├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── docs
    ├── docs
    │   ├── ade20k.md
    │   ├── coco.md
    │   ├── imagenet.md
    │   ├── img
    │   │   ├── ade20k.png
    │   │   ├── banner.png
    │   │   ├── coco.jpg
    │   │   ├── connect.png
    │   │   ├── connect2.png
    │   │   ├── examples.png
    │   │   ├── imagenet.jpeg
    │   │   ├── language_model.png
    │   │   ├── pascalvoc2012.png
    │   │   ├── results.png
    │   │   ├── sotabencheval.png
    │   │   └── squad20.png
    │   ├── index.md
    │   ├── pascalvoc.md
    │   ├── squad.md
    │   ├── wikitext103.md
    │   └── wmt.md
    ├── mkdocs.yml
    └── site
    │   ├── img
    │       └── squad20.png
    │   ├── squad
    │       └── index.html
    │   └── wmt
    │       └── index.html
├── requirements-dev.txt
├── requirements.txt
├── setup.cfg
├── setup.py
└── sotabencheval
    ├── __init__.py
    ├── core
        ├── __init__.py
        ├── cache.py
        └── evaluator.py
    ├── image_classification
        ├── __init__.py
        ├── imagenet.py
        └── utils.py
    ├── language_modelling
        ├── __init__.py
        └── wikitext.py
    ├── machine_translation
        ├── __init__.py
        ├── languages.py
        ├── metrics.py
        └── wmt.py
    ├── natural_language_inference
        ├── __init__.py
        └── multinli.py
    ├── object_detection
        ├── __init__.py
        ├── coco.py
        ├── coco_eval.py
        └── utils.py
    ├── question_answering
        ├── __init__.py
        ├── evaluate_v11.py
        ├── evaluate_v20.py
        ├── squad.py
        └── utils.py
    ├── semantic_segmentation
        ├── __init__.py
        ├── ade20k.py
        ├── pascalvoc.py
        └── utils.py
    ├── utils.py
    └── version.py


/.gitignore:
--------------------------------------------------------------------------------
1 | __pycache__
2 | *.egg-info
3 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: help default docs build release clean test check fmt
 2 | .DEFAULT_GOAL := help
 3 | PROJECT := sotabench-eval
 4 | 
 5 | 
 6 | help:                ## Show help.
 7 | 	@grep -E '^[a-zA-Z2_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}'
 8 | 
 9 | 
10 | docs:                    ## Build documentation.
11 | 	@cd docs && make html && open _build/html/index.html
12 | 
13 | 
14 | build:               ## Build the source and wheel distribution packages.
15 | 	@python3 setup.py sdist bdist_wheel
16 | 
17 | 
18 | release: build       ## Build and upload the package to PyPI.
19 | 	@twine upload --repository-url  https://upload.pypi.org/legacy/ dist/*
20 | 	@rm -fr build dist sotabench-eval.egg-info
21 | 
22 | 
23 | clean:               ## Cleanup the project
24 | 	@find . -type d -name __pycache__ -delete
25 | 	@find . -type f -name "*.py[cod]" -delete
26 | 	@rm -fr build dist sotabench-eval.egg-info
27 | 	@rm -fr docs/_build/*
28 | 
29 | 
30 | test:                ## Run tests and code checks.
31 | 	@py.test -v --cov "$(PROJECT)" "$(PROJECT)"
32 | 
33 | 
34 | check:               ## Run code checks.
35 | 	@flake8 "$(PROJECT)"
36 | 	@pydocstyle "$(PROJECT)"
37 | 
38 | 
39 | fmt:                 ## Format the code.
40 | 	@black --target-version=py37 --safe --line-length=79 "$(PROJECT)"
41 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <p align="center"><img width=500 src="/docs/docs/img/sotabencheval.png"></p>
 2 | 
 3 | --------------------------------------------------------------------------------
 4 | 
 5 | [![PyPI version](https://badge.fury.io/py/sotabencheval.svg)](https://badge.fury.io/py/sotabencheval) [![Generic badge](https://img.shields.io/badge/Documentation-Here-<COLOR>.svg)](https://paperswithcode.github.io/sotabench-eval/)
 6 | 
 7 | `sotabencheval` is a framework-agnostic library that contains a collection of deep learning benchmarks you can use to benchmark your models. It can be used in conjunction with the [sotabench](https://www.sotabench.com) service to record results for models, so the community can compare model performance on different tasks, as well as a continuous integration style service for your repository to benchmark your models on each commit.
 8 | 
 9 | ## Benchmarks Supported
10 | 
11 | - [ADE20K](https://paperswithcode.github.io/sotabench-eval/ade20k/) (Semantic Segmentation)
12 | - [COCO](https://paperswithcode.github.io/sotabench-eval/coco/) (Object Detection)
13 | - [ImageNet](https://paperswithcode.github.io/sotabench-eval/imagenet/) (Image Classification)
14 | - [SQuAD](https://paperswithcode.github.io/sotabench-eval/squad/) (Question Answering)
15 | - [WikiText-103](https://paperswithcode.github.io/sotabench-eval/wikitext103/) (Language Modelling)
16 | - [WMT](https://paperswithcode.github.io/sotabench-eval/wmt/) (Machine Translation)
17 | 
18 | PRs welcome for further benchmarks! 
19 | 
20 | ## Installation
21 | 
22 | Requires Python 3.6+. 
23 | 
24 | ```bash
25 | pip install sotabencheval
26 | ```
27 | 
28 | ## Get Benching! 🏋️
29 | 
30 | You should read the [full documentation here](https://paperswithcode.github.io/sotabench-eval/index.html), which contains guidance on getting started and connecting to [sotabench](https://www.sotabench.com).
31 | 
32 | Integration is lightweight. For example, if you are evaluating an ImageNet model, you initialize an Evaluator object and (optionally) link to any linked paper:
33 | 
34 | ```python
35 | from sotabencheval.image_classification import ImageNetEvaluator
36 | evaluator = ImageNetEvaluator(
37 |              model_name='FixResNeXt-101 32x48d',
38 |              paper_arxiv_id='1906.06423')
39 | ```
40 | 
41 | Then for each batch of predictions your model makes on ImageNet, pass a dictionary of keys as image IDs and values as a `np.ndarray`s of logits to the `evaluator.add` method:
42 | 
43 | ```python
44 | evaluator.add(output_dict=dict(zip(image_ids, batch_output)))
45 | ```
46 | 
47 | The evaluation logic just needs to be written in a `sotabench.py` file and sotabench will run it on each commit and record the results:
48 | 
49 | <a href="https://sotabench.com/user/htvr/repos/TouvronHugo/FixRes#latest-results"><img width=500 src="/docs/docs/img/results.png"></a>
50 | 
51 | ## Contributing
52 | 
53 | All contributions welcome!
54 | 
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/docs/docs/ade20k.md:
--------------------------------------------------------------------------------
  1 | # ADE20K
  2 | 
  3 | ![ADE20K Dataset Examples](img/ade20k.png)
  4 | 
  5 | You can view the ADE20K leaderboard [here](https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val).
  6 | 
  7 | ## Getting Started
  8 | 
  9 | You'll need the following in the root of your repository:
 10 | 
 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
 14 | 
 15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the ADE20K 2012 dataset. For example,
 16 | PyTorch users might use torchvision to load the dataset.
 17 | 
 18 | But you will need to record your results for the server, and you'll want to avoid doing things like
 19 | downloading the dataset on the server. So you should:
 20 | 
 21 | - **Point to the server ADE20K data paths** - popular datasets are pre-downloaded on the server.
 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
 24 |  
 25 | We explain how to do these various steps below.
 26 |  
 27 | ## Server Data Location 
 28 | 
 29 | The ADE20K data is located in the root of your repository on the server at `.data/vision/ade20k`. In this folder is contained:
 30 | 
 31 | - `ADEChallengeData2016.zip` - containing validation images and annotations
 32 | 
 33 | Your local ADE20K files may have a different file directory structure, so you
 34 | can use control flow like below to change the data path if the script is being
 35 | run on sotabench servers:
 36 | 
 37 | ``` python
 38 | from sotabencheval.utils import is_server
 39 | 
 40 | if is_server():
 41 |     DATA_ROOT = './.data/vision/ade20k'
 42 | else: # local settings
 43 |     DATA_ROOT = '/home/ubuntu/my_data/'
 44 | ```
 45 | 
 46 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
 47 | 
 48 | ## How Do I Initialize an Evaluator?
 49 | 
 50 | Add this to your code - before you start batching over the dataset and making predictions:
 51 | 
 52 | ``` python
 53 | from sotabencheval.semantic_segmentation import ADE20KEvaluator
 54 | 
 55 | evaluator = ADE20KEvaluator(model_name='My Super Model')
 56 | ```  
 57 |        
 58 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
 59 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val)
 60 | then you will enable direct comparison with the paper. For example:
 61 | 
 62 | ``` python
 63 | from sotabencheval.semantic_segmentation import ADE20KEvaluator
 64 | 
 65 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065')
 66 | ``` 
 67 | 
 68 | The above will directly compare with the result of the paper when run on the server.
 69 | 
 70 | ## How Do I Evaluate Predictions?
 71 | 
 72 | The evaluator object has an `.add()` method to submit predictions by batch or in full.
 73 | 
 74 | For ADE20K there are two required arguments: `outputs`, a 1D np.ndarray of semantic class predictions per label, 
 75 | and `targets`, a 1D np.ndarray of ground truth semantic classes per pixel. In other words, it requires flattened
 76 | inputs and outputs.
 77 | 
 78 | To elaborate, suppose you are making predictions, batch by batch, and have your model output 
 79 | and the original targets with batch_size `32`, and image size `(520, 480)`. The shape of your outputs might look like:
 80 | 
 81 | ``` python
 82 | batch_output.shape
 83 | >> (32, 150, 520, 480) # where 150 is the number of ADE20K classes
 84 | 
 85 | batch_target.shape
 86 | >> (32, 520, 480)
 87 | ```
 88 | 
 89 | We can flatten the entire output and targets to 1D vectors for each pixel:
 90 | 
 91 | ``` python
 92 | flattened_batch_output.shape
 93 | >> (7987200) # flatten by taking the max class prediction
 94 |              #  (batch_output.argmax(1).flatten() in torch with class as second dimension)
 95 | 
 96 | flattened_batch_target.shape
 97 | >> (7987200) # (batch_target.flatten() in torch)
 98 | ```
 99 | 
100 | The output might look something like this:
101 | 
102 | ``` python
103 | flattened_batch_output
104 | >> array([6, 6, 6, 6, 6, ...])
105 | 
106 | flattened_batch_target
107 | >> array([6, 6, 6, 6, 6, ...])
108 | ```
109 | 
110 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5
111 | pixels - so the model is correct.
112 | 
113 | These flattened arrays can then be passed into the .add() method of the evaluator
114 | 
115 | ``` python
116 | my_evaluator.update(outputs=flattened_batch_output,
117 |                             targets=flattened_batch_target)
118 | ```
119 | 
120 | You can do this all at once in a single call to `add()`, but more naturally, you will 
121 | probably loop over the dataset and call the method for the outputs of each batch.
122 | That would like something like this (for a PyTorch example):
123 | 
124 | ``` python
125 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065')
126 | 
127 | with torch.no_grad():
128 |     for image, target in tqdm.tqdm(data_loader_test):
129 |         image, target = image.to('cuda'), target.to('cuda')
130 |         output = model(image)
131 |         output = output['out']
132 |         
133 |         evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
134 | ```
135 | 
136 | When you are done, you can get the results locally by running:
137 | 
138 | ``` python
139 | evaluator.get_results()
140 | ```
141 | 
142 | But for the server you want to save the results by running:
143 | 
144 | ``` python
145 | evaluator.save()
146 | ```
147 | 
148 | This method serialises the results and model metadata and stores to the server database.
149 | 
150 | ## How Do I Cache Evaluation?
151 |     
152 | Sotabench reruns your script on every commit. This is good because it acts like 
153 | continuous integration in checking for bugs and changes, but can be annoying
154 | if the model hasn't changed and evaluation is lengthy. 
155 | 
156 | Fortunately sotabencheval has caching logic that you can use.
157 | 
158 | The idea is that after the first batch, we hash the model outputs and the
159 | current metrics and this tells us if the model is the same given the dataset.
160 | You can include hashing within an evaluation loop like follows (in the following
161 | example for a PyTorch repository):
162 | 
163 | ``` python
164 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065')
165 | 
166 | with torch.no_grad():
167 |     for image, target in tqdm.tqdm(data_loader_test):
168 |         image, target = image.to('cuda'), target.to('cuda')
169 |         output = model(image)
170 |         output = output['out']
171 | 
172 |         evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
173 |         if evaluator.cache_exists:
174 |             break
175 | 
176 | evaluator.save()
177 | ```
178 | 
179 | If the hash is the same as in the server, we infer that the model hasn't changed, so
180 | we simply return hashed results rather than running the whole evaluation again.
181 | 
182 | Caching is very useful if you have large models, or a repository that is evaluating
183 | multiple models, as it speeds up evaluation significantly.
184 |     
185 | ## Need More Help?
186 | 
187 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench
188 | forums if you have any questions or difficulties.
189 | 


--------------------------------------------------------------------------------
/docs/docs/coco.md:
--------------------------------------------------------------------------------
  1 | # COCO
  2 | 
  3 | ![COCO Dataset Examples](img/coco.jpg)
  4 | 
  5 | You can view the COCO minival leaderboard [here](https://sotabench.com/benchmarks/object-detection-on-coco-minival).
  6 | 
  7 | ## Getting Started
  8 | 
  9 | You'll need the following in the root of your repository:
 10 | 
 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
 14 | 
 15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the COCO dataset. For example,
 16 | PyTorch users might use torchvision to load the dataset.
 17 | 
 18 | But you will need to record your results for the server, and you'll want to avoid doing things like
 19 | downloading the dataset on the server. So you should:
 20 | 
 21 | - **Point to the server COCO data paths** - popular datasets are pre-downloaded on the server.
 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
 24 |  
 25 | We explain how to do these various steps below.
 26 |  
 27 | ## Server Data Location 
 28 | 
 29 | The COCO validation data is located in the root of your repository on the server at `.data/vision/coco`. In this folder is contained:
 30 | 
 31 | - `annotations_trainval2017.zip` - containing annotations for the validation images
 32 | - `val2017.zip` - containing the validation images
 33 | 
 34 | Your local COCO files may have a different file directory structure, so you
 35 | can use control flow like below to change the data path if the script is being
 36 | run on sotabench servers:
 37 | 
 38 | ``` python
 39 | from sotabencheval.utils import is_server
 40 | 
 41 | if is_server():
 42 |     DATA_ROOT = './.data/vision/coco'
 43 | else: # local settings
 44 |     DATA_ROOT = '/home/ubuntu/my_data/'
 45 | ```
 46 | 
 47 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
 48 | 
 49 | ## How Do I Initialize an Evaluator?
 50 | 
 51 | Add this to your code - before you start batching over the dataset and making predictions:
 52 | 
 53 | ``` python
 54 | from sotabencheval.object_detection import COCOEvaluator
 55 | 
 56 | evaluator = COCOEvaluator(model_name='My Super Model')
 57 | ```
 58 |          
 59 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
 60 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/object-detection-on-coco-minival)
 61 | then you will enable direct comparison with the paper's model. For example:
 62 | 
 63 | ``` python
 64 | from sotabencheval.object_detection import COCOEvaluator
 65 | 
 66 | evaluator = COCOEvaluator(model_name='Mask R-CNN', paper_arxiv_id='1703.06870')
 67 | ```
 68 | 
 69 | The above will directly compare with the result of the paper when run on the server.
 70 | 
 71 | ## How Do I Evaluate Predictions?
 72 | 
 73 | The evaluator object has an [.add()](https://github.com/paperswithcode/sotabench-eval/blob/a788d17252913e5f2d24733845de80aec23101fb/sotabencheval/object_detection/coco.py#L187) method to submit predictions by batch or in full.
 74 | 
 75 | For COCO the expected input is a list of dictionaries, where each dictionary contains detection information
 76 | that will be used by the [loadRes](https://github.com/paperswithcode/sotabench-eval/blob/a788d17252913e5f2d24733845de80aec23101fb/sotabencheval/object_detection/coco_eval.py#L236) method based on the [pycocotools](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI/pycocotools) API. 
 77 | 
 78 | Each detection can take a dictionary
 79 | like the following:
 80 | 
 81 | ``` python
 82 | {'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312, 110.14895629882812, 278.2847595214844],
 83 | 'score': 0.999152421951294, 'category_id': 1}
 84 | ```
 85 | 
 86 | For this benchmark, only bounding box detection ('bbox') is performed at present.
 87 | 
 88 | You can do this all at once in a single call to `add()`, but more naturally, you will 
 89 | probably loop over the dataset and call the method for the outputs of each batch.
 90 | That would look something like this (for a PyTorch example):
 91 | 
 92 | ``` python
 93 | ...
 94 | 
 95 | evaluator = COCOEvaluator(
 96 |                  model_name='Mask R-CNN',
 97 |                  paper_arxiv_id='1703.06870')
 98 | 
 99 | with torch.no_grad():
100 |     for i, (input, target) in enumerate(data_loader):
101 |         ...
102 |         output = model(input)
103 |         # potentially formatting of the output here to be a list of dicts
104 |         evaluator.add(output)
105 | ```
106 |     
107 | When you are done, you can get the results locally by running:
108 | 
109 | ``` python
110 | evaluator.get_results()
111 | ```
112 | 
113 | But for the server you want to save the results by running:
114 | 
115 | ``` python
116 | evaluator.save()
117 | ```
118 | 
119 | This method serialises the results and model metadata and stores to the server database.
120 |   
121 | ## How Do I Cache Evaluation?
122 |     
123 | Sotabench reruns your script on every commit. This is good because it acts like 
124 | continuous integration in checking for bugs and changes, but can be annoying
125 | if the model hasn't changed and evaluation is lengthy. 
126 | 
127 | Fortunately sotabencheval has caching logic that you can use.
128 | 
129 | The idea is that after the first batch, we hash the model outputs and the
130 | current metrics and this tells us if the model is the same given the dataset.
131 | You can include hashing within an evaluation loop like follows (in the following
132 | example for a PyTorch repository):
133 | 
134 | ``` python
135 | with torch.no_grad():
136 |     for i, (input, target) in enumerate(data_loader):
137 |         ...
138 |         output = model(input)
139 |         # potentially formatting of the output here to be a list of dicts
140 |         evaluator.add(output)
141 | 
142 |         if evaluator.cache_exists:
143 |             break
144 | 
145 | evaluator.save()
146 | ```
147 |  
148 | If the hash is the same as in the server, we infer that the model hasn't changed, so
149 | we simply return hashed results rather than running the whole evaluation again.
150 | 
151 | Caching is very useful if you have large models, or a repository that is evaluating
152 | multiple models, as it speeds up evaluation significantly.
153 |     
154 | ## A Full sotabench.py Example
155 | 
156 | Below we show an implementation for a model from the torchvision repository. This
157 | incorporates all the features explained above: (a) using the server data root, 
158 | (b) using the COCO Evaluator, and (c) caching the evaluation logic. Note that the
159 | torchbench dependency is just to get some processing logic and transforms; the evaluation
160 | is done with sotabencheval.
161 | 
162 | ``` python
163 | import os
164 | import tqdm 
165 | import torch
166 | from torch.utils.data import DataLoader
167 | from torchbench.utils import send_model_to_device
168 | from torchbench.object_detection.transforms import Compose, ConvertCocoPolysToMask, ToTensor
169 | import torchvision
170 | import PIL
171 | 
172 | from sotabencheval.object_detection import COCOEvaluator
173 | from sotabencheval.utils import is_server
174 | 
175 | if is_server():
176 |     DATA_ROOT = './.data/vision/coco'
177 | else: # local settings
178 |     DATA_ROOT = '/home/ubuntu/my_data/'
179 | 
180 | def coco_data_to_device(input, target, device: str = "cuda", non_blocking: bool = True):
181 |     input = list(inp.to(device=device, non_blocking=non_blocking) for inp in input)
182 |     target = [{k: v.to(device=device, non_blocking=non_blocking) for k, v in t.items()} for t in target]
183 |     return input, target
184 | 
185 | def coco_collate_fn(batch):
186 |     return tuple(zip(*batch))
187 | 
188 | def coco_output_transform(output, target):
189 |     output = [{k: v.to("cpu") for k, v in t.items()} for t in output]
190 |     return output, target
191 | 
192 | transforms = Compose([ConvertCocoPolysToMask(), ToTensor()])
193 |    
194 | model = torchvision.models.detection.__dict__['maskrcnn_resnet50_fpn'](num_classes=91, pretrained=True)
195 | 
196 | model, device = send_model_to_device(
197 |     model, device='cuda', num_gpu=1
198 | )
199 | model.eval()
200 | 
201 | model_output_transform = coco_output_transform
202 | send_data_to_device = coco_data_to_device
203 | collate_fn = coco_collate_fn
204 | 
205 | test_dataset = torchbench.datasets.CocoDetection(
206 |     root=os.path.join(DATA_ROOT, "val%s" % '2017'),
207 |     annFile=os.path.join(
208 |         DATA_ROOT, "annotations/instances_val%s.json" % '2017'
209 |     ),
210 |     transform=None,
211 |     target_transform=None,
212 |     transforms=transforms,
213 |     download=True,
214 | )
215 | test_loader = DataLoader(
216 |     test_dataset,
217 |     batch_size=8,
218 |     shuffle=False,
219 |     num_workers=4,
220 |     pin_memory=True,
221 |     collate_fn=collate_fn,
222 | )
223 | test_loader.no_classes = 91  # Number of classes for COCO Detection
224 | 
225 | iterator = tqdm.tqdm(test_loader, desc="Evaluation", mininterval=5)
226 | 
227 | evaluator = COCOEvaluator(
228 |     root=DATA_ROOT,
229 |     model_name='Mask R-CNN (ResNet-50-FPN)',
230 |     paper_arxiv_id='1703.06870'
231 | 
232 | def prepare_for_coco_detection(predictions):
233 |     coco_results = []
234 |     for original_id, prediction in predictions.items():
235 |         if len(prediction) == 0:
236 |             continue
237 | 
238 |         boxes = prediction["boxes"]
239 |         boxes = convert_to_xywh(boxes).tolist()
240 |         scores = prediction["scores"].tolist()
241 |         labels = prediction["labels"].tolist()
242 | 
243 |         coco_results.extend(
244 |             [
245 |                 {
246 |                     "image_id": original_id,
247 |                     "category_id": labels[k],
248 |                     "bbox": box,
249 |                     "score": scores[k],
250 |                 }
251 |                 for k, box in enumerate(boxes)
252 |             ]
253 |         )
254 |     return coco_results
255 | 
256 | def convert_to_xywh(boxes):
257 |     xmin, ymin, xmax, ymax = boxes.unbind(1)
258 |     return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
259 | 
260 | with torch.no_grad():
261 |     for i, (input, target) in enumerate(iterator):
262 |         input, target = send_data_to_device(input, target, device=device)
263 |         original_output = model(input)
264 |         output, target = model_output_transform(original_output, target)
265 |         result = {
266 |             tar["image_id"].item(): out for tar, out in zip(target, output)
267 |         }
268 |         result = prepare_for_coco_detection(result)
269 | 
270 |         evaluator.update(result)
271 |         
272 |         if evaluator.cache_exists:
273 |             break
274 | 
275 | evaluator.save()
276 | ```
277 | 
278 | ## Need More Help?
279 | 
280 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench
281 | forums if you have any questions or difficulties.
282 | 


--------------------------------------------------------------------------------
/docs/docs/imagenet.md:
--------------------------------------------------------------------------------
  1 | # ImageNet
  2 | 
  3 | ![ImageNet Dataset Examples](img/imagenet.jpeg)
  4 | 
  5 | You can view the ImageNet leaderboard [here](https://sotabench.com/benchmarks/image-classification-on-imagenet).
  6 | 
  7 | ## Getting Started
  8 | 
  9 | You'll need the following in the root of your repository:
 10 | 
 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
 14 | 
 15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the ImageNet dataset. For example,
 16 | PyTorch users might use torchvision to load the dataset.
 17 | 
 18 | But you will need to record your results for the server, and you'll want to avoid doing things like
 19 | downloading the dataset on the server. So you should:
 20 | 
 21 | - **Point to the server ImageNet data paths** - popular datasets are pre-downloaded on the server.
 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
 24 |  
 25 | We explain how to do these various steps below.
 26 |  
 27 | ## Server Data Location 
 28 | 
 29 | The ImageNet validation data is located in the root of your repository on the server at `.data/vision/imagenet`. In this folder is contained:
 30 | 
 31 | - `ILSVRC2012_devkit_t12.tar.gz` - containing metadata
 32 | - `ILSVRC2012_img_val.tar` - containing the validation images
 33 | 
 34 | Your local ImageNet files may have a different file directory structure, so you
 35 | can use control flow like below to change the data path if the script is being
 36 | run on sotabench servers:
 37 | 
 38 | ``` python
 39 | from sotabencheval.utils import is_server
 40 | 
 41 | if is_server():
 42 |     DATA_ROOT = './.data/vision/imagenet'
 43 | else: # local settings
 44 |     DATA_ROOT = '/home/ubuntu/my_data/'
 45 | ```
 46 | 
 47 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
 48 | 
 49 | ## How Do I Initialize an Evaluator?
 50 | 
 51 | Add this to your code - before you start batching over the dataset and making predictions:
 52 | 
 53 | ``` python
 54 | from sotabencheval.image_classification import ImageNetEvaluator
 55 | 
 56 | evaluator = ImageNetEvaluator(model_name='My Super Model')
 57 | ```
 58 |        
 59 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
 60 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/image-classification-on-imagenet)
 61 | then you will enable direct comparison with the paper's model. For example:
 62 | 
 63 | ``` python
 64 | from sotabencheval.image_classification import ImageNetEvaluator
 65 | 
 66 | evaluator = ImageNetEvaluator(model_name='FixResNeXt-101 32x48d',
 67 | paper_arxiv_id='1906.06423')
 68 | ```
 69 | 
 70 | The above will directly compare with the result of the paper when run on the server.
 71 | 
 72 | ## How Do I Evaluate Predictions?
 73 | 
 74 | The evaluator object has an `.add()` method to submit predictions by batch or in full.
 75 | 
 76 | For ImageNet the expected input as a dictionary of outputs, where each key is an
 77 | image ID from ImageNet and each value is a list or 1D numpy array of logits for that 
 78 | image ID. For example:
 79 | 
 80 | ``` python
 81 |     evaluator.add({'ILSVRC2012_val_00000293': np.array([1.04243, ...]),
 82 |     'ILSVRC2012_val_00000294': np.array([-2.3677, ...])})
 83 | ```
 84 | 
 85 | You can do this all at once in a single call to `add()`, but more naturally, you will 
 86 | probably loop over the dataset and call the method for the outputs of each batch.
 87 | That would like something like this (for a PyTorch example):
 88 | 
 89 | ``` python
 90 | for i, (input, target) in enumerate(test_loader):
 91 |     input = input.to(device='cuda', non_blocking=True)
 92 |     target = target.to(device='cuda', non_blocking=True)
 93 |     output = model(input)
 94 | 
 95 |     image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
 96 | 
 97 |     evaluator.add(dict(zip(image_ids, list(output.cpu().numpy()))))
 98 | ```
 99 | 
100 | When you are done, you can get the results locally by running:
101 | 
102 | ``` python
103 | evaluator.get_results()
104 | ```
105 | 
106 | But for the server you want to save the results by running:
107 | 
108 | ``` python
109 | evaluator.save()
110 | ```
111 | 
112 | This method serialises the results and model metadata and stores to the server database.
113 | 
114 | ## How Do I Cache Evaluation?
115 |     
116 | Sotabench reruns your script on every commit. This is good because it acts like 
117 | continuous integration in checking for bugs and changes, but can be annoying
118 | if the model hasn't changed and evaluation is lengthy. 
119 | 
120 | Fortunately sotabencheval has caching logic that you can use.
121 | 
122 | The idea is that after the first batch, we hash the model outputs and the
123 | current metrics and this tells us if the model is the same given the dataset.
124 | You can include hashing within an evaluation loop like follows (in the following
125 | example for a PyTorch repository):
126 | 
127 | ``` python
128 | with torch.no_grad():
129 |     for i, (input, target) in enumerate(test_loader):
130 |         input = input.to(device='cuda', non_blocking=True)
131 |         target = target.to(device='cuda', non_blocking=True)
132 |         output = model(input)
133 | 
134 |         image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
135 | 
136 |         evaluator.add(dict(zip(image_ids, list(output.cpu().numpy()))))
137 |            
138 |         if evaluator.cache_exists:
139 |             break
140 | 
141 | evaluator.save()
142 | ```
143 |  
144 | If the hash is the same as in the server, we infer that the model hasn't changed, so
145 | we simply return hashed results rather than running the whole evaluation again.
146 | 
147 | Caching is very useful if you have large models, or a repository that is evaluating
148 | multiple models, as it speeds up evaluation significantly.
149 |     
150 | ## A full sotabench.py example
151 | 
152 | Below we show an implementation for a model from the torchvision repository. This
153 | incorporates all the features explained above: (a) using the server data root, 
154 | (b) using the ImageNet Evaluator, and (c) caching the evaluation logic:
155 | 
156 | ``` python
157 | import numpy as np
158 | import PIL
159 | import torch
160 | from torchvision.models.resnet import resnext101_32x8d
161 | import torchvision.transforms as transforms
162 | from torchvision.datasets import ImageNet
163 | from torch.utils.data import DataLoader
164 | 
165 | from sotabencheval.image_classification import ImageNetEvaluator
166 | from sotabencheval.utils import is_server
167 | 
168 | if is_server():
169 |     DATA_ROOT = './.data/vision/imagenet'
170 | else: # local settings
171 |     DATA_ROOT = '/home/ubuntu/my_data/'
172 | 
173 | model = resnext101_32x8d(pretrained=True)
174 | 
175 | input_transform = transforms.Compose([
176 |     transforms.Resize(256, PIL.Image.BICUBIC),
177 |     transforms.CenterCrop(224),
178 |     transforms.ToTensor(),
179 |     transforms.Normalize(
180 |     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
181 | ])
182 | 
183 | test_dataset = ImageNet(
184 |     DATA_ROOT,
185 |     split="val",
186 |     transform=input_transform,
187 |     target_transform=None,
188 |     download=True,
189 | )
190 | 
191 | test_loader = DataLoader(
192 |     test_dataset,
193 |     batch_size=128,
194 |     shuffle=False,
195 |     num_workers=4,
196 |     pin_memory=True,
197 | )
198 | 
199 | model = model.cuda()
200 | model.eval()
201 | 
202 | evaluator = ImageNetEvaluator(
203 |                  model_name='ResNeXt-101-32x8d',
204 |                  paper_arxiv_id='1611.05431')
205 | 
206 | def get_img_id(image_name):
207 |     return image_name.split('/')[-1].replace('.JPEG', '')
208 | 
209 | with torch.no_grad():
210 |     for i, (input, target) in enumerate(test_loader):
211 |         input = input.to(device='cuda', non_blocking=True)
212 |         target = target.to(device='cuda', non_blocking=True)
213 |         output = model(input)
214 | 
215 |         image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
216 | 
217 |         evaluator.add(dict(zip(image_ids, list(output.cpu().numpy()))))
218 |     
219 |         if evaluator.cache_exists:
220 |             break
221 | 
222 | evaluator.save()
223 | ```
224 | 
225 | ## Need More Help?
226 | 
227 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench
228 | forums if you have any questions or difficulties.
229 | 


--------------------------------------------------------------------------------
/docs/docs/img/ade20k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/ade20k.png


--------------------------------------------------------------------------------
/docs/docs/img/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/banner.png


--------------------------------------------------------------------------------
/docs/docs/img/coco.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/coco.jpg


--------------------------------------------------------------------------------
/docs/docs/img/connect.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/connect.png


--------------------------------------------------------------------------------
/docs/docs/img/connect2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/connect2.png


--------------------------------------------------------------------------------
/docs/docs/img/examples.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/examples.png


--------------------------------------------------------------------------------
/docs/docs/img/imagenet.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/imagenet.jpeg


--------------------------------------------------------------------------------
/docs/docs/img/language_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/language_model.png


--------------------------------------------------------------------------------
/docs/docs/img/pascalvoc2012.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/pascalvoc2012.png


--------------------------------------------------------------------------------
/docs/docs/img/results.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/results.png


--------------------------------------------------------------------------------
/docs/docs/img/sotabencheval.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/sotabencheval.png


--------------------------------------------------------------------------------
/docs/docs/img/squad20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/squad20.png


--------------------------------------------------------------------------------
/docs/docs/index.md:
--------------------------------------------------------------------------------
  1 | # Welcome to sotabencheval!
  2 | 
  3 | ![SotaBench](img/banner.png)
  4 | 
  5 | You have reached the docs for the [sotabencheval](https://github.com/paperswithcode/sotabench-eval) library. This library contains a collection of deep learning benchmarks you can use to
  6 | benchmark your models. It can be used in conjunction with the 
  7 | [sotabench.com](http://www.sotabench.com) website to record results for models, so the community
  8 | can compare model performance on different tasks, as well as a continuous integration style
  9 | service for your repository to benchmark your models on each commit.
 10 | 
 11 | **sotabencheval** is a general benchmarking library, meaning it is designed to support all deep learning frameworks, 
 12 | and requires minimal code integration. There are alternative sotabench APIs you can use that are
 13 | specialized for particular frameworks, e.g. [torchbench](https://github.com/paperswithcode/torchbench) for PyTorch.
 14 | 
 15 | 
 16 | ## Getting Started : Benchmarking on ImageNet
 17 | 
 18 | **Step One : Create a sotabench.py file in the root of your repository**
 19 | 
 20 | This can contain whatever logic you need to load and process the dataset, and to 
 21 | produce model predictions for it. To record your results for sotabench, initialise
 22 | an ImageNet evaluator object to name the model (and optionally) link to a paper:
 23 | 
 24 | ``` python
 25 | from sotabencheval.image_classification import ImageNetEvaluator
 26 | 
 27 | evaluator = ImageNetEvaluator(
 28 |              model_name='ResNeXt-101-32x8d',
 29 |              paper_arxiv_id='1611.05431')
 30 | ```
 31 | 
 32 | For each batch of predictions made by your model, pass a dictionary of keys as image IDs and values as 
 33 | output predictions to the `evaluator.add` method:
 34 | 
 35 | ``` python
 36 | evaluator.add(dict(zip(image_ids, batch_output)))
 37 | ```
 38 | Then after you have accumulated all the predictions:
 39 | 
 40 | ``` python
 41 | evaluator.save()
 42 | ```
 43 | 
 44 | This will ensure results are evaluated and saved when they are run on the [sotabench](http://www.sotabench.com) server.
 45 | 
 46 | Below you can see a working `sotabench.py` file added to the [torchvision](https://github.com/pytorch/vision) repository 
 47 | to test one of its models, integrating the evaluation code from above:
 48 | 
 49 | ``` python
 50 | import numpy as np
 51 | import PIL
 52 | import torch
 53 | from torch.utils.data import DataLoader
 54 | from torchvision.models.resnet import resnext101_32x8d
 55 | import torchvision.transforms as transforms
 56 | from torchvision.datasets import ImageNet
 57 | 
 58 | from sotabencheval.image_classification import ImageNetEvaluator
 59 | from sotabencheval.utils import is_server
 60 | 
 61 | if is_server():
 62 |     DATA_ROOT = './.data/vision/imagenet'
 63 | else: # local settings
 64 |     DATA_ROOT = '/home/ubuntu/my_data/'
 65 | 
 66 | model = resnext101_32x8d(pretrained=True)
 67 | 
 68 | input_transform = transforms.Compose([
 69 |     transforms.Resize(256, PIL.Image.BICUBIC),
 70 |     transforms.CenterCrop(224),
 71 |     transforms.ToTensor(),
 72 |     transforms.Normalize(
 73 |     mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
 74 | ])
 75 | 
 76 | test_dataset = ImageNet(
 77 |     DATA_ROOT,
 78 |     split="val",
 79 |     transform=input_transform,
 80 |     target_transform=None,
 81 |     download=True,
 82 | )
 83 | 
 84 | test_loader = DataLoader(
 85 |     test_dataset,
 86 |     batch_size=128,
 87 |     shuffle=False,
 88 |     num_workers=4,
 89 |     pin_memory=True,
 90 | )
 91 | 
 92 | model = model.cuda()
 93 | model.eval()
 94 | 
 95 | evaluator = ImageNetEvaluator(
 96 |                  model_name='ResNeXt-101-32x8d',
 97 |                  paper_arxiv_id='1611.05431')
 98 | 
 99 | def get_img_id(image_name):
100 |     return image_name.split('/')[-1].replace('.JPEG', '')
101 | 
102 | with torch.no_grad():
103 |     for i, (input, target) in enumerate(test_loader):
104 |         input = input.to(device='cuda', non_blocking=True)
105 |         target = target.to(device='cuda', non_blocking=True)
106 |         output = model(input
107 |         image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]]
108 |         evaluator.add(dict(zip(image_ids, list(output.cpu().numpy()))))
109 |     
110 | evaluator.save()
111 | ```
112 | 
113 | **Step Two : Run locally to verify that it works** 
114 | 
115 | ```
116 | python sotabench.py
117 | ```
118 | 
119 | You can also run the logic in a Jupyter Notebook if that is your preferred workflow.
120 | 
121 | **Step Three : Login and connect your repository to [sotabench](http://www.sotabench.com)**
122 | 
123 | Create an account on [sotabench](http://www.sotabench.com), then head to your user page. Click the
124 | **Connect a GitHub repository** button:
125 | 
126 | <img width=400 src="img/connect.png">
127 | 
128 | Then follow the steps to connect the repositories that you wish to benchmark:
129 | 
130 | ![SotaBench](img/connect2.png)
131 | 
132 | 
133 | After you connect your repository, the sotabench servers will re-evaluate your model on every commit, 
134 | to ensure the model is working and results are up-to-date - including if you add additional models to the benchmark file.
135 | 
136 | ## Installation
137 | 
138 | The library requires Python 3.6+. You can install via pip:
139 | 
140 | ```
141 | pip install sotabencheval
142 | ```
143 | 
144 | ## Support
145 | 
146 | If you get stuck you can head to our [Discourse](http://forum.sotabench.com) forum where you ask
147 | questions on how to use the project. You can also find ideas for contributions,
148 | and work with others on exciting projects.


--------------------------------------------------------------------------------
/docs/docs/pascalvoc.md:
--------------------------------------------------------------------------------
  1 | # PASCAL VOC 2012
  2 | 
  3 | ![VOC Dataset Examples](img/pascalvoc2012.png)
  4 | 
  5 | You can view the PASCAL VOC 2012 leaderboard [here](https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012).
  6 | 
  7 | ## Getting Started
  8 | 
  9 | You'll need the following in the root of your repository:
 10 | 
 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
 14 | 
 15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the VOC 2012 dataset. For example,
 16 | PyTorch users might use torchvision to load the dataset.
 17 | 
 18 | But you will need to record your results for the server, and you'll want to avoid doing things like
 19 | downloading the dataset on the server. So you should:
 20 | 
 21 | - **Point to the server VOC 2012 data paths** - popular datasets are pre-downloaded on the server.
 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
 24 |  
 25 | We explain how to do these various steps below.
 26 |  
 27 | ## Server Data Location 
 28 | 
 29 | The VOC 2012 data is located in the root of your repository on the server at `.data/vision/voc2012`. In this folder is contained:
 30 | 
 31 | - `VOCtrainval_11-May-2012.tar` - containing validation images and annotations
 32 | 
 33 | Your local VOC 2012 files may have a different file directory structure, so you
 34 | can use control flow like below to change the data path if the script is being
 35 | run on sotabench servers:
 36 | 
 37 | ``` python
 38 | from sotabencheval.utils import is_server
 39 | 
 40 | if is_server():
 41 |     DATA_ROOT = './.data/vision/voc2012'
 42 | else: # local settings
 43 |     DATA_ROOT = '/home/ubuntu/my_data/'
 44 | ```
 45 | 
 46 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly.
 47 | 
 48 | ## How Do I Initialize an Evaluator?
 49 | 
 50 | Add this to your code - before you start batching over the dataset and making predictions:
 51 | 
 52 | ``` python
 53 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator
 54 | 
 55 | evaluator = PASCALVOCEvaluator(model_name='My Super Model')
 56 | ```  
 57 |        
 58 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
 59 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012)
 60 | then you will enable direct comparison with the paper. For example:
 61 | 
 62 | ``` python
 63 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator
 64 | 
 65 | evaluator = PASCALVOCEvaluator(model_name='PSPNet', paper_arxiv_id='1612.01105')
 66 | ``` 
 67 | 
 68 | The above will directly compare with the result of the paper when run on the server.
 69 | 
 70 | ## How Do I Evaluate Predictions?
 71 | 
 72 | The evaluator object has an `.add()` method to submit predictions by batch or in full.
 73 | 
 74 | For PASCAL there are two required arguments: `outputs`, a 1D np.ndarray of semantic class predictions per label, 
 75 | and `targets`, a 1D np.ndarray of ground truth semantic classes per pixel. In other words, it requires flattened
 76 | inputs and outputs.
 77 | 
 78 | To elaborate, suppose you are making predictions, batch by batch, and have your model output 
 79 | and the original targets with batch_size `32`, and image size `(520, 480)`. The shape of your outputs might look like:
 80 | 
 81 | ``` python
 82 | batch_output.shape
 83 | >> (32, 21, 520, 480) # where 21 is the number of VOC classes
 84 | 
 85 | batch_target.shape
 86 | >> (32, 520, 480)
 87 | ```
 88 | 
 89 | We can flatten the entire output and targets to 1D vectors for each pixel:
 90 | 
 91 | ``` python
 92 | flattened_batch_output.shape
 93 | >> (7987200) # flatten by taking the max class prediction
 94 |              #  (batch_output.argmax(1).flatten() in torch with class as second dimension)
 95 | 
 96 | flattened_batch_target.shape
 97 | >> (7987200) # (batch_target.flatten() in torch)
 98 | ```
 99 | 
100 | The output might look something like this:
101 | 
102 | ``` python
103 | flattened_batch_output
104 | >> array([6, 6, 6, 6, 6, ...])
105 | 
106 | flattened_batch_target
107 | >> array([6, 6, 6, 6, 6, ...])
108 | ```
109 | 
110 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5
111 | pixels - so the model is correct.
112 | 
113 | These flattened arrays can then be passed into the .add() method of the evaluator
114 | 
115 | ``` python
116 | my_evaluator.update(outputs=flattened_batch_output,
117 |                             targets=flattened_batch_target)
118 | ```
119 | 
120 | You can do this all at once in a single call to `add()`, but more naturally, you will 
121 | probably loop over the dataset and call the method for the outputs of each batch.
122 | That would like something like this (for a PyTorch example):
123 | 
124 | ``` python
125 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211')
126 | 
127 | with torch.no_grad():
128 |     for image, target in tqdm.tqdm(data_loader_test):
129 |         image, target = image.to('cuda'), target.to('cuda')
130 |         output = model(image)
131 |         output = output['out']
132 |         
133 |         evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
134 | ```
135 | 
136 | When you are done, you can get the results locally by running:
137 | 
138 | ``` python
139 | evaluator.get_results()
140 | ```
141 | 
142 | But for the server you want to save the results by running:
143 | 
144 | ``` python
145 | evaluator.save()
146 | ```
147 | 
148 | This method serialises the results and model metadata and stores to the server database.
149 | 
150 | ## How Do I Cache Evaluation?
151 |     
152 | Sotabench reruns your script on every commit. This is good because it acts like 
153 | continuous integration in checking for bugs and changes, but can be annoying
154 | if the model hasn't changed and evaluation is lengthy. 
155 | 
156 | Fortunately sotabencheval has caching logic that you can use.
157 | 
158 | The idea is that after the first batch, we hash the model outputs and the
159 | current metrics and this tells us if the model is the same given the dataset.
160 | You can include hashing within an evaluation loop like follows (in the following
161 | example for a PyTorch repository):
162 | 
163 | ``` python
164 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211')
165 | 
166 | with torch.no_grad():
167 |     for image, target in tqdm.tqdm(data_loader_test):
168 |         image, target = image.to('cuda'), target.to('cuda')
169 |         output = model(image)
170 |         output = output['out']
171 | 
172 |         evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
173 |         if evaluator.cache_exists:
174 |             break
175 | 
176 | evaluator.save()
177 | ```
178 | 
179 | If the hash is the same as in the server, we infer that the model hasn't changed, so
180 | we simply return hashed results rather than running the whole evaluation again.
181 | 
182 | Caching is very useful if you have large models, or a repository that is evaluating
183 | multiple models, as it speeds up evaluation significantly.
184 |     
185 | ## A full sotabench.py example
186 | 
187 | Below we show an implementation for a model from the torchvision repository. This
188 | incorporates all the features explained above: (a) using the server data root, 
189 | (b) using the ImageNet Evaluator, and (c) caching the evaluation logic:
190 | 
191 | ``` python
192 | import PIL
193 | import torch
194 | import torchvision
195 | from torchvision.models.segmentation import fcn_resnet101
196 | import torchvision.transforms as transforms
197 | import tqdm
198 | 
199 | from sotabench_transforms import Normalize, Compose, Resize, ToTensor
200 | 
201 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator
202 | from sotabencheval.utils import is_server
203 | 
204 | if is_server():
205 |     DATA_ROOT = './.data/vision/voc2012'
206 | else: # local settings
207 |     DATA_ROOT = '/home/ubuntu/my_data/'
208 | 
209 | MODEL_NAME = 'fcn_resnet101'
210 | 
211 | def cat_list(images, fill_value=0):
212 |     max_size = tuple(max(s) for s in zip(*[img.shape for img in images]))
213 |     batch_shape = (len(images),) + max_size
214 |     batched_imgs = images[0].new(*batch_shape).fill_(fill_value)
215 |     for img, pad_img in zip(images, batched_imgs):
216 |         pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img)
217 |     return batched_imgs
218 | 
219 | def collate_fn(batch):
220 |     images, targets = list(zip(*batch))
221 |     batched_imgs = cat_list(images, fill_value=0)
222 |     batched_targets = cat_list(targets, fill_value=255)
223 |     return batched_imgs, batched_targets
224 | 
225 | device = torch.device('cuda')
226 | 
227 | normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
228 | my_transforms = Compose([Resize((520, 480)), ToTensor(), normalize])
229 | 
230 | dataset_test = torchvision.datasets.VOCSegmentation(root=DATA_ROOT, year='2012', image_set="val", 
231 |                                                     transforms=my_transforms, download=True)
232 | test_sampler = torch.utils.data.SequentialSampler(dataset_test)
233 | 
234 | data_loader_test = torch.utils.data.DataLoader(
235 |     dataset_test, batch_size=32,
236 |     sampler=test_sampler, num_workers=4,
237 |     collate_fn=collate_fn)
238 | 
239 | model = torchvision.models.segmentation.__dict__['fcn_resnet101'](num_classes=21, pretrained=True)
240 | model.to(device)
241 | model.eval()
242 | 
243 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211')
244 | 
245 | with torch.no_grad():
246 |     for image, target in tqdm.tqdm(data_loader_test):
247 |         image, target = image.to('cuda'), target.to('cuda')
248 |         output = model(image)
249 |         output = output['out']
250 |         
251 |         evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy())
252 |         if evaluator.cache_exists:
253 |             break
254 |         
255 | evaluator.save()
256 | ```
257 | 
258 | ## Need More Help?
259 | 
260 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench
261 | forums if you have any questions or difficulties.
262 | 


--------------------------------------------------------------------------------
/docs/docs/squad.md:
--------------------------------------------------------------------------------
  1 | # SQuAD
  2 | 
  3 | ![SQuAD 2.0 Dataset Examples](img/squad20.png)
  4 | 
  5 | You can view the [SQuAD 1.1](https://sotabench.com/benchmarks/question-answering-on-squad11-dev) and
  6 | [SQuAD 2.0](https://sotabench.com/benchmarks/question-answering-on-squad20-dev) leaderboards.
  7 | 
  8 | ## Getting Started
  9 | 
 10 | You'll need the following in the root of your repository:
 11 | 
 12 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
 13 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
 14 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
 15 | 
 16 | You can write whatever you want in your `sotabench.py` file to get model predictions on the SQuAD dataset.
 17 | 
 18 | But you will need to record your results for the server, and you'll want to avoid doing things like
 19 | downloading the dataset on the server. So you should:
 20 | 
 21 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
 22 | - **Point to the server SQuAD data path** - popular datasets are pre-downloaded on the server.
 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
 24 | 
 25 | We explain how to do these various steps below.
 26 | 
 27 | ## How Do I Initialize an Evaluator?
 28 | 
 29 | Add this to your code - before you start batching over the dataset and making predictions:
 30 | 
 31 | ``` python
 32 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
 33 | 
 34 | # for SQuAD v1.1
 35 | evaluator = SQuADEvaluator(model_name='My Super Model', version=SQuADVersion.V11)
 36 | # for SQuAD v2.0
 37 | evaluator = SQuADEvaluator(model_name='My Super Model', version=SQuADVersion.V20)
 38 | ```
 39 | 
 40 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
 41 | put in the same model name string as on the
 42 | [SQuAD 1.1](https://sotabench.com/benchmarks/question-answering-on-squad11-dev) or
 43 | [SQuAD 2.0](https://sotabench.com/benchmarks/question-answering-on-squad20-dev) leaderboard
 44 | then you will enable direct comparison with the paper's model. For example:
 45 | 
 46 | ``` python
 47 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
 48 | 
 49 | evaluator = SQuADEvaluator(model_name='SpanBERT',
 50 |                            paper_arxiv_id='1907.10529',
 51 |                            version=SQuADVersion.V20)
 52 | ```
 53 | 
 54 | The above will directly compare with the result of the paper when run on the server.
 55 | 
 56 | ## Server Data Location
 57 | 
 58 | The SQuAD development data is located in the root of your repository on the server at `.data/nlp/squad`.
 59 | In this folder is contained:
 60 | 
 61 | - `dev-v1.1.json` - containing SQuAD v1.1 development dataset
 62 | - `dev-v2.0.json` - containing SQuAD v2.0 development dataset
 63 | 
 64 | You can use `evaluator.dataset_path: Path` to get a path to the dataset json file.
 65 | In the example above it resolves to `.data/nlp/squad/dev-v2.0.json` on
 66 | sotabench server and `./dev-v2.0.json` when run locally.
 67 | If you want to use a non-standard file name or location when running locally
 68 | you can override the defaults like this:
 69 | 
 70 | ``` python
 71 | evaluator = SQuADEvaluator(
 72 |     ...,
 73 |     local_root='mydatasets',
 74 |     dataset_filename='data.json'
 75 | )
 76 | ```
 77 | 
 78 | ## How Do I Evaluate Predictions?
 79 | 
 80 | The evaluator object has an `.add(answers: Dict[str, str])` method to submit predictions by batch or in full.
 81 | 
 82 | For SQuAD the expected input is a dictionary, where keys are question ids and values are text answers.
 83 | For unanswerable questions the answer should be an empty string. For example:
 84 | 
 85 | ``` python
 86 | {"57296d571d04691400779413": "itself", "5a89117e19b91f001a626f2d": ""}
 87 | ```
 88 | 
 89 | You can do this all at once in a single call to `add()`, but more naturally, you will
 90 | probably loop over the dataset and call the method for the outputs of each batch.
 91 | That would look something like this (for a PyTorch example):
 92 | 
 93 | ``` python
 94 | ...
 95 | 
 96 | evaluator = SQuADEvaluator(model_name='My Super Model',
 97 |                            paper_arxiv_id="1710.10723",
 98 |                            version=SQuADVersion.V11)
 99 | 
100 | with torch.no_grad():
101 |     for i, (input, target) in enumerate(data_loader):
102 |         ...
103 |         output = model(input)
104 |         # potentially formatting of the output here to be a dict
105 |         evaluator.add(output)
106 | ```
107 | 
108 | When you are done, you can get the results locally by running:
109 | 
110 | ``` python
111 | evaluator.get_results()
112 | ```
113 | 
114 | But for the server you want to save the results by running:
115 | 
116 | ``` python
117 | evaluator.save()
118 | ```
119 | 
120 | This method serialises the results and model metadata and stores to the server database.
121 | 
122 | ## How Do I Cache Evaluation?
123 | 
124 | Sotabench reruns your script on every commit. This is good because it acts like
125 | continuous integration in checking for bugs and changes, but can be annoying
126 | if the model hasn't changed and evaluation is lengthy.
127 | 
128 | Fortunately sotabencheval has caching logic that you can use.
129 | 
130 | The idea is that after the first batch, we hash the model outputs and the
131 | current metrics and this tells us if the model is the same given the dataset.
132 | You can include hashing within an evaluation loop like follows (in the following
133 | example for a PyTorch repository):
134 | 
135 | ``` python
136 | with torch.no_grad():
137 |     for i, (input, target) in enumerate(data_loader):
138 |         ...
139 |         output = model(input)
140 |         # potentially formatting of the output here to be a list of dicts
141 |         evaluator.add(output)
142 | 
143 |         if evaluator.cache_exists:
144 |             break
145 | 
146 | evaluator.save()
147 | ```
148 | 
149 | If the hash is the same as in the server, we infer that the model hasn't changed, so
150 | we simply return hashed results rather than running the whole evaluation again.
151 | 
152 | Caching is very useful if you have large models, or a repository that is evaluating
153 | multiple models, as it speeds up evaluation significantly.
154 | 
155 | ## A Full sotabench.py Example
156 | 
157 | Below we show an implementation for a model from the AllenNLP repository. This
158 | incorporates all the features explained above: (a) using the SQuAD Evaluator,
159 | (b) using custom dataset location when run locally, and (c) the evaluation caching logic.
160 | 
161 | ``` python
162 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
163 | 
164 | from allennlp.data import DatasetReader
165 | from allennlp.data.iterators import DataIterator
166 | from allennlp.models.archival import load_archive
167 | from allennlp.nn.util import move_to_device
168 | 
169 | def load_model(url, batch_size=64):
170 |     archive = load_archive(url, cuda_device=0)
171 |     model = archive.model
172 |     reader = DatasetReader.from_params(archive.config["dataset_reader"])
173 |     iterator_params = archive.config["iterator"]
174 |     iterator_params["batch_size"] = batch_size
175 |     data_iterator = DataIterator.from_params(iterator_params)
176 |     data_iterator.index_with(model.vocab)
177 |     return model, reader, data_iterator
178 | 
179 | def evaluate(model, dataset, data_iterator, evaluator):
180 |     model.eval()
181 |     evaluator.reset_time()
182 |     for batch in data_iterator(dataset, num_epochs=1, shuffle=False):
183 |         batch = move_to_device(batch, 0)
184 |         predictions = model(**batch)
185 |         answers = {metadata['id']: prediction
186 |                    for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])}
187 |         evaluator.add(answers)
188 |         if evaluator.cache_exists:
189 |             break
190 | 
191 | evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)",
192 |     paper_arxiv_id="1611.01603", version=SQuADVersion.V11)
193 | 
194 | model, reader, data_iter =\
195 |     load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz")
196 | dataset = reader.read(evaluator.dataset_path)
197 | evaluate(model, dataset, data_iter, evaluator)
198 | evaluator.save()
199 | print(evaluator.results)
200 | ```
201 | 
202 | ## Need More Help?
203 | 
204 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench
205 | forums if you have any questions or difficulties.
206 | 


--------------------------------------------------------------------------------
/docs/docs/wikitext103.md:
--------------------------------------------------------------------------------
  1 | # WikiText-103
  2 | 
  3 | ![An example text of Wikitext-103](img/language_model.png)
  4 | 
  5 | You can view the WikiText-103 leaderboard [here](https://sotabench.com/benchmarks/language-modelling-on-wikitext-103).
  6 | 
  7 | ## Getting Started
  8 | 
  9 | You'll need the following in the root of your repository:
 10 | 
 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
 14 | 
 15 | You can write whatever you want in your `sotabench.py` file to get language model predictions on the WikiText-103 dataset.
 16 | 
 17 | But you will need to record your results for the server, and you'll want to avoid doing things like
 18 | downloading the dataset on the server. So you should:
 19 | 
 20 | - **Point to the server WikiText-103 data path** - popular datasets are pre-downloaded on the server.
 21 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
 22 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
 23 | 
 24 | We explain how to do these various steps below.
 25 | 
 26 | ## Server Data Location
 27 | 
 28 | The WikiText-103 development data is located in the root of your repository on the server at `.data/nlp/wikitext-103/wikitext-103-v1.zip`.
 29 | The archive contains a folder `wikitext-103` with the following files:
 30 | 
 31 | - `wiki.train.tokens`
 32 | - `wiki.valid.tokens`
 33 | - `wiki.test.tokens`
 34 | 
 35 | It is the original zip file released [here](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/).
 36 | We are running the benchmark on the `wiki.test.tokens` dataset.
 37 | We have two helper methods that will unpack the dataset for you and give you the `pathlib.Path`  to the test file.
 38 | 
 39 | The first option `test_set_path` is available once you instantiate the `WikiText103Evaluator`:
 40 | 
 41 | ```python
 42 | ...
 43 | 
 44 | evaluator = WikiText103Evaluator(
 45 |     model_name="Transformer-XL Large", 
 46 |     paper_arxiv_id="1901.02860",
 47 |     paper_pwc_id="transformer-xl-attentive-language-models",
 48 |     local_root='/content/wikitext-103'
 49 | )
 50 | # dataset_path is pathlib.Path and points to wikitext.test.tokens
 51 | with evaluator.test_set_path.open() as f:
 52 |     test_data = torch.tensor(tokenizer.encode(f.read())).to("cuda")
 53 | ```
 54 | 
 55 | There is a second option available if you are evaluating multiple models and need to use the same
 56 | dataset multiple times - `WikiText103Evaluator.get_test_set_path(local_root)`. This will get the path before 
 57 | you initialize a WikiText evaluator:
 58 | 
 59 | ```python
 60 | from sotabencheval.language_modelling import WikiText103Evaluator
 61 | 
 62 | test_file_path = WikiText103Evaluator.get_test_set_path('/home/ubuntu/my_data/wiki103') 
 63 | with test_file_path.open() as f:
 64 |     content = f.read()
 65 | ```
 66 | 
 67 | ## How Do I Initialize an Evaluator?
 68 | 
 69 | Add this to your code - before you start batching over the dataset and making predictions:
 70 | 
 71 | ``` python
 72 | from sotabencheval.language_modelling import WikiText103Evaluator
 73 | 
 74 | evaluator = WikiText103Evaluator(model_name='Model name as found in paperswithcode website')
 75 | ```
 76 | 
 77 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
 78 | put in the same model name string as on the
 79 | [Wikitext-103](https://sotabench.com/benchmarks/language-modelling-on-wikitext-103) leaderboard
 80 | then you will enable direct comparison with the paper's model. If the `arxiv_id` is not available you 
 81 | can use `paperswithcode.com` id. Below is an example of an evaluator that matches `Transformer XL`:
 82 | 
 83 | ``` python
 84 | from sotabencheval.language_modelling import WikiText103Evaluator
 85 | 
 86 | evaluator = WikiText103Evaluator(
 87 |     model_name="Transformer-XL Large",
 88 |     paper_arxiv_id="1901.02860",
 89 |     paper_pwc_id="transformer-xl-attentive-language-models",
 90 |     local_root="path_to_your_data",
 91 | )
 92 | ```
 93 | 
 94 | The above will directly compare with the result of the paper when run on the server.
 95 | 
 96 | ## How Do I Evaluate Predictions?
 97 | 
 98 | The evaluator object has an `.add(log_probs, targets)` method to submit predictions by batch or in full. 
 99 | We expect you to give us the log probability of a batch of target tokens and the `target` tokens themselves.
100 | The `log_probs` can be either:
101 | 
102 | - a 0d "tensor" (`np.ndarray`/`torch.tensor`) - summed log probability of all `targets` tokens 
103 | - a 2d "tensor" (`np.ndarray`/`torch.tensor`) - log probabilities of each target token, the `log_probs.shape` should match `targets.shape`
104 | - a 3d "tensor" (`np.ndarray`/`torch.tensor`) - distribution of log probabilities for each position in the sequence, we will gather the probabilities of target tokens for you.
105 | 
106 | It is recommended to use third or second option as it allows us to check your perplexity calculations.
107 | 
108 | If your model uses subword tokenization you don't need convert subwords to full words. You are free to report probability of each subword: we will adjust the perplexity normalization accordingly. Just make sure to set `subword_tokenization=True` in your evaluator. 
109 | 
110 | Here is an example of how to report results (for a PyTorch example):
111 | 
112 | ``` python
113 | 
114 | evaluator = WikiText103Evaluator(
115 |     model_name='GPT-2 Small',
116 |     paper_pwc_id="language-models-are-unsupervised-multitask",
117 |     local_root="path_to_your_data",
118 |     subword_tokenization = True
119 | )
120 | 
121 | # run you data preprocessing, in case of GPT-2 the preprocessing removes moses artifacts
122 | with torch.no_grad():
123 |     model.eval()
124 |     for input, target in data_loader:
125 |         output = model(input)
126 |         log_probs = torch.LogSoftmax(output, dim=-1)
127 |         target_log_probs = output.gather(-1, targets.unsqueeze(-1))
128 |         evaluator.add(target_log_probs, target)
129 | ```
130 | 
131 | When you are done, you can get the results locally by running:
132 | 
133 | ``` python
134 | evaluator.get_results()
135 | ```
136 | 
137 | But for the server you want to save the results by running:
138 | 
139 | ``` python
140 | evaluator.save()
141 | ```
142 | 
143 | This method serialises the results and model metadata and stores to the server database.
144 | 
145 | ## How Do I Cache Evaluation?
146 | 
147 | Sotabench reruns your script on every commit. This is good because it acts like
148 | continuous integration in checking for bugs and changes, but can be annoying
149 | if the model hasn't changed and evaluation is lengthy.
150 | 
151 | Fortunately sotabencheval has caching logic that you can use.
152 | 
153 | The idea is that after the first batch, we hash the model outputs and the
154 | current metrics and this tells us if the model is the same given the dataset.
155 | You can include hashing within an evaluation loop like follows (in the following
156 | example for a PyTorch repository):
157 | 
158 | ``` python
159 | with torch.no_grad():
160 |     for input, target in data_loader:
161 |         # ...
162 |         output = model(input)
163 |         log_probs = #...
164 |         evaluator.add(log_probs, target)
165 | 
166 |         if evaluator.cache_exists:
167 |             break
168 | 
169 | evaluator.save()
170 | ```
171 | 
172 | If the hash is the same as in the server, we infer that the model hasn't changed, so
173 | we simply return hashed results rather than running the whole evaluation again.
174 | 
175 | Caching is very useful if you have large models, or a repository that is evaluating
176 | multiple models, as it speeds up evaluation significantly.
177 | 
178 | 
179 | ## A full sotabench.py example
180 | 
181 | Below we show an implementation for a model from the `huggingface/transformers`. This
182 | incorporates all the features explained above: (a) using the server data, 
183 | (b) using the WikiText-103 Evaluator, and (c) caching the evaluation logic:
184 | 
185 | ``` python
186 | import torch
187 | from tqdm import tqdm
188 | from sotabencheval.language_modelling import WikiText103Evaluator
189 | 
190 | model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda")
191 | tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103')
192 | 
193 | evaluator = WikiText103Evaluator(
194 |     model_name="Transformer-XL Large", 
195 |     paper_arxiv_id="1901.02860",
196 |     paper_pwc_id="transformer-xl-attentive-language-models",
197 |     local_root='/content/wikitext-103'
198 | )
199 | 
200 | with evaluator.test_set_path.open() as f:
201 |     test_data = torch.tensor(tokenizer.encode(f.read()))
202 | 
203 | seq_len = 128
204 | with torch.no_grad():
205 |     evaluator.reset_timer()
206 |     model.eval()
207 |     X, Y, mems = test_data[None, :-1], test_data[None, 1:], None
208 |     for s in tqdm(range(0, X.shape[-1], seq_len)):
209 |         x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda")
210 |         log_probs, mems, *_ = model(input_ids=x, mems=mems)
211 |         evaluator.add(log_probs, y)
212 |         if evaluator.cache_exists:
213 |             break
214 | evaluator.save()
215 | evaluator.print_results()
216 | ```
217 | 
218 | You can run this example on [Google Colab](https://colab.research.google.com/drive/1Qcp1_Fgo_aMtSgf_PV1gFw1DT6hEv7fW).
219 | 
220 | ## Need More Help?
221 | 
222 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench forums if you have any questions or difficulties.
223 | 


--------------------------------------------------------------------------------
/docs/docs/wmt.md:
--------------------------------------------------------------------------------
  1 | # WMT
  2 | 
  3 | You can view the WMT Machine Translation leaderboards:
  4 | 
  5 | - [WMT2014 English-German](https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german)
  6 | - [WMT2014 English-French](https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-french)
  7 | - [WMT2019 English-German](https://sotabench.com/benchmarks/machine-translation-on-wmt2019-english-german)
  8 | 
  9 | ## Getting Started
 10 | 
 11 | You'll need the following in the root of your repository:
 12 | 
 13 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit
 14 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py`
 15 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation
 16 | 
 17 | You can write whatever you want in your `sotabench.py` file to get model predictions on the WMT datasets.
 18 | 
 19 | But you will need to record your results for the server, and you'll want to avoid doing things like
 20 | downloading the dataset on the server. So you should:
 21 | 
 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results.
 23 | - **Point to the server WMT data path** - popular datasets are pre-downloaded on the server.
 24 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions.
 25 | 
 26 | We explain how to do these various steps below.
 27 | 
 28 | ## How Do I Initialize an Evaluator?
 29 | 
 30 | Before you start batching over the dataset and making predictions you need
 31 | to create an evaluator instance to record results for a given leaderboard.
 32 | For example, to evaluate on WMT2014 News English-French test set add this
 33 | to your code:
 34 | 
 35 | ``` python
 36 | from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language
 37 | 
 38 | evaluator = WMTEvaluator(
 39 |     dataset=WMTDataset.News2014,
 40 |     source_lang=Language.English,
 41 |     target_lang=Language.French,
 42 |     local_root='mydatasets',
 43 |     model_name='My Super Model'
 44 | )
 45 | ```
 46 | 
 47 | You can use `evaluator.source_dataset_path: Path` and `evaluator.target_dataset_path: Path`
 48 | to get paths to the source and target SGML files.
 49 | In the example above the first one resolves to `.data/nlp/wmt/newstest2014-fren-src.en.sgm` on
 50 | sotabench server and `mydatasets/newstest2014-fren-src.en.sgm` when run locally.
 51 | If you want to use non-standard file names locally you can override the defaults like this:
 52 | 
 53 | ``` python
 54 | evaluator = WMTEvaluator(
 55 |     ...,
 56 |     local_root='mydatasets'
 57 |     source_dataset_filename='english.sgm',
 58 |     target_dataset_filename='french.sgm'
 59 | )
 60 | ```
 61 | 
 62 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you
 63 | put in the same model name string as on the leaderboard
 64 | then you will enable direct comparison with the paper's model. For example:
 65 | 
 66 | ``` python
 67 | evaluator = WMTEvaluator(
 68 |     dataset=WMTDataset.News2019,
 69 |     source_lang=Language.English,
 70 |     target_lang=Language.German,
 71 |     local_root="mydatasets",
 72 |     model_name="Facebook-FAIR (single)",
 73 |     paper_arxiv_id="1907.06616"
 74 | )
 75 | ```
 76 | 
 77 | The above will directly compare with the result of the paper when run on the server.
 78 | 
 79 | By default the evaluator computes a detokenized mixed-case SacreBLEU score.
 80 | To get a tokenized BLEU score as well, during construction of the evaluator set
 81 | a `tokenization: Callable[[str], str]` parameter to a function that tokenizes
 82 | an input segment and returns segment with tokens separated by space, f.e.:
 83 | 
 84 | ``` python
 85 | def get_tokenization():
 86 |     mt = sacremoses.MosesTokenizer()
 87 |     def tokenize(sentence):
 88 |         return mt.tokenize(sentence, return_str=True)
 89 |     return tokenize
 90 | 
 91 | evaluator = WMTEvaluator(
 92 |     ...,
 93 |     tokenization=get_tokenization()
 94 | )
 95 | ```
 96 | 
 97 | Instead of parsing the dataset files by yourself you can access raw segments as strings:
 98 | 
 99 | ``` python
100 |     for segment_id, text in evaluator.source_segments:
101 |         # translate text
102 | 
103 |     # or get segments within document context
104 |     for document in evaluator.source_documents:
105 |         context = [segment.text for segment in document.segments]
106 |         for segment in document.segments:
107 |             segment_id, text = segment.id, segment.text
108 |             # translate text in context
109 | ```
110 | 
111 | ## How Do I Evaluate Predictions?
112 | 
113 | The evaluator object has an `.add(answers: Dict[str, str])` method to submit predictions by batch or in full.
114 | 
115 | For WMT the expected input is a dictionary, where keys are source segments
116 | ids and values are translated segments
117 | (segment id is created by concatenating document id and the original segment id,
118 | separted by `#`.) For example:
119 | 
120 | ``` python
121 | evaluator.add({
122 |     'bbc.381790#1': 'Waliser AMs sorgen sich um "Aussehen wie Muppets"',
123 |     'bbc.381790#2': 'Unter einigen AMs herrscht Bestürzung über einen...',
124 |     'bbc.381790#3': 'Sie ist aufgrund von Plänen entstanden, den Namen...'
125 | })
126 | ```
127 | 
128 | You can do this all at once in a single call to `add()`, but more naturally, you will
129 | probably loop over the dataset and call the method for the outputs of each batch.
130 | That would look something like this (for a PyTorch example):
131 | 
132 | ``` python
133 | with torch.no_grad():
134 |     for i, (input, target) in enumerate(data_loader):
135 |         ...
136 |         output = model(input)
137 |         # potentially formatting of the output here to be a dict
138 |         evaluator.add(output)
139 | ```
140 | 
141 | When you are done, you can get the results locally by running:
142 | 
143 | ``` python
144 | evaluator.get_results()
145 | ```
146 | 
147 | But for the server you want to save the results by running:
148 | 
149 | ``` python
150 | evaluator.save()
151 | ```
152 | 
153 | This method serialises the results and model metadata and stores to the server database.
154 | 
155 | ## How Do I Cache Evaluation?
156 | 
157 | Sotabench reruns your script on every commit. This is good because it acts like
158 | continuous integration in checking for bugs and changes, but can be annoying
159 | if the model hasn't changed and evaluation is lengthy.
160 | 
161 | Fortunately sotabencheval has caching logic that you can use.
162 | 
163 | The idea is that after the first batch, we hash the model outputs and the
164 | current metrics and this tells us if the model is the same given the dataset.
165 | You can include hashing within an evaluation loop like follows (in the following
166 | example for a PyTorch repository):
167 | 
168 | ``` python
169 | with torch.no_grad():
170 |     for i, (input, target) in enumerate(data_loader):
171 |         ...
172 |         output = model(input)
173 |         # potentially formatting of the output here to be a list of dicts
174 |         evaluator.add(output)
175 | 
176 |         if evaluator.cache_exists:
177 |             break
178 | 
179 | evaluator.save()
180 | ```
181 | 
182 | If the hash is the same as in the server, we infer that the model hasn't changed, so
183 | we simply return hashed results rather than running the whole evaluation again.
184 | 
185 | Caching is very useful if you have large models, or a repository that is evaluating
186 | multiple models, as it speeds up evaluation significantly.
187 | 
188 | ## A Full sotabench.py Example
189 | 
190 | Below we show an implementation for a model from the torchhub repository. This
191 | incorporates all the features explained above: (a) using the WMT Evaluator,
192 | (b) accessing segments from evaluator, and (c) the evaluation caching logic.
193 | For clarity we omit batching and simply translate segment by segment.
194 | 
195 | ``` python
196 | from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language
197 | from tqdm import tqdm
198 | import torch
199 | 
200 | evaluator = WMTEvaluator(
201 |     dataset=WMTDataset.News2019,
202 |     source_lang=Language.English,
203 |     target_lang=Language.German,
204 |     local_root="data/nlp/wmt",
205 |     model_name="Facebook-FAIR (single)",
206 |     paper_arxiv_id="1907.06616"
207 | )
208 | 
209 | model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model',
210 |     force_reload=True, tokenizer='moses', bpe='fastbpe').cuda()
211 | 
212 | for sid, text in tqdm(evaluator.source_segments.items()):
213 |     translated = model.translate(text)
214 |     evaluator.add({sid: translated})
215 |     if evaluator.cache_exists:
216 |         break
217 | 
218 | evaluator.save()
219 | print(evaluator.results)
220 | 
221 | ```
222 | 
223 | ## Need More Help?
224 | 
225 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench
226 | forums if you have any questions or difficulties.
227 | 


--------------------------------------------------------------------------------
/docs/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: sotabencheval Docs
 2 | theme:
 3 |   name: 'material'
 4 |   palette:
 5 |     primary: 'cyan'
 6 |     accent: 'cyan'
 7 |   logo:
 8 |     icon: 'explore'
 9 | markdown_extensions:
10 |   - admonition
11 |   - codehilite


--------------------------------------------------------------------------------
/docs/site/img/squad20.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/site/img/squad20.png


--------------------------------------------------------------------------------
/requirements-dev.txt:
--------------------------------------------------------------------------------
 1 | -r requirements.txt
 2 | black==19.3b0
 3 | flake8==3.7.8
 4 | mkdocs-material
 5 | pre-commit==1.18.3
 6 | pydocstyle==4.0.1
 7 | pygments
 8 | pytest==5.1.1
 9 | pytest-cov==2.7.1
10 | recommonmark==0.6.0
11 | sphinx==2.2.0
12 | sphinx-rtd-theme==0.4.3
13 | twine==1.13.0


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Cython
2 | numpy
3 | pycocotools>=2.0.0
4 | sotabenchapi>=0.0.13
5 | tqdm>=4.32.2
6 | beautifulsoup4>=4.7.0
7 | sacrebleu==1.4.1
8 | 


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
 1 | [flake8]
 2 | ignore = E203,E402,W503,E701
 3 | 
 4 | [pydocstyle]
 5 | ignore = D10,D202,D203,D212,D213,D401,D403,D406,D407,D413
 6 | 
 7 | [tool:pytest]
 8 | testpaths = sotabench-eval/test
 9 | python_files = test_*.py
10 | norecursedirs = .git
11 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | from setuptools import setup, find_packages
 3 | from sotabencheval.version import __version__
 4 | 
 5 | name = "sotabencheval"
 6 | author = "Atlas ML"
 7 | author_email = "hello@sotabench.com"
 8 | license = "Apache-2.0"
 9 | url = "https://sotabench.com"
10 | description = (
11 |     "Easily benchmark Machine Learning models on selected tasks and datasets"
12 | )
13 | 
14 | 
15 | def get_requirements():
16 |     with io.open("requirements.txt") as f:
17 |         return [
18 |             line.strip()
19 |             for line in f.readlines()
20 |             if not line.strip().startswith("#")
21 |         ]
22 | 
23 | 
24 | setup(
25 |     name=name,
26 |     version=__version__,
27 |     author=author,
28 |     author_email=author_email,
29 |     maintainer=author,
30 |     maintainer_email=author_email,
31 |     description=description,
32 |     long_description=io.open("README.md", "r", encoding="utf-8").read(),
33 |     long_description_content_type="text/markdown",
34 |     url=url,
35 |     platforms=["Windows", "POSIX", "MacOSX"],
36 |     license=license,
37 |     packages=find_packages(),
38 |     include_package_data=True,
39 |     install_requires=get_requirements(),
40 |     classifiers=[
41 |         "Programming Language :: Python :: 3",
42 |         "Programming Language :: Python :: 3.6",
43 |         "Programming Language :: Python :: 3.7",
44 |         "License :: OSI Approved :: Apache Software License",
45 |         "Operating System :: OS Independent",
46 |     ],
47 | )
48 | 
49 | 


--------------------------------------------------------------------------------
/sotabencheval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/sotabencheval/__init__.py


--------------------------------------------------------------------------------
/sotabencheval/core/__init__.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.core.evaluator import BaseEvaluator
2 | 
3 | __all__ = ["BaseEvaluator"]


--------------------------------------------------------------------------------
/sotabencheval/core/cache.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | CACHE_FLOAT_PRECISION = 3
 4 | 
 5 | 
 6 | def cache_value(value):
 7 |     """
 8 |     Takes in a value and puts it in a format ready for hashing + caching
 9 | 
10 |     Why? In sotabench we hash the output after the first batch as an indication of whether the model has changed or not.
11 |     If the model hasn't changed, then we don't run the whole evaluation on the server - but return the same results
12 |     as before. This speeds up evaluation - making "continuous evaluation" more feasible...it also means lower
13 |     GPU costs for us :).
14 | 
15 |     We apply some rounding and reformatting so small low precision changes do not change the hash.
16 | 
17 |     :param value: example model output
18 |     :return: formatted value (rounded and ready for hashing)
19 |     """
20 |     if isinstance(value, (str, int, bool)) or value is None:
21 |         return value
22 |     elif isinstance(value, float):
23 |         return np.round(value, CACHE_FLOAT_PRECISION)
24 |     elif isinstance(value, dict):
25 |         return {key: cache_value(val) for key, val in sorted(value.items(), key=lambda x: x[0])}
26 |     elif isinstance(value, list):
27 |         return [cache_value(val) for val in value]
28 |     elif isinstance(value, np.ndarray):
29 |         return value.round(CACHE_FLOAT_PRECISION)
30 | 


--------------------------------------------------------------------------------
/sotabencheval/core/evaluator.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | 
  3 | from sotabenchapi.client import Client
  4 | from sotabenchapi.core import BenchmarkResult
  5 | from sotabencheval.utils import is_server
  6 | from sotabencheval.core.cache import cache_value
  7 | 
  8 | 
  9 | class BaseEvaluator:
 10 |     """Base class for evaluator objects on tasks
 11 | 
 12 |     Currently SQuAD and WMT use this as a parent.
 13 | 
 14 |     TODO: Refactor ImageNet, COCO, ADE20K, PASCAL to utilise this class
 15 | 
 16 |     The core API design relies upon:
 17 | 
 18 |     (a) Initializing an Evaluator object and linking to a paper, for example:
 19 | 
 20 |     .. code-block:: python
 21 | 
 22 |         from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
 23 | 
 24 |         evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
 25 |             version=SQuADVersion.V20)
 26 | 
 27 |     The paper metadata allows the results to be linked to paper results when submitted to sotabench.com.
 28 | 
 29 |     (b) Adding Predictions (usually in batch) - example below for PyTorch iterating over DataLoader:
 30 | 
 31 |     .. code-block:: python
 32 | 
 33 |             for i, (input, target) in enumerate(data_loader):
 34 |                 ...
 35 |                 output = model(input)
 36 |                 # potentially formatting of the output here
 37 |                 evaluator.add(output)
 38 | 
 39 |     These results are accumulated and then evaluated - i.e. metrics are calculated once done.
 40 | 
 41 |     (c) Saving Results
 42 | 
 43 |     .. code-block:: python
 44 |         evaluator.save()
 45 | 
 46 |     Gets the evaluation results for the current predictions added to the Evaluation object - calculates metrics -
 47 |     then run if on the server, serializes results to a sotabench_results.json file which is processed and results
 48 |     are stored on the server.
 49 | 
 50 |     These three steps: initialization -> adding predictions -> saving and evaluating results are the core API.
 51 |     They should be capable of integration with any existing evaluation logic in your repository.
 52 |     """
 53 | 
 54 |     def __init__(self,
 55 |                  model_name: str = None,
 56 |                  paper_arxiv_id: str = None,
 57 |                  paper_pwc_id: str = None,
 58 |                  paper_results: dict = None,
 59 |                  model_description=None,):
 60 |         """
 61 |         Initializes a BaseEvaluator like object
 62 | 
 63 |         :param model_name: (str) The name of the model, for example 'ResNet-101', which will be saved to sotabench.com
 64 |         :param paper_arxiv_id: (str, optional) The paper that the model is linked to, e.g. '1906.06423'
 65 |         :param paper_pwc_id: (str, optional) The PWC paper id (slug), e.g. 'albert-a-lite-bert-for-self-supervised'
 66 |         :param paper_results: (dict, optional) If the paper you are linking to does not have results on sotabench,
 67 |         then you can add paper results here. This will be a dictionary with keys as metric names, and values as metric
 68 |         values. This will be benchmark specific.
 69 |         :param model_description: (str, optional) Optional description for the model; this can contain details about
 70 |         where the weights are from, details about training, and more. This will appear in an info box for the model
 71 |         when it is displayed on sotabench.com.
 72 |         """
 73 | 
 74 |         # Model metadata
 75 | 
 76 |         self.model_name = model_name
 77 |         self.paper_arxiv_id = paper_arxiv_id
 78 |         self.paper_pwc_id = paper_pwc_id
 79 |         self.paper_results = paper_results
 80 |         self.model_description = model_description
 81 | 
 82 |         # Backend variables for hashing and caching
 83 | 
 84 |         self.first_batch_processed = False
 85 |         self.batch_hash = None
 86 |         self.cached_results = False
 87 |         self.results = None
 88 |         self._cache_exists = None
 89 | 
 90 |         # Speed and memory metrics
 91 | 
 92 |         self.init_time = time.time()
 93 |         self.speed_mem_metrics = {}
 94 | 
 95 |     @property
 96 |     def cache_exists(self):
 97 |         """
 98 |         Checks whether the cache exists in the sotabench.com database - if so
 99 |         then sets self.results to cached results and returns True.
100 | 
101 |         You can use this property for control flow to break a for loop over a dataset
102 |         after the first iteration. This prevents re-running the same calculation for the
103 |         same model twice.
104 | 
105 |         Q: Why should the user use this?
106 |         A: If you want fast "continuous evaluation" and don't want to avoid rerunning the same model over and over
107 |             each time you commit something new to your repository.
108 | 
109 |         Examples:
110 |             Breaking a for loop if the model is the same as last time we ran
111 | 
112 |             .. code-block:: python
113 | 
114 |                 ...
115 | 
116 |                 with torch.no_grad():
117 |                     for i, (input, target) in enumerate(iterator):
118 |                         ...
119 |                         output = model(input)
120 |                         # optional formatting of output here to be a list of detection dicts
121 |                         evaluator.add(output)
122 | 
123 |                         if evaluator.cache_exists:
124 |                             break
125 | 
126 |                 evaluator.save()
127 | 
128 |         This logic is for the server; it will not break the loop if you evaluate locally.
129 | 
130 |         :return: bool or None (if not on server)
131 |         """
132 | 
133 |         if not is_server():  # we only check the cache on the server
134 |             return None
135 | 
136 |         if not self.first_batch_processed:
137 |             return False
138 | 
139 |         if self._cache_exists is not None:
140 |             return self._cache_exists
141 | 
142 |         client = Client.public()
143 |         cached_res = client.get_results_by_run_hash(self.batch_hash)
144 |         if cached_res:
145 |             self.results = cached_res
146 |             self.cached_results = True
147 |             print(
148 |                 "No model change detected (using the first batch run "
149 |                 f"hash {self.batch_hash}). Will use cached results."
150 |             )
151 | 
152 |             self._cache_exists = True
153 |         else:
154 |             self._cache_exists = False
155 |         return self._cache_exists
156 | 
157 |     def reset(self):
158 |         """Resets the internal state of evaluator and allows to start over"""
159 |         pass
160 | 
161 |     def cache_values(self, **kwargs):
162 |         """
163 |         Takes in keyword argument and converts to a hashable (cachable) format for each
164 | 
165 |         :param kwargs: keyword argument
166 |         :return: cachable version of the keyword arguments
167 |         """
168 |         return cache_value(kwargs)
169 | 
170 |     def eval(self, results_generator):
171 |         """Run full evaluation loop on results_genertor"""
172 |         self.reset()
173 |         self.reset_time()
174 |         for results in results_generator:
175 |             self.add(*results)
176 |             if self.first_batch_processed and self.cache_exists:
177 |                 break
178 |         self.save()
179 |         return self
180 |     
181 |     def get_results(self):
182 |         """Calculate results."""
183 |         return self.results
184 | 
185 |     def print_results(self):
186 |         """Print results."""
187 |         self.get_results()
188 |         print(f"results = {self.results}, speed_mem_metrics = {self.speed_mem_metrics}")
189 | 
190 |     def reset_time(self):
191 |         """
192 |         Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
193 |         appropriately, for example:
194 | 
195 |         .. code-block:: python
196 | 
197 |             from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
198 | 
199 |             evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
200 |                 version=SQuADVersion.V20)
201 | 
202 |             # processing/setup logic here
203 | 
204 |             evaluator.reset_time()
205 | 
206 |             for i, (input, target) in enumerate(data_loader):
207 |                 ...
208 |                 output = model(input)
209 |                 # potentially formatting of the output here
210 |                 evaluator.add(output)
211 | 
212 |             evaluator.save()
213 | 
214 |         Above we may have processing logic inbetween the evaluator initialization and the actual evaluation loop, so
215 |         we reset the timer so it's a fair timing of the evaluation (and not setup steps like data processing, loading
216 |         the model etc).
217 | 
218 |         :return: void - resets self.init_time
219 |         """
220 |         self.init_time = time.time()
221 | 
222 |     def save(self, **kwargs):
223 |         """
224 |         Calculate results and then put into a BenchmarkResult object
225 | 
226 |         On the sotabench.com server, this will produce a JSON file serialisation in sotabench_results.json and results
227 |         will be recorded on the platform.
228 | 
229 |         Users should save once all predictions are added, for instance:
230 | 
231 |         .. code-block:: python
232 | 
233 |             from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
234 | 
235 |             evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529',
236 |                 version=SQuADVersion.V20)
237 | 
238 |             # processing/setup logic here
239 | 
240 |             evaluator.reset_time()
241 | 
242 |             for i, (input, target) in enumerate(data_loader):
243 |                 ...
244 |                 output = model(input)
245 |                 # potentially formatting of the output here
246 |                 evaluator.add(output)
247 | 
248 |             evaluator.save()
249 | 
250 |         Here once we have added all the predictions to the evaluator, we .save() so we evaluate and, if on the server,
251 |         results are serialized and saved to the server.
252 | 
253 |         :return: BenchmarkResult object with results and metadata
254 |         """
255 |         # recalculate to ensure no mistakes made during batch-by-batch metric calculation
256 |         self.get_results()
257 | 
258 |         return BenchmarkResult(
259 |             task=self.task,
260 |             config={},
261 |             results=self.results,
262 |             speed_mem_metrics=self.speed_mem_metrics,
263 |             model=self.model_name,
264 |             model_description=self.model_description,
265 |             arxiv_id=self.paper_arxiv_id,
266 |             pwc_id=self.paper_pwc_id,
267 |             paper_results=self.paper_results,
268 |             run_hash=self.batch_hash,
269 |             **kwargs,
270 |         )
271 | 


--------------------------------------------------------------------------------
/sotabencheval/image_classification/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["ImageNetEvaluator"]
2 | 
3 | from sotabencheval.image_classification.imagenet import ImageNetEvaluator


--------------------------------------------------------------------------------
/sotabencheval/image_classification/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def top_k_accuracy_score(y_true, y_pred, k=5, normalize=True):
 4 |     """
 5 |      Top k Accuracy classification score.
 6 |     :param y_true: the true labels (np.ndarray)
 7 |     :param y_pred: the predicted labels (np.ndarray)
 8 |     :param k: calculates top k accuracy (int)
 9 |     :param normalize: whether to normalize by the number of observations
10 |     :return: the top k accuracy
11 |     """
12 | 
13 |     if len(y_true.shape) == 2:
14 |         y_true = y_true[0]  # should be one-dimensional
15 | 
16 |     num_obs, num_labels = y_pred.shape
17 | 
18 |     idx = num_labels - k - 1
19 |     counter = 0
20 |     argsorted = np.argsort(y_pred, axis=1)
21 | 
22 |     for i in range(num_obs):
23 |         if y_true[i] in argsorted[i, idx+1:]:
24 |             counter += 1
25 |     if normalize:
26 |         return counter / num_obs
27 |     else:
28 |         return counter


--------------------------------------------------------------------------------
/sotabencheval/language_modelling/__init__.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.language_modelling.wikitext import WikiText103Evaluator, WikiText2Evaluator, WikiTextEvaluator, WikiTextDataset
2 | 
3 | __all__ = ["WikiText103Evaluator", "WikiText2Evaluator",
4 |            "WikiTextEvaluator", "WikiTextDataset"]
5 | 


--------------------------------------------------------------------------------
/sotabencheval/language_modelling/wikitext.py:
--------------------------------------------------------------------------------
  1 | import time
  2 | from enum import Enum
  3 | from pathlib import Path
  4 | 
  5 | import numpy as np
  6 | 
  7 | from sotabencheval.core import BaseEvaluator
  8 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server, get_max_memory_allocated
  9 | 
 10 | 
 11 | class WikiTextDataset(Enum):
 12 |     """Enum used to select the dataset on which evaluation is executed. """
 13 |     WikiText103 = ('WikiText-103', 245569, 267735)
 14 |     WikiText2 = ('WikiText-2', 245569, 33278)
 15 |     
 16 |     def __init__(self, pwc_name, testset_size, vocab_size):
 17 |         """
 18 |         Creates an enum instance
 19 |         :param pwc_name: the name of the dataset as it is found on paperswithcode leaderboard
 20 |         :param testset_size: the size of the test set in words
 21 |         :param vocab_size: the size of the dataset vocabluary
 22 |         """
 23 |         self.pwc_name = pwc_name
 24 |         self.testset_size = testset_size
 25 |         self.vocab_size = vocab_size
 26 |     
 27 |     def _get_path(self, local_root, local_unzip=False):
 28 |         root = Path(change_root_if_server(root=local_root,
 29 |                                           server_root=".data/nlp/" + self.pwc_name.lower()))
 30 |         zip_name = self.pwc_name.lower() + "-v1.zip"
 31 |         dataset_path = root / "wiki.test.tokens"
 32 |         if not dataset_path.exists(): # unzip
 33 |             extract_archive(str(root / zip_name), to_path=root.parent)
 34 |         return dataset_path
 35 |     
 36 |     get_path = _get_path # deprecated API, for backward compatibility with existing benchmarks
 37 |     
 38 |     def get_test_set_path(self, local_root):
 39 |         """ 
 40 |         Unzips the datasets and returns path to "wiki.test.tokens" 
 41 |         :param local_root: Path to the directory where the dataset files are located locally.
 42 |             Ignored when run on sotabench server.
 43 |         """
 44 |         return self.get_path(local_root).parent / "wiki.test.tokens"
 45 | 
 46 |     def get_validation_set_path(self, local_root):
 47 |         """ 
 48 |         Unzips the datasets and returns path to "wiki.test.tokens" 
 49 |         :param local_root: Path to the directory where the dataset files are located locally.
 50 |             Ignored when run on sotabench server.
 51 |         """
 52 |         return self.get_path(local_root).parent / "wiki.valid.tokens"
 53 | 
 54 | def _to_numpy(*args):
 55 |     def convert(a):
 56 |         if hasattr(a, 'cpu') and hasattr(a, 'numpy'):
 57 |             return a.cpu().numpy()
 58 |         if isinstance(a, list):
 59 |             return np.array(a)
 60 |         return a
 61 |     return [convert(a) for a in args]
 62 | 
 63 | def _gather_probs(log_probs, targets):
 64 |     """
 65 |     Gather probabilities of each target token, from the model activations after log_softmax
 66 |     :param log_probs: - `torch.tensor`/`np.ndarray` shape [bs x seq_len x vocab_sz] 
 67 |                          with model activations after `log_softmax`, with log probability of each word in the vocab
 68 |     :param targets: - `torch.tensor`/`np.ndarray` shape [bs x seq_len] with ground truth words
 69 |     """
 70 |     if hasattr(log_probs, 'gather'):
 71 |         # if we work with torch this method is faster than numpy implementation
 72 |         probs = log_probs.gather(-1, targets.unsqueeze(-1))
 73 |     elif isinstance(log_probs, np.ndarray):
 74 |         # use slower numpy implementation if we have ndarrays
 75 |         vocab_sz = int(log_probs.shape[-1])
 76 |         log_probs, targets =  _to_numpy(log_probs, targets)
 77 |         log_probs = log_probs.reshape(-1, vocab_sz)
 78 |         targets = targets.reshape(-1)
 79 |         probs = log_probs[np.arange(log_probs.shape[0]), targets]
 80 |     return _to_numpy(probs, targets)   
 81 | 
 82 |     
 83 | class WikiTextEvaluator(BaseEvaluator):
 84 |     task = "Language Modelling"
 85 |     dataset = None  # defined in a subclass
 86 | 
 87 |     def __init__(self,
 88 |                  local_root: str = '.',
 89 |                  model_name: str = None,
 90 |                  paper_arxiv_id: str = None,
 91 |                  paper_pwc_id: str = None,
 92 |                  paper_results: dict = None,
 93 |                  model_description=None,                 
 94 |                  subword_tokenization: bool = False,
 95 |                  text_transformation: bool = False,
 96 |                  dataset=None):
 97 |         """
 98 |         Creates an evaluator for one of the WikiText benchmarks.
 99 | 
100 |         :param local_root: Path to the directory where the dataset files are located locally.
101 |             Ignored when run on sotabench server.
102 |         :param model_name: The name of the model from the
103 |             paper - if you want to link your build to a model from a
104 |             machine learning paper. See the WikiText-103 benchmarks page for model names,
105 |             (f.e., https://sotabench.com/benchmarks/language-modelling-on-wikitext-103)
106 |             on the paper leaderboard or models yet to try tab.
107 |         :param paper_arxiv_id: Optional linking to arXiv if you
108 |             want to link to papers on the leaderboard; put in the
109 |             corresponding paper's arXiv ID, e.g. '1901.02860'.
110 |         :param paper_pwc_id: Optional linking to Papers With Code;
111 |             put in the corresponding papers with code URL slug, e.g.
112 |             "transformer-xl-attentive-language-models"
113 |         :param paper_results: If the paper model you are reproducing
114 |             does not have model results on sotabench.com, you can specify
115 |             the paper results yourself through this argument, where keys
116 |             are metric names, values are metric values. e.g:
117 | 
118 |                     {'Test perplexity': 18.2 }.
119 | 
120 |             Ensure that the metric names match those on the sotabench
121 |             leaderboard - for WikiText benchmarks it should be `Test perplexity`.
122 |         :param model_description: Optional model description.
123 |         :param subword_tokenization: Should be set to `True` if your model use subword tokens defaults to `False`,  
124 |         :param text_transformation: Should be set to  `True` if you use detokenizers that removes moses artefacts, f.e. in zero shoot setting,  
125 |         :param dataset: internal paramtere do not set in subclasses.
126 |         """
127 |         super().__init__(model_name, paper_arxiv_id,
128 |                          paper_pwc_id, paper_results, model_description)
129 |         if dataset is not None:
130 |             self.dataset = dataset
131 |         self.subword_tokenization = subword_tokenization
132 |         self.text_transformation = text_transformation
133 |         self.local_root = local_root 
134 |         self._neglogloss = 0
135 |         self._data_set_size = 0 
136 |     
137 |     @property
138 |     def dataset_path(self): # deprecated 
139 |         return self.dataset.get_path(self.local_root)
140 | 
141 |     @property
142 |     def test_set_path(self):
143 |         """Returns path to test set, uses `self.local_root` when it is not on the server"""
144 |         return self.get_test_set_path(self.local_root)
145 | 
146 |     @classmethod
147 |     def get_test_set_path(cls, local_root):
148 |         """
149 |         Unzips the datasets and returns path to "wiki.test.tokens" 
150 |         :param local_root: Path to the directory where the dataset files are located locally.
151 |             Ignored when run on sotabench server.
152 |         """
153 |         return cls.dataset.get_test_set_path(local_root)
154 |         
155 |     def reset(self):
156 |         """
157 |         Removes already added results
158 | 
159 | 
160 |         When checking if the model should be rerun on whole dataset it is first run on a smaller subset
161 |         and the results are compared with values cached on sotabench server (the check is not performed
162 |         when running locally.) Ideally, the smaller subset is just the first batch, so no additional
163 |         computation is needed. However, for more complex multistage pipelines it maybe simpler to
164 |         run a model twice - on a small dataset and (if necessary) on the full dataset. In that case
165 |         :func:`reset` needs to be called before the second run so values from the first run are not reported.
166 | 
167 |         .. seealso:: :func:`cache_exists`
168 |         .. seealso:: :func:`reset_time`
169 |         """
170 |         self._neglogloss = 0
171 |         self._data_set_size = 0 
172 |     
173 |     def add(self, log_probs, targets):
174 |         """
175 |         Updates the evaluator with new results
176 | 
177 |         :param log_probs: `np.ndarray` or `torch.tensor` with log probability of target tokens can be either:
178 |             - a 0d tensor
179 |                 summed log probability of all `targets` tokens, or 
180 |             - a 2d tensor [bs x seq_len]
181 |                 log probabilities of each target token, the shape of `log_probs`, `targets` must match.
182 |             - a 3d tensor [bs x seq_len x vocab_size] 
183 |                  distribution of log probabilities for each position in the sequence,
184 |                  we will gather the probabilities of target tokens for you.
185 |         :param targets: a `np.ndarray` or `torch.tensor`  with ids of ground truth tokens.
186 | 
187 |         Examples:
188 |             Update the evaluator with a result for a sentence with 10 tokens:
189 | 
190 |             .. code-block:: python
191 |                 log_probs = np.array([[ 32, 582, 2731, 19, 1, 786,  5, 98693, 55362, 5 ]])
192 |                 targets = np.array([[ -9.8461,  -9.3343, -17.8042, -11.2006, -22.3345, -14.4665,  -2.0055,
193 |                                     -14.2044, -14.7545,  -5.7888]])
194 |                 my_evaluator.add(log_probs, targets)
195 |         """
196 |         if isinstance(log_probs, float):
197 |             log_probs = np.array([log_probs]) #  for sum to work
198 |         elif log_probs.shape[:-1] == targets.shape:
199 |             log_probs, targets = _gather_probs(log_probs, targets)
200 |         else:
201 |             assert log_probs.shape == targets.shape, f"log_probs have to be ether gathered log probabilities of targets or all probabilites, received {log_probs.shape} {repr(log_probs)}"
202 |         self._neglogloss += - float(log_probs.sum())
203 |         self._data_set_size += int(np.prod(list(targets.shape)))
204 | 
205 |         if not self.first_batch_processed:
206 |             content = self.cache_values(
207 |                 probs=_to_numpy(log_probs)[0].reshape(-1),
208 |                 api_version=3)
209 |             self.batch_hash = calculate_batch_hash(content)
210 |             self.first_batch_processed = True
211 |         return self.results
212 |     
213 |     def print_results(self):
214 |         """ Calculates and print results. """
215 |         super().print_results()
216 |         print("Perplexity:", np.exp(self._neglogloss / self.dataset.testset_size), 
217 |               "NeglogLoss:", self._neglogloss, "Tokens Count:", self._data_set_size)
218 |     
219 |     print_stats = print_results
220 |     
221 |     def get_results(self):
222 |         """ 
223 |         Calculates the perplexity and measure the performance of the model
224 |         
225 |         :return: dict with `Test perplexity`
226 |         """
227 |         if self.cached_results:
228 |             return self.results
229 |         perplexity = np.exp(self._neglogloss /
230 |                             self.dataset.testset_size)
231 |                             
232 |         self.results = {
233 |             'Test perplexity': perplexity
234 |         }
235 |         self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
236 |         exec_speed = (time.time() - self.init_time)
237 |         count = self.dataset.testset_size
238 |         self.speed_mem_metrics['Tasks / Evaluation Time'] = count / exec_speed
239 |         self.speed_mem_metrics['Tasks'] = count
240 |         self.speed_mem_metrics['Evaluation Time'] = exec_speed
241 |         return self.results
242 | 
243 |     def save(self):
244 |         """Save results to the server databese/"""
245 |         return super().save(dataset=self.dataset.pwc_name)
246 | 
247 | 
248 | class WikiText103Evaluator(WikiTextEvaluator):
249 |     """`WikiText103 <https://sotabench.com/benchmarks/language-modelling-on-wikitext-103>`_ benchmark.
250 | 
251 |     Examples:
252 |         Evaluate a language model from the transformers repository:
253 | 
254 |         .. code-block:: python
255 | 
256 |             import torch
257 |             from tqdm import tqdm
258 |             from sotabencheval.language_modelling import WikiText103Evaluator
259 | 
260 |             model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda")
261 |             tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103')
262 | 
263 |             evaluator = WikiText103Evaluator(
264 |                 model_name="Transformer-XL Large", 
265 |                 paper_arxiv_id="1901.02860",
266 |                 paper_pwc_id="transformer-xl-attentive-language-models",
267 |                 local_root='/content/wikitext-103'
268 |             )
269 | 
270 |             with evaluator.test_set_path.open() as f:
271 |                 test_data = torch.tensor(tokenizer.encode(f.read()))
272 | 
273 |             seq_len = 128
274 |             with torch.no_grad():
275 |                 evaluator.reset_timer()
276 |                 model.eval()
277 |                 X, Y, mems = test_data[None, :-1], test_data[None, 1:], None
278 |                 for s in tqdm(range(0, X.shape[-1], seq_len)):
279 |                     x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda")
280 |                     log_probs, mems, *_ = model(input_ids=x, mems=mems)
281 |                     evaluator.add(log_probs, y)
282 |                     if evaluator.cache_exists:
283 |                         break
284 |             evaluator.save()
285 |             evaluator.print_results()
286 |     """
287 |     dataset = WikiTextDataset.WikiText103
288 | 
289 | 
290 | class WikiText2Evaluator(WikiTextEvaluator):
291 |     """`WikiText103 <https://sotabench.com/benchmarks/language-modelling-on-wikitext-2>`_ benchmark.
292 | 
293 |     Examples:
294 |         Evaluate a language model from the transformers repository:
295 | 
296 |         .. code-block:: python
297 | 
298 |             import torch
299 |             from tqdm import tqdm
300 |             from sotabencheval.language_modelling import WikiText2Evaluator
301 | 
302 |             model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda")
303 |             tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103')
304 | 
305 |             evaluator = WikiText2Evaluator(
306 |                 model_name="Transformer-XL Large", 
307 |                 paper_arxiv_id="1901.02860",
308 |                 paper_pwc_id="transformer-xl-attentive-language-models",
309 |                 local_root='/content/wikitext-2'
310 |             )
311 | 
312 |             with evaluator.test_set_path.open() as f:
313 |                 test_data = torch.tensor(tokenizer.encode(f.read()))
314 | 
315 |             seq_len = 128
316 |             with torch.no_grad():
317 |                 evaluator.reset_timer()
318 |                 model.eval()
319 |                 X, Y, mems = test_data[None, :-1], test_data[None, 1:], None
320 |                 for s in tqdm(range(0, X.shape[-1], seq_len)):
321 |                     x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda")
322 |                     log_probs, mems, *_ = model(input_ids=x, mems=mems)
323 |                     evaluator.add(log_probs, y)
324 |                     if evaluator.cache_exists:
325 |                         break
326 |             evaluator.save()
327 |             evaluator.print_results()
328 |     """
329 |     dataset = WikiTextDataset.WikiText2
330 | 


--------------------------------------------------------------------------------
/sotabencheval/machine_translation/__init__.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.machine_translation.wmt import WMTEvaluator, WMTDataset
2 | from sotabencheval.machine_translation.metrics import TranslationMetrics
3 | from sotabencheval.machine_translation.languages import Language
4 | 
5 | __all__ = ["WMTDataset", "WMTEvaluator", "TranslationMetrics", "Language"]
6 | 


--------------------------------------------------------------------------------
/sotabencheval/machine_translation/languages.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | _full_forms = {
 4 |     "en": "English",
 5 |     "fr": "French",
 6 |     "de": "German",
 7 | }
 8 | 
 9 | 
10 | class Language(Enum):
11 |     English = "en"
12 |     French = "fr"
13 |     German = "de"
14 | 
15 |     @property
16 |     def fullname(self):
17 |         return _full_forms[self.value]
18 | 


--------------------------------------------------------------------------------
/sotabencheval/machine_translation/metrics.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from bs4 import BeautifulSoup
 3 | from pathlib import Path
 4 | from typing import Dict, List, Callable
 5 | from collections import OrderedDict
 6 | from sacrebleu import corpus_bleu
 7 | 
 8 | 
 9 | MIN_CACHE_BATCH_SIZE = 32
10 | 
11 | 
12 | class TranslationMetrics:
13 |     def __init__(self,
14 |                  source_dataset_path: Path,
15 |                  target_dataset_path: Path,
16 |                  tokenization: Callable[[str], str] = None):
17 |         self._src_dataset_path = source_dataset_path
18 |         self._dst_dataset_path = target_dataset_path
19 |         self.answers = {}
20 |         self.source_documents, self.source_segments = self._load_dataset(self._src_dataset_path)
21 |         self._target_documents, self._target_segments = self._load_dataset(self._dst_dataset_path)
22 |         self._tokenization = tokenization
23 |         self._results = None
24 | 
25 |     def _load_dataset(self, dataset_path):
26 |         documents = read_sgm_file(dataset_path)
27 |         segments = OrderedDict([(segment.id, segment.text) for doc in documents for segment in doc.segments])
28 |         return documents, segments
29 | 
30 |     def add(self, answers: Dict[str, str]):
31 |         if not answers:
32 |             print("Empty batch added to results")
33 |             return
34 |         if set(self.answers.keys()) & set(answers.keys()):
35 |             print("Multiple translations for the same segment")
36 |         self.answers.update(answers)
37 | 
38 |     def reset(self):
39 |         self._results = None
40 |         self.answers = {}
41 | 
42 |     def evaluate(self, ignore_missing=False):
43 |         if ignore_missing:
44 |             keep = set(self.answers.keys())
45 |             target_segments = {sid: text for sid, text in self._target_segments.items() if sid in keep}
46 |         else:
47 |             target_segments = self._target_segments
48 |         answers = [self.answers.get(sid, "") for sid in target_segments]
49 |         references = [target for target in target_segments.values()]
50 |         bleu = corpus_bleu(answers, [references])
51 |         self._results = {'SacreBLEU': bleu.score}
52 | 
53 |         if self._tokenization is not None:
54 |             tok_answers = [self._tokenization(answer) for answer in answers]
55 |             tok_references = [self._tokenization(target) for target in references]
56 |             tok_bleu = corpus_bleu(tok_answers, [tok_references], tokenize='none', force=True)
57 |             self._results['BLEU score'] = tok_bleu.score
58 | 
59 |     @property
60 |     def has_data(self):
61 |         return len(self.answers) >= MIN_CACHE_BATCH_SIZE
62 | 
63 |     def get_results(self, ignore_missing=False):
64 |         self.evaluate(ignore_missing)
65 |         return self._results
66 | 
67 | 
68 | @dataclass
69 | class Segment:
70 |     id: str
71 |     text: str
72 | 
73 | 
74 | @dataclass
75 | class Document:
76 |     id: str
77 |     segments: List[Segment]
78 | 
79 | 
80 | def read_sgm_file(path):
81 |     with open(path, 'rb') as f:
82 |         soup = BeautifulSoup(f.read(), features="html.parser")
83 | 
84 |     return [
85 |         Document(
86 |             id=doc['docid'],
87 |             segments=[
88 |                 Segment(
89 |                     id=doc['docid'] + '#' + seg['id'],
90 |                     text=seg.text
91 |                 ) for seg in doc.find_all('seg')
92 |             ]
93 |         ) for doc in soup.find_all('doc')
94 |     ]
95 | 


--------------------------------------------------------------------------------
/sotabencheval/machine_translation/wmt.py:
--------------------------------------------------------------------------------
  1 | from sotabencheval.core import BaseEvaluator
  2 | from sotabencheval.utils import calculate_batch_hash, change_root_if_server, is_server
  3 | from sotabencheval.machine_translation.languages import Language
  4 | from sotabencheval.machine_translation.metrics import TranslationMetrics
  5 | from sotabencheval.utils import get_max_memory_allocated
  6 | from typing import Dict, Callable
  7 | from pathlib import Path
  8 | from enum import Enum
  9 | import time
 10 | 
 11 | 
 12 | class WMTDataset(Enum):
 13 |     News2014 = "newstest2014"
 14 |     News2019 = "newstest2019"
 15 | 
 16 | 
 17 | class WMTEvaluator(BaseEvaluator):
 18 |     """Evaluator for WMT Machine Translation benchmarks.
 19 | 
 20 |     Examples:
 21 |         Evaluate a Transformer model from the fairseq repository on WMT2019 news test set:
 22 | 
 23 |         .. code-block:: python
 24 | 
 25 |             from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language
 26 |             from tqdm import tqdm
 27 |             import torch
 28 | 
 29 |             evaluator = WMTEvaluator(
 30 |                 dataset=WMTDataset.News2019,
 31 |                 source_lang=Language.English,
 32 |                 target_lang=Language.German,
 33 |                 local_root="data/nlp/wmt",
 34 |                 model_name="Facebook-FAIR (single)",
 35 |                 paper_arxiv_id="1907.06616"
 36 |             )
 37 | 
 38 |             model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model',
 39 |                 force_reload=True, tokenizer='moses', bpe='fastbpe').cuda()
 40 | 
 41 |             for sid, text in tqdm(evaluator.source_segments.items()):
 42 |                 translated = model.translate(text)
 43 |                 evaluator.add({sid: translated})
 44 |                 if evaluator.cache_exists:
 45 |                     break
 46 | 
 47 |             evaluator.save()
 48 |             print(evaluator.results)
 49 |     """
 50 | 
 51 |     task = "Machine Translation"
 52 | 
 53 |     _datasets = {
 54 |         (WMTDataset.News2014, Language.English, Language.German),
 55 |         (WMTDataset.News2019, Language.English, Language.German),
 56 |         (WMTDataset.News2014, Language.English, Language.French),
 57 |     }
 58 | 
 59 |     def __init__(self,
 60 |                  dataset: WMTDataset,
 61 |                  source_lang: Language,
 62 |                  target_lang: Language,
 63 |                  local_root: str = '.',
 64 |                  source_dataset_filename: str = None,
 65 |                  target_dataset_filename: str = None,
 66 |                  model_name: str = None,
 67 |                  paper_arxiv_id: str = None,
 68 |                  paper_pwc_id: str = None,
 69 |                  paper_results: dict = None,
 70 |                  model_description: str = None,
 71 |                  tokenization: Callable[[str], str] = None):
 72 |         """
 73 |         Creates an evaluator for one of the WMT benchmarks.
 74 | 
 75 |         :param dataset: Which dataset to evaluate on, f.e., WMTDataset.News2014.
 76 |         :param source_lang: Source language of the documents to translate.
 77 |         :param target_lang: Target language into which the documents are translated.
 78 |         :param local_root: Path to the directory where the dataset files are located locally.
 79 |             Ignored when run on sotabench server.
 80 |         :param source_dataset_filename: Local filename of the SGML file with the source documents.
 81 |             If None, the standard WMT filename is used, based on :param:`dataset`,
 82 |             :param:`source_lang` and :param:`target_lang`.
 83 |             Ignored when run on sotabench server.
 84 |         :param target_dataset_filename: Local filename of the SGML file with the reference documents.
 85 |             If None, the standard WMT filename is used, based on :param:`dataset`,
 86 |             :param:`source_lang` and :param:`target_lang`.
 87 |             Ignored when run on sotabench server.
 88 |         :param model_name: The name of the model from the
 89 |             paper - if you want to link your build to a model from a
 90 |             machine learning paper. See the WMT benchmarks pages for model names,
 91 |             (f.e., https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german)
 92 |             on the paper leaderboard or models yet to try tabs.
 93 |         :param paper_arxiv_id: Optional linking to arXiv if you
 94 |             want to link to papers on the leaderboard; put in the
 95 |             corresponding paper's arXiv ID, e.g. '1907.06616'.
 96 |         :param paper_pwc_id: Optional linking to Papers With Code;
 97 |             put in the corresponding papers with code URL slug, e.g.
 98 |             'facebook-fairs-wmt19-news-translation-task'
 99 |         :param paper_results: If the paper model you are reproducing
100 |             does not have model results on sotabench.com, you can specify
101 |             the paper results yourself through this argument, where keys
102 |             are metric names, values are metric values. e.g:
103 | 
104 |                     {'SacreBLEU': 42.7, 'BLEU score': 43.1}.
105 | 
106 |             Ensure that the metric names match those on the sotabench
107 |             leaderboard - for WMT benchmarks it should be `SacreBLEU` for de-tokenized
108 |             case sensitive BLEU score and `BLEU score` for tokenized BLEU.
109 |         :param model_description: Optional model description.
110 |         :param tokenization: An optional tokenization function to compute tokenized BLEU score.
111 |             It takes a single string - a segment to tokenize, and returns a string with tokens
112 |             separated by space, f.e.:
113 | 
114 |                     tokenization = lambda seg: seg.replace("'s", " 's").replace("-", " - ")
115 | 
116 |             If None, only de-tokenized SacreBLEU score is reported.
117 |         """
118 | 
119 |         super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description)
120 |         self.root = change_root_if_server(root=local_root,
121 |                                           server_root=".data/nlp/wmt")
122 |         self.dataset = dataset
123 |         self.source_lang = source_lang
124 |         self.target_lang = target_lang
125 | 
126 |         default_src_fn, default_dst_fn = self._get_source_dataset_filename()
127 |         if source_dataset_filename is None or is_server():
128 |             source_dataset_filename = default_src_fn
129 | 
130 |         if target_dataset_filename is None or is_server():
131 |             target_dataset_filename = default_dst_fn
132 | 
133 |         self.source_dataset_path = Path(self.root) / source_dataset_filename
134 |         self.target_dataset_path = Path(self.root) / target_dataset_filename
135 | 
136 |         self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path, tokenization)
137 | 
138 |     def _get_source_dataset_filename(self):
139 |         if self.dataset == WMTDataset.News2014:
140 |             other_lang = self.source_lang.value if self.target_lang == Language.English else self.target_lang.value
141 |             source = "{0}-{1}en-src.{2}.sgm".format(self.dataset.value, other_lang, self.source_lang.value)
142 |             target = "{0}-{1}en-ref.{2}.sgm".format(self.dataset.value, other_lang, self.target_lang.value)
143 |         elif self.dataset == WMTDataset.News2019:
144 |             source = "{0}-{1}{2}-src.{1}.sgm".format(self.dataset.value, self.source_lang.value, self.target_lang.value)
145 |             target = "{0}-{1}{2}-ref.{2}.sgm".format(self.dataset.value, self.source_lang.value, self.target_lang.value)
146 |         else:
147 |             raise ValueError("Unknown dataset: {}".format(self.dataset))
148 |         return source, target
149 | 
150 |     def _get_dataset_name(self):
151 |         cfg = (self.dataset, self.source_lang, self.target_lang)
152 |         if cfg not in WMTEvaluator._datasets:
153 |             raise ValueError("Unsupported dataset configuration: {} {} {}".format(
154 |                 self.dataset.name,
155 |                 self.source_lang.name,
156 |                 self.target_lang.name
157 |             ))
158 | 
159 |         ds_names = {WMTDataset.News2014: "WMT2014", WMTDataset.News2019: "WMT2019"}
160 |         return "{0} {1}-{2}".format(ds_names.get(self.dataset), self.source_lang.fullname, self.target_lang.fullname)
161 | 
162 |     def add(self, answers: Dict[str, str]):
163 |         """
164 |         Updates the evaluator with new results
165 | 
166 |         :param answers: a dict where keys are source segments ids and values are translated segments
167 |             (segment id is created by concatenating document id and the original segment id,
168 |             separated by `#`.)
169 | 
170 |         Examples:
171 |             Update the evaluator with three results:
172 | 
173 |             .. code-block:: python
174 | 
175 |                 my_evaluator.add({
176 |                     'bbc.381790#1': 'Waliser AMs sorgen sich um "Aussehen wie Muppets"',
177 |                     'bbc.381790#2': 'Unter einigen AMs herrscht Bestürzung über einen...',
178 |                     'bbc.381790#3': 'Sie ist aufgrund von Plänen entstanden, den Namen...'
179 |                 })
180 | 
181 |         .. seealso:: `source_segments`
182 |         """
183 | 
184 |         self.metrics.add(answers)
185 | 
186 |         if not self.first_batch_processed and self.metrics.has_data:
187 |             self.batch_hash = calculate_batch_hash(
188 |                 self.cache_values(answers=self.metrics.answers,
189 |                                   metrics=self.metrics.get_results(ignore_missing=True))
190 |             )
191 |             self.first_batch_processed = True
192 | 
193 |     @property
194 |     def source_segments(self):
195 |         """
196 |         Ordered dictionary of all segments to translate with segments ids as keys. The same segments ids
197 |         have to be used when submitting translations with :func:`add`.
198 | 
199 |         Examples:
200 | 
201 |             .. code-block:: python
202 | 
203 |                 for segment_id, text in my_evaluator.source_segments.items():
204 |                     translated = model(text)
205 |                     my_evaluator.add({segment_id: translated})
206 | 
207 |         .. seealso: `source_documents`
208 |         """
209 | 
210 |         return self.metrics.source_segments
211 | 
212 |     @property
213 |     def source_documents(self):
214 |         """
215 |         List of all documents to translate
216 | 
217 |         Examples:
218 | 
219 |             .. code-block:: python
220 | 
221 |                 for document in my_evaluator.source_documents:
222 |                     for segment in document.segments:
223 |                         translated = model(segment.text)
224 |                         my_evaluator.add({segment.id: translated})
225 | 
226 |         .. seealso: `source_segments`
227 |         """
228 | 
229 |         return self.metrics.source_documents
230 | 
231 |     def reset(self):
232 |         """
233 |         Removes already added translations
234 | 
235 |         When checking if the model should be rerun on whole dataset it is first run on a smaller subset
236 |         and the results are compared with values cached on sotabench server (the check is not performed
237 |         when running locally.) Ideally, the smaller subset is just the first batch, so no additional
238 |         computation is needed. However, for more complex multistage pipelines it may be simpler to
239 |         run the model twice - on a small dataset and (if necessary) on the full dataset. In that case
240 |         :func:`reset` needs to be called before the second run so values from the first run are not reported.
241 | 
242 |         .. seealso:: :func:`cache_exists`
243 |         .. seealso:: :func:`reset_time`
244 |         """
245 | 
246 |         self.metrics.reset()
247 | 
248 |     def get_results(self):
249 |         """
250 |         Gets the results for the evaluator. Empty string is assumed for segments for which in translation
251 |         was provided.
252 | 
253 |         :return: dict with `SacreBLEU` and `BLEU score`.
254 |         """
255 | 
256 |         if self.cached_results:
257 |             return self.results
258 |         self.results = self.metrics.get_results()
259 |         self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
260 | 
261 |         return self.results
262 | 
263 |     def save(self):
264 |         dataset = self._get_dataset_name()
265 | 
266 |         if not self.cached_results:
267 |             exec_speed = (time.time() - self.init_time)
268 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = len(self.metrics.answers) / exec_speed
269 |             self.speed_mem_metrics['Tasks'] = len(self.metrics.answers)
270 |             self.speed_mem_metrics['Evaluation Time'] = exec_speed
271 |         else:
272 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = None
273 |             self.speed_mem_metrics['Tasks'] = None
274 |             self.speed_mem_metrics['Evaluation Time'] = None
275 | 
276 |         return super().save(dataset=dataset)
277 | 
278 | 


--------------------------------------------------------------------------------
/sotabencheval/natural_language_inference/__init__.py:
--------------------------------------------------------------------------------
1 | from .multinli import MultiNLI
2 | 
3 | __all__ = ["MultiNLI"]
4 | 


--------------------------------------------------------------------------------
/sotabencheval/natural_language_inference/multinli.py:
--------------------------------------------------------------------------------
  1 | import csv
  2 | import time
  3 | 
  4 | from itertools import zip_longest
  5 | from pathlib import Path
  6 | 
  7 | from sotabencheval.core import BaseEvaluator
  8 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server, get_max_memory_allocated
  9 | 
 10 | 
 11 | def read_csv(path):
 12 |     with path.open('r') as f:
 13 |         yield from csv.DictReader(f, delimiter='\t')
 14 | 
 15 | 
 16 | def get_path(local_root, local_unzip=False):
 17 |     root = Path(change_root_if_server(root=local_root,
 18 |                                       server_root=".data/nlp/multinli"))
 19 |     zip_name = "MNLI.zip"
 20 |     dataset_path=root / "MNLI" / "dev_matched.tsv"
 21 |     if not dataset_path.exists():  # unzip
 22 |         extract_archive(str(root / zip_name), to_path=root)
 23 |     return (dataset_path, dataset_path.parent / "dev_mismatched.tsv")
 24 | 
 25 | 
 26 | class ClassificationEvaluator:
 27 |     def __init__(self, file_path):
 28 |         self.dataset_path = file_path
 29 |         dataset = list(read_csv(file_path))
 30 |         self.targets = {d['pairID']: d['gold_label'] for d in dataset}
 31 |         self.dataset = {d['pairID']: (d['sentence1'], d['sentence2']) for d in dataset}
 32 |         self.reset()
 33 | 
 34 |     def reset(self):    
 35 |         self.answers = {}
 36 |     
 37 |     @property
 38 |     def count(self):
 39 |         return len(self.answers)
 40 |     
 41 |     def add(self, pairIds, preds):
 42 |         for pairId, pred in zip(pairIds,preds):
 43 |             if pairId not in self.targets:
 44 |                 continue
 45 |             if pairId not in self.answers:
 46 |                 self.answers[pairId] = pred
 47 |             else:
 48 |                 print(f"Double prediction for {pairId} former: {self.answers[pairId]} new: {pred}")
 49 |    
 50 |     @property
 51 |     def has_enough_for_cache_hash(self):
 52 |         return self.count >= 100
 53 | 
 54 |     @property
 55 |     def accuracy(self):
 56 |         correct = [self.targets[k] == a for k,a in self.answers.items() if a is not None]
 57 |         accuracy = sum(correct) / self.count if self.count > 0 else 0
 58 |         if self.count != len(self.targets):
 59 |             return (accuracy, f"partial on {self.count} out of {len(self.targets)}")
 60 |         return accuracy
 61 | 
 62 | 
 63 | class MultiNLI(BaseEvaluator):
 64 |     task = "Natural Language Inference"
 65 |     dataset = 'MultiNLI'  # defined in subclass
 66 | 
 67 |     def __init__(self,
 68 |                  local_root: str = '.',
 69 |                  model_name: str = None,
 70 |                  paper_arxiv_id: str = None,
 71 |                  paper_pwc_id: str = None,
 72 |                  paper_results: dict = None,
 73 |                  model_description=None):
 74 | 
 75 |         super().__init__(model_name, paper_arxiv_id,
 76 |                          paper_pwc_id, paper_results, model_description)
 77 |         self.local_root = local_root
 78 |         paths = self.dataset_paths
 79 |         self.matched = ClassificationEvaluator(paths[0])
 80 |         self.mismatched = ClassificationEvaluator(paths[1])
 81 |         self.reset()
 82 | 
 83 |     @property
 84 |     def dataset_paths(self):
 85 |         return get_path(self.local_root)
 86 |     
 87 |     @property
 88 |     def data_generator(self):
 89 |         for v1, v2 in zip_longest(self.matched.dataset.items(), self.mismatched.dataset.items()):
 90 |             if v1 is not None:
 91 |                 yield v1
 92 |             if v2 is not None:
 93 |                 yield v2
 94 | 
 95 |     def reset(self):
 96 |         self.matched.reset()
 97 |         self.mismatched.reset()
 98 |         self.batch_hash = None
 99 |         self.reset_time()
100 | 
101 |     def add(self, pairIds, predictions):
102 |         """
103 |             pairIDToLabel - Dictionary mapping pairID (str) to label (str)              
104 |         """
105 |         if isinstance(pairIds, str):
106 |             pairIds = [pairIds]
107 |             predictions = [predictions]
108 |         
109 |         self.matched.add(pairIds, predictions)
110 |         self.mismatched.add(pairIds, predictions)
111 |         if self.batch_hash is None and self.matched.count + self.mismatched.count > 100:
112 |             content = self.cache_values(matched=self.matched.answers, mismatched=self.mismatched.answers)
113 |             self.batch_hash = calculate_batch_hash(content)
114 |             self.first_batch_processed = True #TODO: do we need this if we have self.batch_hash
115 | 
116 | 
117 |     def get_results(self):
118 |         if self.cached_results:
119 |             return self.results
120 |         self.results = {
121 |             'Matched': self.matched.accuracy,
122 |             'Mismatched': self.mismatched.accuracy
123 |         }
124 |         self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
125 |         exec_speed = (time.time() - self.init_time)
126 |         count = self.mismatched.count + self.matched.count
127 |         self.speed_mem_metrics['Tasks / Evaluation Time'] = count / exec_speed
128 |         self.speed_mem_metrics['Tasks'] = count
129 |         self.speed_mem_metrics['Evaluation Time'] = exec_speed
130 |         return self.results
131 | 
132 |     def save(self):
133 | 
134 |         
135 |         return super().save(dataset=self.dataset)
136 | 


--------------------------------------------------------------------------------
/sotabencheval/object_detection/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["COCOEvaluator"]
2 | 
3 | from sotabencheval.object_detection.coco import COCOEvaluator


--------------------------------------------------------------------------------
/sotabencheval/object_detection/coco.py:
--------------------------------------------------------------------------------
  1 | # Some of the processing logic here is based on the torchvision COCO dataset
  2 | # https://github.com/pytorch/vision/blob/master/torchvision/datasets/coco.py
  3 | 
  4 | import copy
  5 | import numpy as np
  6 | import os
  7 | from pycocotools.coco import COCO
  8 | from sotabenchapi.client import Client
  9 | from sotabenchapi.core import BenchmarkResult, check_inputs
 10 | import time
 11 | 
 12 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server
 13 | from sotabencheval.utils import get_max_memory_allocated
 14 | from sotabencheval.object_detection.coco_eval import CocoEvaluator
 15 | from sotabencheval.object_detection.utils import get_coco_metrics
 16 | 
 17 | 
 18 | class COCOEvaluator(object):
 19 |     """`COCO <https://sotabench.com/benchmarks/object-detection-on-coco-minival>`_ benchmark.
 20 | 
 21 |     Examples:
 22 |         Evaluate a ResNeXt model from the torchvision repository:
 23 | 
 24 |         .. code-block:: python
 25 | 
 26 |             ...
 27 | 
 28 |             evaluator = COCOEvaluator(model_name='Mask R-CNN', paper_arxiv_id='1703.06870')
 29 | 
 30 |             with torch.no_grad():
 31 |                 for i, (input, __) in enumerate(iterator):
 32 |                     ...
 33 |                     output = model(input)
 34 |                     # optional formatting of output here to be a list of detection dicts
 35 |                     evaluator.add(output)
 36 | 
 37 |                     if evaluator.cache_exists:
 38 |                         break
 39 | 
 40 |             evaluator.save()
 41 |     """
 42 | 
 43 |     task = "Object Detection"
 44 | 
 45 |     def __init__(self,
 46 |                  root: str = '.',
 47 |                  split: str = "val",
 48 |                  dataset_year: str = "2017",
 49 |                  model_name: str = None,
 50 |                  paper_arxiv_id: str = None,
 51 |                  paper_pwc_id: str = None,
 52 |                  paper_results: dict = None,
 53 |                  model_description=None,):
 54 |         """Initializes a COCO Evaluator object
 55 | 
 56 |         Args:
 57 |             root (string): Root directory of the COCO Dataset - where the
 58 |             label data is located (or will be downloaded to).
 59 |             split (str) : the split for COCO to use, e.g. 'val'
 60 |             dataset_year (str): the dataset year for COCO to use
 61 |             model_name (str, optional): The name of the model from the
 62 |                 paper - if you want to link your build to a machine learning
 63 |                 paper. See the COCO benchmark page for model names,
 64 |                 https://sotabench.com/benchmarks/object-detection-on-coco-minival,
 65 |                 e.g. on the paper leaderboard tab.
 66 |             paper_arxiv_id (str, optional): Optional linking to arXiv if you
 67 |                 want to link to papers on the leaderboard; put in the
 68 |                 corresponding paper's arXiv ID, e.g. '1611.05431'.
 69 |             paper_pwc_id (str, optional): Optional linking to Papers With Code;
 70 |                 put in the corresponding papers with code URL slug, e.g.
 71 |                 'u-gat-it-unsupervised-generative-attentional'
 72 |             paper_results (dict, optional) : If the paper you are reproducing
 73 |                 does not have model results on sotabench.com, you can specify
 74 |                 the paper results yourself through this argument, where keys
 75 |                 are metric names, values are metric values. e.g::
 76 | 
 77 |                     {'box AP': 0.349, 'AP50': 0.592, ...}.
 78 | 
 79 |                 Ensure that the metric names match those on the sotabench
 80 |                 leaderboard - for COCO it should be 'box AP', 'AP50',
 81 |                 'AP75', 'APS', 'APM', 'APL'
 82 |             model_description (str, optional): Optional model description.
 83 |         """
 84 |         root = self.root = change_root_if_server(root=root,
 85 |                                                  server_root="./.data/vision/coco")
 86 | 
 87 |         # Model metadata
 88 | 
 89 |         self.model_name = model_name
 90 |         self.paper_arxiv_id = paper_arxiv_id
 91 |         self.paper_pwc_id = paper_pwc_id
 92 |         self.paper_results = paper_results
 93 |         self.model_description = model_description
 94 |         self.split = split
 95 | 
 96 |         annFile = os.path.join(
 97 |             root, "annotations/instances_%s%s.json" % (self.split, dataset_year)
 98 |         )
 99 | 
100 |         self._download(annFile)
101 | 
102 |         self.coco = COCO(annFile)
103 |         self.iou_types = ['bbox']
104 |         self.coco_evaluator = CocoEvaluator(self.coco, self.iou_types)
105 | 
106 |         self.detections = []
107 |         self.results = None
108 | 
109 |         # Backend variables for hashing and caching
110 | 
111 |         self.first_batch_processed = False
112 |         self.batch_hash = None
113 |         self.cached_results = False
114 | 
115 |         # Speed and memory metrics
116 | 
117 |         self.speed_mem_metrics = {}
118 |         self.init_time = time.time()
119 | 
120 |     def _download(self, annFile):
121 |         """
122 |         Utility function for downloading the COCO annotation file
123 | 
124 |         :param annFile: path of the annotations file
125 |         :return: void - extracts the archive
126 |         """
127 |         if not os.path.isfile(annFile):
128 |             if "2017" in annFile:
129 |                 annotations_dir_zip = os.path.join(
130 |                     self.root, "annotations_train%s2017.zip" % self.split
131 |                 )
132 |             elif "2014" in annFile:
133 |                 annotations_dir_zip = os.path.join(
134 |                     self.root, "annotations_train%s2014.zip" % self.split
135 |                 )
136 |             else:
137 |                 annotations_dir_zip = None
138 | 
139 |             if annotations_dir_zip is not None:
140 |                 print('Attempt to extract annotations file at {zip_loc}'.format(zip_loc=annotations_dir_zip))
141 |                 extract_archive(from_path=annotations_dir_zip, to_path=self.root)
142 | 
143 |     @property
144 |     def cache_exists(self):
145 |         """
146 |         Checks whether the cache exists in the sotabench.com database - if so
147 |         then sets self.results to cached results and returns True.
148 | 
149 |         You can use this property for control flow to break a for loop over a dataset
150 |         after the first iteration. This prevents rerunning the same calculation for the
151 |         same model twice.
152 | 
153 |         Examples:
154 |             Breaking a for loop
155 | 
156 |             .. code-block:: python
157 | 
158 |                 ...
159 | 
160 |                 with torch.no_grad():
161 |                     for i, (input, target) in enumerate(iterator):
162 |                         ...
163 |                         output = model(input)
164 |                         # optional formatting of output here to be a list of detection dicts
165 |                         evaluator.add(output)
166 | 
167 |                         if evaluator.cache_exists:
168 |                             break
169 | 
170 |                 evaluator.save()
171 | 
172 |         :return: bool or None (if not in check mode)
173 |         """
174 |         if not self.first_batch_processed:
175 |             raise ValueError('No batches of data have been processed so no batch_hash exists')
176 | 
177 |         if not is_server():  # we only check the cache on the server
178 |             return None
179 | 
180 |         client = Client.public()
181 |         cached_res = client.get_results_by_run_hash(self.batch_hash)
182 |         if cached_res:
183 |             self.results = cached_res
184 |             self.cached_results = True
185 |             print(
186 |                 "No model change detected (using the first batch run "
187 |                 "hash). Will use cached results."
188 |             )
189 |             return True
190 | 
191 |         return False
192 | 
193 |     @staticmethod
194 |     def cache_format_ann(ann):
195 |         """
196 |         Cache formats an annotation dictionary with rounding. the reason we need to round is that if we have
197 |         small floating point originated differences, then changes the hash of the predictions.
198 | 
199 |         :param ann (dict): A detection dictionary
200 | 
201 |         :return: ann : A detection dictionary but with rounded values
202 |         """
203 |         ann['bbox'] = [np.round(el, 3) for el in ann['bbox']]
204 |         ann['score'] = np.round(ann['score'], 3)
205 | 
206 |         if 'segmentation' in ann:
207 |             ann['segmentation'] = [np.round(el, 3) for el in ann['segmentation']]
208 | 
209 |         if 'area' in ann:
210 |             ann['area'] = np.round(ann['area'], 3)
211 | 
212 |         return ann
213 | 
214 |     def cache_values(self, annotations, metrics):
215 |         """
216 |         Takes in annotations and metrics, and formats the data to calculate the hash for the cache
217 |         :param annotations: list of detections
218 |         :param metrics: dictionary of final AP metrics
219 |         :return: list of data (combining annotations and metrics)
220 |         """
221 |         metrics = {k: np.round(v, 3) for k, v in metrics.items()}
222 |         new_annotations = copy.deepcopy(annotations)
223 |         new_annotations = [self.cache_format_ann(ann) for ann in new_annotations]
224 | 
225 |         return new_annotations + [metrics]
226 | 
227 |     def add(self, detections: list):
228 |         """
229 |         Update the evaluator with new detections
230 | 
231 |         :param annotations (list): List of detections, that will be used by the COCO.loadRes method in the
232 |         pycocotools API.  Each detection can take a dictionary format like the following:
233 | 
234 |         {'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312, 110.14895629882812, 278.2847595214844],
235 |         'score': 0.999152421951294, 'category_id': 1}
236 | 
237 |         I.e is a list of dictionaries.
238 | 
239 |         :return: void - updates self.detection with the new IDSs and prediction
240 | 
241 |         Examples:
242 |             Update the evaluator with two results:
243 | 
244 |             .. code-block:: python
245 | 
246 |                 my_evaluator.add([{'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312,
247 |                 110.14895629882812, 278.2847595214844], 'score': 0.999152421951294, 'category_id': 1}])
248 |         """
249 |         self.detections.extend(detections)
250 | 
251 |         self.coco_evaluator.update(detections)
252 | 
253 |         if not self.first_batch_processed:
254 |             self.coco_evaluator.evaluate()
255 |             self.coco_evaluator.accumulate()
256 | 
257 |             if any([detection['bbox'] for detection in detections]): # we can only hash if we have predictions
258 |                 self.batch_hash = calculate_batch_hash(
259 |                     self.cache_values(annotations=detections, metrics=get_coco_metrics(self.coco_evaluator)))
260 |                 self.first_batch_processed = True
261 | 
262 |     def get_results(self):
263 |         """
264 |         Reruns the evaluation using the accumulated detections, returns COCO results with AP metrics
265 | 
266 |         :return: dict with COCO AP metrics
267 |         """
268 |         if self.cached_results:
269 |             return self.results
270 | 
271 |         self.coco_evaluator = CocoEvaluator(self.coco, self.iou_types)
272 |         self.coco_evaluator.update(self.detections)
273 |         self.coco_evaluator.evaluate()
274 |         self.coco_evaluator.accumulate()
275 |         self.coco_evaluator.summarize()
276 | 
277 |         self.results = get_coco_metrics(self.coco_evaluator)
278 |         self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
279 | 
280 |         return self.results
281 | 
282 |     def reset_time(self):
283 |         """
284 |         Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
285 |         appropriately, for example:
286 | 
287 |         :return: void - resets self.init_time
288 |         """
289 |         self.init_time = time.time()
290 | 
291 |     def save(self):
292 |         """
293 |         Calculate results and then put into a BenchmarkResult object
294 | 
295 |         On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
296 |         on the platform.
297 | 
298 |         :return: BenchmarkResult object with results and metadata
299 |         """
300 |         # recalculate to ensure no mistakes made during batch-by-batch metric calculation
301 |         self.get_results()
302 | 
303 |         # If this is the first time the model is run, then we record evaluation time information
304 | 
305 |         if not self.cached_results:
306 |             unique_image_ids = set([d['image_id'] for d in self.detections])
307 |             exec_speed = (time.time() - self.init_time)
308 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = len(unique_image_ids) / exec_speed
309 |             self.speed_mem_metrics['Tasks'] = len(unique_image_ids)
310 |             self.speed_mem_metrics['Evaluation Time'] = exec_speed
311 |         else:
312 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = None
313 |             self.speed_mem_metrics['Tasks'] = None
314 |             self.speed_mem_metrics['Evaluation Time'] = None
315 | 
316 |         return BenchmarkResult(
317 |             task=self.task,
318 |             config={},
319 |             dataset='COCO minival',
320 |             results=self.results,
321 |             speed_mem_metrics=self.speed_mem_metrics,
322 |             model=self.model_name,
323 |             model_description=self.model_description,
324 |             arxiv_id=self.paper_arxiv_id,
325 |             pwc_id=self.paper_pwc_id,
326 |             paper_results=self.paper_results,
327 |             run_hash=self.batch_hash,
328 |         )
329 | 


--------------------------------------------------------------------------------
/sotabencheval/object_detection/coco_eval.py:
--------------------------------------------------------------------------------
  1 | # Code is based on  https://github.com/pytorch/vision/blob/master/references/detection/
  2 | 
  3 | import numpy as np
  4 | import copy
  5 | 
  6 | from pycocotools.cocoeval import COCOeval
  7 | from pycocotools.coco import COCO
  8 | import pycocotools.mask as mask_util
  9 | 
 10 | from collections import defaultdict
 11 | 
 12 | 
 13 | class CocoEvaluator(object):
 14 |     """
 15 |     For now this only does BBOX detection - so 'bbox' is the only acceptable iou_type
 16 |     """
 17 |     def __init__(self, coco_gt, iou_types):
 18 |         assert isinstance(iou_types, (list, tuple))
 19 |         coco_gt = copy.deepcopy(coco_gt)
 20 |         self.coco_gt = coco_gt
 21 | 
 22 |         self.iou_types = iou_types
 23 |         self.coco_eval = {}
 24 |         for iou_type in iou_types:
 25 |             self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
 26 | 
 27 |         self.annotation_list = []
 28 | 
 29 |     def update(self, annotation_list):
 30 |         assert(type(annotation_list) == list)
 31 | 
 32 |         self.annotation_list.extend(annotation_list)
 33 | 
 34 |         for iou_type in self.iou_types:
 35 |             coco_dt = loadRes(self.coco_gt, self.annotation_list) if self.annotation_list else COCO()
 36 |             coco_eval = self.coco_eval[iou_type]
 37 |             coco_eval.cocoDt = coco_dt
 38 |             coco_eval.params.imgIds = self.coco_gt.getImgIds()
 39 | 
 40 |     def accumulate(self):
 41 |         for coco_eval in self.coco_eval.values():
 42 |             coco_eval.accumulate()
 43 | 
 44 |     def evaluate(self):
 45 |         for coco_eval in self.coco_eval.values():
 46 |             coco_eval.evaluate()
 47 | 
 48 |     def summarize(self):
 49 |         for iou_type, coco_eval in self.coco_eval.items():
 50 |             # print("IoU metric: {}".format(iou_type))
 51 |             coco_eval.summarize()
 52 | 
 53 | 
 54 | #################################################################
 55 | # From pycocotools, just removed the prints and fixed
 56 | # a Python3 bug about unicode not defined
 57 | #################################################################
 58 | 
 59 | # Ideally, pycocotools wouldn't have hard-coded prints
 60 | # so that we could avoid copy-pasting those two functions
 61 | 
 62 | 
 63 | def createIndex(self):
 64 |     # create index
 65 |     # print('creating index...')
 66 |     anns, cats, imgs = {}, {}, {}
 67 |     imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
 68 |     if "annotations" in self.dataset:
 69 |         for ann in self.dataset["annotations"]:
 70 |             imgToAnns[ann["image_id"]].append(ann)
 71 |             anns[ann["id"]] = ann
 72 | 
 73 |     if "images" in self.dataset:
 74 |         for img in self.dataset["images"]:
 75 |             imgs[img["id"]] = img
 76 | 
 77 |     if "categories" in self.dataset:
 78 |         for cat in self.dataset["categories"]:
 79 |             cats[cat["id"]] = cat
 80 | 
 81 |     if "annotations" in self.dataset and "categories" in self.dataset:
 82 |         for ann in self.dataset["annotations"]:
 83 |             catToImgs[ann["category_id"]].append(ann["image_id"])
 84 | 
 85 |     # print('index created!')
 86 | 
 87 |     # create class members
 88 |     self.anns = anns
 89 |     self.imgToAnns = imgToAnns
 90 |     self.catToImgs = catToImgs
 91 |     self.imgs = imgs
 92 |     self.cats = cats
 93 | 
 94 | 
 95 | maskUtils = mask_util
 96 | 
 97 | 
 98 | def loadRes(coco, anns):
 99 |     """Load result file and return a result api object.
100 | 
101 |     ``anns`` is a list of dicts containing the results
102 | 
103 |     In the original pycoco api, a results file is passed in, whereas in this
104 |     case we bypass the json file loading and ask for a list of dictionary
105 |     annotations to be passed directly in
106 | 
107 |     Returns:
108 |         res (obj): result api object.
109 |     """
110 |     res = COCO()
111 |     res.dataset["images"] = [img for img in coco.dataset["images"]]
112 | 
113 |     # print('Loading and preparing results...')
114 |     # tic = time.time()
115 |     # if isinstance(resFile, torch._six.string_classes):
116 |     #     anns = json.load(open(resFile))
117 |     # elif type(resFile) == np.ndarray:
118 |     #     anns = self.loadNumpyAnnotations(resFile)
119 |     # else:
120 |     #     anns = resFile
121 |     assert type(anns) == list, "results in not an array of objects"
122 |     annsImgIds = [ann["image_id"] for ann in anns]
123 |     assert set(annsImgIds) == (
124 |         set(annsImgIds) & set(coco.getImgIds())
125 |     ), "Results do not correspond to current coco set"
126 |     if "caption" in anns[0]:
127 |         imgIds = set([img["id"] for img in res.dataset["images"]]) & set(
128 |             [ann["image_id"] for ann in anns]
129 |         )
130 |         res.dataset["images"] = [
131 |             img for img in res.dataset["images"] if img["id"] in imgIds
132 |         ]
133 |         for id, ann in enumerate(anns):
134 |             ann["id"] = id + 1
135 |     elif "bbox" in anns[0] and not anns[0]["bbox"] == []:
136 |         res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"])
137 |         for id, ann in enumerate(anns):
138 |             bb = ann["bbox"]
139 |             x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
140 |             if "segmentation" not in ann:
141 |                 ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
142 |             ann["area"] = bb[2] * bb[3]
143 |             ann["id"] = id + 1
144 |             ann["iscrowd"] = 0
145 |     elif "segmentation" in anns[0]:
146 |         res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"])
147 |         for id, ann in enumerate(anns):
148 |             # now only support compressed RLE format as segmentation results
149 |             ann["area"] = maskUtils.area(ann["segmentation"])
150 |             if "bbox" not in ann:
151 |                 ann["bbox"] = maskUtils.toBbox(ann["segmentation"])
152 |             ann["id"] = id + 1
153 |             ann["iscrowd"] = 0
154 |     elif "keypoints" in anns[0]:
155 |         res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"])
156 |         for id, ann in enumerate(anns):
157 |             s = ann["keypoints"]
158 |             x = s[0::3]
159 |             y = s[1::3]
160 |             x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y)
161 |             ann["area"] = (x1 - x0) * (y1 - y0)
162 |             ann["id"] = id + 1
163 |             ann["bbox"] = [x0, y0, x1 - x0, y1 - y0]
164 |     # print('DONE (t={:0.2f}s)'.format(time.time()- tic))
165 | 
166 |     res.dataset["annotations"] = anns
167 |     createIndex(res)
168 |     return res
169 | 


--------------------------------------------------------------------------------
/sotabencheval/object_detection/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def get_coco_metrics(coco_evaluator):
 4 | 
 5 |     metrics = {
 6 |         "box AP": None,
 7 |         "AP50": None,
 8 |         "AP75": None,
 9 |         "APS": None,
10 |         "APM": None,
11 |         "APL": None,
12 |     }
13 |     iouThrs = [None, 0.5, 0.75, None, None, None]
14 |     maxDets = [100] + [coco_evaluator.coco_eval["bbox"].params.maxDets[2]] * 5
15 |     areaRngs = ["all", "all", "all", "small", "medium", "large"]
16 |     bounding_box_params = coco_evaluator.coco_eval["bbox"].params
17 | 
18 |     for metric_no, metric in enumerate(metrics):
19 |         aind = [
20 |             i
21 |             for i, aRng in enumerate(bounding_box_params.areaRngLbl)
22 |             if aRng == areaRngs[metric_no]
23 |         ]
24 |         mind = [
25 |             i
26 |             for i, mDet in enumerate(bounding_box_params.maxDets)
27 |             if mDet == maxDets[metric_no]
28 |         ]
29 | 
30 |         s = coco_evaluator.coco_eval["bbox"].eval["precision"]
31 | 
32 |         # IoU
33 |         if iouThrs[metric_no] is not None:
34 |             t = np.where(iouThrs[metric_no] == bounding_box_params.iouThrs)[0]
35 |             s = s[t]
36 |         s = s[:, :, :, aind, mind]
37 | 
38 |         if len(s[s > -1]) == 0:
39 |             mean_s = -1
40 |         else:
41 |             mean_s = np.mean(s[s > -1])
42 | 
43 |         metrics[metric] = mean_s
44 | 
45 |     return metrics
46 | 


--------------------------------------------------------------------------------
/sotabencheval/question_answering/__init__.py:
--------------------------------------------------------------------------------
1 | from sotabencheval.question_answering.squad import SQuADEvaluator, SQuADVersion
2 | 
3 | __all__ = ["SQuADEvaluator", "SQuADVersion"]
4 | 


--------------------------------------------------------------------------------
/sotabencheval/question_answering/evaluate_v11.py:
--------------------------------------------------------------------------------
 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """
 2 | from __future__ import print_function
 3 | from collections import Counter
 4 | import string
 5 | import re
 6 | import argparse
 7 | import json
 8 | import sys
 9 | 
10 | 
11 | def normalize_answer(s):
12 |     """Lower text and remove punctuation, articles and extra whitespace."""
13 |     def remove_articles(text):
14 |         return re.sub(r'\b(a|an|the)\b', ' ', text)
15 | 
16 |     def white_space_fix(text):
17 |         return ' '.join(text.split())
18 | 
19 |     def remove_punc(text):
20 |         exclude = set(string.punctuation)
21 |         return ''.join(ch for ch in text if ch not in exclude)
22 | 
23 |     def lower(text):
24 |         return text.lower()
25 | 
26 |     return white_space_fix(remove_articles(remove_punc(lower(s))))
27 | 
28 | 
29 | def f1_score(prediction, ground_truth):
30 |     prediction_tokens = normalize_answer(prediction).split()
31 |     ground_truth_tokens = normalize_answer(ground_truth).split()
32 |     common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
33 |     num_same = sum(common.values())
34 |     if num_same == 0:
35 |         return 0
36 |     precision = 1.0 * num_same / len(prediction_tokens)
37 |     recall = 1.0 * num_same / len(ground_truth_tokens)
38 |     f1 = (2 * precision * recall) / (precision + recall)
39 |     return f1
40 | 
41 | 
42 | def exact_match_score(prediction, ground_truth):
43 |     return (normalize_answer(prediction) == normalize_answer(ground_truth))
44 | 
45 | 
46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths):
47 |     scores_for_ground_truths = []
48 |     for ground_truth in ground_truths:
49 |         score = metric_fn(prediction, ground_truth)
50 |         scores_for_ground_truths.append(score)
51 |     return max(scores_for_ground_truths)
52 | 
53 | 
54 | def evaluate(dataset, predictions):
55 |     f1 = exact_match = total = 0
56 |     for article in dataset:
57 |         for paragraph in article['paragraphs']:
58 |             for qa in paragraph['qas']:
59 |                 total += 1
60 |                 if qa['id'] not in predictions:
61 |                     message = 'Unanswered question ' + qa['id'] + \
62 |                               ' will receive score 0.'
63 |                     print(message, file=sys.stderr)
64 |                     continue
65 |                 ground_truths = list(map(lambda x: x['text'], qa['answers']))
66 |                 prediction = predictions[qa['id']]
67 |                 exact_match += metric_max_over_ground_truths(
68 |                     exact_match_score, prediction, ground_truths)
69 |                 f1 += metric_max_over_ground_truths(
70 |                     f1_score, prediction, ground_truths)
71 | 
72 |     exact_match = 100.0 * exact_match / total
73 |     f1 = 100.0 * f1 / total
74 | 
75 |     return {'exact_match': exact_match, 'f1': f1}
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     expected_version = '1.1'
80 |     parser = argparse.ArgumentParser(
81 |         description='Evaluation for SQuAD ' + expected_version)
82 |     parser.add_argument('dataset_file', help='Dataset file')
83 |     parser.add_argument('prediction_file', help='Prediction File')
84 |     args = parser.parse_args()
85 |     with open(args.dataset_file) as dataset_file:
86 |         dataset_json = json.load(dataset_file)
87 |         if (dataset_json['version'] != expected_version):
88 |             print('Evaluation expects v-' + expected_version +
89 |                   ', but got dataset with v-' + dataset_json['version'],
90 |                   file=sys.stderr)
91 |         dataset = dataset_json['data']
92 |     with open(args.prediction_file) as prediction_file:
93 |         predictions = json.load(prediction_file)
94 |     print(json.dumps(evaluate(dataset, predictions)))
95 | 


--------------------------------------------------------------------------------
/sotabencheval/question_answering/evaluate_v20.py:
--------------------------------------------------------------------------------
  1 | """Official evaluation script for SQuAD version 2.0.
  2 | 
  3 | In addition to basic functionality, we also compute additional statistics and
  4 | plot precision-recall curves if an additional na_prob.json file is provided.
  5 | This file is expected to map question ID's to the model's predicted probability
  6 | that a question is unanswerable.
  7 | """
  8 | import argparse
  9 | import collections
 10 | import json
 11 | import numpy as np
 12 | import os
 13 | import re
 14 | import string
 15 | import sys
 16 | 
 17 | OPTS = None
 18 | 
 19 | def parse_args():
 20 |   parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
 21 |   parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
 22 |   parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
 23 |   parser.add_argument('--out-file', '-o', metavar='eval.json',
 24 |                       help='Write accuracy metrics to file (default is stdout).')
 25 |   parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
 26 |                       help='Model estimates of probability of no answer.')
 27 |   parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
 28 |                       help='Predict "" if no-answer probability exceeds this (default = 1.0).')
 29 |   parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
 30 |                       help='Save precision-recall curves to directory.')
 31 |   parser.add_argument('--verbose', '-v', action='store_true')
 32 |   if len(sys.argv) == 1:
 33 |     parser.print_help()
 34 |     sys.exit(1)
 35 |   return parser.parse_args()
 36 | 
 37 | def make_qid_to_has_ans(dataset):
 38 |   qid_to_has_ans = {}
 39 |   for article in dataset:
 40 |     for p in article['paragraphs']:
 41 |       for qa in p['qas']:
 42 |         qid_to_has_ans[qa['id']] = bool(qa['answers'])
 43 |   return qid_to_has_ans
 44 | 
 45 | def normalize_answer(s):
 46 |   """Lower text and remove punctuation, articles and extra whitespace."""
 47 |   def remove_articles(text):
 48 |     regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
 49 |     return re.sub(regex, ' ', text)
 50 |   def white_space_fix(text):
 51 |     return ' '.join(text.split())
 52 |   def remove_punc(text):
 53 |     exclude = set(string.punctuation)
 54 |     return ''.join(ch for ch in text if ch not in exclude)
 55 |   def lower(text):
 56 |     return text.lower()
 57 |   return white_space_fix(remove_articles(remove_punc(lower(s))))
 58 | 
 59 | def get_tokens(s):
 60 |   if not s: return []
 61 |   return normalize_answer(s).split()
 62 | 
 63 | def compute_exact(a_gold, a_pred):
 64 |   return int(normalize_answer(a_gold) == normalize_answer(a_pred))
 65 | 
 66 | def compute_f1(a_gold, a_pred):
 67 |   gold_toks = get_tokens(a_gold)
 68 |   pred_toks = get_tokens(a_pred)
 69 |   common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
 70 |   num_same = sum(common.values())
 71 |   if len(gold_toks) == 0 or len(pred_toks) == 0:
 72 |     # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
 73 |     return int(gold_toks == pred_toks)
 74 |   if num_same == 0:
 75 |     return 0
 76 |   precision = 1.0 * num_same / len(pred_toks)
 77 |   recall = 1.0 * num_same / len(gold_toks)
 78 |   f1 = (2 * precision * recall) / (precision + recall)
 79 |   return f1
 80 | 
 81 | def get_raw_scores(dataset, preds):
 82 |   exact_scores = {}
 83 |   f1_scores = {}
 84 |   for article in dataset:
 85 |     for p in article['paragraphs']:
 86 |       for qa in p['qas']:
 87 |         qid = qa['id']
 88 |         gold_answers = [a['text'] for a in qa['answers']
 89 |                         if normalize_answer(a['text'])]
 90 |         if not gold_answers:
 91 |           # For unanswerable questions, only correct answer is empty string
 92 |           gold_answers = ['']
 93 |         if qid not in preds:
 94 |           print('Missing prediction for %s' % qid)
 95 |           continue
 96 |         a_pred = preds[qid]
 97 |         # Take max over all gold answers
 98 |         exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
 99 |         f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
100 |   return exact_scores, f1_scores
101 | 
102 | def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
103 |   new_scores = {}
104 |   for qid, s in scores.items():
105 |     pred_na = na_probs[qid] > na_prob_thresh
106 |     if pred_na:
107 |       new_scores[qid] = float(not qid_to_has_ans[qid])
108 |     else:
109 |       new_scores[qid] = s
110 |   return new_scores
111 | 
112 | def make_eval_dict(exact_scores, f1_scores, qid_list=None):
113 |   if not qid_list:
114 |     total = len(exact_scores)
115 |     return collections.OrderedDict([
116 |         ('exact', 100.0 * sum(exact_scores.values()) / total),
117 |         ('f1', 100.0 * sum(f1_scores.values()) / total),
118 |         ('total', total),
119 |     ])
120 |   else:
121 |     total = len(qid_list)
122 |     return collections.OrderedDict([
123 |         ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
124 |         ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
125 |         ('total', total),
126 |     ])
127 | 
128 | def merge_eval(main_eval, new_eval, prefix):
129 |   for k in new_eval:
130 |     main_eval['%s_%s' % (prefix, k)] = new_eval[k]
131 | 
132 | def plot_pr_curve(precisions, recalls, out_image, title):
133 |   plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
134 |   plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
135 |   plt.xlabel('Recall')
136 |   plt.ylabel('Precision')
137 |   plt.xlim([0.0, 1.05])
138 |   plt.ylim([0.0, 1.05])
139 |   plt.title(title)
140 |   plt.savefig(out_image)
141 |   plt.clf()
142 | 
143 | def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
144 |                                out_image=None, title=None):
145 |   qid_list = sorted(na_probs, key=lambda k: na_probs[k])
146 |   true_pos = 0.0
147 |   cur_p = 1.0
148 |   cur_r = 0.0
149 |   precisions = [1.0]
150 |   recalls = [0.0]
151 |   avg_prec = 0.0
152 |   for i, qid in enumerate(qid_list):
153 |     if qid_to_has_ans[qid]:
154 |       true_pos += scores[qid]
155 |     cur_p = true_pos / float(i+1)
156 |     cur_r = true_pos / float(num_true_pos)
157 |     if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
158 |       # i.e., if we can put a threshold after this point
159 |       avg_prec += cur_p * (cur_r - recalls[-1])
160 |       precisions.append(cur_p)
161 |       recalls.append(cur_r)
162 |   if out_image:
163 |     plot_pr_curve(precisions, recalls, out_image, title)
164 |   return {'ap': 100.0 * avg_prec}
165 | 
166 | def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
167 |                                   qid_to_has_ans, out_image_dir):
168 |   if out_image_dir and not os.path.exists(out_image_dir):
169 |     os.makedirs(out_image_dir)
170 |   num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
171 |   if num_true_pos == 0:
172 |     return
173 |   pr_exact = make_precision_recall_eval(
174 |       exact_raw, na_probs, num_true_pos, qid_to_has_ans,
175 |       out_image=os.path.join(out_image_dir, 'pr_exact.png'),
176 |       title='Precision-Recall curve for Exact Match score')
177 |   pr_f1 = make_precision_recall_eval(
178 |       f1_raw, na_probs, num_true_pos, qid_to_has_ans,
179 |       out_image=os.path.join(out_image_dir, 'pr_f1.png'),
180 |       title='Precision-Recall curve for F1 score')
181 |   oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
182 |   pr_oracle = make_precision_recall_eval(
183 |       oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
184 |       out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
185 |       title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
186 |   merge_eval(main_eval, pr_exact, 'pr_exact')
187 |   merge_eval(main_eval, pr_f1, 'pr_f1')
188 |   merge_eval(main_eval, pr_oracle, 'pr_oracle')
189 | 
190 | def histogram_na_prob(na_probs, qid_list, image_dir, name):
191 |   if not qid_list:
192 |     return
193 |   x = [na_probs[k] for k in qid_list]
194 |   weights = np.ones_like(x) / float(len(x))
195 |   plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
196 |   plt.xlabel('Model probability of no-answer')
197 |   plt.ylabel('Proportion of dataset')
198 |   plt.title('Histogram of no-answer probability: %s' % name)
199 |   plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
200 |   plt.clf()
201 | 
202 | def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
203 |   num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
204 |   cur_score = num_no_ans
205 |   best_score = cur_score
206 |   best_thresh = 0.0
207 |   qid_list = sorted(na_probs, key=lambda k: na_probs[k])
208 |   for i, qid in enumerate(qid_list):
209 |     if qid not in scores: continue
210 |     if qid_to_has_ans[qid]:
211 |       diff = scores[qid]
212 |     else:
213 |       if preds[qid]:
214 |         diff = -1
215 |       else:
216 |         diff = 0
217 |     cur_score += diff
218 |     if cur_score > best_score:
219 |       best_score = cur_score
220 |       best_thresh = na_probs[qid]
221 |   return 100.0 * best_score / len(scores), best_thresh
222 | 
223 | def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
224 |   best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
225 |   best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
226 |   main_eval['best_exact'] = best_exact
227 |   main_eval['best_exact_thresh'] = exact_thresh
228 |   main_eval['best_f1'] = best_f1
229 |   main_eval['best_f1_thresh'] = f1_thresh
230 | 
231 | def main():
232 |   with open(OPTS.data_file) as f:
233 |     dataset_json = json.load(f)
234 |     dataset = dataset_json['data']
235 |   with open(OPTS.pred_file) as f:
236 |     preds = json.load(f)
237 |   if OPTS.na_prob_file:
238 |     with open(OPTS.na_prob_file) as f:
239 |       na_probs = json.load(f)
240 |   else:
241 |     na_probs = {k: 0.0 for k in preds}
242 |   qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
243 |   has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
244 |   no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
245 |   exact_raw, f1_raw = get_raw_scores(dataset, preds)
246 |   exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
247 |                                         OPTS.na_prob_thresh)
248 |   f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
249 |                                      OPTS.na_prob_thresh)
250 |   out_eval = make_eval_dict(exact_thresh, f1_thresh)
251 |   if has_ans_qids:
252 |     has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
253 |     merge_eval(out_eval, has_ans_eval, 'HasAns')
254 |   if no_ans_qids:
255 |     no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
256 |     merge_eval(out_eval, no_ans_eval, 'NoAns')
257 |   if OPTS.na_prob_file:
258 |     find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
259 |   if OPTS.na_prob_file and OPTS.out_image_dir:
260 |     run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
261 |                                   qid_to_has_ans, OPTS.out_image_dir)
262 |     histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
263 |     histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
264 |   if OPTS.out_file:
265 |     with open(OPTS.out_file, 'w') as f:
266 |       json.dump(out_eval, f)
267 |   else:
268 |     print(json.dumps(out_eval, indent=2))
269 | 
270 | if __name__ == '__main__':
271 |   OPTS = parse_args()
272 |   if OPTS.out_image_dir:
273 |     import matplotlib
274 |     matplotlib.use('Agg')
275 |     import matplotlib.pyplot as plt 
276 |   main()
277 | 
278 | 


--------------------------------------------------------------------------------
/sotabencheval/question_answering/squad.py:
--------------------------------------------------------------------------------
  1 | from sotabencheval.core import BaseEvaluator
  2 | from sotabencheval.utils import calculate_batch_hash, change_root_if_server, is_server, get_max_memory_allocated
  3 | from sotabencheval.question_answering.utils import *
  4 | from typing import Dict
  5 | from enum import Enum
  6 | from pathlib import Path
  7 | import json
  8 | import time
  9 | 
 10 | class SQuADVersion(Enum):
 11 |     V11 = 'v1.1'
 12 |     V20 = 'v2.0'
 13 | 
 14 | 
 15 | class SQuADEvaluator(BaseEvaluator):
 16 |     """Evaluator for Stanford Question Answering Dataset v1.1 and v2.0 benchmarks.
 17 | 
 18 |     Examples:
 19 |         Evaluate a BiDAF model from the AllenNLP repository on SQuAD 1.1 development set:
 20 | 
 21 |         .. code-block:: python
 22 | 
 23 |             from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion
 24 | 
 25 |             from allennlp.data import DatasetReader
 26 |             from allennlp.data.iterators import DataIterator
 27 |             from allennlp.models.archival import load_archive
 28 |             from allennlp.nn.util import move_to_device
 29 | 
 30 |             def load_model(url, batch_size=64):
 31 |                 archive = load_archive(url, cuda_device=0)
 32 |                 model = archive.model
 33 |                 reader = DatasetReader.from_params(archive.config["dataset_reader"])
 34 |                 iterator_params = archive.config["iterator"]
 35 |                 iterator_params["batch_size"] = batch_size
 36 |                 data_iterator = DataIterator.from_params(iterator_params)
 37 |                 data_iterator.index_with(model.vocab)
 38 |                 return model, reader, data_iterator
 39 | 
 40 |             def evaluate(model, dataset, data_iterator, evaluator):
 41 |                 model.eval()
 42 |                 evaluator.reset_time()
 43 |                 for batch in data_iterator(dataset, num_epochs=1, shuffle=False):
 44 |                     batch = move_to_device(batch, 0)
 45 |                     predictions = model(**batch)
 46 |                     answers = {metadata['id']: prediction
 47 |                                for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])}
 48 |                     evaluator.add(answers)
 49 |                     if evaluator.cache_exists:
 50 |                         break
 51 | 
 52 |             evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)",
 53 |                 paper_arxiv_id="1611.01603", version=SQuADVersion.V11)
 54 | 
 55 |             model, reader, data_iter =\
 56 |                 load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz")
 57 |             dataset = reader.read(evaluator.dataset_path)
 58 |             evaluate(model, dataset, data_iter, evaluator)
 59 |             evaluator.save()
 60 |             print(evaluator.results)
 61 |     """
 62 | 
 63 |     task = "Question Answering"
 64 | 
 65 |     def __init__(self,
 66 |                  local_root: str = '.',
 67 |                  dataset_filename: str = None,
 68 |                  model_name: str = None,
 69 |                  paper_arxiv_id: str = None,
 70 |                  paper_pwc_id: str = None,
 71 |                  paper_results: dict = None,
 72 |                  model_description=None,
 73 |                  version: SQuADVersion = SQuADVersion.V20):
 74 |         """
 75 |         Creates an evaluator for SQuAD v1.1 or v2.0 Question Answering benchmarks.
 76 | 
 77 |         :param local_root: Path to the directory where the dataset files are located locally.
 78 |             Ignored when run on sotabench server.
 79 |         :param dataset_filename: Local filename of the JSON file with the SQuAD dataset.
 80 |             If None, the standard filename is used, based on :param:`version`.
 81 |             Ignored when run on sotabench server.
 82 |         :param model_name: The name of the model from the
 83 |             paper - if you want to link your build to a model from a
 84 |             machine learning paper. See the SQuAD benchmarks pages for model names,
 85 |             (f.e., https://sotabench.com/benchmarks/question-answering-on-squad11-dev)
 86 |             on the paper leaderboard or models yet to try tabs.
 87 |         :param paper_arxiv_id: Optional linking to arXiv if you
 88 |             want to link to papers on the leaderboard; put in the
 89 |             corresponding paper's arXiv ID, e.g. '1907.10529'.
 90 |         :param paper_pwc_id: Optional linking to Papers With Code;
 91 |             put in the corresponding papers with code URL slug, e.g.
 92 |             'spanbert-improving-pre-training-by'
 93 |         :param paper_results: If the paper model you are reproducing
 94 |             does not have model results on sotabench.com, you can specify
 95 |             the paper results yourself through this argument, where keys
 96 |             are metric names, values are metric values. e.g:
 97 | 
 98 |                     {'EM': 0.858, 'F1': 0.873}.
 99 | 
100 |             Ensure that the metric names match those on the sotabench
101 |             leaderboard - for SQuAD benchmarks it should be `EM` for exact match
102 |             and `F1` for F1 score. Make sure to use results of evaluation on a development set.
103 |         :param model_description: Optional model description.
104 |         :param version: Which dataset to evaluate on, either `SQuADVersion.V11` or `SQuADVersion.V20`.
105 |         """
106 |         super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description)
107 |         self.root = change_root_if_server(root=local_root,
108 |                                           server_root=".data/nlp/squad")
109 |         self.version = version
110 |         if dataset_filename is None or is_server():
111 |             dataset_filename = "dev-{}.json".format(version.value)
112 |         self.dataset_path = Path(self.root) / dataset_filename
113 | 
114 |         self.metrics = SQuADMetrics(self.dataset_path, version)
115 | 
116 |     def add(self, answers: Dict[str, str]):
117 |         """
118 |         Updates the evaluator with new results
119 | 
120 |         :param answers: a dictionary, where keys are question ids and values are text answers.
121 |             For unanswerable questions (SQuAD v2.0) the answer should be an empty string.
122 | 
123 |         Examples:
124 |             Update the evaluator with two results:
125 | 
126 |             .. code-block:: python
127 | 
128 |                 my_evaluator.add({
129 |                     "57296d571d04691400779413": "itself",
130 |                     "5a89117e19b91f001a626f2d": ""
131 |                 })
132 |         """
133 | 
134 |         self.metrics.add(answers)
135 | 
136 |         if not self.first_batch_processed and self.metrics.has_data:
137 |             self.batch_hash = calculate_batch_hash(
138 |                 self.cache_values(answers=self.metrics.answers,
139 |                                   metrics=self.metrics.get_results(ignore_missing=True))
140 |             )
141 |             self.first_batch_processed = True
142 | 
143 |     def reset(self):
144 |         """
145 |         Removes already added answers
146 | 
147 |         When checking if the model should be rerun on whole dataset it is first run on a smaller subset
148 |         and the results are compared with values cached on sotabench server (the check is not performed
149 |         when running locally.) Ideally, the smaller subset is just the first batch, so no additional
150 |         computation is needed. However, for more complex multistage pipelines it may be simpler to
151 |         run the model twice - on a small dataset and (if necessary) on the full dataset. In that case
152 |         :func:`reset` needs to be called before the second run so values from the first run are not reported.
153 | 
154 |         .. seealso:: :func:`cache_exists`
155 |         .. seealso:: :func:`reset_time`
156 |         """
157 | 
158 |         self.metrics.reset()
159 |         self.reset_time()
160 | 
161 |     def get_results(self):
162 |         """
163 |         Gets the results for the evaluator.
164 | 
165 |         :return: dict with `EM` (exact match score) and `F1`.
166 |         """
167 | 
168 |         if self.cached_results:
169 |             return self.results
170 |         self.results = self.metrics.get_results()
171 |         self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
172 | 
173 |         return self.results
174 | 
175 |     def save(self):
176 |         dataset = "SQuAD{} dev".format(self.metrics.version.value[1:])
177 | 
178 |         if not self.cached_results:
179 |             exec_speed = (time.time() - self.init_time)
180 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = len(self.metrics.answers) / exec_speed
181 |             self.speed_mem_metrics['Tasks'] = len(self.metrics.answers)
182 |             self.speed_mem_metrics['Evaluation Time'] = exec_speed
183 |         else:
184 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = None
185 |             self.speed_mem_metrics['Tasks'] = None
186 |             self.speed_mem_metrics['Evaluation Time'] = None
187 | 
188 |         return super().save(dataset=dataset)
189 | 
190 | 
191 | # todo: aggregate batches so that size of the batch used for caching does not depend on evaluation batch size
192 | CACHE_BATCH_SIZE = 1024
193 | 
194 | 
195 | class SQuADMetrics:
196 |     def __init__(self, dataset_path: Path, version: SQuADVersion = SQuADVersion.V20):
197 |         self.version = version
198 |         self.answers = {}
199 |         self._dataset = self._load_dataset(dataset_path)
200 |         self._results = None
201 | 
202 |     def _load_dataset(self, path):
203 |         with open(path, 'rt') as f:
204 |             ds = json.load(f)
205 |         if 'version' not in ds or 'data' not in ds:
206 |             raise ValueError("Incorrect dataset format, either 'version' or 'data' is missing")
207 |         version = ds['version'].strip().lower()
208 |         if version and version[0] != 'v':
209 |             version = 'v'+version
210 |         if self.version.value != version:
211 |             raise ValueError("Incorrect dataset version, found {} but was expecting {}"
212 |                              .format(version, self.version.value))
213 |         return ds['data']
214 | 
215 |     def reset(self):
216 |         self._results = None
217 |         self.answers = {}
218 | 
219 |     def add(self, answers: Dict[str, str]):
220 |         if not answers:
221 |             print("Empty batch added to results")
222 |             return
223 |         if set(self.answers.keys()) & set(answers.keys()):
224 |             print("Multiple predictions for a single question")
225 | 
226 |         self.answers.update(answers)
227 | 
228 |     def evaluate(self, ignore_missing=False):
229 |         if ignore_missing:
230 |             dataset = [{'paragraphs': [
231 |                 {'qas': [qa for qa in paragraph['qas'] if qa['id'] in self.answers]}
232 |                 for paragraph in article['paragraphs']
233 |             ]} for article in self._dataset]
234 |         else:
235 |             dataset = self._dataset
236 |         if self.version == SQuADVersion.V11:
237 |             eval_fn = evaluate_v11
238 |         else:
239 |             eval_fn = evaluate_v20
240 |         results = eval_fn(dataset, self.answers)
241 |         self._results = {
242 |             'EM': results['exact_match'] / 100.0,
243 |             'F1': results['f1'] / 100.0
244 |         }
245 | 
246 |     @property
247 |     def has_data(self):
248 |         return bool(self.answers)
249 | 
250 |     def get_results(self, ignore_missing=False):
251 |         self.evaluate(ignore_missing)
252 | 
253 |         return self._results
254 | 


--------------------------------------------------------------------------------
/sotabencheval/question_answering/utils.py:
--------------------------------------------------------------------------------
 1 | from sotabencheval.question_answering.evaluate_v11 import evaluate as evaluate_v11
 2 | from sotabencheval.question_answering.evaluate_v20 import get_raw_scores
 3 | 
 4 | __all__ = ["evaluate_v11", "evaluate_v20"]
 5 | 
 6 | 
 7 | def evaluate_v20(dataset, predictions):
 8 |     exact_scores, f1_scores = get_raw_scores(dataset, predictions)
 9 |     total = sum([len(p['qas']) for article in dataset for p in article['paragraphs']])
10 |     exact_match = 100.0 * sum(exact_scores.values()) / total
11 |     f1 = 100.0 * sum(f1_scores.values()) / total
12 |     return {'exact_match': exact_match, 'f1': f1}
13 | 


--------------------------------------------------------------------------------
/sotabencheval/semantic_segmentation/__init__.py:
--------------------------------------------------------------------------------
1 | __all__ = ["PASCALVOCEvaluator"]
2 | 
3 | from sotabencheval.semantic_segmentation.ade20k import ADE20KEvaluator
4 | from sotabencheval.semantic_segmentation.pascalvoc import PASCALVOCEvaluator


--------------------------------------------------------------------------------
/sotabencheval/semantic_segmentation/ade20k.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sotabenchapi.client import Client
  3 | from sotabenchapi.core import BenchmarkResult, check_inputs
  4 | import time
  5 | 
  6 | from sotabencheval.utils import calculate_batch_hash, is_server, get_max_memory_allocated
  7 | from sotabencheval.semantic_segmentation.utils import ConfusionMatrix
  8 | 
  9 | 
 10 | class ADE20KEvaluator(object):
 11 |     """`ADE20K <https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val>`_ benchmark.
 12 | 
 13 |     Examples:
 14 |         Evaluate a HRNetV2 model from the CSAILVision repository
 15 | 
 16 |         .. code-block:: python
 17 | 
 18 |             ...
 19 | 
 20 |             evaluator = ADE20KEvaluator(model_name='HRNetV2 (HRNetV2-W48)', paper_arxiv_id='1904.04514')
 21 | 
 22 |             for batch_data in loader:
 23 |                 # process data
 24 |                 batch_data = batch_data[0]
 25 |                 seg_label = as_numpy(batch_data['seg_label'][0])
 26 |                 img_resized_list = batch_data['img_data']
 27 | 
 28 |                 torch.cuda.synchronize()
 29 |                 tic = time.perf_counter()
 30 |                 with torch.no_grad():
 31 |                     segSize = (seg_label.shape[0], seg_label.shape[1])
 32 |                     scores = torch.zeros(1, cfg.DATASET.num_class, segSize[0], segSize[1])
 33 |                     scores = async_copy_to(scores, gpu)
 34 | 
 35 |                     for img in img_resized_list:
 36 |                         feed_dict = batch_data.copy()
 37 |                         feed_dict['img_data'] = img
 38 |                         del feed_dict['img_ori']
 39 |                         del feed_dict['info']
 40 |                         feed_dict = async_copy_to(feed_dict, gpu)
 41 | 
 42 |                         # forward pass
 43 |                         scores_tmp = segmentation_module(feed_dict, segSize=segSize)
 44 |                         scores = scores + scores_tmp / len(cfg.DATASET.imgSizes)
 45 | 
 46 |                     _, pred = torch.max(scores, dim=1)
 47 |                     pred = as_numpy(pred.squeeze(0).cpu())
 48 | 
 49 |                 torch.cuda.synchronize()
 50 | 
 51 |                 evaluator.update(output=pred.flatten().cpu().numpy(),
 52 |                     target=seg_label.flatten().cpu().numpy())
 53 | 
 54 |                 if evaluator.cache_exists:
 55 |                     break
 56 | 
 57 |             evaluator.save()
 58 |     """
 59 | 
 60 |     task = "Semantic Segmentation"
 61 | 
 62 |     def __init__(self,
 63 |                  model_name: str = None,
 64 |                  paper_arxiv_id: str = None,
 65 |                  paper_pwc_id: str = None,
 66 |                  paper_results: dict = None,
 67 |                  model_description=None):
 68 |         """Initializes a COCO Evaluator object
 69 | 
 70 |         Args:
 71 |             model_name (str, optional): The name of the model from the
 72 |                 paper - if you want to link your build to a machine learning
 73 |                 paper. See the ADE20K benchmark page for model names,
 74 |                 https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val,
 75 |                 e.g. on the paper leaderboard tab.
 76 |             paper_arxiv_id (str, optional): Optional linking to arXiv if you
 77 |                 want to link to papers on the leaderboard; put in the
 78 |                 corresponding paper's arXiv ID, e.g. '1611.05431'.
 79 |             paper_pwc_id (str, optional): Optional linking to Papers With Code;
 80 |                 put in the corresponding papers with code URL slug, e.g.
 81 |                 'u-gat-it-unsupervised-generative-attentional'
 82 |             paper_results (dict, optional) : If the paper you are reproducing
 83 |                 does not have model results on sotabench.com, you can specify
 84 |                 the paper results yourself through this argument, where keys
 85 |                 are metric names, values are metric values. e.g::
 86 | 
 87 |                     {'mIOU': 0.4566, 'Accuracy': 0.543}.
 88 | 
 89 |                 Ensure that the metric names match those on the sotabench
 90 |                 leaderboard - for ADE20K it should be 'mIOU', 'Accuracy'
 91 |             model_description (str, optional): Optional model description.
 92 |             download (bool) : whether to download the data or not
 93 |         """
 94 | 
 95 |         # Model metadata
 96 | 
 97 |         self.model_name = model_name
 98 |         self.paper_arxiv_id = paper_arxiv_id
 99 |         self.paper_pwc_id = paper_pwc_id
100 |         self.paper_results = paper_results
101 |         self.model_description = model_description
102 | 
103 |         self.ade20k_evaluator = ConfusionMatrix(150)
104 | 
105 |         self.outputs = np.array([])
106 |         self.targets = np.array([])
107 | 
108 |         self.results = None
109 | 
110 |         # Backend variables for hashing and caching
111 | 
112 |         self.first_batch_processed = False
113 |         self.batch_hash = None
114 |         self.cached_results = False
115 | 
116 |         # Speed and memory metrics
117 | 
118 |         self.init_time = time.time()
119 |         self.speed_mem_metrics = {}
120 | 
121 |     @property
122 |     def cache_exists(self):
123 |         """
124 |         Checks whether the cache exists in the sotabench.com database - if so
125 |         then sets self.results to cached results and returns True.
126 | 
127 |         You can use this property for control flow to break a for loop over a dataset
128 |         after the first iteration. This prevents rerunning the same calculation for the
129 |         same model twice.
130 | 
131 |         Examples:
132 |             Breaking a for loop
133 | 
134 |             .. code-block:: python
135 | 
136 |                 ...
137 | 
138 |                 with torch.no_grad():
139 |                     for i, (input, target) in enumerate(iterator):
140 |                         ...
141 |                         output = model(input)
142 |                         # output and target should then be flattened into 1D np.ndarrays and passed in below
143 |                         evaluator.update(output=output, target=target)
144 | 
145 |                         if evaluator.cache_exists:
146 |                             break
147 | 
148 |                 evaluator.save()
149 | 
150 |         :return: bool or None (if not in check mode)
151 |         """
152 |         if not self.first_batch_processed:
153 |             raise ValueError('No batches of data have been processed so no batch_hash exists')
154 | 
155 |         if not is_server():
156 |             return None
157 | 
158 |         client = Client.public()
159 |         cached_res = client.get_results_by_run_hash(self.batch_hash)
160 |         if cached_res:
161 |             self.results = cached_res
162 |             self.cached_results = True
163 |             print(
164 |                 "No model change detected (using the first batch run "
165 |                 "hash). Will use cached results."
166 |             )
167 |             return True
168 | 
169 |         return False
170 | 
171 |     def add(self, outputs: np.ndarray, targets: np.ndarray):
172 |         """
173 |         Update the evaluator with new results from the model
174 | 
175 |         :param outputs (np.ndarray): 1D np.ndarray of semantic class predictions per pixel
176 |         :param targets (np.ndarray): 1D np.ndarray of ground truth semantic classes per pixel
177 | 
178 |         The method requires an outputs input and a targets input - both flattened.
179 | 
180 |         Suppose you are making predictions, batch by batch, and have your model outputs
181 |         and the original targets with batch_size 32, and image size 520 x 480.
182 |         The shape of your outputs might look like this:
183 | 
184 |         batch_output.shape
185 |         >> (32, 21, 520, 480) # where 21 is the number of ADE20K classes
186 | 
187 |         batch_target.shape
188 |         >> (32, 520, 480)
189 | 
190 |         We can flatten the entire output and targets to 1D vectors for each pixel:
191 | 
192 |         flattened_batch_output.shape
193 |         >> (7987200) # flatten by taking the max class prediction
194 |                      #  (batch_output.argmax(1).flatten() in torch with class as second dimension)
195 | 
196 |         flattened_batch_target.shape
197 |         >> (7987200) # (batch_target.flatten() in torch)
198 | 
199 |         The output might look something like this:
200 | 
201 |         flattened_batch_output
202 |         >> array([6, 6, 6, 6, 6, ...])
203 | 
204 |         flattened_batch_target
205 |         >> array([6, 6, 6, 6, 6, ...])
206 | 
207 |         In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5
208 |         pixels - so the model is correct.
209 | 
210 |         These flattened arrays can then be passed into the .add() method of the evaluator
211 | 
212 |         .. code-block:: python
213 | 
214 |             my_evaluator.update(outputs=flattened_batch_output,
215 |                                         targets=flattened_batch_target)
216 | 
217 | 
218 |         :return: void - updates self.ade20k_evaluator with the data, and updates self.targets and self.outputs
219 |         """
220 |         self.ade20k_evaluator.update(targets, outputs)
221 | 
222 |         self.targets = np.append(self.targets, targets)
223 |         self.outputs = np.append(self.outputs, outputs)
224 | 
225 |         if not self.first_batch_processed:
226 |             acc_global, acc, iu = self.ade20k_evaluator.compute()
227 |             self.batch_hash = calculate_batch_hash(np.append(
228 |                 np.append(np.around(targets, 3), np.around(outputs, 3)),
229 |                 np.around(np.array([acc_global.item(), iu.mean().item()]), 3)))
230 |             self.first_batch_processed = True
231 | 
232 |     def get_results(self):
233 |         """
234 |         Reruns the evaluation using the accumulated detections, returns ADE20K results with IOU and
235 |         Accuracy metrics
236 | 
237 |         :return: dict with ADE20K metrics
238 |         """
239 |         if self.cached_results:
240 |             return self.results
241 | 
242 |         self.ade20k_evaluator = ConfusionMatrix(150)
243 |         self.ade20k_evaluator.update(self.targets.astype(np.int64), self.outputs.astype(np.int64))
244 | 
245 |         acc_global, acc, iu = self.ade20k_evaluator.compute()
246 | 
247 |         self.results = {
248 |                    "Accuracy": acc_global.item(),
249 |                    "Mean IOU": iu.mean().item(),
250 |                }
251 | 
252 |         self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
253 | 
254 |         return self.results
255 | 
256 |     def reset_time(self):
257 |         """
258 |         Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
259 |         appropriately, for example:
260 | 
261 |         :return: void - resets self.init_time
262 |         """
263 |         self.init_time = time.time()
264 | 
265 |     def save(self):
266 |         """
267 |         Calculate results and then put into a BenchmarkResult object
268 | 
269 |         On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
270 |         on the platform.
271 | 
272 |         :return: BenchmarkResult object with results and metadata
273 |         """
274 | 
275 |         # recalculate to ensure no mistakes made during batch-by-batch metric calculation
276 |         self.get_results()
277 | 
278 |         # If this is the first time the model is run, then we record evaluation time information
279 | 
280 |         if not self.cached_results:
281 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = None
282 |             self.speed_mem_metrics['Tasks'] = None
283 |             self.speed_mem_metrics['Evaluation Time'] = (time.time() - self.init_time)
284 |         else:
285 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = None
286 |             self.speed_mem_metrics['Tasks'] = None
287 |             self.speed_mem_metrics['Evaluation Time'] = None
288 | 
289 |         return BenchmarkResult(
290 |             task=self.task,
291 |             config={},
292 |             dataset='ADE20K val',
293 |             results=self.results,
294 |             speed_mem_metrics=self.speed_mem_metrics,
295 |             model=self.model_name,
296 |             model_description=self.model_description,
297 |             arxiv_id=self.paper_arxiv_id,
298 |             pwc_id=self.paper_pwc_id,
299 |             paper_results=self.paper_results,
300 |             run_hash=self.batch_hash,
301 |         )
302 | 


--------------------------------------------------------------------------------
/sotabencheval/semantic_segmentation/pascalvoc.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | from sotabenchapi.client import Client
  3 | from sotabenchapi.core import BenchmarkResult, check_inputs
  4 | import time
  5 | 
  6 | from sotabencheval.utils import calculate_batch_hash, is_server, get_max_memory_allocated
  7 | from sotabencheval.semantic_segmentation.utils import ConfusionMatrix
  8 | 
  9 | 
 10 | class PASCALVOCEvaluator(object):
 11 |     """`PASCAL VOC <https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012-val>`_ benchmark.
 12 | 
 13 |     Examples:
 14 |         Evaluate a FCN model from the torchvision repository:
 15 | 
 16 |         .. code-block:: python
 17 | 
 18 |             ...
 19 | 
 20 |             evaluator = PASCALVOCEvaluator(model_name='FCN ResNet-101', paper_arxiv_id='1605.06211')
 21 | 
 22 |             with torch.no_grad():
 23 |                 for i, (input, target) in enumerate(iterator):
 24 |                     ...
 25 |                     output = model(input)
 26 |                     # output and target should then be flattened into 1D np.ndarrays and passed in below
 27 |                     evaluator.update(output=output, target=target)
 28 | 
 29 |                     if evaluator.cache_exists:
 30 |                         break
 31 | 
 32 |             evaluator.save()
 33 |     """
 34 | 
 35 |     task = "Semantic Segmentation"
 36 | 
 37 |     def __init__(self,
 38 |                  model_name: str = None,
 39 |                  paper_arxiv_id: str = None,
 40 |                  paper_pwc_id: str = None,
 41 |                  paper_results: dict = None,
 42 |                  model_description=None):
 43 |         """Initializes a COCO Evaluator object
 44 | 
 45 |         Args:
 46 |             model_name (str, optional): The name of the model from the
 47 |                 paper - if you want to link your build to a machine learning
 48 |                 paper. See the VOC benchmark page for model names,
 49 |                 https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012-val,
 50 |                 e.g. on the paper leaderboard tab.
 51 |             paper_arxiv_id (str, optional): Optional linking to arXiv if you
 52 |                 want to link to papers on the leaderboard; put in the
 53 |                 corresponding paper's arXiv ID, e.g. '1611.05431'.
 54 |             paper_pwc_id (str, optional): Optional linking to Papers With Code;
 55 |                 put in the corresponding papers with code URL slug, e.g.
 56 |                 'u-gat-it-unsupervised-generative-attentional'
 57 |             paper_results (dict, optional) : If the paper you are reproducing
 58 |                 does not have model results on sotabench.com, you can specify
 59 |                 the paper results yourself through this argument, where keys
 60 |                 are metric names, values are metric values. e.g::
 61 | 
 62 |                     {'Mean IOU': 76.42709, 'Accuracy': 95.31, ...}.
 63 | 
 64 |                 Ensure that the metric names match those on the sotabench
 65 |                 leaderboard - for PASCAL VOC it should be 'Mean IOU', 'Accuracy'
 66 |             model_description (str, optional): Optional model description.
 67 |         """
 68 | 
 69 |         # Model metadata
 70 | 
 71 |         self.model_name = model_name
 72 |         self.paper_arxiv_id = paper_arxiv_id
 73 |         self.paper_pwc_id = paper_pwc_id
 74 |         self.paper_results = paper_results
 75 |         self.model_description = model_description
 76 | 
 77 |         self.voc_evaluator = ConfusionMatrix(21)
 78 | 
 79 |         self.outputs = np.array([])
 80 |         self.targets = np.array([])
 81 | 
 82 |         self.results = None
 83 | 
 84 |         # Backend variables for hashing and caching
 85 | 
 86 |         self.first_batch_processed = False
 87 |         self.batch_hash = None
 88 |         self.cached_results = False
 89 | 
 90 |         # Speed and memory metrics
 91 | 
 92 |         self.init_time = time.time()
 93 |         self.speed_mem_metrics = {}
 94 | 
 95 |     @property
 96 |     def cache_exists(self):
 97 |         """
 98 |         Checks whether the cache exists in the sotabench.com database - if so
 99 |         then sets self.results to cached results and returns True.
100 | 
101 |         You can use this property for control flow to break a for loop over a dataset
102 |         after the first iteration. This prevents rerunning the same calculation for the
103 |         same model twice.
104 | 
105 |         Examples:
106 |             Breaking a for loop
107 | 
108 |             .. code-block:: python
109 | 
110 |                 ...
111 | 
112 |                 with torch.no_grad():
113 |                     for i, (input, target) in enumerate(iterator):
114 |                         ...
115 |                         output = model(input)
116 |                         # output and target should then be flattened into 1D np.ndarrays and passed in below
117 |                         evaluator.update(output=output, target=target)
118 | 
119 |                         if evaluator.cache_exists:
120 |                             break
121 | 
122 |                 evaluator.save()
123 | 
124 |         :return: bool or None (if not in check mode)
125 |         """
126 |         if not self.first_batch_processed:
127 |             raise ValueError('No batches of data have been processed so no batch_hash exists')
128 | 
129 |         if not is_server():
130 |             return None
131 | 
132 |         client = Client.public()
133 |         cached_res = client.get_results_by_run_hash(self.batch_hash)
134 |         if cached_res:
135 |             self.results = cached_res
136 |             self.cached_results = True
137 |             print(
138 |                 "No model change detected (using the first batch run "
139 |                 "hash). Will use cached results."
140 |             )
141 |             return True
142 | 
143 |         return False
144 | 
145 |     def add(self, outputs: np.ndarray, targets: np.ndarray):
146 |         """
147 |         Update the evaluator with new results from the model
148 | 
149 |         :param outputs (np.ndarray): 1D np.ndarray of semantic class predictions per pixel
150 |         :param targets (np.ndarray): 1D np.ndarray of ground truth semantic classes per pixel
151 | 
152 |         The method requires an outputs input and a targets input - both flattened.
153 | 
154 |         Suppose you are making predictions, batch by batch, and have your model outputs
155 |         and the original targets with batch_size 32, and image size 520 x 480.
156 |         The shape of your outputs might look like this:
157 | 
158 |         batch_output.shape
159 |         >> (32, 21, 520, 480) # where 21 is the number of VOC classes
160 | 
161 |         batch_target.shape
162 |         >> (32, 520, 480)
163 | 
164 |         We can flatten the entire output and targets to 1D vectors for each pixel:
165 | 
166 |         flattened_batch_output.shape
167 |         >> (7987200) # flatten by taking the max class prediction
168 |                      #  (batch_output.argmax(1).flatten() in torch with class as second dimension)
169 | 
170 |         flattened_batch_target.shape
171 |         >> (7987200) # (batch_target.flatten() in torch)
172 | 
173 |         The output might look something like this:
174 | 
175 |         flattened_batch_output
176 |         >> array([6, 6, 6, 6, 6, ...])
177 | 
178 |         flattened_batch_target
179 |         >> array([6, 6, 6, 6, 6, ...])
180 | 
181 |         In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5
182 |         pixels - so the model is correct.
183 | 
184 |         These flattened arrays can then be passed into the .add() method of the evaluator
185 | 
186 |         .. code-block:: python
187 | 
188 |             my_evaluator.update(outputs=flattened_batch_output,
189 |                                         targets=flattened_batch_target)
190 | 
191 | 
192 |         :return: void - updates self.voc_evaluator with the data, and updates self.targets and self.outputs
193 |         """
194 |         self.voc_evaluator.update(targets, outputs)
195 | 
196 |         self.targets = np.append(self.targets, targets)
197 |         self.outputs = np.append(self.outputs, outputs)
198 | 
199 |         if not self.first_batch_processed:
200 |             acc_global, acc, iu = self.voc_evaluator.compute()
201 |             self.batch_hash = calculate_batch_hash(np.append(
202 |                 np.append(np.around(targets, 3), np.around(outputs, 3)),
203 |                 np.around(np.array([acc_global.item(), iu.mean().item()]), 3)))
204 |             self.first_batch_processed = True
205 | 
206 |     def get_results(self):
207 |         """
208 |         Reruns the evaluation using the accumulated detections, returns VOC results with IOU and
209 |         Accuracy metrics
210 | 
211 |         :return: dict with PASCAL VOC metrics
212 |         """
213 |         if self.cached_results:
214 |             return self.results
215 | 
216 |         self.voc_evaluator = ConfusionMatrix(21)
217 |         self.voc_evaluator.update(self.targets.astype(np.int64), self.outputs.astype(np.int64))
218 | 
219 |         acc_global, acc, iu = self.voc_evaluator.compute()
220 | 
221 |         self.results = {
222 |                    "Accuracy": acc_global.item(),
223 |                    "Mean IOU": iu.mean().item(),
224 |                }
225 | 
226 |         self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated()
227 | 
228 |         return self.results
229 | 
230 |     def reset_time(self):
231 |         """
232 |         Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation
233 |         appropriately, for example:
234 | 
235 |         :return: void - resets self.init_time
236 |         """
237 |         self.init_time = time.time()
238 | 
239 |     def save(self):
240 |         """
241 |         Calculate results and then put into a BenchmarkResult object
242 | 
243 |         On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded
244 |         on the platform.
245 | 
246 |         :return: BenchmarkResult object with results and metadata
247 |         """
248 |         # recalculate to ensure no mistakes made during batch-by-batch metric calculation
249 |         self.get_results()
250 | 
251 |         # If this is the first time the model is run, then we record evaluation time information
252 | 
253 |         if not self.cached_results:
254 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = None
255 |             self.speed_mem_metrics['Tasks'] = None
256 |             self.speed_mem_metrics['Evaluation Time'] = (time.time() - self.init_time)
257 |         else:
258 |             self.speed_mem_metrics['Tasks / Evaluation Time'] = None
259 |             self.speed_mem_metrics['Tasks'] = None
260 |             self.speed_mem_metrics['Evaluation Time'] = None
261 | 
262 |         return BenchmarkResult(
263 |             task=self.task,
264 |             config={},
265 |             dataset='PASCAL VOC 2012 val',
266 |             results=self.results,
267 |             speed_mem_metrics=self.speed_mem_metrics,
268 |             model=self.model_name,
269 |             model_description=self.model_description,
270 |             arxiv_id=self.paper_arxiv_id,
271 |             pwc_id=self.paper_pwc_id,
272 |             paper_results=self.paper_results,
273 |             run_hash=self.batch_hash,
274 |         )
275 | 


--------------------------------------------------------------------------------
/sotabencheval/semantic_segmentation/utils.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class ConfusionMatrix(object):
 5 |     def __init__(self, num_classes):
 6 |         self.num_classes = num_classes
 7 |         self.mat = None
 8 | 
 9 |     def update(self, a, b):
10 |         """
11 |         print(a.shape)
12 |         print(n.shape)
13 |         k = (a >= 0) & (a < n)
14 |         inds = n * a[k].to(torch.int64) + b[k]
15 |         self.mat += np.bincount(inds, minlength=n ** 2).reshape(n, n)
16 |         """
17 |         n = self.num_classes
18 | 
19 |         if self.mat is None:
20 |             self.mat = np.zeros((n, n), dtype=np.int64)
21 | 
22 |         k = (a >= 0) & (a < n)
23 |         inds = n * a[k].astype(np.int64) + b[k]
24 |         self.mat += np.bincount(inds, minlength=n ** 2).reshape(n, n)
25 | 
26 |     def reset(self):
27 |         self.mat.zero_()
28 | 
29 |     def compute(self):
30 |         h = self.mat
31 |         acc_global = np.diag(h).sum() / h.sum()
32 |         acc = np.diag(h) / h.sum(1)
33 |         iu = np.diag(h) / (h.sum(1) + h.sum(0) - np.diag(h))
34 |         return acc_global, acc, iu
35 | 
36 |     def __str__(self):
37 |         acc_global, acc, iu = self.compute()
38 |         return (
39 |             "global correct: {:.1f}\n"
40 |             "average row correct: {}\n"
41 |             "IoU: {}\n"
42 |             "mean IoU: {:.1f}"
43 |         ).format(
44 |             acc_global.item() * 100,
45 |             ["{:.1f}".format(i) for i in (acc * 100).tolist()],
46 |             ["{:.1f}".format(i) for i in (iu * 100).tolist()],
47 |             iu.mean().item() * 100,
48 |         )
49 | 
50 | 


--------------------------------------------------------------------------------
/sotabencheval/utils.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import gzip
  3 | import errno
  4 | import tarfile
  5 | import zipfile
  6 | import os
  7 | from tqdm import tqdm
  8 | from pathlib import Path
  9 | 
 10 | 
 11 | SOTABENCH_CACHE = Path.home() / ".cache"
 12 | 
 13 | 
 14 | class AverageMeter(object):
 15 |     """Computes and stores the average and current value."""
 16 | 
 17 |     def __init__(self):
 18 |         self.val = 0
 19 |         self.avg = 0
 20 |         self.sum = 0
 21 |         self.count = 0
 22 | 
 23 |     def reset(self):
 24 |         self.val = 0
 25 |         self.avg = 0
 26 |         self.sum = 0
 27 |         self.count = 0
 28 | 
 29 |     def update(self, val, n=1):
 30 |         self.val = val
 31 |         self.sum += val * n
 32 |         self.count += n
 33 |         self.avg = self.sum / self.count
 34 | 
 35 | 
 36 | def calculate_batch_hash(output):
 37 |     """Calculate the hash for the output of a batch
 38 | 
 39 |     Output is passed into this method, stringified, and a hash is taken of the contents. For example,
 40 |     it could be an list of predictions that is passed in.
 41 | 
 42 |     Args:
 43 |         output: data to be hashed
 44 |     """
 45 |     m = hashlib.sha256()
 46 |     m.update(str(output).encode("utf-8"))
 47 |     return m.hexdigest()
 48 | 
 49 | 
 50 | def change_root_if_server(root: str, server_root: str):
 51 |     """
 52 |     This method checks whether code is being executed on the sotabench server - if so it returns
 53 |     server_root, else root. Written as a method so the user doesn't have to fiddle with environmental
 54 |     variables.
 55 | 
 56 |     :param root: (str) a user-specified root
 57 |     :param server_root: (str) a server root
 58 |     :return: server_root if SOTABENCH_SERVER env variable is set, else root
 59 |     """
 60 |     check_server = os.environ.get("SOTABENCH_SERVER")
 61 | 
 62 |     if check_server == 'true':
 63 |         return server_root
 64 | 
 65 |     return root
 66 | 
 67 | 
 68 | def is_server():
 69 |     """
 70 |     Checks whether code is being executed on server; if so, returns True else False.
 71 | 
 72 |     Uses env variable SOTABENCH_SERVER to determine whether code is being run on the server.
 73 | 
 74 |     You can use this function for your control flow for server specific settings - e.g. the data paths.
 75 | 
 76 |     Examples:
 77 | 
 78 |         .. code-block:: python
 79 | 
 80 | 
 81 |             from sotabencheval.utils import is_server
 82 | 
 83 |             if is_server():
 84 |                 DATA_ROOT = './.data/vision/imagenet'
 85 |             else: # local settings
 86 |                 DATA_ROOT = '/home/ubuntu/my_data/'
 87 | 
 88 |     :return: bool - whether the code is being run on the server or not
 89 |     """
 90 |     if os.environ.get("SOTABENCH_SERVER") == 'true':
 91 |         return True
 92 |     else:
 93 |         return False
 94 | 
 95 | 
 96 | def set_env_on_server(env_name: str, value):
 97 |     """
 98 |     If run on sotabench server, sets an environment variable with a given name to value (casted to str).
 99 | 
100 |     :param env_name: (str) environment variable name
101 |     :param value: value to set if executed on sotabench
102 |     :return: bool - whether code is being run on the server
103 |     """
104 |     if is_server():
105 |         os.environ[env_name] = str(value)
106 |         return True
107 |     return False
108 | 
109 | 
110 | def get_max_memory_allocated(device: str = 'cuda'):
111 |     """
112 |     Finds out the maximum memory allocated, then clears the max memory allocated.
113 | 
114 |     This currently only works for PyTorch models.
115 | 
116 |     TODO: Support TensorFlow and MXNet.
117 | 
118 |     :param device: (str) - name of device (Torch style) -> e.g. 'cuda'
119 |     :return: float or None - if torch is in the environment, max memory allocated, else None
120 |     """
121 |     try:
122 |         import torch
123 |         max_mem = torch.cuda.max_memory_allocated(device=device)
124 |         torch.cuda.reset_max_memory_allocated(device=device)
125 |         return max_mem
126 |     except ImportError:
127 |         return None
128 | 
129 | # Below the utilities have been taken directly from the torchvision repository
130 | # Contains helper functions for unzipping and making directories
131 | # https://github.com/pytorch/vision/tree/master/torchvision
132 | 
133 | 
134 | def makedir_exist_ok(dirpath):
135 |     """
136 |     Python2 support for os.makedirs(.., exist_ok=True)
137 |     """
138 |     try:
139 |         os.makedirs(dirpath)
140 |     except OSError as e:
141 |         if e.errno == errno.EEXIST:
142 |             pass
143 |         else:
144 |             raise
145 | 
146 | def gen_bar_updater():
147 |     pbar = tqdm(total=None)
148 | 
149 |     def bar_update(count, block_size, total_size):
150 |         if pbar.total is None and total_size:
151 |             pbar.total = total_size
152 |         progress_bytes = count * block_size
153 |         pbar.update(progress_bytes - pbar.n)
154 | 
155 |     return bar_update
156 | 
157 | 
158 | def calculate_md5(fpath, chunk_size=1024 * 1024):
159 |     md5 = hashlib.md5()
160 |     with open(fpath, 'rb') as f:
161 |         for chunk in iter(lambda: f.read(chunk_size), b''):
162 |             md5.update(chunk)
163 |     return md5.hexdigest()
164 | 
165 | 
166 | def check_md5(fpath, md5, **kwargs):
167 |     return md5 == calculate_md5(fpath, **kwargs)
168 | 
169 | 
170 | def check_integrity(fpath, md5=None):
171 |     if not os.path.isfile(fpath):
172 |         return False
173 |     if md5 is None:
174 |         return True
175 |     return check_md5(fpath, md5)
176 | 
177 | 
178 | def download_url(url, root, filename=None, md5=None):
179 |     """Download a file from a url and place it in root - utility function taken from torchvision repository
180 |     Args:
181 |         url (str): URL to download file from
182 |         root (str): Directory to place downloaded file in
183 |         filename (str, optional): Name to save the file under. If None, use the basename of the URL
184 |         md5 (str, optional): MD5 checksum of the download. If None, do not check
185 |     """
186 |     from six.moves import urllib
187 | 
188 |     root = os.path.expanduser(root)
189 |     if not filename:
190 |         filename = os.path.basename(url)
191 |     fpath = os.path.join(root, filename)
192 | 
193 |     makedir_exist_ok(root)
194 | 
195 |     # downloads file
196 |     if check_integrity(fpath, md5):
197 |         print('Using downloaded and verified file: ' + fpath)
198 |     else:
199 |         try:
200 |             print('Downloading ' + url + ' to ' + fpath)
201 |             urllib.request.urlretrieve(
202 |                 url, fpath,
203 |                 reporthook=gen_bar_updater()
204 |             )
205 |         except (urllib.error.URLError, IOError) as e:
206 |             if url[:5] == 'https':
207 |                 url = url.replace('https:', 'http:')
208 |                 print('Failed download. Trying https -> http instead.'
209 |                       ' Downloading ' + url + ' to ' + fpath)
210 |                 urllib.request.urlretrieve(
211 |                     url, fpath,
212 |                     reporthook=gen_bar_updater()
213 |                 )
214 |             else:
215 |                 raise e
216 | 
217 | 
218 | def _is_tar(filename):
219 |     return filename.endswith(".tar")
220 | 
221 | 
222 | def _is_targz(filename):
223 |     return filename.endswith(".tar.gz")
224 | 
225 | 
226 | def _is_gzip(filename):
227 |     return filename.endswith(".gz") and not filename.endswith(".tar.gz")
228 | 
229 | 
230 | def _is_zip(filename):
231 |     return filename.endswith(".zip")
232 | 
233 | 
234 | def extract_archive(from_path, to_path=None, remove_finished=False):
235 |     if to_path is None:
236 |         to_path = os.path.dirname(from_path)
237 | 
238 |     if _is_tar(from_path):
239 |         with tarfile.open(from_path, 'r') as tar:
240 |             tar.extractall(path=to_path)
241 |     elif _is_targz(from_path):
242 |         with tarfile.open(from_path, 'r:gz') as tar:
243 |             tar.extractall(path=to_path)
244 |     elif _is_gzip(from_path):
245 |         to_path = os.path.join(to_path, os.path.splitext(os.path.basename(from_path))[0])
246 |         with open(to_path, "wb") as out_f, gzip.GzipFile(from_path) as zip_f:
247 |             out_f.write(zip_f.read())
248 |     elif _is_zip(from_path):
249 |         with zipfile.ZipFile(from_path, 'r') as z:
250 |             z.extractall(to_path)
251 |     else:
252 |         raise ValueError("Extraction of {} not supported".format(from_path))
253 | 
254 |     if remove_finished:
255 |         os.remove(from_path)
256 | 


--------------------------------------------------------------------------------
/sotabencheval/version.py:
--------------------------------------------------------------------------------
 1 | class Version:
 2 |     __slots__ = ("major", "minor", "build")
 3 | 
 4 |     def __init__(self, major, minor, build):
 5 |         self.major = major
 6 |         self.minor = minor
 7 |         self.build = build
 8 | 
 9 |     def __str__(self):
10 |         return f"{self.major}.{self.minor}.{self.build}"
11 | 
12 |     def __repr__(self):
13 |         return (
14 |             f"Version(major={self.major}, minor={self.minor}, "
15 |             f"build={self.build})"
16 |         )
17 | 
18 | version = Version(0, 0, 38)
19 | 
20 | __version__ = str(version)
21 | 


--------------------------------------------------------------------------------