├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── docs ├── docs │ ├── ade20k.md │ ├── coco.md │ ├── imagenet.md │ ├── img │ │ ├── ade20k.png │ │ ├── banner.png │ │ ├── coco.jpg │ │ ├── connect.png │ │ ├── connect2.png │ │ ├── examples.png │ │ ├── imagenet.jpeg │ │ ├── language_model.png │ │ ├── pascalvoc2012.png │ │ ├── results.png │ │ ├── sotabencheval.png │ │ └── squad20.png │ ├── index.md │ ├── pascalvoc.md │ ├── squad.md │ ├── wikitext103.md │ └── wmt.md ├── mkdocs.yml └── site │ ├── img │ └── squad20.png │ ├── squad │ └── index.html │ └── wmt │ └── index.html ├── requirements-dev.txt ├── requirements.txt ├── setup.cfg ├── setup.py └── sotabencheval ├── __init__.py ├── core ├── __init__.py ├── cache.py └── evaluator.py ├── image_classification ├── __init__.py ├── imagenet.py └── utils.py ├── language_modelling ├── __init__.py └── wikitext.py ├── machine_translation ├── __init__.py ├── languages.py ├── metrics.py └── wmt.py ├── natural_language_inference ├── __init__.py └── multinli.py ├── object_detection ├── __init__.py ├── coco.py ├── coco_eval.py └── utils.py ├── question_answering ├── __init__.py ├── evaluate_v11.py ├── evaluate_v20.py ├── squad.py └── utils.py ├── semantic_segmentation ├── __init__.py ├── ade20k.py ├── pascalvoc.py └── utils.py ├── utils.py └── version.py /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | *.egg-info 3 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: help default docs build release clean test check fmt 2 | .DEFAULT_GOAL := help 3 | PROJECT := sotabench-eval 4 | 5 | 6 | help: ## Show help. 7 | @grep -E '^[a-zA-Z2_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[36m%-30s\033[0m %s\n", $$1, $$2}' 8 | 9 | 10 | docs: ## Build documentation. 11 | @cd docs && make html && open _build/html/index.html 12 | 13 | 14 | build: ## Build the source and wheel distribution packages. 15 | @python3 setup.py sdist bdist_wheel 16 | 17 | 18 | release: build ## Build and upload the package to PyPI. 19 | @twine upload --repository-url https://upload.pypi.org/legacy/ dist/* 20 | @rm -fr build dist sotabench-eval.egg-info 21 | 22 | 23 | clean: ## Cleanup the project 24 | @find . -type d -name __pycache__ -delete 25 | @find . -type f -name "*.py[cod]" -delete 26 | @rm -fr build dist sotabench-eval.egg-info 27 | @rm -fr docs/_build/* 28 | 29 | 30 | test: ## Run tests and code checks. 31 | @py.test -v --cov "$(PROJECT)" "$(PROJECT)" 32 | 33 | 34 | check: ## Run code checks. 35 | @flake8 "$(PROJECT)" 36 | @pydocstyle "$(PROJECT)" 37 | 38 | 39 | fmt: ## Format the code. 40 | @black --target-version=py37 --safe --line-length=79 "$(PROJECT)" 41 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

2 | 3 | -------------------------------------------------------------------------------- 4 | 5 | [![PyPI version](https://badge.fury.io/py/sotabencheval.svg)](https://badge.fury.io/py/sotabencheval) [![Generic badge](https://img.shields.io/badge/Documentation-Here-.svg)](https://paperswithcode.github.io/sotabench-eval/) 6 | 7 | `sotabencheval` is a framework-agnostic library that contains a collection of deep learning benchmarks you can use to benchmark your models. It can be used in conjunction with the [sotabench](https://www.sotabench.com) service to record results for models, so the community can compare model performance on different tasks, as well as a continuous integration style service for your repository to benchmark your models on each commit. 8 | 9 | ## Benchmarks Supported 10 | 11 | - [ADE20K](https://paperswithcode.github.io/sotabench-eval/ade20k/) (Semantic Segmentation) 12 | - [COCO](https://paperswithcode.github.io/sotabench-eval/coco/) (Object Detection) 13 | - [ImageNet](https://paperswithcode.github.io/sotabench-eval/imagenet/) (Image Classification) 14 | - [SQuAD](https://paperswithcode.github.io/sotabench-eval/squad/) (Question Answering) 15 | - [WikiText-103](https://paperswithcode.github.io/sotabench-eval/wikitext103/) (Language Modelling) 16 | - [WMT](https://paperswithcode.github.io/sotabench-eval/wmt/) (Machine Translation) 17 | 18 | PRs welcome for further benchmarks! 19 | 20 | ## Installation 21 | 22 | Requires Python 3.6+. 23 | 24 | ```bash 25 | pip install sotabencheval 26 | ``` 27 | 28 | ## Get Benching! 🏋️ 29 | 30 | You should read the [full documentation here](https://paperswithcode.github.io/sotabench-eval/index.html), which contains guidance on getting started and connecting to [sotabench](https://www.sotabench.com). 31 | 32 | Integration is lightweight. For example, if you are evaluating an ImageNet model, you initialize an Evaluator object and (optionally) link to any linked paper: 33 | 34 | ```python 35 | from sotabencheval.image_classification import ImageNetEvaluator 36 | evaluator = ImageNetEvaluator( 37 | model_name='FixResNeXt-101 32x48d', 38 | paper_arxiv_id='1906.06423') 39 | ``` 40 | 41 | Then for each batch of predictions your model makes on ImageNet, pass a dictionary of keys as image IDs and values as a `np.ndarray`s of logits to the `evaluator.add` method: 42 | 43 | ```python 44 | evaluator.add(output_dict=dict(zip(image_ids, batch_output))) 45 | ``` 46 | 47 | The evaluation logic just needs to be written in a `sotabench.py` file and sotabench will run it on each commit and record the results: 48 | 49 | 50 | 51 | ## Contributing 52 | 53 | All contributions welcome! 54 | 55 | 56 | 57 | -------------------------------------------------------------------------------- /docs/docs/ade20k.md: -------------------------------------------------------------------------------- 1 | # ADE20K 2 | 3 | ![ADE20K Dataset Examples](img/ade20k.png) 4 | 5 | You can view the ADE20K leaderboard [here](https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val). 6 | 7 | ## Getting Started 8 | 9 | You'll need the following in the root of your repository: 10 | 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py` 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation 14 | 15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the ADE20K 2012 dataset. For example, 16 | PyTorch users might use torchvision to load the dataset. 17 | 18 | But you will need to record your results for the server, and you'll want to avoid doing things like 19 | downloading the dataset on the server. So you should: 20 | 21 | - **Point to the server ADE20K data paths** - popular datasets are pre-downloaded on the server. 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results. 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions. 24 | 25 | We explain how to do these various steps below. 26 | 27 | ## Server Data Location 28 | 29 | The ADE20K data is located in the root of your repository on the server at `.data/vision/ade20k`. In this folder is contained: 30 | 31 | - `ADEChallengeData2016.zip` - containing validation images and annotations 32 | 33 | Your local ADE20K files may have a different file directory structure, so you 34 | can use control flow like below to change the data path if the script is being 35 | run on sotabench servers: 36 | 37 | ``` python 38 | from sotabencheval.utils import is_server 39 | 40 | if is_server(): 41 | DATA_ROOT = './.data/vision/ade20k' 42 | else: # local settings 43 | DATA_ROOT = '/home/ubuntu/my_data/' 44 | ``` 45 | 46 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly. 47 | 48 | ## How Do I Initialize an Evaluator? 49 | 50 | Add this to your code - before you start batching over the dataset and making predictions: 51 | 52 | ``` python 53 | from sotabencheval.semantic_segmentation import ADE20KEvaluator 54 | 55 | evaluator = ADE20KEvaluator(model_name='My Super Model') 56 | ``` 57 | 58 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you 59 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val) 60 | then you will enable direct comparison with the paper. For example: 61 | 62 | ``` python 63 | from sotabencheval.semantic_segmentation import ADE20KEvaluator 64 | 65 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065') 66 | ``` 67 | 68 | The above will directly compare with the result of the paper when run on the server. 69 | 70 | ## How Do I Evaluate Predictions? 71 | 72 | The evaluator object has an `.add()` method to submit predictions by batch or in full. 73 | 74 | For ADE20K there are two required arguments: `outputs`, a 1D np.ndarray of semantic class predictions per label, 75 | and `targets`, a 1D np.ndarray of ground truth semantic classes per pixel. In other words, it requires flattened 76 | inputs and outputs. 77 | 78 | To elaborate, suppose you are making predictions, batch by batch, and have your model output 79 | and the original targets with batch_size `32`, and image size `(520, 480)`. The shape of your outputs might look like: 80 | 81 | ``` python 82 | batch_output.shape 83 | >> (32, 150, 520, 480) # where 150 is the number of ADE20K classes 84 | 85 | batch_target.shape 86 | >> (32, 520, 480) 87 | ``` 88 | 89 | We can flatten the entire output and targets to 1D vectors for each pixel: 90 | 91 | ``` python 92 | flattened_batch_output.shape 93 | >> (7987200) # flatten by taking the max class prediction 94 | # (batch_output.argmax(1).flatten() in torch with class as second dimension) 95 | 96 | flattened_batch_target.shape 97 | >> (7987200) # (batch_target.flatten() in torch) 98 | ``` 99 | 100 | The output might look something like this: 101 | 102 | ``` python 103 | flattened_batch_output 104 | >> array([6, 6, 6, 6, 6, ...]) 105 | 106 | flattened_batch_target 107 | >> array([6, 6, 6, 6, 6, ...]) 108 | ``` 109 | 110 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5 111 | pixels - so the model is correct. 112 | 113 | These flattened arrays can then be passed into the .add() method of the evaluator 114 | 115 | ``` python 116 | my_evaluator.update(outputs=flattened_batch_output, 117 | targets=flattened_batch_target) 118 | ``` 119 | 120 | You can do this all at once in a single call to `add()`, but more naturally, you will 121 | probably loop over the dataset and call the method for the outputs of each batch. 122 | That would like something like this (for a PyTorch example): 123 | 124 | ``` python 125 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065') 126 | 127 | with torch.no_grad(): 128 | for image, target in tqdm.tqdm(data_loader_test): 129 | image, target = image.to('cuda'), target.to('cuda') 130 | output = model(image) 131 | output = output['out'] 132 | 133 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy()) 134 | ``` 135 | 136 | When you are done, you can get the results locally by running: 137 | 138 | ``` python 139 | evaluator.get_results() 140 | ``` 141 | 142 | But for the server you want to save the results by running: 143 | 144 | ``` python 145 | evaluator.save() 146 | ``` 147 | 148 | This method serialises the results and model metadata and stores to the server database. 149 | 150 | ## How Do I Cache Evaluation? 151 | 152 | Sotabench reruns your script on every commit. This is good because it acts like 153 | continuous integration in checking for bugs and changes, but can be annoying 154 | if the model hasn't changed and evaluation is lengthy. 155 | 156 | Fortunately sotabencheval has caching logic that you can use. 157 | 158 | The idea is that after the first batch, we hash the model outputs and the 159 | current metrics and this tells us if the model is the same given the dataset. 160 | You can include hashing within an evaluation loop like follows (in the following 161 | example for a PyTorch repository): 162 | 163 | ``` python 164 | evaluator = ADE20KEvaluator(model_name='OCR (HRNetV2-W48)', paper_arxiv_id='1909.11065') 165 | 166 | with torch.no_grad(): 167 | for image, target in tqdm.tqdm(data_loader_test): 168 | image, target = image.to('cuda'), target.to('cuda') 169 | output = model(image) 170 | output = output['out'] 171 | 172 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy()) 173 | if evaluator.cache_exists: 174 | break 175 | 176 | evaluator.save() 177 | ``` 178 | 179 | If the hash is the same as in the server, we infer that the model hasn't changed, so 180 | we simply return hashed results rather than running the whole evaluation again. 181 | 182 | Caching is very useful if you have large models, or a repository that is evaluating 183 | multiple models, as it speeds up evaluation significantly. 184 | 185 | ## Need More Help? 186 | 187 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench 188 | forums if you have any questions or difficulties. 189 | -------------------------------------------------------------------------------- /docs/docs/coco.md: -------------------------------------------------------------------------------- 1 | # COCO 2 | 3 | ![COCO Dataset Examples](img/coco.jpg) 4 | 5 | You can view the COCO minival leaderboard [here](https://sotabench.com/benchmarks/object-detection-on-coco-minival). 6 | 7 | ## Getting Started 8 | 9 | You'll need the following in the root of your repository: 10 | 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py` 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation 14 | 15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the COCO dataset. For example, 16 | PyTorch users might use torchvision to load the dataset. 17 | 18 | But you will need to record your results for the server, and you'll want to avoid doing things like 19 | downloading the dataset on the server. So you should: 20 | 21 | - **Point to the server COCO data paths** - popular datasets are pre-downloaded on the server. 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results. 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions. 24 | 25 | We explain how to do these various steps below. 26 | 27 | ## Server Data Location 28 | 29 | The COCO validation data is located in the root of your repository on the server at `.data/vision/coco`. In this folder is contained: 30 | 31 | - `annotations_trainval2017.zip` - containing annotations for the validation images 32 | - `val2017.zip` - containing the validation images 33 | 34 | Your local COCO files may have a different file directory structure, so you 35 | can use control flow like below to change the data path if the script is being 36 | run on sotabench servers: 37 | 38 | ``` python 39 | from sotabencheval.utils import is_server 40 | 41 | if is_server(): 42 | DATA_ROOT = './.data/vision/coco' 43 | else: # local settings 44 | DATA_ROOT = '/home/ubuntu/my_data/' 45 | ``` 46 | 47 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly. 48 | 49 | ## How Do I Initialize an Evaluator? 50 | 51 | Add this to your code - before you start batching over the dataset and making predictions: 52 | 53 | ``` python 54 | from sotabencheval.object_detection import COCOEvaluator 55 | 56 | evaluator = COCOEvaluator(model_name='My Super Model') 57 | ``` 58 | 59 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you 60 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/object-detection-on-coco-minival) 61 | then you will enable direct comparison with the paper's model. For example: 62 | 63 | ``` python 64 | from sotabencheval.object_detection import COCOEvaluator 65 | 66 | evaluator = COCOEvaluator(model_name='Mask R-CNN', paper_arxiv_id='1703.06870') 67 | ``` 68 | 69 | The above will directly compare with the result of the paper when run on the server. 70 | 71 | ## How Do I Evaluate Predictions? 72 | 73 | The evaluator object has an [.add()](https://github.com/paperswithcode/sotabench-eval/blob/a788d17252913e5f2d24733845de80aec23101fb/sotabencheval/object_detection/coco.py#L187) method to submit predictions by batch or in full. 74 | 75 | For COCO the expected input is a list of dictionaries, where each dictionary contains detection information 76 | that will be used by the [loadRes](https://github.com/paperswithcode/sotabench-eval/blob/a788d17252913e5f2d24733845de80aec23101fb/sotabencheval/object_detection/coco_eval.py#L236) method based on the [pycocotools](https://github.com/cocodataset/cocoapi/tree/master/PythonAPI/pycocotools) API. 77 | 78 | Each detection can take a dictionary 79 | like the following: 80 | 81 | ``` python 82 | {'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312, 110.14895629882812, 278.2847595214844], 83 | 'score': 0.999152421951294, 'category_id': 1} 84 | ``` 85 | 86 | For this benchmark, only bounding box detection ('bbox') is performed at present. 87 | 88 | You can do this all at once in a single call to `add()`, but more naturally, you will 89 | probably loop over the dataset and call the method for the outputs of each batch. 90 | That would look something like this (for a PyTorch example): 91 | 92 | ``` python 93 | ... 94 | 95 | evaluator = COCOEvaluator( 96 | model_name='Mask R-CNN', 97 | paper_arxiv_id='1703.06870') 98 | 99 | with torch.no_grad(): 100 | for i, (input, target) in enumerate(data_loader): 101 | ... 102 | output = model(input) 103 | # potentially formatting of the output here to be a list of dicts 104 | evaluator.add(output) 105 | ``` 106 | 107 | When you are done, you can get the results locally by running: 108 | 109 | ``` python 110 | evaluator.get_results() 111 | ``` 112 | 113 | But for the server you want to save the results by running: 114 | 115 | ``` python 116 | evaluator.save() 117 | ``` 118 | 119 | This method serialises the results and model metadata and stores to the server database. 120 | 121 | ## How Do I Cache Evaluation? 122 | 123 | Sotabench reruns your script on every commit. This is good because it acts like 124 | continuous integration in checking for bugs and changes, but can be annoying 125 | if the model hasn't changed and evaluation is lengthy. 126 | 127 | Fortunately sotabencheval has caching logic that you can use. 128 | 129 | The idea is that after the first batch, we hash the model outputs and the 130 | current metrics and this tells us if the model is the same given the dataset. 131 | You can include hashing within an evaluation loop like follows (in the following 132 | example for a PyTorch repository): 133 | 134 | ``` python 135 | with torch.no_grad(): 136 | for i, (input, target) in enumerate(data_loader): 137 | ... 138 | output = model(input) 139 | # potentially formatting of the output here to be a list of dicts 140 | evaluator.add(output) 141 | 142 | if evaluator.cache_exists: 143 | break 144 | 145 | evaluator.save() 146 | ``` 147 | 148 | If the hash is the same as in the server, we infer that the model hasn't changed, so 149 | we simply return hashed results rather than running the whole evaluation again. 150 | 151 | Caching is very useful if you have large models, or a repository that is evaluating 152 | multiple models, as it speeds up evaluation significantly. 153 | 154 | ## A Full sotabench.py Example 155 | 156 | Below we show an implementation for a model from the torchvision repository. This 157 | incorporates all the features explained above: (a) using the server data root, 158 | (b) using the COCO Evaluator, and (c) caching the evaluation logic. Note that the 159 | torchbench dependency is just to get some processing logic and transforms; the evaluation 160 | is done with sotabencheval. 161 | 162 | ``` python 163 | import os 164 | import tqdm 165 | import torch 166 | from torch.utils.data import DataLoader 167 | from torchbench.utils import send_model_to_device 168 | from torchbench.object_detection.transforms import Compose, ConvertCocoPolysToMask, ToTensor 169 | import torchvision 170 | import PIL 171 | 172 | from sotabencheval.object_detection import COCOEvaluator 173 | from sotabencheval.utils import is_server 174 | 175 | if is_server(): 176 | DATA_ROOT = './.data/vision/coco' 177 | else: # local settings 178 | DATA_ROOT = '/home/ubuntu/my_data/' 179 | 180 | def coco_data_to_device(input, target, device: str = "cuda", non_blocking: bool = True): 181 | input = list(inp.to(device=device, non_blocking=non_blocking) for inp in input) 182 | target = [{k: v.to(device=device, non_blocking=non_blocking) for k, v in t.items()} for t in target] 183 | return input, target 184 | 185 | def coco_collate_fn(batch): 186 | return tuple(zip(*batch)) 187 | 188 | def coco_output_transform(output, target): 189 | output = [{k: v.to("cpu") for k, v in t.items()} for t in output] 190 | return output, target 191 | 192 | transforms = Compose([ConvertCocoPolysToMask(), ToTensor()]) 193 | 194 | model = torchvision.models.detection.__dict__['maskrcnn_resnet50_fpn'](num_classes=91, pretrained=True) 195 | 196 | model, device = send_model_to_device( 197 | model, device='cuda', num_gpu=1 198 | ) 199 | model.eval() 200 | 201 | model_output_transform = coco_output_transform 202 | send_data_to_device = coco_data_to_device 203 | collate_fn = coco_collate_fn 204 | 205 | test_dataset = torchbench.datasets.CocoDetection( 206 | root=os.path.join(DATA_ROOT, "val%s" % '2017'), 207 | annFile=os.path.join( 208 | DATA_ROOT, "annotations/instances_val%s.json" % '2017' 209 | ), 210 | transform=None, 211 | target_transform=None, 212 | transforms=transforms, 213 | download=True, 214 | ) 215 | test_loader = DataLoader( 216 | test_dataset, 217 | batch_size=8, 218 | shuffle=False, 219 | num_workers=4, 220 | pin_memory=True, 221 | collate_fn=collate_fn, 222 | ) 223 | test_loader.no_classes = 91 # Number of classes for COCO Detection 224 | 225 | iterator = tqdm.tqdm(test_loader, desc="Evaluation", mininterval=5) 226 | 227 | evaluator = COCOEvaluator( 228 | root=DATA_ROOT, 229 | model_name='Mask R-CNN (ResNet-50-FPN)', 230 | paper_arxiv_id='1703.06870' 231 | 232 | def prepare_for_coco_detection(predictions): 233 | coco_results = [] 234 | for original_id, prediction in predictions.items(): 235 | if len(prediction) == 0: 236 | continue 237 | 238 | boxes = prediction["boxes"] 239 | boxes = convert_to_xywh(boxes).tolist() 240 | scores = prediction["scores"].tolist() 241 | labels = prediction["labels"].tolist() 242 | 243 | coco_results.extend( 244 | [ 245 | { 246 | "image_id": original_id, 247 | "category_id": labels[k], 248 | "bbox": box, 249 | "score": scores[k], 250 | } 251 | for k, box in enumerate(boxes) 252 | ] 253 | ) 254 | return coco_results 255 | 256 | def convert_to_xywh(boxes): 257 | xmin, ymin, xmax, ymax = boxes.unbind(1) 258 | return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1) 259 | 260 | with torch.no_grad(): 261 | for i, (input, target) in enumerate(iterator): 262 | input, target = send_data_to_device(input, target, device=device) 263 | original_output = model(input) 264 | output, target = model_output_transform(original_output, target) 265 | result = { 266 | tar["image_id"].item(): out for tar, out in zip(target, output) 267 | } 268 | result = prepare_for_coco_detection(result) 269 | 270 | evaluator.update(result) 271 | 272 | if evaluator.cache_exists: 273 | break 274 | 275 | evaluator.save() 276 | ``` 277 | 278 | ## Need More Help? 279 | 280 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench 281 | forums if you have any questions or difficulties. 282 | -------------------------------------------------------------------------------- /docs/docs/imagenet.md: -------------------------------------------------------------------------------- 1 | # ImageNet 2 | 3 | ![ImageNet Dataset Examples](img/imagenet.jpeg) 4 | 5 | You can view the ImageNet leaderboard [here](https://sotabench.com/benchmarks/image-classification-on-imagenet). 6 | 7 | ## Getting Started 8 | 9 | You'll need the following in the root of your repository: 10 | 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py` 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation 14 | 15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the ImageNet dataset. For example, 16 | PyTorch users might use torchvision to load the dataset. 17 | 18 | But you will need to record your results for the server, and you'll want to avoid doing things like 19 | downloading the dataset on the server. So you should: 20 | 21 | - **Point to the server ImageNet data paths** - popular datasets are pre-downloaded on the server. 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results. 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions. 24 | 25 | We explain how to do these various steps below. 26 | 27 | ## Server Data Location 28 | 29 | The ImageNet validation data is located in the root of your repository on the server at `.data/vision/imagenet`. In this folder is contained: 30 | 31 | - `ILSVRC2012_devkit_t12.tar.gz` - containing metadata 32 | - `ILSVRC2012_img_val.tar` - containing the validation images 33 | 34 | Your local ImageNet files may have a different file directory structure, so you 35 | can use control flow like below to change the data path if the script is being 36 | run on sotabench servers: 37 | 38 | ``` python 39 | from sotabencheval.utils import is_server 40 | 41 | if is_server(): 42 | DATA_ROOT = './.data/vision/imagenet' 43 | else: # local settings 44 | DATA_ROOT = '/home/ubuntu/my_data/' 45 | ``` 46 | 47 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly. 48 | 49 | ## How Do I Initialize an Evaluator? 50 | 51 | Add this to your code - before you start batching over the dataset and making predictions: 52 | 53 | ``` python 54 | from sotabencheval.image_classification import ImageNetEvaluator 55 | 56 | evaluator = ImageNetEvaluator(model_name='My Super Model') 57 | ``` 58 | 59 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you 60 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/image-classification-on-imagenet) 61 | then you will enable direct comparison with the paper's model. For example: 62 | 63 | ``` python 64 | from sotabencheval.image_classification import ImageNetEvaluator 65 | 66 | evaluator = ImageNetEvaluator(model_name='FixResNeXt-101 32x48d', 67 | paper_arxiv_id='1906.06423') 68 | ``` 69 | 70 | The above will directly compare with the result of the paper when run on the server. 71 | 72 | ## How Do I Evaluate Predictions? 73 | 74 | The evaluator object has an `.add()` method to submit predictions by batch or in full. 75 | 76 | For ImageNet the expected input as a dictionary of outputs, where each key is an 77 | image ID from ImageNet and each value is a list or 1D numpy array of logits for that 78 | image ID. For example: 79 | 80 | ``` python 81 | evaluator.add({'ILSVRC2012_val_00000293': np.array([1.04243, ...]), 82 | 'ILSVRC2012_val_00000294': np.array([-2.3677, ...])}) 83 | ``` 84 | 85 | You can do this all at once in a single call to `add()`, but more naturally, you will 86 | probably loop over the dataset and call the method for the outputs of each batch. 87 | That would like something like this (for a PyTorch example): 88 | 89 | ``` python 90 | for i, (input, target) in enumerate(test_loader): 91 | input = input.to(device='cuda', non_blocking=True) 92 | target = target.to(device='cuda', non_blocking=True) 93 | output = model(input) 94 | 95 | image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]] 96 | 97 | evaluator.add(dict(zip(image_ids, list(output.cpu().numpy())))) 98 | ``` 99 | 100 | When you are done, you can get the results locally by running: 101 | 102 | ``` python 103 | evaluator.get_results() 104 | ``` 105 | 106 | But for the server you want to save the results by running: 107 | 108 | ``` python 109 | evaluator.save() 110 | ``` 111 | 112 | This method serialises the results and model metadata and stores to the server database. 113 | 114 | ## How Do I Cache Evaluation? 115 | 116 | Sotabench reruns your script on every commit. This is good because it acts like 117 | continuous integration in checking for bugs and changes, but can be annoying 118 | if the model hasn't changed and evaluation is lengthy. 119 | 120 | Fortunately sotabencheval has caching logic that you can use. 121 | 122 | The idea is that after the first batch, we hash the model outputs and the 123 | current metrics and this tells us if the model is the same given the dataset. 124 | You can include hashing within an evaluation loop like follows (in the following 125 | example for a PyTorch repository): 126 | 127 | ``` python 128 | with torch.no_grad(): 129 | for i, (input, target) in enumerate(test_loader): 130 | input = input.to(device='cuda', non_blocking=True) 131 | target = target.to(device='cuda', non_blocking=True) 132 | output = model(input) 133 | 134 | image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]] 135 | 136 | evaluator.add(dict(zip(image_ids, list(output.cpu().numpy())))) 137 | 138 | if evaluator.cache_exists: 139 | break 140 | 141 | evaluator.save() 142 | ``` 143 | 144 | If the hash is the same as in the server, we infer that the model hasn't changed, so 145 | we simply return hashed results rather than running the whole evaluation again. 146 | 147 | Caching is very useful if you have large models, or a repository that is evaluating 148 | multiple models, as it speeds up evaluation significantly. 149 | 150 | ## A full sotabench.py example 151 | 152 | Below we show an implementation for a model from the torchvision repository. This 153 | incorporates all the features explained above: (a) using the server data root, 154 | (b) using the ImageNet Evaluator, and (c) caching the evaluation logic: 155 | 156 | ``` python 157 | import numpy as np 158 | import PIL 159 | import torch 160 | from torchvision.models.resnet import resnext101_32x8d 161 | import torchvision.transforms as transforms 162 | from torchvision.datasets import ImageNet 163 | from torch.utils.data import DataLoader 164 | 165 | from sotabencheval.image_classification import ImageNetEvaluator 166 | from sotabencheval.utils import is_server 167 | 168 | if is_server(): 169 | DATA_ROOT = './.data/vision/imagenet' 170 | else: # local settings 171 | DATA_ROOT = '/home/ubuntu/my_data/' 172 | 173 | model = resnext101_32x8d(pretrained=True) 174 | 175 | input_transform = transforms.Compose([ 176 | transforms.Resize(256, PIL.Image.BICUBIC), 177 | transforms.CenterCrop(224), 178 | transforms.ToTensor(), 179 | transforms.Normalize( 180 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 181 | ]) 182 | 183 | test_dataset = ImageNet( 184 | DATA_ROOT, 185 | split="val", 186 | transform=input_transform, 187 | target_transform=None, 188 | download=True, 189 | ) 190 | 191 | test_loader = DataLoader( 192 | test_dataset, 193 | batch_size=128, 194 | shuffle=False, 195 | num_workers=4, 196 | pin_memory=True, 197 | ) 198 | 199 | model = model.cuda() 200 | model.eval() 201 | 202 | evaluator = ImageNetEvaluator( 203 | model_name='ResNeXt-101-32x8d', 204 | paper_arxiv_id='1611.05431') 205 | 206 | def get_img_id(image_name): 207 | return image_name.split('/')[-1].replace('.JPEG', '') 208 | 209 | with torch.no_grad(): 210 | for i, (input, target) in enumerate(test_loader): 211 | input = input.to(device='cuda', non_blocking=True) 212 | target = target.to(device='cuda', non_blocking=True) 213 | output = model(input) 214 | 215 | image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]] 216 | 217 | evaluator.add(dict(zip(image_ids, list(output.cpu().numpy())))) 218 | 219 | if evaluator.cache_exists: 220 | break 221 | 222 | evaluator.save() 223 | ``` 224 | 225 | ## Need More Help? 226 | 227 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench 228 | forums if you have any questions or difficulties. 229 | -------------------------------------------------------------------------------- /docs/docs/img/ade20k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/ade20k.png -------------------------------------------------------------------------------- /docs/docs/img/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/banner.png -------------------------------------------------------------------------------- /docs/docs/img/coco.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/coco.jpg -------------------------------------------------------------------------------- /docs/docs/img/connect.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/connect.png -------------------------------------------------------------------------------- /docs/docs/img/connect2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/connect2.png -------------------------------------------------------------------------------- /docs/docs/img/examples.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/examples.png -------------------------------------------------------------------------------- /docs/docs/img/imagenet.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/imagenet.jpeg -------------------------------------------------------------------------------- /docs/docs/img/language_model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/language_model.png -------------------------------------------------------------------------------- /docs/docs/img/pascalvoc2012.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/pascalvoc2012.png -------------------------------------------------------------------------------- /docs/docs/img/results.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/results.png -------------------------------------------------------------------------------- /docs/docs/img/sotabencheval.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/sotabencheval.png -------------------------------------------------------------------------------- /docs/docs/img/squad20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/docs/img/squad20.png -------------------------------------------------------------------------------- /docs/docs/index.md: -------------------------------------------------------------------------------- 1 | # Welcome to sotabencheval! 2 | 3 | ![SotaBench](img/banner.png) 4 | 5 | You have reached the docs for the [sotabencheval](https://github.com/paperswithcode/sotabench-eval) library. This library contains a collection of deep learning benchmarks you can use to 6 | benchmark your models. It can be used in conjunction with the 7 | [sotabench.com](http://www.sotabench.com) website to record results for models, so the community 8 | can compare model performance on different tasks, as well as a continuous integration style 9 | service for your repository to benchmark your models on each commit. 10 | 11 | **sotabencheval** is a general benchmarking library, meaning it is designed to support all deep learning frameworks, 12 | and requires minimal code integration. There are alternative sotabench APIs you can use that are 13 | specialized for particular frameworks, e.g. [torchbench](https://github.com/paperswithcode/torchbench) for PyTorch. 14 | 15 | 16 | ## Getting Started : Benchmarking on ImageNet 17 | 18 | **Step One : Create a sotabench.py file in the root of your repository** 19 | 20 | This can contain whatever logic you need to load and process the dataset, and to 21 | produce model predictions for it. To record your results for sotabench, initialise 22 | an ImageNet evaluator object to name the model (and optionally) link to a paper: 23 | 24 | ``` python 25 | from sotabencheval.image_classification import ImageNetEvaluator 26 | 27 | evaluator = ImageNetEvaluator( 28 | model_name='ResNeXt-101-32x8d', 29 | paper_arxiv_id='1611.05431') 30 | ``` 31 | 32 | For each batch of predictions made by your model, pass a dictionary of keys as image IDs and values as 33 | output predictions to the `evaluator.add` method: 34 | 35 | ``` python 36 | evaluator.add(dict(zip(image_ids, batch_output))) 37 | ``` 38 | Then after you have accumulated all the predictions: 39 | 40 | ``` python 41 | evaluator.save() 42 | ``` 43 | 44 | This will ensure results are evaluated and saved when they are run on the [sotabench](http://www.sotabench.com) server. 45 | 46 | Below you can see a working `sotabench.py` file added to the [torchvision](https://github.com/pytorch/vision) repository 47 | to test one of its models, integrating the evaluation code from above: 48 | 49 | ``` python 50 | import numpy as np 51 | import PIL 52 | import torch 53 | from torch.utils.data import DataLoader 54 | from torchvision.models.resnet import resnext101_32x8d 55 | import torchvision.transforms as transforms 56 | from torchvision.datasets import ImageNet 57 | 58 | from sotabencheval.image_classification import ImageNetEvaluator 59 | from sotabencheval.utils import is_server 60 | 61 | if is_server(): 62 | DATA_ROOT = './.data/vision/imagenet' 63 | else: # local settings 64 | DATA_ROOT = '/home/ubuntu/my_data/' 65 | 66 | model = resnext101_32x8d(pretrained=True) 67 | 68 | input_transform = transforms.Compose([ 69 | transforms.Resize(256, PIL.Image.BICUBIC), 70 | transforms.CenterCrop(224), 71 | transforms.ToTensor(), 72 | transforms.Normalize( 73 | mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), 74 | ]) 75 | 76 | test_dataset = ImageNet( 77 | DATA_ROOT, 78 | split="val", 79 | transform=input_transform, 80 | target_transform=None, 81 | download=True, 82 | ) 83 | 84 | test_loader = DataLoader( 85 | test_dataset, 86 | batch_size=128, 87 | shuffle=False, 88 | num_workers=4, 89 | pin_memory=True, 90 | ) 91 | 92 | model = model.cuda() 93 | model.eval() 94 | 95 | evaluator = ImageNetEvaluator( 96 | model_name='ResNeXt-101-32x8d', 97 | paper_arxiv_id='1611.05431') 98 | 99 | def get_img_id(image_name): 100 | return image_name.split('/')[-1].replace('.JPEG', '') 101 | 102 | with torch.no_grad(): 103 | for i, (input, target) in enumerate(test_loader): 104 | input = input.to(device='cuda', non_blocking=True) 105 | target = target.to(device='cuda', non_blocking=True) 106 | output = model(input 107 | image_ids = [get_img_id(img[0]) for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]] 108 | evaluator.add(dict(zip(image_ids, list(output.cpu().numpy())))) 109 | 110 | evaluator.save() 111 | ``` 112 | 113 | **Step Two : Run locally to verify that it works** 114 | 115 | ``` 116 | python sotabench.py 117 | ``` 118 | 119 | You can also run the logic in a Jupyter Notebook if that is your preferred workflow. 120 | 121 | **Step Three : Login and connect your repository to [sotabench](http://www.sotabench.com)** 122 | 123 | Create an account on [sotabench](http://www.sotabench.com), then head to your user page. Click the 124 | **Connect a GitHub repository** button: 125 | 126 | 127 | 128 | Then follow the steps to connect the repositories that you wish to benchmark: 129 | 130 | ![SotaBench](img/connect2.png) 131 | 132 | 133 | After you connect your repository, the sotabench servers will re-evaluate your model on every commit, 134 | to ensure the model is working and results are up-to-date - including if you add additional models to the benchmark file. 135 | 136 | ## Installation 137 | 138 | The library requires Python 3.6+. You can install via pip: 139 | 140 | ``` 141 | pip install sotabencheval 142 | ``` 143 | 144 | ## Support 145 | 146 | If you get stuck you can head to our [Discourse](http://forum.sotabench.com) forum where you ask 147 | questions on how to use the project. You can also find ideas for contributions, 148 | and work with others on exciting projects. -------------------------------------------------------------------------------- /docs/docs/pascalvoc.md: -------------------------------------------------------------------------------- 1 | # PASCAL VOC 2012 2 | 3 | ![VOC Dataset Examples](img/pascalvoc2012.png) 4 | 5 | You can view the PASCAL VOC 2012 leaderboard [here](https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012). 6 | 7 | ## Getting Started 8 | 9 | You'll need the following in the root of your repository: 10 | 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py` 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation 14 | 15 | You can write whatever you want in your `sotabench.py` file to get model predictions on the VOC 2012 dataset. For example, 16 | PyTorch users might use torchvision to load the dataset. 17 | 18 | But you will need to record your results for the server, and you'll want to avoid doing things like 19 | downloading the dataset on the server. So you should: 20 | 21 | - **Point to the server VOC 2012 data paths** - popular datasets are pre-downloaded on the server. 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results. 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions. 24 | 25 | We explain how to do these various steps below. 26 | 27 | ## Server Data Location 28 | 29 | The VOC 2012 data is located in the root of your repository on the server at `.data/vision/voc2012`. In this folder is contained: 30 | 31 | - `VOCtrainval_11-May-2012.tar` - containing validation images and annotations 32 | 33 | Your local VOC 2012 files may have a different file directory structure, so you 34 | can use control flow like below to change the data path if the script is being 35 | run on sotabench servers: 36 | 37 | ``` python 38 | from sotabencheval.utils import is_server 39 | 40 | if is_server(): 41 | DATA_ROOT = './.data/vision/voc2012' 42 | else: # local settings 43 | DATA_ROOT = '/home/ubuntu/my_data/' 44 | ``` 45 | 46 | This will detect if `sotabench.py` is being run on the server and change behaviour accordingly. 47 | 48 | ## How Do I Initialize an Evaluator? 49 | 50 | Add this to your code - before you start batching over the dataset and making predictions: 51 | 52 | ``` python 53 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator 54 | 55 | evaluator = PASCALVOCEvaluator(model_name='My Super Model') 56 | ``` 57 | 58 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you 59 | put in the same model name string as on the [leaderboard](https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012) 60 | then you will enable direct comparison with the paper. For example: 61 | 62 | ``` python 63 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator 64 | 65 | evaluator = PASCALVOCEvaluator(model_name='PSPNet', paper_arxiv_id='1612.01105') 66 | ``` 67 | 68 | The above will directly compare with the result of the paper when run on the server. 69 | 70 | ## How Do I Evaluate Predictions? 71 | 72 | The evaluator object has an `.add()` method to submit predictions by batch or in full. 73 | 74 | For PASCAL there are two required arguments: `outputs`, a 1D np.ndarray of semantic class predictions per label, 75 | and `targets`, a 1D np.ndarray of ground truth semantic classes per pixel. In other words, it requires flattened 76 | inputs and outputs. 77 | 78 | To elaborate, suppose you are making predictions, batch by batch, and have your model output 79 | and the original targets with batch_size `32`, and image size `(520, 480)`. The shape of your outputs might look like: 80 | 81 | ``` python 82 | batch_output.shape 83 | >> (32, 21, 520, 480) # where 21 is the number of VOC classes 84 | 85 | batch_target.shape 86 | >> (32, 520, 480) 87 | ``` 88 | 89 | We can flatten the entire output and targets to 1D vectors for each pixel: 90 | 91 | ``` python 92 | flattened_batch_output.shape 93 | >> (7987200) # flatten by taking the max class prediction 94 | # (batch_output.argmax(1).flatten() in torch with class as second dimension) 95 | 96 | flattened_batch_target.shape 97 | >> (7987200) # (batch_target.flatten() in torch) 98 | ``` 99 | 100 | The output might look something like this: 101 | 102 | ``` python 103 | flattened_batch_output 104 | >> array([6, 6, 6, 6, 6, ...]) 105 | 106 | flattened_batch_target 107 | >> array([6, 6, 6, 6, 6, ...]) 108 | ``` 109 | 110 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5 111 | pixels - so the model is correct. 112 | 113 | These flattened arrays can then be passed into the .add() method of the evaluator 114 | 115 | ``` python 116 | my_evaluator.update(outputs=flattened_batch_output, 117 | targets=flattened_batch_target) 118 | ``` 119 | 120 | You can do this all at once in a single call to `add()`, but more naturally, you will 121 | probably loop over the dataset and call the method for the outputs of each batch. 122 | That would like something like this (for a PyTorch example): 123 | 124 | ``` python 125 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211') 126 | 127 | with torch.no_grad(): 128 | for image, target in tqdm.tqdm(data_loader_test): 129 | image, target = image.to('cuda'), target.to('cuda') 130 | output = model(image) 131 | output = output['out'] 132 | 133 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy()) 134 | ``` 135 | 136 | When you are done, you can get the results locally by running: 137 | 138 | ``` python 139 | evaluator.get_results() 140 | ``` 141 | 142 | But for the server you want to save the results by running: 143 | 144 | ``` python 145 | evaluator.save() 146 | ``` 147 | 148 | This method serialises the results and model metadata and stores to the server database. 149 | 150 | ## How Do I Cache Evaluation? 151 | 152 | Sotabench reruns your script on every commit. This is good because it acts like 153 | continuous integration in checking for bugs and changes, but can be annoying 154 | if the model hasn't changed and evaluation is lengthy. 155 | 156 | Fortunately sotabencheval has caching logic that you can use. 157 | 158 | The idea is that after the first batch, we hash the model outputs and the 159 | current metrics and this tells us if the model is the same given the dataset. 160 | You can include hashing within an evaluation loop like follows (in the following 161 | example for a PyTorch repository): 162 | 163 | ``` python 164 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211') 165 | 166 | with torch.no_grad(): 167 | for image, target in tqdm.tqdm(data_loader_test): 168 | image, target = image.to('cuda'), target.to('cuda') 169 | output = model(image) 170 | output = output['out'] 171 | 172 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy()) 173 | if evaluator.cache_exists: 174 | break 175 | 176 | evaluator.save() 177 | ``` 178 | 179 | If the hash is the same as in the server, we infer that the model hasn't changed, so 180 | we simply return hashed results rather than running the whole evaluation again. 181 | 182 | Caching is very useful if you have large models, or a repository that is evaluating 183 | multiple models, as it speeds up evaluation significantly. 184 | 185 | ## A full sotabench.py example 186 | 187 | Below we show an implementation for a model from the torchvision repository. This 188 | incorporates all the features explained above: (a) using the server data root, 189 | (b) using the ImageNet Evaluator, and (c) caching the evaluation logic: 190 | 191 | ``` python 192 | import PIL 193 | import torch 194 | import torchvision 195 | from torchvision.models.segmentation import fcn_resnet101 196 | import torchvision.transforms as transforms 197 | import tqdm 198 | 199 | from sotabench_transforms import Normalize, Compose, Resize, ToTensor 200 | 201 | from sotabencheval.semantic_segmentation import PASCALVOCEvaluator 202 | from sotabencheval.utils import is_server 203 | 204 | if is_server(): 205 | DATA_ROOT = './.data/vision/voc2012' 206 | else: # local settings 207 | DATA_ROOT = '/home/ubuntu/my_data/' 208 | 209 | MODEL_NAME = 'fcn_resnet101' 210 | 211 | def cat_list(images, fill_value=0): 212 | max_size = tuple(max(s) for s in zip(*[img.shape for img in images])) 213 | batch_shape = (len(images),) + max_size 214 | batched_imgs = images[0].new(*batch_shape).fill_(fill_value) 215 | for img, pad_img in zip(images, batched_imgs): 216 | pad_img[..., : img.shape[-2], : img.shape[-1]].copy_(img) 217 | return batched_imgs 218 | 219 | def collate_fn(batch): 220 | images, targets = list(zip(*batch)) 221 | batched_imgs = cat_list(images, fill_value=0) 222 | batched_targets = cat_list(targets, fill_value=255) 223 | return batched_imgs, batched_targets 224 | 225 | device = torch.device('cuda') 226 | 227 | normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) 228 | my_transforms = Compose([Resize((520, 480)), ToTensor(), normalize]) 229 | 230 | dataset_test = torchvision.datasets.VOCSegmentation(root=DATA_ROOT, year='2012', image_set="val", 231 | transforms=my_transforms, download=True) 232 | test_sampler = torch.utils.data.SequentialSampler(dataset_test) 233 | 234 | data_loader_test = torch.utils.data.DataLoader( 235 | dataset_test, batch_size=32, 236 | sampler=test_sampler, num_workers=4, 237 | collate_fn=collate_fn) 238 | 239 | model = torchvision.models.segmentation.__dict__['fcn_resnet101'](num_classes=21, pretrained=True) 240 | model.to(device) 241 | model.eval() 242 | 243 | evaluator = PASCALVOCEvaluator(model_name='FCN (ResNet-101)', paper_arxiv_id='1605.06211') 244 | 245 | with torch.no_grad(): 246 | for image, target in tqdm.tqdm(data_loader_test): 247 | image, target = image.to('cuda'), target.to('cuda') 248 | output = model(image) 249 | output = output['out'] 250 | 251 | evaluator.add(output.argmax(1).flatten().cpu().numpy(), target.flatten().cpu().numpy()) 252 | if evaluator.cache_exists: 253 | break 254 | 255 | evaluator.save() 256 | ``` 257 | 258 | ## Need More Help? 259 | 260 | Head on over to the [Computer Vision](https://forum.sotabench.com/c/cv) section of the sotabench 261 | forums if you have any questions or difficulties. 262 | -------------------------------------------------------------------------------- /docs/docs/squad.md: -------------------------------------------------------------------------------- 1 | # SQuAD 2 | 3 | ![SQuAD 2.0 Dataset Examples](img/squad20.png) 4 | 5 | You can view the [SQuAD 1.1](https://sotabench.com/benchmarks/question-answering-on-squad11-dev) and 6 | [SQuAD 2.0](https://sotabench.com/benchmarks/question-answering-on-squad20-dev) leaderboards. 7 | 8 | ## Getting Started 9 | 10 | You'll need the following in the root of your repository: 11 | 12 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit 13 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py` 14 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation 15 | 16 | You can write whatever you want in your `sotabench.py` file to get model predictions on the SQuAD dataset. 17 | 18 | But you will need to record your results for the server, and you'll want to avoid doing things like 19 | downloading the dataset on the server. So you should: 20 | 21 | - **Include an Evaluation object** in `sotabench.py` file to record the results. 22 | - **Point to the server SQuAD data path** - popular datasets are pre-downloaded on the server. 23 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions. 24 | 25 | We explain how to do these various steps below. 26 | 27 | ## How Do I Initialize an Evaluator? 28 | 29 | Add this to your code - before you start batching over the dataset and making predictions: 30 | 31 | ``` python 32 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion 33 | 34 | # for SQuAD v1.1 35 | evaluator = SQuADEvaluator(model_name='My Super Model', version=SQuADVersion.V11) 36 | # for SQuAD v2.0 37 | evaluator = SQuADEvaluator(model_name='My Super Model', version=SQuADVersion.V20) 38 | ``` 39 | 40 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you 41 | put in the same model name string as on the 42 | [SQuAD 1.1](https://sotabench.com/benchmarks/question-answering-on-squad11-dev) or 43 | [SQuAD 2.0](https://sotabench.com/benchmarks/question-answering-on-squad20-dev) leaderboard 44 | then you will enable direct comparison with the paper's model. For example: 45 | 46 | ``` python 47 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion 48 | 49 | evaluator = SQuADEvaluator(model_name='SpanBERT', 50 | paper_arxiv_id='1907.10529', 51 | version=SQuADVersion.V20) 52 | ``` 53 | 54 | The above will directly compare with the result of the paper when run on the server. 55 | 56 | ## Server Data Location 57 | 58 | The SQuAD development data is located in the root of your repository on the server at `.data/nlp/squad`. 59 | In this folder is contained: 60 | 61 | - `dev-v1.1.json` - containing SQuAD v1.1 development dataset 62 | - `dev-v2.0.json` - containing SQuAD v2.0 development dataset 63 | 64 | You can use `evaluator.dataset_path: Path` to get a path to the dataset json file. 65 | In the example above it resolves to `.data/nlp/squad/dev-v2.0.json` on 66 | sotabench server and `./dev-v2.0.json` when run locally. 67 | If you want to use a non-standard file name or location when running locally 68 | you can override the defaults like this: 69 | 70 | ``` python 71 | evaluator = SQuADEvaluator( 72 | ..., 73 | local_root='mydatasets', 74 | dataset_filename='data.json' 75 | ) 76 | ``` 77 | 78 | ## How Do I Evaluate Predictions? 79 | 80 | The evaluator object has an `.add(answers: Dict[str, str])` method to submit predictions by batch or in full. 81 | 82 | For SQuAD the expected input is a dictionary, where keys are question ids and values are text answers. 83 | For unanswerable questions the answer should be an empty string. For example: 84 | 85 | ``` python 86 | {"57296d571d04691400779413": "itself", "5a89117e19b91f001a626f2d": ""} 87 | ``` 88 | 89 | You can do this all at once in a single call to `add()`, but more naturally, you will 90 | probably loop over the dataset and call the method for the outputs of each batch. 91 | That would look something like this (for a PyTorch example): 92 | 93 | ``` python 94 | ... 95 | 96 | evaluator = SQuADEvaluator(model_name='My Super Model', 97 | paper_arxiv_id="1710.10723", 98 | version=SQuADVersion.V11) 99 | 100 | with torch.no_grad(): 101 | for i, (input, target) in enumerate(data_loader): 102 | ... 103 | output = model(input) 104 | # potentially formatting of the output here to be a dict 105 | evaluator.add(output) 106 | ``` 107 | 108 | When you are done, you can get the results locally by running: 109 | 110 | ``` python 111 | evaluator.get_results() 112 | ``` 113 | 114 | But for the server you want to save the results by running: 115 | 116 | ``` python 117 | evaluator.save() 118 | ``` 119 | 120 | This method serialises the results and model metadata and stores to the server database. 121 | 122 | ## How Do I Cache Evaluation? 123 | 124 | Sotabench reruns your script on every commit. This is good because it acts like 125 | continuous integration in checking for bugs and changes, but can be annoying 126 | if the model hasn't changed and evaluation is lengthy. 127 | 128 | Fortunately sotabencheval has caching logic that you can use. 129 | 130 | The idea is that after the first batch, we hash the model outputs and the 131 | current metrics and this tells us if the model is the same given the dataset. 132 | You can include hashing within an evaluation loop like follows (in the following 133 | example for a PyTorch repository): 134 | 135 | ``` python 136 | with torch.no_grad(): 137 | for i, (input, target) in enumerate(data_loader): 138 | ... 139 | output = model(input) 140 | # potentially formatting of the output here to be a list of dicts 141 | evaluator.add(output) 142 | 143 | if evaluator.cache_exists: 144 | break 145 | 146 | evaluator.save() 147 | ``` 148 | 149 | If the hash is the same as in the server, we infer that the model hasn't changed, so 150 | we simply return hashed results rather than running the whole evaluation again. 151 | 152 | Caching is very useful if you have large models, or a repository that is evaluating 153 | multiple models, as it speeds up evaluation significantly. 154 | 155 | ## A Full sotabench.py Example 156 | 157 | Below we show an implementation for a model from the AllenNLP repository. This 158 | incorporates all the features explained above: (a) using the SQuAD Evaluator, 159 | (b) using custom dataset location when run locally, and (c) the evaluation caching logic. 160 | 161 | ``` python 162 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion 163 | 164 | from allennlp.data import DatasetReader 165 | from allennlp.data.iterators import DataIterator 166 | from allennlp.models.archival import load_archive 167 | from allennlp.nn.util import move_to_device 168 | 169 | def load_model(url, batch_size=64): 170 | archive = load_archive(url, cuda_device=0) 171 | model = archive.model 172 | reader = DatasetReader.from_params(archive.config["dataset_reader"]) 173 | iterator_params = archive.config["iterator"] 174 | iterator_params["batch_size"] = batch_size 175 | data_iterator = DataIterator.from_params(iterator_params) 176 | data_iterator.index_with(model.vocab) 177 | return model, reader, data_iterator 178 | 179 | def evaluate(model, dataset, data_iterator, evaluator): 180 | model.eval() 181 | evaluator.reset_time() 182 | for batch in data_iterator(dataset, num_epochs=1, shuffle=False): 183 | batch = move_to_device(batch, 0) 184 | predictions = model(**batch) 185 | answers = {metadata['id']: prediction 186 | for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])} 187 | evaluator.add(answers) 188 | if evaluator.cache_exists: 189 | break 190 | 191 | evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)", 192 | paper_arxiv_id="1611.01603", version=SQuADVersion.V11) 193 | 194 | model, reader, data_iter =\ 195 | load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz") 196 | dataset = reader.read(evaluator.dataset_path) 197 | evaluate(model, dataset, data_iter, evaluator) 198 | evaluator.save() 199 | print(evaluator.results) 200 | ``` 201 | 202 | ## Need More Help? 203 | 204 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench 205 | forums if you have any questions or difficulties. 206 | -------------------------------------------------------------------------------- /docs/docs/wikitext103.md: -------------------------------------------------------------------------------- 1 | # WikiText-103 2 | 3 | ![An example text of Wikitext-103](img/language_model.png) 4 | 5 | You can view the WikiText-103 leaderboard [here](https://sotabench.com/benchmarks/language-modelling-on-wikitext-103). 6 | 7 | ## Getting Started 8 | 9 | You'll need the following in the root of your repository: 10 | 11 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit 12 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py` 13 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation 14 | 15 | You can write whatever you want in your `sotabench.py` file to get language model predictions on the WikiText-103 dataset. 16 | 17 | But you will need to record your results for the server, and you'll want to avoid doing things like 18 | downloading the dataset on the server. So you should: 19 | 20 | - **Point to the server WikiText-103 data path** - popular datasets are pre-downloaded on the server. 21 | - **Include an Evaluation object** in `sotabench.py` file to record the results. 22 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions. 23 | 24 | We explain how to do these various steps below. 25 | 26 | ## Server Data Location 27 | 28 | The WikiText-103 development data is located in the root of your repository on the server at `.data/nlp/wikitext-103/wikitext-103-v1.zip`. 29 | The archive contains a folder `wikitext-103` with the following files: 30 | 31 | - `wiki.train.tokens` 32 | - `wiki.valid.tokens` 33 | - `wiki.test.tokens` 34 | 35 | It is the original zip file released [here](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/). 36 | We are running the benchmark on the `wiki.test.tokens` dataset. 37 | We have two helper methods that will unpack the dataset for you and give you the `pathlib.Path` to the test file. 38 | 39 | The first option `test_set_path` is available once you instantiate the `WikiText103Evaluator`: 40 | 41 | ```python 42 | ... 43 | 44 | evaluator = WikiText103Evaluator( 45 | model_name="Transformer-XL Large", 46 | paper_arxiv_id="1901.02860", 47 | paper_pwc_id="transformer-xl-attentive-language-models", 48 | local_root='/content/wikitext-103' 49 | ) 50 | # dataset_path is pathlib.Path and points to wikitext.test.tokens 51 | with evaluator.test_set_path.open() as f: 52 | test_data = torch.tensor(tokenizer.encode(f.read())).to("cuda") 53 | ``` 54 | 55 | There is a second option available if you are evaluating multiple models and need to use the same 56 | dataset multiple times - `WikiText103Evaluator.get_test_set_path(local_root)`. This will get the path before 57 | you initialize a WikiText evaluator: 58 | 59 | ```python 60 | from sotabencheval.language_modelling import WikiText103Evaluator 61 | 62 | test_file_path = WikiText103Evaluator.get_test_set_path('/home/ubuntu/my_data/wiki103') 63 | with test_file_path.open() as f: 64 | content = f.read() 65 | ``` 66 | 67 | ## How Do I Initialize an Evaluator? 68 | 69 | Add this to your code - before you start batching over the dataset and making predictions: 70 | 71 | ``` python 72 | from sotabencheval.language_modelling import WikiText103Evaluator 73 | 74 | evaluator = WikiText103Evaluator(model_name='Model name as found in paperswithcode website') 75 | ``` 76 | 77 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you 78 | put in the same model name string as on the 79 | [Wikitext-103](https://sotabench.com/benchmarks/language-modelling-on-wikitext-103) leaderboard 80 | then you will enable direct comparison with the paper's model. If the `arxiv_id` is not available you 81 | can use `paperswithcode.com` id. Below is an example of an evaluator that matches `Transformer XL`: 82 | 83 | ``` python 84 | from sotabencheval.language_modelling import WikiText103Evaluator 85 | 86 | evaluator = WikiText103Evaluator( 87 | model_name="Transformer-XL Large", 88 | paper_arxiv_id="1901.02860", 89 | paper_pwc_id="transformer-xl-attentive-language-models", 90 | local_root="path_to_your_data", 91 | ) 92 | ``` 93 | 94 | The above will directly compare with the result of the paper when run on the server. 95 | 96 | ## How Do I Evaluate Predictions? 97 | 98 | The evaluator object has an `.add(log_probs, targets)` method to submit predictions by batch or in full. 99 | We expect you to give us the log probability of a batch of target tokens and the `target` tokens themselves. 100 | The `log_probs` can be either: 101 | 102 | - a 0d "tensor" (`np.ndarray`/`torch.tensor`) - summed log probability of all `targets` tokens 103 | - a 2d "tensor" (`np.ndarray`/`torch.tensor`) - log probabilities of each target token, the `log_probs.shape` should match `targets.shape` 104 | - a 3d "tensor" (`np.ndarray`/`torch.tensor`) - distribution of log probabilities for each position in the sequence, we will gather the probabilities of target tokens for you. 105 | 106 | It is recommended to use third or second option as it allows us to check your perplexity calculations. 107 | 108 | If your model uses subword tokenization you don't need convert subwords to full words. You are free to report probability of each subword: we will adjust the perplexity normalization accordingly. Just make sure to set `subword_tokenization=True` in your evaluator. 109 | 110 | Here is an example of how to report results (for a PyTorch example): 111 | 112 | ``` python 113 | 114 | evaluator = WikiText103Evaluator( 115 | model_name='GPT-2 Small', 116 | paper_pwc_id="language-models-are-unsupervised-multitask", 117 | local_root="path_to_your_data", 118 | subword_tokenization = True 119 | ) 120 | 121 | # run you data preprocessing, in case of GPT-2 the preprocessing removes moses artifacts 122 | with torch.no_grad(): 123 | model.eval() 124 | for input, target in data_loader: 125 | output = model(input) 126 | log_probs = torch.LogSoftmax(output, dim=-1) 127 | target_log_probs = output.gather(-1, targets.unsqueeze(-1)) 128 | evaluator.add(target_log_probs, target) 129 | ``` 130 | 131 | When you are done, you can get the results locally by running: 132 | 133 | ``` python 134 | evaluator.get_results() 135 | ``` 136 | 137 | But for the server you want to save the results by running: 138 | 139 | ``` python 140 | evaluator.save() 141 | ``` 142 | 143 | This method serialises the results and model metadata and stores to the server database. 144 | 145 | ## How Do I Cache Evaluation? 146 | 147 | Sotabench reruns your script on every commit. This is good because it acts like 148 | continuous integration in checking for bugs and changes, but can be annoying 149 | if the model hasn't changed and evaluation is lengthy. 150 | 151 | Fortunately sotabencheval has caching logic that you can use. 152 | 153 | The idea is that after the first batch, we hash the model outputs and the 154 | current metrics and this tells us if the model is the same given the dataset. 155 | You can include hashing within an evaluation loop like follows (in the following 156 | example for a PyTorch repository): 157 | 158 | ``` python 159 | with torch.no_grad(): 160 | for input, target in data_loader: 161 | # ... 162 | output = model(input) 163 | log_probs = #... 164 | evaluator.add(log_probs, target) 165 | 166 | if evaluator.cache_exists: 167 | break 168 | 169 | evaluator.save() 170 | ``` 171 | 172 | If the hash is the same as in the server, we infer that the model hasn't changed, so 173 | we simply return hashed results rather than running the whole evaluation again. 174 | 175 | Caching is very useful if you have large models, or a repository that is evaluating 176 | multiple models, as it speeds up evaluation significantly. 177 | 178 | 179 | ## A full sotabench.py example 180 | 181 | Below we show an implementation for a model from the `huggingface/transformers`. This 182 | incorporates all the features explained above: (a) using the server data, 183 | (b) using the WikiText-103 Evaluator, and (c) caching the evaluation logic: 184 | 185 | ``` python 186 | import torch 187 | from tqdm import tqdm 188 | from sotabencheval.language_modelling import WikiText103Evaluator 189 | 190 | model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda") 191 | tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103') 192 | 193 | evaluator = WikiText103Evaluator( 194 | model_name="Transformer-XL Large", 195 | paper_arxiv_id="1901.02860", 196 | paper_pwc_id="transformer-xl-attentive-language-models", 197 | local_root='/content/wikitext-103' 198 | ) 199 | 200 | with evaluator.test_set_path.open() as f: 201 | test_data = torch.tensor(tokenizer.encode(f.read())) 202 | 203 | seq_len = 128 204 | with torch.no_grad(): 205 | evaluator.reset_timer() 206 | model.eval() 207 | X, Y, mems = test_data[None, :-1], test_data[None, 1:], None 208 | for s in tqdm(range(0, X.shape[-1], seq_len)): 209 | x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda") 210 | log_probs, mems, *_ = model(input_ids=x, mems=mems) 211 | evaluator.add(log_probs, y) 212 | if evaluator.cache_exists: 213 | break 214 | evaluator.save() 215 | evaluator.print_results() 216 | ``` 217 | 218 | You can run this example on [Google Colab](https://colab.research.google.com/drive/1Qcp1_Fgo_aMtSgf_PV1gFw1DT6hEv7fW). 219 | 220 | ## Need More Help? 221 | 222 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench forums if you have any questions or difficulties. 223 | -------------------------------------------------------------------------------- /docs/docs/wmt.md: -------------------------------------------------------------------------------- 1 | # WMT 2 | 3 | You can view the WMT Machine Translation leaderboards: 4 | 5 | - [WMT2014 English-German](https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german) 6 | - [WMT2014 English-French](https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-french) 7 | - [WMT2019 English-German](https://sotabench.com/benchmarks/machine-translation-on-wmt2019-english-german) 8 | 9 | ## Getting Started 10 | 11 | You'll need the following in the root of your repository: 12 | 13 | - `sotabench.py` file - contains benchmarking logic; the server will run this on each commit 14 | - `requirements.txt` file - Python dependencies to be installed before running `sotabench.py` 15 | - `sotabench_setup.sh` *(optional)* - any advanced dependencies or setup, e.g. compilation 16 | 17 | You can write whatever you want in your `sotabench.py` file to get model predictions on the WMT datasets. 18 | 19 | But you will need to record your results for the server, and you'll want to avoid doing things like 20 | downloading the dataset on the server. So you should: 21 | 22 | - **Include an Evaluation object** in `sotabench.py` file to record the results. 23 | - **Point to the server WMT data path** - popular datasets are pre-downloaded on the server. 24 | - **Use Caching** *(optional)* - to speed up evaluation by hashing the first batch of predictions. 25 | 26 | We explain how to do these various steps below. 27 | 28 | ## How Do I Initialize an Evaluator? 29 | 30 | Before you start batching over the dataset and making predictions you need 31 | to create an evaluator instance to record results for a given leaderboard. 32 | For example, to evaluate on WMT2014 News English-French test set add this 33 | to your code: 34 | 35 | ``` python 36 | from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language 37 | 38 | evaluator = WMTEvaluator( 39 | dataset=WMTDataset.News2014, 40 | source_lang=Language.English, 41 | target_lang=Language.French, 42 | local_root='mydatasets', 43 | model_name='My Super Model' 44 | ) 45 | ``` 46 | 47 | You can use `evaluator.source_dataset_path: Path` and `evaluator.target_dataset_path: Path` 48 | to get paths to the source and target SGML files. 49 | In the example above the first one resolves to `.data/nlp/wmt/newstest2014-fren-src.en.sgm` on 50 | sotabench server and `mydatasets/newstest2014-fren-src.en.sgm` when run locally. 51 | If you want to use non-standard file names locally you can override the defaults like this: 52 | 53 | ``` python 54 | evaluator = WMTEvaluator( 55 | ..., 56 | local_root='mydatasets' 57 | source_dataset_filename='english.sgm', 58 | target_dataset_filename='french.sgm' 59 | ) 60 | ``` 61 | 62 | If you are reproducing a model from a paper, then you can enter the arXiv ID. If you 63 | put in the same model name string as on the leaderboard 64 | then you will enable direct comparison with the paper's model. For example: 65 | 66 | ``` python 67 | evaluator = WMTEvaluator( 68 | dataset=WMTDataset.News2019, 69 | source_lang=Language.English, 70 | target_lang=Language.German, 71 | local_root="mydatasets", 72 | model_name="Facebook-FAIR (single)", 73 | paper_arxiv_id="1907.06616" 74 | ) 75 | ``` 76 | 77 | The above will directly compare with the result of the paper when run on the server. 78 | 79 | By default the evaluator computes a detokenized mixed-case SacreBLEU score. 80 | To get a tokenized BLEU score as well, during construction of the evaluator set 81 | a `tokenization: Callable[[str], str]` parameter to a function that tokenizes 82 | an input segment and returns segment with tokens separated by space, f.e.: 83 | 84 | ``` python 85 | def get_tokenization(): 86 | mt = sacremoses.MosesTokenizer() 87 | def tokenize(sentence): 88 | return mt.tokenize(sentence, return_str=True) 89 | return tokenize 90 | 91 | evaluator = WMTEvaluator( 92 | ..., 93 | tokenization=get_tokenization() 94 | ) 95 | ``` 96 | 97 | Instead of parsing the dataset files by yourself you can access raw segments as strings: 98 | 99 | ``` python 100 | for segment_id, text in evaluator.source_segments: 101 | # translate text 102 | 103 | # or get segments within document context 104 | for document in evaluator.source_documents: 105 | context = [segment.text for segment in document.segments] 106 | for segment in document.segments: 107 | segment_id, text = segment.id, segment.text 108 | # translate text in context 109 | ``` 110 | 111 | ## How Do I Evaluate Predictions? 112 | 113 | The evaluator object has an `.add(answers: Dict[str, str])` method to submit predictions by batch or in full. 114 | 115 | For WMT the expected input is a dictionary, where keys are source segments 116 | ids and values are translated segments 117 | (segment id is created by concatenating document id and the original segment id, 118 | separted by `#`.) For example: 119 | 120 | ``` python 121 | evaluator.add({ 122 | 'bbc.381790#1': 'Waliser AMs sorgen sich um "Aussehen wie Muppets"', 123 | 'bbc.381790#2': 'Unter einigen AMs herrscht Bestürzung über einen...', 124 | 'bbc.381790#3': 'Sie ist aufgrund von Plänen entstanden, den Namen...' 125 | }) 126 | ``` 127 | 128 | You can do this all at once in a single call to `add()`, but more naturally, you will 129 | probably loop over the dataset and call the method for the outputs of each batch. 130 | That would look something like this (for a PyTorch example): 131 | 132 | ``` python 133 | with torch.no_grad(): 134 | for i, (input, target) in enumerate(data_loader): 135 | ... 136 | output = model(input) 137 | # potentially formatting of the output here to be a dict 138 | evaluator.add(output) 139 | ``` 140 | 141 | When you are done, you can get the results locally by running: 142 | 143 | ``` python 144 | evaluator.get_results() 145 | ``` 146 | 147 | But for the server you want to save the results by running: 148 | 149 | ``` python 150 | evaluator.save() 151 | ``` 152 | 153 | This method serialises the results and model metadata and stores to the server database. 154 | 155 | ## How Do I Cache Evaluation? 156 | 157 | Sotabench reruns your script on every commit. This is good because it acts like 158 | continuous integration in checking for bugs and changes, but can be annoying 159 | if the model hasn't changed and evaluation is lengthy. 160 | 161 | Fortunately sotabencheval has caching logic that you can use. 162 | 163 | The idea is that after the first batch, we hash the model outputs and the 164 | current metrics and this tells us if the model is the same given the dataset. 165 | You can include hashing within an evaluation loop like follows (in the following 166 | example for a PyTorch repository): 167 | 168 | ``` python 169 | with torch.no_grad(): 170 | for i, (input, target) in enumerate(data_loader): 171 | ... 172 | output = model(input) 173 | # potentially formatting of the output here to be a list of dicts 174 | evaluator.add(output) 175 | 176 | if evaluator.cache_exists: 177 | break 178 | 179 | evaluator.save() 180 | ``` 181 | 182 | If the hash is the same as in the server, we infer that the model hasn't changed, so 183 | we simply return hashed results rather than running the whole evaluation again. 184 | 185 | Caching is very useful if you have large models, or a repository that is evaluating 186 | multiple models, as it speeds up evaluation significantly. 187 | 188 | ## A Full sotabench.py Example 189 | 190 | Below we show an implementation for a model from the torchhub repository. This 191 | incorporates all the features explained above: (a) using the WMT Evaluator, 192 | (b) accessing segments from evaluator, and (c) the evaluation caching logic. 193 | For clarity we omit batching and simply translate segment by segment. 194 | 195 | ``` python 196 | from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language 197 | from tqdm import tqdm 198 | import torch 199 | 200 | evaluator = WMTEvaluator( 201 | dataset=WMTDataset.News2019, 202 | source_lang=Language.English, 203 | target_lang=Language.German, 204 | local_root="data/nlp/wmt", 205 | model_name="Facebook-FAIR (single)", 206 | paper_arxiv_id="1907.06616" 207 | ) 208 | 209 | model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model', 210 | force_reload=True, tokenizer='moses', bpe='fastbpe').cuda() 211 | 212 | for sid, text in tqdm(evaluator.source_segments.items()): 213 | translated = model.translate(text) 214 | evaluator.add({sid: translated}) 215 | if evaluator.cache_exists: 216 | break 217 | 218 | evaluator.save() 219 | print(evaluator.results) 220 | 221 | ``` 222 | 223 | ## Need More Help? 224 | 225 | Head on over to the [Natural Language Processing](https://forum.sotabench.com/c/natural-language-processing) section of the sotabench 226 | forums if you have any questions or difficulties. 227 | -------------------------------------------------------------------------------- /docs/mkdocs.yml: -------------------------------------------------------------------------------- 1 | site_name: sotabencheval Docs 2 | theme: 3 | name: 'material' 4 | palette: 5 | primary: 'cyan' 6 | accent: 'cyan' 7 | logo: 8 | icon: 'explore' 9 | markdown_extensions: 10 | - admonition 11 | - codehilite -------------------------------------------------------------------------------- /docs/site/img/squad20.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/docs/site/img/squad20.png -------------------------------------------------------------------------------- /requirements-dev.txt: -------------------------------------------------------------------------------- 1 | -r requirements.txt 2 | black==19.3b0 3 | flake8==3.7.8 4 | mkdocs-material 5 | pre-commit==1.18.3 6 | pydocstyle==4.0.1 7 | pygments 8 | pytest==5.1.1 9 | pytest-cov==2.7.1 10 | recommonmark==0.6.0 11 | sphinx==2.2.0 12 | sphinx-rtd-theme==0.4.3 13 | twine==1.13.0 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Cython 2 | numpy 3 | pycocotools>=2.0.0 4 | sotabenchapi>=0.0.13 5 | tqdm>=4.32.2 6 | beautifulsoup4>=4.7.0 7 | sacrebleu==1.4.1 8 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E203,E402,W503,E701 3 | 4 | [pydocstyle] 5 | ignore = D10,D202,D203,D212,D213,D401,D403,D406,D407,D413 6 | 7 | [tool:pytest] 8 | testpaths = sotabench-eval/test 9 | python_files = test_*.py 10 | norecursedirs = .git 11 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import io 2 | from setuptools import setup, find_packages 3 | from sotabencheval.version import __version__ 4 | 5 | name = "sotabencheval" 6 | author = "Atlas ML" 7 | author_email = "hello@sotabench.com" 8 | license = "Apache-2.0" 9 | url = "https://sotabench.com" 10 | description = ( 11 | "Easily benchmark Machine Learning models on selected tasks and datasets" 12 | ) 13 | 14 | 15 | def get_requirements(): 16 | with io.open("requirements.txt") as f: 17 | return [ 18 | line.strip() 19 | for line in f.readlines() 20 | if not line.strip().startswith("#") 21 | ] 22 | 23 | 24 | setup( 25 | name=name, 26 | version=__version__, 27 | author=author, 28 | author_email=author_email, 29 | maintainer=author, 30 | maintainer_email=author_email, 31 | description=description, 32 | long_description=io.open("README.md", "r", encoding="utf-8").read(), 33 | long_description_content_type="text/markdown", 34 | url=url, 35 | platforms=["Windows", "POSIX", "MacOSX"], 36 | license=license, 37 | packages=find_packages(), 38 | include_package_data=True, 39 | install_requires=get_requirements(), 40 | classifiers=[ 41 | "Programming Language :: Python :: 3", 42 | "Programming Language :: Python :: 3.6", 43 | "Programming Language :: Python :: 3.7", 44 | "License :: OSI Approved :: Apache Software License", 45 | "Operating System :: OS Independent", 46 | ], 47 | ) 48 | 49 | -------------------------------------------------------------------------------- /sotabencheval/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/paperswithcode/sotabench-eval/667d04e85ea7f2f6b3fdd709e1990ac88d132be7/sotabencheval/__init__.py -------------------------------------------------------------------------------- /sotabencheval/core/__init__.py: -------------------------------------------------------------------------------- 1 | from sotabencheval.core.evaluator import BaseEvaluator 2 | 3 | __all__ = ["BaseEvaluator"] -------------------------------------------------------------------------------- /sotabencheval/core/cache.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | CACHE_FLOAT_PRECISION = 3 4 | 5 | 6 | def cache_value(value): 7 | """ 8 | Takes in a value and puts it in a format ready for hashing + caching 9 | 10 | Why? In sotabench we hash the output after the first batch as an indication of whether the model has changed or not. 11 | If the model hasn't changed, then we don't run the whole evaluation on the server - but return the same results 12 | as before. This speeds up evaluation - making "continuous evaluation" more feasible...it also means lower 13 | GPU costs for us :). 14 | 15 | We apply some rounding and reformatting so small low precision changes do not change the hash. 16 | 17 | :param value: example model output 18 | :return: formatted value (rounded and ready for hashing) 19 | """ 20 | if isinstance(value, (str, int, bool)) or value is None: 21 | return value 22 | elif isinstance(value, float): 23 | return np.round(value, CACHE_FLOAT_PRECISION) 24 | elif isinstance(value, dict): 25 | return {key: cache_value(val) for key, val in sorted(value.items(), key=lambda x: x[0])} 26 | elif isinstance(value, list): 27 | return [cache_value(val) for val in value] 28 | elif isinstance(value, np.ndarray): 29 | return value.round(CACHE_FLOAT_PRECISION) 30 | -------------------------------------------------------------------------------- /sotabencheval/core/evaluator.py: -------------------------------------------------------------------------------- 1 | import time 2 | 3 | from sotabenchapi.client import Client 4 | from sotabenchapi.core import BenchmarkResult 5 | from sotabencheval.utils import is_server 6 | from sotabencheval.core.cache import cache_value 7 | 8 | 9 | class BaseEvaluator: 10 | """Base class for evaluator objects on tasks 11 | 12 | Currently SQuAD and WMT use this as a parent. 13 | 14 | TODO: Refactor ImageNet, COCO, ADE20K, PASCAL to utilise this class 15 | 16 | The core API design relies upon: 17 | 18 | (a) Initializing an Evaluator object and linking to a paper, for example: 19 | 20 | .. code-block:: python 21 | 22 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion 23 | 24 | evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529', 25 | version=SQuADVersion.V20) 26 | 27 | The paper metadata allows the results to be linked to paper results when submitted to sotabench.com. 28 | 29 | (b) Adding Predictions (usually in batch) - example below for PyTorch iterating over DataLoader: 30 | 31 | .. code-block:: python 32 | 33 | for i, (input, target) in enumerate(data_loader): 34 | ... 35 | output = model(input) 36 | # potentially formatting of the output here 37 | evaluator.add(output) 38 | 39 | These results are accumulated and then evaluated - i.e. metrics are calculated once done. 40 | 41 | (c) Saving Results 42 | 43 | .. code-block:: python 44 | evaluator.save() 45 | 46 | Gets the evaluation results for the current predictions added to the Evaluation object - calculates metrics - 47 | then run if on the server, serializes results to a sotabench_results.json file which is processed and results 48 | are stored on the server. 49 | 50 | These three steps: initialization -> adding predictions -> saving and evaluating results are the core API. 51 | They should be capable of integration with any existing evaluation logic in your repository. 52 | """ 53 | 54 | def __init__(self, 55 | model_name: str = None, 56 | paper_arxiv_id: str = None, 57 | paper_pwc_id: str = None, 58 | paper_results: dict = None, 59 | model_description=None,): 60 | """ 61 | Initializes a BaseEvaluator like object 62 | 63 | :param model_name: (str) The name of the model, for example 'ResNet-101', which will be saved to sotabench.com 64 | :param paper_arxiv_id: (str, optional) The paper that the model is linked to, e.g. '1906.06423' 65 | :param paper_pwc_id: (str, optional) The PWC paper id (slug), e.g. 'albert-a-lite-bert-for-self-supervised' 66 | :param paper_results: (dict, optional) If the paper you are linking to does not have results on sotabench, 67 | then you can add paper results here. This will be a dictionary with keys as metric names, and values as metric 68 | values. This will be benchmark specific. 69 | :param model_description: (str, optional) Optional description for the model; this can contain details about 70 | where the weights are from, details about training, and more. This will appear in an info box for the model 71 | when it is displayed on sotabench.com. 72 | """ 73 | 74 | # Model metadata 75 | 76 | self.model_name = model_name 77 | self.paper_arxiv_id = paper_arxiv_id 78 | self.paper_pwc_id = paper_pwc_id 79 | self.paper_results = paper_results 80 | self.model_description = model_description 81 | 82 | # Backend variables for hashing and caching 83 | 84 | self.first_batch_processed = False 85 | self.batch_hash = None 86 | self.cached_results = False 87 | self.results = None 88 | self._cache_exists = None 89 | 90 | # Speed and memory metrics 91 | 92 | self.init_time = time.time() 93 | self.speed_mem_metrics = {} 94 | 95 | @property 96 | def cache_exists(self): 97 | """ 98 | Checks whether the cache exists in the sotabench.com database - if so 99 | then sets self.results to cached results and returns True. 100 | 101 | You can use this property for control flow to break a for loop over a dataset 102 | after the first iteration. This prevents re-running the same calculation for the 103 | same model twice. 104 | 105 | Q: Why should the user use this? 106 | A: If you want fast "continuous evaluation" and don't want to avoid rerunning the same model over and over 107 | each time you commit something new to your repository. 108 | 109 | Examples: 110 | Breaking a for loop if the model is the same as last time we ran 111 | 112 | .. code-block:: python 113 | 114 | ... 115 | 116 | with torch.no_grad(): 117 | for i, (input, target) in enumerate(iterator): 118 | ... 119 | output = model(input) 120 | # optional formatting of output here to be a list of detection dicts 121 | evaluator.add(output) 122 | 123 | if evaluator.cache_exists: 124 | break 125 | 126 | evaluator.save() 127 | 128 | This logic is for the server; it will not break the loop if you evaluate locally. 129 | 130 | :return: bool or None (if not on server) 131 | """ 132 | 133 | if not is_server(): # we only check the cache on the server 134 | return None 135 | 136 | if not self.first_batch_processed: 137 | return False 138 | 139 | if self._cache_exists is not None: 140 | return self._cache_exists 141 | 142 | client = Client.public() 143 | cached_res = client.get_results_by_run_hash(self.batch_hash) 144 | if cached_res: 145 | self.results = cached_res 146 | self.cached_results = True 147 | print( 148 | "No model change detected (using the first batch run " 149 | f"hash {self.batch_hash}). Will use cached results." 150 | ) 151 | 152 | self._cache_exists = True 153 | else: 154 | self._cache_exists = False 155 | return self._cache_exists 156 | 157 | def reset(self): 158 | """Resets the internal state of evaluator and allows to start over""" 159 | pass 160 | 161 | def cache_values(self, **kwargs): 162 | """ 163 | Takes in keyword argument and converts to a hashable (cachable) format for each 164 | 165 | :param kwargs: keyword argument 166 | :return: cachable version of the keyword arguments 167 | """ 168 | return cache_value(kwargs) 169 | 170 | def eval(self, results_generator): 171 | """Run full evaluation loop on results_genertor""" 172 | self.reset() 173 | self.reset_time() 174 | for results in results_generator: 175 | self.add(*results) 176 | if self.first_batch_processed and self.cache_exists: 177 | break 178 | self.save() 179 | return self 180 | 181 | def get_results(self): 182 | """Calculate results.""" 183 | return self.results 184 | 185 | def print_results(self): 186 | """Print results.""" 187 | self.get_results() 188 | print(f"results = {self.results}, speed_mem_metrics = {self.speed_mem_metrics}") 189 | 190 | def reset_time(self): 191 | """ 192 | Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation 193 | appropriately, for example: 194 | 195 | .. code-block:: python 196 | 197 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion 198 | 199 | evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529', 200 | version=SQuADVersion.V20) 201 | 202 | # processing/setup logic here 203 | 204 | evaluator.reset_time() 205 | 206 | for i, (input, target) in enumerate(data_loader): 207 | ... 208 | output = model(input) 209 | # potentially formatting of the output here 210 | evaluator.add(output) 211 | 212 | evaluator.save() 213 | 214 | Above we may have processing logic inbetween the evaluator initialization and the actual evaluation loop, so 215 | we reset the timer so it's a fair timing of the evaluation (and not setup steps like data processing, loading 216 | the model etc). 217 | 218 | :return: void - resets self.init_time 219 | """ 220 | self.init_time = time.time() 221 | 222 | def save(self, **kwargs): 223 | """ 224 | Calculate results and then put into a BenchmarkResult object 225 | 226 | On the sotabench.com server, this will produce a JSON file serialisation in sotabench_results.json and results 227 | will be recorded on the platform. 228 | 229 | Users should save once all predictions are added, for instance: 230 | 231 | .. code-block:: python 232 | 233 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion 234 | 235 | evaluator = SQuADEvaluator(model_name='SpanBERT', paper_arxiv_id='1907.10529', 236 | version=SQuADVersion.V20) 237 | 238 | # processing/setup logic here 239 | 240 | evaluator.reset_time() 241 | 242 | for i, (input, target) in enumerate(data_loader): 243 | ... 244 | output = model(input) 245 | # potentially formatting of the output here 246 | evaluator.add(output) 247 | 248 | evaluator.save() 249 | 250 | Here once we have added all the predictions to the evaluator, we .save() so we evaluate and, if on the server, 251 | results are serialized and saved to the server. 252 | 253 | :return: BenchmarkResult object with results and metadata 254 | """ 255 | # recalculate to ensure no mistakes made during batch-by-batch metric calculation 256 | self.get_results() 257 | 258 | return BenchmarkResult( 259 | task=self.task, 260 | config={}, 261 | results=self.results, 262 | speed_mem_metrics=self.speed_mem_metrics, 263 | model=self.model_name, 264 | model_description=self.model_description, 265 | arxiv_id=self.paper_arxiv_id, 266 | pwc_id=self.paper_pwc_id, 267 | paper_results=self.paper_results, 268 | run_hash=self.batch_hash, 269 | **kwargs, 270 | ) 271 | -------------------------------------------------------------------------------- /sotabencheval/image_classification/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["ImageNetEvaluator"] 2 | 3 | from sotabencheval.image_classification.imagenet import ImageNetEvaluator -------------------------------------------------------------------------------- /sotabencheval/image_classification/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def top_k_accuracy_score(y_true, y_pred, k=5, normalize=True): 4 | """ 5 | Top k Accuracy classification score. 6 | :param y_true: the true labels (np.ndarray) 7 | :param y_pred: the predicted labels (np.ndarray) 8 | :param k: calculates top k accuracy (int) 9 | :param normalize: whether to normalize by the number of observations 10 | :return: the top k accuracy 11 | """ 12 | 13 | if len(y_true.shape) == 2: 14 | y_true = y_true[0] # should be one-dimensional 15 | 16 | num_obs, num_labels = y_pred.shape 17 | 18 | idx = num_labels - k - 1 19 | counter = 0 20 | argsorted = np.argsort(y_pred, axis=1) 21 | 22 | for i in range(num_obs): 23 | if y_true[i] in argsorted[i, idx+1:]: 24 | counter += 1 25 | if normalize: 26 | return counter / num_obs 27 | else: 28 | return counter -------------------------------------------------------------------------------- /sotabencheval/language_modelling/__init__.py: -------------------------------------------------------------------------------- 1 | from sotabencheval.language_modelling.wikitext import WikiText103Evaluator, WikiText2Evaluator, WikiTextEvaluator, WikiTextDataset 2 | 3 | __all__ = ["WikiText103Evaluator", "WikiText2Evaluator", 4 | "WikiTextEvaluator", "WikiTextDataset"] 5 | -------------------------------------------------------------------------------- /sotabencheval/language_modelling/wikitext.py: -------------------------------------------------------------------------------- 1 | import time 2 | from enum import Enum 3 | from pathlib import Path 4 | 5 | import numpy as np 6 | 7 | from sotabencheval.core import BaseEvaluator 8 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server, get_max_memory_allocated 9 | 10 | 11 | class WikiTextDataset(Enum): 12 | """Enum used to select the dataset on which evaluation is executed. """ 13 | WikiText103 = ('WikiText-103', 245569, 267735) 14 | WikiText2 = ('WikiText-2', 245569, 33278) 15 | 16 | def __init__(self, pwc_name, testset_size, vocab_size): 17 | """ 18 | Creates an enum instance 19 | :param pwc_name: the name of the dataset as it is found on paperswithcode leaderboard 20 | :param testset_size: the size of the test set in words 21 | :param vocab_size: the size of the dataset vocabluary 22 | """ 23 | self.pwc_name = pwc_name 24 | self.testset_size = testset_size 25 | self.vocab_size = vocab_size 26 | 27 | def _get_path(self, local_root, local_unzip=False): 28 | root = Path(change_root_if_server(root=local_root, 29 | server_root=".data/nlp/" + self.pwc_name.lower())) 30 | zip_name = self.pwc_name.lower() + "-v1.zip" 31 | dataset_path = root / "wiki.test.tokens" 32 | if not dataset_path.exists(): # unzip 33 | extract_archive(str(root / zip_name), to_path=root.parent) 34 | return dataset_path 35 | 36 | get_path = _get_path # deprecated API, for backward compatibility with existing benchmarks 37 | 38 | def get_test_set_path(self, local_root): 39 | """ 40 | Unzips the datasets and returns path to "wiki.test.tokens" 41 | :param local_root: Path to the directory where the dataset files are located locally. 42 | Ignored when run on sotabench server. 43 | """ 44 | return self.get_path(local_root).parent / "wiki.test.tokens" 45 | 46 | def get_validation_set_path(self, local_root): 47 | """ 48 | Unzips the datasets and returns path to "wiki.test.tokens" 49 | :param local_root: Path to the directory where the dataset files are located locally. 50 | Ignored when run on sotabench server. 51 | """ 52 | return self.get_path(local_root).parent / "wiki.valid.tokens" 53 | 54 | def _to_numpy(*args): 55 | def convert(a): 56 | if hasattr(a, 'cpu') and hasattr(a, 'numpy'): 57 | return a.cpu().numpy() 58 | if isinstance(a, list): 59 | return np.array(a) 60 | return a 61 | return [convert(a) for a in args] 62 | 63 | def _gather_probs(log_probs, targets): 64 | """ 65 | Gather probabilities of each target token, from the model activations after log_softmax 66 | :param log_probs: - `torch.tensor`/`np.ndarray` shape [bs x seq_len x vocab_sz] 67 | with model activations after `log_softmax`, with log probability of each word in the vocab 68 | :param targets: - `torch.tensor`/`np.ndarray` shape [bs x seq_len] with ground truth words 69 | """ 70 | if hasattr(log_probs, 'gather'): 71 | # if we work with torch this method is faster than numpy implementation 72 | probs = log_probs.gather(-1, targets.unsqueeze(-1)) 73 | elif isinstance(log_probs, np.ndarray): 74 | # use slower numpy implementation if we have ndarrays 75 | vocab_sz = int(log_probs.shape[-1]) 76 | log_probs, targets = _to_numpy(log_probs, targets) 77 | log_probs = log_probs.reshape(-1, vocab_sz) 78 | targets = targets.reshape(-1) 79 | probs = log_probs[np.arange(log_probs.shape[0]), targets] 80 | return _to_numpy(probs, targets) 81 | 82 | 83 | class WikiTextEvaluator(BaseEvaluator): 84 | task = "Language Modelling" 85 | dataset = None # defined in a subclass 86 | 87 | def __init__(self, 88 | local_root: str = '.', 89 | model_name: str = None, 90 | paper_arxiv_id: str = None, 91 | paper_pwc_id: str = None, 92 | paper_results: dict = None, 93 | model_description=None, 94 | subword_tokenization: bool = False, 95 | text_transformation: bool = False, 96 | dataset=None): 97 | """ 98 | Creates an evaluator for one of the WikiText benchmarks. 99 | 100 | :param local_root: Path to the directory where the dataset files are located locally. 101 | Ignored when run on sotabench server. 102 | :param model_name: The name of the model from the 103 | paper - if you want to link your build to a model from a 104 | machine learning paper. See the WikiText-103 benchmarks page for model names, 105 | (f.e., https://sotabench.com/benchmarks/language-modelling-on-wikitext-103) 106 | on the paper leaderboard or models yet to try tab. 107 | :param paper_arxiv_id: Optional linking to arXiv if you 108 | want to link to papers on the leaderboard; put in the 109 | corresponding paper's arXiv ID, e.g. '1901.02860'. 110 | :param paper_pwc_id: Optional linking to Papers With Code; 111 | put in the corresponding papers with code URL slug, e.g. 112 | "transformer-xl-attentive-language-models" 113 | :param paper_results: If the paper model you are reproducing 114 | does not have model results on sotabench.com, you can specify 115 | the paper results yourself through this argument, where keys 116 | are metric names, values are metric values. e.g: 117 | 118 | {'Test perplexity': 18.2 }. 119 | 120 | Ensure that the metric names match those on the sotabench 121 | leaderboard - for WikiText benchmarks it should be `Test perplexity`. 122 | :param model_description: Optional model description. 123 | :param subword_tokenization: Should be set to `True` if your model use subword tokens defaults to `False`, 124 | :param text_transformation: Should be set to `True` if you use detokenizers that removes moses artefacts, f.e. in zero shoot setting, 125 | :param dataset: internal paramtere do not set in subclasses. 126 | """ 127 | super().__init__(model_name, paper_arxiv_id, 128 | paper_pwc_id, paper_results, model_description) 129 | if dataset is not None: 130 | self.dataset = dataset 131 | self.subword_tokenization = subword_tokenization 132 | self.text_transformation = text_transformation 133 | self.local_root = local_root 134 | self._neglogloss = 0 135 | self._data_set_size = 0 136 | 137 | @property 138 | def dataset_path(self): # deprecated 139 | return self.dataset.get_path(self.local_root) 140 | 141 | @property 142 | def test_set_path(self): 143 | """Returns path to test set, uses `self.local_root` when it is not on the server""" 144 | return self.get_test_set_path(self.local_root) 145 | 146 | @classmethod 147 | def get_test_set_path(cls, local_root): 148 | """ 149 | Unzips the datasets and returns path to "wiki.test.tokens" 150 | :param local_root: Path to the directory where the dataset files are located locally. 151 | Ignored when run on sotabench server. 152 | """ 153 | return cls.dataset.get_test_set_path(local_root) 154 | 155 | def reset(self): 156 | """ 157 | Removes already added results 158 | 159 | 160 | When checking if the model should be rerun on whole dataset it is first run on a smaller subset 161 | and the results are compared with values cached on sotabench server (the check is not performed 162 | when running locally.) Ideally, the smaller subset is just the first batch, so no additional 163 | computation is needed. However, for more complex multistage pipelines it maybe simpler to 164 | run a model twice - on a small dataset and (if necessary) on the full dataset. In that case 165 | :func:`reset` needs to be called before the second run so values from the first run are not reported. 166 | 167 | .. seealso:: :func:`cache_exists` 168 | .. seealso:: :func:`reset_time` 169 | """ 170 | self._neglogloss = 0 171 | self._data_set_size = 0 172 | 173 | def add(self, log_probs, targets): 174 | """ 175 | Updates the evaluator with new results 176 | 177 | :param log_probs: `np.ndarray` or `torch.tensor` with log probability of target tokens can be either: 178 | - a 0d tensor 179 | summed log probability of all `targets` tokens, or 180 | - a 2d tensor [bs x seq_len] 181 | log probabilities of each target token, the shape of `log_probs`, `targets` must match. 182 | - a 3d tensor [bs x seq_len x vocab_size] 183 | distribution of log probabilities for each position in the sequence, 184 | we will gather the probabilities of target tokens for you. 185 | :param targets: a `np.ndarray` or `torch.tensor` with ids of ground truth tokens. 186 | 187 | Examples: 188 | Update the evaluator with a result for a sentence with 10 tokens: 189 | 190 | .. code-block:: python 191 | log_probs = np.array([[ 32, 582, 2731, 19, 1, 786, 5, 98693, 55362, 5 ]]) 192 | targets = np.array([[ -9.8461, -9.3343, -17.8042, -11.2006, -22.3345, -14.4665, -2.0055, 193 | -14.2044, -14.7545, -5.7888]]) 194 | my_evaluator.add(log_probs, targets) 195 | """ 196 | if isinstance(log_probs, float): 197 | log_probs = np.array([log_probs]) # for sum to work 198 | elif log_probs.shape[:-1] == targets.shape: 199 | log_probs, targets = _gather_probs(log_probs, targets) 200 | else: 201 | assert log_probs.shape == targets.shape, f"log_probs have to be ether gathered log probabilities of targets or all probabilites, received {log_probs.shape} {repr(log_probs)}" 202 | self._neglogloss += - float(log_probs.sum()) 203 | self._data_set_size += int(np.prod(list(targets.shape))) 204 | 205 | if not self.first_batch_processed: 206 | content = self.cache_values( 207 | probs=_to_numpy(log_probs)[0].reshape(-1), 208 | api_version=3) 209 | self.batch_hash = calculate_batch_hash(content) 210 | self.first_batch_processed = True 211 | return self.results 212 | 213 | def print_results(self): 214 | """ Calculates and print results. """ 215 | super().print_results() 216 | print("Perplexity:", np.exp(self._neglogloss / self.dataset.testset_size), 217 | "NeglogLoss:", self._neglogloss, "Tokens Count:", self._data_set_size) 218 | 219 | print_stats = print_results 220 | 221 | def get_results(self): 222 | """ 223 | Calculates the perplexity and measure the performance of the model 224 | 225 | :return: dict with `Test perplexity` 226 | """ 227 | if self.cached_results: 228 | return self.results 229 | perplexity = np.exp(self._neglogloss / 230 | self.dataset.testset_size) 231 | 232 | self.results = { 233 | 'Test perplexity': perplexity 234 | } 235 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated() 236 | exec_speed = (time.time() - self.init_time) 237 | count = self.dataset.testset_size 238 | self.speed_mem_metrics['Tasks / Evaluation Time'] = count / exec_speed 239 | self.speed_mem_metrics['Tasks'] = count 240 | self.speed_mem_metrics['Evaluation Time'] = exec_speed 241 | return self.results 242 | 243 | def save(self): 244 | """Save results to the server databese/""" 245 | return super().save(dataset=self.dataset.pwc_name) 246 | 247 | 248 | class WikiText103Evaluator(WikiTextEvaluator): 249 | """`WikiText103 `_ benchmark. 250 | 251 | Examples: 252 | Evaluate a language model from the transformers repository: 253 | 254 | .. code-block:: python 255 | 256 | import torch 257 | from tqdm import tqdm 258 | from sotabencheval.language_modelling import WikiText103Evaluator 259 | 260 | model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda") 261 | tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103') 262 | 263 | evaluator = WikiText103Evaluator( 264 | model_name="Transformer-XL Large", 265 | paper_arxiv_id="1901.02860", 266 | paper_pwc_id="transformer-xl-attentive-language-models", 267 | local_root='/content/wikitext-103' 268 | ) 269 | 270 | with evaluator.test_set_path.open() as f: 271 | test_data = torch.tensor(tokenizer.encode(f.read())) 272 | 273 | seq_len = 128 274 | with torch.no_grad(): 275 | evaluator.reset_timer() 276 | model.eval() 277 | X, Y, mems = test_data[None, :-1], test_data[None, 1:], None 278 | for s in tqdm(range(0, X.shape[-1], seq_len)): 279 | x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda") 280 | log_probs, mems, *_ = model(input_ids=x, mems=mems) 281 | evaluator.add(log_probs, y) 282 | if evaluator.cache_exists: 283 | break 284 | evaluator.save() 285 | evaluator.print_results() 286 | """ 287 | dataset = WikiTextDataset.WikiText103 288 | 289 | 290 | class WikiText2Evaluator(WikiTextEvaluator): 291 | """`WikiText103 `_ benchmark. 292 | 293 | Examples: 294 | Evaluate a language model from the transformers repository: 295 | 296 | .. code-block:: python 297 | 298 | import torch 299 | from tqdm import tqdm 300 | from sotabencheval.language_modelling import WikiText2Evaluator 301 | 302 | model = torch.hub.load('huggingface/transformers', 'modelWithLMHead', 'transfo-xl-wt103').to("cuda") 303 | tokenizer = torch.hub.load('huggingface/transformers', 'tokenizer', 'transfo-xl-wt103') 304 | 305 | evaluator = WikiText2Evaluator( 306 | model_name="Transformer-XL Large", 307 | paper_arxiv_id="1901.02860", 308 | paper_pwc_id="transformer-xl-attentive-language-models", 309 | local_root='/content/wikitext-2' 310 | ) 311 | 312 | with evaluator.test_set_path.open() as f: 313 | test_data = torch.tensor(tokenizer.encode(f.read())) 314 | 315 | seq_len = 128 316 | with torch.no_grad(): 317 | evaluator.reset_timer() 318 | model.eval() 319 | X, Y, mems = test_data[None, :-1], test_data[None, 1:], None 320 | for s in tqdm(range(0, X.shape[-1], seq_len)): 321 | x,y = X[..., s:s+seq_len].to("cuda"), Y[..., s:s+seq_len].to("cuda") 322 | log_probs, mems, *_ = model(input_ids=x, mems=mems) 323 | evaluator.add(log_probs, y) 324 | if evaluator.cache_exists: 325 | break 326 | evaluator.save() 327 | evaluator.print_results() 328 | """ 329 | dataset = WikiTextDataset.WikiText2 330 | -------------------------------------------------------------------------------- /sotabencheval/machine_translation/__init__.py: -------------------------------------------------------------------------------- 1 | from sotabencheval.machine_translation.wmt import WMTEvaluator, WMTDataset 2 | from sotabencheval.machine_translation.metrics import TranslationMetrics 3 | from sotabencheval.machine_translation.languages import Language 4 | 5 | __all__ = ["WMTDataset", "WMTEvaluator", "TranslationMetrics", "Language"] 6 | -------------------------------------------------------------------------------- /sotabencheval/machine_translation/languages.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | _full_forms = { 4 | "en": "English", 5 | "fr": "French", 6 | "de": "German", 7 | } 8 | 9 | 10 | class Language(Enum): 11 | English = "en" 12 | French = "fr" 13 | German = "de" 14 | 15 | @property 16 | def fullname(self): 17 | return _full_forms[self.value] 18 | -------------------------------------------------------------------------------- /sotabencheval/machine_translation/metrics.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from bs4 import BeautifulSoup 3 | from pathlib import Path 4 | from typing import Dict, List, Callable 5 | from collections import OrderedDict 6 | from sacrebleu import corpus_bleu 7 | 8 | 9 | MIN_CACHE_BATCH_SIZE = 32 10 | 11 | 12 | class TranslationMetrics: 13 | def __init__(self, 14 | source_dataset_path: Path, 15 | target_dataset_path: Path, 16 | tokenization: Callable[[str], str] = None): 17 | self._src_dataset_path = source_dataset_path 18 | self._dst_dataset_path = target_dataset_path 19 | self.answers = {} 20 | self.source_documents, self.source_segments = self._load_dataset(self._src_dataset_path) 21 | self._target_documents, self._target_segments = self._load_dataset(self._dst_dataset_path) 22 | self._tokenization = tokenization 23 | self._results = None 24 | 25 | def _load_dataset(self, dataset_path): 26 | documents = read_sgm_file(dataset_path) 27 | segments = OrderedDict([(segment.id, segment.text) for doc in documents for segment in doc.segments]) 28 | return documents, segments 29 | 30 | def add(self, answers: Dict[str, str]): 31 | if not answers: 32 | print("Empty batch added to results") 33 | return 34 | if set(self.answers.keys()) & set(answers.keys()): 35 | print("Multiple translations for the same segment") 36 | self.answers.update(answers) 37 | 38 | def reset(self): 39 | self._results = None 40 | self.answers = {} 41 | 42 | def evaluate(self, ignore_missing=False): 43 | if ignore_missing: 44 | keep = set(self.answers.keys()) 45 | target_segments = {sid: text for sid, text in self._target_segments.items() if sid in keep} 46 | else: 47 | target_segments = self._target_segments 48 | answers = [self.answers.get(sid, "") for sid in target_segments] 49 | references = [target for target in target_segments.values()] 50 | bleu = corpus_bleu(answers, [references]) 51 | self._results = {'SacreBLEU': bleu.score} 52 | 53 | if self._tokenization is not None: 54 | tok_answers = [self._tokenization(answer) for answer in answers] 55 | tok_references = [self._tokenization(target) for target in references] 56 | tok_bleu = corpus_bleu(tok_answers, [tok_references], tokenize='none', force=True) 57 | self._results['BLEU score'] = tok_bleu.score 58 | 59 | @property 60 | def has_data(self): 61 | return len(self.answers) >= MIN_CACHE_BATCH_SIZE 62 | 63 | def get_results(self, ignore_missing=False): 64 | self.evaluate(ignore_missing) 65 | return self._results 66 | 67 | 68 | @dataclass 69 | class Segment: 70 | id: str 71 | text: str 72 | 73 | 74 | @dataclass 75 | class Document: 76 | id: str 77 | segments: List[Segment] 78 | 79 | 80 | def read_sgm_file(path): 81 | with open(path, 'rb') as f: 82 | soup = BeautifulSoup(f.read(), features="html.parser") 83 | 84 | return [ 85 | Document( 86 | id=doc['docid'], 87 | segments=[ 88 | Segment( 89 | id=doc['docid'] + '#' + seg['id'], 90 | text=seg.text 91 | ) for seg in doc.find_all('seg') 92 | ] 93 | ) for doc in soup.find_all('doc') 94 | ] 95 | -------------------------------------------------------------------------------- /sotabencheval/machine_translation/wmt.py: -------------------------------------------------------------------------------- 1 | from sotabencheval.core import BaseEvaluator 2 | from sotabencheval.utils import calculate_batch_hash, change_root_if_server, is_server 3 | from sotabencheval.machine_translation.languages import Language 4 | from sotabencheval.machine_translation.metrics import TranslationMetrics 5 | from sotabencheval.utils import get_max_memory_allocated 6 | from typing import Dict, Callable 7 | from pathlib import Path 8 | from enum import Enum 9 | import time 10 | 11 | 12 | class WMTDataset(Enum): 13 | News2014 = "newstest2014" 14 | News2019 = "newstest2019" 15 | 16 | 17 | class WMTEvaluator(BaseEvaluator): 18 | """Evaluator for WMT Machine Translation benchmarks. 19 | 20 | Examples: 21 | Evaluate a Transformer model from the fairseq repository on WMT2019 news test set: 22 | 23 | .. code-block:: python 24 | 25 | from sotabencheval.machine_translation import WMTEvaluator, WMTDataset, Language 26 | from tqdm import tqdm 27 | import torch 28 | 29 | evaluator = WMTEvaluator( 30 | dataset=WMTDataset.News2019, 31 | source_lang=Language.English, 32 | target_lang=Language.German, 33 | local_root="data/nlp/wmt", 34 | model_name="Facebook-FAIR (single)", 35 | paper_arxiv_id="1907.06616" 36 | ) 37 | 38 | model = torch.hub.load('pytorch/fairseq', 'transformer.wmt19.en-de.single_model', 39 | force_reload=True, tokenizer='moses', bpe='fastbpe').cuda() 40 | 41 | for sid, text in tqdm(evaluator.source_segments.items()): 42 | translated = model.translate(text) 43 | evaluator.add({sid: translated}) 44 | if evaluator.cache_exists: 45 | break 46 | 47 | evaluator.save() 48 | print(evaluator.results) 49 | """ 50 | 51 | task = "Machine Translation" 52 | 53 | _datasets = { 54 | (WMTDataset.News2014, Language.English, Language.German), 55 | (WMTDataset.News2019, Language.English, Language.German), 56 | (WMTDataset.News2014, Language.English, Language.French), 57 | } 58 | 59 | def __init__(self, 60 | dataset: WMTDataset, 61 | source_lang: Language, 62 | target_lang: Language, 63 | local_root: str = '.', 64 | source_dataset_filename: str = None, 65 | target_dataset_filename: str = None, 66 | model_name: str = None, 67 | paper_arxiv_id: str = None, 68 | paper_pwc_id: str = None, 69 | paper_results: dict = None, 70 | model_description: str = None, 71 | tokenization: Callable[[str], str] = None): 72 | """ 73 | Creates an evaluator for one of the WMT benchmarks. 74 | 75 | :param dataset: Which dataset to evaluate on, f.e., WMTDataset.News2014. 76 | :param source_lang: Source language of the documents to translate. 77 | :param target_lang: Target language into which the documents are translated. 78 | :param local_root: Path to the directory where the dataset files are located locally. 79 | Ignored when run on sotabench server. 80 | :param source_dataset_filename: Local filename of the SGML file with the source documents. 81 | If None, the standard WMT filename is used, based on :param:`dataset`, 82 | :param:`source_lang` and :param:`target_lang`. 83 | Ignored when run on sotabench server. 84 | :param target_dataset_filename: Local filename of the SGML file with the reference documents. 85 | If None, the standard WMT filename is used, based on :param:`dataset`, 86 | :param:`source_lang` and :param:`target_lang`. 87 | Ignored when run on sotabench server. 88 | :param model_name: The name of the model from the 89 | paper - if you want to link your build to a model from a 90 | machine learning paper. See the WMT benchmarks pages for model names, 91 | (f.e., https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german) 92 | on the paper leaderboard or models yet to try tabs. 93 | :param paper_arxiv_id: Optional linking to arXiv if you 94 | want to link to papers on the leaderboard; put in the 95 | corresponding paper's arXiv ID, e.g. '1907.06616'. 96 | :param paper_pwc_id: Optional linking to Papers With Code; 97 | put in the corresponding papers with code URL slug, e.g. 98 | 'facebook-fairs-wmt19-news-translation-task' 99 | :param paper_results: If the paper model you are reproducing 100 | does not have model results on sotabench.com, you can specify 101 | the paper results yourself through this argument, where keys 102 | are metric names, values are metric values. e.g: 103 | 104 | {'SacreBLEU': 42.7, 'BLEU score': 43.1}. 105 | 106 | Ensure that the metric names match those on the sotabench 107 | leaderboard - for WMT benchmarks it should be `SacreBLEU` for de-tokenized 108 | case sensitive BLEU score and `BLEU score` for tokenized BLEU. 109 | :param model_description: Optional model description. 110 | :param tokenization: An optional tokenization function to compute tokenized BLEU score. 111 | It takes a single string - a segment to tokenize, and returns a string with tokens 112 | separated by space, f.e.: 113 | 114 | tokenization = lambda seg: seg.replace("'s", " 's").replace("-", " - ") 115 | 116 | If None, only de-tokenized SacreBLEU score is reported. 117 | """ 118 | 119 | super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) 120 | self.root = change_root_if_server(root=local_root, 121 | server_root=".data/nlp/wmt") 122 | self.dataset = dataset 123 | self.source_lang = source_lang 124 | self.target_lang = target_lang 125 | 126 | default_src_fn, default_dst_fn = self._get_source_dataset_filename() 127 | if source_dataset_filename is None or is_server(): 128 | source_dataset_filename = default_src_fn 129 | 130 | if target_dataset_filename is None or is_server(): 131 | target_dataset_filename = default_dst_fn 132 | 133 | self.source_dataset_path = Path(self.root) / source_dataset_filename 134 | self.target_dataset_path = Path(self.root) / target_dataset_filename 135 | 136 | self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path, tokenization) 137 | 138 | def _get_source_dataset_filename(self): 139 | if self.dataset == WMTDataset.News2014: 140 | other_lang = self.source_lang.value if self.target_lang == Language.English else self.target_lang.value 141 | source = "{0}-{1}en-src.{2}.sgm".format(self.dataset.value, other_lang, self.source_lang.value) 142 | target = "{0}-{1}en-ref.{2}.sgm".format(self.dataset.value, other_lang, self.target_lang.value) 143 | elif self.dataset == WMTDataset.News2019: 144 | source = "{0}-{1}{2}-src.{1}.sgm".format(self.dataset.value, self.source_lang.value, self.target_lang.value) 145 | target = "{0}-{1}{2}-ref.{2}.sgm".format(self.dataset.value, self.source_lang.value, self.target_lang.value) 146 | else: 147 | raise ValueError("Unknown dataset: {}".format(self.dataset)) 148 | return source, target 149 | 150 | def _get_dataset_name(self): 151 | cfg = (self.dataset, self.source_lang, self.target_lang) 152 | if cfg not in WMTEvaluator._datasets: 153 | raise ValueError("Unsupported dataset configuration: {} {} {}".format( 154 | self.dataset.name, 155 | self.source_lang.name, 156 | self.target_lang.name 157 | )) 158 | 159 | ds_names = {WMTDataset.News2014: "WMT2014", WMTDataset.News2019: "WMT2019"} 160 | return "{0} {1}-{2}".format(ds_names.get(self.dataset), self.source_lang.fullname, self.target_lang.fullname) 161 | 162 | def add(self, answers: Dict[str, str]): 163 | """ 164 | Updates the evaluator with new results 165 | 166 | :param answers: a dict where keys are source segments ids and values are translated segments 167 | (segment id is created by concatenating document id and the original segment id, 168 | separated by `#`.) 169 | 170 | Examples: 171 | Update the evaluator with three results: 172 | 173 | .. code-block:: python 174 | 175 | my_evaluator.add({ 176 | 'bbc.381790#1': 'Waliser AMs sorgen sich um "Aussehen wie Muppets"', 177 | 'bbc.381790#2': 'Unter einigen AMs herrscht Bestürzung über einen...', 178 | 'bbc.381790#3': 'Sie ist aufgrund von Plänen entstanden, den Namen...' 179 | }) 180 | 181 | .. seealso:: `source_segments` 182 | """ 183 | 184 | self.metrics.add(answers) 185 | 186 | if not self.first_batch_processed and self.metrics.has_data: 187 | self.batch_hash = calculate_batch_hash( 188 | self.cache_values(answers=self.metrics.answers, 189 | metrics=self.metrics.get_results(ignore_missing=True)) 190 | ) 191 | self.first_batch_processed = True 192 | 193 | @property 194 | def source_segments(self): 195 | """ 196 | Ordered dictionary of all segments to translate with segments ids as keys. The same segments ids 197 | have to be used when submitting translations with :func:`add`. 198 | 199 | Examples: 200 | 201 | .. code-block:: python 202 | 203 | for segment_id, text in my_evaluator.source_segments.items(): 204 | translated = model(text) 205 | my_evaluator.add({segment_id: translated}) 206 | 207 | .. seealso: `source_documents` 208 | """ 209 | 210 | return self.metrics.source_segments 211 | 212 | @property 213 | def source_documents(self): 214 | """ 215 | List of all documents to translate 216 | 217 | Examples: 218 | 219 | .. code-block:: python 220 | 221 | for document in my_evaluator.source_documents: 222 | for segment in document.segments: 223 | translated = model(segment.text) 224 | my_evaluator.add({segment.id: translated}) 225 | 226 | .. seealso: `source_segments` 227 | """ 228 | 229 | return self.metrics.source_documents 230 | 231 | def reset(self): 232 | """ 233 | Removes already added translations 234 | 235 | When checking if the model should be rerun on whole dataset it is first run on a smaller subset 236 | and the results are compared with values cached on sotabench server (the check is not performed 237 | when running locally.) Ideally, the smaller subset is just the first batch, so no additional 238 | computation is needed. However, for more complex multistage pipelines it may be simpler to 239 | run the model twice - on a small dataset and (if necessary) on the full dataset. In that case 240 | :func:`reset` needs to be called before the second run so values from the first run are not reported. 241 | 242 | .. seealso:: :func:`cache_exists` 243 | .. seealso:: :func:`reset_time` 244 | """ 245 | 246 | self.metrics.reset() 247 | 248 | def get_results(self): 249 | """ 250 | Gets the results for the evaluator. Empty string is assumed for segments for which in translation 251 | was provided. 252 | 253 | :return: dict with `SacreBLEU` and `BLEU score`. 254 | """ 255 | 256 | if self.cached_results: 257 | return self.results 258 | self.results = self.metrics.get_results() 259 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated() 260 | 261 | return self.results 262 | 263 | def save(self): 264 | dataset = self._get_dataset_name() 265 | 266 | if not self.cached_results: 267 | exec_speed = (time.time() - self.init_time) 268 | self.speed_mem_metrics['Tasks / Evaluation Time'] = len(self.metrics.answers) / exec_speed 269 | self.speed_mem_metrics['Tasks'] = len(self.metrics.answers) 270 | self.speed_mem_metrics['Evaluation Time'] = exec_speed 271 | else: 272 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None 273 | self.speed_mem_metrics['Tasks'] = None 274 | self.speed_mem_metrics['Evaluation Time'] = None 275 | 276 | return super().save(dataset=dataset) 277 | 278 | -------------------------------------------------------------------------------- /sotabencheval/natural_language_inference/__init__.py: -------------------------------------------------------------------------------- 1 | from .multinli import MultiNLI 2 | 3 | __all__ = ["MultiNLI"] 4 | -------------------------------------------------------------------------------- /sotabencheval/natural_language_inference/multinli.py: -------------------------------------------------------------------------------- 1 | import csv 2 | import time 3 | 4 | from itertools import zip_longest 5 | from pathlib import Path 6 | 7 | from sotabencheval.core import BaseEvaluator 8 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server, get_max_memory_allocated 9 | 10 | 11 | def read_csv(path): 12 | with path.open('r') as f: 13 | yield from csv.DictReader(f, delimiter='\t') 14 | 15 | 16 | def get_path(local_root, local_unzip=False): 17 | root = Path(change_root_if_server(root=local_root, 18 | server_root=".data/nlp/multinli")) 19 | zip_name = "MNLI.zip" 20 | dataset_path=root / "MNLI" / "dev_matched.tsv" 21 | if not dataset_path.exists(): # unzip 22 | extract_archive(str(root / zip_name), to_path=root) 23 | return (dataset_path, dataset_path.parent / "dev_mismatched.tsv") 24 | 25 | 26 | class ClassificationEvaluator: 27 | def __init__(self, file_path): 28 | self.dataset_path = file_path 29 | dataset = list(read_csv(file_path)) 30 | self.targets = {d['pairID']: d['gold_label'] for d in dataset} 31 | self.dataset = {d['pairID']: (d['sentence1'], d['sentence2']) for d in dataset} 32 | self.reset() 33 | 34 | def reset(self): 35 | self.answers = {} 36 | 37 | @property 38 | def count(self): 39 | return len(self.answers) 40 | 41 | def add(self, pairIds, preds): 42 | for pairId, pred in zip(pairIds,preds): 43 | if pairId not in self.targets: 44 | continue 45 | if pairId not in self.answers: 46 | self.answers[pairId] = pred 47 | else: 48 | print(f"Double prediction for {pairId} former: {self.answers[pairId]} new: {pred}") 49 | 50 | @property 51 | def has_enough_for_cache_hash(self): 52 | return self.count >= 100 53 | 54 | @property 55 | def accuracy(self): 56 | correct = [self.targets[k] == a for k,a in self.answers.items() if a is not None] 57 | accuracy = sum(correct) / self.count if self.count > 0 else 0 58 | if self.count != len(self.targets): 59 | return (accuracy, f"partial on {self.count} out of {len(self.targets)}") 60 | return accuracy 61 | 62 | 63 | class MultiNLI(BaseEvaluator): 64 | task = "Natural Language Inference" 65 | dataset = 'MultiNLI' # defined in subclass 66 | 67 | def __init__(self, 68 | local_root: str = '.', 69 | model_name: str = None, 70 | paper_arxiv_id: str = None, 71 | paper_pwc_id: str = None, 72 | paper_results: dict = None, 73 | model_description=None): 74 | 75 | super().__init__(model_name, paper_arxiv_id, 76 | paper_pwc_id, paper_results, model_description) 77 | self.local_root = local_root 78 | paths = self.dataset_paths 79 | self.matched = ClassificationEvaluator(paths[0]) 80 | self.mismatched = ClassificationEvaluator(paths[1]) 81 | self.reset() 82 | 83 | @property 84 | def dataset_paths(self): 85 | return get_path(self.local_root) 86 | 87 | @property 88 | def data_generator(self): 89 | for v1, v2 in zip_longest(self.matched.dataset.items(), self.mismatched.dataset.items()): 90 | if v1 is not None: 91 | yield v1 92 | if v2 is not None: 93 | yield v2 94 | 95 | def reset(self): 96 | self.matched.reset() 97 | self.mismatched.reset() 98 | self.batch_hash = None 99 | self.reset_time() 100 | 101 | def add(self, pairIds, predictions): 102 | """ 103 | pairIDToLabel - Dictionary mapping pairID (str) to label (str) 104 | """ 105 | if isinstance(pairIds, str): 106 | pairIds = [pairIds] 107 | predictions = [predictions] 108 | 109 | self.matched.add(pairIds, predictions) 110 | self.mismatched.add(pairIds, predictions) 111 | if self.batch_hash is None and self.matched.count + self.mismatched.count > 100: 112 | content = self.cache_values(matched=self.matched.answers, mismatched=self.mismatched.answers) 113 | self.batch_hash = calculate_batch_hash(content) 114 | self.first_batch_processed = True #TODO: do we need this if we have self.batch_hash 115 | 116 | 117 | def get_results(self): 118 | if self.cached_results: 119 | return self.results 120 | self.results = { 121 | 'Matched': self.matched.accuracy, 122 | 'Mismatched': self.mismatched.accuracy 123 | } 124 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated() 125 | exec_speed = (time.time() - self.init_time) 126 | count = self.mismatched.count + self.matched.count 127 | self.speed_mem_metrics['Tasks / Evaluation Time'] = count / exec_speed 128 | self.speed_mem_metrics['Tasks'] = count 129 | self.speed_mem_metrics['Evaluation Time'] = exec_speed 130 | return self.results 131 | 132 | def save(self): 133 | 134 | 135 | return super().save(dataset=self.dataset) 136 | -------------------------------------------------------------------------------- /sotabencheval/object_detection/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["COCOEvaluator"] 2 | 3 | from sotabencheval.object_detection.coco import COCOEvaluator -------------------------------------------------------------------------------- /sotabencheval/object_detection/coco.py: -------------------------------------------------------------------------------- 1 | # Some of the processing logic here is based on the torchvision COCO dataset 2 | # https://github.com/pytorch/vision/blob/master/torchvision/datasets/coco.py 3 | 4 | import copy 5 | import numpy as np 6 | import os 7 | from pycocotools.coco import COCO 8 | from sotabenchapi.client import Client 9 | from sotabenchapi.core import BenchmarkResult, check_inputs 10 | import time 11 | 12 | from sotabencheval.utils import calculate_batch_hash, extract_archive, change_root_if_server, is_server 13 | from sotabencheval.utils import get_max_memory_allocated 14 | from sotabencheval.object_detection.coco_eval import CocoEvaluator 15 | from sotabencheval.object_detection.utils import get_coco_metrics 16 | 17 | 18 | class COCOEvaluator(object): 19 | """`COCO `_ benchmark. 20 | 21 | Examples: 22 | Evaluate a ResNeXt model from the torchvision repository: 23 | 24 | .. code-block:: python 25 | 26 | ... 27 | 28 | evaluator = COCOEvaluator(model_name='Mask R-CNN', paper_arxiv_id='1703.06870') 29 | 30 | with torch.no_grad(): 31 | for i, (input, __) in enumerate(iterator): 32 | ... 33 | output = model(input) 34 | # optional formatting of output here to be a list of detection dicts 35 | evaluator.add(output) 36 | 37 | if evaluator.cache_exists: 38 | break 39 | 40 | evaluator.save() 41 | """ 42 | 43 | task = "Object Detection" 44 | 45 | def __init__(self, 46 | root: str = '.', 47 | split: str = "val", 48 | dataset_year: str = "2017", 49 | model_name: str = None, 50 | paper_arxiv_id: str = None, 51 | paper_pwc_id: str = None, 52 | paper_results: dict = None, 53 | model_description=None,): 54 | """Initializes a COCO Evaluator object 55 | 56 | Args: 57 | root (string): Root directory of the COCO Dataset - where the 58 | label data is located (or will be downloaded to). 59 | split (str) : the split for COCO to use, e.g. 'val' 60 | dataset_year (str): the dataset year for COCO to use 61 | model_name (str, optional): The name of the model from the 62 | paper - if you want to link your build to a machine learning 63 | paper. See the COCO benchmark page for model names, 64 | https://sotabench.com/benchmarks/object-detection-on-coco-minival, 65 | e.g. on the paper leaderboard tab. 66 | paper_arxiv_id (str, optional): Optional linking to arXiv if you 67 | want to link to papers on the leaderboard; put in the 68 | corresponding paper's arXiv ID, e.g. '1611.05431'. 69 | paper_pwc_id (str, optional): Optional linking to Papers With Code; 70 | put in the corresponding papers with code URL slug, e.g. 71 | 'u-gat-it-unsupervised-generative-attentional' 72 | paper_results (dict, optional) : If the paper you are reproducing 73 | does not have model results on sotabench.com, you can specify 74 | the paper results yourself through this argument, where keys 75 | are metric names, values are metric values. e.g:: 76 | 77 | {'box AP': 0.349, 'AP50': 0.592, ...}. 78 | 79 | Ensure that the metric names match those on the sotabench 80 | leaderboard - for COCO it should be 'box AP', 'AP50', 81 | 'AP75', 'APS', 'APM', 'APL' 82 | model_description (str, optional): Optional model description. 83 | """ 84 | root = self.root = change_root_if_server(root=root, 85 | server_root="./.data/vision/coco") 86 | 87 | # Model metadata 88 | 89 | self.model_name = model_name 90 | self.paper_arxiv_id = paper_arxiv_id 91 | self.paper_pwc_id = paper_pwc_id 92 | self.paper_results = paper_results 93 | self.model_description = model_description 94 | self.split = split 95 | 96 | annFile = os.path.join( 97 | root, "annotations/instances_%s%s.json" % (self.split, dataset_year) 98 | ) 99 | 100 | self._download(annFile) 101 | 102 | self.coco = COCO(annFile) 103 | self.iou_types = ['bbox'] 104 | self.coco_evaluator = CocoEvaluator(self.coco, self.iou_types) 105 | 106 | self.detections = [] 107 | self.results = None 108 | 109 | # Backend variables for hashing and caching 110 | 111 | self.first_batch_processed = False 112 | self.batch_hash = None 113 | self.cached_results = False 114 | 115 | # Speed and memory metrics 116 | 117 | self.speed_mem_metrics = {} 118 | self.init_time = time.time() 119 | 120 | def _download(self, annFile): 121 | """ 122 | Utility function for downloading the COCO annotation file 123 | 124 | :param annFile: path of the annotations file 125 | :return: void - extracts the archive 126 | """ 127 | if not os.path.isfile(annFile): 128 | if "2017" in annFile: 129 | annotations_dir_zip = os.path.join( 130 | self.root, "annotations_train%s2017.zip" % self.split 131 | ) 132 | elif "2014" in annFile: 133 | annotations_dir_zip = os.path.join( 134 | self.root, "annotations_train%s2014.zip" % self.split 135 | ) 136 | else: 137 | annotations_dir_zip = None 138 | 139 | if annotations_dir_zip is not None: 140 | print('Attempt to extract annotations file at {zip_loc}'.format(zip_loc=annotations_dir_zip)) 141 | extract_archive(from_path=annotations_dir_zip, to_path=self.root) 142 | 143 | @property 144 | def cache_exists(self): 145 | """ 146 | Checks whether the cache exists in the sotabench.com database - if so 147 | then sets self.results to cached results and returns True. 148 | 149 | You can use this property for control flow to break a for loop over a dataset 150 | after the first iteration. This prevents rerunning the same calculation for the 151 | same model twice. 152 | 153 | Examples: 154 | Breaking a for loop 155 | 156 | .. code-block:: python 157 | 158 | ... 159 | 160 | with torch.no_grad(): 161 | for i, (input, target) in enumerate(iterator): 162 | ... 163 | output = model(input) 164 | # optional formatting of output here to be a list of detection dicts 165 | evaluator.add(output) 166 | 167 | if evaluator.cache_exists: 168 | break 169 | 170 | evaluator.save() 171 | 172 | :return: bool or None (if not in check mode) 173 | """ 174 | if not self.first_batch_processed: 175 | raise ValueError('No batches of data have been processed so no batch_hash exists') 176 | 177 | if not is_server(): # we only check the cache on the server 178 | return None 179 | 180 | client = Client.public() 181 | cached_res = client.get_results_by_run_hash(self.batch_hash) 182 | if cached_res: 183 | self.results = cached_res 184 | self.cached_results = True 185 | print( 186 | "No model change detected (using the first batch run " 187 | "hash). Will use cached results." 188 | ) 189 | return True 190 | 191 | return False 192 | 193 | @staticmethod 194 | def cache_format_ann(ann): 195 | """ 196 | Cache formats an annotation dictionary with rounding. the reason we need to round is that if we have 197 | small floating point originated differences, then changes the hash of the predictions. 198 | 199 | :param ann (dict): A detection dictionary 200 | 201 | :return: ann : A detection dictionary but with rounded values 202 | """ 203 | ann['bbox'] = [np.round(el, 3) for el in ann['bbox']] 204 | ann['score'] = np.round(ann['score'], 3) 205 | 206 | if 'segmentation' in ann: 207 | ann['segmentation'] = [np.round(el, 3) for el in ann['segmentation']] 208 | 209 | if 'area' in ann: 210 | ann['area'] = np.round(ann['area'], 3) 211 | 212 | return ann 213 | 214 | def cache_values(self, annotations, metrics): 215 | """ 216 | Takes in annotations and metrics, and formats the data to calculate the hash for the cache 217 | :param annotations: list of detections 218 | :param metrics: dictionary of final AP metrics 219 | :return: list of data (combining annotations and metrics) 220 | """ 221 | metrics = {k: np.round(v, 3) for k, v in metrics.items()} 222 | new_annotations = copy.deepcopy(annotations) 223 | new_annotations = [self.cache_format_ann(ann) for ann in new_annotations] 224 | 225 | return new_annotations + [metrics] 226 | 227 | def add(self, detections: list): 228 | """ 229 | Update the evaluator with new detections 230 | 231 | :param annotations (list): List of detections, that will be used by the COCO.loadRes method in the 232 | pycocotools API. Each detection can take a dictionary format like the following: 233 | 234 | {'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312, 110.14895629882812, 278.2847595214844], 235 | 'score': 0.999152421951294, 'category_id': 1} 236 | 237 | I.e is a list of dictionaries. 238 | 239 | :return: void - updates self.detection with the new IDSs and prediction 240 | 241 | Examples: 242 | Update the evaluator with two results: 243 | 244 | .. code-block:: python 245 | 246 | my_evaluator.add([{'image_id': 397133, 'bbox': [386.1628112792969, 69.48855590820312, 247 | 110.14895629882812, 278.2847595214844], 'score': 0.999152421951294, 'category_id': 1}]) 248 | """ 249 | self.detections.extend(detections) 250 | 251 | self.coco_evaluator.update(detections) 252 | 253 | if not self.first_batch_processed: 254 | self.coco_evaluator.evaluate() 255 | self.coco_evaluator.accumulate() 256 | 257 | if any([detection['bbox'] for detection in detections]): # we can only hash if we have predictions 258 | self.batch_hash = calculate_batch_hash( 259 | self.cache_values(annotations=detections, metrics=get_coco_metrics(self.coco_evaluator))) 260 | self.first_batch_processed = True 261 | 262 | def get_results(self): 263 | """ 264 | Reruns the evaluation using the accumulated detections, returns COCO results with AP metrics 265 | 266 | :return: dict with COCO AP metrics 267 | """ 268 | if self.cached_results: 269 | return self.results 270 | 271 | self.coco_evaluator = CocoEvaluator(self.coco, self.iou_types) 272 | self.coco_evaluator.update(self.detections) 273 | self.coco_evaluator.evaluate() 274 | self.coco_evaluator.accumulate() 275 | self.coco_evaluator.summarize() 276 | 277 | self.results = get_coco_metrics(self.coco_evaluator) 278 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated() 279 | 280 | return self.results 281 | 282 | def reset_time(self): 283 | """ 284 | Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation 285 | appropriately, for example: 286 | 287 | :return: void - resets self.init_time 288 | """ 289 | self.init_time = time.time() 290 | 291 | def save(self): 292 | """ 293 | Calculate results and then put into a BenchmarkResult object 294 | 295 | On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded 296 | on the platform. 297 | 298 | :return: BenchmarkResult object with results and metadata 299 | """ 300 | # recalculate to ensure no mistakes made during batch-by-batch metric calculation 301 | self.get_results() 302 | 303 | # If this is the first time the model is run, then we record evaluation time information 304 | 305 | if not self.cached_results: 306 | unique_image_ids = set([d['image_id'] for d in self.detections]) 307 | exec_speed = (time.time() - self.init_time) 308 | self.speed_mem_metrics['Tasks / Evaluation Time'] = len(unique_image_ids) / exec_speed 309 | self.speed_mem_metrics['Tasks'] = len(unique_image_ids) 310 | self.speed_mem_metrics['Evaluation Time'] = exec_speed 311 | else: 312 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None 313 | self.speed_mem_metrics['Tasks'] = None 314 | self.speed_mem_metrics['Evaluation Time'] = None 315 | 316 | return BenchmarkResult( 317 | task=self.task, 318 | config={}, 319 | dataset='COCO minival', 320 | results=self.results, 321 | speed_mem_metrics=self.speed_mem_metrics, 322 | model=self.model_name, 323 | model_description=self.model_description, 324 | arxiv_id=self.paper_arxiv_id, 325 | pwc_id=self.paper_pwc_id, 326 | paper_results=self.paper_results, 327 | run_hash=self.batch_hash, 328 | ) 329 | -------------------------------------------------------------------------------- /sotabencheval/object_detection/coco_eval.py: -------------------------------------------------------------------------------- 1 | # Code is based on https://github.com/pytorch/vision/blob/master/references/detection/ 2 | 3 | import numpy as np 4 | import copy 5 | 6 | from pycocotools.cocoeval import COCOeval 7 | from pycocotools.coco import COCO 8 | import pycocotools.mask as mask_util 9 | 10 | from collections import defaultdict 11 | 12 | 13 | class CocoEvaluator(object): 14 | """ 15 | For now this only does BBOX detection - so 'bbox' is the only acceptable iou_type 16 | """ 17 | def __init__(self, coco_gt, iou_types): 18 | assert isinstance(iou_types, (list, tuple)) 19 | coco_gt = copy.deepcopy(coco_gt) 20 | self.coco_gt = coco_gt 21 | 22 | self.iou_types = iou_types 23 | self.coco_eval = {} 24 | for iou_type in iou_types: 25 | self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type) 26 | 27 | self.annotation_list = [] 28 | 29 | def update(self, annotation_list): 30 | assert(type(annotation_list) == list) 31 | 32 | self.annotation_list.extend(annotation_list) 33 | 34 | for iou_type in self.iou_types: 35 | coco_dt = loadRes(self.coco_gt, self.annotation_list) if self.annotation_list else COCO() 36 | coco_eval = self.coco_eval[iou_type] 37 | coco_eval.cocoDt = coco_dt 38 | coco_eval.params.imgIds = self.coco_gt.getImgIds() 39 | 40 | def accumulate(self): 41 | for coco_eval in self.coco_eval.values(): 42 | coco_eval.accumulate() 43 | 44 | def evaluate(self): 45 | for coco_eval in self.coco_eval.values(): 46 | coco_eval.evaluate() 47 | 48 | def summarize(self): 49 | for iou_type, coco_eval in self.coco_eval.items(): 50 | # print("IoU metric: {}".format(iou_type)) 51 | coco_eval.summarize() 52 | 53 | 54 | ################################################################# 55 | # From pycocotools, just removed the prints and fixed 56 | # a Python3 bug about unicode not defined 57 | ################################################################# 58 | 59 | # Ideally, pycocotools wouldn't have hard-coded prints 60 | # so that we could avoid copy-pasting those two functions 61 | 62 | 63 | def createIndex(self): 64 | # create index 65 | # print('creating index...') 66 | anns, cats, imgs = {}, {}, {} 67 | imgToAnns, catToImgs = defaultdict(list), defaultdict(list) 68 | if "annotations" in self.dataset: 69 | for ann in self.dataset["annotations"]: 70 | imgToAnns[ann["image_id"]].append(ann) 71 | anns[ann["id"]] = ann 72 | 73 | if "images" in self.dataset: 74 | for img in self.dataset["images"]: 75 | imgs[img["id"]] = img 76 | 77 | if "categories" in self.dataset: 78 | for cat in self.dataset["categories"]: 79 | cats[cat["id"]] = cat 80 | 81 | if "annotations" in self.dataset and "categories" in self.dataset: 82 | for ann in self.dataset["annotations"]: 83 | catToImgs[ann["category_id"]].append(ann["image_id"]) 84 | 85 | # print('index created!') 86 | 87 | # create class members 88 | self.anns = anns 89 | self.imgToAnns = imgToAnns 90 | self.catToImgs = catToImgs 91 | self.imgs = imgs 92 | self.cats = cats 93 | 94 | 95 | maskUtils = mask_util 96 | 97 | 98 | def loadRes(coco, anns): 99 | """Load result file and return a result api object. 100 | 101 | ``anns`` is a list of dicts containing the results 102 | 103 | In the original pycoco api, a results file is passed in, whereas in this 104 | case we bypass the json file loading and ask for a list of dictionary 105 | annotations to be passed directly in 106 | 107 | Returns: 108 | res (obj): result api object. 109 | """ 110 | res = COCO() 111 | res.dataset["images"] = [img for img in coco.dataset["images"]] 112 | 113 | # print('Loading and preparing results...') 114 | # tic = time.time() 115 | # if isinstance(resFile, torch._six.string_classes): 116 | # anns = json.load(open(resFile)) 117 | # elif type(resFile) == np.ndarray: 118 | # anns = self.loadNumpyAnnotations(resFile) 119 | # else: 120 | # anns = resFile 121 | assert type(anns) == list, "results in not an array of objects" 122 | annsImgIds = [ann["image_id"] for ann in anns] 123 | assert set(annsImgIds) == ( 124 | set(annsImgIds) & set(coco.getImgIds()) 125 | ), "Results do not correspond to current coco set" 126 | if "caption" in anns[0]: 127 | imgIds = set([img["id"] for img in res.dataset["images"]]) & set( 128 | [ann["image_id"] for ann in anns] 129 | ) 130 | res.dataset["images"] = [ 131 | img for img in res.dataset["images"] if img["id"] in imgIds 132 | ] 133 | for id, ann in enumerate(anns): 134 | ann["id"] = id + 1 135 | elif "bbox" in anns[0] and not anns[0]["bbox"] == []: 136 | res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"]) 137 | for id, ann in enumerate(anns): 138 | bb = ann["bbox"] 139 | x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]] 140 | if "segmentation" not in ann: 141 | ann["segmentation"] = [[x1, y1, x1, y2, x2, y2, x2, y1]] 142 | ann["area"] = bb[2] * bb[3] 143 | ann["id"] = id + 1 144 | ann["iscrowd"] = 0 145 | elif "segmentation" in anns[0]: 146 | res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"]) 147 | for id, ann in enumerate(anns): 148 | # now only support compressed RLE format as segmentation results 149 | ann["area"] = maskUtils.area(ann["segmentation"]) 150 | if "bbox" not in ann: 151 | ann["bbox"] = maskUtils.toBbox(ann["segmentation"]) 152 | ann["id"] = id + 1 153 | ann["iscrowd"] = 0 154 | elif "keypoints" in anns[0]: 155 | res.dataset["categories"] = copy.deepcopy(coco.dataset["categories"]) 156 | for id, ann in enumerate(anns): 157 | s = ann["keypoints"] 158 | x = s[0::3] 159 | y = s[1::3] 160 | x0, x1, y0, y1 = np.min(x), np.max(x), np.min(y), np.max(y) 161 | ann["area"] = (x1 - x0) * (y1 - y0) 162 | ann["id"] = id + 1 163 | ann["bbox"] = [x0, y0, x1 - x0, y1 - y0] 164 | # print('DONE (t={:0.2f}s)'.format(time.time()- tic)) 165 | 166 | res.dataset["annotations"] = anns 167 | createIndex(res) 168 | return res 169 | -------------------------------------------------------------------------------- /sotabencheval/object_detection/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def get_coco_metrics(coco_evaluator): 4 | 5 | metrics = { 6 | "box AP": None, 7 | "AP50": None, 8 | "AP75": None, 9 | "APS": None, 10 | "APM": None, 11 | "APL": None, 12 | } 13 | iouThrs = [None, 0.5, 0.75, None, None, None] 14 | maxDets = [100] + [coco_evaluator.coco_eval["bbox"].params.maxDets[2]] * 5 15 | areaRngs = ["all", "all", "all", "small", "medium", "large"] 16 | bounding_box_params = coco_evaluator.coco_eval["bbox"].params 17 | 18 | for metric_no, metric in enumerate(metrics): 19 | aind = [ 20 | i 21 | for i, aRng in enumerate(bounding_box_params.areaRngLbl) 22 | if aRng == areaRngs[metric_no] 23 | ] 24 | mind = [ 25 | i 26 | for i, mDet in enumerate(bounding_box_params.maxDets) 27 | if mDet == maxDets[metric_no] 28 | ] 29 | 30 | s = coco_evaluator.coco_eval["bbox"].eval["precision"] 31 | 32 | # IoU 33 | if iouThrs[metric_no] is not None: 34 | t = np.where(iouThrs[metric_no] == bounding_box_params.iouThrs)[0] 35 | s = s[t] 36 | s = s[:, :, :, aind, mind] 37 | 38 | if len(s[s > -1]) == 0: 39 | mean_s = -1 40 | else: 41 | mean_s = np.mean(s[s > -1]) 42 | 43 | metrics[metric] = mean_s 44 | 45 | return metrics 46 | -------------------------------------------------------------------------------- /sotabencheval/question_answering/__init__.py: -------------------------------------------------------------------------------- 1 | from sotabencheval.question_answering.squad import SQuADEvaluator, SQuADVersion 2 | 3 | __all__ = ["SQuADEvaluator", "SQuADVersion"] 4 | -------------------------------------------------------------------------------- /sotabencheval/question_answering/evaluate_v11.py: -------------------------------------------------------------------------------- 1 | """ Official evaluation script for v1.1 of the SQuAD dataset. """ 2 | from __future__ import print_function 3 | from collections import Counter 4 | import string 5 | import re 6 | import argparse 7 | import json 8 | import sys 9 | 10 | 11 | def normalize_answer(s): 12 | """Lower text and remove punctuation, articles and extra whitespace.""" 13 | def remove_articles(text): 14 | return re.sub(r'\b(a|an|the)\b', ' ', text) 15 | 16 | def white_space_fix(text): 17 | return ' '.join(text.split()) 18 | 19 | def remove_punc(text): 20 | exclude = set(string.punctuation) 21 | return ''.join(ch for ch in text if ch not in exclude) 22 | 23 | def lower(text): 24 | return text.lower() 25 | 26 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 27 | 28 | 29 | def f1_score(prediction, ground_truth): 30 | prediction_tokens = normalize_answer(prediction).split() 31 | ground_truth_tokens = normalize_answer(ground_truth).split() 32 | common = Counter(prediction_tokens) & Counter(ground_truth_tokens) 33 | num_same = sum(common.values()) 34 | if num_same == 0: 35 | return 0 36 | precision = 1.0 * num_same / len(prediction_tokens) 37 | recall = 1.0 * num_same / len(ground_truth_tokens) 38 | f1 = (2 * precision * recall) / (precision + recall) 39 | return f1 40 | 41 | 42 | def exact_match_score(prediction, ground_truth): 43 | return (normalize_answer(prediction) == normalize_answer(ground_truth)) 44 | 45 | 46 | def metric_max_over_ground_truths(metric_fn, prediction, ground_truths): 47 | scores_for_ground_truths = [] 48 | for ground_truth in ground_truths: 49 | score = metric_fn(prediction, ground_truth) 50 | scores_for_ground_truths.append(score) 51 | return max(scores_for_ground_truths) 52 | 53 | 54 | def evaluate(dataset, predictions): 55 | f1 = exact_match = total = 0 56 | for article in dataset: 57 | for paragraph in article['paragraphs']: 58 | for qa in paragraph['qas']: 59 | total += 1 60 | if qa['id'] not in predictions: 61 | message = 'Unanswered question ' + qa['id'] + \ 62 | ' will receive score 0.' 63 | print(message, file=sys.stderr) 64 | continue 65 | ground_truths = list(map(lambda x: x['text'], qa['answers'])) 66 | prediction = predictions[qa['id']] 67 | exact_match += metric_max_over_ground_truths( 68 | exact_match_score, prediction, ground_truths) 69 | f1 += metric_max_over_ground_truths( 70 | f1_score, prediction, ground_truths) 71 | 72 | exact_match = 100.0 * exact_match / total 73 | f1 = 100.0 * f1 / total 74 | 75 | return {'exact_match': exact_match, 'f1': f1} 76 | 77 | 78 | if __name__ == '__main__': 79 | expected_version = '1.1' 80 | parser = argparse.ArgumentParser( 81 | description='Evaluation for SQuAD ' + expected_version) 82 | parser.add_argument('dataset_file', help='Dataset file') 83 | parser.add_argument('prediction_file', help='Prediction File') 84 | args = parser.parse_args() 85 | with open(args.dataset_file) as dataset_file: 86 | dataset_json = json.load(dataset_file) 87 | if (dataset_json['version'] != expected_version): 88 | print('Evaluation expects v-' + expected_version + 89 | ', but got dataset with v-' + dataset_json['version'], 90 | file=sys.stderr) 91 | dataset = dataset_json['data'] 92 | with open(args.prediction_file) as prediction_file: 93 | predictions = json.load(prediction_file) 94 | print(json.dumps(evaluate(dataset, predictions))) 95 | -------------------------------------------------------------------------------- /sotabencheval/question_answering/evaluate_v20.py: -------------------------------------------------------------------------------- 1 | """Official evaluation script for SQuAD version 2.0. 2 | 3 | In addition to basic functionality, we also compute additional statistics and 4 | plot precision-recall curves if an additional na_prob.json file is provided. 5 | This file is expected to map question ID's to the model's predicted probability 6 | that a question is unanswerable. 7 | """ 8 | import argparse 9 | import collections 10 | import json 11 | import numpy as np 12 | import os 13 | import re 14 | import string 15 | import sys 16 | 17 | OPTS = None 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.') 21 | parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.') 22 | parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.') 23 | parser.add_argument('--out-file', '-o', metavar='eval.json', 24 | help='Write accuracy metrics to file (default is stdout).') 25 | parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json', 26 | help='Model estimates of probability of no answer.') 27 | parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0, 28 | help='Predict "" if no-answer probability exceeds this (default = 1.0).') 29 | parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None, 30 | help='Save precision-recall curves to directory.') 31 | parser.add_argument('--verbose', '-v', action='store_true') 32 | if len(sys.argv) == 1: 33 | parser.print_help() 34 | sys.exit(1) 35 | return parser.parse_args() 36 | 37 | def make_qid_to_has_ans(dataset): 38 | qid_to_has_ans = {} 39 | for article in dataset: 40 | for p in article['paragraphs']: 41 | for qa in p['qas']: 42 | qid_to_has_ans[qa['id']] = bool(qa['answers']) 43 | return qid_to_has_ans 44 | 45 | def normalize_answer(s): 46 | """Lower text and remove punctuation, articles and extra whitespace.""" 47 | def remove_articles(text): 48 | regex = re.compile(r'\b(a|an|the)\b', re.UNICODE) 49 | return re.sub(regex, ' ', text) 50 | def white_space_fix(text): 51 | return ' '.join(text.split()) 52 | def remove_punc(text): 53 | exclude = set(string.punctuation) 54 | return ''.join(ch for ch in text if ch not in exclude) 55 | def lower(text): 56 | return text.lower() 57 | return white_space_fix(remove_articles(remove_punc(lower(s)))) 58 | 59 | def get_tokens(s): 60 | if not s: return [] 61 | return normalize_answer(s).split() 62 | 63 | def compute_exact(a_gold, a_pred): 64 | return int(normalize_answer(a_gold) == normalize_answer(a_pred)) 65 | 66 | def compute_f1(a_gold, a_pred): 67 | gold_toks = get_tokens(a_gold) 68 | pred_toks = get_tokens(a_pred) 69 | common = collections.Counter(gold_toks) & collections.Counter(pred_toks) 70 | num_same = sum(common.values()) 71 | if len(gold_toks) == 0 or len(pred_toks) == 0: 72 | # If either is no-answer, then F1 is 1 if they agree, 0 otherwise 73 | return int(gold_toks == pred_toks) 74 | if num_same == 0: 75 | return 0 76 | precision = 1.0 * num_same / len(pred_toks) 77 | recall = 1.0 * num_same / len(gold_toks) 78 | f1 = (2 * precision * recall) / (precision + recall) 79 | return f1 80 | 81 | def get_raw_scores(dataset, preds): 82 | exact_scores = {} 83 | f1_scores = {} 84 | for article in dataset: 85 | for p in article['paragraphs']: 86 | for qa in p['qas']: 87 | qid = qa['id'] 88 | gold_answers = [a['text'] for a in qa['answers'] 89 | if normalize_answer(a['text'])] 90 | if not gold_answers: 91 | # For unanswerable questions, only correct answer is empty string 92 | gold_answers = [''] 93 | if qid not in preds: 94 | print('Missing prediction for %s' % qid) 95 | continue 96 | a_pred = preds[qid] 97 | # Take max over all gold answers 98 | exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers) 99 | f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers) 100 | return exact_scores, f1_scores 101 | 102 | def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh): 103 | new_scores = {} 104 | for qid, s in scores.items(): 105 | pred_na = na_probs[qid] > na_prob_thresh 106 | if pred_na: 107 | new_scores[qid] = float(not qid_to_has_ans[qid]) 108 | else: 109 | new_scores[qid] = s 110 | return new_scores 111 | 112 | def make_eval_dict(exact_scores, f1_scores, qid_list=None): 113 | if not qid_list: 114 | total = len(exact_scores) 115 | return collections.OrderedDict([ 116 | ('exact', 100.0 * sum(exact_scores.values()) / total), 117 | ('f1', 100.0 * sum(f1_scores.values()) / total), 118 | ('total', total), 119 | ]) 120 | else: 121 | total = len(qid_list) 122 | return collections.OrderedDict([ 123 | ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total), 124 | ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total), 125 | ('total', total), 126 | ]) 127 | 128 | def merge_eval(main_eval, new_eval, prefix): 129 | for k in new_eval: 130 | main_eval['%s_%s' % (prefix, k)] = new_eval[k] 131 | 132 | def plot_pr_curve(precisions, recalls, out_image, title): 133 | plt.step(recalls, precisions, color='b', alpha=0.2, where='post') 134 | plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b') 135 | plt.xlabel('Recall') 136 | plt.ylabel('Precision') 137 | plt.xlim([0.0, 1.05]) 138 | plt.ylim([0.0, 1.05]) 139 | plt.title(title) 140 | plt.savefig(out_image) 141 | plt.clf() 142 | 143 | def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans, 144 | out_image=None, title=None): 145 | qid_list = sorted(na_probs, key=lambda k: na_probs[k]) 146 | true_pos = 0.0 147 | cur_p = 1.0 148 | cur_r = 0.0 149 | precisions = [1.0] 150 | recalls = [0.0] 151 | avg_prec = 0.0 152 | for i, qid in enumerate(qid_list): 153 | if qid_to_has_ans[qid]: 154 | true_pos += scores[qid] 155 | cur_p = true_pos / float(i+1) 156 | cur_r = true_pos / float(num_true_pos) 157 | if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]: 158 | # i.e., if we can put a threshold after this point 159 | avg_prec += cur_p * (cur_r - recalls[-1]) 160 | precisions.append(cur_p) 161 | recalls.append(cur_r) 162 | if out_image: 163 | plot_pr_curve(precisions, recalls, out_image, title) 164 | return {'ap': 100.0 * avg_prec} 165 | 166 | def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 167 | qid_to_has_ans, out_image_dir): 168 | if out_image_dir and not os.path.exists(out_image_dir): 169 | os.makedirs(out_image_dir) 170 | num_true_pos = sum(1 for v in qid_to_has_ans.values() if v) 171 | if num_true_pos == 0: 172 | return 173 | pr_exact = make_precision_recall_eval( 174 | exact_raw, na_probs, num_true_pos, qid_to_has_ans, 175 | out_image=os.path.join(out_image_dir, 'pr_exact.png'), 176 | title='Precision-Recall curve for Exact Match score') 177 | pr_f1 = make_precision_recall_eval( 178 | f1_raw, na_probs, num_true_pos, qid_to_has_ans, 179 | out_image=os.path.join(out_image_dir, 'pr_f1.png'), 180 | title='Precision-Recall curve for F1 score') 181 | oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()} 182 | pr_oracle = make_precision_recall_eval( 183 | oracle_scores, na_probs, num_true_pos, qid_to_has_ans, 184 | out_image=os.path.join(out_image_dir, 'pr_oracle.png'), 185 | title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)') 186 | merge_eval(main_eval, pr_exact, 'pr_exact') 187 | merge_eval(main_eval, pr_f1, 'pr_f1') 188 | merge_eval(main_eval, pr_oracle, 'pr_oracle') 189 | 190 | def histogram_na_prob(na_probs, qid_list, image_dir, name): 191 | if not qid_list: 192 | return 193 | x = [na_probs[k] for k in qid_list] 194 | weights = np.ones_like(x) / float(len(x)) 195 | plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0)) 196 | plt.xlabel('Model probability of no-answer') 197 | plt.ylabel('Proportion of dataset') 198 | plt.title('Histogram of no-answer probability: %s' % name) 199 | plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name)) 200 | plt.clf() 201 | 202 | def find_best_thresh(preds, scores, na_probs, qid_to_has_ans): 203 | num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k]) 204 | cur_score = num_no_ans 205 | best_score = cur_score 206 | best_thresh = 0.0 207 | qid_list = sorted(na_probs, key=lambda k: na_probs[k]) 208 | for i, qid in enumerate(qid_list): 209 | if qid not in scores: continue 210 | if qid_to_has_ans[qid]: 211 | diff = scores[qid] 212 | else: 213 | if preds[qid]: 214 | diff = -1 215 | else: 216 | diff = 0 217 | cur_score += diff 218 | if cur_score > best_score: 219 | best_score = cur_score 220 | best_thresh = na_probs[qid] 221 | return 100.0 * best_score / len(scores), best_thresh 222 | 223 | def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans): 224 | best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans) 225 | best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans) 226 | main_eval['best_exact'] = best_exact 227 | main_eval['best_exact_thresh'] = exact_thresh 228 | main_eval['best_f1'] = best_f1 229 | main_eval['best_f1_thresh'] = f1_thresh 230 | 231 | def main(): 232 | with open(OPTS.data_file) as f: 233 | dataset_json = json.load(f) 234 | dataset = dataset_json['data'] 235 | with open(OPTS.pred_file) as f: 236 | preds = json.load(f) 237 | if OPTS.na_prob_file: 238 | with open(OPTS.na_prob_file) as f: 239 | na_probs = json.load(f) 240 | else: 241 | na_probs = {k: 0.0 for k in preds} 242 | qid_to_has_ans = make_qid_to_has_ans(dataset) # maps qid to True/False 243 | has_ans_qids = [k for k, v in qid_to_has_ans.items() if v] 244 | no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v] 245 | exact_raw, f1_raw = get_raw_scores(dataset, preds) 246 | exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans, 247 | OPTS.na_prob_thresh) 248 | f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans, 249 | OPTS.na_prob_thresh) 250 | out_eval = make_eval_dict(exact_thresh, f1_thresh) 251 | if has_ans_qids: 252 | has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids) 253 | merge_eval(out_eval, has_ans_eval, 'HasAns') 254 | if no_ans_qids: 255 | no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids) 256 | merge_eval(out_eval, no_ans_eval, 'NoAns') 257 | if OPTS.na_prob_file: 258 | find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans) 259 | if OPTS.na_prob_file and OPTS.out_image_dir: 260 | run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 261 | qid_to_has_ans, OPTS.out_image_dir) 262 | histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns') 263 | histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns') 264 | if OPTS.out_file: 265 | with open(OPTS.out_file, 'w') as f: 266 | json.dump(out_eval, f) 267 | else: 268 | print(json.dumps(out_eval, indent=2)) 269 | 270 | if __name__ == '__main__': 271 | OPTS = parse_args() 272 | if OPTS.out_image_dir: 273 | import matplotlib 274 | matplotlib.use('Agg') 275 | import matplotlib.pyplot as plt 276 | main() 277 | 278 | -------------------------------------------------------------------------------- /sotabencheval/question_answering/squad.py: -------------------------------------------------------------------------------- 1 | from sotabencheval.core import BaseEvaluator 2 | from sotabencheval.utils import calculate_batch_hash, change_root_if_server, is_server, get_max_memory_allocated 3 | from sotabencheval.question_answering.utils import * 4 | from typing import Dict 5 | from enum import Enum 6 | from pathlib import Path 7 | import json 8 | import time 9 | 10 | class SQuADVersion(Enum): 11 | V11 = 'v1.1' 12 | V20 = 'v2.0' 13 | 14 | 15 | class SQuADEvaluator(BaseEvaluator): 16 | """Evaluator for Stanford Question Answering Dataset v1.1 and v2.0 benchmarks. 17 | 18 | Examples: 19 | Evaluate a BiDAF model from the AllenNLP repository on SQuAD 1.1 development set: 20 | 21 | .. code-block:: python 22 | 23 | from sotabencheval.question_answering import SQuADEvaluator, SQuADVersion 24 | 25 | from allennlp.data import DatasetReader 26 | from allennlp.data.iterators import DataIterator 27 | from allennlp.models.archival import load_archive 28 | from allennlp.nn.util import move_to_device 29 | 30 | def load_model(url, batch_size=64): 31 | archive = load_archive(url, cuda_device=0) 32 | model = archive.model 33 | reader = DatasetReader.from_params(archive.config["dataset_reader"]) 34 | iterator_params = archive.config["iterator"] 35 | iterator_params["batch_size"] = batch_size 36 | data_iterator = DataIterator.from_params(iterator_params) 37 | data_iterator.index_with(model.vocab) 38 | return model, reader, data_iterator 39 | 40 | def evaluate(model, dataset, data_iterator, evaluator): 41 | model.eval() 42 | evaluator.reset_time() 43 | for batch in data_iterator(dataset, num_epochs=1, shuffle=False): 44 | batch = move_to_device(batch, 0) 45 | predictions = model(**batch) 46 | answers = {metadata['id']: prediction 47 | for metadata, prediction in zip(batch['metadata'], predictions['best_span_str'])} 48 | evaluator.add(answers) 49 | if evaluator.cache_exists: 50 | break 51 | 52 | evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name="BiDAF (single)", 53 | paper_arxiv_id="1611.01603", version=SQuADVersion.V11) 54 | 55 | model, reader, data_iter =\ 56 | load_model("https://allennlp.s3.amazonaws.com/models/bidaf-model-2017.09.15-charpad.tar.gz") 57 | dataset = reader.read(evaluator.dataset_path) 58 | evaluate(model, dataset, data_iter, evaluator) 59 | evaluator.save() 60 | print(evaluator.results) 61 | """ 62 | 63 | task = "Question Answering" 64 | 65 | def __init__(self, 66 | local_root: str = '.', 67 | dataset_filename: str = None, 68 | model_name: str = None, 69 | paper_arxiv_id: str = None, 70 | paper_pwc_id: str = None, 71 | paper_results: dict = None, 72 | model_description=None, 73 | version: SQuADVersion = SQuADVersion.V20): 74 | """ 75 | Creates an evaluator for SQuAD v1.1 or v2.0 Question Answering benchmarks. 76 | 77 | :param local_root: Path to the directory where the dataset files are located locally. 78 | Ignored when run on sotabench server. 79 | :param dataset_filename: Local filename of the JSON file with the SQuAD dataset. 80 | If None, the standard filename is used, based on :param:`version`. 81 | Ignored when run on sotabench server. 82 | :param model_name: The name of the model from the 83 | paper - if you want to link your build to a model from a 84 | machine learning paper. See the SQuAD benchmarks pages for model names, 85 | (f.e., https://sotabench.com/benchmarks/question-answering-on-squad11-dev) 86 | on the paper leaderboard or models yet to try tabs. 87 | :param paper_arxiv_id: Optional linking to arXiv if you 88 | want to link to papers on the leaderboard; put in the 89 | corresponding paper's arXiv ID, e.g. '1907.10529'. 90 | :param paper_pwc_id: Optional linking to Papers With Code; 91 | put in the corresponding papers with code URL slug, e.g. 92 | 'spanbert-improving-pre-training-by' 93 | :param paper_results: If the paper model you are reproducing 94 | does not have model results on sotabench.com, you can specify 95 | the paper results yourself through this argument, where keys 96 | are metric names, values are metric values. e.g: 97 | 98 | {'EM': 0.858, 'F1': 0.873}. 99 | 100 | Ensure that the metric names match those on the sotabench 101 | leaderboard - for SQuAD benchmarks it should be `EM` for exact match 102 | and `F1` for F1 score. Make sure to use results of evaluation on a development set. 103 | :param model_description: Optional model description. 104 | :param version: Which dataset to evaluate on, either `SQuADVersion.V11` or `SQuADVersion.V20`. 105 | """ 106 | super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) 107 | self.root = change_root_if_server(root=local_root, 108 | server_root=".data/nlp/squad") 109 | self.version = version 110 | if dataset_filename is None or is_server(): 111 | dataset_filename = "dev-{}.json".format(version.value) 112 | self.dataset_path = Path(self.root) / dataset_filename 113 | 114 | self.metrics = SQuADMetrics(self.dataset_path, version) 115 | 116 | def add(self, answers: Dict[str, str]): 117 | """ 118 | Updates the evaluator with new results 119 | 120 | :param answers: a dictionary, where keys are question ids and values are text answers. 121 | For unanswerable questions (SQuAD v2.0) the answer should be an empty string. 122 | 123 | Examples: 124 | Update the evaluator with two results: 125 | 126 | .. code-block:: python 127 | 128 | my_evaluator.add({ 129 | "57296d571d04691400779413": "itself", 130 | "5a89117e19b91f001a626f2d": "" 131 | }) 132 | """ 133 | 134 | self.metrics.add(answers) 135 | 136 | if not self.first_batch_processed and self.metrics.has_data: 137 | self.batch_hash = calculate_batch_hash( 138 | self.cache_values(answers=self.metrics.answers, 139 | metrics=self.metrics.get_results(ignore_missing=True)) 140 | ) 141 | self.first_batch_processed = True 142 | 143 | def reset(self): 144 | """ 145 | Removes already added answers 146 | 147 | When checking if the model should be rerun on whole dataset it is first run on a smaller subset 148 | and the results are compared with values cached on sotabench server (the check is not performed 149 | when running locally.) Ideally, the smaller subset is just the first batch, so no additional 150 | computation is needed. However, for more complex multistage pipelines it may be simpler to 151 | run the model twice - on a small dataset and (if necessary) on the full dataset. In that case 152 | :func:`reset` needs to be called before the second run so values from the first run are not reported. 153 | 154 | .. seealso:: :func:`cache_exists` 155 | .. seealso:: :func:`reset_time` 156 | """ 157 | 158 | self.metrics.reset() 159 | self.reset_time() 160 | 161 | def get_results(self): 162 | """ 163 | Gets the results for the evaluator. 164 | 165 | :return: dict with `EM` (exact match score) and `F1`. 166 | """ 167 | 168 | if self.cached_results: 169 | return self.results 170 | self.results = self.metrics.get_results() 171 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated() 172 | 173 | return self.results 174 | 175 | def save(self): 176 | dataset = "SQuAD{} dev".format(self.metrics.version.value[1:]) 177 | 178 | if not self.cached_results: 179 | exec_speed = (time.time() - self.init_time) 180 | self.speed_mem_metrics['Tasks / Evaluation Time'] = len(self.metrics.answers) / exec_speed 181 | self.speed_mem_metrics['Tasks'] = len(self.metrics.answers) 182 | self.speed_mem_metrics['Evaluation Time'] = exec_speed 183 | else: 184 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None 185 | self.speed_mem_metrics['Tasks'] = None 186 | self.speed_mem_metrics['Evaluation Time'] = None 187 | 188 | return super().save(dataset=dataset) 189 | 190 | 191 | # todo: aggregate batches so that size of the batch used for caching does not depend on evaluation batch size 192 | CACHE_BATCH_SIZE = 1024 193 | 194 | 195 | class SQuADMetrics: 196 | def __init__(self, dataset_path: Path, version: SQuADVersion = SQuADVersion.V20): 197 | self.version = version 198 | self.answers = {} 199 | self._dataset = self._load_dataset(dataset_path) 200 | self._results = None 201 | 202 | def _load_dataset(self, path): 203 | with open(path, 'rt') as f: 204 | ds = json.load(f) 205 | if 'version' not in ds or 'data' not in ds: 206 | raise ValueError("Incorrect dataset format, either 'version' or 'data' is missing") 207 | version = ds['version'].strip().lower() 208 | if version and version[0] != 'v': 209 | version = 'v'+version 210 | if self.version.value != version: 211 | raise ValueError("Incorrect dataset version, found {} but was expecting {}" 212 | .format(version, self.version.value)) 213 | return ds['data'] 214 | 215 | def reset(self): 216 | self._results = None 217 | self.answers = {} 218 | 219 | def add(self, answers: Dict[str, str]): 220 | if not answers: 221 | print("Empty batch added to results") 222 | return 223 | if set(self.answers.keys()) & set(answers.keys()): 224 | print("Multiple predictions for a single question") 225 | 226 | self.answers.update(answers) 227 | 228 | def evaluate(self, ignore_missing=False): 229 | if ignore_missing: 230 | dataset = [{'paragraphs': [ 231 | {'qas': [qa for qa in paragraph['qas'] if qa['id'] in self.answers]} 232 | for paragraph in article['paragraphs'] 233 | ]} for article in self._dataset] 234 | else: 235 | dataset = self._dataset 236 | if self.version == SQuADVersion.V11: 237 | eval_fn = evaluate_v11 238 | else: 239 | eval_fn = evaluate_v20 240 | results = eval_fn(dataset, self.answers) 241 | self._results = { 242 | 'EM': results['exact_match'] / 100.0, 243 | 'F1': results['f1'] / 100.0 244 | } 245 | 246 | @property 247 | def has_data(self): 248 | return bool(self.answers) 249 | 250 | def get_results(self, ignore_missing=False): 251 | self.evaluate(ignore_missing) 252 | 253 | return self._results 254 | -------------------------------------------------------------------------------- /sotabencheval/question_answering/utils.py: -------------------------------------------------------------------------------- 1 | from sotabencheval.question_answering.evaluate_v11 import evaluate as evaluate_v11 2 | from sotabencheval.question_answering.evaluate_v20 import get_raw_scores 3 | 4 | __all__ = ["evaluate_v11", "evaluate_v20"] 5 | 6 | 7 | def evaluate_v20(dataset, predictions): 8 | exact_scores, f1_scores = get_raw_scores(dataset, predictions) 9 | total = sum([len(p['qas']) for article in dataset for p in article['paragraphs']]) 10 | exact_match = 100.0 * sum(exact_scores.values()) / total 11 | f1 = 100.0 * sum(f1_scores.values()) / total 12 | return {'exact_match': exact_match, 'f1': f1} 13 | -------------------------------------------------------------------------------- /sotabencheval/semantic_segmentation/__init__.py: -------------------------------------------------------------------------------- 1 | __all__ = ["PASCALVOCEvaluator"] 2 | 3 | from sotabencheval.semantic_segmentation.ade20k import ADE20KEvaluator 4 | from sotabencheval.semantic_segmentation.pascalvoc import PASCALVOCEvaluator -------------------------------------------------------------------------------- /sotabencheval/semantic_segmentation/ade20k.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sotabenchapi.client import Client 3 | from sotabenchapi.core import BenchmarkResult, check_inputs 4 | import time 5 | 6 | from sotabencheval.utils import calculate_batch_hash, is_server, get_max_memory_allocated 7 | from sotabencheval.semantic_segmentation.utils import ConfusionMatrix 8 | 9 | 10 | class ADE20KEvaluator(object): 11 | """`ADE20K `_ benchmark. 12 | 13 | Examples: 14 | Evaluate a HRNetV2 model from the CSAILVision repository 15 | 16 | .. code-block:: python 17 | 18 | ... 19 | 20 | evaluator = ADE20KEvaluator(model_name='HRNetV2 (HRNetV2-W48)', paper_arxiv_id='1904.04514') 21 | 22 | for batch_data in loader: 23 | # process data 24 | batch_data = batch_data[0] 25 | seg_label = as_numpy(batch_data['seg_label'][0]) 26 | img_resized_list = batch_data['img_data'] 27 | 28 | torch.cuda.synchronize() 29 | tic = time.perf_counter() 30 | with torch.no_grad(): 31 | segSize = (seg_label.shape[0], seg_label.shape[1]) 32 | scores = torch.zeros(1, cfg.DATASET.num_class, segSize[0], segSize[1]) 33 | scores = async_copy_to(scores, gpu) 34 | 35 | for img in img_resized_list: 36 | feed_dict = batch_data.copy() 37 | feed_dict['img_data'] = img 38 | del feed_dict['img_ori'] 39 | del feed_dict['info'] 40 | feed_dict = async_copy_to(feed_dict, gpu) 41 | 42 | # forward pass 43 | scores_tmp = segmentation_module(feed_dict, segSize=segSize) 44 | scores = scores + scores_tmp / len(cfg.DATASET.imgSizes) 45 | 46 | _, pred = torch.max(scores, dim=1) 47 | pred = as_numpy(pred.squeeze(0).cpu()) 48 | 49 | torch.cuda.synchronize() 50 | 51 | evaluator.update(output=pred.flatten().cpu().numpy(), 52 | target=seg_label.flatten().cpu().numpy()) 53 | 54 | if evaluator.cache_exists: 55 | break 56 | 57 | evaluator.save() 58 | """ 59 | 60 | task = "Semantic Segmentation" 61 | 62 | def __init__(self, 63 | model_name: str = None, 64 | paper_arxiv_id: str = None, 65 | paper_pwc_id: str = None, 66 | paper_results: dict = None, 67 | model_description=None): 68 | """Initializes a COCO Evaluator object 69 | 70 | Args: 71 | model_name (str, optional): The name of the model from the 72 | paper - if you want to link your build to a machine learning 73 | paper. See the ADE20K benchmark page for model names, 74 | https://sotabench.com/benchmarks/semantic-segmentation-on-ade20k-val, 75 | e.g. on the paper leaderboard tab. 76 | paper_arxiv_id (str, optional): Optional linking to arXiv if you 77 | want to link to papers on the leaderboard; put in the 78 | corresponding paper's arXiv ID, e.g. '1611.05431'. 79 | paper_pwc_id (str, optional): Optional linking to Papers With Code; 80 | put in the corresponding papers with code URL slug, e.g. 81 | 'u-gat-it-unsupervised-generative-attentional' 82 | paper_results (dict, optional) : If the paper you are reproducing 83 | does not have model results on sotabench.com, you can specify 84 | the paper results yourself through this argument, where keys 85 | are metric names, values are metric values. e.g:: 86 | 87 | {'mIOU': 0.4566, 'Accuracy': 0.543}. 88 | 89 | Ensure that the metric names match those on the sotabench 90 | leaderboard - for ADE20K it should be 'mIOU', 'Accuracy' 91 | model_description (str, optional): Optional model description. 92 | download (bool) : whether to download the data or not 93 | """ 94 | 95 | # Model metadata 96 | 97 | self.model_name = model_name 98 | self.paper_arxiv_id = paper_arxiv_id 99 | self.paper_pwc_id = paper_pwc_id 100 | self.paper_results = paper_results 101 | self.model_description = model_description 102 | 103 | self.ade20k_evaluator = ConfusionMatrix(150) 104 | 105 | self.outputs = np.array([]) 106 | self.targets = np.array([]) 107 | 108 | self.results = None 109 | 110 | # Backend variables for hashing and caching 111 | 112 | self.first_batch_processed = False 113 | self.batch_hash = None 114 | self.cached_results = False 115 | 116 | # Speed and memory metrics 117 | 118 | self.init_time = time.time() 119 | self.speed_mem_metrics = {} 120 | 121 | @property 122 | def cache_exists(self): 123 | """ 124 | Checks whether the cache exists in the sotabench.com database - if so 125 | then sets self.results to cached results and returns True. 126 | 127 | You can use this property for control flow to break a for loop over a dataset 128 | after the first iteration. This prevents rerunning the same calculation for the 129 | same model twice. 130 | 131 | Examples: 132 | Breaking a for loop 133 | 134 | .. code-block:: python 135 | 136 | ... 137 | 138 | with torch.no_grad(): 139 | for i, (input, target) in enumerate(iterator): 140 | ... 141 | output = model(input) 142 | # output and target should then be flattened into 1D np.ndarrays and passed in below 143 | evaluator.update(output=output, target=target) 144 | 145 | if evaluator.cache_exists: 146 | break 147 | 148 | evaluator.save() 149 | 150 | :return: bool or None (if not in check mode) 151 | """ 152 | if not self.first_batch_processed: 153 | raise ValueError('No batches of data have been processed so no batch_hash exists') 154 | 155 | if not is_server(): 156 | return None 157 | 158 | client = Client.public() 159 | cached_res = client.get_results_by_run_hash(self.batch_hash) 160 | if cached_res: 161 | self.results = cached_res 162 | self.cached_results = True 163 | print( 164 | "No model change detected (using the first batch run " 165 | "hash). Will use cached results." 166 | ) 167 | return True 168 | 169 | return False 170 | 171 | def add(self, outputs: np.ndarray, targets: np.ndarray): 172 | """ 173 | Update the evaluator with new results from the model 174 | 175 | :param outputs (np.ndarray): 1D np.ndarray of semantic class predictions per pixel 176 | :param targets (np.ndarray): 1D np.ndarray of ground truth semantic classes per pixel 177 | 178 | The method requires an outputs input and a targets input - both flattened. 179 | 180 | Suppose you are making predictions, batch by batch, and have your model outputs 181 | and the original targets with batch_size 32, and image size 520 x 480. 182 | The shape of your outputs might look like this: 183 | 184 | batch_output.shape 185 | >> (32, 21, 520, 480) # where 21 is the number of ADE20K classes 186 | 187 | batch_target.shape 188 | >> (32, 520, 480) 189 | 190 | We can flatten the entire output and targets to 1D vectors for each pixel: 191 | 192 | flattened_batch_output.shape 193 | >> (7987200) # flatten by taking the max class prediction 194 | # (batch_output.argmax(1).flatten() in torch with class as second dimension) 195 | 196 | flattened_batch_target.shape 197 | >> (7987200) # (batch_target.flatten() in torch) 198 | 199 | The output might look something like this: 200 | 201 | flattened_batch_output 202 | >> array([6, 6, 6, 6, 6, ...]) 203 | 204 | flattened_batch_target 205 | >> array([6, 6, 6, 6, 6, ...]) 206 | 207 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5 208 | pixels - so the model is correct. 209 | 210 | These flattened arrays can then be passed into the .add() method of the evaluator 211 | 212 | .. code-block:: python 213 | 214 | my_evaluator.update(outputs=flattened_batch_output, 215 | targets=flattened_batch_target) 216 | 217 | 218 | :return: void - updates self.ade20k_evaluator with the data, and updates self.targets and self.outputs 219 | """ 220 | self.ade20k_evaluator.update(targets, outputs) 221 | 222 | self.targets = np.append(self.targets, targets) 223 | self.outputs = np.append(self.outputs, outputs) 224 | 225 | if not self.first_batch_processed: 226 | acc_global, acc, iu = self.ade20k_evaluator.compute() 227 | self.batch_hash = calculate_batch_hash(np.append( 228 | np.append(np.around(targets, 3), np.around(outputs, 3)), 229 | np.around(np.array([acc_global.item(), iu.mean().item()]), 3))) 230 | self.first_batch_processed = True 231 | 232 | def get_results(self): 233 | """ 234 | Reruns the evaluation using the accumulated detections, returns ADE20K results with IOU and 235 | Accuracy metrics 236 | 237 | :return: dict with ADE20K metrics 238 | """ 239 | if self.cached_results: 240 | return self.results 241 | 242 | self.ade20k_evaluator = ConfusionMatrix(150) 243 | self.ade20k_evaluator.update(self.targets.astype(np.int64), self.outputs.astype(np.int64)) 244 | 245 | acc_global, acc, iu = self.ade20k_evaluator.compute() 246 | 247 | self.results = { 248 | "Accuracy": acc_global.item(), 249 | "Mean IOU": iu.mean().item(), 250 | } 251 | 252 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated() 253 | 254 | return self.results 255 | 256 | def reset_time(self): 257 | """ 258 | Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation 259 | appropriately, for example: 260 | 261 | :return: void - resets self.init_time 262 | """ 263 | self.init_time = time.time() 264 | 265 | def save(self): 266 | """ 267 | Calculate results and then put into a BenchmarkResult object 268 | 269 | On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded 270 | on the platform. 271 | 272 | :return: BenchmarkResult object with results and metadata 273 | """ 274 | 275 | # recalculate to ensure no mistakes made during batch-by-batch metric calculation 276 | self.get_results() 277 | 278 | # If this is the first time the model is run, then we record evaluation time information 279 | 280 | if not self.cached_results: 281 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None 282 | self.speed_mem_metrics['Tasks'] = None 283 | self.speed_mem_metrics['Evaluation Time'] = (time.time() - self.init_time) 284 | else: 285 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None 286 | self.speed_mem_metrics['Tasks'] = None 287 | self.speed_mem_metrics['Evaluation Time'] = None 288 | 289 | return BenchmarkResult( 290 | task=self.task, 291 | config={}, 292 | dataset='ADE20K val', 293 | results=self.results, 294 | speed_mem_metrics=self.speed_mem_metrics, 295 | model=self.model_name, 296 | model_description=self.model_description, 297 | arxiv_id=self.paper_arxiv_id, 298 | pwc_id=self.paper_pwc_id, 299 | paper_results=self.paper_results, 300 | run_hash=self.batch_hash, 301 | ) 302 | -------------------------------------------------------------------------------- /sotabencheval/semantic_segmentation/pascalvoc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sotabenchapi.client import Client 3 | from sotabenchapi.core import BenchmarkResult, check_inputs 4 | import time 5 | 6 | from sotabencheval.utils import calculate_batch_hash, is_server, get_max_memory_allocated 7 | from sotabencheval.semantic_segmentation.utils import ConfusionMatrix 8 | 9 | 10 | class PASCALVOCEvaluator(object): 11 | """`PASCAL VOC `_ benchmark. 12 | 13 | Examples: 14 | Evaluate a FCN model from the torchvision repository: 15 | 16 | .. code-block:: python 17 | 18 | ... 19 | 20 | evaluator = PASCALVOCEvaluator(model_name='FCN ResNet-101', paper_arxiv_id='1605.06211') 21 | 22 | with torch.no_grad(): 23 | for i, (input, target) in enumerate(iterator): 24 | ... 25 | output = model(input) 26 | # output and target should then be flattened into 1D np.ndarrays and passed in below 27 | evaluator.update(output=output, target=target) 28 | 29 | if evaluator.cache_exists: 30 | break 31 | 32 | evaluator.save() 33 | """ 34 | 35 | task = "Semantic Segmentation" 36 | 37 | def __init__(self, 38 | model_name: str = None, 39 | paper_arxiv_id: str = None, 40 | paper_pwc_id: str = None, 41 | paper_results: dict = None, 42 | model_description=None): 43 | """Initializes a COCO Evaluator object 44 | 45 | Args: 46 | model_name (str, optional): The name of the model from the 47 | paper - if you want to link your build to a machine learning 48 | paper. See the VOC benchmark page for model names, 49 | https://sotabench.com/benchmarks/semantic-segmentation-on-pascal-voc-2012-val, 50 | e.g. on the paper leaderboard tab. 51 | paper_arxiv_id (str, optional): Optional linking to arXiv if you 52 | want to link to papers on the leaderboard; put in the 53 | corresponding paper's arXiv ID, e.g. '1611.05431'. 54 | paper_pwc_id (str, optional): Optional linking to Papers With Code; 55 | put in the corresponding papers with code URL slug, e.g. 56 | 'u-gat-it-unsupervised-generative-attentional' 57 | paper_results (dict, optional) : If the paper you are reproducing 58 | does not have model results on sotabench.com, you can specify 59 | the paper results yourself through this argument, where keys 60 | are metric names, values are metric values. e.g:: 61 | 62 | {'Mean IOU': 76.42709, 'Accuracy': 95.31, ...}. 63 | 64 | Ensure that the metric names match those on the sotabench 65 | leaderboard - for PASCAL VOC it should be 'Mean IOU', 'Accuracy' 66 | model_description (str, optional): Optional model description. 67 | """ 68 | 69 | # Model metadata 70 | 71 | self.model_name = model_name 72 | self.paper_arxiv_id = paper_arxiv_id 73 | self.paper_pwc_id = paper_pwc_id 74 | self.paper_results = paper_results 75 | self.model_description = model_description 76 | 77 | self.voc_evaluator = ConfusionMatrix(21) 78 | 79 | self.outputs = np.array([]) 80 | self.targets = np.array([]) 81 | 82 | self.results = None 83 | 84 | # Backend variables for hashing and caching 85 | 86 | self.first_batch_processed = False 87 | self.batch_hash = None 88 | self.cached_results = False 89 | 90 | # Speed and memory metrics 91 | 92 | self.init_time = time.time() 93 | self.speed_mem_metrics = {} 94 | 95 | @property 96 | def cache_exists(self): 97 | """ 98 | Checks whether the cache exists in the sotabench.com database - if so 99 | then sets self.results to cached results and returns True. 100 | 101 | You can use this property for control flow to break a for loop over a dataset 102 | after the first iteration. This prevents rerunning the same calculation for the 103 | same model twice. 104 | 105 | Examples: 106 | Breaking a for loop 107 | 108 | .. code-block:: python 109 | 110 | ... 111 | 112 | with torch.no_grad(): 113 | for i, (input, target) in enumerate(iterator): 114 | ... 115 | output = model(input) 116 | # output and target should then be flattened into 1D np.ndarrays and passed in below 117 | evaluator.update(output=output, target=target) 118 | 119 | if evaluator.cache_exists: 120 | break 121 | 122 | evaluator.save() 123 | 124 | :return: bool or None (if not in check mode) 125 | """ 126 | if not self.first_batch_processed: 127 | raise ValueError('No batches of data have been processed so no batch_hash exists') 128 | 129 | if not is_server(): 130 | return None 131 | 132 | client = Client.public() 133 | cached_res = client.get_results_by_run_hash(self.batch_hash) 134 | if cached_res: 135 | self.results = cached_res 136 | self.cached_results = True 137 | print( 138 | "No model change detected (using the first batch run " 139 | "hash). Will use cached results." 140 | ) 141 | return True 142 | 143 | return False 144 | 145 | def add(self, outputs: np.ndarray, targets: np.ndarray): 146 | """ 147 | Update the evaluator with new results from the model 148 | 149 | :param outputs (np.ndarray): 1D np.ndarray of semantic class predictions per pixel 150 | :param targets (np.ndarray): 1D np.ndarray of ground truth semantic classes per pixel 151 | 152 | The method requires an outputs input and a targets input - both flattened. 153 | 154 | Suppose you are making predictions, batch by batch, and have your model outputs 155 | and the original targets with batch_size 32, and image size 520 x 480. 156 | The shape of your outputs might look like this: 157 | 158 | batch_output.shape 159 | >> (32, 21, 520, 480) # where 21 is the number of VOC classes 160 | 161 | batch_target.shape 162 | >> (32, 520, 480) 163 | 164 | We can flatten the entire output and targets to 1D vectors for each pixel: 165 | 166 | flattened_batch_output.shape 167 | >> (7987200) # flatten by taking the max class prediction 168 | # (batch_output.argmax(1).flatten() in torch with class as second dimension) 169 | 170 | flattened_batch_target.shape 171 | >> (7987200) # (batch_target.flatten() in torch) 172 | 173 | The output might look something like this: 174 | 175 | flattened_batch_output 176 | >> array([6, 6, 6, 6, 6, ...]) 177 | 178 | flattened_batch_target 179 | >> array([6, 6, 6, 6, 6, ...]) 180 | 181 | In both cases, the prediction and ground truth have class 6 as the semantic label for the first 5 182 | pixels - so the model is correct. 183 | 184 | These flattened arrays can then be passed into the .add() method of the evaluator 185 | 186 | .. code-block:: python 187 | 188 | my_evaluator.update(outputs=flattened_batch_output, 189 | targets=flattened_batch_target) 190 | 191 | 192 | :return: void - updates self.voc_evaluator with the data, and updates self.targets and self.outputs 193 | """ 194 | self.voc_evaluator.update(targets, outputs) 195 | 196 | self.targets = np.append(self.targets, targets) 197 | self.outputs = np.append(self.outputs, outputs) 198 | 199 | if not self.first_batch_processed: 200 | acc_global, acc, iu = self.voc_evaluator.compute() 201 | self.batch_hash = calculate_batch_hash(np.append( 202 | np.append(np.around(targets, 3), np.around(outputs, 3)), 203 | np.around(np.array([acc_global.item(), iu.mean().item()]), 3))) 204 | self.first_batch_processed = True 205 | 206 | def get_results(self): 207 | """ 208 | Reruns the evaluation using the accumulated detections, returns VOC results with IOU and 209 | Accuracy metrics 210 | 211 | :return: dict with PASCAL VOC metrics 212 | """ 213 | if self.cached_results: 214 | return self.results 215 | 216 | self.voc_evaluator = ConfusionMatrix(21) 217 | self.voc_evaluator.update(self.targets.astype(np.int64), self.outputs.astype(np.int64)) 218 | 219 | acc_global, acc, iu = self.voc_evaluator.compute() 220 | 221 | self.results = { 222 | "Accuracy": acc_global.item(), 223 | "Mean IOU": iu.mean().item(), 224 | } 225 | 226 | self.speed_mem_metrics['Max Memory Allocated (Total)'] = get_max_memory_allocated() 227 | 228 | return self.results 229 | 230 | def reset_time(self): 231 | """ 232 | Simple method to reset the timer self.init_time. Often used before a loop, to time the evaluation 233 | appropriately, for example: 234 | 235 | :return: void - resets self.init_time 236 | """ 237 | self.init_time = time.time() 238 | 239 | def save(self): 240 | """ 241 | Calculate results and then put into a BenchmarkResult object 242 | 243 | On the sotabench.com server, this will produce a JSON file serialisation and results will be recorded 244 | on the platform. 245 | 246 | :return: BenchmarkResult object with results and metadata 247 | """ 248 | # recalculate to ensure no mistakes made during batch-by-batch metric calculation 249 | self.get_results() 250 | 251 | # If this is the first time the model is run, then we record evaluation time information 252 | 253 | if not self.cached_results: 254 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None 255 | self.speed_mem_metrics['Tasks'] = None 256 | self.speed_mem_metrics['Evaluation Time'] = (time.time() - self.init_time) 257 | else: 258 | self.speed_mem_metrics['Tasks / Evaluation Time'] = None 259 | self.speed_mem_metrics['Tasks'] = None 260 | self.speed_mem_metrics['Evaluation Time'] = None 261 | 262 | return BenchmarkResult( 263 | task=self.task, 264 | config={}, 265 | dataset='PASCAL VOC 2012 val', 266 | results=self.results, 267 | speed_mem_metrics=self.speed_mem_metrics, 268 | model=self.model_name, 269 | model_description=self.model_description, 270 | arxiv_id=self.paper_arxiv_id, 271 | pwc_id=self.paper_pwc_id, 272 | paper_results=self.paper_results, 273 | run_hash=self.batch_hash, 274 | ) 275 | -------------------------------------------------------------------------------- /sotabencheval/semantic_segmentation/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class ConfusionMatrix(object): 5 | def __init__(self, num_classes): 6 | self.num_classes = num_classes 7 | self.mat = None 8 | 9 | def update(self, a, b): 10 | """ 11 | print(a.shape) 12 | print(n.shape) 13 | k = (a >= 0) & (a < n) 14 | inds = n * a[k].to(torch.int64) + b[k] 15 | self.mat += np.bincount(inds, minlength=n ** 2).reshape(n, n) 16 | """ 17 | n = self.num_classes 18 | 19 | if self.mat is None: 20 | self.mat = np.zeros((n, n), dtype=np.int64) 21 | 22 | k = (a >= 0) & (a < n) 23 | inds = n * a[k].astype(np.int64) + b[k] 24 | self.mat += np.bincount(inds, minlength=n ** 2).reshape(n, n) 25 | 26 | def reset(self): 27 | self.mat.zero_() 28 | 29 | def compute(self): 30 | h = self.mat 31 | acc_global = np.diag(h).sum() / h.sum() 32 | acc = np.diag(h) / h.sum(1) 33 | iu = np.diag(h) / (h.sum(1) + h.sum(0) - np.diag(h)) 34 | return acc_global, acc, iu 35 | 36 | def __str__(self): 37 | acc_global, acc, iu = self.compute() 38 | return ( 39 | "global correct: {:.1f}\n" 40 | "average row correct: {}\n" 41 | "IoU: {}\n" 42 | "mean IoU: {:.1f}" 43 | ).format( 44 | acc_global.item() * 100, 45 | ["{:.1f}".format(i) for i in (acc * 100).tolist()], 46 | ["{:.1f}".format(i) for i in (iu * 100).tolist()], 47 | iu.mean().item() * 100, 48 | ) 49 | 50 | -------------------------------------------------------------------------------- /sotabencheval/utils.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import gzip 3 | import errno 4 | import tarfile 5 | import zipfile 6 | import os 7 | from tqdm import tqdm 8 | from pathlib import Path 9 | 10 | 11 | SOTABENCH_CACHE = Path.home() / ".cache" 12 | 13 | 14 | class AverageMeter(object): 15 | """Computes and stores the average and current value.""" 16 | 17 | def __init__(self): 18 | self.val = 0 19 | self.avg = 0 20 | self.sum = 0 21 | self.count = 0 22 | 23 | def reset(self): 24 | self.val = 0 25 | self.avg = 0 26 | self.sum = 0 27 | self.count = 0 28 | 29 | def update(self, val, n=1): 30 | self.val = val 31 | self.sum += val * n 32 | self.count += n 33 | self.avg = self.sum / self.count 34 | 35 | 36 | def calculate_batch_hash(output): 37 | """Calculate the hash for the output of a batch 38 | 39 | Output is passed into this method, stringified, and a hash is taken of the contents. For example, 40 | it could be an list of predictions that is passed in. 41 | 42 | Args: 43 | output: data to be hashed 44 | """ 45 | m = hashlib.sha256() 46 | m.update(str(output).encode("utf-8")) 47 | return m.hexdigest() 48 | 49 | 50 | def change_root_if_server(root: str, server_root: str): 51 | """ 52 | This method checks whether code is being executed on the sotabench server - if so it returns 53 | server_root, else root. Written as a method so the user doesn't have to fiddle with environmental 54 | variables. 55 | 56 | :param root: (str) a user-specified root 57 | :param server_root: (str) a server root 58 | :return: server_root if SOTABENCH_SERVER env variable is set, else root 59 | """ 60 | check_server = os.environ.get("SOTABENCH_SERVER") 61 | 62 | if check_server == 'true': 63 | return server_root 64 | 65 | return root 66 | 67 | 68 | def is_server(): 69 | """ 70 | Checks whether code is being executed on server; if so, returns True else False. 71 | 72 | Uses env variable SOTABENCH_SERVER to determine whether code is being run on the server. 73 | 74 | You can use this function for your control flow for server specific settings - e.g. the data paths. 75 | 76 | Examples: 77 | 78 | .. code-block:: python 79 | 80 | 81 | from sotabencheval.utils import is_server 82 | 83 | if is_server(): 84 | DATA_ROOT = './.data/vision/imagenet' 85 | else: # local settings 86 | DATA_ROOT = '/home/ubuntu/my_data/' 87 | 88 | :return: bool - whether the code is being run on the server or not 89 | """ 90 | if os.environ.get("SOTABENCH_SERVER") == 'true': 91 | return True 92 | else: 93 | return False 94 | 95 | 96 | def set_env_on_server(env_name: str, value): 97 | """ 98 | If run on sotabench server, sets an environment variable with a given name to value (casted to str). 99 | 100 | :param env_name: (str) environment variable name 101 | :param value: value to set if executed on sotabench 102 | :return: bool - whether code is being run on the server 103 | """ 104 | if is_server(): 105 | os.environ[env_name] = str(value) 106 | return True 107 | return False 108 | 109 | 110 | def get_max_memory_allocated(device: str = 'cuda'): 111 | """ 112 | Finds out the maximum memory allocated, then clears the max memory allocated. 113 | 114 | This currently only works for PyTorch models. 115 | 116 | TODO: Support TensorFlow and MXNet. 117 | 118 | :param device: (str) - name of device (Torch style) -> e.g. 'cuda' 119 | :return: float or None - if torch is in the environment, max memory allocated, else None 120 | """ 121 | try: 122 | import torch 123 | max_mem = torch.cuda.max_memory_allocated(device=device) 124 | torch.cuda.reset_max_memory_allocated(device=device) 125 | return max_mem 126 | except ImportError: 127 | return None 128 | 129 | # Below the utilities have been taken directly from the torchvision repository 130 | # Contains helper functions for unzipping and making directories 131 | # https://github.com/pytorch/vision/tree/master/torchvision 132 | 133 | 134 | def makedir_exist_ok(dirpath): 135 | """ 136 | Python2 support for os.makedirs(.., exist_ok=True) 137 | """ 138 | try: 139 | os.makedirs(dirpath) 140 | except OSError as e: 141 | if e.errno == errno.EEXIST: 142 | pass 143 | else: 144 | raise 145 | 146 | def gen_bar_updater(): 147 | pbar = tqdm(total=None) 148 | 149 | def bar_update(count, block_size, total_size): 150 | if pbar.total is None and total_size: 151 | pbar.total = total_size 152 | progress_bytes = count * block_size 153 | pbar.update(progress_bytes - pbar.n) 154 | 155 | return bar_update 156 | 157 | 158 | def calculate_md5(fpath, chunk_size=1024 * 1024): 159 | md5 = hashlib.md5() 160 | with open(fpath, 'rb') as f: 161 | for chunk in iter(lambda: f.read(chunk_size), b''): 162 | md5.update(chunk) 163 | return md5.hexdigest() 164 | 165 | 166 | def check_md5(fpath, md5, **kwargs): 167 | return md5 == calculate_md5(fpath, **kwargs) 168 | 169 | 170 | def check_integrity(fpath, md5=None): 171 | if not os.path.isfile(fpath): 172 | return False 173 | if md5 is None: 174 | return True 175 | return check_md5(fpath, md5) 176 | 177 | 178 | def download_url(url, root, filename=None, md5=None): 179 | """Download a file from a url and place it in root - utility function taken from torchvision repository 180 | Args: 181 | url (str): URL to download file from 182 | root (str): Directory to place downloaded file in 183 | filename (str, optional): Name to save the file under. If None, use the basename of the URL 184 | md5 (str, optional): MD5 checksum of the download. If None, do not check 185 | """ 186 | from six.moves import urllib 187 | 188 | root = os.path.expanduser(root) 189 | if not filename: 190 | filename = os.path.basename(url) 191 | fpath = os.path.join(root, filename) 192 | 193 | makedir_exist_ok(root) 194 | 195 | # downloads file 196 | if check_integrity(fpath, md5): 197 | print('Using downloaded and verified file: ' + fpath) 198 | else: 199 | try: 200 | print('Downloading ' + url + ' to ' + fpath) 201 | urllib.request.urlretrieve( 202 | url, fpath, 203 | reporthook=gen_bar_updater() 204 | ) 205 | except (urllib.error.URLError, IOError) as e: 206 | if url[:5] == 'https': 207 | url = url.replace('https:', 'http:') 208 | print('Failed download. Trying https -> http instead.' 209 | ' Downloading ' + url + ' to ' + fpath) 210 | urllib.request.urlretrieve( 211 | url, fpath, 212 | reporthook=gen_bar_updater() 213 | ) 214 | else: 215 | raise e 216 | 217 | 218 | def _is_tar(filename): 219 | return filename.endswith(".tar") 220 | 221 | 222 | def _is_targz(filename): 223 | return filename.endswith(".tar.gz") 224 | 225 | 226 | def _is_gzip(filename): 227 | return filename.endswith(".gz") and not filename.endswith(".tar.gz") 228 | 229 | 230 | def _is_zip(filename): 231 | return filename.endswith(".zip") 232 | 233 | 234 | def extract_archive(from_path, to_path=None, remove_finished=False): 235 | if to_path is None: 236 | to_path = os.path.dirname(from_path) 237 | 238 | if _is_tar(from_path): 239 | with tarfile.open(from_path, 'r') as tar: 240 | tar.extractall(path=to_path) 241 | elif _is_targz(from_path): 242 | with tarfile.open(from_path, 'r:gz') as tar: 243 | tar.extractall(path=to_path) 244 | elif _is_gzip(from_path): 245 | to_path = os.path.join(to_path, os.path.splitext(os.path.basename(from_path))[0]) 246 | with open(to_path, "wb") as out_f, gzip.GzipFile(from_path) as zip_f: 247 | out_f.write(zip_f.read()) 248 | elif _is_zip(from_path): 249 | with zipfile.ZipFile(from_path, 'r') as z: 250 | z.extractall(to_path) 251 | else: 252 | raise ValueError("Extraction of {} not supported".format(from_path)) 253 | 254 | if remove_finished: 255 | os.remove(from_path) 256 | -------------------------------------------------------------------------------- /sotabencheval/version.py: -------------------------------------------------------------------------------- 1 | class Version: 2 | __slots__ = ("major", "minor", "build") 3 | 4 | def __init__(self, major, minor, build): 5 | self.major = major 6 | self.minor = minor 7 | self.build = build 8 | 9 | def __str__(self): 10 | return f"{self.major}.{self.minor}.{self.build}" 11 | 12 | def __repr__(self): 13 | return ( 14 | f"Version(major={self.major}, minor={self.minor}, " 15 | f"build={self.build})" 16 | ) 17 | 18 | version = Version(0, 0, 38) 19 | 20 | __version__ = str(version) 21 | --------------------------------------------------------------------------------