├── CODE_OF_CONDUCT.md ├── COMET ├── .github │ ├── ISSUE_TEMPLATE │ │ ├── bug_report.md │ │ ├── feature_request.md │ │ ├── questions-and-help.md │ │ └── typos-and-doc-fixes.md │ └── workflows │ │ └── ci.yaml ├── .gitignore ├── .idea │ ├── COMET.iml │ ├── deployment.xml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ ├── runConfigurations │ │ └── contrapro_comet.xml │ └── workspace.xml ├── CONTRIBUTING.md ├── LICENSE ├── MANIFEST.in ├── METRICS.md ├── README.md ├── add_context.py ├── comet │ ├── __init__.py │ ├── cli │ │ ├── compare.py │ │ ├── mbr.py │ │ ├── score.py │ │ └── train.py │ ├── download_utils.py │ ├── encoders │ │ ├── __init__.py │ │ ├── base.py │ │ ├── bert.py │ │ ├── minilm.py │ │ ├── xlmr.py │ │ └── xlmr_xl.py │ ├── models │ │ ├── __init__.py │ │ ├── base.py │ │ ├── lru_cache.py │ │ ├── metrics.py │ │ ├── pooling_utils.py │ │ ├── predict_pbar.py │ │ ├── ranking │ │ │ ├── __init__.py │ │ │ └── ranking_metric.py │ │ └── regression │ │ │ ├── __init__.py │ │ │ ├── referenceless.py │ │ │ └── regression_metric.py │ └── modules │ │ ├── __init__.py │ │ ├── feedforward.py │ │ └── layerwise_attention.py ├── configs │ ├── early_stopping.yaml │ ├── model_checkpoint.yaml │ ├── models │ │ ├── ranking_metric.yaml │ │ ├── referenceless_metric.yaml │ │ └── regression_metric.yaml │ └── trainer.yaml ├── docs │ ├── Makefile │ ├── make.bat │ └── source │ │ ├── _static │ │ ├── css │ │ │ └── comet.css │ │ └── img │ │ │ ├── COMET_lockup-dark.png │ │ │ ├── COMET_lockup-white.png │ │ │ ├── estimator_model.jpg │ │ │ ├── logo.png │ │ │ ├── models.png │ │ │ └── ranking_model.jpg │ │ ├── conf.py │ │ ├── index.rst │ │ ├── installation.rst │ │ ├── library.rst │ │ ├── models.md │ │ ├── running.rst │ │ └── training.md ├── poetry.lock ├── pyproject.toml ├── requirements.txt └── tests │ ├── __init__.py │ ├── integration │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── test_ranking_metric.py │ │ ├── test_referenceless_regression.py │ │ └── test_regression_metric.py │ └── modules │ │ └── test_feedforward.py │ └── unit │ ├── __init__.py │ ├── encoders │ ├── __init__.py │ ├── test_bert.py │ └── test_xlmr.py │ └── test_download_utils.py ├── CONTRIBUTING.md ├── Config ├── LICENSE ├── NOTICE ├── Prism ├── README.md ├── add_context.py └── prism.py ├── README.md ├── THIRD-PARTY-LICENSES_DOC_MT_METRICS.txt ├── bert_score ├── .gitignore ├── .idea │ ├── bert_score.iml │ ├── deployment.xml │ ├── inspectionProfiles │ │ └── profiles_settings.xml │ ├── misc.xml │ ├── modules.xml │ ├── remote-mappings.xml │ └── workspace.xml ├── .travis.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── add_context.py ├── bert_score.png ├── bert_score │ ├── __init__.py │ ├── rescale_baseline │ │ ├── cs │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ ├── de │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ ├── en-sci │ │ │ └── allenai │ │ │ │ └── scibert_scivocab_uncased.tsv │ │ ├── en │ │ │ ├── albert-base-v1.tsv │ │ │ ├── albert-base-v2.tsv │ │ │ ├── albert-large-v1.tsv │ │ │ ├── albert-large-v2.tsv │ │ │ ├── albert-xlarge-v1.tsv │ │ │ ├── albert-xlarge-v2.tsv │ │ │ ├── albert-xxlarge-v1.tsv │ │ │ ├── albert-xxlarge-v2.tsv │ │ │ ├── bert-base-cased-finetuned-mrpc.tsv │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── bert-base-uncased.tsv │ │ │ ├── bert-large-uncased.tsv │ │ │ ├── distilbert-base-multilingual-cased.tsv │ │ │ ├── distilbert-base-uncased-distilled-squad.tsv │ │ │ ├── distilbert-base-uncased.tsv │ │ │ ├── distilroberta-base.tsv │ │ │ ├── microsoft │ │ │ │ ├── deberta-base-mnli.tsv │ │ │ │ ├── deberta-base.tsv │ │ │ │ ├── deberta-large-mnli.tsv │ │ │ │ ├── deberta-large.tsv │ │ │ │ ├── deberta-xlarge-mnli.tsv │ │ │ │ └── deberta-xlarge.tsv │ │ │ ├── roberta-base.tsv │ │ │ ├── roberta-large-mnli.tsv │ │ │ ├── roberta-large.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-mlm-en-2048.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ ├── xlm-roberta-large.tsv │ │ │ ├── xlnet-base-cased.tsv │ │ │ └── xlnet-large-cased.tsv │ │ ├── es │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ ├── et │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ ├── fi │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ ├── fr │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ ├── it │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ ├── lv │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ ├── pt │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ ├── xlm-roberta-base.tsv │ │ │ └── xlm-roberta-large.tsv │ │ └── zh │ │ │ ├── bert-base-chinese.tsv │ │ │ ├── bert-base-multilingual-cased.tsv │ │ │ ├── xlm-mlm-100-1280.tsv │ │ │ └── xlm-roberta-base.tsv │ ├── score.py │ ├── scorer.py │ └── utils.py ├── bert_score_cli │ ├── __init__.py │ ├── score.py │ └── visualize.py ├── get_rescale_baseline │ ├── README.md │ ├── download_text_data.sh │ ├── get_baseline_example.sh │ └── get_rescale_baseline.py ├── journal │ ├── rescale_baseline.md │ └── static │ │ ├── .png │ │ ├── after.png │ │ └── before.png ├── requirements.txt ├── setup.py ├── tests │ ├── __init__.py │ ├── custom_assertions.py │ ├── test_score_function.py │ └── test_scorer.py ├── tune_layers │ ├── README.md │ ├── download_data.sh │ ├── tune.sh │ └── tune_layers.py └── upload_pypi.sh ├── media └── bertscore.png └── score_doc-metrics.py /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | ## Code of Conduct 2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 4 | opensource-codeofconduct@amazon.com with any additional questions or comments. 5 | -------------------------------------------------------------------------------- /COMET/.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🐛 Bug 11 | 12 | 13 | 14 | ### To Reproduce 15 | 16 | Before reporting a bug, make sure that the bug can be reproduced with a minimal example and add your relevant changes, to see if the issue persists. 17 | 18 | If the test is failing, please add your test cases to the issue (as a draft PR, or simply paste the code to the issue description here). 19 | 20 | ### Expected behaviour 21 | A clear and concise description of what you expected to happen. 22 | 23 | ### Screenshots 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | ### Environment 27 | OS: [e.g. iOS, Linux, Win] 28 | Packaging [e.g. pip, conda] 29 | Version [e.g. 0.5.2.1] 30 | 31 | ### Additional context 32 | 33 | 34 | -------------------------------------------------------------------------------- /COMET/.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 🚀 Feature 11 | 12 | 13 | ### Motivation 14 | 15 | 16 | 17 | ### Alternatives 18 | 19 | 20 | 21 | ### Additional context 22 | 23 | 24 | -------------------------------------------------------------------------------- /COMET/.github/ISSUE_TEMPLATE/questions-and-help.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Questions and Help 3 | about: Ask questions about COMET 4 | title: "[QUESTION]" 5 | labels: question 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## ❓ Questions and Help 11 | 12 | ### Before asking: 13 | 1. Search for similar [issues](https://github.com/Unbabel/COMET/issues). 14 | 3. Search the [docs](https://unbabel.github.io/COMET/html/index.html). 15 | 16 | 17 | 18 | #### What is your question? 19 | 20 | #### Code 21 | 22 | 23 | 24 | #### What have you tried? 25 | 26 | #### What's your environment? 27 | 28 | - OS: [e.g. iOS, Linux, Win] 29 | - Packaging [e.g. pip, conda] 30 | - Version [e.g. 0.5.2.1] 31 | -------------------------------------------------------------------------------- /COMET/.github/ISSUE_TEMPLATE/typos-and-doc-fixes.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Typos and doc fixes 3 | about: Typos and doc fixes 4 | title: '' 5 | labels: documentation, enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## 📚 Documentation 11 | 12 | If you find a typo or something not well explained in the documentation please use this template to report that! 13 | -------------------------------------------------------------------------------- /COMET/.github/workflows/ci.yaml: -------------------------------------------------------------------------------- 1 | name: Python package 2 | 3 | on: [push, pull_request] 4 | 5 | jobs: 6 | build: 7 | 8 | runs-on: ubuntu-latest 9 | strategy: 10 | matrix: 11 | python-version: [3.7, 3.8, 3.9] 12 | 13 | steps: 14 | - uses: actions/checkout@v2 15 | - name: Set up Python ${{ matrix.python-version }} 16 | uses: actions/setup-python@v2 17 | with: 18 | python-version: ${{ matrix.python-version }} 19 | - name: Install Requirements 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install unbabel-comet 23 | comet-score --help 24 | -------------------------------------------------------------------------------- /COMET/.gitignore: -------------------------------------------------------------------------------- 1 | *.pkl 2 | *DS_Store 3 | data/ 4 | lightning_logs/ 5 | wmt21/ 6 | 7 | .vscode 8 | # Byte-compiled / optimized / DLL files 9 | __pycache__/ 10 | *.py[cod] 11 | *$py.class 12 | 13 | # C extensions 14 | *.so 15 | 16 | # Distribution / packaging 17 | .Python 18 | build/ 19 | develop-eggs/ 20 | dist/ 21 | downloads/ 22 | eggs/ 23 | .eggs/ 24 | lib/ 25 | lib64/ 26 | parts/ 27 | sdist/ 28 | var/ 29 | wheels/ 30 | pip-wheel-metadata/ 31 | share/python-wheels/ 32 | *.egg-info/ 33 | .installed.cfg 34 | *.egg 35 | MANIFEST 36 | 37 | # PyInstaller 38 | # Usually these files are written by a python script from a template 39 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 40 | *.manifest 41 | *.spec 42 | 43 | # Installer logs 44 | pip-log.txt 45 | pip-delete-this-directory.txt 46 | 47 | # Unit test / coverage reports 48 | htmlcov/ 49 | .tox/ 50 | .nox/ 51 | .coverage 52 | .coverage.* 53 | .cache 54 | nosetests.xml 55 | coverage.xml 56 | *.cover 57 | .hypothesis/ 58 | .pytest_cache/ 59 | 60 | # Translations 61 | *.mo 62 | *.pot 63 | 64 | # Django stuff: 65 | *.log 66 | local_settings.py 67 | db.sqlite3 68 | db.sqlite3-journal 69 | 70 | # Flask stuff: 71 | instance/ 72 | .webassets-cache 73 | 74 | # Scrapy stuff: 75 | .scrapy 76 | 77 | # Sphinx documentation 78 | docs/_build/ 79 | 80 | # PyBuilder 81 | target/ 82 | 83 | # Jupyter Notebook 84 | .ipynb_checkpoints 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # celery beat schedule file 101 | celerybeat-schedule 102 | 103 | # SageMath parsed files 104 | *.sage.py 105 | 106 | # Environments 107 | .env 108 | .venv 109 | env/ 110 | venv/ 111 | ENV/ 112 | env.bak/ 113 | venv.bak/ 114 | 115 | # Spyder project settings 116 | .spyderproject 117 | .spyproject 118 | 119 | # Rope project settings 120 | .ropeproject 121 | 122 | # mkdocs documentation 123 | /site 124 | 125 | # mypy 126 | .mypy_cache/ 127 | .dmypy.json 128 | dmypy.json 129 | 130 | # Pyre type checker 131 | .pyre/ 132 | 133 | # local files 134 | docids 135 | hyp.en 136 | ref.en 137 | src.de 138 | -------------------------------------------------------------------------------- /COMET/.idea/COMET.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /COMET/.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 22 | -------------------------------------------------------------------------------- /COMET/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /COMET/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /COMET/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /COMET/.idea/runConfigurations/contrapro_comet.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 24 | -------------------------------------------------------------------------------- /COMET/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contribution guide 2 | 3 | ## Overview 4 | 5 | COMET is an Open Source toolkit aimed to develop state of the art models that can act as MT evaluation metrics. While we do welcome contributions, in order to guarantee their quality and usefulness, it is necessary that we follow basic guidelines in order to ease development, collaboration and readability. 6 | 7 | ## Basic guidelines 8 | 9 | * The project must fully support Python 3.6 or further. 10 | * Code formatting must stick to the Facebook style, 80 columns and single quotes. Please make sure you have [black](https://github.com/ambv/black) installed and to run it before submitting changes. 11 | * Imports are sorted with [isort](https://github.com/timothycrosley/isort). 12 | * Filenames must be in lowercase. 13 | * Tests are running with [unittest](https://docs.python.org/3/library/unittest.html). Unittest implements a standard test discovery which means that it will search for `test_*.py` files. We do not enforce a minimum code coverage but it is preferrable to have even very basic tests running for critical pieces of code. Always test functions that takes/returns tensor argument to document the sizes. 14 | * The `comet` folder contains core features. 15 | 16 | ## Contributing 17 | 18 | * Keep track of everything by creating issues and editing them with reference to the code! Explain succinctly the problem you are trying to solve and your solution. 19 | * Contributions to `master` should be made through github pull-requests. 20 | * Work in a clean environment (`virtualenv` is nice). 21 | * Your commit message must start with an infinitive verb (Add, Fix, Remove, ...). 22 | * If your change is based on a paper, please include a clear comment and reference in the code and in the related issue. 23 | * In order to test your local changes, install COMET following the instructions on the [documentation](https://unbabel.github.io/COMET/html/index.html) 24 | -------------------------------------------------------------------------------- /COMET/MANIFEST.in: -------------------------------------------------------------------------------- 1 | include requirements.txt -------------------------------------------------------------------------------- /COMET/METRICS.md: -------------------------------------------------------------------------------- 1 | # WMT20 COMET Metrics: 2 | 3 | Our **Primary Metrics** are the models described in our [participation to the WMT20 Metrics Shared task](https://aclanthology.org/2020.wmt-1.101.pdf): 4 | - `wmt20-comet-da`: This model was trained to predict _Direct Assessments_ from WMT17 to WMT19 using source, translation and reference. (Same as `wmt-large-da-estimator-1719` from previous versions.) 5 | - `wmt20-comet-qe-da`: This model was trained to predict _Direct Assessments_ from WMT17 to WMT19 using **source and translation only**! Also, this model is bounded between 0 and 1 which improves interpretability in comparison with the previous model. 6 | 7 | These two models were the best performing metrics in the large-scale metrics study performed by Microsoft Research [kocmi et al, 2021](https://arxiv.org/abs/2107.10821) which validates our findings. 8 | 9 | # EMNLP20 Metric: 10 | 11 | In our [initial COMET release](https://aclanthology.org/2020.emnlp-main.213/) we developed a Translation Ranking Model based on daRR from previous WMT shared tasks. This model achieves **some of the highest Kendall tau-like correlations on the WMT19 daRR benchmark** but does not perform as well on later WMT benchmarks, specially those using MQM annotations. 12 | 13 | 14 | # WMT21 COMET Metrics: 15 | 16 | In our participation to the WMT21 shared task we steer COMET towards higher correlations with MQM. We do so by first pre-training on _Direct Assessments_ and then fine-tuning on z-normalized MQM scores. 17 | - `wmt21-comet-mqm`: This model was pre-trained on _Direct Assessments_ from WMT15 to WMT20 and then fine-tuned on MQM scores from [freitag et al, 2021](https://arxiv.org/pdf/2104.14478.pdf) 18 | - `wmt21-comet-qe-mqm`: Reference-free version of `wmt21-comet-mqm`. 19 | 20 | Additionally, we present COMETinho (`wmt21-cometinho-da`), a light-weight COMET model that is 19x faster on CPU than the original model. 21 | 22 | **NOTE:** One thing we noticed in the WMT21 Models is that the variance between predicted scores is lower than previous models which make their predictions look very similar to each other even if the overall correlations with human judgments improve and the system rankings is correct. 23 | -------------------------------------------------------------------------------- /COMET/README.md: -------------------------------------------------------------------------------- 1 | # Doc-COMET(-QE) 2 | 3 | This README describes how to use **Doc-COMET** an extension of the original COMET metric that can be used for document-level evaluation. This can also be applied to the referenceless version of COMET, i.e. COMET-QE (QE-as-a-metric), resulting in the corresponding **Doc-COMET-QE** metric. 4 | 5 | ## Installation 6 | 7 | This codebase is built upon the original [COMET code](https://github.com/Unbabel/COMET). For a detailed documentation of the COMET metric, including usage examples and instructions see the [Full Documentation](https://unbabel.github.io/COMET/html/index.html). 8 | 9 | To run Doc-COMET you will need to develop locally: 10 | ```bash 11 | git clone https://github.com/amazon-science/doc-mt-metrics.git 12 | cd doc-mt-metrics/COMET 13 | conda create -n doc-metrics-env python=3.9 14 | conda activate doc-metrics-env 15 | pip install --upgrade pip 16 | pip install -r requirements.txt 17 | pip install -e . 18 | ``` 19 | 20 | ### Get some files to score 21 | ```bash 22 | sacrebleu -t wmt21 -l en-de --echo src | head -n 20 > src.en 23 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > ref.de 24 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > hyp.de # put your system output here 25 | ``` 26 | 27 | To evaluate at the document level we need to know where the document boundaries are in the test set, so that we only use valid context. This is passed in as a file where each line contains a document ID. 28 | 29 | For WMT test sets this can be obtained via [sacreBLEU](https://github.com/mjpost/sacrebleu): 30 | ```bash 31 | sacrebleu -t wmt21 -l en-de --echo docid | head -n 20 > docids.ende 32 | ``` 33 | 34 | ### Command Line usage 35 | 36 | Comet and comet-qe are run just as before, except we, add the `--doc` flag to the `comet-score` command: 37 | ```bash 38 | comet-score -s src.en -t hyp.de -r ref.de --doc docids.ende --model wmt21-comet-mqm 39 | comet-score -s src.en -t hyp.de --doc docids.ende --model wmt21-comet-qe-mqm 40 | ``` 41 | > Note: you can set `--gpus 0` to run on CPU. 42 | 43 | In the paper we use `wmt21-comet-mqm` and `wmt21-comet-qe-mqm` models. To select a different model from the [available COMET models/metrics](https://unbabel.github.io/COMET/html/models.html) set the `--model` flag accordingly. 44 | 45 | ### Python usage: 46 | 47 | In order to use Doc-COMET(-QE) with python simply add `model.set_document_level()` after loading the model. 48 | 49 | ```python 50 | from comet import download_model, load_from_checkpoint 51 | from add_context import add_context 52 | 53 | # load data files 54 | doc_ids = [x.strip() for x in open('docids.ende', 'rt').readlines()] 55 | src = [x.strip() for x in open('src.en', 'rt').readlines()] 56 | hyp = [x.strip() for x in open('hyp.de', 'rt').readlines()] 57 | ref = [x.strip() for x in open('ref.de', 'rt').readlines()] 58 | 59 | # load comet model 60 | model_path = download_model("wmt21-comet-mqm") 61 | model = load_from_checkpoint(model_path) 62 | 63 | # enable document-level evaluation 64 | model.set_document_level() 65 | 66 | # add contexts to reference, source and hypothesis texts 67 | src = add_context(orig_txt=src, context=src, doc_ids=doc_ids, sep_token=model.encoder.tokenizer.sep_token) 68 | hyp = add_context(orig_txt=hyp, context=ref, doc_ids=doc_ids, sep_token=model.encoder.tokenizer.sep_token) 69 | ref = add_context(orig_txt=ref, context=ref, doc_ids=doc_ids, sep_token=model.encoder.tokenizer.sep_token) 70 | 71 | data = [{"src": x, "mt": y, "ref": z} for x, y, z in zip(src, hyp, ref)] 72 | 73 | seg_scores, sys_score = model.predict(data, batch_size=8, gpus=1) 74 | ``` 75 | 76 | ## Reproduce 77 | To reproduce the Doc-COMET results from the paper run the [score_doc-metrics.py](/score_doc-metrics.py) script with the flags `--model comet` and `--doc`. For the Doc-COMET-QE results also add the `--qe` flag. 78 | 79 | 80 | ## Paper 81 | 82 | If you use the code in your work, please cite [Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric](https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf): 83 | 84 | ``` 85 | @inproceedings{easy_doc_mt 86 | title = {Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric}, 87 | author = {Vernikos, Giorgos and Thompson, Brian and Mathur, Prashant and Federico, Marcello}, 88 | booktitle = "Proceedings of the Seventh Conference on Machine Translation", 89 | month = dec, 90 | year = "2022", 91 | address = "Abu Dhabi, United Arab Emirates", 92 | publisher = "Association for Computational Linguistics", 93 | url = "https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf", 94 | } 95 | ``` 96 | -------------------------------------------------------------------------------- /COMET/add_context.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List 3 | 4 | 5 | def add_context(orig_txt: List[str], context: List[str], doc_ids: List[str], sep_token: str = "", 6 | ws: int = 2) -> List[str]: 7 | """Function that adds the previous sentences as context to the current sentence, respecting document boundaries 8 | :param orig_txt: the original text 9 | :param context: the text from which the context will be taken (same as orig_txt for source/reference) 10 | :param doc_ids: the document where each segment belongs to 11 | :param sep_token: the separator token of the tokenizer for the specific model 12 | :param ws: the window size, maximum of the previous sentences to be considered as context 13 | :return: the original text augmented with context 14 | """ 15 | if not (len(orig_txt) == len(context) == len(doc_ids)): 16 | raise Exception(f'Lengths should match: len(orig_txt)={len(orig_txt)}, len(context)={len(context)}, len(doc_ids)={len(doc_ids)}') 17 | i, k = 0, 0 18 | augm_txt = [] 19 | doc_id = doc_ids[0] 20 | while i < len(orig_txt): 21 | if doc_ids[i] == doc_id: 22 | context_window = context[i - min(k, ws):i] 23 | augm_txt.append(" {} ".format(sep_token).join(context_window + [orig_txt[i]])) 24 | i += 1 25 | else: 26 | doc_id = doc_ids[i] 27 | k = -1 28 | k += 1 29 | return augm_txt 30 | 31 | 32 | -------------------------------------------------------------------------------- /COMET/comet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # -*- coding: utf-8 -*- 3 | # Copyright (C) 2020 Unbabel 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import logging 18 | 19 | from .download_utils import download_model 20 | from .models import load_from_checkpoint 21 | 22 | logging.basicConfig(level=logging.INFO, format="%(message)s") 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | __version__ = "1.1.2" 27 | __copyright__ = "2020 Unbabel. All rights reserved." 28 | -------------------------------------------------------------------------------- /COMET/comet/cli/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (C) 2020 Unbabel 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """ 17 | 18 | Command for training new Metrics. 19 | ================================= 20 | 21 | e.g: 22 | ``` 23 | comet-train --cfg configs/models/regression_metric.yaml 24 | ``` 25 | 26 | For more details run the following command: 27 | ``` 28 | comet-train --help 29 | ``` 30 | """ 31 | import json 32 | import warnings 33 | 34 | 35 | from comet.models import ( 36 | CometModel, 37 | RankingMetric, 38 | ReferencelessRegression, 39 | RegressionMetric, 40 | ) 41 | from jsonargparse import ActionConfigFile, ArgumentParser, namespace_to_dict 42 | from pytorch_lightning import seed_everything 43 | from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint 44 | from pytorch_lightning.trainer.trainer import Trainer 45 | 46 | 47 | def train_command() -> None: 48 | parser = ArgumentParser(description="Command for training COMET models.") 49 | parser.add_argument( 50 | "--seed_everything", 51 | type=int, 52 | default=12, 53 | help="Training Seed.", 54 | ) 55 | parser.add_argument("--cfg", action=ActionConfigFile) 56 | parser.add_class_arguments(CometModel, "model") 57 | parser.add_subclass_arguments(RegressionMetric, "regression_metric") 58 | parser.add_subclass_arguments( 59 | ReferencelessRegression, "referenceless_regression_metric" 60 | ) 61 | parser.add_subclass_arguments(RankingMetric, "ranking_metric") 62 | parser.add_subclass_arguments(EarlyStopping, "early_stopping") 63 | parser.add_subclass_arguments(ModelCheckpoint, "model_checkpoint") 64 | parser.add_subclass_arguments(Trainer, "trainer") 65 | cfg = parser.parse_args() 66 | seed_everything(cfg.seed_everything) 67 | 68 | checkpoint_callback = ModelCheckpoint( 69 | **namespace_to_dict(cfg.model_checkpoint.init_args) 70 | ) 71 | early_stop_callback = EarlyStopping( 72 | **namespace_to_dict(cfg.early_stopping.init_args) 73 | ) 74 | trainer_args = namespace_to_dict(cfg.trainer.init_args) 75 | trainer_args["callbacks"] = [early_stop_callback, checkpoint_callback] 76 | print("TRAINER ARGUMENTS: ") 77 | print(json.dumps(trainer_args, indent=4, default=lambda x: x.__dict__)) 78 | trainer = Trainer(**trainer_args) 79 | 80 | print("MODEL ARGUMENTS: ") 81 | if cfg.regression_metric is not None: 82 | print( 83 | json.dumps( 84 | cfg.regression_metric.init_args, indent=4, default=lambda x: x.__dict__ 85 | ) 86 | ) 87 | model = RegressionMetric(**namespace_to_dict(cfg.regression_metric.init_args)) 88 | elif cfg.referenceless_regression_metric is not None: 89 | print( 90 | json.dumps( 91 | cfg.referenceless_regression_metric.init_args, 92 | indent=4, 93 | default=lambda x: x.__dict__, 94 | ) 95 | ) 96 | model = ReferencelessRegression( 97 | **namespace_to_dict(cfg.referenceless_regression_metric.init_args) 98 | ) 99 | elif cfg.ranking_metric is not None: 100 | print( 101 | json.dumps( 102 | cfg.ranking_metric.init_args, indent=4, default=lambda x: x.__dict__ 103 | ) 104 | ) 105 | model = RankingMetric(**namespace_to_dict(cfg.ranking_metric.init_args)) 106 | else: 107 | raise Exception("Model configurations missing!") 108 | # Related to train/val_dataloaders: 109 | 110 | # 2 workers per gpu is enough! If set to the number of cpus on this machine 111 | # it throws another exception saying its too many workers. 112 | warnings.filterwarnings( 113 | "ignore", 114 | category=UserWarning, 115 | message=".*Consider increasing the value of the `num_workers` argument` .*", 116 | ) 117 | trainer.fit(model) 118 | 119 | 120 | if __name__ == "__main__": 121 | train_command() 122 | -------------------------------------------------------------------------------- /COMET/comet/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Unbabel 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import transformers 15 | from packaging import version 16 | 17 | from .bert import BERTEncoder 18 | from .minilm import MiniLMEncoder 19 | from .xlmr import XLMREncoder 20 | 21 | str2encoder = {"BERT": BERTEncoder, "XLM-RoBERTa": XLMREncoder, "MiniLM": MiniLMEncoder} 22 | 23 | if version.parse(transformers.__version__) >= version.parse("4.17.0"): 24 | from .xlmr_xl import XLMRXLEncoder 25 | 26 | str2encoder["XLM-RoBERTa-XL"] = XLMRXLEncoder 27 | -------------------------------------------------------------------------------- /COMET/comet/encoders/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Unbabel 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | r""" 15 | Encoder Model base 16 | ==================== 17 | Module defining the common interface between all pretrained encoder models. 18 | """ 19 | import abc 20 | from typing import Dict, List 21 | 22 | import torch 23 | import torch.nn as nn 24 | 25 | 26 | class Encoder(nn.Module, metaclass=abc.ABCMeta): 27 | """Base class for an encoder model.""" 28 | 29 | @property 30 | @abc.abstractmethod 31 | def output_units(self): 32 | """Max number of tokens the encoder handles.""" 33 | pass 34 | 35 | @property 36 | @abc.abstractmethod 37 | def max_positions(self): 38 | """Max number of tokens the encoder handles.""" 39 | pass 40 | 41 | @property 42 | @abc.abstractmethod 43 | def num_layers(self): 44 | """Number of model layers available.""" 45 | pass 46 | 47 | @classmethod 48 | @abc.abstractmethod 49 | def from_pretrained(cls, pretrained_model): 50 | """Function that loads a pretrained encoder and the respective tokenizer. 51 | 52 | :return: Encoder model 53 | """ 54 | raise NotImplementedError 55 | 56 | def prepare_sample(self, sample: List[str]) -> Dict[str, torch.Tensor]: 57 | """Receives a list of strings and applies tokenization and vectorization. 58 | 59 | :param sample: List with text segments to be tokenized and padded. 60 | 61 | :return: Dictionary with HF model inputs. 62 | """ 63 | tokenizer_output = self.tokenizer( 64 | sample, 65 | return_tensors="pt", 66 | padding=True, 67 | truncation=True, 68 | max_length=self.max_positions - 2, 69 | ) 70 | return tokenizer_output 71 | 72 | def freeze(self) -> None: 73 | """Frezees the entire encoder.""" 74 | for param in self.parameters(): 75 | param.requires_grad = False 76 | 77 | def unfreeze(self) -> None: 78 | """Unfrezees the entire encoder.""" 79 | for param in self.parameters(): 80 | param.requires_grad = True 81 | 82 | @abc.abstractmethod 83 | def freeze_embeddings(self) -> None: 84 | """Frezees the embedding layer.""" 85 | pass 86 | 87 | @abc.abstractmethod 88 | def layerwise_lr(self, lr: float, decay: float): 89 | """ 90 | :param lr: Learning rate for the highest encoder layer. 91 | :param decay: decay percentage for the lower layers. 92 | 93 | :return: List of model parameters with layer-wise decay learning rate 94 | """ 95 | pass 96 | 97 | @abc.abstractmethod 98 | def forward( 99 | self, tokens: torch.Tensor, lengths: torch.Tensor 100 | ) -> Dict[str, torch.Tensor]: 101 | pass 102 | -------------------------------------------------------------------------------- /COMET/comet/encoders/bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | BERT Encoder 17 | ============== 18 | Pretrained BERT encoder from Hugging Face. 19 | """ 20 | from typing import Dict 21 | 22 | import torch 23 | from comet.encoders.base import Encoder 24 | from transformers import AutoModel, AutoTokenizer 25 | 26 | 27 | class BERTEncoder(Encoder): 28 | """BERT encoder. 29 | 30 | :param pretrained_model: Pretrained model from hugging face. 31 | """ 32 | 33 | def __init__(self, pretrained_model: str) -> None: 34 | super().__init__() 35 | self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True) 36 | self.model = AutoModel.from_pretrained(pretrained_model) 37 | self.model.encoder.output_hidden_states = True 38 | 39 | @property 40 | def output_units(self): 41 | """Max number of tokens the encoder handles.""" 42 | return self.model.config.hidden_size 43 | 44 | @property 45 | def max_positions(self): 46 | """Max number of tokens the encoder handles.""" 47 | return self.model.config.max_position_embeddings 48 | 49 | @property 50 | def num_layers(self): 51 | """Number of model layers available.""" 52 | return self.model.config.num_hidden_layers + 1 53 | 54 | @classmethod 55 | def from_pretrained(cls, pretrained_model: str) -> Encoder: 56 | """Function that loads a pretrained encoder from Hugging Face. 57 | :param pretrained_model: Name of the pretrain model to be loaded. 58 | 59 | :return: Encoder model 60 | """ 61 | return BERTEncoder(pretrained_model) 62 | 63 | def freeze_embeddings(self) -> None: 64 | """Frezees the embedding layer.""" 65 | for param in self.model.embeddings.parameters(): 66 | param.requires_grad = False 67 | 68 | def layerwise_lr(self, lr: float, decay: float): 69 | """ 70 | :param lr: Learning rate for the highest encoder layer. 71 | :param decay: decay percentage for the lower layers. 72 | 73 | :return: List of model parameters with layer-wise decay learning rate 74 | """ 75 | # Embedding Layer 76 | opt_parameters = [ 77 | { 78 | "params": self.model.embeddings.parameters(), 79 | "lr": lr * decay ** (self.num_layers), 80 | } 81 | ] 82 | # All layers 83 | opt_parameters += [ 84 | { 85 | "params": self.model.encoder.layer[i].parameters(), 86 | "lr": lr * decay**i, 87 | } 88 | for i in range(self.num_layers - 2, 0, -1) 89 | ] 90 | return opt_parameters 91 | 92 | def forward( 93 | self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs 94 | ) -> Dict[str, torch.Tensor]: 95 | last_hidden_states, pooler_output, all_layers = self.model( 96 | input_ids=input_ids, 97 | attention_mask=attention_mask, 98 | output_hidden_states=True, 99 | return_dict=False, 100 | ) 101 | return { 102 | "sentemb": pooler_output, 103 | "wordemb": last_hidden_states, 104 | "all_layers": all_layers, 105 | "attention_mask": attention_mask, 106 | } 107 | -------------------------------------------------------------------------------- /COMET/comet/encoders/minilm.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | MiniLM Encoder 17 | ============== 18 | Pretrained MiniLM encoder from Microsoft. This encoder uses a BERT 19 | architecture with an XLMR tokenizer. 20 | """ 21 | from comet.encoders.bert import BERTEncoder 22 | from transformers import BertModel, XLMRobertaTokenizer 23 | 24 | 25 | class MiniLMEncoder(BERTEncoder): 26 | """MiniLMEncoder encoder. 27 | 28 | :param pretrained_model: Pretrained model from hugging face. 29 | """ 30 | 31 | def __init__(self, pretrained_model: str) -> None: 32 | super().__init__() 33 | self.tokenizer = XLMRobertaTokenizer.from_pretrained( 34 | pretrained_model, use_fast=True 35 | ) 36 | self.model = BertModel.from_pretrained(pretrained_model) 37 | self.model.encoder.output_hidden_states = True 38 | -------------------------------------------------------------------------------- /COMET/comet/encoders/xlmr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | XLM-RoBERTa Encoder 17 | ============== 18 | Pretrained XLM-RoBERTa encoder from Hugging Face. 19 | """ 20 | from typing import Dict 21 | 22 | import torch 23 | from comet.encoders.base import Encoder 24 | from comet.encoders.bert import BERTEncoder 25 | from transformers import XLMRobertaModel, XLMRobertaTokenizer 26 | 27 | 28 | class XLMREncoder(BERTEncoder): 29 | """XLM-RoBERTA Encoder encoder. 30 | 31 | :param pretrained_model: Pretrained model from hugging face. 32 | """ 33 | 34 | def __init__(self, pretrained_model: str) -> None: 35 | super(Encoder, self).__init__() 36 | self.tokenizer = XLMRobertaTokenizer.from_pretrained(pretrained_model) 37 | self.model = XLMRobertaModel.from_pretrained( 38 | pretrained_model, add_pooling_layer=False 39 | ) 40 | self.model.encoder.output_hidden_states = True 41 | 42 | @classmethod 43 | def from_pretrained(cls, pretrained_model: str) -> Encoder: 44 | """Function that loads a pretrained encoder from Hugging Face. 45 | :param pretrained_model: Name of the pretrain model to be loaded. 46 | 47 | :return: Encoder model 48 | """ 49 | return XLMREncoder(pretrained_model) 50 | 51 | def forward( 52 | self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs 53 | ) -> Dict[str, torch.Tensor]: 54 | last_hidden_states, _, all_layers = self.model( 55 | input_ids=input_ids, 56 | attention_mask=attention_mask, 57 | output_hidden_states=True, 58 | return_dict=False, 59 | ) 60 | return { 61 | "sentemb": last_hidden_states[:, 0, :], 62 | "wordemb": last_hidden_states, 63 | "all_layers": all_layers, 64 | "attention_mask": attention_mask, 65 | } 66 | -------------------------------------------------------------------------------- /COMET/comet/encoders/xlmr_xl.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | XLM-RoBERTa-XL Encoder 17 | ============== 18 | Pretrained XLM-RoBERTa-XL encoder from Hugging Face. 19 | """ 20 | from comet.encoders.base import Encoder 21 | from comet.encoders.xlmr import XLMREncoder 22 | from transformers import XLMRobertaTokenizer, XLMRobertaXLModel 23 | 24 | 25 | class XLMRXLEncoder(XLMREncoder): 26 | """XLM-RoBERTA-XL Encoder encoder. 27 | 28 | :param pretrained_model: Pretrained model from hugging face. 29 | """ 30 | 31 | def __init__(self, pretrained_model: str) -> None: 32 | super(Encoder, self).__init__() 33 | self.tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large") 34 | self.model = XLMRobertaXLModel.from_pretrained( 35 | pretrained_model, add_pooling_layer=False 36 | ) 37 | self.model.encoder.output_hidden_states = True 38 | 39 | @classmethod 40 | def from_pretrained(cls, pretrained_model: str) -> Encoder: 41 | """Function that loads a pretrained encoder from Hugging Face. 42 | :param pretrained_model: Name of the pretrain model to be loaded. 43 | 44 | :return: Encoder model 45 | """ 46 | return XLMRXLEncoder(pretrained_model) 47 | -------------------------------------------------------------------------------- /COMET/comet/models/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # -*- coding: utf-8 -*- 3 | # Copyright (C) 2020 Unbabel 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | from .regression.regression_metric import RegressionMetric 17 | from .ranking.ranking_metric import RankingMetric 18 | from .regression.referenceless import ReferencelessRegression 19 | from .base import CometModel 20 | 21 | import os 22 | import yaml 23 | 24 | str2model = { 25 | "referenceless_regression_metric": ReferencelessRegression, 26 | "regression_metric": RegressionMetric, 27 | "ranking_metric": RankingMetric, 28 | } 29 | 30 | available_metrics = { 31 | # WMT20 Models 32 | "emnlp20-comet-rank": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/emnlp20-comet-rank.tar.gz", 33 | "wmt20-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-da.tar.gz", 34 | "wmt20-comet-qe-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-qe-da.tar.gz", 35 | "wmt20-comet-qe-da-v2": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-qe-da-v2.tar.gz", 36 | 37 | # WMT21 Models 38 | "wmt21-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-da.tar.gz", 39 | "wmt21-comet-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-mqm.tar.gz", 40 | "wmt21-cometinho-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-cometinho-mqm.tar.gz", 41 | "wmt21-cometinho-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-cometinho-da.tar.gz", 42 | "wmt21-comet-qe-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-qe-mqm.tar.gz", 43 | "wmt21-comet-qe-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-qe-da.tar.gz", 44 | 45 | #EAMT22 Models 46 | "eamt22-cometinho-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/eamt22/eamt22-cometinho-da.tar.gz", 47 | "eamt22-prune-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/eamt22/eamt22-prune-comet-da.tar.gz", 48 | } 49 | 50 | 51 | def load_from_checkpoint(checkpoint_path: str) -> CometModel: 52 | """Loads models from a checkpoint path. 53 | :param checkpoint_path: Path to a model checkpoint. 54 | 55 | :return: Returns a COMET model. 56 | """ 57 | if not os.path.exists(checkpoint_path): 58 | raise Exception(f"Invalid checkpoint path: {checkpoint_path}") 59 | 60 | hparams_file = "/".join(checkpoint_path.split("/")[:-2] + ["hparams.yaml"]) 61 | if os.path.exists(hparams_file): 62 | with open(hparams_file) as yaml_file: 63 | hparams = yaml.load(yaml_file.read(), Loader=yaml.FullLoader) 64 | model_class = str2model[hparams["class_identifier"]] 65 | model = model_class.load_from_checkpoint(checkpoint_path, **hparams) 66 | return model 67 | else: 68 | raise Exception("hparams.yaml file is missing!") 69 | -------------------------------------------------------------------------------- /COMET/comet/models/metrics.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | r""" 17 | Metrics 18 | ======= 19 | Regression and Ranking metrics to be used during training to measure 20 | correlations with human judgements 21 | """ 22 | import torch 23 | from torch import Tensor 24 | 25 | from torchmetrics import Metric 26 | from typing import Any, Callable, List, Optional 27 | import scipy.stats as stats 28 | 29 | 30 | class RegressionMetrics(Metric): 31 | is_differentiable = False 32 | higher_is_better = True 33 | preds: List[Tensor] 34 | target: List[Tensor] 35 | 36 | def __init__( 37 | self, 38 | prefix: str = "", 39 | compute_on_step: bool = False, 40 | dist_sync_on_step: bool = False, 41 | process_group: Optional[Any] = None, 42 | dist_sync_fn: Optional[Callable] = None, 43 | ) -> None: 44 | super().__init__( 45 | compute_on_step=compute_on_step, 46 | dist_sync_on_step=dist_sync_on_step, 47 | process_group=process_group, 48 | dist_sync_fn=dist_sync_fn, 49 | ) 50 | self.add_state("preds", default=[], dist_reduce_fx="cat") 51 | self.add_state("target", default=[], dist_reduce_fx="cat") 52 | self.prefix = prefix 53 | 54 | 55 | def update(self, preds: Tensor, target: Tensor) -> None: # type: ignore 56 | """Update state with predictions and targets. 57 | Args: 58 | preds: Predictions from model 59 | target: Ground truth values 60 | """ 61 | self.preds.append(preds) 62 | self.target.append(target) 63 | 64 | def compute(self) -> Tensor: 65 | """ Computes spearmans correlation coefficient. """ 66 | preds = torch.cat(self.preds, dim=0) 67 | target = torch.cat(self.target, dim=0) 68 | kendall, _ = stats.kendalltau(preds.tolist(), target.tolist()) 69 | spearman, _ = stats.spearmanr(preds.tolist(), target.tolist()) 70 | pearson, _ = stats.pearsonr(preds.tolist(), target.tolist()) 71 | return { 72 | self.prefix + "_kendall": kendall, 73 | self.prefix + "_spearman": spearman, 74 | self.prefix + "_pearson": pearson, 75 | } 76 | 77 | class WMTKendall(Metric): 78 | def __init__( 79 | self, 80 | prefix: str = "", 81 | compute_on_step: bool = False, 82 | dist_sync_on_step: bool = False, 83 | process_group: Optional[Any] = None, 84 | dist_sync_fn: Optional[Callable] = None, 85 | ) -> None: 86 | super().__init__( 87 | compute_on_step=compute_on_step, 88 | dist_sync_on_step=dist_sync_on_step, 89 | process_group=process_group, 90 | dist_sync_fn=dist_sync_fn, 91 | ) 92 | self.add_state("concordance", default=torch.tensor(0), dist_reduce_fx="sum") 93 | self.add_state("discordance", default=torch.tensor(0), dist_reduce_fx="sum") 94 | self.prefix = prefix 95 | 96 | def update(self, distance_pos: torch.Tensor, distance_neg: torch.Tensor): 97 | assert distance_pos.shape == distance_neg.shape 98 | self.concordance = torch.sum((distance_pos < distance_neg).float()) 99 | self.discordance = torch.sum((distance_pos >= distance_neg).float()) 100 | 101 | def compute(self): 102 | return { 103 | self.prefix 104 | + "_kendall": (self.concordance - self.discordance) 105 | / (self.concordance + self.discordance) 106 | } 107 | -------------------------------------------------------------------------------- /COMET/comet/models/predict_pbar.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | import pytorch_lightning as ptl 4 | from tqdm import tqdm 5 | 6 | 7 | class PredictProgressBar(ptl.callbacks.progress.tqdm_progress.TQDMProgressBar): 8 | """Default Lightning Progress bar writes to stdout, we replace stdout with stderr""" 9 | 10 | def init_predict_tqdm(self) -> tqdm: 11 | bar = tqdm( 12 | desc="Predicting", 13 | initial=self.train_batch_idx, 14 | position=(2 * self.process_position), 15 | disable=self.is_disabled, 16 | leave=True, 17 | dynamic_ncols=True, 18 | file=sys.stderr, 19 | smoothing=0, 20 | ) 21 | return bar 22 | -------------------------------------------------------------------------------- /COMET/comet/models/ranking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/comet/models/ranking/__init__.py -------------------------------------------------------------------------------- /COMET/comet/models/regression/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/comet/models/regression/__init__.py -------------------------------------------------------------------------------- /COMET/comet/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .feedforward import FeedForward 3 | from .layerwise_attention import LayerwiseAttention 4 | -------------------------------------------------------------------------------- /COMET/comet/modules/feedforward.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | Feed Forward 17 | ============== 18 | Feed Forward Neural Network module that can be used for classification or regression 19 | """ 20 | from typing import List, Optional 21 | 22 | import torch 23 | from torch import nn 24 | 25 | 26 | class FeedForward(nn.Module): 27 | """ 28 | Feed Forward Neural Network. 29 | 30 | :param in_dim: Number input features. 31 | :param out_dim: Number of output features. Default is just a score. 32 | :param hidden_sizes: List with hidden layer sizes. 33 | :param activations: Name of the activation function to be used in the hidden layers. 34 | :param final_activation: Name of the final activation function if any. 35 | :param dropout: dropout to be used in the hidden layers. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | in_dim: int, 41 | out_dim: int = 1, 42 | hidden_sizes: List[int] = [3072, 768], 43 | activations: str = "Sigmoid", 44 | final_activation: Optional[str] = None, 45 | dropout: float = 0.1, 46 | ) -> None: 47 | super().__init__() 48 | modules = [] 49 | modules.append(nn.Linear(in_dim, hidden_sizes[0])) 50 | modules.append(self.build_activation(activations)) 51 | modules.append(nn.Dropout(dropout)) 52 | 53 | for i in range(1, len(hidden_sizes)): 54 | modules.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i])) 55 | modules.append(self.build_activation(activations)) 56 | modules.append(nn.Dropout(dropout)) 57 | 58 | modules.append(nn.Linear(hidden_sizes[-1], int(out_dim))) 59 | if final_activation is not None: 60 | modules.append(self.build_activation(final_activation)) 61 | 62 | self.ff = nn.Sequential(*modules) 63 | 64 | def build_activation(self, activation: str) -> nn.Module: 65 | if hasattr(nn, activation.title()): 66 | return getattr(nn, activation.title())() 67 | else: 68 | raise Exception(f"{activation} is not a valid activation function!") 69 | 70 | def forward(self, in_features: torch.Tensor) -> torch.Tensor: 71 | return self.ff(in_features) 72 | -------------------------------------------------------------------------------- /COMET/configs/early_stopping.yaml: -------------------------------------------------------------------------------- 1 | class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping 2 | init_args: 3 | monitor: val_pearson 4 | min_delta: 0. 5 | patience: 2 6 | verbose: False 7 | mode: max 8 | strict: True 9 | check_finite: True 10 | stopping_threshold: null 11 | divergence_threshold: null 12 | check_on_train_epoch_end: False 13 | -------------------------------------------------------------------------------- /COMET/configs/model_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint 2 | init_args: 3 | dirpath: null 4 | filename: null 5 | monitor: val_pearson 6 | verbose: True 7 | save_last: False 8 | save_top_k: 3 9 | mode: max 10 | auto_insert_metric_name: True 11 | save_weights_only: True 12 | every_n_train_steps: null 13 | train_time_interval: null 14 | every_n_epochs: 1 15 | save_on_train_epoch_end: null 16 | -------------------------------------------------------------------------------- /COMET/configs/models/ranking_metric.yaml: -------------------------------------------------------------------------------- 1 | ranking_metric: 2 | class_path: comet.models.RankingMetric 3 | init_args: 4 | nr_frozen_epochs: 0.3 5 | keep_embeddings_frozen: True 6 | optimizer: AdamW 7 | encoder_learning_rate: 1.0e-05 8 | learning_rate: 3.0e-05 9 | layerwise_decay: 0.95 10 | encoder_model: XLM-RoBERTa 11 | pretrained_model: xlm-roberta-base 12 | pool: avg 13 | layer: mix 14 | dropout: 0.1 15 | batch_size: 4 16 | train_data: data/TRAIN.csv 17 | validation_data: data/DEV.csv 18 | 19 | trainer: ../trainer.yaml 20 | early_stopping: ../early_stopping.yaml 21 | model_checkpoint: ../model_checkpoint.yaml -------------------------------------------------------------------------------- /COMET/configs/models/referenceless_metric.yaml: -------------------------------------------------------------------------------- 1 | referenceless_regression_metric: 2 | class_path: comet.models.ReferencelessRegression 3 | init_args: 4 | nr_frozen_epochs: 0.3 5 | keep_embeddings_frozen: True 6 | optimizer: AdamW 7 | encoder_learning_rate: 1.0e-05 8 | learning_rate: 3.1e-05 9 | layerwise_decay: 0.95 10 | encoder_model: XLM-RoBERTa 11 | pretrained_model: xlm-roberta-large 12 | pool: avg 13 | layer: mix 14 | dropout: 0.1 15 | batch_size: 4 16 | train_data: data/TRAIN.csv 17 | validation_data: data/DEV.csv 18 | hidden_sizes: 19 | - 2048 20 | - 1024 21 | 22 | trainer: ../trainer.yaml 23 | early_stopping: ../early_stopping.yaml 24 | model_checkpoint: ../model_checkpoint.yaml -------------------------------------------------------------------------------- /COMET/configs/models/regression_metric.yaml: -------------------------------------------------------------------------------- 1 | regression_metric: 2 | class_path: comet.models.RegressionMetric 3 | init_args: 4 | nr_frozen_epochs: 0.3 5 | keep_embeddings_frozen: True 6 | optimizer: AdamW 7 | encoder_learning_rate: 1.0e-05 8 | learning_rate: 3.1e-05 9 | layerwise_decay: 0.95 10 | encoder_model: MiniLM 11 | pretrained_model: microsoft/Multilingual-MiniLM-L12-H384 12 | pool: avg 13 | layer: mix 14 | dropout: 0.15 15 | batch_size: 8 16 | train_data: data/TRAIN.csv 17 | validation_data: data/DEV.csv 18 | hidden_sizes: 19 | - 384 20 | 21 | trainer: ../trainer.yaml 22 | early_stopping: ../early_stopping.yaml 23 | model_checkpoint: ../model_checkpoint.yaml -------------------------------------------------------------------------------- /COMET/configs/trainer.yaml: -------------------------------------------------------------------------------- 1 | class_path: pytorch_lightning.trainer.trainer.Trainer 2 | init_args: 3 | accelerator: gpu 4 | devices: null 5 | accumulate_grad_batches: 4 6 | amp_backend: native #apex 7 | amp_level: null #'01' 8 | auto_lr_find: False 9 | auto_scale_batch_size: False 10 | auto_select_gpus: False 11 | benchmark: null 12 | check_val_every_n_epoch: 1 13 | default_root_dir: null 14 | deterministic: True 15 | fast_dev_run: False 16 | gradient_clip_val: 1.0 17 | gradient_clip_algorithm: norm 18 | limit_train_batches: 1.0 19 | limit_val_batches: 1.0 20 | limit_test_batches: 1.0 21 | limit_predict_batches: 1.0 22 | log_every_n_steps: 50 23 | profiler: null 24 | overfit_batches: 0 25 | plugins: null 26 | precision: 16 27 | max_epochs: 3 28 | min_epochs: 1 29 | max_steps: -1 30 | min_steps: null 31 | max_time: null 32 | num_nodes: 1 33 | num_processes: 1 34 | num_sanity_val_steps: 10 35 | reload_dataloaders_every_n_epochs: 0 36 | replace_sampler_ddp: True 37 | sync_batchnorm: False 38 | detect_anomaly: False 39 | tpu_cores: null 40 | track_grad_norm: -1 41 | val_check_interval: 1.0 42 | enable_model_summary: True 43 | move_metrics_to_cpu: True 44 | multiple_trainloader_mode: max_size_cycle 45 | -------------------------------------------------------------------------------- /COMET/docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /COMET/docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /COMET/docs/source/_static/css/comet.css: -------------------------------------------------------------------------------- 1 | .wy-side-nav-search { 2 | background-color: #3852de; 3 | } 4 | 5 | .wy-side-nav-search > div.version { 6 | color: white; 7 | } 8 | 9 | .wy-menu-vertical p.caption { 10 | color: #3852de; 11 | } 12 | 13 | .wy-side-nav-search input[type=text] { 14 | border-color: #1a1a1a; 15 | } 16 | 17 | a { 18 | color: #3852de; 19 | } 20 | 21 | a:hover { 22 | color: #1a1a1a; 23 | } -------------------------------------------------------------------------------- /COMET/docs/source/_static/img/COMET_lockup-dark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/COMET_lockup-dark.png -------------------------------------------------------------------------------- /COMET/docs/source/_static/img/COMET_lockup-white.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/COMET_lockup-white.png -------------------------------------------------------------------------------- /COMET/docs/source/_static/img/estimator_model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/estimator_model.jpg -------------------------------------------------------------------------------- /COMET/docs/source/_static/img/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/logo.png -------------------------------------------------------------------------------- /COMET/docs/source/_static/img/models.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/models.png -------------------------------------------------------------------------------- /COMET/docs/source/_static/img/ranking_model.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/ranking_model.jpg -------------------------------------------------------------------------------- /COMET/docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = "COMET" 21 | copyright = ( 22 | "2020, Unbabel. All rights reserved." 23 | "Source code available under Apache License 2.0" 24 | ) 25 | author = "Unbabel" 26 | 27 | # The full version, including alpha/beta/rc tags 28 | release = "0.0.3" 29 | 30 | 31 | # -- General configuration --------------------------------------------------- 32 | 33 | # Add any Sphinx extension module names here, as strings. They can be 34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 35 | # ones. 36 | extensions = [ 37 | "sphinx.ext.autodoc", 38 | # 'sphinx.ext.doctest', 39 | "sphinx.ext.intersphinx", 40 | "sphinx.ext.todo", 41 | "sphinx.ext.coverage", 42 | "sphinx.ext.mathjax", 43 | "sphinx.ext.viewcode", 44 | "sphinx.ext.githubpages", 45 | "sphinx.ext.napoleon", 46 | "recommonmark", 47 | #'sphinxarg.ext', 48 | #'m2r', 49 | # 'sphinx-issues', 50 | # 'pytest-sphinx', 51 | "sphinx_markdown_tables", 52 | "sphinx.ext.autosectionlabel", 53 | ] 54 | autosectionlabel_prefix_document = True 55 | 56 | # Add any paths that contain templates here, relative to this directory. 57 | templates_path = ["_templates"] 58 | 59 | source_suffix = { 60 | ".rst": "restructuredtext", 61 | ".txt": "markdown", 62 | ".md": "markdown", 63 | } 64 | 65 | # List of patterns, relative to source directory, that match files and 66 | # directories to ignore when looking for source files. 67 | # This pattern also affects html_static_path and html_extra_path. 68 | exclude_patterns = [] 69 | 70 | # The master toctree document. 71 | master_doc = "index" 72 | 73 | # List of patterns, relative to source directory, that match files and 74 | # directories to ignore when looking for source files. 75 | # This pattern also affects html_static_path and html_extra_path. 76 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] 77 | 78 | # The name of the Pygments (syntax highlighting) style to use. 79 | pygments_style = None 80 | 81 | # -- Options for HTML output ------------------------------------------------- 82 | 83 | # html_logo = '_static/img/LOGO.png' 84 | 85 | # The theme to use for HTML and HTML Help pages. See the documentation for 86 | # a list of builtin themes. 87 | # 88 | html_theme = "sphinx_rtd_theme" 89 | 90 | 91 | # Add any paths that contain custom static files (such as style sheets) here, 92 | # relative to this directory. They are copied after the builtin static files, 93 | # so a file named "default.css" will overwrite the builtin "default.css". 94 | html_static_path = ["_static"] 95 | 96 | 97 | def setup(app): 98 | app.add_css_file("css/comet.css") 99 | -------------------------------------------------------------------------------- /COMET/docs/source/index.rst: -------------------------------------------------------------------------------- 1 | COMET: High-quality Machine Translation Evaluation 2 | =================================================== 3 | 4 | .. image:: _static/img/COMET_lockup-dark.png 5 | :width: 800 6 | :alt: COMET by Unbabel 7 | 8 | What is COMET 9 | ============== 10 | 11 | COMET is an open-source framework for MT evaluation that can be used for two purposes: 12 | 13 | * To evaluate MT systems with our currently available high-performing metrics (check: :ref:`models:COMET Metrics`). 14 | * To train and develop new metrics. 15 | 16 | 17 | 18 | 19 | Contents: 20 | ========= 21 | 22 | .. toctree:: 23 | :maxdepth: 2 24 | 25 | installation 26 | running 27 | models 28 | training 29 | 30 | 31 | License 32 | ============== 33 | 34 | Free software: Apache License 2.0 35 | 36 | Citation 37 | ========= 38 | 39 | If you use COMET to evaluate your MT system or to develop new metrics please cite the following paper: 40 | `COMET: A Neural Framework for MT Evaluation `_ 41 | 42 | Library Reference 43 | ================== 44 | 45 | .. toctree:: 46 | :maxdepth: 2 47 | 48 | library 49 | -------------------------------------------------------------------------------- /COMET/docs/source/installation.rst: -------------------------------------------------------------------------------- 1 | .. _installation: 2 | Installation 3 | ============ 4 | 5 | Please note that since Python>=3.5 is required, all the below commands, especially `pip`, 6 | also have to be the Python 3 version. This might require that you run `pip3` instead. 7 | 8 | 9 | Install COMET as a package with pip:: 10 | 11 | pip install unbabel-comet 12 | 13 | Inside your project you can now:: 14 | 15 | import comet 16 | 17 | or run it with our command line interface:: 18 | 19 | comet --help 20 | 21 | -------------------------------------------------------------------------------- /COMET/docs/source/library.rst: -------------------------------------------------------------------------------- 1 | Library Reference 2 | =================================================== 3 | 4 | ****************************** 5 | Multilingual Encoders 6 | ****************************** 7 | 8 | .. automodule:: comet.models.encoders.encoder_base 9 | :members: 10 | 11 | .. automodule:: comet.models.encoders.laser 12 | :members: 13 | 14 | .. automodule:: comet.models.encoders.bert 15 | :members: 16 | 17 | .. automodule:: comet.models.encoders.xlmr 18 | :members: 19 | 20 | ****************** 21 | Model Base 22 | ****************** 23 | 24 | .. automodule:: comet.models.model_base 25 | :members: 26 | 27 | ********** 28 | Estimators 29 | ********** 30 | 31 | .. automodule:: comet.models.estimators.estimator_base 32 | :members: 33 | 34 | .. automodule:: comet.models.estimators.comet_estimator 35 | :members: 36 | 37 | .. automodule:: comet.models.estimators.quality_estimator 38 | :members: 39 | 40 | ************************** 41 | Translation Ranking Model 42 | ************************** 43 | 44 | .. automodule:: comet.models.ranking.ranking_base 45 | :members: 46 | 47 | .. automodule:: comet.models.ranking.comet_ranker 48 | :members: 49 | 50 | ***************** 51 | Auxiliar Modules 52 | ***************** 53 | 54 | .. automodule:: comet.modules.feedforward 55 | :members: 56 | 57 | .. automodule:: comet.modules.scalar_mix 58 | :members: -------------------------------------------------------------------------------- /COMET/docs/source/models.md: -------------------------------------------------------------------------------- 1 | ## COMET Metrics 2 | 3 | COMET models can be optimized towards different kinds of human judgements (for example HTER or DA's). Because of we provide a list of different Metrics that you can use to test you systems: 4 | 5 | | Model | Description | 6 | | :--------------------- | :------------------------------------------------ | 7 | | ↑`wmt-large-da-estimator-1719` | **RECOMMENDED:** Estimator model build on top of XLM-R (large) trained on DA from WMT17, WMT18 and WMT19 | 8 | | ↑`wmt-base-da-estimator-1719` | Estimator model build on top of XLM-R (base) trained on DA from WMT17, WMT18 and WMT19 | 9 | | ↓`wmt-large-hter-estimator` | Estimator model build on top of XLM-R (large) trained to regress on HTER. | 10 | | ↓`wmt-base-hter-estimator` | Estimator model build on top of XLM-R (base) trained to regress on HTER. | 11 | | ↑`emnlp-base-da-ranker` | Translation ranking model that uses XLM-R to encode sentences. This model was trained with WMT17 and WMT18 Direct Assessments Relative Ranks (DARR). | 12 | 13 | The first four models (`wmt-*`) where trained and tested for the WMT2020 shared task, thus they were only introduced in our submission to the shared task (paper still under-review) 14 | 15 | **NOTE:** Even when regressing on the same Human Judgement scores between metrics are not comparable (e.g. scores from a large and a base model are not comparable even when trained on the same type of judgements)! Please make sure you use the same metric when comparing 2 systems! 16 | 17 | Also, since HTER measures the amount of edits we needed to correct an MT hypothesis, models regressing on HTER produce low scores for better systems. -------------------------------------------------------------------------------- /COMET/docs/source/running.rst: -------------------------------------------------------------------------------- 1 | .. _running: 2 | Running COMET 3 | ============== 4 | 5 | Command Line Interface 6 | ################################ 7 | 8 | After installing COMET you can score you MT outputs with the following command:: 9 | 10 | comet score -s sources.txt -h hypothesis.txt -r references.txt --model wmt-large-da-estimator-1719 11 | 12 | You can export your results to a JSON file using the `---to_json` flag:: 13 | 14 | comet score -s sources.txt -h hypothesis.txt -r references.txt --model wmt-large-da-estimator-1719 --to_json output.json 15 | 16 | 17 | Using Python 18 | ############# 19 | 20 | Instead of using CLI you can score you models using Python with the `predict` function:: 21 | 22 | from comet.models import download_model 23 | model = download_model("wmt-large-da-estimator-1719", "path/where/to/save/models") 24 | data = [ 25 | { 26 | "src": "Hello world!", 27 | "mt": "Oi mundo!", 28 | "ref": "Olá mundo!" 29 | }, 30 | { 31 | "src": "This is a sample", 32 | "mt": "este é um exemplo", 33 | "ref": "isto é um exemplo!" 34 | } 35 | ] 36 | model.predict(data) 37 | 38 | Scoring MT ouputs using lists:: 39 | 40 | source = ["Hello world!", "This is a sample"] 41 | hypothesis = ["Oi mundo!", "este é um exemplo"] 42 | reference = ["Olá mundo!", "isto é um exemplo!"] 43 | data = {"src": source, "mt": hypothesis, "ref": reference} 44 | data = [dict(zip(data, t)) for t in zip(*data.values())] 45 | model.predict(data) -------------------------------------------------------------------------------- /COMET/docs/source/training.md: -------------------------------------------------------------------------------- 1 | # Train your own Metric 2 | 3 | To train our models we rely on [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/0.8.4/) Library. This means that all our models are [Lightning Modules](https://pytorch-lightning.readthedocs.io/en/0.8.4/lightning-module.html). 4 | 5 | To train a new metric we just need to run 1 command: 6 | 7 | ```bash 8 | comet train -f {my_configs}.yaml 9 | ``` 10 | 11 | This will setup a [Lightning Trainer](https://pytorch-lightning.readthedocs.io/en/0.8.4/trainer.html) and fit your module accordingly. 12 | ## Data Format 13 | To train your metric we expect your data to be a csv with the following columns: 14 | - `src`: The source segment. 15 | - `mt`: The machine translation hypothesis. 16 | - `ref`: The reference segment. 17 | - `score`: The human judgment score. 18 | 19 | Example: 20 | 21 | | src | mt | ref | score | 22 | | :---------: | :------: | :------: | :------: | 23 | | Hello world! | Oi mundo. | Olá mundo! | 0.5 | 24 | | This is a sample | este é um exemplo | isto é um exemplo! | 0.8 | 25 | 26 | ## Training flags 27 | 28 | ### Lightning Trainer Configurations 29 | 30 | | Argument | Default | Description | 31 | | :--------- | :------ | :------ | 32 | | `seed` | 3 | Training seed. | 33 | | `deterministic` | True | If true enables cudnn.deterministic. Might make your system slower, but ensures reproducibility. | 34 | | `verbose` | False | Verbosity mode. | 35 | | `early_stopping` | True | Enables early stopping. | 36 | | `save_top_k` | 1 | Sets how many checkpoints we want to save (keeping only the best ones). | 37 | | `monitor` | Kendall | Metric to monitor during training. | 38 | | `metric_mode` | max | 'min' or 'max' depending if we wish to maximize or minimize the metric. | 39 | | `min_delta` | 0 | Sensitivity to the metric. | 40 | | `patience` | 1 | Number of epochs without improvement before stopping training | 41 | | `accumulate_grad_batches` | 1 | Gradient accumulation steps | 42 | | `lr_finder` | False | Enables the learning rate finder described in [Cyclical Learning Rates for Training Neural Networks](https://arxiv.org/abs/1506.01186) | 43 | 44 | 45 | ### Base Model Configurations 46 | 47 | | Argument | Default | Description | 48 | | :--------- | :------ | :------ | 49 | | `model` | `required` | Type of metric we want to train. Options: [`CometEstimator`, `CometRanker`, `QualityEstimator`] | 50 | | `batch_size` | 8 | Batch size used to train the model. | 51 | | `nr_frozen_epochs` | 0 | Number of epochs we keep the encoder frozen. | 52 | | `keep_embeddings_frozen` | False | If set to True, keeps the embedding layer frozen during training. Usefull to save some GPU memory. | 53 | | `optimizer` | Adam | PyTorch Optimizer class name | 54 | | `learning_rate` | 1e-05 | Learning rate to be used during training. | 55 | | `scheduler` | constant | Learning Rate scheduler. Options: [`constant`, `linear_warmup`, `warmup_constant`] | 56 | | `warmup_steps` | None |Scheduler warmup steps. | 57 | | `encoder_model` | XLMR | Encoder Model to be used: Options: [`LASER`, `BERT`, `XLMR`]. | 58 | | `pretrained_model` | xlmr.base | pretrained model to be used e.g: xlmr.base vs xlmr.large (for LASER this is ignored) | 59 | | `pool` | avg | Pooling technique to create the sentence embeddings. Options: [`avg`, `avg+cls`, `max`, `cls`, `default`] | 60 | | `load_weights` | False | Loads compatible weights from another checkpoint file. | 61 | | `train_path` | `required` | Path to the training csv. | 62 | | `val_path` | `required` | Path to the validation csv. | 63 | | `test_path` | None | Path to the test csv. (Not used) | 64 | | `loader_workers` | False | Number of workers for loading and preparing the batches. | 65 | 66 | **Note:** The `Ranker` model requires no further configs. 67 | 68 | ### Estimator Specific Configurations 69 | 70 | | Argument | Default | Description | 71 | | :--------- | :------ | :------ | 72 | | `encoder_learning_rate` | `required` | Learning rate used to fine-tune the encoder. Note that this is different from `learning_rate` config that will be used only for the top layer. | 73 | | `layerwise_decay` | 1.0 | Decay for the layer wise learning rates. If 1.0 no decay is applied. | 74 | | `layer` | mix | Layer from the pretrained encoder that we wish to extract the word embeddings. If `mix` uses a layer-wise attention mechanism to combine different layers. | 75 | | `scalar_mix_dropout` | mix | Sets the layer-wise dropout. Ignored if `layer != mix`. | 76 | | `loss` | mse | `mse` for Mean Squared Error or `binary_xent`for Binary Cross Entropy. | 77 | | `hidden_sizes` | 1536,768 | Hidden sizes of the different Feed-Forward layers on top. | 78 | | `activations` | Tanh | Activation functions for the Feed-Forward on top. | 79 | | `dropout` | 0.1 | Dropout used in the Feed-Forward on top. | 80 | | `final_activation` | Sigmoid | Feed-Forward final activation function. If `False` the model outputs the logits | 81 | 82 | 83 | -------------------------------------------------------------------------------- /COMET/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "unbabel-comet" 3 | version = "1.1.2" 4 | description = "High-quality Machine Translation Evaluation" 5 | authors = ["Ricardo Rei, Craig Stewart, Catarina Farinha, Alon Lavie"] 6 | license = "Apache-2.0" 7 | readme = "README.md" 8 | homepage = "https://github.com/Unbabel/COMET" 9 | repository = "https://github.com/Unbabel/COMET" 10 | documentation = "https://unbabel.github.io/COMET/html/index.html" 11 | keywords = [ 12 | "Machine Translation", 13 | "Evaluation", 14 | "Unbabel", 15 | "COMET" 16 | ] 17 | classifiers = [ 18 | 'Development Status :: 4 - Beta', 19 | 'Environment :: Console', 20 | 'Intended Audience :: Science/Research', 21 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 22 | ] 23 | packages = [ 24 | {include = "comet"}, 25 | ] 26 | include = [ 27 | "LICENSE", 28 | "pyproject.toml", 29 | "CONTRIBUTING.md" 30 | ] 31 | 32 | [tool.poetry.scripts] 33 | comet-train = 'comet.cli.train:train_command' 34 | comet-score = 'comet.cli.score:score_command' 35 | comet-compare = 'comet.cli.compare:compare_command' 36 | comet-mbr = 'comet.cli.mbr:mbr_command' 37 | 38 | [tool.poetry.dependencies] 39 | python = ">=3.7.0,<4.0.0" 40 | sentencepiece = "^0.1.96" 41 | pandas = "1.1.5" 42 | transformers = ">=4.8" 43 | pytorch-lightning = "1.6.4" 44 | jsonargparse = "3.13.1" 45 | torch = ">=1.6.0,<2" 46 | numpy = ">=1.20.0" 47 | torchmetrics = "0.8.2" 48 | sacrebleu = ">=2.0.0" 49 | scipy = ">=1.5.4" 50 | 51 | [tool.poetry.dev-dependencies] 52 | sphinx-markdown-tables = "0.0.15" 53 | coverage = "^5.5" 54 | scikit-learn = "0.24" 55 | 56 | [build-system] 57 | requires = ["poetry-core>=1.0.0"] 58 | build-backend = "poetry.core.masonry.api" 59 | -------------------------------------------------------------------------------- /COMET/requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece==0.1.96 2 | pandas==1.1.5 3 | transformers>=4.8 4 | pytorch-lightning==1.6.4 5 | jsonargparse==3.13.1 6 | torch>=1.6.0,<2 7 | numpy>=1.20.0 8 | torchmetrics==0.8.2 9 | sacrebleu >= 2.0.0 10 | scipy>=1.5.4 11 | -------------------------------------------------------------------------------- /COMET/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/__init__.py -------------------------------------------------------------------------------- /COMET/tests/integration/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/integration/__init__.py -------------------------------------------------------------------------------- /COMET/tests/integration/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/integration/models/__init__.py -------------------------------------------------------------------------------- /COMET/tests/integration/models/test_ranking_metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import shutil 4 | import unittest 5 | import warnings 6 | 7 | import torch 8 | from comet.models import RankingMetric 9 | from pytorch_lightning import seed_everything 10 | from pytorch_lightning.trainer.trainer import Trainer 11 | from scipy.stats import pearsonr 12 | from tests.data import DATA_PATH 13 | from torch.utils.data import DataLoader 14 | 15 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 16 | os.environ["OMP_NUM_THREADS"] = "1" 17 | 18 | 19 | class TestRankingMetric(unittest.TestCase): 20 | @classmethod 21 | def tearDownClass(cls): 22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints")) 23 | 24 | def test_training(self): 25 | seed_everything(12) 26 | warnings.filterwarnings( 27 | "ignore", 28 | #category=PossibleUserWarning, 29 | message="GPU available but not used.*", 30 | ) 31 | trainer = Trainer( 32 | accelerator="cpu", 33 | max_epochs=4, 34 | deterministic=True, 35 | enable_checkpointing=True, 36 | default_root_dir=DATA_PATH, 37 | logger=False, 38 | enable_progress_bar=False, 39 | ) 40 | model = RankingMetric( 41 | encoder_model="BERT", 42 | pretrained_model="google/bert_uncased_L-2_H-128_A-2", 43 | train_data=os.path.join(DATA_PATH, "test_ranking_data.csv"), 44 | validation_data=os.path.join(DATA_PATH, "test_ranking_data.csv"), 45 | layerwise_decay=0.95, 46 | batch_size=32, 47 | learning_rate=1e-04, 48 | encoder_learning_rate=1e-04, 49 | ) 50 | warnings.filterwarnings( 51 | "ignore", 52 | category=UserWarning, 53 | message=".*Consider increasing the value of the `num_workers` argument` .*", 54 | ) 55 | trainer.fit(model) 56 | self.assertTrue( 57 | os.path.exists( 58 | os.path.join(DATA_PATH, "checkpoints", "epoch=3-step=16.ckpt") 59 | ) 60 | ) 61 | saved_model = RankingMetric.load_from_checkpoint( 62 | os.path.join(DATA_PATH, "checkpoints", "epoch=3-step=16.ckpt") 63 | ) 64 | dataset = saved_model.read_csv( 65 | os.path.join(DATA_PATH, "test_regression_data.csv"), regression=True 66 | ) 67 | y = [s["score"] for s in dataset] 68 | dataloader = DataLoader( 69 | dataset=dataset, 70 | batch_size=256, 71 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True), 72 | num_workers=2, 73 | ) 74 | y_hat = ( 75 | torch.cat( 76 | trainer.predict( 77 | ckpt_path="best", dataloaders=dataloader, return_predictions=True 78 | ), 79 | dim=0, 80 | ) 81 | .cpu() 82 | .tolist() 83 | ) 84 | # This shouldn't break! 85 | pearsonr(y_hat, y)[0] 86 | -------------------------------------------------------------------------------- /COMET/tests/integration/models/test_referenceless_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import shutil 4 | import unittest 5 | import warnings 6 | 7 | import torch 8 | from comet.models import ReferencelessRegression 9 | from pytorch_lightning import seed_everything 10 | from pytorch_lightning.trainer.trainer import Trainer 11 | from scipy.stats import pearsonr 12 | from tests.data import DATA_PATH 13 | from torch.utils.data import DataLoader 14 | 15 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 16 | os.environ["OMP_NUM_THREADS"] = "1" 17 | 18 | 19 | class TestReferencelessRegression(unittest.TestCase): 20 | @classmethod 21 | def tearDownClass(cls): 22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints")) 23 | 24 | def test_training(self): 25 | seed_everything(12) 26 | warnings.filterwarnings( 27 | "ignore", 28 | #category=PossibleUserWarning, 29 | message="GPU available but not used.*", 30 | ) 31 | trainer = Trainer( 32 | accelerator="cpu", 33 | max_epochs=10, 34 | deterministic=True, 35 | enable_checkpointing=True, 36 | default_root_dir=DATA_PATH, 37 | logger=False, 38 | enable_progress_bar=False, 39 | ) 40 | model = ReferencelessRegression( 41 | encoder_model="BERT", 42 | pretrained_model="google/bert_uncased_L-2_H-128_A-2", 43 | train_data=os.path.join(DATA_PATH, "test_regression_data.csv"), 44 | validation_data=os.path.join(DATA_PATH, "test_regression_data.csv"), 45 | hidden_sizes=[256], 46 | layerwise_decay=0.95, 47 | batch_size=32, 48 | learning_rate=1e-04, 49 | encoder_learning_rate=1e-04, 50 | ) 51 | warnings.filterwarnings( 52 | "ignore", 53 | category=UserWarning, 54 | message=".*Consider increasing the value of the `num_workers` argument` .*", 55 | ) 56 | trainer.fit(model) 57 | self.assertTrue( 58 | os.path.exists( 59 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=160.ckpt") 60 | ) 61 | ) 62 | 63 | saved_model = ReferencelessRegression.load_from_checkpoint( 64 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=160.ckpt") 65 | ) 66 | dataset = saved_model.read_csv( 67 | os.path.join(DATA_PATH, "test_regression_data.csv") 68 | ) 69 | y = [s["score"] for s in dataset] 70 | dataloader = DataLoader( 71 | dataset=dataset, 72 | batch_size=256, 73 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True), 74 | num_workers=2, 75 | ) 76 | y_hat = ( 77 | torch.cat( 78 | trainer.predict( 79 | ckpt_path="best", dataloaders=dataloader, return_predictions=True 80 | ), 81 | dim=0, 82 | ) 83 | .cpu() 84 | .tolist() 85 | ) 86 | assert pearsonr(y_hat, y)[0] > 0.77 -------------------------------------------------------------------------------- /COMET/tests/integration/models/test_regression_metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import shutil 4 | import unittest 5 | import warnings 6 | 7 | import torch 8 | from comet.models import RegressionMetric 9 | from pytorch_lightning import seed_everything 10 | from pytorch_lightning.trainer.trainer import Trainer 11 | from scipy.stats import pearsonr 12 | from tests.data import DATA_PATH 13 | from torch.utils.data import DataLoader 14 | 15 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 16 | os.environ["OMP_NUM_THREADS"] = "1" 17 | 18 | 19 | class TestRegressionMetric(unittest.TestCase): 20 | @classmethod 21 | def tearDownClass(cls): 22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints")) 23 | 24 | def test_training(self): 25 | seed_everything(12) 26 | warnings.filterwarnings( 27 | "ignore", 28 | #category=PossibleUserWarning, 29 | message="GPU available but not used.*", 30 | ) 31 | trainer = Trainer( 32 | accelerator="cpu", 33 | max_epochs=10, 34 | deterministic=True, 35 | enable_checkpointing=True, 36 | default_root_dir=DATA_PATH, 37 | logger=False, 38 | enable_progress_bar=False, 39 | ) 40 | model = RegressionMetric( 41 | encoder_model="BERT", 42 | pretrained_model="google/bert_uncased_L-2_H-128_A-2", 43 | train_data=os.path.join(DATA_PATH, "test_regression_data.csv"), 44 | validation_data=os.path.join(DATA_PATH, "test_regression_data.csv"), 45 | hidden_sizes=[384], 46 | layerwise_decay=0.95, 47 | batch_size=32, 48 | learning_rate=1e-04, 49 | encoder_learning_rate=1e-04, 50 | ) 51 | warnings.filterwarnings( 52 | "ignore", 53 | category=UserWarning, 54 | message=".*Consider increasing the value of the `num_workers` argument` .*", 55 | ) 56 | trainer.fit(model) 57 | self.assertTrue( 58 | os.path.exists( 59 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=160.ckpt") 60 | ) 61 | ) 62 | 63 | saved_model = RegressionMetric.load_from_checkpoint( 64 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=160.ckpt") 65 | ) 66 | dataset = saved_model.read_csv( 67 | os.path.join(DATA_PATH, "test_regression_data.csv") 68 | ) 69 | y = [s["score"] for s in dataset] 70 | dataloader = DataLoader( 71 | dataset=dataset, 72 | batch_size=256, 73 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True), 74 | num_workers=2, 75 | ) 76 | y_hat = ( 77 | torch.cat( 78 | trainer.predict( 79 | ckpt_path="best", dataloaders=dataloader, return_predictions=True 80 | ), 81 | dim=0, 82 | ) 83 | .cpu() 84 | .tolist() 85 | ) 86 | assert pearsonr(y_hat, y)[0] > 0.77 -------------------------------------------------------------------------------- /COMET/tests/integration/modules/test_feedforward.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | import torch 5 | from sklearn.datasets import load_digits 6 | from sklearn.model_selection import train_test_split 7 | from torch import nn 8 | 9 | from comet.modules.feedforward import FeedForward 10 | from pytorch_lightning import seed_everything 11 | 12 | 13 | class TestFeedForward(unittest.TestCase): 14 | def test_MNIST(self): 15 | seed_everything(3) 16 | """ 17 | STEP 1: LOADING DATASET 18 | """ 19 | images, labels = load_digits(return_X_y=True) 20 | images = [torch.Tensor(images[i, :]) for i in range(images.shape[0])] 21 | labels = torch.tensor(labels, dtype=torch.long) 22 | 23 | train_images, test_images, train_labels, test_labels = train_test_split( 24 | images, labels, test_size=0.2, random_state=42 25 | ) 26 | 27 | train_dataset = list(zip(train_images, train_labels)) 28 | test_dataset = list(zip(test_images, test_labels)) 29 | 30 | """ 31 | STEP 2: MAKING DATASET ITERABLE 32 | """ 33 | batch_size = 256 34 | n_iters = 80 35 | num_epochs = n_iters / (len(train_dataset) / batch_size) 36 | num_epochs = int(num_epochs) 37 | 38 | train_loader = torch.utils.data.DataLoader( 39 | dataset=train_dataset, batch_size=batch_size, shuffle=True 40 | ) 41 | 42 | test_loader = torch.utils.data.DataLoader( 43 | dataset=test_dataset, batch_size=batch_size, shuffle=False 44 | ) 45 | 46 | """ 47 | STEP 3: INSTANTIATE MODEL CLASS 48 | """ 49 | model = FeedForward( 50 | in_dim=8 * 8, 51 | out_dim=10, 52 | hidden_sizes=[100], 53 | activations="Tanh", 54 | ) 55 | 56 | """ 57 | STEP 4: INSTANTIATE LOSS CLASS 58 | """ 59 | criterion = nn.CrossEntropyLoss() 60 | 61 | """ 62 | STEP 5: INSTANTIATE OPTIMIZER CLASS 63 | """ 64 | learning_rate = 0.1 65 | optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 66 | 67 | """ 68 | STEP 7: TRAIN THE MODEL 69 | """ 70 | iter = 0 71 | for epoch in range(num_epochs): 72 | for i, (images, labels) in enumerate(train_loader): 73 | # Load images with gradient accumulation capabilities 74 | images = images.view(-1, 8 * 8).requires_grad_() 75 | 76 | # Clear gradients w.r.t. parameters 77 | optimizer.zero_grad() 78 | 79 | # Forward pass to get output/logits 80 | outputs = model(images) 81 | 82 | # Calculate Loss: softmax --> cross entropy loss 83 | loss = criterion(outputs, labels) 84 | 85 | # Getting gradients w.r.t. parameters 86 | loss.backward() 87 | 88 | # Updating parameters 89 | optimizer.step() 90 | 91 | iter += 1 92 | 93 | if iter % 10 == 0: 94 | # Calculate Accuracy 95 | correct = 0 96 | total = 0 97 | # Iterate through test dataset 98 | for images, labels in test_loader: 99 | # Load images with gradient accumulation capabilities 100 | images = images.view(-1, 8 * 8).requires_grad_() 101 | 102 | # Forward pass only to get logits/output 103 | outputs = model(images) 104 | 105 | # Get predictions from the maximum value 106 | _, predicted = torch.max(outputs.data, 1) 107 | 108 | # Total number of labels 109 | total += labels.size(0) 110 | 111 | # Total correct predictions 112 | correct += (predicted == labels).sum() 113 | 114 | accuracy = 100 * correct // total 115 | self.assertGreaterEqual(accuracy, 95) 116 | -------------------------------------------------------------------------------- /COMET/tests/unit/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/unit/__init__.py -------------------------------------------------------------------------------- /COMET/tests/unit/encoders/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/unit/encoders/__init__.py -------------------------------------------------------------------------------- /COMET/tests/unit/encoders/test_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | from comet.encoders.bert import BERTEncoder 5 | 6 | 7 | class TestBERTEncoder(unittest.TestCase): 8 | 9 | bert = BERTEncoder.from_pretrained("google/bert_uncased_L-2_H-128_A-2") 10 | 11 | def test_num_layers(self): 12 | self.assertEqual(self.bert.num_layers, 3) 13 | 14 | def test_output_units(self): 15 | self.assertEqual(self.bert.output_units, 128) 16 | 17 | def test_max_positions(self): 18 | self.assertEqual(self.bert.max_positions, 512) 19 | 20 | def test_prepare_sample(self): 21 | sample = ["hello world, welcome to COMET!", "This is a batch"] 22 | model_input = self.bert.prepare_sample(sample) 23 | self.assertIn("input_ids", model_input) 24 | self.assertIn("attention_mask", model_input) 25 | 26 | def test_forward(self): 27 | sample = ["hello world, welcome to COMET!", "This is a batch"] 28 | model_input = self.bert.prepare_sample(sample) 29 | model_output = self.bert(**model_input) 30 | self.assertIn("wordemb", model_output) 31 | self.assertIn("sentemb", model_output) 32 | self.assertIn("all_layers", model_output) 33 | self.assertIn("attention_mask", model_output) 34 | -------------------------------------------------------------------------------- /COMET/tests/unit/encoders/test_xlmr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | from comet.encoders.xlmr import XLMREncoder 5 | 6 | 7 | class TestXLMREncoder(unittest.TestCase): 8 | 9 | xlmr = XLMREncoder.from_pretrained("Unbabel/xlm-roberta-comet-small") 10 | 11 | def test_num_layers(self): 12 | self.assertEqual(self.xlmr.num_layers, 7) 13 | 14 | def test_output_units(self): 15 | self.assertEqual(self.xlmr.output_units, 384) 16 | 17 | def test_max_positions(self): 18 | self.assertEqual(self.xlmr.max_positions, 514) 19 | 20 | def test_prepare_sample(self): 21 | sample = ["hello world, welcome to COMET!", "This is a batch"] 22 | model_input = self.xlmr.prepare_sample(sample) 23 | self.assertIn("input_ids", model_input) 24 | self.assertIn("attention_mask", model_input) 25 | 26 | def test_forward(self): 27 | sample = ["hello world, welcome to COMET!", "This is a batch"] 28 | model_input = self.xlmr.prepare_sample(sample) 29 | model_output = self.xlmr(**model_input) 30 | self.assertIn("wordemb", model_output) 31 | self.assertIn("sentemb", model_output) 32 | self.assertIn("all_layers", model_output) 33 | self.assertIn("attention_mask", model_output) 34 | -------------------------------------------------------------------------------- /COMET/tests/unit/test_download_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | import os 4 | import shutil 5 | from tests.data import DATA_PATH 6 | from comet.download_utils import download_model 7 | from comet.models import load_from_checkpoint 8 | 9 | 10 | class TestDownloadModel(unittest.TestCase): 11 | @classmethod 12 | def tearDownClass(cls): 13 | shutil.rmtree(os.path.join(DATA_PATH, "wmt21-cometinho-da")) 14 | 15 | def test_download_from_s3(self): 16 | data_path = download_model("wmt21-cometinho-da", saving_directory=DATA_PATH) 17 | self.assertTrue( 18 | os.path.exists(os.path.join(DATA_PATH, "wmt21-cometinho-da/hparams.yaml")) 19 | ) 20 | self.assertTrue( 21 | os.path.exists(os.path.join(DATA_PATH, "wmt21-cometinho-da/checkpoints/")) 22 | ) 23 | load_from_checkpoint(data_path) 24 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing Guidelines 2 | 3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional 4 | documentation, we greatly value feedback and contributions from our community. 5 | 6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary 7 | information to effectively respond to your bug report or contribution. 8 | 9 | 10 | ## Reporting Bugs/Feature Requests 11 | 12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features. 13 | 14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already 15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful: 16 | 17 | * A reproducible test case or series of steps 18 | * The version of our code being used 19 | * Any modifications you've made relevant to the bug 20 | * Anything unusual about your environment or deployment 21 | 22 | 23 | ## Contributing via Pull Requests 24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that: 25 | 26 | 1. You are working against the latest source on the *main* branch. 27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already. 28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted. 29 | 30 | To send us a pull request, please: 31 | 32 | 1. Fork the repository. 33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change. 34 | 3. Ensure local tests pass. 35 | 4. Commit to your fork using clear commit messages. 36 | 5. Send us a pull request, answering any default questions in the pull request interface. 37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation. 38 | 39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and 40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/). 41 | 42 | 43 | ## Finding contributions to work on 44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start. 45 | 46 | 47 | ## Code of Conduct 48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct). 49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact 50 | opensource-codeofconduct@amazon.com with any additional questions or comments. 51 | 52 | 53 | ## Security issue notifications 54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue. 55 | 56 | 57 | ## Licensing 58 | 59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution. 60 | -------------------------------------------------------------------------------- /Config: -------------------------------------------------------------------------------- 1 | package.Giorgos_internship_code = { 2 | interfaces = (1.0); 3 | 4 | # Use NoOpBuild. See https://w.amazon.com/index.php/BrazilBuildSystem/NoOpBuild 5 | build-system = no-op; 6 | build-tools = { 7 | 1.0 = { 8 | NoOpBuild = 1.0; 9 | }; 10 | }; 11 | 12 | # Use runtime-dependencies for when you want to bring in additional 13 | # packages when deploying. 14 | # Use dependencies instead if you intend for these dependencies to 15 | # be exported to other packages that build against you. 16 | dependencies = { 17 | 1.0 = { 18 | }; 19 | }; 20 | 21 | runtime-dependencies = { 22 | 1.0 = { 23 | }; 24 | }; 25 | 26 | }; 27 | -------------------------------------------------------------------------------- /NOTICE: -------------------------------------------------------------------------------- 1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. 2 | -------------------------------------------------------------------------------- /Prism/README.md: -------------------------------------------------------------------------------- 1 | # Doc-Prism (mBART-50) 2 | 3 | This README describes how to use **Doc-Prism** an extension of the original Prism metric that can be used for document-level evaluation. 4 | 5 | Contrary to the original implementation that used a multilingual MT model, we use [mBART-50](https://arxiv.org/abs/2008.00401), a multilingual language model that is pre-trained at the document level, to score the MT outputs. 6 | 7 | ## Installation 8 | 9 | This codebase is an implementation of the [Prism metric](https://github.com/thompsonb/prism) using the [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) library. For a detailed presnetation of the BERTScore metric, including usage examples and instructions see the original documentation. 10 | 11 | ### Get some files to score 12 | ```bash 13 | sacrebleu -t wmt21 -l en-de --echo src | head -n 20 > src.en 14 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > ref.de 15 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > hyp.de # put your system output here 16 | ``` 17 | 18 | To evaluate at the document level we need to know where the document boundaries are in the test set, so that we only use valid context. This is passed in as a file where each line contains a document ID. 19 | 20 | For WMT test sets this can be obtained via [sacreBLEU](https://github.com/mjpost/sacrebleu): 21 | ```bash 22 | sacrebleu -t wmt21 -l en-de --echo docid | head -n 20 > docids.ende 23 | ``` 24 | 25 | 26 | ### Python usage: 27 | 28 | In order to use Doc-Prism with python simply add `doc=True` when calling the score function. 29 | 30 | ```python 31 | from prism import MBARTPrism 32 | from add_context import add_context 33 | 34 | # load data files 35 | doc_ids = [x.strip() for x in open('docids.ende', 'rt').readlines()] 36 | hyp = [x.strip() for x in open('hyp.de', 'rt').readlines()] 37 | ref = [x.strip() for x in open('ref.de', 'rt').readlines()] 38 | 39 | # load prism model 40 | model_path = "facebook/mbart-large-50" 41 | prism = MBARTPrism(checkpoint=model_path, src_lang="en", tgt_lang="de") 42 | 43 | # add contexts to reference and hypothesis texts 44 | hyp = add_context(orig_txt=hyp, context=ref, doc_ids=doc_ids, sep_token=prism.encoder.tokenizer.sep_token) 45 | ref = add_context(orig_txt=ref, context=ref, doc_ids=doc_ids, sep_token=prism.encoder.tokenizer.sep_token) 46 | 47 | seg_score = prism.score(cand=hyp, ref=ref, doc=True) 48 | ``` 49 | 50 | ## Reproduce 51 | To reproduce the Doc-Prism results from the paper run the [score_doc-metrics.py](/score_doc-metrics.py) script with the flags `--model prism` and `--doc`. 52 | 53 | ```bash 54 | git clone https://github.com/google-research/mt-metrics-eval.git 55 | cd mt-metrics-eval 56 | pip install . 57 | alias mtme='python3 -m mt_metrics_eval.mtme' 58 | mtme --download # Puts ~1G of data into $HOME/.mt-metrics-eval. 59 | ``` 60 | To obtain system-level scores of Doc-Prism (mBART-50) for the WMT21 testet run: 61 | ```bash 62 | python score_doc-prism.py --campaign wmt21.news --lp en-de --doc --level sys 63 | ```` 64 | 65 | ## Paper 66 | 67 | If you use the code in your work, please cite [Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric](https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf): 68 | 69 | ``` 70 | @inproceedings{easy_doc_mt 71 | title = {Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric}, 72 | author = {Vernikos, Giorgos and Thompson, Brian and Mathur, Prashant and Federico, Marcello}, 73 | booktitle = "Proceedings of the Seventh Conference on Machine Translation", 74 | month = dec, 75 | year = "2022", 76 | address = "Abu Dhabi, United Arab Emirates", 77 | publisher = "Association for Computational Linguistics", 78 | url = "https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf", 79 | } 80 | ``` 81 | -------------------------------------------------------------------------------- /Prism/add_context.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List 3 | 4 | 5 | def add_context(orig_txt: List[str], context: List[str], doc_ids: List[str], sep_token: str = "", 6 | ws: int = 2) -> List[str]: 7 | """Function that adds the previous sentences as context to the current sentence, respecting document boundaries 8 | :param orig_txt: the original text 9 | :param context: the text from which the context will be taken (same as orig_txt for source/reference) 10 | :param doc_ids: the document where each segment belongs to 11 | :param sep_token: the separator token of the tokenizer for the specific model 12 | :param ws: the window size, maximum of the previous sentences to be considered as context 13 | :return: the original text augmented with context 14 | """ 15 | if not (len(orig_txt) == len(context) == len(doc_ids)): 16 | raise Exception(f'Lengths should match: len(orig_txt)={len(orig_txt)}, len(context)={len(context)}, len(doc_ids)={len(doc_ids)}') 17 | i, k = 0, 0 18 | augm_txt = [] 19 | doc_id = doc_ids[0] 20 | while i < len(orig_txt): 21 | if doc_ids[i] == doc_id: 22 | context_window = context[i - min(k, ws):i] 23 | augm_txt.append(" {} ".format(sep_token).join(context_window + [orig_txt[i]])) 24 | i += 1 25 | else: 26 | doc_id = doc_ids[i] 27 | k = -1 28 | k += 1 29 | return augm_txt 30 | 31 | 32 | -------------------------------------------------------------------------------- /Prism/prism.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from tqdm import tqdm 3 | from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, MBart50Tokenizer 4 | import torch.nn.functional as F 5 | import torch.nn as nn 6 | import numpy as np 7 | 8 | 9 | class MBARTPrism: 10 | def __init__(self, src_lang, tgt_lang, checkpoint='facebook/mbart-large-cc25', device='None'): 11 | langs = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT", 12 | "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK", 13 | "tr_TR", "vi_VN", "zh_CN", "pl_PL", "ta_IN"] 14 | src_lang = [l for l in langs if src_lang in l][0] 15 | tgt_lang = [l for l in langs if tgt_lang in l][0] 16 | 17 | if device is None: 18 | self.device = "cuda" if torch.cuda.is_available() else "cpu" 19 | else: 20 | self.device = device 21 | self.tokenizer = MBart50Tokenizer.from_pretrained(checkpoint, src_lang=src_lang, tgt_lang=tgt_lang) 22 | self.model = MBartForConditionalGeneration.from_pretrained(checkpoint) 23 | self.model.eval() 24 | self.model.to(self.device) 25 | 26 | # Set up loss 27 | self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id) 28 | self.lsm = nn.LogSoftmax(dim=1) 29 | 30 | def score(self, cand, ref, batch_size, doc, segment_scores=True): 31 | """ Score a batch of examples """ 32 | 33 | if len(cand) != len(ref): 34 | raise Exception(f'Length of cand ({len(cand)}) does not match length of ref ({len(ref)})') 35 | 36 | sent_scores = [[], []] 37 | 38 | with torch.no_grad(): 39 | for sent_idx, (srcs, tgts) in enumerate([(ref, cand), (cand, ref)]): 40 | for i in tqdm(range(0, len(srcs), batch_size)): 41 | src_list = srcs[i: i + batch_size] 42 | tgt_list = tgts[i: i + batch_size] 43 | with torch.no_grad(): 44 | encoded_src = self.tokenizer( 45 | src_list, 46 | truncation=True, 47 | padding=True, 48 | return_tensors='pt', 49 | max_length=self.tokenizer.model_max_length 50 | ) 51 | with self.tokenizer.as_target_tokenizer(): 52 | encoded_tgt = self.tokenizer( 53 | tgt_list, 54 | truncation=True, 55 | padding=True, 56 | return_tensors='pt', 57 | max_length=self.tokenizer.model_max_length 58 | ) 59 | tgt_len = [len(self.tokenizer(sent.split(self.tokenizer.sep_token)[-1]).input_ids) for sent 60 | in tgt_list] 61 | if doc: 62 | start_toks = [len(self.tokenizer(sent).input_ids) - tgt_len[i] for i, sent in 63 | enumerate(tgt_list)] 64 | else: 65 | start_toks = [0] * len(tgt_list) 66 | 67 | src_tokens = encoded_src['input_ids'].to(self.device) 68 | src_mask = encoded_src['attention_mask'].to(self.device) 69 | 70 | tgt_tokens = encoded_tgt['input_ids'].to(self.device) 71 | 72 | output = self.model( 73 | input_ids=src_tokens, 74 | attention_mask=src_mask, 75 | labels=tgt_tokens 76 | ) 77 | logits = output.logits.view(-1, self.model.config.vocab_size) 78 | loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1)) 79 | loss = loss.view(tgt_tokens.shape[0], -1) 80 | ppl = [] 81 | for i, s in enumerate(loss): 82 | ppl.append(s[start_toks[i]:start_toks[i] + tgt_len[i] - 1].sum() / (tgt_len[i] - 1)) 83 | curr_score_list = [-x.item() for x in ppl] 84 | sent_scores[sent_idx] += curr_score_list 85 | 86 | segm_scores = np.mean(sent_scores, axis=0) 87 | sys_score = np.mean(segm_scores) if not segment_scores else segm_scores 88 | 89 | return sys_score 90 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Embarrassingly Easy Document-Level MT Metrics 2 | 3 | ## Overview 4 | 5 | In this work we extend state-of-the-art Machine Translation metrics, namely [Prism](https://github.com/thompsonb/prism), [COMET](https://github.com/Unbabel/COMET), [COMET-QE](https://github.com/Unbabel/COMET) and [BERTScore](https://github.com/Tiiiger/bert_score) to the document level. Our approach is _embarassingly simple_: instead of encoding only the hypothesis and reference we also encode the previous reference sentences as context. We still compute the metric score at the sentence level but also attend to previous context. 6 | 7 | ![image](media/bertscore.png) 8 | 9 | 10 | The extended metrics outperform their sentence-level counterparts in about 85% of the tested conditions ([WMT 2021 Metrics Shared Task](https://wmt-metrics-task.github.io/) ) and dramatically improve the ability of the corresponding model to handle discourse phenomena. 11 | 12 | ## Usage 13 | 14 | The current repository contains code that extends the original MT metrics to document level by providing the option to encode additional context. The code is presented as an extension of the corresponding original codebase. For information on how to use each metric see the corresponding README: 15 | * [COMET/COMET-QE](COMET/README.md) 16 | * [BERTScore](bert_score/README.md) 17 | * [Prism](Prism//README.md) 18 | 19 | It is recommended to create an environment for this project 20 | ```bash 21 | conda create -n doc-metrics-env python=3.9 22 | conda activate doc-metrics-env 23 | ``` 24 | 25 | ## Reproducibility 26 | 27 | In order to reproduce the results of the paper, regarding the correlation with human annotations of document or sentence-level metrics on the test sets from the [WMT Metrics Shared Task](https://wmt-metrics-task.github.io/) first install the required packages for [BERTScore](/bert_score) and [COMET](/COMET) models. Next, install the [MT Metrics Eval](https://github.com/google-research/mt-metrics-eval) toolkit 28 | and download the database. 29 | ```bash 30 | git clone https://github.com/google-research/mt-metrics-eval.git 31 | cd mt-metrics-eval 32 | pip install . 33 | alias mtme='python3 -m mt_metrics_eval.mtme' 34 | mtme --download # Puts ~1G of data into $HOME/.mt-metrics-eval. 35 | ``` 36 | Then use the `score_doc-metrics.py` script to obtain the scores for the model, domain and language pair of your choice from the WMT21 test sets. 37 | For example, to obtain system-level scores of Doc-COMET for the en-de language pair in the news domain, run: 38 | ```bash 39 | python score_doc-metrics.py --campaign wmt21.news --model comet --lp en-de --level sys --doc 40 | ```` 41 | ## Acknowledgments 42 | 43 | We would like to thank the community for releasing their code! This repository contains code from [COMET](https://github.com/Unbabel/COMET), [BERTScore](https://github.com/Tiiiger/bert_score) and [Prism](https://github.com/thompsonb/prism) repositories. 44 | 45 | 46 | ## Paper 47 | 48 | If you use the code in your work, please cite [Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric](https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf): 49 | 50 | ``` 51 | @inproceedings{easy_doc_mt 52 | title = {Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric}, 53 | author = {Vernikos, Giorgos and Thompson, Brian and Mathur, Prashant and Federico, Marcello}, 54 | booktitle = "Proceedings of the Seventh Conference on Machine Translation", 55 | month = dec, 56 | year = "2022", 57 | address = "Abu Dhabi, United Arab Emirates", 58 | publisher = "Association for Computational Linguistics", 59 | url = "https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf", 60 | } 61 | ``` 62 | 63 | ## Security 64 | 65 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information. 66 | 67 | ## License 68 | 69 | This project is licensed under the Apache-2.0 License. 70 | 71 | -------------------------------------------------------------------------------- /bert_score/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # vscode 107 | .vscode -------------------------------------------------------------------------------- /bert_score/.idea/bert_score.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 12 | -------------------------------------------------------------------------------- /bert_score/.idea/deployment.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /bert_score/.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /bert_score/.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /bert_score/.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /bert_score/.idea/remote-mappings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | -------------------------------------------------------------------------------- /bert_score/.travis.yml: -------------------------------------------------------------------------------- 1 | env: 2 | TOKENIZERS_PARALLELISM=false # parallelized fast tokenizer don't fit into Travis CI VM 3 | language: python 4 | python: 5 | - '3.6' 6 | - '3.7' 7 | install: 8 | pip install . 9 | script: travis_wait 30 python -m unittest discover 10 | -------------------------------------------------------------------------------- /bert_score/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, and Yoav Artzi. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /bert_score/MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include bert_score/rescale_baseline * 2 | -------------------------------------------------------------------------------- /bert_score/README.md: -------------------------------------------------------------------------------- 1 | # Doc-BERTScore 2 | 3 | This README describes hot to use **Doc-BERTScore** an extension of the BERTScore metric that can be used for document-level evaluation. 4 | 5 | ## Installation 6 | 7 | This codebase is built upon the original [BERTScore code](https://github.com/Tiiiger/bert_score). For a detailed presnetation of the BERTScore metric, including usage examples and instructions see the original documentation. 8 | 9 | To run Doc-BERTScore you will need to develop locally: 10 | ```bash 11 | git clone https://github.com/amazon-science/doc-mt-metrics.git 12 | cd bert_score 13 | pip install . 14 | ``` 15 | 16 | ### Get some files to score 17 | ```bash 18 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > ref.de 19 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > hyp.de # put your system output here 20 | ``` 21 | To evaluate at the document level we need to know where the document boundaries are in the test set, so that we only use valid context. This is passed in as a file where each line contains a document ID. 22 | 23 | For WMT test sets this can be obtained via [sacreBLEU](https://github.com/mjpost/sacrebleu): 24 | ```bash 25 | sacrebleu -t wmt21 -l en-de --echo docid | head -n 20 > docids.ende 26 | ``` 27 | 28 | ### Command Line usage: 29 | 30 | To score using the document-level BERTScore simply add the `--doc` flag: 31 | ```bash 32 | bert-score -r ref.de -c hyp.de --lang de --doc docids.ende 33 | ``` 34 | 35 | In the paper we use`roberta-large` for X->En pairs and `bert-base-multilingual-cased` for En->X pairs (default at the time) but you can select another model with the `-m MODEL_TYPE` flag. See the [spreadsheet](https://docs.google.com/spreadsheets/d/1RKOVpselB98Nnh_EOC4A2BYn8_201tmPODpNWu4w7xI/edit?usp=sharing) provided by the authors of BERTScore for a full list of supported models. 36 | 37 | ### Python usage (Object-oriented API): 38 | 39 | The [BERTScore](https://github.com/Tiiiger/bert_score) framework provides two APIs in order to use the BERTScore metric with python: an object-oriented one that caches the model and is recommended for multiple evaluations and a functional one that can be used for single evaluation. For more details see the [demo](https://github.com/Tiiiger/bert_score/blob/master/example/Demo.ipynb) provided by the authors. 40 | 41 | In order to use Doc-BERTScore simple simply add `doc=True` when calling the `score` function: 42 | 43 | ```python 44 | from bert_score import BERTScorer 45 | from add_context import add_context 46 | 47 | with open("hyp.de") as f: 48 | cands = [line.strip() for line in f] 49 | 50 | with open("ref.de") as f: 51 | refs = [line.strip() for line in f] 52 | 53 | with open("docids.ende") as f: 54 | doc_ids = [line.strip() for line in f] 55 | 56 | scorer = BERTScorer(lang="de") 57 | 58 | # add contexts to reference and hypothesis texts 59 | cands = add_context(orig_txt=cands, context=refs, doc_ids=doc_ids, sep_token=scorer._tokenizer.sep_token) 60 | refs = add_context(orig_txt=refs, context=refs, doc_ids=doc_ids, sep_token=scorer._tokenizer.sep_token) 61 | 62 | # set doc=True to evaluate at the document level 63 | P, R, F1 = scorer.score(cands, refs, doc=True) 64 | ``` 65 | ### Python usage (Function API): 66 | 67 | In order to use Doc-BERTScore simple simply add `doc=True` when calling the `score` function: 68 | 69 | ```python 70 | from bert_score import score 71 | from add_context import add_context 72 | 73 | with open("hyp.de") as f: 74 | cands = [line.strip() for line in f] 75 | 76 | with open("ref.de") as f: 77 | refs = [line.strip() for line in f] 78 | 79 | with open("docids.ende") as f: 80 | doc_ids = [line.strip() for line in f] 81 | 82 | # add contexts to reference and hypothesis texts 83 | cands = add_context(orig_txt=cands, context=refs, doc_ids=doc_ids, sep_token="[SEP]") 84 | refs = add_context(orig_txt=refs, context=refs, doc_ids=doc_ids, sep_token="[SEP]") 85 | 86 | # set doc=True to evaluate at the document level 87 | P, R, F1 = score(cands, refs, lang="de", verbose=True, doc=True) 88 | ``` 89 | 90 | To use another model set the flag `model_type=MODEL_TYPE` when calling `score` function. 91 | 92 | ## Reproduce 93 | To reproduce the Doc-BERTScore results from the paper run the [score_doc-metrics.py](/score_doc-metrics.py) script with the flags `--model bertscore` and `--doc`. 94 | 95 | ## Paper 96 | 97 | If you use the code in your work, please cite [Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric](https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf): 98 | 99 | ``` 100 | @inproceedings{easy_doc_mt 101 | title = {Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric}, 102 | author = {Vernikos, Giorgos and Thompson, Brian and Mathur, Prashant and Federico, Marcello}, 103 | booktitle = "Proceedings of the Seventh Conference on Machine Translation", 104 | month = dec, 105 | year = "2022", 106 | address = "Abu Dhabi, United Arab Emirates", 107 | publisher = "Association for Computational Linguistics", 108 | url = "https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf", 109 | } 110 | ``` 111 | -------------------------------------------------------------------------------- /bert_score/add_context.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from typing import List 3 | 4 | 5 | def add_context(orig_txt: List[str], context: List[str], doc_ids: List[str], sep_token: str = "", 6 | ws: int = 2) -> List[str]: 7 | """Function that adds the previous sentences as context to the current sentence, respecting document boundaries 8 | :param orig_txt: the original text 9 | :param context: the text from which the context will be taken (same as orig_txt for source/reference) 10 | :param doc_ids: the document where each segment belongs to 11 | :param sep_token: the separator token of the tokenizer for the specific model 12 | :param ws: the window size, maximum of the previous sentences to be considered as context 13 | :return: the original text augmented with context 14 | """ 15 | if not (len(orig_txt) == len(context) == len(doc_ids)): 16 | raise Exception(f'Lengths should match: len(orig_txt)={len(orig_txt)}, len(context)={len(context)}, len(doc_ids)={len(doc_ids)}') 17 | i, k = 0, 0 18 | augm_txt = [] 19 | doc_id = doc_ids[0] 20 | while i < len(orig_txt): 21 | if doc_ids[i] == doc_id: 22 | context_window = context[i - min(k, ws):i] 23 | augm_txt.append(" {} ".format(sep_token).join(context_window + [orig_txt[i]])) 24 | i += 1 25 | else: 26 | doc_id = doc_ids[i] 27 | k = -1 28 | k += 1 29 | return augm_txt 30 | 31 | 32 | -------------------------------------------------------------------------------- /bert_score/bert_score.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/bert_score.png -------------------------------------------------------------------------------- /bert_score/bert_score/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.3.11" 2 | from .score import * 3 | from .scorer import * 4 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/cs/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.28803304,0.28811806,0.28382972 3 | 1,0.36045152,0.3605044,0.35791346 4 | 2,0.35763955,0.3577387,0.35552806 5 | 3,0.4382742,0.43832803,0.4371357 6 | 4,0.49264902,0.4926875,0.49187797 7 | 5,0.5753039,0.5753327,0.57483304 8 | 6,0.63127446,0.6313224,0.6309864 9 | 7,0.5324934,0.532565,0.53202814 10 | 8,0.5102161,0.5103038,0.5096529 11 | 9,0.6044539,0.6045382,0.604006 12 | 10,0.6814313,0.68149376,0.6810876 13 | 11,0.7187933,0.7188438,0.71841186 14 | 12,0.386078,0.38613266,0.38548917 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/cs/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.24679352,0.24680473,0.24270211 3 | 1,0.29235435,0.29231834,0.28975013 4 | 2,0.3138872,0.31386852,0.31213808 5 | 3,0.3285111,0.3284912,0.32616478 6 | 4,0.34355187,0.34352767,0.3409594 7 | 5,0.40920743,0.4091819,0.40708998 8 | 6,0.5143928,0.5143628,0.51312447 9 | 7,0.5684746,0.56843746,0.5675548 10 | 8,0.55277854,0.55274475,0.55174726 11 | 9,0.4946325,0.49455652,0.49314302 12 | 10,0.425077,0.42500603,0.42305094 13 | 11,0.37143245,0.37136525,0.3687799 14 | 12,0.38431773,0.38426274,0.38162753 15 | 13,0.40205154,0.40199956,0.3993145 16 | 14,0.41208863,0.412054,0.40980735 17 | 15,0.4243431,0.42427495,0.4220649 18 | 16,0.32602695,0.3260445,0.32438898 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/cs/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.28832704,0.28834337,0.28409466 3 | 1,0.42489076,0.42484972,0.42346135 4 | 2,0.6489359,0.64890593,0.6484903 5 | 3,0.7212477,0.7212302,0.7210182 6 | 4,0.70944715,0.7094549,0.70922697 7 | 5,0.7286318,0.72864425,0.7284186 8 | 6,0.71929383,0.71930563,0.71912307 9 | 7,0.75613487,0.756147,0.7559896 10 | 8,0.7593519,0.759376,0.75920963 11 | 9,0.801281,0.80129445,0.8010951 12 | 10,0.8243164,0.82432646,0.8241175 13 | 11,0.86058,0.86058563,0.8604526 14 | 12,0.97968304,0.9796832,0.9796791 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/cs/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.36036962,0.3603732,0.35725367 3 | 1,0.6612272,0.661179,0.66074145 4 | 2,0.722742,0.72273415,0.72256047 5 | 3,0.73125947,0.73123205,0.7310358 6 | 4,0.7825561,0.7825642,0.78245354 7 | 5,0.78133506,0.7813208,0.7811937 8 | 6,0.8079803,0.8079664,0.8078874 9 | 7,0.8139315,0.8139195,0.8138673 10 | 8,0.82575524,0.82575536,0.8256901 11 | 9,0.8267652,0.8267674,0.8267081 12 | 10,0.826633,0.826636,0.82654697 13 | 11,0.8310137,0.8310095,0.83087397 14 | 12,0.8320955,0.83211106,0.83181846 15 | 13,0.82811135,0.8281364,0.827703 16 | 14,0.8271892,0.8272189,0.8265785 17 | 15,0.8306057,0.8306258,0.82997155 18 | 16,0.81801736,0.81803435,0.8175852 19 | 17,0.8253589,0.825372,0.8250096 20 | 18,0.82938665,0.82940817,0.8290164 21 | 19,0.82824516,0.8282779,0.827922 22 | 20,0.8445639,0.84459394,0.84429437 23 | 21,0.86360985,0.8636378,0.86333483 24 | 22,0.8661244,0.8661579,0.86584014 25 | 23,0.8638866,0.86392677,0.8635829 26 | 24,0.97858095,0.9785705,0.9785698 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/de/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.29239914,0.29233938,0.28799337 3 | 1,0.37400073,0.37395933,0.37138724 4 | 2,0.36879358,0.36874846,0.3663888 5 | 3,0.4502482,0.4501956,0.44887444 6 | 4,0.4982386,0.49817833,0.49722672 7 | 5,0.5760319,0.5759751,0.5754043 8 | 6,0.62940514,0.62935334,0.6289917 9 | 7,0.5357095,0.53565013,0.53505087 10 | 8,0.5146575,0.51462156,0.5138855 11 | 9,0.61532813,0.61528224,0.6147353 12 | 10,0.68632543,0.6862504,0.6858456 13 | 11,0.7214881,0.7214059,0.72098553 14 | 12,0.36572546,0.36572027,0.36501065 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/de/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.25753325,0.25744554,0.25318053 3 | 1,0.2981514,0.2980718,0.2952621 4 | 2,0.3208413,0.32078207,0.3187413 5 | 3,0.33565432,0.33562624,0.33315146 6 | 4,0.34684345,0.34679237,0.3443796 7 | 5,0.4133209,0.41324788,0.41142154 8 | 6,0.514071,0.51400465,0.51292115 9 | 7,0.5642201,0.56416416,0.56339765 10 | 8,0.54623514,0.5461879,0.54531705 11 | 9,0.49143773,0.4913597,0.4903938 12 | 10,0.42275012,0.42266262,0.42136824 13 | 11,0.36494458,0.36484274,0.36310795 14 | 12,0.37404448,0.37393928,0.37217715 15 | 13,0.38868552,0.3885813,0.38668826 16 | 14,0.39440155,0.39433125,0.39241815 17 | 15,0.4055417,0.40547967,0.4035052 18 | 16,0.30379978,0.30370486,0.30213118 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/de/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.30777588,0.30777684,0.3031559 3 | 1,0.44505233,0.44509125,0.4434747 4 | 2,0.66170436,0.66174895,0.6612669 5 | 3,0.73550326,0.7355261,0.735256 6 | 4,0.7208496,0.72085893,0.720586 7 | 5,0.73704386,0.73705214,0.7367808 8 | 6,0.73208153,0.7320707,0.7318679 9 | 7,0.7680251,0.76800215,0.76783967 10 | 8,0.77268696,0.77266395,0.7724989 11 | 9,0.8099519,0.80989397,0.809723 12 | 10,0.8310105,0.83095115,0.83080244 13 | 11,0.86770487,0.86765665,0.86756754 14 | 12,0.9819623,0.9819598,0.9819579 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/de/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.37937975,0.379358,0.37601414 3 | 1,0.6650208,0.66500705,0.6645409 4 | 2,0.72824335,0.7282381,0.72805566 5 | 3,0.74166065,0.7416417,0.741436 6 | 4,0.7924967,0.7925062,0.7923915 7 | 5,0.7885143,0.7884954,0.7883624 8 | 6,0.8117979,0.8117669,0.81168765 9 | 7,0.8173677,0.8173395,0.81728506 10 | 8,0.82804793,0.828012,0.82794595 11 | 9,0.83066076,0.8306335,0.83057094 12 | 10,0.82999426,0.8299607,0.82988906 13 | 11,0.83342683,0.83340013,0.8332831 14 | 12,0.83806795,0.83803594,0.83778083 15 | 13,0.83596325,0.83591455,0.83558387 16 | 14,0.8378458,0.8377797,0.83741814 17 | 15,0.8420356,0.84196484,0.84161186 18 | 16,0.83186066,0.8318187,0.8314605 19 | 17,0.83927697,0.83923465,0.83889884 20 | 18,0.84405965,0.84401745,0.8436563 21 | 19,0.8409746,0.8409399,0.84059715 22 | 20,0.8542512,0.85422283,0.8539368 23 | 21,0.8734287,0.8733914,0.87314016 24 | 22,0.8774618,0.87741566,0.87717056 25 | 23,0.87821764,0.8781659,0.8779116 26 | 24,0.9817083,0.98170334,0.9817008 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en-sci/allenai/scibert_scivocab_uncased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3247314,0.32477322,0.32055983 3 | 1,0.34701017,0.34706187,0.344079 4 | 2,0.41985375,0.41988486,0.4179418 5 | 3,0.4668236,0.46684003,0.4656058 6 | 4,0.45860615,0.4586492,0.4573681 7 | 5,0.41228917,0.4123522,0.41066456 8 | 6,0.4395095,0.43956795,0.43794444 9 | 7,0.48392966,0.4839865,0.48246792 10 | 8,0.5335945,0.5336341,0.5322364 11 | 9,0.60744065,0.6074917,0.60612226 12 | 10,0.66027635,0.66033924,0.65897125 13 | 11,0.6890247,0.6891011,0.6878515 14 | 12,0.54997945,0.55007255,0.54844016 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/albert-base-v1.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.42279568,0.42285842,0.4198645 3 | 1,0.38239375,0.3824535,0.3795375 4 | 2,0.35127786,0.35131463,0.34854048 5 | 3,0.3402314,0.34027407,0.33761653 6 | 4,0.34001094,0.3400646,0.33745667 7 | 5,0.34310105,0.34314916,0.34054983 8 | 6,0.3478834,0.34792796,0.34530792 9 | 7,0.3523316,0.35237584,0.34973368 10 | 8,0.35546654,0.35550496,0.35283387 11 | 9,0.35682797,0.35686156,0.3541417 12 | 10,0.3572713,0.35730729,0.35451323 13 | 11,0.35916516,0.35920846,0.35632935 14 | 12,0.3620535,0.3621047,0.35911387 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/albert-base-v2.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.43284354,0.4329465,0.42670736 3 | 1,0.4085349,0.40857056,0.4041539 4 | 2,0.42302486,0.42304876,0.41986418 5 | 3,0.43835327,0.43837532,0.43578437 6 | 4,0.46398157,0.4640153,0.46179092 7 | 5,0.487097,0.48714137,0.48507443 8 | 6,0.50701046,0.5070602,0.50516284 9 | 7,0.5251579,0.5252073,0.52346826 10 | 8,0.5432063,0.5432638,0.5416856 11 | 9,0.56169736,0.56174135,0.56031275 12 | 10,0.58207834,0.58211654,0.58080167 13 | 11,0.5087994,0.5088567,0.50630754 14 | 12,0.4822224,0.48224902,0.4795803 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/albert-large-v1.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.48447838,0.48450485,0.4821886 3 | 1,0.5124409,0.51243365,0.5109167 4 | 2,0.49396634,0.49394318,0.49285302 5 | 3,0.48355308,0.48351732,0.48258644 6 | 4,0.48206407,0.48202685,0.4811013 7 | 5,0.48171225,0.48167655,0.48073986 8 | 6,0.48402956,0.48400134,0.48304388 9 | 7,0.48760605,0.48758495,0.4866279 10 | 8,0.49034056,0.4903293,0.4893756 11 | 9,0.4919946,0.49199188,0.4910255 12 | 10,0.49351045,0.4935107,0.49251547 13 | 11,0.4953505,0.49535286,0.4943231 14 | 12,0.49792922,0.4979353,0.49686712 15 | 13,0.50119936,0.5012099,0.5001017 16 | 14,0.50464475,0.50465906,0.5035164 17 | 15,0.5072171,0.50723296,0.5060587 18 | 16,0.50804037,0.50805837,0.506836 19 | 17,0.50674427,0.5067624,0.5054734 20 | 18,0.5028615,0.5028785,0.50150096 21 | 19,0.4957624,0.49577576,0.49427336 22 | 20,0.48470628,0.48471764,0.48304176 23 | 21,0.46942177,0.4694329,0.46755382 24 | 22,0.45182654,0.45184082,0.44979697 25 | 23,0.4372368,0.43725976,0.43516964 26 | 24,0.43032366,0.4303518,0.42831102 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/albert-large-v2.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.43137488,0.4314412,0.4271023 3 | 1,0.47189355,0.47192886,0.46977237 4 | 2,0.4965904,0.49659666,0.49521467 5 | 3,0.4952368,0.4952206,0.49390256 6 | 4,0.49991024,0.4998804,0.49865857 7 | 5,0.5061125,0.5060827,0.50490576 8 | 6,0.52520007,0.5251885,0.5241151 9 | 7,0.5463337,0.54633546,0.54536676 10 | 8,0.56268036,0.56267744,0.5618048 11 | 9,0.5788636,0.5788671,0.5780607 12 | 10,0.59798187,0.5979915,0.5972454 13 | 11,0.6093569,0.6093737,0.60867995 14 | 12,0.61832786,0.6183305,0.6176837 15 | 13,0.6298888,0.62988657,0.6292773 16 | 14,0.63760334,0.6376027,0.6370052 17 | 15,0.6402277,0.6402217,0.63963217 18 | 16,0.6457506,0.6457368,0.64517874 19 | 17,0.6488497,0.6488231,0.6482803 20 | 18,0.6473536,0.6473276,0.6467711 21 | 19,0.65181977,0.6517948,0.6512418 22 | 20,0.65941834,0.6593918,0.65884435 23 | 21,0.65883756,0.65882397,0.65822756 24 | 22,0.6599824,0.6599794,0.6593097 25 | 23,0.6140344,0.6140205,0.6131047 26 | 24,0.54314095,0.54311645,0.5419062 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/albert-xlarge-v1.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.37603918,0.37612942,0.37049496 3 | 1,0.31145602,0.3114958,0.3073803 4 | 2,0.25227228,0.2522994,0.24795091 5 | 3,0.22015819,0.22017719,0.21600199 6 | 4,0.21572605,0.21576598,0.21187688 7 | 5,0.21390381,0.21393314,0.21024637 8 | 6,0.21366087,0.21368802,0.21022928 9 | 7,0.2149553,0.21497151,0.2116843 10 | 8,0.21902423,0.21904334,0.215865 11 | 9,0.22598784,0.22601976,0.22294162 12 | 10,0.23651579,0.23656204,0.2335378 13 | 11,0.2508,0.25083283,0.24782418 14 | 12,0.26735264,0.26740175,0.2642045 15 | 13,0.2851571,0.2852036,0.28140694 16 | 14,0.30159834,0.3016559,0.2969648 17 | 15,0.31582344,0.31589058,0.31032172 18 | 16,0.33028397,0.3303347,0.32389277 19 | 17,0.34479943,0.34483773,0.33757344 20 | 18,0.3576801,0.35770583,0.34980485 21 | 19,0.36997133,0.36996147,0.3615338 22 | 20,0.3813416,0.38132015,0.37257645 23 | 21,0.3904368,0.39041746,0.38146585 24 | 22,0.4026223,0.40261322,0.39356884 25 | 23,0.41755676,0.41755086,0.4090774 26 | 24,0.40913486,0.40914643,0.40243107 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/albert-xlarge-v2.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.379094,0.37919718,0.37330297 3 | 1,0.27352002,0.27357075,0.26852632 4 | 2,0.24191533,0.24194317,0.23669504 5 | 3,0.2238661,0.22388461,0.21928357 6 | 4,0.22812894,0.22815062,0.22410771 7 | 5,0.22398795,0.22402358,0.22023973 8 | 6,0.22606015,0.22609216,0.22241953 9 | 7,0.22955626,0.22957715,0.2261971 10 | 8,0.23346025,0.23349406,0.230283 11 | 9,0.23933677,0.23937275,0.23639005 12 | 10,0.24947925,0.2495169,0.24674372 13 | 11,0.25879192,0.25879982,0.25623834 14 | 12,0.26840612,0.2684224,0.2659429 15 | 13,0.28223696,0.2822432,0.27990422 16 | 14,0.3007411,0.30081397,0.298456 17 | 15,0.32065493,0.32073346,0.31820792 18 | 16,0.3489667,0.34909493,0.34612358 19 | 17,0.37499505,0.37513632,0.37153322 20 | 18,0.39365283,0.3937659,0.3894278 21 | 19,0.3985198,0.39858896,0.39375183 22 | 20,0.40377426,0.4038127,0.3987301 23 | 21,0.4162669,0.41631454,0.41127917 24 | 22,0.4385093,0.43853307,0.43359485 25 | 23,0.50211877,0.5021498,0.49820283 26 | 24,0.6450441,0.6450727,0.64176905 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/albert-xxlarge-v1.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.44518736,0.44525033,0.44190475 3 | 1,0.26892486,0.26893654,0.26619813 4 | 2,0.25225964,0.25227055,0.2495048 5 | 3,0.23626596,0.23626427,0.23414151 6 | 4,0.24108262,0.24108647,0.23914734 7 | 5,0.2402725,0.24029303,0.23852193 8 | 6,0.24204335,0.24206877,0.24038398 9 | 7,0.24432875,0.24436904,0.2427339 10 | 8,0.24470611,0.24472676,0.24312295 11 | 9,0.24761276,0.24763304,0.2458257 12 | 10,0.26654655,0.26657295,0.26450548 13 | 11,0.30993807,0.309992,0.3073111 14 | 12,0.46560258,0.46563277,0.463768 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/albert-xxlarge-v2.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.4414845,0.4415628,0.4378333 3 | 1,0.26729813,0.26729846,0.26443842 4 | 2,0.25006709,0.25006858,0.2470538 5 | 3,0.22912578,0.22914563,0.22677879 6 | 4,0.23676835,0.23678702,0.23474906 7 | 5,0.23712093,0.23712862,0.23520498 8 | 6,0.2357785,0.23579709,0.2339876 9 | 7,0.2375271,0.2375658,0.2357691 10 | 8,0.23694733,0.2369875,0.23519956 11 | 9,0.24043696,0.24048997,0.23847668 12 | 10,0.25991938,0.25997588,0.257621 13 | 11,0.3076668,0.30775174,0.30460533 14 | 12,0.5213576,0.52133,0.5192018 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/bert-base-cased-finetuned-mrpc.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.32524315,0.32527947,0.32047534 3 | 1,0.3697738,0.3697855,0.36682808 4 | 2,0.3912412,0.39124438,0.38884974 5 | 3,0.38678017,0.3867508,0.3849363 6 | 4,0.4306143,0.43059555,0.4291982 7 | 5,0.47680253,0.47676748,0.4757307 8 | 6,0.4937383,0.4937078,0.49275663 9 | 7,0.47395828,0.47392154,0.47275484 10 | 8,0.48822877,0.48818707,0.48712534 11 | 9,0.55345184,0.55342007,0.5525519 12 | 10,0.6535154,0.6534775,0.6529064 13 | 11,0.76415604,0.7641147,0.76378924 14 | 12,0.72067815,0.7206308,0.72023565 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.31651747,0.3166142,0.31180394 3 | 1,0.38737702,0.38744056,0.38455048 4 | 2,0.37912813,0.37916443,0.37648088 5 | 3,0.46451283,0.46451145,0.46312103 6 | 4,0.5066057,0.50659287,0.5054953 7 | 5,0.5804824,0.5804496,0.5797646 8 | 6,0.63067275,0.630636,0.63018715 9 | 7,0.54218787,0.5421653,0.5414328 10 | 8,0.5240471,0.5240057,0.5232123 11 | 9,0.6320527,0.6320019,0.63146895 12 | 10,0.69633687,0.6962761,0.6958725 13 | 11,0.7193143,0.7192363,0.7188216 14 | 12,0.3473233,0.34732684,0.34655094 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/bert-base-uncased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3231512,0.32322776,0.31853873 3 | 1,0.32517454,0.32522815,0.32197207 4 | 2,0.3708038,0.37080705,0.36834884 5 | 3,0.36287847,0.36286885,0.36059204 6 | 4,0.3786389,0.37860426,0.3767926 7 | 5,0.4018232,0.401791,0.40032896 8 | 6,0.38439456,0.38434005,0.38282546 9 | 7,0.37114623,0.3710986,0.36949417 10 | 8,0.37231025,0.37226102,0.37049443 11 | 9,0.35375935,0.3537393,0.35219112 12 | 10,0.38161838,0.3816211,0.37991408 13 | 11,0.4421448,0.4421776,0.44040316 14 | 12,0.40192786,0.40191513,0.40038353 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/bert-large-uncased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.33945993,0.33952734,0.3353803 3 | 1,0.46529758,0.46534532,0.4629573 4 | 2,0.5190359,0.51904607,0.5170987 5 | 3,0.55551875,0.5555247,0.5540426 6 | 4,0.47806495,0.4780755,0.47663376 7 | 5,0.39333034,0.3933407,0.391598 8 | 6,0.30678865,0.30683848,0.30446944 9 | 7,0.40164435,0.40167126,0.39997557 10 | 8,0.44429466,0.4443099,0.44277325 11 | 9,0.5114804,0.5114661,0.5102474 12 | 10,0.53322667,0.5332073,0.5323144 13 | 11,0.56793964,0.56791747,0.56725395 14 | 12,0.56360143,0.5635814,0.5629889 15 | 13,0.5358492,0.5358346,0.53522795 16 | 14,0.42079058,0.42078197,0.41975206 17 | 15,0.3509417,0.3509411,0.34957188 18 | 16,0.4534342,0.45341223,0.45231807 19 | 17,0.46370843,0.46370083,0.46265444 20 | 18,0.4278576,0.42786714,0.42646673 21 | 19,0.38974905,0.3897353,0.3877319 22 | 20,0.3966205,0.3966191,0.3942883 23 | 21,0.4981153,0.49813268,0.4955151 24 | 22,0.5868029,0.58685154,0.584482 25 | 23,0.7136535,0.7137033,0.7118858 26 | 24,0.5152624,0.5152391,0.5146088 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/distilbert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.27245584,0.27247205,0.26611173 3 | 1,0.45394143,0.453942,0.45178676 4 | 2,0.5374658,0.5374726,0.53619426 5 | 3,0.61241305,0.61244136,0.6116679 6 | 4,0.63282156,0.632836,0.63219804 7 | 5,0.8164157,0.81645757,0.81623197 8 | 6,0.4648941,0.4649093,0.4638737 9 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/distilbert-base-uncased-distilled-squad.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.28725642,0.2872663,0.28207442 3 | 1,0.37234208,0.37233955,0.37046063 4 | 2,0.403689,0.4037149,0.4020736 5 | 3,0.5399291,0.53997463,0.53930676 6 | 4,0.6591859,0.65919137,0.65882134 7 | 5,0.65313077,0.6531313,0.65279835 8 | 6,0.74920315,0.7491901,0.7487158 9 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/distilbert-base-uncased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.2884445,0.2884457,0.28333962 3 | 1,0.39316687,0.3931663,0.39123002 4 | 2,0.42905498,0.4290923,0.42735597 5 | 3,0.5222444,0.52227175,0.52129734 6 | 4,0.6019937,0.6019904,0.6014007 7 | 5,0.6666034,0.66660464,0.66620487 8 | 6,0.51401854,0.51404256,0.5131456 9 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/distilroberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.42608285,0.4272089,0.42462298 3 | 1,0.7367886,0.7370362,0.736573 4 | 2,0.79922664,0.799593,0.7991632 5 | 3,0.8329021,0.8333321,0.83291864 6 | 4,0.8442,0.84462386,0.84425896 7 | 5,0.84732,0.84759504,0.8473319 8 | 6,0.89334005,0.8935088,0.8933471 9 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/microsoft/deberta-base-mnli.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.24991891,0.25001466,0.24200068 3 | 1,0.29392833,0.29395026,0.28912014 4 | 2,0.36113718,0.36123025,0.3575888 5 | 3,0.41445282,0.41459718,0.41148487 6 | 4,0.4386812,0.43877414,0.4361292 7 | 5,0.45521808,0.4552972,0.45306677 8 | 6,0.4797258,0.4797979,0.47779492 9 | 7,0.48204568,0.48210686,0.480253 10 | 8,0.50440174,0.5044583,0.5025705 11 | 9,0.53045946,0.5304829,0.52866036 12 | 10,0.53781724,0.5377958,0.53583634 13 | 11,0.5402823,0.5402229,0.53816986 14 | 12,0.57382584,0.57370174,0.57160807 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/microsoft/deberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.2517391,0.2518335,0.24388544 3 | 1,0.36171424,0.36175922,0.35748047 4 | 2,0.4423475,0.442458,0.44021225 5 | 3,0.50618786,0.5063445,0.5045984 6 | 4,0.5250692,0.525192,0.5236118 7 | 5,0.55415064,0.5542385,0.5528668 8 | 6,0.5684745,0.5685567,0.5672051 9 | 7,0.5721026,0.5721756,0.5708452 10 | 8,0.60626274,0.6063245,0.6049902 11 | 9,0.6282066,0.62825406,0.6269483 12 | 10,0.6643668,0.66438687,0.66297233 13 | 11,0.65951246,0.6595324,0.6584084 14 | 12,0.70749044,0.70750576,0.7064498 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/microsoft/deberta-large-mnli.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.24490805,0.24501415,0.23715581 3 | 1,0.29400384,0.2940948,0.28872925 4 | 2,0.30570883,0.3057956,0.30113816 5 | 3,0.2957167,0.29578057,0.2915654 6 | 4,0.2884288,0.28847086,0.2843156 7 | 5,0.30902475,0.3090854,0.3057 8 | 6,0.3267471,0.32683545,0.32377866 9 | 7,0.32664096,0.32672828,0.3239887 10 | 8,0.33238792,0.3324875,0.32986364 11 | 9,0.35454232,0.3546663,0.35220724 12 | 10,0.37474304,0.37486178,0.3723941 13 | 11,0.38948673,0.38959926,0.38713577 14 | 12,0.40499082,0.4051212,0.4027381 15 | 13,0.40869987,0.40882573,0.40650842 16 | 14,0.41533,0.41543606,0.41318002 17 | 15,0.42891178,0.4289993,0.42687863 18 | 16,0.43574512,0.43581918,0.43376175 19 | 17,0.44409868,0.44415665,0.4421444 20 | 18,0.45358238,0.45362508,0.45173016 21 | 19,0.4614291,0.46146432,0.45968512 22 | 20,0.4612395,0.46127385,0.45946208 23 | 21,0.47897574,0.47901914,0.4772938 24 | 22,0.49526486,0.49531218,0.49363694 25 | 23,0.48103315,0.4810869,0.4794539 26 | 24,0.5131625,0.51319313,0.51193404 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/microsoft/deberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.24543904,0.24554524,0.23771858 3 | 1,0.32400694,0.3240889,0.31910792 4 | 2,0.35317397,0.35325843,0.34890434 5 | 3,0.34494445,0.34502625,0.34082443 6 | 4,0.34670925,0.34677663,0.3425456 7 | 5,0.36661133,0.3667012,0.36314553 8 | 6,0.38046056,0.38056228,0.37710926 9 | 7,0.38267714,0.3827855,0.37945607 10 | 8,0.3922755,0.3924098,0.38914645 11 | 9,0.41027483,0.41045374,0.4072962 12 | 10,0.43634042,0.4365225,0.43331632 13 | 11,0.4587171,0.45889324,0.45575032 14 | 12,0.47399956,0.47417867,0.47109136 15 | 13,0.48888516,0.48905894,0.4862424 16 | 14,0.4966528,0.49680543,0.49413764 17 | 15,0.5117451,0.51189446,0.50938886 18 | 16,0.5341927,0.53433174,0.53205305 19 | 17,0.55080074,0.5509329,0.5488182 20 | 18,0.5715738,0.571711,0.5698007 21 | 19,0.58424556,0.5843769,0.5826535 22 | 20,0.59171396,0.5918352,0.5901539 23 | 21,0.60953987,0.60965025,0.60810995 24 | 22,0.620468,0.6205763,0.6191674 25 | 23,0.57499653,0.575068,0.573669 26 | 24,0.5698042,0.5698687,0.5686779 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/microsoft/deberta-xlarge-mnli.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.2493579,0.24956034,0.24190253 3 | 1,0.3013932,0.30158633,0.2964718 4 | 2,0.317363,0.31756195,0.31315005 5 | 3,0.3117229,0.3118849,0.30764845 6 | 4,0.3074649,0.3076071,0.30345994 7 | 5,0.3140126,0.31414607,0.31065413 8 | 6,0.32410583,0.3242222,0.32100978 9 | 7,0.32173893,0.3218549,0.3187024 10 | 8,0.32544047,0.32556787,0.3224075 11 | 9,0.344368,0.3445152,0.34142512 12 | 10,0.3655007,0.36567506,0.3623955 13 | 11,0.38081372,0.38100296,0.37764993 14 | 12,0.38874978,0.38893828,0.38563213 15 | 13,0.38537422,0.38555342,0.38225004 16 | 14,0.39434314,0.39452493,0.3914539 17 | 15,0.40501443,0.40519157,0.40221062 18 | 16,0.41383415,0.414013,0.41118416 19 | 17,0.43424043,0.4344097,0.4318083 20 | 18,0.4456768,0.44583458,0.4435271 21 | 19,0.4616012,0.46173084,0.45967415 22 | 20,0.46671286,0.46683112,0.4647799 23 | 21,0.49091575,0.49103191,0.4892095 24 | 22,0.5345532,0.53466916,0.53317034 25 | 23,0.52739257,0.5275056,0.52598923 26 | 24,0.4812145,0.48132038,0.47937903 27 | 25,0.47786388,0.47797868,0.4758911 28 | 26,0.4767261,0.476854,0.4747504 29 | 27,0.45120457,0.45133275,0.44898003 30 | 28,0.43487516,0.43499732,0.43227148 31 | 29,0.4418857,0.44200745,0.439456 32 | 30,0.45188263,0.4520089,0.44948938 33 | 31,0.44309646,0.443208,0.44067165 34 | 32,0.44934252,0.44945362,0.44696212 35 | 33,0.47058168,0.470693,0.46848273 36 | 34,0.48300824,0.4831242,0.480923 37 | 35,0.49022266,0.49034286,0.48815507 38 | 36,0.49732342,0.49744752,0.49531126 39 | 37,0.49466616,0.494789,0.49265566 40 | 38,0.4995418,0.4996657,0.49754837 41 | 39,0.5116362,0.5117548,0.50974 42 | 40,0.5169066,0.5170288,0.5150192 43 | 41,0.53604615,0.5361662,0.534255 44 | 42,0.5560917,0.5562141,0.55443686 45 | 43,0.5699871,0.5701181,0.56848437 46 | 44,0.5755175,0.5756376,0.5740404 47 | 45,0.5944691,0.59459156,0.59314805 48 | 46,0.61108196,0.6111957,0.60986704 49 | 47,0.5935245,0.59361994,0.5924131 50 | 48,0.6343621,0.6344516,0.63365686 51 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/microsoft/deberta-xlarge.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.24957183,0.24977477,0.24214219 3 | 1,0.30639872,0.3065908,0.30158743 4 | 2,0.3331396,0.33333635,0.3290473 5 | 3,0.32949522,0.32968733,0.3253696 6 | 4,0.31661382,0.316769,0.31258678 7 | 5,0.32896715,0.32910535,0.32568523 8 | 6,0.33770096,0.33782086,0.3345446 9 | 7,0.3326147,0.33272395,0.32957816 10 | 8,0.3367821,0.33687654,0.33380622 11 | 9,0.3546219,0.3547327,0.35179362 12 | 10,0.38037142,0.38049275,0.37740862 13 | 11,0.40171945,0.40185076,0.39869636 14 | 12,0.4163913,0.41652367,0.4133557 15 | 13,0.43222922,0.43235204,0.42938623 16 | 14,0.4416328,0.44175574,0.43894717 17 | 15,0.45403007,0.45415205,0.45151842 18 | 16,0.47758847,0.47770745,0.47528616 19 | 17,0.49413732,0.49424222,0.49203014 20 | 18,0.5177917,0.5178813,0.51596016 21 | 19,0.54055035,0.54061955,0.53895485 22 | 20,0.5554671,0.55553156,0.553943 23 | 21,0.5871218,0.5871978,0.585844 24 | 22,0.6379372,0.6380021,0.6369301 25 | 23,0.62672323,0.6267863,0.6256873 26 | 24,0.5497838,0.5498381,0.5483379 27 | 25,0.543943,0.5440018,0.54246646 28 | 26,0.55943567,0.55949783,0.5578509 29 | 27,0.5522361,0.5523346,0.55041844 30 | 28,0.5384432,0.53856134,0.53645724 31 | 29,0.541011,0.5411351,0.53916043 32 | 30,0.53560615,0.5357274,0.5337449 33 | 31,0.5211553,0.5212751,0.51924247 34 | 32,0.52553123,0.52564174,0.5235451 35 | 33,0.53930295,0.5394204,0.5372786 36 | 34,0.5591909,0.55931133,0.5570341 37 | 35,0.5712996,0.5714208,0.5691194 38 | 36,0.57959074,0.57972014,0.5774709 39 | 37,0.58818644,0.5883232,0.58616716 40 | 38,0.5925551,0.5926871,0.5905953 41 | 39,0.6026835,0.60282564,0.6008043 42 | 40,0.6189861,0.6191251,0.6172279 43 | 41,0.62964463,0.62977934,0.62799156 44 | 42,0.6451681,0.64530563,0.6436409 45 | 43,0.6539978,0.65413773,0.65264153 46 | 44,0.65711796,0.65726453,0.65580934 47 | 45,0.66835105,0.66850114,0.6671609 48 | 46,0.67004806,0.6701847,0.6689483 49 | 47,0.611536,0.61166185,0.6104823 50 | 48,0.6487418,0.64883584,0.6481099 51 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.4043224,0.40432808,0.40218553 3 | 1,0.6423126,0.6422804,0.6414617 4 | 2,0.768273,0.7682535,0.76791227 5 | 3,0.7803166,0.78030443,0.7800415 6 | 4,0.7839782,0.78397924,0.7836174 7 | 5,0.7959116,0.7959033,0.79557085 8 | 6,0.80936664,0.80936354,0.80908644 9 | 7,0.81720984,0.81721514,0.816965 10 | 8,0.80465585,0.80464727,0.8043641 11 | 9,0.7911581,0.79115206,0.7908595 12 | 10,0.8146725,0.8146619,0.814463 13 | 11,0.8243949,0.8244051,0.82420003 14 | 12,0.8557132,0.85571885,0.8555707 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/roberta-large-mnli.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.36816803,0.36820343,0.3650997 3 | 1,0.6424572,0.64243424,0.6408211 4 | 2,0.62199366,0.6219771,0.62105906 5 | 3,0.65479594,0.65479946,0.6542115 6 | 4,0.66220766,0.66219413,0.66147035 7 | 5,0.6841878,0.6841976,0.6835943 8 | 6,0.6993157,0.6993184,0.698729 9 | 7,0.7363659,0.7363538,0.73597246 10 | 8,0.76699406,0.76697797,0.7666572 11 | 9,0.76385623,0.76387703,0.76359564 12 | 10,0.7751121,0.7751162,0.7748585 13 | 11,0.7607176,0.7607192,0.7604293 14 | 12,0.75846714,0.75850517,0.7582122 15 | 13,0.7660639,0.766093,0.7658386 16 | 14,0.76723933,0.7672636,0.76692307 17 | 15,0.76183504,0.7618548,0.7615043 18 | 16,0.77503896,0.7750635,0.77476084 19 | 17,0.7572284,0.75724494,0.7568846 20 | 18,0.72981,0.72983533,0.7294623 21 | 19,0.6901594,0.69018,0.6896288 22 | 20,0.6456024,0.6456534,0.6447707 23 | 21,0.6733705,0.6734108,0.672755 24 | 22,0.7964235,0.79642963,0.7961781 25 | 23,0.83942956,0.839427,0.8393037 26 | 24,0.87867236,0.8787309,0.8781039 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3712891,0.37132213,0.36826715 3 | 1,0.67176163,0.6717439,0.6703483 4 | 2,0.70031923,0.7003052,0.69969934 5 | 3,0.7080897,0.7081011,0.707698 6 | 4,0.6976306,0.69762677,0.69710517 7 | 5,0.7187199,0.71873325,0.71828526 8 | 6,0.74678195,0.74678224,0.74642223 9 | 7,0.7772428,0.7772184,0.77691925 10 | 8,0.8021733,0.8021747,0.8019093 11 | 9,0.8067641,0.80678225,0.8065291 12 | 10,0.8366976,0.8367098,0.8364913 13 | 11,0.8163513,0.816369,0.8161064 14 | 12,0.8175406,0.8175611,0.81728977 15 | 13,0.82106245,0.8210674,0.82080233 16 | 14,0.81487834,0.8148861,0.8145652 17 | 15,0.8243552,0.8243522,0.8240494 18 | 16,0.8341641,0.8341684,0.833912 19 | 17,0.83150584,0.8314941,0.83122575 20 | 18,0.8314624,0.83146274,0.8311686 21 | 19,0.82761073,0.8276117,0.8273196 22 | 20,0.799873,0.79988,0.79956234 23 | 21,0.8082163,0.80819315,0.8079286 24 | 22,0.83196104,0.83195347,0.83174026 25 | 23,0.8408042,0.8408027,0.8405716 26 | 24,0.96022236,0.96021587,0.960168 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.2929519,0.29297927,0.28788087 3 | 1,0.32307193,0.32305866,0.31955993 4 | 2,0.33333376,0.33329934,0.3307059 5 | 3,0.34018472,0.34019333,0.3369147 6 | 4,0.35193846,0.35196185,0.34877294 7 | 5,0.41633913,0.41635182,0.41389906 8 | 6,0.52230054,0.5223191,0.5208747 9 | 7,0.57117224,0.5711975,0.57016635 10 | 8,0.55626523,0.55628437,0.55513597 11 | 9,0.5035621,0.5035617,0.5023768 12 | 10,0.43660313,0.4366135,0.43496045 13 | 11,0.37350416,0.37354943,0.3712711 14 | 12,0.3694557,0.36947483,0.36708415 15 | 13,0.38296118,0.38296735,0.38057274 16 | 14,0.3801941,0.38019708,0.37771493 17 | 15,0.39073846,0.39073724,0.38804337 18 | 16,0.27941948,0.2793937,0.27774334 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/xlm-mlm-en-2048.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.48034036,0.48027167,0.4755281 3 | 1,0.68549955,0.68547165,0.68418026 4 | 2,0.7502881,0.7502652,0.7497456 5 | 3,0.7662417,0.7662214,0.7659151 6 | 4,0.7910623,0.7910466,0.79085386 7 | 5,0.8090659,0.8090618,0.80895317 8 | 6,0.82148397,0.8214852,0.821408 9 | 7,0.8091143,0.8091184,0.8090199 10 | 8,0.77966934,0.7796406,0.77937865 11 | 9,0.75278246,0.7527972,0.7524639 12 | 10,0.72071564,0.7207407,0.7202978 13 | 11,0.7175687,0.7176211,0.7170889 14 | 12,0.22130837,0.22130068,0.21938775 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.31767526,0.31771243,0.31208947 3 | 1,0.45930108,0.45930612,0.4573549 4 | 2,0.6739723,0.6739605,0.67332643 5 | 3,0.7428563,0.7428622,0.74252146 6 | 4,0.7270618,0.7270706,0.7267292 7 | 5,0.7459538,0.7459533,0.74563044 8 | 6,0.7416182,0.74162334,0.74136156 9 | 7,0.7766629,0.7766664,0.7764565 10 | 8,0.7827196,0.78271383,0.78251594 11 | 9,0.81658614,0.8165717,0.81639785 12 | 10,0.83839214,0.83837646,0.8382293 13 | 11,0.8711623,0.8711581,0.87106025 14 | 12,0.9843661,0.98436636,0.9843645 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.38918123,0.38920417,0.3852401 3 | 1,0.66835684,0.6683084,0.6677018 4 | 2,0.7323929,0.7323684,0.7321559 5 | 3,0.7391762,0.7391537,0.73889536 6 | 4,0.7922834,0.79227173,0.7921484 7 | 5,0.79589903,0.795871,0.7957138 8 | 6,0.8166894,0.816673,0.8165898 9 | 7,0.8223533,0.8223572,0.82228154 10 | 8,0.834576,0.8345772,0.8344947 11 | 9,0.8377803,0.83777326,0.8376894 12 | 10,0.8380223,0.8380033,0.83791 13 | 11,0.8415803,0.84157884,0.8414282 14 | 12,0.84659237,0.8466055,0.84632146 15 | 13,0.8437288,0.84372836,0.84340864 16 | 14,0.846515,0.84650415,0.8461781 17 | 15,0.8514585,0.8514379,0.85112184 18 | 16,0.84461045,0.8446081,0.8442589 19 | 17,0.85291016,0.8529066,0.8525485 20 | 18,0.8582745,0.8582787,0.85787606 21 | 19,0.85327464,0.8532746,0.85287833 22 | 20,0.86624545,0.86624,0.86592185 23 | 21,0.8854349,0.88543147,0.88515806 24 | 22,0.8891757,0.8891605,0.88892245 25 | 23,0.88805044,0.88803035,0.88777393 26 | 24,0.9840399,0.98404247,0.984038 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/xlnet-base-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.29910204,0.29919305,0.29052314 3 | 1,0.29633516,0.29640594,0.2915415 4 | 2,0.28782755,0.28787795,0.28492415 5 | 3,0.29966587,0.2996727,0.29745364 6 | 4,0.32897076,0.32897395,0.3263186 7 | 5,0.34247187,0.3424195,0.34024557 8 | 6,0.61728173,0.61718243,0.6160013 9 | 7,0.6704566,0.6703779,0.66936857 10 | 8,0.8596307,0.8595696,0.859391 11 | 9,0.8611796,0.8611522,0.8610164 12 | 10,0.89382625,0.8938215,0.8937337 13 | 11,0.97762144,0.9776183,0.97761476 14 | 12,0.93146294,0.93134,0.93100053 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/en/xlnet-large-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.41637358,0.41643414,0.41258112 3 | 1,0.32545134,0.32545993,0.3204785 4 | 2,0.29599807,0.29601985,0.29176536 5 | 3,0.21799843,0.2180424,0.21441601 6 | 4,0.2619272,0.261958,0.25913864 7 | 5,0.30362618,0.30360785,0.30147976 8 | 6,0.31371272,0.3136575,0.31170228 9 | 7,0.3085695,0.30850938,0.30676135 10 | 8,0.3251663,0.32509723,0.32402074 11 | 9,0.34611195,0.34610417,0.3449464 12 | 10,0.33172518,0.3316963,0.32996267 13 | 11,0.32673666,0.32671896,0.3252777 14 | 12,0.3015574,0.30154356,0.29979268 15 | 13,0.33127543,0.33126998,0.33017284 16 | 14,0.33191463,0.33192313,0.3307891 17 | 15,0.3753324,0.3753503,0.374231 18 | 16,0.37750244,0.37751338,0.37648135 19 | 17,0.3678608,0.3678761,0.36674905 20 | 18,0.305072,0.3050984,0.3042137 21 | 19,0.42524177,0.4253285,0.42387673 22 | 20,0.59149736,0.59153783,0.5901478 23 | 21,0.6070587,0.607099,0.6057612 24 | 22,0.80884385,0.80882186,0.8085461 25 | 23,0.9555436,0.9555404,0.95551467 26 | 24,0.96873486,0.9687297,0.9685215 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/es/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.32142487,0.32125905,0.31729683 3 | 1,0.39584324,0.395717,0.39326182 4 | 2,0.3895418,0.38945207,0.38716727 5 | 3,0.47731403,0.47727716,0.47604948 6 | 4,0.5232235,0.5231792,0.52232313 7 | 5,0.5989939,0.59892774,0.59843445 8 | 6,0.6496523,0.6496062,0.649302 9 | 7,0.5524209,0.5523591,0.55184853 10 | 8,0.52988493,0.5298184,0.52922106 11 | 9,0.63474494,0.6346978,0.6342529 12 | 10,0.70397323,0.7039352,0.703585 13 | 11,0.7417414,0.74173224,0.74136305 14 | 12,0.39257455,0.39254928,0.39194846 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/es/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.29741356,0.29718205,0.29312935 3 | 1,0.33412832,0.33395684,0.3312904 4 | 2,0.35136887,0.35124466,0.3492788 5 | 3,0.36096326,0.3608026,0.35864976 6 | 4,0.36783966,0.36770988,0.36555293 7 | 5,0.4318944,0.4317502,0.4300937 8 | 6,0.54022354,0.54010266,0.5391772 9 | 7,0.5873484,0.5872481,0.58660454 10 | 8,0.56757474,0.5674764,0.566725 11 | 9,0.50883144,0.5087277,0.5079181 12 | 10,0.43789023,0.43777642,0.4366415 13 | 11,0.37517586,0.37504935,0.3734603 14 | 12,0.37935427,0.37921786,0.37755096 15 | 13,0.39596176,0.39583504,0.39407465 16 | 14,0.40488854,0.4047234,0.40284988 17 | 15,0.41720447,0.41700417,0.41506332 18 | 16,0.321014,0.32089671,0.31943843 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/es/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3246688,0.32442638,0.31979698 3 | 1,0.4669744,0.46682546,0.46536762 4 | 2,0.682952,0.68287796,0.6824639 5 | 3,0.75232756,0.7522827,0.7520721 6 | 4,0.73857796,0.73851913,0.73830944 7 | 5,0.7549688,0.7549195,0.75471216 8 | 6,0.7463499,0.74629426,0.7461334 9 | 7,0.7811989,0.78114533,0.78101724 10 | 8,0.78642476,0.7863655,0.7862384 11 | 9,0.8234212,0.823385,0.8232284 12 | 10,0.8446837,0.8446493,0.8445056 13 | 11,0.87540615,0.8753815,0.8752877 14 | 12,0.9844347,0.9844323,0.9844318 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/es/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.39393866,0.39371517,0.3905179 3 | 1,0.6807716,0.6807661,0.680328 4 | 2,0.7418765,0.74186456,0.74167407 5 | 3,0.74935234,0.7493611,0.7491301 6 | 4,0.79821396,0.79822713,0.7980995 7 | 5,0.7988987,0.7989139,0.7987521 8 | 6,0.8229017,0.8228938,0.8228024 9 | 7,0.8280001,0.8279914,0.8279237 10 | 8,0.8397697,0.8397626,0.8396876 11 | 9,0.8410181,0.8410066,0.84094054 12 | 10,0.8409921,0.8409992,0.8409067 13 | 11,0.8431543,0.8431424,0.84302104 14 | 12,0.8459719,0.84595364,0.84571356 15 | 13,0.8396326,0.839628,0.83931595 16 | 14,0.84028465,0.84028375,0.83993286 17 | 15,0.8447372,0.84472674,0.8444034 18 | 16,0.8363781,0.8363222,0.8360513 19 | 17,0.84482056,0.8447689,0.8445116 20 | 18,0.85074264,0.85068643,0.8504014 21 | 19,0.84944814,0.8493866,0.8491228 22 | 20,0.86171687,0.86166567,0.8614421 23 | 21,0.87797874,0.8779322,0.8777276 24 | 22,0.87815136,0.87810516,0.87791014 25 | 23,0.87712365,0.87708676,0.8768752 26 | 24,0.9812538,0.9812441,0.9812441 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/et/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.27440318,0.27450797,0.2698814 3 | 1,0.36711293,0.3672066,0.3646441 4 | 2,0.36751607,0.36758184,0.36546195 5 | 3,0.44396114,0.4440282,0.44275236 6 | 4,0.49434176,0.49438694,0.49351478 7 | 5,0.5781191,0.57814497,0.57762396 8 | 6,0.6325188,0.63253754,0.63219965 9 | 7,0.5371272,0.5371553,0.53662723 10 | 8,0.51365125,0.5136854,0.5130298 11 | 9,0.61113626,0.6111767,0.6106605 12 | 10,0.68986833,0.6898959,0.6895253 13 | 11,0.72481495,0.7248488,0.72443366 14 | 12,0.41427994,0.414279,0.41360843 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/et/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.22709163,0.2271243,0.22315507 3 | 1,0.28697282,0.28699732,0.284561 4 | 2,0.31591207,0.31594896,0.3142282 5 | 3,0.3272068,0.3271873,0.3251662 6 | 4,0.33797315,0.33791435,0.3357934 7 | 5,0.39506105,0.39499047,0.39325175 8 | 6,0.49566302,0.4955908,0.49454838 9 | 7,0.55213124,0.5520629,0.55135715 10 | 8,0.5356107,0.53553146,0.53473157 11 | 9,0.48094663,0.4808736,0.4799986 12 | 10,0.41156343,0.41149083,0.410293 13 | 11,0.36135536,0.36126482,0.3597544 14 | 12,0.3840661,0.38395354,0.3824061 15 | 13,0.3990762,0.39895925,0.39723954 16 | 14,0.40530387,0.40517297,0.40322375 17 | 15,0.41519368,0.41508782,0.4130928 18 | 16,0.3316055,0.33152688,0.32986996 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/et/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.27967498,0.27960515,0.27565888 3 | 1,0.4284523,0.4284252,0.42707014 4 | 2,0.6428217,0.64281446,0.6423762 5 | 3,0.7207139,0.72071636,0.72050244 6 | 4,0.7149051,0.7149116,0.7146955 7 | 5,0.7364546,0.7364631,0.73624396 8 | 6,0.72894406,0.7289576,0.7287705 9 | 7,0.76335233,0.76335174,0.7631944 10 | 8,0.7660467,0.7660525,0.76588887 11 | 9,0.80481553,0.8047997,0.8046123 12 | 10,0.824247,0.8242213,0.8240452 13 | 11,0.8616431,0.8616192,0.8615053 14 | 12,0.9794287,0.9794275,0.9794225 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/et/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.34819788,0.3482508,0.34528664 3 | 1,0.66133285,0.6613388,0.66087705 4 | 2,0.7167448,0.71674955,0.71656334 5 | 3,0.7235102,0.7235182,0.7232822 6 | 4,0.7763208,0.77634054,0.7762066 7 | 5,0.7787467,0.77879304,0.77860975 8 | 6,0.8065161,0.8065541,0.8064417 9 | 7,0.8130386,0.8130481,0.81297535 10 | 8,0.8232221,0.8232352,0.82315505 11 | 9,0.8259885,0.8259966,0.82592285 12 | 10,0.8261345,0.8261378,0.82604396 13 | 11,0.8302732,0.83030087,0.8301369 14 | 12,0.832208,0.8322509,0.83195096 15 | 13,0.8284099,0.82843494,0.8280566 16 | 14,0.8308385,0.830874,0.83043313 17 | 15,0.83593214,0.83598274,0.8355737 18 | 16,0.8225831,0.8226431,0.8222514 19 | 17,0.83149856,0.83155996,0.83119583 20 | 18,0.8360739,0.83612305,0.83573186 21 | 19,0.8338515,0.8339162,0.8335273 22 | 20,0.85045713,0.8505154,0.8501959 23 | 21,0.866938,0.8670008,0.86669517 24 | 22,0.86754334,0.86759776,0.86728114 25 | 23,0.86495036,0.86501676,0.86465526 26 | 24,0.97565717,0.97566575,0.97565126 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/fi/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.27203143,0.2718125,0.2674527 3 | 1,0.3669008,0.36672932,0.36434102 4 | 2,0.36613643,0.36596906,0.36401412 5 | 3,0.4369806,0.43683773,0.43563512 6 | 4,0.4888657,0.48875853,0.48795715 7 | 5,0.5726952,0.5726454,0.572158 8 | 6,0.62713367,0.62711185,0.6267881 9 | 7,0.5336007,0.53355575,0.53305566 10 | 8,0.51138526,0.51132864,0.51072747 11 | 9,0.6112424,0.6111909,0.6107369 12 | 10,0.6913106,0.6912809,0.6909531 13 | 11,0.7289148,0.7289066,0.7285409 14 | 12,0.40449622,0.4044448,0.4038471 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/fi/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.23438127,0.23428068,0.23040625 3 | 1,0.2891474,0.2890501,0.28663626 4 | 2,0.31794775,0.3178401,0.3161117 5 | 3,0.3314175,0.3313274,0.32932603 6 | 4,0.342742,0.34266472,0.34063184 7 | 5,0.40328184,0.40322024,0.40158102 8 | 6,0.5053177,0.5052804,0.50429296 9 | 7,0.55995744,0.5599387,0.55925107 10 | 8,0.5432386,0.5432242,0.5424414 11 | 9,0.48718062,0.4871476,0.48624423 12 | 10,0.41743338,0.41739362,0.4161943 13 | 11,0.36450592,0.36447832,0.3629536 14 | 12,0.38068864,0.38065174,0.37914556 15 | 13,0.40042648,0.40037584,0.39876702 16 | 14,0.40577888,0.405764,0.40404075 17 | 15,0.4122403,0.412242,0.4104758 18 | 16,0.32324278,0.3231793,0.3216624 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/fi/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.28559503,0.2854355,0.28159738 3 | 1,0.4194148,0.41935146,0.417895 4 | 2,0.6369165,0.63687444,0.63647 5 | 3,0.7129336,0.71288896,0.71269244 6 | 4,0.705694,0.7056649,0.7054607 7 | 5,0.7278231,0.72779924,0.7275826 8 | 6,0.7264064,0.72638345,0.72620934 9 | 7,0.76126385,0.7612437,0.7610952 10 | 8,0.76516724,0.76513124,0.76499206 11 | 9,0.8022079,0.8021703,0.8020057 12 | 10,0.8249256,0.8248923,0.824736 13 | 11,0.86274844,0.8627164,0.862623 14 | 12,0.98083913,0.9808375,0.9808362 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/fi/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.36092442,0.36074498,0.3580085 3 | 1,0.6607407,0.66068447,0.66028255 4 | 2,0.7176594,0.7175985,0.71745765 5 | 3,0.7285679,0.72852075,0.72832286 6 | 4,0.78595924,0.7859272,0.7858384 7 | 5,0.7865282,0.7864904,0.7863824 8 | 6,0.8087826,0.80874693,0.80867803 9 | 7,0.81360877,0.8135691,0.8135222 10 | 8,0.8234922,0.82345206,0.82339203 11 | 9,0.82659143,0.8265619,0.8265032 12 | 10,0.82844096,0.8284168,0.8283329 13 | 11,0.833159,0.83313984,0.83300036 14 | 12,0.83688194,0.8368595,0.8365941 15 | 13,0.83482826,0.8348066,0.83445275 16 | 14,0.8371448,0.8371316,0.8367094 17 | 15,0.8411402,0.8411226,0.8407152 18 | 16,0.8285362,0.828508,0.8281295 19 | 17,0.8365054,0.8364763,0.8361323 20 | 18,0.84074885,0.84073424,0.8403675 21 | 19,0.8374997,0.8374846,0.83713585 22 | 20,0.85316974,0.8531484,0.85286206 23 | 21,0.8724993,0.87247324,0.87221074 24 | 22,0.87472016,0.87468535,0.87441224 25 | 23,0.8715076,0.8714718,0.8711863 26 | 24,0.97793525,0.97793806,0.9779296 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/fr/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3169629,0.31686378,0.31290925 3 | 1,0.3973607,0.39730093,0.3948981 4 | 2,0.3917096,0.39167702,0.38944873 5 | 3,0.471558,0.4715446,0.4703017 6 | 4,0.51729333,0.5172892,0.5164051 7 | 5,0.5921461,0.59214556,0.59160614 8 | 6,0.64118487,0.6411703,0.64082944 9 | 7,0.54434645,0.5443365,0.5437896 10 | 8,0.52369165,0.5237088,0.5230594 11 | 9,0.62573117,0.62573653,0.6252499 12 | 10,0.69342446,0.6934141,0.6930288 13 | 11,0.72644377,0.72643,0.7260432 14 | 12,0.37622055,0.3762342,0.37555423 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/fr/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.29251722,0.29243135,0.28824046 3 | 1,0.3270653,0.32702345,0.32414123 4 | 2,0.34298986,0.34297037,0.34081614 5 | 3,0.35257423,0.35255608,0.3502542 6 | 4,0.36079553,0.36077785,0.35852012 7 | 5,0.4250942,0.425059,0.42333168 8 | 6,0.5288226,0.5288088,0.5278067 9 | 7,0.57518166,0.5751787,0.5744667 10 | 8,0.5556386,0.5556409,0.554816 11 | 9,0.50031036,0.50027037,0.49935982 12 | 10,0.431764,0.43173033,0.43051916 13 | 11,0.3727856,0.37272698,0.37108865 14 | 12,0.3785679,0.37849474,0.37677515 15 | 13,0.3937992,0.39371702,0.39193982 16 | 14,0.3963082,0.39625022,0.3943681 17 | 15,0.40861925,0.408575,0.40663382 18 | 16,0.3136189,0.3135873,0.3120113 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/fr/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.32206962,0.32194686,0.31745258 3 | 1,0.4602701,0.46020856,0.45864564 4 | 2,0.6719199,0.6718807,0.67143625 5 | 3,0.74045163,0.74043214,0.74017817 6 | 4,0.72625005,0.7262382,0.72596425 7 | 5,0.74321467,0.7431929,0.7429401 8 | 6,0.73884493,0.7388181,0.7386279 9 | 7,0.77495724,0.77493846,0.77478385 10 | 8,0.78073204,0.7807058,0.7805547 11 | 9,0.8198895,0.81987315,0.81971085 12 | 10,0.84097534,0.84096044,0.8408151 13 | 11,0.8744024,0.87438446,0.8742934 14 | 12,0.98294896,0.98294634,0.9829455 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/fr/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.38867676,0.38860926,0.38541874 3 | 1,0.6748494,0.6748672,0.6744121 4 | 2,0.7362502,0.7362605,0.73607147 5 | 3,0.74364084,0.7436084,0.7434166 6 | 4,0.79383326,0.79382324,0.7937171 7 | 5,0.79398805,0.7939688,0.79384273 8 | 6,0.8189012,0.818897,0.81879824 9 | 7,0.8252554,0.8252701,0.82518923 10 | 8,0.8372976,0.8373069,0.837221 11 | 9,0.83934426,0.8393484,0.8392743 12 | 10,0.8396223,0.8396263,0.8395396 13 | 11,0.8413963,0.8414122,0.84128094 14 | 12,0.8425236,0.84252983,0.8422956 15 | 13,0.836232,0.8362653,0.8359306 16 | 14,0.8365411,0.8365994,0.83620155 17 | 15,0.84075475,0.84081256,0.84043586 18 | 16,0.8336484,0.83366156,0.83334255 19 | 17,0.8420401,0.84204596,0.84175515 20 | 18,0.84736043,0.847369,0.8470594 21 | 19,0.8457147,0.84572095,0.84543604 22 | 20,0.8610545,0.86105347,0.8608192 23 | 21,0.8796009,0.87960935,0.87939256 24 | 22,0.87826204,0.8782994,0.8780729 25 | 23,0.8757684,0.8757959,0.8755639 26 | 24,0.9783308,0.9783405,0.9783287 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/it/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.2844219,0.28444692,0.28044853 3 | 1,0.37012622,0.37011567,0.3678241 4 | 2,0.37155172,0.3715547,0.36946896 5 | 3,0.4603244,0.46031958,0.45919034 6 | 4,0.50872415,0.5087325,0.50791526 7 | 5,0.5868436,0.5868716,0.5863534 8 | 6,0.6397911,0.63983333,0.63949335 9 | 7,0.5409238,0.54094136,0.54040617 10 | 8,0.5172371,0.51725966,0.5166258 11 | 9,0.62051994,0.6205607,0.62006223 12 | 10,0.6916372,0.6916744,0.69127834 13 | 11,0.7267179,0.72675204,0.7263427 14 | 12,0.38121554,0.38125327,0.38057736 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/it/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.26145306,0.26147375,0.25741524 3 | 1,0.3025566,0.30248818,0.2998916 4 | 2,0.32179558,0.32175842,0.31985885 5 | 3,0.33394024,0.33392504,0.33166507 6 | 4,0.34498307,0.3450014,0.34262 7 | 5,0.41246715,0.4124828,0.41061035 8 | 6,0.51987356,0.51987606,0.51879877 9 | 7,0.56968486,0.5696755,0.56891495 10 | 8,0.5526059,0.55259466,0.55172443 11 | 9,0.49650237,0.49646214,0.49543175 12 | 10,0.42862728,0.42857316,0.42718053 13 | 11,0.36833626,0.36827257,0.3663598 14 | 12,0.37506276,0.3750053,0.37302044 15 | 13,0.38766044,0.38759127,0.3855202 16 | 14,0.39820743,0.39815167,0.39614087 17 | 15,0.4081781,0.40812454,0.4060677 18 | 16,0.31196377,0.31188112,0.31034505 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/it/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.29116568,0.29114744,0.2868295 3 | 1,0.44197133,0.4419524,0.4405804 4 | 2,0.6624113,0.662375,0.661958 5 | 3,0.73566717,0.7356161,0.73541015 6 | 4,0.72424763,0.72419375,0.7239804 7 | 5,0.74316144,0.743101,0.7429021 8 | 6,0.7358866,0.7358457,0.7356837 9 | 7,0.7717992,0.77175343,0.7716246 10 | 8,0.77671385,0.77666664,0.7765362 11 | 9,0.8156109,0.8155815,0.81542325 12 | 10,0.8353943,0.8353729,0.83522326 13 | 11,0.8693978,0.86938006,0.86928266 14 | 12,0.98234653,0.98234344,0.9823433 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/it/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3594449,0.35943142,0.35630783 3 | 1,0.662669,0.662666,0.66222733 4 | 2,0.7278628,0.72787654,0.727691 5 | 3,0.7365745,0.73659885,0.7363675 6 | 4,0.7886599,0.7886995,0.7885636 7 | 5,0.7880371,0.7880777,0.7879012 8 | 6,0.8136995,0.813729,0.8136207 9 | 7,0.8208896,0.8209284,0.82083935 10 | 8,0.83259714,0.8326411,0.8325414 11 | 9,0.83512545,0.83517814,0.83508295 12 | 10,0.8349007,0.8349598,0.83484215 13 | 11,0.8370682,0.83713394,0.83697623 14 | 12,0.83735925,0.8374388,0.83716834 15 | 13,0.8307876,0.8308769,0.8305337 16 | 14,0.8304336,0.83052474,0.83013064 17 | 15,0.8350196,0.83511585,0.8347356 18 | 16,0.8262828,0.8263541,0.82602215 19 | 17,0.8352246,0.83529,0.8349814 20 | 18,0.8413706,0.8414452,0.84111327 21 | 19,0.84041846,0.84048223,0.840178 22 | 20,0.85462093,0.8546788,0.854426 23 | 21,0.8733275,0.87337714,0.8731498 24 | 22,0.87235314,0.8724075,0.87218434 25 | 23,0.86924857,0.86931163,0.8690715 26 | 24,0.97641337,0.9764174,0.97640705 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/lv/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.31880563,0.31885436,0.314444 3 | 1,0.39939553,0.399461,0.39680958 4 | 2,0.39936826,0.39942977,0.39705652 5 | 3,0.4639698,0.46403775,0.462585 6 | 4,0.51133174,0.511391,0.5104017 7 | 5,0.58995867,0.59001076,0.5894416 8 | 6,0.64041185,0.6404576,0.640104 9 | 7,0.5489481,0.5489947,0.5485002 10 | 8,0.5241059,0.5241476,0.5235563 11 | 9,0.61489826,0.6149375,0.614489 12 | 10,0.69464105,0.6946774,0.69437844 13 | 11,0.73005176,0.7301036,0.72975814 14 | 12,0.42655912,0.42657772,0.42596325 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/lv/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.25295407,0.2529038,0.24914542 3 | 1,0.30763087,0.30758235,0.3050838 4 | 2,0.3358753,0.33583683,0.33403713 5 | 3,0.35062265,0.35062423,0.34857833 6 | 4,0.36368594,0.36370537,0.36166307 7 | 5,0.4208051,0.42084384,0.41921085 8 | 6,0.52163017,0.5216751,0.5207288 9 | 7,0.5748712,0.57491165,0.57426846 10 | 8,0.5565561,0.55660844,0.55587536 11 | 9,0.50083,0.50086135,0.5000541 12 | 10,0.4287173,0.42873642,0.4276349 13 | 11,0.37965864,0.37967306,0.37825358 14 | 12,0.407949,0.40795445,0.40652746 15 | 13,0.43800756,0.437995,0.43645307 16 | 14,0.45024598,0.4502631,0.4485536 17 | 15,0.45746338,0.45749146,0.4557482 18 | 16,0.38742596,0.38743827,0.3859296 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/lv/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3158023,0.31595236,0.31170404 3 | 1,0.44148916,0.44161677,0.43999764 4 | 2,0.65698195,0.65707916,0.65660477 5 | 3,0.7291459,0.72921735,0.72896385 6 | 4,0.72035086,0.720424,0.72016907 7 | 5,0.7387083,0.73877054,0.73851764 8 | 6,0.7331035,0.7331564,0.73294306 9 | 7,0.7675076,0.767555,0.7673717 10 | 8,0.7721929,0.7722387,0.77205515 11 | 9,0.8134348,0.81347644,0.81327385 12 | 10,0.8337028,0.8337392,0.8335502 13 | 11,0.86931133,0.869342,0.86921614 14 | 12,0.98048294,0.9804859,0.980481 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/lv/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.38303047,0.38316694,0.38004494 3 | 1,0.6715437,0.67160815,0.671146 4 | 2,0.72935355,0.7293971,0.7292059 5 | 3,0.7367963,0.7368109,0.73659825 6 | 4,0.78665257,0.78668237,0.7865436 7 | 5,0.7876893,0.7877102,0.7875555 8 | 6,0.8118788,0.811887,0.8117912 9 | 7,0.81852704,0.8185447,0.81847227 10 | 8,0.82763994,0.8276652,0.8275769 11 | 9,0.829965,0.8299915,0.8299079 12 | 10,0.8325733,0.8325909,0.832492 13 | 11,0.83627987,0.8362996,0.8361541 14 | 12,0.83918196,0.8392273,0.8389481 15 | 13,0.83649516,0.8365421,0.83620304 16 | 14,0.8397933,0.8398624,0.8394616 17 | 15,0.84358793,0.8436498,0.8432716 18 | 16,0.83227086,0.8323361,0.8319669 19 | 17,0.83968145,0.83974457,0.8394039 20 | 18,0.8447246,0.84477955,0.8444194 21 | 19,0.8420644,0.8421077,0.84175897 22 | 20,0.85892874,0.85897714,0.85869175 23 | 21,0.8784681,0.87850714,0.8782475 24 | 22,0.8812008,0.8812461,0.8809788 25 | 23,0.8809998,0.88104856,0.8807619 26 | 24,0.9765009,0.9765086,0.9764968 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/pt/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.29287484,0.29276612,0.28896153 3 | 1,0.37047997,0.3703816,0.36809015 4 | 2,0.36899555,0.36891612,0.36687937 5 | 3,0.46113333,0.46108902,0.46000266 6 | 4,0.5105716,0.5105592,0.5097696 7 | 5,0.5895024,0.5895065,0.5890118 8 | 6,0.6431079,0.6431254,0.64279854 9 | 7,0.5462664,0.54628617,0.5457523 10 | 8,0.5256067,0.5256172,0.5249814 11 | 9,0.6314677,0.6314837,0.63099706 12 | 10,0.70045394,0.7004913,0.7001096 13 | 11,0.73772144,0.7377826,0.73737544 14 | 12,0.376468,0.37648788,0.37585765 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/pt/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.27077773,0.27081063,0.26688486 3 | 1,0.31458914,0.31458747,0.3120791 4 | 2,0.33407974,0.3341051,0.33230388 5 | 3,0.3424346,0.34244755,0.3402508 6 | 4,0.35230166,0.35230178,0.35003006 7 | 5,0.4220341,0.42202395,0.4202843 8 | 6,0.53116757,0.53115034,0.530122 9 | 7,0.5820105,0.5819889,0.5812692 10 | 8,0.5657533,0.56574196,0.56491727 11 | 9,0.5071269,0.5071374,0.5062459 12 | 10,0.43731558,0.43734074,0.43611565 13 | 11,0.37808847,0.37813658,0.37643093 14 | 12,0.38327742,0.3833187,0.38146868 15 | 13,0.39855412,0.39860153,0.3966159 16 | 14,0.40502536,0.4050431,0.4029639 17 | 15,0.4187931,0.4188173,0.41667226 18 | 16,0.3218223,0.32182986,0.32025683 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/pt/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.302053,0.30184427,0.2976399 3 | 1,0.45684627,0.45677197,0.45546424 4 | 2,0.6735796,0.6735233,0.6731233 5 | 3,0.74322075,0.74317265,0.74297166 6 | 4,0.72940576,0.72935677,0.72914743 7 | 5,0.74814695,0.748095,0.74789345 8 | 6,0.7392128,0.739171,0.7390097 9 | 7,0.7750178,0.774979,0.7748446 10 | 8,0.7798485,0.77981144,0.779676 11 | 9,0.81777656,0.8177529,0.8175885 12 | 10,0.83894795,0.8389314,0.838778 13 | 11,0.870458,0.87044656,0.8703426 14 | 12,0.9830619,0.98306423,0.98306143 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/pt/xlm-roberta-large.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3740441,0.37399486,0.3710023 3 | 1,0.67301875,0.6729773,0.6725724 4 | 2,0.7341082,0.73409325,0.733912 5 | 3,0.74055076,0.7405203,0.7403056 6 | 4,0.7904661,0.79042983,0.79032314 7 | 5,0.7880771,0.78803754,0.78790236 8 | 6,0.81661665,0.8166169,0.81652534 9 | 7,0.8221869,0.82219744,0.82212555 10 | 8,0.8350775,0.83508027,0.8350043 11 | 9,0.8372719,0.83726805,0.8372026 12 | 10,0.8372136,0.8371918,0.8371133 13 | 11,0.8399054,0.8398653,0.83975667 14 | 12,0.84060127,0.8405483,0.84033316 15 | 13,0.8341999,0.8341561,0.83385843 16 | 14,0.83416283,0.83414257,0.8337824 17 | 15,0.8384014,0.83838236,0.8380531 18 | 16,0.8296981,0.82966036,0.8293861 19 | 17,0.83966845,0.8396195,0.8393744 20 | 18,0.84589136,0.8458346,0.845562 21 | 19,0.84492606,0.8448792,0.8446221 22 | 20,0.8584489,0.8584082,0.858195 23 | 21,0.87398726,0.8739412,0.87374836 24 | 22,0.8719638,0.8719428,0.8717388 25 | 23,0.87165064,0.87161326,0.87140054 26 | 24,0.97964114,0.9796477,0.979639 27 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/zh/bert-base-chinese.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.2786982,0.2785878,0.27494037 3 | 1,0.33671036,0.33662596,0.33471334 4 | 2,0.42845756,0.4284101,0.4273608 5 | 3,0.45149758,0.45147166,0.45057997 6 | 4,0.5184017,0.5184023,0.51783705 7 | 5,0.573508,0.5734958,0.57311326 8 | 6,0.6330495,0.6330315,0.63276017 9 | 7,0.59864044,0.5986131,0.59829366 10 | 8,0.54804957,0.5480091,0.54755783 11 | 9,0.51617336,0.516132,0.5156478 12 | 10,0.5561151,0.55609417,0.55573994 13 | 11,0.5984755,0.5984512,0.5981564 14 | 12,0.56038475,0.5603337,0.5599188 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/zh/bert-base-multilingual-cased.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.3118959,0.31177446,0.3086432 3 | 1,0.3425565,0.34244964,0.3399823 4 | 2,0.35352883,0.35343447,0.35129714 5 | 3,0.43610418,0.43604368,0.43494177 6 | 4,0.489178,0.4891102,0.48830378 7 | 5,0.5690116,0.5689432,0.5684761 8 | 6,0.6265541,0.6264865,0.6262059 9 | 7,0.54113525,0.5410629,0.54064935 10 | 8,0.5284168,0.52834535,0.5279011 11 | 9,0.62840384,0.62833464,0.62803453 12 | 10,0.69999313,0.69992936,0.6997184 13 | 11,0.732485,0.73242646,0.73219264 14 | 12,0.37793094,0.37789607,0.3773833 15 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/zh/xlm-mlm-100-1280.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.32776257,0.3276813,0.32470536 3 | 1,0.3356181,0.3355479,0.332959 4 | 2,0.35034394,0.35026166,0.3482373 5 | 3,0.36442822,0.36435747,0.36224666 6 | 4,0.3771403,0.37707978,0.37516475 7 | 5,0.43258497,0.4325344,0.43104237 8 | 6,0.5181599,0.5181224,0.5172326 9 | 7,0.5792645,0.57922333,0.57866186 10 | 8,0.5692134,0.5691731,0.56858486 11 | 9,0.5324812,0.5324232,0.53178775 12 | 10,0.47810394,0.47805268,0.47723517 13 | 11,0.4319199,0.43188363,0.43088776 14 | 12,0.44747546,0.447443,0.44653583 15 | 13,0.45633683,0.4563076,0.45531917 16 | 14,0.45723236,0.457195,0.45610127 17 | 15,0.46675017,0.46670267,0.4656479 18 | 16,0.40051928,0.40046176,0.39960644 19 | -------------------------------------------------------------------------------- /bert_score/bert_score/rescale_baseline/zh/xlm-roberta-base.tsv: -------------------------------------------------------------------------------- 1 | LAYER,P,R,F 2 | 0,0.36188287,0.36180493,0.35862362 3 | 1,0.4372344,0.43716717,0.43550655 4 | 2,0.64521,0.64515334,0.6446227 5 | 3,0.734053,0.7340016,0.7337482 6 | 4,0.730163,0.73011726,0.72988415 7 | 5,0.7542184,0.7541747,0.7539484 8 | 6,0.7611062,0.7610684,0.76089287 9 | 7,0.79163146,0.7915949,0.79145956 10 | 8,0.79859376,0.79856044,0.7984367 11 | 9,0.82988167,0.8298588,0.82975745 12 | 10,0.8522986,0.8522761,0.8521975 13 | 11,0.8852355,0.88521546,0.88517046 14 | 12,0.98287344,0.98286974,0.9828698 15 | -------------------------------------------------------------------------------- /bert_score/bert_score_cli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/bert_score_cli/__init__.py -------------------------------------------------------------------------------- /bert_score/bert_score_cli/score.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import argparse 4 | import torch 5 | 6 | import bert_score 7 | from add_context import add_context 8 | 9 | def main(): 10 | torch.multiprocessing.set_sharing_strategy("file_system") 11 | 12 | parser = argparse.ArgumentParser("Calculate BERTScore") 13 | parser.add_argument( 14 | "--lang", 15 | type=str, 16 | default=None, 17 | help='two-letter abbreviation of the language (e.g., en) or "en-sci" for scientific text', 18 | ) 19 | parser.add_argument( 20 | "-m", "--model", default=None, help="BERT model name (default: bert-base-uncased) or path to a pretrain model", 21 | ) 22 | parser.add_argument("-l", "--num_layers", type=int, default=None, help="use first N layer in BERT (default: 8)") 23 | parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size (default: 64)") 24 | parser.add_argument("--nthreads", type=int, default=4, help="number of cpu workers (default: 4)") 25 | parser.add_argument("--idf", action="store_true", help="BERT Score with IDF scaling") 26 | parser.add_argument( 27 | "--rescale_with_baseline", action="store_true", help="Rescaling the numerical score with precomputed baselines", 28 | ) 29 | parser.add_argument("--baseline_path", default=None, type=str, help="path of custom baseline csv file") 30 | parser.add_argument("--use_fast_tokenizer", action="store_false", help="whether to use HF fast tokenizer") 31 | parser.add_argument("-s", "--seg_level", action="store_true", help="show individual score of each pair") 32 | parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity") 33 | parser.add_argument("-r", "--ref", type=str, nargs="+", required=True, help="reference file path(s) or a string") 34 | parser.add_argument( 35 | "-c", "--cand", type=str, required=True, help="candidate (system outputs) file path or a string", 36 | ) 37 | parser.add_argument("--doc", type=str, default=None, help="File containing document IDs to evaluate at the document level.") 38 | 39 | args = parser.parse_args() 40 | 41 | if args.doc: 42 | print('Running at document level') 43 | with open(args.doc, encoding="utf-8") as fp: 44 | doc_ids = [line.strip() for line in fp.readlines()] 45 | assert not args.idf, "do not support idf mode for document-level evaluation" 46 | 47 | if os.path.isfile(args.cand): 48 | with open(args.cand) as f: 49 | cands = [line.strip() for line in f] 50 | 51 | refs = [] 52 | for ref_file in args.ref: 53 | assert os.path.exists(ref_file), f"reference file {ref_file} doesn't exist" 54 | with open(ref_file) as f: 55 | curr_ref = [line.strip() for line in f] 56 | assert len(curr_ref) == len(cands), f"# of sentences in {ref_file} doesn't match the # of candidates" 57 | if args.doc: 58 | sep_token = "[SEP]" if args.lang != "en" else "" 59 | sent_ref = curr_ref 60 | curr_ref = add_context(orig_txt=curr_ref, context=curr_ref, doc_ids=doc_ids, sep_token=sep_token) 61 | refs.append(curr_ref) 62 | refs = list(zip(*refs)) 63 | elif os.path.isfile(args.ref[0]): 64 | assert os.path.exists(args.cand), f"candidate file {args.cand} doesn't exist" 65 | else: 66 | cands = [args.cand] 67 | refs = [args.ref] 68 | assert not args.idf, "do not support idf mode for a single pair of sentences" 69 | 70 | if args.doc: 71 | print('Adding reference context to MT') 72 | cands = add_context(orig_txt=cands, context=sent_ref, doc_ids=doc_ids, sep_token=sep_token) 73 | 74 | all_preds, hash_code = bert_score.score( 75 | cands, 76 | refs, 77 | model_type=args.model, 78 | num_layers=args.num_layers, 79 | verbose=args.verbose, 80 | idf=args.idf, 81 | batch_size=args.batch_size, 82 | lang=args.lang, 83 | return_hash=True, 84 | rescale_with_baseline=args.rescale_with_baseline, 85 | baseline_path=args.baseline_path, 86 | use_fast_tokenizer=args.use_fast_tokenizer, 87 | doc=True if args.doc else False 88 | ) 89 | avg_scores = [s.mean(dim=0) for s in all_preds] 90 | P = avg_scores[0].cpu().item() 91 | R = avg_scores[1].cpu().item() 92 | F1 = avg_scores[2].cpu().item() 93 | msg = hash_code + f" P: {P:.6f} R: {R:.6f} F1: {F1:.6f}" 94 | print(msg) 95 | if args.seg_level: 96 | ps, rs, fs = all_preds 97 | for p, r, f in zip(ps, rs, fs): 98 | print("{:.6f}\t{:.6f}\t{:.6f}".format(p, r, f)) 99 | 100 | 101 | if __name__ == "__main__": 102 | main() 103 | -------------------------------------------------------------------------------- /bert_score/bert_score_cli/visualize.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import os 3 | import time 4 | import argparse 5 | import torch 6 | from collections import defaultdict 7 | 8 | import bert_score 9 | 10 | 11 | def main(): 12 | torch.multiprocessing.set_sharing_strategy("file_system") 13 | 14 | parser = argparse.ArgumentParser("Visualize BERTScore") 15 | parser.add_argument("--lang", type=str, default="en", help="two-letter abbreviation of the language (e.g., en)") 16 | parser.add_argument("-m", "--model", default=None, help="BERT model name (default: bert-base-uncased)") 17 | parser.add_argument("-l", "--num_layers", type=int, default=None, help="use first N layer in BERT (default: 8)") 18 | parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity") 19 | parser.add_argument("-r", "--ref", type=str, required=True, help="reference sentence") 20 | parser.add_argument("-c", "--cand", type=str, required=True, help="candidate sentence") 21 | parser.add_argument( 22 | "-f", "--file", type=str, default="visualize.png", help="name of file to save output matrix in", 23 | ) 24 | parser.add_argument( 25 | "--rescale_with_baseline", action="store_true", help="Rescaling the numerical score with precomputed baselines", 26 | ) 27 | parser.add_argument("--baseline_path", default=None, type=str, help="path of custom baseline csv file") 28 | 29 | args = parser.parse_args() 30 | 31 | bert_score.plot_example( 32 | args.cand, 33 | args.ref, 34 | model_type=args.model, 35 | lang=args.lang, 36 | num_layers=args.num_layers, 37 | fname=args.file, 38 | rescale_with_baseline=args.rescale_with_baseline, 39 | baseline_path=args.baseline_path, 40 | ) 41 | 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /bert_score/get_rescale_baseline/README.md: -------------------------------------------------------------------------------- 1 | # Computing rescale baselines for English models 2 | ### Downloading the dataset 3 | This downloads the WMT17 English text data. 4 | ```sh 5 | bash download_text_data.sh 6 | ``` 7 | 8 | ### Tuning the models 9 | Here is an example of getting the rescale baseline files of two models 10 | ```sh 11 | python get_rescale_baseline.py --lang en -b 16 -m \ 12 | microsoft/deberta-large \ 13 | microsoft/deberta-large-mnli 14 | ``` 15 | The baseline files will be in `rescale_baseline` folder -------------------------------------------------------------------------------- /bert_score/get_rescale_baseline/download_text_data.sh: -------------------------------------------------------------------------------- 1 | mkdir -p data 2 | cd data 3 | if ! [ -f news.2017.en.shuffled.deduped ]; then 4 | wget http://data.statmt.org/wmt18/translation-task/news.2017.en.shuffled.deduped.gz 5 | gzip -d news.2017.en.shuffled.deduped.gz 6 | fi 7 | 8 | echo "finish downloading data" -------------------------------------------------------------------------------- /bert_score/get_rescale_baseline/get_baseline_example.sh: -------------------------------------------------------------------------------- 1 | bash download_text_data.sh 2 | python get_rescale_baseline.py --lang en -b 16 -m \ 3 | microsoft/deberta-large \ 4 | microsoft/deberta-large-mnli \ 5 | -------------------------------------------------------------------------------- /bert_score/get_rescale_baseline/get_rescale_baseline.py: -------------------------------------------------------------------------------- 1 | import matplotlib 2 | import matplotlib.pyplot as plt 3 | import bert_score 4 | import torch 5 | from random import shuffle 6 | import sacrebleu 7 | import numpy as np 8 | import pandas as pd 9 | from tqdm.auto import tqdm 10 | import os 11 | import argparse 12 | import gzip 13 | 14 | 15 | def get_data(lang="en"): 16 | 17 | if lang == "en": 18 | file_path = "data/news.2017.en.shuffled.deduped" 19 | elif lang == "zh": 20 | file_path = "data/paracrawl/crawl_chinese.txt" 21 | else: 22 | file_path = f"data/paracrawl/rand_{lang}.txt" 23 | 24 | with open(file_path, "r") as f: 25 | lines = [] 26 | for i, line in enumerate(f): 27 | if i == 1_000_000: 28 | break 29 | line = line.strip() 30 | if len(line.split(" ")) < 32 and len(line.split(" ")) > 0: 31 | lines.append(line) 32 | 33 | samples = np.random.choice(range(len(lines)), size=(2, len(lines) // 2), replace=False) 34 | 35 | hyp = [lines[i] for i in samples[0]] 36 | cand = [lines[i] for i in samples[1]] 37 | 38 | return hyp, cand 39 | 40 | 41 | def chunk(l, n): 42 | # looping till length l 43 | for i in range(0, len(l), n): 44 | yield l[i : i + n] 45 | 46 | 47 | if __name__ == "__main__": 48 | parser = argparse.ArgumentParser(description="Process some integers.") 49 | parser.add_argument("--lang", type=str, required=True, help="language to compute baseline with") 50 | parser.add_argument("-m", "--model", nargs="+", help="models to tune") 51 | parser.add_argument("-b", "--batch_size", type=int, default=64) 52 | 53 | args = parser.parse_args() 54 | 55 | hyp, cand = get_data(lang=args.lang) 56 | 57 | for model_type in args.model: 58 | baseline_file_path = f"rescale_baseline/{args.lang}/{model_type}.tsv" 59 | if os.path.isfile(baseline_file_path): 60 | print(f"{model_type} baseline exists for {args.lang}") 61 | continue 62 | else: 63 | print(f"computing baseline for {model_type} on {args.lang}") 64 | scorer = bert_score.BERTScorer(model_type=model_type, all_layers=True) 65 | with torch.no_grad(): 66 | score_means = None 67 | count = 0 68 | for batches in tqdm(chunk(list(zip(hyp, cand)), 1000), total=len(hyp) / 1000): 69 | batch_hyp, batch_cand = zip(*batches) 70 | scores = scorer.score(batch_hyp, batch_cand, batch_size=args.batch_size) 71 | scores = torch.stack(scores, dim=0) 72 | if score_means is None: 73 | score_means = scores.mean(dim=-1) 74 | else: 75 | score_means = score_means * count / (count + len(batches)) + scores.mean(dim=-1) * len( 76 | batches 77 | ) / (count + len(batches)) 78 | count += len(batches) 79 | 80 | pd_baselines = pd.DataFrame(score_means.numpy().transpose(), columns=["P", "R", "F"]) 81 | pd_baselines.index.name = "LAYER" 82 | 83 | os.makedirs(os.path.dirname(baseline_file_path), exist_ok=True) 84 | pd_baselines.to_csv(baseline_file_path) 85 | del scorer 86 | -------------------------------------------------------------------------------- /bert_score/journal/rescale_baseline.md: -------------------------------------------------------------------------------- 1 | # Rescaling BERTScore with Baselines 2 | 3 | BERTScore computes a sentence-level similarity score by making use of token-level similarities, 4 | produced by cosine similarities between contextual embeddings. 5 | The numerical range of BERTScore is between -1 and 1, the same as the underlying cosine similarity. 6 | In practice, however, BERTscore is usually found to be in a small range. 7 | For an extreme case, BERTScore computed with the large RoBERTa model often is between 0.85 and 0.95. 8 | 9 | Although BERTscore correlates highly with human judgment in spite of the above mentioned caveat, BERTScore will 10 | be easier to interpret and work with if it has a natural range (for example, between 0 and 1). 11 | Therefore, we seek a method to rescale BERTScore to have an intuitive range. 12 | Let's denote the BERTScore for a pair of candidate and reference sentence as 13 | Let be a lower bound for BERTScores that we typically observe in practice (i.e. ). 14 | We obtain a rescaled BERTScore through a simple linear transformation, 15 | . 16 | With a reliable baseline , we will typically observe to be between 0 and 1. 17 | 18 | We highlight that this rescaling operation does not affect BERTScore's correlation with human judgment, as measured by Pearson's and Kendall's coefficients. So we preserve BERTScore's high correlation as reported in our [study](https://arxiv.org/abs/1904.09675). 19 | We now describe how we compute a reliable baseline. 20 | 21 | For each language, we select a million sentences from some large monolingual corpus. 22 | We randomly group sentences into candidate-reference pairs, resulting in half a million pairs. 23 | For each contextual embedding model, we compute BERTScore on the random pairs and take the average to be the baseline. 24 | We compute the baseline with different layers of representations and separate the baselines for precision, recall, and F1. 25 | So far, we have supported 11 different languages (English, Chinese, French, German...) with all models we support. 26 | The baseline numbers are collected [here](../rescale_baseline). We plan to release the experiment code soon so you can compute baselines with any data of your choice. 27 | 28 | With this recalling, the average BERTScore (computed with RoBERTa-large, layer17) on the WMT18 De-EN translation evaluation dataset drops from 0.9311 to 0.5758. 29 | For a concrete example, we can plot the similarity matrix between two sentences using `bert-score-show`. 30 | 31 | Before scaling: 32 | 33 | ![](./static/before.png) 34 | 35 | After scaling: 36 | 37 | ![](./static/after.png) 38 | 39 | Clearly, the rescaling produces a more readable output. Occationally, some of the similarity entries will become negative after rescaling but they won't affect BERTScore results because the rescaling is done after BERTScore is computed. 40 | 41 | We package this feature into our library (>=0.3.0). Here's an example on how to use it (note that the language needs to be specified in order to use this feature): 42 | ```python 43 | out = bert_score.score( 44 | cands, refs, 45 | rescale_with_baseline=True, lang="en" 46 | ) 47 | ``` 48 | and for the command-line version: 49 | ```bash 50 | bert-score -r example/refs.txt -c example/hyps.txt \ 51 | --lang en --rescale_with_baseline 52 | ``` 53 | 54 | 55 | 56 | Hope you enjoy this new feature! 57 | 58 | ---Tianyi, Varsha, and Felix 59 | -------------------------------------------------------------------------------- /bert_score/journal/static/.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/journal/static/.png -------------------------------------------------------------------------------- /bert_score/journal/static/after.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/journal/static/after.png -------------------------------------------------------------------------------- /bert_score/journal/static/before.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/journal/static/before.png -------------------------------------------------------------------------------- /bert_score/requirements.txt: -------------------------------------------------------------------------------- 1 | # PyTorch 2 | torch>=1.0.0 3 | # progress bars in model download and training scripts 4 | tqdm>=4.31.1 5 | # BERT 6 | transformers>=3.0.0 7 | matplotlib 8 | pandas>=1.0.1 9 | numpy 10 | packaging>=20.9 11 | -------------------------------------------------------------------------------- /bert_score/setup.py: -------------------------------------------------------------------------------- 1 | from io import open 2 | from setuptools import find_packages, setup 3 | 4 | setup( 5 | name="bert_score", 6 | version='0.3.11', 7 | author="Tianyi Zhang*, Varsha Kishore*, Felix Wu*, Kilian Q. Weinberger, and Yoav Artzi", 8 | author_email="tzhang@asapp.com", 9 | description="PyTorch implementation of BERT score", 10 | long_description=open("README.md", "r", encoding='utf-8').read(), 11 | long_description_content_type="text/markdown", 12 | keywords='BERT NLP deep learning google metric', 13 | license='MIT', 14 | url="https://github.com/Tiiiger/bert_score", 15 | packages=find_packages(exclude=["*.tests", "*.tests.*", 16 | "tests.*", "tests"]), 17 | install_requires=['torch>=1.0.0', 18 | 'pandas>=1.0.1', 19 | 'transformers>=3.0.0' 20 | 'numpy', 21 | 'requests', 22 | 'tqdm>=4.31.1', 23 | 'matplotlib', 24 | 'packaging>=20.9', 25 | ], 26 | entry_points={ 27 | 'console_scripts': [ 28 | "bert-score=bert_score_cli.score:main", 29 | "bert-score-show=bert_score_cli.visualize:main", 30 | ] 31 | }, 32 | include_package_data=True, 33 | python_requires='>=3.6', 34 | tests_require=['pytest'], 35 | classifiers=[ 36 | 'Intended Audience :: Science/Research', 37 | 'License :: OSI Approved :: MIT License', 38 | 'Programming Language :: Python :: 3', 39 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 40 | ], 41 | 42 | ) 43 | -------------------------------------------------------------------------------- /bert_score/tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/tests/__init__.py -------------------------------------------------------------------------------- /bert_score/tests/custom_assertions.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | 4 | 5 | class CustomAssertions: 6 | def assertAreTensors(self, *args): 7 | if not all([torch.is_tensor(arg) for arg in args]): 8 | raise AssertionError("All values should be of type torch.Tensor") 9 | 10 | def assertTensorsAlmostEqual(self, expected, actual, decimal=5): 11 | """ 12 | Test tensors are almost equal (EPS = 1e-5 by default) 13 | """ 14 | np.testing.assert_almost_equal(expected, actual, decimal=decimal) 15 | -------------------------------------------------------------------------------- /bert_score/tests/test_scorer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from transformers import __version__ as ht_version 3 | import bert_score 4 | 5 | from tests.custom_assertions import CustomAssertions 6 | 7 | cands = [ 8 | "28-year-old chef found dead in San Francisco mall", 9 | "A 28-year-old chef who recently moved to San Francisco was found dead in the staircase of a local shopping center.", 10 | 'The victim\'s brother said he cannot imagine anyone who would want to harm him,"Finally, it went uphill again at him."', 11 | ] 12 | refs = [ 13 | "28-Year-Old Chef Found Dead at San Francisco Mall", 14 | "A 28-year-old chef who had recently moved to San Francisco was found dead in the stairwell of a local mall this week.", 15 | "But the victim's brother says he can't think of anyone who would want to hurt him, saying, \"Things were finally going well for him.\"", 16 | ] 17 | 18 | 19 | class TestScore(unittest.TestCase, CustomAssertions): 20 | def test_scorer(self): 21 | scorer = bert_score.BERTScorer(lang="en", batch_size=3) 22 | 23 | (P, R, F), hash_code = scorer.score(cands, refs, return_hash=True) 24 | self.assertAreTensors(P, R, F) 25 | self.assertTensorsAlmostEqual(P, [0.9843302369117737, 0.9832239747047424, 0.9120386242866516]) 26 | self.assertTensorsAlmostEqual(R, [0.9823839068412781, 0.9732863903045654, 0.920428991317749]) 27 | self.assertTensorsAlmostEqual(F, [0.9833561182022095, 0.9782299995422363, 0.916214644908905]) 28 | self.assertEqual( 29 | hash_code, f"roberta-large_L17_no-idf_version={bert_score.__version__}(hug_trans={ht_version})", 30 | ) 31 | 32 | def test_idf_scorer(self): 33 | scorer = bert_score.BERTScorer(lang="en", idf=True, idf_sents=refs, batch_size=3) 34 | 35 | (P, R, F), hash_code = scorer.score(cands, refs, return_hash=True) 36 | self.assertAreTensors(P, R, F) 37 | self.assertTensorsAlmostEqual(P, [0.9837872385978699, 0.9754738807678223, 0.8947395086288452]) 38 | self.assertTensorsAlmostEqual(R, [0.9827190637588501, 0.9697767496109009, 0.9172918796539307]) 39 | self.assertTensorsAlmostEqual(F, [0.9832529425621033, 0.972616970539093, 0.9058753848075867]) 40 | self.assertEqual( 41 | hash_code, f"roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})", 42 | ) 43 | 44 | def test_scorer_rescale(self): 45 | scorer = bert_score.BERTScorer(lang="en", rescale_with_baseline=True, batch_size=3) 46 | 47 | (P, R, F), hash_code = scorer.score(cands, refs, return_hash=True) 48 | self.assertAreTensors(P, R, F) 49 | self.assertTensorsAlmostEqual(P, [0.907000780105591, 0.900435566902161, 0.477955609560013]) 50 | self.assertTensorsAlmostEqual(R, [0.895456790924072, 0.841467440128326, 0.527785062789917]) 51 | self.assertTensorsAlmostEqual(F, [0.901383399963379, 0.871010780334473, 0.503565192222595]) 52 | self.assertEqual( 53 | hash_code, f"roberta-large_L17_no-idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled", 54 | ) 55 | 56 | def test_idf_scorer_rescale(self): 57 | scorer = bert_score.BERTScorer(lang="en", rescale_with_baseline=True, idf=True, idf_sents=refs, batch_size=3) 58 | 59 | (P, R, F), hash_code = scorer.score(cands, refs, return_hash=True) 60 | self.assertAreTensors(P, R, F) 61 | self.assertTensorsAlmostEqual(P, [0.903778135776520, 0.854439020156860, 0.375287383794785]) 62 | self.assertTensorsAlmostEqual(R, [0.897446095943451, 0.820639789104462, 0.509167850017548]) 63 | self.assertTensorsAlmostEqual(F, [0.900772094726562, 0.837753534317017, 0.442304641008377]) 64 | self.assertEqual( 65 | hash_code, f"roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled", 66 | ) 67 | 68 | def test_multi_refs(self): 69 | scorer = bert_score.BERTScorer(lang="en", batch_size=3, rescale_with_baseline=True) 70 | 71 | cands = ["I like lemons."] 72 | refs = [["I am proud of you.", "I love lemons.", "Go go go."]] 73 | P_mul, R_mul, F_mul = scorer.score(cands, refs,) 74 | P_best, R_best, F_best = scorer.score(cands, [refs[0][1]],) 75 | self.assertTensorsAlmostEqual(P_mul, P_best) 76 | self.assertTensorsAlmostEqual(R_mul, R_best) 77 | self.assertTensorsAlmostEqual(F_mul, F_best) 78 | 79 | def test_multi_refs_working(self): 80 | scorer = bert_score.BERTScorer(lang="en", batch_size=3, rescale_with_baseline=True) 81 | 82 | cands = ["I like lemons.", "Hi", "Hey", "Hello", "Go", ""] 83 | refs = [ 84 | ["I am proud of you.", "I love lemons.", "Go go go."], 85 | ["I am proud of you.", "Go go go."], 86 | ["Hi", ""], 87 | ["I am proud of you.", "I love lemons.", "Go go go.", "hello"], 88 | ["I am proud of you.", "Go go go.", "Go", "Go to school"], 89 | ["test"], 90 | ] 91 | P_mul, R_mul, F_mul = scorer.score(cands, refs,) 92 | self.assertAreTensors(P_mul, R_mul, F_mul) 93 | 94 | 95 | if __name__ == "__main__": 96 | unittest.main() 97 | -------------------------------------------------------------------------------- /bert_score/tune_layers/README.md: -------------------------------------------------------------------------------- 1 | # Tuning best layer of a pre-trained English model on WMT16 dataset 2 | 3 | ### Downloading the dataset 4 | This downloads the WMT16 dataset and extracts it into a new folder called `wmt16`. If the folder `wmt16` exists, it will skip the process. 5 | ```sh 6 | bash download_data.sh 7 | ``` 8 | 9 | ### Tuning the models 10 | Here is an example of tuning three models in a row: 11 | ```sh 12 | python tune_layers.py -m bert-base-uncased roberta-base albert-base-v2 13 | ``` 14 | The results would be appended to `best_layers_log.txt`. 15 | The last three lines of `best_layers_log.txt` would be 16 | ``` 17 | 'bert-base-uncased': 9, # 0.692518813886652 18 | 'roberta-base': 10, # 0.7062886932674598 19 | 'albert-base-v2': 9, # 0.6682362357086912 20 | ``` 21 | which shows the model name, the best number of layers, and the pearson correlation with human judgement. 22 | These can be copied and pasted into `model2layers` in `bert_score/utils.py`. -------------------------------------------------------------------------------- /bert_score/tune_layers/download_data.sh: -------------------------------------------------------------------------------- 1 | if ! [ -d wmt16 ]; then 2 | mkdir wmt16 3 | gz_file=wmt16-metrics-results.tar.gz 4 | if ! [ -f $gz_file ]; then 5 | wget https://www.scss.tcd.ie/~ygraham/wmt16-metrics-results.tar.gz 6 | fi 7 | tar -xzf $gz_file -C wmt16 8 | rm -f $gz_file 9 | echo "Finished downloading and extracting the dataset" 10 | else 11 | echo "Folder 'wmt16' exists already." 12 | fi 13 | -------------------------------------------------------------------------------- /bert_score/tune_layers/tune.sh: -------------------------------------------------------------------------------- 1 | bash download_data.sh 2 | python tune_layers.py -m bert-base-uncased roberta-base albert-base-v2 -------------------------------------------------------------------------------- /bert_score/tune_layers/tune_layers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import argparse 4 | import torch 5 | import numpy as np 6 | 7 | from tqdm.auto import tqdm, trange 8 | from collections import defaultdict 9 | from scipy.stats import pearsonr 10 | 11 | import bert_score 12 | 13 | 14 | def get_wmt16(lang_pair, data_folder="wmt16"): 15 | with open( 16 | os.path.join( 17 | data_folder, 18 | f"wmt16-metrics-results/seg-level-results/DAseg-newstest2016/DAseg-newstest2016.human.{lang_pair}", 19 | ) 20 | ) as f: 21 | gold_scores = list(map(float, f.read().strip().split("\n"))) 22 | 23 | with open( 24 | os.path.join( 25 | data_folder, 26 | f"wmt16-metrics-results/seg-level-results/DAseg-newstest2016/DAseg-newstest2016.reference.{lang_pair}", 27 | ) 28 | ) as f: 29 | all_refs = f.read().strip().split("\n") 30 | 31 | with open( 32 | os.path.join( 33 | data_folder, 34 | f"wmt16-metrics-results/seg-level-results/DAseg-newstest2016/DAseg-newstest2016.mt-system.{lang_pair}", 35 | ) 36 | ) as f: 37 | all_hyps = f.read().strip().split("\n") 38 | 39 | return gold_scores, all_refs, all_hyps 40 | 41 | 42 | def get_wmt16_seg_to_bert_score(lang_pair, scorer, data_folder="wmt16", batch_size=64): 43 | # os.makedirs(f"cache_score/{network}", exist_ok=True) 44 | # path = "cache_score/{}/wmt16_seg_to_{}_{}.pkl".format(network, *lang_pair.split("-")) 45 | 46 | gold_scores, refs, cands = get_wmt16(lang_pair, data_folder=data_folder) 47 | if scorer.idf: 48 | scorer.compute_idf(refs) 49 | scores = scorer.score(cands, refs, verbose=False, batch_size=batch_size) 50 | scores = list(scores) 51 | max_length = scorer._tokenizer.max_len_single_sentence 52 | 53 | return scores, gold_scores, max_length 54 | 55 | 56 | def main(): 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("-d", "--data", default="wmt16", help="path to wmt16 data") 59 | parser.add_argument("-m", "--model", nargs="+", help="models to tune") 60 | parser.add_argument("-l", "--log_file", default="best_layers_log.txt", help="log file path") 61 | parser.add_argument("--idf", action="store_true") 62 | parser.add_argument("-b", "--batch_size", type=int, default=64) 63 | parser.add_argument( 64 | "--lang_pairs", 65 | nargs="+", 66 | default=["cs-en", "de-en", "fi-en", "ro-en", "ru-en", "tr-en"], 67 | help="language pairs used for tuning", 68 | ) 69 | args = parser.parse_args() 70 | 71 | if args.log_file.endswith(".txt"): 72 | csv_file = args.log_file.replace(".txt", ".csv") 73 | else: 74 | csv_file = args.log_file + ".csv" 75 | 76 | torch.set_grad_enabled(False) 77 | 78 | networks = args.model 79 | for network in networks: 80 | model_type = network 81 | scorer = bert_score.scorer.BERTScorer(model_type=model_type, num_layers=100, idf=False, all_layers=True) 82 | results = defaultdict(dict) 83 | for lang_pair in tqdm(args.lang_pairs): 84 | scores, gold_scores, max_length = get_wmt16_seg_to_bert_score(lang_pair, scorer, batch_size=args.batch_size) 85 | for i, score in enumerate(scores[2]): 86 | results[lang_pair + " " + str(i)]["%s %s" % (network, "F")] = pearsonr(score, gold_scores)[0] 87 | 88 | best_layer, best_corr = 0, 0.0 89 | for num_layer in range(100): 90 | temp = [] 91 | if f"{args.lang_pairs[0]} {num_layer}" not in results: 92 | break 93 | for lp in args.lang_pairs: 94 | temp.append(results[f"{lp} {num_layer}"][f"{network} F"]) 95 | corr = np.mean(temp) 96 | results["avg" + " " + str(num_layer)]["%s %s" % (network, "F")] = corr 97 | print(network, num_layer, corr) 98 | if corr > best_corr: 99 | best_layer, best_corr = num_layer, corr 100 | 101 | if args.idf: 102 | msg = f"'{network}' (idf): {best_layer}, # {best_corr}" 103 | else: 104 | msg = f"'{network}': {best_layer}, # {best_corr}" 105 | print(msg) 106 | with open(args.log_file, "a") as f: 107 | print(msg, file=f) 108 | csv_msg = f"{network},{best_layer},{best_corr},,{max_length}" 109 | with open(csv_file, "a") as f: 110 | print(csv_msg, file=f) 111 | 112 | del scorer 113 | 114 | 115 | if __name__ == "__main__": 116 | main() 117 | -------------------------------------------------------------------------------- /bert_score/upload_pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | rm -rf dist 3 | python setup.py sdist bdist_wheel 4 | python -m twine upload dist/* -------------------------------------------------------------------------------- /media/bertscore.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/media/bertscore.png --------------------------------------------------------------------------------