├── CODE_OF_CONDUCT.md
├── COMET
├── .github
│ ├── ISSUE_TEMPLATE
│ │ ├── bug_report.md
│ │ ├── feature_request.md
│ │ ├── questions-and-help.md
│ │ └── typos-and-doc-fixes.md
│ └── workflows
│ │ └── ci.yaml
├── .gitignore
├── .idea
│ ├── COMET.iml
│ ├── deployment.xml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── runConfigurations
│ │ └── contrapro_comet.xml
│ └── workspace.xml
├── CONTRIBUTING.md
├── LICENSE
├── MANIFEST.in
├── METRICS.md
├── README.md
├── add_context.py
├── comet
│ ├── __init__.py
│ ├── cli
│ │ ├── compare.py
│ │ ├── mbr.py
│ │ ├── score.py
│ │ └── train.py
│ ├── download_utils.py
│ ├── encoders
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── bert.py
│ │ ├── minilm.py
│ │ ├── xlmr.py
│ │ └── xlmr_xl.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── base.py
│ │ ├── lru_cache.py
│ │ ├── metrics.py
│ │ ├── pooling_utils.py
│ │ ├── predict_pbar.py
│ │ ├── ranking
│ │ │ ├── __init__.py
│ │ │ └── ranking_metric.py
│ │ └── regression
│ │ │ ├── __init__.py
│ │ │ ├── referenceless.py
│ │ │ └── regression_metric.py
│ └── modules
│ │ ├── __init__.py
│ │ ├── feedforward.py
│ │ └── layerwise_attention.py
├── configs
│ ├── early_stopping.yaml
│ ├── model_checkpoint.yaml
│ ├── models
│ │ ├── ranking_metric.yaml
│ │ ├── referenceless_metric.yaml
│ │ └── regression_metric.yaml
│ └── trainer.yaml
├── docs
│ ├── Makefile
│ ├── make.bat
│ └── source
│ │ ├── _static
│ │ ├── css
│ │ │ └── comet.css
│ │ └── img
│ │ │ ├── COMET_lockup-dark.png
│ │ │ ├── COMET_lockup-white.png
│ │ │ ├── estimator_model.jpg
│ │ │ ├── logo.png
│ │ │ ├── models.png
│ │ │ └── ranking_model.jpg
│ │ ├── conf.py
│ │ ├── index.rst
│ │ ├── installation.rst
│ │ ├── library.rst
│ │ ├── models.md
│ │ ├── running.rst
│ │ └── training.md
├── poetry.lock
├── pyproject.toml
├── requirements.txt
└── tests
│ ├── __init__.py
│ ├── integration
│ ├── __init__.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── test_ranking_metric.py
│ │ ├── test_referenceless_regression.py
│ │ └── test_regression_metric.py
│ └── modules
│ │ └── test_feedforward.py
│ └── unit
│ ├── __init__.py
│ ├── encoders
│ ├── __init__.py
│ ├── test_bert.py
│ └── test_xlmr.py
│ └── test_download_utils.py
├── CONTRIBUTING.md
├── Config
├── LICENSE
├── NOTICE
├── Prism
├── README.md
├── add_context.py
└── prism.py
├── README.md
├── THIRD-PARTY-LICENSES_DOC_MT_METRICS.txt
├── bert_score
├── .gitignore
├── .idea
│ ├── bert_score.iml
│ ├── deployment.xml
│ ├── inspectionProfiles
│ │ └── profiles_settings.xml
│ ├── misc.xml
│ ├── modules.xml
│ ├── remote-mappings.xml
│ └── workspace.xml
├── .travis.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── add_context.py
├── bert_score.png
├── bert_score
│ ├── __init__.py
│ ├── rescale_baseline
│ │ ├── cs
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ ├── de
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ ├── en-sci
│ │ │ └── allenai
│ │ │ │ └── scibert_scivocab_uncased.tsv
│ │ ├── en
│ │ │ ├── albert-base-v1.tsv
│ │ │ ├── albert-base-v2.tsv
│ │ │ ├── albert-large-v1.tsv
│ │ │ ├── albert-large-v2.tsv
│ │ │ ├── albert-xlarge-v1.tsv
│ │ │ ├── albert-xlarge-v2.tsv
│ │ │ ├── albert-xxlarge-v1.tsv
│ │ │ ├── albert-xxlarge-v2.tsv
│ │ │ ├── bert-base-cased-finetuned-mrpc.tsv
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── bert-base-uncased.tsv
│ │ │ ├── bert-large-uncased.tsv
│ │ │ ├── distilbert-base-multilingual-cased.tsv
│ │ │ ├── distilbert-base-uncased-distilled-squad.tsv
│ │ │ ├── distilbert-base-uncased.tsv
│ │ │ ├── distilroberta-base.tsv
│ │ │ ├── microsoft
│ │ │ │ ├── deberta-base-mnli.tsv
│ │ │ │ ├── deberta-base.tsv
│ │ │ │ ├── deberta-large-mnli.tsv
│ │ │ │ ├── deberta-large.tsv
│ │ │ │ ├── deberta-xlarge-mnli.tsv
│ │ │ │ └── deberta-xlarge.tsv
│ │ │ ├── roberta-base.tsv
│ │ │ ├── roberta-large-mnli.tsv
│ │ │ ├── roberta-large.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-mlm-en-2048.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ ├── xlm-roberta-large.tsv
│ │ │ ├── xlnet-base-cased.tsv
│ │ │ └── xlnet-large-cased.tsv
│ │ ├── es
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ ├── et
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ ├── fi
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ ├── fr
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ ├── it
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ ├── lv
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ ├── pt
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ ├── xlm-roberta-base.tsv
│ │ │ └── xlm-roberta-large.tsv
│ │ └── zh
│ │ │ ├── bert-base-chinese.tsv
│ │ │ ├── bert-base-multilingual-cased.tsv
│ │ │ ├── xlm-mlm-100-1280.tsv
│ │ │ └── xlm-roberta-base.tsv
│ ├── score.py
│ ├── scorer.py
│ └── utils.py
├── bert_score_cli
│ ├── __init__.py
│ ├── score.py
│ └── visualize.py
├── get_rescale_baseline
│ ├── README.md
│ ├── download_text_data.sh
│ ├── get_baseline_example.sh
│ └── get_rescale_baseline.py
├── journal
│ ├── rescale_baseline.md
│ └── static
│ │ ├── .png
│ │ ├── after.png
│ │ └── before.png
├── requirements.txt
├── setup.py
├── tests
│ ├── __init__.py
│ ├── custom_assertions.py
│ ├── test_score_function.py
│ └── test_scorer.py
├── tune_layers
│ ├── README.md
│ ├── download_data.sh
│ ├── tune.sh
│ └── tune_layers.py
└── upload_pypi.sh
├── media
└── bertscore.png
└── score_doc-metrics.py
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | ## Code of Conduct
2 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
3 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
4 | opensource-codeofconduct@amazon.com with any additional questions or comments.
5 |
--------------------------------------------------------------------------------
/COMET/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## 🐛 Bug
11 |
12 |
13 |
14 | ### To Reproduce
15 |
16 | Before reporting a bug, make sure that the bug can be reproduced with a minimal example and add your relevant changes, to see if the issue persists.
17 |
18 | If the test is failing, please add your test cases to the issue (as a draft PR, or simply paste the code to the issue description here).
19 |
20 | ### Expected behaviour
21 | A clear and concise description of what you expected to happen.
22 |
23 | ### Screenshots
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | ### Environment
27 | OS: [e.g. iOS, Linux, Win]
28 | Packaging [e.g. pip, conda]
29 | Version [e.g. 0.5.2.1]
30 |
31 | ### Additional context
32 |
33 |
34 |
--------------------------------------------------------------------------------
/COMET/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## 🚀 Feature
11 |
12 |
13 | ### Motivation
14 |
15 |
16 |
17 | ### Alternatives
18 |
19 |
20 |
21 | ### Additional context
22 |
23 |
24 |
--------------------------------------------------------------------------------
/COMET/.github/ISSUE_TEMPLATE/questions-and-help.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Questions and Help
3 | about: Ask questions about COMET
4 | title: "[QUESTION]"
5 | labels: question
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## ❓ Questions and Help
11 |
12 | ### Before asking:
13 | 1. Search for similar [issues](https://github.com/Unbabel/COMET/issues).
14 | 3. Search the [docs](https://unbabel.github.io/COMET/html/index.html).
15 |
16 |
17 |
18 | #### What is your question?
19 |
20 | #### Code
21 |
22 |
23 |
24 | #### What have you tried?
25 |
26 | #### What's your environment?
27 |
28 | - OS: [e.g. iOS, Linux, Win]
29 | - Packaging [e.g. pip, conda]
30 | - Version [e.g. 0.5.2.1]
31 |
--------------------------------------------------------------------------------
/COMET/.github/ISSUE_TEMPLATE/typos-and-doc-fixes.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Typos and doc fixes
3 | about: Typos and doc fixes
4 | title: ''
5 | labels: documentation, enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## 📚 Documentation
11 |
12 | If you find a typo or something not well explained in the documentation please use this template to report that!
13 |
--------------------------------------------------------------------------------
/COMET/.github/workflows/ci.yaml:
--------------------------------------------------------------------------------
1 | name: Python package
2 |
3 | on: [push, pull_request]
4 |
5 | jobs:
6 | build:
7 |
8 | runs-on: ubuntu-latest
9 | strategy:
10 | matrix:
11 | python-version: [3.7, 3.8, 3.9]
12 |
13 | steps:
14 | - uses: actions/checkout@v2
15 | - name: Set up Python ${{ matrix.python-version }}
16 | uses: actions/setup-python@v2
17 | with:
18 | python-version: ${{ matrix.python-version }}
19 | - name: Install Requirements
20 | run: |
21 | python -m pip install --upgrade pip
22 | pip install unbabel-comet
23 | comet-score --help
24 |
--------------------------------------------------------------------------------
/COMET/.gitignore:
--------------------------------------------------------------------------------
1 | *.pkl
2 | *DS_Store
3 | data/
4 | lightning_logs/
5 | wmt21/
6 |
7 | .vscode
8 | # Byte-compiled / optimized / DLL files
9 | __pycache__/
10 | *.py[cod]
11 | *$py.class
12 |
13 | # C extensions
14 | *.so
15 |
16 | # Distribution / packaging
17 | .Python
18 | build/
19 | develop-eggs/
20 | dist/
21 | downloads/
22 | eggs/
23 | .eggs/
24 | lib/
25 | lib64/
26 | parts/
27 | sdist/
28 | var/
29 | wheels/
30 | pip-wheel-metadata/
31 | share/python-wheels/
32 | *.egg-info/
33 | .installed.cfg
34 | *.egg
35 | MANIFEST
36 |
37 | # PyInstaller
38 | # Usually these files are written by a python script from a template
39 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
40 | *.manifest
41 | *.spec
42 |
43 | # Installer logs
44 | pip-log.txt
45 | pip-delete-this-directory.txt
46 |
47 | # Unit test / coverage reports
48 | htmlcov/
49 | .tox/
50 | .nox/
51 | .coverage
52 | .coverage.*
53 | .cache
54 | nosetests.xml
55 | coverage.xml
56 | *.cover
57 | .hypothesis/
58 | .pytest_cache/
59 |
60 | # Translations
61 | *.mo
62 | *.pot
63 |
64 | # Django stuff:
65 | *.log
66 | local_settings.py
67 | db.sqlite3
68 | db.sqlite3-journal
69 |
70 | # Flask stuff:
71 | instance/
72 | .webassets-cache
73 |
74 | # Scrapy stuff:
75 | .scrapy
76 |
77 | # Sphinx documentation
78 | docs/_build/
79 |
80 | # PyBuilder
81 | target/
82 |
83 | # Jupyter Notebook
84 | .ipynb_checkpoints
85 |
86 | # IPython
87 | profile_default/
88 | ipython_config.py
89 |
90 | # pyenv
91 | .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # celery beat schedule file
101 | celerybeat-schedule
102 |
103 | # SageMath parsed files
104 | *.sage.py
105 |
106 | # Environments
107 | .env
108 | .venv
109 | env/
110 | venv/
111 | ENV/
112 | env.bak/
113 | venv.bak/
114 |
115 | # Spyder project settings
116 | .spyderproject
117 | .spyproject
118 |
119 | # Rope project settings
120 | .ropeproject
121 |
122 | # mkdocs documentation
123 | /site
124 |
125 | # mypy
126 | .mypy_cache/
127 | .dmypy.json
128 | dmypy.json
129 |
130 | # Pyre type checker
131 | .pyre/
132 |
133 | # local files
134 | docids
135 | hyp.en
136 | ref.en
137 | src.de
138 |
--------------------------------------------------------------------------------
/COMET/.idea/COMET.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/COMET/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/COMET/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/COMET/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/COMET/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/COMET/.idea/runConfigurations/contrapro_comet.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
--------------------------------------------------------------------------------
/COMET/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contribution guide
2 |
3 | ## Overview
4 |
5 | COMET is an Open Source toolkit aimed to develop state of the art models that can act as MT evaluation metrics. While we do welcome contributions, in order to guarantee their quality and usefulness, it is necessary that we follow basic guidelines in order to ease development, collaboration and readability.
6 |
7 | ## Basic guidelines
8 |
9 | * The project must fully support Python 3.6 or further.
10 | * Code formatting must stick to the Facebook style, 80 columns and single quotes. Please make sure you have [black](https://github.com/ambv/black) installed and to run it before submitting changes.
11 | * Imports are sorted with [isort](https://github.com/timothycrosley/isort).
12 | * Filenames must be in lowercase.
13 | * Tests are running with [unittest](https://docs.python.org/3/library/unittest.html). Unittest implements a standard test discovery which means that it will search for `test_*.py` files. We do not enforce a minimum code coverage but it is preferrable to have even very basic tests running for critical pieces of code. Always test functions that takes/returns tensor argument to document the sizes.
14 | * The `comet` folder contains core features.
15 |
16 | ## Contributing
17 |
18 | * Keep track of everything by creating issues and editing them with reference to the code! Explain succinctly the problem you are trying to solve and your solution.
19 | * Contributions to `master` should be made through github pull-requests.
20 | * Work in a clean environment (`virtualenv` is nice).
21 | * Your commit message must start with an infinitive verb (Add, Fix, Remove, ...).
22 | * If your change is based on a paper, please include a clear comment and reference in the code and in the related issue.
23 | * In order to test your local changes, install COMET following the instructions on the [documentation](https://unbabel.github.io/COMET/html/index.html)
24 |
--------------------------------------------------------------------------------
/COMET/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include requirements.txt
--------------------------------------------------------------------------------
/COMET/METRICS.md:
--------------------------------------------------------------------------------
1 | # WMT20 COMET Metrics:
2 |
3 | Our **Primary Metrics** are the models described in our [participation to the WMT20 Metrics Shared task](https://aclanthology.org/2020.wmt-1.101.pdf):
4 | - `wmt20-comet-da`: This model was trained to predict _Direct Assessments_ from WMT17 to WMT19 using source, translation and reference. (Same as `wmt-large-da-estimator-1719` from previous versions.)
5 | - `wmt20-comet-qe-da`: This model was trained to predict _Direct Assessments_ from WMT17 to WMT19 using **source and translation only**! Also, this model is bounded between 0 and 1 which improves interpretability in comparison with the previous model.
6 |
7 | These two models were the best performing metrics in the large-scale metrics study performed by Microsoft Research [kocmi et al, 2021](https://arxiv.org/abs/2107.10821) which validates our findings.
8 |
9 | # EMNLP20 Metric:
10 |
11 | In our [initial COMET release](https://aclanthology.org/2020.emnlp-main.213/) we developed a Translation Ranking Model based on daRR from previous WMT shared tasks. This model achieves **some of the highest Kendall tau-like correlations on the WMT19 daRR benchmark** but does not perform as well on later WMT benchmarks, specially those using MQM annotations.
12 |
13 |
14 | # WMT21 COMET Metrics:
15 |
16 | In our participation to the WMT21 shared task we steer COMET towards higher correlations with MQM. We do so by first pre-training on _Direct Assessments_ and then fine-tuning on z-normalized MQM scores.
17 | - `wmt21-comet-mqm`: This model was pre-trained on _Direct Assessments_ from WMT15 to WMT20 and then fine-tuned on MQM scores from [freitag et al, 2021](https://arxiv.org/pdf/2104.14478.pdf)
18 | - `wmt21-comet-qe-mqm`: Reference-free version of `wmt21-comet-mqm`.
19 |
20 | Additionally, we present COMETinho (`wmt21-cometinho-da`), a light-weight COMET model that is 19x faster on CPU than the original model.
21 |
22 | **NOTE:** One thing we noticed in the WMT21 Models is that the variance between predicted scores is lower than previous models which make their predictions look very similar to each other even if the overall correlations with human judgments improve and the system rankings is correct.
23 |
--------------------------------------------------------------------------------
/COMET/README.md:
--------------------------------------------------------------------------------
1 | # Doc-COMET(-QE)
2 |
3 | This README describes how to use **Doc-COMET** an extension of the original COMET metric that can be used for document-level evaluation. This can also be applied to the referenceless version of COMET, i.e. COMET-QE (QE-as-a-metric), resulting in the corresponding **Doc-COMET-QE** metric.
4 |
5 | ## Installation
6 |
7 | This codebase is built upon the original [COMET code](https://github.com/Unbabel/COMET). For a detailed documentation of the COMET metric, including usage examples and instructions see the [Full Documentation](https://unbabel.github.io/COMET/html/index.html).
8 |
9 | To run Doc-COMET you will need to develop locally:
10 | ```bash
11 | git clone https://github.com/amazon-science/doc-mt-metrics.git
12 | cd doc-mt-metrics/COMET
13 | conda create -n doc-metrics-env python=3.9
14 | conda activate doc-metrics-env
15 | pip install --upgrade pip
16 | pip install -r requirements.txt
17 | pip install -e .
18 | ```
19 |
20 | ### Get some files to score
21 | ```bash
22 | sacrebleu -t wmt21 -l en-de --echo src | head -n 20 > src.en
23 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > ref.de
24 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > hyp.de # put your system output here
25 | ```
26 |
27 | To evaluate at the document level we need to know where the document boundaries are in the test set, so that we only use valid context. This is passed in as a file where each line contains a document ID.
28 |
29 | For WMT test sets this can be obtained via [sacreBLEU](https://github.com/mjpost/sacrebleu):
30 | ```bash
31 | sacrebleu -t wmt21 -l en-de --echo docid | head -n 20 > docids.ende
32 | ```
33 |
34 | ### Command Line usage
35 |
36 | Comet and comet-qe are run just as before, except we, add the `--doc` flag to the `comet-score` command:
37 | ```bash
38 | comet-score -s src.en -t hyp.de -r ref.de --doc docids.ende --model wmt21-comet-mqm
39 | comet-score -s src.en -t hyp.de --doc docids.ende --model wmt21-comet-qe-mqm
40 | ```
41 | > Note: you can set `--gpus 0` to run on CPU.
42 |
43 | In the paper we use `wmt21-comet-mqm` and `wmt21-comet-qe-mqm` models. To select a different model from the [available COMET models/metrics](https://unbabel.github.io/COMET/html/models.html) set the `--model` flag accordingly.
44 |
45 | ### Python usage:
46 |
47 | In order to use Doc-COMET(-QE) with python simply add `model.set_document_level()` after loading the model.
48 |
49 | ```python
50 | from comet import download_model, load_from_checkpoint
51 | from add_context import add_context
52 |
53 | # load data files
54 | doc_ids = [x.strip() for x in open('docids.ende', 'rt').readlines()]
55 | src = [x.strip() for x in open('src.en', 'rt').readlines()]
56 | hyp = [x.strip() for x in open('hyp.de', 'rt').readlines()]
57 | ref = [x.strip() for x in open('ref.de', 'rt').readlines()]
58 |
59 | # load comet model
60 | model_path = download_model("wmt21-comet-mqm")
61 | model = load_from_checkpoint(model_path)
62 |
63 | # enable document-level evaluation
64 | model.set_document_level()
65 |
66 | # add contexts to reference, source and hypothesis texts
67 | src = add_context(orig_txt=src, context=src, doc_ids=doc_ids, sep_token=model.encoder.tokenizer.sep_token)
68 | hyp = add_context(orig_txt=hyp, context=ref, doc_ids=doc_ids, sep_token=model.encoder.tokenizer.sep_token)
69 | ref = add_context(orig_txt=ref, context=ref, doc_ids=doc_ids, sep_token=model.encoder.tokenizer.sep_token)
70 |
71 | data = [{"src": x, "mt": y, "ref": z} for x, y, z in zip(src, hyp, ref)]
72 |
73 | seg_scores, sys_score = model.predict(data, batch_size=8, gpus=1)
74 | ```
75 |
76 | ## Reproduce
77 | To reproduce the Doc-COMET results from the paper run the [score_doc-metrics.py](/score_doc-metrics.py) script with the flags `--model comet` and `--doc`. For the Doc-COMET-QE results also add the `--qe` flag.
78 |
79 |
80 | ## Paper
81 |
82 | If you use the code in your work, please cite [Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric](https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf):
83 |
84 | ```
85 | @inproceedings{easy_doc_mt
86 | title = {Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric},
87 | author = {Vernikos, Giorgos and Thompson, Brian and Mathur, Prashant and Federico, Marcello},
88 | booktitle = "Proceedings of the Seventh Conference on Machine Translation",
89 | month = dec,
90 | year = "2022",
91 | address = "Abu Dhabi, United Arab Emirates",
92 | publisher = "Association for Computational Linguistics",
93 | url = "https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf",
94 | }
95 | ```
96 |
--------------------------------------------------------------------------------
/COMET/add_context.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from typing import List
3 |
4 |
5 | def add_context(orig_txt: List[str], context: List[str], doc_ids: List[str], sep_token: str = "",
6 | ws: int = 2) -> List[str]:
7 | """Function that adds the previous sentences as context to the current sentence, respecting document boundaries
8 | :param orig_txt: the original text
9 | :param context: the text from which the context will be taken (same as orig_txt for source/reference)
10 | :param doc_ids: the document where each segment belongs to
11 | :param sep_token: the separator token of the tokenizer for the specific model
12 | :param ws: the window size, maximum of the previous sentences to be considered as context
13 | :return: the original text augmented with context
14 | """
15 | if not (len(orig_txt) == len(context) == len(doc_ids)):
16 | raise Exception(f'Lengths should match: len(orig_txt)={len(orig_txt)}, len(context)={len(context)}, len(doc_ids)={len(doc_ids)}')
17 | i, k = 0, 0
18 | augm_txt = []
19 | doc_id = doc_ids[0]
20 | while i < len(orig_txt):
21 | if doc_ids[i] == doc_id:
22 | context_window = context[i - min(k, ws):i]
23 | augm_txt.append(" {} ".format(sep_token).join(context_window + [orig_txt[i]]))
24 | i += 1
25 | else:
26 | doc_id = doc_ids[i]
27 | k = -1
28 | k += 1
29 | return augm_txt
30 |
31 |
32 |
--------------------------------------------------------------------------------
/COMET/comet/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # -*- coding: utf-8 -*-
3 | # Copyright (C) 2020 Unbabel
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | import logging
18 |
19 | from .download_utils import download_model
20 | from .models import load_from_checkpoint
21 |
22 | logging.basicConfig(level=logging.INFO, format="%(message)s")
23 | logger = logging.getLogger(__name__)
24 |
25 |
26 | __version__ = "1.1.2"
27 | __copyright__ = "2020 Unbabel. All rights reserved."
28 |
--------------------------------------------------------------------------------
/COMET/comet/cli/train.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python3
2 |
3 | # Copyright (C) 2020 Unbabel
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | """
17 |
18 | Command for training new Metrics.
19 | =================================
20 |
21 | e.g:
22 | ```
23 | comet-train --cfg configs/models/regression_metric.yaml
24 | ```
25 |
26 | For more details run the following command:
27 | ```
28 | comet-train --help
29 | ```
30 | """
31 | import json
32 | import warnings
33 |
34 |
35 | from comet.models import (
36 | CometModel,
37 | RankingMetric,
38 | ReferencelessRegression,
39 | RegressionMetric,
40 | )
41 | from jsonargparse import ActionConfigFile, ArgumentParser, namespace_to_dict
42 | from pytorch_lightning import seed_everything
43 | from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
44 | from pytorch_lightning.trainer.trainer import Trainer
45 |
46 |
47 | def train_command() -> None:
48 | parser = ArgumentParser(description="Command for training COMET models.")
49 | parser.add_argument(
50 | "--seed_everything",
51 | type=int,
52 | default=12,
53 | help="Training Seed.",
54 | )
55 | parser.add_argument("--cfg", action=ActionConfigFile)
56 | parser.add_class_arguments(CometModel, "model")
57 | parser.add_subclass_arguments(RegressionMetric, "regression_metric")
58 | parser.add_subclass_arguments(
59 | ReferencelessRegression, "referenceless_regression_metric"
60 | )
61 | parser.add_subclass_arguments(RankingMetric, "ranking_metric")
62 | parser.add_subclass_arguments(EarlyStopping, "early_stopping")
63 | parser.add_subclass_arguments(ModelCheckpoint, "model_checkpoint")
64 | parser.add_subclass_arguments(Trainer, "trainer")
65 | cfg = parser.parse_args()
66 | seed_everything(cfg.seed_everything)
67 |
68 | checkpoint_callback = ModelCheckpoint(
69 | **namespace_to_dict(cfg.model_checkpoint.init_args)
70 | )
71 | early_stop_callback = EarlyStopping(
72 | **namespace_to_dict(cfg.early_stopping.init_args)
73 | )
74 | trainer_args = namespace_to_dict(cfg.trainer.init_args)
75 | trainer_args["callbacks"] = [early_stop_callback, checkpoint_callback]
76 | print("TRAINER ARGUMENTS: ")
77 | print(json.dumps(trainer_args, indent=4, default=lambda x: x.__dict__))
78 | trainer = Trainer(**trainer_args)
79 |
80 | print("MODEL ARGUMENTS: ")
81 | if cfg.regression_metric is not None:
82 | print(
83 | json.dumps(
84 | cfg.regression_metric.init_args, indent=4, default=lambda x: x.__dict__
85 | )
86 | )
87 | model = RegressionMetric(**namespace_to_dict(cfg.regression_metric.init_args))
88 | elif cfg.referenceless_regression_metric is not None:
89 | print(
90 | json.dumps(
91 | cfg.referenceless_regression_metric.init_args,
92 | indent=4,
93 | default=lambda x: x.__dict__,
94 | )
95 | )
96 | model = ReferencelessRegression(
97 | **namespace_to_dict(cfg.referenceless_regression_metric.init_args)
98 | )
99 | elif cfg.ranking_metric is not None:
100 | print(
101 | json.dumps(
102 | cfg.ranking_metric.init_args, indent=4, default=lambda x: x.__dict__
103 | )
104 | )
105 | model = RankingMetric(**namespace_to_dict(cfg.ranking_metric.init_args))
106 | else:
107 | raise Exception("Model configurations missing!")
108 | # Related to train/val_dataloaders:
109 |
110 | # 2 workers per gpu is enough! If set to the number of cpus on this machine
111 | # it throws another exception saying its too many workers.
112 | warnings.filterwarnings(
113 | "ignore",
114 | category=UserWarning,
115 | message=".*Consider increasing the value of the `num_workers` argument` .*",
116 | )
117 | trainer.fit(model)
118 |
119 |
120 | if __name__ == "__main__":
121 | train_command()
122 |
--------------------------------------------------------------------------------
/COMET/comet/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Unbabel
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | import transformers
15 | from packaging import version
16 |
17 | from .bert import BERTEncoder
18 | from .minilm import MiniLMEncoder
19 | from .xlmr import XLMREncoder
20 |
21 | str2encoder = {"BERT": BERTEncoder, "XLM-RoBERTa": XLMREncoder, "MiniLM": MiniLMEncoder}
22 |
23 | if version.parse(transformers.__version__) >= version.parse("4.17.0"):
24 | from .xlmr_xl import XLMRXLEncoder
25 |
26 | str2encoder["XLM-RoBERTa-XL"] = XLMRXLEncoder
27 |
--------------------------------------------------------------------------------
/COMET/comet/encoders/base.py:
--------------------------------------------------------------------------------
1 | # Copyright (C) 2020 Unbabel
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | r"""
15 | Encoder Model base
16 | ====================
17 | Module defining the common interface between all pretrained encoder models.
18 | """
19 | import abc
20 | from typing import Dict, List
21 |
22 | import torch
23 | import torch.nn as nn
24 |
25 |
26 | class Encoder(nn.Module, metaclass=abc.ABCMeta):
27 | """Base class for an encoder model."""
28 |
29 | @property
30 | @abc.abstractmethod
31 | def output_units(self):
32 | """Max number of tokens the encoder handles."""
33 | pass
34 |
35 | @property
36 | @abc.abstractmethod
37 | def max_positions(self):
38 | """Max number of tokens the encoder handles."""
39 | pass
40 |
41 | @property
42 | @abc.abstractmethod
43 | def num_layers(self):
44 | """Number of model layers available."""
45 | pass
46 |
47 | @classmethod
48 | @abc.abstractmethod
49 | def from_pretrained(cls, pretrained_model):
50 | """Function that loads a pretrained encoder and the respective tokenizer.
51 |
52 | :return: Encoder model
53 | """
54 | raise NotImplementedError
55 |
56 | def prepare_sample(self, sample: List[str]) -> Dict[str, torch.Tensor]:
57 | """Receives a list of strings and applies tokenization and vectorization.
58 |
59 | :param sample: List with text segments to be tokenized and padded.
60 |
61 | :return: Dictionary with HF model inputs.
62 | """
63 | tokenizer_output = self.tokenizer(
64 | sample,
65 | return_tensors="pt",
66 | padding=True,
67 | truncation=True,
68 | max_length=self.max_positions - 2,
69 | )
70 | return tokenizer_output
71 |
72 | def freeze(self) -> None:
73 | """Frezees the entire encoder."""
74 | for param in self.parameters():
75 | param.requires_grad = False
76 |
77 | def unfreeze(self) -> None:
78 | """Unfrezees the entire encoder."""
79 | for param in self.parameters():
80 | param.requires_grad = True
81 |
82 | @abc.abstractmethod
83 | def freeze_embeddings(self) -> None:
84 | """Frezees the embedding layer."""
85 | pass
86 |
87 | @abc.abstractmethod
88 | def layerwise_lr(self, lr: float, decay: float):
89 | """
90 | :param lr: Learning rate for the highest encoder layer.
91 | :param decay: decay percentage for the lower layers.
92 |
93 | :return: List of model parameters with layer-wise decay learning rate
94 | """
95 | pass
96 |
97 | @abc.abstractmethod
98 | def forward(
99 | self, tokens: torch.Tensor, lengths: torch.Tensor
100 | ) -> Dict[str, torch.Tensor]:
101 | pass
102 |
--------------------------------------------------------------------------------
/COMET/comet/encoders/bert.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (C) 2020 Unbabel
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | r"""
16 | BERT Encoder
17 | ==============
18 | Pretrained BERT encoder from Hugging Face.
19 | """
20 | from typing import Dict
21 |
22 | import torch
23 | from comet.encoders.base import Encoder
24 | from transformers import AutoModel, AutoTokenizer
25 |
26 |
27 | class BERTEncoder(Encoder):
28 | """BERT encoder.
29 |
30 | :param pretrained_model: Pretrained model from hugging face.
31 | """
32 |
33 | def __init__(self, pretrained_model: str) -> None:
34 | super().__init__()
35 | self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
36 | self.model = AutoModel.from_pretrained(pretrained_model)
37 | self.model.encoder.output_hidden_states = True
38 |
39 | @property
40 | def output_units(self):
41 | """Max number of tokens the encoder handles."""
42 | return self.model.config.hidden_size
43 |
44 | @property
45 | def max_positions(self):
46 | """Max number of tokens the encoder handles."""
47 | return self.model.config.max_position_embeddings
48 |
49 | @property
50 | def num_layers(self):
51 | """Number of model layers available."""
52 | return self.model.config.num_hidden_layers + 1
53 |
54 | @classmethod
55 | def from_pretrained(cls, pretrained_model: str) -> Encoder:
56 | """Function that loads a pretrained encoder from Hugging Face.
57 | :param pretrained_model: Name of the pretrain model to be loaded.
58 |
59 | :return: Encoder model
60 | """
61 | return BERTEncoder(pretrained_model)
62 |
63 | def freeze_embeddings(self) -> None:
64 | """Frezees the embedding layer."""
65 | for param in self.model.embeddings.parameters():
66 | param.requires_grad = False
67 |
68 | def layerwise_lr(self, lr: float, decay: float):
69 | """
70 | :param lr: Learning rate for the highest encoder layer.
71 | :param decay: decay percentage for the lower layers.
72 |
73 | :return: List of model parameters with layer-wise decay learning rate
74 | """
75 | # Embedding Layer
76 | opt_parameters = [
77 | {
78 | "params": self.model.embeddings.parameters(),
79 | "lr": lr * decay ** (self.num_layers),
80 | }
81 | ]
82 | # All layers
83 | opt_parameters += [
84 | {
85 | "params": self.model.encoder.layer[i].parameters(),
86 | "lr": lr * decay**i,
87 | }
88 | for i in range(self.num_layers - 2, 0, -1)
89 | ]
90 | return opt_parameters
91 |
92 | def forward(
93 | self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs
94 | ) -> Dict[str, torch.Tensor]:
95 | last_hidden_states, pooler_output, all_layers = self.model(
96 | input_ids=input_ids,
97 | attention_mask=attention_mask,
98 | output_hidden_states=True,
99 | return_dict=False,
100 | )
101 | return {
102 | "sentemb": pooler_output,
103 | "wordemb": last_hidden_states,
104 | "all_layers": all_layers,
105 | "attention_mask": attention_mask,
106 | }
107 |
--------------------------------------------------------------------------------
/COMET/comet/encoders/minilm.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (C) 2020 Unbabel
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | r"""
16 | MiniLM Encoder
17 | ==============
18 | Pretrained MiniLM encoder from Microsoft. This encoder uses a BERT
19 | architecture with an XLMR tokenizer.
20 | """
21 | from comet.encoders.bert import BERTEncoder
22 | from transformers import BertModel, XLMRobertaTokenizer
23 |
24 |
25 | class MiniLMEncoder(BERTEncoder):
26 | """MiniLMEncoder encoder.
27 |
28 | :param pretrained_model: Pretrained model from hugging face.
29 | """
30 |
31 | def __init__(self, pretrained_model: str) -> None:
32 | super().__init__()
33 | self.tokenizer = XLMRobertaTokenizer.from_pretrained(
34 | pretrained_model, use_fast=True
35 | )
36 | self.model = BertModel.from_pretrained(pretrained_model)
37 | self.model.encoder.output_hidden_states = True
38 |
--------------------------------------------------------------------------------
/COMET/comet/encoders/xlmr.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (C) 2020 Unbabel
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | r"""
16 | XLM-RoBERTa Encoder
17 | ==============
18 | Pretrained XLM-RoBERTa encoder from Hugging Face.
19 | """
20 | from typing import Dict
21 |
22 | import torch
23 | from comet.encoders.base import Encoder
24 | from comet.encoders.bert import BERTEncoder
25 | from transformers import XLMRobertaModel, XLMRobertaTokenizer
26 |
27 |
28 | class XLMREncoder(BERTEncoder):
29 | """XLM-RoBERTA Encoder encoder.
30 |
31 | :param pretrained_model: Pretrained model from hugging face.
32 | """
33 |
34 | def __init__(self, pretrained_model: str) -> None:
35 | super(Encoder, self).__init__()
36 | self.tokenizer = XLMRobertaTokenizer.from_pretrained(pretrained_model)
37 | self.model = XLMRobertaModel.from_pretrained(
38 | pretrained_model, add_pooling_layer=False
39 | )
40 | self.model.encoder.output_hidden_states = True
41 |
42 | @classmethod
43 | def from_pretrained(cls, pretrained_model: str) -> Encoder:
44 | """Function that loads a pretrained encoder from Hugging Face.
45 | :param pretrained_model: Name of the pretrain model to be loaded.
46 |
47 | :return: Encoder model
48 | """
49 | return XLMREncoder(pretrained_model)
50 |
51 | def forward(
52 | self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs
53 | ) -> Dict[str, torch.Tensor]:
54 | last_hidden_states, _, all_layers = self.model(
55 | input_ids=input_ids,
56 | attention_mask=attention_mask,
57 | output_hidden_states=True,
58 | return_dict=False,
59 | )
60 | return {
61 | "sentemb": last_hidden_states[:, 0, :],
62 | "wordemb": last_hidden_states,
63 | "all_layers": all_layers,
64 | "attention_mask": attention_mask,
65 | }
66 |
--------------------------------------------------------------------------------
/COMET/comet/encoders/xlmr_xl.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (C) 2020 Unbabel
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | r"""
16 | XLM-RoBERTa-XL Encoder
17 | ==============
18 | Pretrained XLM-RoBERTa-XL encoder from Hugging Face.
19 | """
20 | from comet.encoders.base import Encoder
21 | from comet.encoders.xlmr import XLMREncoder
22 | from transformers import XLMRobertaTokenizer, XLMRobertaXLModel
23 |
24 |
25 | class XLMRXLEncoder(XLMREncoder):
26 | """XLM-RoBERTA-XL Encoder encoder.
27 |
28 | :param pretrained_model: Pretrained model from hugging face.
29 | """
30 |
31 | def __init__(self, pretrained_model: str) -> None:
32 | super(Encoder, self).__init__()
33 | self.tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-large")
34 | self.model = XLMRobertaXLModel.from_pretrained(
35 | pretrained_model, add_pooling_layer=False
36 | )
37 | self.model.encoder.output_hidden_states = True
38 |
39 | @classmethod
40 | def from_pretrained(cls, pretrained_model: str) -> Encoder:
41 | """Function that loads a pretrained encoder from Hugging Face.
42 | :param pretrained_model: Name of the pretrain model to be loaded.
43 |
44 | :return: Encoder model
45 | """
46 | return XLMRXLEncoder(pretrained_model)
47 |
--------------------------------------------------------------------------------
/COMET/comet/models/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | # -*- coding: utf-8 -*-
3 | # Copyright (C) 2020 Unbabel
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | from .regression.regression_metric import RegressionMetric
17 | from .ranking.ranking_metric import RankingMetric
18 | from .regression.referenceless import ReferencelessRegression
19 | from .base import CometModel
20 |
21 | import os
22 | import yaml
23 |
24 | str2model = {
25 | "referenceless_regression_metric": ReferencelessRegression,
26 | "regression_metric": RegressionMetric,
27 | "ranking_metric": RankingMetric,
28 | }
29 |
30 | available_metrics = {
31 | # WMT20 Models
32 | "emnlp20-comet-rank": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/emnlp20-comet-rank.tar.gz",
33 | "wmt20-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-da.tar.gz",
34 | "wmt20-comet-qe-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-qe-da.tar.gz",
35 | "wmt20-comet-qe-da-v2": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-qe-da-v2.tar.gz",
36 |
37 | # WMT21 Models
38 | "wmt21-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-da.tar.gz",
39 | "wmt21-comet-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-mqm.tar.gz",
40 | "wmt21-cometinho-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-cometinho-mqm.tar.gz",
41 | "wmt21-cometinho-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-cometinho-da.tar.gz",
42 | "wmt21-comet-qe-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-qe-mqm.tar.gz",
43 | "wmt21-comet-qe-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-qe-da.tar.gz",
44 |
45 | #EAMT22 Models
46 | "eamt22-cometinho-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/eamt22/eamt22-cometinho-da.tar.gz",
47 | "eamt22-prune-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/eamt22/eamt22-prune-comet-da.tar.gz",
48 | }
49 |
50 |
51 | def load_from_checkpoint(checkpoint_path: str) -> CometModel:
52 | """Loads models from a checkpoint path.
53 | :param checkpoint_path: Path to a model checkpoint.
54 |
55 | :return: Returns a COMET model.
56 | """
57 | if not os.path.exists(checkpoint_path):
58 | raise Exception(f"Invalid checkpoint path: {checkpoint_path}")
59 |
60 | hparams_file = "/".join(checkpoint_path.split("/")[:-2] + ["hparams.yaml"])
61 | if os.path.exists(hparams_file):
62 | with open(hparams_file) as yaml_file:
63 | hparams = yaml.load(yaml_file.read(), Loader=yaml.FullLoader)
64 | model_class = str2model[hparams["class_identifier"]]
65 | model = model_class.load_from_checkpoint(checkpoint_path, **hparams)
66 | return model
67 | else:
68 | raise Exception("hparams.yaml file is missing!")
69 |
--------------------------------------------------------------------------------
/COMET/comet/models/metrics.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (C) 2020 Unbabel
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 |
16 | r"""
17 | Metrics
18 | =======
19 | Regression and Ranking metrics to be used during training to measure
20 | correlations with human judgements
21 | """
22 | import torch
23 | from torch import Tensor
24 |
25 | from torchmetrics import Metric
26 | from typing import Any, Callable, List, Optional
27 | import scipy.stats as stats
28 |
29 |
30 | class RegressionMetrics(Metric):
31 | is_differentiable = False
32 | higher_is_better = True
33 | preds: List[Tensor]
34 | target: List[Tensor]
35 |
36 | def __init__(
37 | self,
38 | prefix: str = "",
39 | compute_on_step: bool = False,
40 | dist_sync_on_step: bool = False,
41 | process_group: Optional[Any] = None,
42 | dist_sync_fn: Optional[Callable] = None,
43 | ) -> None:
44 | super().__init__(
45 | compute_on_step=compute_on_step,
46 | dist_sync_on_step=dist_sync_on_step,
47 | process_group=process_group,
48 | dist_sync_fn=dist_sync_fn,
49 | )
50 | self.add_state("preds", default=[], dist_reduce_fx="cat")
51 | self.add_state("target", default=[], dist_reduce_fx="cat")
52 | self.prefix = prefix
53 |
54 |
55 | def update(self, preds: Tensor, target: Tensor) -> None: # type: ignore
56 | """Update state with predictions and targets.
57 | Args:
58 | preds: Predictions from model
59 | target: Ground truth values
60 | """
61 | self.preds.append(preds)
62 | self.target.append(target)
63 |
64 | def compute(self) -> Tensor:
65 | """ Computes spearmans correlation coefficient. """
66 | preds = torch.cat(self.preds, dim=0)
67 | target = torch.cat(self.target, dim=0)
68 | kendall, _ = stats.kendalltau(preds.tolist(), target.tolist())
69 | spearman, _ = stats.spearmanr(preds.tolist(), target.tolist())
70 | pearson, _ = stats.pearsonr(preds.tolist(), target.tolist())
71 | return {
72 | self.prefix + "_kendall": kendall,
73 | self.prefix + "_spearman": spearman,
74 | self.prefix + "_pearson": pearson,
75 | }
76 |
77 | class WMTKendall(Metric):
78 | def __init__(
79 | self,
80 | prefix: str = "",
81 | compute_on_step: bool = False,
82 | dist_sync_on_step: bool = False,
83 | process_group: Optional[Any] = None,
84 | dist_sync_fn: Optional[Callable] = None,
85 | ) -> None:
86 | super().__init__(
87 | compute_on_step=compute_on_step,
88 | dist_sync_on_step=dist_sync_on_step,
89 | process_group=process_group,
90 | dist_sync_fn=dist_sync_fn,
91 | )
92 | self.add_state("concordance", default=torch.tensor(0), dist_reduce_fx="sum")
93 | self.add_state("discordance", default=torch.tensor(0), dist_reduce_fx="sum")
94 | self.prefix = prefix
95 |
96 | def update(self, distance_pos: torch.Tensor, distance_neg: torch.Tensor):
97 | assert distance_pos.shape == distance_neg.shape
98 | self.concordance = torch.sum((distance_pos < distance_neg).float())
99 | self.discordance = torch.sum((distance_pos >= distance_neg).float())
100 |
101 | def compute(self):
102 | return {
103 | self.prefix
104 | + "_kendall": (self.concordance - self.discordance)
105 | / (self.concordance + self.discordance)
106 | }
107 |
--------------------------------------------------------------------------------
/COMET/comet/models/predict_pbar.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | import pytorch_lightning as ptl
4 | from tqdm import tqdm
5 |
6 |
7 | class PredictProgressBar(ptl.callbacks.progress.tqdm_progress.TQDMProgressBar):
8 | """Default Lightning Progress bar writes to stdout, we replace stdout with stderr"""
9 |
10 | def init_predict_tqdm(self) -> tqdm:
11 | bar = tqdm(
12 | desc="Predicting",
13 | initial=self.train_batch_idx,
14 | position=(2 * self.process_position),
15 | disable=self.is_disabled,
16 | leave=True,
17 | dynamic_ncols=True,
18 | file=sys.stderr,
19 | smoothing=0,
20 | )
21 | return bar
22 |
--------------------------------------------------------------------------------
/COMET/comet/models/ranking/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/comet/models/ranking/__init__.py
--------------------------------------------------------------------------------
/COMET/comet/models/regression/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/comet/models/regression/__init__.py
--------------------------------------------------------------------------------
/COMET/comet/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .feedforward import FeedForward
3 | from .layerwise_attention import LayerwiseAttention
4 |
--------------------------------------------------------------------------------
/COMET/comet/modules/feedforward.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # Copyright (C) 2020 Unbabel
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | #
8 | # http://www.apache.org/licenses/LICENSE-2.0
9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | r"""
16 | Feed Forward
17 | ==============
18 | Feed Forward Neural Network module that can be used for classification or regression
19 | """
20 | from typing import List, Optional
21 |
22 | import torch
23 | from torch import nn
24 |
25 |
26 | class FeedForward(nn.Module):
27 | """
28 | Feed Forward Neural Network.
29 |
30 | :param in_dim: Number input features.
31 | :param out_dim: Number of output features. Default is just a score.
32 | :param hidden_sizes: List with hidden layer sizes.
33 | :param activations: Name of the activation function to be used in the hidden layers.
34 | :param final_activation: Name of the final activation function if any.
35 | :param dropout: dropout to be used in the hidden layers.
36 | """
37 |
38 | def __init__(
39 | self,
40 | in_dim: int,
41 | out_dim: int = 1,
42 | hidden_sizes: List[int] = [3072, 768],
43 | activations: str = "Sigmoid",
44 | final_activation: Optional[str] = None,
45 | dropout: float = 0.1,
46 | ) -> None:
47 | super().__init__()
48 | modules = []
49 | modules.append(nn.Linear(in_dim, hidden_sizes[0]))
50 | modules.append(self.build_activation(activations))
51 | modules.append(nn.Dropout(dropout))
52 |
53 | for i in range(1, len(hidden_sizes)):
54 | modules.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
55 | modules.append(self.build_activation(activations))
56 | modules.append(nn.Dropout(dropout))
57 |
58 | modules.append(nn.Linear(hidden_sizes[-1], int(out_dim)))
59 | if final_activation is not None:
60 | modules.append(self.build_activation(final_activation))
61 |
62 | self.ff = nn.Sequential(*modules)
63 |
64 | def build_activation(self, activation: str) -> nn.Module:
65 | if hasattr(nn, activation.title()):
66 | return getattr(nn, activation.title())()
67 | else:
68 | raise Exception(f"{activation} is not a valid activation function!")
69 |
70 | def forward(self, in_features: torch.Tensor) -> torch.Tensor:
71 | return self.ff(in_features)
72 |
--------------------------------------------------------------------------------
/COMET/configs/early_stopping.yaml:
--------------------------------------------------------------------------------
1 | class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping
2 | init_args:
3 | monitor: val_pearson
4 | min_delta: 0.
5 | patience: 2
6 | verbose: False
7 | mode: max
8 | strict: True
9 | check_finite: True
10 | stopping_threshold: null
11 | divergence_threshold: null
12 | check_on_train_epoch_end: False
13 |
--------------------------------------------------------------------------------
/COMET/configs/model_checkpoint.yaml:
--------------------------------------------------------------------------------
1 | class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
2 | init_args:
3 | dirpath: null
4 | filename: null
5 | monitor: val_pearson
6 | verbose: True
7 | save_last: False
8 | save_top_k: 3
9 | mode: max
10 | auto_insert_metric_name: True
11 | save_weights_only: True
12 | every_n_train_steps: null
13 | train_time_interval: null
14 | every_n_epochs: 1
15 | save_on_train_epoch_end: null
16 |
--------------------------------------------------------------------------------
/COMET/configs/models/ranking_metric.yaml:
--------------------------------------------------------------------------------
1 | ranking_metric:
2 | class_path: comet.models.RankingMetric
3 | init_args:
4 | nr_frozen_epochs: 0.3
5 | keep_embeddings_frozen: True
6 | optimizer: AdamW
7 | encoder_learning_rate: 1.0e-05
8 | learning_rate: 3.0e-05
9 | layerwise_decay: 0.95
10 | encoder_model: XLM-RoBERTa
11 | pretrained_model: xlm-roberta-base
12 | pool: avg
13 | layer: mix
14 | dropout: 0.1
15 | batch_size: 4
16 | train_data: data/TRAIN.csv
17 | validation_data: data/DEV.csv
18 |
19 | trainer: ../trainer.yaml
20 | early_stopping: ../early_stopping.yaml
21 | model_checkpoint: ../model_checkpoint.yaml
--------------------------------------------------------------------------------
/COMET/configs/models/referenceless_metric.yaml:
--------------------------------------------------------------------------------
1 | referenceless_regression_metric:
2 | class_path: comet.models.ReferencelessRegression
3 | init_args:
4 | nr_frozen_epochs: 0.3
5 | keep_embeddings_frozen: True
6 | optimizer: AdamW
7 | encoder_learning_rate: 1.0e-05
8 | learning_rate: 3.1e-05
9 | layerwise_decay: 0.95
10 | encoder_model: XLM-RoBERTa
11 | pretrained_model: xlm-roberta-large
12 | pool: avg
13 | layer: mix
14 | dropout: 0.1
15 | batch_size: 4
16 | train_data: data/TRAIN.csv
17 | validation_data: data/DEV.csv
18 | hidden_sizes:
19 | - 2048
20 | - 1024
21 |
22 | trainer: ../trainer.yaml
23 | early_stopping: ../early_stopping.yaml
24 | model_checkpoint: ../model_checkpoint.yaml
--------------------------------------------------------------------------------
/COMET/configs/models/regression_metric.yaml:
--------------------------------------------------------------------------------
1 | regression_metric:
2 | class_path: comet.models.RegressionMetric
3 | init_args:
4 | nr_frozen_epochs: 0.3
5 | keep_embeddings_frozen: True
6 | optimizer: AdamW
7 | encoder_learning_rate: 1.0e-05
8 | learning_rate: 3.1e-05
9 | layerwise_decay: 0.95
10 | encoder_model: MiniLM
11 | pretrained_model: microsoft/Multilingual-MiniLM-L12-H384
12 | pool: avg
13 | layer: mix
14 | dropout: 0.15
15 | batch_size: 8
16 | train_data: data/TRAIN.csv
17 | validation_data: data/DEV.csv
18 | hidden_sizes:
19 | - 384
20 |
21 | trainer: ../trainer.yaml
22 | early_stopping: ../early_stopping.yaml
23 | model_checkpoint: ../model_checkpoint.yaml
--------------------------------------------------------------------------------
/COMET/configs/trainer.yaml:
--------------------------------------------------------------------------------
1 | class_path: pytorch_lightning.trainer.trainer.Trainer
2 | init_args:
3 | accelerator: gpu
4 | devices: null
5 | accumulate_grad_batches: 4
6 | amp_backend: native #apex
7 | amp_level: null #'01'
8 | auto_lr_find: False
9 | auto_scale_batch_size: False
10 | auto_select_gpus: False
11 | benchmark: null
12 | check_val_every_n_epoch: 1
13 | default_root_dir: null
14 | deterministic: True
15 | fast_dev_run: False
16 | gradient_clip_val: 1.0
17 | gradient_clip_algorithm: norm
18 | limit_train_batches: 1.0
19 | limit_val_batches: 1.0
20 | limit_test_batches: 1.0
21 | limit_predict_batches: 1.0
22 | log_every_n_steps: 50
23 | profiler: null
24 | overfit_batches: 0
25 | plugins: null
26 | precision: 16
27 | max_epochs: 3
28 | min_epochs: 1
29 | max_steps: -1
30 | min_steps: null
31 | max_time: null
32 | num_nodes: 1
33 | num_processes: 1
34 | num_sanity_val_steps: 10
35 | reload_dataloaders_every_n_epochs: 0
36 | replace_sampler_ddp: True
37 | sync_batchnorm: False
38 | detect_anomaly: False
39 | tpu_cores: null
40 | track_grad_norm: -1
41 | val_check_interval: 1.0
42 | enable_model_summary: True
43 | move_metrics_to_cpu: True
44 | multiple_trainloader_mode: max_size_cycle
45 |
--------------------------------------------------------------------------------
/COMET/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/COMET/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/COMET/docs/source/_static/css/comet.css:
--------------------------------------------------------------------------------
1 | .wy-side-nav-search {
2 | background-color: #3852de;
3 | }
4 |
5 | .wy-side-nav-search > div.version {
6 | color: white;
7 | }
8 |
9 | .wy-menu-vertical p.caption {
10 | color: #3852de;
11 | }
12 |
13 | .wy-side-nav-search input[type=text] {
14 | border-color: #1a1a1a;
15 | }
16 |
17 | a {
18 | color: #3852de;
19 | }
20 |
21 | a:hover {
22 | color: #1a1a1a;
23 | }
--------------------------------------------------------------------------------
/COMET/docs/source/_static/img/COMET_lockup-dark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/COMET_lockup-dark.png
--------------------------------------------------------------------------------
/COMET/docs/source/_static/img/COMET_lockup-white.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/COMET_lockup-white.png
--------------------------------------------------------------------------------
/COMET/docs/source/_static/img/estimator_model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/estimator_model.jpg
--------------------------------------------------------------------------------
/COMET/docs/source/_static/img/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/logo.png
--------------------------------------------------------------------------------
/COMET/docs/source/_static/img/models.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/models.png
--------------------------------------------------------------------------------
/COMET/docs/source/_static/img/ranking_model.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/docs/source/_static/img/ranking_model.jpg
--------------------------------------------------------------------------------
/COMET/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = "COMET"
21 | copyright = (
22 | "2020, Unbabel. All rights reserved."
23 | "Source code available under Apache License 2.0"
24 | )
25 | author = "Unbabel"
26 |
27 | # The full version, including alpha/beta/rc tags
28 | release = "0.0.3"
29 |
30 |
31 | # -- General configuration ---------------------------------------------------
32 |
33 | # Add any Sphinx extension module names here, as strings. They can be
34 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
35 | # ones.
36 | extensions = [
37 | "sphinx.ext.autodoc",
38 | # 'sphinx.ext.doctest',
39 | "sphinx.ext.intersphinx",
40 | "sphinx.ext.todo",
41 | "sphinx.ext.coverage",
42 | "sphinx.ext.mathjax",
43 | "sphinx.ext.viewcode",
44 | "sphinx.ext.githubpages",
45 | "sphinx.ext.napoleon",
46 | "recommonmark",
47 | #'sphinxarg.ext',
48 | #'m2r',
49 | # 'sphinx-issues',
50 | # 'pytest-sphinx',
51 | "sphinx_markdown_tables",
52 | "sphinx.ext.autosectionlabel",
53 | ]
54 | autosectionlabel_prefix_document = True
55 |
56 | # Add any paths that contain templates here, relative to this directory.
57 | templates_path = ["_templates"]
58 |
59 | source_suffix = {
60 | ".rst": "restructuredtext",
61 | ".txt": "markdown",
62 | ".md": "markdown",
63 | }
64 |
65 | # List of patterns, relative to source directory, that match files and
66 | # directories to ignore when looking for source files.
67 | # This pattern also affects html_static_path and html_extra_path.
68 | exclude_patterns = []
69 |
70 | # The master toctree document.
71 | master_doc = "index"
72 |
73 | # List of patterns, relative to source directory, that match files and
74 | # directories to ignore when looking for source files.
75 | # This pattern also affects html_static_path and html_extra_path.
76 | exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
77 |
78 | # The name of the Pygments (syntax highlighting) style to use.
79 | pygments_style = None
80 |
81 | # -- Options for HTML output -------------------------------------------------
82 |
83 | # html_logo = '_static/img/LOGO.png'
84 |
85 | # The theme to use for HTML and HTML Help pages. See the documentation for
86 | # a list of builtin themes.
87 | #
88 | html_theme = "sphinx_rtd_theme"
89 |
90 |
91 | # Add any paths that contain custom static files (such as style sheets) here,
92 | # relative to this directory. They are copied after the builtin static files,
93 | # so a file named "default.css" will overwrite the builtin "default.css".
94 | html_static_path = ["_static"]
95 |
96 |
97 | def setup(app):
98 | app.add_css_file("css/comet.css")
99 |
--------------------------------------------------------------------------------
/COMET/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | COMET: High-quality Machine Translation Evaluation
2 | ===================================================
3 |
4 | .. image:: _static/img/COMET_lockup-dark.png
5 | :width: 800
6 | :alt: COMET by Unbabel
7 |
8 | What is COMET
9 | ==============
10 |
11 | COMET is an open-source framework for MT evaluation that can be used for two purposes:
12 |
13 | * To evaluate MT systems with our currently available high-performing metrics (check: :ref:`models:COMET Metrics`).
14 | * To train and develop new metrics.
15 |
16 |
17 |
18 |
19 | Contents:
20 | =========
21 |
22 | .. toctree::
23 | :maxdepth: 2
24 |
25 | installation
26 | running
27 | models
28 | training
29 |
30 |
31 | License
32 | ==============
33 |
34 | Free software: Apache License 2.0
35 |
36 | Citation
37 | =========
38 |
39 | If you use COMET to evaluate your MT system or to develop new metrics please cite the following paper:
40 | `COMET: A Neural Framework for MT Evaluation `_
41 |
42 | Library Reference
43 | ==================
44 |
45 | .. toctree::
46 | :maxdepth: 2
47 |
48 | library
49 |
--------------------------------------------------------------------------------
/COMET/docs/source/installation.rst:
--------------------------------------------------------------------------------
1 | .. _installation:
2 | Installation
3 | ============
4 |
5 | Please note that since Python>=3.5 is required, all the below commands, especially `pip`,
6 | also have to be the Python 3 version. This might require that you run `pip3` instead.
7 |
8 |
9 | Install COMET as a package with pip::
10 |
11 | pip install unbabel-comet
12 |
13 | Inside your project you can now::
14 |
15 | import comet
16 |
17 | or run it with our command line interface::
18 |
19 | comet --help
20 |
21 |
--------------------------------------------------------------------------------
/COMET/docs/source/library.rst:
--------------------------------------------------------------------------------
1 | Library Reference
2 | ===================================================
3 |
4 | ******************************
5 | Multilingual Encoders
6 | ******************************
7 |
8 | .. automodule:: comet.models.encoders.encoder_base
9 | :members:
10 |
11 | .. automodule:: comet.models.encoders.laser
12 | :members:
13 |
14 | .. automodule:: comet.models.encoders.bert
15 | :members:
16 |
17 | .. automodule:: comet.models.encoders.xlmr
18 | :members:
19 |
20 | ******************
21 | Model Base
22 | ******************
23 |
24 | .. automodule:: comet.models.model_base
25 | :members:
26 |
27 | **********
28 | Estimators
29 | **********
30 |
31 | .. automodule:: comet.models.estimators.estimator_base
32 | :members:
33 |
34 | .. automodule:: comet.models.estimators.comet_estimator
35 | :members:
36 |
37 | .. automodule:: comet.models.estimators.quality_estimator
38 | :members:
39 |
40 | **************************
41 | Translation Ranking Model
42 | **************************
43 |
44 | .. automodule:: comet.models.ranking.ranking_base
45 | :members:
46 |
47 | .. automodule:: comet.models.ranking.comet_ranker
48 | :members:
49 |
50 | *****************
51 | Auxiliar Modules
52 | *****************
53 |
54 | .. automodule:: comet.modules.feedforward
55 | :members:
56 |
57 | .. automodule:: comet.modules.scalar_mix
58 | :members:
--------------------------------------------------------------------------------
/COMET/docs/source/models.md:
--------------------------------------------------------------------------------
1 | ## COMET Metrics
2 |
3 | COMET models can be optimized towards different kinds of human judgements (for example HTER or DA's). Because of we provide a list of different Metrics that you can use to test you systems:
4 |
5 | | Model | Description |
6 | | :--------------------- | :------------------------------------------------ |
7 | | ↑`wmt-large-da-estimator-1719` | **RECOMMENDED:** Estimator model build on top of XLM-R (large) trained on DA from WMT17, WMT18 and WMT19 |
8 | | ↑`wmt-base-da-estimator-1719` | Estimator model build on top of XLM-R (base) trained on DA from WMT17, WMT18 and WMT19 |
9 | | ↓`wmt-large-hter-estimator` | Estimator model build on top of XLM-R (large) trained to regress on HTER. |
10 | | ↓`wmt-base-hter-estimator` | Estimator model build on top of XLM-R (base) trained to regress on HTER. |
11 | | ↑`emnlp-base-da-ranker` | Translation ranking model that uses XLM-R to encode sentences. This model was trained with WMT17 and WMT18 Direct Assessments Relative Ranks (DARR). |
12 |
13 | The first four models (`wmt-*`) where trained and tested for the WMT2020 shared task, thus they were only introduced in our submission to the shared task (paper still under-review)
14 |
15 | **NOTE:** Even when regressing on the same Human Judgement scores between metrics are not comparable (e.g. scores from a large and a base model are not comparable even when trained on the same type of judgements)! Please make sure you use the same metric when comparing 2 systems!
16 |
17 | Also, since HTER measures the amount of edits we needed to correct an MT hypothesis, models regressing on HTER produce low scores for better systems.
--------------------------------------------------------------------------------
/COMET/docs/source/running.rst:
--------------------------------------------------------------------------------
1 | .. _running:
2 | Running COMET
3 | ==============
4 |
5 | Command Line Interface
6 | ################################
7 |
8 | After installing COMET you can score you MT outputs with the following command::
9 |
10 | comet score -s sources.txt -h hypothesis.txt -r references.txt --model wmt-large-da-estimator-1719
11 |
12 | You can export your results to a JSON file using the `---to_json` flag::
13 |
14 | comet score -s sources.txt -h hypothesis.txt -r references.txt --model wmt-large-da-estimator-1719 --to_json output.json
15 |
16 |
17 | Using Python
18 | #############
19 |
20 | Instead of using CLI you can score you models using Python with the `predict` function::
21 |
22 | from comet.models import download_model
23 | model = download_model("wmt-large-da-estimator-1719", "path/where/to/save/models")
24 | data = [
25 | {
26 | "src": "Hello world!",
27 | "mt": "Oi mundo!",
28 | "ref": "Olá mundo!"
29 | },
30 | {
31 | "src": "This is a sample",
32 | "mt": "este é um exemplo",
33 | "ref": "isto é um exemplo!"
34 | }
35 | ]
36 | model.predict(data)
37 |
38 | Scoring MT ouputs using lists::
39 |
40 | source = ["Hello world!", "This is a sample"]
41 | hypothesis = ["Oi mundo!", "este é um exemplo"]
42 | reference = ["Olá mundo!", "isto é um exemplo!"]
43 | data = {"src": source, "mt": hypothesis, "ref": reference}
44 | data = [dict(zip(data, t)) for t in zip(*data.values())]
45 | model.predict(data)
--------------------------------------------------------------------------------
/COMET/docs/source/training.md:
--------------------------------------------------------------------------------
1 | # Train your own Metric
2 |
3 | To train our models we rely on [PyTorch Lightning](https://pytorch-lightning.readthedocs.io/en/0.8.4/) Library. This means that all our models are [Lightning Modules](https://pytorch-lightning.readthedocs.io/en/0.8.4/lightning-module.html).
4 |
5 | To train a new metric we just need to run 1 command:
6 |
7 | ```bash
8 | comet train -f {my_configs}.yaml
9 | ```
10 |
11 | This will setup a [Lightning Trainer](https://pytorch-lightning.readthedocs.io/en/0.8.4/trainer.html) and fit your module accordingly.
12 | ## Data Format
13 | To train your metric we expect your data to be a csv with the following columns:
14 | - `src`: The source segment.
15 | - `mt`: The machine translation hypothesis.
16 | - `ref`: The reference segment.
17 | - `score`: The human judgment score.
18 |
19 | Example:
20 |
21 | | src | mt | ref | score |
22 | | :---------: | :------: | :------: | :------: |
23 | | Hello world! | Oi mundo. | Olá mundo! | 0.5 |
24 | | This is a sample | este é um exemplo | isto é um exemplo! | 0.8 |
25 |
26 | ## Training flags
27 |
28 | ### Lightning Trainer Configurations
29 |
30 | | Argument | Default | Description |
31 | | :--------- | :------ | :------ |
32 | | `seed` | 3 | Training seed. |
33 | | `deterministic` | True | If true enables cudnn.deterministic. Might make your system slower, but ensures reproducibility. |
34 | | `verbose` | False | Verbosity mode. |
35 | | `early_stopping` | True | Enables early stopping. |
36 | | `save_top_k` | 1 | Sets how many checkpoints we want to save (keeping only the best ones). |
37 | | `monitor` | Kendall | Metric to monitor during training. |
38 | | `metric_mode` | max | 'min' or 'max' depending if we wish to maximize or minimize the metric. |
39 | | `min_delta` | 0 | Sensitivity to the metric. |
40 | | `patience` | 1 | Number of epochs without improvement before stopping training |
41 | | `accumulate_grad_batches` | 1 | Gradient accumulation steps |
42 | | `lr_finder` | False | Enables the learning rate finder described in [Cyclical Learning Rates for Training Neural Networks](https://arxiv.org/abs/1506.01186) |
43 |
44 |
45 | ### Base Model Configurations
46 |
47 | | Argument | Default | Description |
48 | | :--------- | :------ | :------ |
49 | | `model` | `required` | Type of metric we want to train. Options: [`CometEstimator`, `CometRanker`, `QualityEstimator`] |
50 | | `batch_size` | 8 | Batch size used to train the model. |
51 | | `nr_frozen_epochs` | 0 | Number of epochs we keep the encoder frozen. |
52 | | `keep_embeddings_frozen` | False | If set to True, keeps the embedding layer frozen during training. Usefull to save some GPU memory. |
53 | | `optimizer` | Adam | PyTorch Optimizer class name |
54 | | `learning_rate` | 1e-05 | Learning rate to be used during training. |
55 | | `scheduler` | constant | Learning Rate scheduler. Options: [`constant`, `linear_warmup`, `warmup_constant`] |
56 | | `warmup_steps` | None |Scheduler warmup steps. |
57 | | `encoder_model` | XLMR | Encoder Model to be used: Options: [`LASER`, `BERT`, `XLMR`]. |
58 | | `pretrained_model` | xlmr.base | pretrained model to be used e.g: xlmr.base vs xlmr.large (for LASER this is ignored) |
59 | | `pool` | avg | Pooling technique to create the sentence embeddings. Options: [`avg`, `avg+cls`, `max`, `cls`, `default`] |
60 | | `load_weights` | False | Loads compatible weights from another checkpoint file. |
61 | | `train_path` | `required` | Path to the training csv. |
62 | | `val_path` | `required` | Path to the validation csv. |
63 | | `test_path` | None | Path to the test csv. (Not used) |
64 | | `loader_workers` | False | Number of workers for loading and preparing the batches. |
65 |
66 | **Note:** The `Ranker` model requires no further configs.
67 |
68 | ### Estimator Specific Configurations
69 |
70 | | Argument | Default | Description |
71 | | :--------- | :------ | :------ |
72 | | `encoder_learning_rate` | `required` | Learning rate used to fine-tune the encoder. Note that this is different from `learning_rate` config that will be used only for the top layer. |
73 | | `layerwise_decay` | 1.0 | Decay for the layer wise learning rates. If 1.0 no decay is applied. |
74 | | `layer` | mix | Layer from the pretrained encoder that we wish to extract the word embeddings. If `mix` uses a layer-wise attention mechanism to combine different layers. |
75 | | `scalar_mix_dropout` | mix | Sets the layer-wise dropout. Ignored if `layer != mix`. |
76 | | `loss` | mse | `mse` for Mean Squared Error or `binary_xent`for Binary Cross Entropy. |
77 | | `hidden_sizes` | 1536,768 | Hidden sizes of the different Feed-Forward layers on top. |
78 | | `activations` | Tanh | Activation functions for the Feed-Forward on top. |
79 | | `dropout` | 0.1 | Dropout used in the Feed-Forward on top. |
80 | | `final_activation` | Sigmoid | Feed-Forward final activation function. If `False` the model outputs the logits |
81 |
82 |
83 |
--------------------------------------------------------------------------------
/COMET/pyproject.toml:
--------------------------------------------------------------------------------
1 | [tool.poetry]
2 | name = "unbabel-comet"
3 | version = "1.1.2"
4 | description = "High-quality Machine Translation Evaluation"
5 | authors = ["Ricardo Rei, Craig Stewart, Catarina Farinha, Alon Lavie"]
6 | license = "Apache-2.0"
7 | readme = "README.md"
8 | homepage = "https://github.com/Unbabel/COMET"
9 | repository = "https://github.com/Unbabel/COMET"
10 | documentation = "https://unbabel.github.io/COMET/html/index.html"
11 | keywords = [
12 | "Machine Translation",
13 | "Evaluation",
14 | "Unbabel",
15 | "COMET"
16 | ]
17 | classifiers = [
18 | 'Development Status :: 4 - Beta',
19 | 'Environment :: Console',
20 | 'Intended Audience :: Science/Research',
21 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
22 | ]
23 | packages = [
24 | {include = "comet"},
25 | ]
26 | include = [
27 | "LICENSE",
28 | "pyproject.toml",
29 | "CONTRIBUTING.md"
30 | ]
31 |
32 | [tool.poetry.scripts]
33 | comet-train = 'comet.cli.train:train_command'
34 | comet-score = 'comet.cli.score:score_command'
35 | comet-compare = 'comet.cli.compare:compare_command'
36 | comet-mbr = 'comet.cli.mbr:mbr_command'
37 |
38 | [tool.poetry.dependencies]
39 | python = ">=3.7.0,<4.0.0"
40 | sentencepiece = "^0.1.96"
41 | pandas = "1.1.5"
42 | transformers = ">=4.8"
43 | pytorch-lightning = "1.6.4"
44 | jsonargparse = "3.13.1"
45 | torch = ">=1.6.0,<2"
46 | numpy = ">=1.20.0"
47 | torchmetrics = "0.8.2"
48 | sacrebleu = ">=2.0.0"
49 | scipy = ">=1.5.4"
50 |
51 | [tool.poetry.dev-dependencies]
52 | sphinx-markdown-tables = "0.0.15"
53 | coverage = "^5.5"
54 | scikit-learn = "0.24"
55 |
56 | [build-system]
57 | requires = ["poetry-core>=1.0.0"]
58 | build-backend = "poetry.core.masonry.api"
59 |
--------------------------------------------------------------------------------
/COMET/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece==0.1.96
2 | pandas==1.1.5
3 | transformers>=4.8
4 | pytorch-lightning==1.6.4
5 | jsonargparse==3.13.1
6 | torch>=1.6.0,<2
7 | numpy>=1.20.0
8 | torchmetrics==0.8.2
9 | sacrebleu >= 2.0.0
10 | scipy>=1.5.4
11 |
--------------------------------------------------------------------------------
/COMET/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/__init__.py
--------------------------------------------------------------------------------
/COMET/tests/integration/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/integration/__init__.py
--------------------------------------------------------------------------------
/COMET/tests/integration/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/integration/models/__init__.py
--------------------------------------------------------------------------------
/COMET/tests/integration/models/test_ranking_metric.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import shutil
4 | import unittest
5 | import warnings
6 |
7 | import torch
8 | from comet.models import RankingMetric
9 | from pytorch_lightning import seed_everything
10 | from pytorch_lightning.trainer.trainer import Trainer
11 | from scipy.stats import pearsonr
12 | from tests.data import DATA_PATH
13 | from torch.utils.data import DataLoader
14 |
15 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
16 | os.environ["OMP_NUM_THREADS"] = "1"
17 |
18 |
19 | class TestRankingMetric(unittest.TestCase):
20 | @classmethod
21 | def tearDownClass(cls):
22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints"))
23 |
24 | def test_training(self):
25 | seed_everything(12)
26 | warnings.filterwarnings(
27 | "ignore",
28 | #category=PossibleUserWarning,
29 | message="GPU available but not used.*",
30 | )
31 | trainer = Trainer(
32 | accelerator="cpu",
33 | max_epochs=4,
34 | deterministic=True,
35 | enable_checkpointing=True,
36 | default_root_dir=DATA_PATH,
37 | logger=False,
38 | enable_progress_bar=False,
39 | )
40 | model = RankingMetric(
41 | encoder_model="BERT",
42 | pretrained_model="google/bert_uncased_L-2_H-128_A-2",
43 | train_data=os.path.join(DATA_PATH, "test_ranking_data.csv"),
44 | validation_data=os.path.join(DATA_PATH, "test_ranking_data.csv"),
45 | layerwise_decay=0.95,
46 | batch_size=32,
47 | learning_rate=1e-04,
48 | encoder_learning_rate=1e-04,
49 | )
50 | warnings.filterwarnings(
51 | "ignore",
52 | category=UserWarning,
53 | message=".*Consider increasing the value of the `num_workers` argument` .*",
54 | )
55 | trainer.fit(model)
56 | self.assertTrue(
57 | os.path.exists(
58 | os.path.join(DATA_PATH, "checkpoints", "epoch=3-step=16.ckpt")
59 | )
60 | )
61 | saved_model = RankingMetric.load_from_checkpoint(
62 | os.path.join(DATA_PATH, "checkpoints", "epoch=3-step=16.ckpt")
63 | )
64 | dataset = saved_model.read_csv(
65 | os.path.join(DATA_PATH, "test_regression_data.csv"), regression=True
66 | )
67 | y = [s["score"] for s in dataset]
68 | dataloader = DataLoader(
69 | dataset=dataset,
70 | batch_size=256,
71 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True),
72 | num_workers=2,
73 | )
74 | y_hat = (
75 | torch.cat(
76 | trainer.predict(
77 | ckpt_path="best", dataloaders=dataloader, return_predictions=True
78 | ),
79 | dim=0,
80 | )
81 | .cpu()
82 | .tolist()
83 | )
84 | # This shouldn't break!
85 | pearsonr(y_hat, y)[0]
86 |
--------------------------------------------------------------------------------
/COMET/tests/integration/models/test_referenceless_regression.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import shutil
4 | import unittest
5 | import warnings
6 |
7 | import torch
8 | from comet.models import ReferencelessRegression
9 | from pytorch_lightning import seed_everything
10 | from pytorch_lightning.trainer.trainer import Trainer
11 | from scipy.stats import pearsonr
12 | from tests.data import DATA_PATH
13 | from torch.utils.data import DataLoader
14 |
15 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
16 | os.environ["OMP_NUM_THREADS"] = "1"
17 |
18 |
19 | class TestReferencelessRegression(unittest.TestCase):
20 | @classmethod
21 | def tearDownClass(cls):
22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints"))
23 |
24 | def test_training(self):
25 | seed_everything(12)
26 | warnings.filterwarnings(
27 | "ignore",
28 | #category=PossibleUserWarning,
29 | message="GPU available but not used.*",
30 | )
31 | trainer = Trainer(
32 | accelerator="cpu",
33 | max_epochs=10,
34 | deterministic=True,
35 | enable_checkpointing=True,
36 | default_root_dir=DATA_PATH,
37 | logger=False,
38 | enable_progress_bar=False,
39 | )
40 | model = ReferencelessRegression(
41 | encoder_model="BERT",
42 | pretrained_model="google/bert_uncased_L-2_H-128_A-2",
43 | train_data=os.path.join(DATA_PATH, "test_regression_data.csv"),
44 | validation_data=os.path.join(DATA_PATH, "test_regression_data.csv"),
45 | hidden_sizes=[256],
46 | layerwise_decay=0.95,
47 | batch_size=32,
48 | learning_rate=1e-04,
49 | encoder_learning_rate=1e-04,
50 | )
51 | warnings.filterwarnings(
52 | "ignore",
53 | category=UserWarning,
54 | message=".*Consider increasing the value of the `num_workers` argument` .*",
55 | )
56 | trainer.fit(model)
57 | self.assertTrue(
58 | os.path.exists(
59 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=160.ckpt")
60 | )
61 | )
62 |
63 | saved_model = ReferencelessRegression.load_from_checkpoint(
64 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=160.ckpt")
65 | )
66 | dataset = saved_model.read_csv(
67 | os.path.join(DATA_PATH, "test_regression_data.csv")
68 | )
69 | y = [s["score"] for s in dataset]
70 | dataloader = DataLoader(
71 | dataset=dataset,
72 | batch_size=256,
73 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True),
74 | num_workers=2,
75 | )
76 | y_hat = (
77 | torch.cat(
78 | trainer.predict(
79 | ckpt_path="best", dataloaders=dataloader, return_predictions=True
80 | ),
81 | dim=0,
82 | )
83 | .cpu()
84 | .tolist()
85 | )
86 | assert pearsonr(y_hat, y)[0] > 0.77
--------------------------------------------------------------------------------
/COMET/tests/integration/models/test_regression_metric.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import os
3 | import shutil
4 | import unittest
5 | import warnings
6 |
7 | import torch
8 | from comet.models import RegressionMetric
9 | from pytorch_lightning import seed_everything
10 | from pytorch_lightning.trainer.trainer import Trainer
11 | from scipy.stats import pearsonr
12 | from tests.data import DATA_PATH
13 | from torch.utils.data import DataLoader
14 |
15 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
16 | os.environ["OMP_NUM_THREADS"] = "1"
17 |
18 |
19 | class TestRegressionMetric(unittest.TestCase):
20 | @classmethod
21 | def tearDownClass(cls):
22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints"))
23 |
24 | def test_training(self):
25 | seed_everything(12)
26 | warnings.filterwarnings(
27 | "ignore",
28 | #category=PossibleUserWarning,
29 | message="GPU available but not used.*",
30 | )
31 | trainer = Trainer(
32 | accelerator="cpu",
33 | max_epochs=10,
34 | deterministic=True,
35 | enable_checkpointing=True,
36 | default_root_dir=DATA_PATH,
37 | logger=False,
38 | enable_progress_bar=False,
39 | )
40 | model = RegressionMetric(
41 | encoder_model="BERT",
42 | pretrained_model="google/bert_uncased_L-2_H-128_A-2",
43 | train_data=os.path.join(DATA_PATH, "test_regression_data.csv"),
44 | validation_data=os.path.join(DATA_PATH, "test_regression_data.csv"),
45 | hidden_sizes=[384],
46 | layerwise_decay=0.95,
47 | batch_size=32,
48 | learning_rate=1e-04,
49 | encoder_learning_rate=1e-04,
50 | )
51 | warnings.filterwarnings(
52 | "ignore",
53 | category=UserWarning,
54 | message=".*Consider increasing the value of the `num_workers` argument` .*",
55 | )
56 | trainer.fit(model)
57 | self.assertTrue(
58 | os.path.exists(
59 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=160.ckpt")
60 | )
61 | )
62 |
63 | saved_model = RegressionMetric.load_from_checkpoint(
64 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=160.ckpt")
65 | )
66 | dataset = saved_model.read_csv(
67 | os.path.join(DATA_PATH, "test_regression_data.csv")
68 | )
69 | y = [s["score"] for s in dataset]
70 | dataloader = DataLoader(
71 | dataset=dataset,
72 | batch_size=256,
73 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True),
74 | num_workers=2,
75 | )
76 | y_hat = (
77 | torch.cat(
78 | trainer.predict(
79 | ckpt_path="best", dataloaders=dataloader, return_predictions=True
80 | ),
81 | dim=0,
82 | )
83 | .cpu()
84 | .tolist()
85 | )
86 | assert pearsonr(y_hat, y)[0] > 0.77
--------------------------------------------------------------------------------
/COMET/tests/integration/modules/test_feedforward.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import unittest
3 |
4 | import torch
5 | from sklearn.datasets import load_digits
6 | from sklearn.model_selection import train_test_split
7 | from torch import nn
8 |
9 | from comet.modules.feedforward import FeedForward
10 | from pytorch_lightning import seed_everything
11 |
12 |
13 | class TestFeedForward(unittest.TestCase):
14 | def test_MNIST(self):
15 | seed_everything(3)
16 | """
17 | STEP 1: LOADING DATASET
18 | """
19 | images, labels = load_digits(return_X_y=True)
20 | images = [torch.Tensor(images[i, :]) for i in range(images.shape[0])]
21 | labels = torch.tensor(labels, dtype=torch.long)
22 |
23 | train_images, test_images, train_labels, test_labels = train_test_split(
24 | images, labels, test_size=0.2, random_state=42
25 | )
26 |
27 | train_dataset = list(zip(train_images, train_labels))
28 | test_dataset = list(zip(test_images, test_labels))
29 |
30 | """
31 | STEP 2: MAKING DATASET ITERABLE
32 | """
33 | batch_size = 256
34 | n_iters = 80
35 | num_epochs = n_iters / (len(train_dataset) / batch_size)
36 | num_epochs = int(num_epochs)
37 |
38 | train_loader = torch.utils.data.DataLoader(
39 | dataset=train_dataset, batch_size=batch_size, shuffle=True
40 | )
41 |
42 | test_loader = torch.utils.data.DataLoader(
43 | dataset=test_dataset, batch_size=batch_size, shuffle=False
44 | )
45 |
46 | """
47 | STEP 3: INSTANTIATE MODEL CLASS
48 | """
49 | model = FeedForward(
50 | in_dim=8 * 8,
51 | out_dim=10,
52 | hidden_sizes=[100],
53 | activations="Tanh",
54 | )
55 |
56 | """
57 | STEP 4: INSTANTIATE LOSS CLASS
58 | """
59 | criterion = nn.CrossEntropyLoss()
60 |
61 | """
62 | STEP 5: INSTANTIATE OPTIMIZER CLASS
63 | """
64 | learning_rate = 0.1
65 | optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
66 |
67 | """
68 | STEP 7: TRAIN THE MODEL
69 | """
70 | iter = 0
71 | for epoch in range(num_epochs):
72 | for i, (images, labels) in enumerate(train_loader):
73 | # Load images with gradient accumulation capabilities
74 | images = images.view(-1, 8 * 8).requires_grad_()
75 |
76 | # Clear gradients w.r.t. parameters
77 | optimizer.zero_grad()
78 |
79 | # Forward pass to get output/logits
80 | outputs = model(images)
81 |
82 | # Calculate Loss: softmax --> cross entropy loss
83 | loss = criterion(outputs, labels)
84 |
85 | # Getting gradients w.r.t. parameters
86 | loss.backward()
87 |
88 | # Updating parameters
89 | optimizer.step()
90 |
91 | iter += 1
92 |
93 | if iter % 10 == 0:
94 | # Calculate Accuracy
95 | correct = 0
96 | total = 0
97 | # Iterate through test dataset
98 | for images, labels in test_loader:
99 | # Load images with gradient accumulation capabilities
100 | images = images.view(-1, 8 * 8).requires_grad_()
101 |
102 | # Forward pass only to get logits/output
103 | outputs = model(images)
104 |
105 | # Get predictions from the maximum value
106 | _, predicted = torch.max(outputs.data, 1)
107 |
108 | # Total number of labels
109 | total += labels.size(0)
110 |
111 | # Total correct predictions
112 | correct += (predicted == labels).sum()
113 |
114 | accuracy = 100 * correct // total
115 | self.assertGreaterEqual(accuracy, 95)
116 |
--------------------------------------------------------------------------------
/COMET/tests/unit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/unit/__init__.py
--------------------------------------------------------------------------------
/COMET/tests/unit/encoders/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/COMET/tests/unit/encoders/__init__.py
--------------------------------------------------------------------------------
/COMET/tests/unit/encoders/test_bert.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import unittest
3 |
4 | from comet.encoders.bert import BERTEncoder
5 |
6 |
7 | class TestBERTEncoder(unittest.TestCase):
8 |
9 | bert = BERTEncoder.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
10 |
11 | def test_num_layers(self):
12 | self.assertEqual(self.bert.num_layers, 3)
13 |
14 | def test_output_units(self):
15 | self.assertEqual(self.bert.output_units, 128)
16 |
17 | def test_max_positions(self):
18 | self.assertEqual(self.bert.max_positions, 512)
19 |
20 | def test_prepare_sample(self):
21 | sample = ["hello world, welcome to COMET!", "This is a batch"]
22 | model_input = self.bert.prepare_sample(sample)
23 | self.assertIn("input_ids", model_input)
24 | self.assertIn("attention_mask", model_input)
25 |
26 | def test_forward(self):
27 | sample = ["hello world, welcome to COMET!", "This is a batch"]
28 | model_input = self.bert.prepare_sample(sample)
29 | model_output = self.bert(**model_input)
30 | self.assertIn("wordemb", model_output)
31 | self.assertIn("sentemb", model_output)
32 | self.assertIn("all_layers", model_output)
33 | self.assertIn("attention_mask", model_output)
34 |
--------------------------------------------------------------------------------
/COMET/tests/unit/encoders/test_xlmr.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import unittest
3 |
4 | from comet.encoders.xlmr import XLMREncoder
5 |
6 |
7 | class TestXLMREncoder(unittest.TestCase):
8 |
9 | xlmr = XLMREncoder.from_pretrained("Unbabel/xlm-roberta-comet-small")
10 |
11 | def test_num_layers(self):
12 | self.assertEqual(self.xlmr.num_layers, 7)
13 |
14 | def test_output_units(self):
15 | self.assertEqual(self.xlmr.output_units, 384)
16 |
17 | def test_max_positions(self):
18 | self.assertEqual(self.xlmr.max_positions, 514)
19 |
20 | def test_prepare_sample(self):
21 | sample = ["hello world, welcome to COMET!", "This is a batch"]
22 | model_input = self.xlmr.prepare_sample(sample)
23 | self.assertIn("input_ids", model_input)
24 | self.assertIn("attention_mask", model_input)
25 |
26 | def test_forward(self):
27 | sample = ["hello world, welcome to COMET!", "This is a batch"]
28 | model_input = self.xlmr.prepare_sample(sample)
29 | model_output = self.xlmr(**model_input)
30 | self.assertIn("wordemb", model_output)
31 | self.assertIn("sentemb", model_output)
32 | self.assertIn("all_layers", model_output)
33 | self.assertIn("attention_mask", model_output)
34 |
--------------------------------------------------------------------------------
/COMET/tests/unit/test_download_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import unittest
3 | import os
4 | import shutil
5 | from tests.data import DATA_PATH
6 | from comet.download_utils import download_model
7 | from comet.models import load_from_checkpoint
8 |
9 |
10 | class TestDownloadModel(unittest.TestCase):
11 | @classmethod
12 | def tearDownClass(cls):
13 | shutil.rmtree(os.path.join(DATA_PATH, "wmt21-cometinho-da"))
14 |
15 | def test_download_from_s3(self):
16 | data_path = download_model("wmt21-cometinho-da", saving_directory=DATA_PATH)
17 | self.assertTrue(
18 | os.path.exists(os.path.join(DATA_PATH, "wmt21-cometinho-da/hparams.yaml"))
19 | )
20 | self.assertTrue(
21 | os.path.exists(os.path.join(DATA_PATH, "wmt21-cometinho-da/checkpoints/"))
22 | )
23 | load_from_checkpoint(data_path)
24 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing Guidelines
2 |
3 | Thank you for your interest in contributing to our project. Whether it's a bug report, new feature, correction, or additional
4 | documentation, we greatly value feedback and contributions from our community.
5 |
6 | Please read through this document before submitting any issues or pull requests to ensure we have all the necessary
7 | information to effectively respond to your bug report or contribution.
8 |
9 |
10 | ## Reporting Bugs/Feature Requests
11 |
12 | We welcome you to use the GitHub issue tracker to report bugs or suggest features.
13 |
14 | When filing an issue, please check existing open, or recently closed, issues to make sure somebody else hasn't already
15 | reported the issue. Please try to include as much information as you can. Details like these are incredibly useful:
16 |
17 | * A reproducible test case or series of steps
18 | * The version of our code being used
19 | * Any modifications you've made relevant to the bug
20 | * Anything unusual about your environment or deployment
21 |
22 |
23 | ## Contributing via Pull Requests
24 | Contributions via pull requests are much appreciated. Before sending us a pull request, please ensure that:
25 |
26 | 1. You are working against the latest source on the *main* branch.
27 | 2. You check existing open, and recently merged, pull requests to make sure someone else hasn't addressed the problem already.
28 | 3. You open an issue to discuss any significant work - we would hate for your time to be wasted.
29 |
30 | To send us a pull request, please:
31 |
32 | 1. Fork the repository.
33 | 2. Modify the source; please focus on the specific change you are contributing. If you also reformat all the code, it will be hard for us to focus on your change.
34 | 3. Ensure local tests pass.
35 | 4. Commit to your fork using clear commit messages.
36 | 5. Send us a pull request, answering any default questions in the pull request interface.
37 | 6. Pay attention to any automated CI failures reported in the pull request, and stay involved in the conversation.
38 |
39 | GitHub provides additional document on [forking a repository](https://help.github.com/articles/fork-a-repo/) and
40 | [creating a pull request](https://help.github.com/articles/creating-a-pull-request/).
41 |
42 |
43 | ## Finding contributions to work on
44 | Looking at the existing issues is a great way to find something to contribute on. As our projects, by default, use the default GitHub issue labels (enhancement/bug/duplicate/help wanted/invalid/question/wontfix), looking at any 'help wanted' issues is a great place to start.
45 |
46 |
47 | ## Code of Conduct
48 | This project has adopted the [Amazon Open Source Code of Conduct](https://aws.github.io/code-of-conduct).
49 | For more information see the [Code of Conduct FAQ](https://aws.github.io/code-of-conduct-faq) or contact
50 | opensource-codeofconduct@amazon.com with any additional questions or comments.
51 |
52 |
53 | ## Security issue notifications
54 | If you discover a potential security issue in this project we ask that you notify AWS/Amazon Security via our [vulnerability reporting page](http://aws.amazon.com/security/vulnerability-reporting/). Please do **not** create a public github issue.
55 |
56 |
57 | ## Licensing
58 |
59 | See the [LICENSE](LICENSE) file for our project's licensing. We will ask you to confirm the licensing of your contribution.
60 |
--------------------------------------------------------------------------------
/Config:
--------------------------------------------------------------------------------
1 | package.Giorgos_internship_code = {
2 | interfaces = (1.0);
3 |
4 | # Use NoOpBuild. See https://w.amazon.com/index.php/BrazilBuildSystem/NoOpBuild
5 | build-system = no-op;
6 | build-tools = {
7 | 1.0 = {
8 | NoOpBuild = 1.0;
9 | };
10 | };
11 |
12 | # Use runtime-dependencies for when you want to bring in additional
13 | # packages when deploying.
14 | # Use dependencies instead if you intend for these dependencies to
15 | # be exported to other packages that build against you.
16 | dependencies = {
17 | 1.0 = {
18 | };
19 | };
20 |
21 | runtime-dependencies = {
22 | 1.0 = {
23 | };
24 | };
25 |
26 | };
27 |
--------------------------------------------------------------------------------
/NOTICE:
--------------------------------------------------------------------------------
1 | Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
2 |
--------------------------------------------------------------------------------
/Prism/README.md:
--------------------------------------------------------------------------------
1 | # Doc-Prism (mBART-50)
2 |
3 | This README describes how to use **Doc-Prism** an extension of the original Prism metric that can be used for document-level evaluation.
4 |
5 | Contrary to the original implementation that used a multilingual MT model, we use [mBART-50](https://arxiv.org/abs/2008.00401), a multilingual language model that is pre-trained at the document level, to score the MT outputs.
6 |
7 | ## Installation
8 |
9 | This codebase is an implementation of the [Prism metric](https://github.com/thompsonb/prism) using the [Hugging Face Transformers](https://huggingface.co/docs/transformers/index) library. For a detailed presnetation of the BERTScore metric, including usage examples and instructions see the original documentation.
10 |
11 | ### Get some files to score
12 | ```bash
13 | sacrebleu -t wmt21 -l en-de --echo src | head -n 20 > src.en
14 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > ref.de
15 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > hyp.de # put your system output here
16 | ```
17 |
18 | To evaluate at the document level we need to know where the document boundaries are in the test set, so that we only use valid context. This is passed in as a file where each line contains a document ID.
19 |
20 | For WMT test sets this can be obtained via [sacreBLEU](https://github.com/mjpost/sacrebleu):
21 | ```bash
22 | sacrebleu -t wmt21 -l en-de --echo docid | head -n 20 > docids.ende
23 | ```
24 |
25 |
26 | ### Python usage:
27 |
28 | In order to use Doc-Prism with python simply add `doc=True` when calling the score function.
29 |
30 | ```python
31 | from prism import MBARTPrism
32 | from add_context import add_context
33 |
34 | # load data files
35 | doc_ids = [x.strip() for x in open('docids.ende', 'rt').readlines()]
36 | hyp = [x.strip() for x in open('hyp.de', 'rt').readlines()]
37 | ref = [x.strip() for x in open('ref.de', 'rt').readlines()]
38 |
39 | # load prism model
40 | model_path = "facebook/mbart-large-50"
41 | prism = MBARTPrism(checkpoint=model_path, src_lang="en", tgt_lang="de")
42 |
43 | # add contexts to reference and hypothesis texts
44 | hyp = add_context(orig_txt=hyp, context=ref, doc_ids=doc_ids, sep_token=prism.encoder.tokenizer.sep_token)
45 | ref = add_context(orig_txt=ref, context=ref, doc_ids=doc_ids, sep_token=prism.encoder.tokenizer.sep_token)
46 |
47 | seg_score = prism.score(cand=hyp, ref=ref, doc=True)
48 | ```
49 |
50 | ## Reproduce
51 | To reproduce the Doc-Prism results from the paper run the [score_doc-metrics.py](/score_doc-metrics.py) script with the flags `--model prism` and `--doc`.
52 |
53 | ```bash
54 | git clone https://github.com/google-research/mt-metrics-eval.git
55 | cd mt-metrics-eval
56 | pip install .
57 | alias mtme='python3 -m mt_metrics_eval.mtme'
58 | mtme --download # Puts ~1G of data into $HOME/.mt-metrics-eval.
59 | ```
60 | To obtain system-level scores of Doc-Prism (mBART-50) for the WMT21 testet run:
61 | ```bash
62 | python score_doc-prism.py --campaign wmt21.news --lp en-de --doc --level sys
63 | ````
64 |
65 | ## Paper
66 |
67 | If you use the code in your work, please cite [Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric](https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf):
68 |
69 | ```
70 | @inproceedings{easy_doc_mt
71 | title = {Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric},
72 | author = {Vernikos, Giorgos and Thompson, Brian and Mathur, Prashant and Federico, Marcello},
73 | booktitle = "Proceedings of the Seventh Conference on Machine Translation",
74 | month = dec,
75 | year = "2022",
76 | address = "Abu Dhabi, United Arab Emirates",
77 | publisher = "Association for Computational Linguistics",
78 | url = "https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf",
79 | }
80 | ```
81 |
--------------------------------------------------------------------------------
/Prism/add_context.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from typing import List
3 |
4 |
5 | def add_context(orig_txt: List[str], context: List[str], doc_ids: List[str], sep_token: str = "",
6 | ws: int = 2) -> List[str]:
7 | """Function that adds the previous sentences as context to the current sentence, respecting document boundaries
8 | :param orig_txt: the original text
9 | :param context: the text from which the context will be taken (same as orig_txt for source/reference)
10 | :param doc_ids: the document where each segment belongs to
11 | :param sep_token: the separator token of the tokenizer for the specific model
12 | :param ws: the window size, maximum of the previous sentences to be considered as context
13 | :return: the original text augmented with context
14 | """
15 | if not (len(orig_txt) == len(context) == len(doc_ids)):
16 | raise Exception(f'Lengths should match: len(orig_txt)={len(orig_txt)}, len(context)={len(context)}, len(doc_ids)={len(doc_ids)}')
17 | i, k = 0, 0
18 | augm_txt = []
19 | doc_id = doc_ids[0]
20 | while i < len(orig_txt):
21 | if doc_ids[i] == doc_id:
22 | context_window = context[i - min(k, ws):i]
23 | augm_txt.append(" {} ".format(sep_token).join(context_window + [orig_txt[i]]))
24 | i += 1
25 | else:
26 | doc_id = doc_ids[i]
27 | k = -1
28 | k += 1
29 | return augm_txt
30 |
31 |
32 |
--------------------------------------------------------------------------------
/Prism/prism.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from tqdm import tqdm
3 | from transformers import MBartForConditionalGeneration, MBart50TokenizerFast, MBart50Tokenizer
4 | import torch.nn.functional as F
5 | import torch.nn as nn
6 | import numpy as np
7 |
8 |
9 | class MBARTPrism:
10 | def __init__(self, src_lang, tgt_lang, checkpoint='facebook/mbart-large-cc25', device='None'):
11 | langs = ["ar_AR", "cs_CZ", "de_DE", "en_XX", "es_XX", "et_EE", "fi_FI", "fr_XX", "gu_IN", "hi_IN", "it_IT",
12 | "ja_XX", "kk_KZ", "ko_KR", "lt_LT", "lv_LV", "my_MM", "ne_NP", "nl_XX", "ro_RO", "ru_RU", "si_LK",
13 | "tr_TR", "vi_VN", "zh_CN", "pl_PL", "ta_IN"]
14 | src_lang = [l for l in langs if src_lang in l][0]
15 | tgt_lang = [l for l in langs if tgt_lang in l][0]
16 |
17 | if device is None:
18 | self.device = "cuda" if torch.cuda.is_available() else "cpu"
19 | else:
20 | self.device = device
21 | self.tokenizer = MBart50Tokenizer.from_pretrained(checkpoint, src_lang=src_lang, tgt_lang=tgt_lang)
22 | self.model = MBartForConditionalGeneration.from_pretrained(checkpoint)
23 | self.model.eval()
24 | self.model.to(self.device)
25 |
26 | # Set up loss
27 | self.loss_fct = nn.NLLLoss(reduction='none', ignore_index=self.model.config.pad_token_id)
28 | self.lsm = nn.LogSoftmax(dim=1)
29 |
30 | def score(self, cand, ref, batch_size, doc, segment_scores=True):
31 | """ Score a batch of examples """
32 |
33 | if len(cand) != len(ref):
34 | raise Exception(f'Length of cand ({len(cand)}) does not match length of ref ({len(ref)})')
35 |
36 | sent_scores = [[], []]
37 |
38 | with torch.no_grad():
39 | for sent_idx, (srcs, tgts) in enumerate([(ref, cand), (cand, ref)]):
40 | for i in tqdm(range(0, len(srcs), batch_size)):
41 | src_list = srcs[i: i + batch_size]
42 | tgt_list = tgts[i: i + batch_size]
43 | with torch.no_grad():
44 | encoded_src = self.tokenizer(
45 | src_list,
46 | truncation=True,
47 | padding=True,
48 | return_tensors='pt',
49 | max_length=self.tokenizer.model_max_length
50 | )
51 | with self.tokenizer.as_target_tokenizer():
52 | encoded_tgt = self.tokenizer(
53 | tgt_list,
54 | truncation=True,
55 | padding=True,
56 | return_tensors='pt',
57 | max_length=self.tokenizer.model_max_length
58 | )
59 | tgt_len = [len(self.tokenizer(sent.split(self.tokenizer.sep_token)[-1]).input_ids) for sent
60 | in tgt_list]
61 | if doc:
62 | start_toks = [len(self.tokenizer(sent).input_ids) - tgt_len[i] for i, sent in
63 | enumerate(tgt_list)]
64 | else:
65 | start_toks = [0] * len(tgt_list)
66 |
67 | src_tokens = encoded_src['input_ids'].to(self.device)
68 | src_mask = encoded_src['attention_mask'].to(self.device)
69 |
70 | tgt_tokens = encoded_tgt['input_ids'].to(self.device)
71 |
72 | output = self.model(
73 | input_ids=src_tokens,
74 | attention_mask=src_mask,
75 | labels=tgt_tokens
76 | )
77 | logits = output.logits.view(-1, self.model.config.vocab_size)
78 | loss = self.loss_fct(self.lsm(logits), tgt_tokens.view(-1))
79 | loss = loss.view(tgt_tokens.shape[0], -1)
80 | ppl = []
81 | for i, s in enumerate(loss):
82 | ppl.append(s[start_toks[i]:start_toks[i] + tgt_len[i] - 1].sum() / (tgt_len[i] - 1))
83 | curr_score_list = [-x.item() for x in ppl]
84 | sent_scores[sent_idx] += curr_score_list
85 |
86 | segm_scores = np.mean(sent_scores, axis=0)
87 | sys_score = np.mean(segm_scores) if not segment_scores else segm_scores
88 |
89 | return sys_score
90 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Embarrassingly Easy Document-Level MT Metrics
2 |
3 | ## Overview
4 |
5 | In this work we extend state-of-the-art Machine Translation metrics, namely [Prism](https://github.com/thompsonb/prism), [COMET](https://github.com/Unbabel/COMET), [COMET-QE](https://github.com/Unbabel/COMET) and [BERTScore](https://github.com/Tiiiger/bert_score) to the document level. Our approach is _embarassingly simple_: instead of encoding only the hypothesis and reference we also encode the previous reference sentences as context. We still compute the metric score at the sentence level but also attend to previous context.
6 |
7 | 
8 |
9 |
10 | The extended metrics outperform their sentence-level counterparts in about 85% of the tested conditions ([WMT 2021 Metrics Shared Task](https://wmt-metrics-task.github.io/) ) and dramatically improve the ability of the corresponding model to handle discourse phenomena.
11 |
12 | ## Usage
13 |
14 | The current repository contains code that extends the original MT metrics to document level by providing the option to encode additional context. The code is presented as an extension of the corresponding original codebase. For information on how to use each metric see the corresponding README:
15 | * [COMET/COMET-QE](COMET/README.md)
16 | * [BERTScore](bert_score/README.md)
17 | * [Prism](Prism//README.md)
18 |
19 | It is recommended to create an environment for this project
20 | ```bash
21 | conda create -n doc-metrics-env python=3.9
22 | conda activate doc-metrics-env
23 | ```
24 |
25 | ## Reproducibility
26 |
27 | In order to reproduce the results of the paper, regarding the correlation with human annotations of document or sentence-level metrics on the test sets from the [WMT Metrics Shared Task](https://wmt-metrics-task.github.io/) first install the required packages for [BERTScore](/bert_score) and [COMET](/COMET) models. Next, install the [MT Metrics Eval](https://github.com/google-research/mt-metrics-eval) toolkit
28 | and download the database.
29 | ```bash
30 | git clone https://github.com/google-research/mt-metrics-eval.git
31 | cd mt-metrics-eval
32 | pip install .
33 | alias mtme='python3 -m mt_metrics_eval.mtme'
34 | mtme --download # Puts ~1G of data into $HOME/.mt-metrics-eval.
35 | ```
36 | Then use the `score_doc-metrics.py` script to obtain the scores for the model, domain and language pair of your choice from the WMT21 test sets.
37 | For example, to obtain system-level scores of Doc-COMET for the en-de language pair in the news domain, run:
38 | ```bash
39 | python score_doc-metrics.py --campaign wmt21.news --model comet --lp en-de --level sys --doc
40 | ````
41 | ## Acknowledgments
42 |
43 | We would like to thank the community for releasing their code! This repository contains code from [COMET](https://github.com/Unbabel/COMET), [BERTScore](https://github.com/Tiiiger/bert_score) and [Prism](https://github.com/thompsonb/prism) repositories.
44 |
45 |
46 | ## Paper
47 |
48 | If you use the code in your work, please cite [Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric](https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf):
49 |
50 | ```
51 | @inproceedings{easy_doc_mt
52 | title = {Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric},
53 | author = {Vernikos, Giorgos and Thompson, Brian and Mathur, Prashant and Federico, Marcello},
54 | booktitle = "Proceedings of the Seventh Conference on Machine Translation",
55 | month = dec,
56 | year = "2022",
57 | address = "Abu Dhabi, United Arab Emirates",
58 | publisher = "Association for Computational Linguistics",
59 | url = "https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf",
60 | }
61 | ```
62 |
63 | ## Security
64 |
65 | See [CONTRIBUTING](CONTRIBUTING.md#security-issue-notifications) for more information.
66 |
67 | ## License
68 |
69 | This project is licensed under the Apache-2.0 License.
70 |
71 |
--------------------------------------------------------------------------------
/bert_score/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
106 | # vscode
107 | .vscode
--------------------------------------------------------------------------------
/bert_score/.idea/bert_score.iml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/bert_score/.idea/deployment.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/bert_score/.idea/inspectionProfiles/profiles_settings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/bert_score/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
--------------------------------------------------------------------------------
/bert_score/.idea/modules.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/bert_score/.idea/remote-mappings.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
--------------------------------------------------------------------------------
/bert_score/.travis.yml:
--------------------------------------------------------------------------------
1 | env:
2 | TOKENIZERS_PARALLELISM=false # parallelized fast tokenizer don't fit into Travis CI VM
3 | language: python
4 | python:
5 | - '3.6'
6 | - '3.7'
7 | install:
8 | pip install .
9 | script: travis_wait 30 python -m unittest discover
10 |
--------------------------------------------------------------------------------
/bert_score/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Tianyi Zhang, Varsha Kishore, Felix Wu, Kilian Q. Weinberger, and Yoav Artzi.
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/bert_score/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include bert_score/rescale_baseline *
2 |
--------------------------------------------------------------------------------
/bert_score/README.md:
--------------------------------------------------------------------------------
1 | # Doc-BERTScore
2 |
3 | This README describes hot to use **Doc-BERTScore** an extension of the BERTScore metric that can be used for document-level evaluation.
4 |
5 | ## Installation
6 |
7 | This codebase is built upon the original [BERTScore code](https://github.com/Tiiiger/bert_score). For a detailed presnetation of the BERTScore metric, including usage examples and instructions see the original documentation.
8 |
9 | To run Doc-BERTScore you will need to develop locally:
10 | ```bash
11 | git clone https://github.com/amazon-science/doc-mt-metrics.git
12 | cd bert_score
13 | pip install .
14 | ```
15 |
16 | ### Get some files to score
17 | ```bash
18 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > ref.de
19 | sacrebleu -t wmt21 -l en-de --echo ref | head -n 20 > hyp.de # put your system output here
20 | ```
21 | To evaluate at the document level we need to know where the document boundaries are in the test set, so that we only use valid context. This is passed in as a file where each line contains a document ID.
22 |
23 | For WMT test sets this can be obtained via [sacreBLEU](https://github.com/mjpost/sacrebleu):
24 | ```bash
25 | sacrebleu -t wmt21 -l en-de --echo docid | head -n 20 > docids.ende
26 | ```
27 |
28 | ### Command Line usage:
29 |
30 | To score using the document-level BERTScore simply add the `--doc` flag:
31 | ```bash
32 | bert-score -r ref.de -c hyp.de --lang de --doc docids.ende
33 | ```
34 |
35 | In the paper we use`roberta-large` for X->En pairs and `bert-base-multilingual-cased` for En->X pairs (default at the time) but you can select another model with the `-m MODEL_TYPE` flag. See the [spreadsheet](https://docs.google.com/spreadsheets/d/1RKOVpselB98Nnh_EOC4A2BYn8_201tmPODpNWu4w7xI/edit?usp=sharing) provided by the authors of BERTScore for a full list of supported models.
36 |
37 | ### Python usage (Object-oriented API):
38 |
39 | The [BERTScore](https://github.com/Tiiiger/bert_score) framework provides two APIs in order to use the BERTScore metric with python: an object-oriented one that caches the model and is recommended for multiple evaluations and a functional one that can be used for single evaluation. For more details see the [demo](https://github.com/Tiiiger/bert_score/blob/master/example/Demo.ipynb) provided by the authors.
40 |
41 | In order to use Doc-BERTScore simple simply add `doc=True` when calling the `score` function:
42 |
43 | ```python
44 | from bert_score import BERTScorer
45 | from add_context import add_context
46 |
47 | with open("hyp.de") as f:
48 | cands = [line.strip() for line in f]
49 |
50 | with open("ref.de") as f:
51 | refs = [line.strip() for line in f]
52 |
53 | with open("docids.ende") as f:
54 | doc_ids = [line.strip() for line in f]
55 |
56 | scorer = BERTScorer(lang="de")
57 |
58 | # add contexts to reference and hypothesis texts
59 | cands = add_context(orig_txt=cands, context=refs, doc_ids=doc_ids, sep_token=scorer._tokenizer.sep_token)
60 | refs = add_context(orig_txt=refs, context=refs, doc_ids=doc_ids, sep_token=scorer._tokenizer.sep_token)
61 |
62 | # set doc=True to evaluate at the document level
63 | P, R, F1 = scorer.score(cands, refs, doc=True)
64 | ```
65 | ### Python usage (Function API):
66 |
67 | In order to use Doc-BERTScore simple simply add `doc=True` when calling the `score` function:
68 |
69 | ```python
70 | from bert_score import score
71 | from add_context import add_context
72 |
73 | with open("hyp.de") as f:
74 | cands = [line.strip() for line in f]
75 |
76 | with open("ref.de") as f:
77 | refs = [line.strip() for line in f]
78 |
79 | with open("docids.ende") as f:
80 | doc_ids = [line.strip() for line in f]
81 |
82 | # add contexts to reference and hypothesis texts
83 | cands = add_context(orig_txt=cands, context=refs, doc_ids=doc_ids, sep_token="[SEP]")
84 | refs = add_context(orig_txt=refs, context=refs, doc_ids=doc_ids, sep_token="[SEP]")
85 |
86 | # set doc=True to evaluate at the document level
87 | P, R, F1 = score(cands, refs, lang="de", verbose=True, doc=True)
88 | ```
89 |
90 | To use another model set the flag `model_type=MODEL_TYPE` when calling `score` function.
91 |
92 | ## Reproduce
93 | To reproduce the Doc-BERTScore results from the paper run the [score_doc-metrics.py](/score_doc-metrics.py) script with the flags `--model bertscore` and `--doc`.
94 |
95 | ## Paper
96 |
97 | If you use the code in your work, please cite [Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric](https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf):
98 |
99 | ```
100 | @inproceedings{easy_doc_mt
101 | title = {Embarrassingly Easy Document-Level MT Metrics: How to Convert Any Pretrained Metric Into a Document-Level Metric},
102 | author = {Vernikos, Giorgos and Thompson, Brian and Mathur, Prashant and Federico, Marcello},
103 | booktitle = "Proceedings of the Seventh Conference on Machine Translation",
104 | month = dec,
105 | year = "2022",
106 | address = "Abu Dhabi, United Arab Emirates",
107 | publisher = "Association for Computational Linguistics",
108 | url = "https://statmt.org/wmt22/pdf/2022.wmt-1.6.pdf",
109 | }
110 | ```
111 |
--------------------------------------------------------------------------------
/bert_score/add_context.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from typing import List
3 |
4 |
5 | def add_context(orig_txt: List[str], context: List[str], doc_ids: List[str], sep_token: str = "",
6 | ws: int = 2) -> List[str]:
7 | """Function that adds the previous sentences as context to the current sentence, respecting document boundaries
8 | :param orig_txt: the original text
9 | :param context: the text from which the context will be taken (same as orig_txt for source/reference)
10 | :param doc_ids: the document where each segment belongs to
11 | :param sep_token: the separator token of the tokenizer for the specific model
12 | :param ws: the window size, maximum of the previous sentences to be considered as context
13 | :return: the original text augmented with context
14 | """
15 | if not (len(orig_txt) == len(context) == len(doc_ids)):
16 | raise Exception(f'Lengths should match: len(orig_txt)={len(orig_txt)}, len(context)={len(context)}, len(doc_ids)={len(doc_ids)}')
17 | i, k = 0, 0
18 | augm_txt = []
19 | doc_id = doc_ids[0]
20 | while i < len(orig_txt):
21 | if doc_ids[i] == doc_id:
22 | context_window = context[i - min(k, ws):i]
23 | augm_txt.append(" {} ".format(sep_token).join(context_window + [orig_txt[i]]))
24 | i += 1
25 | else:
26 | doc_id = doc_ids[i]
27 | k = -1
28 | k += 1
29 | return augm_txt
30 |
31 |
32 |
--------------------------------------------------------------------------------
/bert_score/bert_score.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/bert_score.png
--------------------------------------------------------------------------------
/bert_score/bert_score/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.3.11"
2 | from .score import *
3 | from .scorer import *
4 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/cs/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.28803304,0.28811806,0.28382972
3 | 1,0.36045152,0.3605044,0.35791346
4 | 2,0.35763955,0.3577387,0.35552806
5 | 3,0.4382742,0.43832803,0.4371357
6 | 4,0.49264902,0.4926875,0.49187797
7 | 5,0.5753039,0.5753327,0.57483304
8 | 6,0.63127446,0.6313224,0.6309864
9 | 7,0.5324934,0.532565,0.53202814
10 | 8,0.5102161,0.5103038,0.5096529
11 | 9,0.6044539,0.6045382,0.604006
12 | 10,0.6814313,0.68149376,0.6810876
13 | 11,0.7187933,0.7188438,0.71841186
14 | 12,0.386078,0.38613266,0.38548917
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/cs/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.24679352,0.24680473,0.24270211
3 | 1,0.29235435,0.29231834,0.28975013
4 | 2,0.3138872,0.31386852,0.31213808
5 | 3,0.3285111,0.3284912,0.32616478
6 | 4,0.34355187,0.34352767,0.3409594
7 | 5,0.40920743,0.4091819,0.40708998
8 | 6,0.5143928,0.5143628,0.51312447
9 | 7,0.5684746,0.56843746,0.5675548
10 | 8,0.55277854,0.55274475,0.55174726
11 | 9,0.4946325,0.49455652,0.49314302
12 | 10,0.425077,0.42500603,0.42305094
13 | 11,0.37143245,0.37136525,0.3687799
14 | 12,0.38431773,0.38426274,0.38162753
15 | 13,0.40205154,0.40199956,0.3993145
16 | 14,0.41208863,0.412054,0.40980735
17 | 15,0.4243431,0.42427495,0.4220649
18 | 16,0.32602695,0.3260445,0.32438898
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/cs/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.28832704,0.28834337,0.28409466
3 | 1,0.42489076,0.42484972,0.42346135
4 | 2,0.6489359,0.64890593,0.6484903
5 | 3,0.7212477,0.7212302,0.7210182
6 | 4,0.70944715,0.7094549,0.70922697
7 | 5,0.7286318,0.72864425,0.7284186
8 | 6,0.71929383,0.71930563,0.71912307
9 | 7,0.75613487,0.756147,0.7559896
10 | 8,0.7593519,0.759376,0.75920963
11 | 9,0.801281,0.80129445,0.8010951
12 | 10,0.8243164,0.82432646,0.8241175
13 | 11,0.86058,0.86058563,0.8604526
14 | 12,0.97968304,0.9796832,0.9796791
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/cs/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.36036962,0.3603732,0.35725367
3 | 1,0.6612272,0.661179,0.66074145
4 | 2,0.722742,0.72273415,0.72256047
5 | 3,0.73125947,0.73123205,0.7310358
6 | 4,0.7825561,0.7825642,0.78245354
7 | 5,0.78133506,0.7813208,0.7811937
8 | 6,0.8079803,0.8079664,0.8078874
9 | 7,0.8139315,0.8139195,0.8138673
10 | 8,0.82575524,0.82575536,0.8256901
11 | 9,0.8267652,0.8267674,0.8267081
12 | 10,0.826633,0.826636,0.82654697
13 | 11,0.8310137,0.8310095,0.83087397
14 | 12,0.8320955,0.83211106,0.83181846
15 | 13,0.82811135,0.8281364,0.827703
16 | 14,0.8271892,0.8272189,0.8265785
17 | 15,0.8306057,0.8306258,0.82997155
18 | 16,0.81801736,0.81803435,0.8175852
19 | 17,0.8253589,0.825372,0.8250096
20 | 18,0.82938665,0.82940817,0.8290164
21 | 19,0.82824516,0.8282779,0.827922
22 | 20,0.8445639,0.84459394,0.84429437
23 | 21,0.86360985,0.8636378,0.86333483
24 | 22,0.8661244,0.8661579,0.86584014
25 | 23,0.8638866,0.86392677,0.8635829
26 | 24,0.97858095,0.9785705,0.9785698
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/de/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.29239914,0.29233938,0.28799337
3 | 1,0.37400073,0.37395933,0.37138724
4 | 2,0.36879358,0.36874846,0.3663888
5 | 3,0.4502482,0.4501956,0.44887444
6 | 4,0.4982386,0.49817833,0.49722672
7 | 5,0.5760319,0.5759751,0.5754043
8 | 6,0.62940514,0.62935334,0.6289917
9 | 7,0.5357095,0.53565013,0.53505087
10 | 8,0.5146575,0.51462156,0.5138855
11 | 9,0.61532813,0.61528224,0.6147353
12 | 10,0.68632543,0.6862504,0.6858456
13 | 11,0.7214881,0.7214059,0.72098553
14 | 12,0.36572546,0.36572027,0.36501065
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/de/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.25753325,0.25744554,0.25318053
3 | 1,0.2981514,0.2980718,0.2952621
4 | 2,0.3208413,0.32078207,0.3187413
5 | 3,0.33565432,0.33562624,0.33315146
6 | 4,0.34684345,0.34679237,0.3443796
7 | 5,0.4133209,0.41324788,0.41142154
8 | 6,0.514071,0.51400465,0.51292115
9 | 7,0.5642201,0.56416416,0.56339765
10 | 8,0.54623514,0.5461879,0.54531705
11 | 9,0.49143773,0.4913597,0.4903938
12 | 10,0.42275012,0.42266262,0.42136824
13 | 11,0.36494458,0.36484274,0.36310795
14 | 12,0.37404448,0.37393928,0.37217715
15 | 13,0.38868552,0.3885813,0.38668826
16 | 14,0.39440155,0.39433125,0.39241815
17 | 15,0.4055417,0.40547967,0.4035052
18 | 16,0.30379978,0.30370486,0.30213118
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/de/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.30777588,0.30777684,0.3031559
3 | 1,0.44505233,0.44509125,0.4434747
4 | 2,0.66170436,0.66174895,0.6612669
5 | 3,0.73550326,0.7355261,0.735256
6 | 4,0.7208496,0.72085893,0.720586
7 | 5,0.73704386,0.73705214,0.7367808
8 | 6,0.73208153,0.7320707,0.7318679
9 | 7,0.7680251,0.76800215,0.76783967
10 | 8,0.77268696,0.77266395,0.7724989
11 | 9,0.8099519,0.80989397,0.809723
12 | 10,0.8310105,0.83095115,0.83080244
13 | 11,0.86770487,0.86765665,0.86756754
14 | 12,0.9819623,0.9819598,0.9819579
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/de/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.37937975,0.379358,0.37601414
3 | 1,0.6650208,0.66500705,0.6645409
4 | 2,0.72824335,0.7282381,0.72805566
5 | 3,0.74166065,0.7416417,0.741436
6 | 4,0.7924967,0.7925062,0.7923915
7 | 5,0.7885143,0.7884954,0.7883624
8 | 6,0.8117979,0.8117669,0.81168765
9 | 7,0.8173677,0.8173395,0.81728506
10 | 8,0.82804793,0.828012,0.82794595
11 | 9,0.83066076,0.8306335,0.83057094
12 | 10,0.82999426,0.8299607,0.82988906
13 | 11,0.83342683,0.83340013,0.8332831
14 | 12,0.83806795,0.83803594,0.83778083
15 | 13,0.83596325,0.83591455,0.83558387
16 | 14,0.8378458,0.8377797,0.83741814
17 | 15,0.8420356,0.84196484,0.84161186
18 | 16,0.83186066,0.8318187,0.8314605
19 | 17,0.83927697,0.83923465,0.83889884
20 | 18,0.84405965,0.84401745,0.8436563
21 | 19,0.8409746,0.8409399,0.84059715
22 | 20,0.8542512,0.85422283,0.8539368
23 | 21,0.8734287,0.8733914,0.87314016
24 | 22,0.8774618,0.87741566,0.87717056
25 | 23,0.87821764,0.8781659,0.8779116
26 | 24,0.9817083,0.98170334,0.9817008
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en-sci/allenai/scibert_scivocab_uncased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3247314,0.32477322,0.32055983
3 | 1,0.34701017,0.34706187,0.344079
4 | 2,0.41985375,0.41988486,0.4179418
5 | 3,0.4668236,0.46684003,0.4656058
6 | 4,0.45860615,0.4586492,0.4573681
7 | 5,0.41228917,0.4123522,0.41066456
8 | 6,0.4395095,0.43956795,0.43794444
9 | 7,0.48392966,0.4839865,0.48246792
10 | 8,0.5335945,0.5336341,0.5322364
11 | 9,0.60744065,0.6074917,0.60612226
12 | 10,0.66027635,0.66033924,0.65897125
13 | 11,0.6890247,0.6891011,0.6878515
14 | 12,0.54997945,0.55007255,0.54844016
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/albert-base-v1.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.42279568,0.42285842,0.4198645
3 | 1,0.38239375,0.3824535,0.3795375
4 | 2,0.35127786,0.35131463,0.34854048
5 | 3,0.3402314,0.34027407,0.33761653
6 | 4,0.34001094,0.3400646,0.33745667
7 | 5,0.34310105,0.34314916,0.34054983
8 | 6,0.3478834,0.34792796,0.34530792
9 | 7,0.3523316,0.35237584,0.34973368
10 | 8,0.35546654,0.35550496,0.35283387
11 | 9,0.35682797,0.35686156,0.3541417
12 | 10,0.3572713,0.35730729,0.35451323
13 | 11,0.35916516,0.35920846,0.35632935
14 | 12,0.3620535,0.3621047,0.35911387
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/albert-base-v2.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.43284354,0.4329465,0.42670736
3 | 1,0.4085349,0.40857056,0.4041539
4 | 2,0.42302486,0.42304876,0.41986418
5 | 3,0.43835327,0.43837532,0.43578437
6 | 4,0.46398157,0.4640153,0.46179092
7 | 5,0.487097,0.48714137,0.48507443
8 | 6,0.50701046,0.5070602,0.50516284
9 | 7,0.5251579,0.5252073,0.52346826
10 | 8,0.5432063,0.5432638,0.5416856
11 | 9,0.56169736,0.56174135,0.56031275
12 | 10,0.58207834,0.58211654,0.58080167
13 | 11,0.5087994,0.5088567,0.50630754
14 | 12,0.4822224,0.48224902,0.4795803
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/albert-large-v1.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.48447838,0.48450485,0.4821886
3 | 1,0.5124409,0.51243365,0.5109167
4 | 2,0.49396634,0.49394318,0.49285302
5 | 3,0.48355308,0.48351732,0.48258644
6 | 4,0.48206407,0.48202685,0.4811013
7 | 5,0.48171225,0.48167655,0.48073986
8 | 6,0.48402956,0.48400134,0.48304388
9 | 7,0.48760605,0.48758495,0.4866279
10 | 8,0.49034056,0.4903293,0.4893756
11 | 9,0.4919946,0.49199188,0.4910255
12 | 10,0.49351045,0.4935107,0.49251547
13 | 11,0.4953505,0.49535286,0.4943231
14 | 12,0.49792922,0.4979353,0.49686712
15 | 13,0.50119936,0.5012099,0.5001017
16 | 14,0.50464475,0.50465906,0.5035164
17 | 15,0.5072171,0.50723296,0.5060587
18 | 16,0.50804037,0.50805837,0.506836
19 | 17,0.50674427,0.5067624,0.5054734
20 | 18,0.5028615,0.5028785,0.50150096
21 | 19,0.4957624,0.49577576,0.49427336
22 | 20,0.48470628,0.48471764,0.48304176
23 | 21,0.46942177,0.4694329,0.46755382
24 | 22,0.45182654,0.45184082,0.44979697
25 | 23,0.4372368,0.43725976,0.43516964
26 | 24,0.43032366,0.4303518,0.42831102
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/albert-large-v2.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.43137488,0.4314412,0.4271023
3 | 1,0.47189355,0.47192886,0.46977237
4 | 2,0.4965904,0.49659666,0.49521467
5 | 3,0.4952368,0.4952206,0.49390256
6 | 4,0.49991024,0.4998804,0.49865857
7 | 5,0.5061125,0.5060827,0.50490576
8 | 6,0.52520007,0.5251885,0.5241151
9 | 7,0.5463337,0.54633546,0.54536676
10 | 8,0.56268036,0.56267744,0.5618048
11 | 9,0.5788636,0.5788671,0.5780607
12 | 10,0.59798187,0.5979915,0.5972454
13 | 11,0.6093569,0.6093737,0.60867995
14 | 12,0.61832786,0.6183305,0.6176837
15 | 13,0.6298888,0.62988657,0.6292773
16 | 14,0.63760334,0.6376027,0.6370052
17 | 15,0.6402277,0.6402217,0.63963217
18 | 16,0.6457506,0.6457368,0.64517874
19 | 17,0.6488497,0.6488231,0.6482803
20 | 18,0.6473536,0.6473276,0.6467711
21 | 19,0.65181977,0.6517948,0.6512418
22 | 20,0.65941834,0.6593918,0.65884435
23 | 21,0.65883756,0.65882397,0.65822756
24 | 22,0.6599824,0.6599794,0.6593097
25 | 23,0.6140344,0.6140205,0.6131047
26 | 24,0.54314095,0.54311645,0.5419062
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/albert-xlarge-v1.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.37603918,0.37612942,0.37049496
3 | 1,0.31145602,0.3114958,0.3073803
4 | 2,0.25227228,0.2522994,0.24795091
5 | 3,0.22015819,0.22017719,0.21600199
6 | 4,0.21572605,0.21576598,0.21187688
7 | 5,0.21390381,0.21393314,0.21024637
8 | 6,0.21366087,0.21368802,0.21022928
9 | 7,0.2149553,0.21497151,0.2116843
10 | 8,0.21902423,0.21904334,0.215865
11 | 9,0.22598784,0.22601976,0.22294162
12 | 10,0.23651579,0.23656204,0.2335378
13 | 11,0.2508,0.25083283,0.24782418
14 | 12,0.26735264,0.26740175,0.2642045
15 | 13,0.2851571,0.2852036,0.28140694
16 | 14,0.30159834,0.3016559,0.2969648
17 | 15,0.31582344,0.31589058,0.31032172
18 | 16,0.33028397,0.3303347,0.32389277
19 | 17,0.34479943,0.34483773,0.33757344
20 | 18,0.3576801,0.35770583,0.34980485
21 | 19,0.36997133,0.36996147,0.3615338
22 | 20,0.3813416,0.38132015,0.37257645
23 | 21,0.3904368,0.39041746,0.38146585
24 | 22,0.4026223,0.40261322,0.39356884
25 | 23,0.41755676,0.41755086,0.4090774
26 | 24,0.40913486,0.40914643,0.40243107
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/albert-xlarge-v2.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.379094,0.37919718,0.37330297
3 | 1,0.27352002,0.27357075,0.26852632
4 | 2,0.24191533,0.24194317,0.23669504
5 | 3,0.2238661,0.22388461,0.21928357
6 | 4,0.22812894,0.22815062,0.22410771
7 | 5,0.22398795,0.22402358,0.22023973
8 | 6,0.22606015,0.22609216,0.22241953
9 | 7,0.22955626,0.22957715,0.2261971
10 | 8,0.23346025,0.23349406,0.230283
11 | 9,0.23933677,0.23937275,0.23639005
12 | 10,0.24947925,0.2495169,0.24674372
13 | 11,0.25879192,0.25879982,0.25623834
14 | 12,0.26840612,0.2684224,0.2659429
15 | 13,0.28223696,0.2822432,0.27990422
16 | 14,0.3007411,0.30081397,0.298456
17 | 15,0.32065493,0.32073346,0.31820792
18 | 16,0.3489667,0.34909493,0.34612358
19 | 17,0.37499505,0.37513632,0.37153322
20 | 18,0.39365283,0.3937659,0.3894278
21 | 19,0.3985198,0.39858896,0.39375183
22 | 20,0.40377426,0.4038127,0.3987301
23 | 21,0.4162669,0.41631454,0.41127917
24 | 22,0.4385093,0.43853307,0.43359485
25 | 23,0.50211877,0.5021498,0.49820283
26 | 24,0.6450441,0.6450727,0.64176905
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/albert-xxlarge-v1.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.44518736,0.44525033,0.44190475
3 | 1,0.26892486,0.26893654,0.26619813
4 | 2,0.25225964,0.25227055,0.2495048
5 | 3,0.23626596,0.23626427,0.23414151
6 | 4,0.24108262,0.24108647,0.23914734
7 | 5,0.2402725,0.24029303,0.23852193
8 | 6,0.24204335,0.24206877,0.24038398
9 | 7,0.24432875,0.24436904,0.2427339
10 | 8,0.24470611,0.24472676,0.24312295
11 | 9,0.24761276,0.24763304,0.2458257
12 | 10,0.26654655,0.26657295,0.26450548
13 | 11,0.30993807,0.309992,0.3073111
14 | 12,0.46560258,0.46563277,0.463768
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/albert-xxlarge-v2.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.4414845,0.4415628,0.4378333
3 | 1,0.26729813,0.26729846,0.26443842
4 | 2,0.25006709,0.25006858,0.2470538
5 | 3,0.22912578,0.22914563,0.22677879
6 | 4,0.23676835,0.23678702,0.23474906
7 | 5,0.23712093,0.23712862,0.23520498
8 | 6,0.2357785,0.23579709,0.2339876
9 | 7,0.2375271,0.2375658,0.2357691
10 | 8,0.23694733,0.2369875,0.23519956
11 | 9,0.24043696,0.24048997,0.23847668
12 | 10,0.25991938,0.25997588,0.257621
13 | 11,0.3076668,0.30775174,0.30460533
14 | 12,0.5213576,0.52133,0.5192018
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/bert-base-cased-finetuned-mrpc.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.32524315,0.32527947,0.32047534
3 | 1,0.3697738,0.3697855,0.36682808
4 | 2,0.3912412,0.39124438,0.38884974
5 | 3,0.38678017,0.3867508,0.3849363
6 | 4,0.4306143,0.43059555,0.4291982
7 | 5,0.47680253,0.47676748,0.4757307
8 | 6,0.4937383,0.4937078,0.49275663
9 | 7,0.47395828,0.47392154,0.47275484
10 | 8,0.48822877,0.48818707,0.48712534
11 | 9,0.55345184,0.55342007,0.5525519
12 | 10,0.6535154,0.6534775,0.6529064
13 | 11,0.76415604,0.7641147,0.76378924
14 | 12,0.72067815,0.7206308,0.72023565
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.31651747,0.3166142,0.31180394
3 | 1,0.38737702,0.38744056,0.38455048
4 | 2,0.37912813,0.37916443,0.37648088
5 | 3,0.46451283,0.46451145,0.46312103
6 | 4,0.5066057,0.50659287,0.5054953
7 | 5,0.5804824,0.5804496,0.5797646
8 | 6,0.63067275,0.630636,0.63018715
9 | 7,0.54218787,0.5421653,0.5414328
10 | 8,0.5240471,0.5240057,0.5232123
11 | 9,0.6320527,0.6320019,0.63146895
12 | 10,0.69633687,0.6962761,0.6958725
13 | 11,0.7193143,0.7192363,0.7188216
14 | 12,0.3473233,0.34732684,0.34655094
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/bert-base-uncased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3231512,0.32322776,0.31853873
3 | 1,0.32517454,0.32522815,0.32197207
4 | 2,0.3708038,0.37080705,0.36834884
5 | 3,0.36287847,0.36286885,0.36059204
6 | 4,0.3786389,0.37860426,0.3767926
7 | 5,0.4018232,0.401791,0.40032896
8 | 6,0.38439456,0.38434005,0.38282546
9 | 7,0.37114623,0.3710986,0.36949417
10 | 8,0.37231025,0.37226102,0.37049443
11 | 9,0.35375935,0.3537393,0.35219112
12 | 10,0.38161838,0.3816211,0.37991408
13 | 11,0.4421448,0.4421776,0.44040316
14 | 12,0.40192786,0.40191513,0.40038353
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/bert-large-uncased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.33945993,0.33952734,0.3353803
3 | 1,0.46529758,0.46534532,0.4629573
4 | 2,0.5190359,0.51904607,0.5170987
5 | 3,0.55551875,0.5555247,0.5540426
6 | 4,0.47806495,0.4780755,0.47663376
7 | 5,0.39333034,0.3933407,0.391598
8 | 6,0.30678865,0.30683848,0.30446944
9 | 7,0.40164435,0.40167126,0.39997557
10 | 8,0.44429466,0.4443099,0.44277325
11 | 9,0.5114804,0.5114661,0.5102474
12 | 10,0.53322667,0.5332073,0.5323144
13 | 11,0.56793964,0.56791747,0.56725395
14 | 12,0.56360143,0.5635814,0.5629889
15 | 13,0.5358492,0.5358346,0.53522795
16 | 14,0.42079058,0.42078197,0.41975206
17 | 15,0.3509417,0.3509411,0.34957188
18 | 16,0.4534342,0.45341223,0.45231807
19 | 17,0.46370843,0.46370083,0.46265444
20 | 18,0.4278576,0.42786714,0.42646673
21 | 19,0.38974905,0.3897353,0.3877319
22 | 20,0.3966205,0.3966191,0.3942883
23 | 21,0.4981153,0.49813268,0.4955151
24 | 22,0.5868029,0.58685154,0.584482
25 | 23,0.7136535,0.7137033,0.7118858
26 | 24,0.5152624,0.5152391,0.5146088
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/distilbert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.27245584,0.27247205,0.26611173
3 | 1,0.45394143,0.453942,0.45178676
4 | 2,0.5374658,0.5374726,0.53619426
5 | 3,0.61241305,0.61244136,0.6116679
6 | 4,0.63282156,0.632836,0.63219804
7 | 5,0.8164157,0.81645757,0.81623197
8 | 6,0.4648941,0.4649093,0.4638737
9 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/distilbert-base-uncased-distilled-squad.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.28725642,0.2872663,0.28207442
3 | 1,0.37234208,0.37233955,0.37046063
4 | 2,0.403689,0.4037149,0.4020736
5 | 3,0.5399291,0.53997463,0.53930676
6 | 4,0.6591859,0.65919137,0.65882134
7 | 5,0.65313077,0.6531313,0.65279835
8 | 6,0.74920315,0.7491901,0.7487158
9 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/distilbert-base-uncased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.2884445,0.2884457,0.28333962
3 | 1,0.39316687,0.3931663,0.39123002
4 | 2,0.42905498,0.4290923,0.42735597
5 | 3,0.5222444,0.52227175,0.52129734
6 | 4,0.6019937,0.6019904,0.6014007
7 | 5,0.6666034,0.66660464,0.66620487
8 | 6,0.51401854,0.51404256,0.5131456
9 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/distilroberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.42608285,0.4272089,0.42462298
3 | 1,0.7367886,0.7370362,0.736573
4 | 2,0.79922664,0.799593,0.7991632
5 | 3,0.8329021,0.8333321,0.83291864
6 | 4,0.8442,0.84462386,0.84425896
7 | 5,0.84732,0.84759504,0.8473319
8 | 6,0.89334005,0.8935088,0.8933471
9 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/microsoft/deberta-base-mnli.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.24991891,0.25001466,0.24200068
3 | 1,0.29392833,0.29395026,0.28912014
4 | 2,0.36113718,0.36123025,0.3575888
5 | 3,0.41445282,0.41459718,0.41148487
6 | 4,0.4386812,0.43877414,0.4361292
7 | 5,0.45521808,0.4552972,0.45306677
8 | 6,0.4797258,0.4797979,0.47779492
9 | 7,0.48204568,0.48210686,0.480253
10 | 8,0.50440174,0.5044583,0.5025705
11 | 9,0.53045946,0.5304829,0.52866036
12 | 10,0.53781724,0.5377958,0.53583634
13 | 11,0.5402823,0.5402229,0.53816986
14 | 12,0.57382584,0.57370174,0.57160807
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/microsoft/deberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.2517391,0.2518335,0.24388544
3 | 1,0.36171424,0.36175922,0.35748047
4 | 2,0.4423475,0.442458,0.44021225
5 | 3,0.50618786,0.5063445,0.5045984
6 | 4,0.5250692,0.525192,0.5236118
7 | 5,0.55415064,0.5542385,0.5528668
8 | 6,0.5684745,0.5685567,0.5672051
9 | 7,0.5721026,0.5721756,0.5708452
10 | 8,0.60626274,0.6063245,0.6049902
11 | 9,0.6282066,0.62825406,0.6269483
12 | 10,0.6643668,0.66438687,0.66297233
13 | 11,0.65951246,0.6595324,0.6584084
14 | 12,0.70749044,0.70750576,0.7064498
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/microsoft/deberta-large-mnli.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.24490805,0.24501415,0.23715581
3 | 1,0.29400384,0.2940948,0.28872925
4 | 2,0.30570883,0.3057956,0.30113816
5 | 3,0.2957167,0.29578057,0.2915654
6 | 4,0.2884288,0.28847086,0.2843156
7 | 5,0.30902475,0.3090854,0.3057
8 | 6,0.3267471,0.32683545,0.32377866
9 | 7,0.32664096,0.32672828,0.3239887
10 | 8,0.33238792,0.3324875,0.32986364
11 | 9,0.35454232,0.3546663,0.35220724
12 | 10,0.37474304,0.37486178,0.3723941
13 | 11,0.38948673,0.38959926,0.38713577
14 | 12,0.40499082,0.4051212,0.4027381
15 | 13,0.40869987,0.40882573,0.40650842
16 | 14,0.41533,0.41543606,0.41318002
17 | 15,0.42891178,0.4289993,0.42687863
18 | 16,0.43574512,0.43581918,0.43376175
19 | 17,0.44409868,0.44415665,0.4421444
20 | 18,0.45358238,0.45362508,0.45173016
21 | 19,0.4614291,0.46146432,0.45968512
22 | 20,0.4612395,0.46127385,0.45946208
23 | 21,0.47897574,0.47901914,0.4772938
24 | 22,0.49526486,0.49531218,0.49363694
25 | 23,0.48103315,0.4810869,0.4794539
26 | 24,0.5131625,0.51319313,0.51193404
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/microsoft/deberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.24543904,0.24554524,0.23771858
3 | 1,0.32400694,0.3240889,0.31910792
4 | 2,0.35317397,0.35325843,0.34890434
5 | 3,0.34494445,0.34502625,0.34082443
6 | 4,0.34670925,0.34677663,0.3425456
7 | 5,0.36661133,0.3667012,0.36314553
8 | 6,0.38046056,0.38056228,0.37710926
9 | 7,0.38267714,0.3827855,0.37945607
10 | 8,0.3922755,0.3924098,0.38914645
11 | 9,0.41027483,0.41045374,0.4072962
12 | 10,0.43634042,0.4365225,0.43331632
13 | 11,0.4587171,0.45889324,0.45575032
14 | 12,0.47399956,0.47417867,0.47109136
15 | 13,0.48888516,0.48905894,0.4862424
16 | 14,0.4966528,0.49680543,0.49413764
17 | 15,0.5117451,0.51189446,0.50938886
18 | 16,0.5341927,0.53433174,0.53205305
19 | 17,0.55080074,0.5509329,0.5488182
20 | 18,0.5715738,0.571711,0.5698007
21 | 19,0.58424556,0.5843769,0.5826535
22 | 20,0.59171396,0.5918352,0.5901539
23 | 21,0.60953987,0.60965025,0.60810995
24 | 22,0.620468,0.6205763,0.6191674
25 | 23,0.57499653,0.575068,0.573669
26 | 24,0.5698042,0.5698687,0.5686779
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/microsoft/deberta-xlarge-mnli.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.2493579,0.24956034,0.24190253
3 | 1,0.3013932,0.30158633,0.2964718
4 | 2,0.317363,0.31756195,0.31315005
5 | 3,0.3117229,0.3118849,0.30764845
6 | 4,0.3074649,0.3076071,0.30345994
7 | 5,0.3140126,0.31414607,0.31065413
8 | 6,0.32410583,0.3242222,0.32100978
9 | 7,0.32173893,0.3218549,0.3187024
10 | 8,0.32544047,0.32556787,0.3224075
11 | 9,0.344368,0.3445152,0.34142512
12 | 10,0.3655007,0.36567506,0.3623955
13 | 11,0.38081372,0.38100296,0.37764993
14 | 12,0.38874978,0.38893828,0.38563213
15 | 13,0.38537422,0.38555342,0.38225004
16 | 14,0.39434314,0.39452493,0.3914539
17 | 15,0.40501443,0.40519157,0.40221062
18 | 16,0.41383415,0.414013,0.41118416
19 | 17,0.43424043,0.4344097,0.4318083
20 | 18,0.4456768,0.44583458,0.4435271
21 | 19,0.4616012,0.46173084,0.45967415
22 | 20,0.46671286,0.46683112,0.4647799
23 | 21,0.49091575,0.49103191,0.4892095
24 | 22,0.5345532,0.53466916,0.53317034
25 | 23,0.52739257,0.5275056,0.52598923
26 | 24,0.4812145,0.48132038,0.47937903
27 | 25,0.47786388,0.47797868,0.4758911
28 | 26,0.4767261,0.476854,0.4747504
29 | 27,0.45120457,0.45133275,0.44898003
30 | 28,0.43487516,0.43499732,0.43227148
31 | 29,0.4418857,0.44200745,0.439456
32 | 30,0.45188263,0.4520089,0.44948938
33 | 31,0.44309646,0.443208,0.44067165
34 | 32,0.44934252,0.44945362,0.44696212
35 | 33,0.47058168,0.470693,0.46848273
36 | 34,0.48300824,0.4831242,0.480923
37 | 35,0.49022266,0.49034286,0.48815507
38 | 36,0.49732342,0.49744752,0.49531126
39 | 37,0.49466616,0.494789,0.49265566
40 | 38,0.4995418,0.4996657,0.49754837
41 | 39,0.5116362,0.5117548,0.50974
42 | 40,0.5169066,0.5170288,0.5150192
43 | 41,0.53604615,0.5361662,0.534255
44 | 42,0.5560917,0.5562141,0.55443686
45 | 43,0.5699871,0.5701181,0.56848437
46 | 44,0.5755175,0.5756376,0.5740404
47 | 45,0.5944691,0.59459156,0.59314805
48 | 46,0.61108196,0.6111957,0.60986704
49 | 47,0.5935245,0.59361994,0.5924131
50 | 48,0.6343621,0.6344516,0.63365686
51 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/microsoft/deberta-xlarge.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.24957183,0.24977477,0.24214219
3 | 1,0.30639872,0.3065908,0.30158743
4 | 2,0.3331396,0.33333635,0.3290473
5 | 3,0.32949522,0.32968733,0.3253696
6 | 4,0.31661382,0.316769,0.31258678
7 | 5,0.32896715,0.32910535,0.32568523
8 | 6,0.33770096,0.33782086,0.3345446
9 | 7,0.3326147,0.33272395,0.32957816
10 | 8,0.3367821,0.33687654,0.33380622
11 | 9,0.3546219,0.3547327,0.35179362
12 | 10,0.38037142,0.38049275,0.37740862
13 | 11,0.40171945,0.40185076,0.39869636
14 | 12,0.4163913,0.41652367,0.4133557
15 | 13,0.43222922,0.43235204,0.42938623
16 | 14,0.4416328,0.44175574,0.43894717
17 | 15,0.45403007,0.45415205,0.45151842
18 | 16,0.47758847,0.47770745,0.47528616
19 | 17,0.49413732,0.49424222,0.49203014
20 | 18,0.5177917,0.5178813,0.51596016
21 | 19,0.54055035,0.54061955,0.53895485
22 | 20,0.5554671,0.55553156,0.553943
23 | 21,0.5871218,0.5871978,0.585844
24 | 22,0.6379372,0.6380021,0.6369301
25 | 23,0.62672323,0.6267863,0.6256873
26 | 24,0.5497838,0.5498381,0.5483379
27 | 25,0.543943,0.5440018,0.54246646
28 | 26,0.55943567,0.55949783,0.5578509
29 | 27,0.5522361,0.5523346,0.55041844
30 | 28,0.5384432,0.53856134,0.53645724
31 | 29,0.541011,0.5411351,0.53916043
32 | 30,0.53560615,0.5357274,0.5337449
33 | 31,0.5211553,0.5212751,0.51924247
34 | 32,0.52553123,0.52564174,0.5235451
35 | 33,0.53930295,0.5394204,0.5372786
36 | 34,0.5591909,0.55931133,0.5570341
37 | 35,0.5712996,0.5714208,0.5691194
38 | 36,0.57959074,0.57972014,0.5774709
39 | 37,0.58818644,0.5883232,0.58616716
40 | 38,0.5925551,0.5926871,0.5905953
41 | 39,0.6026835,0.60282564,0.6008043
42 | 40,0.6189861,0.6191251,0.6172279
43 | 41,0.62964463,0.62977934,0.62799156
44 | 42,0.6451681,0.64530563,0.6436409
45 | 43,0.6539978,0.65413773,0.65264153
46 | 44,0.65711796,0.65726453,0.65580934
47 | 45,0.66835105,0.66850114,0.6671609
48 | 46,0.67004806,0.6701847,0.6689483
49 | 47,0.611536,0.61166185,0.6104823
50 | 48,0.6487418,0.64883584,0.6481099
51 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.4043224,0.40432808,0.40218553
3 | 1,0.6423126,0.6422804,0.6414617
4 | 2,0.768273,0.7682535,0.76791227
5 | 3,0.7803166,0.78030443,0.7800415
6 | 4,0.7839782,0.78397924,0.7836174
7 | 5,0.7959116,0.7959033,0.79557085
8 | 6,0.80936664,0.80936354,0.80908644
9 | 7,0.81720984,0.81721514,0.816965
10 | 8,0.80465585,0.80464727,0.8043641
11 | 9,0.7911581,0.79115206,0.7908595
12 | 10,0.8146725,0.8146619,0.814463
13 | 11,0.8243949,0.8244051,0.82420003
14 | 12,0.8557132,0.85571885,0.8555707
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/roberta-large-mnli.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.36816803,0.36820343,0.3650997
3 | 1,0.6424572,0.64243424,0.6408211
4 | 2,0.62199366,0.6219771,0.62105906
5 | 3,0.65479594,0.65479946,0.6542115
6 | 4,0.66220766,0.66219413,0.66147035
7 | 5,0.6841878,0.6841976,0.6835943
8 | 6,0.6993157,0.6993184,0.698729
9 | 7,0.7363659,0.7363538,0.73597246
10 | 8,0.76699406,0.76697797,0.7666572
11 | 9,0.76385623,0.76387703,0.76359564
12 | 10,0.7751121,0.7751162,0.7748585
13 | 11,0.7607176,0.7607192,0.7604293
14 | 12,0.75846714,0.75850517,0.7582122
15 | 13,0.7660639,0.766093,0.7658386
16 | 14,0.76723933,0.7672636,0.76692307
17 | 15,0.76183504,0.7618548,0.7615043
18 | 16,0.77503896,0.7750635,0.77476084
19 | 17,0.7572284,0.75724494,0.7568846
20 | 18,0.72981,0.72983533,0.7294623
21 | 19,0.6901594,0.69018,0.6896288
22 | 20,0.6456024,0.6456534,0.6447707
23 | 21,0.6733705,0.6734108,0.672755
24 | 22,0.7964235,0.79642963,0.7961781
25 | 23,0.83942956,0.839427,0.8393037
26 | 24,0.87867236,0.8787309,0.8781039
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3712891,0.37132213,0.36826715
3 | 1,0.67176163,0.6717439,0.6703483
4 | 2,0.70031923,0.7003052,0.69969934
5 | 3,0.7080897,0.7081011,0.707698
6 | 4,0.6976306,0.69762677,0.69710517
7 | 5,0.7187199,0.71873325,0.71828526
8 | 6,0.74678195,0.74678224,0.74642223
9 | 7,0.7772428,0.7772184,0.77691925
10 | 8,0.8021733,0.8021747,0.8019093
11 | 9,0.8067641,0.80678225,0.8065291
12 | 10,0.8366976,0.8367098,0.8364913
13 | 11,0.8163513,0.816369,0.8161064
14 | 12,0.8175406,0.8175611,0.81728977
15 | 13,0.82106245,0.8210674,0.82080233
16 | 14,0.81487834,0.8148861,0.8145652
17 | 15,0.8243552,0.8243522,0.8240494
18 | 16,0.8341641,0.8341684,0.833912
19 | 17,0.83150584,0.8314941,0.83122575
20 | 18,0.8314624,0.83146274,0.8311686
21 | 19,0.82761073,0.8276117,0.8273196
22 | 20,0.799873,0.79988,0.79956234
23 | 21,0.8082163,0.80819315,0.8079286
24 | 22,0.83196104,0.83195347,0.83174026
25 | 23,0.8408042,0.8408027,0.8405716
26 | 24,0.96022236,0.96021587,0.960168
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.2929519,0.29297927,0.28788087
3 | 1,0.32307193,0.32305866,0.31955993
4 | 2,0.33333376,0.33329934,0.3307059
5 | 3,0.34018472,0.34019333,0.3369147
6 | 4,0.35193846,0.35196185,0.34877294
7 | 5,0.41633913,0.41635182,0.41389906
8 | 6,0.52230054,0.5223191,0.5208747
9 | 7,0.57117224,0.5711975,0.57016635
10 | 8,0.55626523,0.55628437,0.55513597
11 | 9,0.5035621,0.5035617,0.5023768
12 | 10,0.43660313,0.4366135,0.43496045
13 | 11,0.37350416,0.37354943,0.3712711
14 | 12,0.3694557,0.36947483,0.36708415
15 | 13,0.38296118,0.38296735,0.38057274
16 | 14,0.3801941,0.38019708,0.37771493
17 | 15,0.39073846,0.39073724,0.38804337
18 | 16,0.27941948,0.2793937,0.27774334
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/xlm-mlm-en-2048.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.48034036,0.48027167,0.4755281
3 | 1,0.68549955,0.68547165,0.68418026
4 | 2,0.7502881,0.7502652,0.7497456
5 | 3,0.7662417,0.7662214,0.7659151
6 | 4,0.7910623,0.7910466,0.79085386
7 | 5,0.8090659,0.8090618,0.80895317
8 | 6,0.82148397,0.8214852,0.821408
9 | 7,0.8091143,0.8091184,0.8090199
10 | 8,0.77966934,0.7796406,0.77937865
11 | 9,0.75278246,0.7527972,0.7524639
12 | 10,0.72071564,0.7207407,0.7202978
13 | 11,0.7175687,0.7176211,0.7170889
14 | 12,0.22130837,0.22130068,0.21938775
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.31767526,0.31771243,0.31208947
3 | 1,0.45930108,0.45930612,0.4573549
4 | 2,0.6739723,0.6739605,0.67332643
5 | 3,0.7428563,0.7428622,0.74252146
6 | 4,0.7270618,0.7270706,0.7267292
7 | 5,0.7459538,0.7459533,0.74563044
8 | 6,0.7416182,0.74162334,0.74136156
9 | 7,0.7766629,0.7766664,0.7764565
10 | 8,0.7827196,0.78271383,0.78251594
11 | 9,0.81658614,0.8165717,0.81639785
12 | 10,0.83839214,0.83837646,0.8382293
13 | 11,0.8711623,0.8711581,0.87106025
14 | 12,0.9843661,0.98436636,0.9843645
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.38918123,0.38920417,0.3852401
3 | 1,0.66835684,0.6683084,0.6677018
4 | 2,0.7323929,0.7323684,0.7321559
5 | 3,0.7391762,0.7391537,0.73889536
6 | 4,0.7922834,0.79227173,0.7921484
7 | 5,0.79589903,0.795871,0.7957138
8 | 6,0.8166894,0.816673,0.8165898
9 | 7,0.8223533,0.8223572,0.82228154
10 | 8,0.834576,0.8345772,0.8344947
11 | 9,0.8377803,0.83777326,0.8376894
12 | 10,0.8380223,0.8380033,0.83791
13 | 11,0.8415803,0.84157884,0.8414282
14 | 12,0.84659237,0.8466055,0.84632146
15 | 13,0.8437288,0.84372836,0.84340864
16 | 14,0.846515,0.84650415,0.8461781
17 | 15,0.8514585,0.8514379,0.85112184
18 | 16,0.84461045,0.8446081,0.8442589
19 | 17,0.85291016,0.8529066,0.8525485
20 | 18,0.8582745,0.8582787,0.85787606
21 | 19,0.85327464,0.8532746,0.85287833
22 | 20,0.86624545,0.86624,0.86592185
23 | 21,0.8854349,0.88543147,0.88515806
24 | 22,0.8891757,0.8891605,0.88892245
25 | 23,0.88805044,0.88803035,0.88777393
26 | 24,0.9840399,0.98404247,0.984038
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/xlnet-base-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.29910204,0.29919305,0.29052314
3 | 1,0.29633516,0.29640594,0.2915415
4 | 2,0.28782755,0.28787795,0.28492415
5 | 3,0.29966587,0.2996727,0.29745364
6 | 4,0.32897076,0.32897395,0.3263186
7 | 5,0.34247187,0.3424195,0.34024557
8 | 6,0.61728173,0.61718243,0.6160013
9 | 7,0.6704566,0.6703779,0.66936857
10 | 8,0.8596307,0.8595696,0.859391
11 | 9,0.8611796,0.8611522,0.8610164
12 | 10,0.89382625,0.8938215,0.8937337
13 | 11,0.97762144,0.9776183,0.97761476
14 | 12,0.93146294,0.93134,0.93100053
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/en/xlnet-large-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.41637358,0.41643414,0.41258112
3 | 1,0.32545134,0.32545993,0.3204785
4 | 2,0.29599807,0.29601985,0.29176536
5 | 3,0.21799843,0.2180424,0.21441601
6 | 4,0.2619272,0.261958,0.25913864
7 | 5,0.30362618,0.30360785,0.30147976
8 | 6,0.31371272,0.3136575,0.31170228
9 | 7,0.3085695,0.30850938,0.30676135
10 | 8,0.3251663,0.32509723,0.32402074
11 | 9,0.34611195,0.34610417,0.3449464
12 | 10,0.33172518,0.3316963,0.32996267
13 | 11,0.32673666,0.32671896,0.3252777
14 | 12,0.3015574,0.30154356,0.29979268
15 | 13,0.33127543,0.33126998,0.33017284
16 | 14,0.33191463,0.33192313,0.3307891
17 | 15,0.3753324,0.3753503,0.374231
18 | 16,0.37750244,0.37751338,0.37648135
19 | 17,0.3678608,0.3678761,0.36674905
20 | 18,0.305072,0.3050984,0.3042137
21 | 19,0.42524177,0.4253285,0.42387673
22 | 20,0.59149736,0.59153783,0.5901478
23 | 21,0.6070587,0.607099,0.6057612
24 | 22,0.80884385,0.80882186,0.8085461
25 | 23,0.9555436,0.9555404,0.95551467
26 | 24,0.96873486,0.9687297,0.9685215
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/es/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.32142487,0.32125905,0.31729683
3 | 1,0.39584324,0.395717,0.39326182
4 | 2,0.3895418,0.38945207,0.38716727
5 | 3,0.47731403,0.47727716,0.47604948
6 | 4,0.5232235,0.5231792,0.52232313
7 | 5,0.5989939,0.59892774,0.59843445
8 | 6,0.6496523,0.6496062,0.649302
9 | 7,0.5524209,0.5523591,0.55184853
10 | 8,0.52988493,0.5298184,0.52922106
11 | 9,0.63474494,0.6346978,0.6342529
12 | 10,0.70397323,0.7039352,0.703585
13 | 11,0.7417414,0.74173224,0.74136305
14 | 12,0.39257455,0.39254928,0.39194846
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/es/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.29741356,0.29718205,0.29312935
3 | 1,0.33412832,0.33395684,0.3312904
4 | 2,0.35136887,0.35124466,0.3492788
5 | 3,0.36096326,0.3608026,0.35864976
6 | 4,0.36783966,0.36770988,0.36555293
7 | 5,0.4318944,0.4317502,0.4300937
8 | 6,0.54022354,0.54010266,0.5391772
9 | 7,0.5873484,0.5872481,0.58660454
10 | 8,0.56757474,0.5674764,0.566725
11 | 9,0.50883144,0.5087277,0.5079181
12 | 10,0.43789023,0.43777642,0.4366415
13 | 11,0.37517586,0.37504935,0.3734603
14 | 12,0.37935427,0.37921786,0.37755096
15 | 13,0.39596176,0.39583504,0.39407465
16 | 14,0.40488854,0.4047234,0.40284988
17 | 15,0.41720447,0.41700417,0.41506332
18 | 16,0.321014,0.32089671,0.31943843
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/es/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3246688,0.32442638,0.31979698
3 | 1,0.4669744,0.46682546,0.46536762
4 | 2,0.682952,0.68287796,0.6824639
5 | 3,0.75232756,0.7522827,0.7520721
6 | 4,0.73857796,0.73851913,0.73830944
7 | 5,0.7549688,0.7549195,0.75471216
8 | 6,0.7463499,0.74629426,0.7461334
9 | 7,0.7811989,0.78114533,0.78101724
10 | 8,0.78642476,0.7863655,0.7862384
11 | 9,0.8234212,0.823385,0.8232284
12 | 10,0.8446837,0.8446493,0.8445056
13 | 11,0.87540615,0.8753815,0.8752877
14 | 12,0.9844347,0.9844323,0.9844318
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/es/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.39393866,0.39371517,0.3905179
3 | 1,0.6807716,0.6807661,0.680328
4 | 2,0.7418765,0.74186456,0.74167407
5 | 3,0.74935234,0.7493611,0.7491301
6 | 4,0.79821396,0.79822713,0.7980995
7 | 5,0.7988987,0.7989139,0.7987521
8 | 6,0.8229017,0.8228938,0.8228024
9 | 7,0.8280001,0.8279914,0.8279237
10 | 8,0.8397697,0.8397626,0.8396876
11 | 9,0.8410181,0.8410066,0.84094054
12 | 10,0.8409921,0.8409992,0.8409067
13 | 11,0.8431543,0.8431424,0.84302104
14 | 12,0.8459719,0.84595364,0.84571356
15 | 13,0.8396326,0.839628,0.83931595
16 | 14,0.84028465,0.84028375,0.83993286
17 | 15,0.8447372,0.84472674,0.8444034
18 | 16,0.8363781,0.8363222,0.8360513
19 | 17,0.84482056,0.8447689,0.8445116
20 | 18,0.85074264,0.85068643,0.8504014
21 | 19,0.84944814,0.8493866,0.8491228
22 | 20,0.86171687,0.86166567,0.8614421
23 | 21,0.87797874,0.8779322,0.8777276
24 | 22,0.87815136,0.87810516,0.87791014
25 | 23,0.87712365,0.87708676,0.8768752
26 | 24,0.9812538,0.9812441,0.9812441
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/et/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.27440318,0.27450797,0.2698814
3 | 1,0.36711293,0.3672066,0.3646441
4 | 2,0.36751607,0.36758184,0.36546195
5 | 3,0.44396114,0.4440282,0.44275236
6 | 4,0.49434176,0.49438694,0.49351478
7 | 5,0.5781191,0.57814497,0.57762396
8 | 6,0.6325188,0.63253754,0.63219965
9 | 7,0.5371272,0.5371553,0.53662723
10 | 8,0.51365125,0.5136854,0.5130298
11 | 9,0.61113626,0.6111767,0.6106605
12 | 10,0.68986833,0.6898959,0.6895253
13 | 11,0.72481495,0.7248488,0.72443366
14 | 12,0.41427994,0.414279,0.41360843
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/et/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.22709163,0.2271243,0.22315507
3 | 1,0.28697282,0.28699732,0.284561
4 | 2,0.31591207,0.31594896,0.3142282
5 | 3,0.3272068,0.3271873,0.3251662
6 | 4,0.33797315,0.33791435,0.3357934
7 | 5,0.39506105,0.39499047,0.39325175
8 | 6,0.49566302,0.4955908,0.49454838
9 | 7,0.55213124,0.5520629,0.55135715
10 | 8,0.5356107,0.53553146,0.53473157
11 | 9,0.48094663,0.4808736,0.4799986
12 | 10,0.41156343,0.41149083,0.410293
13 | 11,0.36135536,0.36126482,0.3597544
14 | 12,0.3840661,0.38395354,0.3824061
15 | 13,0.3990762,0.39895925,0.39723954
16 | 14,0.40530387,0.40517297,0.40322375
17 | 15,0.41519368,0.41508782,0.4130928
18 | 16,0.3316055,0.33152688,0.32986996
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/et/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.27967498,0.27960515,0.27565888
3 | 1,0.4284523,0.4284252,0.42707014
4 | 2,0.6428217,0.64281446,0.6423762
5 | 3,0.7207139,0.72071636,0.72050244
6 | 4,0.7149051,0.7149116,0.7146955
7 | 5,0.7364546,0.7364631,0.73624396
8 | 6,0.72894406,0.7289576,0.7287705
9 | 7,0.76335233,0.76335174,0.7631944
10 | 8,0.7660467,0.7660525,0.76588887
11 | 9,0.80481553,0.8047997,0.8046123
12 | 10,0.824247,0.8242213,0.8240452
13 | 11,0.8616431,0.8616192,0.8615053
14 | 12,0.9794287,0.9794275,0.9794225
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/et/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.34819788,0.3482508,0.34528664
3 | 1,0.66133285,0.6613388,0.66087705
4 | 2,0.7167448,0.71674955,0.71656334
5 | 3,0.7235102,0.7235182,0.7232822
6 | 4,0.7763208,0.77634054,0.7762066
7 | 5,0.7787467,0.77879304,0.77860975
8 | 6,0.8065161,0.8065541,0.8064417
9 | 7,0.8130386,0.8130481,0.81297535
10 | 8,0.8232221,0.8232352,0.82315505
11 | 9,0.8259885,0.8259966,0.82592285
12 | 10,0.8261345,0.8261378,0.82604396
13 | 11,0.8302732,0.83030087,0.8301369
14 | 12,0.832208,0.8322509,0.83195096
15 | 13,0.8284099,0.82843494,0.8280566
16 | 14,0.8308385,0.830874,0.83043313
17 | 15,0.83593214,0.83598274,0.8355737
18 | 16,0.8225831,0.8226431,0.8222514
19 | 17,0.83149856,0.83155996,0.83119583
20 | 18,0.8360739,0.83612305,0.83573186
21 | 19,0.8338515,0.8339162,0.8335273
22 | 20,0.85045713,0.8505154,0.8501959
23 | 21,0.866938,0.8670008,0.86669517
24 | 22,0.86754334,0.86759776,0.86728114
25 | 23,0.86495036,0.86501676,0.86465526
26 | 24,0.97565717,0.97566575,0.97565126
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/fi/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.27203143,0.2718125,0.2674527
3 | 1,0.3669008,0.36672932,0.36434102
4 | 2,0.36613643,0.36596906,0.36401412
5 | 3,0.4369806,0.43683773,0.43563512
6 | 4,0.4888657,0.48875853,0.48795715
7 | 5,0.5726952,0.5726454,0.572158
8 | 6,0.62713367,0.62711185,0.6267881
9 | 7,0.5336007,0.53355575,0.53305566
10 | 8,0.51138526,0.51132864,0.51072747
11 | 9,0.6112424,0.6111909,0.6107369
12 | 10,0.6913106,0.6912809,0.6909531
13 | 11,0.7289148,0.7289066,0.7285409
14 | 12,0.40449622,0.4044448,0.4038471
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/fi/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.23438127,0.23428068,0.23040625
3 | 1,0.2891474,0.2890501,0.28663626
4 | 2,0.31794775,0.3178401,0.3161117
5 | 3,0.3314175,0.3313274,0.32932603
6 | 4,0.342742,0.34266472,0.34063184
7 | 5,0.40328184,0.40322024,0.40158102
8 | 6,0.5053177,0.5052804,0.50429296
9 | 7,0.55995744,0.5599387,0.55925107
10 | 8,0.5432386,0.5432242,0.5424414
11 | 9,0.48718062,0.4871476,0.48624423
12 | 10,0.41743338,0.41739362,0.4161943
13 | 11,0.36450592,0.36447832,0.3629536
14 | 12,0.38068864,0.38065174,0.37914556
15 | 13,0.40042648,0.40037584,0.39876702
16 | 14,0.40577888,0.405764,0.40404075
17 | 15,0.4122403,0.412242,0.4104758
18 | 16,0.32324278,0.3231793,0.3216624
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/fi/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.28559503,0.2854355,0.28159738
3 | 1,0.4194148,0.41935146,0.417895
4 | 2,0.6369165,0.63687444,0.63647
5 | 3,0.7129336,0.71288896,0.71269244
6 | 4,0.705694,0.7056649,0.7054607
7 | 5,0.7278231,0.72779924,0.7275826
8 | 6,0.7264064,0.72638345,0.72620934
9 | 7,0.76126385,0.7612437,0.7610952
10 | 8,0.76516724,0.76513124,0.76499206
11 | 9,0.8022079,0.8021703,0.8020057
12 | 10,0.8249256,0.8248923,0.824736
13 | 11,0.86274844,0.8627164,0.862623
14 | 12,0.98083913,0.9808375,0.9808362
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/fi/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.36092442,0.36074498,0.3580085
3 | 1,0.6607407,0.66068447,0.66028255
4 | 2,0.7176594,0.7175985,0.71745765
5 | 3,0.7285679,0.72852075,0.72832286
6 | 4,0.78595924,0.7859272,0.7858384
7 | 5,0.7865282,0.7864904,0.7863824
8 | 6,0.8087826,0.80874693,0.80867803
9 | 7,0.81360877,0.8135691,0.8135222
10 | 8,0.8234922,0.82345206,0.82339203
11 | 9,0.82659143,0.8265619,0.8265032
12 | 10,0.82844096,0.8284168,0.8283329
13 | 11,0.833159,0.83313984,0.83300036
14 | 12,0.83688194,0.8368595,0.8365941
15 | 13,0.83482826,0.8348066,0.83445275
16 | 14,0.8371448,0.8371316,0.8367094
17 | 15,0.8411402,0.8411226,0.8407152
18 | 16,0.8285362,0.828508,0.8281295
19 | 17,0.8365054,0.8364763,0.8361323
20 | 18,0.84074885,0.84073424,0.8403675
21 | 19,0.8374997,0.8374846,0.83713585
22 | 20,0.85316974,0.8531484,0.85286206
23 | 21,0.8724993,0.87247324,0.87221074
24 | 22,0.87472016,0.87468535,0.87441224
25 | 23,0.8715076,0.8714718,0.8711863
26 | 24,0.97793525,0.97793806,0.9779296
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/fr/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3169629,0.31686378,0.31290925
3 | 1,0.3973607,0.39730093,0.3948981
4 | 2,0.3917096,0.39167702,0.38944873
5 | 3,0.471558,0.4715446,0.4703017
6 | 4,0.51729333,0.5172892,0.5164051
7 | 5,0.5921461,0.59214556,0.59160614
8 | 6,0.64118487,0.6411703,0.64082944
9 | 7,0.54434645,0.5443365,0.5437896
10 | 8,0.52369165,0.5237088,0.5230594
11 | 9,0.62573117,0.62573653,0.6252499
12 | 10,0.69342446,0.6934141,0.6930288
13 | 11,0.72644377,0.72643,0.7260432
14 | 12,0.37622055,0.3762342,0.37555423
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/fr/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.29251722,0.29243135,0.28824046
3 | 1,0.3270653,0.32702345,0.32414123
4 | 2,0.34298986,0.34297037,0.34081614
5 | 3,0.35257423,0.35255608,0.3502542
6 | 4,0.36079553,0.36077785,0.35852012
7 | 5,0.4250942,0.425059,0.42333168
8 | 6,0.5288226,0.5288088,0.5278067
9 | 7,0.57518166,0.5751787,0.5744667
10 | 8,0.5556386,0.5556409,0.554816
11 | 9,0.50031036,0.50027037,0.49935982
12 | 10,0.431764,0.43173033,0.43051916
13 | 11,0.3727856,0.37272698,0.37108865
14 | 12,0.3785679,0.37849474,0.37677515
15 | 13,0.3937992,0.39371702,0.39193982
16 | 14,0.3963082,0.39625022,0.3943681
17 | 15,0.40861925,0.408575,0.40663382
18 | 16,0.3136189,0.3135873,0.3120113
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/fr/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.32206962,0.32194686,0.31745258
3 | 1,0.4602701,0.46020856,0.45864564
4 | 2,0.6719199,0.6718807,0.67143625
5 | 3,0.74045163,0.74043214,0.74017817
6 | 4,0.72625005,0.7262382,0.72596425
7 | 5,0.74321467,0.7431929,0.7429401
8 | 6,0.73884493,0.7388181,0.7386279
9 | 7,0.77495724,0.77493846,0.77478385
10 | 8,0.78073204,0.7807058,0.7805547
11 | 9,0.8198895,0.81987315,0.81971085
12 | 10,0.84097534,0.84096044,0.8408151
13 | 11,0.8744024,0.87438446,0.8742934
14 | 12,0.98294896,0.98294634,0.9829455
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/fr/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.38867676,0.38860926,0.38541874
3 | 1,0.6748494,0.6748672,0.6744121
4 | 2,0.7362502,0.7362605,0.73607147
5 | 3,0.74364084,0.7436084,0.7434166
6 | 4,0.79383326,0.79382324,0.7937171
7 | 5,0.79398805,0.7939688,0.79384273
8 | 6,0.8189012,0.818897,0.81879824
9 | 7,0.8252554,0.8252701,0.82518923
10 | 8,0.8372976,0.8373069,0.837221
11 | 9,0.83934426,0.8393484,0.8392743
12 | 10,0.8396223,0.8396263,0.8395396
13 | 11,0.8413963,0.8414122,0.84128094
14 | 12,0.8425236,0.84252983,0.8422956
15 | 13,0.836232,0.8362653,0.8359306
16 | 14,0.8365411,0.8365994,0.83620155
17 | 15,0.84075475,0.84081256,0.84043586
18 | 16,0.8336484,0.83366156,0.83334255
19 | 17,0.8420401,0.84204596,0.84175515
20 | 18,0.84736043,0.847369,0.8470594
21 | 19,0.8457147,0.84572095,0.84543604
22 | 20,0.8610545,0.86105347,0.8608192
23 | 21,0.8796009,0.87960935,0.87939256
24 | 22,0.87826204,0.8782994,0.8780729
25 | 23,0.8757684,0.8757959,0.8755639
26 | 24,0.9783308,0.9783405,0.9783287
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/it/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.2844219,0.28444692,0.28044853
3 | 1,0.37012622,0.37011567,0.3678241
4 | 2,0.37155172,0.3715547,0.36946896
5 | 3,0.4603244,0.46031958,0.45919034
6 | 4,0.50872415,0.5087325,0.50791526
7 | 5,0.5868436,0.5868716,0.5863534
8 | 6,0.6397911,0.63983333,0.63949335
9 | 7,0.5409238,0.54094136,0.54040617
10 | 8,0.5172371,0.51725966,0.5166258
11 | 9,0.62051994,0.6205607,0.62006223
12 | 10,0.6916372,0.6916744,0.69127834
13 | 11,0.7267179,0.72675204,0.7263427
14 | 12,0.38121554,0.38125327,0.38057736
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/it/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.26145306,0.26147375,0.25741524
3 | 1,0.3025566,0.30248818,0.2998916
4 | 2,0.32179558,0.32175842,0.31985885
5 | 3,0.33394024,0.33392504,0.33166507
6 | 4,0.34498307,0.3450014,0.34262
7 | 5,0.41246715,0.4124828,0.41061035
8 | 6,0.51987356,0.51987606,0.51879877
9 | 7,0.56968486,0.5696755,0.56891495
10 | 8,0.5526059,0.55259466,0.55172443
11 | 9,0.49650237,0.49646214,0.49543175
12 | 10,0.42862728,0.42857316,0.42718053
13 | 11,0.36833626,0.36827257,0.3663598
14 | 12,0.37506276,0.3750053,0.37302044
15 | 13,0.38766044,0.38759127,0.3855202
16 | 14,0.39820743,0.39815167,0.39614087
17 | 15,0.4081781,0.40812454,0.4060677
18 | 16,0.31196377,0.31188112,0.31034505
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/it/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.29116568,0.29114744,0.2868295
3 | 1,0.44197133,0.4419524,0.4405804
4 | 2,0.6624113,0.662375,0.661958
5 | 3,0.73566717,0.7356161,0.73541015
6 | 4,0.72424763,0.72419375,0.7239804
7 | 5,0.74316144,0.743101,0.7429021
8 | 6,0.7358866,0.7358457,0.7356837
9 | 7,0.7717992,0.77175343,0.7716246
10 | 8,0.77671385,0.77666664,0.7765362
11 | 9,0.8156109,0.8155815,0.81542325
12 | 10,0.8353943,0.8353729,0.83522326
13 | 11,0.8693978,0.86938006,0.86928266
14 | 12,0.98234653,0.98234344,0.9823433
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/it/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3594449,0.35943142,0.35630783
3 | 1,0.662669,0.662666,0.66222733
4 | 2,0.7278628,0.72787654,0.727691
5 | 3,0.7365745,0.73659885,0.7363675
6 | 4,0.7886599,0.7886995,0.7885636
7 | 5,0.7880371,0.7880777,0.7879012
8 | 6,0.8136995,0.813729,0.8136207
9 | 7,0.8208896,0.8209284,0.82083935
10 | 8,0.83259714,0.8326411,0.8325414
11 | 9,0.83512545,0.83517814,0.83508295
12 | 10,0.8349007,0.8349598,0.83484215
13 | 11,0.8370682,0.83713394,0.83697623
14 | 12,0.83735925,0.8374388,0.83716834
15 | 13,0.8307876,0.8308769,0.8305337
16 | 14,0.8304336,0.83052474,0.83013064
17 | 15,0.8350196,0.83511585,0.8347356
18 | 16,0.8262828,0.8263541,0.82602215
19 | 17,0.8352246,0.83529,0.8349814
20 | 18,0.8413706,0.8414452,0.84111327
21 | 19,0.84041846,0.84048223,0.840178
22 | 20,0.85462093,0.8546788,0.854426
23 | 21,0.8733275,0.87337714,0.8731498
24 | 22,0.87235314,0.8724075,0.87218434
25 | 23,0.86924857,0.86931163,0.8690715
26 | 24,0.97641337,0.9764174,0.97640705
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/lv/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.31880563,0.31885436,0.314444
3 | 1,0.39939553,0.399461,0.39680958
4 | 2,0.39936826,0.39942977,0.39705652
5 | 3,0.4639698,0.46403775,0.462585
6 | 4,0.51133174,0.511391,0.5104017
7 | 5,0.58995867,0.59001076,0.5894416
8 | 6,0.64041185,0.6404576,0.640104
9 | 7,0.5489481,0.5489947,0.5485002
10 | 8,0.5241059,0.5241476,0.5235563
11 | 9,0.61489826,0.6149375,0.614489
12 | 10,0.69464105,0.6946774,0.69437844
13 | 11,0.73005176,0.7301036,0.72975814
14 | 12,0.42655912,0.42657772,0.42596325
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/lv/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.25295407,0.2529038,0.24914542
3 | 1,0.30763087,0.30758235,0.3050838
4 | 2,0.3358753,0.33583683,0.33403713
5 | 3,0.35062265,0.35062423,0.34857833
6 | 4,0.36368594,0.36370537,0.36166307
7 | 5,0.4208051,0.42084384,0.41921085
8 | 6,0.52163017,0.5216751,0.5207288
9 | 7,0.5748712,0.57491165,0.57426846
10 | 8,0.5565561,0.55660844,0.55587536
11 | 9,0.50083,0.50086135,0.5000541
12 | 10,0.4287173,0.42873642,0.4276349
13 | 11,0.37965864,0.37967306,0.37825358
14 | 12,0.407949,0.40795445,0.40652746
15 | 13,0.43800756,0.437995,0.43645307
16 | 14,0.45024598,0.4502631,0.4485536
17 | 15,0.45746338,0.45749146,0.4557482
18 | 16,0.38742596,0.38743827,0.3859296
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/lv/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3158023,0.31595236,0.31170404
3 | 1,0.44148916,0.44161677,0.43999764
4 | 2,0.65698195,0.65707916,0.65660477
5 | 3,0.7291459,0.72921735,0.72896385
6 | 4,0.72035086,0.720424,0.72016907
7 | 5,0.7387083,0.73877054,0.73851764
8 | 6,0.7331035,0.7331564,0.73294306
9 | 7,0.7675076,0.767555,0.7673717
10 | 8,0.7721929,0.7722387,0.77205515
11 | 9,0.8134348,0.81347644,0.81327385
12 | 10,0.8337028,0.8337392,0.8335502
13 | 11,0.86931133,0.869342,0.86921614
14 | 12,0.98048294,0.9804859,0.980481
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/lv/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.38303047,0.38316694,0.38004494
3 | 1,0.6715437,0.67160815,0.671146
4 | 2,0.72935355,0.7293971,0.7292059
5 | 3,0.7367963,0.7368109,0.73659825
6 | 4,0.78665257,0.78668237,0.7865436
7 | 5,0.7876893,0.7877102,0.7875555
8 | 6,0.8118788,0.811887,0.8117912
9 | 7,0.81852704,0.8185447,0.81847227
10 | 8,0.82763994,0.8276652,0.8275769
11 | 9,0.829965,0.8299915,0.8299079
12 | 10,0.8325733,0.8325909,0.832492
13 | 11,0.83627987,0.8362996,0.8361541
14 | 12,0.83918196,0.8392273,0.8389481
15 | 13,0.83649516,0.8365421,0.83620304
16 | 14,0.8397933,0.8398624,0.8394616
17 | 15,0.84358793,0.8436498,0.8432716
18 | 16,0.83227086,0.8323361,0.8319669
19 | 17,0.83968145,0.83974457,0.8394039
20 | 18,0.8447246,0.84477955,0.8444194
21 | 19,0.8420644,0.8421077,0.84175897
22 | 20,0.85892874,0.85897714,0.85869175
23 | 21,0.8784681,0.87850714,0.8782475
24 | 22,0.8812008,0.8812461,0.8809788
25 | 23,0.8809998,0.88104856,0.8807619
26 | 24,0.9765009,0.9765086,0.9764968
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/pt/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.29287484,0.29276612,0.28896153
3 | 1,0.37047997,0.3703816,0.36809015
4 | 2,0.36899555,0.36891612,0.36687937
5 | 3,0.46113333,0.46108902,0.46000266
6 | 4,0.5105716,0.5105592,0.5097696
7 | 5,0.5895024,0.5895065,0.5890118
8 | 6,0.6431079,0.6431254,0.64279854
9 | 7,0.5462664,0.54628617,0.5457523
10 | 8,0.5256067,0.5256172,0.5249814
11 | 9,0.6314677,0.6314837,0.63099706
12 | 10,0.70045394,0.7004913,0.7001096
13 | 11,0.73772144,0.7377826,0.73737544
14 | 12,0.376468,0.37648788,0.37585765
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/pt/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.27077773,0.27081063,0.26688486
3 | 1,0.31458914,0.31458747,0.3120791
4 | 2,0.33407974,0.3341051,0.33230388
5 | 3,0.3424346,0.34244755,0.3402508
6 | 4,0.35230166,0.35230178,0.35003006
7 | 5,0.4220341,0.42202395,0.4202843
8 | 6,0.53116757,0.53115034,0.530122
9 | 7,0.5820105,0.5819889,0.5812692
10 | 8,0.5657533,0.56574196,0.56491727
11 | 9,0.5071269,0.5071374,0.5062459
12 | 10,0.43731558,0.43734074,0.43611565
13 | 11,0.37808847,0.37813658,0.37643093
14 | 12,0.38327742,0.3833187,0.38146868
15 | 13,0.39855412,0.39860153,0.3966159
16 | 14,0.40502536,0.4050431,0.4029639
17 | 15,0.4187931,0.4188173,0.41667226
18 | 16,0.3218223,0.32182986,0.32025683
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/pt/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.302053,0.30184427,0.2976399
3 | 1,0.45684627,0.45677197,0.45546424
4 | 2,0.6735796,0.6735233,0.6731233
5 | 3,0.74322075,0.74317265,0.74297166
6 | 4,0.72940576,0.72935677,0.72914743
7 | 5,0.74814695,0.748095,0.74789345
8 | 6,0.7392128,0.739171,0.7390097
9 | 7,0.7750178,0.774979,0.7748446
10 | 8,0.7798485,0.77981144,0.779676
11 | 9,0.81777656,0.8177529,0.8175885
12 | 10,0.83894795,0.8389314,0.838778
13 | 11,0.870458,0.87044656,0.8703426
14 | 12,0.9830619,0.98306423,0.98306143
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/pt/xlm-roberta-large.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3740441,0.37399486,0.3710023
3 | 1,0.67301875,0.6729773,0.6725724
4 | 2,0.7341082,0.73409325,0.733912
5 | 3,0.74055076,0.7405203,0.7403056
6 | 4,0.7904661,0.79042983,0.79032314
7 | 5,0.7880771,0.78803754,0.78790236
8 | 6,0.81661665,0.8166169,0.81652534
9 | 7,0.8221869,0.82219744,0.82212555
10 | 8,0.8350775,0.83508027,0.8350043
11 | 9,0.8372719,0.83726805,0.8372026
12 | 10,0.8372136,0.8371918,0.8371133
13 | 11,0.8399054,0.8398653,0.83975667
14 | 12,0.84060127,0.8405483,0.84033316
15 | 13,0.8341999,0.8341561,0.83385843
16 | 14,0.83416283,0.83414257,0.8337824
17 | 15,0.8384014,0.83838236,0.8380531
18 | 16,0.8296981,0.82966036,0.8293861
19 | 17,0.83966845,0.8396195,0.8393744
20 | 18,0.84589136,0.8458346,0.845562
21 | 19,0.84492606,0.8448792,0.8446221
22 | 20,0.8584489,0.8584082,0.858195
23 | 21,0.87398726,0.8739412,0.87374836
24 | 22,0.8719638,0.8719428,0.8717388
25 | 23,0.87165064,0.87161326,0.87140054
26 | 24,0.97964114,0.9796477,0.979639
27 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/zh/bert-base-chinese.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.2786982,0.2785878,0.27494037
3 | 1,0.33671036,0.33662596,0.33471334
4 | 2,0.42845756,0.4284101,0.4273608
5 | 3,0.45149758,0.45147166,0.45057997
6 | 4,0.5184017,0.5184023,0.51783705
7 | 5,0.573508,0.5734958,0.57311326
8 | 6,0.6330495,0.6330315,0.63276017
9 | 7,0.59864044,0.5986131,0.59829366
10 | 8,0.54804957,0.5480091,0.54755783
11 | 9,0.51617336,0.516132,0.5156478
12 | 10,0.5561151,0.55609417,0.55573994
13 | 11,0.5984755,0.5984512,0.5981564
14 | 12,0.56038475,0.5603337,0.5599188
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/zh/bert-base-multilingual-cased.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.3118959,0.31177446,0.3086432
3 | 1,0.3425565,0.34244964,0.3399823
4 | 2,0.35352883,0.35343447,0.35129714
5 | 3,0.43610418,0.43604368,0.43494177
6 | 4,0.489178,0.4891102,0.48830378
7 | 5,0.5690116,0.5689432,0.5684761
8 | 6,0.6265541,0.6264865,0.6262059
9 | 7,0.54113525,0.5410629,0.54064935
10 | 8,0.5284168,0.52834535,0.5279011
11 | 9,0.62840384,0.62833464,0.62803453
12 | 10,0.69999313,0.69992936,0.6997184
13 | 11,0.732485,0.73242646,0.73219264
14 | 12,0.37793094,0.37789607,0.3773833
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/zh/xlm-mlm-100-1280.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.32776257,0.3276813,0.32470536
3 | 1,0.3356181,0.3355479,0.332959
4 | 2,0.35034394,0.35026166,0.3482373
5 | 3,0.36442822,0.36435747,0.36224666
6 | 4,0.3771403,0.37707978,0.37516475
7 | 5,0.43258497,0.4325344,0.43104237
8 | 6,0.5181599,0.5181224,0.5172326
9 | 7,0.5792645,0.57922333,0.57866186
10 | 8,0.5692134,0.5691731,0.56858486
11 | 9,0.5324812,0.5324232,0.53178775
12 | 10,0.47810394,0.47805268,0.47723517
13 | 11,0.4319199,0.43188363,0.43088776
14 | 12,0.44747546,0.447443,0.44653583
15 | 13,0.45633683,0.4563076,0.45531917
16 | 14,0.45723236,0.457195,0.45610127
17 | 15,0.46675017,0.46670267,0.4656479
18 | 16,0.40051928,0.40046176,0.39960644
19 |
--------------------------------------------------------------------------------
/bert_score/bert_score/rescale_baseline/zh/xlm-roberta-base.tsv:
--------------------------------------------------------------------------------
1 | LAYER,P,R,F
2 | 0,0.36188287,0.36180493,0.35862362
3 | 1,0.4372344,0.43716717,0.43550655
4 | 2,0.64521,0.64515334,0.6446227
5 | 3,0.734053,0.7340016,0.7337482
6 | 4,0.730163,0.73011726,0.72988415
7 | 5,0.7542184,0.7541747,0.7539484
8 | 6,0.7611062,0.7610684,0.76089287
9 | 7,0.79163146,0.7915949,0.79145956
10 | 8,0.79859376,0.79856044,0.7984367
11 | 9,0.82988167,0.8298588,0.82975745
12 | 10,0.8522986,0.8522761,0.8521975
13 | 11,0.8852355,0.88521546,0.88517046
14 | 12,0.98287344,0.98286974,0.9828698
15 |
--------------------------------------------------------------------------------
/bert_score/bert_score_cli/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/bert_score_cli/__init__.py
--------------------------------------------------------------------------------
/bert_score/bert_score_cli/score.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import argparse
4 | import torch
5 |
6 | import bert_score
7 | from add_context import add_context
8 |
9 | def main():
10 | torch.multiprocessing.set_sharing_strategy("file_system")
11 |
12 | parser = argparse.ArgumentParser("Calculate BERTScore")
13 | parser.add_argument(
14 | "--lang",
15 | type=str,
16 | default=None,
17 | help='two-letter abbreviation of the language (e.g., en) or "en-sci" for scientific text',
18 | )
19 | parser.add_argument(
20 | "-m", "--model", default=None, help="BERT model name (default: bert-base-uncased) or path to a pretrain model",
21 | )
22 | parser.add_argument("-l", "--num_layers", type=int, default=None, help="use first N layer in BERT (default: 8)")
23 | parser.add_argument("-b", "--batch_size", type=int, default=64, help="batch size (default: 64)")
24 | parser.add_argument("--nthreads", type=int, default=4, help="number of cpu workers (default: 4)")
25 | parser.add_argument("--idf", action="store_true", help="BERT Score with IDF scaling")
26 | parser.add_argument(
27 | "--rescale_with_baseline", action="store_true", help="Rescaling the numerical score with precomputed baselines",
28 | )
29 | parser.add_argument("--baseline_path", default=None, type=str, help="path of custom baseline csv file")
30 | parser.add_argument("--use_fast_tokenizer", action="store_false", help="whether to use HF fast tokenizer")
31 | parser.add_argument("-s", "--seg_level", action="store_true", help="show individual score of each pair")
32 | parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
33 | parser.add_argument("-r", "--ref", type=str, nargs="+", required=True, help="reference file path(s) or a string")
34 | parser.add_argument(
35 | "-c", "--cand", type=str, required=True, help="candidate (system outputs) file path or a string",
36 | )
37 | parser.add_argument("--doc", type=str, default=None, help="File containing document IDs to evaluate at the document level.")
38 |
39 | args = parser.parse_args()
40 |
41 | if args.doc:
42 | print('Running at document level')
43 | with open(args.doc, encoding="utf-8") as fp:
44 | doc_ids = [line.strip() for line in fp.readlines()]
45 | assert not args.idf, "do not support idf mode for document-level evaluation"
46 |
47 | if os.path.isfile(args.cand):
48 | with open(args.cand) as f:
49 | cands = [line.strip() for line in f]
50 |
51 | refs = []
52 | for ref_file in args.ref:
53 | assert os.path.exists(ref_file), f"reference file {ref_file} doesn't exist"
54 | with open(ref_file) as f:
55 | curr_ref = [line.strip() for line in f]
56 | assert len(curr_ref) == len(cands), f"# of sentences in {ref_file} doesn't match the # of candidates"
57 | if args.doc:
58 | sep_token = "[SEP]" if args.lang != "en" else ""
59 | sent_ref = curr_ref
60 | curr_ref = add_context(orig_txt=curr_ref, context=curr_ref, doc_ids=doc_ids, sep_token=sep_token)
61 | refs.append(curr_ref)
62 | refs = list(zip(*refs))
63 | elif os.path.isfile(args.ref[0]):
64 | assert os.path.exists(args.cand), f"candidate file {args.cand} doesn't exist"
65 | else:
66 | cands = [args.cand]
67 | refs = [args.ref]
68 | assert not args.idf, "do not support idf mode for a single pair of sentences"
69 |
70 | if args.doc:
71 | print('Adding reference context to MT')
72 | cands = add_context(orig_txt=cands, context=sent_ref, doc_ids=doc_ids, sep_token=sep_token)
73 |
74 | all_preds, hash_code = bert_score.score(
75 | cands,
76 | refs,
77 | model_type=args.model,
78 | num_layers=args.num_layers,
79 | verbose=args.verbose,
80 | idf=args.idf,
81 | batch_size=args.batch_size,
82 | lang=args.lang,
83 | return_hash=True,
84 | rescale_with_baseline=args.rescale_with_baseline,
85 | baseline_path=args.baseline_path,
86 | use_fast_tokenizer=args.use_fast_tokenizer,
87 | doc=True if args.doc else False
88 | )
89 | avg_scores = [s.mean(dim=0) for s in all_preds]
90 | P = avg_scores[0].cpu().item()
91 | R = avg_scores[1].cpu().item()
92 | F1 = avg_scores[2].cpu().item()
93 | msg = hash_code + f" P: {P:.6f} R: {R:.6f} F1: {F1:.6f}"
94 | print(msg)
95 | if args.seg_level:
96 | ps, rs, fs = all_preds
97 | for p, r, f in zip(ps, rs, fs):
98 | print("{:.6f}\t{:.6f}\t{:.6f}".format(p, r, f))
99 |
100 |
101 | if __name__ == "__main__":
102 | main()
103 |
--------------------------------------------------------------------------------
/bert_score/bert_score_cli/visualize.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | import os
3 | import time
4 | import argparse
5 | import torch
6 | from collections import defaultdict
7 |
8 | import bert_score
9 |
10 |
11 | def main():
12 | torch.multiprocessing.set_sharing_strategy("file_system")
13 |
14 | parser = argparse.ArgumentParser("Visualize BERTScore")
15 | parser.add_argument("--lang", type=str, default="en", help="two-letter abbreviation of the language (e.g., en)")
16 | parser.add_argument("-m", "--model", default=None, help="BERT model name (default: bert-base-uncased)")
17 | parser.add_argument("-l", "--num_layers", type=int, default=None, help="use first N layer in BERT (default: 8)")
18 | parser.add_argument("-v", "--verbose", action="store_true", help="increase output verbosity")
19 | parser.add_argument("-r", "--ref", type=str, required=True, help="reference sentence")
20 | parser.add_argument("-c", "--cand", type=str, required=True, help="candidate sentence")
21 | parser.add_argument(
22 | "-f", "--file", type=str, default="visualize.png", help="name of file to save output matrix in",
23 | )
24 | parser.add_argument(
25 | "--rescale_with_baseline", action="store_true", help="Rescaling the numerical score with precomputed baselines",
26 | )
27 | parser.add_argument("--baseline_path", default=None, type=str, help="path of custom baseline csv file")
28 |
29 | args = parser.parse_args()
30 |
31 | bert_score.plot_example(
32 | args.cand,
33 | args.ref,
34 | model_type=args.model,
35 | lang=args.lang,
36 | num_layers=args.num_layers,
37 | fname=args.file,
38 | rescale_with_baseline=args.rescale_with_baseline,
39 | baseline_path=args.baseline_path,
40 | )
41 |
42 |
43 | if __name__ == "__main__":
44 | main()
45 |
--------------------------------------------------------------------------------
/bert_score/get_rescale_baseline/README.md:
--------------------------------------------------------------------------------
1 | # Computing rescale baselines for English models
2 | ### Downloading the dataset
3 | This downloads the WMT17 English text data.
4 | ```sh
5 | bash download_text_data.sh
6 | ```
7 |
8 | ### Tuning the models
9 | Here is an example of getting the rescale baseline files of two models
10 | ```sh
11 | python get_rescale_baseline.py --lang en -b 16 -m \
12 | microsoft/deberta-large \
13 | microsoft/deberta-large-mnli
14 | ```
15 | The baseline files will be in `rescale_baseline` folder
--------------------------------------------------------------------------------
/bert_score/get_rescale_baseline/download_text_data.sh:
--------------------------------------------------------------------------------
1 | mkdir -p data
2 | cd data
3 | if ! [ -f news.2017.en.shuffled.deduped ]; then
4 | wget http://data.statmt.org/wmt18/translation-task/news.2017.en.shuffled.deduped.gz
5 | gzip -d news.2017.en.shuffled.deduped.gz
6 | fi
7 |
8 | echo "finish downloading data"
--------------------------------------------------------------------------------
/bert_score/get_rescale_baseline/get_baseline_example.sh:
--------------------------------------------------------------------------------
1 | bash download_text_data.sh
2 | python get_rescale_baseline.py --lang en -b 16 -m \
3 | microsoft/deberta-large \
4 | microsoft/deberta-large-mnli \
5 |
--------------------------------------------------------------------------------
/bert_score/get_rescale_baseline/get_rescale_baseline.py:
--------------------------------------------------------------------------------
1 | import matplotlib
2 | import matplotlib.pyplot as plt
3 | import bert_score
4 | import torch
5 | from random import shuffle
6 | import sacrebleu
7 | import numpy as np
8 | import pandas as pd
9 | from tqdm.auto import tqdm
10 | import os
11 | import argparse
12 | import gzip
13 |
14 |
15 | def get_data(lang="en"):
16 |
17 | if lang == "en":
18 | file_path = "data/news.2017.en.shuffled.deduped"
19 | elif lang == "zh":
20 | file_path = "data/paracrawl/crawl_chinese.txt"
21 | else:
22 | file_path = f"data/paracrawl/rand_{lang}.txt"
23 |
24 | with open(file_path, "r") as f:
25 | lines = []
26 | for i, line in enumerate(f):
27 | if i == 1_000_000:
28 | break
29 | line = line.strip()
30 | if len(line.split(" ")) < 32 and len(line.split(" ")) > 0:
31 | lines.append(line)
32 |
33 | samples = np.random.choice(range(len(lines)), size=(2, len(lines) // 2), replace=False)
34 |
35 | hyp = [lines[i] for i in samples[0]]
36 | cand = [lines[i] for i in samples[1]]
37 |
38 | return hyp, cand
39 |
40 |
41 | def chunk(l, n):
42 | # looping till length l
43 | for i in range(0, len(l), n):
44 | yield l[i : i + n]
45 |
46 |
47 | if __name__ == "__main__":
48 | parser = argparse.ArgumentParser(description="Process some integers.")
49 | parser.add_argument("--lang", type=str, required=True, help="language to compute baseline with")
50 | parser.add_argument("-m", "--model", nargs="+", help="models to tune")
51 | parser.add_argument("-b", "--batch_size", type=int, default=64)
52 |
53 | args = parser.parse_args()
54 |
55 | hyp, cand = get_data(lang=args.lang)
56 |
57 | for model_type in args.model:
58 | baseline_file_path = f"rescale_baseline/{args.lang}/{model_type}.tsv"
59 | if os.path.isfile(baseline_file_path):
60 | print(f"{model_type} baseline exists for {args.lang}")
61 | continue
62 | else:
63 | print(f"computing baseline for {model_type} on {args.lang}")
64 | scorer = bert_score.BERTScorer(model_type=model_type, all_layers=True)
65 | with torch.no_grad():
66 | score_means = None
67 | count = 0
68 | for batches in tqdm(chunk(list(zip(hyp, cand)), 1000), total=len(hyp) / 1000):
69 | batch_hyp, batch_cand = zip(*batches)
70 | scores = scorer.score(batch_hyp, batch_cand, batch_size=args.batch_size)
71 | scores = torch.stack(scores, dim=0)
72 | if score_means is None:
73 | score_means = scores.mean(dim=-1)
74 | else:
75 | score_means = score_means * count / (count + len(batches)) + scores.mean(dim=-1) * len(
76 | batches
77 | ) / (count + len(batches))
78 | count += len(batches)
79 |
80 | pd_baselines = pd.DataFrame(score_means.numpy().transpose(), columns=["P", "R", "F"])
81 | pd_baselines.index.name = "LAYER"
82 |
83 | os.makedirs(os.path.dirname(baseline_file_path), exist_ok=True)
84 | pd_baselines.to_csv(baseline_file_path)
85 | del scorer
86 |
--------------------------------------------------------------------------------
/bert_score/journal/rescale_baseline.md:
--------------------------------------------------------------------------------
1 | # Rescaling BERTScore with Baselines
2 |
3 | BERTScore computes a sentence-level similarity score by making use of token-level similarities,
4 | produced by cosine similarities between contextual embeddings.
5 | The numerical range of BERTScore is between -1 and 1, the same as the underlying cosine similarity.
6 | In practice, however, BERTscore is usually found to be in a small range.
7 | For an extreme case, BERTScore computed with the large RoBERTa model often is between 0.85 and 0.95.
8 |
9 | Although BERTscore correlates highly with human judgment in spite of the above mentioned caveat, BERTScore will
10 | be easier to interpret and work with if it has a natural range (for example, between 0 and 1).
11 | Therefore, we seek a method to rescale BERTScore to have an intuitive range.
12 | Let's denote the BERTScore for a pair of candidate and reference sentence as
13 | Let
be a lower bound for BERTScores that we typically observe in practice (i.e.
).
14 | We obtain a rescaled BERTScore
through a simple linear transformation,
15 |
.
16 | With a reliable baseline
, we will typically observe
to be between 0 and 1.
17 |
18 | We highlight that this rescaling operation does not affect BERTScore's correlation with human judgment, as measured by Pearson's
and Kendall's
coefficients. So we preserve BERTScore's high correlation as reported in our [study](https://arxiv.org/abs/1904.09675).
19 | We now describe how we compute a reliable baseline.
20 |
21 | For each language, we select a million sentences from some large monolingual corpus.
22 | We randomly group sentences into candidate-reference pairs, resulting in half a million pairs.
23 | For each contextual embedding model, we compute BERTScore on the random pairs and take the average to be the baseline.
24 | We compute the baseline with different layers of representations and separate the baselines for precision, recall, and F1.
25 | So far, we have supported 11 different languages (English, Chinese, French, German...) with all models we support.
26 | The baseline numbers are collected [here](../rescale_baseline). We plan to release the experiment code soon so you can compute baselines with any data of your choice.
27 |
28 | With this recalling, the average BERTScore (computed with RoBERTa-large, layer17) on the WMT18 De-EN translation evaluation dataset drops from 0.9311 to 0.5758.
29 | For a concrete example, we can plot the similarity matrix between two sentences using `bert-score-show`.
30 |
31 | Before scaling:
32 |
33 | 
34 |
35 | After scaling:
36 |
37 | 
38 |
39 | Clearly, the rescaling produces a more readable output. Occationally, some of the similarity entries will become negative after rescaling but they won't affect BERTScore results because the rescaling is done after BERTScore is computed.
40 |
41 | We package this feature into our library (>=0.3.0). Here's an example on how to use it (note that the language needs to be specified in order to use this feature):
42 | ```python
43 | out = bert_score.score(
44 | cands, refs,
45 | rescale_with_baseline=True, lang="en"
46 | )
47 | ```
48 | and for the command-line version:
49 | ```bash
50 | bert-score -r example/refs.txt -c example/hyps.txt \
51 | --lang en --rescale_with_baseline
52 | ```
53 |
54 |
55 |
56 | Hope you enjoy this new feature!
57 |
58 | ---Tianyi, Varsha, and Felix
59 |
--------------------------------------------------------------------------------
/bert_score/journal/static/.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/journal/static/.png
--------------------------------------------------------------------------------
/bert_score/journal/static/after.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/journal/static/after.png
--------------------------------------------------------------------------------
/bert_score/journal/static/before.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/journal/static/before.png
--------------------------------------------------------------------------------
/bert_score/requirements.txt:
--------------------------------------------------------------------------------
1 | # PyTorch
2 | torch>=1.0.0
3 | # progress bars in model download and training scripts
4 | tqdm>=4.31.1
5 | # BERT
6 | transformers>=3.0.0
7 | matplotlib
8 | pandas>=1.0.1
9 | numpy
10 | packaging>=20.9
11 |
--------------------------------------------------------------------------------
/bert_score/setup.py:
--------------------------------------------------------------------------------
1 | from io import open
2 | from setuptools import find_packages, setup
3 |
4 | setup(
5 | name="bert_score",
6 | version='0.3.11',
7 | author="Tianyi Zhang*, Varsha Kishore*, Felix Wu*, Kilian Q. Weinberger, and Yoav Artzi",
8 | author_email="tzhang@asapp.com",
9 | description="PyTorch implementation of BERT score",
10 | long_description=open("README.md", "r", encoding='utf-8').read(),
11 | long_description_content_type="text/markdown",
12 | keywords='BERT NLP deep learning google metric',
13 | license='MIT',
14 | url="https://github.com/Tiiiger/bert_score",
15 | packages=find_packages(exclude=["*.tests", "*.tests.*",
16 | "tests.*", "tests"]),
17 | install_requires=['torch>=1.0.0',
18 | 'pandas>=1.0.1',
19 | 'transformers>=3.0.0'
20 | 'numpy',
21 | 'requests',
22 | 'tqdm>=4.31.1',
23 | 'matplotlib',
24 | 'packaging>=20.9',
25 | ],
26 | entry_points={
27 | 'console_scripts': [
28 | "bert-score=bert_score_cli.score:main",
29 | "bert-score-show=bert_score_cli.visualize:main",
30 | ]
31 | },
32 | include_package_data=True,
33 | python_requires='>=3.6',
34 | tests_require=['pytest'],
35 | classifiers=[
36 | 'Intended Audience :: Science/Research',
37 | 'License :: OSI Approved :: MIT License',
38 | 'Programming Language :: Python :: 3',
39 | 'Topic :: Scientific/Engineering :: Artificial Intelligence',
40 | ],
41 |
42 | )
43 |
--------------------------------------------------------------------------------
/bert_score/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/bert_score/tests/__init__.py
--------------------------------------------------------------------------------
/bert_score/tests/custom_assertions.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 |
4 |
5 | class CustomAssertions:
6 | def assertAreTensors(self, *args):
7 | if not all([torch.is_tensor(arg) for arg in args]):
8 | raise AssertionError("All values should be of type torch.Tensor")
9 |
10 | def assertTensorsAlmostEqual(self, expected, actual, decimal=5):
11 | """
12 | Test tensors are almost equal (EPS = 1e-5 by default)
13 | """
14 | np.testing.assert_almost_equal(expected, actual, decimal=decimal)
15 |
--------------------------------------------------------------------------------
/bert_score/tests/test_scorer.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from transformers import __version__ as ht_version
3 | import bert_score
4 |
5 | from tests.custom_assertions import CustomAssertions
6 |
7 | cands = [
8 | "28-year-old chef found dead in San Francisco mall",
9 | "A 28-year-old chef who recently moved to San Francisco was found dead in the staircase of a local shopping center.",
10 | 'The victim\'s brother said he cannot imagine anyone who would want to harm him,"Finally, it went uphill again at him."',
11 | ]
12 | refs = [
13 | "28-Year-Old Chef Found Dead at San Francisco Mall",
14 | "A 28-year-old chef who had recently moved to San Francisco was found dead in the stairwell of a local mall this week.",
15 | "But the victim's brother says he can't think of anyone who would want to hurt him, saying, \"Things were finally going well for him.\"",
16 | ]
17 |
18 |
19 | class TestScore(unittest.TestCase, CustomAssertions):
20 | def test_scorer(self):
21 | scorer = bert_score.BERTScorer(lang="en", batch_size=3)
22 |
23 | (P, R, F), hash_code = scorer.score(cands, refs, return_hash=True)
24 | self.assertAreTensors(P, R, F)
25 | self.assertTensorsAlmostEqual(P, [0.9843302369117737, 0.9832239747047424, 0.9120386242866516])
26 | self.assertTensorsAlmostEqual(R, [0.9823839068412781, 0.9732863903045654, 0.920428991317749])
27 | self.assertTensorsAlmostEqual(F, [0.9833561182022095, 0.9782299995422363, 0.916214644908905])
28 | self.assertEqual(
29 | hash_code, f"roberta-large_L17_no-idf_version={bert_score.__version__}(hug_trans={ht_version})",
30 | )
31 |
32 | def test_idf_scorer(self):
33 | scorer = bert_score.BERTScorer(lang="en", idf=True, idf_sents=refs, batch_size=3)
34 |
35 | (P, R, F), hash_code = scorer.score(cands, refs, return_hash=True)
36 | self.assertAreTensors(P, R, F)
37 | self.assertTensorsAlmostEqual(P, [0.9837872385978699, 0.9754738807678223, 0.8947395086288452])
38 | self.assertTensorsAlmostEqual(R, [0.9827190637588501, 0.9697767496109009, 0.9172918796539307])
39 | self.assertTensorsAlmostEqual(F, [0.9832529425621033, 0.972616970539093, 0.9058753848075867])
40 | self.assertEqual(
41 | hash_code, f"roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})",
42 | )
43 |
44 | def test_scorer_rescale(self):
45 | scorer = bert_score.BERTScorer(lang="en", rescale_with_baseline=True, batch_size=3)
46 |
47 | (P, R, F), hash_code = scorer.score(cands, refs, return_hash=True)
48 | self.assertAreTensors(P, R, F)
49 | self.assertTensorsAlmostEqual(P, [0.907000780105591, 0.900435566902161, 0.477955609560013])
50 | self.assertTensorsAlmostEqual(R, [0.895456790924072, 0.841467440128326, 0.527785062789917])
51 | self.assertTensorsAlmostEqual(F, [0.901383399963379, 0.871010780334473, 0.503565192222595])
52 | self.assertEqual(
53 | hash_code, f"roberta-large_L17_no-idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled",
54 | )
55 |
56 | def test_idf_scorer_rescale(self):
57 | scorer = bert_score.BERTScorer(lang="en", rescale_with_baseline=True, idf=True, idf_sents=refs, batch_size=3)
58 |
59 | (P, R, F), hash_code = scorer.score(cands, refs, return_hash=True)
60 | self.assertAreTensors(P, R, F)
61 | self.assertTensorsAlmostEqual(P, [0.903778135776520, 0.854439020156860, 0.375287383794785])
62 | self.assertTensorsAlmostEqual(R, [0.897446095943451, 0.820639789104462, 0.509167850017548])
63 | self.assertTensorsAlmostEqual(F, [0.900772094726562, 0.837753534317017, 0.442304641008377])
64 | self.assertEqual(
65 | hash_code, f"roberta-large_L17_idf_version={bert_score.__version__}(hug_trans={ht_version})-rescaled",
66 | )
67 |
68 | def test_multi_refs(self):
69 | scorer = bert_score.BERTScorer(lang="en", batch_size=3, rescale_with_baseline=True)
70 |
71 | cands = ["I like lemons."]
72 | refs = [["I am proud of you.", "I love lemons.", "Go go go."]]
73 | P_mul, R_mul, F_mul = scorer.score(cands, refs,)
74 | P_best, R_best, F_best = scorer.score(cands, [refs[0][1]],)
75 | self.assertTensorsAlmostEqual(P_mul, P_best)
76 | self.assertTensorsAlmostEqual(R_mul, R_best)
77 | self.assertTensorsAlmostEqual(F_mul, F_best)
78 |
79 | def test_multi_refs_working(self):
80 | scorer = bert_score.BERTScorer(lang="en", batch_size=3, rescale_with_baseline=True)
81 |
82 | cands = ["I like lemons.", "Hi", "Hey", "Hello", "Go", ""]
83 | refs = [
84 | ["I am proud of you.", "I love lemons.", "Go go go."],
85 | ["I am proud of you.", "Go go go."],
86 | ["Hi", ""],
87 | ["I am proud of you.", "I love lemons.", "Go go go.", "hello"],
88 | ["I am proud of you.", "Go go go.", "Go", "Go to school"],
89 | ["test"],
90 | ]
91 | P_mul, R_mul, F_mul = scorer.score(cands, refs,)
92 | self.assertAreTensors(P_mul, R_mul, F_mul)
93 |
94 |
95 | if __name__ == "__main__":
96 | unittest.main()
97 |
--------------------------------------------------------------------------------
/bert_score/tune_layers/README.md:
--------------------------------------------------------------------------------
1 | # Tuning best layer of a pre-trained English model on WMT16 dataset
2 |
3 | ### Downloading the dataset
4 | This downloads the WMT16 dataset and extracts it into a new folder called `wmt16`. If the folder `wmt16` exists, it will skip the process.
5 | ```sh
6 | bash download_data.sh
7 | ```
8 |
9 | ### Tuning the models
10 | Here is an example of tuning three models in a row:
11 | ```sh
12 | python tune_layers.py -m bert-base-uncased roberta-base albert-base-v2
13 | ```
14 | The results would be appended to `best_layers_log.txt`.
15 | The last three lines of `best_layers_log.txt` would be
16 | ```
17 | 'bert-base-uncased': 9, # 0.692518813886652
18 | 'roberta-base': 10, # 0.7062886932674598
19 | 'albert-base-v2': 9, # 0.6682362357086912
20 | ```
21 | which shows the model name, the best number of layers, and the pearson correlation with human judgement.
22 | These can be copied and pasted into `model2layers` in `bert_score/utils.py`.
--------------------------------------------------------------------------------
/bert_score/tune_layers/download_data.sh:
--------------------------------------------------------------------------------
1 | if ! [ -d wmt16 ]; then
2 | mkdir wmt16
3 | gz_file=wmt16-metrics-results.tar.gz
4 | if ! [ -f $gz_file ]; then
5 | wget https://www.scss.tcd.ie/~ygraham/wmt16-metrics-results.tar.gz
6 | fi
7 | tar -xzf $gz_file -C wmt16
8 | rm -f $gz_file
9 | echo "Finished downloading and extracting the dataset"
10 | else
11 | echo "Folder 'wmt16' exists already."
12 | fi
13 |
--------------------------------------------------------------------------------
/bert_score/tune_layers/tune.sh:
--------------------------------------------------------------------------------
1 | bash download_data.sh
2 | python tune_layers.py -m bert-base-uncased roberta-base albert-base-v2
--------------------------------------------------------------------------------
/bert_score/tune_layers/tune_layers.py:
--------------------------------------------------------------------------------
1 | import os
2 | import re
3 | import argparse
4 | import torch
5 | import numpy as np
6 |
7 | from tqdm.auto import tqdm, trange
8 | from collections import defaultdict
9 | from scipy.stats import pearsonr
10 |
11 | import bert_score
12 |
13 |
14 | def get_wmt16(lang_pair, data_folder="wmt16"):
15 | with open(
16 | os.path.join(
17 | data_folder,
18 | f"wmt16-metrics-results/seg-level-results/DAseg-newstest2016/DAseg-newstest2016.human.{lang_pair}",
19 | )
20 | ) as f:
21 | gold_scores = list(map(float, f.read().strip().split("\n")))
22 |
23 | with open(
24 | os.path.join(
25 | data_folder,
26 | f"wmt16-metrics-results/seg-level-results/DAseg-newstest2016/DAseg-newstest2016.reference.{lang_pair}",
27 | )
28 | ) as f:
29 | all_refs = f.read().strip().split("\n")
30 |
31 | with open(
32 | os.path.join(
33 | data_folder,
34 | f"wmt16-metrics-results/seg-level-results/DAseg-newstest2016/DAseg-newstest2016.mt-system.{lang_pair}",
35 | )
36 | ) as f:
37 | all_hyps = f.read().strip().split("\n")
38 |
39 | return gold_scores, all_refs, all_hyps
40 |
41 |
42 | def get_wmt16_seg_to_bert_score(lang_pair, scorer, data_folder="wmt16", batch_size=64):
43 | # os.makedirs(f"cache_score/{network}", exist_ok=True)
44 | # path = "cache_score/{}/wmt16_seg_to_{}_{}.pkl".format(network, *lang_pair.split("-"))
45 |
46 | gold_scores, refs, cands = get_wmt16(lang_pair, data_folder=data_folder)
47 | if scorer.idf:
48 | scorer.compute_idf(refs)
49 | scores = scorer.score(cands, refs, verbose=False, batch_size=batch_size)
50 | scores = list(scores)
51 | max_length = scorer._tokenizer.max_len_single_sentence
52 |
53 | return scores, gold_scores, max_length
54 |
55 |
56 | def main():
57 | parser = argparse.ArgumentParser()
58 | parser.add_argument("-d", "--data", default="wmt16", help="path to wmt16 data")
59 | parser.add_argument("-m", "--model", nargs="+", help="models to tune")
60 | parser.add_argument("-l", "--log_file", default="best_layers_log.txt", help="log file path")
61 | parser.add_argument("--idf", action="store_true")
62 | parser.add_argument("-b", "--batch_size", type=int, default=64)
63 | parser.add_argument(
64 | "--lang_pairs",
65 | nargs="+",
66 | default=["cs-en", "de-en", "fi-en", "ro-en", "ru-en", "tr-en"],
67 | help="language pairs used for tuning",
68 | )
69 | args = parser.parse_args()
70 |
71 | if args.log_file.endswith(".txt"):
72 | csv_file = args.log_file.replace(".txt", ".csv")
73 | else:
74 | csv_file = args.log_file + ".csv"
75 |
76 | torch.set_grad_enabled(False)
77 |
78 | networks = args.model
79 | for network in networks:
80 | model_type = network
81 | scorer = bert_score.scorer.BERTScorer(model_type=model_type, num_layers=100, idf=False, all_layers=True)
82 | results = defaultdict(dict)
83 | for lang_pair in tqdm(args.lang_pairs):
84 | scores, gold_scores, max_length = get_wmt16_seg_to_bert_score(lang_pair, scorer, batch_size=args.batch_size)
85 | for i, score in enumerate(scores[2]):
86 | results[lang_pair + " " + str(i)]["%s %s" % (network, "F")] = pearsonr(score, gold_scores)[0]
87 |
88 | best_layer, best_corr = 0, 0.0
89 | for num_layer in range(100):
90 | temp = []
91 | if f"{args.lang_pairs[0]} {num_layer}" not in results:
92 | break
93 | for lp in args.lang_pairs:
94 | temp.append(results[f"{lp} {num_layer}"][f"{network} F"])
95 | corr = np.mean(temp)
96 | results["avg" + " " + str(num_layer)]["%s %s" % (network, "F")] = corr
97 | print(network, num_layer, corr)
98 | if corr > best_corr:
99 | best_layer, best_corr = num_layer, corr
100 |
101 | if args.idf:
102 | msg = f"'{network}' (idf): {best_layer}, # {best_corr}"
103 | else:
104 | msg = f"'{network}': {best_layer}, # {best_corr}"
105 | print(msg)
106 | with open(args.log_file, "a") as f:
107 | print(msg, file=f)
108 | csv_msg = f"{network},{best_layer},{best_corr},,{max_length}"
109 | with open(csv_file, "a") as f:
110 | print(csv_msg, file=f)
111 |
112 | del scorer
113 |
114 |
115 | if __name__ == "__main__":
116 | main()
117 |
--------------------------------------------------------------------------------
/bert_score/upload_pypi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | rm -rf dist
3 | python setup.py sdist bdist_wheel
4 | python -m twine upload dist/*
--------------------------------------------------------------------------------
/media/bertscore.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/amazon-science/doc-mt-metrics/5385cc28930aae9924edcb3201645dd3810b12c0/media/bertscore.png
--------------------------------------------------------------------------------