├── .gitignore
├── LICENSE
├── README.md
├── experiments
    ├── README.md
    ├── combine.py
    ├── comparison.py
    ├── contrast.py
    ├── distil.py
    ├── ensemble.py
    ├── finetune.py
    ├── lm.py
    ├── nmt.py
    ├── parallel.py
    ├── quantity.py
    ├── remap.py
    ├── sentsim.py
    └── vecmap.py
├── metrics
    ├── common.py
    ├── contrastscore.py
    ├── distilscore.py
    ├── marginscore.py
    ├── sentsim.py
    ├── utils
    │   ├── dataset.py
    │   ├── embed.py
    │   ├── env.py
    │   ├── knn.py
    │   ├── language.py
    │   ├── nmt.py
    │   ├── perplexity.py
    │   ├── remap.py
    │   ├── vecmap
    │   │   ├── .gitignore
    │   │   ├── LICENSE.txt
    │   │   ├── README.md
    │   │   ├── cupy_utils.py
    │   │   ├── embeddings.py
    │   │   └── map_embeddings.py
    │   └── wmd.py
    ├── vecmapscore.py
    └── xmoverscore
    │   ├── __init__.py
    │   ├── align.py
    │   └── embed.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 98 | __pypackages__/
 99 | 
100 | # Celery stuff
101 | celerybeat-schedule
102 | celerybeat.pid
103 | 
104 | # SageMath parsed files
105 | *.sage.py
106 | 
107 | # Environments
108 | .env
109 | .venv
110 | env/
111 | venv/
112 | ENV/
113 | env.bak/
114 | venv.bak/
115 | 
116 | # Spyder project settings
117 | .spyderproject
118 | .spyproject
119 | 
120 | # Rope project settings
121 | .ropeproject
122 | 
123 | # mkdocs documentation
124 | /site
125 | 
126 | # mypy
127 | .mypy_cache/
128 | .dmypy.json
129 | dmypy.json
130 | 
131 | # Pyre type checker
132 | .pyre/
133 | 
134 | # pytype static type analyzer
135 | .pytype/
136 | 
137 | # Cython debug symbols
138 | cython_debug/
139 | 
140 | # files specific to this project
141 | data/
142 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Unsupervised Metrics: UScore & Friends
  2 | [Unsupervised-Metrics](https://github.com/potamides/unsupervised-metrics) is a
  3 | Python library which allows researchers and developers alike to experiment with
  4 | state-of-the-art evaluation metrics for machine translation. The focus hereby
  5 | lies on reference-free, unsupervised metrics, which do not make use of supervision (parallel
  6 | data, references, human scores) in any way. However wrappers around some (weakly-)supervised metrics like
  7 | [XMoverScore](https://aclanthology.org/2020.acl-main.151) and
  8 | [SentSim](https://aclanthology.org/2021.naacl-main.252) are provided for
  9 | convenience.
 10 | 
 11 | <details><summary>Implemented Papers</summary><p>
 12 | 
 13 |   * [UScore: An Effective Approach to Fully Unsupervised Evaluation Metrics for Machine Translation](https://aclanthology.org/2023.eacl-main.27/)
 14 |   * [On the Limitations of Cross-lingual Encoders as Exposed by Reference-Free Machine Translation Evaluation](https://aclanthology.org/2020.acl-main.151)
 15 |   * [SentSim: Crosslingual Semantic Evaluation of Machine Translation](https://aclanthology.org/2021.naacl-main.252)
 16 | </p></details>
 17 | 
 18 | ## Installation
 19 | If you want to use this project as a library you can install it as a regular
 20 | package with [pip](https://pip.pypa.io/en/stable):
 21 | ```sh
 22 | pip install 'git+https://github.com/potamides/unsupervised-metrics.git#egg=metrics'
 23 | ```
 24 | If your goal is to run the included [experiments](experiments) (e.g. to
 25 | replicate the results of UScore) clone the repository and install it in
 26 | editable mode:
 27 |  ```sh
 28 |  git clone https://github.com/potamides/unsupervised-metrics
 29 |  pip install -e unsupervised-metrics[experiments]
 30 |  ```
 31 | If you want to use [fast-align](https://github.com/clab/fast_align) follow its
 32 | install instruction and make sure that the `fast_align` and `atools` programs
 33 | are on your `PATH`. This requirement is optional.
 34 | 
 35 | ## Usage
 36 | 
 37 | ### Train an existing metric
 38 | One focus of this library is to make it easy to fine-tune existing
 39 | state-of-the-art metrics for arbitrary language pairs and domains.
 40 | A simple example is provided in the code block below. For more involved
 41 | examples and means on how to instantiate a pre-trained metric take a look at
 42 | the [experiments](experiments).
 43 | 
 44 | ```python
 45 | from metrics.contrastscore import ContrastScore
 46 | from metrics.utils.dataset import DatasetLoader
 47 | 
 48 | src_lang, tgt_lang = "de", "en"
 49 | 
 50 | dataset = DatasetLoader(src_lang, tgt_lang)
 51 | # instantiate ContrastScore and enable parallel training on multiple GPUs
 52 | scorer = ContrastScore(source_language=src_lang, target_language=tgt_lang, parallelize=True)
 53 | # train the underlying language model on pseudo-parallel sentence pairs
 54 | scorer.train(*dataset.load("monolingual-train"))
 55 | 
 56 | # print correlations with human judgments
 57 | print("Pearson's r: {}, Spearman's ρ: {}".format(*scorer.correlation(*dataset.load("scored"))))
 58 | ```
 59 | 
 60 | ### Create your own metric
 61 | This library can also be used as a framework to create new metrics, as
 62 | demonstrated in the code block below. Existing metrics are defined in the
 63 | [metrics](metrics) package, which could serve as a source of inspiration.
 64 | 
 65 | ```python
 66 | from metrics.common import CommonScore
 67 | 
 68 | class MyOwnMetric(CommonScore):
 69 |     def align():
 70 |         """
 71 |         This method receives a list of sentences in the source language and a
 72 |         list of sentences in the target language as parameters and returns
 73 |         a list of pseudo aligned sentence pairs.
 74 |         """
 75 | 
 76 |     def _embed():
 77 |         """
 78 |         This method receives a list of sentences in the source language and a
 79 |         list of sentences in the target language as parameters and returns
 80 |         their embeddings, inverse document frequences, tokens and padding
 81 |         masks.
 82 |         """
 83 | 
 84 |     def score():
 85 |         """
 86 |         This method receives a list of sentences in the source language and a
 87 |         list of sentences in the target language as parameters, which are
 88 |         assumed to be aligned according to their index. For each sentence pair
 89 |         a similarity score is computed and the list of scores is returned.
 90 |         """
 91 | ```
 92 | 
 93 | ## Acknowledgments
 94 | This library is based on the following projects:
 95 | * [ACL20-Reference-Free-MT-Evaluation](https://github.com/AIPHES/ACL20-Reference-Free-MT-Evaluation)
 96 | * [Unsupervised-crosslingual-Compound-Method-For-MT](https://github.com/Rain9876/Unsupervised-crosslingual-Compound-Method-For-MT)
 97 | * [Seq2Seq examples](https://github.com/huggingface/transformers/tree/v4.5.1/examples/seq2seq) of [transformers](https://github.com/huggingface/transformers)
 98 | * [VecMap](https://github.com/artetxem/vecmap)
 99 | * [CRISS](https://github.com/pytorch/fairseq/tree/master/examples/criss)
100 | 
101 | ## Citation
102 | If you like/use our work, please [cite](https://aclanthology.org/2023.eacl-main.27.bib) as follows:
103 | 
104 | ```bibtex
105 | @inproceedings{belouadi-eger-2023-uscore,
106 |     title = "{US}core: An Effective Approach to Fully Unsupervised Evaluation Metrics for Machine Translation",
107 |     author = "Belouadi, Jonas  and
108 |       Eger, Steffen",
109 |     booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics",
110 |     month = may,
111 |     year = "2023",
112 |     address = "Dubrovnik, Croatia",
113 |     publisher = "Association for Computational Linguistics",
114 |     url = "https://aclanthology.org/2023.eacl-main.27",
115 |     pages = "358--374",
116 | }
117 | ```
118 | 


--------------------------------------------------------------------------------
/experiments/README.md:
--------------------------------------------------------------------------------
 1 | # Experiments
 2 | This directory contains experiments which were conducted during the
 3 | master-thesis *Self-Learning for Unsupervised Evaluation Metrics* and published
 4 | in the paper *UScore: An Effective Approach to Fully Unsupervised Evaluation
 5 | Metrics for Machine Translation*. By default the experiments train the used
 6 | models from scratch, since it is difficult to distribute all created model
 7 | files due to storage limitations. If you need the original model files due to
 8 | reproducability reasons, please contact the maintainers of this repository.
 9 | Created files are cached in
10 | `${METRICS_HOME:-${XDG_CACHE_HOME:-~/.cache}/metrics}`, so training and
11 | pre-processing only happens once. Please be careful when interrupting a running
12 | process, as created files are not yet checked for their integrity.
13 | 
14 | Also please bear in mind, that most models were trained on beefy workstations
15 | like the [NVIDIA DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100).
16 | The majority of experiments require considerably less resources, but in this
17 | case out-of-memory errors are to be expected. Model inference is of course less
18 | resource intensive.
19 | 
20 | ## Included Experiments
21 | When coming from UScore, the `uscore_tests` in `comparison.py` contain
22 | most of the experiments present in the paper. Please note that the names used
23 | for the metrics are slightly different from the ones used in the paper.
24 | 
25 | * `remap.py` Remap XMoverScore on pseudo-parallel sentences.
26 | * `quantity.py` Remap XMoverScore on pseudo-parallel sentences mined from different amounts of monolingual data.
27 | * `vecmap.py` Use XMoverScore and mean-pooling metrics with [VecMap](https://github.com/artetxem/vecmap) embeddings.
28 | * `nmt.py` Combine XMoverScore with an unsupervised NMT model.
29 | * `lm.py` Combine XMoverScore with an unsupervised NMT model and a language model of the target language.
30 | * `distil.py` Create distilled cross-lingual sentence embeddings using pseudo-parallel sentences.
31 | * `contrast.py` Created cross-lingual sentence embeddings using a contrastive learning objective.
32 | * `combine.py` Combine word-embeddings with sentence-embeddings in a single metric (XMoverScore + ContrastScore).
33 | * `comparison.py` Compare all self-learned metrics with strong baselines on multiple language directions and datasets.
34 | * `finetune.py` Finetune induced self-learned metrics on small parallel corpora.
35 | * `parallel.py` Create distilled and contrastive cross-lingual sentence embeddings only on parallel data.
36 | * `ensemble.py` Ensemble SentSim with XMoverScore + ContrastScore.
37 | * `sentsim.py` Try to reproduce scores of SentSim metrics.
38 | 


--------------------------------------------------------------------------------
/experiments/combine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore
 3 | from metrics.contrastscore import ContrastScore
 4 | from collections import defaultdict
 5 | from tabulate import tabulate
 6 | from numpy import linspace
 7 | from numpy import corrcoef, argsort
 8 | from torch.nn.functional import mse_loss, l1_loss
 9 | from torch import FloatTensor
10 | from metrics.utils.dataset import DatasetLoader
11 | import logging
12 | 
13 | source_lang, target_lang = "de", "en"
14 | remap_iterations = 1
15 | nmt_iterations = 1
16 | contrast_iterations = 6
17 | 
18 | def error(model_scores, ref_scores):
19 |     rmse = mse_loss(FloatTensor(ref_scores), FloatTensor(model_scores)).sqrt().item()
20 |     mae = l1_loss(FloatTensor(ref_scores), FloatTensor(model_scores)).item()
21 |     return rmse, mae
22 | 
23 | def correlation(model_scores, ref_scores):
24 |     ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(model_scores).argsort()
25 |     return corrcoef(ref_scores, model_scores)[0,1], corrcoef(ref_ranks, ranks)[0,1]
26 | 
27 | def combine_tests(max_len=30):
28 |     assert target_lang == "en", "Target language has to be English for LM to work"
29 |     xmover = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, lm_weights=[1, 0.1], nmt_weights=[0.5, 0.4], use_lm=True)
30 |     contrast = ContrastScore(source_language=source_lang, target_language=target_lang, parallelize=True)
31 |     dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len)
32 |     mono_src, mono_tgt = dataset.load("monolingual-align")
33 |     eval_src, eval_system, eval_scores = dataset.load("scored")
34 |     train_src, train_tgt = dataset.load("monolingual-train")
35 |     suffix = f"{source_lang}-{target_lang}-awesome-wmd-{xmover.mapping}-monolingual-align-{xmover.k}-{xmover.remap_size}-{40000}-{max_len}"
36 |     results, index = defaultdict(list), [f"{round(weight, 2)}-{round(1-weight, 2)}" for weight in linspace(1, 0, 11)]
37 | 
38 |     logging.info("Preparing XMoverScore")
39 |     for iteration in range(1, remap_iterations + 1):
40 |         logging.info(f"Remapping iteration {iteration}.")
41 |         xmover.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False)
42 |     for iteration in range(nmt_iterations):
43 |         logging.info(f"NMT training iteration {iteration}.")
44 |         xmover.train(train_src, train_tgt, suffix=suffix+f"-{remap_iterations}", iteration=iteration, overwrite=False, k=1)
45 | 
46 |     logging.info("Preparing ContrastScore")
47 |     for iteration in range(1, contrast_iterations + 1):
48 |         logging.info(f"Contrastive Learning iteration {iteration}.")
49 |         contrast.suffix = f"{max_len}-{iteration}"
50 |         contrast.train(mono_src, mono_tgt, overwrite=False)
51 | 
52 |     wmd_scores = xmover.score(eval_src, eval_system)
53 |     contrast_scores = contrast.score(eval_src, eval_system)
54 | 
55 |     for weight in linspace(1, 0, 11):
56 |         pearson, spearman = correlation([weight * x + (1 - weight) * y for x, y in zip(wmd_scores, contrast_scores)], eval_scores)
57 |         rmse, mae = error([weight * x + (1 - weight) * y for x, y in zip(wmd_scores, contrast_scores)], eval_scores)
58 |         results["pearson"].append(round(100 * pearson, 2))
59 |         results["spearman"].append(round(100 * spearman, 2))
60 |         results["rmse"].append(round(rmse, 2))
61 |         results["mae"].append(round(mae, 2))
62 | 
63 |     return tabulate(results, headers="keys", showindex=index)
64 | 
65 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
66 | print("Maximum Sentence Length: 30", combine_tests(), sep="\n")
67 | print("Maximum Sentence Length: 50", combine_tests(max_len=50), sep="\n")
68 | 


--------------------------------------------------------------------------------
/experiments/comparison.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | from collections import defaultdict
  3 | from collections import defaultdict
  4 | import logging
  5 | 
  6 | from numpy import argsort, corrcoef
  7 | from tabulate import tabulate
  8 | 
  9 | from comet import download_model, load_from_checkpoint
 10 | from datasets import load_metric
 11 | from metrics.contrastscore import ContrastScore
 12 | from metrics.distilscore import DistilScore
 13 | from metrics.sentsim import SentSim
 14 | from metrics.utils.dataset import DatasetLoader
 15 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore, XMoverScore
 16 | from torch.cuda import is_available as cuda_is_available
 17 | from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel
 18 | 
 19 | newstest2016 = [
 20 |     ("de", "en"),
 21 |     ("en", "ru"),
 22 |     ("ru", "en"),
 23 |     ("ro", "en"),
 24 |     ("cs", "en"),
 25 |     ("fi", "en"),
 26 |     ("tr", "en"),
 27 | ]
 28 | newstest2017 = [
 29 |     ("cs", "en"),
 30 |     ("de", "en"),
 31 |     ("fi", "en"),
 32 |     ("lv", "en"),
 33 |     ("ru", "en"),
 34 |     ("tr", "en"),
 35 |     ("zh", "en"),
 36 | ]
 37 | mlqe = [
 38 |     ("en", "de"),
 39 |     ("en", "zh"),
 40 |     ("ru", "en"),
 41 |     ("ro", "en"),
 42 |     ("et", "en"),
 43 |     ("ne", "en"),
 44 |     ("si", "en"),
 45 | ]
 46 | eval4nlp = [
 47 |     (src, tgt)
 48 |     for src, tgt in [("de", "zh"), ("ru", "de")]
 49 |     if DatasetLoader(src, tgt).has_eval4nlp_access()
 50 | ]
 51 | mqm = [("en", "de"), ("zh", "en")]
 52 | 
 53 | lm_model = defaultdict(
 54 |     lambda: None,
 55 |     en="gpt2",
 56 |     ru="sberbank-ai/rugpt3small_based_on_gpt2",
 57 |     de="dbmdz/german-gpt2",
 58 |     zh="uer/gpt2-chinese-cluecorpussmall",
 59 | )
 60 | 
 61 | remap_iterations = 1
 62 | nmt_iterations = 1
 63 | contrast_iterations = 6
 64 | 
 65 | 
 66 | def correlation(model_scores, ref_scores):
 67 |     ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(model_scores).argsort()
 68 |     return corrcoef(ref_scores, model_scores)[0, 1], corrcoef(ref_ranks, ranks)[0, 1]
 69 | 
 70 | 
 71 | def comet_tests(source_lang, target_lang, dataset_name):
 72 |     model = load_from_checkpoint(download_model("wmt20-comet-qe-da"))
 73 |     dataset = DatasetLoader(source_lang, target_lang)
 74 |     results, index = defaultdict(list), ["Comet-QE"]
 75 | 
 76 |     eval_src, eval_system, eval_scores = dataset.load(dataset_name)
 77 |     data = [{"src": src, "mt": system} for src, system in zip(eval_src, eval_system)]
 78 |     scores, _ = model.predict(data, batch_size=8, gpus=cuda_is_available() and 1 or 0)
 79 | 
 80 |     pearson, spearman = correlation(scores, eval_scores)
 81 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
 82 |     results["pearson"].append(round(100 * pearson, 2))
 83 |     results["spearman"].append(round(100 * spearman, 2))
 84 | 
 85 |     return tabulate(results, headers="keys", showindex=index)
 86 | 
 87 | 
 88 | def transquest_tests(source_lang, target_lang, dataset_name):
 89 |     transquest_models = {
 90 |         ("ro", "en"): "TransQuest/monotransquest-da-ro_en-wiki",
 91 |         ("et", "en"): "TransQuest/monotransquest-da-et_en-wiki",
 92 |         ("ne", "en"): "TransQuest/monotransquest-da-ne_en-wiki",
 93 |         ("si", "en"): "TransQuest/monotransquest-da-si_en-wiki",
 94 |         ("ru", "en"): "TransQuest/monotransquest-da-ru_en-reddit_wikiquotes",
 95 |         ("en", "de"): "TransQuest/monotransquest-da-en_de-wiki",
 96 |         ("en", "zh"): "TransQuest/monotransquest-da-en_zh-wiki",
 97 |         ("en", None): "TransQuest/monotransquest-da-en_any",
 98 |         (None, "en"): "TransQuest/monotransquest-da-any_en",
 99 |         (None, None): "TransQuest/monotransquest-da-multilingual",
100 |     }
101 | 
102 |     if (source_lang, target_lang) in transquest_models:
103 |         model_name = transquest_models[(source_lang, target_lang)]
104 |     elif (source_lang, None) in transquest_models:
105 |         model_name = transquest_models[(source_lang, None)]
106 |     elif (None, target_lang) in transquest_models:
107 |         model_name = transquest_models[(None, target_lang)]
108 |     else:
109 |         model_name = transquest_models[(None, None)]
110 | 
111 |     model = MonoTransQuestModel(
112 |         "xlmroberta", model_name, num_labels=1, use_cuda=cuda_is_available()
113 |     )
114 |     dataset = DatasetLoader(source_lang, target_lang)
115 |     eval_src, eval_system, eval_scores = dataset.load(dataset_name)
116 |     results, index = defaultdict(list), ["MonoTransQuest"]
117 |     scores, _ = model.predict(list(map(list, zip(eval_src, eval_system))))
118 | 
119 |     pearson, spearman = correlation(scores, eval_scores)
120 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
121 |     results["pearson"].append(round(100 * pearson, 2))
122 |     results["spearman"].append(round(100 * spearman, 2))
123 | 
124 |     return tabulate(results, headers="keys", showindex=index)
125 | 
126 | 
127 | def bleu_test(source_lang, target_lang, dataset_name):
128 |     metric = load_metric("sacrebleu")
129 |     dataset = DatasetLoader(source_lang, target_lang, return_references=True)
130 |     _, eval_ref, eval_system, eval_scores = dataset.load(dataset_name)
131 |     results, index = defaultdict(list), ["BLEU"]
132 | 
133 |     scores = list()
134 |     for system, ref in zip(eval_system, eval_ref):
135 |         scores.append(metric.compute(predictions=[system], references=[[ref]])["score"])
136 |     pearson, spearman = correlation(scores, eval_scores)
137 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
138 |     results["pearson"].append(round(100 * pearson, 2))
139 |     results["spearman"].append(round(100 * spearman, 2))
140 | 
141 |     return tabulate(results, headers="keys", showindex=index)
142 | 
143 | 
144 | def xmoverscore_tests(source_lang, target_lang, dataset_name, mapping="UMD"):
145 |     scorer = XMoverScore(
146 |         mapping=mapping, use_lm=True, lm_model_name=lm_model[target_lang]
147 |     )
148 |     dataset = DatasetLoader(source_lang, target_lang)
149 |     eval_src, eval_system, eval_scores = dataset.load(dataset_name)
150 |     results, index = defaultdict(list), [f"XMoverScore ({mapping})"]
151 | 
152 |     try:
153 |         scorer.remap(source_lang, target_lang)
154 |     except ValueError:
155 |         results["pearson"].append("-")
156 |         results["spearman"].append("-")
157 |     else:
158 |         pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
159 |         logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
160 |         results["pearson"].append(round(100 * pearson, 2))
161 |         results["spearman"].append(round(100 * spearman, 2))
162 | 
163 |     return tabulate(results, headers="keys", showindex=index)
164 | 
165 | 
166 | def sentsim_tests(source_lang, target_lang, dataset_name, word_metric="BERTScore"):
167 |     scorer = SentSim(use_wmd=word_metric == "WMD")
168 |     dataset = DatasetLoader(source_lang, target_lang)
169 |     eval_src, eval_system, eval_scores = dataset.load(dataset_name)
170 |     results, index = defaultdict(list), [f"SentSim ({word_metric})"]
171 | 
172 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
173 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
174 |     results["pearson"].append(round(100 * pearson, 2))
175 |     results["spearman"].append(round(100 * spearman, 2))
176 | 
177 |     return tabulate(results, headers="keys", showindex=index)
178 | 
179 | 
180 | def distilscore_tests(source_lang, target_lang, dataset_name):
181 |     scorer = DistilScore(
182 |         student_model_name="xlm-r-bert-base-nli-stsb-mean-tokens",
183 |         source_language=source_lang,
184 |         target_language=target_lang,
185 |         student_is_pretrained=True,
186 |         suffix="1",
187 |     )
188 |     dataset = DatasetLoader(source_lang, target_lang)
189 |     eval_src, eval_system, eval_scores = dataset.load(dataset_name)
190 |     results, index = defaultdict(list), ["DistilScore"]
191 | 
192 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
193 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
194 |     results["pearson"].append(round(100 * pearson, 2))
195 |     results["spearman"].append(round(100 * spearman, 2))
196 | 
197 |     return tabulate(results, headers="keys", showindex=index)
198 | 
199 | 
200 | def uscore_tests(source_lang, target_lang, dataset_name, max_len=30):
201 |     xmover = XMoverNMTLMBertAlignScore(
202 |         src_lang=source_lang,
203 |         tgt_lang=target_lang,
204 |         lm_weights=[1, 0.1],
205 |         nmt_weights=[0.5, 0.4],
206 |         use_lm=True,
207 |         lm_model_name=lm_model[target_lang],
208 |     )
209 |     contrast = ContrastScore(
210 |         source_language=source_lang, target_language=target_lang, parallelize=True
211 |     )
212 |     dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len)
213 |     mono_src, mono_tgt = dataset.load("monolingual-align")
214 |     train_src, train_tgt = dataset.load("monolingual-train")
215 |     eval_src, eval_system, eval_scores = dataset.load(dataset_name)
216 |     suffix = f"{source_lang}-{target_lang}-awesome-wmd-{xmover.mapping}-monolingual-align-{xmover.k}-{xmover.remap_size}-{40000}-{max_len}"
217 |     results, index = defaultdict(list), [
218 |         f"UScore (WRD) ({max_len} tokens)",
219 |         f"UScore (SNT) ({max_len} tokens)",
220 |         f"UScore (WRD) + UScore (SNT) ({max_len} tokens)",
221 |     ]
222 | 
223 |     logging.info("Evaluating UScore (WRD)")
224 |     for iteration in range(1, remap_iterations + 1):
225 |         logging.info(f"Remapping iteration {iteration}.")
226 |         xmover.remap(
227 |             mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False
228 |         )
229 |     for iteration in range(nmt_iterations):
230 |         logging.info(f"NMT training iteration {iteration}.")
231 |         xmover.train(
232 |             train_src,
233 |             train_tgt,
234 |             suffix=suffix + f"-{remap_iterations}",
235 |             iteration=iteration,
236 |             overwrite=False,
237 |             k=1,
238 |         )
239 | 
240 |     pearson, spearman = xmover.correlation(eval_src, eval_system, eval_scores)
241 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
242 |     results["pearson"].append(round(100 * pearson, 2))
243 |     results["spearman"].append(round(100 * spearman, 2))
244 | 
245 |     logging.info("Evaluating UScore (SNT)")
246 |     for iteration in range(1, contrast_iterations + 1):
247 |         logging.info(f"Contrastive Learning iteration {iteration}.")
248 |         contrast.suffix = f"{max_len}-{iteration}"
249 |         contrast.train(train_src, train_tgt, overwrite=False)
250 | 
251 |     pearson, spearman = contrast.correlation(eval_src, eval_system, eval_scores)
252 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
253 |     results["pearson"].append(round(100 * pearson, 2))
254 |     results["spearman"].append(round(100 * spearman, 2))
255 | 
256 |     logging.info("Evaluating UScore (WRD) + UScore (SNT)")
257 |     wmd_scores, contrast_scores = xmover.score(eval_src, eval_system), contrast.score(
258 |         eval_src, eval_system
259 |     )
260 |     pearson, spearman = correlation(
261 |         [0.6 * x + 0.4 * y for x, y in zip(wmd_scores, contrast_scores)], eval_scores
262 |     )
263 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
264 |     results["pearson"].append(round(100 * pearson, 2))
265 |     results["spearman"].append(round(100 * spearman, 2))
266 | 
267 |     return tabulate(results, headers="keys", showindex=index)
268 | 
269 | 
270 | logging.basicConfig(
271 |     level=logging.INFO,
272 |     datefmt="%m-%d %H:%M",
273 |     format="%(asctime)s %(levelname)-8s %(message)s",
274 | )
275 | ref_datasets = (
276 |     ("Newstest-2016", "scored", newstest2016),
277 |     ("Newstest-2017", "scored-wmt17", newstest2017),
278 |     ("MQM-Newstest-2020", "scored-mqm", mqm),
279 | )
280 | datasets = ref_datasets + (
281 |     ("MLQE-PE", "scored-mlqe", mlqe),
282 |     ("Eval4NLP-2021", "scored-eval4nlp", eval4nlp),
283 | )
284 | for dataset, identifier, pairs in datasets:
285 |     for source_lang, target_lang in pairs:
286 |         print(f"Evaluating {source_lang}-{target_lang} language direction on {dataset}")
287 |         print(uscore_tests(source_lang, target_lang, identifier, max_len=30))
288 |         print(uscore_tests(source_lang, target_lang, identifier, max_len=50))
289 |         print(xmoverscore_tests(source_lang, target_lang, identifier, mapping="UMD"))
290 |         print(xmoverscore_tests(source_lang, target_lang, identifier, mapping="CLP"))
291 |         print(sentsim_tests(source_lang, target_lang, identifier, word_metric="BERTScore"))
292 |         print(sentsim_tests(source_lang, target_lang, identifier, word_metric="WMD"))
293 |         print(distilscore_tests(source_lang, target_lang, identifier))
294 |         print(transquest_tests(source_lang, target_lang, identifier))
295 |         print(comet_tests(source_lang, target_lang, identifier))
296 | 
297 | for dataset, identifier, pairs in ref_datasets:
298 |     for source_lang, target_lang in pairs:
299 |         print(bleu_test(source_lang, target_lang, identifier))
300 | 


--------------------------------------------------------------------------------
/experiments/contrast.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.contrastscore import ContrastScore
 3 | from collections import defaultdict
 4 | from tabulate import tabulate
 5 | from metrics.utils.dataset import DatasetLoader
 6 | import logging
 7 | 
 8 | source_lang, target_lang = "de", "en"
 9 | iterations = 10
10 | 
11 | def contrastive_tests(max_len=30, model="xlm-roberta-base"):
12 |     scorer = ContrastScore(model_name=model, source_language=source_lang, target_language=target_lang, parallelize=True)
13 |     dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len)
14 |     eval_src, eval_system, eval_scores = dataset.load("scored")
15 |     parallel_src, parallel_tgt = dataset.load("parallel")
16 |     results, index = defaultdict(list), list(range(iterations + 1))
17 | 
18 |     logging.info("Evaluating performance before training.")
19 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
20 |     precision = scorer.precision(parallel_src, parallel_tgt)
21 |     rmse, mae = scorer.error(eval_src, eval_system, eval_scores)
22 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}")
23 |     results["pearson"].append(round(100 * pearson, 2))
24 |     results["spearman"].append(round(100 * spearman, 2))
25 |     results["precision"].append(round(100 * precision, 2))
26 |     results["rmse"].append(round(rmse, 2))
27 |     results["mae"].append(round(mae, 2))
28 | 
29 |     mono_src, mono_tgt = dataset.load("monolingual-train")
30 | 
31 |     for iteration in range(1, iterations + 1):
32 |         logging.info(f"Training iteration {iteration}.")
33 |         scorer.suffix = f"{max_len}-{iteration}"
34 |         scorer.train(mono_src, mono_tgt, overwrite=False)
35 |         pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
36 |         precision = scorer.precision(parallel_src, parallel_tgt)
37 |         rmse, mae = scorer.error(eval_src, eval_system, eval_scores)
38 |         logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}")
39 |         results["pearson"].append(round(100 * pearson, 2))
40 |         results["spearman"].append(round(100 * spearman, 2))
41 |         results["precision"].append(round(100 * precision, 2))
42 |         results["rmse"].append(round(rmse, 2))
43 |         results["mae"].append(round(mae, 2))
44 | 
45 |     return tabulate(results, headers="keys", showindex=index)
46 | 
47 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
48 | print("Contrastive learning with XLM-R", contrastive_tests(max_len=30), contrastive_tests(max_len=50), sep="\n")
49 | print("Contrastive learning with mBERT", contrastive_tests(max_len=30, model="bert-base-multilingual-cased"), sep="\n")
50 | 


--------------------------------------------------------------------------------
/experiments/distil.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.distilscore import DistilScore
 3 | from collections import defaultdict
 4 | from tabulate import tabulate
 5 | from metrics.utils.dataset import DatasetLoader
 6 | import logging
 7 | 
 8 | source_lang, target_lang = "de", "en"
 9 | iterations = 5
10 | 
11 | def distil_tests(model="xlm-roberta-base"):
12 |     scorer = DistilScore(student_model_name=model, source_language=source_lang, target_language=target_lang, suffix="1")
13 |     dataset = DatasetLoader(source_lang, target_lang)
14 |     eval_src, eval_system, eval_scores = dataset.load("scored")
15 |     parallel_src, parallel_tgt = dataset.load("parallel")
16 |     results, index = defaultdict(list), list(range(iterations + 1))
17 | 
18 |     logging.info("Evaluating performance before distilling.")
19 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
20 |     precision = scorer.precision(parallel_src, parallel_tgt)
21 |     rmse, mae = scorer.error(eval_src, eval_system, eval_scores)
22 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}")
23 |     results["pearson"].append(round(100 * pearson, 2))
24 |     results["spearman"].append(round(100 * spearman, 2))
25 |     results["precision"].append(round(100 * precision, 2))
26 |     results["rmse"].append(round(rmse, 2))
27 |     results["mae"].append(round(mae, 2))
28 | 
29 |     parallel_src, parallel_tgt = dataset.load("parallel")
30 |     mono_src, mono_tgt = dataset.load("monolingual-train")
31 | 
32 |     for iteration in range(1, iterations + 1):
33 |         logging.info(f"Training iteration {iteration}.")
34 |         scorer.suffix = str(iteration)
35 |         scorer.train(mono_src, mono_tgt, dev_source_sents=parallel_src, dev_target_sents=parallel_tgt, overwrite=False)
36 |         pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
37 |         precision = scorer.precision(parallel_src, parallel_tgt)
38 |         rmse, mae = scorer.error(eval_src, eval_system, eval_scores)
39 |         logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}")
40 |         results["pearson"].append(round(100 * pearson, 2))
41 |         results["spearman"].append(round(100 * spearman, 2))
42 |         results["precision"].append(round(100 * precision, 2))
43 |         results["rmse"].append(round(rmse, 2))
44 |         results["mae"].append(round(mae, 2))
45 | 
46 |     return tabulate(results, headers="keys", showindex=index)
47 | 
48 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
49 | print("Results using XLM-R as student:", distil_tests(), sep="\n")
50 | print("Results using mBERT as student:", distil_tests(model="bert-base-multilingual-cased"), sep="\n")
51 | 


--------------------------------------------------------------------------------
/experiments/ensemble.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore
 3 | from metrics.contrastscore import ContrastScore
 4 | from metrics.sentsim import SentSim
 5 | from metrics.utils.dataset import DatasetLoader
 6 | from collections import defaultdict
 7 | from tabulate import tabulate
 8 | from numpy import corrcoef, argsort
 9 | import logging
10 | 
11 | mlqe = [("en", "de"), ("en", "zh"), ("ru", "en"), ("ro", "en"), ("et", "en"), ("ne", "en"), ("si", "en")]
12 | lm_model = {"en": "gpt2", "ru": "sberbank-ai/rugpt3small_based_on_gpt2", "de": "dbmdz/german-gpt2",
13 |         "zh": "uer/gpt2-chinese-cluecorpussmall"}
14 | 
15 | remap_iterations = 1
16 | nmt_iterations = 1
17 | contrast_iterations = 6
18 | 
19 | def correlation(model_scores, ref_scores):
20 |     ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(model_scores).argsort()
21 |     return corrcoef(ref_scores, model_scores)[0,1], corrcoef(ref_ranks, ranks)[0,1]
22 | 
23 | def xmover_contrast_combine(source_lang, target_lang, max_len=30):
24 |     xmover = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, lm_weights=[1, 0.1],
25 |         nmt_weights=[0.5, 0.4], use_lm=source_lang!="ru", lm_model_name=lm_model[target_lang], translate_batch_size=4)
26 |     contrast = ContrastScore(source_language=source_lang, target_language=target_lang, parallelize=True)
27 |     dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len)
28 |     mono_src, mono_tgt = dataset.load("monolingual-align")
29 |     train_src, train_tgt = dataset.load("monolingual-train")
30 |     para_src, para_tgt = dataset.load("nepali" if "ne" in [source_lang, target_lang] else "wikimatrix", 200000)
31 |     suffix = f"{source_lang}-{target_lang}-awesome-wmd-{xmover.mapping}-monolingual-align-{xmover.k}-{xmover.remap_size}-{40000}-{max_len}"
32 | 
33 |     for iteration in range(1, remap_iterations + 1):
34 |         logging.info(f"Remapping iteration {iteration}.")
35 |         xmover.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False)
36 |     if source_lang != "ru":
37 |         for iteration in range(nmt_iterations):
38 |             logging.info(f"NMT training iteration {iteration}.")
39 |             xmover.train(train_src, train_tgt, suffix=suffix+f"-{remap_iterations}", iteration=iteration, overwrite=False, k=1)
40 |     for iteration in range(1, contrast_iterations + 1):
41 |         logging.info(f"Contrastive Learning iteration {iteration}.")
42 |         contrast.suffix = f"{max_len}-{iteration}"
43 |         contrast.train(train_src, train_tgt, overwrite=False)
44 | 
45 |     xmover.mapping = "CLP"
46 |     xmover.remap(para_src, para_tgt, suffix=suffix.replace("UMD", "CLP") + f"-finetuned-200000", aligned=True, overwrite=False)
47 |     if source_lang != "ru":
48 |         xmover.train(para_src, para_tgt, suffix=suffix + f"-finetuned-200000", iteration=iteration, aligned=True,
49 |             finetune=True, overwrite=False, k=1)
50 |     contrast.suffix = f"{max_len}-finetuned-200000"
51 |     contrast.train(para_src, para_tgt, aligned=True, finetune=True, overwrite=False)
52 | 
53 |     return lambda src, sys: [0.6 * x + 0.4 * y for x, y in zip(xmover.score(src, sys), contrast.score(src, sys))]
54 | 
55 | def tests(source_lang, target_lang, dataset_name, max_len=30):
56 |     xcontrast = xmover_contrast_combine(source_lang, target_lang, max_len)
57 |     sentsim = SentSim()
58 |     dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len)
59 |     eval_src, eval_system, eval_scores = dataset.load(dataset_name)
60 |     results, index = defaultdict(list), ["SentSim", f"XMover + Contrast ({max_len} tokens)", f"Ensemble ({max_len} tokens)"]
61 | 
62 |     for score in [sentsim.score, xcontrast, lambda src, sys:[0.5 * x + 0.5 * y for x, y in zip(xcontrast(src, sys), sentsim.score(src, sys))]]:
63 |         pearson, spearman = correlation(score(eval_src, eval_system), eval_scores)
64 |         logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
65 |         results["pearson"].append(round(100 * pearson, 2))
66 |         results["spearman"].append(round(100 * spearman, 2))
67 | 
68 |     return tabulate(results, headers="keys", showindex=index)
69 | 
70 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
71 | for source_lang, target_lang in mlqe:
72 |     print(f"Evaluating {source_lang}-{target_lang} language direction on MLQE-PE.")
73 |     print(tests(source_lang, target_lang, "scored-mlqe", max_len=30))
74 |     print(tests(source_lang, target_lang, "scored-mlqe", max_len=50))
75 | 


--------------------------------------------------------------------------------
/experiments/finetune.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore
 3 | from metrics.contrastscore import ContrastScore
 4 | from metrics.utils.dataset import DatasetLoader
 5 | from collections import defaultdict
 6 | from tabulate import tabulate
 7 | from numpy import corrcoef, argsort
 8 | import logging
 9 | 
10 | mlqe = [("en", "de"), ("en", "zh"), ("ru", "en"), ("ro", "en"), ("et", "en"), ("ne", "en"), ("si", "en")]
11 | lm_model = {"en": "gpt2", "de": "dbmdz/german-gpt2", "zh": "uer/gpt2-chinese-cluecorpussmall"}
12 | 
13 | remap_iterations = 1
14 | nmt_iterations = 1
15 | contrast_iterations = 6
16 | 
17 | def correlation(model_scores, ref_scores):
18 |     ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(model_scores).argsort()
19 |     return corrcoef(ref_scores, model_scores)[0,1], corrcoef(ref_ranks, ranks)[0,1]
20 | 
21 | def self_learning_tests(source_lang, target_lang, max_len=30, size=30000):
22 |     xmover = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, lm_weights=[1, 0.1],
23 |             nmt_weights=[0.5, 0.4], use_lm=True, lm_model_name=lm_model[target_lang])
24 |     contrast = ContrastScore(source_language=source_lang, target_language=target_lang, parallelize=True)
25 |     dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len)
26 |     mono_src, mono_tgt = dataset.load("monolingual-align")
27 |     train_src, train_tgt = dataset.load("monolingual-train")
28 |     eval_src, eval_system, eval_scores = dataset.load("scored-mlqe")
29 |     dataset.hard_limit = 500
30 |     para_src, para_tgt = dataset.load("nepali" if "ne" in [source_lang, target_lang] else "wikimatrix", size)
31 |     suffix = f"{source_lang}-{target_lang}-awesome-wmd-{xmover.mapping}-monolingual-align-{xmover.k}-{xmover.remap_size}-{40000}-{max_len}"
32 |     results, index = defaultdict(list), [f"XMoverScore ({max_len} tokens)", f"Fine-tuned XMoverScore ({max_len} tokens)",
33 |             f"ContrastScore ({max_len} tokens)", f"Fine-tuned ContrastScore ({max_len} tokens)",
34 |             f"XMoverScore + ContrastScore ({max_len} tokens)"]
35 | 
36 |     logging.info("Evaluating XMoverScore")
37 |     for iteration in range(1, remap_iterations + 1):
38 |         logging.info(f"Remapping iteration {iteration}.")
39 |         xmover.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False)
40 |     for iteration in range(nmt_iterations):
41 |         logging.info(f"NMT training iteration {iteration}.")
42 |         xmover.train(train_src, train_tgt, suffix=suffix+f"-{remap_iterations}", iteration=iteration, overwrite=False, k=1)
43 | 
44 |     pearson, spearman = xmover.correlation(eval_src, eval_system, eval_scores)
45 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
46 |     results["pearson"].append(round(100 * pearson, 2))
47 |     results["spearman"].append(round(100 * spearman, 2))
48 | 
49 |     logging.info(f"Remapping on parallel data.")
50 |     xmover.mapping = "CLP"
51 |     xmover.remap(para_src, para_tgt, suffix=suffix.replace("UMD", "CLP") + f"-finetuned-{size}", aligned=True, overwrite=False)
52 |     logging.info(f"NMT training on parallel data.")
53 |     xmover.train(para_src, para_tgt, suffix=suffix + f"-finetuned-{size}", aligned=True, finetune=True, overwrite=False, k=1)
54 | 
55 |     pearson, spearman = xmover.correlation(eval_src, eval_system, eval_scores)
56 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
57 |     results["pearson"].append(round(100 * pearson, 2))
58 |     results["spearman"].append(round(100 * spearman, 2))
59 | 
60 |     logging.info("Evaluating ContrastScore")
61 |     for iteration in range(1, contrast_iterations + 1):
62 |         logging.info(f"Contrastive Learning iteration {iteration}.")
63 |         contrast.suffix = f"{max_len}-{iteration}"
64 |         contrast.train(train_src, train_tgt, overwrite=False)
65 | 
66 |     pearson, spearman = contrast.correlation(eval_src, eval_system, eval_scores)
67 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
68 |     results["pearson"].append(round(100 * pearson, 2))
69 |     results["spearman"].append(round(100 * spearman, 2))
70 | 
71 |     logging.info(f"Contrastive Learning on parallel data.")
72 |     contrast.suffix = f"{max_len}-finetuned-{size}"
73 |     contrast.train(para_src, para_tgt, aligned=True, finetune=True, overwrite=False)
74 | 
75 |     pearson, spearman = contrast.correlation(eval_src, eval_system, eval_scores)
76 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
77 |     results["pearson"].append(round(100 * pearson, 2))
78 |     results["spearman"].append(round(100 * spearman, 2))
79 | 
80 |     logging.info("Evaluating XMoverScore + ContrastScore")
81 |     wmd_scores, contrast_scores = xmover.score(eval_src, eval_system), contrast.score(eval_src, eval_system)
82 |     pearson, spearman = correlation([0.6 * x + 0.4 * y for x, y in zip(wmd_scores, contrast_scores)], eval_scores)
83 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
84 |     results["pearson"].append(round(100 * pearson, 2))
85 |     results["spearman"].append(round(100 * spearman, 2))
86 | 
87 |     return tabulate(results, headers="keys", showindex=index)
88 | 
89 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
90 | for size in (10000, 20000, 30000, 200000):
91 |     for source_lang, target_lang in mlqe:
92 |         print(f"Evaluating {source_lang}-{target_lang} language direction on MLQE-PE using {size} parallel sentences.")
93 |         print(self_learning_tests(source_lang, target_lang, max_len=30, size=size))
94 |         print(self_learning_tests(source_lang, target_lang, max_len=50, size=size))
95 | 


--------------------------------------------------------------------------------
/experiments/lm.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore
 3 | from collections import defaultdict
 4 | from tabulate import tabulate
 5 | from numpy import linspace
 6 | from metrics.utils.dataset import DatasetLoader
 7 | import logging
 8 | 
 9 | source_lang, target_lang = "de", "en"
10 | iterations = 1
11 | 
12 | def lm_nmt_tests(metric="wmd", max_len=50):
13 |     assert target_lang == "en", "Target language has to be English for LM to work"
14 |     results = defaultdict(list)
15 |     scorer = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, use_lm=True, use_cosine=metric=="cosine")
16 |     dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len)
17 |     mono_src, mono_tgt = dataset.load("monolingual-align")
18 |     eval_src, eval_system, eval_scores = dataset.load("scored")
19 |     suffix = f"{source_lang}-{target_lang}-awesome-{metric}-{scorer.mapping}-monolingual-align-{scorer.k}-{scorer.remap_size}-{len(mono_src)}-{max_len}"
20 | 
21 |     for iteration in range(1, iterations + 1):
22 |         logging.info(f"Remapping iteration {iteration}.")
23 |         scorer.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False)
24 | 
25 |     logging.info("Training NMT system.")
26 |     train_src, train_tgt = dataset.load("monolingual-train")
27 |     scorer.train(train_src, train_tgt, suffix=suffix+f"-{iterations}", overwrite=False, k=5 if metric=="cosine" else 1)
28 | 
29 | 
30 |     logging.info(f"Evaluating performance with NMT and language model.")
31 |     for lm_weight in linspace(0, 1, 11):
32 |         for nmt_weight in linspace(0, 1, 11):
33 |             if lm_weight + nmt_weight <= 1:
34 |                 scorer.nmt_weights = [1 - lm_weight - nmt_weight, nmt_weight]
35 |                 scorer.lm_weights = [1, lm_weight]
36 |                 pearson, _ = scorer.correlation(eval_src, eval_system, eval_scores)
37 |                 logging.info(f"NMT: {round(nmt_weight, 1)}, LM: {round(lm_weight, 1)}, Pearson: {pearson}")
38 |                 results[round(lm_weight, 1)].append(round(100 * pearson, 2))
39 |             else:
40 |                 results[round(lm_weight, 1)].append("-")
41 | 
42 |     return suffix, tabulate(results, headers="keys", showindex=linspace(0, 1, 11))
43 | 
44 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
45 | print(*lm_nmt_tests(), sep="\n")
46 | 


--------------------------------------------------------------------------------
/experiments/nmt.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore
 3 | from collections import defaultdict
 4 | from tabulate import tabulate
 5 | from numpy import linspace
 6 | from metrics.utils.dataset import DatasetLoader
 7 | import logging
 8 | 
 9 | source_lang, target_lang = "de", "en"
10 | remap_iterations = 1
11 | 
12 | def nmt_tests(metric="cosine", weights=[0.8, 0.2], max_len=30, nmt_iterations=1, back_translate=False):
13 |     aligner = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, nmt_weights=weights, use_cosine=metric=="cosine")
14 |     dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len)
15 |     mono_src, mono_tgt = dataset.load("monolingual-align")
16 |     train_src, train_tgt = dataset.load("monolingual-train")
17 |     eval_src, eval_system, eval_scores = dataset.load("scored")
18 |     langs = f"{target_lang}-{source_lang}" if back_translate else f"{source_lang}-{target_lang}"
19 |     suffix = f"{langs}-awesome-{metric}-{aligner.mapping}-monolingual-align-{aligner.k}-{aligner.remap_size}-{len(mono_src)}-{max_len}"
20 |     results, index = defaultdict(list), list(range(remap_iterations + 1)) +[f"{remap_iterations} + NMT-{iteration}"
21 |             for iteration in range(nmt_iterations)]
22 | 
23 |     logging.info("Evaluating performance before remapping.")
24 |     pearson, spearman = aligner.correlation(eval_src, eval_system, eval_scores)
25 |     rmse, mae = aligner.error(eval_src, eval_system, eval_scores)
26 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}")
27 |     results["pearson"].append(round(100 * pearson, 2))
28 |     results["spearman"].append(round(100 * spearman, 2))
29 |     results["rmse"].append(round(rmse, 2))
30 |     results["mae"].append(round(mae, 2))
31 | 
32 |     for iteration in range(1, remap_iterations + 1):
33 |         logging.info(f"Remapping iteration {iteration}.")
34 |         aligner.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False)
35 |         pearson, spearman = aligner.correlation(eval_src, eval_system, eval_scores)
36 |         rmse, mae = aligner.error(eval_src, eval_system, eval_scores)
37 |         logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}")
38 |         results["pearson"].append(round(100 * pearson, 2))
39 |         results["spearman"].append(round(100 * spearman, 2))
40 |         results["rmse"].append(round(rmse, 2))
41 |         results["mae"].append(round(mae, 2))
42 | 
43 | 
44 |     for iteration in range(nmt_iterations):
45 |         aligner.train(train_src, train_tgt, suffix=suffix+f"-{remap_iterations}", iteration=iteration, overwrite=False,
46 |                 k=5 if metric=="cosine" else 1, back_translate=back_translate)
47 | 
48 |         logging.info("Evaluating performance with NMT model.")
49 |         pearson, spearman = aligner.correlation(eval_src, eval_system, eval_scores)
50 |         rmse, mae = aligner.error(eval_src, eval_system, eval_scores)
51 |         logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}")
52 |         results["pearson"].append(round(100 * pearson, 2))
53 |         results["spearman"].append(round(100 * spearman, 2))
54 |         results["rmse"].append(round(rmse, 2))
55 |         results["mae"].append(round(mae, 2))
56 | 
57 |     return suffix, tabulate(results, headers="keys", showindex=index)
58 | 
59 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
60 | for weight in linspace(1, 0, 11):
61 |     print(f"Using weight {weight} for cross-lingual XMoverScore and weight {1 - weight} for NMT system.")
62 |     print(*nmt_tests(metric="cosine", weights=[weight, 1 - weight]), sep="\n")
63 |     print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight]), sep="\n")
64 |     print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], back_translate=True), sep="\n")
65 | for weight in linspace(1, 0, 11):
66 |     print(f"Using weight {weight} for cross-lingual XMoverScore and weight {1 - weight} for NMT system.")
67 |     print(*nmt_tests(metric="cosine", weights=[weight, 1 - weight], max_len=50), sep="\n")
68 |     print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], max_len=50), sep="\n")
69 |     print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], max_len=50, back_translate=True), sep="\n")
70 | for weight in linspace(1, 0, 11):
71 |     print(f"Using weight {weight} for cross-lingual XMoverScore and weight {1 - weight} for NMT system.")
72 |     print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], nmt_iterations=3), sep="\n")
73 |     print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], nmt_iterations=3, max_len=50), sep="\n")
74 | 


--------------------------------------------------------------------------------
/experiments/parallel.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.contrastscore import ContrastScore
 3 | from metrics.distilscore import DistilScore
 4 | from metrics.utils.dataset import DatasetLoader
 5 | from collections import defaultdict
 6 | from tabulate import tabulate
 7 | import logging
 8 | 
 9 | datasets = (("Newstest-2016", "scored", ("de", "en")), ("Newstest-2017", "scored-wmt17", ("de", "en")),
10 |         ("MLQE-PE", "scored-mlqe", ("en", "de")), ("MQM-Newstest-2020", "scored-mqm", ("en", "de")))
11 | 
12 | def distil_tests(source_lang, target_lang, score_model=ContrastScore, eval_dataset="scored"):
13 |     scorer = score_model(source_language=source_lang, target_language=target_lang, suffix="parallel")
14 |     dataset = DatasetLoader(source_lang, target_lang, hard_limit=500)
15 |     eval_src, eval_system, eval_scores = dataset.load(eval_dataset)
16 |     parallel_src, parallel_tgt = dataset.load("parallel-train")
17 |     results, index = defaultdict(list), ["Baseline", "Fine-tuned model"]
18 | 
19 |     if score_model == ContrastScore:
20 |         scorer.parallelize = True
21 | 
22 |     logging.info("Evaluating performance before fine-tuning.")
23 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
24 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
25 |     results["pearson"].append(round(100 * pearson, 2))
26 |     results["spearman"].append(round(100 * spearman, 2))
27 | 
28 |     scorer.train(parallel_src, parallel_tgt, aligned=True, overwrite=False)
29 | 
30 |     logging.info(f"Evaluating performance after fine-tuning.")
31 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
32 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
33 |     results["pearson"].append(round(100 * pearson, 2))
34 |     results["spearman"].append(round(100 * spearman, 2))
35 | 
36 |     return tabulate(results, headers="keys", showindex=index)
37 | 
38 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
39 | for dataset, identifier, pair in datasets:
40 |     print(f"Evaluating {'-'.join(pair)} language direction on {dataset}")
41 |     print("Results using contrastive learning:", distil_tests(*pair, score_model=ContrastScore, eval_dataset=identifier), sep="\n")
42 |     print("Results using knowledge distillation:", distil_tests(*pair, score_model=DistilScore, eval_dataset=identifier), sep="\n")
43 | 


--------------------------------------------------------------------------------
/experiments/quantity.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore
 3 | from collections import defaultdict
 4 | from tabulate import tabulate
 5 | from metrics.utils.dataset import DatasetLoader
 6 | import logging
 7 | 
 8 | source_lang, target_lang = "de", "en"
 9 | 
10 | def remap_tests(mapping="UMD", amount=40000):
11 |     scorer = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, mapping=mapping, alignment="fast", use_cosine=True)
12 |     dataset = DatasetLoader(source_lang, target_lang)
13 |     eval_src, eval_system, eval_scores = dataset.load("scored")
14 |     suffix = f"{source_lang}-{target_lang}-fast-cosine-{mapping}-monolingual-align-{scorer.k}-{scorer.remap_size}-{amount}-30-1"
15 |     results = defaultdict(list)
16 | 
17 |     scorer.remap(*dataset.load("monolingual-align", amount), suffix=suffix, overwrite=False)
18 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
19 |     rmse, mae = scorer.error(eval_src, eval_system, eval_scores)
20 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
21 |     results["pearson"].append(round(100 * pearson, 2))
22 |     results["spearman"].append(round(100 * spearman, 2))
23 |     results["rmse"].append(round(rmse, 2))
24 |     results["mae"].append(round(mae, 2))
25 | 
26 |     return suffix, tabulate(results, headers="keys")
27 | 
28 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
29 | for amount in (80000, 40000, 20000, 10000, 5000, 2500, 2000):
30 |     print(f"Using {100 * round(2000/amount, 3)}% of aligned sentences for training.")
31 |     print(*remap_tests(mapping="UMD", amount=amount), sep="\n")
32 |     print(*remap_tests(mapping="CLP", amount=amount), sep="\n")
33 | 


--------------------------------------------------------------------------------
/experiments/remap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.xmoverscore import XMoverBertAlignScore
 3 | from collections import defaultdict
 4 | from tabulate import tabulate
 5 | from metrics.utils.dataset import DatasetLoader
 6 | import logging
 7 | 
 8 | source_lang, target_lang = "de", "en"
 9 | iterations = 5
10 | 
11 | def remap_tests(alignment="awesome", mapping="UMD", data="monolingual-align", metric="cosine"):
12 |     scorer = XMoverBertAlignScore(alignment=alignment, mapping=mapping, use_cosine=True if metric == "cosine" else False)
13 |     dataset = DatasetLoader(source_lang, target_lang)
14 |     parallel_src, parallel_tgt = dataset.load("parallel")
15 |     mono_src, mono_tgt = dataset.load(data)
16 |     eval_src, eval_system, eval_scores = dataset.load("scored")
17 |     suffix = f"{source_lang}-{target_lang}-{alignment}-{metric}-{mapping}-{data}-{scorer.k}-{scorer.remap_size}-{len(mono_src)}"
18 |     results = defaultdict(list)
19 | 
20 |     logging.info("Evaluating performance before remapping.")
21 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
22 |     precision = scorer.precision(parallel_src, parallel_tgt)
23 |     rmse, mae = scorer.error(eval_src, eval_system, eval_scores)
24 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}")
25 |     results["pearson"].append(round(100 * pearson, 2))
26 |     results["spearman"].append(round(100 * spearman, 2))
27 |     results["precision"].append(round(100 * precision, 2))
28 |     results["rmse"].append(round(rmse, 2))
29 |     results["mae"].append(round(mae, 2))
30 | 
31 |     for iteration in range(1, iterations + 1):
32 |         logging.info(f"Remapping iteration {iteration}.")
33 |         scorer.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False)
34 |         pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
35 |         precision = scorer.precision(parallel_src, parallel_tgt)
36 |         rmse, mae = scorer.error(eval_src, eval_system, eval_scores)
37 |         logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}")
38 |         results["pearson"].append(round(100 * pearson, 2))
39 |         results["spearman"].append(round(100 * spearman, 2))
40 |         results["precision"].append(round(100 * precision, 2))
41 |         results["rmse"].append(round(rmse, 2))
42 |         results["mae"].append(round(mae, 2))
43 | 
44 |     return suffix, tabulate(results, headers="keys", showindex=True)
45 | 
46 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
47 | print(*remap_tests(alignment="awesome", data="monolingual-align", mapping="UMD"), sep="\n")
48 | print(*remap_tests(alignment="awesome", data="monolingual-align", mapping="CLP"), sep="\n")
49 | print(*remap_tests(alignment="awesome-remap", data="monolingual-align", mapping="UMD"), sep="\n")
50 | print(*remap_tests(alignment="awesome-remap", data="monolingual-align", mapping="CLP"), sep="\n")
51 | print(*remap_tests(alignment="fast", data="monolingual-align", mapping="UMD"), sep="\n")
52 | print(*remap_tests(alignment="fast", data="monolingual-align", mapping="CLP"), sep="\n")
53 | print(*remap_tests(alignment="awesome", data="parallel-align", mapping="UMD"), sep="\n")
54 | print(*remap_tests(alignment="awesome", data="parallel-align", mapping="CLP"), sep="\n")
55 | print(*remap_tests(alignment="fast", data="parallel-align", mapping="UMD"), sep="\n")
56 | print(*remap_tests(alignment="fast", data="parallel-align", mapping="CLP"), sep="\n")
57 | print(*remap_tests(alignment="awesome", data="monolingual-align", mapping="UMD", metric="wmd"), sep="\n")
58 | print(*remap_tests(alignment="awesome", data="monolingual-align", mapping="CLP", metric="wmd"), sep="\n")
59 | 


--------------------------------------------------------------------------------
/experiments/sentsim.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from collections import defaultdict
 3 | import logging
 4 | 
 5 | from tabulate import tabulate
 6 | 
 7 | from metrics.sentsim import SentSim
 8 | from metrics.utils.dataset import DatasetLoader
 9 | 
10 | mlqe = [
11 |     ("en", "de"),
12 |     ("en", "zh"),
13 |     ("ru", "en"),
14 |     ("ro", "en"),
15 |     ("et", "en"),
16 |     ("ne", "en"),
17 |     ("si", "en"),
18 | ]
19 | newstest2017 = [
20 |     ("cs", "en"),
21 |     ("de", "en"),
22 |     ("fi", "en"),
23 |     ("lv", "en"),
24 |     ("ru", "en"),
25 |     ("tr", "en"),
26 |     ("zh", "en"),
27 |     ("en", "zh"),
28 |     ("en", "ru"),
29 | ]
30 | 
31 | # Try to reproduce results of Sentsim on both WMT-17 and WMT-20. The scores for
32 | # WMT-20 are computed for both human-annotated scores and MT model scores.
33 | def sentsim_reproduce(
34 |     source_lang,
35 |     target_lang,
36 |     dataset_name,
37 |     word_metric="BERTScore",
38 |     use_mlqe_model_scores=False,
39 | ):
40 |     scorer = SentSim(use_wmd=word_metric == "WMD")
41 |     dataset = DatasetLoader(source_lang, target_lang)
42 |     eval_src, eval_system, eval_scores = dataset.load(
43 |         dataset_name, use_mlqe_model_scores=use_mlqe_model_scores
44 |     )
45 |     results, index = defaultdict(list), [f"SentSim ({word_metric})"]
46 | 
47 |     pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores)
48 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}")
49 |     results["pearson"].append(round(100 * pearson, 2))
50 |     results["spearman"].append(round(100 * spearman, 2))
51 | 
52 |     return tabulate(results, headers="keys", showindex=index)
53 | 
54 | 
55 | logging.basicConfig(
56 |     level=logging.INFO,
57 |     datefmt="%m-%d %H:%M",
58 |     format="%(asctime)s %(levelname)-8s %(message)s",
59 | )
60 | datasets = (
61 |     ("Newstest-2017", "scored-wmt17", newstest2017),
62 |     ("MLQE-PE (Human)", "scored-mlqe", mlqe),
63 |     ("MLQE-PE (Model)", "scored-mlqe", mlqe),
64 | )
65 | for dataset, identifier, pairs in datasets:
66 |     for source_lang, target_lang in pairs:
67 |         print(f"Evaluating {source_lang}-{target_lang} language direction on {dataset}")
68 |         print(
69 |             sentsim_reproduce(
70 |                 source_lang, target_lang, identifier, "BERTScore", "Mode" in dataset
71 |             )
72 |         )
73 |         print(
74 |             sentsim_reproduce(
75 |                 source_lang, target_lang, identifier, "WMD", "Mode" in dataset
76 |             )
77 |         )
78 | 


--------------------------------------------------------------------------------
/experiments/vecmap.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | from metrics.xmoverscore import XMoverVecMapAlignScore
 3 | from metrics.vecmapscore import VecMapScore
 4 | from collections import defaultdict
 5 | from tabulate import tabulate
 6 | from metrics.utils.dataset import DatasetLoader
 7 | import logging
 8 | 
 9 | source_lang, target_lang = "de", "en"
10 | 
11 | def vecmap_tests(model=XMoverVecMapAlignScore):
12 |     aligner = model(src_lang=source_lang, tgt_lang=target_lang)
13 |     dataset = DatasetLoader(source_lang, target_lang)
14 |     parallel_src, parallel_tgt = dataset.load("parallel")
15 |     eval_src, eval_system, eval_scores = dataset.load("scored")
16 |     results = defaultdict(list)
17 | 
18 |     pearson, spearman = aligner.correlation(eval_src, eval_system, eval_scores)
19 |     precision = aligner.precision(parallel_src, parallel_tgt)
20 |     rmse, mae = aligner.error(eval_src, eval_system, eval_scores)
21 |     logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}")
22 |     results["pearson"].append(round(100 * pearson, 2))
23 |     results["spearman"].append(round(100 * spearman, 2))
24 |     results["precision"].append(round(100 * precision, 2))
25 |     results["rmse"].append(round(rmse, 2))
26 |     results["mae"].append(round(mae, 2))
27 | 
28 |     return f"{source_lang}-{target_lang}-vecmap", tabulate(results, headers="keys")
29 | 
30 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s")
31 | print(*vecmap_tests(), sep="\n")
32 | print(*vecmap_tests(model=VecMapScore), sep="\n")
33 | 


--------------------------------------------------------------------------------
/metrics/common.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from numpy import corrcoef, argsort
 3 | from torch.nn.functional import mse_loss, l1_loss
 4 | from torch import FloatTensor
 5 | 
 6 | class CommonScore(ABC):
 7 |     @abstractmethod
 8 |     def align():
 9 |         """
10 |         This method receives a list of sentences in the source language and a
11 |         list of sentences in the target language as parameters and returns
12 |         a list of pseudo aligned sentence pairs.
13 |         """
14 | 
15 |     @abstractmethod
16 |     def _embed():
17 |         """
18 |         This method receives a list of sentences in the source language and a
19 |         list of sentences in the target language as parameters and returns
20 |         their embeddings, inverse document frequences, tokens and padding
21 |         masks.
22 |         """
23 | 
24 |     @abstractmethod
25 |     def score():
26 |         """
27 |         This method receives a list of sentences in the source language and a
28 |         list of sentences in the target language as parameters, which are
29 |         assumed to be aligned according to their index. For each sentence pair
30 |         a similarity score is computed and the list of scores is returned.
31 |         """
32 | 
33 |     def precision(self, source_sents, ref_sents):
34 |         """
35 |         This method receives a list of sentences in the source language and a
36 |         list of sentences in the target language as parameters, which are
37 |         assumed to be aligned. The parallel sentences are then shuffled and
38 |         re-aligned through parallel sentence matching. This method then returns
39 |         the Precision @ 1 score.
40 |         """
41 |         pairs, _ = self.align(source_sents, ref_sents)
42 |         return sum([reference == predicted for reference, (_, predicted) in zip(ref_sents, pairs)]) / len(ref_sents)
43 | 
44 |     def correlation(self, source_sents, system_sents, ref_scores):
45 |         """
46 |         This method receives a list of sentences in the source language, a
47 |         list of sentences in the target language, which are
48 |         assumed to be aligned and reference scores as parameters. The method
49 |         then returns the pearson correlation and the spearman correlation
50 |         between the reference scores and the scores of the metric.
51 |         """
52 |         scores = self.score(source_sents, system_sents)
53 |         ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(scores).argsort()
54 |         return corrcoef(ref_scores, scores)[0,1], corrcoef(ref_ranks, ranks)[0,1]
55 | 
56 |     def error(self, source_sents, system_sents, ref_scores):
57 |         """
58 |         This method receives a list of sentences in the source language, a
59 |         list of sentences in the target language, which are
60 |         assumed to be aligned and reference scores as parameters. The method
61 |         then returns the Root Mean Squared Error and the Mean Absulute Error
62 |         between the reference scores and the scores of the metric.
63 |         """
64 |         scores = self.score(source_sents, system_sents)
65 |         rmse = mse_loss(FloatTensor(ref_scores), FloatTensor(scores)).sqrt().item()
66 |         mae = l1_loss(FloatTensor(ref_scores), FloatTensor(scores)).item()
67 |         return rmse, mae
68 | 


--------------------------------------------------------------------------------
/metrics/contrastscore.py:
--------------------------------------------------------------------------------
  1 | from sentence_transformers import SentenceTransformer, InputExample, models, util
  2 | from torch.utils.data import DataLoader
  3 | from os.path import join, isfile, basename
  4 | from torch.cuda import device_count, is_available as cuda_is_available
  5 | from torch.nn import CrossEntropyLoss, Module, DataParallel
  6 | from torch.nn.functional import cosine_similarity
  7 | from math import ceil
  8 | from .utils.knn import ratio_margin_align
  9 | from .common import CommonScore
 10 | from .utils.env import DATADIR
 11 | from .utils.wmd import word_mover_score
 12 | from .utils.perplexity import lm_perplexity
 13 | from nltk.metrics.distance import edit_distance
 14 | from pathlib import Path
 15 | import logging
 16 | import torch
 17 | 
 18 | class AdditiveMarginSoftmaxLoss(Module):
 19 |     """
 20 |     Contrastive learning loss function used by LaBSE and SimCSE.
 21 |     """
 22 |     def __init__(self, model, scale = 20.0, margin = 0.0, symmetric = True, similarity_fct = util.cos_sim):
 23 |         super().__init__()
 24 |         self.model = model
 25 |         self.scale = scale
 26 |         self.margin = margin
 27 |         self.symmetric = symmetric
 28 |         self.similarity_fct = similarity_fct
 29 |         self.cross_entropy_loss = CrossEntropyLoss()
 30 | 
 31 |     def additive_margin_softmax_loss(self, embeddings_a, embeddings_b):
 32 |         scores = self.similarity_fct(embeddings_a, embeddings_b)
 33 |         scores.diagonal().subtract_(self.margin)
 34 |         labels = torch.tensor(range(len(scores)), dtype=torch.long, device=scores.device)  # Example a[i] should match with b[i]
 35 |         return self.cross_entropy_loss(self.scale * scores, labels)
 36 | 
 37 |     def forward(self, sentence_features, _):
 38 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
 39 |         assert len(reps) == 2, "Inputs should be source texts and translations"
 40 |         embeddings_a = reps[0]
 41 |         embeddings_b = reps[1]
 42 | 
 43 |         if self.symmetric:
 44 |             return self.additive_margin_softmax_loss(embeddings_a, embeddings_b) + self.additive_margin_softmax_loss(embeddings_b, embeddings_a)
 45 |         else:
 46 |             return self.additive_margin_softmax_loss(embeddings_a, embeddings_b)
 47 | 
 48 |     def get_config_dict(self):
 49 |         return {'scale': self.scale, 'margin': self.margin, 'symmetric': self.symmetric, 'similarity_fct': self.similarity_fct.__name__}
 50 | 
 51 | class ContrastScore(CommonScore):
 52 |     def __init__(
 53 |         self,
 54 |         model_name="xlm-roberta-base",
 55 |         source_language="en",
 56 |         target_language="de",
 57 |         device="cuda" if cuda_is_available() else "cpu",
 58 |         parallelize= False,
 59 |         train_batch_size=256,
 60 |         max_seq_length=None,
 61 |         num_epochs=1,
 62 |         knn_batch_size = 1000000,
 63 |         mine_batch_size = 5000000,
 64 |         train_size = 100000,
 65 |         k = 5,
 66 |         suffix = None
 67 |     ):
 68 |         self.model_name = model_name
 69 |         self.train_batch_size = train_batch_size
 70 |         self.max_seq_length = max_seq_length
 71 |         self.num_epochs = num_epochs
 72 |         self.device = device
 73 |         self.parallelize = parallelize
 74 |         self.knn_batch_size = knn_batch_size
 75 |         self.mine_batch_size = mine_batch_size
 76 |         self.train_size = train_size
 77 |         self.k = k
 78 |         self.cache_dir = join(DATADIR, "contrastive-learning",
 79 |             f"{'-'.join(sorted([source_language, target_language]))}-{basename(model_name)}")
 80 |         self.suffix = suffix
 81 |         self.model = self.load_model(model_name)
 82 | 
 83 |     def load_model(self, model_name):
 84 |         word_embedding_model = models.Transformer(model_name, max_seq_length=self.max_seq_length)
 85 |         pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
 86 |         return SentenceTransformer(modules=[word_embedding_model, pooling_model], device=self.device)
 87 | 
 88 |     @property
 89 |     def path(self):
 90 |         path = self.cache_dir + f"-{self.suffix}" if self.suffix is not None else ""
 91 |         Path(path).mkdir(parents=True, exist_ok=True)
 92 |         return path
 93 | 
 94 |     def _embed(self, source_sents, target_sents):
 95 |         return (
 96 |             self.model.encode(source_sents, convert_to_tensor=True).cpu(),
 97 |             self.model.encode(target_sents, convert_to_tensor=True).cpu())
 98 | 
 99 |     def align(self, source_sents, target_sents):
100 |         source_embeddings, target_embeddings = self._embed(source_sents, target_sents)
101 |         indeces, scores = ratio_margin_align(source_embeddings, target_embeddings, self.k,
102 |                 self.knn_batch_size, self.device)
103 | 
104 |         sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces]
105 |         return sent_pairs, scores
106 | 
107 |     def score(self, source_sents, target_sents):
108 |         source_embeddings, target_embeddings = self._embed(source_sents, target_sents)
109 |         return cosine_similarity(source_embeddings, target_embeddings)
110 | 
111 |     def mine(self, source_sents, target_sents, mine_size, overwrite=True):
112 |         logging.info("Mining pseudo parallel data.")
113 |         file_path = join(self.path, "mined-sentence-pairs.txt")
114 |         pairs, scores, batch, batch_size = list(), list(), 0, self.mine_batch_size
115 |         if not isfile(file_path) or overwrite:
116 |             while batch < len(source_sents):
117 |                 logging.info("Obtaining sentence embeddings.")
118 |                 batch_src, batch_tgt = source_sents[batch:batch + batch_size], target_sents[batch:batch + batch_size]
119 |                 source_embeddings, target_embeddings = self._embed(batch_src, batch_tgt)
120 |                 logging.info("Mining pseudo parallel data with Ratio Margin function.")
121 |                 batch_pairs, batch_scores = ratio_margin_align(source_embeddings, target_embeddings, self.k,
122 |                         self.knn_batch_size, self.device)
123 |                 del source_embeddings, target_embeddings
124 |                 pairs.extend([(src + batch, tgt + batch) for src, tgt in batch_pairs]), scores.extend(batch_scores)
125 |                 batch += batch_size
126 |             with open(file_path, "wb") as f:
127 |                 idx = 0
128 |                 check_duplicates_set = set()
129 |                 for _, (src, tgt) in sorted(zip(scores, pairs), key=lambda tup: tup[0], reverse=True):
130 |                     src_sent, tgt_sent = source_sents[src], target_sents[tgt]
131 |                     if tgt_sent not in check_duplicates_set and edit_distance(src_sent, tgt_sent) / max(len(src_sent), len(tgt_sent)) > 0.5:
132 |                         check_duplicates_set.add(tgt_sent)
133 |                         f.write(f"{src_sent}\t{tgt_sent}\n".encode())
134 |                         idx += 1
135 |                     if idx >= mine_size:
136 |                         break
137 | 
138 |         with open(file_path, "rb") as f:
139 |             sents = list()
140 |             for line in f:
141 |                 sents.append(line.decode().strip().split("\t"))
142 |             return sents
143 | 
144 |     def train(self, source_sents, target_sents, aligned=False, finetune=False, overwrite=True):
145 |         if not isfile(join(self.path, 'config.json')) or overwrite:
146 |             # Convert train sentences to sentence pairs
147 |             if aligned:
148 |                 train_data = [InputExample(texts=[s, t]) for s, t in zip(source_sents, target_sents)]
149 |             else:
150 |                 train_data = [InputExample(texts=[s, t]) for s, t in self.mine(source_sents, target_sents, self.train_size,
151 |                     overwrite=overwrite)]
152 | 
153 |             # DataLoader to batch your data
154 |             train_dataloader = DataLoader(train_data, batch_size=self.train_batch_size, shuffle=True)
155 | 
156 |             if finetune:
157 |                 new_model = self.model
158 |             # Train a new model
159 |             else:
160 |                 del self.model
161 |                 new_model = self.load_model(self.model_name)
162 | 
163 |             # Use contrastive learning loss
164 |             if self.parallelize and device_count() > 1:
165 |                logging.info(f"Training on {device_count()} GPUs.")
166 |                train_loss = AdditiveMarginSoftmaxLoss(DataParallel(new_model))
167 |             else:
168 |                train_loss = AdditiveMarginSoftmaxLoss(new_model)
169 | 
170 |             # Call the fit method
171 |             warmup_steps = ceil(len(train_dataloader) * self.num_epochs * 0.1)  # 10% of train data for warm-up
172 |             new_model.fit(train_objectives=[(train_dataloader, train_loss)],
173 |                 epochs=self.num_epochs,
174 |                 warmup_steps=warmup_steps,
175 |                 optimizer_params={'lr': 5e-5}
176 |             )
177 |             new_model.save(self.path)
178 | 
179 |         self.model = SentenceTransformer(self.path, device=self.device)
180 | 
181 | class XLMoverScore(ContrastScore):
182 |     def __init__(
183 |             self,
184 |             embed_batch_size = 128,
185 |             n_gram=1,
186 |             suffix_filter=False,
187 |             lm_model_name="gpt2",
188 |             use_lm=False,
189 |             lm_weights=[0.9, 0.1],
190 |             **kwargs
191 |         ):
192 |         """
193 |         embed_batch_size - batch size for embedding sentences during inference
194 |         n_gram           - n-gram size of word mover's distance
195 |         suffix_filter    - filter embeddings of word suffixes (original XLMoverScore
196 |             does this, but it doesn't make sense for SentencePiece-based Models)
197 |         """
198 |         super().__init__(**kwargs)
199 |         self.embed_batch_size = embed_batch_size
200 |         self.n_gram = n_gram
201 |         self.suffix_filter = suffix_filter
202 |         self.lm_model_name = lm_model_name
203 |         self.use_lm = use_lm
204 |         self.lm_weights = lm_weights
205 | 
206 |     #Override
207 |     def score(self, source_sents, target_sents):
208 |         embedding_model = self.model.eval().to(self.device)[0].auto_model
209 |         tokenizer = self.model.tokenizer
210 | 
211 |         with torch.no_grad():
212 |             src_ids, src_mask = tokenizer(source_sents, padding=True, truncation=True, return_tensors="pt").values()
213 |             src_idf = src_mask.float()
214 |             src_tokens = [[tokenizer.cls_token, *tokenizer.tokenize(sent), tokenizer.sep_token] for sent in source_sents]
215 |             src_embeddings = list()
216 | 
217 |             tgt_ids, tgt_mask = tokenizer(target_sents, padding=True, truncation=True, return_tensors="pt").values()
218 |             tgt_idf = tgt_mask.float()
219 |             tgt_tokens = [[tokenizer.cls_token, *tokenizer.tokenize(sent), tokenizer.sep_token] for sent in target_sents]
220 |             tgt_embeddings = list()
221 | 
222 |             for index in range(0, len(source_sents), self.embed_batch_size):
223 |                 batch_src_ids = src_ids[index: index + self.embed_batch_size].to(self.device)
224 |                 batch_src_mask = src_mask[index: index + self.embed_batch_size].to(self.device)
225 |                 src_embeddings.extend(embedding_model(input_ids=batch_src_ids, attention_mask=batch_src_mask)['last_hidden_state'].cpu())
226 | 
227 |                 batch_tgt_ids = tgt_ids[index: index + self.embed_batch_size].to(self.device)
228 |                 batch_tgt_mask = tgt_mask[index: index + self.embed_batch_size].to(self.device)
229 |                 tgt_embeddings.extend(embedding_model(input_ids=batch_tgt_ids, attention_mask=batch_tgt_mask)['last_hidden_state'].cpu())
230 | 
231 |         wmd_scores = word_mover_score((torch.stack(src_embeddings), src_idf, src_tokens), (torch.stack(tgt_embeddings), tgt_idf, tgt_tokens),
232 |                 self.n_gram, True, self.suffix_filter)
233 | 
234 |         if self.use_lm:
235 |             lm_scores = lm_perplexity(target_sents, self.device, self.lm_model_name)
236 |             return (self.lm_weights[0] * torch.tensor(wmd_scores) + self.lm_weights[1] * torch.tensor(lm_scores)).tolist()
237 |         else:
238 |             return wmd_scores
239 | 


--------------------------------------------------------------------------------
/metrics/distilscore.py:
--------------------------------------------------------------------------------
  1 | from sentence_transformers import SentenceTransformer, models, losses
  2 | from sentence_transformers.evaluation import TranslationEvaluator, SequentialEvaluator
  3 | from sentence_transformers.datasets import ParallelSentencesDataset
  4 | from torch.utils.data import DataLoader
  5 | from torch.cuda import is_available as cuda_is_available
  6 | from torch.nn.functional import cosine_similarity
  7 | from torch import from_numpy
  8 | from .common import CommonScore
  9 | from .utils.knn import ratio_margin_align
 10 | from .utils.env import DATADIR
 11 | from .utils.language import LangDetect
 12 | from .utils.nmt import language2mBART
 13 | from os.path import join, isfile, basename
 14 | from nltk.metrics.distance import edit_distance
 15 | from pathlib import Path
 16 | from math import ceil
 17 | 
 18 | import logging
 19 | import numpy as np
 20 | 
 21 | class DistilScore(CommonScore):
 22 |     def __init__(
 23 |         self,
 24 |         teacher_model_name="bert-base-nli-stsb-mean-tokens",
 25 |         student_model_name="xlm-roberta-base",
 26 |         source_language="en",
 27 |         target_language="de",
 28 |         device="cuda" if cuda_is_available() else "cpu",
 29 |         student_is_pretrained=False,
 30 |         train_batch_size=64,                # Batch size for training
 31 |         inference_batch_size=64,            # Batch size at inference
 32 |         num_epochs=10,                      # Train for x epochs
 33 |         knn_batch_size = 1000000,
 34 |         mine_batch_size = 5000000,
 35 |         train_size = 200000,
 36 |         k = 5,
 37 |         suffix = None
 38 |     ):
 39 |         self.teacher_model_name = teacher_model_name
 40 |         self.student_model_name = student_model_name
 41 |         self.target_language = target_language
 42 |         self.train_batch_size = train_batch_size
 43 |         self.inference_batch_size = inference_batch_size
 44 |         self.num_epochs = num_epochs
 45 |         self.device = device
 46 |         self.knn_batch_size = knn_batch_size
 47 |         self.mine_batch_size = mine_batch_size
 48 |         self.train_size = train_size
 49 |         self.k = k
 50 |         self.cache_dir = join(DATADIR, "distillation",
 51 |             f"{'-'.join(sorted([source_language, target_language]))}-{basename(teacher_model_name)}-{basename(student_model_name)}")
 52 |         self.suffix = suffix
 53 |         if student_is_pretrained:
 54 |             self.model = SentenceTransformer(student_model_name, device=self.device)
 55 |         else:
 56 |             self.model = self.load_student(student_model_name)
 57 | 
 58 |     def load_student(self, model_name):
 59 |         logging.info("Creating model from scratch")
 60 |         word_embedding_model = models.Transformer(model_name)
 61 |         # Apply mean pooling to get one fixed sized sentence vector
 62 |         pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
 63 |         model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=self.device)
 64 | 
 65 |         # mBART also has a decoder but we are only interested in the encoder output. To make sure that
 66 |         # sentence_transformers use the encoder output we monkey patch the forward method. Don't do this at home kids.
 67 |         if "mbart" in model_name:
 68 |             mbart, detector = word_embedding_model.auto_model, LangDetect(cache_dir=DATADIR)
 69 |             mbart.forward = lambda **kv: type(mbart).forward(mbart, **kv)[-1:]
 70 | 
 71 |             def tokenize(text):
 72 |                 model.tokenizer.src_lang = language2mBART[detector.detect(text)]
 73 |                 return word_embedding_model.tokenize(text)
 74 | 
 75 |             self.model.tokenize = tokenize
 76 | 
 77 |         return model
 78 | 
 79 |     @property
 80 |     def path(self):
 81 |         path = self.cache_dir + f"-{self.suffix}" if self.suffix is not None else ""
 82 |         Path(path).mkdir(parents=True, exist_ok=True)
 83 |         return path
 84 | 
 85 |     def _embed(self, source_sents, target_sents):
 86 |         return self.model.encode(source_sents), self.model.encode(target_sents)
 87 | 
 88 |     def align(self, source_sents, target_sents):
 89 |         source_embeddings, target_embeddings = self._embed(source_sents, target_sents)
 90 |         indeces, scores = ratio_margin_align(from_numpy(source_embeddings), from_numpy(target_embeddings), self.k,
 91 |                 self.knn_batch_size, self.device)
 92 | 
 93 |         sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces]
 94 |         return sent_pairs, scores
 95 | 
 96 |     def score(self, source_sents, target_sents):
 97 |         source_embeddings, target_embeddings = self._embed(source_sents, target_sents)
 98 |         return cosine_similarity(from_numpy(source_embeddings), from_numpy(target_embeddings))
 99 | 
100 |     def mine(self, source_sents, target_sents, overwrite=True):
101 |         logging.info("Mining pseudo parallel data.")
102 |         file_path = join(self.path, "mined-sentence-pairs.txt")
103 |         pairs, scores, batch, batch_size = list(), list(), 0, self.mine_batch_size
104 |         if not isfile(file_path) or overwrite:
105 |             while batch < len(source_sents):
106 |                 logging.info("Obtaining sentence embeddings.")
107 |                 batch_src, batch_tgt = source_sents[batch:batch + batch_size], target_sents[batch:batch + batch_size]
108 |                 source_embeddings, target_embeddings = self._embed(batch_src, batch_tgt)
109 |                 logging.info("Mining pseudo parallel data with Ratio Margin function.")
110 |                 batch_pairs, batch_scores = ratio_margin_align(from_numpy(source_embeddings),
111 |                         from_numpy(target_embeddings), self.k, self.knn_batch_size, self.device)
112 |                 del source_embeddings, target_embeddings
113 |                 pairs.extend([(src + batch, tgt + batch) for src, tgt in batch_pairs]), scores.extend(batch_scores)
114 |                 batch += batch_size
115 |             with open(file_path, "wb") as f:
116 |                 idx = 0
117 |                 for _, (src, tgt) in sorted(zip(scores, pairs), key=lambda tup: tup[0], reverse=True):
118 |                     src_sent, tgt_sent = source_sents[src], target_sents[tgt]
119 |                     if edit_distance(src_sent, tgt_sent) / max(len(src_sent), len(tgt_sent)) > 0.5:
120 |                         f.write(f"{src_sent}\t{tgt_sent}\n".encode())
121 |                         idx += 1
122 |                     if idx >= self.train_size:
123 |                         break
124 |         return file_path
125 | 
126 |     def train(self, source_sents, target_sents, dev_source_sents=None, dev_target_sents=None, aligned=False, overwrite=True):
127 |         if not isfile(join(self.path, 'config.json')) or overwrite:
128 |             # Train a new model to avoid overfitting
129 |             new_model = self.load_student(self.student_model_name)
130 |             logging.info("Loading teacher model and training data.")
131 |             teacher_model = SentenceTransformer(self.teacher_model_name, device=self.device)
132 |             train_data = ParallelSentencesDataset(student_model=new_model, teacher_model=teacher_model,
133 |                     batch_size=self.inference_batch_size, use_embedding_cache=True)
134 | 
135 |             if self.target_language == "en": # since teacher embeds source sentences make sure they are in english
136 |                 source_sents, target_sents = target_sents, source_sents
137 | 
138 |             if aligned:
139 |                 train_data.add_dataset(zip(source_sents, target_sents), max_sentences=self.train_size,
140 |                         max_sentence_length=None)
141 |             else:
142 |                 train_data.load_data(self.mine(source_sents, target_sents, overwrite=overwrite),
143 |                         max_sentences=self.train_size, max_sentence_length=None)
144 | 
145 |             train_dataloader = DataLoader(train_data, shuffle=True, batch_size=self.train_batch_size)
146 |             train_loss = losses.MSELoss(model=new_model)
147 | 
148 |             dev_trans_acc = None
149 |             if dev_source_sents is not None and dev_target_sents is not None:
150 |                 # TranslationEvaluator computes the embeddings for all parallel sentences. It then checks if the
151 |                 # embedding of source[i] is the closest to target[i] out of all available target sentences
152 |                 dev_trans_acc = TranslationEvaluator(dev_source_sents, dev_target_sents, write_csv=False,
153 |                         batch_size=self.inference_batch_size)
154 | 
155 |             # Train the model
156 |             logging.info("Fine-tuning student model.")
157 |             warmup_steps = ceil(len(train_dataloader) * self.num_epochs * 0.1)  # 10% of train data for warm-up
158 |             new_model.fit(train_objectives=[(train_dataloader, train_loss)],
159 |                 evaluator=None if dev_trans_acc is None else SequentialEvaluator([dev_trans_acc], main_score_function=np.mean),
160 |                 epochs=self.num_epochs,
161 |                 warmup_steps=warmup_steps,
162 |                 optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
163 |             )
164 |             new_model.save(self.path)
165 | 
166 |         self.model = SentenceTransformer(self.path, device=self.device)
167 | 


--------------------------------------------------------------------------------
/metrics/marginscore.py:
--------------------------------------------------------------------------------
 1 | from .common import CommonScore
 2 | from .xmoverscore import BertEmbed
 3 | from .utils.knn import ratio_margin_align
 4 | from torch.nn.functional import cosine_similarity
 5 | from torch.cuda import is_available as cuda_is_available
 6 | from torch import sum as tsum
 7 | import logging
 8 | 
 9 | class RatioMarginAlign(CommonScore):
10 |     def __init__(self, device, k, knn_batch_size):
11 |         self.device = device
12 |         self.k = k
13 |         self.knn_batch_size = knn_batch_size
14 | 
15 |     def align(self, source_sents, target_sents):
16 |         src_embeddings, _, _, src_mask, tgt_embeddings, _, _, tgt_mask = self._embed(
17 |                 source_sents, target_sents)
18 | 
19 |         logging.info("Computing scores with Ratio Margin algorithm.")
20 |         source_sent_embeddings = tsum(src_embeddings * src_mask, 1) / tsum(src_mask, 1)
21 |         target_sent_embeddings = tsum(tgt_embeddings * tgt_mask, 1) / tsum(tgt_mask, 1)
22 |         indeces, scores = ratio_margin_align(source_sent_embeddings, target_sent_embeddings, self.k,
23 |                 self.knn_batch_size, self.device)
24 | 
25 |         sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces]
26 |         return sent_pairs, scores
27 | 
28 |     def score(self, source_sents, target_sents):
29 |         src_embeddings, _, _, src_mask, tgt_embeddings, _, _, tgt_mask = self._embed(source_sents, target_sents)
30 |         source_sent_embeddings = tsum(src_embeddings * src_mask, 1) / tsum(src_mask, 1)
31 |         target_sent_embeddings = tsum(tgt_embeddings * tgt_mask, 1) / tsum(tgt_mask, 1)
32 |         scores = cosine_similarity(source_sent_embeddings, target_sent_embeddings)
33 |         return scores
34 | 
35 | class RatioMarginBertAlignScore(RatioMarginAlign, BertEmbed):
36 |     def __init__(
37 |         self,
38 |         model_name="bert-base-multilingual-cased",
39 |         mapping="UMD",
40 |         device="cuda" if cuda_is_available() else "cpu",
41 |         do_lower_case=False,
42 |         alignment = "awesome",
43 |         k = 20,
44 |         remap_size = 2000,
45 |         embed_batch_size = 128,
46 |         knn_batch_size = 1000000
47 |     ):
48 |         RatioMarginAlign.__init__(self, device, k, knn_batch_size)
49 |         BertEmbed.__init__(self, model_name, mapping, device, do_lower_case, remap_size, embed_batch_size, alignment)
50 | 


--------------------------------------------------------------------------------
/metrics/sentsim.py:
--------------------------------------------------------------------------------
  1 | from scipy.spatial.distance import euclidean
  2 | from collections import defaultdict
  3 | from itertools import product
  4 | from sentence_transformers import SentenceTransformer
  5 | from transformers import AutoTokenizer, AutoModel
  6 | from torch.nn import CosineSimilarity
  7 | from torch.cuda import is_available as cuda_is_available
  8 | from datasets import load_metric
  9 | from .utils.knn import ratio_margin_align
 10 | from .utils.env import DATADIR
 11 | from .common import CommonScore
 12 | import torch
 13 | import pulp
 14 | import logging
 15 | import numpy as np
 16 | 
 17 | # This code is based on https://github.com/Rain9876/Unsupervised-crosslingual-Compound-Method-For-MT
 18 | class SentSim(CommonScore):
 19 |     """
 20 |     A wrapper around the original SentSim implementation. Be careful, the used
 21 |     models were fine-tuned on parallel sentences.
 22 |     """
 23 |     def __init__(
 24 |         self,
 25 |         wordemb_model="xlm-roberta-base",
 26 |         sentemb_model="xlm-r-bert-base-nli-stsb-mean-tokens",
 27 |         device="cuda" if cuda_is_available() else "cpu",
 28 |         use_wmd=False,
 29 |         knn_batch_size = 1000000,
 30 |         mine_batch_size = 5000000,
 31 |         k = 5,
 32 |     ):
 33 |         if use_wmd:
 34 |             self.tokenizer, self.word_model = self.get_WMD_Model(wordemb_model)
 35 |             self.layers = self.layer_processing(self.word_model)
 36 |         else:
 37 |             self.word_model = wordemb_model
 38 |         self.use_wmd = use_wmd
 39 |         self.sent_model = SentenceTransformer(sentemb_model, device=device)
 40 |         self.knn_batch_size = knn_batch_size
 41 |         self.mine_batch_size = mine_batch_size
 42 |         self.device = device
 43 |         self.k = k
 44 | 
 45 |     def _embed(self, source_sents, target_sents):
 46 |         return (
 47 |             self.sent_model.encode(source_sents, convert_to_tensor=True).cpu(),
 48 |             self.sent_model.encode(target_sents, convert_to_tensor=True).cpu())
 49 | 
 50 |     def align(self, source_sents, target_sents):
 51 |         logging.warn("For now SentSim sentence alignment only leverages sentence embeddings.")
 52 |         source_embeddings, target_embeddings = self._embed(source_sents, target_sents)
 53 |         indeces, scores = ratio_margin_align(source_embeddings, target_embeddings, self.k,
 54 |                 self.knn_batch_size, self.device)
 55 | 
 56 |         sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces]
 57 |         return sent_pairs, scores
 58 | 
 59 |     def score(self, source_sents, target_sents):
 60 |         cosine = self.getSentSimilarity(target_sents, source_sents)
 61 |         if self.use_wmd:
 62 |             wmd = self.compute_WMD(target_sents, source_sents, self.tokenizer, self.word_model)
 63 |             return self.combine_metrics(cosine, wmd, corr=[1, -1])
 64 |         else:
 65 |             bertscore = self.getBertScore(target_sents, source_sents, self.word_model)
 66 |             return self.combine_metrics(cosine, bertscore, corr=[1, 1])
 67 | 
 68 |     def combine_metrics(_, *args, **kwargs):
 69 |         assert len(args) == len(kwargs["corr"]) and len(args[0]) == len(args[1])
 70 |         output = []
 71 | 
 72 |         for i in range(len(args[0])):
 73 |             value = 0
 74 |             for sign, metric in zip(kwargs["corr"], args):
 75 |                 assert metric[i] <= 1 and metric[i] >= 0
 76 |                 if sign > 0:
 77 |                     value += np.exp(metric[i])
 78 |                 else:
 79 |                     value += np.exp(1-metric[i])
 80 |             output.append(value)
 81 | 
 82 |         return output
 83 | 
 84 |     def getSentSimilarity(self, sents1, sents2):
 85 |         embed_sent1, embed_sent2 = self._embed(sents1, sents2)
 86 |         cos_sim = CosineSimilarity(dim=1)(embed_sent1,embed_sent2)
 87 |         # Normalized
 88 |         cos_sim = (cos_sim -torch.min(cos_sim))/ (torch.max(cos_sim)-torch.min(cos_sim))
 89 |         return cos_sim.numpy()
 90 | 
 91 |     def getBertScore(self, sents1, sents2, model):
 92 |         bert_score_metric = load_metric('bertscore', keep_in_memory=True, cache_dir=DATADIR)
 93 |         bert_score_metric.add_batch(predictions=sents2, references=sents1)
 94 |         score = torch.tensor(bert_score_metric.compute(model_type=model, device=self.device)["f1"])
 95 |         # Normalized Bert Score F1
 96 |         norm_score = (score - torch.min(score)) / (torch.max(score) - torch.min(score))
 97 |         return norm_score.tolist()
 98 | 
 99 |     def compute_WMD(self, hypotheses, references, tokenizer, model, embed_type=False):
100 |         wmd = []
101 | 
102 |         for reference, hypothesis in zip(references, hypotheses):
103 |             wmd_tmp = self.word_mover_distance(reference, hypothesis, tokenizer, model, embed_type)
104 |             wmd.append(wmd_tmp)
105 |         # Normalize
106 |         wmd = [(val-min(wmd))/(max(wmd)-min(wmd)) for val in wmd]
107 |         return np.array(wmd)
108 | 
109 |     def word_mover_distance(self, sent1, sent2, tokenizer, model, embed_type, lpFile=None):
110 |         sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding = self.embedding_processing(sent1, sent2,
111 |                 tokenizer, model, embed_type)
112 |         prob = self.word_mover_distance_probspec(sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding, lpFile=lpFile)
113 |         return pulp.value(prob.objective)
114 | 
115 |     def word_mover_distance_probspec(_, sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding, lpFile=None):
116 |         first_sent_buckets = {f"x{idx}": item[1] for idx, item in enumerate(sent1_buckets.items())}
117 |         second_sent_buckets = {f"y{idx}": item[1] for idx, item in enumerate(sent2_buckets.items())}
118 | 
119 |         var_names = list(first_sent_buckets.keys()) + list(second_sent_buckets.keys())
120 |         all_embedding = torch.cat([sent1_embedding, sent2_embedding])
121 |         wordvecs = {token: embedding.detach().numpy() for token, embedding in zip(var_names, all_embedding)}
122 |         assert len(var_names) == all_embedding.size(0)
123 | 
124 |         T = pulp.LpVariable.dicts('T_matrix', list(product(var_names, var_names)), lowBound=0)
125 |         prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize)
126 |         prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2])
127 |                             for token1, token2 in product(var_names, var_names)])
128 |         for token2 in second_sent_buckets:   #constrains
129 |             prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2]
130 |         for token1 in first_sent_buckets:    #constrains
131 |             prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1]
132 |         if lpFile!=None:
133 |             prob.writeLP(lpFile)
134 | 
135 |         prob.solve(pulp.apis.PULP_CBC_CMD(msg=0))
136 |         return prob
137 | 
138 |     def embedding_processing(self, sent1, sent2, tokenizer, model, embed_type=False):
139 |         sent1_tokens, sent2_tokens  = tokenizer.tokenize(sent1), tokenizer.tokenize(sent2)
140 | 
141 |         if embed_type:
142 |             sent1_buckets, sent2_buckets  = self.tokens_to_fracdict(sent1_tokens), self.tokens_to_fracdict(sent2_tokens)
143 |             sent1_embedding = model.embeddings.word_embeddings(
144 |                     torch.tensor(tokenizer.convert_tokens_to_ids(list(sent1_buckets.keys()))))
145 |             sent2_embedding = model.embeddings.word_embeddings(
146 |                     torch.tensor(tokenizer.convert_tokens_to_ids(list(sent2_buckets.keys()))))
147 |         else:
148 |             sent1_buckets = self.tokens_to_fracdict_contextual(sent1_tokens)
149 |             sent2_buckets = self.tokens_to_fracdict_contextual(sent2_tokens)
150 |             sent1_id = tokenizer(sent1,return_tensors="pt")
151 |             sent2_id = tokenizer(sent2,return_tensors="pt")
152 |     #         [-8:-7] indicates Roberta-Large layer 17
153 |     #         [-4,-3] indicates XLM Roberta-Base layer 9
154 |             model(sent1_id['input_ids'])
155 |             sent1_embedding = torch.mean(torch.stack(self.layers[-4:-3]).squeeze(1).permute(1,0,2), dim=1)
156 |             model(sent2_id['input_ids'])
157 |             sent2_embedding = torch.mean(torch.stack(self.layers[-4:-3]).squeeze(1).permute(1,0,2), dim=1)
158 |         self.layers.clear()
159 | 
160 |         if sent1_embedding.size()[0] - 2 == len(sent1_tokens):
161 |             sent1_embedding = sent1_embedding[1:-1,:] # Remove bos and eos tokens
162 |         if sent2_embedding.size()[0] - 2 == len(sent2_tokens):
163 |             sent2_embedding = sent2_embedding[1:-1,:] # Remove bos and eos tokens
164 | 
165 |         assert len(sent1_buckets) + len(sent2_buckets) == (sent1_embedding.size()[0] + sent2_embedding.size()[0])
166 |         return sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding
167 | 
168 |     def tokens_to_fracdict_contextual(_, tokens):
169 |         return {token: 1/len(tokens) for token in range(len(tokens))}
170 | 
171 |     def tokens_to_fracdict(_, tokens):
172 |         cntdict = defaultdict(lambda : 0)
173 | 
174 |         for token in tokens:
175 |             cntdict[token] += 1
176 |         totalcnt = sum(cntdict.values())
177 |         return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()}
178 | 
179 |     def get_WMD_Model(_, name):
180 |         tokenizer = AutoTokenizer.from_pretrained(name)
181 |         model = AutoModel.from_pretrained(name, return_dict=True)
182 |         # bert_model.embeddings.word_embeddings
183 |         model.eval()
184 |         return tokenizer, model
185 | 
186 |     def layer_processing(_, model):
187 |         layers = []
188 | 
189 |         for i in model.encoder.layer:
190 |             i.register_forward_hook(lambda *args: layers.append(args[2][0]))
191 | 
192 |         return layers
193 | 


--------------------------------------------------------------------------------
/metrics/utils/embed.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.nn.utils.rnn import pad_sequence
 3 | from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
 4 | from collections import defaultdict
 5 | from os.path import join, isfile
 6 | from shutil import copyfileobj
 7 | from urllib.request import urlretrieve
 8 | from gzip import open as gopen
 9 | from .language import WordTokenizer
10 | from .vecmap.map_embeddings import vecmap
11 | from .env import DATADIR
12 | 
13 | fasttext_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/"
14 | 
15 | def padding(arr, pad_token, dtype=torch.long):
16 |     lens = torch.LongTensor([len(a) for a in arr])
17 |     max_len = lens.max().item()
18 |     padded = torch.ones(len(arr), max_len, dtype=dtype) * pad_token
19 |     mask = torch.zeros(len(arr), max_len, dtype=torch.long)
20 |     for i, a in enumerate(arr):
21 |         padded[i, :lens[i]] = torch.tensor(a, dtype=dtype)
22 |         mask[i, :lens[i]] = 1
23 |     return padded, mask
24 | 
25 | def collate_idf(arr, tokenize, numericalize, max_len):
26 |     tokens = [["[CLS]"] + tokenize(a)[:max_len] + ["[SEP]"] for a in arr]
27 |     arr = [numericalize(a) for a in tokens]
28 |     idf_dict = defaultdict(lambda: 1.)
29 |     idf_weights = [[idf_dict[i] for i in a] for a in arr]
30 |     pad_token = numericalize(["[PAD]"])[0]
31 |     padded, mask = padding(arr, pad_token, dtype=torch.long)
32 |     padded_idf, _ = padding(idf_weights, pad_token, dtype=torch.float)
33 | 
34 |     return padded, padded_idf, mask, tokens
35 | 
36 | def bert_embed(all_sens, batch_size, model, tokenizer, device):
37 |     if len(all_sens) == 0:
38 |         return torch.empty(0, 0, 768), torch.empty(0, 0, 1), list(), torch.empty(0, 0, 1)
39 |     padded_sens, padded_idf, mask, tokens = collate_idf(all_sens, tokenizer.tokenize, tokenizer.convert_tokens_to_ids,
40 |             tokenizer.max_len_single_sentence)
41 |     data = TensorDataset(padded_sens, mask)
42 |     sampler = SequentialSampler(data)
43 |     dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size)
44 |     all_embeddings = torch.zeros((len(all_sens), mask.shape[1], model.config.hidden_size))
45 | 
46 |     model.eval()
47 |     with torch.no_grad():
48 |         for batch_index, (batch_padded_sens, batch_mask) in enumerate(dataloader):
49 |             pos = batch_index * batch_size
50 |             batch_padded_sens = batch_padded_sens.to(device)
51 |             batch_mask = batch_mask.to(device)
52 |             all_embeddings[pos:pos + len(batch_mask)] = model(batch_padded_sens, batch_mask)["last_hidden_state"].cpu()
53 |     return all_embeddings, padded_idf, tokens, mask.unsqueeze(-1)
54 | 
55 | def map_multilingual_embeddings(src_lang, tgt_lang, batch_size, device):
56 |     src_emb = get_embeddings_file(src_lang)
57 |     tgt_emb = get_embeddings_file(tgt_lang)
58 | 
59 |     arguments = ['--batch_size', str(batch_size), '--unsupervised', src_emb, tgt_emb]
60 |     if "cuda" in device:
61 |         arguments.insert(0, '--cuda')
62 |     return vecmap(arguments)
63 | 
64 | def get_embeddings_file(lang_id):
65 |     filename = f"cc.{lang_id}.300.vec"
66 |     gz_filename = filename + ".gz"
67 | 
68 |     if isfile(join(DATADIR, filename)):
69 |         return join(DATADIR, filename)
70 | 
71 |     urlretrieve(join(fasttext_url, gz_filename), join(DATADIR, gz_filename))
72 | 
73 |     with gopen(join(DATADIR, gz_filename), 'rb') as f:
74 |         with open(join(DATADIR, filename), 'wb') as f_out:
75 |             copyfileobj(f, f_out)
76 | 
77 |     return join(DATADIR, filename)
78 | 
79 | def vecmap_embed(all_sents, lang_dict, lang):
80 |     tokens, idf_weights, embeddings = list(), list(), list()
81 |     with WordTokenizer(lang) as tokenize:
82 |         for sent in all_sents:
83 |             tokens.append([word for word in tokenize(sent)])
84 |             idf_weights.append([1] * len(tokens[-1]))
85 |             embeddings.append(torch.stack([lang_dict[word] for word in tokens[-1]]))
86 | 
87 |     idf_weights, mask = padding(idf_weights, 0, dtype=torch.float)
88 |     embeddings = pad_sequence(embeddings, batch_first=True)
89 | 
90 |     return embeddings, idf_weights, tokens, mask.unsqueeze(-1)
91 | 


--------------------------------------------------------------------------------
/metrics/utils/env.py:
--------------------------------------------------------------------------------
1 | from os import getenv
2 | from os.path import join
3 | from pathlib import Path
4 | 
5 | DATADIR = getenv("METRICS_HOME", join(getenv("XDG_CACHE_HOME", join(Path.home(), ".cache")), "metrics"))
6 | Path(DATADIR).mkdir(parents=True, exist_ok=True)
7 | 


--------------------------------------------------------------------------------
/metrics/utils/knn.py:
--------------------------------------------------------------------------------
 1 | from faiss import IndexFlatL2, IndexFlatIP, index_cpu_to_all_gpus, normalize_L2 
 2 | import numpy as np
 3 | 
 4 | # Adopted from https://github.com/pytorch/fairseq/blob/master/examples/criss/mining/mine.py
 5 | def knn_sharded(source_data, target_data, k, batch_size, device, use_cosine=False):
 6 |     if use_cosine:
 7 |         normalize_L2(source_data)
 8 |         normalize_L2(target_data)
 9 |     sims = []
10 |     inds = []
11 |     dim = source_data.shape[-1]
12 |     xfrom = 0
13 | 
14 |     for x_batch in np.array_split(source_data, np.ceil(len(source_data) / batch_size)):
15 |         yfrom = 0
16 |         bsims, binds = [], []
17 |         for y_batch in np.array_split(target_data, np.ceil(len(target_data) / batch_size)):
18 |             neighbor_size = min(k, y_batch.shape[0])
19 |             idx = IndexFlatIP(dim) if use_cosine else IndexFlatL2(dim)
20 |             if device != 'cpu':
21 |                 idx = index_cpu_to_all_gpus(idx)
22 |             idx.add(y_batch)
23 |             bsim, bind = idx.search(x_batch, neighbor_size)
24 | 
25 |             bsims.append(bsim)
26 |             binds.append(bind + yfrom)
27 |             yfrom += y_batch.shape[0]
28 |             del idx
29 |             del y_batch
30 |         bsims = np.concatenate(bsims, axis=1)
31 |         binds = np.concatenate(binds, axis=1)
32 |         aux = np.argsort(-bsims, axis=1)
33 |         sim_batch = np.zeros((x_batch.shape[0], k), dtype=np.float32)
34 |         ind_batch = np.zeros((x_batch.shape[0], k), dtype=np.int64)
35 |         for i in range(x_batch.shape[0]):
36 |             for j in range(k):
37 |                 sim_batch[i, j] = bsims[i, aux[i, j]]
38 |                 ind_batch[i, j] = binds[i, aux[i, j]]
39 |         sims.append(sim_batch)
40 |         inds.append(ind_batch)
41 |         xfrom += x_batch.shape[0]
42 |         del x_batch
43 |     sim = np.concatenate(sims, axis=0)
44 |     ind = np.concatenate(inds, axis=0)
45 |     return sim, ind
46 | 
47 | def score_candidates(sim_mat, candidate_inds, fwd_mean, bwd_mean):
48 |     scores = np.zeros(candidate_inds.shape)
49 |     for i in range(scores.shape[0]):
50 |         for j in range(scores.shape[1]):
51 |             k = int(candidate_inds[i, j])
52 |             scores[i, j] = sim_mat[i, j] / ((fwd_mean[i] + bwd_mean[k]) / 2)
53 |     return scores
54 | 
55 | def ratio_margin_align(source_data, target_data, k, batch_size, device):
56 |     src2tgt_sim, src2tgt_ind = knn_sharded(source_data.numpy(), target_data.numpy(), k, batch_size, device, True)
57 |     tgt2src_sim, _ = knn_sharded(target_data.numpy(), source_data.numpy(), k, batch_size, device)
58 | 
59 |     src2tgt_mean = src2tgt_sim.mean(axis=1)
60 |     tgt2src_mean = tgt2src_sim.mean(axis=1)
61 |     fwd_scores = score_candidates(src2tgt_sim, src2tgt_ind, src2tgt_mean, tgt2src_mean)
62 |     fwd_best = src2tgt_ind[np.arange(src2tgt_sim.shape[0]), fwd_scores.argmax(axis=1)]
63 | 
64 |     return np.insert(np.expand_dims(fwd_best, 1), 0, range(len(fwd_best)), 1), fwd_scores.max(axis=1)
65 | 
66 | def wcd_align(source_data, target_data, k, batch_size, device):
67 |     squared_scores, indeces = knn_sharded(source_data.numpy(), target_data.numpy(), k, batch_size, device)
68 |     return indeces, np.sqrt(squared_scores)
69 | 
70 | def cosine_align(source_data, target_data, k, batch_size, device):
71 |     scores, indeces = knn_sharded(source_data.numpy(), target_data.numpy(), k, batch_size, device, True)
72 |     return indeces, scores
73 | 


--------------------------------------------------------------------------------
/metrics/utils/language.py:
--------------------------------------------------------------------------------
 1 | from os.path import isfile, join
 2 | from fasttext import FastText, load_model
 3 | from urllib.request import urlretrieve
 4 | from collections import defaultdict
 5 | from tempfile import mkdtemp
 6 | from mosestokenizer import MosesTokenizer, MosesSentenceSplitter
 7 | from Nepali_nlp import Tokenizer
 8 | from sinling import SinhalaTokenizer
 9 | from jieba import cut
10 | from re import findall, U
11 | 
12 | class LangDetect():
13 |     url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/"
14 | 
15 |     def __init__(self, compress=False, cache_dir=mkdtemp()):
16 |         # fixes https://github.com/facebookresearch/fastText/issues/1067 for the time being
17 |         FastText.eprint = lambda _: None
18 |         self.cache_dir = cache_dir
19 |         self.model = self.load_model("lid.176.ftz" if compress else "lid.176.bin")
20 | 
21 |     def load_model(self, name):
22 |         target_path = join(self.cache_dir, name)
23 |         if not isfile(target_path):
24 |             urlretrieve(join(self.url, name), target_path)
25 |         return load_model(target_path)
26 | 
27 |     def detect(self, texts, return_score=False):
28 |         texts = [texts] if isinstance(texts, str) else texts
29 |         counter = defaultdict(float)
30 | 
31 |         for text in texts:
32 |             labels, scores = self.model.predict(text.strip())
33 |             label = labels[0].removeprefix("__label__")
34 |             score = min(float(scores[0]), 1.0)
35 |             counter[label] += score
36 |         label, score = sorted(counter.items(), key=lambda tup: tup[1])[-1]
37 |         return (label, score) if return_score else label
38 | 
39 | 
40 | class WordTokenizer():
41 |     def __init__(self, language):
42 |         if language == "si":
43 |             self.tokenize = SinhalaTokenizer().tokenize
44 |         # since bn and hi are related to ne and use the same script we can use the ne tokenizer for all
45 |         elif language in ["ne", "bn", "hi"]:
46 |             self.tokenize = Tokenizer().word_tokenize
47 |         elif language == "zh":
48 |             self.tokenize = lambda sent: list(cut(sent))
49 |         else:
50 |             # zulu and xhosa follow english punctuation
51 |             self.tokenize = MosesTokenizer("en" if language in ["zu", "xh"] else language)
52 | 
53 |     def __call__(self, sentence):
54 |         return self.tokenize(sentence)
55 | 
56 |     def __enter__(self):
57 |         return self.tokenize
58 | 
59 |     def __exit__(self, *_):
60 |         if type(self.tokenize) == MosesTokenizer:
61 |             self.tokenize.close()
62 | 
63 |     def __del__(self):
64 |         if type(self.tokenize) == MosesTokenizer:
65 |             self.tokenize.close()
66 | 
67 | class SentenceSplitter():
68 |     def __init__(self, language):
69 |         if language in ["si"]:
70 |             self.split = lambda sents: SinhalaTokenizer().split_sentences(" ".join(sents))
71 |         elif language in ["ne", "bn", "hi"]:
72 |             self.split = lambda sents: Tokenizer().sentence_tokenize(" ".join(sents))
73 |         elif language == "zh":
74 |             self.split = lambda sent: self._split_chinese(sent)
75 |         else:
76 |             self.split = MosesSentenceSplitter("en" if language in ["zu", "xh"] else language, False)
77 | 
78 |     # taken from https://stackoverflow.com/a/45274695, modified regex of
79 |     # http://aclweb.org/anthology/Y/Y11/Y11-1038.pdf
80 |     def _split_chinese(_, sentences):
81 |         return [sent.strip() for sent in findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', "".join(sentences), flags=U)]
82 | 
83 |     def __call__(self, sentence):
84 |         return self.split(sentence)
85 | 
86 |     def __enter__(self):
87 |         return self.split
88 | 
89 |     def __exit__(self, *_):
90 |         if type(self.split) == MosesSentenceSplitter:
91 |             self.split.close()
92 | 
93 |     def __del__(self):
94 |         if type(self.split) == MosesSentenceSplitter:
95 |             self.split.close()
96 | 


--------------------------------------------------------------------------------
/metrics/utils/nmt.py:
--------------------------------------------------------------------------------
  1 | # Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Fine-tuning the library models for sequence to sequence.
 16 | """
 17 | # Adapted, based on https://github.com/huggingface/transformers/blob/v4.5.1/examples/seq2seq/run_translation.py
 18 | 
 19 | import logging
 20 | import os
 21 | from dataclasses import dataclass, field
 22 | from typing import Optional
 23 | from torch.utils.data import DataLoader
 24 | from .env import DATADIR
 25 | 
 26 | from datasets import load_dataset
 27 | 
 28 | from transformers import (
 29 |     AutoConfig,
 30 |     AutoModelForSeq2SeqLM,
 31 |     AutoTokenizer,
 32 |     DataCollatorForSeq2Seq,
 33 |     HfArgumentParser,
 34 |     MBartTokenizer,
 35 |     MBartTokenizerFast,
 36 |     MBart50Tokenizer,
 37 |     MBart50TokenizerFast,
 38 |     Seq2SeqTrainer,
 39 |     Seq2SeqTrainingArguments,
 40 |     default_data_collator,
 41 |     set_seed,
 42 | )
 43 | from transformers.trainer_utils import get_last_checkpoint
 44 | from transformers.utils import check_min_version
 45 | 
 46 | 
 47 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 48 | check_min_version("4.5.0")
 49 | 
 50 | logger = logging.getLogger(__name__)
 51 | 
 52 | language2mBART = {
 53 |     "ar": "ar_AR", "cs": "cs_CZ", "de": "de_DE", "en": "en_XX", "es": "es_XX",
 54 |     "et": "et_EE", "fi": "fi_FI", "fr": "fr_XX", "gu": "gu_IN", "hi": "hi_IN",
 55 |     "it": "it_IT", "ja": "ja_XX", "kk": "kk_KZ", "ko": "ko_KR", "lt": "lt_LT",
 56 |     "lv": "lv_LV", "my": "my_MM", "ne": "ne_NP", "nl": "nl_XX", "ro": "ro_RO",
 57 |     "ru": "ru_RU", "si": "si_LK", "tr": "tr_TR", "vi": "vi_VN", "zh": "zh_CN" }
 58 | 
 59 | language2mBART50 = language2mBART | {
 60 |     "af": "af_ZA", "az": "az_AZ", "bn": "bn_IN", "fa": "fa_IR", "he": "he_IL",
 61 |     "hr": "hr_HR", "id": "id_ID", "ka": "ka_GE", "km": "km_KH", "mk": "mk_MK",
 62 |     "ml": "ml_IN", "mn": "mn_MN", "mr": "mr_IN", "pl": "pl_PL", "ps": "ps_AF",
 63 |     "pt": "pt_XX", "sv": "sv_SE", "sw": "sw_KE", "ta": "ta_IN", "te": "te_IN",
 64 |     "th": "th_TH", "tl": "tl_XX", "uk": "uk_UA", "ur": "ur_PK", "xh": "xh_ZA",
 65 |     "gl": "gl_ES", "sl": "sl_SI", "zu": "xh_ZA"} # zulu and xhosa are related, so it should be fine
 66 | 
 67 | @dataclass
 68 | class ModelArguments:
 69 |     """
 70 |     Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
 71 |     """
 72 | 
 73 |     model_name_or_path: str = field(
 74 |         metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
 75 |     )
 76 |     cache_dir: Optional[str] = field(
 77 |         default=None,
 78 |         metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"},
 79 |     )
 80 |     use_fast_tokenizer: bool = field(
 81 |         default=True,
 82 |         metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
 83 |     )
 84 | 
 85 | 
 86 | @dataclass
 87 | class DataTrainingArguments:
 88 |     """
 89 |     Arguments pertaining to what data we are going to input our model for training and eval.
 90 |     """
 91 | 
 92 |     source_lang: str = field(default=None, metadata={"help": "Source language id for translation."})
 93 |     target_lang: str = field(default=None, metadata={"help": "Target language id for translation."})
 94 | 
 95 |     train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a jsonlines)."})
 96 |     overwrite_cache: bool = field(
 97 |         default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
 98 |     )
 99 |     preprocessing_num_workers: Optional[int] = field(
100 |         default=None,
101 |         metadata={"help": "The number of processes to use for the preprocessing."},
102 |     )
103 |     max_source_length: Optional[int] = field(
104 |         default=1024,
105 |         metadata={
106 |             "help": "The maximum total input sequence length after tokenization. Sequences longer "
107 |             "than this will be truncated, sequences shorter will be padded."
108 |         },
109 |     )
110 |     max_target_length: Optional[int] = field(
111 |         default=128,
112 |         metadata={
113 |             "help": "The maximum total sequence length for target text after tokenization. Sequences longer "
114 |             "than this will be truncated, sequences shorter will be padded."
115 |         },
116 |     )
117 |     pad_to_max_length: bool = field(
118 |         default=False,
119 |         metadata={
120 |             "help": "Whether to pad all samples to model maximum sentence length. "
121 |             "If False, will pad the samples dynamically when batching to the maximum length in the batch. More "
122 |             "efficient on GPU but very bad for TPU."
123 |         },
124 |     )
125 |     max_train_samples: Optional[int] = field(
126 |         default=None,
127 |         metadata={
128 |             "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
129 |             "value if set."
130 |         },
131 |     )
132 |     num_beams: Optional[int] = field(
133 |         default=None,
134 |         metadata={
135 |             "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, "
136 |             "which is used during ``evaluate`` and ``predict``."
137 |         },
138 |     )
139 |     ignore_pad_token_for_loss: bool = field(
140 |         default=True,
141 |         metadata={
142 |             "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not."
143 |         },
144 |     )
145 |     source_prefix: Optional[str] = field(
146 |         default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."}
147 |     )
148 | 
149 |     def __post_init__(self):
150 |         if self.train_file is None:
151 |             raise ValueError("Need a training file.")
152 |         elif self.source_lang is None or self.target_lang is None:
153 |             raise ValueError("Need to specify the source language and the target language.")
154 | 
155 |         if self.train_file is not None:
156 |             extension = self.train_file.split(".")[-1]
157 |             assert extension == "json", "`train_file` should be a json file."
158 | 
159 | def load_model_and_tokenizer(model_name_or_path, source_lang, target_lang, use_fast_tokenizer, cache_dir):
160 |     # Load pretrained model and tokenizer
161 |     config = AutoConfig.from_pretrained(
162 |         model_name_or_path,
163 |         cache_dir=cache_dir,
164 |     )
165 |     tokenizer = AutoTokenizer.from_pretrained(
166 |         model_name_or_path,
167 |         cache_dir=cache_dir,
168 |         use_fast=use_fast_tokenizer,
169 |     )
170 |     model = AutoModelForSeq2SeqLM.from_pretrained(
171 |         model_name_or_path,
172 |         from_tf=bool(".ckpt" in model_name_or_path),
173 |         config=config,
174 |         cache_dir=cache_dir,
175 |     )
176 | 
177 |     # Set decoder_start_token_id
178 |     if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)):
179 |         if isinstance(tokenizer, MBartTokenizer):
180 |             model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_lang]
181 |         else:
182 |             model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_lang)
183 | 
184 |     if model.config.decoder_start_token_id is None:
185 |         raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined")
186 | 
187 |     # For translation we set the codes of our source and target languages (only useful for mBART, the others will
188 |     # ignore those attributes).
189 |     if isinstance(tokenizer, (MBartTokenizer, MBart50Tokenizer, MBartTokenizerFast, MBart50TokenizerFast)):
190 |         tokenizer.src_lang = source_lang
191 |         tokenizer.tgt_lang = target_lang
192 | 
193 |     # For multilingual translation model mBART-50 we need to force the target
194 |     # language token as the first generated token.
195 |     if isinstance(tokenizer, (MBart50Tokenizer, MBart50TokenizerFast)):
196 |         model.config.forced_bos_token_id = tokenizer.lang_code_to_id[target_lang]
197 | 
198 |     return model, tokenizer
199 | 
200 | def _train(args=None):
201 |     # See all possible arguments in src/transformers/training_args.py
202 |     # or by passing the --help flag to this script.
203 |     # We now keep distinct sets of args, for a cleaner separation of concerns.
204 | 
205 |     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments))
206 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses(args)
207 | 
208 |     # Detecting last checkpoint.
209 |     last_checkpoint = None
210 |     if os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir:
211 |         last_checkpoint = get_last_checkpoint(training_args.output_dir)
212 | 
213 | 
214 |         if os.path.isfile(os.path.join(training_args.output_dir, 'config.json')):
215 |             logger.info(
216 |                 f"Output directory ({training_args.output_dir}) exists already and is not empty. "
217 |                 "Skipping training and returning pretrained models."
218 |             )
219 |             return load_model_and_tokenizer(training_args.output_dir, data_args.source_lang,
220 |                     data_args.target_lang, model_args.use_fast_tokenizer, model_args.cache_dir)
221 |         elif last_checkpoint is not None:
222 |             logger.info(
223 |                 f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
224 |                 "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
225 |             )
226 | 
227 |     # Set seed before initializing model.
228 |     set_seed(training_args.seed)
229 | 
230 |     # For translation, only JSON files are supported, with one field named "translation" containing two keys for the
231 |     # source and target languages (unless you adapt what follows).
232 |     data_files = {}
233 |     data_files["train"] = data_args.train_file
234 |     extension = data_args.train_file.split(".")[-1]
235 |     datasets = load_dataset(extension, data_files=data_files, download_mode="force_redownload")
236 |     # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
237 |     # https://huggingface.co/docs/datasets/loading_datasets.html
238 | 
239 |     model, tokenizer = load_model_and_tokenizer(model_args.model_name_or_path, data_args.source_lang,
240 |         data_args.target_lang, model_args.use_fast_tokenizer, model_args.cache_dir)
241 | 
242 |     # Get the language codes for input/target.
243 |     source_lang = data_args.source_lang.split("_")[0]
244 |     target_lang = data_args.target_lang.split("_")[0]
245 | 
246 |     # Temporarily set max_target_length for training.
247 |     max_target_length = data_args.max_target_length
248 |     padding = "max_length" if data_args.pad_to_max_length else False
249 | 
250 |     if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"):
251 |         logger.warn(
252 |             "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for"
253 |             f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory"
254 |         )
255 | 
256 |     def preprocess_function(examples):
257 |         prefix = data_args.source_prefix if data_args.source_prefix is not None else ""
258 |         inputs = [ex[source_lang] for ex in examples["translation"]]
259 |         targets = [ex[target_lang] for ex in examples["translation"]]
260 |         inputs = [prefix + inp for inp in inputs]
261 |         model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True)
262 | 
263 |         # Setup the tokenizer for targets
264 |         with tokenizer.as_target_tokenizer():
265 |             labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True)
266 | 
267 |         # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
268 |         # padding in the loss.
269 |         if padding == "max_length" and data_args.ignore_pad_token_for_loss:
270 |             labels["input_ids"] = [
271 |                 [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
272 |             ]
273 | 
274 |         model_inputs["labels"] = labels["input_ids"]
275 |         return model_inputs
276 | 
277 |     train_dataset = datasets["train"]
278 |     if data_args.max_train_samples is not None:
279 |         train_dataset = train_dataset.select(range(data_args.max_train_samples))
280 |     train_dataset = train_dataset.map(
281 |         preprocess_function,
282 |         batched=True,
283 |         num_proc=data_args.preprocessing_num_workers,
284 |         remove_columns=datasets["train"].column_names,
285 |         load_from_cache_file=not data_args.overwrite_cache,
286 |     )
287 | 
288 |     # Data collator
289 |     label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
290 |     if data_args.pad_to_max_length:
291 |         data_collator = default_data_collator
292 |     else:
293 |         data_collator = DataCollatorForSeq2Seq(
294 |             tokenizer,
295 |             model=model,
296 |             label_pad_token_id=label_pad_token_id,
297 |             pad_to_multiple_of=8 if training_args.fp16 else None,
298 |         )
299 |     # Initialize our Trainer
300 |     trainer = Seq2SeqTrainer(
301 |         model=model,
302 |         args=training_args,
303 |         train_dataset=train_dataset,
304 |         tokenizer=tokenizer,
305 |         data_collator=data_collator,
306 |     )
307 | 
308 |     # Training
309 |     train_result = trainer.train(resume_from_checkpoint=last_checkpoint)
310 |     trainer.save_model(training_args.output_dir)  # Saves the tokenizer too
311 | 
312 |     metrics = train_result.metrics
313 |     max_train_samples = (
314 |         data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset)
315 |     )
316 |     metrics["train_samples"] = min(max_train_samples, len(train_dataset))
317 | 
318 |     trainer.log_metrics("train", metrics)
319 |     trainer.save_state()
320 | 
321 |     return load_model_and_tokenizer(training_args.output_dir, data_args.source_lang,
322 |             data_args.target_lang, model_args.use_fast_tokenizer, model_args.cache_dir)
323 | 
324 | def train(model, source_lang, target_lang, dataset, overwrite, suffix, name=None):
325 |     if "mbart" in model:
326 |         lookup = language2mBART50 if "50" in model else language2mBART
327 |         source_lang = lookup[source_lang]
328 |         target_lang = lookup[target_lang]
329 |     args = [
330 |         "--model_name_or_path", model,
331 |         "--cache_dir", os.path.join(DATADIR, "translation", name or os.path.basename(model), suffix, "cache"),
332 |         "--output_dir", os.path.join(DATADIR, "translation", name or os.path.basename(model), suffix, "output"),
333 |         "--source_lang", source_lang,
334 |         "--target_lang", target_lang,
335 |         "--train_file", dataset,
336 |         "--report_to", "none",
337 |         "--save_strategy", "epoch",
338 |         "--per_device_train_batch_size", "4", "--do_train"]
339 |     if overwrite:
340 |         args.append("--overwrite_output_dir")
341 | 
342 |     return _train(args)
343 | 
344 | def translate(model, tokenizer, sentences, batch_size, device):
345 |     translated = list()
346 |     for batch in DataLoader(sentences, batch_size=batch_size):
347 |         inputs = tokenizer(batch, return_tensors="pt", padding=True)
348 |         inputs = {k: v.to(device) for k, v in inputs.items()}
349 |         translated_tokens = model.generate(**inputs, decoder_start_token_id=model.config.decoder_start_token_id)
350 |         translated.extend(tokenizer.batch_decode(translated_tokens.cpu(), skip_special_tokens=True))
351 |     return translated
352 | 
353 | if __name__ == "__main__":
354 |     _train()
355 | 


--------------------------------------------------------------------------------
/metrics/utils/perplexity.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelWithLMHead, AutoTokenizer
 2 | from torch import tensor
 3 | 
 4 | def lm_perplexity(hyps, device, name="gpt2"):
 5 |     if name is None:
 6 |         return [0] * len(hyps)
 7 | 
 8 |     # Some models need a special tokenizer, like chinese gpt2, see here:
 9 |     # https://huggingface.co/ckiplab/gpt2-base-chinese
10 |     model_name, tokenizer_name = (name, name) if isinstance(name, str) else name
11 | 
12 |     model = AutoModelWithLMHead.from_pretrained(model_name).to(device)
13 |     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
14 | 
15 |     scores = list()
16 |     model.eval()
17 |     for hyp in hyps:
18 |         tokenize_input = tokenizer.tokenize(hyp)
19 | 
20 |         if len(tokenize_input) <= 1:
21 |             scores.append(0)
22 |         else:
23 |             if len(tokenize_input) > 1024:
24 |                 tokenize_input = tokenize_input[:1024]
25 | 
26 |             input_ids = tensor([tokenizer.convert_tokens_to_ids(tokenize_input)]).to(device)
27 |             score = model(input_ids, labels=input_ids)[0]
28 |             scores.append(-score.item())
29 | 
30 |     return scores
31 | 


--------------------------------------------------------------------------------
/metrics/utils/remap.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from itertools import chain
  4 | from subprocess import check_output, DEVNULL
  5 | from tempfile import NamedTemporaryFile as TempFile
  6 | from simalign import SentenceAligner
  7 | from torch.utils.data import DataLoader, SequentialSampler, TensorDataset
  8 | from .env import DATADIR
  9 | 
 10 | def convert_sent_to_input(sents, tokenizer, max_seq_length):
 11 |     input_ids = []
 12 |     mask = []
 13 |     for sent in sents:
 14 |         ids = tokenizer.convert_tokens_to_ids(sent)
 15 |         mask.append([1] * (len(ids) + 2) + [0] * (max_seq_length - len(ids)))
 16 |         input_ids.append([101] + ids + [102] + [0] * (max_seq_length - len(ids)))
 17 |     return torch.tensor(input_ids, dtype=torch.long), torch.tensor(mask, dtype=torch.long)
 18 | 
 19 | def convert_words_to_bpe(sent_pairs, tokenizer):
 20 |     bpe_para, bpe_table = [], []
 21 | 
 22 |     for (src_sent, tgt_sent) in sent_pairs:
 23 |         src_bpe_table, tgt_bpe_table = [], []
 24 |         src_sent_bpe, tgt_sent_bpe = [], []
 25 | 
 26 |         for word in src_sent:
 27 |             token = tokenizer.tokenize(word)
 28 |             word2bpe_map = []
 29 |             for i in range(len(token)):
 30 |                 word2bpe_map.append(len(src_sent_bpe)+i)
 31 |             src_sent_bpe.extend(token)
 32 |             src_bpe_table.append(word2bpe_map)
 33 | 
 34 |         for word in tgt_sent:
 35 |             token = tokenizer.tokenize(word)
 36 |             word2bpe_map = []
 37 |             for i in range(len(token)):
 38 |                 word2bpe_map.append(len(tgt_sent_bpe)+i)
 39 |             tgt_sent_bpe.extend(token)
 40 |             tgt_bpe_table.append(word2bpe_map)
 41 | 
 42 |         bpe_para.append([src_sent_bpe, tgt_sent_bpe])
 43 |         bpe_table.append([src_bpe_table, tgt_bpe_table])
 44 | 
 45 |     return bpe_para, bpe_table
 46 | 
 47 | 
 48 | def get_aligned_features_avgbpe(sent_pairs, align_pairs, model,
 49 |         tokenizer, batch_size, device, layer=12, max_seq_length=175):
 50 |     bpe_para, bpe_table = convert_words_to_bpe(sent_pairs, tokenizer)
 51 | 
 52 |     # filter long/empty sentences
 53 |     fltr_src_bpe, fltr_tgt_bpe, fltr_align_pairs, fltr_bpe_table, align_cnt = [], [], [], [], 0
 54 |     for cnt, (src, tgt) in enumerate(bpe_para):
 55 |         if len(src) <= max_seq_length and len(tgt) <= max_seq_length and len(src) > 0 and len(tgt) > 0:
 56 |             fltr_src_bpe.append(src)
 57 |             fltr_tgt_bpe.append(tgt)
 58 |             fltr_align_pairs.append(align_pairs[cnt])
 59 |             fltr_bpe_table.append(bpe_table[cnt])
 60 |             align_cnt += len(align_pairs[cnt])
 61 | 
 62 |     src_input, src_mask = convert_sent_to_input(fltr_src_bpe, tokenizer, max_seq_length)
 63 |     tgt_input, tgt_mask = convert_sent_to_input(fltr_tgt_bpe, tokenizer, max_seq_length)
 64 | 
 65 |     src_data = TensorDataset(src_input, src_mask)
 66 |     src_sampler = SequentialSampler(src_data)
 67 |     src_dataloader = DataLoader(src_data, sampler=src_sampler, batch_size=batch_size)
 68 | 
 69 |     tgt_data = TensorDataset(tgt_input, tgt_mask)
 70 |     tgt_sampler = SequentialSampler(tgt_data)
 71 |     tgt_dataloader = DataLoader(tgt_data, sampler=tgt_sampler, batch_size=batch_size)
 72 | 
 73 |     src_embed = []
 74 |     tgt_embed = []
 75 | 
 76 |     model.eval()
 77 |     with torch.no_grad():
 78 |         for batch in src_dataloader:
 79 |             input_ids, input_mask = batch
 80 |             input_ids = input_ids.to(device)
 81 |             input_mask = input_mask.to(device)
 82 | 
 83 |             hidden_state = model(input_ids, attention_mask=input_mask)["hidden_states"][layer]
 84 |             src_embed.append(hidden_state[:,1:].cpu().numpy()) # remove CLS
 85 | 
 86 |     with torch.no_grad():
 87 |         for batch in tgt_dataloader:
 88 |             input_ids, input_mask = batch
 89 |             input_ids = input_ids.to(device)
 90 |             input_mask = input_mask.to(device)
 91 | 
 92 |             hidden_state = model(input_ids, attention_mask=input_mask)["hidden_states"][layer]
 93 |             tgt_embed.append(hidden_state[:,1:].cpu().numpy())
 94 | 
 95 |     src_embed = np.concatenate(src_embed)
 96 |     tgt_embed = np.concatenate(tgt_embed)
 97 | 
 98 |     feature_size = src_embed.shape[2]
 99 |     cnt, src_matrix, tgt_matrix = 0, np.zeros((align_cnt, feature_size)), np.zeros((align_cnt, feature_size))
100 |     for i, pairs in enumerate(fltr_align_pairs):
101 |         for a in pairs:
102 |             if len(fltr_bpe_table[i][0][a[0]]) > 0 and len(fltr_bpe_table[i][1][a[1]]) > 0: # token alignment (0,0)
103 |                 src_word_avg_embed = np.zeros((1, feature_size))
104 | 
105 |                 for j in fltr_bpe_table[i][0][a[0]]:
106 |                     src_word_avg_embed += src_embed[i][j,:]
107 |                 src_matrix[cnt,:] = src_word_avg_embed / len(fltr_bpe_table[i][0][a[0]])
108 | 
109 |                 tgt_word_avg_embed = np.zeros((1, feature_size))
110 |                 for j in fltr_bpe_table[i][1][a[1]]:
111 |                     tgt_word_avg_embed += tgt_embed[i][j,:]
112 | 
113 |                 tgt_matrix[cnt,:] = tgt_word_avg_embed / len(fltr_bpe_table[i][1][a[1]])
114 |                 cnt += 1
115 | 
116 |     return src_matrix, tgt_matrix
117 | 
118 | def fast_align(sent_pairs, tokenizer, size, max_seq_length=100):
119 |     tokenized_pairs = list()
120 |     for source_sent, target_sent in sent_pairs:
121 |         sent1 = tokenizer.basic_tokenizer.tokenize(source_sent)
122 |         sent2 = tokenizer.basic_tokenizer.tokenize(target_sent)
123 | 
124 |         if 0 < len(sent1) <= max_seq_length and 0 < len(sent2) <= max_seq_length:
125 |             tokenized_pairs.append((sent1, sent2))
126 | 
127 |         if len(tokenized_pairs) >= size:
128 |             break
129 | 
130 |     with TempFile(dir=DATADIR, buffering=0) as fwd_file, TempFile(dir=DATADIR, buffering=0) as bwd_file:
131 |         for file_, data, flags in ((fwd_file, tokenized_pairs, "-dov"), (bwd_file, tokenized_pairs, "-dovr")):
132 |             file_.write("\n".join([f'{" ".join(src)} ||| {" ".join(tgt)}'.lower() for src, tgt in data]).encode())
133 |             asym_aligned = check_output(["fast_align", "-i", file_.name, flags], stderr=DEVNULL)
134 |             file_.seek(0)
135 |             file_.truncate()
136 |             file_.write(asym_aligned)
137 | 
138 |         sym_aligned = check_output(["atools", "-i", fwd_file.name, "-j", bwd_file.name, "-c", "grow-diag-final-and"])
139 | 
140 |     sym_aligned = [[tuple(map(int, pair.split(b"-"))) for pair in pairs.split()] for pairs in sym_aligned.splitlines()]
141 |     return tokenized_pairs, sym_aligned
142 | 
143 | def awesome_align(sentpairs, model, tokenizer, size, device, projection=None, max_seq_length=100):
144 |     tokenized_pairs, alignments = list(), list()
145 |     for src, tgt in sentpairs:
146 |         sent_src, sent_tgt = tokenizer.basic_tokenizer.tokenize(src), tokenizer.basic_tokenizer.tokenize(tgt)
147 |         if 0 < len(sent_src) <= max_seq_length and 0 < len(sent_tgt) <= max_seq_length:
148 |             token_src = [tokenizer.tokenize(word) for word in sent_src]
149 |             token_tgt = [tokenizer.tokenize(word) for word in sent_tgt]
150 |             wid_src = [tokenizer.convert_tokens_to_ids(x) for x in token_src]
151 |             wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_tgt]
152 |             ids_src = tokenizer.prepare_for_model(list(chain(*wid_src)), return_tensors='pt', truncation=True)['input_ids']
153 |             ids_tgt = tokenizer.prepare_for_model(list(chain(*wid_tgt)), return_tensors='pt', truncation=True)['input_ids']
154 |             sub2word_map_src = []
155 |             for i, word_list in enumerate(token_src):
156 |                 sub2word_map_src.extend([i] * len(word_list))
157 |             sub2word_map_tgt = []
158 |             for i, word_list in enumerate(token_tgt):
159 |                 sub2word_map_tgt.extend([i] * len(word_list))
160 | 
161 |             # alignment
162 |             align_layer = 8
163 |             threshold = 1e-3
164 |             model.eval()
165 |             with torch.no_grad():
166 |                 out_src = model(ids_src.unsqueeze(0).to(device))["hidden_states"][align_layer]
167 |                 out_tgt = model(ids_tgt.unsqueeze(0).to(device))["hidden_states"][align_layer]
168 | 
169 |                 if projection is not None:
170 |                     projection = projection.to(device)
171 |                     if projection.ndim == 2: # CLP
172 |                         out_src = torch.matmul(out_src, projection)
173 |                     else: # UMD
174 |                         out_src = out_src - (out_src * projection).sum(2, keepdim=True) * \
175 |                                 projection.repeat(out_src.shape[0], out_src.shape[1], 1)
176 | 
177 |                 dot_prod = torch.matmul(out_src[0, 1:-1].cpu(), out_tgt[0, 1:-1].transpose(-1, -2).cpu())
178 | 
179 |                 softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod)
180 |                 softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod)
181 | 
182 |                 softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold)
183 | 
184 |             align_subwords = torch.nonzero(softmax_inter, as_tuple=False)
185 |             align_words = set()
186 |             for i, j in align_subwords:
187 |                 align_words.add((sub2word_map_src[i], sub2word_map_tgt[j]))
188 | 
189 |             tokenized_pairs.append((sent_src, sent_tgt))
190 |             alignments.append(list(align_words))
191 | 
192 |             if len(tokenized_pairs) >= size:
193 |                 break
194 | 
195 |     return tokenized_pairs, alignments
196 | 
197 | def sim_align(sent_pairs, tokenizer, size, device, max_seq_length=100):
198 |     tokenized_pairs, alignments = list(), list()
199 |     aligner = SentenceAligner(matching_methods="i", token_type="word", device=device)
200 |     for source_sent, target_sent in sent_pairs:
201 |         sent1 = tokenizer.basic_tokenizer.tokenize(source_sent)
202 |         sent2 = tokenizer.basic_tokenizer.tokenize(target_sent)
203 | 
204 |         if 0 < len(sent1) <= max_seq_length and 0 < len(sent2) <= max_seq_length:
205 |             tokenized_pairs.append((sent1, sent2))
206 |             alignments.append(aligner.get_word_aligns(sent1, sent2)["itermax"])
207 | 
208 |         if len(tokenized_pairs) >= size:
209 |             break
210 | 
211 |     return tokenized_pairs, alignments
212 | 
213 | def clp(x, z, orthogonal=True):
214 |     if orthogonal:
215 |         u, _, vt = np.linalg.svd(z.T.dot(x))
216 |         w = vt.T.dot(u.T)
217 |     else:
218 |         x_pseudoinv = np.linalg.inv(x.T.dot(x)).dot(x.T)
219 |         w = x_pseudoinv.dot(z)
220 |     return torch.Tensor(w)
221 | 
222 | def umd(x, z):
223 |     *_, v = np.linalg.svd(x - z)
224 |     v_b = v[0]
225 |     return torch.Tensor(v_b)
226 | 


--------------------------------------------------------------------------------
/metrics/utils/vecmap/.gitignore:
--------------------------------------------------------------------------------
 1 | data/
 2 | output/
 3 | private/
 4 | 
 5 | *~
 6 | .DS_Store
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | .idea/
11 | 


--------------------------------------------------------------------------------
/metrics/utils/vecmap/README.md:
--------------------------------------------------------------------------------
  1 | VecMap (cross-lingual word embedding mappings)
  2 | ==============
  3 | 
  4 | This is an open source implementation of our framework to learn cross-lingual word embedding mappings, described in the following papers:
  5 | - Mikel Artetxe, Gorka Labaka, and Eneko Agirre. 2018. **[A robust self-learning method for fully unsupervised cross-lingual mappings of word embeddings](https://aclweb.org/anthology/P18-1073)**. In *Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*.
  6 | - Mikel Artetxe, Gorka Labaka, and Eneko Agirre. 2018. **[Generalizing and improving bilingual word embedding mappings with a multi-step framework of linear transformations](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16935/16781)**. In *Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18)*, pages 5012-5019.
  7 | - Mikel Artetxe, Gorka Labaka, and Eneko Agirre. 2017. **[Learning bilingual word embeddings with (almost) no bilingual data](https://aclweb.org/anthology/P17-1042)**. In *Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*, pages 451-462.
  8 | - Mikel Artetxe, Gorka Labaka, and Eneko Agirre. 2016. **[Learning principled bilingual mappings of word embeddings while preserving monolingual invariance](https://aclweb.org/anthology/D16-1250)**. In *Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing*, pages 2289-2294.
  9 | 
 10 | The package includes a script to build cross-lingual word embeddings with or without parallel data as described in the papers, as well as evaluation tools in word translation induction, word similarity/relatedness and word analogy.
 11 | 
 12 | If you use this software for academic research, [please cite the relevant paper(s)](#publications).
 13 | 
 14 | 
 15 | Requirements
 16 | --------
 17 | - Python 3
 18 | - NumPy
 19 | - SciPy
 20 | - CuPy (optional, only required for CUDA support)
 21 | 
 22 | 
 23 | Usage
 24 | --------
 25 | 
 26 | In order to build your own cross-lingual word embeddings, you should first train monolingual word embeddings for each language using your favorite tool (e.g. [word2vec](https://github.com/tmikolov/word2vec) or [fasttext](https://github.com/facebookresearch/fastText)) and then map them to a common space with our software as described below. Having done that, you can evaluate the resulting cross-lingual embeddings using our included tools as discussed next.
 27 | 
 28 | #### Mapping
 29 | 
 30 | The mapping software offers 4 main modes with our recommended settings for different scenarios:
 31 | 
 32 | - **Supervised** (recommended if you have a large training dictionary):
 33 | ```
 34 | python3 map_embeddings.py --supervised TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
 35 | ```
 36 | - **Semi-supervised** (recommended if you have a small seed dictionary):
 37 | ```
 38 | python3 map_embeddings.py --semi_supervised TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
 39 | ```
 40 | - **Identical** (recommended if you have no seed dictionary but can rely on identical words):
 41 | ```
 42 | python3 map_embeddings.py --identical SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
 43 | ```
 44 | - **Unsupervised** (recommended if you have no seed dictionary and do not want to rely on identical words):
 45 | ```
 46 | python3 map_embeddings.py --unsupervised SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
 47 | ```
 48 | 
 49 | `SRC.EMB` and `TRG.EMB` refer to the input monolingual embeddings, which should be in the word2vec text format, whereas `SRC_MAPPED.EMB` and `TRG_MAPPED.EMB` refer to the output cross-lingual embeddings. The training dictionary `TRAIN.DICT`, if any, should be given as a text file with one entry per line (source word + whitespace + target word).
 50 | 
 51 | If you have a NVIDIA GPU, append the `--cuda` flag to the above commands to make things faster.
 52 | 
 53 | For most users, the above settings should suffice. Choosing the right mode should be straightforward depending on the resources available: as a general rule, you should prefer the mode with the highest supervision for the resources you have, although it is advised to try different variants in case of doubt.
 54 | 
 55 | In addition to these recommended modes, the software also offers additional options to adjust different aspects of the mapping method as described in the papers. While most users should not need to deal with those, you can learn more about them by running the tool with the `--help` flag. You can either use one of the recommended modes and modify a few options on top of it, or do not use any recommended mode and set all options yourself. In fact, if you dig into the code, you will see that the above modes simply set recommended defaults for all the different options.
 56 | 
 57 | #### Evaluation
 58 | 
 59 | You can evaluate your mapped embeddings in bilingual lexicon extraction (aka dictionary induction or word translation) as follows:
 60 | ```
 61 | python3 eval_translation.py SRC_MAPPED.EMB TRG_MAPPED.EMB -d TEST.DICT
 62 | ```
 63 | The above command uses standard nearest neighbor retrieval by default. For best results, it is recommended that you use CSLS retrieval instead:
 64 | ```
 65 | python3 eval_translation.py SRC_MAPPED.EMB TRG_MAPPED.EMB -d TEST.DICT --retrieval csls
 66 | ```
 67 | While better, CSLS is also significantly slower than nearest neighbor, so do not forget to append the `--cuda` flag to the above command if you have a NVIDIA GPU.
 68 | 
 69 | In addition to bilingual lexicon extraction, you can also evaluate your mapped embeddings in cross-lingual word similarity as follows:
 70 | ```
 71 | python3 eval_similarity.py -l --backoff 0 SRC_MAPPED.EMB TRG_MAPPED.EMB -i TEST_SIMILARITY.TXT
 72 | ```
 73 | 
 74 | Finally, we also offer an evaluation tool for monolingual word analogies, which mimics the one included with word2vec but should run significantly faster:
 75 | ```
 76 | python3 eval_analogy.py -l SRC_MAPPED.EMB -i TEST_ANALOGIES.TXT -t 30000
 77 | ```
 78 | 
 79 | 
 80 | Dataset
 81 | --------
 82 | You can use the following script to download the main dataset used in our papers, which is an extension of that of [Dinu et al. (2014)](http://clic.cimec.unitn.it/~georgiana.dinu/down/):
 83 | ```
 84 | ./get_data.sh
 85 | ```
 86 | 
 87 | 
 88 | Reproducing results
 89 | --------
 90 | 
 91 | While we always recommend to use the above settings for best results when working with your own embeddings, we also offer additional modes to replicate the systems from our different papers as follows:
 92 | - **ACL 2018** (currently equivalent to the unsupervised mode):
 93 | ```
 94 | python3 map_embeddings.py --acl2018 SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
 95 | ```
 96 | - **AAAI 2018** (currently equivalent to the supervised mode, except for minor differences in re-weighting, normalization and dimensionality reduction):
 97 | ```
 98 | python3 map_embeddings.py --aaai2018 TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
 99 | ```
100 | - **ACL 2017** (superseded by our ACL 2018 system; offers 2 modes depending on the initialization):
101 | ```
102 | python3 map_embeddings.py --acl2017 SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
103 | python3 map_embeddings.py --acl2017_seed TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
104 | ```
105 | - **EMNLP 2016** (superseded by our AAAI 2018 system):
106 | ```
107 | python3 map_embeddings.py --emnlp2016 TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB
108 | ```
109 | 
110 | 
111 | FAQ
112 | --------
113 | 
114 | ##### How long does training take?
115 | 
116 | - The supervised mode (`--supervised`) should run in around 2 minutes in either CPU or GPU.
117 | - The rest of recommended modes (either `--semi_supervised`, `--identical` or `--unsupervised`) should run in around 5 hours in CPU, or 10 minutes in GPU (Titan Xp or similar).
118 | 
119 | 
120 | ##### This is running much slower for me! What can I do?
121 | 
122 | 1. If you have a GPU, do not forget the `--cuda` flag.
123 | 2. Make sure that your NumPy installation is properly linked to BLAS/LAPACK. This is particularly important if you are working on CPU, as it can have a huge impact in performance if not properly set up.
124 | 3. There are different settings that affect the execution time of the algorithm and can thus be adjusted to make things faster: the batch size (`--batch_size`), the vocabulary cutoff (`--vocabulary_cutoff`), the stochastic dictionary induction settings (`--stochastic_initial`, `--stochastic_multiplier` and `--stochastic_interval`) and the convergence threshold (`--threshold`), among others. However, most of these settings will have a direct impact in the quality of the resulting embeddings, so you should not play with them unless you really know what you are doing.
125 | 
126 | 
127 | ##### Prior versions of this software included nice scripts to reproduce the exact same results reported in your papers. Why are those missing now?
128 | 
129 | As the complexity of the software (and the number of publications/results to reproduce) increased, maintaining those nice scripts became very tedious. Moreover, with the inclusion of CUDA support and FP32 precision, reproducing the exact same results on different platforms became inviable due to minor numerical variations in the underlying computations, which were magnified by self-learning (e.g. the exact same command is likely to produce a slightly different output on CPU and GPU). While the effect in the final results is negligible (the observed variations are around 0.1-0.2 accuracy points), this made it unfeasible to reproduce the exact same results in different platforms.
130 | 
131 | Instead of that, we now provide an [easy interface to run all the systems proposed in our different papers](#reproducing-results). We think that this might be even more useful than the previous approach: the most skeptical user should still be able to easily verify our results, while we also provide a simple interface to test our different systems in other datasets.
132 | 
133 | 
134 | ##### The ablation test in your ACL 2018 paper reports 0% accuracies for removing CSLS, but I am getting better results. Why is that?
135 | 
136 | After publishing the paper, we discovered a bug in the code that was causing those 0% accuracies. Now that the bug is fixed, the effect of removing CSLS is not that dramatic, although it still has a big negative impact. At the same time, the effect of removing the bidirectional dictionary induction in that same ablation test is slightly smaller.
137 | 
138 | 
139 | See also
140 | --------
141 | 
142 | VecMap is a basic building block of [Monoses](https://github.com/artetxem/monoses), our Unsupervised Statistical Machine Translation system. You can use them in combination to train your own machine translation model from monolingual corpora alone.
143 | 
144 | 
145 | Publications
146 | --------
147 | 
148 | If you use this software for academic research, please cite the relevant paper(s) as follows (in case of doubt, please cite the ACL 2018 paper, or the AAAI 2018 paper if you use the supervised mode):
149 | ```
150 | @inproceedings{artetxe2018acl,
151 |   author    = {Artetxe, Mikel  and  Labaka, Gorka  and  Agirre, Eneko},
152 |   title     = {A robust self-learning method for fully unsupervised cross-lingual mappings of word embeddings},
153 |   booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
154 |   year      = {2018},
155 |   pages     = {789--798}
156 | }
157 | 
158 | @inproceedings{artetxe2018aaai,
159 |   author    = {Artetxe, Mikel  and  Labaka, Gorka  and  Agirre, Eneko},
160 |   title     = {Generalizing and improving bilingual word embedding mappings with a multi-step framework of linear transformations},
161 |   booktitle = {Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence},
162 |   year      = {2018},
163 |   pages     = {5012--5019}
164 | }
165 | 
166 | @inproceedings{artetxe2017acl,
167 |   author    = {Artetxe, Mikel  and  Labaka, Gorka  and  Agirre, Eneko},
168 |   title     = {Learning bilingual word embeddings with (almost) no bilingual data},
169 |   booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)},
170 |   year      = {2017},
171 |   pages     = {451--462}
172 | }
173 | 
174 | @inproceedings{artetxe2016emnlp,
175 |   author    = {Artetxe, Mikel  and  Labaka, Gorka  and  Agirre, Eneko},
176 |   title     = {Learning principled bilingual mappings of word embeddings while preserving monolingual invariance},
177 |   booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing},
178 |   year      = {2016},
179 |   pages     = {2289--2294}
180 | }
181 | ```
182 | 
183 | 
184 | License
185 | -------
186 | 
187 | Copyright (C) 2016-2018, Mikel Artetxe
188 | 
189 | Licensed under the terms of the GNU General Public License, either version 3 or (at your option) any later version. A full copy of the license can be found in LICENSE.txt.
190 | 


--------------------------------------------------------------------------------
/metrics/utils/vecmap/cupy_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2018  Mikel Artetxe <artetxem@gmail.com>
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify
 4 | # it under the terms of the GNU General Public License as published by
 5 | # the Free Software Foundation, either version 3 of the License, or
 6 | # (at your option) any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful,
 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | 
16 | import numpy
17 | 
18 | try:
19 |     import cupy
20 | except ImportError:
21 |     cupy = None
22 | 
23 | 
24 | def supports_cupy():
25 |     return cupy is not None
26 | 
27 | 
28 | def get_cupy():
29 |     return cupy
30 | 
31 | 
32 | def get_array_module(x):
33 |     if cupy is not None:
34 |         return cupy.get_array_module(x)
35 |     else:
36 |         return numpy
37 | 
38 | 
39 | def asnumpy(x):
40 |     if cupy is not None:
41 |         return cupy.asnumpy(x)
42 |     else:
43 |         return numpy.asarray(x)
44 | 


--------------------------------------------------------------------------------
/metrics/utils/vecmap/embeddings.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2016-2018  Mikel Artetxe <artetxem@gmail.com>
 2 | #
 3 | # This program is free software: you can redistribute it and/or modify
 4 | # it under the terms of the GNU General Public License as published by
 5 | # the Free Software Foundation, either version 3 of the License, or
 6 | # (at your option) any later version.
 7 | #
 8 | # This program is distributed in the hope that it will be useful,
 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
11 | # GNU General Public License for more details.
12 | #
13 | # You should have received a copy of the GNU General Public License
14 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
15 | 
16 | from .cupy_utils import *
17 | 
18 | import numpy as np
19 | 
20 | 
21 | def read(file, threshold=0, vocabulary=None, dtype='float'):
22 |     header = file.readline().split(' ')
23 |     count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0]))
24 |     dim = int(header[1])
25 |     words = []
26 |     matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else []
27 |     for i in range(count):
28 |         word, vec = file.readline().split(' ', 1)
29 |         if vocabulary is None:
30 |             words.append(word)
31 |             matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype)
32 |         elif word in vocabulary:
33 |             words.append(word)
34 |             matrix.append(np.fromstring(vec, sep=' ', dtype=dtype))
35 |     return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype))
36 | 
37 | 
38 | def write(words, matrix, file):
39 |     m = asnumpy(matrix)
40 |     print('%d %d' % m.shape, file=file)
41 |     for i in range(len(words)):
42 |         print(words[i] + ' ' + ' '.join(['%.6g' % x for x in m[i]]), file=file)
43 | 
44 | 
45 | def length_normalize(matrix):
46 |     xp = get_array_module(matrix)
47 |     norms = xp.sqrt(xp.sum(matrix**2, axis=1))
48 |     norms[norms == 0] = 1
49 |     matrix /= norms[:, xp.newaxis]
50 | 
51 | 
52 | def mean_center(matrix):
53 |     xp = get_array_module(matrix)
54 |     avg = xp.mean(matrix, axis=0)
55 |     matrix -= avg
56 | 
57 | 
58 | def length_normalize_dimensionwise(matrix):
59 |     xp = get_array_module(matrix)
60 |     norms = xp.sqrt(xp.sum(matrix**2, axis=0))
61 |     norms[norms == 0] = 1
62 |     matrix /= norms
63 | 
64 | 
65 | def mean_center_embeddingwise(matrix):
66 |     xp = get_array_module(matrix)
67 |     avg = xp.mean(matrix, axis=1)
68 |     matrix -= avg[:, xp.newaxis]
69 | 
70 | 
71 | def normalize(matrix, actions):
72 |     for action in actions:
73 |         if action == 'unit':
74 |             length_normalize(matrix)
75 |         elif action == 'center':
76 |             mean_center(matrix)
77 |         elif action == 'unitdim':
78 |             length_normalize_dimensionwise(matrix)
79 |         elif action == 'centeremb':
80 |             mean_center_embeddingwise(matrix)
81 | 


--------------------------------------------------------------------------------
/metrics/utils/vecmap/map_embeddings.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2016-2018  Mikel Artetxe <artetxem@gmail.com>
  2 | #
  3 | # This program is free software: you can redistribute it and/or modify
  4 | # it under the terms of the GNU General Public License as published by
  5 | # the Free Software Foundation, either version 3 of the License, or
  6 | # (at your option) any later version.
  7 | #
  8 | # This program is distributed in the hope that it will be useful,
  9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 11 | # GNU General Public License for more details.
 12 | #
 13 | # You should have received a copy of the GNU General Public License
 14 | # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 15 | 
 16 | from  . import embeddings
 17 | from .cupy_utils import *
 18 | from torch import zeros, from_numpy
 19 | from collections import defaultdict
 20 | 
 21 | import argparse
 22 | import collections
 23 | import numpy as np
 24 | import re
 25 | import sys
 26 | import time
 27 | 
 28 | 
 29 | def dropout(m, p):
 30 |     if p <= 0.0:
 31 |         return m
 32 |     else:
 33 |         xp = get_array_module(m)
 34 |         mask = xp.random.rand(*m.shape) >= p
 35 |         return m*mask
 36 | 
 37 | 
 38 | def topk_mean(m, k, inplace=False):  # TODO Assuming that axis is 1
 39 |     xp = get_array_module(m)
 40 |     n = m.shape[0]
 41 |     ans = xp.zeros(n, dtype=m.dtype)
 42 |     if k <= 0:
 43 |         return ans
 44 |     if not inplace:
 45 |         m = xp.array(m)
 46 |     ind0 = xp.arange(n)
 47 |     ind1 = xp.empty(n, dtype=int)
 48 |     minimum = m.min()
 49 |     for i in range(k):
 50 |         m.argmax(axis=1, out=ind1)
 51 |         ans += m[ind0, ind1]
 52 |         m[ind0, ind1] = minimum
 53 |     return ans / k
 54 | 
 55 | 
 56 | def vecmap(cmd_args=None):
 57 |     # Parse command line arguments
 58 |     parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space')
 59 |     parser.add_argument('src_input', help='the input source embeddings')
 60 |     parser.add_argument('trg_input', help='the input target embeddings')
 61 |     parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)')
 62 |     parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)')
 63 |     parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)')
 64 |     parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory')
 65 |     parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)')
 66 | 
 67 |     recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios')
 68 |     recommended_type = recommended_group.add_mutually_exclusive_group()
 69 |     recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary')
 70 |     recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary')
 71 |     recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words')
 72 |     recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words')
 73 |     recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system')
 74 |     recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system')
 75 |     recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization')
 76 |     recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary')
 77 |     recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system')
 78 | 
 79 |     init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments')
 80 |     init_type = init_group.add_mutually_exclusive_group()
 81 |     init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)')
 82 |     init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary')
 83 |     init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary')
 84 |     init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization')
 85 |     init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization')
 86 | 
 87 |     mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments')
 88 |     mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order')
 89 |     mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings')
 90 |     mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings')
 91 |     mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings')
 92 |     mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings')
 93 |     mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings')
 94 |     mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction')
 95 |     mapping_type = mapping_group.add_mutually_exclusive_group()
 96 |     mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping')
 97 |     mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping')
 98 | 
 99 |     self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning')
100 |     self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning')
101 |     self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries')
102 |     self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)')
103 |     self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction')
104 |     self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)')
105 |     self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration')
106 |     self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)')
107 |     self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)')
108 |     self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)')
109 |     self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration')
110 |     self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration')
111 |     args = parser.parse_args(cmd_args)
112 | 
113 |     if args.supervised is not None:
114 |         parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
115 |     if args.semi_supervised is not None:
116 |         parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
117 |     if args.identical:
118 |         parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
119 |     if args.unsupervised or args.acl2018:
120 |         parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10)
121 |     if args.aaai2018:
122 |         parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000)
123 |     if args.acl2017:
124 |         parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
125 |     if args.acl2017_seed:
126 |         parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000)
127 |     if args.emnlp2016:
128 |         parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000)
129 |     args = parser.parse_args(cmd_args)
130 | 
131 |     # Check command line arguments
132 |     if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten:
133 |         print('ERROR: De-whitening requires whitening first', file=sys.stderr)
134 |         sys.exit(-1)
135 | 
136 |     # Choose the right dtype for the desired precision
137 |     if args.precision == 'fp16':
138 |         dtype = 'float16'
139 |     elif args.precision == 'fp32':
140 |         dtype = 'float32'
141 |     elif args.precision == 'fp64':
142 |         dtype = 'float64'
143 | 
144 |     # Read input embeddings
145 |     srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape')
146 |     trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape')
147 |     src_words, x = embeddings.read(srcfile, dtype=dtype)
148 |     trg_words, z = embeddings.read(trgfile, dtype=dtype)
149 | 
150 |     # NumPy/CuPy management
151 |     if args.cuda:
152 |         if not supports_cupy():
153 |             print('ERROR: Install CuPy for CUDA support', file=sys.stderr)
154 |             sys.exit(-1)
155 |         xp = get_cupy()
156 |         x = xp.asarray(x)
157 |         z = xp.asarray(z)
158 |     else:
159 |         xp = np
160 |     xp.random.seed(args.seed)
161 | 
162 |     # Build word to index map
163 |     src_word2ind = {word: i for i, word in enumerate(src_words)}
164 |     trg_word2ind = {word: i for i, word in enumerate(trg_words)}
165 | 
166 |     # STEP 0: Normalization
167 |     embeddings.normalize(x, args.normalize)
168 |     embeddings.normalize(z, args.normalize)
169 | 
170 |     # Build the seed dictionary
171 |     src_indices = []
172 |     trg_indices = []
173 |     if args.init_unsupervised:
174 |         sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab)
175 |         u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False)
176 |         xsim = (u*s).dot(u.T)
177 |         u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False)
178 |         zsim = (u*s).dot(u.T)
179 |         del u, s, vt
180 |         xsim.sort(axis=1)
181 |         zsim.sort(axis=1)
182 |         embeddings.normalize(xsim, args.normalize)
183 |         embeddings.normalize(zsim, args.normalize)
184 |         sim = xsim.dot(zsim.T)
185 |         if args.csls_neighborhood > 0:
186 |             knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood)
187 |             knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood)
188 |             sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2
189 |         if args.direction == 'forward':
190 |             src_indices = xp.arange(sim_size)
191 |             trg_indices = sim.argmax(axis=1)
192 |         elif args.direction == 'backward':
193 |             src_indices = sim.argmax(axis=0)
194 |             trg_indices = xp.arange(sim_size)
195 |         elif args.direction == 'union':
196 |             src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0)))
197 |             trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size)))
198 |         del xsim, zsim, sim
199 |     elif args.init_numerals:
200 |         numeral_regex = re.compile('^[0-9]+$')
201 |         src_numerals = {word for word in src_words if numeral_regex.match(word) is not None}
202 |         trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None}
203 |         numerals = src_numerals.intersection(trg_numerals)
204 |         for word in numerals:
205 |             src_indices.append(src_word2ind[word])
206 |             trg_indices.append(trg_word2ind[word])
207 |     elif args.init_identical:
208 |         identical = set(src_words).intersection(set(trg_words))
209 |         for word in identical:
210 |             src_indices.append(src_word2ind[word])
211 |             trg_indices.append(trg_word2ind[word])
212 |     else:
213 |         f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape')
214 |         for line in f:
215 |             src, trg = line.split()
216 |             try:
217 |                 src_ind = src_word2ind[src]
218 |                 trg_ind = trg_word2ind[trg]
219 |                 src_indices.append(src_ind)
220 |                 trg_indices.append(trg_ind)
221 |             except KeyError:
222 |                 print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr)
223 | 
224 |     # Read validation dictionary
225 |     if args.validation is not None:
226 |         f = open(args.validation, encoding=args.encoding, errors='surrogateescape')
227 |         validation = collections.defaultdict(set)
228 |         oov = set()
229 |         vocab = set()
230 |         for line in f:
231 |             src, trg = line.split()
232 |             try:
233 |                 src_ind = src_word2ind[src]
234 |                 trg_ind = trg_word2ind[trg]
235 |                 validation[src_ind].add(trg_ind)
236 |                 vocab.add(src)
237 |             except KeyError:
238 |                 oov.add(src)
239 |         oov -= vocab  # If one of the translation options is in the vocabulary, then the entry is not an oov
240 |         validation_coverage = len(validation) / (len(validation) + len(oov))
241 | 
242 |     # Create log file
243 |     if args.log:
244 |         log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape')
245 | 
246 |     # Allocate memory
247 |     xw = xp.empty_like(x)
248 |     zw = xp.empty_like(z)
249 |     src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff)
250 |     trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff)
251 |     simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype)
252 |     simbwd = xp.empty((args.batch_size, src_size), dtype=dtype)
253 |     if args.validation is not None:
254 |         simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype)
255 | 
256 |     best_sim_forward = xp.full(src_size, -100, dtype=dtype)
257 |     src_indices_forward = xp.arange(src_size)
258 |     trg_indices_forward = xp.zeros(src_size, dtype=int)
259 |     best_sim_backward = xp.full(trg_size, -100, dtype=dtype)
260 |     src_indices_backward = xp.zeros(trg_size, dtype=int)
261 |     trg_indices_backward = xp.arange(trg_size)
262 |     knn_sim_fwd = xp.zeros(src_size, dtype=dtype)
263 |     knn_sim_bwd = xp.zeros(trg_size, dtype=dtype)
264 | 
265 |     # Training loop
266 |     best_objective = objective = -100.
267 |     it = 1
268 |     last_improvement = 0
269 |     keep_prob = args.stochastic_initial
270 |     t = time.time()
271 |     end = not args.self_learning
272 |     while True:
273 | 
274 |         # Increase the keep probability if we have not improve in args.stochastic_interval iterations
275 |         if it - last_improvement > args.stochastic_interval:
276 |             if keep_prob >= 1.0:
277 |                 end = True
278 |             keep_prob = min(1.0, args.stochastic_multiplier*keep_prob)
279 |             last_improvement = it
280 | 
281 |         # Update the embedding mapping
282 |         if args.orthogonal or not end:  # orthogonal mapping
283 |             u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices]))
284 |             w = vt.T.dot(u.T)
285 |             x.dot(w, out=xw)
286 |             zw[:] = z
287 |         elif args.unconstrained:  # unconstrained mapping
288 |             x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T)
289 |             w = x_pseudoinv.dot(z[trg_indices])
290 |             x.dot(w, out=xw)
291 |             zw[:] = z
292 |         else:  # advanced mapping
293 | 
294 |             # TODO xw.dot(wx2, out=xw) and alike not working
295 |             xw[:] = x
296 |             zw[:] = z
297 | 
298 |             # STEP 1: Whitening
299 |             def whitening_transformation(m):
300 |                 u, s, vt = xp.linalg.svd(m, full_matrices=False)
301 |                 return vt.T.dot(xp.diag(1/s)).dot(vt)
302 |             if args.whiten:
303 |                 wx1 = whitening_transformation(xw[src_indices])
304 |                 wz1 = whitening_transformation(zw[trg_indices])
305 |                 xw = xw.dot(wx1)
306 |                 zw = zw.dot(wz1)
307 | 
308 |             # STEP 2: Orthogonal mapping
309 |             wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices]))
310 |             wz2 = wz2_t.T
311 |             xw = xw.dot(wx2)
312 |             zw = zw.dot(wz2)
313 | 
314 |             # STEP 3: Re-weighting
315 |             xw *= s**args.src_reweight
316 |             zw *= s**args.trg_reweight
317 | 
318 |             # STEP 4: De-whitening
319 |             if args.src_dewhiten == 'src':
320 |                 xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
321 |             elif args.src_dewhiten == 'trg':
322 |                 xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
323 |             if args.trg_dewhiten == 'src':
324 |                 zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2))
325 |             elif args.trg_dewhiten == 'trg':
326 |                 zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2))
327 | 
328 |             # STEP 5: Dimensionality reduction
329 |             if args.dim_reduction > 0:
330 |                 xw = xw[:, :args.dim_reduction]
331 |                 zw = zw[:, :args.dim_reduction]
332 | 
333 |         # Self-learning
334 |         if end:
335 |             break
336 |         else:
337 |             # Update the training dictionary
338 |             if args.direction in ('forward', 'union'):
339 |                 if args.csls_neighborhood > 0:
340 |                     for i in range(0, trg_size, simbwd.shape[0]):
341 |                         j = min(i + simbwd.shape[0], trg_size)
342 |                         zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
343 |                         knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True)
344 |                 for i in range(0, src_size, simfwd.shape[0]):
345 |                     j = min(i + simfwd.shape[0], src_size)
346 |                     xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
347 |                     simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j])
348 |                     simfwd[:j-i] -= knn_sim_bwd/2  # Equivalent to the real CSLS scores for NN
349 |                     dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j])
350 |             if args.direction in ('backward', 'union'):
351 |                 if args.csls_neighborhood > 0:
352 |                     for i in range(0, src_size, simfwd.shape[0]):
353 |                         j = min(i + simfwd.shape[0], src_size)
354 |                         xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i])
355 |                         knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True)
356 |                 for i in range(0, trg_size, simbwd.shape[0]):
357 |                     j = min(i + simbwd.shape[0], trg_size)
358 |                     zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i])
359 |                     simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j])
360 |                     simbwd[:j-i] -= knn_sim_fwd/2  # Equivalent to the real CSLS scores for NN
361 |                     dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j])
362 |             if args.direction == 'forward':
363 |                 src_indices = src_indices_forward
364 |                 trg_indices = trg_indices_forward
365 |             elif args.direction == 'backward':
366 |                 src_indices = src_indices_backward
367 |                 trg_indices = trg_indices_backward
368 |             elif args.direction == 'union':
369 |                 src_indices = xp.concatenate((src_indices_forward, src_indices_backward))
370 |                 trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward))
371 | 
372 |             # Objective function evaluation
373 |             if args.direction == 'forward':
374 |                 objective = xp.mean(best_sim_forward).tolist()
375 |             elif args.direction == 'backward':
376 |                 objective = xp.mean(best_sim_backward).tolist()
377 |             elif args.direction == 'union':
378 |                 objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2
379 |             if objective - best_objective >= args.threshold:
380 |                 last_improvement = it
381 |                 best_objective = objective
382 | 
383 |             # Accuracy and similarity evaluation in validation
384 |             if args.validation is not None:
385 |                 src = list(validation.keys())
386 |                 xw[src].dot(zw.T, out=simval)
387 |                 nn = asnumpy(simval.argmax(axis=1))
388 |                 accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))])
389 |                 similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))])
390 | 
391 |             # Logging
392 |             duration = time.time() - t
393 |             if args.verbose:
394 |                 print(file=sys.stderr)
395 |                 print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr)
396 |                 print('\t- Objective:        {0:9.4f}%'.format(100 * objective), file=sys.stderr)
397 |                 print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr)
398 |                 if args.validation is not None:
399 |                     print('\t- Val. similarity:  {0:9.4f}%'.format(100 * similarity), file=sys.stderr)
400 |                     print('\t- Val. accuracy:    {0:9.4f}%'.format(100 * accuracy), file=sys.stderr)
401 |                     print('\t- Val. coverage:    {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr)
402 |                 sys.stderr.flush()
403 |             if args.log is not None:
404 |                 val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format(
405 |                     100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else ''
406 |                 print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log)
407 |                 log.flush()
408 | 
409 |         t = time.time()
410 |         it += 1
411 | 
412 |     src_dict, tgt_dict = defaultdict(lambda: zeros(300)), defaultdict(lambda: zeros(300))
413 |     src_dict.update(zip(src_words, from_numpy(asnumpy(xw))))
414 |     tgt_dict.update(zip(trg_words, from_numpy(asnumpy(zw))))
415 |     return src_dict, tgt_dict
416 | 


--------------------------------------------------------------------------------
/metrics/utils/wmd.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from torch.nn.functional import cosine_similarity
  4 | import string
  5 | from pyemd import emd
  6 | 
  7 | def pairwise_distances(x, y=None):
  8 |     x_norm = (x**2).sum(1).view(-1, 1)
  9 |     y_norm = (y**2).sum(1).view(1, -1)
 10 |     y_t = torch.transpose(y, 0, 1)
 11 |     dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t)
 12 |     return torch.clamp(dist, 0.0, np.inf)
 13 | 
 14 | def slide_window(input_, w=3, o=1):
 15 |     if input_.size - w + 1 <= 0:
 16 |         w = input_.size
 17 |     sh = (input_.size - w + 1, w)
 18 |     st = input_.strides * 2
 19 |     view = np.lib.stride_tricks.as_strided(input_, strides = st, shape = sh)[0::o]
 20 |     return view.copy().tolist()
 21 | 
 22 | def _safe_divide(numerator, denominator):
 23 |     return numerator / (denominator + 1e-30)
 24 | 
 25 | def load_ngram(tokens, embedding, idf, n_gram, suffix_filter=True):
 26 |     new_a = []
 27 |     new_idf = []
 28 |     ids = [k for k, w in enumerate(tokens) if not suffix_filter or (w not in set(string.punctuation) and '##' not in w)]
 29 | 
 30 |     slide_wins = slide_window(np.array(ids), w=n_gram)
 31 |     for slide_win in slide_wins:
 32 |         new_idf.append(idf[slide_win].sum().item())
 33 |         scale = _safe_divide(idf[slide_win], idf[slide_win].sum(0)).unsqueeze(-1)
 34 |         tmp =  (scale * embedding[slide_win]).sum(0)
 35 |         new_a.append(tmp)
 36 |     new_a = torch.stack(new_a, 0)
 37 |     return new_a, new_idf
 38 | 
 39 | def compute_score(src_embedding_ngrams, src_idf_ngrams, tgt_embedding_ngrams, tgt_idf_ngrams, use_cosine=False):
 40 |     embeddings = torch.cat([src_embedding_ngrams, tgt_embedding_ngrams], 0)
 41 |     embeddings.div_(torch.norm(embeddings, dim=-1).unsqueeze(-1) + 1e-30)
 42 |     if use_cosine:
 43 |         distance_matrix = 1 - cosine_similarity(embeddings[:, None, :], embeddings[None, :, :], dim=2)
 44 |     else:
 45 |         distance_matrix = pairwise_distances(embeddings, embeddings)
 46 | 
 47 |     c1 = np.zeros(len(src_idf_ngrams) + len(tgt_idf_ngrams))
 48 |     c2 = np.zeros_like(c1)
 49 | 
 50 |     c1[:len(src_idf_ngrams)] = src_idf_ngrams
 51 |     c2[-len(tgt_idf_ngrams):] = tgt_idf_ngrams
 52 | 
 53 |     return -emd(_safe_divide(c1, np.sum(c1)), _safe_divide(c2, np.sum(c2)), distance_matrix.double().numpy())
 54 | 
 55 | def word_mover_align(source_data, target_data, n_gram, candidates=None, use_cosine=False, suffix_filter=True):
 56 |     src_embedding_ngrams, src_idf_ngrams = list(), list()
 57 |     for embedding, idf, tokens in zip(*source_data):
 58 |         embedding_ngrams, idf_ngrams = load_ngram(tokens, embedding, idf, n_gram, suffix_filter)
 59 |         src_embedding_ngrams.append(embedding_ngrams)
 60 |         src_idf_ngrams.append(idf_ngrams)
 61 | 
 62 |     tgt_embedding_ngrams, tgt_idf_ngrams = list(), list()
 63 |     for embedding, idf, tokens in zip(*target_data):
 64 |         embedding_ngrams, idf_ngrams = load_ngram(tokens, embedding, idf, n_gram, suffix_filter)
 65 |         tgt_embedding_ngrams.append(embedding_ngrams)
 66 |         tgt_idf_ngrams.append(idf_ngrams)
 67 | 
 68 |     pairs, scores = list(), list()
 69 |     for src_index in range(len(src_embedding_ngrams)):
 70 |         best_score = float("-inf")
 71 |         best_tgt_index = -1
 72 |         # use only the nearest neighbors, when they are provided
 73 |         for tgt_index in range(len(tgt_embedding_ngrams)) if candidates is None else candidates[src_index]:
 74 |             batch_src_embedding_ngrams = src_embedding_ngrams[src_index]
 75 |             batch_src_idf_ngrams = src_idf_ngrams[src_index]
 76 |             batch_tgt_embedding_ngrams = tgt_embedding_ngrams[tgt_index]
 77 |             batch_tgt_idf_ngrams = tgt_idf_ngrams[tgt_index]
 78 |             score = compute_score(batch_src_embedding_ngrams, batch_src_idf_ngrams,
 79 |                     batch_tgt_embedding_ngrams, batch_tgt_idf_ngrams, use_cosine)
 80 |             if score > best_score:
 81 |                 best_score = score
 82 |                 best_tgt_index = tgt_index
 83 | 
 84 |         pairs.append((src_index, best_tgt_index))
 85 |         scores.append(best_score)
 86 | 
 87 |     return pairs, scores
 88 | 
 89 | def word_mover_score(source_data, target_data, n_gram, use_cosine=False, suffix_filter=True):
 90 |     src_embedding_ngrams, src_idf_ngrams = list(), list()
 91 |     for embedding, idf, tokens in zip(*source_data):
 92 |         embedding_ngrams, idf_ngrams = load_ngram(tokens, embedding, idf, n_gram, suffix_filter)
 93 |         src_embedding_ngrams.append(embedding_ngrams)
 94 |         src_idf_ngrams.append(idf_ngrams)
 95 | 
 96 |     tgt_embedding_ngrams, tgt_idf_ngrams = list(), list()
 97 |     for embedding, idf, tokens in zip(*target_data):
 98 |         embedding_ngrams, idf_ngrams = load_ngram(tokens, embedding, idf, n_gram, suffix_filter)
 99 |         tgt_embedding_ngrams.append(embedding_ngrams)
100 |         tgt_idf_ngrams.append(idf_ngrams)
101 | 
102 |     scores = list()
103 |     for data in zip(src_embedding_ngrams, src_idf_ngrams, tgt_embedding_ngrams, tgt_idf_ngrams):
104 |         scores.append(compute_score(*data, use_cosine))
105 | 
106 |     return scores
107 | 


--------------------------------------------------------------------------------
/metrics/vecmapscore.py:
--------------------------------------------------------------------------------
 1 | from .utils.embed import vecmap_embed, map_multilingual_embeddings
 2 | from .utils.knn import ratio_margin_align
 3 | from torch.nn.functional import cosine_similarity
 4 | from .common import CommonScore
 5 | from torch.cuda import is_available as cuda_is_available
 6 | import logging, torch
 7 | 
 8 | class VecMapScore(CommonScore):
 9 |     def __init__(
10 |         self,
11 |         device="cuda" if cuda_is_available() else "cpu",
12 |         src_lang="en",
13 |         tgt_lang="de",
14 |         batch_size=5000,
15 |         knn_batch_size = 1000000,
16 |         k = 5
17 |     ):
18 |         self.device = device
19 |         self.src_lang = src_lang
20 |         self.tgt_lang = tgt_lang
21 |         self.batch_size = batch_size
22 |         self.knn_batch_size = knn_batch_size
23 |         self.k = k
24 |         self.src_dict = None
25 |         self.tgt_dict = None
26 | 
27 |     def _embed(self, source_sents, target_sents):
28 |         if self.src_dict is None or self.tgt_dict is None:
29 |             logging.info("Obtaining cross-lingual word embedding mappings from fasttext embeddings.")
30 |             self.src_dict, self.tgt_dict = map_multilingual_embeddings(self.src_lang, self.tgt_lang,
31 |                 self.batch_size, self.device)
32 | 
33 |         src_embeddings, *_, src_mask = vecmap_embed(source_sents, self.src_dict, self.src_lang)
34 |         tgt_embeddings, *_, tgt_mask = vecmap_embed(target_sents, self.tgt_dict, self.tgt_lang)
35 |         source_sent_embeddings = torch.sum(src_embeddings * src_mask, 1) / torch.sum(src_mask, 1)
36 |         target_sent_embeddings = torch.sum(tgt_embeddings * tgt_mask, 1) / torch.sum(tgt_mask, 1)
37 | 
38 |         return source_sent_embeddings, target_sent_embeddings
39 | 
40 |     def align(self, source_sents, target_sents):
41 |         source_embeddings, target_embeddings = self._embed(source_sents, target_sents)
42 |         indeces, scores = ratio_margin_align(source_embeddings, target_embeddings, self.k,
43 |                 self.knn_batch_size, self.device)
44 | 
45 |         sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces]
46 |         return sent_pairs, scores
47 | 
48 |     def score(self, source_sents, target_sents):
49 |         source_embeddings, target_embeddings = self._embed(source_sents, target_sents)
50 |         return cosine_similarity(source_embeddings, target_embeddings)
51 | 


--------------------------------------------------------------------------------
/metrics/xmoverscore/__init__.py:
--------------------------------------------------------------------------------
  1 | from .embed import *
  2 | from .align import *
  3 | from torch.cuda import is_available as cuda_is_available
  4 | 
  5 | class XMoverBertAlignScore(XMoverAlign, BertRemap):
  6 |     def __init__(
  7 |         self,
  8 |         model_name="bert-base-multilingual-cased",
  9 |         mapping="UMD",
 10 |         device="cuda" if cuda_is_available() else "cpu",
 11 |         do_lower_case=False,
 12 |         use_cosine = False,
 13 |         alignment = "awesome",
 14 |         k = 20,
 15 |         n_gram = 1,
 16 |         remap_size = 2000,
 17 |         embed_batch_size = 128,
 18 |         knn_batch_size = 1000000,
 19 |         align_batch_size = 5000
 20 |     ):
 21 |         logging.info("Using device \"%s\" for computations.", device)
 22 |         XMoverAlign.__init__(self, device, k, n_gram, knn_batch_size, use_cosine, align_batch_size)
 23 |         BertRemap.__init__(self, model_name, None, mapping, device, do_lower_case, remap_size, embed_batch_size, alignment)
 24 | 
 25 | class XMoverVecMapAlignScore(XMoverAlign, VecMapEmbed):
 26 |     def __init__(
 27 |         self,
 28 |         device="cuda" if cuda_is_available() else "cpu",
 29 |         use_cosine = False,
 30 |         k = 20,
 31 |         n_gram = 1,
 32 |         knn_batch_size = 1000000,
 33 |         src_lang = "de",
 34 |         tgt_lang = "en",
 35 |         batch_size = 5000,
 36 |         align_batch_size = 5000
 37 |     ):
 38 |         logging.info("Using device \"%s\" for computations.", device)
 39 |         XMoverAlign.__init__(self, device, k, n_gram, knn_batch_size, use_cosine, align_batch_size)
 40 |         VecMapEmbed.__init__(self, device, src_lang, tgt_lang, batch_size)
 41 | 
 42 | class XMoverNMTBertAlignScore(XMoverNMTAlign, BertRemap):
 43 |     def __init__(
 44 |         self,
 45 |         device="cuda" if cuda_is_available() else "cpu",
 46 |         use_cosine = False,
 47 |         alignment = "awesome",
 48 |         k = 20,
 49 |         n_gram = 1,
 50 |         knn_batch_size = 1000000,
 51 |         train_size = 200000,
 52 |         align_batch_size = 5000,
 53 |         mine_batch_size = 5000000, 
 54 |         src_lang = "de",
 55 |         tgt_lang = "en",
 56 |         model_name="bert-base-multilingual-cased",
 57 |         monolingual_model_name=None,
 58 |         mt_model_name="facebook/mbart-large-cc25",
 59 |         mapping="UMD",
 60 |         do_lower_case=False,
 61 |         remap_size = 2000,
 62 |         embed_batch_size = 128,
 63 |         translate_batch_size = 16,
 64 |         nmt_weights = [0.8, 0.2],
 65 |     ):
 66 |         logging.info("Using device \"%s\" for computations.", device)
 67 |         XMoverNMTAlign.__init__(self, device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang,
 68 |                 tgt_lang, mt_model_name, translate_batch_size, nmt_weights, use_cosine, mine_batch_size)
 69 |         BertRemap.__init__(self, model_name, monolingual_model_name, mapping, device, do_lower_case, remap_size,
 70 |                 embed_batch_size, alignment)
 71 | 
 72 | class XMoverNMTLMBertAlignScore(XMoverNMTLMAlign, BertRemap):
 73 |     def __init__(
 74 |         self,
 75 |         device="cuda" if cuda_is_available() else "cpu",
 76 |         use_cosine = False,
 77 |         use_lm = False,
 78 |         alignment = "awesome",
 79 |         k = 20,
 80 |         n_gram = 1,
 81 |         knn_batch_size = 1000000,
 82 |         train_size = 200000,
 83 |         align_batch_size = 5000,
 84 |         mine_batch_size = 5000000,
 85 |         lm_weights = [1, 0.1],
 86 |         nmt_weights = [0.8, 0.2],
 87 |         src_lang = "de",
 88 |         tgt_lang = "en",
 89 |         model_name="bert-base-multilingual-cased",
 90 |         monolingual_model_name=None,
 91 |         mt_model_name="facebook/mbart-large-cc25",
 92 |         lm_model_name="gpt2",
 93 |         mapping="UMD",
 94 |         do_lower_case=False,
 95 |         remap_size = 2000,
 96 |         embed_batch_size = 128,
 97 |         translate_batch_size = 16,
 98 |     ):
 99 |         logging.info("Using device \"%s\" for computations.", device)
100 |         XMoverNMTLMAlign.__init__(self, device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, tgt_lang,
101 |                 mt_model_name, translate_batch_size, nmt_weights, use_cosine, mine_batch_size, use_lm, lm_weights, lm_model_name)
102 |         BertRemap.__init__(self, model_name, monolingual_model_name, mapping, device, do_lower_case, remap_size,
103 |                 embed_batch_size, alignment)
104 | 
105 | class XMoverScore(XMoverLMAlign, BertRemapPretrained):
106 |     """
107 |     The original XMoverScore implementation. Be careful, remapping matrices
108 |     were trained on parallel data! Provided out of convienence to compare the
109 |     preformance of self-learning remapping approaches to the supervised
110 |     original.
111 |     """
112 |     def __init__(
113 |         self,
114 |         model_name="bert-base-multilingual-cased",
115 |         lm_model_name="gpt2",
116 |         mapping="UMD",
117 |         device="cuda" if cuda_is_available() else "cpu",
118 |         do_lower_case=False,
119 |         use_cosine = False,
120 |         use_lm = False,
121 |         k = 20,
122 |         n_gram = 1,
123 |         embed_batch_size = 128,
124 |         knn_batch_size = 1000000,
125 |         align_batch_size = 5000,
126 |         lm_weights = [1, 0.1]
127 |     ):
128 |         logging.info("Using device \"%s\" for computations.", device)
129 |         XMoverLMAlign.__init__(self, device, k, n_gram, knn_batch_size, use_cosine, align_batch_size, use_lm,
130 |                 lm_weights, lm_model_name)
131 |         BertRemapPretrained.__init__(self, model_name, None, mapping, device, do_lower_case, embed_batch_size)
132 | 


--------------------------------------------------------------------------------
/metrics/xmoverscore/align.py:
--------------------------------------------------------------------------------
  1 | from ..utils.wmd import word_mover_align, word_mover_score
  2 | from ..utils.knn import wcd_align, ratio_margin_align, cosine_align
  3 | from ..utils.nmt import train, translate
  4 | from ..utils.perplexity import lm_perplexity
  5 | from ..utils.env import DATADIR
  6 | from ..common import CommonScore
  7 | from os.path import isfile, join, basename
  8 | from json import dumps
  9 | from math import ceil
 10 | from numpy import arange, array
 11 | from nltk.metrics.distance import edit_distance
 12 | from datasets import load_dataset
 13 | from shutil import copyfile
 14 | import logging
 15 | import torch
 16 | 
 17 | class XMoverAlign(CommonScore):
 18 |     def __init__(self, device, k, n_gram, knn_batch_size, use_cosine, align_batch_size):
 19 |         self.device = device
 20 |         self.k = k
 21 |         self.n_gram = n_gram
 22 |         self.knn_batch_size = knn_batch_size
 23 |         self.use_cosine = use_cosine
 24 |         self.align_batch_size = align_batch_size
 25 | 
 26 |     def _mean_pool_embed(self, source_sents, target_sents):
 27 |         source_sent_embeddings, target_sent_embeddings, idx = None, None, 0
 28 |         while idx < max(len(source_sents), len(target_sents)):
 29 |             src_embeddings, _, _, src_mask, tgt_embeddings, _, _, tgt_mask = self._embed(
 30 |                 source_sents[idx:idx + self.align_batch_size], target_sents[idx:idx + self.align_batch_size])
 31 |             if source_sent_embeddings is None and target_sent_embeddings is None:
 32 |                 source_sent_embeddings = torch.empty(len(source_sents), src_embeddings.shape[-1])
 33 |                 target_sent_embeddings = torch.empty(len(target_sents), tgt_embeddings.shape[-1])
 34 |             source_sent_embeddings[idx:idx + len(src_embeddings)] = torch.sum(src_embeddings * src_mask, 1) / torch.sum(src_mask, 1)
 35 |             target_sent_embeddings[idx:idx + len(tgt_embeddings)] = torch.sum(tgt_embeddings * tgt_mask, 1) / torch.sum(tgt_mask, 1)
 36 |             idx += self.align_batch_size
 37 | 
 38 |         return source_sent_embeddings, target_sent_embeddings
 39 | 
 40 |     def _memory_efficient_word_mover_align(self, source_sents, target_sents, candidates):
 41 |         pairs, scores, idx, k = list(), list(), 0, candidates.shape[1]
 42 |         batch_size = ceil(self.align_batch_size / k)
 43 |         while idx < len(source_sents):
 44 |             src_embeddings, src_idf, src_tokens, _, tgt_embeddings, tgt_idf, tgt_tokens, _ = self._embed(
 45 |                 source_sents[idx:idx + batch_size],
 46 |                 [target_sents[candidate] for candidate in candidates[idx:idx + batch_size].flatten()])
 47 |             batch_pairs, batch_scores = word_mover_align((src_embeddings, src_idf, src_tokens),
 48 |                 (tgt_embeddings, tgt_idf, tgt_tokens), self.n_gram,
 49 |                 arange(len(src_embeddings) * k).reshape(len(src_embeddings), k))
 50 |             pairs.extend([(src + idx, candidates[idx:idx + batch_size].flatten()[tgt]) for src, tgt in batch_pairs])
 51 |             scores.extend(batch_scores)
 52 |             idx += batch_size
 53 |         return pairs, scores
 54 | 
 55 |     def align(self, source_sents, target_sents):
 56 |         candidates = None
 57 |         logging.info("Obtaining sentence embeddings.")
 58 |         source_sent_embeddings, target_sent_embeddings = self._mean_pool_embed(source_sents, target_sents)
 59 |         logging.info("Searching for nearest neighbors.")
 60 |         if self.use_cosine:
 61 |             candidates, _ = cosine_align(source_sent_embeddings, target_sent_embeddings, self.k,
 62 |                     self.knn_batch_size, self.device)
 63 |         else:
 64 |             candidates, _ = wcd_align(source_sent_embeddings, target_sent_embeddings, self.k,
 65 |                     self.knn_batch_size, self.device)
 66 | 
 67 |         logging.info("Filter best nearest neighbors with Word Mover's Distance.")
 68 |         pairs, scores = self._memory_efficient_word_mover_align(source_sents, target_sents, candidates)
 69 |         sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in pairs]
 70 |         return sent_pairs, scores
 71 | 
 72 |     def score(self, source_sents, target_sents, same_language=False):
 73 |         src_embeddings, src_idf, src_tokens, _, tgt_embeddings, tgt_idf, tgt_tokens, _ = self._embed(source_sents,
 74 |                 target_sents, same_language)
 75 |         scores = word_mover_score((src_embeddings, src_idf, src_tokens), (tgt_embeddings, tgt_idf, tgt_tokens),
 76 |                 self.n_gram)
 77 |         return scores
 78 | 
 79 | class XMoverLMAlign(XMoverAlign):
 80 |     """
 81 |     Extends XMoverScore based sentence aligner with an additional language model.
 82 |     """
 83 | 
 84 |     def __init__(self, device, k, n_gram, knn_batch_size, align_batch_size, use_cosine, use_lm, lm_weights, lm_model_name):
 85 |         super().__init__(device, k, n_gram, knn_batch_size, use_cosine, align_batch_size)
 86 |         self.device = device
 87 |         self.use_lm = use_lm
 88 |         self.lm_weights = lm_weights
 89 |         self.lm_model_name = lm_model_name
 90 | 
 91 |     #Override
 92 |     def score(self, source_sents, target_sents):
 93 |         """
 94 |         Compute WMD scores and combine results with perplexity of GPT2 language
 95 |         model. This only makes sense when the hyptheses are in English.
 96 |         """
 97 |         wmd_scores = super().score(source_sents, target_sents)
 98 |         if self.use_lm:
 99 |             lm_scores = lm_perplexity(target_sents, self.device, self.lm_model_name)
100 |             return (self.lm_weights[0] * array(wmd_scores) + self.lm_weights[1] * array(lm_scores)).tolist()
101 |         else:
102 |             return wmd_scores
103 | 
104 | class XMoverNMTAlign(XMoverAlign):
105 |     """
106 |     Able to mine data to train an NMT model, which is then combined with the score.
107 |     """
108 | 
109 |     def __init__(self, device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, tgt_lang,
110 |             mt_model_name, translate_batch_size, nmt_weights, use_cosine, mine_batch_size):
111 |         super().__init__(device, k, n_gram, knn_batch_size, use_cosine, align_batch_size)
112 |         self.train_size = train_size
113 |         self.knn_batch_size = knn_batch_size
114 |         self.src_lang = src_lang
115 |         self.tgt_lang = tgt_lang
116 |         self.mt_model_name = mt_model_name
117 |         self.translate_batch_size = translate_batch_size
118 |         self.nmt_weights = nmt_weights
119 |         self.mt_model = None
120 |         self.mt_tokenizer = None
121 |         self.use_cosine = use_cosine
122 |         self.mine_batch_size = mine_batch_size
123 |         self.back_translate = False
124 | 
125 |     #Override
126 |     def score(self, source_sents, target_sents):
127 |         scores = super().score(source_sents, target_sents)
128 |         if self.mt_model is None or self.mt_tokenizer is None:
129 |             return scores
130 |         else:
131 |             if self.back_translate:
132 |                 mt_scores = super().score(source_sents, self.translate(target_sents), True)
133 |             else:
134 |                 mt_scores = super().score(self.translate(source_sents), target_sents, True)
135 |             return [self.nmt_weights[0] * score + self.nmt_weights[1] * mt_score for score, mt_score in zip(scores, mt_scores)]
136 | 
137 |     def train(self, source_sents, target_sents, suffix="data", iteration=1, aligned=False, finetune=False, overwrite=True,
138 |             back_translate=False, k=None):
139 |         mine_file, batch, batch_size = join(DATADIR, "translation", f"mined-{suffix}.json"), 0, self.mine_batch_size
140 |         pairs, scores = list(), list()
141 |         self.back_translate = back_translate
142 | 
143 |         if self.back_translate:
144 |             logging.info("Training in back-translation mode, swapping source_sents and target_sents.")
145 |             source_sents, target_sents = target_sents, source_sents
146 |             src_lang, tgt_lang = self.tgt_lang, self.src_lang
147 |         else:
148 |             src_lang, tgt_lang = self.src_lang, self.tgt_lang
149 | 
150 |         if (not isfile(mine_file) or overwrite) and not aligned:
151 |             while batch < len(source_sents):
152 |                 logging.info("Obtaining sentence embeddings.")
153 |                 batch_src, batch_tgt = source_sents[batch:batch + batch_size], target_sents[batch:batch + batch_size]
154 |                 source_sent_embeddings, target_sent_embeddings = self._mean_pool_embed(batch_src, batch_tgt)
155 |                 if self.use_cosine:
156 |                     logging.info("Mining pseudo parallel data with Ratio Margin function.")
157 |                     batch_pairs, batch_scores = ratio_margin_align(source_sent_embeddings, target_sent_embeddings,
158 |                             self.k if k is None else k, self.knn_batch_size, self.device)
159 |                 else:
160 |                     logging.info("Mining pseudo parallel data using Word Centroid Distance.")
161 |                     candidates, _ = wcd_align(source_sent_embeddings, target_sent_embeddings, self.k if k is None else k,
162 |                             self.knn_batch_size, self.device)
163 |                     logging.info("Computing exact Word Mover's Distances for candidates.")
164 |                     batch_pairs, batch_scores = self._memory_efficient_word_mover_align(batch_src, batch_tgt, candidates)
165 |                 del source_sent_embeddings, target_sent_embeddings
166 |                 pairs.extend([(src + batch, tgt + batch) for src, tgt in batch_pairs]), scores.extend(batch_scores)
167 |                 batch += batch_size
168 |             with open(mine_file, "wb") as f:
169 |                 idx = 0
170 |                 for _, (src, tgt) in sorted(zip(scores, pairs), key=lambda tup: tup[0], reverse=True):
171 |                     src_sent, tgt_sent = source_sents[src], target_sents[tgt]
172 |                     if edit_distance(src_sent, tgt_sent) / max(len(src_sent), len(tgt_sent)) > 0.5:
173 |                         line = { "translation": { src_lang: src_sent, tgt_lang: tgt_sent} }
174 |                         f.write(dumps(line, ensure_ascii=False).encode() + b"\n")
175 |                         idx += 1
176 |                     if idx >= self.train_size:
177 |                         break
178 |         elif (not isfile(mine_file) or overwrite) and aligned:
179 |             with open(mine_file, "wb") as f:
180 |                 for src_sent, tgt_sent in zip(source_sents, target_sents):
181 |                     line = { "translation": { src_lang: src_sent, tgt_lang: tgt_sent} }
182 |                     f.write(dumps(line, ensure_ascii=False).encode() + b"\n")
183 | 
184 |         if finetune:
185 |             if self.mt_model is not None and self.mt_tokenizer is not None:
186 |                 self.mt_model, self.mt_tokenizer = train(self.mt_model.name_or_path, src_lang, tgt_lang, mine_file,
187 |                         overwrite, suffix, name=basename(self.mt_model_name))
188 |             else:
189 |                 raise ValueError("Wanted to finetune existing model but none was found.")
190 |         elif self.mt_model is not None and self.mt_tokenizer is not None:
191 |                 logging.info("Training MT model with translated and pseudo parallel data.")
192 |                 datasets = load_dataset("json", data_files=mine_file)
193 |                 translation_file = join(DATADIR, "translation", f"translated-{suffix}-{iteration}.json")
194 |                 sents = list(set(source_sents).difference([entry["translation"][src_lang] for entry in datasets['train']]))
195 | 
196 |                 if not isfile(translation_file) or overwrite:
197 |                     copyfile(mine_file, translation_file)
198 |                     with open(translation_file, "ab") as f:
199 |                         for src, tgt in zip(sents, self.translate(sents[:self.train_size])):
200 |                             line = { "translation": { src_lang: src, tgt_lang: tgt} }
201 |                             f.write(dumps(line, ensure_ascii=False).encode() + b"\n")
202 | 
203 |                 self.mt_model, self.mt_tokenizer = train(self.mt_model_name, src_lang, tgt_lang, translation_file,
204 |                         overwrite, f"{suffix}-{iteration}")
205 | 
206 |         else:
207 |             logging.info("Training MT model with pseudo parallel data.")
208 |             self.mt_model, self.mt_tokenizer = train(self.mt_model_name, src_lang, tgt_lang, mine_file,
209 |                     overwrite, suffix)
210 | 
211 |         self.mt_model.to(self.device)
212 | 
213 |     def translate(self, sentences):
214 |         logging.info(f"Translating sentences into {'source' if self.back_translate else 'target'} language.")
215 |         return translate(self.mt_model, self.mt_tokenizer, sentences, self.translate_batch_size, self.device)
216 | 
217 | class XMoverNMTLMAlign(XMoverNMTAlign):
218 |     """
219 |     Combine NMT and LM XMoverScore extensions.
220 |     """
221 | 
222 |     def __init__(self, device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, tgt_lang, mt_model_name,
223 |             translate_batch_size, nmt_weights, use_cosine, mine_batch_size, use_lm, lm_weights, lm_model_name):
224 |         super().__init__(device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, tgt_lang,
225 |                 mt_model_name, translate_batch_size, nmt_weights, use_cosine, mine_batch_size)
226 |         self.device = device
227 |         self.use_lm = use_lm
228 |         self.lm_weights = lm_weights
229 |         self.lm_model_name = lm_model_name
230 | 
231 |     #Override
232 |     def score(self, source_sents, target_sents):
233 |         """
234 |         Compute WMD scores on hypotheses and pseudo translations and combine
235 |         results with perplexity of GPT2 language model. This only makes sense
236 |         when the hyptheses are in English.
237 |         """
238 |         nmt_scores = super().score(source_sents, target_sents)
239 |         if self.use_lm:
240 |             lm_scores = lm_perplexity(target_sents, self.device, self.lm_model_name)
241 |             return (self.lm_weights[0] * array(nmt_scores) + self.lm_weights[1] * array(lm_scores)).tolist()
242 |         else:
243 |             return nmt_scores
244 | 


--------------------------------------------------------------------------------
/metrics/xmoverscore/embed.py:
--------------------------------------------------------------------------------
  1 | from transformers import BertModel, BertTokenizer, BertConfig
  2 | from ..utils.embed import bert_embed, vecmap_embed, map_multilingual_embeddings
  3 | from ..utils.remap import fast_align, awesome_align, sim_align, get_aligned_features_avgbpe, clp, umd
  4 | from ..utils.env import DATADIR
  5 | from ..common import CommonScore
  6 | from os.path import isfile, join
  7 | from nltk.metrics.distance import edit_distance
  8 | from numpy import load
  9 | from io import BytesIO
 10 | from functools import cached_property
 11 | from urllib.request import urlopen
 12 | from urllib.error import URLError
 13 | import logging
 14 | import torch
 15 | 
 16 | class BertEmbed(CommonScore):
 17 |     def __init__(self, model_name, monolingual_model_name, mapping, device, do_lower_case, embed_batch_size):
 18 |         self.model_name = model_name
 19 |         self.monolingual_model_name = monolingual_model_name if monolingual_model_name else model_name
 20 |         self.do_lower_case = do_lower_case
 21 |         self.device = device
 22 |         self.mapping = mapping
 23 |         self.embed_batch_size = embed_batch_size
 24 |         self.projection = None
 25 | 
 26 |     @cached_property
 27 |     def model(self):
 28 |         config = BertConfig.from_pretrained(self.model_name, output_hidden_states=True)
 29 |         return BertModel.from_pretrained(self.model_name, config=config).to(self.device)
 30 | 
 31 |     @cached_property
 32 |     def monolingual_model(self):
 33 |         if self.monolingual_model_name != self.model_name:
 34 |             config = BertConfig.from_pretrained(self.monolingual_model_name, output_hidden_states=True)
 35 |             return BertModel.from_pretrained(self.monolingual_model_name, config=config).to(self.device)
 36 |         else:
 37 |             return self.model
 38 | 
 39 |     @cached_property
 40 |     def tokenizer(self):
 41 |         return BertTokenizer.from_pretrained(self.model_name, do_lower_case=self.do_lower_case)
 42 | 
 43 |     @cached_property
 44 |     def monolingual_tokenzier(self):
 45 |         if self.monolingual_model_name != self.model_name:
 46 |             return BertTokenizer.from_pretrained(self.monolingual_model_name, do_lower_case=self.do_lower_case)
 47 |         else:
 48 |             return self.tokenizer
 49 | 
 50 |     def _embed(self, source_sents, target_sents, same_language=False):
 51 |         model, tokenizer = (self.monolingual_model, self.monolingual_tokenzier) if same_language else (self.model, self.tokenizer)
 52 |         src_embeddings, src_idf, src_tokens, src_mask = bert_embed(source_sents, self.embed_batch_size, model,
 53 |                 tokenizer, self.device)
 54 |         tgt_embeddings, tgt_idf, tgt_tokens, tgt_mask = bert_embed(target_sents, self.embed_batch_size, model,
 55 |                 tokenizer, self.device)
 56 | 
 57 |         if self.projection is not None and not same_language:
 58 |             if self.mapping == 'CLP':
 59 |                 src_embeddings = torch.matmul(src_embeddings, self.projection)
 60 |             else:
 61 |                 src_embeddings = src_embeddings - (src_embeddings * self.projection).sum(2, keepdim=True) * \
 62 |                         self.projection.repeat(src_embeddings.shape[0], src_embeddings.shape[1], 1)
 63 | 
 64 |         return src_embeddings, src_idf, src_tokens, src_mask, tgt_embeddings, tgt_idf, tgt_tokens, tgt_mask
 65 | 
 66 | class BertRemap(BertEmbed):
 67 |     def __init__(self, model_name, monolingual_model_name, mapping, device, do_lower_case, remap_size, embed_batch_size, alignment):
 68 |         super().__init__(model_name, monolingual_model_name, mapping, device, do_lower_case, embed_batch_size)
 69 |         self.remap_size = remap_size
 70 |         self.alignment = alignment
 71 | 
 72 |     def remap(self, source_sents, target_sents, suffix="tensor", aligned=False, overwrite=True, new_mapping=None):
 73 |         file_path, mapping = join(DATADIR, f"projection-{suffix}.pt"), new_mapping or self.mapping
 74 |         if not isfile(file_path) or overwrite:
 75 |             logging.info(f'Computing projection tensor for {mapping} remapping method.')
 76 |             sorted_sent_pairs = list()
 77 |             if aligned:
 78 |                 sorted_sent_pairs.extend(zip(source_sents, target_sents))
 79 |             else:
 80 |                 sent_pairs, scores = self.align(source_sents, target_sents)
 81 |                 for _, (src_sent, tgt_sent) in sorted(zip(scores, sent_pairs), key=lambda tup: tup[0], reverse=True):
 82 |                     if edit_distance(src_sent, tgt_sent) / max(len(src_sent), len(tgt_sent)) > 0.5:
 83 |                         sorted_sent_pairs.append((src_sent, tgt_sent))
 84 |             if self.alignment == "fast":
 85 |                 tokenized_pairs, align_pairs = fast_align(sorted_sent_pairs, self.tokenizer, self.remap_size)
 86 |             elif self.alignment == "sim":
 87 |                 tokenized_pairs, align_pairs = sim_align(sorted_sent_pairs, self.tokenizer, self.remap_size, self.device)
 88 |             else: # awesome
 89 |                 tokenized_pairs, align_pairs = awesome_align(sorted_sent_pairs, self.model, self.tokenizer,
 90 |                         self.remap_size, self.device)
 91 |                 if self.alignment.endswith("remap"): # awesome-remap
 92 |                     src_matrix, tgt_matrix = get_aligned_features_avgbpe(tokenized_pairs, align_pairs,
 93 |                             self.model, self.tokenizer, self.embed_batch_size, self.device, 8)
 94 |                     tokenized_pairs, align_pairs = awesome_align(sorted_sent_pairs, self.model, self.tokenizer,
 95 |                             self.remap_size, self.device,
 96 |                             clp(src_matrix, tgt_matrix) if mapping == "CLP" else umd(src_matrix, tgt_matrix))
 97 |             src_matrix, tgt_matrix = get_aligned_features_avgbpe(tokenized_pairs, align_pairs,
 98 |                     self.model, self.tokenizer, self.embed_batch_size, self.device)
 99 | 
100 |             logging.info(f"Using {len(src_matrix)} aligned word pairs to compute projection tensor.")
101 |             if mapping == "CLP":
102 |                 self.projection = clp(src_matrix, tgt_matrix)
103 |             else:
104 |                 self.projection = umd(src_matrix, tgt_matrix)
105 |             torch.save(self.projection, file_path)
106 |         else:
107 |             logging.info(f'Loading {mapping} projection tensor from disk.')
108 |             self.projection = torch.load(file_path)
109 |         if new_mapping:
110 |             self.mapping = new_mapping
111 | 
112 | class BertRemapPretrained(BertEmbed):
113 |     """
114 |     Obtains pretrained remapping matrices from original XMoverScore repository.
115 |     """
116 | 
117 |     commit = "73ef48058f8e47e0d99434b7c75a9ceb6f253d94"
118 |     path = "mapping/layer-12/{}.{}-{}.2k.12.{}"
119 |     url = f"https://github.com/potamides/ACL20-Reference-Free-MT-Evaluation/raw/{commit}/{path}"
120 | 
121 |     def remap(self, source_lang, target_lang):
122 |         for corpus in ["europarl-v7", "flores-v1", "un-v1", "wikimedia-v20210402", "wikimatrix-v1", "multi-cc-aligned-v1.1"]:
123 |             try:
124 |                 if self.mapping == "CLP":
125 |                     download = urlopen(self.url.format(corpus, source_lang, target_lang, "BAM")).read()
126 |                     self.projection = torch.tensor(load(BytesIO(download)), dtype=torch.float32)
127 |                 else:
128 |                     download = urlopen(self.url.format(corpus, source_lang, target_lang, "GBDD")).read()
129 |                     self.projection = torch.tensor(load(BytesIO(download))[0], dtype=torch.float32)
130 |                 break
131 |             except URLError as e:
132 |                 if e.status == 404:
133 |                     pass
134 |         else:
135 |             raise ValueError("Language direction does not exist!")
136 | 
137 | class VecMapEmbed(CommonScore):
138 |     def __init__(self, device, src_lang, tgt_lang, batch_size):
139 |         self.device = device
140 |         self.src_lang = src_lang
141 |         self.tgt_lang = tgt_lang
142 |         self.batch_size = batch_size
143 |         self.src_dict = None
144 |         self.tgt_dict = None
145 | 
146 |     def _embed(self, source_sents, target_sents, same_language=False):
147 |         if self.src_dict is None or self.tgt_dict is None:
148 |             logging.info("Obtaining cross-lingual word embedding mappings from fasttext embeddings.")
149 |             self.src_dict, self.tgt_dict = map_multilingual_embeddings(self.src_lang, self.tgt_lang,
150 |                 self.batch_size, self.device)
151 |         src_embeddings, src_idf, src_tokens, src_mask = vecmap_embed(source_sents,
152 |                 *((self.tgt_dict, self.tgt_lang) if same_language else (self.src_dict, self.src_lang)))
153 |         tgt_embeddings, tgt_idf, tgt_tokens, tgt_mask = vecmap_embed(target_sents, self.tgt_dict, self.tgt_lang)
154 | 
155 |         return src_embeddings, src_idf, src_tokens, src_mask, tgt_embeddings, tgt_idf, tgt_tokens, tgt_mask
156 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_namespace_packages
 2 | 
 3 | setup(
 4 |     name="unsupervised-metrics",
 5 |     version="1.0",
 6 |     description="Self-Learning for Unsupervised Metrics",
 7 |     keywords=[
 8 |         "Unsupervised",
 9 |         "Metrics",
10 |         "Quality Estimation",
11 |         "Machine Translation",
12 |         "NLP",
13 |         "Deep Learning",
14 |     ],
15 |     url="https://github.com/potamides/unsupervised-metrics",
16 |     author="Jonas Belouadi",
17 |     author_email="potamides@posteo.net",
18 |     packages=find_namespace_packages(include=["metrics*"]),
19 |     install_requires=[
20 |         "tqdm==4.49.0",
21 |         "fasttext==0.9.2",
22 |         "faiss-gpu==1.6.5",
23 |         "pyemd==0.5.1",
24 |         "torch==1.9.0",
25 |         "sentence-transformers==1.2.0",
26 |         "transformers==4.10.3",
27 |         "datasets==2.0.0",
28 |         "nltk>=3.4.5",
29 |         "sentencepiece==0.1.96",
30 |         "mosestokenizer==1.1.0",
31 |         "jieba==0.42.1",
32 |         "sinling==0.3.6",
33 |         "Nepali_nlp @ https://github.com/potamides/Nepali_nlp/archive/d3d078ed50c8224f290d772f7b895354d0cb0266.zip",
34 |         "simalign @ https://github.com/cisnlp/simalign/archive/refs/tags/v0.2.zip",
35 |         "mt_metrics_eval @ https://github.com/google-research/mt-metrics-eval/archive/refs/heads/main.zip",
36 |         "PuLP==2.4",
37 |         "bert-score==0.3.9",
38 |         "tabulate==0.8.9",
39 |         "gdown==3.13.0",
40 |     ],
41 |     extras_require={
42 |         "vecmap-cuda": ["cupy"],
43 |         "experiments": [
44 |             "unbabel-comet",
45 |             "transquest",
46 |             "sacrebleu",
47 |         ],
48 |     },
49 |     python_requires=">=3.9.0",
50 |     zip_safe=False,
51 | )
52 | 


--------------------------------------------------------------------------------