├── .gitignore ├── LICENSE ├── README.md ├── experiments ├── README.md ├── combine.py ├── comparison.py ├── contrast.py ├── distil.py ├── ensemble.py ├── finetune.py ├── lm.py ├── nmt.py ├── parallel.py ├── quantity.py ├── remap.py ├── sentsim.py └── vecmap.py ├── metrics ├── common.py ├── contrastscore.py ├── distilscore.py ├── marginscore.py ├── sentsim.py ├── utils │ ├── dataset.py │ ├── embed.py │ ├── env.py │ ├── knn.py │ ├── language.py │ ├── nmt.py │ ├── perplexity.py │ ├── remap.py │ ├── vecmap │ │ ├── .gitignore │ │ ├── LICENSE.txt │ │ ├── README.md │ │ ├── cupy_utils.py │ │ ├── embeddings.py │ │ └── map_embeddings.py │ └── wmd.py ├── vecmapscore.py └── xmoverscore │ ├── __init__.py │ ├── align.py │ └── embed.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 98 | __pypackages__/ 99 | 100 | # Celery stuff 101 | celerybeat-schedule 102 | celerybeat.pid 103 | 104 | # SageMath parsed files 105 | *.sage.py 106 | 107 | # Environments 108 | .env 109 | .venv 110 | env/ 111 | venv/ 112 | ENV/ 113 | env.bak/ 114 | venv.bak/ 115 | 116 | # Spyder project settings 117 | .spyderproject 118 | .spyproject 119 | 120 | # Rope project settings 121 | .ropeproject 122 | 123 | # mkdocs documentation 124 | /site 125 | 126 | # mypy 127 | .mypy_cache/ 128 | .dmypy.json 129 | dmypy.json 130 | 131 | # Pyre type checker 132 | .pyre/ 133 | 134 | # pytype static type analyzer 135 | .pytype/ 136 | 137 | # Cython debug symbols 138 | cython_debug/ 139 | 140 | # files specific to this project 141 | data/ 142 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Unsupervised Metrics: UScore & Friends 2 | [Unsupervised-Metrics](https://github.com/potamides/unsupervised-metrics) is a 3 | Python library which allows researchers and developers alike to experiment with 4 | state-of-the-art evaluation metrics for machine translation. The focus hereby 5 | lies on reference-free, unsupervised metrics, which do not make use of supervision (parallel 6 | data, references, human scores) in any way. However wrappers around some (weakly-)supervised metrics like 7 | [XMoverScore](https://aclanthology.org/2020.acl-main.151) and 8 | [SentSim](https://aclanthology.org/2021.naacl-main.252) are provided for 9 | convenience. 10 | 11 |
Implemented Papers

12 | 13 | * [UScore: An Effective Approach to Fully Unsupervised Evaluation Metrics for Machine Translation](https://aclanthology.org/2023.eacl-main.27/) 14 | * [On the Limitations of Cross-lingual Encoders as Exposed by Reference-Free Machine Translation Evaluation](https://aclanthology.org/2020.acl-main.151) 15 | * [SentSim: Crosslingual Semantic Evaluation of Machine Translation](https://aclanthology.org/2021.naacl-main.252) 16 |

17 | 18 | ## Installation 19 | If you want to use this project as a library you can install it as a regular 20 | package with [pip](https://pip.pypa.io/en/stable): 21 | ```sh 22 | pip install 'git+https://github.com/potamides/unsupervised-metrics.git#egg=metrics' 23 | ``` 24 | If your goal is to run the included [experiments](experiments) (e.g. to 25 | replicate the results of UScore) clone the repository and install it in 26 | editable mode: 27 | ```sh 28 | git clone https://github.com/potamides/unsupervised-metrics 29 | pip install -e unsupervised-metrics[experiments] 30 | ``` 31 | If you want to use [fast-align](https://github.com/clab/fast_align) follow its 32 | install instruction and make sure that the `fast_align` and `atools` programs 33 | are on your `PATH`. This requirement is optional. 34 | 35 | ## Usage 36 | 37 | ### Train an existing metric 38 | One focus of this library is to make it easy to fine-tune existing 39 | state-of-the-art metrics for arbitrary language pairs and domains. 40 | A simple example is provided in the code block below. For more involved 41 | examples and means on how to instantiate a pre-trained metric take a look at 42 | the [experiments](experiments). 43 | 44 | ```python 45 | from metrics.contrastscore import ContrastScore 46 | from metrics.utils.dataset import DatasetLoader 47 | 48 | src_lang, tgt_lang = "de", "en" 49 | 50 | dataset = DatasetLoader(src_lang, tgt_lang) 51 | # instantiate ContrastScore and enable parallel training on multiple GPUs 52 | scorer = ContrastScore(source_language=src_lang, target_language=tgt_lang, parallelize=True) 53 | # train the underlying language model on pseudo-parallel sentence pairs 54 | scorer.train(*dataset.load("monolingual-train")) 55 | 56 | # print correlations with human judgments 57 | print("Pearson's r: {}, Spearman's ρ: {}".format(*scorer.correlation(*dataset.load("scored")))) 58 | ``` 59 | 60 | ### Create your own metric 61 | This library can also be used as a framework to create new metrics, as 62 | demonstrated in the code block below. Existing metrics are defined in the 63 | [metrics](metrics) package, which could serve as a source of inspiration. 64 | 65 | ```python 66 | from metrics.common import CommonScore 67 | 68 | class MyOwnMetric(CommonScore): 69 | def align(): 70 | """ 71 | This method receives a list of sentences in the source language and a 72 | list of sentences in the target language as parameters and returns 73 | a list of pseudo aligned sentence pairs. 74 | """ 75 | 76 | def _embed(): 77 | """ 78 | This method receives a list of sentences in the source language and a 79 | list of sentences in the target language as parameters and returns 80 | their embeddings, inverse document frequences, tokens and padding 81 | masks. 82 | """ 83 | 84 | def score(): 85 | """ 86 | This method receives a list of sentences in the source language and a 87 | list of sentences in the target language as parameters, which are 88 | assumed to be aligned according to their index. For each sentence pair 89 | a similarity score is computed and the list of scores is returned. 90 | """ 91 | ``` 92 | 93 | ## Acknowledgments 94 | This library is based on the following projects: 95 | * [ACL20-Reference-Free-MT-Evaluation](https://github.com/AIPHES/ACL20-Reference-Free-MT-Evaluation) 96 | * [Unsupervised-crosslingual-Compound-Method-For-MT](https://github.com/Rain9876/Unsupervised-crosslingual-Compound-Method-For-MT) 97 | * [Seq2Seq examples](https://github.com/huggingface/transformers/tree/v4.5.1/examples/seq2seq) of [transformers](https://github.com/huggingface/transformers) 98 | * [VecMap](https://github.com/artetxem/vecmap) 99 | * [CRISS](https://github.com/pytorch/fairseq/tree/master/examples/criss) 100 | 101 | ## Citation 102 | If you like/use our work, please [cite](https://aclanthology.org/2023.eacl-main.27.bib) as follows: 103 | 104 | ```bibtex 105 | @inproceedings{belouadi-eger-2023-uscore, 106 | title = "{US}core: An Effective Approach to Fully Unsupervised Evaluation Metrics for Machine Translation", 107 | author = "Belouadi, Jonas and 108 | Eger, Steffen", 109 | booktitle = "Proceedings of the 17th Conference of the European Chapter of the Association for Computational Linguistics", 110 | month = may, 111 | year = "2023", 112 | address = "Dubrovnik, Croatia", 113 | publisher = "Association for Computational Linguistics", 114 | url = "https://aclanthology.org/2023.eacl-main.27", 115 | pages = "358--374", 116 | } 117 | ``` 118 | -------------------------------------------------------------------------------- /experiments/README.md: -------------------------------------------------------------------------------- 1 | # Experiments 2 | This directory contains experiments which were conducted during the 3 | master-thesis *Self-Learning for Unsupervised Evaluation Metrics* and published 4 | in the paper *UScore: An Effective Approach to Fully Unsupervised Evaluation 5 | Metrics for Machine Translation*. By default the experiments train the used 6 | models from scratch, since it is difficult to distribute all created model 7 | files due to storage limitations. If you need the original model files due to 8 | reproducability reasons, please contact the maintainers of this repository. 9 | Created files are cached in 10 | `${METRICS_HOME:-${XDG_CACHE_HOME:-~/.cache}/metrics}`, so training and 11 | pre-processing only happens once. Please be careful when interrupting a running 12 | process, as created files are not yet checked for their integrity. 13 | 14 | Also please bear in mind, that most models were trained on beefy workstations 15 | like the [NVIDIA DGX A100](https://www.nvidia.com/en-us/data-center/dgx-a100). 16 | The majority of experiments require considerably less resources, but in this 17 | case out-of-memory errors are to be expected. Model inference is of course less 18 | resource intensive. 19 | 20 | ## Included Experiments 21 | When coming from UScore, the `uscore_tests` in `comparison.py` contain 22 | most of the experiments present in the paper. Please note that the names used 23 | for the metrics are slightly different from the ones used in the paper. 24 | 25 | * `remap.py` Remap XMoverScore on pseudo-parallel sentences. 26 | * `quantity.py` Remap XMoverScore on pseudo-parallel sentences mined from different amounts of monolingual data. 27 | * `vecmap.py` Use XMoverScore and mean-pooling metrics with [VecMap](https://github.com/artetxem/vecmap) embeddings. 28 | * `nmt.py` Combine XMoverScore with an unsupervised NMT model. 29 | * `lm.py` Combine XMoverScore with an unsupervised NMT model and a language model of the target language. 30 | * `distil.py` Create distilled cross-lingual sentence embeddings using pseudo-parallel sentences. 31 | * `contrast.py` Created cross-lingual sentence embeddings using a contrastive learning objective. 32 | * `combine.py` Combine word-embeddings with sentence-embeddings in a single metric (XMoverScore + ContrastScore). 33 | * `comparison.py` Compare all self-learned metrics with strong baselines on multiple language directions and datasets. 34 | * `finetune.py` Finetune induced self-learned metrics on small parallel corpora. 35 | * `parallel.py` Create distilled and contrastive cross-lingual sentence embeddings only on parallel data. 36 | * `ensemble.py` Ensemble SentSim with XMoverScore + ContrastScore. 37 | * `sentsim.py` Try to reproduce scores of SentSim metrics. 38 | -------------------------------------------------------------------------------- /experiments/combine.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore 3 | from metrics.contrastscore import ContrastScore 4 | from collections import defaultdict 5 | from tabulate import tabulate 6 | from numpy import linspace 7 | from numpy import corrcoef, argsort 8 | from torch.nn.functional import mse_loss, l1_loss 9 | from torch import FloatTensor 10 | from metrics.utils.dataset import DatasetLoader 11 | import logging 12 | 13 | source_lang, target_lang = "de", "en" 14 | remap_iterations = 1 15 | nmt_iterations = 1 16 | contrast_iterations = 6 17 | 18 | def error(model_scores, ref_scores): 19 | rmse = mse_loss(FloatTensor(ref_scores), FloatTensor(model_scores)).sqrt().item() 20 | mae = l1_loss(FloatTensor(ref_scores), FloatTensor(model_scores)).item() 21 | return rmse, mae 22 | 23 | def correlation(model_scores, ref_scores): 24 | ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(model_scores).argsort() 25 | return corrcoef(ref_scores, model_scores)[0,1], corrcoef(ref_ranks, ranks)[0,1] 26 | 27 | def combine_tests(max_len=30): 28 | assert target_lang == "en", "Target language has to be English for LM to work" 29 | xmover = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, lm_weights=[1, 0.1], nmt_weights=[0.5, 0.4], use_lm=True) 30 | contrast = ContrastScore(source_language=source_lang, target_language=target_lang, parallelize=True) 31 | dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len) 32 | mono_src, mono_tgt = dataset.load("monolingual-align") 33 | eval_src, eval_system, eval_scores = dataset.load("scored") 34 | train_src, train_tgt = dataset.load("monolingual-train") 35 | suffix = f"{source_lang}-{target_lang}-awesome-wmd-{xmover.mapping}-monolingual-align-{xmover.k}-{xmover.remap_size}-{40000}-{max_len}" 36 | results, index = defaultdict(list), [f"{round(weight, 2)}-{round(1-weight, 2)}" for weight in linspace(1, 0, 11)] 37 | 38 | logging.info("Preparing XMoverScore") 39 | for iteration in range(1, remap_iterations + 1): 40 | logging.info(f"Remapping iteration {iteration}.") 41 | xmover.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False) 42 | for iteration in range(nmt_iterations): 43 | logging.info(f"NMT training iteration {iteration}.") 44 | xmover.train(train_src, train_tgt, suffix=suffix+f"-{remap_iterations}", iteration=iteration, overwrite=False, k=1) 45 | 46 | logging.info("Preparing ContrastScore") 47 | for iteration in range(1, contrast_iterations + 1): 48 | logging.info(f"Contrastive Learning iteration {iteration}.") 49 | contrast.suffix = f"{max_len}-{iteration}" 50 | contrast.train(mono_src, mono_tgt, overwrite=False) 51 | 52 | wmd_scores = xmover.score(eval_src, eval_system) 53 | contrast_scores = contrast.score(eval_src, eval_system) 54 | 55 | for weight in linspace(1, 0, 11): 56 | pearson, spearman = correlation([weight * x + (1 - weight) * y for x, y in zip(wmd_scores, contrast_scores)], eval_scores) 57 | rmse, mae = error([weight * x + (1 - weight) * y for x, y in zip(wmd_scores, contrast_scores)], eval_scores) 58 | results["pearson"].append(round(100 * pearson, 2)) 59 | results["spearman"].append(round(100 * spearman, 2)) 60 | results["rmse"].append(round(rmse, 2)) 61 | results["mae"].append(round(mae, 2)) 62 | 63 | return tabulate(results, headers="keys", showindex=index) 64 | 65 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 66 | print("Maximum Sentence Length: 30", combine_tests(), sep="\n") 67 | print("Maximum Sentence Length: 50", combine_tests(max_len=50), sep="\n") 68 | -------------------------------------------------------------------------------- /experiments/comparison.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from collections import defaultdict 3 | from collections import defaultdict 4 | import logging 5 | 6 | from numpy import argsort, corrcoef 7 | from tabulate import tabulate 8 | 9 | from comet import download_model, load_from_checkpoint 10 | from datasets import load_metric 11 | from metrics.contrastscore import ContrastScore 12 | from metrics.distilscore import DistilScore 13 | from metrics.sentsim import SentSim 14 | from metrics.utils.dataset import DatasetLoader 15 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore, XMoverScore 16 | from torch.cuda import is_available as cuda_is_available 17 | from transquest.algo.sentence_level.monotransquest.run_model import MonoTransQuestModel 18 | 19 | newstest2016 = [ 20 | ("de", "en"), 21 | ("en", "ru"), 22 | ("ru", "en"), 23 | ("ro", "en"), 24 | ("cs", "en"), 25 | ("fi", "en"), 26 | ("tr", "en"), 27 | ] 28 | newstest2017 = [ 29 | ("cs", "en"), 30 | ("de", "en"), 31 | ("fi", "en"), 32 | ("lv", "en"), 33 | ("ru", "en"), 34 | ("tr", "en"), 35 | ("zh", "en"), 36 | ] 37 | mlqe = [ 38 | ("en", "de"), 39 | ("en", "zh"), 40 | ("ru", "en"), 41 | ("ro", "en"), 42 | ("et", "en"), 43 | ("ne", "en"), 44 | ("si", "en"), 45 | ] 46 | eval4nlp = [ 47 | (src, tgt) 48 | for src, tgt in [("de", "zh"), ("ru", "de")] 49 | if DatasetLoader(src, tgt).has_eval4nlp_access() 50 | ] 51 | mqm = [("en", "de"), ("zh", "en")] 52 | 53 | lm_model = defaultdict( 54 | lambda: None, 55 | en="gpt2", 56 | ru="sberbank-ai/rugpt3small_based_on_gpt2", 57 | de="dbmdz/german-gpt2", 58 | zh="uer/gpt2-chinese-cluecorpussmall", 59 | ) 60 | 61 | remap_iterations = 1 62 | nmt_iterations = 1 63 | contrast_iterations = 6 64 | 65 | 66 | def correlation(model_scores, ref_scores): 67 | ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(model_scores).argsort() 68 | return corrcoef(ref_scores, model_scores)[0, 1], corrcoef(ref_ranks, ranks)[0, 1] 69 | 70 | 71 | def comet_tests(source_lang, target_lang, dataset_name): 72 | model = load_from_checkpoint(download_model("wmt20-comet-qe-da")) 73 | dataset = DatasetLoader(source_lang, target_lang) 74 | results, index = defaultdict(list), ["Comet-QE"] 75 | 76 | eval_src, eval_system, eval_scores = dataset.load(dataset_name) 77 | data = [{"src": src, "mt": system} for src, system in zip(eval_src, eval_system)] 78 | scores, _ = model.predict(data, batch_size=8, gpus=cuda_is_available() and 1 or 0) 79 | 80 | pearson, spearman = correlation(scores, eval_scores) 81 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 82 | results["pearson"].append(round(100 * pearson, 2)) 83 | results["spearman"].append(round(100 * spearman, 2)) 84 | 85 | return tabulate(results, headers="keys", showindex=index) 86 | 87 | 88 | def transquest_tests(source_lang, target_lang, dataset_name): 89 | transquest_models = { 90 | ("ro", "en"): "TransQuest/monotransquest-da-ro_en-wiki", 91 | ("et", "en"): "TransQuest/monotransquest-da-et_en-wiki", 92 | ("ne", "en"): "TransQuest/monotransquest-da-ne_en-wiki", 93 | ("si", "en"): "TransQuest/monotransquest-da-si_en-wiki", 94 | ("ru", "en"): "TransQuest/monotransquest-da-ru_en-reddit_wikiquotes", 95 | ("en", "de"): "TransQuest/monotransquest-da-en_de-wiki", 96 | ("en", "zh"): "TransQuest/monotransquest-da-en_zh-wiki", 97 | ("en", None): "TransQuest/monotransquest-da-en_any", 98 | (None, "en"): "TransQuest/monotransquest-da-any_en", 99 | (None, None): "TransQuest/monotransquest-da-multilingual", 100 | } 101 | 102 | if (source_lang, target_lang) in transquest_models: 103 | model_name = transquest_models[(source_lang, target_lang)] 104 | elif (source_lang, None) in transquest_models: 105 | model_name = transquest_models[(source_lang, None)] 106 | elif (None, target_lang) in transquest_models: 107 | model_name = transquest_models[(None, target_lang)] 108 | else: 109 | model_name = transquest_models[(None, None)] 110 | 111 | model = MonoTransQuestModel( 112 | "xlmroberta", model_name, num_labels=1, use_cuda=cuda_is_available() 113 | ) 114 | dataset = DatasetLoader(source_lang, target_lang) 115 | eval_src, eval_system, eval_scores = dataset.load(dataset_name) 116 | results, index = defaultdict(list), ["MonoTransQuest"] 117 | scores, _ = model.predict(list(map(list, zip(eval_src, eval_system)))) 118 | 119 | pearson, spearman = correlation(scores, eval_scores) 120 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 121 | results["pearson"].append(round(100 * pearson, 2)) 122 | results["spearman"].append(round(100 * spearman, 2)) 123 | 124 | return tabulate(results, headers="keys", showindex=index) 125 | 126 | 127 | def bleu_test(source_lang, target_lang, dataset_name): 128 | metric = load_metric("sacrebleu") 129 | dataset = DatasetLoader(source_lang, target_lang, return_references=True) 130 | _, eval_ref, eval_system, eval_scores = dataset.load(dataset_name) 131 | results, index = defaultdict(list), ["BLEU"] 132 | 133 | scores = list() 134 | for system, ref in zip(eval_system, eval_ref): 135 | scores.append(metric.compute(predictions=[system], references=[[ref]])["score"]) 136 | pearson, spearman = correlation(scores, eval_scores) 137 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 138 | results["pearson"].append(round(100 * pearson, 2)) 139 | results["spearman"].append(round(100 * spearman, 2)) 140 | 141 | return tabulate(results, headers="keys", showindex=index) 142 | 143 | 144 | def xmoverscore_tests(source_lang, target_lang, dataset_name, mapping="UMD"): 145 | scorer = XMoverScore( 146 | mapping=mapping, use_lm=True, lm_model_name=lm_model[target_lang] 147 | ) 148 | dataset = DatasetLoader(source_lang, target_lang) 149 | eval_src, eval_system, eval_scores = dataset.load(dataset_name) 150 | results, index = defaultdict(list), [f"XMoverScore ({mapping})"] 151 | 152 | try: 153 | scorer.remap(source_lang, target_lang) 154 | except ValueError: 155 | results["pearson"].append("-") 156 | results["spearman"].append("-") 157 | else: 158 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 159 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 160 | results["pearson"].append(round(100 * pearson, 2)) 161 | results["spearman"].append(round(100 * spearman, 2)) 162 | 163 | return tabulate(results, headers="keys", showindex=index) 164 | 165 | 166 | def sentsim_tests(source_lang, target_lang, dataset_name, word_metric="BERTScore"): 167 | scorer = SentSim(use_wmd=word_metric == "WMD") 168 | dataset = DatasetLoader(source_lang, target_lang) 169 | eval_src, eval_system, eval_scores = dataset.load(dataset_name) 170 | results, index = defaultdict(list), [f"SentSim ({word_metric})"] 171 | 172 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 173 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 174 | results["pearson"].append(round(100 * pearson, 2)) 175 | results["spearman"].append(round(100 * spearman, 2)) 176 | 177 | return tabulate(results, headers="keys", showindex=index) 178 | 179 | 180 | def distilscore_tests(source_lang, target_lang, dataset_name): 181 | scorer = DistilScore( 182 | student_model_name="xlm-r-bert-base-nli-stsb-mean-tokens", 183 | source_language=source_lang, 184 | target_language=target_lang, 185 | student_is_pretrained=True, 186 | suffix="1", 187 | ) 188 | dataset = DatasetLoader(source_lang, target_lang) 189 | eval_src, eval_system, eval_scores = dataset.load(dataset_name) 190 | results, index = defaultdict(list), ["DistilScore"] 191 | 192 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 193 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 194 | results["pearson"].append(round(100 * pearson, 2)) 195 | results["spearman"].append(round(100 * spearman, 2)) 196 | 197 | return tabulate(results, headers="keys", showindex=index) 198 | 199 | 200 | def uscore_tests(source_lang, target_lang, dataset_name, max_len=30): 201 | xmover = XMoverNMTLMBertAlignScore( 202 | src_lang=source_lang, 203 | tgt_lang=target_lang, 204 | lm_weights=[1, 0.1], 205 | nmt_weights=[0.5, 0.4], 206 | use_lm=True, 207 | lm_model_name=lm_model[target_lang], 208 | ) 209 | contrast = ContrastScore( 210 | source_language=source_lang, target_language=target_lang, parallelize=True 211 | ) 212 | dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len) 213 | mono_src, mono_tgt = dataset.load("monolingual-align") 214 | train_src, train_tgt = dataset.load("monolingual-train") 215 | eval_src, eval_system, eval_scores = dataset.load(dataset_name) 216 | suffix = f"{source_lang}-{target_lang}-awesome-wmd-{xmover.mapping}-monolingual-align-{xmover.k}-{xmover.remap_size}-{40000}-{max_len}" 217 | results, index = defaultdict(list), [ 218 | f"UScore (WRD) ({max_len} tokens)", 219 | f"UScore (SNT) ({max_len} tokens)", 220 | f"UScore (WRD) + UScore (SNT) ({max_len} tokens)", 221 | ] 222 | 223 | logging.info("Evaluating UScore (WRD)") 224 | for iteration in range(1, remap_iterations + 1): 225 | logging.info(f"Remapping iteration {iteration}.") 226 | xmover.remap( 227 | mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False 228 | ) 229 | for iteration in range(nmt_iterations): 230 | logging.info(f"NMT training iteration {iteration}.") 231 | xmover.train( 232 | train_src, 233 | train_tgt, 234 | suffix=suffix + f"-{remap_iterations}", 235 | iteration=iteration, 236 | overwrite=False, 237 | k=1, 238 | ) 239 | 240 | pearson, spearman = xmover.correlation(eval_src, eval_system, eval_scores) 241 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 242 | results["pearson"].append(round(100 * pearson, 2)) 243 | results["spearman"].append(round(100 * spearman, 2)) 244 | 245 | logging.info("Evaluating UScore (SNT)") 246 | for iteration in range(1, contrast_iterations + 1): 247 | logging.info(f"Contrastive Learning iteration {iteration}.") 248 | contrast.suffix = f"{max_len}-{iteration}" 249 | contrast.train(train_src, train_tgt, overwrite=False) 250 | 251 | pearson, spearman = contrast.correlation(eval_src, eval_system, eval_scores) 252 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 253 | results["pearson"].append(round(100 * pearson, 2)) 254 | results["spearman"].append(round(100 * spearman, 2)) 255 | 256 | logging.info("Evaluating UScore (WRD) + UScore (SNT)") 257 | wmd_scores, contrast_scores = xmover.score(eval_src, eval_system), contrast.score( 258 | eval_src, eval_system 259 | ) 260 | pearson, spearman = correlation( 261 | [0.6 * x + 0.4 * y for x, y in zip(wmd_scores, contrast_scores)], eval_scores 262 | ) 263 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 264 | results["pearson"].append(round(100 * pearson, 2)) 265 | results["spearman"].append(round(100 * spearman, 2)) 266 | 267 | return tabulate(results, headers="keys", showindex=index) 268 | 269 | 270 | logging.basicConfig( 271 | level=logging.INFO, 272 | datefmt="%m-%d %H:%M", 273 | format="%(asctime)s %(levelname)-8s %(message)s", 274 | ) 275 | ref_datasets = ( 276 | ("Newstest-2016", "scored", newstest2016), 277 | ("Newstest-2017", "scored-wmt17", newstest2017), 278 | ("MQM-Newstest-2020", "scored-mqm", mqm), 279 | ) 280 | datasets = ref_datasets + ( 281 | ("MLQE-PE", "scored-mlqe", mlqe), 282 | ("Eval4NLP-2021", "scored-eval4nlp", eval4nlp), 283 | ) 284 | for dataset, identifier, pairs in datasets: 285 | for source_lang, target_lang in pairs: 286 | print(f"Evaluating {source_lang}-{target_lang} language direction on {dataset}") 287 | print(uscore_tests(source_lang, target_lang, identifier, max_len=30)) 288 | print(uscore_tests(source_lang, target_lang, identifier, max_len=50)) 289 | print(xmoverscore_tests(source_lang, target_lang, identifier, mapping="UMD")) 290 | print(xmoverscore_tests(source_lang, target_lang, identifier, mapping="CLP")) 291 | print(sentsim_tests(source_lang, target_lang, identifier, word_metric="BERTScore")) 292 | print(sentsim_tests(source_lang, target_lang, identifier, word_metric="WMD")) 293 | print(distilscore_tests(source_lang, target_lang, identifier)) 294 | print(transquest_tests(source_lang, target_lang, identifier)) 295 | print(comet_tests(source_lang, target_lang, identifier)) 296 | 297 | for dataset, identifier, pairs in ref_datasets: 298 | for source_lang, target_lang in pairs: 299 | print(bleu_test(source_lang, target_lang, identifier)) 300 | -------------------------------------------------------------------------------- /experiments/contrast.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.contrastscore import ContrastScore 3 | from collections import defaultdict 4 | from tabulate import tabulate 5 | from metrics.utils.dataset import DatasetLoader 6 | import logging 7 | 8 | source_lang, target_lang = "de", "en" 9 | iterations = 10 10 | 11 | def contrastive_tests(max_len=30, model="xlm-roberta-base"): 12 | scorer = ContrastScore(model_name=model, source_language=source_lang, target_language=target_lang, parallelize=True) 13 | dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len) 14 | eval_src, eval_system, eval_scores = dataset.load("scored") 15 | parallel_src, parallel_tgt = dataset.load("parallel") 16 | results, index = defaultdict(list), list(range(iterations + 1)) 17 | 18 | logging.info("Evaluating performance before training.") 19 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 20 | precision = scorer.precision(parallel_src, parallel_tgt) 21 | rmse, mae = scorer.error(eval_src, eval_system, eval_scores) 22 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}") 23 | results["pearson"].append(round(100 * pearson, 2)) 24 | results["spearman"].append(round(100 * spearman, 2)) 25 | results["precision"].append(round(100 * precision, 2)) 26 | results["rmse"].append(round(rmse, 2)) 27 | results["mae"].append(round(mae, 2)) 28 | 29 | mono_src, mono_tgt = dataset.load("monolingual-train") 30 | 31 | for iteration in range(1, iterations + 1): 32 | logging.info(f"Training iteration {iteration}.") 33 | scorer.suffix = f"{max_len}-{iteration}" 34 | scorer.train(mono_src, mono_tgt, overwrite=False) 35 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 36 | precision = scorer.precision(parallel_src, parallel_tgt) 37 | rmse, mae = scorer.error(eval_src, eval_system, eval_scores) 38 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}") 39 | results["pearson"].append(round(100 * pearson, 2)) 40 | results["spearman"].append(round(100 * spearman, 2)) 41 | results["precision"].append(round(100 * precision, 2)) 42 | results["rmse"].append(round(rmse, 2)) 43 | results["mae"].append(round(mae, 2)) 44 | 45 | return tabulate(results, headers="keys", showindex=index) 46 | 47 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 48 | print("Contrastive learning with XLM-R", contrastive_tests(max_len=30), contrastive_tests(max_len=50), sep="\n") 49 | print("Contrastive learning with mBERT", contrastive_tests(max_len=30, model="bert-base-multilingual-cased"), sep="\n") 50 | -------------------------------------------------------------------------------- /experiments/distil.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.distilscore import DistilScore 3 | from collections import defaultdict 4 | from tabulate import tabulate 5 | from metrics.utils.dataset import DatasetLoader 6 | import logging 7 | 8 | source_lang, target_lang = "de", "en" 9 | iterations = 5 10 | 11 | def distil_tests(model="xlm-roberta-base"): 12 | scorer = DistilScore(student_model_name=model, source_language=source_lang, target_language=target_lang, suffix="1") 13 | dataset = DatasetLoader(source_lang, target_lang) 14 | eval_src, eval_system, eval_scores = dataset.load("scored") 15 | parallel_src, parallel_tgt = dataset.load("parallel") 16 | results, index = defaultdict(list), list(range(iterations + 1)) 17 | 18 | logging.info("Evaluating performance before distilling.") 19 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 20 | precision = scorer.precision(parallel_src, parallel_tgt) 21 | rmse, mae = scorer.error(eval_src, eval_system, eval_scores) 22 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}") 23 | results["pearson"].append(round(100 * pearson, 2)) 24 | results["spearman"].append(round(100 * spearman, 2)) 25 | results["precision"].append(round(100 * precision, 2)) 26 | results["rmse"].append(round(rmse, 2)) 27 | results["mae"].append(round(mae, 2)) 28 | 29 | parallel_src, parallel_tgt = dataset.load("parallel") 30 | mono_src, mono_tgt = dataset.load("monolingual-train") 31 | 32 | for iteration in range(1, iterations + 1): 33 | logging.info(f"Training iteration {iteration}.") 34 | scorer.suffix = str(iteration) 35 | scorer.train(mono_src, mono_tgt, dev_source_sents=parallel_src, dev_target_sents=parallel_tgt, overwrite=False) 36 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 37 | precision = scorer.precision(parallel_src, parallel_tgt) 38 | rmse, mae = scorer.error(eval_src, eval_system, eval_scores) 39 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}") 40 | results["pearson"].append(round(100 * pearson, 2)) 41 | results["spearman"].append(round(100 * spearman, 2)) 42 | results["precision"].append(round(100 * precision, 2)) 43 | results["rmse"].append(round(rmse, 2)) 44 | results["mae"].append(round(mae, 2)) 45 | 46 | return tabulate(results, headers="keys", showindex=index) 47 | 48 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 49 | print("Results using XLM-R as student:", distil_tests(), sep="\n") 50 | print("Results using mBERT as student:", distil_tests(model="bert-base-multilingual-cased"), sep="\n") 51 | -------------------------------------------------------------------------------- /experiments/ensemble.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore 3 | from metrics.contrastscore import ContrastScore 4 | from metrics.sentsim import SentSim 5 | from metrics.utils.dataset import DatasetLoader 6 | from collections import defaultdict 7 | from tabulate import tabulate 8 | from numpy import corrcoef, argsort 9 | import logging 10 | 11 | mlqe = [("en", "de"), ("en", "zh"), ("ru", "en"), ("ro", "en"), ("et", "en"), ("ne", "en"), ("si", "en")] 12 | lm_model = {"en": "gpt2", "ru": "sberbank-ai/rugpt3small_based_on_gpt2", "de": "dbmdz/german-gpt2", 13 | "zh": "uer/gpt2-chinese-cluecorpussmall"} 14 | 15 | remap_iterations = 1 16 | nmt_iterations = 1 17 | contrast_iterations = 6 18 | 19 | def correlation(model_scores, ref_scores): 20 | ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(model_scores).argsort() 21 | return corrcoef(ref_scores, model_scores)[0,1], corrcoef(ref_ranks, ranks)[0,1] 22 | 23 | def xmover_contrast_combine(source_lang, target_lang, max_len=30): 24 | xmover = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, lm_weights=[1, 0.1], 25 | nmt_weights=[0.5, 0.4], use_lm=source_lang!="ru", lm_model_name=lm_model[target_lang], translate_batch_size=4) 26 | contrast = ContrastScore(source_language=source_lang, target_language=target_lang, parallelize=True) 27 | dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len) 28 | mono_src, mono_tgt = dataset.load("monolingual-align") 29 | train_src, train_tgt = dataset.load("monolingual-train") 30 | para_src, para_tgt = dataset.load("nepali" if "ne" in [source_lang, target_lang] else "wikimatrix", 200000) 31 | suffix = f"{source_lang}-{target_lang}-awesome-wmd-{xmover.mapping}-monolingual-align-{xmover.k}-{xmover.remap_size}-{40000}-{max_len}" 32 | 33 | for iteration in range(1, remap_iterations + 1): 34 | logging.info(f"Remapping iteration {iteration}.") 35 | xmover.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False) 36 | if source_lang != "ru": 37 | for iteration in range(nmt_iterations): 38 | logging.info(f"NMT training iteration {iteration}.") 39 | xmover.train(train_src, train_tgt, suffix=suffix+f"-{remap_iterations}", iteration=iteration, overwrite=False, k=1) 40 | for iteration in range(1, contrast_iterations + 1): 41 | logging.info(f"Contrastive Learning iteration {iteration}.") 42 | contrast.suffix = f"{max_len}-{iteration}" 43 | contrast.train(train_src, train_tgt, overwrite=False) 44 | 45 | xmover.mapping = "CLP" 46 | xmover.remap(para_src, para_tgt, suffix=suffix.replace("UMD", "CLP") + f"-finetuned-200000", aligned=True, overwrite=False) 47 | if source_lang != "ru": 48 | xmover.train(para_src, para_tgt, suffix=suffix + f"-finetuned-200000", iteration=iteration, aligned=True, 49 | finetune=True, overwrite=False, k=1) 50 | contrast.suffix = f"{max_len}-finetuned-200000" 51 | contrast.train(para_src, para_tgt, aligned=True, finetune=True, overwrite=False) 52 | 53 | return lambda src, sys: [0.6 * x + 0.4 * y for x, y in zip(xmover.score(src, sys), contrast.score(src, sys))] 54 | 55 | def tests(source_lang, target_lang, dataset_name, max_len=30): 56 | xcontrast = xmover_contrast_combine(source_lang, target_lang, max_len) 57 | sentsim = SentSim() 58 | dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len) 59 | eval_src, eval_system, eval_scores = dataset.load(dataset_name) 60 | results, index = defaultdict(list), ["SentSim", f"XMover + Contrast ({max_len} tokens)", f"Ensemble ({max_len} tokens)"] 61 | 62 | for score in [sentsim.score, xcontrast, lambda src, sys:[0.5 * x + 0.5 * y for x, y in zip(xcontrast(src, sys), sentsim.score(src, sys))]]: 63 | pearson, spearman = correlation(score(eval_src, eval_system), eval_scores) 64 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 65 | results["pearson"].append(round(100 * pearson, 2)) 66 | results["spearman"].append(round(100 * spearman, 2)) 67 | 68 | return tabulate(results, headers="keys", showindex=index) 69 | 70 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 71 | for source_lang, target_lang in mlqe: 72 | print(f"Evaluating {source_lang}-{target_lang} language direction on MLQE-PE.") 73 | print(tests(source_lang, target_lang, "scored-mlqe", max_len=30)) 74 | print(tests(source_lang, target_lang, "scored-mlqe", max_len=50)) 75 | -------------------------------------------------------------------------------- /experiments/finetune.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore 3 | from metrics.contrastscore import ContrastScore 4 | from metrics.utils.dataset import DatasetLoader 5 | from collections import defaultdict 6 | from tabulate import tabulate 7 | from numpy import corrcoef, argsort 8 | import logging 9 | 10 | mlqe = [("en", "de"), ("en", "zh"), ("ru", "en"), ("ro", "en"), ("et", "en"), ("ne", "en"), ("si", "en")] 11 | lm_model = {"en": "gpt2", "de": "dbmdz/german-gpt2", "zh": "uer/gpt2-chinese-cluecorpussmall"} 12 | 13 | remap_iterations = 1 14 | nmt_iterations = 1 15 | contrast_iterations = 6 16 | 17 | def correlation(model_scores, ref_scores): 18 | ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(model_scores).argsort() 19 | return corrcoef(ref_scores, model_scores)[0,1], corrcoef(ref_ranks, ranks)[0,1] 20 | 21 | def self_learning_tests(source_lang, target_lang, max_len=30, size=30000): 22 | xmover = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, lm_weights=[1, 0.1], 23 | nmt_weights=[0.5, 0.4], use_lm=True, lm_model_name=lm_model[target_lang]) 24 | contrast = ContrastScore(source_language=source_lang, target_language=target_lang, parallelize=True) 25 | dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len) 26 | mono_src, mono_tgt = dataset.load("monolingual-align") 27 | train_src, train_tgt = dataset.load("monolingual-train") 28 | eval_src, eval_system, eval_scores = dataset.load("scored-mlqe") 29 | dataset.hard_limit = 500 30 | para_src, para_tgt = dataset.load("nepali" if "ne" in [source_lang, target_lang] else "wikimatrix", size) 31 | suffix = f"{source_lang}-{target_lang}-awesome-wmd-{xmover.mapping}-monolingual-align-{xmover.k}-{xmover.remap_size}-{40000}-{max_len}" 32 | results, index = defaultdict(list), [f"XMoverScore ({max_len} tokens)", f"Fine-tuned XMoverScore ({max_len} tokens)", 33 | f"ContrastScore ({max_len} tokens)", f"Fine-tuned ContrastScore ({max_len} tokens)", 34 | f"XMoverScore + ContrastScore ({max_len} tokens)"] 35 | 36 | logging.info("Evaluating XMoverScore") 37 | for iteration in range(1, remap_iterations + 1): 38 | logging.info(f"Remapping iteration {iteration}.") 39 | xmover.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False) 40 | for iteration in range(nmt_iterations): 41 | logging.info(f"NMT training iteration {iteration}.") 42 | xmover.train(train_src, train_tgt, suffix=suffix+f"-{remap_iterations}", iteration=iteration, overwrite=False, k=1) 43 | 44 | pearson, spearman = xmover.correlation(eval_src, eval_system, eval_scores) 45 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 46 | results["pearson"].append(round(100 * pearson, 2)) 47 | results["spearman"].append(round(100 * spearman, 2)) 48 | 49 | logging.info(f"Remapping on parallel data.") 50 | xmover.mapping = "CLP" 51 | xmover.remap(para_src, para_tgt, suffix=suffix.replace("UMD", "CLP") + f"-finetuned-{size}", aligned=True, overwrite=False) 52 | logging.info(f"NMT training on parallel data.") 53 | xmover.train(para_src, para_tgt, suffix=suffix + f"-finetuned-{size}", aligned=True, finetune=True, overwrite=False, k=1) 54 | 55 | pearson, spearman = xmover.correlation(eval_src, eval_system, eval_scores) 56 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 57 | results["pearson"].append(round(100 * pearson, 2)) 58 | results["spearman"].append(round(100 * spearman, 2)) 59 | 60 | logging.info("Evaluating ContrastScore") 61 | for iteration in range(1, contrast_iterations + 1): 62 | logging.info(f"Contrastive Learning iteration {iteration}.") 63 | contrast.suffix = f"{max_len}-{iteration}" 64 | contrast.train(train_src, train_tgt, overwrite=False) 65 | 66 | pearson, spearman = contrast.correlation(eval_src, eval_system, eval_scores) 67 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 68 | results["pearson"].append(round(100 * pearson, 2)) 69 | results["spearman"].append(round(100 * spearman, 2)) 70 | 71 | logging.info(f"Contrastive Learning on parallel data.") 72 | contrast.suffix = f"{max_len}-finetuned-{size}" 73 | contrast.train(para_src, para_tgt, aligned=True, finetune=True, overwrite=False) 74 | 75 | pearson, spearman = contrast.correlation(eval_src, eval_system, eval_scores) 76 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 77 | results["pearson"].append(round(100 * pearson, 2)) 78 | results["spearman"].append(round(100 * spearman, 2)) 79 | 80 | logging.info("Evaluating XMoverScore + ContrastScore") 81 | wmd_scores, contrast_scores = xmover.score(eval_src, eval_system), contrast.score(eval_src, eval_system) 82 | pearson, spearman = correlation([0.6 * x + 0.4 * y for x, y in zip(wmd_scores, contrast_scores)], eval_scores) 83 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 84 | results["pearson"].append(round(100 * pearson, 2)) 85 | results["spearman"].append(round(100 * spearman, 2)) 86 | 87 | return tabulate(results, headers="keys", showindex=index) 88 | 89 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 90 | for size in (10000, 20000, 30000, 200000): 91 | for source_lang, target_lang in mlqe: 92 | print(f"Evaluating {source_lang}-{target_lang} language direction on MLQE-PE using {size} parallel sentences.") 93 | print(self_learning_tests(source_lang, target_lang, max_len=30, size=size)) 94 | print(self_learning_tests(source_lang, target_lang, max_len=50, size=size)) 95 | -------------------------------------------------------------------------------- /experiments/lm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore 3 | from collections import defaultdict 4 | from tabulate import tabulate 5 | from numpy import linspace 6 | from metrics.utils.dataset import DatasetLoader 7 | import logging 8 | 9 | source_lang, target_lang = "de", "en" 10 | iterations = 1 11 | 12 | def lm_nmt_tests(metric="wmd", max_len=50): 13 | assert target_lang == "en", "Target language has to be English for LM to work" 14 | results = defaultdict(list) 15 | scorer = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, use_lm=True, use_cosine=metric=="cosine") 16 | dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len) 17 | mono_src, mono_tgt = dataset.load("monolingual-align") 18 | eval_src, eval_system, eval_scores = dataset.load("scored") 19 | suffix = f"{source_lang}-{target_lang}-awesome-{metric}-{scorer.mapping}-monolingual-align-{scorer.k}-{scorer.remap_size}-{len(mono_src)}-{max_len}" 20 | 21 | for iteration in range(1, iterations + 1): 22 | logging.info(f"Remapping iteration {iteration}.") 23 | scorer.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False) 24 | 25 | logging.info("Training NMT system.") 26 | train_src, train_tgt = dataset.load("monolingual-train") 27 | scorer.train(train_src, train_tgt, suffix=suffix+f"-{iterations}", overwrite=False, k=5 if metric=="cosine" else 1) 28 | 29 | 30 | logging.info(f"Evaluating performance with NMT and language model.") 31 | for lm_weight in linspace(0, 1, 11): 32 | for nmt_weight in linspace(0, 1, 11): 33 | if lm_weight + nmt_weight <= 1: 34 | scorer.nmt_weights = [1 - lm_weight - nmt_weight, nmt_weight] 35 | scorer.lm_weights = [1, lm_weight] 36 | pearson, _ = scorer.correlation(eval_src, eval_system, eval_scores) 37 | logging.info(f"NMT: {round(nmt_weight, 1)}, LM: {round(lm_weight, 1)}, Pearson: {pearson}") 38 | results[round(lm_weight, 1)].append(round(100 * pearson, 2)) 39 | else: 40 | results[round(lm_weight, 1)].append("-") 41 | 42 | return suffix, tabulate(results, headers="keys", showindex=linspace(0, 1, 11)) 43 | 44 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 45 | print(*lm_nmt_tests(), sep="\n") 46 | -------------------------------------------------------------------------------- /experiments/nmt.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore 3 | from collections import defaultdict 4 | from tabulate import tabulate 5 | from numpy import linspace 6 | from metrics.utils.dataset import DatasetLoader 7 | import logging 8 | 9 | source_lang, target_lang = "de", "en" 10 | remap_iterations = 1 11 | 12 | def nmt_tests(metric="cosine", weights=[0.8, 0.2], max_len=30, nmt_iterations=1, back_translate=False): 13 | aligner = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, nmt_weights=weights, use_cosine=metric=="cosine") 14 | dataset = DatasetLoader(source_lang, target_lang, max_monolingual_sent_len=max_len) 15 | mono_src, mono_tgt = dataset.load("monolingual-align") 16 | train_src, train_tgt = dataset.load("monolingual-train") 17 | eval_src, eval_system, eval_scores = dataset.load("scored") 18 | langs = f"{target_lang}-{source_lang}" if back_translate else f"{source_lang}-{target_lang}" 19 | suffix = f"{langs}-awesome-{metric}-{aligner.mapping}-monolingual-align-{aligner.k}-{aligner.remap_size}-{len(mono_src)}-{max_len}" 20 | results, index = defaultdict(list), list(range(remap_iterations + 1)) +[f"{remap_iterations} + NMT-{iteration}" 21 | for iteration in range(nmt_iterations)] 22 | 23 | logging.info("Evaluating performance before remapping.") 24 | pearson, spearman = aligner.correlation(eval_src, eval_system, eval_scores) 25 | rmse, mae = aligner.error(eval_src, eval_system, eval_scores) 26 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}") 27 | results["pearson"].append(round(100 * pearson, 2)) 28 | results["spearman"].append(round(100 * spearman, 2)) 29 | results["rmse"].append(round(rmse, 2)) 30 | results["mae"].append(round(mae, 2)) 31 | 32 | for iteration in range(1, remap_iterations + 1): 33 | logging.info(f"Remapping iteration {iteration}.") 34 | aligner.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False) 35 | pearson, spearman = aligner.correlation(eval_src, eval_system, eval_scores) 36 | rmse, mae = aligner.error(eval_src, eval_system, eval_scores) 37 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}") 38 | results["pearson"].append(round(100 * pearson, 2)) 39 | results["spearman"].append(round(100 * spearman, 2)) 40 | results["rmse"].append(round(rmse, 2)) 41 | results["mae"].append(round(mae, 2)) 42 | 43 | 44 | for iteration in range(nmt_iterations): 45 | aligner.train(train_src, train_tgt, suffix=suffix+f"-{remap_iterations}", iteration=iteration, overwrite=False, 46 | k=5 if metric=="cosine" else 1, back_translate=back_translate) 47 | 48 | logging.info("Evaluating performance with NMT model.") 49 | pearson, spearman = aligner.correlation(eval_src, eval_system, eval_scores) 50 | rmse, mae = aligner.error(eval_src, eval_system, eval_scores) 51 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, RMSE: {rmse}, MAE: {mae}") 52 | results["pearson"].append(round(100 * pearson, 2)) 53 | results["spearman"].append(round(100 * spearman, 2)) 54 | results["rmse"].append(round(rmse, 2)) 55 | results["mae"].append(round(mae, 2)) 56 | 57 | return suffix, tabulate(results, headers="keys", showindex=index) 58 | 59 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 60 | for weight in linspace(1, 0, 11): 61 | print(f"Using weight {weight} for cross-lingual XMoverScore and weight {1 - weight} for NMT system.") 62 | print(*nmt_tests(metric="cosine", weights=[weight, 1 - weight]), sep="\n") 63 | print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight]), sep="\n") 64 | print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], back_translate=True), sep="\n") 65 | for weight in linspace(1, 0, 11): 66 | print(f"Using weight {weight} for cross-lingual XMoverScore and weight {1 - weight} for NMT system.") 67 | print(*nmt_tests(metric="cosine", weights=[weight, 1 - weight], max_len=50), sep="\n") 68 | print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], max_len=50), sep="\n") 69 | print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], max_len=50, back_translate=True), sep="\n") 70 | for weight in linspace(1, 0, 11): 71 | print(f"Using weight {weight} for cross-lingual XMoverScore and weight {1 - weight} for NMT system.") 72 | print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], nmt_iterations=3), sep="\n") 73 | print(*nmt_tests(metric="wmd", weights=[weight, 1 - weight], nmt_iterations=3, max_len=50), sep="\n") 74 | -------------------------------------------------------------------------------- /experiments/parallel.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.contrastscore import ContrastScore 3 | from metrics.distilscore import DistilScore 4 | from metrics.utils.dataset import DatasetLoader 5 | from collections import defaultdict 6 | from tabulate import tabulate 7 | import logging 8 | 9 | datasets = (("Newstest-2016", "scored", ("de", "en")), ("Newstest-2017", "scored-wmt17", ("de", "en")), 10 | ("MLQE-PE", "scored-mlqe", ("en", "de")), ("MQM-Newstest-2020", "scored-mqm", ("en", "de"))) 11 | 12 | def distil_tests(source_lang, target_lang, score_model=ContrastScore, eval_dataset="scored"): 13 | scorer = score_model(source_language=source_lang, target_language=target_lang, suffix="parallel") 14 | dataset = DatasetLoader(source_lang, target_lang, hard_limit=500) 15 | eval_src, eval_system, eval_scores = dataset.load(eval_dataset) 16 | parallel_src, parallel_tgt = dataset.load("parallel-train") 17 | results, index = defaultdict(list), ["Baseline", "Fine-tuned model"] 18 | 19 | if score_model == ContrastScore: 20 | scorer.parallelize = True 21 | 22 | logging.info("Evaluating performance before fine-tuning.") 23 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 24 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 25 | results["pearson"].append(round(100 * pearson, 2)) 26 | results["spearman"].append(round(100 * spearman, 2)) 27 | 28 | scorer.train(parallel_src, parallel_tgt, aligned=True, overwrite=False) 29 | 30 | logging.info(f"Evaluating performance after fine-tuning.") 31 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 32 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 33 | results["pearson"].append(round(100 * pearson, 2)) 34 | results["spearman"].append(round(100 * spearman, 2)) 35 | 36 | return tabulate(results, headers="keys", showindex=index) 37 | 38 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 39 | for dataset, identifier, pair in datasets: 40 | print(f"Evaluating {'-'.join(pair)} language direction on {dataset}") 41 | print("Results using contrastive learning:", distil_tests(*pair, score_model=ContrastScore, eval_dataset=identifier), sep="\n") 42 | print("Results using knowledge distillation:", distil_tests(*pair, score_model=DistilScore, eval_dataset=identifier), sep="\n") 43 | -------------------------------------------------------------------------------- /experiments/quantity.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.xmoverscore import XMoverNMTLMBertAlignScore 3 | from collections import defaultdict 4 | from tabulate import tabulate 5 | from metrics.utils.dataset import DatasetLoader 6 | import logging 7 | 8 | source_lang, target_lang = "de", "en" 9 | 10 | def remap_tests(mapping="UMD", amount=40000): 11 | scorer = XMoverNMTLMBertAlignScore(src_lang=source_lang, tgt_lang=target_lang, mapping=mapping, alignment="fast", use_cosine=True) 12 | dataset = DatasetLoader(source_lang, target_lang) 13 | eval_src, eval_system, eval_scores = dataset.load("scored") 14 | suffix = f"{source_lang}-{target_lang}-fast-cosine-{mapping}-monolingual-align-{scorer.k}-{scorer.remap_size}-{amount}-30-1" 15 | results = defaultdict(list) 16 | 17 | scorer.remap(*dataset.load("monolingual-align", amount), suffix=suffix, overwrite=False) 18 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 19 | rmse, mae = scorer.error(eval_src, eval_system, eval_scores) 20 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 21 | results["pearson"].append(round(100 * pearson, 2)) 22 | results["spearman"].append(round(100 * spearman, 2)) 23 | results["rmse"].append(round(rmse, 2)) 24 | results["mae"].append(round(mae, 2)) 25 | 26 | return suffix, tabulate(results, headers="keys") 27 | 28 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 29 | for amount in (80000, 40000, 20000, 10000, 5000, 2500, 2000): 30 | print(f"Using {100 * round(2000/amount, 3)}% of aligned sentences for training.") 31 | print(*remap_tests(mapping="UMD", amount=amount), sep="\n") 32 | print(*remap_tests(mapping="CLP", amount=amount), sep="\n") 33 | -------------------------------------------------------------------------------- /experiments/remap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.xmoverscore import XMoverBertAlignScore 3 | from collections import defaultdict 4 | from tabulate import tabulate 5 | from metrics.utils.dataset import DatasetLoader 6 | import logging 7 | 8 | source_lang, target_lang = "de", "en" 9 | iterations = 5 10 | 11 | def remap_tests(alignment="awesome", mapping="UMD", data="monolingual-align", metric="cosine"): 12 | scorer = XMoverBertAlignScore(alignment=alignment, mapping=mapping, use_cosine=True if metric == "cosine" else False) 13 | dataset = DatasetLoader(source_lang, target_lang) 14 | parallel_src, parallel_tgt = dataset.load("parallel") 15 | mono_src, mono_tgt = dataset.load(data) 16 | eval_src, eval_system, eval_scores = dataset.load("scored") 17 | suffix = f"{source_lang}-{target_lang}-{alignment}-{metric}-{mapping}-{data}-{scorer.k}-{scorer.remap_size}-{len(mono_src)}" 18 | results = defaultdict(list) 19 | 20 | logging.info("Evaluating performance before remapping.") 21 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 22 | precision = scorer.precision(parallel_src, parallel_tgt) 23 | rmse, mae = scorer.error(eval_src, eval_system, eval_scores) 24 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}") 25 | results["pearson"].append(round(100 * pearson, 2)) 26 | results["spearman"].append(round(100 * spearman, 2)) 27 | results["precision"].append(round(100 * precision, 2)) 28 | results["rmse"].append(round(rmse, 2)) 29 | results["mae"].append(round(mae, 2)) 30 | 31 | for iteration in range(1, iterations + 1): 32 | logging.info(f"Remapping iteration {iteration}.") 33 | scorer.remap(mono_src, mono_tgt, suffix=suffix + f"-{iteration}", overwrite=False) 34 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 35 | precision = scorer.precision(parallel_src, parallel_tgt) 36 | rmse, mae = scorer.error(eval_src, eval_system, eval_scores) 37 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}") 38 | results["pearson"].append(round(100 * pearson, 2)) 39 | results["spearman"].append(round(100 * spearman, 2)) 40 | results["precision"].append(round(100 * precision, 2)) 41 | results["rmse"].append(round(rmse, 2)) 42 | results["mae"].append(round(mae, 2)) 43 | 44 | return suffix, tabulate(results, headers="keys", showindex=True) 45 | 46 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 47 | print(*remap_tests(alignment="awesome", data="monolingual-align", mapping="UMD"), sep="\n") 48 | print(*remap_tests(alignment="awesome", data="monolingual-align", mapping="CLP"), sep="\n") 49 | print(*remap_tests(alignment="awesome-remap", data="monolingual-align", mapping="UMD"), sep="\n") 50 | print(*remap_tests(alignment="awesome-remap", data="monolingual-align", mapping="CLP"), sep="\n") 51 | print(*remap_tests(alignment="fast", data="monolingual-align", mapping="UMD"), sep="\n") 52 | print(*remap_tests(alignment="fast", data="monolingual-align", mapping="CLP"), sep="\n") 53 | print(*remap_tests(alignment="awesome", data="parallel-align", mapping="UMD"), sep="\n") 54 | print(*remap_tests(alignment="awesome", data="parallel-align", mapping="CLP"), sep="\n") 55 | print(*remap_tests(alignment="fast", data="parallel-align", mapping="UMD"), sep="\n") 56 | print(*remap_tests(alignment="fast", data="parallel-align", mapping="CLP"), sep="\n") 57 | print(*remap_tests(alignment="awesome", data="monolingual-align", mapping="UMD", metric="wmd"), sep="\n") 58 | print(*remap_tests(alignment="awesome", data="monolingual-align", mapping="CLP", metric="wmd"), sep="\n") 59 | -------------------------------------------------------------------------------- /experiments/sentsim.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from collections import defaultdict 3 | import logging 4 | 5 | from tabulate import tabulate 6 | 7 | from metrics.sentsim import SentSim 8 | from metrics.utils.dataset import DatasetLoader 9 | 10 | mlqe = [ 11 | ("en", "de"), 12 | ("en", "zh"), 13 | ("ru", "en"), 14 | ("ro", "en"), 15 | ("et", "en"), 16 | ("ne", "en"), 17 | ("si", "en"), 18 | ] 19 | newstest2017 = [ 20 | ("cs", "en"), 21 | ("de", "en"), 22 | ("fi", "en"), 23 | ("lv", "en"), 24 | ("ru", "en"), 25 | ("tr", "en"), 26 | ("zh", "en"), 27 | ("en", "zh"), 28 | ("en", "ru"), 29 | ] 30 | 31 | # Try to reproduce results of Sentsim on both WMT-17 and WMT-20. The scores for 32 | # WMT-20 are computed for both human-annotated scores and MT model scores. 33 | def sentsim_reproduce( 34 | source_lang, 35 | target_lang, 36 | dataset_name, 37 | word_metric="BERTScore", 38 | use_mlqe_model_scores=False, 39 | ): 40 | scorer = SentSim(use_wmd=word_metric == "WMD") 41 | dataset = DatasetLoader(source_lang, target_lang) 42 | eval_src, eval_system, eval_scores = dataset.load( 43 | dataset_name, use_mlqe_model_scores=use_mlqe_model_scores 44 | ) 45 | results, index = defaultdict(list), [f"SentSim ({word_metric})"] 46 | 47 | pearson, spearman = scorer.correlation(eval_src, eval_system, eval_scores) 48 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}") 49 | results["pearson"].append(round(100 * pearson, 2)) 50 | results["spearman"].append(round(100 * spearman, 2)) 51 | 52 | return tabulate(results, headers="keys", showindex=index) 53 | 54 | 55 | logging.basicConfig( 56 | level=logging.INFO, 57 | datefmt="%m-%d %H:%M", 58 | format="%(asctime)s %(levelname)-8s %(message)s", 59 | ) 60 | datasets = ( 61 | ("Newstest-2017", "scored-wmt17", newstest2017), 62 | ("MLQE-PE (Human)", "scored-mlqe", mlqe), 63 | ("MLQE-PE (Model)", "scored-mlqe", mlqe), 64 | ) 65 | for dataset, identifier, pairs in datasets: 66 | for source_lang, target_lang in pairs: 67 | print(f"Evaluating {source_lang}-{target_lang} language direction on {dataset}") 68 | print( 69 | sentsim_reproduce( 70 | source_lang, target_lang, identifier, "BERTScore", "Mode" in dataset 71 | ) 72 | ) 73 | print( 74 | sentsim_reproduce( 75 | source_lang, target_lang, identifier, "WMD", "Mode" in dataset 76 | ) 77 | ) 78 | -------------------------------------------------------------------------------- /experiments/vecmap.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from metrics.xmoverscore import XMoverVecMapAlignScore 3 | from metrics.vecmapscore import VecMapScore 4 | from collections import defaultdict 5 | from tabulate import tabulate 6 | from metrics.utils.dataset import DatasetLoader 7 | import logging 8 | 9 | source_lang, target_lang = "de", "en" 10 | 11 | def vecmap_tests(model=XMoverVecMapAlignScore): 12 | aligner = model(src_lang=source_lang, tgt_lang=target_lang) 13 | dataset = DatasetLoader(source_lang, target_lang) 14 | parallel_src, parallel_tgt = dataset.load("parallel") 15 | eval_src, eval_system, eval_scores = dataset.load("scored") 16 | results = defaultdict(list) 17 | 18 | pearson, spearman = aligner.correlation(eval_src, eval_system, eval_scores) 19 | precision = aligner.precision(parallel_src, parallel_tgt) 20 | rmse, mae = aligner.error(eval_src, eval_system, eval_scores) 21 | logging.info(f"Pearson: {pearson}, Spearman: {spearman}, Precision @ 1: {precision}, RMSE: {rmse}, MAE: {mae}") 22 | results["pearson"].append(round(100 * pearson, 2)) 23 | results["spearman"].append(round(100 * spearman, 2)) 24 | results["precision"].append(round(100 * precision, 2)) 25 | results["rmse"].append(round(rmse, 2)) 26 | results["mae"].append(round(mae, 2)) 27 | 28 | return f"{source_lang}-{target_lang}-vecmap", tabulate(results, headers="keys") 29 | 30 | logging.basicConfig(level=logging.INFO, datefmt="%m-%d %H:%M", format="%(asctime)s %(levelname)-8s %(message)s") 31 | print(*vecmap_tests(), sep="\n") 32 | print(*vecmap_tests(model=VecMapScore), sep="\n") 33 | -------------------------------------------------------------------------------- /metrics/common.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from numpy import corrcoef, argsort 3 | from torch.nn.functional import mse_loss, l1_loss 4 | from torch import FloatTensor 5 | 6 | class CommonScore(ABC): 7 | @abstractmethod 8 | def align(): 9 | """ 10 | This method receives a list of sentences in the source language and a 11 | list of sentences in the target language as parameters and returns 12 | a list of pseudo aligned sentence pairs. 13 | """ 14 | 15 | @abstractmethod 16 | def _embed(): 17 | """ 18 | This method receives a list of sentences in the source language and a 19 | list of sentences in the target language as parameters and returns 20 | their embeddings, inverse document frequences, tokens and padding 21 | masks. 22 | """ 23 | 24 | @abstractmethod 25 | def score(): 26 | """ 27 | This method receives a list of sentences in the source language and a 28 | list of sentences in the target language as parameters, which are 29 | assumed to be aligned according to their index. For each sentence pair 30 | a similarity score is computed and the list of scores is returned. 31 | """ 32 | 33 | def precision(self, source_sents, ref_sents): 34 | """ 35 | This method receives a list of sentences in the source language and a 36 | list of sentences in the target language as parameters, which are 37 | assumed to be aligned. The parallel sentences are then shuffled and 38 | re-aligned through parallel sentence matching. This method then returns 39 | the Precision @ 1 score. 40 | """ 41 | pairs, _ = self.align(source_sents, ref_sents) 42 | return sum([reference == predicted for reference, (_, predicted) in zip(ref_sents, pairs)]) / len(ref_sents) 43 | 44 | def correlation(self, source_sents, system_sents, ref_scores): 45 | """ 46 | This method receives a list of sentences in the source language, a 47 | list of sentences in the target language, which are 48 | assumed to be aligned and reference scores as parameters. The method 49 | then returns the pearson correlation and the spearman correlation 50 | between the reference scores and the scores of the metric. 51 | """ 52 | scores = self.score(source_sents, system_sents) 53 | ref_ranks, ranks = argsort(ref_scores).argsort(), argsort(scores).argsort() 54 | return corrcoef(ref_scores, scores)[0,1], corrcoef(ref_ranks, ranks)[0,1] 55 | 56 | def error(self, source_sents, system_sents, ref_scores): 57 | """ 58 | This method receives a list of sentences in the source language, a 59 | list of sentences in the target language, which are 60 | assumed to be aligned and reference scores as parameters. The method 61 | then returns the Root Mean Squared Error and the Mean Absulute Error 62 | between the reference scores and the scores of the metric. 63 | """ 64 | scores = self.score(source_sents, system_sents) 65 | rmse = mse_loss(FloatTensor(ref_scores), FloatTensor(scores)).sqrt().item() 66 | mae = l1_loss(FloatTensor(ref_scores), FloatTensor(scores)).item() 67 | return rmse, mae 68 | -------------------------------------------------------------------------------- /metrics/contrastscore.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, InputExample, models, util 2 | from torch.utils.data import DataLoader 3 | from os.path import join, isfile, basename 4 | from torch.cuda import device_count, is_available as cuda_is_available 5 | from torch.nn import CrossEntropyLoss, Module, DataParallel 6 | from torch.nn.functional import cosine_similarity 7 | from math import ceil 8 | from .utils.knn import ratio_margin_align 9 | from .common import CommonScore 10 | from .utils.env import DATADIR 11 | from .utils.wmd import word_mover_score 12 | from .utils.perplexity import lm_perplexity 13 | from nltk.metrics.distance import edit_distance 14 | from pathlib import Path 15 | import logging 16 | import torch 17 | 18 | class AdditiveMarginSoftmaxLoss(Module): 19 | """ 20 | Contrastive learning loss function used by LaBSE and SimCSE. 21 | """ 22 | def __init__(self, model, scale = 20.0, margin = 0.0, symmetric = True, similarity_fct = util.cos_sim): 23 | super().__init__() 24 | self.model = model 25 | self.scale = scale 26 | self.margin = margin 27 | self.symmetric = symmetric 28 | self.similarity_fct = similarity_fct 29 | self.cross_entropy_loss = CrossEntropyLoss() 30 | 31 | def additive_margin_softmax_loss(self, embeddings_a, embeddings_b): 32 | scores = self.similarity_fct(embeddings_a, embeddings_b) 33 | scores.diagonal().subtract_(self.margin) 34 | labels = torch.tensor(range(len(scores)), dtype=torch.long, device=scores.device) # Example a[i] should match with b[i] 35 | return self.cross_entropy_loss(self.scale * scores, labels) 36 | 37 | def forward(self, sentence_features, _): 38 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 39 | assert len(reps) == 2, "Inputs should be source texts and translations" 40 | embeddings_a = reps[0] 41 | embeddings_b = reps[1] 42 | 43 | if self.symmetric: 44 | return self.additive_margin_softmax_loss(embeddings_a, embeddings_b) + self.additive_margin_softmax_loss(embeddings_b, embeddings_a) 45 | else: 46 | return self.additive_margin_softmax_loss(embeddings_a, embeddings_b) 47 | 48 | def get_config_dict(self): 49 | return {'scale': self.scale, 'margin': self.margin, 'symmetric': self.symmetric, 'similarity_fct': self.similarity_fct.__name__} 50 | 51 | class ContrastScore(CommonScore): 52 | def __init__( 53 | self, 54 | model_name="xlm-roberta-base", 55 | source_language="en", 56 | target_language="de", 57 | device="cuda" if cuda_is_available() else "cpu", 58 | parallelize= False, 59 | train_batch_size=256, 60 | max_seq_length=None, 61 | num_epochs=1, 62 | knn_batch_size = 1000000, 63 | mine_batch_size = 5000000, 64 | train_size = 100000, 65 | k = 5, 66 | suffix = None 67 | ): 68 | self.model_name = model_name 69 | self.train_batch_size = train_batch_size 70 | self.max_seq_length = max_seq_length 71 | self.num_epochs = num_epochs 72 | self.device = device 73 | self.parallelize = parallelize 74 | self.knn_batch_size = knn_batch_size 75 | self.mine_batch_size = mine_batch_size 76 | self.train_size = train_size 77 | self.k = k 78 | self.cache_dir = join(DATADIR, "contrastive-learning", 79 | f"{'-'.join(sorted([source_language, target_language]))}-{basename(model_name)}") 80 | self.suffix = suffix 81 | self.model = self.load_model(model_name) 82 | 83 | def load_model(self, model_name): 84 | word_embedding_model = models.Transformer(model_name, max_seq_length=self.max_seq_length) 85 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) 86 | return SentenceTransformer(modules=[word_embedding_model, pooling_model], device=self.device) 87 | 88 | @property 89 | def path(self): 90 | path = self.cache_dir + f"-{self.suffix}" if self.suffix is not None else "" 91 | Path(path).mkdir(parents=True, exist_ok=True) 92 | return path 93 | 94 | def _embed(self, source_sents, target_sents): 95 | return ( 96 | self.model.encode(source_sents, convert_to_tensor=True).cpu(), 97 | self.model.encode(target_sents, convert_to_tensor=True).cpu()) 98 | 99 | def align(self, source_sents, target_sents): 100 | source_embeddings, target_embeddings = self._embed(source_sents, target_sents) 101 | indeces, scores = ratio_margin_align(source_embeddings, target_embeddings, self.k, 102 | self.knn_batch_size, self.device) 103 | 104 | sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces] 105 | return sent_pairs, scores 106 | 107 | def score(self, source_sents, target_sents): 108 | source_embeddings, target_embeddings = self._embed(source_sents, target_sents) 109 | return cosine_similarity(source_embeddings, target_embeddings) 110 | 111 | def mine(self, source_sents, target_sents, mine_size, overwrite=True): 112 | logging.info("Mining pseudo parallel data.") 113 | file_path = join(self.path, "mined-sentence-pairs.txt") 114 | pairs, scores, batch, batch_size = list(), list(), 0, self.mine_batch_size 115 | if not isfile(file_path) or overwrite: 116 | while batch < len(source_sents): 117 | logging.info("Obtaining sentence embeddings.") 118 | batch_src, batch_tgt = source_sents[batch:batch + batch_size], target_sents[batch:batch + batch_size] 119 | source_embeddings, target_embeddings = self._embed(batch_src, batch_tgt) 120 | logging.info("Mining pseudo parallel data with Ratio Margin function.") 121 | batch_pairs, batch_scores = ratio_margin_align(source_embeddings, target_embeddings, self.k, 122 | self.knn_batch_size, self.device) 123 | del source_embeddings, target_embeddings 124 | pairs.extend([(src + batch, tgt + batch) for src, tgt in batch_pairs]), scores.extend(batch_scores) 125 | batch += batch_size 126 | with open(file_path, "wb") as f: 127 | idx = 0 128 | check_duplicates_set = set() 129 | for _, (src, tgt) in sorted(zip(scores, pairs), key=lambda tup: tup[0], reverse=True): 130 | src_sent, tgt_sent = source_sents[src], target_sents[tgt] 131 | if tgt_sent not in check_duplicates_set and edit_distance(src_sent, tgt_sent) / max(len(src_sent), len(tgt_sent)) > 0.5: 132 | check_duplicates_set.add(tgt_sent) 133 | f.write(f"{src_sent}\t{tgt_sent}\n".encode()) 134 | idx += 1 135 | if idx >= mine_size: 136 | break 137 | 138 | with open(file_path, "rb") as f: 139 | sents = list() 140 | for line in f: 141 | sents.append(line.decode().strip().split("\t")) 142 | return sents 143 | 144 | def train(self, source_sents, target_sents, aligned=False, finetune=False, overwrite=True): 145 | if not isfile(join(self.path, 'config.json')) or overwrite: 146 | # Convert train sentences to sentence pairs 147 | if aligned: 148 | train_data = [InputExample(texts=[s, t]) for s, t in zip(source_sents, target_sents)] 149 | else: 150 | train_data = [InputExample(texts=[s, t]) for s, t in self.mine(source_sents, target_sents, self.train_size, 151 | overwrite=overwrite)] 152 | 153 | # DataLoader to batch your data 154 | train_dataloader = DataLoader(train_data, batch_size=self.train_batch_size, shuffle=True) 155 | 156 | if finetune: 157 | new_model = self.model 158 | # Train a new model 159 | else: 160 | del self.model 161 | new_model = self.load_model(self.model_name) 162 | 163 | # Use contrastive learning loss 164 | if self.parallelize and device_count() > 1: 165 | logging.info(f"Training on {device_count()} GPUs.") 166 | train_loss = AdditiveMarginSoftmaxLoss(DataParallel(new_model)) 167 | else: 168 | train_loss = AdditiveMarginSoftmaxLoss(new_model) 169 | 170 | # Call the fit method 171 | warmup_steps = ceil(len(train_dataloader) * self.num_epochs * 0.1) # 10% of train data for warm-up 172 | new_model.fit(train_objectives=[(train_dataloader, train_loss)], 173 | epochs=self.num_epochs, 174 | warmup_steps=warmup_steps, 175 | optimizer_params={'lr': 5e-5} 176 | ) 177 | new_model.save(self.path) 178 | 179 | self.model = SentenceTransformer(self.path, device=self.device) 180 | 181 | class XLMoverScore(ContrastScore): 182 | def __init__( 183 | self, 184 | embed_batch_size = 128, 185 | n_gram=1, 186 | suffix_filter=False, 187 | lm_model_name="gpt2", 188 | use_lm=False, 189 | lm_weights=[0.9, 0.1], 190 | **kwargs 191 | ): 192 | """ 193 | embed_batch_size - batch size for embedding sentences during inference 194 | n_gram - n-gram size of word mover's distance 195 | suffix_filter - filter embeddings of word suffixes (original XLMoverScore 196 | does this, but it doesn't make sense for SentencePiece-based Models) 197 | """ 198 | super().__init__(**kwargs) 199 | self.embed_batch_size = embed_batch_size 200 | self.n_gram = n_gram 201 | self.suffix_filter = suffix_filter 202 | self.lm_model_name = lm_model_name 203 | self.use_lm = use_lm 204 | self.lm_weights = lm_weights 205 | 206 | #Override 207 | def score(self, source_sents, target_sents): 208 | embedding_model = self.model.eval().to(self.device)[0].auto_model 209 | tokenizer = self.model.tokenizer 210 | 211 | with torch.no_grad(): 212 | src_ids, src_mask = tokenizer(source_sents, padding=True, truncation=True, return_tensors="pt").values() 213 | src_idf = src_mask.float() 214 | src_tokens = [[tokenizer.cls_token, *tokenizer.tokenize(sent), tokenizer.sep_token] for sent in source_sents] 215 | src_embeddings = list() 216 | 217 | tgt_ids, tgt_mask = tokenizer(target_sents, padding=True, truncation=True, return_tensors="pt").values() 218 | tgt_idf = tgt_mask.float() 219 | tgt_tokens = [[tokenizer.cls_token, *tokenizer.tokenize(sent), tokenizer.sep_token] for sent in target_sents] 220 | tgt_embeddings = list() 221 | 222 | for index in range(0, len(source_sents), self.embed_batch_size): 223 | batch_src_ids = src_ids[index: index + self.embed_batch_size].to(self.device) 224 | batch_src_mask = src_mask[index: index + self.embed_batch_size].to(self.device) 225 | src_embeddings.extend(embedding_model(input_ids=batch_src_ids, attention_mask=batch_src_mask)['last_hidden_state'].cpu()) 226 | 227 | batch_tgt_ids = tgt_ids[index: index + self.embed_batch_size].to(self.device) 228 | batch_tgt_mask = tgt_mask[index: index + self.embed_batch_size].to(self.device) 229 | tgt_embeddings.extend(embedding_model(input_ids=batch_tgt_ids, attention_mask=batch_tgt_mask)['last_hidden_state'].cpu()) 230 | 231 | wmd_scores = word_mover_score((torch.stack(src_embeddings), src_idf, src_tokens), (torch.stack(tgt_embeddings), tgt_idf, tgt_tokens), 232 | self.n_gram, True, self.suffix_filter) 233 | 234 | if self.use_lm: 235 | lm_scores = lm_perplexity(target_sents, self.device, self.lm_model_name) 236 | return (self.lm_weights[0] * torch.tensor(wmd_scores) + self.lm_weights[1] * torch.tensor(lm_scores)).tolist() 237 | else: 238 | return wmd_scores 239 | -------------------------------------------------------------------------------- /metrics/distilscore.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer, models, losses 2 | from sentence_transformers.evaluation import TranslationEvaluator, SequentialEvaluator 3 | from sentence_transformers.datasets import ParallelSentencesDataset 4 | from torch.utils.data import DataLoader 5 | from torch.cuda import is_available as cuda_is_available 6 | from torch.nn.functional import cosine_similarity 7 | from torch import from_numpy 8 | from .common import CommonScore 9 | from .utils.knn import ratio_margin_align 10 | from .utils.env import DATADIR 11 | from .utils.language import LangDetect 12 | from .utils.nmt import language2mBART 13 | from os.path import join, isfile, basename 14 | from nltk.metrics.distance import edit_distance 15 | from pathlib import Path 16 | from math import ceil 17 | 18 | import logging 19 | import numpy as np 20 | 21 | class DistilScore(CommonScore): 22 | def __init__( 23 | self, 24 | teacher_model_name="bert-base-nli-stsb-mean-tokens", 25 | student_model_name="xlm-roberta-base", 26 | source_language="en", 27 | target_language="de", 28 | device="cuda" if cuda_is_available() else "cpu", 29 | student_is_pretrained=False, 30 | train_batch_size=64, # Batch size for training 31 | inference_batch_size=64, # Batch size at inference 32 | num_epochs=10, # Train for x epochs 33 | knn_batch_size = 1000000, 34 | mine_batch_size = 5000000, 35 | train_size = 200000, 36 | k = 5, 37 | suffix = None 38 | ): 39 | self.teacher_model_name = teacher_model_name 40 | self.student_model_name = student_model_name 41 | self.target_language = target_language 42 | self.train_batch_size = train_batch_size 43 | self.inference_batch_size = inference_batch_size 44 | self.num_epochs = num_epochs 45 | self.device = device 46 | self.knn_batch_size = knn_batch_size 47 | self.mine_batch_size = mine_batch_size 48 | self.train_size = train_size 49 | self.k = k 50 | self.cache_dir = join(DATADIR, "distillation", 51 | f"{'-'.join(sorted([source_language, target_language]))}-{basename(teacher_model_name)}-{basename(student_model_name)}") 52 | self.suffix = suffix 53 | if student_is_pretrained: 54 | self.model = SentenceTransformer(student_model_name, device=self.device) 55 | else: 56 | self.model = self.load_student(student_model_name) 57 | 58 | def load_student(self, model_name): 59 | logging.info("Creating model from scratch") 60 | word_embedding_model = models.Transformer(model_name) 61 | # Apply mean pooling to get one fixed sized sentence vector 62 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) 63 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model], device=self.device) 64 | 65 | # mBART also has a decoder but we are only interested in the encoder output. To make sure that 66 | # sentence_transformers use the encoder output we monkey patch the forward method. Don't do this at home kids. 67 | if "mbart" in model_name: 68 | mbart, detector = word_embedding_model.auto_model, LangDetect(cache_dir=DATADIR) 69 | mbart.forward = lambda **kv: type(mbart).forward(mbart, **kv)[-1:] 70 | 71 | def tokenize(text): 72 | model.tokenizer.src_lang = language2mBART[detector.detect(text)] 73 | return word_embedding_model.tokenize(text) 74 | 75 | self.model.tokenize = tokenize 76 | 77 | return model 78 | 79 | @property 80 | def path(self): 81 | path = self.cache_dir + f"-{self.suffix}" if self.suffix is not None else "" 82 | Path(path).mkdir(parents=True, exist_ok=True) 83 | return path 84 | 85 | def _embed(self, source_sents, target_sents): 86 | return self.model.encode(source_sents), self.model.encode(target_sents) 87 | 88 | def align(self, source_sents, target_sents): 89 | source_embeddings, target_embeddings = self._embed(source_sents, target_sents) 90 | indeces, scores = ratio_margin_align(from_numpy(source_embeddings), from_numpy(target_embeddings), self.k, 91 | self.knn_batch_size, self.device) 92 | 93 | sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces] 94 | return sent_pairs, scores 95 | 96 | def score(self, source_sents, target_sents): 97 | source_embeddings, target_embeddings = self._embed(source_sents, target_sents) 98 | return cosine_similarity(from_numpy(source_embeddings), from_numpy(target_embeddings)) 99 | 100 | def mine(self, source_sents, target_sents, overwrite=True): 101 | logging.info("Mining pseudo parallel data.") 102 | file_path = join(self.path, "mined-sentence-pairs.txt") 103 | pairs, scores, batch, batch_size = list(), list(), 0, self.mine_batch_size 104 | if not isfile(file_path) or overwrite: 105 | while batch < len(source_sents): 106 | logging.info("Obtaining sentence embeddings.") 107 | batch_src, batch_tgt = source_sents[batch:batch + batch_size], target_sents[batch:batch + batch_size] 108 | source_embeddings, target_embeddings = self._embed(batch_src, batch_tgt) 109 | logging.info("Mining pseudo parallel data with Ratio Margin function.") 110 | batch_pairs, batch_scores = ratio_margin_align(from_numpy(source_embeddings), 111 | from_numpy(target_embeddings), self.k, self.knn_batch_size, self.device) 112 | del source_embeddings, target_embeddings 113 | pairs.extend([(src + batch, tgt + batch) for src, tgt in batch_pairs]), scores.extend(batch_scores) 114 | batch += batch_size 115 | with open(file_path, "wb") as f: 116 | idx = 0 117 | for _, (src, tgt) in sorted(zip(scores, pairs), key=lambda tup: tup[0], reverse=True): 118 | src_sent, tgt_sent = source_sents[src], target_sents[tgt] 119 | if edit_distance(src_sent, tgt_sent) / max(len(src_sent), len(tgt_sent)) > 0.5: 120 | f.write(f"{src_sent}\t{tgt_sent}\n".encode()) 121 | idx += 1 122 | if idx >= self.train_size: 123 | break 124 | return file_path 125 | 126 | def train(self, source_sents, target_sents, dev_source_sents=None, dev_target_sents=None, aligned=False, overwrite=True): 127 | if not isfile(join(self.path, 'config.json')) or overwrite: 128 | # Train a new model to avoid overfitting 129 | new_model = self.load_student(self.student_model_name) 130 | logging.info("Loading teacher model and training data.") 131 | teacher_model = SentenceTransformer(self.teacher_model_name, device=self.device) 132 | train_data = ParallelSentencesDataset(student_model=new_model, teacher_model=teacher_model, 133 | batch_size=self.inference_batch_size, use_embedding_cache=True) 134 | 135 | if self.target_language == "en": # since teacher embeds source sentences make sure they are in english 136 | source_sents, target_sents = target_sents, source_sents 137 | 138 | if aligned: 139 | train_data.add_dataset(zip(source_sents, target_sents), max_sentences=self.train_size, 140 | max_sentence_length=None) 141 | else: 142 | train_data.load_data(self.mine(source_sents, target_sents, overwrite=overwrite), 143 | max_sentences=self.train_size, max_sentence_length=None) 144 | 145 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=self.train_batch_size) 146 | train_loss = losses.MSELoss(model=new_model) 147 | 148 | dev_trans_acc = None 149 | if dev_source_sents is not None and dev_target_sents is not None: 150 | # TranslationEvaluator computes the embeddings for all parallel sentences. It then checks if the 151 | # embedding of source[i] is the closest to target[i] out of all available target sentences 152 | dev_trans_acc = TranslationEvaluator(dev_source_sents, dev_target_sents, write_csv=False, 153 | batch_size=self.inference_batch_size) 154 | 155 | # Train the model 156 | logging.info("Fine-tuning student model.") 157 | warmup_steps = ceil(len(train_dataloader) * self.num_epochs * 0.1) # 10% of train data for warm-up 158 | new_model.fit(train_objectives=[(train_dataloader, train_loss)], 159 | evaluator=None if dev_trans_acc is None else SequentialEvaluator([dev_trans_acc], main_score_function=np.mean), 160 | epochs=self.num_epochs, 161 | warmup_steps=warmup_steps, 162 | optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False} 163 | ) 164 | new_model.save(self.path) 165 | 166 | self.model = SentenceTransformer(self.path, device=self.device) 167 | -------------------------------------------------------------------------------- /metrics/marginscore.py: -------------------------------------------------------------------------------- 1 | from .common import CommonScore 2 | from .xmoverscore import BertEmbed 3 | from .utils.knn import ratio_margin_align 4 | from torch.nn.functional import cosine_similarity 5 | from torch.cuda import is_available as cuda_is_available 6 | from torch import sum as tsum 7 | import logging 8 | 9 | class RatioMarginAlign(CommonScore): 10 | def __init__(self, device, k, knn_batch_size): 11 | self.device = device 12 | self.k = k 13 | self.knn_batch_size = knn_batch_size 14 | 15 | def align(self, source_sents, target_sents): 16 | src_embeddings, _, _, src_mask, tgt_embeddings, _, _, tgt_mask = self._embed( 17 | source_sents, target_sents) 18 | 19 | logging.info("Computing scores with Ratio Margin algorithm.") 20 | source_sent_embeddings = tsum(src_embeddings * src_mask, 1) / tsum(src_mask, 1) 21 | target_sent_embeddings = tsum(tgt_embeddings * tgt_mask, 1) / tsum(tgt_mask, 1) 22 | indeces, scores = ratio_margin_align(source_sent_embeddings, target_sent_embeddings, self.k, 23 | self.knn_batch_size, self.device) 24 | 25 | sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces] 26 | return sent_pairs, scores 27 | 28 | def score(self, source_sents, target_sents): 29 | src_embeddings, _, _, src_mask, tgt_embeddings, _, _, tgt_mask = self._embed(source_sents, target_sents) 30 | source_sent_embeddings = tsum(src_embeddings * src_mask, 1) / tsum(src_mask, 1) 31 | target_sent_embeddings = tsum(tgt_embeddings * tgt_mask, 1) / tsum(tgt_mask, 1) 32 | scores = cosine_similarity(source_sent_embeddings, target_sent_embeddings) 33 | return scores 34 | 35 | class RatioMarginBertAlignScore(RatioMarginAlign, BertEmbed): 36 | def __init__( 37 | self, 38 | model_name="bert-base-multilingual-cased", 39 | mapping="UMD", 40 | device="cuda" if cuda_is_available() else "cpu", 41 | do_lower_case=False, 42 | alignment = "awesome", 43 | k = 20, 44 | remap_size = 2000, 45 | embed_batch_size = 128, 46 | knn_batch_size = 1000000 47 | ): 48 | RatioMarginAlign.__init__(self, device, k, knn_batch_size) 49 | BertEmbed.__init__(self, model_name, mapping, device, do_lower_case, remap_size, embed_batch_size, alignment) 50 | -------------------------------------------------------------------------------- /metrics/sentsim.py: -------------------------------------------------------------------------------- 1 | from scipy.spatial.distance import euclidean 2 | from collections import defaultdict 3 | from itertools import product 4 | from sentence_transformers import SentenceTransformer 5 | from transformers import AutoTokenizer, AutoModel 6 | from torch.nn import CosineSimilarity 7 | from torch.cuda import is_available as cuda_is_available 8 | from datasets import load_metric 9 | from .utils.knn import ratio_margin_align 10 | from .utils.env import DATADIR 11 | from .common import CommonScore 12 | import torch 13 | import pulp 14 | import logging 15 | import numpy as np 16 | 17 | # This code is based on https://github.com/Rain9876/Unsupervised-crosslingual-Compound-Method-For-MT 18 | class SentSim(CommonScore): 19 | """ 20 | A wrapper around the original SentSim implementation. Be careful, the used 21 | models were fine-tuned on parallel sentences. 22 | """ 23 | def __init__( 24 | self, 25 | wordemb_model="xlm-roberta-base", 26 | sentemb_model="xlm-r-bert-base-nli-stsb-mean-tokens", 27 | device="cuda" if cuda_is_available() else "cpu", 28 | use_wmd=False, 29 | knn_batch_size = 1000000, 30 | mine_batch_size = 5000000, 31 | k = 5, 32 | ): 33 | if use_wmd: 34 | self.tokenizer, self.word_model = self.get_WMD_Model(wordemb_model) 35 | self.layers = self.layer_processing(self.word_model) 36 | else: 37 | self.word_model = wordemb_model 38 | self.use_wmd = use_wmd 39 | self.sent_model = SentenceTransformer(sentemb_model, device=device) 40 | self.knn_batch_size = knn_batch_size 41 | self.mine_batch_size = mine_batch_size 42 | self.device = device 43 | self.k = k 44 | 45 | def _embed(self, source_sents, target_sents): 46 | return ( 47 | self.sent_model.encode(source_sents, convert_to_tensor=True).cpu(), 48 | self.sent_model.encode(target_sents, convert_to_tensor=True).cpu()) 49 | 50 | def align(self, source_sents, target_sents): 51 | logging.warn("For now SentSim sentence alignment only leverages sentence embeddings.") 52 | source_embeddings, target_embeddings = self._embed(source_sents, target_sents) 53 | indeces, scores = ratio_margin_align(source_embeddings, target_embeddings, self.k, 54 | self.knn_batch_size, self.device) 55 | 56 | sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces] 57 | return sent_pairs, scores 58 | 59 | def score(self, source_sents, target_sents): 60 | cosine = self.getSentSimilarity(target_sents, source_sents) 61 | if self.use_wmd: 62 | wmd = self.compute_WMD(target_sents, source_sents, self.tokenizer, self.word_model) 63 | return self.combine_metrics(cosine, wmd, corr=[1, -1]) 64 | else: 65 | bertscore = self.getBertScore(target_sents, source_sents, self.word_model) 66 | return self.combine_metrics(cosine, bertscore, corr=[1, 1]) 67 | 68 | def combine_metrics(_, *args, **kwargs): 69 | assert len(args) == len(kwargs["corr"]) and len(args[0]) == len(args[1]) 70 | output = [] 71 | 72 | for i in range(len(args[0])): 73 | value = 0 74 | for sign, metric in zip(kwargs["corr"], args): 75 | assert metric[i] <= 1 and metric[i] >= 0 76 | if sign > 0: 77 | value += np.exp(metric[i]) 78 | else: 79 | value += np.exp(1-metric[i]) 80 | output.append(value) 81 | 82 | return output 83 | 84 | def getSentSimilarity(self, sents1, sents2): 85 | embed_sent1, embed_sent2 = self._embed(sents1, sents2) 86 | cos_sim = CosineSimilarity(dim=1)(embed_sent1,embed_sent2) 87 | # Normalized 88 | cos_sim = (cos_sim -torch.min(cos_sim))/ (torch.max(cos_sim)-torch.min(cos_sim)) 89 | return cos_sim.numpy() 90 | 91 | def getBertScore(self, sents1, sents2, model): 92 | bert_score_metric = load_metric('bertscore', keep_in_memory=True, cache_dir=DATADIR) 93 | bert_score_metric.add_batch(predictions=sents2, references=sents1) 94 | score = torch.tensor(bert_score_metric.compute(model_type=model, device=self.device)["f1"]) 95 | # Normalized Bert Score F1 96 | norm_score = (score - torch.min(score)) / (torch.max(score) - torch.min(score)) 97 | return norm_score.tolist() 98 | 99 | def compute_WMD(self, hypotheses, references, tokenizer, model, embed_type=False): 100 | wmd = [] 101 | 102 | for reference, hypothesis in zip(references, hypotheses): 103 | wmd_tmp = self.word_mover_distance(reference, hypothesis, tokenizer, model, embed_type) 104 | wmd.append(wmd_tmp) 105 | # Normalize 106 | wmd = [(val-min(wmd))/(max(wmd)-min(wmd)) for val in wmd] 107 | return np.array(wmd) 108 | 109 | def word_mover_distance(self, sent1, sent2, tokenizer, model, embed_type, lpFile=None): 110 | sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding = self.embedding_processing(sent1, sent2, 111 | tokenizer, model, embed_type) 112 | prob = self.word_mover_distance_probspec(sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding, lpFile=lpFile) 113 | return pulp.value(prob.objective) 114 | 115 | def word_mover_distance_probspec(_, sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding, lpFile=None): 116 | first_sent_buckets = {f"x{idx}": item[1] for idx, item in enumerate(sent1_buckets.items())} 117 | second_sent_buckets = {f"y{idx}": item[1] for idx, item in enumerate(sent2_buckets.items())} 118 | 119 | var_names = list(first_sent_buckets.keys()) + list(second_sent_buckets.keys()) 120 | all_embedding = torch.cat([sent1_embedding, sent2_embedding]) 121 | wordvecs = {token: embedding.detach().numpy() for token, embedding in zip(var_names, all_embedding)} 122 | assert len(var_names) == all_embedding.size(0) 123 | 124 | T = pulp.LpVariable.dicts('T_matrix', list(product(var_names, var_names)), lowBound=0) 125 | prob = pulp.LpProblem('WMD', sense=pulp.LpMinimize) 126 | prob += pulp.lpSum([T[token1, token2]*euclidean(wordvecs[token1], wordvecs[token2]) 127 | for token1, token2 in product(var_names, var_names)]) 128 | for token2 in second_sent_buckets: #constrains 129 | prob += pulp.lpSum([T[token1, token2] for token1 in first_sent_buckets])==second_sent_buckets[token2] 130 | for token1 in first_sent_buckets: #constrains 131 | prob += pulp.lpSum([T[token1, token2] for token2 in second_sent_buckets])==first_sent_buckets[token1] 132 | if lpFile!=None: 133 | prob.writeLP(lpFile) 134 | 135 | prob.solve(pulp.apis.PULP_CBC_CMD(msg=0)) 136 | return prob 137 | 138 | def embedding_processing(self, sent1, sent2, tokenizer, model, embed_type=False): 139 | sent1_tokens, sent2_tokens = tokenizer.tokenize(sent1), tokenizer.tokenize(sent2) 140 | 141 | if embed_type: 142 | sent1_buckets, sent2_buckets = self.tokens_to_fracdict(sent1_tokens), self.tokens_to_fracdict(sent2_tokens) 143 | sent1_embedding = model.embeddings.word_embeddings( 144 | torch.tensor(tokenizer.convert_tokens_to_ids(list(sent1_buckets.keys())))) 145 | sent2_embedding = model.embeddings.word_embeddings( 146 | torch.tensor(tokenizer.convert_tokens_to_ids(list(sent2_buckets.keys())))) 147 | else: 148 | sent1_buckets = self.tokens_to_fracdict_contextual(sent1_tokens) 149 | sent2_buckets = self.tokens_to_fracdict_contextual(sent2_tokens) 150 | sent1_id = tokenizer(sent1,return_tensors="pt") 151 | sent2_id = tokenizer(sent2,return_tensors="pt") 152 | # [-8:-7] indicates Roberta-Large layer 17 153 | # [-4,-3] indicates XLM Roberta-Base layer 9 154 | model(sent1_id['input_ids']) 155 | sent1_embedding = torch.mean(torch.stack(self.layers[-4:-3]).squeeze(1).permute(1,0,2), dim=1) 156 | model(sent2_id['input_ids']) 157 | sent2_embedding = torch.mean(torch.stack(self.layers[-4:-3]).squeeze(1).permute(1,0,2), dim=1) 158 | self.layers.clear() 159 | 160 | if sent1_embedding.size()[0] - 2 == len(sent1_tokens): 161 | sent1_embedding = sent1_embedding[1:-1,:] # Remove bos and eos tokens 162 | if sent2_embedding.size()[0] - 2 == len(sent2_tokens): 163 | sent2_embedding = sent2_embedding[1:-1,:] # Remove bos and eos tokens 164 | 165 | assert len(sent1_buckets) + len(sent2_buckets) == (sent1_embedding.size()[0] + sent2_embedding.size()[0]) 166 | return sent1_buckets, sent2_buckets, sent1_embedding, sent2_embedding 167 | 168 | def tokens_to_fracdict_contextual(_, tokens): 169 | return {token: 1/len(tokens) for token in range(len(tokens))} 170 | 171 | def tokens_to_fracdict(_, tokens): 172 | cntdict = defaultdict(lambda : 0) 173 | 174 | for token in tokens: 175 | cntdict[token] += 1 176 | totalcnt = sum(cntdict.values()) 177 | return {token: float(cnt)/totalcnt for token, cnt in cntdict.items()} 178 | 179 | def get_WMD_Model(_, name): 180 | tokenizer = AutoTokenizer.from_pretrained(name) 181 | model = AutoModel.from_pretrained(name, return_dict=True) 182 | # bert_model.embeddings.word_embeddings 183 | model.eval() 184 | return tokenizer, model 185 | 186 | def layer_processing(_, model): 187 | layers = [] 188 | 189 | for i in model.encoder.layer: 190 | i.register_forward_hook(lambda *args: layers.append(args[2][0])) 191 | 192 | return layers 193 | -------------------------------------------------------------------------------- /metrics/utils/embed.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch.nn.utils.rnn import pad_sequence 3 | from torch.utils.data import DataLoader, SequentialSampler, TensorDataset 4 | from collections import defaultdict 5 | from os.path import join, isfile 6 | from shutil import copyfileobj 7 | from urllib.request import urlretrieve 8 | from gzip import open as gopen 9 | from .language import WordTokenizer 10 | from .vecmap.map_embeddings import vecmap 11 | from .env import DATADIR 12 | 13 | fasttext_url = "https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/" 14 | 15 | def padding(arr, pad_token, dtype=torch.long): 16 | lens = torch.LongTensor([len(a) for a in arr]) 17 | max_len = lens.max().item() 18 | padded = torch.ones(len(arr), max_len, dtype=dtype) * pad_token 19 | mask = torch.zeros(len(arr), max_len, dtype=torch.long) 20 | for i, a in enumerate(arr): 21 | padded[i, :lens[i]] = torch.tensor(a, dtype=dtype) 22 | mask[i, :lens[i]] = 1 23 | return padded, mask 24 | 25 | def collate_idf(arr, tokenize, numericalize, max_len): 26 | tokens = [["[CLS]"] + tokenize(a)[:max_len] + ["[SEP]"] for a in arr] 27 | arr = [numericalize(a) for a in tokens] 28 | idf_dict = defaultdict(lambda: 1.) 29 | idf_weights = [[idf_dict[i] for i in a] for a in arr] 30 | pad_token = numericalize(["[PAD]"])[0] 31 | padded, mask = padding(arr, pad_token, dtype=torch.long) 32 | padded_idf, _ = padding(idf_weights, pad_token, dtype=torch.float) 33 | 34 | return padded, padded_idf, mask, tokens 35 | 36 | def bert_embed(all_sens, batch_size, model, tokenizer, device): 37 | if len(all_sens) == 0: 38 | return torch.empty(0, 0, 768), torch.empty(0, 0, 1), list(), torch.empty(0, 0, 1) 39 | padded_sens, padded_idf, mask, tokens = collate_idf(all_sens, tokenizer.tokenize, tokenizer.convert_tokens_to_ids, 40 | tokenizer.max_len_single_sentence) 41 | data = TensorDataset(padded_sens, mask) 42 | sampler = SequentialSampler(data) 43 | dataloader = DataLoader(data, sampler=sampler, batch_size=batch_size) 44 | all_embeddings = torch.zeros((len(all_sens), mask.shape[1], model.config.hidden_size)) 45 | 46 | model.eval() 47 | with torch.no_grad(): 48 | for batch_index, (batch_padded_sens, batch_mask) in enumerate(dataloader): 49 | pos = batch_index * batch_size 50 | batch_padded_sens = batch_padded_sens.to(device) 51 | batch_mask = batch_mask.to(device) 52 | all_embeddings[pos:pos + len(batch_mask)] = model(batch_padded_sens, batch_mask)["last_hidden_state"].cpu() 53 | return all_embeddings, padded_idf, tokens, mask.unsqueeze(-1) 54 | 55 | def map_multilingual_embeddings(src_lang, tgt_lang, batch_size, device): 56 | src_emb = get_embeddings_file(src_lang) 57 | tgt_emb = get_embeddings_file(tgt_lang) 58 | 59 | arguments = ['--batch_size', str(batch_size), '--unsupervised', src_emb, tgt_emb] 60 | if "cuda" in device: 61 | arguments.insert(0, '--cuda') 62 | return vecmap(arguments) 63 | 64 | def get_embeddings_file(lang_id): 65 | filename = f"cc.{lang_id}.300.vec" 66 | gz_filename = filename + ".gz" 67 | 68 | if isfile(join(DATADIR, filename)): 69 | return join(DATADIR, filename) 70 | 71 | urlretrieve(join(fasttext_url, gz_filename), join(DATADIR, gz_filename)) 72 | 73 | with gopen(join(DATADIR, gz_filename), 'rb') as f: 74 | with open(join(DATADIR, filename), 'wb') as f_out: 75 | copyfileobj(f, f_out) 76 | 77 | return join(DATADIR, filename) 78 | 79 | def vecmap_embed(all_sents, lang_dict, lang): 80 | tokens, idf_weights, embeddings = list(), list(), list() 81 | with WordTokenizer(lang) as tokenize: 82 | for sent in all_sents: 83 | tokens.append([word for word in tokenize(sent)]) 84 | idf_weights.append([1] * len(tokens[-1])) 85 | embeddings.append(torch.stack([lang_dict[word] for word in tokens[-1]])) 86 | 87 | idf_weights, mask = padding(idf_weights, 0, dtype=torch.float) 88 | embeddings = pad_sequence(embeddings, batch_first=True) 89 | 90 | return embeddings, idf_weights, tokens, mask.unsqueeze(-1) 91 | -------------------------------------------------------------------------------- /metrics/utils/env.py: -------------------------------------------------------------------------------- 1 | from os import getenv 2 | from os.path import join 3 | from pathlib import Path 4 | 5 | DATADIR = getenv("METRICS_HOME", join(getenv("XDG_CACHE_HOME", join(Path.home(), ".cache")), "metrics")) 6 | Path(DATADIR).mkdir(parents=True, exist_ok=True) 7 | -------------------------------------------------------------------------------- /metrics/utils/knn.py: -------------------------------------------------------------------------------- 1 | from faiss import IndexFlatL2, IndexFlatIP, index_cpu_to_all_gpus, normalize_L2 2 | import numpy as np 3 | 4 | # Adopted from https://github.com/pytorch/fairseq/blob/master/examples/criss/mining/mine.py 5 | def knn_sharded(source_data, target_data, k, batch_size, device, use_cosine=False): 6 | if use_cosine: 7 | normalize_L2(source_data) 8 | normalize_L2(target_data) 9 | sims = [] 10 | inds = [] 11 | dim = source_data.shape[-1] 12 | xfrom = 0 13 | 14 | for x_batch in np.array_split(source_data, np.ceil(len(source_data) / batch_size)): 15 | yfrom = 0 16 | bsims, binds = [], [] 17 | for y_batch in np.array_split(target_data, np.ceil(len(target_data) / batch_size)): 18 | neighbor_size = min(k, y_batch.shape[0]) 19 | idx = IndexFlatIP(dim) if use_cosine else IndexFlatL2(dim) 20 | if device != 'cpu': 21 | idx = index_cpu_to_all_gpus(idx) 22 | idx.add(y_batch) 23 | bsim, bind = idx.search(x_batch, neighbor_size) 24 | 25 | bsims.append(bsim) 26 | binds.append(bind + yfrom) 27 | yfrom += y_batch.shape[0] 28 | del idx 29 | del y_batch 30 | bsims = np.concatenate(bsims, axis=1) 31 | binds = np.concatenate(binds, axis=1) 32 | aux = np.argsort(-bsims, axis=1) 33 | sim_batch = np.zeros((x_batch.shape[0], k), dtype=np.float32) 34 | ind_batch = np.zeros((x_batch.shape[0], k), dtype=np.int64) 35 | for i in range(x_batch.shape[0]): 36 | for j in range(k): 37 | sim_batch[i, j] = bsims[i, aux[i, j]] 38 | ind_batch[i, j] = binds[i, aux[i, j]] 39 | sims.append(sim_batch) 40 | inds.append(ind_batch) 41 | xfrom += x_batch.shape[0] 42 | del x_batch 43 | sim = np.concatenate(sims, axis=0) 44 | ind = np.concatenate(inds, axis=0) 45 | return sim, ind 46 | 47 | def score_candidates(sim_mat, candidate_inds, fwd_mean, bwd_mean): 48 | scores = np.zeros(candidate_inds.shape) 49 | for i in range(scores.shape[0]): 50 | for j in range(scores.shape[1]): 51 | k = int(candidate_inds[i, j]) 52 | scores[i, j] = sim_mat[i, j] / ((fwd_mean[i] + bwd_mean[k]) / 2) 53 | return scores 54 | 55 | def ratio_margin_align(source_data, target_data, k, batch_size, device): 56 | src2tgt_sim, src2tgt_ind = knn_sharded(source_data.numpy(), target_data.numpy(), k, batch_size, device, True) 57 | tgt2src_sim, _ = knn_sharded(target_data.numpy(), source_data.numpy(), k, batch_size, device) 58 | 59 | src2tgt_mean = src2tgt_sim.mean(axis=1) 60 | tgt2src_mean = tgt2src_sim.mean(axis=1) 61 | fwd_scores = score_candidates(src2tgt_sim, src2tgt_ind, src2tgt_mean, tgt2src_mean) 62 | fwd_best = src2tgt_ind[np.arange(src2tgt_sim.shape[0]), fwd_scores.argmax(axis=1)] 63 | 64 | return np.insert(np.expand_dims(fwd_best, 1), 0, range(len(fwd_best)), 1), fwd_scores.max(axis=1) 65 | 66 | def wcd_align(source_data, target_data, k, batch_size, device): 67 | squared_scores, indeces = knn_sharded(source_data.numpy(), target_data.numpy(), k, batch_size, device) 68 | return indeces, np.sqrt(squared_scores) 69 | 70 | def cosine_align(source_data, target_data, k, batch_size, device): 71 | scores, indeces = knn_sharded(source_data.numpy(), target_data.numpy(), k, batch_size, device, True) 72 | return indeces, scores 73 | -------------------------------------------------------------------------------- /metrics/utils/language.py: -------------------------------------------------------------------------------- 1 | from os.path import isfile, join 2 | from fasttext import FastText, load_model 3 | from urllib.request import urlretrieve 4 | from collections import defaultdict 5 | from tempfile import mkdtemp 6 | from mosestokenizer import MosesTokenizer, MosesSentenceSplitter 7 | from Nepali_nlp import Tokenizer 8 | from sinling import SinhalaTokenizer 9 | from jieba import cut 10 | from re import findall, U 11 | 12 | class LangDetect(): 13 | url = "https://dl.fbaipublicfiles.com/fasttext/supervised-models/" 14 | 15 | def __init__(self, compress=False, cache_dir=mkdtemp()): 16 | # fixes https://github.com/facebookresearch/fastText/issues/1067 for the time being 17 | FastText.eprint = lambda _: None 18 | self.cache_dir = cache_dir 19 | self.model = self.load_model("lid.176.ftz" if compress else "lid.176.bin") 20 | 21 | def load_model(self, name): 22 | target_path = join(self.cache_dir, name) 23 | if not isfile(target_path): 24 | urlretrieve(join(self.url, name), target_path) 25 | return load_model(target_path) 26 | 27 | def detect(self, texts, return_score=False): 28 | texts = [texts] if isinstance(texts, str) else texts 29 | counter = defaultdict(float) 30 | 31 | for text in texts: 32 | labels, scores = self.model.predict(text.strip()) 33 | label = labels[0].removeprefix("__label__") 34 | score = min(float(scores[0]), 1.0) 35 | counter[label] += score 36 | label, score = sorted(counter.items(), key=lambda tup: tup[1])[-1] 37 | return (label, score) if return_score else label 38 | 39 | 40 | class WordTokenizer(): 41 | def __init__(self, language): 42 | if language == "si": 43 | self.tokenize = SinhalaTokenizer().tokenize 44 | # since bn and hi are related to ne and use the same script we can use the ne tokenizer for all 45 | elif language in ["ne", "bn", "hi"]: 46 | self.tokenize = Tokenizer().word_tokenize 47 | elif language == "zh": 48 | self.tokenize = lambda sent: list(cut(sent)) 49 | else: 50 | # zulu and xhosa follow english punctuation 51 | self.tokenize = MosesTokenizer("en" if language in ["zu", "xh"] else language) 52 | 53 | def __call__(self, sentence): 54 | return self.tokenize(sentence) 55 | 56 | def __enter__(self): 57 | return self.tokenize 58 | 59 | def __exit__(self, *_): 60 | if type(self.tokenize) == MosesTokenizer: 61 | self.tokenize.close() 62 | 63 | def __del__(self): 64 | if type(self.tokenize) == MosesTokenizer: 65 | self.tokenize.close() 66 | 67 | class SentenceSplitter(): 68 | def __init__(self, language): 69 | if language in ["si"]: 70 | self.split = lambda sents: SinhalaTokenizer().split_sentences(" ".join(sents)) 71 | elif language in ["ne", "bn", "hi"]: 72 | self.split = lambda sents: Tokenizer().sentence_tokenize(" ".join(sents)) 73 | elif language == "zh": 74 | self.split = lambda sent: self._split_chinese(sent) 75 | else: 76 | self.split = MosesSentenceSplitter("en" if language in ["zu", "xh"] else language, False) 77 | 78 | # taken from https://stackoverflow.com/a/45274695, modified regex of 79 | # http://aclweb.org/anthology/Y/Y11/Y11-1038.pdf 80 | def _split_chinese(_, sentences): 81 | return [sent.strip() for sent in findall(u'[^!?。\.\!\?]+[!?。\.\!\?]?', "".join(sentences), flags=U)] 82 | 83 | def __call__(self, sentence): 84 | return self.split(sentence) 85 | 86 | def __enter__(self): 87 | return self.split 88 | 89 | def __exit__(self, *_): 90 | if type(self.split) == MosesSentenceSplitter: 91 | self.split.close() 92 | 93 | def __del__(self): 94 | if type(self.split) == MosesSentenceSplitter: 95 | self.split.close() 96 | -------------------------------------------------------------------------------- /metrics/utils/nmt.py: -------------------------------------------------------------------------------- 1 | # Copyright The HuggingFace Team and The HuggingFace Inc. team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Fine-tuning the library models for sequence to sequence. 16 | """ 17 | # Adapted, based on https://github.com/huggingface/transformers/blob/v4.5.1/examples/seq2seq/run_translation.py 18 | 19 | import logging 20 | import os 21 | from dataclasses import dataclass, field 22 | from typing import Optional 23 | from torch.utils.data import DataLoader 24 | from .env import DATADIR 25 | 26 | from datasets import load_dataset 27 | 28 | from transformers import ( 29 | AutoConfig, 30 | AutoModelForSeq2SeqLM, 31 | AutoTokenizer, 32 | DataCollatorForSeq2Seq, 33 | HfArgumentParser, 34 | MBartTokenizer, 35 | MBartTokenizerFast, 36 | MBart50Tokenizer, 37 | MBart50TokenizerFast, 38 | Seq2SeqTrainer, 39 | Seq2SeqTrainingArguments, 40 | default_data_collator, 41 | set_seed, 42 | ) 43 | from transformers.trainer_utils import get_last_checkpoint 44 | from transformers.utils import check_min_version 45 | 46 | 47 | # Will error if the minimal version of Transformers is not installed. Remove at your own risks. 48 | check_min_version("4.5.0") 49 | 50 | logger = logging.getLogger(__name__) 51 | 52 | language2mBART = { 53 | "ar": "ar_AR", "cs": "cs_CZ", "de": "de_DE", "en": "en_XX", "es": "es_XX", 54 | "et": "et_EE", "fi": "fi_FI", "fr": "fr_XX", "gu": "gu_IN", "hi": "hi_IN", 55 | "it": "it_IT", "ja": "ja_XX", "kk": "kk_KZ", "ko": "ko_KR", "lt": "lt_LT", 56 | "lv": "lv_LV", "my": "my_MM", "ne": "ne_NP", "nl": "nl_XX", "ro": "ro_RO", 57 | "ru": "ru_RU", "si": "si_LK", "tr": "tr_TR", "vi": "vi_VN", "zh": "zh_CN" } 58 | 59 | language2mBART50 = language2mBART | { 60 | "af": "af_ZA", "az": "az_AZ", "bn": "bn_IN", "fa": "fa_IR", "he": "he_IL", 61 | "hr": "hr_HR", "id": "id_ID", "ka": "ka_GE", "km": "km_KH", "mk": "mk_MK", 62 | "ml": "ml_IN", "mn": "mn_MN", "mr": "mr_IN", "pl": "pl_PL", "ps": "ps_AF", 63 | "pt": "pt_XX", "sv": "sv_SE", "sw": "sw_KE", "ta": "ta_IN", "te": "te_IN", 64 | "th": "th_TH", "tl": "tl_XX", "uk": "uk_UA", "ur": "ur_PK", "xh": "xh_ZA", 65 | "gl": "gl_ES", "sl": "sl_SI", "zu": "xh_ZA"} # zulu and xhosa are related, so it should be fine 66 | 67 | @dataclass 68 | class ModelArguments: 69 | """ 70 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune from. 71 | """ 72 | 73 | model_name_or_path: str = field( 74 | metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"} 75 | ) 76 | cache_dir: Optional[str] = field( 77 | default=None, 78 | metadata={"help": "Where to store the pretrained models downloaded from huggingface.co"}, 79 | ) 80 | use_fast_tokenizer: bool = field( 81 | default=True, 82 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 83 | ) 84 | 85 | 86 | @dataclass 87 | class DataTrainingArguments: 88 | """ 89 | Arguments pertaining to what data we are going to input our model for training and eval. 90 | """ 91 | 92 | source_lang: str = field(default=None, metadata={"help": "Source language id for translation."}) 93 | target_lang: str = field(default=None, metadata={"help": "Target language id for translation."}) 94 | 95 | train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a jsonlines)."}) 96 | overwrite_cache: bool = field( 97 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 98 | ) 99 | preprocessing_num_workers: Optional[int] = field( 100 | default=None, 101 | metadata={"help": "The number of processes to use for the preprocessing."}, 102 | ) 103 | max_source_length: Optional[int] = field( 104 | default=1024, 105 | metadata={ 106 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 107 | "than this will be truncated, sequences shorter will be padded." 108 | }, 109 | ) 110 | max_target_length: Optional[int] = field( 111 | default=128, 112 | metadata={ 113 | "help": "The maximum total sequence length for target text after tokenization. Sequences longer " 114 | "than this will be truncated, sequences shorter will be padded." 115 | }, 116 | ) 117 | pad_to_max_length: bool = field( 118 | default=False, 119 | metadata={ 120 | "help": "Whether to pad all samples to model maximum sentence length. " 121 | "If False, will pad the samples dynamically when batching to the maximum length in the batch. More " 122 | "efficient on GPU but very bad for TPU." 123 | }, 124 | ) 125 | max_train_samples: Optional[int] = field( 126 | default=None, 127 | metadata={ 128 | "help": "For debugging purposes or quicker training, truncate the number of training examples to this " 129 | "value if set." 130 | }, 131 | ) 132 | num_beams: Optional[int] = field( 133 | default=None, 134 | metadata={ 135 | "help": "Number of beams to use for evaluation. This argument will be passed to ``model.generate``, " 136 | "which is used during ``evaluate`` and ``predict``." 137 | }, 138 | ) 139 | ignore_pad_token_for_loss: bool = field( 140 | default=True, 141 | metadata={ 142 | "help": "Whether to ignore the tokens corresponding to padded labels in the loss computation or not." 143 | }, 144 | ) 145 | source_prefix: Optional[str] = field( 146 | default=None, metadata={"help": "A prefix to add before every source text (useful for T5 models)."} 147 | ) 148 | 149 | def __post_init__(self): 150 | if self.train_file is None: 151 | raise ValueError("Need a training file.") 152 | elif self.source_lang is None or self.target_lang is None: 153 | raise ValueError("Need to specify the source language and the target language.") 154 | 155 | if self.train_file is not None: 156 | extension = self.train_file.split(".")[-1] 157 | assert extension == "json", "`train_file` should be a json file." 158 | 159 | def load_model_and_tokenizer(model_name_or_path, source_lang, target_lang, use_fast_tokenizer, cache_dir): 160 | # Load pretrained model and tokenizer 161 | config = AutoConfig.from_pretrained( 162 | model_name_or_path, 163 | cache_dir=cache_dir, 164 | ) 165 | tokenizer = AutoTokenizer.from_pretrained( 166 | model_name_or_path, 167 | cache_dir=cache_dir, 168 | use_fast=use_fast_tokenizer, 169 | ) 170 | model = AutoModelForSeq2SeqLM.from_pretrained( 171 | model_name_or_path, 172 | from_tf=bool(".ckpt" in model_name_or_path), 173 | config=config, 174 | cache_dir=cache_dir, 175 | ) 176 | 177 | # Set decoder_start_token_id 178 | if model.config.decoder_start_token_id is None and isinstance(tokenizer, (MBartTokenizer, MBartTokenizerFast)): 179 | if isinstance(tokenizer, MBartTokenizer): 180 | model.config.decoder_start_token_id = tokenizer.lang_code_to_id[target_lang] 181 | else: 182 | model.config.decoder_start_token_id = tokenizer.convert_tokens_to_ids(target_lang) 183 | 184 | if model.config.decoder_start_token_id is None: 185 | raise ValueError("Make sure that `config.decoder_start_token_id` is correctly defined") 186 | 187 | # For translation we set the codes of our source and target languages (only useful for mBART, the others will 188 | # ignore those attributes). 189 | if isinstance(tokenizer, (MBartTokenizer, MBart50Tokenizer, MBartTokenizerFast, MBart50TokenizerFast)): 190 | tokenizer.src_lang = source_lang 191 | tokenizer.tgt_lang = target_lang 192 | 193 | # For multilingual translation model mBART-50 we need to force the target 194 | # language token as the first generated token. 195 | if isinstance(tokenizer, (MBart50Tokenizer, MBart50TokenizerFast)): 196 | model.config.forced_bos_token_id = tokenizer.lang_code_to_id[target_lang] 197 | 198 | return model, tokenizer 199 | 200 | def _train(args=None): 201 | # See all possible arguments in src/transformers/training_args.py 202 | # or by passing the --help flag to this script. 203 | # We now keep distinct sets of args, for a cleaner separation of concerns. 204 | 205 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, Seq2SeqTrainingArguments)) 206 | model_args, data_args, training_args = parser.parse_args_into_dataclasses(args) 207 | 208 | # Detecting last checkpoint. 209 | last_checkpoint = None 210 | if os.path.isdir(training_args.output_dir) and not training_args.overwrite_output_dir: 211 | last_checkpoint = get_last_checkpoint(training_args.output_dir) 212 | 213 | 214 | if os.path.isfile(os.path.join(training_args.output_dir, 'config.json')): 215 | logger.info( 216 | f"Output directory ({training_args.output_dir}) exists already and is not empty. " 217 | "Skipping training and returning pretrained models." 218 | ) 219 | return load_model_and_tokenizer(training_args.output_dir, data_args.source_lang, 220 | data_args.target_lang, model_args.use_fast_tokenizer, model_args.cache_dir) 221 | elif last_checkpoint is not None: 222 | logger.info( 223 | f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change " 224 | "the `--output_dir` or add `--overwrite_output_dir` to train from scratch." 225 | ) 226 | 227 | # Set seed before initializing model. 228 | set_seed(training_args.seed) 229 | 230 | # For translation, only JSON files are supported, with one field named "translation" containing two keys for the 231 | # source and target languages (unless you adapt what follows). 232 | data_files = {} 233 | data_files["train"] = data_args.train_file 234 | extension = data_args.train_file.split(".")[-1] 235 | datasets = load_dataset(extension, data_files=data_files, download_mode="force_redownload") 236 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 237 | # https://huggingface.co/docs/datasets/loading_datasets.html 238 | 239 | model, tokenizer = load_model_and_tokenizer(model_args.model_name_or_path, data_args.source_lang, 240 | data_args.target_lang, model_args.use_fast_tokenizer, model_args.cache_dir) 241 | 242 | # Get the language codes for input/target. 243 | source_lang = data_args.source_lang.split("_")[0] 244 | target_lang = data_args.target_lang.split("_")[0] 245 | 246 | # Temporarily set max_target_length for training. 247 | max_target_length = data_args.max_target_length 248 | padding = "max_length" if data_args.pad_to_max_length else False 249 | 250 | if training_args.label_smoothing_factor > 0 and not hasattr(model, "prepare_decoder_input_ids_from_labels"): 251 | logger.warn( 252 | "label_smoothing is enabled but the `prepare_decoder_input_ids_from_labels` method is not defined for" 253 | f"`{model.__class__.__name__}`. This will lead to loss being calculated twice and will take up more memory" 254 | ) 255 | 256 | def preprocess_function(examples): 257 | prefix = data_args.source_prefix if data_args.source_prefix is not None else "" 258 | inputs = [ex[source_lang] for ex in examples["translation"]] 259 | targets = [ex[target_lang] for ex in examples["translation"]] 260 | inputs = [prefix + inp for inp in inputs] 261 | model_inputs = tokenizer(inputs, max_length=data_args.max_source_length, padding=padding, truncation=True) 262 | 263 | # Setup the tokenizer for targets 264 | with tokenizer.as_target_tokenizer(): 265 | labels = tokenizer(targets, max_length=max_target_length, padding=padding, truncation=True) 266 | 267 | # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore 268 | # padding in the loss. 269 | if padding == "max_length" and data_args.ignore_pad_token_for_loss: 270 | labels["input_ids"] = [ 271 | [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"] 272 | ] 273 | 274 | model_inputs["labels"] = labels["input_ids"] 275 | return model_inputs 276 | 277 | train_dataset = datasets["train"] 278 | if data_args.max_train_samples is not None: 279 | train_dataset = train_dataset.select(range(data_args.max_train_samples)) 280 | train_dataset = train_dataset.map( 281 | preprocess_function, 282 | batched=True, 283 | num_proc=data_args.preprocessing_num_workers, 284 | remove_columns=datasets["train"].column_names, 285 | load_from_cache_file=not data_args.overwrite_cache, 286 | ) 287 | 288 | # Data collator 289 | label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id 290 | if data_args.pad_to_max_length: 291 | data_collator = default_data_collator 292 | else: 293 | data_collator = DataCollatorForSeq2Seq( 294 | tokenizer, 295 | model=model, 296 | label_pad_token_id=label_pad_token_id, 297 | pad_to_multiple_of=8 if training_args.fp16 else None, 298 | ) 299 | # Initialize our Trainer 300 | trainer = Seq2SeqTrainer( 301 | model=model, 302 | args=training_args, 303 | train_dataset=train_dataset, 304 | tokenizer=tokenizer, 305 | data_collator=data_collator, 306 | ) 307 | 308 | # Training 309 | train_result = trainer.train(resume_from_checkpoint=last_checkpoint) 310 | trainer.save_model(training_args.output_dir) # Saves the tokenizer too 311 | 312 | metrics = train_result.metrics 313 | max_train_samples = ( 314 | data_args.max_train_samples if data_args.max_train_samples is not None else len(train_dataset) 315 | ) 316 | metrics["train_samples"] = min(max_train_samples, len(train_dataset)) 317 | 318 | trainer.log_metrics("train", metrics) 319 | trainer.save_state() 320 | 321 | return load_model_and_tokenizer(training_args.output_dir, data_args.source_lang, 322 | data_args.target_lang, model_args.use_fast_tokenizer, model_args.cache_dir) 323 | 324 | def train(model, source_lang, target_lang, dataset, overwrite, suffix, name=None): 325 | if "mbart" in model: 326 | lookup = language2mBART50 if "50" in model else language2mBART 327 | source_lang = lookup[source_lang] 328 | target_lang = lookup[target_lang] 329 | args = [ 330 | "--model_name_or_path", model, 331 | "--cache_dir", os.path.join(DATADIR, "translation", name or os.path.basename(model), suffix, "cache"), 332 | "--output_dir", os.path.join(DATADIR, "translation", name or os.path.basename(model), suffix, "output"), 333 | "--source_lang", source_lang, 334 | "--target_lang", target_lang, 335 | "--train_file", dataset, 336 | "--report_to", "none", 337 | "--save_strategy", "epoch", 338 | "--per_device_train_batch_size", "4", "--do_train"] 339 | if overwrite: 340 | args.append("--overwrite_output_dir") 341 | 342 | return _train(args) 343 | 344 | def translate(model, tokenizer, sentences, batch_size, device): 345 | translated = list() 346 | for batch in DataLoader(sentences, batch_size=batch_size): 347 | inputs = tokenizer(batch, return_tensors="pt", padding=True) 348 | inputs = {k: v.to(device) for k, v in inputs.items()} 349 | translated_tokens = model.generate(**inputs, decoder_start_token_id=model.config.decoder_start_token_id) 350 | translated.extend(tokenizer.batch_decode(translated_tokens.cpu(), skip_special_tokens=True)) 351 | return translated 352 | 353 | if __name__ == "__main__": 354 | _train() 355 | -------------------------------------------------------------------------------- /metrics/utils/perplexity.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelWithLMHead, AutoTokenizer 2 | from torch import tensor 3 | 4 | def lm_perplexity(hyps, device, name="gpt2"): 5 | if name is None: 6 | return [0] * len(hyps) 7 | 8 | # Some models need a special tokenizer, like chinese gpt2, see here: 9 | # https://huggingface.co/ckiplab/gpt2-base-chinese 10 | model_name, tokenizer_name = (name, name) if isinstance(name, str) else name 11 | 12 | model = AutoModelWithLMHead.from_pretrained(model_name).to(device) 13 | tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) 14 | 15 | scores = list() 16 | model.eval() 17 | for hyp in hyps: 18 | tokenize_input = tokenizer.tokenize(hyp) 19 | 20 | if len(tokenize_input) <= 1: 21 | scores.append(0) 22 | else: 23 | if len(tokenize_input) > 1024: 24 | tokenize_input = tokenize_input[:1024] 25 | 26 | input_ids = tensor([tokenizer.convert_tokens_to_ids(tokenize_input)]).to(device) 27 | score = model(input_ids, labels=input_ids)[0] 28 | scores.append(-score.item()) 29 | 30 | return scores 31 | -------------------------------------------------------------------------------- /metrics/utils/remap.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from itertools import chain 4 | from subprocess import check_output, DEVNULL 5 | from tempfile import NamedTemporaryFile as TempFile 6 | from simalign import SentenceAligner 7 | from torch.utils.data import DataLoader, SequentialSampler, TensorDataset 8 | from .env import DATADIR 9 | 10 | def convert_sent_to_input(sents, tokenizer, max_seq_length): 11 | input_ids = [] 12 | mask = [] 13 | for sent in sents: 14 | ids = tokenizer.convert_tokens_to_ids(sent) 15 | mask.append([1] * (len(ids) + 2) + [0] * (max_seq_length - len(ids))) 16 | input_ids.append([101] + ids + [102] + [0] * (max_seq_length - len(ids))) 17 | return torch.tensor(input_ids, dtype=torch.long), torch.tensor(mask, dtype=torch.long) 18 | 19 | def convert_words_to_bpe(sent_pairs, tokenizer): 20 | bpe_para, bpe_table = [], [] 21 | 22 | for (src_sent, tgt_sent) in sent_pairs: 23 | src_bpe_table, tgt_bpe_table = [], [] 24 | src_sent_bpe, tgt_sent_bpe = [], [] 25 | 26 | for word in src_sent: 27 | token = tokenizer.tokenize(word) 28 | word2bpe_map = [] 29 | for i in range(len(token)): 30 | word2bpe_map.append(len(src_sent_bpe)+i) 31 | src_sent_bpe.extend(token) 32 | src_bpe_table.append(word2bpe_map) 33 | 34 | for word in tgt_sent: 35 | token = tokenizer.tokenize(word) 36 | word2bpe_map = [] 37 | for i in range(len(token)): 38 | word2bpe_map.append(len(tgt_sent_bpe)+i) 39 | tgt_sent_bpe.extend(token) 40 | tgt_bpe_table.append(word2bpe_map) 41 | 42 | bpe_para.append([src_sent_bpe, tgt_sent_bpe]) 43 | bpe_table.append([src_bpe_table, tgt_bpe_table]) 44 | 45 | return bpe_para, bpe_table 46 | 47 | 48 | def get_aligned_features_avgbpe(sent_pairs, align_pairs, model, 49 | tokenizer, batch_size, device, layer=12, max_seq_length=175): 50 | bpe_para, bpe_table = convert_words_to_bpe(sent_pairs, tokenizer) 51 | 52 | # filter long/empty sentences 53 | fltr_src_bpe, fltr_tgt_bpe, fltr_align_pairs, fltr_bpe_table, align_cnt = [], [], [], [], 0 54 | for cnt, (src, tgt) in enumerate(bpe_para): 55 | if len(src) <= max_seq_length and len(tgt) <= max_seq_length and len(src) > 0 and len(tgt) > 0: 56 | fltr_src_bpe.append(src) 57 | fltr_tgt_bpe.append(tgt) 58 | fltr_align_pairs.append(align_pairs[cnt]) 59 | fltr_bpe_table.append(bpe_table[cnt]) 60 | align_cnt += len(align_pairs[cnt]) 61 | 62 | src_input, src_mask = convert_sent_to_input(fltr_src_bpe, tokenizer, max_seq_length) 63 | tgt_input, tgt_mask = convert_sent_to_input(fltr_tgt_bpe, tokenizer, max_seq_length) 64 | 65 | src_data = TensorDataset(src_input, src_mask) 66 | src_sampler = SequentialSampler(src_data) 67 | src_dataloader = DataLoader(src_data, sampler=src_sampler, batch_size=batch_size) 68 | 69 | tgt_data = TensorDataset(tgt_input, tgt_mask) 70 | tgt_sampler = SequentialSampler(tgt_data) 71 | tgt_dataloader = DataLoader(tgt_data, sampler=tgt_sampler, batch_size=batch_size) 72 | 73 | src_embed = [] 74 | tgt_embed = [] 75 | 76 | model.eval() 77 | with torch.no_grad(): 78 | for batch in src_dataloader: 79 | input_ids, input_mask = batch 80 | input_ids = input_ids.to(device) 81 | input_mask = input_mask.to(device) 82 | 83 | hidden_state = model(input_ids, attention_mask=input_mask)["hidden_states"][layer] 84 | src_embed.append(hidden_state[:,1:].cpu().numpy()) # remove CLS 85 | 86 | with torch.no_grad(): 87 | for batch in tgt_dataloader: 88 | input_ids, input_mask = batch 89 | input_ids = input_ids.to(device) 90 | input_mask = input_mask.to(device) 91 | 92 | hidden_state = model(input_ids, attention_mask=input_mask)["hidden_states"][layer] 93 | tgt_embed.append(hidden_state[:,1:].cpu().numpy()) 94 | 95 | src_embed = np.concatenate(src_embed) 96 | tgt_embed = np.concatenate(tgt_embed) 97 | 98 | feature_size = src_embed.shape[2] 99 | cnt, src_matrix, tgt_matrix = 0, np.zeros((align_cnt, feature_size)), np.zeros((align_cnt, feature_size)) 100 | for i, pairs in enumerate(fltr_align_pairs): 101 | for a in pairs: 102 | if len(fltr_bpe_table[i][0][a[0]]) > 0 and len(fltr_bpe_table[i][1][a[1]]) > 0: # token alignment (0,0) 103 | src_word_avg_embed = np.zeros((1, feature_size)) 104 | 105 | for j in fltr_bpe_table[i][0][a[0]]: 106 | src_word_avg_embed += src_embed[i][j,:] 107 | src_matrix[cnt,:] = src_word_avg_embed / len(fltr_bpe_table[i][0][a[0]]) 108 | 109 | tgt_word_avg_embed = np.zeros((1, feature_size)) 110 | for j in fltr_bpe_table[i][1][a[1]]: 111 | tgt_word_avg_embed += tgt_embed[i][j,:] 112 | 113 | tgt_matrix[cnt,:] = tgt_word_avg_embed / len(fltr_bpe_table[i][1][a[1]]) 114 | cnt += 1 115 | 116 | return src_matrix, tgt_matrix 117 | 118 | def fast_align(sent_pairs, tokenizer, size, max_seq_length=100): 119 | tokenized_pairs = list() 120 | for source_sent, target_sent in sent_pairs: 121 | sent1 = tokenizer.basic_tokenizer.tokenize(source_sent) 122 | sent2 = tokenizer.basic_tokenizer.tokenize(target_sent) 123 | 124 | if 0 < len(sent1) <= max_seq_length and 0 < len(sent2) <= max_seq_length: 125 | tokenized_pairs.append((sent1, sent2)) 126 | 127 | if len(tokenized_pairs) >= size: 128 | break 129 | 130 | with TempFile(dir=DATADIR, buffering=0) as fwd_file, TempFile(dir=DATADIR, buffering=0) as bwd_file: 131 | for file_, data, flags in ((fwd_file, tokenized_pairs, "-dov"), (bwd_file, tokenized_pairs, "-dovr")): 132 | file_.write("\n".join([f'{" ".join(src)} ||| {" ".join(tgt)}'.lower() for src, tgt in data]).encode()) 133 | asym_aligned = check_output(["fast_align", "-i", file_.name, flags], stderr=DEVNULL) 134 | file_.seek(0) 135 | file_.truncate() 136 | file_.write(asym_aligned) 137 | 138 | sym_aligned = check_output(["atools", "-i", fwd_file.name, "-j", bwd_file.name, "-c", "grow-diag-final-and"]) 139 | 140 | sym_aligned = [[tuple(map(int, pair.split(b"-"))) for pair in pairs.split()] for pairs in sym_aligned.splitlines()] 141 | return tokenized_pairs, sym_aligned 142 | 143 | def awesome_align(sentpairs, model, tokenizer, size, device, projection=None, max_seq_length=100): 144 | tokenized_pairs, alignments = list(), list() 145 | for src, tgt in sentpairs: 146 | sent_src, sent_tgt = tokenizer.basic_tokenizer.tokenize(src), tokenizer.basic_tokenizer.tokenize(tgt) 147 | if 0 < len(sent_src) <= max_seq_length and 0 < len(sent_tgt) <= max_seq_length: 148 | token_src = [tokenizer.tokenize(word) for word in sent_src] 149 | token_tgt = [tokenizer.tokenize(word) for word in sent_tgt] 150 | wid_src = [tokenizer.convert_tokens_to_ids(x) for x in token_src] 151 | wid_tgt = [tokenizer.convert_tokens_to_ids(x) for x in token_tgt] 152 | ids_src = tokenizer.prepare_for_model(list(chain(*wid_src)), return_tensors='pt', truncation=True)['input_ids'] 153 | ids_tgt = tokenizer.prepare_for_model(list(chain(*wid_tgt)), return_tensors='pt', truncation=True)['input_ids'] 154 | sub2word_map_src = [] 155 | for i, word_list in enumerate(token_src): 156 | sub2word_map_src.extend([i] * len(word_list)) 157 | sub2word_map_tgt = [] 158 | for i, word_list in enumerate(token_tgt): 159 | sub2word_map_tgt.extend([i] * len(word_list)) 160 | 161 | # alignment 162 | align_layer = 8 163 | threshold = 1e-3 164 | model.eval() 165 | with torch.no_grad(): 166 | out_src = model(ids_src.unsqueeze(0).to(device))["hidden_states"][align_layer] 167 | out_tgt = model(ids_tgt.unsqueeze(0).to(device))["hidden_states"][align_layer] 168 | 169 | if projection is not None: 170 | projection = projection.to(device) 171 | if projection.ndim == 2: # CLP 172 | out_src = torch.matmul(out_src, projection) 173 | else: # UMD 174 | out_src = out_src - (out_src * projection).sum(2, keepdim=True) * \ 175 | projection.repeat(out_src.shape[0], out_src.shape[1], 1) 176 | 177 | dot_prod = torch.matmul(out_src[0, 1:-1].cpu(), out_tgt[0, 1:-1].transpose(-1, -2).cpu()) 178 | 179 | softmax_srctgt = torch.nn.Softmax(dim=-1)(dot_prod) 180 | softmax_tgtsrc = torch.nn.Softmax(dim=-2)(dot_prod) 181 | 182 | softmax_inter = (softmax_srctgt > threshold)*(softmax_tgtsrc > threshold) 183 | 184 | align_subwords = torch.nonzero(softmax_inter, as_tuple=False) 185 | align_words = set() 186 | for i, j in align_subwords: 187 | align_words.add((sub2word_map_src[i], sub2word_map_tgt[j])) 188 | 189 | tokenized_pairs.append((sent_src, sent_tgt)) 190 | alignments.append(list(align_words)) 191 | 192 | if len(tokenized_pairs) >= size: 193 | break 194 | 195 | return tokenized_pairs, alignments 196 | 197 | def sim_align(sent_pairs, tokenizer, size, device, max_seq_length=100): 198 | tokenized_pairs, alignments = list(), list() 199 | aligner = SentenceAligner(matching_methods="i", token_type="word", device=device) 200 | for source_sent, target_sent in sent_pairs: 201 | sent1 = tokenizer.basic_tokenizer.tokenize(source_sent) 202 | sent2 = tokenizer.basic_tokenizer.tokenize(target_sent) 203 | 204 | if 0 < len(sent1) <= max_seq_length and 0 < len(sent2) <= max_seq_length: 205 | tokenized_pairs.append((sent1, sent2)) 206 | alignments.append(aligner.get_word_aligns(sent1, sent2)["itermax"]) 207 | 208 | if len(tokenized_pairs) >= size: 209 | break 210 | 211 | return tokenized_pairs, alignments 212 | 213 | def clp(x, z, orthogonal=True): 214 | if orthogonal: 215 | u, _, vt = np.linalg.svd(z.T.dot(x)) 216 | w = vt.T.dot(u.T) 217 | else: 218 | x_pseudoinv = np.linalg.inv(x.T.dot(x)).dot(x.T) 219 | w = x_pseudoinv.dot(z) 220 | return torch.Tensor(w) 221 | 222 | def umd(x, z): 223 | *_, v = np.linalg.svd(x - z) 224 | v_b = v[0] 225 | return torch.Tensor(v_b) 226 | -------------------------------------------------------------------------------- /metrics/utils/vecmap/.gitignore: -------------------------------------------------------------------------------- 1 | data/ 2 | output/ 3 | private/ 4 | 5 | *~ 6 | .DS_Store 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | .idea/ 11 | -------------------------------------------------------------------------------- /metrics/utils/vecmap/README.md: -------------------------------------------------------------------------------- 1 | VecMap (cross-lingual word embedding mappings) 2 | ============== 3 | 4 | This is an open source implementation of our framework to learn cross-lingual word embedding mappings, described in the following papers: 5 | - Mikel Artetxe, Gorka Labaka, and Eneko Agirre. 2018. **[A robust self-learning method for fully unsupervised cross-lingual mappings of word embeddings](https://aclweb.org/anthology/P18-1073)**. In *Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*. 6 | - Mikel Artetxe, Gorka Labaka, and Eneko Agirre. 2018. **[Generalizing and improving bilingual word embedding mappings with a multi-step framework of linear transformations](https://www.aaai.org/ocs/index.php/AAAI/AAAI18/paper/view/16935/16781)**. In *Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence (AAAI-18)*, pages 5012-5019. 7 | - Mikel Artetxe, Gorka Labaka, and Eneko Agirre. 2017. **[Learning bilingual word embeddings with (almost) no bilingual data](https://aclweb.org/anthology/P17-1042)**. In *Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)*, pages 451-462. 8 | - Mikel Artetxe, Gorka Labaka, and Eneko Agirre. 2016. **[Learning principled bilingual mappings of word embeddings while preserving monolingual invariance](https://aclweb.org/anthology/D16-1250)**. In *Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing*, pages 2289-2294. 9 | 10 | The package includes a script to build cross-lingual word embeddings with or without parallel data as described in the papers, as well as evaluation tools in word translation induction, word similarity/relatedness and word analogy. 11 | 12 | If you use this software for academic research, [please cite the relevant paper(s)](#publications). 13 | 14 | 15 | Requirements 16 | -------- 17 | - Python 3 18 | - NumPy 19 | - SciPy 20 | - CuPy (optional, only required for CUDA support) 21 | 22 | 23 | Usage 24 | -------- 25 | 26 | In order to build your own cross-lingual word embeddings, you should first train monolingual word embeddings for each language using your favorite tool (e.g. [word2vec](https://github.com/tmikolov/word2vec) or [fasttext](https://github.com/facebookresearch/fastText)) and then map them to a common space with our software as described below. Having done that, you can evaluate the resulting cross-lingual embeddings using our included tools as discussed next. 27 | 28 | #### Mapping 29 | 30 | The mapping software offers 4 main modes with our recommended settings for different scenarios: 31 | 32 | - **Supervised** (recommended if you have a large training dictionary): 33 | ``` 34 | python3 map_embeddings.py --supervised TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 35 | ``` 36 | - **Semi-supervised** (recommended if you have a small seed dictionary): 37 | ``` 38 | python3 map_embeddings.py --semi_supervised TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 39 | ``` 40 | - **Identical** (recommended if you have no seed dictionary but can rely on identical words): 41 | ``` 42 | python3 map_embeddings.py --identical SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 43 | ``` 44 | - **Unsupervised** (recommended if you have no seed dictionary and do not want to rely on identical words): 45 | ``` 46 | python3 map_embeddings.py --unsupervised SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 47 | ``` 48 | 49 | `SRC.EMB` and `TRG.EMB` refer to the input monolingual embeddings, which should be in the word2vec text format, whereas `SRC_MAPPED.EMB` and `TRG_MAPPED.EMB` refer to the output cross-lingual embeddings. The training dictionary `TRAIN.DICT`, if any, should be given as a text file with one entry per line (source word + whitespace + target word). 50 | 51 | If you have a NVIDIA GPU, append the `--cuda` flag to the above commands to make things faster. 52 | 53 | For most users, the above settings should suffice. Choosing the right mode should be straightforward depending on the resources available: as a general rule, you should prefer the mode with the highest supervision for the resources you have, although it is advised to try different variants in case of doubt. 54 | 55 | In addition to these recommended modes, the software also offers additional options to adjust different aspects of the mapping method as described in the papers. While most users should not need to deal with those, you can learn more about them by running the tool with the `--help` flag. You can either use one of the recommended modes and modify a few options on top of it, or do not use any recommended mode and set all options yourself. In fact, if you dig into the code, you will see that the above modes simply set recommended defaults for all the different options. 56 | 57 | #### Evaluation 58 | 59 | You can evaluate your mapped embeddings in bilingual lexicon extraction (aka dictionary induction or word translation) as follows: 60 | ``` 61 | python3 eval_translation.py SRC_MAPPED.EMB TRG_MAPPED.EMB -d TEST.DICT 62 | ``` 63 | The above command uses standard nearest neighbor retrieval by default. For best results, it is recommended that you use CSLS retrieval instead: 64 | ``` 65 | python3 eval_translation.py SRC_MAPPED.EMB TRG_MAPPED.EMB -d TEST.DICT --retrieval csls 66 | ``` 67 | While better, CSLS is also significantly slower than nearest neighbor, so do not forget to append the `--cuda` flag to the above command if you have a NVIDIA GPU. 68 | 69 | In addition to bilingual lexicon extraction, you can also evaluate your mapped embeddings in cross-lingual word similarity as follows: 70 | ``` 71 | python3 eval_similarity.py -l --backoff 0 SRC_MAPPED.EMB TRG_MAPPED.EMB -i TEST_SIMILARITY.TXT 72 | ``` 73 | 74 | Finally, we also offer an evaluation tool for monolingual word analogies, which mimics the one included with word2vec but should run significantly faster: 75 | ``` 76 | python3 eval_analogy.py -l SRC_MAPPED.EMB -i TEST_ANALOGIES.TXT -t 30000 77 | ``` 78 | 79 | 80 | Dataset 81 | -------- 82 | You can use the following script to download the main dataset used in our papers, which is an extension of that of [Dinu et al. (2014)](http://clic.cimec.unitn.it/~georgiana.dinu/down/): 83 | ``` 84 | ./get_data.sh 85 | ``` 86 | 87 | 88 | Reproducing results 89 | -------- 90 | 91 | While we always recommend to use the above settings for best results when working with your own embeddings, we also offer additional modes to replicate the systems from our different papers as follows: 92 | - **ACL 2018** (currently equivalent to the unsupervised mode): 93 | ``` 94 | python3 map_embeddings.py --acl2018 SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 95 | ``` 96 | - **AAAI 2018** (currently equivalent to the supervised mode, except for minor differences in re-weighting, normalization and dimensionality reduction): 97 | ``` 98 | python3 map_embeddings.py --aaai2018 TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 99 | ``` 100 | - **ACL 2017** (superseded by our ACL 2018 system; offers 2 modes depending on the initialization): 101 | ``` 102 | python3 map_embeddings.py --acl2017 SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 103 | python3 map_embeddings.py --acl2017_seed TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 104 | ``` 105 | - **EMNLP 2016** (superseded by our AAAI 2018 system): 106 | ``` 107 | python3 map_embeddings.py --emnlp2016 TRAIN.DICT SRC.EMB TRG.EMB SRC_MAPPED.EMB TRG_MAPPED.EMB 108 | ``` 109 | 110 | 111 | FAQ 112 | -------- 113 | 114 | ##### How long does training take? 115 | 116 | - The supervised mode (`--supervised`) should run in around 2 minutes in either CPU or GPU. 117 | - The rest of recommended modes (either `--semi_supervised`, `--identical` or `--unsupervised`) should run in around 5 hours in CPU, or 10 minutes in GPU (Titan Xp or similar). 118 | 119 | 120 | ##### This is running much slower for me! What can I do? 121 | 122 | 1. If you have a GPU, do not forget the `--cuda` flag. 123 | 2. Make sure that your NumPy installation is properly linked to BLAS/LAPACK. This is particularly important if you are working on CPU, as it can have a huge impact in performance if not properly set up. 124 | 3. There are different settings that affect the execution time of the algorithm and can thus be adjusted to make things faster: the batch size (`--batch_size`), the vocabulary cutoff (`--vocabulary_cutoff`), the stochastic dictionary induction settings (`--stochastic_initial`, `--stochastic_multiplier` and `--stochastic_interval`) and the convergence threshold (`--threshold`), among others. However, most of these settings will have a direct impact in the quality of the resulting embeddings, so you should not play with them unless you really know what you are doing. 125 | 126 | 127 | ##### Prior versions of this software included nice scripts to reproduce the exact same results reported in your papers. Why are those missing now? 128 | 129 | As the complexity of the software (and the number of publications/results to reproduce) increased, maintaining those nice scripts became very tedious. Moreover, with the inclusion of CUDA support and FP32 precision, reproducing the exact same results on different platforms became inviable due to minor numerical variations in the underlying computations, which were magnified by self-learning (e.g. the exact same command is likely to produce a slightly different output on CPU and GPU). While the effect in the final results is negligible (the observed variations are around 0.1-0.2 accuracy points), this made it unfeasible to reproduce the exact same results in different platforms. 130 | 131 | Instead of that, we now provide an [easy interface to run all the systems proposed in our different papers](#reproducing-results). We think that this might be even more useful than the previous approach: the most skeptical user should still be able to easily verify our results, while we also provide a simple interface to test our different systems in other datasets. 132 | 133 | 134 | ##### The ablation test in your ACL 2018 paper reports 0% accuracies for removing CSLS, but I am getting better results. Why is that? 135 | 136 | After publishing the paper, we discovered a bug in the code that was causing those 0% accuracies. Now that the bug is fixed, the effect of removing CSLS is not that dramatic, although it still has a big negative impact. At the same time, the effect of removing the bidirectional dictionary induction in that same ablation test is slightly smaller. 137 | 138 | 139 | See also 140 | -------- 141 | 142 | VecMap is a basic building block of [Monoses](https://github.com/artetxem/monoses), our Unsupervised Statistical Machine Translation system. You can use them in combination to train your own machine translation model from monolingual corpora alone. 143 | 144 | 145 | Publications 146 | -------- 147 | 148 | If you use this software for academic research, please cite the relevant paper(s) as follows (in case of doubt, please cite the ACL 2018 paper, or the AAAI 2018 paper if you use the supervised mode): 149 | ``` 150 | @inproceedings{artetxe2018acl, 151 | author = {Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko}, 152 | title = {A robust self-learning method for fully unsupervised cross-lingual mappings of word embeddings}, 153 | booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, 154 | year = {2018}, 155 | pages = {789--798} 156 | } 157 | 158 | @inproceedings{artetxe2018aaai, 159 | author = {Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko}, 160 | title = {Generalizing and improving bilingual word embedding mappings with a multi-step framework of linear transformations}, 161 | booktitle = {Proceedings of the Thirty-Second AAAI Conference on Artificial Intelligence}, 162 | year = {2018}, 163 | pages = {5012--5019} 164 | } 165 | 166 | @inproceedings{artetxe2017acl, 167 | author = {Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko}, 168 | title = {Learning bilingual word embeddings with (almost) no bilingual data}, 169 | booktitle = {Proceedings of the 55th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, 170 | year = {2017}, 171 | pages = {451--462} 172 | } 173 | 174 | @inproceedings{artetxe2016emnlp, 175 | author = {Artetxe, Mikel and Labaka, Gorka and Agirre, Eneko}, 176 | title = {Learning principled bilingual mappings of word embeddings while preserving monolingual invariance}, 177 | booktitle = {Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing}, 178 | year = {2016}, 179 | pages = {2289--2294} 180 | } 181 | ``` 182 | 183 | 184 | License 185 | ------- 186 | 187 | Copyright (C) 2016-2018, Mikel Artetxe 188 | 189 | Licensed under the terms of the GNU General Public License, either version 3 or (at your option) any later version. A full copy of the license can be found in LICENSE.txt. 190 | -------------------------------------------------------------------------------- /metrics/utils/vecmap/cupy_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2018 Mikel Artetxe 2 | # 3 | # This program is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU General Public License 14 | # along with this program. If not, see . 15 | 16 | import numpy 17 | 18 | try: 19 | import cupy 20 | except ImportError: 21 | cupy = None 22 | 23 | 24 | def supports_cupy(): 25 | return cupy is not None 26 | 27 | 28 | def get_cupy(): 29 | return cupy 30 | 31 | 32 | def get_array_module(x): 33 | if cupy is not None: 34 | return cupy.get_array_module(x) 35 | else: 36 | return numpy 37 | 38 | 39 | def asnumpy(x): 40 | if cupy is not None: 41 | return cupy.asnumpy(x) 42 | else: 43 | return numpy.asarray(x) 44 | -------------------------------------------------------------------------------- /metrics/utils/vecmap/embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2016-2018 Mikel Artetxe 2 | # 3 | # This program is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU General Public License 14 | # along with this program. If not, see . 15 | 16 | from .cupy_utils import * 17 | 18 | import numpy as np 19 | 20 | 21 | def read(file, threshold=0, vocabulary=None, dtype='float'): 22 | header = file.readline().split(' ') 23 | count = int(header[0]) if threshold <= 0 else min(threshold, int(header[0])) 24 | dim = int(header[1]) 25 | words = [] 26 | matrix = np.empty((count, dim), dtype=dtype) if vocabulary is None else [] 27 | for i in range(count): 28 | word, vec = file.readline().split(' ', 1) 29 | if vocabulary is None: 30 | words.append(word) 31 | matrix[i] = np.fromstring(vec, sep=' ', dtype=dtype) 32 | elif word in vocabulary: 33 | words.append(word) 34 | matrix.append(np.fromstring(vec, sep=' ', dtype=dtype)) 35 | return (words, matrix) if vocabulary is None else (words, np.array(matrix, dtype=dtype)) 36 | 37 | 38 | def write(words, matrix, file): 39 | m = asnumpy(matrix) 40 | print('%d %d' % m.shape, file=file) 41 | for i in range(len(words)): 42 | print(words[i] + ' ' + ' '.join(['%.6g' % x for x in m[i]]), file=file) 43 | 44 | 45 | def length_normalize(matrix): 46 | xp = get_array_module(matrix) 47 | norms = xp.sqrt(xp.sum(matrix**2, axis=1)) 48 | norms[norms == 0] = 1 49 | matrix /= norms[:, xp.newaxis] 50 | 51 | 52 | def mean_center(matrix): 53 | xp = get_array_module(matrix) 54 | avg = xp.mean(matrix, axis=0) 55 | matrix -= avg 56 | 57 | 58 | def length_normalize_dimensionwise(matrix): 59 | xp = get_array_module(matrix) 60 | norms = xp.sqrt(xp.sum(matrix**2, axis=0)) 61 | norms[norms == 0] = 1 62 | matrix /= norms 63 | 64 | 65 | def mean_center_embeddingwise(matrix): 66 | xp = get_array_module(matrix) 67 | avg = xp.mean(matrix, axis=1) 68 | matrix -= avg[:, xp.newaxis] 69 | 70 | 71 | def normalize(matrix, actions): 72 | for action in actions: 73 | if action == 'unit': 74 | length_normalize(matrix) 75 | elif action == 'center': 76 | mean_center(matrix) 77 | elif action == 'unitdim': 78 | length_normalize_dimensionwise(matrix) 79 | elif action == 'centeremb': 80 | mean_center_embeddingwise(matrix) 81 | -------------------------------------------------------------------------------- /metrics/utils/vecmap/map_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2016-2018 Mikel Artetxe 2 | # 3 | # This program is free software: you can redistribute it and/or modify 4 | # it under the terms of the GNU General Public License as published by 5 | # the Free Software Foundation, either version 3 of the License, or 6 | # (at your option) any later version. 7 | # 8 | # This program is distributed in the hope that it will be useful, 9 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 10 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 11 | # GNU General Public License for more details. 12 | # 13 | # You should have received a copy of the GNU General Public License 14 | # along with this program. If not, see . 15 | 16 | from . import embeddings 17 | from .cupy_utils import * 18 | from torch import zeros, from_numpy 19 | from collections import defaultdict 20 | 21 | import argparse 22 | import collections 23 | import numpy as np 24 | import re 25 | import sys 26 | import time 27 | 28 | 29 | def dropout(m, p): 30 | if p <= 0.0: 31 | return m 32 | else: 33 | xp = get_array_module(m) 34 | mask = xp.random.rand(*m.shape) >= p 35 | return m*mask 36 | 37 | 38 | def topk_mean(m, k, inplace=False): # TODO Assuming that axis is 1 39 | xp = get_array_module(m) 40 | n = m.shape[0] 41 | ans = xp.zeros(n, dtype=m.dtype) 42 | if k <= 0: 43 | return ans 44 | if not inplace: 45 | m = xp.array(m) 46 | ind0 = xp.arange(n) 47 | ind1 = xp.empty(n, dtype=int) 48 | minimum = m.min() 49 | for i in range(k): 50 | m.argmax(axis=1, out=ind1) 51 | ans += m[ind0, ind1] 52 | m[ind0, ind1] = minimum 53 | return ans / k 54 | 55 | 56 | def vecmap(cmd_args=None): 57 | # Parse command line arguments 58 | parser = argparse.ArgumentParser(description='Map word embeddings in two languages into a shared space') 59 | parser.add_argument('src_input', help='the input source embeddings') 60 | parser.add_argument('trg_input', help='the input target embeddings') 61 | parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') 62 | parser.add_argument('--precision', choices=['fp16', 'fp32', 'fp64'], default='fp32', help='the floating-point precision (defaults to fp32)') 63 | parser.add_argument('--cuda', action='store_true', help='use cuda (requires cupy)') 64 | parser.add_argument('--batch_size', default=10000, type=int, help='batch size (defaults to 10000); does not affect results, larger is usually faster but uses more memory') 65 | parser.add_argument('--seed', type=int, default=0, help='the random seed (defaults to 0)') 66 | 67 | recommended_group = parser.add_argument_group('recommended settings', 'Recommended settings for different scenarios') 68 | recommended_type = recommended_group.add_mutually_exclusive_group() 69 | recommended_type.add_argument('--supervised', metavar='DICTIONARY', help='recommended if you have a large training dictionary') 70 | recommended_type.add_argument('--semi_supervised', metavar='DICTIONARY', help='recommended if you have a small seed dictionary') 71 | recommended_type.add_argument('--identical', action='store_true', help='recommended if you have no seed dictionary but can rely on identical words') 72 | recommended_type.add_argument('--unsupervised', action='store_true', help='recommended if you have no seed dictionary and do not want to rely on identical words') 73 | recommended_type.add_argument('--acl2018', action='store_true', help='reproduce our ACL 2018 system') 74 | recommended_type.add_argument('--aaai2018', metavar='DICTIONARY', help='reproduce our AAAI 2018 system') 75 | recommended_type.add_argument('--acl2017', action='store_true', help='reproduce our ACL 2017 system with numeral initialization') 76 | recommended_type.add_argument('--acl2017_seed', metavar='DICTIONARY', help='reproduce our ACL 2017 system with a seed dictionary') 77 | recommended_type.add_argument('--emnlp2016', metavar='DICTIONARY', help='reproduce our EMNLP 2016 system') 78 | 79 | init_group = parser.add_argument_group('advanced initialization arguments', 'Advanced initialization arguments') 80 | init_type = init_group.add_mutually_exclusive_group() 81 | init_type.add_argument('-d', '--init_dictionary', default=sys.stdin.fileno(), metavar='DICTIONARY', help='the training dictionary file (defaults to stdin)') 82 | init_type.add_argument('--init_identical', action='store_true', help='use identical words as the seed dictionary') 83 | init_type.add_argument('--init_numerals', action='store_true', help='use latin numerals (i.e. words matching [0-9]+) as the seed dictionary') 84 | init_type.add_argument('--init_unsupervised', action='store_true', help='use unsupervised initialization') 85 | init_group.add_argument('--unsupervised_vocab', type=int, default=0, help='restrict the vocabulary to the top k entries for unsupervised initialization') 86 | 87 | mapping_group = parser.add_argument_group('advanced mapping arguments', 'Advanced embedding mapping arguments') 88 | mapping_group.add_argument('--normalize', choices=['unit', 'center', 'unitdim', 'centeremb', 'none'], nargs='*', default=[], help='the normalization actions to perform in order') 89 | mapping_group.add_argument('--whiten', action='store_true', help='whiten the embeddings') 90 | mapping_group.add_argument('--src_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the source language embeddings') 91 | mapping_group.add_argument('--trg_reweight', type=float, default=0, nargs='?', const=1, help='re-weight the target language embeddings') 92 | mapping_group.add_argument('--src_dewhiten', choices=['src', 'trg'], help='de-whiten the source language embeddings') 93 | mapping_group.add_argument('--trg_dewhiten', choices=['src', 'trg'], help='de-whiten the target language embeddings') 94 | mapping_group.add_argument('--dim_reduction', type=int, default=0, help='apply dimensionality reduction') 95 | mapping_type = mapping_group.add_mutually_exclusive_group() 96 | mapping_type.add_argument('-c', '--orthogonal', action='store_true', help='use orthogonal constrained mapping') 97 | mapping_type.add_argument('-u', '--unconstrained', action='store_true', help='use unconstrained mapping') 98 | 99 | self_learning_group = parser.add_argument_group('advanced self-learning arguments', 'Advanced arguments for self-learning') 100 | self_learning_group.add_argument('--self_learning', action='store_true', help='enable self-learning') 101 | self_learning_group.add_argument('--vocabulary_cutoff', type=int, default=0, help='restrict the vocabulary to the top k entries') 102 | self_learning_group.add_argument('--direction', choices=['forward', 'backward', 'union'], default='union', help='the direction for dictionary induction (defaults to union)') 103 | self_learning_group.add_argument('--csls', type=int, nargs='?', default=0, const=10, metavar='NEIGHBORHOOD_SIZE', dest='csls_neighborhood', help='use CSLS for dictionary induction') 104 | self_learning_group.add_argument('--threshold', default=0.000001, type=float, help='the convergence threshold (defaults to 0.000001)') 105 | self_learning_group.add_argument('--validation', default=None, metavar='DICTIONARY', help='a dictionary file for validation at each iteration') 106 | self_learning_group.add_argument('--stochastic_initial', default=0.1, type=float, help='initial keep probability stochastic dictionary induction (defaults to 0.1)') 107 | self_learning_group.add_argument('--stochastic_multiplier', default=2.0, type=float, help='stochastic dictionary induction multiplier (defaults to 2.0)') 108 | self_learning_group.add_argument('--stochastic_interval', default=50, type=int, help='stochastic dictionary induction interval (defaults to 50)') 109 | self_learning_group.add_argument('--log', help='write to a log file in tsv format at each iteration') 110 | self_learning_group.add_argument('-v', '--verbose', action='store_true', help='write log information to stderr at each iteration') 111 | args = parser.parse_args(cmd_args) 112 | 113 | if args.supervised is not None: 114 | parser.set_defaults(init_dictionary=args.supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) 115 | if args.semi_supervised is not None: 116 | parser.set_defaults(init_dictionary=args.semi_supervised, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) 117 | if args.identical: 118 | parser.set_defaults(init_identical=True, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) 119 | if args.unsupervised or args.acl2018: 120 | parser.set_defaults(init_unsupervised=True, unsupervised_vocab=4000, normalize=['unit', 'center', 'unit'], whiten=True, src_reweight=0.5, trg_reweight=0.5, src_dewhiten='src', trg_dewhiten='trg', self_learning=True, vocabulary_cutoff=20000, csls_neighborhood=10) 121 | if args.aaai2018: 122 | parser.set_defaults(init_dictionary=args.aaai2018, normalize=['unit', 'center'], whiten=True, trg_reweight=1, src_dewhiten='src', trg_dewhiten='trg', batch_size=1000) 123 | if args.acl2017: 124 | parser.set_defaults(init_numerals=True, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) 125 | if args.acl2017_seed: 126 | parser.set_defaults(init_dictionary=args.acl2017_seed, orthogonal=True, normalize=['unit', 'center'], self_learning=True, direction='forward', stochastic_initial=1.0, stochastic_interval=1, batch_size=1000) 127 | if args.emnlp2016: 128 | parser.set_defaults(init_dictionary=args.emnlp2016, orthogonal=True, normalize=['unit', 'center'], batch_size=1000) 129 | args = parser.parse_args(cmd_args) 130 | 131 | # Check command line arguments 132 | if (args.src_dewhiten is not None or args.trg_dewhiten is not None) and not args.whiten: 133 | print('ERROR: De-whitening requires whitening first', file=sys.stderr) 134 | sys.exit(-1) 135 | 136 | # Choose the right dtype for the desired precision 137 | if args.precision == 'fp16': 138 | dtype = 'float16' 139 | elif args.precision == 'fp32': 140 | dtype = 'float32' 141 | elif args.precision == 'fp64': 142 | dtype = 'float64' 143 | 144 | # Read input embeddings 145 | srcfile = open(args.src_input, encoding=args.encoding, errors='surrogateescape') 146 | trgfile = open(args.trg_input, encoding=args.encoding, errors='surrogateescape') 147 | src_words, x = embeddings.read(srcfile, dtype=dtype) 148 | trg_words, z = embeddings.read(trgfile, dtype=dtype) 149 | 150 | # NumPy/CuPy management 151 | if args.cuda: 152 | if not supports_cupy(): 153 | print('ERROR: Install CuPy for CUDA support', file=sys.stderr) 154 | sys.exit(-1) 155 | xp = get_cupy() 156 | x = xp.asarray(x) 157 | z = xp.asarray(z) 158 | else: 159 | xp = np 160 | xp.random.seed(args.seed) 161 | 162 | # Build word to index map 163 | src_word2ind = {word: i for i, word in enumerate(src_words)} 164 | trg_word2ind = {word: i for i, word in enumerate(trg_words)} 165 | 166 | # STEP 0: Normalization 167 | embeddings.normalize(x, args.normalize) 168 | embeddings.normalize(z, args.normalize) 169 | 170 | # Build the seed dictionary 171 | src_indices = [] 172 | trg_indices = [] 173 | if args.init_unsupervised: 174 | sim_size = min(x.shape[0], z.shape[0]) if args.unsupervised_vocab <= 0 else min(x.shape[0], z.shape[0], args.unsupervised_vocab) 175 | u, s, vt = xp.linalg.svd(x[:sim_size], full_matrices=False) 176 | xsim = (u*s).dot(u.T) 177 | u, s, vt = xp.linalg.svd(z[:sim_size], full_matrices=False) 178 | zsim = (u*s).dot(u.T) 179 | del u, s, vt 180 | xsim.sort(axis=1) 181 | zsim.sort(axis=1) 182 | embeddings.normalize(xsim, args.normalize) 183 | embeddings.normalize(zsim, args.normalize) 184 | sim = xsim.dot(zsim.T) 185 | if args.csls_neighborhood > 0: 186 | knn_sim_fwd = topk_mean(sim, k=args.csls_neighborhood) 187 | knn_sim_bwd = topk_mean(sim.T, k=args.csls_neighborhood) 188 | sim -= knn_sim_fwd[:, xp.newaxis]/2 + knn_sim_bwd/2 189 | if args.direction == 'forward': 190 | src_indices = xp.arange(sim_size) 191 | trg_indices = sim.argmax(axis=1) 192 | elif args.direction == 'backward': 193 | src_indices = sim.argmax(axis=0) 194 | trg_indices = xp.arange(sim_size) 195 | elif args.direction == 'union': 196 | src_indices = xp.concatenate((xp.arange(sim_size), sim.argmax(axis=0))) 197 | trg_indices = xp.concatenate((sim.argmax(axis=1), xp.arange(sim_size))) 198 | del xsim, zsim, sim 199 | elif args.init_numerals: 200 | numeral_regex = re.compile('^[0-9]+$') 201 | src_numerals = {word for word in src_words if numeral_regex.match(word) is not None} 202 | trg_numerals = {word for word in trg_words if numeral_regex.match(word) is not None} 203 | numerals = src_numerals.intersection(trg_numerals) 204 | for word in numerals: 205 | src_indices.append(src_word2ind[word]) 206 | trg_indices.append(trg_word2ind[word]) 207 | elif args.init_identical: 208 | identical = set(src_words).intersection(set(trg_words)) 209 | for word in identical: 210 | src_indices.append(src_word2ind[word]) 211 | trg_indices.append(trg_word2ind[word]) 212 | else: 213 | f = open(args.init_dictionary, encoding=args.encoding, errors='surrogateescape') 214 | for line in f: 215 | src, trg = line.split() 216 | try: 217 | src_ind = src_word2ind[src] 218 | trg_ind = trg_word2ind[trg] 219 | src_indices.append(src_ind) 220 | trg_indices.append(trg_ind) 221 | except KeyError: 222 | print('WARNING: OOV dictionary entry ({0} - {1})'.format(src, trg), file=sys.stderr) 223 | 224 | # Read validation dictionary 225 | if args.validation is not None: 226 | f = open(args.validation, encoding=args.encoding, errors='surrogateescape') 227 | validation = collections.defaultdict(set) 228 | oov = set() 229 | vocab = set() 230 | for line in f: 231 | src, trg = line.split() 232 | try: 233 | src_ind = src_word2ind[src] 234 | trg_ind = trg_word2ind[trg] 235 | validation[src_ind].add(trg_ind) 236 | vocab.add(src) 237 | except KeyError: 238 | oov.add(src) 239 | oov -= vocab # If one of the translation options is in the vocabulary, then the entry is not an oov 240 | validation_coverage = len(validation) / (len(validation) + len(oov)) 241 | 242 | # Create log file 243 | if args.log: 244 | log = open(args.log, mode='w', encoding=args.encoding, errors='surrogateescape') 245 | 246 | # Allocate memory 247 | xw = xp.empty_like(x) 248 | zw = xp.empty_like(z) 249 | src_size = x.shape[0] if args.vocabulary_cutoff <= 0 else min(x.shape[0], args.vocabulary_cutoff) 250 | trg_size = z.shape[0] if args.vocabulary_cutoff <= 0 else min(z.shape[0], args.vocabulary_cutoff) 251 | simfwd = xp.empty((args.batch_size, trg_size), dtype=dtype) 252 | simbwd = xp.empty((args.batch_size, src_size), dtype=dtype) 253 | if args.validation is not None: 254 | simval = xp.empty((len(validation.keys()), z.shape[0]), dtype=dtype) 255 | 256 | best_sim_forward = xp.full(src_size, -100, dtype=dtype) 257 | src_indices_forward = xp.arange(src_size) 258 | trg_indices_forward = xp.zeros(src_size, dtype=int) 259 | best_sim_backward = xp.full(trg_size, -100, dtype=dtype) 260 | src_indices_backward = xp.zeros(trg_size, dtype=int) 261 | trg_indices_backward = xp.arange(trg_size) 262 | knn_sim_fwd = xp.zeros(src_size, dtype=dtype) 263 | knn_sim_bwd = xp.zeros(trg_size, dtype=dtype) 264 | 265 | # Training loop 266 | best_objective = objective = -100. 267 | it = 1 268 | last_improvement = 0 269 | keep_prob = args.stochastic_initial 270 | t = time.time() 271 | end = not args.self_learning 272 | while True: 273 | 274 | # Increase the keep probability if we have not improve in args.stochastic_interval iterations 275 | if it - last_improvement > args.stochastic_interval: 276 | if keep_prob >= 1.0: 277 | end = True 278 | keep_prob = min(1.0, args.stochastic_multiplier*keep_prob) 279 | last_improvement = it 280 | 281 | # Update the embedding mapping 282 | if args.orthogonal or not end: # orthogonal mapping 283 | u, s, vt = xp.linalg.svd(z[trg_indices].T.dot(x[src_indices])) 284 | w = vt.T.dot(u.T) 285 | x.dot(w, out=xw) 286 | zw[:] = z 287 | elif args.unconstrained: # unconstrained mapping 288 | x_pseudoinv = xp.linalg.inv(x[src_indices].T.dot(x[src_indices])).dot(x[src_indices].T) 289 | w = x_pseudoinv.dot(z[trg_indices]) 290 | x.dot(w, out=xw) 291 | zw[:] = z 292 | else: # advanced mapping 293 | 294 | # TODO xw.dot(wx2, out=xw) and alike not working 295 | xw[:] = x 296 | zw[:] = z 297 | 298 | # STEP 1: Whitening 299 | def whitening_transformation(m): 300 | u, s, vt = xp.linalg.svd(m, full_matrices=False) 301 | return vt.T.dot(xp.diag(1/s)).dot(vt) 302 | if args.whiten: 303 | wx1 = whitening_transformation(xw[src_indices]) 304 | wz1 = whitening_transformation(zw[trg_indices]) 305 | xw = xw.dot(wx1) 306 | zw = zw.dot(wz1) 307 | 308 | # STEP 2: Orthogonal mapping 309 | wx2, s, wz2_t = xp.linalg.svd(xw[src_indices].T.dot(zw[trg_indices])) 310 | wz2 = wz2_t.T 311 | xw = xw.dot(wx2) 312 | zw = zw.dot(wz2) 313 | 314 | # STEP 3: Re-weighting 315 | xw *= s**args.src_reweight 316 | zw *= s**args.trg_reweight 317 | 318 | # STEP 4: De-whitening 319 | if args.src_dewhiten == 'src': 320 | xw = xw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) 321 | elif args.src_dewhiten == 'trg': 322 | xw = xw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) 323 | if args.trg_dewhiten == 'src': 324 | zw = zw.dot(wx2.T.dot(xp.linalg.inv(wx1)).dot(wx2)) 325 | elif args.trg_dewhiten == 'trg': 326 | zw = zw.dot(wz2.T.dot(xp.linalg.inv(wz1)).dot(wz2)) 327 | 328 | # STEP 5: Dimensionality reduction 329 | if args.dim_reduction > 0: 330 | xw = xw[:, :args.dim_reduction] 331 | zw = zw[:, :args.dim_reduction] 332 | 333 | # Self-learning 334 | if end: 335 | break 336 | else: 337 | # Update the training dictionary 338 | if args.direction in ('forward', 'union'): 339 | if args.csls_neighborhood > 0: 340 | for i in range(0, trg_size, simbwd.shape[0]): 341 | j = min(i + simbwd.shape[0], trg_size) 342 | zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) 343 | knn_sim_bwd[i:j] = topk_mean(simbwd[:j-i], k=args.csls_neighborhood, inplace=True) 344 | for i in range(0, src_size, simfwd.shape[0]): 345 | j = min(i + simfwd.shape[0], src_size) 346 | xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) 347 | simfwd[:j-i].max(axis=1, out=best_sim_forward[i:j]) 348 | simfwd[:j-i] -= knn_sim_bwd/2 # Equivalent to the real CSLS scores for NN 349 | dropout(simfwd[:j-i], 1 - keep_prob).argmax(axis=1, out=trg_indices_forward[i:j]) 350 | if args.direction in ('backward', 'union'): 351 | if args.csls_neighborhood > 0: 352 | for i in range(0, src_size, simfwd.shape[0]): 353 | j = min(i + simfwd.shape[0], src_size) 354 | xw[i:j].dot(zw[:trg_size].T, out=simfwd[:j-i]) 355 | knn_sim_fwd[i:j] = topk_mean(simfwd[:j-i], k=args.csls_neighborhood, inplace=True) 356 | for i in range(0, trg_size, simbwd.shape[0]): 357 | j = min(i + simbwd.shape[0], trg_size) 358 | zw[i:j].dot(xw[:src_size].T, out=simbwd[:j-i]) 359 | simbwd[:j-i].max(axis=1, out=best_sim_backward[i:j]) 360 | simbwd[:j-i] -= knn_sim_fwd/2 # Equivalent to the real CSLS scores for NN 361 | dropout(simbwd[:j-i], 1 - keep_prob).argmax(axis=1, out=src_indices_backward[i:j]) 362 | if args.direction == 'forward': 363 | src_indices = src_indices_forward 364 | trg_indices = trg_indices_forward 365 | elif args.direction == 'backward': 366 | src_indices = src_indices_backward 367 | trg_indices = trg_indices_backward 368 | elif args.direction == 'union': 369 | src_indices = xp.concatenate((src_indices_forward, src_indices_backward)) 370 | trg_indices = xp.concatenate((trg_indices_forward, trg_indices_backward)) 371 | 372 | # Objective function evaluation 373 | if args.direction == 'forward': 374 | objective = xp.mean(best_sim_forward).tolist() 375 | elif args.direction == 'backward': 376 | objective = xp.mean(best_sim_backward).tolist() 377 | elif args.direction == 'union': 378 | objective = (xp.mean(best_sim_forward) + xp.mean(best_sim_backward)).tolist() / 2 379 | if objective - best_objective >= args.threshold: 380 | last_improvement = it 381 | best_objective = objective 382 | 383 | # Accuracy and similarity evaluation in validation 384 | if args.validation is not None: 385 | src = list(validation.keys()) 386 | xw[src].dot(zw.T, out=simval) 387 | nn = asnumpy(simval.argmax(axis=1)) 388 | accuracy = np.mean([1 if nn[i] in validation[src[i]] else 0 for i in range(len(src))]) 389 | similarity = np.mean([max([simval[i, j].tolist() for j in validation[src[i]]]) for i in range(len(src))]) 390 | 391 | # Logging 392 | duration = time.time() - t 393 | if args.verbose: 394 | print(file=sys.stderr) 395 | print('ITERATION {0} ({1:.2f}s)'.format(it, duration), file=sys.stderr) 396 | print('\t- Objective: {0:9.4f}%'.format(100 * objective), file=sys.stderr) 397 | print('\t- Drop probability: {0:9.4f}%'.format(100 - 100*keep_prob), file=sys.stderr) 398 | if args.validation is not None: 399 | print('\t- Val. similarity: {0:9.4f}%'.format(100 * similarity), file=sys.stderr) 400 | print('\t- Val. accuracy: {0:9.4f}%'.format(100 * accuracy), file=sys.stderr) 401 | print('\t- Val. coverage: {0:9.4f}%'.format(100 * validation_coverage), file=sys.stderr) 402 | sys.stderr.flush() 403 | if args.log is not None: 404 | val = '{0:.6f}\t{1:.6f}\t{2:.6f}'.format( 405 | 100 * similarity, 100 * accuracy, 100 * validation_coverage) if args.validation is not None else '' 406 | print('{0}\t{1:.6f}\t{2}\t{3:.6f}'.format(it, 100 * objective, val, duration), file=log) 407 | log.flush() 408 | 409 | t = time.time() 410 | it += 1 411 | 412 | src_dict, tgt_dict = defaultdict(lambda: zeros(300)), defaultdict(lambda: zeros(300)) 413 | src_dict.update(zip(src_words, from_numpy(asnumpy(xw)))) 414 | tgt_dict.update(zip(trg_words, from_numpy(asnumpy(zw)))) 415 | return src_dict, tgt_dict 416 | -------------------------------------------------------------------------------- /metrics/utils/wmd.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from torch.nn.functional import cosine_similarity 4 | import string 5 | from pyemd import emd 6 | 7 | def pairwise_distances(x, y=None): 8 | x_norm = (x**2).sum(1).view(-1, 1) 9 | y_norm = (y**2).sum(1).view(1, -1) 10 | y_t = torch.transpose(y, 0, 1) 11 | dist = x_norm + y_norm - 2.0 * torch.mm(x, y_t) 12 | return torch.clamp(dist, 0.0, np.inf) 13 | 14 | def slide_window(input_, w=3, o=1): 15 | if input_.size - w + 1 <= 0: 16 | w = input_.size 17 | sh = (input_.size - w + 1, w) 18 | st = input_.strides * 2 19 | view = np.lib.stride_tricks.as_strided(input_, strides = st, shape = sh)[0::o] 20 | return view.copy().tolist() 21 | 22 | def _safe_divide(numerator, denominator): 23 | return numerator / (denominator + 1e-30) 24 | 25 | def load_ngram(tokens, embedding, idf, n_gram, suffix_filter=True): 26 | new_a = [] 27 | new_idf = [] 28 | ids = [k for k, w in enumerate(tokens) if not suffix_filter or (w not in set(string.punctuation) and '##' not in w)] 29 | 30 | slide_wins = slide_window(np.array(ids), w=n_gram) 31 | for slide_win in slide_wins: 32 | new_idf.append(idf[slide_win].sum().item()) 33 | scale = _safe_divide(idf[slide_win], idf[slide_win].sum(0)).unsqueeze(-1) 34 | tmp = (scale * embedding[slide_win]).sum(0) 35 | new_a.append(tmp) 36 | new_a = torch.stack(new_a, 0) 37 | return new_a, new_idf 38 | 39 | def compute_score(src_embedding_ngrams, src_idf_ngrams, tgt_embedding_ngrams, tgt_idf_ngrams, use_cosine=False): 40 | embeddings = torch.cat([src_embedding_ngrams, tgt_embedding_ngrams], 0) 41 | embeddings.div_(torch.norm(embeddings, dim=-1).unsqueeze(-1) + 1e-30) 42 | if use_cosine: 43 | distance_matrix = 1 - cosine_similarity(embeddings[:, None, :], embeddings[None, :, :], dim=2) 44 | else: 45 | distance_matrix = pairwise_distances(embeddings, embeddings) 46 | 47 | c1 = np.zeros(len(src_idf_ngrams) + len(tgt_idf_ngrams)) 48 | c2 = np.zeros_like(c1) 49 | 50 | c1[:len(src_idf_ngrams)] = src_idf_ngrams 51 | c2[-len(tgt_idf_ngrams):] = tgt_idf_ngrams 52 | 53 | return -emd(_safe_divide(c1, np.sum(c1)), _safe_divide(c2, np.sum(c2)), distance_matrix.double().numpy()) 54 | 55 | def word_mover_align(source_data, target_data, n_gram, candidates=None, use_cosine=False, suffix_filter=True): 56 | src_embedding_ngrams, src_idf_ngrams = list(), list() 57 | for embedding, idf, tokens in zip(*source_data): 58 | embedding_ngrams, idf_ngrams = load_ngram(tokens, embedding, idf, n_gram, suffix_filter) 59 | src_embedding_ngrams.append(embedding_ngrams) 60 | src_idf_ngrams.append(idf_ngrams) 61 | 62 | tgt_embedding_ngrams, tgt_idf_ngrams = list(), list() 63 | for embedding, idf, tokens in zip(*target_data): 64 | embedding_ngrams, idf_ngrams = load_ngram(tokens, embedding, idf, n_gram, suffix_filter) 65 | tgt_embedding_ngrams.append(embedding_ngrams) 66 | tgt_idf_ngrams.append(idf_ngrams) 67 | 68 | pairs, scores = list(), list() 69 | for src_index in range(len(src_embedding_ngrams)): 70 | best_score = float("-inf") 71 | best_tgt_index = -1 72 | # use only the nearest neighbors, when they are provided 73 | for tgt_index in range(len(tgt_embedding_ngrams)) if candidates is None else candidates[src_index]: 74 | batch_src_embedding_ngrams = src_embedding_ngrams[src_index] 75 | batch_src_idf_ngrams = src_idf_ngrams[src_index] 76 | batch_tgt_embedding_ngrams = tgt_embedding_ngrams[tgt_index] 77 | batch_tgt_idf_ngrams = tgt_idf_ngrams[tgt_index] 78 | score = compute_score(batch_src_embedding_ngrams, batch_src_idf_ngrams, 79 | batch_tgt_embedding_ngrams, batch_tgt_idf_ngrams, use_cosine) 80 | if score > best_score: 81 | best_score = score 82 | best_tgt_index = tgt_index 83 | 84 | pairs.append((src_index, best_tgt_index)) 85 | scores.append(best_score) 86 | 87 | return pairs, scores 88 | 89 | def word_mover_score(source_data, target_data, n_gram, use_cosine=False, suffix_filter=True): 90 | src_embedding_ngrams, src_idf_ngrams = list(), list() 91 | for embedding, idf, tokens in zip(*source_data): 92 | embedding_ngrams, idf_ngrams = load_ngram(tokens, embedding, idf, n_gram, suffix_filter) 93 | src_embedding_ngrams.append(embedding_ngrams) 94 | src_idf_ngrams.append(idf_ngrams) 95 | 96 | tgt_embedding_ngrams, tgt_idf_ngrams = list(), list() 97 | for embedding, idf, tokens in zip(*target_data): 98 | embedding_ngrams, idf_ngrams = load_ngram(tokens, embedding, idf, n_gram, suffix_filter) 99 | tgt_embedding_ngrams.append(embedding_ngrams) 100 | tgt_idf_ngrams.append(idf_ngrams) 101 | 102 | scores = list() 103 | for data in zip(src_embedding_ngrams, src_idf_ngrams, tgt_embedding_ngrams, tgt_idf_ngrams): 104 | scores.append(compute_score(*data, use_cosine)) 105 | 106 | return scores 107 | -------------------------------------------------------------------------------- /metrics/vecmapscore.py: -------------------------------------------------------------------------------- 1 | from .utils.embed import vecmap_embed, map_multilingual_embeddings 2 | from .utils.knn import ratio_margin_align 3 | from torch.nn.functional import cosine_similarity 4 | from .common import CommonScore 5 | from torch.cuda import is_available as cuda_is_available 6 | import logging, torch 7 | 8 | class VecMapScore(CommonScore): 9 | def __init__( 10 | self, 11 | device="cuda" if cuda_is_available() else "cpu", 12 | src_lang="en", 13 | tgt_lang="de", 14 | batch_size=5000, 15 | knn_batch_size = 1000000, 16 | k = 5 17 | ): 18 | self.device = device 19 | self.src_lang = src_lang 20 | self.tgt_lang = tgt_lang 21 | self.batch_size = batch_size 22 | self.knn_batch_size = knn_batch_size 23 | self.k = k 24 | self.src_dict = None 25 | self.tgt_dict = None 26 | 27 | def _embed(self, source_sents, target_sents): 28 | if self.src_dict is None or self.tgt_dict is None: 29 | logging.info("Obtaining cross-lingual word embedding mappings from fasttext embeddings.") 30 | self.src_dict, self.tgt_dict = map_multilingual_embeddings(self.src_lang, self.tgt_lang, 31 | self.batch_size, self.device) 32 | 33 | src_embeddings, *_, src_mask = vecmap_embed(source_sents, self.src_dict, self.src_lang) 34 | tgt_embeddings, *_, tgt_mask = vecmap_embed(target_sents, self.tgt_dict, self.tgt_lang) 35 | source_sent_embeddings = torch.sum(src_embeddings * src_mask, 1) / torch.sum(src_mask, 1) 36 | target_sent_embeddings = torch.sum(tgt_embeddings * tgt_mask, 1) / torch.sum(tgt_mask, 1) 37 | 38 | return source_sent_embeddings, target_sent_embeddings 39 | 40 | def align(self, source_sents, target_sents): 41 | source_embeddings, target_embeddings = self._embed(source_sents, target_sents) 42 | indeces, scores = ratio_margin_align(source_embeddings, target_embeddings, self.k, 43 | self.knn_batch_size, self.device) 44 | 45 | sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in indeces] 46 | return sent_pairs, scores 47 | 48 | def score(self, source_sents, target_sents): 49 | source_embeddings, target_embeddings = self._embed(source_sents, target_sents) 50 | return cosine_similarity(source_embeddings, target_embeddings) 51 | -------------------------------------------------------------------------------- /metrics/xmoverscore/__init__.py: -------------------------------------------------------------------------------- 1 | from .embed import * 2 | from .align import * 3 | from torch.cuda import is_available as cuda_is_available 4 | 5 | class XMoverBertAlignScore(XMoverAlign, BertRemap): 6 | def __init__( 7 | self, 8 | model_name="bert-base-multilingual-cased", 9 | mapping="UMD", 10 | device="cuda" if cuda_is_available() else "cpu", 11 | do_lower_case=False, 12 | use_cosine = False, 13 | alignment = "awesome", 14 | k = 20, 15 | n_gram = 1, 16 | remap_size = 2000, 17 | embed_batch_size = 128, 18 | knn_batch_size = 1000000, 19 | align_batch_size = 5000 20 | ): 21 | logging.info("Using device \"%s\" for computations.", device) 22 | XMoverAlign.__init__(self, device, k, n_gram, knn_batch_size, use_cosine, align_batch_size) 23 | BertRemap.__init__(self, model_name, None, mapping, device, do_lower_case, remap_size, embed_batch_size, alignment) 24 | 25 | class XMoverVecMapAlignScore(XMoverAlign, VecMapEmbed): 26 | def __init__( 27 | self, 28 | device="cuda" if cuda_is_available() else "cpu", 29 | use_cosine = False, 30 | k = 20, 31 | n_gram = 1, 32 | knn_batch_size = 1000000, 33 | src_lang = "de", 34 | tgt_lang = "en", 35 | batch_size = 5000, 36 | align_batch_size = 5000 37 | ): 38 | logging.info("Using device \"%s\" for computations.", device) 39 | XMoverAlign.__init__(self, device, k, n_gram, knn_batch_size, use_cosine, align_batch_size) 40 | VecMapEmbed.__init__(self, device, src_lang, tgt_lang, batch_size) 41 | 42 | class XMoverNMTBertAlignScore(XMoverNMTAlign, BertRemap): 43 | def __init__( 44 | self, 45 | device="cuda" if cuda_is_available() else "cpu", 46 | use_cosine = False, 47 | alignment = "awesome", 48 | k = 20, 49 | n_gram = 1, 50 | knn_batch_size = 1000000, 51 | train_size = 200000, 52 | align_batch_size = 5000, 53 | mine_batch_size = 5000000, 54 | src_lang = "de", 55 | tgt_lang = "en", 56 | model_name="bert-base-multilingual-cased", 57 | monolingual_model_name=None, 58 | mt_model_name="facebook/mbart-large-cc25", 59 | mapping="UMD", 60 | do_lower_case=False, 61 | remap_size = 2000, 62 | embed_batch_size = 128, 63 | translate_batch_size = 16, 64 | nmt_weights = [0.8, 0.2], 65 | ): 66 | logging.info("Using device \"%s\" for computations.", device) 67 | XMoverNMTAlign.__init__(self, device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, 68 | tgt_lang, mt_model_name, translate_batch_size, nmt_weights, use_cosine, mine_batch_size) 69 | BertRemap.__init__(self, model_name, monolingual_model_name, mapping, device, do_lower_case, remap_size, 70 | embed_batch_size, alignment) 71 | 72 | class XMoverNMTLMBertAlignScore(XMoverNMTLMAlign, BertRemap): 73 | def __init__( 74 | self, 75 | device="cuda" if cuda_is_available() else "cpu", 76 | use_cosine = False, 77 | use_lm = False, 78 | alignment = "awesome", 79 | k = 20, 80 | n_gram = 1, 81 | knn_batch_size = 1000000, 82 | train_size = 200000, 83 | align_batch_size = 5000, 84 | mine_batch_size = 5000000, 85 | lm_weights = [1, 0.1], 86 | nmt_weights = [0.8, 0.2], 87 | src_lang = "de", 88 | tgt_lang = "en", 89 | model_name="bert-base-multilingual-cased", 90 | monolingual_model_name=None, 91 | mt_model_name="facebook/mbart-large-cc25", 92 | lm_model_name="gpt2", 93 | mapping="UMD", 94 | do_lower_case=False, 95 | remap_size = 2000, 96 | embed_batch_size = 128, 97 | translate_batch_size = 16, 98 | ): 99 | logging.info("Using device \"%s\" for computations.", device) 100 | XMoverNMTLMAlign.__init__(self, device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, tgt_lang, 101 | mt_model_name, translate_batch_size, nmt_weights, use_cosine, mine_batch_size, use_lm, lm_weights, lm_model_name) 102 | BertRemap.__init__(self, model_name, monolingual_model_name, mapping, device, do_lower_case, remap_size, 103 | embed_batch_size, alignment) 104 | 105 | class XMoverScore(XMoverLMAlign, BertRemapPretrained): 106 | """ 107 | The original XMoverScore implementation. Be careful, remapping matrices 108 | were trained on parallel data! Provided out of convienence to compare the 109 | preformance of self-learning remapping approaches to the supervised 110 | original. 111 | """ 112 | def __init__( 113 | self, 114 | model_name="bert-base-multilingual-cased", 115 | lm_model_name="gpt2", 116 | mapping="UMD", 117 | device="cuda" if cuda_is_available() else "cpu", 118 | do_lower_case=False, 119 | use_cosine = False, 120 | use_lm = False, 121 | k = 20, 122 | n_gram = 1, 123 | embed_batch_size = 128, 124 | knn_batch_size = 1000000, 125 | align_batch_size = 5000, 126 | lm_weights = [1, 0.1] 127 | ): 128 | logging.info("Using device \"%s\" for computations.", device) 129 | XMoverLMAlign.__init__(self, device, k, n_gram, knn_batch_size, use_cosine, align_batch_size, use_lm, 130 | lm_weights, lm_model_name) 131 | BertRemapPretrained.__init__(self, model_name, None, mapping, device, do_lower_case, embed_batch_size) 132 | -------------------------------------------------------------------------------- /metrics/xmoverscore/align.py: -------------------------------------------------------------------------------- 1 | from ..utils.wmd import word_mover_align, word_mover_score 2 | from ..utils.knn import wcd_align, ratio_margin_align, cosine_align 3 | from ..utils.nmt import train, translate 4 | from ..utils.perplexity import lm_perplexity 5 | from ..utils.env import DATADIR 6 | from ..common import CommonScore 7 | from os.path import isfile, join, basename 8 | from json import dumps 9 | from math import ceil 10 | from numpy import arange, array 11 | from nltk.metrics.distance import edit_distance 12 | from datasets import load_dataset 13 | from shutil import copyfile 14 | import logging 15 | import torch 16 | 17 | class XMoverAlign(CommonScore): 18 | def __init__(self, device, k, n_gram, knn_batch_size, use_cosine, align_batch_size): 19 | self.device = device 20 | self.k = k 21 | self.n_gram = n_gram 22 | self.knn_batch_size = knn_batch_size 23 | self.use_cosine = use_cosine 24 | self.align_batch_size = align_batch_size 25 | 26 | def _mean_pool_embed(self, source_sents, target_sents): 27 | source_sent_embeddings, target_sent_embeddings, idx = None, None, 0 28 | while idx < max(len(source_sents), len(target_sents)): 29 | src_embeddings, _, _, src_mask, tgt_embeddings, _, _, tgt_mask = self._embed( 30 | source_sents[idx:idx + self.align_batch_size], target_sents[idx:idx + self.align_batch_size]) 31 | if source_sent_embeddings is None and target_sent_embeddings is None: 32 | source_sent_embeddings = torch.empty(len(source_sents), src_embeddings.shape[-1]) 33 | target_sent_embeddings = torch.empty(len(target_sents), tgt_embeddings.shape[-1]) 34 | source_sent_embeddings[idx:idx + len(src_embeddings)] = torch.sum(src_embeddings * src_mask, 1) / torch.sum(src_mask, 1) 35 | target_sent_embeddings[idx:idx + len(tgt_embeddings)] = torch.sum(tgt_embeddings * tgt_mask, 1) / torch.sum(tgt_mask, 1) 36 | idx += self.align_batch_size 37 | 38 | return source_sent_embeddings, target_sent_embeddings 39 | 40 | def _memory_efficient_word_mover_align(self, source_sents, target_sents, candidates): 41 | pairs, scores, idx, k = list(), list(), 0, candidates.shape[1] 42 | batch_size = ceil(self.align_batch_size / k) 43 | while idx < len(source_sents): 44 | src_embeddings, src_idf, src_tokens, _, tgt_embeddings, tgt_idf, tgt_tokens, _ = self._embed( 45 | source_sents[idx:idx + batch_size], 46 | [target_sents[candidate] for candidate in candidates[idx:idx + batch_size].flatten()]) 47 | batch_pairs, batch_scores = word_mover_align((src_embeddings, src_idf, src_tokens), 48 | (tgt_embeddings, tgt_idf, tgt_tokens), self.n_gram, 49 | arange(len(src_embeddings) * k).reshape(len(src_embeddings), k)) 50 | pairs.extend([(src + idx, candidates[idx:idx + batch_size].flatten()[tgt]) for src, tgt in batch_pairs]) 51 | scores.extend(batch_scores) 52 | idx += batch_size 53 | return pairs, scores 54 | 55 | def align(self, source_sents, target_sents): 56 | candidates = None 57 | logging.info("Obtaining sentence embeddings.") 58 | source_sent_embeddings, target_sent_embeddings = self._mean_pool_embed(source_sents, target_sents) 59 | logging.info("Searching for nearest neighbors.") 60 | if self.use_cosine: 61 | candidates, _ = cosine_align(source_sent_embeddings, target_sent_embeddings, self.k, 62 | self.knn_batch_size, self.device) 63 | else: 64 | candidates, _ = wcd_align(source_sent_embeddings, target_sent_embeddings, self.k, 65 | self.knn_batch_size, self.device) 66 | 67 | logging.info("Filter best nearest neighbors with Word Mover's Distance.") 68 | pairs, scores = self._memory_efficient_word_mover_align(source_sents, target_sents, candidates) 69 | sent_pairs = [(source_sents[src_idx], target_sents[tgt_idx]) for src_idx, tgt_idx in pairs] 70 | return sent_pairs, scores 71 | 72 | def score(self, source_sents, target_sents, same_language=False): 73 | src_embeddings, src_idf, src_tokens, _, tgt_embeddings, tgt_idf, tgt_tokens, _ = self._embed(source_sents, 74 | target_sents, same_language) 75 | scores = word_mover_score((src_embeddings, src_idf, src_tokens), (tgt_embeddings, tgt_idf, tgt_tokens), 76 | self.n_gram) 77 | return scores 78 | 79 | class XMoverLMAlign(XMoverAlign): 80 | """ 81 | Extends XMoverScore based sentence aligner with an additional language model. 82 | """ 83 | 84 | def __init__(self, device, k, n_gram, knn_batch_size, align_batch_size, use_cosine, use_lm, lm_weights, lm_model_name): 85 | super().__init__(device, k, n_gram, knn_batch_size, use_cosine, align_batch_size) 86 | self.device = device 87 | self.use_lm = use_lm 88 | self.lm_weights = lm_weights 89 | self.lm_model_name = lm_model_name 90 | 91 | #Override 92 | def score(self, source_sents, target_sents): 93 | """ 94 | Compute WMD scores and combine results with perplexity of GPT2 language 95 | model. This only makes sense when the hyptheses are in English. 96 | """ 97 | wmd_scores = super().score(source_sents, target_sents) 98 | if self.use_lm: 99 | lm_scores = lm_perplexity(target_sents, self.device, self.lm_model_name) 100 | return (self.lm_weights[0] * array(wmd_scores) + self.lm_weights[1] * array(lm_scores)).tolist() 101 | else: 102 | return wmd_scores 103 | 104 | class XMoverNMTAlign(XMoverAlign): 105 | """ 106 | Able to mine data to train an NMT model, which is then combined with the score. 107 | """ 108 | 109 | def __init__(self, device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, tgt_lang, 110 | mt_model_name, translate_batch_size, nmt_weights, use_cosine, mine_batch_size): 111 | super().__init__(device, k, n_gram, knn_batch_size, use_cosine, align_batch_size) 112 | self.train_size = train_size 113 | self.knn_batch_size = knn_batch_size 114 | self.src_lang = src_lang 115 | self.tgt_lang = tgt_lang 116 | self.mt_model_name = mt_model_name 117 | self.translate_batch_size = translate_batch_size 118 | self.nmt_weights = nmt_weights 119 | self.mt_model = None 120 | self.mt_tokenizer = None 121 | self.use_cosine = use_cosine 122 | self.mine_batch_size = mine_batch_size 123 | self.back_translate = False 124 | 125 | #Override 126 | def score(self, source_sents, target_sents): 127 | scores = super().score(source_sents, target_sents) 128 | if self.mt_model is None or self.mt_tokenizer is None: 129 | return scores 130 | else: 131 | if self.back_translate: 132 | mt_scores = super().score(source_sents, self.translate(target_sents), True) 133 | else: 134 | mt_scores = super().score(self.translate(source_sents), target_sents, True) 135 | return [self.nmt_weights[0] * score + self.nmt_weights[1] * mt_score for score, mt_score in zip(scores, mt_scores)] 136 | 137 | def train(self, source_sents, target_sents, suffix="data", iteration=1, aligned=False, finetune=False, overwrite=True, 138 | back_translate=False, k=None): 139 | mine_file, batch, batch_size = join(DATADIR, "translation", f"mined-{suffix}.json"), 0, self.mine_batch_size 140 | pairs, scores = list(), list() 141 | self.back_translate = back_translate 142 | 143 | if self.back_translate: 144 | logging.info("Training in back-translation mode, swapping source_sents and target_sents.") 145 | source_sents, target_sents = target_sents, source_sents 146 | src_lang, tgt_lang = self.tgt_lang, self.src_lang 147 | else: 148 | src_lang, tgt_lang = self.src_lang, self.tgt_lang 149 | 150 | if (not isfile(mine_file) or overwrite) and not aligned: 151 | while batch < len(source_sents): 152 | logging.info("Obtaining sentence embeddings.") 153 | batch_src, batch_tgt = source_sents[batch:batch + batch_size], target_sents[batch:batch + batch_size] 154 | source_sent_embeddings, target_sent_embeddings = self._mean_pool_embed(batch_src, batch_tgt) 155 | if self.use_cosine: 156 | logging.info("Mining pseudo parallel data with Ratio Margin function.") 157 | batch_pairs, batch_scores = ratio_margin_align(source_sent_embeddings, target_sent_embeddings, 158 | self.k if k is None else k, self.knn_batch_size, self.device) 159 | else: 160 | logging.info("Mining pseudo parallel data using Word Centroid Distance.") 161 | candidates, _ = wcd_align(source_sent_embeddings, target_sent_embeddings, self.k if k is None else k, 162 | self.knn_batch_size, self.device) 163 | logging.info("Computing exact Word Mover's Distances for candidates.") 164 | batch_pairs, batch_scores = self._memory_efficient_word_mover_align(batch_src, batch_tgt, candidates) 165 | del source_sent_embeddings, target_sent_embeddings 166 | pairs.extend([(src + batch, tgt + batch) for src, tgt in batch_pairs]), scores.extend(batch_scores) 167 | batch += batch_size 168 | with open(mine_file, "wb") as f: 169 | idx = 0 170 | for _, (src, tgt) in sorted(zip(scores, pairs), key=lambda tup: tup[0], reverse=True): 171 | src_sent, tgt_sent = source_sents[src], target_sents[tgt] 172 | if edit_distance(src_sent, tgt_sent) / max(len(src_sent), len(tgt_sent)) > 0.5: 173 | line = { "translation": { src_lang: src_sent, tgt_lang: tgt_sent} } 174 | f.write(dumps(line, ensure_ascii=False).encode() + b"\n") 175 | idx += 1 176 | if idx >= self.train_size: 177 | break 178 | elif (not isfile(mine_file) or overwrite) and aligned: 179 | with open(mine_file, "wb") as f: 180 | for src_sent, tgt_sent in zip(source_sents, target_sents): 181 | line = { "translation": { src_lang: src_sent, tgt_lang: tgt_sent} } 182 | f.write(dumps(line, ensure_ascii=False).encode() + b"\n") 183 | 184 | if finetune: 185 | if self.mt_model is not None and self.mt_tokenizer is not None: 186 | self.mt_model, self.mt_tokenizer = train(self.mt_model.name_or_path, src_lang, tgt_lang, mine_file, 187 | overwrite, suffix, name=basename(self.mt_model_name)) 188 | else: 189 | raise ValueError("Wanted to finetune existing model but none was found.") 190 | elif self.mt_model is not None and self.mt_tokenizer is not None: 191 | logging.info("Training MT model with translated and pseudo parallel data.") 192 | datasets = load_dataset("json", data_files=mine_file) 193 | translation_file = join(DATADIR, "translation", f"translated-{suffix}-{iteration}.json") 194 | sents = list(set(source_sents).difference([entry["translation"][src_lang] for entry in datasets['train']])) 195 | 196 | if not isfile(translation_file) or overwrite: 197 | copyfile(mine_file, translation_file) 198 | with open(translation_file, "ab") as f: 199 | for src, tgt in zip(sents, self.translate(sents[:self.train_size])): 200 | line = { "translation": { src_lang: src, tgt_lang: tgt} } 201 | f.write(dumps(line, ensure_ascii=False).encode() + b"\n") 202 | 203 | self.mt_model, self.mt_tokenizer = train(self.mt_model_name, src_lang, tgt_lang, translation_file, 204 | overwrite, f"{suffix}-{iteration}") 205 | 206 | else: 207 | logging.info("Training MT model with pseudo parallel data.") 208 | self.mt_model, self.mt_tokenizer = train(self.mt_model_name, src_lang, tgt_lang, mine_file, 209 | overwrite, suffix) 210 | 211 | self.mt_model.to(self.device) 212 | 213 | def translate(self, sentences): 214 | logging.info(f"Translating sentences into {'source' if self.back_translate else 'target'} language.") 215 | return translate(self.mt_model, self.mt_tokenizer, sentences, self.translate_batch_size, self.device) 216 | 217 | class XMoverNMTLMAlign(XMoverNMTAlign): 218 | """ 219 | Combine NMT and LM XMoverScore extensions. 220 | """ 221 | 222 | def __init__(self, device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, tgt_lang, mt_model_name, 223 | translate_batch_size, nmt_weights, use_cosine, mine_batch_size, use_lm, lm_weights, lm_model_name): 224 | super().__init__(device, k, n_gram, knn_batch_size, train_size, align_batch_size, src_lang, tgt_lang, 225 | mt_model_name, translate_batch_size, nmt_weights, use_cosine, mine_batch_size) 226 | self.device = device 227 | self.use_lm = use_lm 228 | self.lm_weights = lm_weights 229 | self.lm_model_name = lm_model_name 230 | 231 | #Override 232 | def score(self, source_sents, target_sents): 233 | """ 234 | Compute WMD scores on hypotheses and pseudo translations and combine 235 | results with perplexity of GPT2 language model. This only makes sense 236 | when the hyptheses are in English. 237 | """ 238 | nmt_scores = super().score(source_sents, target_sents) 239 | if self.use_lm: 240 | lm_scores = lm_perplexity(target_sents, self.device, self.lm_model_name) 241 | return (self.lm_weights[0] * array(nmt_scores) + self.lm_weights[1] * array(lm_scores)).tolist() 242 | else: 243 | return nmt_scores 244 | -------------------------------------------------------------------------------- /metrics/xmoverscore/embed.py: -------------------------------------------------------------------------------- 1 | from transformers import BertModel, BertTokenizer, BertConfig 2 | from ..utils.embed import bert_embed, vecmap_embed, map_multilingual_embeddings 3 | from ..utils.remap import fast_align, awesome_align, sim_align, get_aligned_features_avgbpe, clp, umd 4 | from ..utils.env import DATADIR 5 | from ..common import CommonScore 6 | from os.path import isfile, join 7 | from nltk.metrics.distance import edit_distance 8 | from numpy import load 9 | from io import BytesIO 10 | from functools import cached_property 11 | from urllib.request import urlopen 12 | from urllib.error import URLError 13 | import logging 14 | import torch 15 | 16 | class BertEmbed(CommonScore): 17 | def __init__(self, model_name, monolingual_model_name, mapping, device, do_lower_case, embed_batch_size): 18 | self.model_name = model_name 19 | self.monolingual_model_name = monolingual_model_name if monolingual_model_name else model_name 20 | self.do_lower_case = do_lower_case 21 | self.device = device 22 | self.mapping = mapping 23 | self.embed_batch_size = embed_batch_size 24 | self.projection = None 25 | 26 | @cached_property 27 | def model(self): 28 | config = BertConfig.from_pretrained(self.model_name, output_hidden_states=True) 29 | return BertModel.from_pretrained(self.model_name, config=config).to(self.device) 30 | 31 | @cached_property 32 | def monolingual_model(self): 33 | if self.monolingual_model_name != self.model_name: 34 | config = BertConfig.from_pretrained(self.monolingual_model_name, output_hidden_states=True) 35 | return BertModel.from_pretrained(self.monolingual_model_name, config=config).to(self.device) 36 | else: 37 | return self.model 38 | 39 | @cached_property 40 | def tokenizer(self): 41 | return BertTokenizer.from_pretrained(self.model_name, do_lower_case=self.do_lower_case) 42 | 43 | @cached_property 44 | def monolingual_tokenzier(self): 45 | if self.monolingual_model_name != self.model_name: 46 | return BertTokenizer.from_pretrained(self.monolingual_model_name, do_lower_case=self.do_lower_case) 47 | else: 48 | return self.tokenizer 49 | 50 | def _embed(self, source_sents, target_sents, same_language=False): 51 | model, tokenizer = (self.monolingual_model, self.monolingual_tokenzier) if same_language else (self.model, self.tokenizer) 52 | src_embeddings, src_idf, src_tokens, src_mask = bert_embed(source_sents, self.embed_batch_size, model, 53 | tokenizer, self.device) 54 | tgt_embeddings, tgt_idf, tgt_tokens, tgt_mask = bert_embed(target_sents, self.embed_batch_size, model, 55 | tokenizer, self.device) 56 | 57 | if self.projection is not None and not same_language: 58 | if self.mapping == 'CLP': 59 | src_embeddings = torch.matmul(src_embeddings, self.projection) 60 | else: 61 | src_embeddings = src_embeddings - (src_embeddings * self.projection).sum(2, keepdim=True) * \ 62 | self.projection.repeat(src_embeddings.shape[0], src_embeddings.shape[1], 1) 63 | 64 | return src_embeddings, src_idf, src_tokens, src_mask, tgt_embeddings, tgt_idf, tgt_tokens, tgt_mask 65 | 66 | class BertRemap(BertEmbed): 67 | def __init__(self, model_name, monolingual_model_name, mapping, device, do_lower_case, remap_size, embed_batch_size, alignment): 68 | super().__init__(model_name, monolingual_model_name, mapping, device, do_lower_case, embed_batch_size) 69 | self.remap_size = remap_size 70 | self.alignment = alignment 71 | 72 | def remap(self, source_sents, target_sents, suffix="tensor", aligned=False, overwrite=True, new_mapping=None): 73 | file_path, mapping = join(DATADIR, f"projection-{suffix}.pt"), new_mapping or self.mapping 74 | if not isfile(file_path) or overwrite: 75 | logging.info(f'Computing projection tensor for {mapping} remapping method.') 76 | sorted_sent_pairs = list() 77 | if aligned: 78 | sorted_sent_pairs.extend(zip(source_sents, target_sents)) 79 | else: 80 | sent_pairs, scores = self.align(source_sents, target_sents) 81 | for _, (src_sent, tgt_sent) in sorted(zip(scores, sent_pairs), key=lambda tup: tup[0], reverse=True): 82 | if edit_distance(src_sent, tgt_sent) / max(len(src_sent), len(tgt_sent)) > 0.5: 83 | sorted_sent_pairs.append((src_sent, tgt_sent)) 84 | if self.alignment == "fast": 85 | tokenized_pairs, align_pairs = fast_align(sorted_sent_pairs, self.tokenizer, self.remap_size) 86 | elif self.alignment == "sim": 87 | tokenized_pairs, align_pairs = sim_align(sorted_sent_pairs, self.tokenizer, self.remap_size, self.device) 88 | else: # awesome 89 | tokenized_pairs, align_pairs = awesome_align(sorted_sent_pairs, self.model, self.tokenizer, 90 | self.remap_size, self.device) 91 | if self.alignment.endswith("remap"): # awesome-remap 92 | src_matrix, tgt_matrix = get_aligned_features_avgbpe(tokenized_pairs, align_pairs, 93 | self.model, self.tokenizer, self.embed_batch_size, self.device, 8) 94 | tokenized_pairs, align_pairs = awesome_align(sorted_sent_pairs, self.model, self.tokenizer, 95 | self.remap_size, self.device, 96 | clp(src_matrix, tgt_matrix) if mapping == "CLP" else umd(src_matrix, tgt_matrix)) 97 | src_matrix, tgt_matrix = get_aligned_features_avgbpe(tokenized_pairs, align_pairs, 98 | self.model, self.tokenizer, self.embed_batch_size, self.device) 99 | 100 | logging.info(f"Using {len(src_matrix)} aligned word pairs to compute projection tensor.") 101 | if mapping == "CLP": 102 | self.projection = clp(src_matrix, tgt_matrix) 103 | else: 104 | self.projection = umd(src_matrix, tgt_matrix) 105 | torch.save(self.projection, file_path) 106 | else: 107 | logging.info(f'Loading {mapping} projection tensor from disk.') 108 | self.projection = torch.load(file_path) 109 | if new_mapping: 110 | self.mapping = new_mapping 111 | 112 | class BertRemapPretrained(BertEmbed): 113 | """ 114 | Obtains pretrained remapping matrices from original XMoverScore repository. 115 | """ 116 | 117 | commit = "73ef48058f8e47e0d99434b7c75a9ceb6f253d94" 118 | path = "mapping/layer-12/{}.{}-{}.2k.12.{}" 119 | url = f"https://github.com/potamides/ACL20-Reference-Free-MT-Evaluation/raw/{commit}/{path}" 120 | 121 | def remap(self, source_lang, target_lang): 122 | for corpus in ["europarl-v7", "flores-v1", "un-v1", "wikimedia-v20210402", "wikimatrix-v1", "multi-cc-aligned-v1.1"]: 123 | try: 124 | if self.mapping == "CLP": 125 | download = urlopen(self.url.format(corpus, source_lang, target_lang, "BAM")).read() 126 | self.projection = torch.tensor(load(BytesIO(download)), dtype=torch.float32) 127 | else: 128 | download = urlopen(self.url.format(corpus, source_lang, target_lang, "GBDD")).read() 129 | self.projection = torch.tensor(load(BytesIO(download))[0], dtype=torch.float32) 130 | break 131 | except URLError as e: 132 | if e.status == 404: 133 | pass 134 | else: 135 | raise ValueError("Language direction does not exist!") 136 | 137 | class VecMapEmbed(CommonScore): 138 | def __init__(self, device, src_lang, tgt_lang, batch_size): 139 | self.device = device 140 | self.src_lang = src_lang 141 | self.tgt_lang = tgt_lang 142 | self.batch_size = batch_size 143 | self.src_dict = None 144 | self.tgt_dict = None 145 | 146 | def _embed(self, source_sents, target_sents, same_language=False): 147 | if self.src_dict is None or self.tgt_dict is None: 148 | logging.info("Obtaining cross-lingual word embedding mappings from fasttext embeddings.") 149 | self.src_dict, self.tgt_dict = map_multilingual_embeddings(self.src_lang, self.tgt_lang, 150 | self.batch_size, self.device) 151 | src_embeddings, src_idf, src_tokens, src_mask = vecmap_embed(source_sents, 152 | *((self.tgt_dict, self.tgt_lang) if same_language else (self.src_dict, self.src_lang))) 153 | tgt_embeddings, tgt_idf, tgt_tokens, tgt_mask = vecmap_embed(target_sents, self.tgt_dict, self.tgt_lang) 154 | 155 | return src_embeddings, src_idf, src_tokens, src_mask, tgt_embeddings, tgt_idf, tgt_tokens, tgt_mask 156 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_namespace_packages 2 | 3 | setup( 4 | name="unsupervised-metrics", 5 | version="1.0", 6 | description="Self-Learning for Unsupervised Metrics", 7 | keywords=[ 8 | "Unsupervised", 9 | "Metrics", 10 | "Quality Estimation", 11 | "Machine Translation", 12 | "NLP", 13 | "Deep Learning", 14 | ], 15 | url="https://github.com/potamides/unsupervised-metrics", 16 | author="Jonas Belouadi", 17 | author_email="potamides@posteo.net", 18 | packages=find_namespace_packages(include=["metrics*"]), 19 | install_requires=[ 20 | "tqdm==4.49.0", 21 | "fasttext==0.9.2", 22 | "faiss-gpu==1.6.5", 23 | "pyemd==0.5.1", 24 | "torch==1.9.0", 25 | "sentence-transformers==1.2.0", 26 | "transformers==4.10.3", 27 | "datasets==2.0.0", 28 | "nltk>=3.4.5", 29 | "sentencepiece==0.1.96", 30 | "mosestokenizer==1.1.0", 31 | "jieba==0.42.1", 32 | "sinling==0.3.6", 33 | "Nepali_nlp @ https://github.com/potamides/Nepali_nlp/archive/d3d078ed50c8224f290d772f7b895354d0cb0266.zip", 34 | "simalign @ https://github.com/cisnlp/simalign/archive/refs/tags/v0.2.zip", 35 | "mt_metrics_eval @ https://github.com/google-research/mt-metrics-eval/archive/refs/heads/main.zip", 36 | "PuLP==2.4", 37 | "bert-score==0.3.9", 38 | "tabulate==0.8.9", 39 | "gdown==3.13.0", 40 | ], 41 | extras_require={ 42 | "vecmap-cuda": ["cupy"], 43 | "experiments": [ 44 | "unbabel-comet", 45 | "transquest", 46 | "sacrebleu", 47 | ], 48 | }, 49 | python_requires=">=3.9.0", 50 | zip_safe=False, 51 | ) 52 | --------------------------------------------------------------------------------