├── tests
    ├── __init__.py
    ├── unit
    │   ├── __init__.py
    │   ├── encoders
    │   │   ├── __init__.py
    │   │   ├── test_bert.py
    │   │   └── test_xlmr.py
    │   └── test_download_utils.py
    ├── integration
    │   ├── __init__.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── test_ranking_metric.py
    │   │   ├── test_regression_metric.py
    │   │   └── test_referenceless_regression.py
    │   └── modules
    │   │   └── test_feedforward.py
    └── data
    │   ├── __init__.py
    │   └── test_ranking_data.csv
├── comet
    ├── models
    │   ├── ranking
    │   │   ├── __init__.py
    │   │   ├── wmt_kendall.py
    │   │   └── ranking_metric.py
    │   ├── regression
    │   │   ├── __init__.py
    │   │   ├── referenceless.py
    │   │   ├── regression_metric_hybrid.py
    │   │   └── regression_metric.py
    │   ├── pooling_utils.py
    │   ├── __init__.py
    │   └── base.py
    ├── modules
    │   ├── __init__.py
    │   ├── bottleneck.py
    │   ├── losses.py
    │   ├── feedforward.py
    │   └── layerwise_attention.py
    ├── encoders
    │   ├── __init__.py
    │   ├── xlmr.py
    │   ├── base.py
    │   └── bert.py
    ├── __init__.py
    ├── cli
    │   ├── train.py
    │   ├── score.py
    │   └── compare.py
    └── download_utils.py
├── requirements.txt
├── configs
    ├── early_stopping.yaml
    ├── model_checkpoint.yaml
    ├── models
    │   ├── regression_metric_comet_kl.yaml
    │   ├── regression_metric_comet_heteroscedastic.yaml
    │   ├── regression_metric_comet_dup.yaml
    │   ├── regression_metric_comet_plain.yaml
    │   └── regression_metric_dup_256bottleneck.yaml
    └── trainer.yaml
├── pyproject.toml
├── README.md
└── LICENSE


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/unit/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/comet/models/ranking/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/integration/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/unit/encoders/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/comet/models/regression/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/integration/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/data/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | DATA_PATH = os.path.abspath(__file__)
4 | DATA_PATH = os.path.dirname(DATA_PATH)
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | sentencepiece==0.1.96
2 | pandas==1.1.5
3 | transformers==4.8.2
4 | pytorch-lightning==1.3.5
5 | jsonargparse==3.13.1
6 | 


--------------------------------------------------------------------------------
/comet/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 | from .feedforward import FeedForward
3 | from .bottleneck import Bottleneck
4 | from .layerwise_attention import LayerwiseAttention
5 | from .losses import HeteroscedasticLoss, HeteroscedasticLossv2, HeteroApproxLoss, HeteroApproxLossv2, SquaredLoss, KLLoss
6 | 


--------------------------------------------------------------------------------
/configs/early_stopping.yaml:
--------------------------------------------------------------------------------
 1 | class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping
 2 | init_args:
 3 |   monitor: val_pearson
 4 |   min_delta: 0.
 5 |   patience: 3
 6 |   verbose: False
 7 |   mode: max
 8 |   strict: True
 9 |   check_finite: True
10 |   stopping_threshold: null
11 |   divergence_threshold: null
12 |   check_on_train_epoch_end: False
13 | 


--------------------------------------------------------------------------------
/configs/model_checkpoint.yaml:
--------------------------------------------------------------------------------
 1 | class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint
 2 | init_args:
 3 |   dirpath: null
 4 |   filename: null
 5 |   monitor: val_pearson
 6 |   verbose: True
 7 |   save_last: False
 8 |   save_top_k: 2
 9 |   save_weights_only: True
10 |   mode: max
11 |   auto_insert_metric_name: True
12 |   every_n_train_steps: null
13 |   every_n_val_epochs: 1
14 | 


--------------------------------------------------------------------------------
/comet/encoders/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (C) 2020 Unbabel
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from .bert import BERTEncoder
15 | from .xlmr import XLMREncoder
16 | 
17 | str2encoder = {"BERT": BERTEncoder, "XLM-RoBERTa": XLMREncoder}
18 | 


--------------------------------------------------------------------------------
/tests/unit/test_download_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import unittest
 3 | import os
 4 | import shutil
 5 | from tests.data import DATA_PATH
 6 | from comet.download_utils import download_model
 7 | from comet.models import load_from_checkpoint
 8 | 
 9 | 
10 | class TestDownloadModel(unittest.TestCase):
11 |     @classmethod
12 |     def tearDownClass(cls):
13 |         shutil.rmtree(os.path.join(DATA_PATH, "wmt21-cometinho-da"))
14 | 
15 |     def test_download_from_s3(self):
16 |         data_path = download_model("wmt21-cometinho-da", saving_directory=DATA_PATH)
17 |         self.assertTrue(
18 |             os.path.exists(os.path.join(DATA_PATH, "wmt21-cometinho-da/hparams.yaml"))
19 |         )
20 |         self.assertTrue(
21 |             os.path.exists(os.path.join(DATA_PATH, "wmt21-cometinho-da/checkpoints/"))
22 |         )
23 |         load_from_checkpoint(data_path)
24 | 


--------------------------------------------------------------------------------
/comet/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (C) 2020 Unbabel
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | import logging
18 | 
19 | from .download_utils import download_model
20 | from .models import load_from_checkpoint
21 | 
22 | logging.basicConfig(level=logging.INFO, format="%(message)s")
23 | logger = logging.getLogger(__name__)
24 | 
25 | 
26 | __version__ = "1.0.0rc4"
27 | __copyright__ = "2020-2021 Unbabel. All rights reserved."
28 | 


--------------------------------------------------------------------------------
/configs/models/regression_metric_comet_kl.yaml:
--------------------------------------------------------------------------------
 1 | regression_metric:
 2 |   class_path: comet.models.RegressionMetric
 3 |   init_args:
 4 |     nr_frozen_epochs: 0.3
 5 |     keep_embeddings_frozen: True
 6 |     keep_encoder_frozen: False
 7 |     optimizer: AdamW
 8 |     encoder_learning_rate: 1.0e-05
 9 |     learning_rate: 3.1e-05
10 |     layerwise_decay: 0.95
11 |     encoder_model: XLM-RoBERTa
12 |     pretrained_model: xlm-roberta-large
13 |     pool: avg
14 |     layer: mix
15 |     dropout: 0.15
16 |     batch_size: 4
17 |     train_data: data/balanced/scores-1719.csv
18 |     validation_data: data/balanced/scores-1719.csv
19 |     hidden_sizes:
20 |       - 3072
21 |       - 1024
22 |     hidden_sizes_bottleneck: 
23 |       - 0
24 |     data_portion: 1.0
25 |     loss: kl
26 |     feature_size: 0
27 |     #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt
28 | 
29 | trainer: ../trainer.yaml
30 | early_stopping: ../early_stopping.yaml
31 | model_checkpoint: ../model_checkpoint.yaml


--------------------------------------------------------------------------------
/configs/models/regression_metric_comet_heteroscedastic.yaml:
--------------------------------------------------------------------------------
 1 | regression_metric:
 2 |   class_path: comet.models.RegressionMetric
 3 |   init_args:
 4 |     nr_frozen_epochs: 0.3
 5 |     keep_embeddings_frozen: True
 6 |     keep_encoder_frozen: False
 7 |     optimizer: AdamW
 8 |     encoder_learning_rate: 1.0e-05
 9 |     learning_rate: 3.1e-05
10 |     layerwise_decay: 0.95
11 |     encoder_model: XLM-RoBERTa
12 |     pretrained_model: xlm-roberta-large
13 |     pool: avg
14 |     layer: mix
15 |     dropout: 0.15
16 |     batch_size: 4
17 |     train_data: data/balanced/scores-1719.csv
18 |     validation_data: data/balanced/scores-1719.csv
19 |     hidden_sizes:
20 |       - 3072
21 |       - 1024
22 |     hidden_sizes_bottleneck: 
23 |       - 0
24 |     data_portion: 1.0
25 |     loss: hts
26 |     feature_size: 0
27 |     #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt
28 | 
29 | trainer: ../trainer.yaml
30 | early_stopping: ../early_stopping.yaml
31 | model_checkpoint: ../model_checkpoint.yaml


--------------------------------------------------------------------------------
/configs/models/regression_metric_comet_dup.yaml:
--------------------------------------------------------------------------------
 1 | regression_metric:
 2 |   class_path: comet.models.RegressionMetric
 3 |   init_args:
 4 |     nr_frozen_epochs: 0.3
 5 |     keep_embeddings_frozen: True
 6 |     keep_encoder_frozen: False
 7 |     optimizer: AdamW
 8 |     encoder_learning_rate: 1.0e-05
 9 |     learning_rate: 3.1e-05
10 |     layerwise_decay: 0.95
11 |     encoder_model: XLM-RoBERTa
12 |     pretrained_model: xlm-roberta-large
13 |     pool: avg
14 |     layer: mix
15 |     dropout: 0.15
16 |     batch_size: 4
17 |     train_data: data/balanced/scores-2020_train_errors_cmtfeat.csv
18 |     validation_data: data/balanced/scores-2020_val_errors_cmtfeat.csv
19 |     hidden_sizes:
20 |       - 3072
21 |       - 1024
22 |     hidden_sizes_bottleneck: 
23 |       - 256
24 |     data_portion: 1.0
25 |     loss: hts_approx
26 |     feature_size: 1
27 |     #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt
28 | 
29 | trainer: ../trainer.yaml
30 | early_stopping: ../early_stopping.yaml
31 | model_checkpoint: ../model_checkpoint.yaml


--------------------------------------------------------------------------------
/configs/models/regression_metric_comet_plain.yaml:
--------------------------------------------------------------------------------
 1 | regression_metric:
 2 |   class_path: comet.models.RegressionMetric
 3 |   init_args:
 4 |     nr_frozen_epochs: 0.3
 5 |     keep_embeddings_frozen: True
 6 |     keep_encoder_frozen: False
 7 |     optimizer: AdamW
 8 |     encoder_learning_rate: 1.0e-05
 9 |     learning_rate: 3.1e-05
10 |     layerwise_decay: 0.95
11 |     encoder_model: XLM-RoBERTa
12 |     pretrained_model: xlm-roberta-large
13 |     pool: avg
14 |     layer: mix
15 |     dropout: 0.15
16 |     batch_size: 4
17 |     train_data: /home/czerva/MT_QE/v2COMET/COMET_up/data/balanced/scores-1719.csv
18 |     validation_data: /home/czerva/MT_QE/v2COMET/COMET_up/data/balanced/scores-1719.csv
19 |     hidden_sizes:
20 |       - 3072
21 |       - 1024
22 |     hidden_sizes_bottleneck: 
23 |       - 0
24 |     data_portion: 1.0
25 |     loss: mse
26 |     feature_size: 0
27 |     #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt
28 | 
29 | trainer: ../trainer.yaml
30 | early_stopping: ../early_stopping.yaml
31 | model_checkpoint: ../model_checkpoint.yaml


--------------------------------------------------------------------------------
/configs/models/regression_metric_dup_256bottleneck.yaml:
--------------------------------------------------------------------------------
 1 | regression_metric:
 2 |   class_path: comet.models.RegressionMetric
 3 |   init_args:
 4 |     nr_frozen_epochs: 0.3
 5 |     keep_embeddings_frozen: True
 6 |     keep_encoder_frozen: False
 7 |     optimizer: AdamW
 8 |     encoder_learning_rate: 1.0e-05
 9 |     learning_rate: 3.1e-05
10 |     layerwise_decay: 0.95
11 |     encoder_model: XLM-RoBERTa
12 |     pretrained_model: xlm-roberta-large
13 |     pool: avg
14 |     layer: mix
15 |     dropout: 0.15
16 |     batch_size: 4
17 |     train_data: /home/czerva/MT_QE/v2COMET/COMET_up/data/balanced/scores-2020_train_errors_cmtfeat.csv
18 |     validation_data: /home/czerva/MT_QE/v2COMET/COMET_up/data/balanced/scores-2020_val_errors_cmtfeat.csv
19 |     hidden_sizes:
20 |       - 3072
21 |       - 1024
22 |     hidden_sizes_bottleneck: 
23 |       - 256
24 |     data_portion: 1.0
25 |     loss: hts_approx
26 |     feature_size: 1
27 |     #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt
28 | 
29 | trainer: ../trainer.yaml
30 | early_stopping: ../early_stopping.yaml
31 | model_checkpoint: ../model_checkpoint.yaml


--------------------------------------------------------------------------------
/tests/unit/encoders/test_bert.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import unittest
 3 | 
 4 | from comet.encoders.bert import BERTEncoder
 5 | 
 6 | 
 7 | class TestBERTEncoder(unittest.TestCase):
 8 | 
 9 |     bert = BERTEncoder.from_pretrained("google/bert_uncased_L-2_H-128_A-2")
10 | 
11 |     def test_num_layers(self):
12 |         self.assertEqual(self.bert.num_layers, 3)
13 | 
14 |     def test_output_units(self):
15 |         self.assertEqual(self.bert.output_units, 128)
16 | 
17 |     def test_max_positions(self):
18 |         self.assertEqual(self.bert.max_positions, 512)
19 | 
20 |     def test_prepare_sample(self):
21 |         sample = ["hello world, welcome to COMET!", "This is a batch"]
22 |         model_input = self.bert.prepare_sample(sample)
23 |         self.assertIn("input_ids", model_input)
24 |         self.assertIn("attention_mask", model_input)
25 | 
26 |     def test_forward(self):
27 |         sample = ["hello world, welcome to COMET!", "This is a batch"]
28 |         model_input = self.bert.prepare_sample(sample)
29 |         model_output = self.bert(**model_input)
30 |         self.assertIn("wordemb", model_output)
31 |         self.assertIn("sentemb", model_output)
32 |         self.assertIn("all_layers", model_output)
33 |         self.assertIn("attention_mask", model_output)
34 | 


--------------------------------------------------------------------------------
/tests/unit/encoders/test_xlmr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import unittest
 3 | 
 4 | from comet.encoders.xlmr import XLMREncoder
 5 | 
 6 | 
 7 | class TestXLMREncoder(unittest.TestCase):
 8 | 
 9 |     xlmr = XLMREncoder.from_pretrained("Unbabel/xlm-roberta-comet-small")
10 | 
11 |     def test_num_layers(self):
12 |         self.assertEqual(self.xlmr.num_layers, 7)
13 | 
14 |     def test_output_units(self):
15 |         self.assertEqual(self.xlmr.output_units, 384)
16 | 
17 |     def test_max_positions(self):
18 |         self.assertEqual(self.xlmr.max_positions, 514)
19 | 
20 |     def test_prepare_sample(self):
21 |         sample = ["hello world, welcome to COMET!", "This is a batch"]
22 |         model_input = self.xlmr.prepare_sample(sample)
23 |         self.assertIn("input_ids", model_input)
24 |         self.assertIn("attention_mask", model_input)
25 | 
26 |     def test_forward(self):
27 |         sample = ["hello world, welcome to COMET!", "This is a batch"]
28 |         model_input = self.xlmr.prepare_sample(sample)
29 |         model_output = self.xlmr(**model_input)
30 |         self.assertIn("wordemb", model_output)
31 |         self.assertIn("sentemb", model_output)
32 |         self.assertIn("all_layers", model_output)
33 |         self.assertIn("attention_mask", model_output)
34 | 


--------------------------------------------------------------------------------
/configs/trainer.yaml:
--------------------------------------------------------------------------------
 1 | class_path: pytorch_lightning.trainer.trainer.Trainer
 2 | init_args:
 3 |   accelerator: null
 4 |   accumulate_grad_batches: 2
 5 |   amp_backend: native
 6 |   amp_level: O0
 7 |   auto_lr_find: False
 8 |   auto_scale_batch_size: False
 9 |   auto_select_gpus: False
10 |   benchmark: False
11 |   check_val_every_n_epoch: 1
12 |   default_root_dir: null
13 |   deterministic: True
14 |   fast_dev_run: False
15 |   flush_logs_every_n_steps: 100
16 |   gpus: 1
17 |   gradient_clip_val: 1.0
18 |   gradient_clip_algorithm: norm
19 |   limit_train_batches: 1.0
20 |   limit_val_batches: 1.0
21 |   limit_test_batches: 1.0
22 |   limit_predict_batches: 1.0
23 |   log_gpu_memory: null
24 |   log_every_n_steps: 10
25 |   prepare_data_per_node: True 
26 |   process_position: 0 
27 |   progress_bar_refresh_rate: null
28 |   profiler: null
29 |   overfit_batches: 0.0
30 |   plugins: null
31 |   precision: 32
32 |   max_epochs: 3
33 |   min_epochs: 1
34 |   max_steps: null
35 |   min_steps: null
36 |   max_time: null
37 |   num_nodes: 1
38 |   num_processes: 1
39 |   num_sanity_val_steps: 10
40 |   reload_dataloaders_every_epoch: False
41 |   replace_sampler_ddp: True
42 |   resume_from_checkpoint: null
43 |   sync_batchnorm: False
44 |   terminate_on_nan: False
45 |   tpu_cores: null
46 |   track_grad_norm: -1
47 |   val_check_interval: 1.0
48 |   weights_summary: top
49 |   move_metrics_to_cpu: True
50 |   multiple_trainloader_mode: max_size_cycle
51 |   stochastic_weight_avg: True


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "unbabel-comet"
 3 | version = "1.0.0rc4"
 4 | description = "High-quality Machine Translation Evaluation"
 5 | authors = ["Ricardo Rei, Craig Stewart, Catarina Farinha, Alon Lavie"]
 6 | license = "Apache-2.0"
 7 | readme = "README.md"
 8 | homepage = "https://github.com/Unbabel/COMET"
 9 | repository = "https://github.com/Unbabel/COMET"
10 | documentation = "https://unbabel.github.io/COMET/html/index.html"
11 | keywords = [
12 |     "Machine Translation", 
13 |     "Evaluation",
14 |     "Unbabel",
15 |     "COMET"
16 | ]
17 | classifiers = [
18 |     'Development Status :: 4 - Beta',
19 |     'Environment :: Console',
20 |     'Intended Audience :: Science/Research',
21 |     'Topic :: Scientific/Engineering :: Artificial Intelligence',
22 | ]
23 | packages = [
24 |     {include = "comet"},
25 | ]
26 | include = [
27 |     "LICENSE",
28 |     "pyproject.toml",
29 |     "CONTRIBUTING.md"
30 | ]
31 | 
32 | [tool.poetry.scripts]
33 | comet-train = 'comet.cli.train:train_command'
34 | comet-score = 'comet.cli.score:score_command'
35 | comet-compare = 'comet.cli.compare:compare_command'
36 | 
37 | [tool.poetry.dependencies]
38 | python = "^3.6.1"
39 | sentencepiece = "^0.1.96"
40 | pandas = "1.1.5"
41 | transformers = "^4.8.2"
42 | pytorch-lightning = "1.3.5"
43 | jsonargparse = "3.13.1"
44 | torch = "1.6.0"
45 | torchmetrics = "0.5"
46 | 
47 | [tool.poetry.dev-dependencies]
48 | sphinx-markdown-tables = "0.0.15"
49 | coverage = "^5.5"
50 | scikit-learn = "0.24"
51 | scipy = "1.5.4"
52 | 
53 | [build-system]
54 | requires = ["poetry-core>=1.0.0"]
55 | build-backend = "poetry.core.masonry.api"


--------------------------------------------------------------------------------
/comet/models/ranking/wmt_kendall.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2020 Unbabel
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | 
16 | r"""
17 | WMT Kendall Tau
18 | ====================
19 |     Kendall Tau like formulation used to measure agreement between relative ranks
20 |     produced by humans and relative ranks produced by metrics.
21 | """
22 | import torch
23 | from torchmetrics import Metric
24 | 
25 | 
26 | class WMTKendall(Metric):
27 |     def __init__(self, dist_sync_on_step=False, prefix=""):
28 |         super().__init__(dist_sync_on_step=dist_sync_on_step)
29 |         self.add_state("concordance", default=torch.tensor(0), dist_reduce_fx="sum")
30 |         self.add_state("discordance", default=torch.tensor(0), dist_reduce_fx="sum")
31 |         self.prefix = prefix
32 | 
33 |     def update(self, distance_pos: torch.Tensor, distance_neg: torch.Tensor):
34 |         assert distance_pos.shape == distance_neg.shape
35 |         self.concordance = torch.sum((distance_pos < distance_neg).float())
36 |         self.discordance = torch.sum((distance_pos >= distance_neg).float())
37 | 
38 |     def compute(self):
39 |         return {
40 |             self.prefix
41 |             + "_kendall": (self.concordance - self.discordance)
42 |             / (self.concordance + self.discordance)
43 |         }
44 | 


--------------------------------------------------------------------------------
/comet/modules/bottleneck.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | r"""
15 | Bottleneck Layer
16 | ==============
17 |     Bottleneck module to be used with customised features
18 | """
19 | 
20 | from typing import List, Optional
21 | 
22 | import torch
23 | from torch import nn
24 | 
25 | 
26 | class Bottleneck(nn.Module):
27 |     """
28 |     Bottleneck layer.
29 | 
30 |     :param in_dim: Number input features.
31 |     :param out_dim: Number of output features. Default is just a score.
32 |     :param hidden_sizes: List with hidden layer sizes.
33 |     :param activations: Name of the activation function to be used in the hidden layers.
34 |     :param final_activation: Name of the final activation function if any.
35 |     :param dropout: dropout to be used in the hidden layers.
36 |     """
37 | 
38 |     def __init__(
39 |         self,
40 |         in_dim: int,
41 |         hidden_sizes: List[int] = [3072, 256],
42 |         activations: str = "Sigmoid",
43 |         dropout: float = 0.1,
44 |     ) -> None:
45 |         super().__init__()
46 |         modules = []
47 |         modules.append(nn.Linear(in_dim, hidden_sizes[0]))
48 |         modules.append(self.build_activation(activations))
49 |         modules.append(nn.Dropout(dropout))
50 |         for i in range(1, len(hidden_sizes)):
51 |             modules.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
52 |             modules.append(self.build_activation(activations))
53 |             modules.append(nn.Dropout(dropout))
54 | 
55 |         self.ff = nn.Sequential(*modules)
56 | 
57 |     def build_activation(self, activation: str) -> nn.Module:
58 |         if hasattr(nn, activation):
59 |             return getattr(nn, activation)()
60 | 
61 |     def forward(self, in_features: torch.Tensor) -> torch.Tensor:
62 |         return self.ff(in_features)
63 | 


--------------------------------------------------------------------------------
/comet/encoders/xlmr.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2020 Unbabel
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | r"""
16 | XLM-RoBERTa Encoder
17 | ==============
18 |     Pretrained XLM-RoBERTa  encoder from Hugging Face.
19 | """
20 | from typing import Dict
21 | 
22 | import torch
23 | from comet.encoders.base import Encoder
24 | from comet.encoders.bert import BERTEncoder
25 | from transformers import XLMRobertaModel, XLMRobertaTokenizer
26 | 
27 | 
28 | class XLMREncoder(BERTEncoder):
29 |     """XLM-RoBERTA Encoder encoder.
30 | 
31 |     :param pretrained_model: Pretrained model from hugging face.
32 |     """
33 | 
34 |     def __init__(self, pretrained_model: str) -> None:
35 |         super(Encoder, self).__init__()
36 |         self.tokenizer = XLMRobertaTokenizer.from_pretrained(pretrained_model)
37 |         self.model = XLMRobertaModel.from_pretrained(
38 |             pretrained_model, add_pooling_layer=False
39 |         )
40 |         self.model.encoder.output_hidden_states = True
41 | 
42 |     @classmethod
43 |     def from_pretrained(cls, pretrained_model: str) -> Encoder:
44 |         """Function that loads a pretrained encoder from Hugging Face.
45 |         :param pretrained_model: Name of the pretrain model to be loaded.
46 | 
47 |         :return: Encoder model
48 |         """
49 |         return XLMREncoder(pretrained_model)
50 | 
51 |     def forward(
52 |         self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs
53 |     ) -> Dict[str, torch.Tensor]:
54 |         last_hidden_states, _, all_layers = self.model(
55 |             input_ids=input_ids,
56 |             attention_mask=attention_mask,
57 |             output_hidden_states=True,
58 |             return_dict=False,
59 |         )
60 |         return {
61 |             "sentemb": last_hidden_states[:, 0, :],
62 |             "wordemb": last_hidden_states,
63 |             "all_layers": all_layers,
64 |             "attention_mask": attention_mask,
65 |         }
66 | 


--------------------------------------------------------------------------------
/comet/models/pooling_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2020 Unbabel
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | import torch
16 | 
17 | 
18 | def average_pooling(
19 |     tokens: torch.Tensor,
20 |     embeddings: torch.Tensor,
21 |     mask: torch.Tensor,
22 |     padding_index: int,
23 | ) -> torch.Tensor:
24 |     """Average pooling function.
25 |     :param tokens: Word ids [batch_size x seq_length]
26 |     :param embeddings: Word embeddings [batch_size x seq_length x hidden_size]
27 |     :param mask: Padding mask [batch_size x seq_length]
28 |     :param padding_index: Padding value.
29 |     """
30 |     wordemb = mask_fill(0.0, tokens, embeddings, padding_index)
31 |     sentemb = torch.sum(wordemb, 1)
32 |     sum_mask = mask.unsqueeze(-1).expand(embeddings.size()).float().sum(1)
33 |     return sentemb / sum_mask
34 | 
35 | 
36 | def max_pooling(
37 |     tokens: torch.Tensor, embeddings: torch.Tensor, padding_index: int
38 | ) -> torch.Tensor:
39 |     """Max pooling function.
40 |     :param tokens: Word ids [batch_size x seq_length]
41 |     :param embeddings: Word embeddings [batch_size x seq_length x hidden_size]
42 |     :param padding_index: Padding value.
43 |     """
44 |     return mask_fill(float("-inf"), tokens, embeddings, padding_index).max(dim=1)[0]
45 | 
46 | 
47 | def mask_fill(
48 |     fill_value: float,
49 |     tokens: torch.Tensor,
50 |     embeddings: torch.Tensor,
51 |     padding_index: int,
52 | ) -> torch.Tensor:
53 |     """
54 |     Function that masks embeddings representing padded elements.
55 |     :param fill_value: the value to fill the embeddings belonging to padded tokens.
56 |     :param tokens: The input sequences [bsz x seq_len].
57 |     :param embeddings: word embeddings [bsz x seq_len x hiddens].
58 |     :param padding_index: Index of the padding token.
59 |     """
60 |     padding_mask = tokens.eq(padding_index).unsqueeze(-1)
61 |     return embeddings.float().masked_fill_(padding_mask, fill_value).type_as(embeddings)
62 | 


--------------------------------------------------------------------------------
/comet/modules/losses.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class HeteroscedasticLoss(nn.Module):
 5 |     
 6 |     def forward(self, mu: torch.Tensor, std: torch.Tensor, target: torch.Tensor):
 7 |         sigma = std**2
 8 |         log1 = 0.5 * torch.neg(torch.log(sigma)).exp() 
 9 |         mse = (target - mu)**2
10 |         log2 = 0.5 * torch.log(sigma)
11 |         return torch.sum(log1*mse+log2)
12 | 
13 | 
14 | class HeteroscedasticLossv2(nn.Module):
15 |     
16 |     def forward(self, mu: torch.Tensor, std: torch.Tensor, target: torch.Tensor):
17 |         sigma = std
18 |         log1 = 0.5 * torch.neg(torch.log(sigma)).exp() 
19 |         mse = (target - mu)**2
20 |         log2 = 0.5 * torch.log(sigma)
21 |         return torch.sum(log1*mse+log2)
22 | 
23 | #Heteroscedastic inspired loss for error/uncertainty prediction
24 | class HeteroApproxLoss(nn.Module):
25 |     
26 |     def forward(self, pred: torch.Tensor, target: torch.Tensor):
27 |         sigma = pred**2
28 |         l1 = 0.5 * torch.neg(torch.log(sigma)).exp() 
29 |         l2 = 0.5 * torch.log(sigma)
30 |         mse = target**2 
31 |         #return torch.mean(0.5*pred**(-2)*(target**2)+(0.5*torch.log(pred**2)))
32 |         return torch.sum(l1*mse+l2)
33 | 
34 | #Heteroscedastic inspired loss for error/uncertainty prediction
35 | class HeteroApproxLossv2(nn.Module):
36 |     
37 |     def forward(self, pred: torch.Tensor, target: torch.Tensor):
38 |         sigma = pred
39 |         l1 = 0.5 * torch.neg(torch.log(sigma)).exp() 
40 |         l2 = 0.5 * torch.log(sigma)
41 |         mse = target**2 
42 |         #return torch.mean(0.5*pred**(-2)*(target**2)+(0.5*torch.log(pred**2)))
43 |         return torch.sum(l1*mse+l2)
44 | 
45 | class SquaredLoss(nn.Module):
46 |     def forward(self, pred: torch.Tensor, target: torch.Tensor):
47 |         mse = (target**2-pred**2)**2
48 |         
49 |         #return torch.mean(0.5*pred**(-2)*(target**2)+(0.5*torch.log(pred**2)))
50 |         return torch.mean(mse)
51 |  
52 | 
53 | class KLLoss(nn.Module):    
54 |     #based on Daan's idea
55 |     def forward(self, mu: torch.Tensor, sigma: torch.Tensor, target_mu: torch.Tensor, target_std: torch.Tensor):
56 |        
57 |         # Add fudge factor to variance to avoid large KL values
58 |         #   (value of 1e-2 just turned out to work - 1e-3 already
59 |         #   occasionally caused loss > 1000)
60 |         std1 = target_std
61 |         std2 = sigma
62 |         mean1 = target_mu
63 |         mean2 = mu
64 |         
65 |         kl = torch.log(torch.abs(std2)/torch.abs(std1)) + (std1**2 + (mean1 - mean2)**2)/(2*std2**2) - 0.5
66 |         
67 |         return kl.mean()
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/tests/integration/models/test_ranking_metric.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import multiprocessing
 3 | import os
 4 | import shutil
 5 | import unittest
 6 | 
 7 | import torch
 8 | from comet.models import RankingMetric
 9 | from pytorch_lightning import seed_everything
10 | from pytorch_lightning.trainer.trainer import Trainer
11 | from scipy.stats import pearsonr
12 | from tests.data import DATA_PATH
13 | from torch.utils.data import DataLoader
14 | 
15 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
16 | os.environ["OMP_NUM_THREADS"] = "1"
17 | 
18 | 
19 | class TestRankingMetric(unittest.TestCase):
20 |     @classmethod
21 |     def tearDownClass(cls):
22 |         shutil.rmtree(os.path.join(DATA_PATH, "checkpoints"))
23 | 
24 |     def test_training(self):
25 |         seed_everything(12)
26 |         trainer = Trainer(
27 |             gpus=0,
28 |             max_epochs=4,
29 |             deterministic=True,
30 |             checkpoint_callback=True,
31 |             default_root_dir=DATA_PATH,
32 |             logger=False,
33 |             weights_summary=None,
34 |             progress_bar_refresh_rate=0,
35 |         )
36 |         model = RankingMetric(
37 |             encoder_model="BERT",
38 |             pretrained_model="google/bert_uncased_L-2_H-128_A-2",
39 |             train_data=os.path.join(DATA_PATH, "test_ranking_data.csv"),
40 |             validation_data=os.path.join(DATA_PATH, "test_ranking_data.csv"),
41 |             layerwise_decay=0.95,
42 |             batch_size=32,
43 |             learning_rate=1e-04,
44 |             encoder_learning_rate=1e-04,
45 |         )
46 |         trainer.fit(model)
47 |         self.assertTrue(
48 |             os.path.exists(
49 |                 os.path.join(DATA_PATH, "checkpoints", "epoch=3-step=15.ckpt")
50 |             )
51 |         )
52 |         saved_model = RankingMetric.load_from_checkpoint(
53 |             os.path.join(DATA_PATH, "checkpoints", "epoch=3-step=15.ckpt")
54 |         )
55 |         dataset = saved_model.read_csv(
56 |             os.path.join(DATA_PATH, "test_regression_data.csv"), regression=True
57 |         )
58 |         y = [s["score"] for s in dataset]
59 |         dataloader = DataLoader(
60 |             dataset=dataset,
61 |             batch_size=256,
62 |             collate_fn=lambda x: saved_model.prepare_sample(x, inference=True),
63 |             num_workers=multiprocessing.cpu_count(),
64 |         )
65 |         y_hat = (
66 |             torch.cat(
67 |                 trainer.predict(dataloaders=dataloader, return_predictions=True), dim=0
68 |             )
69 |             .cpu()
70 |             .tolist()
71 |         )
72 |         # This shouldn't break!
73 |         pearsonr(y_hat, y)[0]
74 | 


--------------------------------------------------------------------------------
/comet/modules/feedforward.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # Copyright (C) 2020 Unbabel
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | r"""
16 | Feed Forward
17 | ==============
18 |     Feed Forward Neural Network module that can be used for classification or regression
19 | """
20 | from typing import List, Optional
21 | 
22 | import torch
23 | from torch import nn
24 | 
25 | 
26 | class FeedForward(nn.Module):
27 |     """
28 |     Feed Forward Neural Network.
29 | 
30 |     :param in_dim: Number input features.
31 |     :param out_dim: Number of output features. Default is just a score.
32 |     :param hidden_sizes: List with hidden layer sizes.
33 |     :param activations: Name of the activation function to be used in the hidden layers.
34 |     :param final_activation: Name of the final activation function if any.
35 |     :param dropout: dropout to be used in the hidden layers.
36 |     """
37 | 
38 |     def __init__(
39 |         self,
40 |         in_dim: int,
41 |         out_dim: int = 1,
42 |         hidden_sizes: List[int] = [3072, 768],
43 |         activations: str = "Sigmoid",
44 |         final_activation: Optional[str] = None,
45 |         dropout: float = 0.1,
46 |     ) -> None:
47 |         super().__init__()
48 |         modules = []
49 |         modules.append(nn.Linear(in_dim, hidden_sizes[0]))
50 |         modules.append(self.build_activation(activations))
51 |         modules.append(nn.Dropout(dropout))
52 | 
53 |         for i in range(1, len(hidden_sizes)):
54 |             modules.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i]))
55 |             modules.append(self.build_activation(activations))
56 |             modules.append(nn.Dropout(dropout))
57 | 
58 |         modules.append(nn.Linear(hidden_sizes[-1], int(out_dim)))
59 |         if final_activation is not None:
60 |             modules.append(self.build_activation(final_activation))
61 | 
62 |         self.ff = nn.Sequential(*modules)
63 | 
64 |     def build_activation(self, activation: str) -> nn.Module:
65 |         if hasattr(nn, activation):
66 |             return getattr(nn, activation)()
67 | 
68 |     def forward(self, in_features: torch.Tensor) -> torch.Tensor:
69 |         return self.ff(in_features)
70 | 


--------------------------------------------------------------------------------
/tests/integration/models/test_regression_metric.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import multiprocessing
 3 | import os
 4 | import shutil
 5 | import unittest
 6 | 
 7 | import torch
 8 | from comet.models import RegressionMetric
 9 | from pytorch_lightning import seed_everything
10 | from pytorch_lightning.trainer.trainer import Trainer
11 | from scipy.stats import pearsonr
12 | from tests.data import DATA_PATH
13 | from torch.utils.data import DataLoader
14 | 
15 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
16 | os.environ["OMP_NUM_THREADS"] = "1"
17 | 
18 | 
19 | class TestRegressionMetric(unittest.TestCase):
20 |     @classmethod
21 |     def tearDownClass(cls):
22 |         shutil.rmtree(os.path.join(DATA_PATH, "checkpoints"))
23 | 
24 |     def test_training(self):
25 |         seed_everything(12)
26 |         trainer = Trainer(
27 |             gpus=0,
28 |             max_epochs=10,
29 |             deterministic=True,
30 |             checkpoint_callback=True,
31 |             default_root_dir=DATA_PATH,
32 |             logger=False,
33 |             weights_summary=None,
34 |             progress_bar_refresh_rate=0,
35 |         )
36 |         model = RegressionMetric(
37 |             encoder_model="BERT",
38 |             pretrained_model="google/bert_uncased_L-2_H-128_A-2",
39 |             train_data=os.path.join(DATA_PATH, "test_regression_data.csv"),
40 |             validation_data=os.path.join(DATA_PATH, "test_regression_data.csv"),
41 |             hidden_sizes=[384],
42 |             layerwise_decay=0.95,
43 |             batch_size=32,
44 |             learning_rate=1e-04,
45 |             encoder_learning_rate=1e-04,
46 |         )
47 |         trainer.fit(model)
48 |         self.assertTrue(
49 |             os.path.exists(
50 |                 os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=159.ckpt")
51 |             )
52 |         )
53 | 
54 |         saved_model = RegressionMetric.load_from_checkpoint(
55 |             os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=159.ckpt")
56 |         )
57 |         dataset = saved_model.read_csv(
58 |             os.path.join(DATA_PATH, "test_regression_data.csv")
59 |         )
60 |         y = [s["score"] for s in dataset]
61 |         dataloader = DataLoader(
62 |             dataset=dataset,
63 |             batch_size=256,
64 |             collate_fn=lambda x: saved_model.prepare_sample(x, inference=True),
65 |             num_workers=multiprocessing.cpu_count(),
66 |         )
67 |         y_hat = (
68 |             torch.cat(
69 |                 trainer.predict(dataloaders=dataloader, return_predictions=True), dim=0
70 |             )
71 |             .cpu()
72 |             .tolist()
73 |         )
74 |         self.assertAlmostEqual(pearsonr(y_hat, y)[0], 0.8, places=1)
75 | 


--------------------------------------------------------------------------------
/tests/integration/models/test_referenceless_regression.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import multiprocessing
 3 | import os
 4 | import shutil
 5 | import unittest
 6 | 
 7 | import torch
 8 | from comet.models import ReferencelessRegression
 9 | from pytorch_lightning import seed_everything
10 | from pytorch_lightning.trainer.trainer import Trainer
11 | from scipy.stats import pearsonr
12 | from tests.data import DATA_PATH
13 | from torch.utils.data import DataLoader
14 | 
15 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
16 | os.environ["OMP_NUM_THREADS"] = "1"
17 | 
18 | 
19 | class TestReferencelessRegression(unittest.TestCase):
20 |     @classmethod
21 |     def tearDownClass(cls):
22 |         shutil.rmtree(os.path.join(DATA_PATH, "checkpoints"))
23 | 
24 |     def test_training(self):
25 | 
26 |         seed_everything(12)
27 |         trainer = Trainer(
28 |             gpus=0,
29 |             max_epochs=10,
30 |             deterministic=True,
31 |             checkpoint_callback=True,
32 |             default_root_dir=DATA_PATH,
33 |             logger=False,
34 |             weights_summary=None,
35 |             progress_bar_refresh_rate=0,
36 |         )
37 |         model = ReferencelessRegression(
38 |             encoder_model="BERT",
39 |             pretrained_model="google/bert_uncased_L-2_H-128_A-2",
40 |             train_data=os.path.join(DATA_PATH, "test_regression_data.csv"),
41 |             validation_data=os.path.join(DATA_PATH, "test_regression_data.csv"),
42 |             hidden_sizes=[256],
43 |             layerwise_decay=0.95,
44 |             batch_size=32,
45 |             learning_rate=1e-04,
46 |             encoder_learning_rate=1e-04,
47 |         )
48 |         trainer.fit(model)
49 |         self.assertTrue(
50 |             os.path.exists(
51 |                 os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=159.ckpt")
52 |             )
53 |         )
54 | 
55 |         saved_model = ReferencelessRegression.load_from_checkpoint(
56 |             os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=159.ckpt")
57 |         )
58 |         dataset = saved_model.read_csv(
59 |             os.path.join(DATA_PATH, "test_regression_data.csv")
60 |         )
61 |         y = [s["score"] for s in dataset]
62 |         dataloader = DataLoader(
63 |             dataset=dataset,
64 |             batch_size=256,
65 |             collate_fn=lambda x: saved_model.prepare_sample(x, inference=True),
66 |             num_workers=multiprocessing.cpu_count(),
67 |         )
68 |         y_hat = (
69 |             torch.cat(
70 |                 trainer.predict(dataloaders=dataloader, return_predictions=True), dim=0
71 |             )
72 |             .cpu()
73 |             .tolist()
74 |         )
75 |         self.assertAlmostEqual(pearsonr(y_hat, y)[0], 0.8, places=1)
76 | 


--------------------------------------------------------------------------------
/comet/models/__init__.py:
--------------------------------------------------------------------------------
 1 | # flake8: noqa
 2 | # -*- coding: utf-8 -*-
 3 | # Copyright (C) 2020 Unbabel
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #     http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | 
17 | from .regression.regression_metric import RegressionMetric
18 | from .ranking.ranking_metric import RankingMetric
19 | from .regression.referenceless import ReferencelessRegression
20 | from .base import CometModel
21 | 
22 | import os
23 | import yaml
24 | 
25 | str2model = {
26 |     "referenceless_regression_metric": ReferencelessRegression,
27 |     "regression_metric": RegressionMetric,
28 |     "ranking_metric": RankingMetric,
29 | }
30 | 
31 | available_metrics = {
32 |     "emnlp20-comet-rank": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/emnlp20-comet-rank.tar.gz",
33 |     "wmt20-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-da.tar.gz",
34 |     "wmt20-comet-qe-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-qe-da.tar.gz",
35 |     "wmt21-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-da.tar.gz",
36 |     "wmt21-comet-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-mqm.tar.gz",
37 |     # "wmt21-cometinho-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-cometinho-mqm.tar.gz",
38 |     "wmt21-cometinho-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-cometinho-da.tar.gz",
39 |     # "wmt21-comet-qe-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/refless-wmt21-comet-mqm.tar.gz",
40 |     # "wmt21-comet-qe-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/refless-wmt21-comet-da.tar.gz",
41 | }
42 | 
43 | 
44 | def load_from_checkpoint(checkpoint_path: str) -> CometModel:
45 |     """Loads models from a checkpoint path.
46 |     :param checkpoint_path: Path to a model checkpoint.
47 | 
48 |     :return: Returns a COMET model.
49 |     """
50 |     if not os.path.exists(checkpoint_path):
51 |         raise Exception(f"Invalid checkpoint path: {checkpoint_path}")
52 | 
53 |     hparams_file = "/".join(checkpoint_path.split("/")[:-2] + ["hparams.yaml"])
54 |     if os.path.exists(hparams_file):
55 |         with open(hparams_file) as yaml_file:
56 |             hparams = yaml.load(yaml_file.read(), Loader=yaml.FullLoader)
57 |         model_class = str2model[hparams["class_identifier"]]
58 |         model = model_class.load_from_checkpoint(checkpoint_path, **hparams)
59 |         return model
60 |     else:
61 |         raise Exception("hparams.yaml file is missing!")
62 | 


--------------------------------------------------------------------------------
/comet/encoders/base.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2020 Unbabel
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | r"""
 15 | Encoder Model base
 16 | ====================
 17 |     Module defining the common interface between all pretrained encoder models.
 18 | """
 19 | import abc
 20 | from typing import Dict, List
 21 | 
 22 | import torch
 23 | import torch.nn as nn
 24 | 
 25 | 
 26 | class Encoder(nn.Module, metaclass=abc.ABCMeta):
 27 |     """Base class for an encoder model."""
 28 | 
 29 |     @property
 30 |     @abc.abstractmethod
 31 |     def output_units(self):
 32 |         """Max number of tokens the encoder handles."""
 33 |         pass
 34 | 
 35 |     @property
 36 |     @abc.abstractmethod
 37 |     def max_positions(self):
 38 |         """Max number of tokens the encoder handles."""
 39 |         pass
 40 | 
 41 |     @property
 42 |     @abc.abstractmethod
 43 |     def num_layers(self):
 44 |         """Number of model layers available."""
 45 |         pass
 46 | 
 47 |     @classmethod
 48 |     @abc.abstractmethod
 49 |     def from_pretrained(cls, pretrained_model):
 50 |         """Function that loads a pretrained encoder and the respective tokenizer.
 51 | 
 52 |         :return: Encoder model
 53 |         """
 54 |         raise NotImplementedError
 55 | 
 56 |     def prepare_sample(self, sample: List[str]) -> Dict[str, torch.Tensor]:
 57 |         """Receives a list of strings and applies tokenization and vectorization.
 58 | 
 59 |         :param sample: List with text segments to be tokenized and padded.
 60 | 
 61 |         :return: Dictionary with HF model inputs.
 62 |         """
 63 |         tokenizer_output = self.tokenizer(
 64 |             sample,
 65 |             return_tensors="pt",
 66 |             padding=True,
 67 |             truncation=True,
 68 |             max_length=self.max_positions - 2,
 69 |         )
 70 |         return tokenizer_output
 71 | 
 72 |     def freeze(self) -> None:
 73 |         """Frezees the entire encoder."""
 74 |         for param in self.parameters():
 75 |             param.requires_grad = False
 76 | 
 77 |     def unfreeze(self) -> None:
 78 |         """Unfrezees the entire encoder."""
 79 |         for param in self.parameters():
 80 |             param.requires_grad = True
 81 | 
 82 |     @abc.abstractmethod
 83 |     def freeze_embeddings(self) -> None:
 84 |         """Frezees the embedding layer."""
 85 |         pass
 86 | 
 87 |     @abc.abstractmethod
 88 |     def layerwise_lr(self, lr: float, decay: float):
 89 |         """
 90 |         :param lr: Learning rate for the highest encoder layer.
 91 |         :param decay: decay percentage for the lower layers.
 92 | 
 93 |         :return: List of model parameters with layer-wise decay learning rate
 94 |         """
 95 |         pass
 96 | 
 97 |     @abc.abstractmethod
 98 |     def forward(
 99 |         self, tokens: torch.Tensor, lengths: torch.Tensor
100 |     ) -> Dict[str, torch.Tensor]:
101 |         pass
102 | 


--------------------------------------------------------------------------------
/comet/encoders/bert.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2020 Unbabel
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | r"""
 16 | BERT Encoder
 17 | ==============
 18 |     Pretrained BERT encoder from Hugging Face.
 19 | """
 20 | from typing import Dict
 21 | 
 22 | import torch
 23 | from comet.encoders.base import Encoder
 24 | from transformers import AutoModel, AutoTokenizer
 25 | 
 26 | 
 27 | class BERTEncoder(Encoder):
 28 |     """BERT encoder.
 29 | 
 30 |     :param pretrained_model: Pretrained model from hugging face.
 31 |     """
 32 | 
 33 |     def __init__(self, pretrained_model: str) -> None:
 34 |         super().__init__()
 35 |         self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True)
 36 |         self.model = AutoModel.from_pretrained(pretrained_model)
 37 |         self.model.encoder.output_hidden_states = True
 38 | 
 39 |     @property
 40 |     def output_units(self):
 41 |         """Max number of tokens the encoder handles."""
 42 |         return self.model.config.hidden_size
 43 | 
 44 |     @property
 45 |     def max_positions(self):
 46 |         """Max number of tokens the encoder handles."""
 47 |         return self.model.config.max_position_embeddings
 48 | 
 49 |     @property
 50 |     def num_layers(self):
 51 |         """Number of model layers available."""
 52 |         return self.model.config.num_hidden_layers + 1
 53 | 
 54 |     @classmethod
 55 |     def from_pretrained(cls, pretrained_model: str) -> Encoder:
 56 |         """Function that loads a pretrained encoder from Hugging Face.
 57 |         :param pretrained_model: Name of the pretrain model to be loaded.
 58 | 
 59 |         :return: Encoder model
 60 |         """
 61 |         return BERTEncoder(pretrained_model)
 62 | 
 63 |     def freeze_embeddings(self) -> None:
 64 |         """Frezees the embedding layer."""
 65 |         for param in self.model.embeddings.parameters():
 66 |             param.requires_grad = False
 67 | 
 68 |     def layerwise_lr(self, lr: float, decay: float):
 69 |         """
 70 |         :param lr: Learning rate for the highest encoder layer.
 71 |         :param decay: decay percentage for the lower layers.
 72 | 
 73 |         :return: List of model parameters with layer-wise decay learning rate
 74 |         """
 75 |         # Embedding Layer
 76 |         opt_parameters = [
 77 |             {
 78 |                 "params": self.model.embeddings.parameters(),
 79 |                 "lr": lr * decay ** (self.num_layers),
 80 |             }
 81 |         ]
 82 |         # All layers
 83 |         opt_parameters += [
 84 |             {
 85 |                 "params": self.model.encoder.layer[i].parameters(),
 86 |                 "lr": lr * decay ** i,
 87 |             }
 88 |             for i in range(self.num_layers - 2, 0, -1)
 89 |         ]
 90 |         return opt_parameters
 91 | 
 92 |     def forward(
 93 |         self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs
 94 |     ) -> Dict[str, torch.Tensor]:
 95 |         last_hidden_states, pooler_output, all_layers = self.model(
 96 |             input_ids=input_ids,
 97 |             attention_mask=attention_mask,
 98 |             output_hidden_states=True,
 99 |             return_dict=False,
100 |         )
101 |         return {
102 |             "sentemb": pooler_output,
103 |             "wordemb": last_hidden_states,
104 |             "all_layers": all_layers,
105 |             "attention_mask": attention_mask,
106 |         }
107 | 


--------------------------------------------------------------------------------
/comet/cli/train.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2020 Unbabel
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | 
 16 | Command for training new Metrics.
 17 | =================================
 18 | 
 19 | e.g:
 20 | ```
 21 |     comet-train --cfg configs/models/regression_metric.yaml
 22 | ```
 23 | 
 24 | For more details run the following command:
 25 | ```
 26 |     comet-train --help
 27 | ```
 28 | """
 29 | import json
 30 | 
 31 | from comet.models import (
 32 |     CometModel,
 33 |     RankingMetric,
 34 |     ReferencelessRegression,
 35 |     RegressionMetric,
 36 | )
 37 | from jsonargparse import ActionConfigFile, ArgumentParser, namespace_to_dict
 38 | from pytorch_lightning import seed_everything
 39 | from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
 40 | from pytorch_lightning.trainer.trainer import Trainer
 41 | 
 42 | 
 43 | def train_command() -> None:
 44 |     parser = ArgumentParser(description="Command for training COMET models.")
 45 |     parser.add_argument(
 46 |         "--seed_everything",
 47 |         type=int,
 48 |         default=12,
 49 |         help="Training Seed.",
 50 |     )
 51 |     parser.add_argument("--cfg", action=ActionConfigFile)
 52 |     parser.add_class_arguments(CometModel, "model")
 53 |     parser.add_subclass_arguments(RegressionMetric, "regression_metric")
 54 |     parser.add_subclass_arguments(
 55 |         ReferencelessRegression, "referenceless_regression_metric"
 56 |     )
 57 |     parser.add_subclass_arguments(RankingMetric, "ranking_metric")
 58 |     parser.add_subclass_arguments(EarlyStopping, "early_stopping")
 59 |     parser.add_subclass_arguments(ModelCheckpoint, "model_checkpoint")
 60 |     parser.add_subclass_arguments(Trainer, "trainer")
 61 |     cfg = parser.parse_args()
 62 |     seed_everything(cfg.seed_everything)
 63 | 
 64 |     checkpoint_callback = ModelCheckpoint(
 65 |         **namespace_to_dict(cfg.model_checkpoint.init_args)
 66 |     )
 67 |     early_stop_callback = EarlyStopping(
 68 |         **namespace_to_dict(cfg.early_stopping.init_args)
 69 |     )
 70 |     trainer_args = namespace_to_dict(cfg.trainer.init_args)
 71 |     trainer_args["callbacks"] = [early_stop_callback, checkpoint_callback]
 72 |     print("TRAINER ARGUMENTS: ")
 73 |     print(json.dumps(trainer_args, indent=4, default=lambda x: x.__dict__))
 74 |     trainer = Trainer(**trainer_args)
 75 | 
 76 |     print("MODEL ARGUMENTS: ")
 77 |     if cfg.regression_metric is not None:
 78 |         print(
 79 |             json.dumps(
 80 |                 cfg.regression_metric.init_args, indent=4, default=lambda x: x.__dict__
 81 |             )
 82 |         )
 83 |         model = RegressionMetric(**namespace_to_dict(cfg.regression_metric.init_args))
 84 |     elif cfg.referenceless_regression_metric is not None:
 85 |         print(
 86 |             json.dumps(
 87 |                 cfg.referenceless_regression_metric.init_args,
 88 |                 indent=4,
 89 |                 default=lambda x: x.__dict__,
 90 |             )
 91 |         )
 92 |         model = ReferencelessRegression(
 93 |             **namespace_to_dict(cfg.referenceless_regression_metric.init_args)
 94 |         )
 95 |     elif cfg.ranking_metric is not None:
 96 |         print(
 97 |             json.dumps(
 98 |                 cfg.ranking_metric.init_args, indent=4, default=lambda x: x.__dict__
 99 |             )
100 |         )
101 |         model = RankingMetric(**namespace_to_dict(cfg.ranking_metric.init_args))
102 |     else:
103 |         raise Exception("Model configurations missing!")
104 | 
105 |     trainer.fit(model)
106 | 


--------------------------------------------------------------------------------
/tests/integration/modules/test_feedforward.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | import unittest
  3 | 
  4 | import torch
  5 | from sklearn.datasets import load_digits
  6 | from sklearn.model_selection import train_test_split
  7 | from torch import nn
  8 | 
  9 | from comet.modules.feedforward import FeedForward
 10 | from pytorch_lightning import seed_everything
 11 | 
 12 | 
 13 | class TestFeedForward(unittest.TestCase):
 14 |     def test_MNIST(self):
 15 |         seed_everything(3)
 16 |         """
 17 |         STEP 1: LOADING DATASET
 18 |         """
 19 |         images, labels = load_digits(return_X_y=True)
 20 |         images = [torch.Tensor(images[i, :]) for i in range(images.shape[0])]
 21 |         labels = torch.tensor(labels, dtype=torch.long)
 22 | 
 23 |         train_images, test_images, train_labels, test_labels = train_test_split(
 24 |             images, labels, test_size=0.2, random_state=42
 25 |         )
 26 | 
 27 |         train_dataset = list(zip(train_images, train_labels))
 28 |         test_dataset = list(zip(test_images, test_labels))
 29 | 
 30 |         """
 31 |         STEP 2: MAKING DATASET ITERABLE
 32 |         """
 33 |         batch_size = 256
 34 |         n_iters = 80
 35 |         num_epochs = n_iters / (len(train_dataset) / batch_size)
 36 |         num_epochs = int(num_epochs)
 37 | 
 38 |         train_loader = torch.utils.data.DataLoader(
 39 |             dataset=train_dataset, batch_size=batch_size, shuffle=True
 40 |         )
 41 | 
 42 |         test_loader = torch.utils.data.DataLoader(
 43 |             dataset=test_dataset, batch_size=batch_size, shuffle=False
 44 |         )
 45 | 
 46 |         """
 47 |         STEP 3: INSTANTIATE MODEL CLASS
 48 |         """
 49 |         model = FeedForward(
 50 |             in_dim=8 * 8,
 51 |             out_dim=10,
 52 |             hidden_sizes=[100],
 53 |             activations="Tanh",
 54 |         )
 55 | 
 56 |         """
 57 |         STEP 4: INSTANTIATE LOSS CLASS
 58 |         """
 59 |         criterion = nn.CrossEntropyLoss()
 60 | 
 61 |         """
 62 |         STEP 5: INSTANTIATE OPTIMIZER CLASS
 63 |         """
 64 |         learning_rate = 0.1
 65 |         optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
 66 | 
 67 |         """
 68 |         STEP 7: TRAIN THE MODEL
 69 |         """
 70 |         iter = 0
 71 |         for epoch in range(num_epochs):
 72 |             for i, (images, labels) in enumerate(train_loader):
 73 |                 # Load images with gradient accumulation capabilities
 74 |                 images = images.view(-1, 8 * 8).requires_grad_()
 75 | 
 76 |                 # Clear gradients w.r.t. parameters
 77 |                 optimizer.zero_grad()
 78 | 
 79 |                 # Forward pass to get output/logits
 80 |                 outputs = model(images)
 81 | 
 82 |                 # Calculate Loss: softmax --> cross entropy loss
 83 |                 loss = criterion(outputs, labels)
 84 | 
 85 |                 # Getting gradients w.r.t. parameters
 86 |                 loss.backward()
 87 | 
 88 |                 # Updating parameters
 89 |                 optimizer.step()
 90 | 
 91 |                 iter += 1
 92 | 
 93 |                 if iter % 10 == 0:
 94 |                     # Calculate Accuracy
 95 |                     correct = 0
 96 |                     total = 0
 97 |                     # Iterate through test dataset
 98 |                     for images, labels in test_loader:
 99 |                         # Load images with gradient accumulation capabilities
100 |                         images = images.view(-1, 8 * 8).requires_grad_()
101 | 
102 |                         # Forward pass only to get logits/output
103 |                         outputs = model(images)
104 | 
105 |                         # Get predictions from the maximum value
106 |                         _, predicted = torch.max(outputs.data, 1)
107 | 
108 |                         # Total number of labels
109 |                         total += labels.size(0)
110 | 
111 |                         # Total correct predictions
112 |                         correct += (predicted == labels).sum()
113 | 
114 |                     accuracy = 100 * correct // total
115 |         self.assertGreaterEqual(accuracy, 95)
116 | 


--------------------------------------------------------------------------------
/comet/modules/layerwise_attention.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2020 Unbabel
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | r"""
 16 | Layer-Wise Attention Mechanism
 17 | ================================
 18 |     Computes a parameterised scalar mixture of N tensors,
 19 |         `mixture = gamma * sum(s_k * tensor_k)`
 20 |     where `s = softmax(w)`, with `w` and `gamma` scalar parameters.
 21 | 
 22 |     If `layer_norm=True` then apply layer normalization.
 23 | 
 24 |     If `dropout > 0`, then for each scalar weight, adjust its softmax
 25 |     weight mass to 0 with the dropout probability (i.e., setting the
 26 |     unnormalized weight to -inf). This effectively should redistribute
 27 |     dropped probability mass to all other weights.
 28 | 
 29 |     Original implementation:
 30 |         - https://github.com/Hyperparticle/udify
 31 | """
 32 | from typing import List, Optional
 33 | 
 34 | import torch
 35 | from torch.nn import Parameter, ParameterList
 36 | 
 37 | 
 38 | class LayerwiseAttention(torch.nn.Module):
 39 |     def __init__(
 40 |         self,
 41 |         num_layers: int,
 42 |         layer_norm: bool = False,
 43 |         layer_weights: Optional[List[int]] = None,
 44 |         dropout: float = None,
 45 |     ) -> None:
 46 |         super(LayerwiseAttention, self).__init__()
 47 |         self.num_layers = num_layers
 48 |         self.layer_norm = layer_norm
 49 |         self.dropout = dropout
 50 | 
 51 |         if layer_weights is None:
 52 |             layer_weights = [0.0] * num_layers
 53 |         elif len(layer_weights) != num_layers:
 54 |             raise Exception(
 55 |                 "Length of layer_weights {} differs \
 56 |                 from num_layers {}".format(
 57 |                     layer_weights, num_layers
 58 |                 )
 59 |             )
 60 | 
 61 |         self.scalar_parameters = ParameterList(
 62 |             [
 63 |                 Parameter(
 64 |                     torch.FloatTensor([layer_weights[i]]),
 65 |                     requires_grad=True,
 66 |                 )
 67 |                 for i in range(num_layers)
 68 |             ]
 69 |         )
 70 | 
 71 |         self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=True)
 72 | 
 73 |         if self.dropout:
 74 |             dropout_mask = torch.zeros(len(self.scalar_parameters))
 75 |             dropout_fill = torch.empty(len(self.scalar_parameters)).fill_(-1e20)
 76 |             self.register_buffer("dropout_mask", dropout_mask)
 77 |             self.register_buffer("dropout_fill", dropout_fill)
 78 | 
 79 |     def forward(
 80 |         self,
 81 |         tensors: List[torch.Tensor],  # pylint: disable=arguments-differ
 82 |         mask: torch.Tensor = None,
 83 |     ) -> torch.Tensor:
 84 | 
 85 |         if len(tensors) != self.num_layers:
 86 |             raise Exception(
 87 |                 "{} tensors were passed, but the module was initialized to \
 88 |                 mix {} tensors.".format(
 89 |                     len(tensors), self.num_layers
 90 |                 )
 91 |             )
 92 | 
 93 |         def _layer_norm(tensor, broadcast_mask, num_elements_not_masked):
 94 |             tensor_masked = tensor * broadcast_mask
 95 |             mean = torch.sum(tensor_masked) / num_elements_not_masked
 96 |             variance = (
 97 |                 torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2)
 98 |                 / num_elements_not_masked
 99 |             )
100 |             return (tensor - mean) / torch.sqrt(variance + 1e-12)
101 | 
102 |         # BUG: Pytorch bug fix when Parameters are not well copied across GPUs
103 |         # https://github.com/pytorch/pytorch/issues/36035
104 |         if len([parameter for parameter in self.scalar_parameters]) != self.num_layers:
105 |             weights = torch.tensor(self.weights, device=tensors[0].device)
106 |             gamma = torch.tensor(self.gamma_value, device=tensors[0].device)
107 |         else:
108 |             weights = torch.cat([parameter for parameter in self.scalar_parameters])
109 |             gamma = self.gamma
110 | 
111 |         if self.training and self.dropout:
112 |             weights = torch.where(
113 |                 self.dropout_mask.uniform_() > self.dropout, weights, self.dropout_fill
114 |             )
115 | 
116 |         normed_weights = torch.nn.functional.softmax(weights, dim=0)
117 |         normed_weights = torch.split(normed_weights, split_size_or_sections=1)
118 | 
119 |         if not self.layer_norm:
120 |             pieces = []
121 |             for weight, tensor in zip(normed_weights, tensors):
122 |                 pieces.append(weight * tensor)
123 |             return gamma * sum(pieces)
124 | 
125 |         else:
126 |             mask_float = mask.float()
127 |             broadcast_mask = mask_float.unsqueeze(-1)
128 |             input_dim = tensors[0].size(-1)
129 |             num_elements_not_masked = torch.sum(mask_float) * input_dim
130 | 
131 |             pieces = []
132 |             for weight, tensor in zip(normed_weights, tensors):
133 |                 pieces.append(
134 |                     weight
135 |                     * _layer_norm(tensor, broadcast_mask, num_elements_not_masked)
136 |                 )
137 |             return gamma * sum(pieces)
138 | 


--------------------------------------------------------------------------------
/comet/cli/score.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2020 Unbabel
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Command for scoring MT systems.
 16 | ===============================
 17 | 
 18 | optional arguments:
 19 |   -h, --help            Show this help message and exit.
 20 |   -s SOURCES, --sources SOURCES
 21 |                         (required, type: Path_fr)
 22 |   -t TRANSLATIONS, --translations TRANSLATIONS
 23 |                         (required, type: Path_fr)
 24 |   -r REFERENCES, --references REFERENCES
 25 |                         (required, type: Path_fr)
 26 |   --to_json TO_JSON     (type: Union[bool, str], default: False)
 27 |   --model MODEL         (type: Union[str, Path_fr], default: wmt21-large-estimator)
 28 |   --batch_size BATCH_SIZE
 29 |                         (type: int, default: 32)
 30 |   --gpus GPUS           (type: int, default: 1)
 31 | 
 32 | """
 33 | import json
 34 | from typing import Union
 35 | 
 36 | from comet.download_utils import download_model
 37 | from comet.models import available_metrics, load_from_checkpoint
 38 | from comet.modules import HeteroscedasticLoss, HeteroApproxLoss
 39 | from jsonargparse import ArgumentParser
 40 | from jsonargparse.typing import Path_fr
 41 | from pytorch_lightning import seed_everything
 42 | 
 43 | 
 44 | def score_command() -> None:
 45 |     parser = ArgumentParser(description="Command for scoring MT systems.")
 46 |     parser.add_argument("-s", "--sources", type=Path_fr, required=True)
 47 |     parser.add_argument("-t", "--translations", type=Path_fr, required=True)
 48 |     parser.add_argument("-r", "--references", type=Path_fr)
 49 |     parser.add_argument("-f", "--features", type=Path_fr, help="Path to additional features for predictor (optional)")
 50 |     parser.add_argument("--batch_size", type=int, default=8)
 51 |     parser.add_argument("--gpus", type=int, default=1)
 52 |     parser.add_argument(
 53 |         "--to_json",
 54 |         type=Union[bool, str],
 55 |         default=False,
 56 |         help="Exports results to a json file.",
 57 |     )
 58 |     parser.add_argument(
 59 |         "--model",
 60 |         type=Union[str, Path_fr],
 61 |         required=False,
 62 |         default="wmt20-comet-da",
 63 |         #choices=available_metrics.keys(),
 64 |         help="COMET model to be used.",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--mc_dropout",
 68 |         type=Union[bool, int],
 69 |         default=False,
 70 |         help="Number of inference runs for each sample in MC Dropout.",
 71 |     )
 72 |     parser.add_argument(
 73 |         "--refless",
 74 |         type=bool,
 75 |         default=False,
 76 |         help="flag for heteroschedastic loss",
 77 |     )
 78 |     parser.add_argument(
 79 |         "--seed_everything",
 80 |         help="Prediction seed.",
 81 |         type=int,
 82 |         default=12,
 83 |     )
 84 |     cfg = parser.parse_args()
 85 |     seed_everything(cfg.seed_everything)
 86 | 
 87 |     if (cfg.references is None) and ("refless" not in cfg.model) and (not cfg.refless):
 88 |         parser.error("{} requires -r/--references.".format(cfg.model))
 89 | 
 90 |     model_path = (
 91 |         download_model(cfg.model) if cfg.model in available_metrics else cfg.model
 92 |     )
 93 |     model = load_from_checkpoint(model_path)
 94 |     model.eval()
 95 | 
 96 |     with open(cfg.sources()) as fp:
 97 |         sources = [line.strip() for line in fp.readlines()]
 98 | 
 99 |     with open(cfg.translations()) as fp:
100 |         translations = [line.strip() for line in fp.readlines()]
101 | 
102 |     if cfg.features is not None :
103 |         with open(cfg.features()) as fp:
104 |             features = [(line.strip().split(',')) for line in fp.readlines()]
105 |             features = list(map(list, zip(*features)))
106 |             features = [[float(i) for i in f] for f in features]
107 |             
108 | 
109 | 
110 |     if "refless" in cfg.model or cfg.refless:
111 |         if cfg.features is not None :
112 |             data = {"src": sources, "mt": translations}
113 |             for i,f in enumerate(features):
114 |                 data['f'+str(i+1)]=f
115 |         else:
116 |             data = {"src": sources, "mt": translations}
117 |     else:
118 |         with open(cfg.references()) as fp:
119 |             references = [line.strip() for line in fp.readlines()]
120 |         if cfg.features is not None :
121 |             data = {"src": sources, "mt": translations, "ref": references}
122 |             for i,f in enumerate(features):
123 |                 data['f'+str(i+1)]=f
124 |         else:
125 |             data = {"src": sources, "mt": translations, "ref": references}
126 | 
127 |     data = [dict(zip(data, t)) for t in zip(*data.values())]
128 |     if cfg.mc_dropout:
129 |         if isinstance(model.loss, HeteroscedasticLoss):
130 |            mean_scores, std_scores, hts_mean, hts_std, sys_score = model.predict(
131 |             data, cfg.batch_size, cfg.gpus, cfg.mc_dropout)
132 |         else:
133 |             mean_scores, std_scores, sys_score = model.predict(
134 |             data, cfg.batch_size, cfg.gpus, cfg.mc_dropout)
135 |         for i, (mean, std, sample) in enumerate(zip(mean_scores, std_scores, data)):
136 |             print("Segment {}\tscore: {:.4f}\tvariance: {:.4f}".format(i, mean, std))
137 |             sample["COMET score"] = mean
138 |             sample["COMET variance"] = std
139 |             if isinstance(model.loss, HeteroscedasticLoss):
140 |                 sample["Heteroscedastic score"] = hts_mean
141 |                 sample["Heteroscedastic variance"] = hts_std
142 | 
143 |         print("System score: {:.4f}".format(sys_score))
144 |         if isinstance(cfg.to_json, str):
145 |             with open(cfg.to_json, "w") as outfile:
146 |                 json.dump(data, outfile, ensure_ascii=False, indent=4)
147 |             print("Predictions saved in: {}.".format(cfg.to_json))
148 | 
149 |     else:
150 |         if isinstance(model.loss, HeteroscedasticLoss):
151 |             predictions, hts, sys_score = model.predict(data, cfg.batch_size, cfg.gpus) 
152 |         else:
153 |             predictions, sys_score = model.predict(data, cfg.batch_size, cfg.gpus)
154 |         for i, (score, sample) in enumerate(zip(predictions, data)):
155 |             print("Segment {}\tscore: {:.4f}".format(i, score))
156 |             sample["COMET score"] = score
157 |             if isinstance(model.loss, HeteroscedasticLoss):
158 |                 sample["Heteroscedastic score"] = hts[i]
159 | 
160 |         print("System score: {:.4f}".format(sys_score))
161 |         if isinstance(cfg.to_json, str):
162 |             with open(cfg.to_json, "w") as outfile:
163 |                 json.dump(data, outfile, ensure_ascii=False, indent=4)
164 |             print("Predictions saved in: {}.".format(cfg.to_json))
165 | 


--------------------------------------------------------------------------------
/comet/cli/compare.py:
--------------------------------------------------------------------------------
  1 | # Copyright (C) 2020 Unbabel
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | """
 15 | Command for comparing two MT systems.
 16 | ======================================
 17 | 
 18 | optional arguments:
 19 |   -h, --help            Show this help message and exit.
 20 |   -s SOURCES, --sources SOURCES
 21 |                         (required, type: Path_fr)
 22 |   -x SYSTEM_X, --system_x SYSTEM_X
 23 |                         (required, type: Path_fr)
 24 |   -y SYSTEM_Y, --system_y SYSTEM_Y
 25 |                         (required, type: Path_fr)
 26 |   -r REFERENCES, --references REFERENCES
 27 |                         (type: Path_fr, default: null)
 28 |   --batch_size BATCH_SIZE
 29 |                         (type: int, default: 8)
 30 |   --gpus GPUS           (type: int, default: 1)
 31 |   --num_splits NUM_SPLITS
 32 |                         Number of random partitions used in Bootstrap resampling. (type: int, default: 300)
 33 |   --sample_ratio SAMPLE_RATIO
 34 |                         Percentage of the testset to use in each bootstrap resampling partition. (type: float, default: 0.4)
 35 |   --to_json TO_JSON     Exports results to a json file. (type: Union[bool, str], default: False)
 36 |   --model {emnlp20-comet-rank,wmt20-comet-da,wmt20-comet-qe-da,wmt21-cometinho-da}
 37 |                         COMET model to be used. (type: Union[str, Path_fr], default: wmt20-comet-da)
 38 |   --seed_everything SEED_EVERYTHING
 39 |                         Prediction seed. (type: int, default: 12)
 40 |                         
 41 | """
 42 | 
 43 | import json
 44 | from typing import Union
 45 | 
 46 | import numpy as np
 47 | from comet.download_utils import download_model
 48 | from comet.models import available_metrics, load_from_checkpoint
 49 | from jsonargparse import ArgumentParser
 50 | from jsonargparse.typing import Path_fr
 51 | from pytorch_lightning import seed_everything
 52 | 
 53 | 
 54 | def compare_command() -> None:
 55 |     parser = ArgumentParser(description="Command for comparing two MT systems.")
 56 |     parser.add_argument("-s", "--sources", type=Path_fr, required=True)
 57 |     parser.add_argument("-x", "--system_x", type=Path_fr, required=True)
 58 |     parser.add_argument("-y", "--system_y", type=Path_fr, required=True)
 59 |     parser.add_argument("-r", "--references", type=Path_fr)
 60 |     parser.add_argument("--batch_size", type=int, default=8)
 61 |     parser.add_argument("--gpus", type=int, default=1)
 62 |     parser.add_argument(
 63 |         "--num_splits",
 64 |         type=int,
 65 |         default=300,
 66 |         help="Number of random partitions used in Bootstrap resampling.",
 67 |     )
 68 |     parser.add_argument(
 69 |         "--sample_ratio",
 70 |         type=float,
 71 |         default=0.4,
 72 |         help="Percentage of the testset to use in each bootstrap resampling partition.",
 73 |     )
 74 |     parser.add_argument(
 75 |         "--to_json",
 76 |         type=Union[bool, str],
 77 |         default=False,
 78 |         help="Exports results to a json file.",
 79 |     )
 80 |     parser.add_argument(
 81 |         "--model",
 82 |         type=Union[str, Path_fr],
 83 |         required=False,
 84 |         default="wmt20-comet-da",
 85 |         choices=available_metrics.keys(),
 86 |         help="COMET model to be used.",
 87 |     )
 88 |     parser.add_argument(
 89 |         "--seed_everything",
 90 |         help="Prediction seed.",
 91 |         type=int,
 92 |         default=12,
 93 |     )
 94 |     cfg = parser.parse_args()
 95 |     seed_everything(cfg.seed_everything)
 96 | 
 97 |     if (cfg.references is None) and ("refless" not in cfg.model):
 98 |         parser.error("{} requires -r/--references.".format(cfg.model))
 99 | 
100 |     model_path = (
101 |         download_model(cfg.model) if cfg.model in available_metrics else cfg.model
102 |     )
103 |     model = load_from_checkpoint(model_path)
104 |     model.eval()
105 | 
106 |     with open(cfg.sources()) as fp:
107 |         sources = [line.strip() for line in fp.readlines()]
108 | 
109 |     with open(cfg.system_x()) as fp:
110 |         system_x = [line.strip() for line in fp.readlines()]
111 | 
112 |     with open(cfg.system_y()) as fp:
113 |         system_y = [line.strip() for line in fp.readlines()]
114 | 
115 |     if "refless" in cfg.model:
116 |         system_x = {"src": sources, "mt": system_x}
117 |         system_y = {"src": sources, "mt": system_y}
118 |     else:
119 |         with open(cfg.references()) as fp:
120 |             references = [line.strip() for line in fp.readlines()]
121 |         system_x = {"src": sources, "mt": system_x, "ref": references}
122 |         system_y = {"src": sources, "mt": system_y, "ref": references}
123 | 
124 |     system_x = [dict(zip(system_x, t)) for t in zip(*system_x.values())]
125 |     system_y = [dict(zip(system_y, t)) for t in zip(*system_y.values())]
126 | 
127 |     x_seg_scores, _ = model.predict(system_x, cfg.batch_size, cfg.gpus)
128 |     y_seg_scores, _ = model.predict(system_y, cfg.batch_size, cfg.gpus)
129 | 
130 |     data = []
131 |     for i, (x_score, y_score) in enumerate(zip(x_seg_scores, y_seg_scores)):
132 |         print(
133 |             "Segment {}\tsystem_x score: {:.4f}\tsystem_y score: {:.4f}".format(
134 |                 i, x_score, y_score
135 |             )
136 |         )
137 |         data.append(
138 |             {
139 |                 "src": system_x[0]["src"],
140 |                 "system_x": {"mt": system_x[0]["mt"], "score": x_score},
141 |                 "system_y": {"mt": system_y[0]["mt"], "score": y_score},
142 |                 "ref": system_y[0]["ref"],
143 |             }
144 |         )
145 | 
146 |     n = len(sources)
147 |     ids = list(range(n))
148 |     sample_size = max(int(n * cfg.sample_ratio), 1)
149 | 
150 |     x_sys_scores, y_sys_scores = [], []
151 |     win_count = [0, 0, 0]
152 |     for _ in range(cfg.num_splits):
153 |         # Subsample the gold and system outputs (with replacement)
154 |         subsample_ids = np.random.choice(ids, size=sample_size, replace=True)
155 |         subsample_x_scr = sum([x_seg_scores[i] for i in subsample_ids]) / sample_size
156 |         subsample_y_scr = sum([y_seg_scores[i] for i in subsample_ids]) / sample_size
157 | 
158 |         if subsample_x_scr > subsample_y_scr:
159 |             win_count[0] += 1
160 |         elif subsample_y_scr > subsample_x_scr:
161 |             win_count[1] += 1
162 |         else:
163 |             win_count[2] += 1
164 | 
165 |         x_sys_scores.append(subsample_x_scr)
166 |         y_sys_scores.append(subsample_y_scr)
167 | 
168 |     data.insert(
169 |         0,
170 |         {
171 |             "x-mean": np.mean(np.array(x_sys_scores)),
172 |             "y-mean": np.mean(np.array(y_sys_scores)),
173 |             "ties (%)": win_count[2] / sum(win_count),
174 |             "x_wins (%)": win_count[0] / sum(win_count),
175 |             "y_wins (%)": win_count[1] / sum(win_count),
176 |         },
177 |     )
178 |     for k, v in data[0].items():
179 |         print("{}:\t{:.4f}".format(k, v))
180 | 
181 |     if isinstance(cfg.to_json, str):
182 |         with open(cfg.to_json, "w") as outfile:
183 |             json.dump(data, outfile, ensure_ascii=False, indent=4)
184 |         print("Predictions saved in: {}.".format(cfg.to_json))
185 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # uncertainties_MT_eval
  2 | Code and data for the paper: [Disentangling Uncertainty in Machine Translation Evaluation](https://arxiv.org/pdf/2204.06546.pdf)
  3 | 
  4 | 
  5 | ## Quick Installation
  6 | 
  7 | We are using Python 3.8.
  8 | 
  9 | Detailed usage examples and instructions for the COMET metric can be found in the [Full Documentation](https://unbabel.github.io/COMET/html/index.html).
 10 | 
 11 | To develop locally:
 12 | ```bash
 13 | git clone https://github.com/deep-spin/uncertainties_MT_eval.git
 14 | pip install -r requirements.txt
 15 | pip install -e .
 16 | ```
 17 | 
 18 | ## TL;DR
 19 | 
 20 | This repository is en extension of the original COMET metric, providing different options to enhance it with uncertainty predictors. It includes code for **heteroscedastic losses (HTS and KL)**, as well as the option to use the same architecture for **direct uncertainty prediction (DUP)**. 
 21 | We used COMET v1.0 as the basis for this extension. 
 22 | 
 23 | ## Important commands
 24 | 
 25 | - To train a new metric use:
 26 | 
 27 |     ```bash
 28 |     comet-train --cfg config/models/model_config.yaml
 29 |     ```
 30 | 
 31 | - To use a trained metric of a triplet of a source file <src.txt>, translation file <mt.txt> and reference file <ref.txt> and obtain predictions use:
 32 | 
 33 |     ```bash
 34 |     comet-score --model <path_to_trained_model> -s src.txt -t mt.txt -r ref.txt
 35 |     ```
 36 | 
 37 | ## Description of configurations and command options
 38 | 
 39 | ### COMET configuration
 40 | To train a plain COMET model on your data without using the uncertainty-related code, use the configuration file :
 41 | [uncertainties_MT_eval/configs/models/regression_metric_comet_plain.yaml](../uncertainties_MT_eval/configs/models/regression_metric_comet_plain.yaml)
 42 | 
 43 | This model will use an MSE loss and will produce a single output for each segment, corresponding to the predicted **quality score**.
 44 | 
 45 | ### COMET with MC Dropout configuration
 46 | 
 47 | After having (any) trained COMET model you can apply MC Dropout during inference using the ```--mc_dropout``` and specify the desired number *N* of the forward stochastic runs during ```comet-score``` as follows:
 48 | 
 49 | ```bash
 50 | comet-score --model <path_to_trained_model> -s src.txt -t mt.txt -r ref.txt --mc_dropout N
 51 | ```
 52 | 
 53 | 
 54 | This option can be used with models trained using any of the three loss options: hts, kl, mse.
 55 | 
 56 | If the option is used with a model trained with the MSE loss, then the model will pgenerateroduce a second output for each segment corresponding to the variance/uncertainty value for each segment's quality score prediction.
 57 | 
 58 | If the option is used in combination with any of the two heteroscedastic losses, the model will generate four outputs for each segment in total:
 59 | 1. The predicted quality score
 60 | 2. The estimated variance for the quality score
 61 | 3. The predicted aleatoric uncertainty 
 62 | 4. The estimated variance of the aleatoric uncertainty 
 63 | 
 64 | Then the total uncertainty value for the segment can be calculated as indicated in Eq. 4 in the paper.
 65 | 
 66 | 
 67 | >Note that we used N=100 for all experiments in the paper. To reproduce other related works this number might have to be reduced.
 68 | 
 69 | ### COMET with aleatoric uncertainty predictions
 70 | 
 71 | There are two options to train COMET with aleatoric uncertainty prediction. 
 72 | 
 73 | 1. Heteroscedastic uncertainty (HTS) which can be used with any labelled dataset. It only requires setting the loss to "hts" in the configuration file; see [uncertainties_MT_eval/configs/models/regression_metric_comet_heteroscedastic.yaml](../uncertainties_MT_eval/configs/models/regression_metric_comet_heteroscedastic.yaml) as an example.
 74 | 
 75 | 2. KL-divergence minimisation based uncertainty (KL). To train a model with the KL setup requires access to labelled data with multiple annotator per segment that provides either (a) multiple human judgements per segment, or (b) the standard deviation of the multiple annotator scores per segment. See file [uncertainties_MT_eval/data/mqm2020/mqm.train.z_score.csv](uncertainties_MT_eval/data/mqm2020/mqm.train.z_score.csv) as an example. 
 76 | To train a model on this data set the loss to "kl" in the configuration file. See [uncertainties_MT_eval/configs/models/regression_metric_comet_kl.yaml](../uncertainties_MT_eval/configs/models/regression_metric_comet_kl.yaml)
 77 | 
 78 | 
 79 | ### COMET-based direct uncertainty prediction (COMET-DUP)
 80 | 
 81 | It is possible train a COMET model to predict the uncertainty of a given prediction (casting uncertainty as the error/distance to the human judgement), henceforth referred to as COMET-DUP. 
 82 | 
 83 | #### **Training Setup:**
 84 | 
 85 | To train a COMET-DUP model it is necessary to:
 86 | 
 87 | - Have access to human judgements $q^*$ on a train dataset $\mathcal{D}$  
 88 | - Run a MT Evaluation or MT Quality Estimation model to obtain quality predictions  $\hat{q}$ over $\mathcal{D}$
 89 | - Calculate $\epsilon = |q^*-\hat{q}|$ for $\mathcal{D}$
 90 | - Use $\epsilon$ as the target for the uncertainty predicting COMET, instead of the human quality judgements which is the default target
 91 | 
 92 | Provide the training data in a csv file using a column **f1** that holds the values for the predicted quality scores $\hat{q}$ and a column **score** that contains the computed $\epsilon$ (target) for each <src, mt, ref> instance.
 93 | 
 94 | #### **Losses**
 95 | 
 96 | Upon calculating the above three different losses can be used for the COMET-DUP training:
 97 | 
 98 | 1. Typical MSE loss: $\mathcal{L}^\mathrm{E}_{\mathrm{ABS}}(\hat{\epsilon}; \epsilon^*) = (\epsilon^* - \hat{\epsilon})^2$\
 99 | Specify loss: "mse" in the yaml configuration file to use it
100 | 2. MSE loss with squared values: 
101 |    $\mathcal{L}^\mathrm{E}_{\mathrm{SQ}}(\hat{\epsilon}; \epsilon^*) = ((\epsilon^*)^2 - \hat{\epsilon}^2)^2 $
102 | Specify loss: "squared" in the yaml configuration file to use it
103 | 3. Heteroschedastic approximation loss:  
104 | $\mathcal{L}^\mathrm{E}_{\mathrm{HTS}}(\hat{\epsilon}; \epsilon^*) = \frac{(\epsilon^*)^2}{2 \hat{\epsilon}^2} + \frac{1}{2}\log(\hat{\epsilon})^2$  
105 | Specify loss: "hts_approx" in the yaml configuration file to use it
106 | 
107 | #### **Bottleneck**:
108 | COMET-DUP unlike COMET uses a bottleneck layer to incorporate the initial quality predictions $\hat{q}$ as training. You need to specify the the size of the bottleneck layer in the configuration file.  
109 | Recommended value: 256
110 | 
111 | 
112 | #### **Full Train Configuration**:
113 | For an example of a configuration file to train COMET-DUP with $\mathcal{L}^\mathrm{E}_{\mathrm{HTS}}$ see the file [uncertainties_MT_eval/configs/models/regression_metric_comet_dup.yaml](../uncertainties_MT_eval/configs/models/regression_metric_comet_dup.yaml)
114 | 
115 | 
116 | #### **Inference**
117 | 
118 | For inference with COMET-DUP use the same inference command (`comet-score`) used for the other COMET models providing a trained COMET-DUP model in the `--model` option. Remember that the output in this case will be uncertainty scores instead of quality scores.
119 | 
120 | <br>
121 | </br>
122 | 
123 | ***
124 | 
125 | ## Related Publications
126 | 
127 | - [Better Uncertainty Quantification for Machine Translation Evaluation](https://arxiv.org/pdf/2204.06546.pdf)
128 | 
129 | - [Uncertainty-Aware Machine Translation Evaluation](https://aclanthology.org/2021.findings-emnlp.330/) 
130 | 
131 | - [IST-Unbabel 2021 Submission for the Quality Estimation Shared Task](https://aclanthology.org/2021.wmt-1.102/)
132 | 
133 | - [Are References Really Needed? Unbabel-IST 2021 Submission for the Metrics Shared Task](http://statmt.org/wmt21/pdf/2021.wmt-1.111.pdf)
134 | 
135 | - [COMET - Deploying a New State-of-the-art MT Evaluation Metric in Production](https://www.aclweb.org/anthology/2020.amta-user.4)
136 | 
137 | - [Unbabel's Participation in the WMT20 Metrics Shared Task](https://aclanthology.org/2020.wmt-1.101/)
138 | 
139 | - [COMET: A Neural Framework for MT Evaluation](https://www.aclweb.org/anthology/2020.emnlp-main.213)
140 | 


--------------------------------------------------------------------------------
/comet/download_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2020 Unbabel
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | import logging
 17 | import os
 18 | import subprocess
 19 | import urllib.request
 20 | import zipfile
 21 | from typing import List
 22 | from urllib.parse import urlparse
 23 | 
 24 | from tqdm import tqdm
 25 | 
 26 | from comet.models import available_metrics
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | 
 31 | def get_cache_folder():
 32 |     if "HOME" in os.environ:
 33 |         cache_directory = os.environ["HOME"] + "/.cache/torch/unbabel_comet/"
 34 |         if not os.path.exists(cache_directory):
 35 |             os.makedirs(cache_directory)
 36 |         return cache_directory
 37 |     else:
 38 |         raise Exception("HOME environment variable is not defined.")
 39 | 
 40 | 
 41 | def _reporthook(t):
 42 |     """``reporthook`` to use with ``urllib.request`` that prints the
 43 |         process of the download.
 44 | 
 45 |     Uses ``tqdm`` for progress bar.
 46 | 
 47 |     **Reference:**
 48 |     https://github.com/tqdm/tqdm
 49 | 
 50 |     """
 51 |     last_b = [0]
 52 | 
 53 |     def inner(b: int = 1, bsize: int = 1, tsize: int = None):
 54 |         """
 55 |         :param b: Number of blocks just transferred [default: 1].
 56 |         :param bsize: Size of each block (in tqdm units) [default: 1].
 57 |         :param tsize: Total size (in tqdm units).
 58 |             If [default: None] remains unchanged.
 59 |         """
 60 |         if tsize is not None:
 61 |             t.total = tsize
 62 |         t.update((b - last_b[0]) * bsize)
 63 |         last_b[0] = b
 64 | 
 65 |     return inner
 66 | 
 67 | 
 68 | def _maybe_extract(compressed_filename: str, directory: str, extension: str = None):
 69 |     """Extract a compressed file to ``directory``.
 70 | 
 71 |     :param compressed_filename: Compressed file.
 72 |     :param directory: Extract to directory.
 73 |     :param extension: Extension of the file; Otherwise, attempts to
 74 |         extract extension from the filename.
 75 |     """
 76 |     logger.info("Extracting {}".format(compressed_filename))
 77 | 
 78 |     if extension is None:
 79 |         basename = os.path.basename(compressed_filename)
 80 |         extension = basename.split(".", 1)[1]
 81 | 
 82 |     if "zip" in extension:
 83 |         with zipfile.ZipFile(compressed_filename, "r") as zip_:
 84 |             zip_.extractall(directory)
 85 | 
 86 |     elif "tar.gz" in extension or "tgz" in extension:
 87 |         # `tar` is much faster than python's `tarfile` implementation
 88 |         with open(os.devnull, "w") as devnull:
 89 |             subprocess.call(
 90 |                 ["tar", "-C", directory, "-zxvf", compressed_filename], stdout=devnull
 91 |             )
 92 | 
 93 |     elif "tar" in extension:
 94 |         with open(os.devnull, "w") as devnull:
 95 |             subprocess.call(
 96 |                 ["tar", "-C", directory, "-xvf", compressed_filename], stdout=devnull
 97 |             )
 98 | 
 99 |     logger.info("Extracted {}".format(compressed_filename))
100 | 
101 | 
102 | def _get_filename_from_url(url):
103 |     """Return a filename from a URL
104 | 
105 |     Args:
106 |         url (str): URL to extract filename from
107 | 
108 |     Returns:
109 |         (str): Filename in URL
110 |     """
111 |     parse = urlparse(url)
112 |     return os.path.basename(parse.path)
113 | 
114 | 
115 | def _check_download(*filepaths):
116 |     """Check if the downloaded files are found.
117 | 
118 |     Args:
119 |         filepaths (list of str): Check if these filepaths exist
120 | 
121 |     Returns:
122 |         (bool): Returns True if all filepaths exist
123 |     """
124 |     return all([os.path.isfile(filepath) for filepath in filepaths])
125 | 
126 | 
127 | def download_file_maybe_extract(
128 |     url: str,
129 |     directory: str,
130 |     filename: str = None,
131 |     extension: str = None,
132 |     check_files: List[str] = [],
133 | ):
134 |     """Download the file at ``url`` to ``directory``.
135 |         Extract to ``directory`` if tar or zip.
136 | 
137 |     :param url: Url of file (str or Path).
138 |     :param directory: Directory to download to.
139 |     :param filename: Name of the file to download; Otherwise, a filename is extracted
140 |         from the url.
141 |     :param extension: Extension of the file; Otherwise, attempts to extract extension
142 |         from the filename.
143 |     :param check_files: Check if these files exist, ensuring the download
144 |         succeeded. If these files exist before the download, the download is skipped.
145 | 
146 |     :return: Filename of download file.
147 |     """
148 |     if filename is None:
149 |         filename = _get_filename_from_url(url)
150 | 
151 |     directory = str(directory)
152 |     filepath = os.path.join(directory, filename)
153 |     check_files = [os.path.join(directory, str(f)) for f in check_files]
154 | 
155 |     if len(check_files) > 0 and _check_download(*check_files):
156 |         return filepath
157 | 
158 |     if not os.path.isdir(directory):
159 |         os.makedirs(directory)
160 | 
161 |     logger.info("Downloading {}".format(filename))
162 | 
163 |     # Download
164 |     with tqdm(unit="B", unit_scale=True, miniters=1, desc=filename) as t:
165 |         urllib.request.urlretrieve(url, filename=filepath, reporthook=_reporthook(t))
166 | 
167 |     _maybe_extract(
168 |         compressed_filename=filepath, directory=directory, extension=extension
169 |     )
170 | 
171 |     if not _check_download(*check_files):
172 |         raise ValueError("[DOWNLOAD FAILED] `*check_files` not found")
173 | 
174 |     return filepath
175 | 
176 | 
177 | def download_model(model: str, saving_directory: str = None) -> str:
178 |     """
179 |     Function that loads pretrained models from AWS.
180 | 
181 |     :param model: Name of the model to be loaded.
182 |     :param saving_directory: RELATIVE path to the saving folder (must end with /).
183 | 
184 |     Return:
185 |         - Path to model checkpoint.
186 |     """
187 | 
188 |     if saving_directory is None:
189 |         saving_directory = get_cache_folder()
190 | 
191 |     if not saving_directory.endswith("/"):
192 |         saving_directory += "/"
193 | 
194 |     if not os.path.exists(saving_directory):
195 |         os.makedirs(saving_directory)
196 | 
197 |     if os.path.isdir(saving_directory + model):
198 |         logger.info(f"{model} is already in cache.")
199 |         if not model.endswith("/"):
200 |             model += "/"
201 | 
202 |     elif model not in available_metrics.keys():
203 |         raise Exception(
204 |             f"{model} is not in the `availale_metrics` or is a valid checkpoint folder."
205 |         )
206 | 
207 |     elif available_metrics[model].startswith("https://"):
208 |         download_file_maybe_extract(
209 |             available_metrics[model], directory=saving_directory
210 |         )
211 | 
212 |     else:
213 |         raise Exception("Invalid model name!")
214 | 
215 |     # CLEAN Cache
216 |     if os.path.exists(saving_directory + model + ".zip"):
217 |         os.remove(saving_directory + model + ".zip")
218 |     if os.path.exists(saving_directory + model + ".tar.gz"):
219 |         os.remove(saving_directory + model + ".tar.gz")
220 |     if os.path.exists(saving_directory + model + ".tar"):
221 |         os.remove(saving_directory + model + ".tar")
222 | 
223 |     checkpoints_folder = saving_directory + model + "/checkpoints"
224 |     checkpoints = [
225 |         file for file in os.listdir(checkpoints_folder) if file.endswith(".ckpt")
226 |     ]
227 |     checkpoint = checkpoints[-1]
228 |     checkpoint_path = checkpoints_folder + "/" + checkpoint
229 |     return checkpoint_path
230 | 


--------------------------------------------------------------------------------
/comet/models/regression/referenceless.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2020 Unbabel
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | r"""
 17 | ReferencelessRegression
 18 | ========================
 19 |     Referenceless Regression Metric that learns to predict a quality assessment by
 20 |     looking at source and translation.
 21 | """
 22 | from typing import Dict, List, Optional, Tuple, Union
 23 | 
 24 | import pandas as pd
 25 | import torch
 26 | from comet.models.regression.regression_metric import RegressionMetric
 27 | from comet.modules import FeedForward, Bottleneck
 28 | 
 29 | 
 30 | class ReferencelessRegression(RegressionMetric):
 31 |     """ReferencelessRegression:
 32 | 
 33 |     :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen.
 34 |     :param keep_embeddings_frozen: Keeps the embeddings frozen during training.
 35 |     :param keep_encoder_frozen: freezes entire encoder.
 36 |     :param optimizer: Optimizer used during training.
 37 |     :param encoder_learning_rate: Learning rate used to fine-tune the encoder model.
 38 |     :param learning_rate: Learning rate used to fine-tune the top layers.
 39 |     :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers.
 40 |     :param encoder_model: Encoder model to be used.
 41 |     :param pretrained_model: Pretrained model from Hugging Face.
 42 |     :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg'].
 43 |     :param layer: Encoder layer to be used ('mix' for pooling info from all layers.)
 44 |     :param dropout: Dropout used in the top-layers.
 45 |     :param batch_size: Batch size used during training.
 46 |     :param train_data: Path to a csv file containing the training data.
 47 |     :param validation_data: Path to a csv file containing the validation data.
 48 |     :param hidden_sizes: Hidden sizes for the Feed Forward regression.
 49 |     :param activations: Feed Forward activation function.
 50 |     :param load_weights_from_checkpoint: Path to a checkpoint file.
 51 |     """
 52 | 
 53 |     def __init__(
 54 |         self,
 55 |         nr_frozen_epochs: Union[float, int] = 0.3,
 56 |         keep_embeddings_frozen: bool = False,
 57 |         keep_encoder_frozen: bool = False,
 58 |         optimizer: str = "AdamW",
 59 |         encoder_learning_rate: float = 1e-05,
 60 |         learning_rate: float = 3e-05,
 61 |         layerwise_decay: float = 0.95,
 62 |         encoder_model: str = "XLM-RoBERTa",
 63 |         pretrained_model: str = "xlm-roberta-base",
 64 |         pool: str = "avg",
 65 |         layer: Union[str, int] = "mix",
 66 |         dropout: float = 0.1,
 67 |         batch_size: int = 4,
 68 |         train_data: Optional[str] = None,
 69 |         validation_data: Optional[str] = None,
 70 |         hidden_sizes_bottleneck: List[int] = [1536, 256],
 71 |         hidden_sizes: List[int] = [768],
 72 |         activations: str = "Tanh",
 73 |         final_activation: Optional[str] = None,
 74 |         load_weights_from_checkpoint: Optional[str] = None,
 75 |         loss: Optional[str]="mse",
 76 |         data_portion: Optional[float]=1.0,
 77 |         feature_size: Optional[int] = 0
 78 |     ) -> None:
 79 |         super(RegressionMetric, self).__init__(
 80 |             nr_frozen_epochs,
 81 |             keep_embeddings_frozen,
 82 |             keep_encoder_frozen,
 83 |             optimizer,
 84 |             encoder_learning_rate,
 85 |             learning_rate,
 86 |             layerwise_decay,
 87 |             encoder_model,
 88 |             pretrained_model,
 89 |             pool,
 90 |             layer,
 91 |             dropout,
 92 |             batch_size,
 93 |             train_data,
 94 |             validation_data,
 95 |             load_weights_from_checkpoint,
 96 |             "referenceless_regression_metric",
 97 |         )
 98 |         self.save_hyperparameters()
 99 | 
100 |         self.bottleneck = Bottleneck(
101 |             in_dim=self.encoder.output_units * 4 ,
102 |             hidden_sizes = self.hparams.hidden_sizes_bottleneck,
103 |             activations=self.hparams.activations,
104 |             dropout=self.hparams.dropout,
105 |         )
106 | 
107 |         self.estimator = FeedForward(
108 |             in_dim=self.hparams.hidden_sizes_bottleneck[-1] + self.hparams.feature_size,
109 |             out_dim = 2 if self.hparams.loss in ["var", "hts"] else 1,
110 |             hidden_sizes=self.hparams.hidden_sizes,
111 |             activations=self.hparams.activations,
112 |             dropout=self.hparams.dropout,
113 |             final_activation=self.hparams.final_activation,
114 |         )
115 | 
116 |     def prepare_sample(
117 |         self, sample: List[Dict[str, Union[str, float]]], inference: bool = False, data_portion: float = 1.0,
118 |     ) -> Union[
119 |         Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor]
120 |     ]:
121 |         """
122 |         Function that prepares a sample to input the model.
123 | 
124 |         :param sample: list of dictionaries.
125 |         :param inference: If set to true prepares only the model inputs.
126 | 
127 |         :returns: Tuple with 2 dictionaries (model inputs and targets).
128 |             If `inference=True` returns only the model inputs.
129 |         """
130 |         sample = {k: [dic[k] for dic in sample] for k in sample[0]}
131 |         src_inputs = self.encoder.prepare_sample(sample["src"])
132 |         mt_inputs = self.encoder.prepare_sample(sample["mt"])
133 | 
134 |         src_inputs = {"src_" + k: v for k, v in src_inputs.items()}
135 |         mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()}
136 |         if self.hparams.feature_size>0:
137 |             feats = []
138 |             for feat in sample:
139 |                 if feat.startswith("f"):
140 |                     feats.append(sample[feat])
141 |             #print(len(feats))
142 |             feature_tensor = torch.as_tensor(feats, dtype=torch.float)
143 |             #print(feature_tensor.shape)
144 |             #print('------------------')
145 |             features = {"custom_features": feature_tensor.T}
146 | 
147 |             
148 |         else:
149 |             features = {"custom_features": torch.Tensor()}
150 | 
151 |         inputs = {**src_inputs, **mt_inputs, **features}
152 | 
153 |         if inference:
154 |             return inputs
155 | 
156 |         targets = {"score": torch.tensor(sample["score"], dtype=torch.float)}
157 |         return inputs, targets
158 | 
159 |     def forward(
160 |         self,
161 |         src_input_ids: torch.tensor,
162 |         src_attention_mask: torch.tensor,
163 |         mt_input_ids: torch.tensor,
164 |         mt_attention_mask: torch.tensor,
165 |         custom_features: torch.tensor,
166 |         **kwargs
167 |     ) -> Dict[str, torch.Tensor]:
168 |         src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask)
169 |         mt_sentemb = self.get_sentence_embedding(mt_input_ids, mt_attention_mask)
170 | 
171 |         diff_src = torch.abs(mt_sentemb - src_sentemb)
172 |         prod_src = mt_sentemb * src_sentemb
173 | 
174 |         embedded_sequences = torch.cat(
175 |             (mt_sentemb, src_sentemb, prod_src, diff_src), dim=1
176 |         )
177 | 
178 |         if self.hparams.feature_size>0:
179 |             #custom_features=torch.unsqueeze(custom_features, 1)
180 |         #print(embedded_sequences.shape)
181 |         #print(f1.shape)
182 | 
183 |             bottleneck = self.bottleneck(embedded_sequences)
184 |             #print(bottleneck.shape)
185 |             #print(custom_features.shape)
186 |             seq_feats = torch.cat((bottleneck,custom_features),dim=1)
187 |             #print(seq_feats.shape)
188 |             
189 |             score = self.estimator(seq_feats)
190 |         else:
191 |             bottleneck = self.bottleneck(embedded_sequences)
192 |             score = self.estimator(bottleneck)
193 |         if self.hparams.loss in ["var","hts"]:
194 |             return {"score": score[:,0], "variance": score[:,1]}
195 | 
196 |         return {"score": score}
197 | 
198 |     def read_csv(self, path: str) -> List[dict]:
199 |         """Reads a comma separated value file.
200 | 
201 |         :param path: path to a csv file.
202 | 
203 |         :return: List of records as dictionaries
204 |         """
205 |         feats=[]
206 |         df = pd.read_csv(path)
207 |         flen = self.hparams.feature_size
208 |         columns = ["src", "mt", "score"]
209 |         for i in range(flen):
210 |             fstring='f'+str(i+1)
211 |             print('feature added: '+str(fstring))
212 |             columns.append(fstring)
213 |             feats.append(fstring)
214 |         df = df[columns]
215 |         
216 |         df["src"] = df["src"].astype(str)
217 |         df["mt"] = df["mt"].astype(str)
218 |         df["score"] = df["score"].astype(float)
219 |         for feat in feats:
220 |             df[feat] = df[feat].astype(float)
221 |         return df.to_dict("records")
222 | 


--------------------------------------------------------------------------------
/comet/models/regression/regression_metric_hybrid.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2020 Unbabel
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | r"""
 17 | RegressionMetric
 18 | ========================
 19 |     Regression Metric that learns to predict a quality assessment by looking
 20 |     at source, translation and reference.
 21 | """
 22 | from typing import Dict, List, Optional, Tuple, Union
 23 | 
 24 | import pandas as pd
 25 | import torch
 26 | from comet.models.base import CometModel
 27 | from comet.modules import FeedForward, Bottleneck
 28 | from torchmetrics import MetricCollection, PearsonCorrcoef, SpearmanCorrcoef
 29 | from transformers import AdamW
 30 | import random
 31 | 
 32 | class RegressionMetric(CometModel):
 33 |     """RegressionMetric:
 34 | 
 35 |     :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen.
 36 |     :param keep_embeddings_frozen: Keeps the embeddings frozen during training.
 37 |     :param keep_encoder_frozen: freezes entire encoder.
 38 |     :param optimizer: Optimizer used during training.
 39 |     :param encoder_learning_rate: Learning rate used to fine-tune the encoder model.
 40 |     :param learning_rate: Learning rate used to fine-tune the top layers.
 41 |     :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers.
 42 |     :param encoder_model: Encoder model to be used.
 43 |     :param pretrained_model: Pretrained model from Hugging Face.
 44 |     :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg'].
 45 |     :param layer: Encoder layer to be used ('mix' for pooling info from all layers.)
 46 |     :param dropout: Dropout used in the top-layers.
 47 |     :param batch_size: Batch size used during training.
 48 |     :param train_data: Path to a csv file containing the training data.
 49 |     :param validation_data: Path to a csv file containing the validation data.
 50 |     :param hidden_sizes: Hidden sizes for the Feed Forward regression.
 51 |     :param activations: Feed Forward activation function.
 52 |     :param load_weights_from_checkpoint: Path to a checkpoint file.
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         nr_frozen_epochs: Union[float, int] = 0.3,
 58 |         keep_embeddings_frozen: bool = False,
 59 |         keep_encoder_frozen: bool = False,
 60 |         optimizer: str = "AdamW",
 61 |         encoder_learning_rate: float = 1e-05,
 62 |         learning_rate: float = 3e-05,
 63 |         layerwise_decay: float = 0.95,
 64 |         encoder_model: str = "XLM-RoBERTa",
 65 |         pretrained_model: str = "xlm-roberta-base",
 66 |         pool: str = "avg",
 67 |         layer: Union[str, int] = "mix",
 68 |         dropout: float = 0.1,
 69 |         batch_size: int = 4,
 70 |         train_data: Optional[str] = None,
 71 |         validation_data: Optional[str] = None,
 72 |         hidden_sizes_bottleneck: List[int] = [2304, 256],
 73 |         hidden_sizes: List[int] = [768],
 74 |         activations: str = "Tanh",
 75 |         final_activation: Optional[str] = None,
 76 |         load_weights_from_checkpoint: Optional[str] = None,
 77 |         loss: Optional[str]="mse",
 78 |         data_portion: Optional[float] = 1.0,
 79 |         feature_size: Optional[int] = 0
 80 |     ) -> None:
 81 |         super().__init__(
 82 |             nr_frozen_epochs,
 83 |             keep_embeddings_frozen,
 84 |             keep_encoder_frozen,
 85 |             optimizer,
 86 |             encoder_learning_rate,
 87 |             learning_rate,
 88 |             layerwise_decay,
 89 |             encoder_model,
 90 |             pretrained_model,
 91 |             pool,
 92 |             layer,
 93 |             dropout,
 94 |             batch_size,
 95 |             train_data,
 96 |             validation_data,
 97 |             load_weights_from_checkpoint,
 98 |             "regression_metric",
 99 |         )
100 |         self.save_hyperparameters()
101 |         self.bottleneck = Bottleneck(
102 |             in_dim=self.encoder.output_units * 6 ,
103 |             hidden_sizes = [self.hparams.hidden_sizes[0],self.hparams.hidden_sizes_bottleneck[-1]],
104 |             activations=self.hparams.activations,
105 |             dropout=self.hparams.dropout,
106 |         )
107 | 
108 |         self.estimator = FeedForward(
109 |             in_dim=self.hparams.hidden_sizes_bottleneck[-1] + self.hparams.feature_size,
110 |             out_dim = 2 if self.hparams.loss in ["var", "hts"] else 1,
111 |             hidden_sizes=[self.hparams.hidden_sizes[-1]],
112 |             activations=self.hparams.activations,
113 |             dropout=self.hparams.dropout,
114 |             final_activation=self.hparams.final_activation,
115 |         )
116 | 
117 |     def init_metrics(self):
118 |         metrics = MetricCollection(
119 |             {"spearman": SpearmanCorrcoef(), "pearson": PearsonCorrcoef()}
120 |         )
121 |         self.train_metrics = metrics.clone(prefix="train_")
122 |         self.val_metrics = metrics.clone(prefix="val_")
123 | 
124 |     def configure_optimizers(
125 |         self,
126 |     ) -> Tuple[List[torch.optim.Optimizer], List[torch.optim.lr_scheduler.LambdaLR]]:
127 |         """Sets the optimizers to be used during training."""
128 |         layer_parameters = self.encoder.layerwise_lr(
129 |             self.hparams.encoder_learning_rate, self.hparams.layerwise_decay
130 |         )
131 |         top_layers_parameters = [
132 |             {"params": self.estimator.parameters() , "lr": self.hparams.learning_rate}
133 |         ]
134 |         bott_layers_parameters = [
135 |             {"params": self.bottleneck.parameters() , "lr": self.hparams.learning_rate}
136 |         ]
137 |         if self.layerwise_attention:
138 |             layerwise_attn_params = [
139 |                 {
140 |                     "params": self.layerwise_attention.parameters(),
141 |                     "lr": self.hparams.learning_rate,
142 |                 }
143 |             ]
144 |             params = layer_parameters + top_layers_parameters + bott_layers_parameters + layerwise_attn_params
145 |         else:
146 |             params = layer_parameters + top_layers_parameters + bott_layers_parameters
147 | 
148 |         optimizer = AdamW(
149 |             params,
150 |             lr=self.hparams.learning_rate,
151 |             correct_bias=True,
152 |         )
153 |         # scheduler = self._build_scheduler(optimizer)
154 |         return [optimizer], []
155 | 
156 |     def prepare_sample(
157 |         self, sample: List[Dict[str, Union[str, float]]], inference: bool = False, data_portion: float = 1.0,
158 |     ) -> Union[
159 |         Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor]
160 |     ]:
161 |         """
162 |         Function that prepares a sample to input the model.
163 | 
164 |         :param sample: list of dictionaries.
165 |         :param inference: If set to true prepares only the model inputs.
166 | 
167 |         :returns: Tuple with 2 dictionaries (model inputs and targets).
168 |             If `inference=True` returns only the model inputs.
169 |         """
170 |         #print(sample[0])
171 |         sample = {k: [dic[k] for dic in sample] for k in sample[0]}
172 |         src_inputs = self.encoder.prepare_sample(sample["src"])
173 |         mt_inputs = self.encoder.prepare_sample(sample["mt"])
174 |         ref_inputs = self.encoder.prepare_sample(sample["ref"])
175 |         
176 |         src_inputs = {"src_" + k: v for k, v in src_inputs.items()}
177 |         mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()}
178 |         ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()}
179 |         if self.hparams.feature_size>0:
180 |             feats = []
181 |             for feat in sample:
182 |                 if feat.startswith("f"):
183 |                     feats.append(sample[feat])
184 |             #print(len(feats))
185 |             feature_tensor = torch.as_tensor(feats, dtype=torch.float)
186 |             #print(feature_tensor.shape)
187 |             #print('------------------')
188 |             features = {"custom_features": feature_tensor.T}
189 | 
190 |             
191 |         else:
192 |             features = {"custom_features": torch.Tensor()}
193 | 
194 |         inputs = {**src_inputs, **mt_inputs, **ref_inputs, **features}
195 |         if inference:
196 |             return inputs
197 | 
198 |         targets = {"score": torch.tensor(sample["score"], dtype=torch.float)}
199 |         return inputs, targets
200 | 
201 |     def forward(
202 |         self,
203 |         src_input_ids: torch.tensor,
204 |         src_attention_mask: torch.tensor,
205 |         mt_input_ids: torch.tensor,
206 |         mt_attention_mask: torch.tensor,
207 |         ref_input_ids: torch.tensor,
208 |         ref_attention_mask: torch.tensor,
209 |         custom_features: torch.tensor,
210 |         **kwargs
211 |     ) -> Dict[str, torch.Tensor]:
212 |         src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask)
213 |         mt_sentemb = self.get_sentence_embedding(mt_input_ids, mt_attention_mask)
214 |         ref_sentemb = self.get_sentence_embedding(ref_input_ids, ref_attention_mask)
215 | 
216 |         diff_ref = torch.abs(mt_sentemb - ref_sentemb)
217 |         diff_src = torch.abs(mt_sentemb - src_sentemb)
218 | 
219 |         prod_ref = mt_sentemb * ref_sentemb
220 |         prod_src = mt_sentemb * src_sentemb
221 | 
222 |         embedded_sequences = torch.cat(
223 |             (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_src, diff_src),
224 |             dim=1,
225 |         )
226 |         if self.hparams.feature_size>0:
227 |             #custom_features=torch.unsqueeze(custom_features, 1)
228 |         #print(embedded_sequences.shape)
229 |         #print(f1.shape)
230 | 
231 |             bottleneck = self.bottleneck(embedded_sequences)
232 |             #print(bottleneck.shape)
233 |             #print(custom_features.shape)
234 |             seq_feats = torch.cat((bottleneck,custom_features),dim=1)
235 |             #print(seq_feats.shape)
236 |             
237 |             score = self.estimator(seq_feats)
238 |         else:
239 |             bottleneck = self.bottleneck(embedded_sequences)
240 |             score = self.estimator(bottleneck)
241 |         if self.hparams.loss in ["var","hts"]:
242 |             return {"score": score[:,0], "variance": score[:,1]}
243 | 
244 |         return {"score": score}
245 | 
246 |     def read_csv(self, path: str) -> List[dict]:
247 |         """Reads a comma separated value file.
248 | 
249 |         :param path: path to a csv file.
250 | 
251 |         :return: List of records as dictionaries
252 |         """
253 |         feats=[]
254 |         df = pd.read_csv(path)
255 |         flen = self.hparams.feature_size
256 |         columns = ["src", "mt", "ref", "score"]
257 |         for i in range(flen):
258 |             fstring='f'+str(i+1)
259 |             print('feature added: '+str(fstring))
260 |             columns.append(fstring)
261 |             feats.append(fstring)
262 |         df = df[columns]
263 |         df["src"] = df["src"].astype(str)
264 |         df["mt"] = df["mt"].astype(str)
265 |         df["ref"] = df["ref"].astype(str)
266 |         df["score"] = df["score"].astype(float)
267 |         for feat in feats:
268 |             df[feat] = df[feat].astype(float)
269 |         return df.to_dict("records")
270 | 


--------------------------------------------------------------------------------
/comet/models/regression/regression_metric.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2020 Unbabel
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | r"""
 17 | RegressionMetric
 18 | ========================
 19 |     Regression Metric that learns to predict a quality assessment by looking
 20 |     at source, translation and reference.
 21 | """
 22 | from typing import Dict, List, Optional, Tuple, Union
 23 | 
 24 | import pandas as pd
 25 | import torch
 26 | from comet.models.base import CometModel
 27 | from comet.modules import FeedForward, Bottleneck
 28 | from torchmetrics import MetricCollection, PearsonCorrcoef, SpearmanCorrcoef
 29 | from transformers import AdamW
 30 | import random
 31 | 
 32 | class RegressionMetric(CometModel):
 33 |     """RegressionMetric:
 34 | 
 35 |     :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen.
 36 |     :param keep_embeddings_frozen: Keeps the embeddings frozen during training.
 37 |     :param keep_encoder_frozen: freezes entire encoder.
 38 |     :param optimizer: Optimizer used during training.
 39 |     :param encoder_learning_rate: Learning rate used to fine-tune the encoder model.
 40 |     :param learning_rate: Learning rate used to fine-tune the top layers.
 41 |     :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers.
 42 |     :param encoder_model: Encoder model to be used.
 43 |     :param pretrained_model: Pretrained model from Hugging Face.
 44 |     :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg'].
 45 |     :param layer: Encoder layer to be used ('mix' for pooling info from all layers.)
 46 |     :param dropout: Dropout used in the top-layers.
 47 |     :param batch_size: Batch size used during training.
 48 |     :param train_data: Path to a csv file containing the training data.
 49 |     :param validation_data: Path to a csv file containing the validation data.
 50 |     :param hidden_sizes: Hidden sizes for the Feed Forward regression.
 51 |     :param activations: Feed Forward activation function.
 52 |     :param load_weights_from_checkpoint: Path to a checkpoint file.
 53 |     """
 54 | 
 55 |     def __init__(
 56 |         self,
 57 |         nr_frozen_epochs: Union[float, int] = 0.3,
 58 |         keep_embeddings_frozen: bool = False,
 59 |         keep_encoder_frozen: bool = False,
 60 |         optimizer: str = "AdamW",
 61 |         encoder_learning_rate: float = 1e-05,
 62 |         learning_rate: float = 3e-05,
 63 |         layerwise_decay: float = 0.95,
 64 |         encoder_model: str = "XLM-RoBERTa",
 65 |         pretrained_model: str = "xlm-roberta-base",
 66 |         pool: str = "avg",
 67 |         layer: Union[str, int] = "mix",
 68 |         dropout: float = 0.1,
 69 |         batch_size: int = 4,
 70 |         train_data: Optional[str] = None,
 71 |         validation_data: Optional[str] = None,
 72 |         hidden_sizes_bottleneck: List[int] = [2304, 256],
 73 |         hidden_sizes: List[int] = [768],
 74 |         activations: str = "Tanh",
 75 |         final_activation: Optional[str] = None,
 76 |         load_weights_from_checkpoint: Optional[str] = None,
 77 |         loss: Optional[str]="mse",
 78 |         feature_size: Optional[int] = 0
 79 |     ) -> None:
 80 |         super().__init__(
 81 |             nr_frozen_epochs,
 82 |             keep_embeddings_frozen,
 83 |             keep_encoder_frozen,
 84 |             optimizer,
 85 |             encoder_learning_rate,
 86 |             learning_rate,
 87 |             layerwise_decay,
 88 |             encoder_model,
 89 |             pretrained_model,
 90 |             pool,
 91 |             layer,
 92 |             dropout,
 93 |             batch_size,
 94 |             train_data,
 95 |             validation_data,
 96 |             load_weights_from_checkpoint,
 97 |             "regression_metric",
 98 |         )
 99 |         self.save_hyperparameters()
100 |         
101 |         if self.hparams.feature_size > 0:
102 |             self.bottleneck = Bottleneck(
103 |                 in_dim=self.encoder.output_units * 6,
104 |                 hidden_sizes = [self.hparams.hidden_sizes[0],self.hparams.hidden_sizes_bottleneck[-1]],
105 |                 activations=self.hparams.activations,
106 |                 dropout=self.hparams.dropout,
107 |             )
108 | 
109 |             self.estimator = FeedForward(
110 |                 in_dim=self.hparams.hidden_sizes_bottleneck[-1] + self.hparams.feature_size,
111 |                 out_dim = 2 if self.hparams.loss in ["kl", "hts"] else 1,
112 |                 hidden_sizes=[self.hparams.hidden_sizes[-1]],
113 |                 activations=self.hparams.activations,
114 |                 dropout=self.hparams.dropout,
115 |                 final_activation=self.hparams.final_activation,
116 |             )
117 |         else:
118 |             self.estimator = FeedForward(
119 |                 in_dim=self.encoder.output_units * 6,
120 |                 hidden_sizes=self.hparams.hidden_sizes,
121 |                 activations=self.hparams.activations,
122 |                 dropout=self.hparams.dropout,
123 |                 final_activation=self.hparams.final_activation,
124 |                 out_dim=2 if self.hparams.loss in ["kl", "hts"] else 1,
125 |             )
126 | 
127 |     def init_metrics(self):
128 |         metrics = MetricCollection(
129 |             {"spearman": SpearmanCorrcoef(), "pearson": PearsonCorrcoef()}
130 |         )
131 |         self.train_metrics = metrics.clone(prefix="train_")
132 |         self.val_metrics = metrics.clone(prefix="val_")
133 | 
134 |     def configure_optimizers(
135 |         self,
136 |     ) -> Tuple[List[torch.optim.Optimizer], List[torch.optim.lr_scheduler.LambdaLR]]:
137 |         """Sets the optimizers to be used during training."""
138 |         layer_parameters = self.encoder.layerwise_lr(
139 |             self.hparams.encoder_learning_rate, self.hparams.layerwise_decay
140 |         )
141 |         top_layers_parameters = [
142 |             {"params": self.estimator.parameters() , "lr": self.hparams.learning_rate}
143 |         ]
144 |         if self.hparams.feature_size>0:
145 |             bott_layers_parameters = [
146 |                 {"params": self.bottleneck.parameters() , "lr": self.hparams.learning_rate}
147 |             ]
148 |         if self.layerwise_attention:
149 |             layerwise_attn_params = [
150 |                 {
151 |                     "params": self.layerwise_attention.parameters(),
152 |                     "lr": self.hparams.learning_rate,
153 |                 }
154 |             ]
155 |             params = layer_parameters + top_layers_parameters + layerwise_attn_params
156 |         else:
157 |             params = layer_parameters + top_layers_parameters 
158 |         if self.hparams.feature_size > 0:
159 |             params += bott_layers_parameters
160 |             
161 |         optimizer = AdamW(
162 |             params,
163 |             lr=self.hparams.learning_rate,
164 |             correct_bias=True,
165 |         )
166 |         # scheduler = self._build_scheduler(optimizer)
167 |         return [optimizer], []
168 | 
169 |     def prepare_sample(
170 |         self, sample: List[Dict[str, Union[str, float]]], inference: bool = False, 
171 |     ) -> Union[
172 |         Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor]
173 |     ]:
174 |         """
175 |         Function that prepares a sample to input the model.
176 | 
177 |         :param sample: list of dictionaries.
178 |         :param inference: If set to true prepares only the model inputs.
179 | 
180 |         :returns: Tuple with 2 dictionaries (model inputs and targets).
181 |             If `inference=True` returns only the model inputs.
182 |         """
183 |         #print(sample[0])
184 |         sample = {k: [dic[k] for dic in sample] for k in sample[0]}
185 |         src_inputs = self.encoder.prepare_sample(sample["src"])
186 |         mt_inputs = self.encoder.prepare_sample(sample["mt"])
187 |         ref_inputs = self.encoder.prepare_sample(sample["ref"])
188 |         
189 |         src_inputs = {"src_" + k: v for k, v in src_inputs.items()}
190 |         mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()}
191 |         ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()}
192 |         if self.hparams.feature_size>0:
193 |             feats = []
194 |             for feat in sample:
195 |                 if feat.startswith("f"):
196 |                     feats.append(sample[feat])
197 |             feature_tensor = torch.as_tensor(feats, dtype=torch.float)
198 |             features = {"custom_features": feature_tensor.T}
199 |         else:
200 |             features = {"custom_features": torch.Tensor()}
201 | 
202 |         inputs = {**src_inputs, **mt_inputs, **ref_inputs, **features}
203 |         if inference:
204 |             return inputs
205 | 
206 |         targets = {"score": torch.tensor(sample["score"], dtype=torch.float)}
207 |         return inputs, targets
208 | 
209 |     def forward(
210 |         self,
211 |         src_input_ids: torch.tensor,
212 |         src_attention_mask: torch.tensor,
213 |         mt_input_ids: torch.tensor,
214 |         mt_attention_mask: torch.tensor,
215 |         ref_input_ids: torch.tensor,
216 |         ref_attention_mask: torch.tensor,
217 |         custom_features: torch.tensor,
218 |         **kwargs
219 |     ) -> Dict[str, torch.Tensor]:
220 |         src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask)
221 |         mt_sentemb = self.get_sentence_embedding(mt_input_ids, mt_attention_mask)
222 |         ref_sentemb = self.get_sentence_embedding(ref_input_ids, ref_attention_mask)
223 | 
224 |         diff_ref = torch.abs(mt_sentemb - ref_sentemb)
225 |         diff_src = torch.abs(mt_sentemb - src_sentemb)
226 | 
227 |         prod_ref = mt_sentemb * ref_sentemb
228 |         prod_src = mt_sentemb * src_sentemb
229 | 
230 |         embedded_sequences = torch.cat(
231 |             (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_src, diff_src),
232 |             dim=1,
233 |         )
234 |         if self.hparams.feature_size>0:
235 |             bottleneck = self.bottleneck(embedded_sequences)
236 |             seq_feats = torch.cat((bottleneck,custom_features),dim=1)           
237 |             score = self.estimator(seq_feats)
238 |         else:
239 |             score = self.estimator(embedded_sequences)
240 |             
241 |         if self.hparams.loss in ["kl","hts"]:
242 |             return {"score": score[:,0], "variance": score[:,1]}
243 |         return {"score": score}
244 | 
245 |     def read_csv(self, path: str) -> List[dict]:
246 |         """Reads a comma separated value file.
247 |         :param path: path to a csv file.
248 |         :return: List of records as dictionaries
249 |         """
250 |         feats=[]
251 |         df = pd.read_csv(path)
252 |         flen = self.hparams.feature_size
253 |         columns = ["src", "mt", "ref", "score"]
254 |         if self.hparams.loss == 'kl':
255 |             columns.append("std")
256 |         for i in range(flen):
257 |             fstring='f'+str(i+1)
258 |             print('feature added: '+str(fstring))
259 |             columns.append(fstring)
260 |             feats.append(fstring)
261 |         df = df[columns]
262 |         df["src"] = df["src"].astype(str)
263 |         df["mt"] = df["mt"].astype(str)
264 |         df["ref"] = df["ref"].astype(str)
265 |         df["score"] = df["score"].astype(float)
266 |         for feat in feats:
267 |             df[feat] = df[feat].astype(float)
268 |         return df.to_dict("records")
269 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/comet/models/ranking/ranking_metric.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2020 Unbabel
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | 
 16 | r"""
 17 | Ranking Metric
 18 | ====================
 19 |     Translation Ranking metric was introduced by
 20 |         [Rei, et al. 2020](https://aclanthology.org/2020.emnlp-main.213/)
 21 |     and it is trained on top of Direct Assessment Relative Ranks (DARR) to encode
 22 |     `good` translations closer to the anchors (source & reference) than `worse`
 23 |     translations.
 24 | """
 25 | from typing import Dict, List, Optional, Tuple, Union
 26 | 
 27 | import pandas as pd
 28 | import torch
 29 | import torch.nn.functional as F
 30 | from comet.models.base import CometModel
 31 | from transformers import AdamW
 32 | 
 33 | from .wmt_kendall import WMTKendall
 34 | 
 35 | 
 36 | class RankingMetric(CometModel):
 37 |     """RankingMetric
 38 | 
 39 |     :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen.
 40 |     :param keep_embeddings_frozen: Keeps the encoder frozen during training.
 41 |     :param optimizer: Optimizer used during training.
 42 |     :param encoder_learning_rate: Learning rate used to fine-tune the encoder model.
 43 |     :param learning_rate: Learning rate used to fine-tune the top layers.
 44 |     :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers.
 45 |     :param encoder_model: Encoder model to be used.
 46 |     :param pretrained_model: Pretrained model from Hugging Face.
 47 |     :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg'].
 48 |     :param layer: Encoder layer to be used ('mix' for pooling info from all layers.)
 49 |     :param dropout: Dropout used in the top-layers.
 50 |     :param batch_size: Batch size used during training.
 51 |     :param train_data: Path to a csv file containing the training data.
 52 |     :param validation_data: Path to a csv file containing the validation data.
 53 |     :param load_weights_from_checkpoint: Path to a checkpoint file.
 54 |     """
 55 | 
 56 |     def __init__(
 57 |         self,
 58 |         nr_frozen_epochs: Union[float, int] = 0.05,
 59 |         keep_embeddings_frozen: bool = False,
 60 |         optimizer: str = "AdamW",
 61 |         encoder_learning_rate: float = 1e-05,
 62 |         learning_rate: float = 3e-05,
 63 |         layerwise_decay: float = 0.95,
 64 |         encoder_model: str = "XLM-RoBERTa",
 65 |         pretrained_model: str = "xlm-roberta-base",
 66 |         pool: str = "avg",
 67 |         layer: Union[str, int] = "mix",
 68 |         dropout: float = 0.1,
 69 |         batch_size: int = 8,
 70 |         train_data: Optional[str] = None,
 71 |         validation_data: Optional[str] = None,
 72 |         load_weights_from_checkpoint: Optional[str] = None,
 73 |     ) -> None:
 74 |         super().__init__(
 75 |             nr_frozen_epochs,
 76 |             keep_embeddings_frozen,
 77 |             optimizer,
 78 |             encoder_learning_rate,
 79 |             learning_rate,
 80 |             layerwise_decay,
 81 |             encoder_model,
 82 |             pretrained_model,
 83 |             pool,
 84 |             layer,
 85 |             dropout,
 86 |             batch_size,
 87 |             train_data,
 88 |             validation_data,
 89 |             load_weights_from_checkpoint,
 90 |             "ranking_metric",
 91 |         )
 92 |         self.save_hyperparameters()
 93 | 
 94 |     def init_metrics(self):
 95 |         self.train_metrics = WMTKendall(prefix="train")
 96 |         self.val_metrics = WMTKendall(prefix="val")
 97 | 
 98 |     @property
 99 |     def loss(self):
100 |         return torch.nn.TripletMarginLoss(margin=1.0, p=2)
101 | 
102 |     def configure_optimizers(
103 |         self,
104 |     ) -> Tuple[List[torch.optim.Optimizer], List[torch.optim.lr_scheduler.LambdaLR]]:
105 |         """Sets the optimizers to be used during training."""
106 |         layer_parameters = self.encoder.layerwise_lr(
107 |             self.hparams.encoder_learning_rate, self.hparams.layerwise_decay
108 |         )
109 |         if self.layerwise_attention:
110 |             layerwise_attn_params = [
111 |                 {
112 |                     "params": self.layerwise_attention.parameters(),
113 |                     "lr": self.hparams.learning_rate,
114 |                 }
115 |             ]
116 |             params = layer_parameters + layerwise_attn_params
117 |         else:
118 |             params = layer_parameters
119 | 
120 |         optimizer = AdamW(
121 |             params,
122 |             lr=self.hparams.learning_rate,
123 |             correct_bias=True,
124 |         )
125 |         # scheduler = self._build_scheduler(optimizer)
126 |         return [optimizer], []
127 | 
128 |     def prepare_sample(
129 |         self, sample: List[Dict[str, Union[str, float]]], inference: bool = False
130 |     ) -> Dict[str, torch.Tensor]:
131 | 
132 |         sample = {k: [dic[k] for dic in sample] for k in sample[0]}
133 | 
134 |         if inference:
135 |             src_inputs = self.encoder.prepare_sample(sample["src"])
136 |             mt_inputs = self.encoder.prepare_sample(sample["mt"])
137 |             ref_inputs = self.encoder.prepare_sample(sample["ref"])
138 | 
139 |             ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()}
140 |             src_inputs = {"src_" + k: v for k, v in src_inputs.items()}
141 |             mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()}
142 | 
143 |             return {**ref_inputs, **src_inputs, **mt_inputs}
144 | 
145 |         ref_inputs = self.encoder.prepare_sample(sample["ref"])
146 |         src_inputs = self.encoder.prepare_sample(sample["src"])
147 |         pos_inputs = self.encoder.prepare_sample(sample["pos"])
148 |         neg_inputs = self.encoder.prepare_sample(sample["neg"])
149 | 
150 |         ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()}
151 |         src_inputs = {"src_" + k: v for k, v in src_inputs.items()}
152 |         pos_inputs = {"pos_" + k: v for k, v in pos_inputs.items()}
153 |         neg_inputs = {"neg_" + k: v for k, v in neg_inputs.items()}
154 | 
155 |         return {**ref_inputs, **src_inputs, **pos_inputs, **neg_inputs}
156 | 
157 |     def forward(
158 |         self,
159 |         src_input_ids: torch.tensor,
160 |         ref_input_ids: torch.tensor,
161 |         pos_input_ids: torch.tensor,
162 |         neg_input_ids: torch.tensor,
163 |         src_attention_mask: torch.tensor,
164 |         ref_attention_mask: torch.tensor,
165 |         pos_attention_mask: torch.tensor,
166 |         neg_attention_mask: torch.tensor,
167 |         **kwargs
168 |     ) -> Dict[str, torch.Tensor]:
169 |         src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask)
170 |         ref_sentemb = self.get_sentence_embedding(ref_input_ids, ref_attention_mask)
171 |         pos_sentemb = self.get_sentence_embedding(pos_input_ids, pos_attention_mask)
172 |         neg_sentemb = self.get_sentence_embedding(neg_input_ids, neg_attention_mask)
173 | 
174 |         loss = self.loss(src_sentemb, pos_sentemb, neg_sentemb) + self.loss(
175 |             ref_sentemb, pos_sentemb, neg_sentemb
176 |         )
177 | 
178 |         distance_src_pos = F.pairwise_distance(pos_sentemb, src_sentemb)
179 |         distance_ref_pos = F.pairwise_distance(pos_sentemb, ref_sentemb)
180 |         # Harmonic mean between anchors and the positive example
181 |         distance_pos = (2 * distance_src_pos * distance_ref_pos) / (
182 |             distance_src_pos + distance_ref_pos
183 |         )
184 | 
185 |         # Harmonic mean between anchors and the negative example
186 |         distance_src_neg = F.pairwise_distance(neg_sentemb, src_sentemb)
187 |         distance_ref_neg = F.pairwise_distance(neg_sentemb, ref_sentemb)
188 |         distance_neg = (2 * distance_src_neg * distance_ref_neg) / (
189 |             distance_src_neg + distance_ref_neg
190 |         )
191 | 
192 |         return {
193 |             "loss": loss,
194 |             "distance_pos": distance_pos,
195 |             "distance_neg": distance_neg,
196 |         }
197 | 
198 |     def read_csv(self, path: str, regression: bool = False) -> List[dict]:
199 |         """Reads a comma separated value file.
200 | 
201 |         :param path: path to a csv file.
202 | 
203 |         :return: List of records as dictionaries
204 |         """
205 |         df = pd.read_csv(path)
206 | 
207 |         if regression:
208 |             df = df[["src", "mt", "ref", "score"]]
209 |             df["src"] = df["src"].astype(str)
210 |             df["mt"] = df["mt"].astype(str)
211 |             df["ref"] = df["ref"].astype(str)
212 |             df["score"] = df["score"].astype(float)
213 |             return df.to_dict("records")
214 | 
215 |         df = df[["src", "pos", "neg", "ref"]]
216 |         df["src"] = df["src"].astype(str)
217 |         df["pos"] = df["pos"].astype(str)
218 |         df["neg"] = df["neg"].astype(str)
219 |         df["ref"] = df["ref"].astype(str)
220 |         return df.to_dict("records")
221 | 
222 |     def training_step(
223 |         self,
224 |         batch: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]],
225 |         batch_nb: int,
226 |     ) -> Dict[str, torch.Tensor]:
227 |         """
228 |         Runs one training step.
229 |         This usually consists in the forward function followed by the loss function.
230 | 
231 |         :param batch: The output of your prepare_sample function.
232 |         :param batch_nb: Integer displaying which batch this is.
233 | 
234 |         :returns: dictionary containing the loss and the metrics to be added to the
235 |             lightning logger.
236 |         """
237 |         batch_prediction = self.forward(**batch)
238 |         loss_value = batch_prediction["loss"]
239 | 
240 |         if (
241 |             self.nr_frozen_epochs < 1.0
242 |             and self.nr_frozen_epochs > 0.0
243 |             and batch_nb > self.epoch_total_steps * self.nr_frozen_epochs
244 |         ):
245 |             self.unfreeze_encoder()
246 |             self._frozen = False
247 | 
248 |         self.log("train_loss", loss_value, on_step=True, on_epoch=True)
249 |         return loss_value
250 | 
251 |     def validation_step(
252 |         self,
253 |         batch: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]],
254 |         batch_nb: int,
255 |         dataloader_idx: int,
256 |     ) -> Dict[str, torch.Tensor]:
257 |         """
258 |         Similar to the training step but with the model in eval mode.
259 | 
260 |         :param batch: The output of your prepare_sample function.
261 |         :param batch_nb: Integer displaying which batch this is.
262 |         :param dataloader_idx: Integer displaying which dataloader this is.
263 | 
264 |         :returns: dictionary passed to the validation_end function.
265 |         """
266 |         batch_prediction = self.forward(**batch)
267 |         loss_value = batch_prediction["loss"]
268 |         self.log("val_loss", loss_value, on_step=True, on_epoch=True)
269 | 
270 |         # TODO: REMOVE if condition after torchmetrics bug fix
271 |         if dataloader_idx == 0:
272 |             self.train_metrics.update(
273 |                 batch_prediction["distance_pos"], batch_prediction["distance_neg"]
274 |             )
275 |         elif dataloader_idx == 1:
276 |             self.val_metrics.update(
277 |                 batch_prediction["distance_pos"], batch_prediction["distance_neg"]
278 |             )
279 | 
280 |     def predict_step(
281 |         self,
282 |         batch: Dict[str, torch.Tensor],
283 |         batch_idx: int,
284 |         dataloader_idx: Optional[int],
285 |     ) -> List[float]:
286 |         src_sentemb = self.get_sentence_embedding(
287 |             batch["src_input_ids"], batch["src_attention_mask"]
288 |         )
289 |         ref_sentemb = self.get_sentence_embedding(
290 |             batch["ref_input_ids"], batch["ref_attention_mask"]
291 |         )
292 |         mt_sentemb = self.get_sentence_embedding(
293 |             batch["mt_input_ids"], batch["mt_attention_mask"]
294 |         )
295 | 
296 |         src_distance = F.pairwise_distance(mt_sentemb, src_sentemb)
297 |         ref_distance = F.pairwise_distance(mt_sentemb, ref_sentemb)
298 | 
299 |         distances = (2 * ref_distance * src_distance) / (ref_distance + src_distance)
300 |         return torch.ones_like(distances) / (1 + distances)
301 | 


--------------------------------------------------------------------------------
/comet/models/base.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # Copyright (C) 2020 Unbabel
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | r"""
 16 | CometModel
 17 | ========================
 18 |     Abstract Model class that implements some of the Pytorch Lightning logic.
 19 |     Extend this class to create new model and metrics within COMET.
 20 | """
 21 | import abc
 22 | import logging
 23 | import multiprocessing
 24 | import sys
 25 | from os import path
 26 | from typing import Dict, List, Optional, Tuple, Union
 27 | import random
 28 | import numpy as np
 29 | import pytorch_lightning as ptl
 30 | import torch
 31 | from comet.encoders import str2encoder
 32 | from comet.modules import LayerwiseAttention, HeteroscedasticLoss, HeteroscedasticLossv2, HeteroApproxLoss, HeteroApproxLossv2, SquaredLoss
 33 | from torch import nn
 34 | from torch.utils.data import DataLoader, RandomSampler, Subset
 35 | from pytorch_lightning.loggers import TensorBoardLogger
 36 | from tqdm import tqdm
 37 | 
 38 | from .pooling_utils import average_pooling, max_pooling
 39 | 
 40 | logger = logging.getLogger(__name__)
 41 | 
 42 | 
 43 | class CometModel(ptl.LightningModule, metaclass=abc.ABCMeta):
 44 |     """CometModel:
 45 | 
 46 |     :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen.
 47 |     :param keep_embeddings_frozen: Keeps the embeddings frozen during training.
 48 |     :param keep_encoder_frozen: freezes entire encoder.
 49 |     :param optimizer: Optimizer used during training.
 50 |     :param encoder_learning_rate: Learning rate used to fine-tune the encoder model.
 51 |     :param learning_rate: Learning rate used to fine-tune the top layers.
 52 |     :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers.
 53 |     :param encoder_model: Encoder model to be used.
 54 |     :param pretrained_model: Pretrained model from Hugging Face.
 55 |     :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg'].
 56 |     :param layer: Encoder layer to be used ('mix' for pooling info from all layers.)
 57 |     :param dropout: Dropout used in the top-layers.
 58 |     :param batch_size: Batch size used during training.
 59 |     :param train_data: Path to a csv file containing the training data.
 60 |     :param validation_data: Path to a csv file containing the validation data.
 61 |     :param load_weights_from_checkpoint: Path to a checkpoint file.
 62 |     :param class_identifier: subclass identifier.
 63 |     """
 64 | 
 65 |     def __init__(
 66 |         self,
 67 |         nr_frozen_epochs: Union[float, int] = 0.3,
 68 |         keep_embeddings_frozen: bool = False,
 69 |         keep_encoder_frozen: bool = False,
 70 |         optimizer: str = "AdamW",
 71 |         encoder_learning_rate: float = 1e-05,
 72 |         learning_rate: float = 3e-05,
 73 |         layerwise_decay: float = 0.95,
 74 |         encoder_model: str = "XLM-RoBERTa",
 75 |         pretrained_model: str = "xlm-roberta-large",
 76 |         pool: str = "avg",
 77 |         layer: Union[str, int] = "mix",
 78 |         dropout: float = 0.1,
 79 |         batch_size: int = 4,
 80 |         train_data: Optional[str] = None,
 81 |         validation_data: Optional[str] = None,
 82 |         load_weights_from_checkpoint: Optional[str] = None,
 83 |         class_identifier: Optional[str] = None,
 84 |         loss: Optional[str]="mse",
 85 |         data_portion: Optional[float] = 1.0, 
 86 |         feature_size: Optional[int] = 0
 87 |     ) -> None:
 88 |         super().__init__()
 89 |         self.save_hyperparameters(
 90 |             ignore=["train_data", "validation_data", "load_weights_from_checkpoint"]
 91 |         )
 92 |         self.encoder = str2encoder[self.hparams.encoder_model].from_pretrained(
 93 |             self.hparams.pretrained_model
 94 |         )
 95 |         self.epoch_nr = 0
 96 |         if self.hparams.layer == "mix":
 97 |             self.layerwise_attention = LayerwiseAttention(
 98 |                 num_layers=self.encoder.num_layers,
 99 |                 dropout=self.hparams.dropout,
100 |                 layer_norm=True,
101 |             )
102 |         else:
103 |             self.layerwise_attention = None
104 | 
105 |         if self.hparams.nr_frozen_epochs > 0:
106 |             self._frozen = True
107 |             self.freeze_encoder()
108 |         else:
109 |             self._frozen = False
110 |         if self.hparams.keep_encoder_frozen:
111 |             self._frozen = True
112 |             self.freeze_encoder()
113 | 
114 |         if self.hparams.keep_embeddings_frozen:
115 |             self.encoder.freeze_embeddings()
116 | 
117 |         self.nr_frozen_epochs = self.hparams.nr_frozen_epochs
118 | 
119 |         if load_weights_from_checkpoint is not None:
120 |             if path.exists(load_weights_from_checkpoint):
121 |                 self.load_weights(load_weights_from_checkpoint)
122 |             else:
123 |                 logger.warning(f"Path {load_weights_from_checkpoint} does not exist!")
124 | 
125 |         self.mc_dropout = False  # Flag used to control usage of MC Dropout
126 | 
127 |     def set_mc_dropout(self, value: bool):
128 |         self.mc_dropout = value
129 | 
130 |     def load_weights(self, checkpoint: str) -> None:
131 |         """Function that loads the weights from a given checkpoint file.
132 |         Note:
133 |             If the checkpoint model architecture is different then `self`, only
134 |             the common parts will be loaded.
135 | 
136 |         :param checkpoint: Path to the checkpoint containing the weights to be loaded.
137 |         """
138 |         logger.info(f"Loading weights from {checkpoint}.")
139 |         checkpoint = torch.load(checkpoint, map_location=lambda storage, loc: storage)
140 |         pretrained_dict = checkpoint["state_dict"]
141 |         model_dict = self.state_dict()
142 |         # 1. filter out unnecessary keys
143 |         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
144 |         # 2. overwrite entries in the existing state dict
145 |         model_dict.update(pretrained_dict)
146 |         # 3. load the new state dict
147 |         self.load_state_dict(model_dict)
148 | 
149 |     @abc.abstractmethod
150 |     def read_csv(self):
151 |         pass
152 | 
153 |     @abc.abstractmethod
154 |     def prepare_sample(
155 |         self, sample: List[Dict[str, Union[str, float]]], *args, **kwargs
156 |     ):
157 |         pass
158 | 
159 |     @abc.abstractmethod
160 |     def configure_optimizers(self):
161 |         pass
162 | 
163 |     @abc.abstractmethod
164 |     def init_metrics(self) -> None:
165 |         pass
166 | 
167 |     @abc.abstractmethod
168 |     def forward(self, *args, **kwargs) -> Dict[str, torch.Tensor]:
169 |         pass
170 | 
171 |     def freeze_encoder(self) -> None:
172 |         logger.info("Encoder model frozen.")
173 |         self.encoder.freeze()
174 | 
175 |     @property
176 |     def loss(self) -> None:
177 |         if self.hparams.loss in ["var","hts"]:
178 |             return HeteroscedasticLoss()
179 |         elif self.hparams.loss in ["var2","hts2"]:
180 |             return HeteroscedasticLossv2()
181 |         elif self.hparams.loss in ["var_approx","hts_approx"]:
182 |             return HeteroApproxLoss()
183 |         elif self.hparams.loss in ["var_approx2","hts_approx2"]:
184 |             return HeteroApproxLossv2()
185 |         elif self.hparams.loss in ["squared"]:
186 |             return SquaredLoss()
187 |         return nn.MSELoss()
188 | 
189 |     def compute_loss(
190 |         self, predictions: Dict[str, torch.Tensor], targets: Dict[str, torch.Tensor]
191 |     ) -> torch.Tensor:
192 |         if self.hparams.loss in ["var","hts"]:
193 |             return self.loss(predictions["score"].view(-1), predictions["variance"].view(-1) , targets["score"])
194 |         
195 |         return self.loss(predictions["score"].view(-1), targets["score"])
196 | 
197 |     def unfreeze_encoder(self) -> None:
198 |         if self._frozen:
199 |             if self.trainer.is_global_zero:
200 |                 logger.info("Encoder model fine-tuning")
201 | 
202 |             self.encoder.unfreeze()
203 |             self._frozen = False
204 |             if self.hparams.keep_embeddings_frozen:
205 |                 self.encoder.freeze_embeddings()
206 | 
207 |     def on_train_epoch_end(self) -> None:
208 |         """Hook used to unfreeze encoder during training."""
209 |         self.epoch_nr += 1
210 |         if self.epoch_nr >= self.nr_frozen_epochs and self._frozen and not self.hparams.keep_encoder_frozen:
211 |             self.unfreeze_encoder()
212 |             self._frozen = False
213 | 
214 |     def get_sentence_embedding(
215 |         self, input_ids: torch.Tensor, attention_mask: torch.Tensor
216 |     ) -> torch.Tensor:
217 |         """Function that extracts sentence embeddings for
218 |             a single sentence.
219 | 
220 |         :param tokens: sequences [batch_size x seq_len]
221 |         :param lengths: lengths [batch_size]
222 | 
223 |         :return: torch.Tensor [batch_size x hidden_size]
224 |         """
225 |         encoder_out = self.encoder(input_ids, attention_mask)
226 |         if self.layerwise_attention:
227 |             # HACK: LayerNorm is applied at the MiniBatch. This means that for big batch sizes the variance
228 |             # and norm within the batch will create small differences in the final score
229 |             # If we are predicting we split the data into equal size batches to minimize this variance.
230 |             if not self.training:
231 |                 n_splits = len(torch.split(encoder_out["all_layers"][-1], 8))
232 |                 embeddings = []
233 |                 for split in range(n_splits):
234 |                     all_layers = []
235 |                     for layer in range(len(encoder_out["all_layers"])):
236 |                         layer_embs = torch.split(encoder_out["all_layers"][layer], 8)
237 |                         all_layers.append(layer_embs[split])
238 |                     split_attn = torch.split(attention_mask, 8)[split]
239 |                     embeddings.append(self.layerwise_attention(all_layers, split_attn))
240 |                 embeddings = torch.cat(embeddings, dim=0)
241 |             else:
242 |                 embeddings = self.layerwise_attention(
243 |                     encoder_out["all_layers"], attention_mask
244 |                 )
245 | 
246 |         elif self.hparams.layer >= 0 and self.hparams.layer < self.encoder.num_layers:
247 |             embeddings = encoder_out["all_layers"][self.hparams.layer]
248 | 
249 |         else:
250 |             raise Exception("Invalid model layer {}.".format(self.hparams.layer))
251 | 
252 |         if self.hparams.pool == "default":
253 |             sentemb = encoder_out["sentemb"]
254 | 
255 |         elif self.hparams.pool == "max":
256 |             sentemb = max_pooling(
257 |                 input_ids, embeddings, self.encoder.tokenizer.pad_token_id
258 |             )
259 | 
260 |         elif self.hparams.pool == "avg":
261 |             sentemb = average_pooling(
262 |                 input_ids,
263 |                 embeddings,
264 |                 attention_mask,
265 |                 self.encoder.tokenizer.pad_token_id,
266 |             )
267 | 
268 |         elif self.hparams.pool == "cls":
269 |             sentemb = embeddings[:, 0, :]
270 | 
271 |         else:
272 |             raise Exception("Invalid pooling technique.")
273 | 
274 |         return sentemb
275 | 
276 |     def training_step(
277 |         self,
278 |         batch: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]],
279 |         batch_nb: int,
280 |     ) -> torch.Tensor:
281 |         """
282 |         Runs one training step and logs the training loss.
283 | 
284 |         :param batch: The output of your prepare_sample function.
285 |         :param batch_nb: Integer displaying which batch this is.
286 | 
287 |         :returns: Loss value
288 |         """
289 |         batch_input, batch_target = batch
290 |         batch_prediction = self.forward(**batch_input)
291 |         #if not self.lossalternate:
292 |         loss_value = self.compute_loss(batch_prediction, batch_target)
293 | 
294 |         if (
295 |             self.nr_frozen_epochs < 1.0
296 |             and self.nr_frozen_epochs > 0.0
297 |             and batch_nb > self.epoch_total_steps * self.nr_frozen_epochs
298 |             and not self.hparams.keep_encoder_frozen
299 |         ):
300 |             self.unfreeze_encoder()
301 |             self._frozen = False
302 | 
303 |         self.log("train_loss", loss_value, on_step=True, on_epoch=True)
304 |         return loss_value
305 | 
306 |     def validation_step(
307 |         self,
308 |         batch: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]],
309 |         batch_nb: int,
310 |         dataloader_idx: int,
311 |     ) -> torch.Tensor:
312 |         """
313 |         Runs one validation step and logs metrics.
314 | 
315 |         :param batch: The output of your prepare_sample function.
316 |         :param batch_nb: Integer displaying which batch this is.
317 |         :param dataloader_idx: Integer displaying which dataloader this is.
318 |         """
319 |         batch_input, batch_target = batch
320 |         batch_prediction = self.forward(**batch_input)
321 |         loss_value = self.compute_loss(batch_prediction, batch_target)
322 | 
323 |         self.log("val_loss", loss_value, on_step=True, on_epoch=True)
324 | 
325 |         # TODO: REMOVE if condition after torchmetrics bug fix
326 |         if batch_prediction["score"].view(-1).size() != torch.Size([1]):
327 |             if dataloader_idx == 0:
328 |                 self.train_metrics.update(
329 |                     batch_prediction["score"].view(-1), batch_target["score"]
330 |                 )
331 |             elif dataloader_idx == 1:
332 |                 self.val_metrics.update(
333 |                     batch_prediction["score"].view(-1), batch_target["score"]
334 |                 )
335 |         #print(loss_value)
336 |         return loss_value
337 | 
338 |     def on_predict_start(self) -> None:
339 |         """Called when predict begins."""
340 |         if self.mc_dropout:
341 |             self.train()
342 |         else:
343 |             self.eval()
344 | 
345 |     def predict_step(
346 |         self,
347 |         batch: Dict[str, torch.Tensor],
348 |         batch_idx: Optional[int] = None,
349 |         dataloader_idx: Optional[int] = None,
350 |     ) -> torch.Tensor:
351 |         """
352 |         Runs one prediction step and returns the predicted values.
353 | 
354 |         :param batch: The output of your prepare_sample function.
355 |         :param batch_nb: Integer displaying which batch this is.
356 |         :param dataloader_idx: Integer displaying which dataloader this is.
357 |         """
358 |         if self.mc_dropout:
359 |             #print(self.loss)
360 |             #print(isinstance(self.loss, HeteroscedasticLoss))
361 |             #mcd_outputs = torch.stack(
362 |             #    [self(**batch)["score"].view(-1) for _ in range(self.mc_dropout)]
363 |             #)
364 |             mcd_outputs = torch.empty(size=(self.mc_dropout, 2))
365 |             hts_outputs = torch.empty(size=(self.mc_dropout, 2))
366 | 
367 |             # mcd_outputs = torch.empty(size=(self.mc_dropout, self.hparams.batch_size))
368 |             # hts_outputs = torch.empty(size=(self.mc_dropout, self.hparams.batch_size))
369 |             for i in range(self.mc_dropout):
370 |                 outputs = self(**batch)
371 |            
372 |                 mcd_outputs[i,:] = outputs["score"].view(-1)
373 |                 if isinstance(self.loss, HeteroscedasticLoss): 
374 |                     hts_outputs[i]=outputs["variance"].view(-1)
375 |         
376 |             mcd_mean = mcd_outputs.mean(dim=0)
377 |             mcd_std = mcd_outputs.std(dim=0)
378 |             #print(mcd_mean)
379 |             if isinstance(self.loss, HeteroscedasticLoss): 
380 |                 hts_mean = hts_outputs.mean(dim=0)
381 |                 hts_std = hts_outputs.std(dim=0)
382 |                 return mcd_mean, mcd_std, hts_mean, hts_std
383 |             return mcd_mean, mcd_std
384 | 
385 |         output = self(**batch)
386 |         if isinstance(self.loss, HeteroscedasticLoss): 
387 |             return output["score"].view(-1), output["variance"].view(-1)
388 |         return output["score"].view(-1)
389 | 
390 |     def validation_epoch_end(self, outputs, *args, **kwargs) -> None:
391 |         """ " Computes and logs metrics."""
392 |         #print(outputs)
393 |         avg_loss = torch.stack([x[0] for x in outputs]).mean()
394 |         self.logger.experiment.add_scalar('validation_loss',avg_loss, self.current_epoch)
395 |         self.log_dict(self.train_metrics.compute(), prog_bar=True)
396 |         self.log_dict(self.val_metrics.compute(), prog_bar=True)
397 |         self.train_metrics.reset()
398 |         self.val_metrics.reset()
399 | 
400 |     def setup(self, stage) -> None:
401 |         """Data preparation function called before training by Lightning.
402 | 
403 |         :param stage: either 'fit', 'validate', 'test', or 'predict'
404 |         """
405 |         if stage in (None, "fit"):
406 |             self.train_dataset = self.read_csv(self.hparams.train_data)
407 |             if self.hparams.data_portion < 1.0:
408 |                 print(len(self.train_dataset))
409 |                 length = len(self.train_dataset)
410 |                 data_size = int(self.hparams.data_portion*length)
411 |                 self.train_dataset = list(random.sample(self.train_dataset, data_size))
412 |                 print(len(self.train_dataset))
413 |             self.validation_dataset = self.read_csv(self.hparams.validation_data)
414 | 
415 |             self.epoch_total_steps = len(self.train_dataset) // (
416 |                 self.hparams.batch_size * max(1, self.trainer.num_gpus)
417 |             )
418 |             self.total_steps = self.epoch_total_steps * float(self.trainer.max_epochs)
419 | 
420 |             # Always validate the model with 2k examples to control overfit.
421 |             train_subset = np.random.choice(a=len(self.train_dataset), size=2000)
422 |             self.train_subset = Subset(self.train_dataset, train_subset)
423 |             self.init_metrics()
424 | 
425 |     def train_dataloader(self) -> DataLoader:
426 |         """Function that loads the train set."""
427 |         return DataLoader(
428 |             dataset=self.train_dataset,
429 |             sampler=RandomSampler(self.train_dataset),
430 |             batch_size=self.hparams.batch_size,
431 |             collate_fn=lambda x: self.prepare_sample(x, inference=False, data_portion=self.hparams.data_portion),
432 |             num_workers=multiprocessing.cpu_count(),
433 |         )
434 | 
435 |     def val_dataloader(self) -> DataLoader:
436 |         """Function that loads the validation set."""
437 |         return [
438 |             DataLoader(
439 |                 dataset=self.train_subset,
440 |                 batch_size=self.hparams.batch_size,
441 |                 collate_fn=lambda x: self.prepare_sample(x, inference=False, data_portion=self.hparams.data_portion),
442 |                 num_workers=min(8, multiprocessing.cpu_count()),
443 |             ),
444 |             DataLoader(
445 |                 dataset=self.validation_dataset,
446 |                 batch_size=self.hparams.batch_size,
447 |                 collate_fn=self.prepare_sample,
448 |                 num_workers=min(8, multiprocessing.cpu_count()),
449 |             ),
450 |         ]
451 | 
452 |     def predict(
453 |         self,
454 |         samples: List[Dict[str, str]],
455 |         batch_size: int = 8,
456 |         gpus: int = 1,
457 |         mc_dropout: Union[int, bool] = False,
458 |     ) -> Union[Tuple[List[float], float], Tuple[List[float], List[float], float]]:
459 |         """Function that receives a list of samples (dictionaries with translations, sources and/or references)
460 |         and returns segment level scores and a system level score. If `mc_dropout` is set, it also returns for each
461 |         segment score, a confidence value.
462 | 
463 |         :param samples: List with dictionaries with source, translations and/or references.
464 |         :param batch_size: Batch size used during inference.
465 |         :gpus: Number of GPUs to be used.
466 | 
467 |         :return: List with segment-level scores and a system-score or segment-level scores, segment-level
468 |             confidence and a system-score.
469 |         """
470 | 
471 |         class PredictProgressBar(ptl.callbacks.ProgressBar):
472 |             """Default Lightning Progress bar writes to stdout, we replace stdout with stderr"""
473 | 
474 |             def init_predict_tqdm(self) -> tqdm:
475 |                 bar = tqdm(
476 |                     desc="Predicting",
477 |                     initial=self.train_batch_idx,
478 |                     position=(2 * self.process_position),
479 |                     disable=self.is_disabled,
480 |                     leave=True,
481 |                     dynamic_ncols=True,
482 |                     file=sys.stderr,
483 |                     smoothing=0,
484 |                 )
485 |                 return bar
486 | 
487 |         # HACK: Workaround pytorch bug that prevents ParameterList to be used in DP
488 |         # https://github.com/pytorch/pytorch/issues/36035
489 |         if self.layerwise_attention is not None and gpus > 1:
490 |             self.layerwise_attention.gamma_value = float(
491 |                 self.layerwise_attention.gamma[0]
492 |             )
493 |             self.layerwise_attention.weights = [
494 |                 float(parameter[0])
495 |                 for parameter in self.layerwise_attention.scalar_parameters
496 |             ]
497 | 
498 |         self.eval()
499 |         dataloader = DataLoader(
500 |             dataset=samples,
501 |             batch_size=batch_size,
502 |             collate_fn=lambda x: self.prepare_sample(x, inference=True),
503 |             num_workers=multiprocessing.cpu_count(),
504 |         )
505 | 
506 |         prog_bar = PredictProgressBar()
507 |         #tb_logger = TensorBoardLogger("tb_logs", name="DEUP_logger")
508 |         trainer = ptl.Trainer(
509 |             gpus=gpus,
510 |             deterministic=True,
511 |             logger=False,
512 |             callbacks=[prog_bar],
513 |             accelerator="dp" if gpus > 1 else None,
514 |         )
515 | 
516 |         if mc_dropout:
517 |             self.set_mc_dropout(mc_dropout)
518 |             predictions = trainer.predict(
519 |                 self, dataloaders=dataloader, return_predictions=True
520 |             )
521 |             mean_scores = [out[0] for out in predictions]
522 |             std_scores = [out[1] for out in predictions]
523 |             mean_scores = torch.cat(mean_scores, dim=0).tolist()
524 |             std_scores = torch.cat(std_scores, dim=0).tolist()
525 |             
526 |             if isinstance(self.loss, HeteroscedasticLoss):
527 |                 hts_scores = [out[2] for out in predictions]
528 |                 hts_std_scores = [out[3] for out in predictions]
529 |                 hts_scores = torch.cat(hts_scores, dim=0).tolist()
530 |                 hts_std_scores = torch.cat(hts_std_scores, dim=0).tolist()
531 |                 return mean_scores, std_scores, hts_scores, hts_std_scores, sum(mean_scores) / len(mean_scores)
532 | 
533 |             return mean_scores, std_scores, sum(mean_scores) / len(mean_scores)
534 | 
535 |         else:
536 |             predictions = trainer.predict(
537 |                 self, dataloaders=dataloader, return_predictions=True
538 |             )
539 |             
540 |             if isinstance(self.loss, HeteroscedasticLoss):
541 |                 #print(predictions)
542 |                 mean_scores = [out[0] for out in predictions]
543 |                 
544 |                 hts_scores = [out[1] for out in predictions]
545 |                 #print(hts_scores)
546 |                 #print(len(predictions))
547 |                
548 |           
549 |                 quality_predictions = torch.cat(mean_scores, dim=0).tolist()
550 |                 variance_predictions = torch.cat(hts_scores, dim=0).tolist()
551 |                 #print(variance_predictions)
552 |                 #print(len(variance_predictions))
553 |                 
554 |                 return quality_predictions, variance_predictions, sum(quality_predictions) / len(quality_predictions)
555 |             else:
556 |                 predictions = torch.cat(predictions, dim=0).tolist()
557 |                 return predictions, sum(predictions) / len(predictions)
558 | 


--------------------------------------------------------------------------------
/tests/data/test_ranking_data.csv:
--------------------------------------------------------------------------------
  1 | src,ref,pos,neg
  2 | "In terms of the attack, Christopheros is convinced that the unknown man must have confused him with someone else.","In terms of the attack, Christopheros is convinced that the unknown man must have confused him with someone else.","As for the attack, Christopheros is sure the unknown man must have mistaken him for someone else.","As for the attack, Christopheros is confident that an unknown man had to confuse him with someone."
  3 | "I'm happy with how it's going so far.""","I'm happy with how it's going so far.""","I'm happy with the result so far.""","With the result I am satisfied. """
  4 | "Joining the trio in the field are former Masters winner Ian Woosnam and Senior major champions Paul Broadhurst, Roger Chapman, Mark James and Mark McNulty.","Joining the trio in the field are former Masters winner Ian Woosnam and Senior major champions Paul Broadhurst, Roger Chapman, Mark James and Mark McNulty.","For a trio of players on the field connects also a former winner of the Masters, Ian Woosnam and the winners of the senior tournament Paul Broadhurst, Roger Chapman, Mark James and Mark McNulty..","The three players will also be joined on the field by former Masters winner Ian Woosnam and senior tournament winners Paul Broadhurst, Roger Chapman, Mark James and Mark McNulty."
  5 | "Orr and former Ryder Cup captain Sam Torrance are among six Scots in the field, the others being Andrew Oldcorn, Stephen McAllister, Bill Longmuir and Ross Drummond.","Orr and former Ryder Cup captain Sam Torrance are among six Scots in the field, the others being Andrew Oldcorn, Stephen McAllister, Bill Longmuir and Ross Drummond.","ORR and former Ryder Cup captain Sam Torrance are among six Scottish players on the pitch; the others are Andrew Oldcorn, Stephen McAllister, Bill Longmuir and Ross Drummond.","ORR and former Ryder Cup captain Sam Torrance are among the six Scottish players on the field; Others are Andrew Oldcorn, Stephen McAllister, Bill Longmuir and Ross Drummond."
  6 | """You want to do well; but you do feel that extra pressure.","""You want to do well; but you do feel that extra pressure.","""You want to be successful and feel great pressure.","""You want to do well while feeling a lot of pressure."
  7 | """You want to do well; but you do feel that extra pressure.","""You want to do well; but you do feel that extra pressure.","""You want to be able to thrive and you feel a lot of pressure.","""You want to do well while feeling a lot of pressure."
  8 | Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and death in Indonesia's land of gold,Poverty and death in indonesian country gold
  9 | Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and death in the Indonesian country of gold,Poverty and death in indonesian country gold
 10 | Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and death in indonesian country gold
 11 | Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and death in the Indonesian country of gold,Poverty and death in indonesian country gold
 12 | "Recently, the Foundation for Responsible Robotics (FRR) released a report looking at the development of sex robots over the next five to 10 years.","Recently, the Foundation for Responsible Robotics (FRR) released a report looking at the development of sex robots over the next five to 10 years.",The Foundation for Responsible Robotics (FRR) recently released a statement on the development of sex robots within the next five to 10 years.,Organization Foundation for Responsible Robotics (FRR) recently issued a statement about the development of sex robots within the next five to ten years.
 13 | "Recently, the Foundation for Responsible Robotics (FRR) released a report looking at the development of sex robots over the next five to 10 years.","Recently, the Foundation for Responsible Robotics (FRR) released a report looking at the development of sex robots over the next five to 10 years.",The Foundation for Responsible Robotics (FRR) recently issued a statement on the development of sex robots within the next five to ten years.,Organization Foundation for Responsible Robotics (FRR) recently issued a statement about the development of sex robots within the next five to ten years.
 14 | """This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and we will bring this change of green card issuance rules to residents of other countries.","""This bill will reduce poverty, raise wages and save billions of dollars for taxpayers."
 15 | """This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This bill will reduce poverty, raise wages and save billions of dollars for taxpayers, and we will do this by changing the rules for issuing green cards to residents of other countries.","""This bill will reduce poverty, raise wages and save billions of dollars for taxpayers."
 16 | """This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, increase wages and will save billions of dollars for tax payers. You can achieve it by changing the rules on the issuing of green cards to residents of other countries.","""This bill will reduce poverty, raise wages and save billions of dollars for taxpayers."
 17 | The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and are supported by the president.,New rules are proposed by the Republicans and the President.,The new rules propose the republicans and the president.
 18 | The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and the president.,The new rules propose the republicans and the president.
 19 | The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and the president.,The new rules propose the republicans and the president.
 20 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The bill will be submitted to Congress soon.,The bill will soon be submitted to Congress.
 21 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The bill will be submitted to Congress soon.,The bill will soon be submitted to Congress.
 22 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The draft law will be soon submitted to Congress.,The bill will be submitted to Congress soon.
 23 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The bill will be submitted to Congress soon.,The bill will soon be submitted to Congress.
 24 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The draft law will be soon submitted to Congress.,The bill will soon be submitted to Congress.
 25 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The draft law will be soon submitted to Congress.,The bill will soon be submitted to Congress.
 26 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The draft law will be soon submitted to Congress.,The bill will soon be submitted to Congress.
 27 | "Freedom of speech, dissent and discourse lie at the very foundation of our nation.","Freedom of speech, dissent and discourse lie at the very foundation of our nation.","Freedom of speech, disagreement and intimidation lies in the very foundations of our nation.","Freedom of expression, dissent and the intimidation lies in the very foundation of our nation."
 28 | And true leadership means accepting that.,And true leadership means accepting that.,And the real leader must accept it.,And the real leader has to accept that.
 29 | And true leadership means accepting that.,And true leadership means accepting that.,And a real leader has to accept that.,And the real leader has to accept that.
 30 | And true leadership means accepting that.,And true leadership means accepting that.,And the real leader has to accept it.,And the real leader has to accept that.
 31 | And true leadership means accepting that.,And true leadership means accepting that.,And the real leader must accept it.,And a true leader must take.
 32 | And true leadership means accepting that.,And true leadership means accepting that.,And a real leader has to accept that.,And a true leader must take.
 33 | And true leadership means accepting that.,And true leadership means accepting that.,And the real leader has to accept it.,And a true leader must take.
 34 | "And even back then, Washington had anonymous trolls.","And even back then, Washington had anonymous trolls.","And even in those times, Washington had to deal with anonymous trolls.","And even in those days, Washington had to deal with anonymous trolls."
 35 | "And even back then, Washington had anonymous trolls.","And even back then, Washington had anonymous trolls.","And even in those times, Washington had to deal with anonymous trolls.","And even in those days, Washington had to deal with anonymous trolls."
 36 | "And even back then, Washington had anonymous trolls.","And even back then, Washington had anonymous trolls.","And even in those times, Washington had to deal with anonymous trolls.","And even in those days, Washington had to deal with anonymous trolls."
 37 | "And even back then, Washington had anonymous trolls.","And even back then, Washington had anonymous trolls.","And even in those times, Washington had to deal with anonymous trolls.","And even in those days, Washington had to deal with anonymous trolls."
 38 | "In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington's time, the era of affordable postage had an impact much like the Internet.","In the times of Washington, the low price of postage had virtually the same impact as the internet.","In the Times of Washington, the low postage price had practically the same impact as the Internet."
 39 | "In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington's time, the era of affordable postage had an impact much like the Internet.","In the days of Washington, the low cost of postage had virtually the same impact as the Internet.","In Washington, the low cost of postage had virtually the same impact as the Internet."
 40 | "In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington, the low cost of postage had virtually the same impact as the Internet.","In the Times of Washington, the low postage price had practically the same impact as the Internet."
 41 | "In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington's time, the era of affordable postage had an impact much like the Internet.","In the days of Washington, the low cost of postage had virtually the same impact as the Internet.","In the Times of Washington, the low postage price had practically the same impact as the Internet."
 42 | "The number of newspapers quadrupled between 1776 and 1800, and anonymous letter writers hammered his leadership.","The number of newspapers quadrupled between 1776 and 1800, and anonymous letter writers hammered his leadership.","Between 1776 and 1800, the number of daily newspapers quadrupled and his leader's position was questioned by many anonymous writers.","Between the years 1776 and 1800, the number of daily newspapers quadrupled and his position as leader Zpochybňovalo many anonymous writers."
 43 | "So blocking people who come to the governor's page - which is a public forum, labeled as official and administered by staff members paid public tax dollars - is unnecessary and ultimately dangerous.","So blocking people who come to the governor's page - which is a public forum, labeled as official and administered by staff members paid public tax dollars - is unnecessary and ultimately dangerous.",Blocking people who visit the Guvernérovu page – which is a public forum designated as official and managed by its employees paid from taxpayers ' money – is useless and ultimately dangerous.,"Blocking people closing on the governor's page - which is a public forum, identified as official and managed by his staff paid from taxpayers ""money - is unnecessary and ultimately dangerous."
 44 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview for The Washington Post, Hogan's spokeswoman Amelia Chasse defended the governor, saying that blocking the comments went from a position of modelling contributions.","In an interview with Washington Post Hogan's spokeswoman Amelia Chasse, the governor argued that comment blocking was from post-moderation posts."
 45 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post, Hogan's spokeswoman, Amelia Chasse, defended the governor, saying the blocking of comments was done from a position of moderating posts.","In an interview for The Washington Post, Hogan's spokeswoman Amelia Chasse defended the governor, saying that blocking the comments went from a position of modelling contributions."
 46 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview for The Washington Post, Hogan's spokeswoman Amelia Chasse defended the governor, saying that blocking the comments went from a position of modelling contributions.","In an interview with the Washington Post Hoganova, the spokesman Amelia Chasseová defended the governor, saying that the blocking of comments took place from the post moderation."
 47 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post, Hogan's spokeswoman, Amelia Chasse, defended the governor, saying the blocking of comments was done from a position of moderating posts.","In an interview with Washington Post Hogan's spokeswoman Amelia Chasse, the governor argued that comment blocking was from post-moderation posts."
 48 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post Hoganova spokesperson Amelia Chasseová defended the governor, saying that the blocking of the comments was from a position of moderation of the contributions.","In an interview with Washington Post Hogan's spokeswoman Amelia Chasse, the governor argued that comment blocking was from post-moderation posts."
 49 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post, Hogan's spokeswoman, Amelia Chasse, defended the governor, saying the blocking of comments was done from a position of moderating posts.","In an interview with the Washington Post Hoganova spokesperson Amelia Chasseová defended the governor, saying that the blocking of the comments was from a position of moderation of the contributions."
 50 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post Hoganova spokesperson Amelia Chasseová defended the governor, saying that the blocking of the comments was from a position of moderation of the contributions.","In an interview with the Washington Post Hoganova, the spokesman Amelia Chasseová defended the governor, saying that the blocking of comments took place from the post moderation."
 51 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post, Hogan's spokeswoman, Amelia Chasse, defended the governor, saying the blocking of comments was done from a position of moderating posts.","In an interview with the Washington Post Hoganova, the spokesman Amelia Chasseová defended the governor, saying that the blocking of comments took place from the post moderation."
 52 | The Post talked to some of the real people blocked by Hogan.,The Post talked to some of the real people blocked by Hogan.,The Washington Post interviewed some real people who had their posts deleted.,The Washington Post interviewed several real people to whom the posts were deleted.
 53 | The Post talked to some of the real people blocked by Hogan.,The Post talked to some of the real people blocked by Hogan.,The Washington Post interviewed some real people who had their posts deleted.,The Washington Post interviewed several real people have been deleted posts.
 54 | "They all said that their comments were respectful, thoughtful and not profane.","They all said that their comments were respectful, thoughtful and not profane.","As everyone reported, their comments were polite and considerate, not profane.","As everyone said, their comments were polite and considerate, not blatant."
 55 | "They all said that their comments were respectful, thoughtful and not profane.","They all said that their comments were respectful, thoughtful and not profane.","As everyone reported, their comments were polite and considerate, not profane.","As they all said, their comments were polite and considerate, not blasphemous."
 56 | "They all said that their comments were respectful, thoughtful and not profane.","They all said that their comments were respectful, thoughtful and not profane.","As you all have said, their comments were polite and respectful, no profane.","As everyone said, their comments were polite and considerate, not blatant."
 57 | "They all said that their comments were respectful, thoughtful and not profane.","They all said that their comments were respectful, thoughtful and not profane.","As you all have said, their comments were polite and respectful, no profane.","As they all said, their comments were polite and considerate, not blasphemous."
 58 | "The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","In his comment, the pastor cited the Bible, appealing to Hogan's Catholic faith.","The Pastor, in his commentary quoted from the Bible, apelujíc on Hoganovu the catholic faith."
 59 | "The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","In his commentary, the pastor cited the Bible, appealing to Hogan's Catholic faith.","The Pastor, in his commentary quoted from the Bible, apelujíc on Hoganovu the catholic faith."
 60 | "The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his commentary, appealing to Hogan's Catholic faith.","The Pastor, in his commentary quoted from the Bible, apelujíc on Hoganovu the catholic faith."
 61 | "The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The Pastor quoted the Bible in his commentary, apelujíc the Hoganovu Catholic faith.","The Pastor, in his commentary quoted from the Bible, apelujíc on Hoganovu the catholic faith."
 62 | "Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Bowie's attorney Lakshmi Sarma Ramani, Md. asked about hate crimes, however, her comment was not hateful.","Attorney company Bowie Lakshmi Sarma Ramaniová, Md. asked on hate crimes, her comment was not hateful."
 63 | "Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Lawyer Bowie Lakshmi Sarma Ramani, MD asked for hate crimes, but her comment was not hateful.","Attorney company Bowie Lakshmi Sarma Ramaniová, Md. asked on hate crimes, her comment was not hateful."
 64 | "Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Bowie Lakshmi's attorney, Sarma Ramani, Md., inquired about hate crimes, but her comment was not hateful.","Attorney company Bowie Lakshmi Sarma Ramaniová, Md. asked on hate crimes, her comment was not hateful."
 65 | "Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","The lawyer of Bowie Lakshmi Sarma Ramaniová, Md. Asked about hate crimes, but her commentary was not hateful.","Attorney company Bowie Lakshmi Sarma Ramaniová, Md. asked on hate crimes, her comment was not hateful."
 66 | "Facebook is to send more potential hoax articles to third-party fact checkers and show their findings below the original post, the world's largest online social network said on Thursday as it tries to fight so-called fake news.","Facebook is to send more potential hoax articles to third-party fact checkers and show their findings below the original post, the world's largest online social network said on Thursday as it tries to fight so-called fake news.","Facebook will send more potentially fake articles to control third parties and display findings under original posts in an attempt to fight against the so-called ""fake news"", said the largest social network on Thursday.","Facebook will send more potentially fake articles to third-party control and display findings under the original posts in an effort to fight so-called ""fake news,"" said the largest social network on Thursday."
 67 | "Facebook is to send more potential hoax articles to third-party fact checkers and show their findings below the original post, the world's largest online social network said on Thursday as it tries to fight so-called fake news.","Facebook is to send more potential hoax articles to third-party fact checkers and show their findings below the original post, the world's largest online social network said on Thursday as it tries to fight so-called fake news.","Facebook will send a more potentially false articles for inspection to third parties and to display findings under the original posts in an attempt to fight against the so-called ""fake news"", said the largest social network on Thursday.","Facebook will send more potentially fake articles to third-party control and display findings under the original posts in an effort to fight so-called ""fake news,"" said the largest social network on Thursday."
 68 | I'm not going to worry too much about it.,I'm not going to worry too much about it.,I believe everything will be all right.,I believe that all will be well.
 69 | I'm not going to worry too much about it.,I'm not going to worry too much about it.,I believe everything will be all right.,I believe everything will be fine.
 70 | I'm not going to worry too much about it.,I'm not going to worry too much about it.,I believe all will be well.,I believe that all will be well.
 71 | I'm not going to worry too much about it.,I'm not going to worry too much about it.,I believe all will be well.,I believe everything will be fine.
 72 | "We have a plan in place.""","We have a plan in place.""","It's all planned. ""","Everything is scheduled. """
 73 | "Riga is amazing, I'm going to miss, said goalkeeper Sedláček as he looks for a new engagement.","Riga is amazing, I'm going to miss, said goalkeeper Sedláček as he looks for a new engagement.","Riga is beautiful, I will miss, says goalie Sedláček and looks for engagement.","Riga is beautiful, I will miss him, says the goalie Sedlacek and looking for engagement."
 74 | "Riga is amazing, I'm going to miss, said goalkeeper Sedláček as he looks for a new engagement.","Riga is amazing, I'm going to miss, said goalkeeper Sedláček as he looks for a new engagement.","Riga is beautiful, I will miss, says goalkeeper Sedláček and looking for engagement.","Riga is beautiful, I will miss him, says the goalie Sedlacek and looking for engagement."
 75 | The World Health Organization considers the preparation to be slightly toxic.,The World Health Organization considers the preparation to be slightly toxic.,The world health organization considers the preparation as moderately toxic.,World Health Organization WHO considers the preparation to be slightly toxic.
 76 | The World Health Organization considers the preparation to be slightly toxic.,The World Health Organization considers the preparation to be slightly toxic.,The world health organization considers the preparation as moderately toxic.,The WHO considers the preparation to be slightly toxic.
 77 | How were the four years you spent in Latvia?,How were the four years you spent in Latvia?,What were the four years you lived in Latvia?,What were the four years you spent in Latvia?
 78 | How were the four years you spent in Latvia?,How were the four years you spent in Latvia?,What were the four years you lived in Latvia?,What were the four years you spent in Latvia?
 79 | How were the four years you spent in Latvia?,How were the four years you spent in Latvia?,What were the four years you lived in Latvia?,What were the four years you spent in Latvia?
 80 | How were the four years you spent in Latvia?,How were the four years you spent in Latvia?,What were the four years you lived in Latvia?,What were the four years you spent in Latvia?
 81 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether you get an engagement at the start of a new year.,It depends on whether he gets an engagement at the start of the new year.
 82 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether he gets an engagement before the start of the new year.,It depends on whether he gets an engagement at the start of the new year.
 83 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether the new year starts gaining engagement.,It depends on whether he gets an engagement at the start of the new year.
 84 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether you get an engagement at the start of a new year.,It depends on whether the start of the new year gets engagement.
 85 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether he gets an engagement before the start of the new year.,It depends on whether the start of the new year gets engagement.
 86 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether the new year starts gaining engagement.,It depends on whether the start of the new year gets engagement.
 87 | I was learning.,I was learning.,I was learning.,I was taught to.
 88 | I was learning.,I was learning.,I learned.,I was taught to.
 89 | I was learning.,I was learning.,I was learning.,I was taught to.
 90 | I was learning.,I was learning.,I was learning.,I was taught to.
 91 | "It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina, but ""more like a shower gel or an egg or a futuristic object.","Doesn't look like an anatomical replica of the vagina, but ""more like a shower gel or as an egg or futuristic object."
 92 | "It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina, but ""more like a shower gel or an egg or a futuristic object.","Doesn't look like an anatomical replica of the vagina, but ""more like a shower gel or as an egg or futuristic object."
 93 | "It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It does not look like an anatomical imitation of the vagina, but ""more like a shower gel or as an egg or futuristic object.","Doesn't look like an anatomical replica of the vagina, but ""more like a shower gel or as an egg or futuristic object."
 94 | "On the other hand, the category of penetrative vibrators was a complete washout,","On the other hand, the category of penetrative vibrators was a complete washout,","On the contrary, a certain dropper is a category of stunning vibrators.","On the contrary, a certain parachute is a category of loading vibrators."
 95 | at least for us.,at least for us.,At least then with us.,At least with us.
 96 | at least for us.,at least for us.,At least then with us.,At least with us.
 97 | at least for us.,at least for us.,At least then with us.,At least with us.
 98 | at least for us.,at least for us.,At least then with us.,At least in the us.
 99 | Correspondents say the growing strength of the Taliban and the group known as so-called Islamic State (IS) in Qarabagh is a major source of concern to Nato forces based in nearby Bagram.,Correspondents say the growing strength of the Taliban and the group known as so-called Islamic State (IS) in Qarabagh is a major source of concern to Nato forces based in nearby Bagram.,"According to the reports, the growing power of the Taliban and the group known as Islamic State (IS) in the Qarabagh area is a major source of concern for NATO-allied forces in nearby Bagram.",According to the observations of the rapporteurs is the growing power of the Taliban and the group known as the Islamic state (IS) in the area of Qarabagh an important source of concern for NATO allied forces at the nearby Bagram.
100 | All measures have been implemented.,All measures have been implemented.,All measures have been taken.,It was taken to all measures.
101 | All measures have been implemented.,All measures have been implemented.,All measures have been taken.,It was taken to all measures.
102 | 


--------------------------------------------------------------------------------