├── tests ├── __init__.py ├── unit │ ├── __init__.py │ ├── encoders │ │ ├── __init__.py │ │ ├── test_bert.py │ │ └── test_xlmr.py │ └── test_download_utils.py ├── integration │ ├── __init__.py │ ├── models │ │ ├── __init__.py │ │ ├── test_ranking_metric.py │ │ ├── test_regression_metric.py │ │ └── test_referenceless_regression.py │ └── modules │ │ └── test_feedforward.py └── data │ ├── __init__.py │ └── test_ranking_data.csv ├── comet ├── models │ ├── ranking │ │ ├── __init__.py │ │ ├── wmt_kendall.py │ │ └── ranking_metric.py │ ├── regression │ │ ├── __init__.py │ │ ├── referenceless.py │ │ ├── regression_metric_hybrid.py │ │ └── regression_metric.py │ ├── pooling_utils.py │ ├── __init__.py │ └── base.py ├── modules │ ├── __init__.py │ ├── bottleneck.py │ ├── losses.py │ ├── feedforward.py │ └── layerwise_attention.py ├── encoders │ ├── __init__.py │ ├── xlmr.py │ ├── base.py │ └── bert.py ├── __init__.py ├── cli │ ├── train.py │ ├── score.py │ └── compare.py └── download_utils.py ├── requirements.txt ├── configs ├── early_stopping.yaml ├── model_checkpoint.yaml ├── models │ ├── regression_metric_comet_kl.yaml │ ├── regression_metric_comet_heteroscedastic.yaml │ ├── regression_metric_comet_dup.yaml │ ├── regression_metric_comet_plain.yaml │ └── regression_metric_dup_256bottleneck.yaml └── trainer.yaml ├── pyproject.toml ├── README.md └── LICENSE /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /comet/models/ranking/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/unit/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /comet/models/regression/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/integration/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/data/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | DATA_PATH = os.path.abspath(__file__) 4 | DATA_PATH = os.path.dirname(DATA_PATH) 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | sentencepiece==0.1.96 2 | pandas==1.1.5 3 | transformers==4.8.2 4 | pytorch-lightning==1.3.5 5 | jsonargparse==3.13.1 6 | -------------------------------------------------------------------------------- /comet/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | from .feedforward import FeedForward 3 | from .bottleneck import Bottleneck 4 | from .layerwise_attention import LayerwiseAttention 5 | from .losses import HeteroscedasticLoss, HeteroscedasticLossv2, HeteroApproxLoss, HeteroApproxLossv2, SquaredLoss, KLLoss 6 | -------------------------------------------------------------------------------- /configs/early_stopping.yaml: -------------------------------------------------------------------------------- 1 | class_path: pytorch_lightning.callbacks.early_stopping.EarlyStopping 2 | init_args: 3 | monitor: val_pearson 4 | min_delta: 0. 5 | patience: 3 6 | verbose: False 7 | mode: max 8 | strict: True 9 | check_finite: True 10 | stopping_threshold: null 11 | divergence_threshold: null 12 | check_on_train_epoch_end: False 13 | -------------------------------------------------------------------------------- /configs/model_checkpoint.yaml: -------------------------------------------------------------------------------- 1 | class_path: pytorch_lightning.callbacks.model_checkpoint.ModelCheckpoint 2 | init_args: 3 | dirpath: null 4 | filename: null 5 | monitor: val_pearson 6 | verbose: True 7 | save_last: False 8 | save_top_k: 2 9 | save_weights_only: True 10 | mode: max 11 | auto_insert_metric_name: True 12 | every_n_train_steps: null 13 | every_n_val_epochs: 1 14 | -------------------------------------------------------------------------------- /comet/encoders/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Unbabel 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from .bert import BERTEncoder 15 | from .xlmr import XLMREncoder 16 | 17 | str2encoder = {"BERT": BERTEncoder, "XLM-RoBERTa": XLMREncoder} 18 | -------------------------------------------------------------------------------- /tests/unit/test_download_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | import os 4 | import shutil 5 | from tests.data import DATA_PATH 6 | from comet.download_utils import download_model 7 | from comet.models import load_from_checkpoint 8 | 9 | 10 | class TestDownloadModel(unittest.TestCase): 11 | @classmethod 12 | def tearDownClass(cls): 13 | shutil.rmtree(os.path.join(DATA_PATH, "wmt21-cometinho-da")) 14 | 15 | def test_download_from_s3(self): 16 | data_path = download_model("wmt21-cometinho-da", saving_directory=DATA_PATH) 17 | self.assertTrue( 18 | os.path.exists(os.path.join(DATA_PATH, "wmt21-cometinho-da/hparams.yaml")) 19 | ) 20 | self.assertTrue( 21 | os.path.exists(os.path.join(DATA_PATH, "wmt21-cometinho-da/checkpoints/")) 22 | ) 23 | load_from_checkpoint(data_path) 24 | -------------------------------------------------------------------------------- /comet/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # -*- coding: utf-8 -*- 3 | # Copyright (C) 2020 Unbabel 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | import logging 18 | 19 | from .download_utils import download_model 20 | from .models import load_from_checkpoint 21 | 22 | logging.basicConfig(level=logging.INFO, format="%(message)s") 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | __version__ = "1.0.0rc4" 27 | __copyright__ = "2020-2021 Unbabel. All rights reserved." 28 | -------------------------------------------------------------------------------- /configs/models/regression_metric_comet_kl.yaml: -------------------------------------------------------------------------------- 1 | regression_metric: 2 | class_path: comet.models.RegressionMetric 3 | init_args: 4 | nr_frozen_epochs: 0.3 5 | keep_embeddings_frozen: True 6 | keep_encoder_frozen: False 7 | optimizer: AdamW 8 | encoder_learning_rate: 1.0e-05 9 | learning_rate: 3.1e-05 10 | layerwise_decay: 0.95 11 | encoder_model: XLM-RoBERTa 12 | pretrained_model: xlm-roberta-large 13 | pool: avg 14 | layer: mix 15 | dropout: 0.15 16 | batch_size: 4 17 | train_data: data/balanced/scores-1719.csv 18 | validation_data: data/balanced/scores-1719.csv 19 | hidden_sizes: 20 | - 3072 21 | - 1024 22 | hidden_sizes_bottleneck: 23 | - 0 24 | data_portion: 1.0 25 | loss: kl 26 | feature_size: 0 27 | #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt 28 | 29 | trainer: ../trainer.yaml 30 | early_stopping: ../early_stopping.yaml 31 | model_checkpoint: ../model_checkpoint.yaml -------------------------------------------------------------------------------- /configs/models/regression_metric_comet_heteroscedastic.yaml: -------------------------------------------------------------------------------- 1 | regression_metric: 2 | class_path: comet.models.RegressionMetric 3 | init_args: 4 | nr_frozen_epochs: 0.3 5 | keep_embeddings_frozen: True 6 | keep_encoder_frozen: False 7 | optimizer: AdamW 8 | encoder_learning_rate: 1.0e-05 9 | learning_rate: 3.1e-05 10 | layerwise_decay: 0.95 11 | encoder_model: XLM-RoBERTa 12 | pretrained_model: xlm-roberta-large 13 | pool: avg 14 | layer: mix 15 | dropout: 0.15 16 | batch_size: 4 17 | train_data: data/balanced/scores-1719.csv 18 | validation_data: data/balanced/scores-1719.csv 19 | hidden_sizes: 20 | - 3072 21 | - 1024 22 | hidden_sizes_bottleneck: 23 | - 0 24 | data_portion: 1.0 25 | loss: hts 26 | feature_size: 0 27 | #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt 28 | 29 | trainer: ../trainer.yaml 30 | early_stopping: ../early_stopping.yaml 31 | model_checkpoint: ../model_checkpoint.yaml -------------------------------------------------------------------------------- /configs/models/regression_metric_comet_dup.yaml: -------------------------------------------------------------------------------- 1 | regression_metric: 2 | class_path: comet.models.RegressionMetric 3 | init_args: 4 | nr_frozen_epochs: 0.3 5 | keep_embeddings_frozen: True 6 | keep_encoder_frozen: False 7 | optimizer: AdamW 8 | encoder_learning_rate: 1.0e-05 9 | learning_rate: 3.1e-05 10 | layerwise_decay: 0.95 11 | encoder_model: XLM-RoBERTa 12 | pretrained_model: xlm-roberta-large 13 | pool: avg 14 | layer: mix 15 | dropout: 0.15 16 | batch_size: 4 17 | train_data: data/balanced/scores-2020_train_errors_cmtfeat.csv 18 | validation_data: data/balanced/scores-2020_val_errors_cmtfeat.csv 19 | hidden_sizes: 20 | - 3072 21 | - 1024 22 | hidden_sizes_bottleneck: 23 | - 256 24 | data_portion: 1.0 25 | loss: hts_approx 26 | feature_size: 1 27 | #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt 28 | 29 | trainer: ../trainer.yaml 30 | early_stopping: ../early_stopping.yaml 31 | model_checkpoint: ../model_checkpoint.yaml -------------------------------------------------------------------------------- /configs/models/regression_metric_comet_plain.yaml: -------------------------------------------------------------------------------- 1 | regression_metric: 2 | class_path: comet.models.RegressionMetric 3 | init_args: 4 | nr_frozen_epochs: 0.3 5 | keep_embeddings_frozen: True 6 | keep_encoder_frozen: False 7 | optimizer: AdamW 8 | encoder_learning_rate: 1.0e-05 9 | learning_rate: 3.1e-05 10 | layerwise_decay: 0.95 11 | encoder_model: XLM-RoBERTa 12 | pretrained_model: xlm-roberta-large 13 | pool: avg 14 | layer: mix 15 | dropout: 0.15 16 | batch_size: 4 17 | train_data: /home/czerva/MT_QE/v2COMET/COMET_up/data/balanced/scores-1719.csv 18 | validation_data: /home/czerva/MT_QE/v2COMET/COMET_up/data/balanced/scores-1719.csv 19 | hidden_sizes: 20 | - 3072 21 | - 1024 22 | hidden_sizes_bottleneck: 23 | - 0 24 | data_portion: 1.0 25 | loss: mse 26 | feature_size: 0 27 | #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt 28 | 29 | trainer: ../trainer.yaml 30 | early_stopping: ../early_stopping.yaml 31 | model_checkpoint: ../model_checkpoint.yaml -------------------------------------------------------------------------------- /configs/models/regression_metric_dup_256bottleneck.yaml: -------------------------------------------------------------------------------- 1 | regression_metric: 2 | class_path: comet.models.RegressionMetric 3 | init_args: 4 | nr_frozen_epochs: 0.3 5 | keep_embeddings_frozen: True 6 | keep_encoder_frozen: False 7 | optimizer: AdamW 8 | encoder_learning_rate: 1.0e-05 9 | learning_rate: 3.1e-05 10 | layerwise_decay: 0.95 11 | encoder_model: XLM-RoBERTa 12 | pretrained_model: xlm-roberta-large 13 | pool: avg 14 | layer: mix 15 | dropout: 0.15 16 | batch_size: 4 17 | train_data: /home/czerva/MT_QE/v2COMET/COMET_up/data/balanced/scores-2020_train_errors_cmtfeat.csv 18 | validation_data: /home/czerva/MT_QE/v2COMET/COMET_up/data/balanced/scores-2020_val_errors_cmtfeat.csv 19 | hidden_sizes: 20 | - 3072 21 | - 1024 22 | hidden_sizes_bottleneck: 23 | - 256 24 | data_portion: 1.0 25 | loss: hts_approx 26 | feature_size: 1 27 | #load_weights_from_checkpoint: /home/czerva/MT_QE/v2COMET/COMET_up/lightning_logs/version_80/checkpoints/epoch=1-step=28143.ckpt 28 | 29 | trainer: ../trainer.yaml 30 | early_stopping: ../early_stopping.yaml 31 | model_checkpoint: ../model_checkpoint.yaml -------------------------------------------------------------------------------- /tests/unit/encoders/test_bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | from comet.encoders.bert import BERTEncoder 5 | 6 | 7 | class TestBERTEncoder(unittest.TestCase): 8 | 9 | bert = BERTEncoder.from_pretrained("google/bert_uncased_L-2_H-128_A-2") 10 | 11 | def test_num_layers(self): 12 | self.assertEqual(self.bert.num_layers, 3) 13 | 14 | def test_output_units(self): 15 | self.assertEqual(self.bert.output_units, 128) 16 | 17 | def test_max_positions(self): 18 | self.assertEqual(self.bert.max_positions, 512) 19 | 20 | def test_prepare_sample(self): 21 | sample = ["hello world, welcome to COMET!", "This is a batch"] 22 | model_input = self.bert.prepare_sample(sample) 23 | self.assertIn("input_ids", model_input) 24 | self.assertIn("attention_mask", model_input) 25 | 26 | def test_forward(self): 27 | sample = ["hello world, welcome to COMET!", "This is a batch"] 28 | model_input = self.bert.prepare_sample(sample) 29 | model_output = self.bert(**model_input) 30 | self.assertIn("wordemb", model_output) 31 | self.assertIn("sentemb", model_output) 32 | self.assertIn("all_layers", model_output) 33 | self.assertIn("attention_mask", model_output) 34 | -------------------------------------------------------------------------------- /tests/unit/encoders/test_xlmr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | from comet.encoders.xlmr import XLMREncoder 5 | 6 | 7 | class TestXLMREncoder(unittest.TestCase): 8 | 9 | xlmr = XLMREncoder.from_pretrained("Unbabel/xlm-roberta-comet-small") 10 | 11 | def test_num_layers(self): 12 | self.assertEqual(self.xlmr.num_layers, 7) 13 | 14 | def test_output_units(self): 15 | self.assertEqual(self.xlmr.output_units, 384) 16 | 17 | def test_max_positions(self): 18 | self.assertEqual(self.xlmr.max_positions, 514) 19 | 20 | def test_prepare_sample(self): 21 | sample = ["hello world, welcome to COMET!", "This is a batch"] 22 | model_input = self.xlmr.prepare_sample(sample) 23 | self.assertIn("input_ids", model_input) 24 | self.assertIn("attention_mask", model_input) 25 | 26 | def test_forward(self): 27 | sample = ["hello world, welcome to COMET!", "This is a batch"] 28 | model_input = self.xlmr.prepare_sample(sample) 29 | model_output = self.xlmr(**model_input) 30 | self.assertIn("wordemb", model_output) 31 | self.assertIn("sentemb", model_output) 32 | self.assertIn("all_layers", model_output) 33 | self.assertIn("attention_mask", model_output) 34 | -------------------------------------------------------------------------------- /configs/trainer.yaml: -------------------------------------------------------------------------------- 1 | class_path: pytorch_lightning.trainer.trainer.Trainer 2 | init_args: 3 | accelerator: null 4 | accumulate_grad_batches: 2 5 | amp_backend: native 6 | amp_level: O0 7 | auto_lr_find: False 8 | auto_scale_batch_size: False 9 | auto_select_gpus: False 10 | benchmark: False 11 | check_val_every_n_epoch: 1 12 | default_root_dir: null 13 | deterministic: True 14 | fast_dev_run: False 15 | flush_logs_every_n_steps: 100 16 | gpus: 1 17 | gradient_clip_val: 1.0 18 | gradient_clip_algorithm: norm 19 | limit_train_batches: 1.0 20 | limit_val_batches: 1.0 21 | limit_test_batches: 1.0 22 | limit_predict_batches: 1.0 23 | log_gpu_memory: null 24 | log_every_n_steps: 10 25 | prepare_data_per_node: True 26 | process_position: 0 27 | progress_bar_refresh_rate: null 28 | profiler: null 29 | overfit_batches: 0.0 30 | plugins: null 31 | precision: 32 32 | max_epochs: 3 33 | min_epochs: 1 34 | max_steps: null 35 | min_steps: null 36 | max_time: null 37 | num_nodes: 1 38 | num_processes: 1 39 | num_sanity_val_steps: 10 40 | reload_dataloaders_every_epoch: False 41 | replace_sampler_ddp: True 42 | resume_from_checkpoint: null 43 | sync_batchnorm: False 44 | terminate_on_nan: False 45 | tpu_cores: null 46 | track_grad_norm: -1 47 | val_check_interval: 1.0 48 | weights_summary: top 49 | move_metrics_to_cpu: True 50 | multiple_trainloader_mode: max_size_cycle 51 | stochastic_weight_avg: True -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "unbabel-comet" 3 | version = "1.0.0rc4" 4 | description = "High-quality Machine Translation Evaluation" 5 | authors = ["Ricardo Rei, Craig Stewart, Catarina Farinha, Alon Lavie"] 6 | license = "Apache-2.0" 7 | readme = "README.md" 8 | homepage = "https://github.com/Unbabel/COMET" 9 | repository = "https://github.com/Unbabel/COMET" 10 | documentation = "https://unbabel.github.io/COMET/html/index.html" 11 | keywords = [ 12 | "Machine Translation", 13 | "Evaluation", 14 | "Unbabel", 15 | "COMET" 16 | ] 17 | classifiers = [ 18 | 'Development Status :: 4 - Beta', 19 | 'Environment :: Console', 20 | 'Intended Audience :: Science/Research', 21 | 'Topic :: Scientific/Engineering :: Artificial Intelligence', 22 | ] 23 | packages = [ 24 | {include = "comet"}, 25 | ] 26 | include = [ 27 | "LICENSE", 28 | "pyproject.toml", 29 | "CONTRIBUTING.md" 30 | ] 31 | 32 | [tool.poetry.scripts] 33 | comet-train = 'comet.cli.train:train_command' 34 | comet-score = 'comet.cli.score:score_command' 35 | comet-compare = 'comet.cli.compare:compare_command' 36 | 37 | [tool.poetry.dependencies] 38 | python = "^3.6.1" 39 | sentencepiece = "^0.1.96" 40 | pandas = "1.1.5" 41 | transformers = "^4.8.2" 42 | pytorch-lightning = "1.3.5" 43 | jsonargparse = "3.13.1" 44 | torch = "1.6.0" 45 | torchmetrics = "0.5" 46 | 47 | [tool.poetry.dev-dependencies] 48 | sphinx-markdown-tables = "0.0.15" 49 | coverage = "^5.5" 50 | scikit-learn = "0.24" 51 | scipy = "1.5.4" 52 | 53 | [build-system] 54 | requires = ["poetry-core>=1.0.0"] 55 | build-backend = "poetry.core.masonry.api" -------------------------------------------------------------------------------- /comet/models/ranking/wmt_kendall.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | r""" 17 | WMT Kendall Tau 18 | ==================== 19 | Kendall Tau like formulation used to measure agreement between relative ranks 20 | produced by humans and relative ranks produced by metrics. 21 | """ 22 | import torch 23 | from torchmetrics import Metric 24 | 25 | 26 | class WMTKendall(Metric): 27 | def __init__(self, dist_sync_on_step=False, prefix=""): 28 | super().__init__(dist_sync_on_step=dist_sync_on_step) 29 | self.add_state("concordance", default=torch.tensor(0), dist_reduce_fx="sum") 30 | self.add_state("discordance", default=torch.tensor(0), dist_reduce_fx="sum") 31 | self.prefix = prefix 32 | 33 | def update(self, distance_pos: torch.Tensor, distance_neg: torch.Tensor): 34 | assert distance_pos.shape == distance_neg.shape 35 | self.concordance = torch.sum((distance_pos < distance_neg).float()) 36 | self.discordance = torch.sum((distance_pos >= distance_neg).float()) 37 | 38 | def compute(self): 39 | return { 40 | self.prefix 41 | + "_kendall": (self.concordance - self.discordance) 42 | / (self.concordance + self.discordance) 43 | } 44 | -------------------------------------------------------------------------------- /comet/modules/bottleneck.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | r""" 15 | Bottleneck Layer 16 | ============== 17 | Bottleneck module to be used with customised features 18 | """ 19 | 20 | from typing import List, Optional 21 | 22 | import torch 23 | from torch import nn 24 | 25 | 26 | class Bottleneck(nn.Module): 27 | """ 28 | Bottleneck layer. 29 | 30 | :param in_dim: Number input features. 31 | :param out_dim: Number of output features. Default is just a score. 32 | :param hidden_sizes: List with hidden layer sizes. 33 | :param activations: Name of the activation function to be used in the hidden layers. 34 | :param final_activation: Name of the final activation function if any. 35 | :param dropout: dropout to be used in the hidden layers. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | in_dim: int, 41 | hidden_sizes: List[int] = [3072, 256], 42 | activations: str = "Sigmoid", 43 | dropout: float = 0.1, 44 | ) -> None: 45 | super().__init__() 46 | modules = [] 47 | modules.append(nn.Linear(in_dim, hidden_sizes[0])) 48 | modules.append(self.build_activation(activations)) 49 | modules.append(nn.Dropout(dropout)) 50 | for i in range(1, len(hidden_sizes)): 51 | modules.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i])) 52 | modules.append(self.build_activation(activations)) 53 | modules.append(nn.Dropout(dropout)) 54 | 55 | self.ff = nn.Sequential(*modules) 56 | 57 | def build_activation(self, activation: str) -> nn.Module: 58 | if hasattr(nn, activation): 59 | return getattr(nn, activation)() 60 | 61 | def forward(self, in_features: torch.Tensor) -> torch.Tensor: 62 | return self.ff(in_features) 63 | -------------------------------------------------------------------------------- /comet/encoders/xlmr.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | XLM-RoBERTa Encoder 17 | ============== 18 | Pretrained XLM-RoBERTa encoder from Hugging Face. 19 | """ 20 | from typing import Dict 21 | 22 | import torch 23 | from comet.encoders.base import Encoder 24 | from comet.encoders.bert import BERTEncoder 25 | from transformers import XLMRobertaModel, XLMRobertaTokenizer 26 | 27 | 28 | class XLMREncoder(BERTEncoder): 29 | """XLM-RoBERTA Encoder encoder. 30 | 31 | :param pretrained_model: Pretrained model from hugging face. 32 | """ 33 | 34 | def __init__(self, pretrained_model: str) -> None: 35 | super(Encoder, self).__init__() 36 | self.tokenizer = XLMRobertaTokenizer.from_pretrained(pretrained_model) 37 | self.model = XLMRobertaModel.from_pretrained( 38 | pretrained_model, add_pooling_layer=False 39 | ) 40 | self.model.encoder.output_hidden_states = True 41 | 42 | @classmethod 43 | def from_pretrained(cls, pretrained_model: str) -> Encoder: 44 | """Function that loads a pretrained encoder from Hugging Face. 45 | :param pretrained_model: Name of the pretrain model to be loaded. 46 | 47 | :return: Encoder model 48 | """ 49 | return XLMREncoder(pretrained_model) 50 | 51 | def forward( 52 | self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs 53 | ) -> Dict[str, torch.Tensor]: 54 | last_hidden_states, _, all_layers = self.model( 55 | input_ids=input_ids, 56 | attention_mask=attention_mask, 57 | output_hidden_states=True, 58 | return_dict=False, 59 | ) 60 | return { 61 | "sentemb": last_hidden_states[:, 0, :], 62 | "wordemb": last_hidden_states, 63 | "all_layers": all_layers, 64 | "attention_mask": attention_mask, 65 | } 66 | -------------------------------------------------------------------------------- /comet/models/pooling_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | import torch 16 | 17 | 18 | def average_pooling( 19 | tokens: torch.Tensor, 20 | embeddings: torch.Tensor, 21 | mask: torch.Tensor, 22 | padding_index: int, 23 | ) -> torch.Tensor: 24 | """Average pooling function. 25 | :param tokens: Word ids [batch_size x seq_length] 26 | :param embeddings: Word embeddings [batch_size x seq_length x hidden_size] 27 | :param mask: Padding mask [batch_size x seq_length] 28 | :param padding_index: Padding value. 29 | """ 30 | wordemb = mask_fill(0.0, tokens, embeddings, padding_index) 31 | sentemb = torch.sum(wordemb, 1) 32 | sum_mask = mask.unsqueeze(-1).expand(embeddings.size()).float().sum(1) 33 | return sentemb / sum_mask 34 | 35 | 36 | def max_pooling( 37 | tokens: torch.Tensor, embeddings: torch.Tensor, padding_index: int 38 | ) -> torch.Tensor: 39 | """Max pooling function. 40 | :param tokens: Word ids [batch_size x seq_length] 41 | :param embeddings: Word embeddings [batch_size x seq_length x hidden_size] 42 | :param padding_index: Padding value. 43 | """ 44 | return mask_fill(float("-inf"), tokens, embeddings, padding_index).max(dim=1)[0] 45 | 46 | 47 | def mask_fill( 48 | fill_value: float, 49 | tokens: torch.Tensor, 50 | embeddings: torch.Tensor, 51 | padding_index: int, 52 | ) -> torch.Tensor: 53 | """ 54 | Function that masks embeddings representing padded elements. 55 | :param fill_value: the value to fill the embeddings belonging to padded tokens. 56 | :param tokens: The input sequences [bsz x seq_len]. 57 | :param embeddings: word embeddings [bsz x seq_len x hiddens]. 58 | :param padding_index: Index of the padding token. 59 | """ 60 | padding_mask = tokens.eq(padding_index).unsqueeze(-1) 61 | return embeddings.float().masked_fill_(padding_mask, fill_value).type_as(embeddings) 62 | -------------------------------------------------------------------------------- /comet/modules/losses.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class HeteroscedasticLoss(nn.Module): 5 | 6 | def forward(self, mu: torch.Tensor, std: torch.Tensor, target: torch.Tensor): 7 | sigma = std**2 8 | log1 = 0.5 * torch.neg(torch.log(sigma)).exp() 9 | mse = (target - mu)**2 10 | log2 = 0.5 * torch.log(sigma) 11 | return torch.sum(log1*mse+log2) 12 | 13 | 14 | class HeteroscedasticLossv2(nn.Module): 15 | 16 | def forward(self, mu: torch.Tensor, std: torch.Tensor, target: torch.Tensor): 17 | sigma = std 18 | log1 = 0.5 * torch.neg(torch.log(sigma)).exp() 19 | mse = (target - mu)**2 20 | log2 = 0.5 * torch.log(sigma) 21 | return torch.sum(log1*mse+log2) 22 | 23 | #Heteroscedastic inspired loss for error/uncertainty prediction 24 | class HeteroApproxLoss(nn.Module): 25 | 26 | def forward(self, pred: torch.Tensor, target: torch.Tensor): 27 | sigma = pred**2 28 | l1 = 0.5 * torch.neg(torch.log(sigma)).exp() 29 | l2 = 0.5 * torch.log(sigma) 30 | mse = target**2 31 | #return torch.mean(0.5*pred**(-2)*(target**2)+(0.5*torch.log(pred**2))) 32 | return torch.sum(l1*mse+l2) 33 | 34 | #Heteroscedastic inspired loss for error/uncertainty prediction 35 | class HeteroApproxLossv2(nn.Module): 36 | 37 | def forward(self, pred: torch.Tensor, target: torch.Tensor): 38 | sigma = pred 39 | l1 = 0.5 * torch.neg(torch.log(sigma)).exp() 40 | l2 = 0.5 * torch.log(sigma) 41 | mse = target**2 42 | #return torch.mean(0.5*pred**(-2)*(target**2)+(0.5*torch.log(pred**2))) 43 | return torch.sum(l1*mse+l2) 44 | 45 | class SquaredLoss(nn.Module): 46 | def forward(self, pred: torch.Tensor, target: torch.Tensor): 47 | mse = (target**2-pred**2)**2 48 | 49 | #return torch.mean(0.5*pred**(-2)*(target**2)+(0.5*torch.log(pred**2))) 50 | return torch.mean(mse) 51 | 52 | 53 | class KLLoss(nn.Module): 54 | #based on Daan's idea 55 | def forward(self, mu: torch.Tensor, sigma: torch.Tensor, target_mu: torch.Tensor, target_std: torch.Tensor): 56 | 57 | # Add fudge factor to variance to avoid large KL values 58 | # (value of 1e-2 just turned out to work - 1e-3 already 59 | # occasionally caused loss > 1000) 60 | std1 = target_std 61 | std2 = sigma 62 | mean1 = target_mu 63 | mean2 = mu 64 | 65 | kl = torch.log(torch.abs(std2)/torch.abs(std1)) + (std1**2 + (mean1 - mean2)**2)/(2*std2**2) - 0.5 66 | 67 | return kl.mean() 68 | 69 | 70 | -------------------------------------------------------------------------------- /tests/integration/models/test_ranking_metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import multiprocessing 3 | import os 4 | import shutil 5 | import unittest 6 | 7 | import torch 8 | from comet.models import RankingMetric 9 | from pytorch_lightning import seed_everything 10 | from pytorch_lightning.trainer.trainer import Trainer 11 | from scipy.stats import pearsonr 12 | from tests.data import DATA_PATH 13 | from torch.utils.data import DataLoader 14 | 15 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 16 | os.environ["OMP_NUM_THREADS"] = "1" 17 | 18 | 19 | class TestRankingMetric(unittest.TestCase): 20 | @classmethod 21 | def tearDownClass(cls): 22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints")) 23 | 24 | def test_training(self): 25 | seed_everything(12) 26 | trainer = Trainer( 27 | gpus=0, 28 | max_epochs=4, 29 | deterministic=True, 30 | checkpoint_callback=True, 31 | default_root_dir=DATA_PATH, 32 | logger=False, 33 | weights_summary=None, 34 | progress_bar_refresh_rate=0, 35 | ) 36 | model = RankingMetric( 37 | encoder_model="BERT", 38 | pretrained_model="google/bert_uncased_L-2_H-128_A-2", 39 | train_data=os.path.join(DATA_PATH, "test_ranking_data.csv"), 40 | validation_data=os.path.join(DATA_PATH, "test_ranking_data.csv"), 41 | layerwise_decay=0.95, 42 | batch_size=32, 43 | learning_rate=1e-04, 44 | encoder_learning_rate=1e-04, 45 | ) 46 | trainer.fit(model) 47 | self.assertTrue( 48 | os.path.exists( 49 | os.path.join(DATA_PATH, "checkpoints", "epoch=3-step=15.ckpt") 50 | ) 51 | ) 52 | saved_model = RankingMetric.load_from_checkpoint( 53 | os.path.join(DATA_PATH, "checkpoints", "epoch=3-step=15.ckpt") 54 | ) 55 | dataset = saved_model.read_csv( 56 | os.path.join(DATA_PATH, "test_regression_data.csv"), regression=True 57 | ) 58 | y = [s["score"] for s in dataset] 59 | dataloader = DataLoader( 60 | dataset=dataset, 61 | batch_size=256, 62 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True), 63 | num_workers=multiprocessing.cpu_count(), 64 | ) 65 | y_hat = ( 66 | torch.cat( 67 | trainer.predict(dataloaders=dataloader, return_predictions=True), dim=0 68 | ) 69 | .cpu() 70 | .tolist() 71 | ) 72 | # This shouldn't break! 73 | pearsonr(y_hat, y)[0] 74 | -------------------------------------------------------------------------------- /comet/modules/feedforward.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | Feed Forward 17 | ============== 18 | Feed Forward Neural Network module that can be used for classification or regression 19 | """ 20 | from typing import List, Optional 21 | 22 | import torch 23 | from torch import nn 24 | 25 | 26 | class FeedForward(nn.Module): 27 | """ 28 | Feed Forward Neural Network. 29 | 30 | :param in_dim: Number input features. 31 | :param out_dim: Number of output features. Default is just a score. 32 | :param hidden_sizes: List with hidden layer sizes. 33 | :param activations: Name of the activation function to be used in the hidden layers. 34 | :param final_activation: Name of the final activation function if any. 35 | :param dropout: dropout to be used in the hidden layers. 36 | """ 37 | 38 | def __init__( 39 | self, 40 | in_dim: int, 41 | out_dim: int = 1, 42 | hidden_sizes: List[int] = [3072, 768], 43 | activations: str = "Sigmoid", 44 | final_activation: Optional[str] = None, 45 | dropout: float = 0.1, 46 | ) -> None: 47 | super().__init__() 48 | modules = [] 49 | modules.append(nn.Linear(in_dim, hidden_sizes[0])) 50 | modules.append(self.build_activation(activations)) 51 | modules.append(nn.Dropout(dropout)) 52 | 53 | for i in range(1, len(hidden_sizes)): 54 | modules.append(nn.Linear(hidden_sizes[i - 1], hidden_sizes[i])) 55 | modules.append(self.build_activation(activations)) 56 | modules.append(nn.Dropout(dropout)) 57 | 58 | modules.append(nn.Linear(hidden_sizes[-1], int(out_dim))) 59 | if final_activation is not None: 60 | modules.append(self.build_activation(final_activation)) 61 | 62 | self.ff = nn.Sequential(*modules) 63 | 64 | def build_activation(self, activation: str) -> nn.Module: 65 | if hasattr(nn, activation): 66 | return getattr(nn, activation)() 67 | 68 | def forward(self, in_features: torch.Tensor) -> torch.Tensor: 69 | return self.ff(in_features) 70 | -------------------------------------------------------------------------------- /tests/integration/models/test_regression_metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import multiprocessing 3 | import os 4 | import shutil 5 | import unittest 6 | 7 | import torch 8 | from comet.models import RegressionMetric 9 | from pytorch_lightning import seed_everything 10 | from pytorch_lightning.trainer.trainer import Trainer 11 | from scipy.stats import pearsonr 12 | from tests.data import DATA_PATH 13 | from torch.utils.data import DataLoader 14 | 15 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 16 | os.environ["OMP_NUM_THREADS"] = "1" 17 | 18 | 19 | class TestRegressionMetric(unittest.TestCase): 20 | @classmethod 21 | def tearDownClass(cls): 22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints")) 23 | 24 | def test_training(self): 25 | seed_everything(12) 26 | trainer = Trainer( 27 | gpus=0, 28 | max_epochs=10, 29 | deterministic=True, 30 | checkpoint_callback=True, 31 | default_root_dir=DATA_PATH, 32 | logger=False, 33 | weights_summary=None, 34 | progress_bar_refresh_rate=0, 35 | ) 36 | model = RegressionMetric( 37 | encoder_model="BERT", 38 | pretrained_model="google/bert_uncased_L-2_H-128_A-2", 39 | train_data=os.path.join(DATA_PATH, "test_regression_data.csv"), 40 | validation_data=os.path.join(DATA_PATH, "test_regression_data.csv"), 41 | hidden_sizes=[384], 42 | layerwise_decay=0.95, 43 | batch_size=32, 44 | learning_rate=1e-04, 45 | encoder_learning_rate=1e-04, 46 | ) 47 | trainer.fit(model) 48 | self.assertTrue( 49 | os.path.exists( 50 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=159.ckpt") 51 | ) 52 | ) 53 | 54 | saved_model = RegressionMetric.load_from_checkpoint( 55 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=159.ckpt") 56 | ) 57 | dataset = saved_model.read_csv( 58 | os.path.join(DATA_PATH, "test_regression_data.csv") 59 | ) 60 | y = [s["score"] for s in dataset] 61 | dataloader = DataLoader( 62 | dataset=dataset, 63 | batch_size=256, 64 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True), 65 | num_workers=multiprocessing.cpu_count(), 66 | ) 67 | y_hat = ( 68 | torch.cat( 69 | trainer.predict(dataloaders=dataloader, return_predictions=True), dim=0 70 | ) 71 | .cpu() 72 | .tolist() 73 | ) 74 | self.assertAlmostEqual(pearsonr(y_hat, y)[0], 0.8, places=1) 75 | -------------------------------------------------------------------------------- /tests/integration/models/test_referenceless_regression.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import multiprocessing 3 | import os 4 | import shutil 5 | import unittest 6 | 7 | import torch 8 | from comet.models import ReferencelessRegression 9 | from pytorch_lightning import seed_everything 10 | from pytorch_lightning.trainer.trainer import Trainer 11 | from scipy.stats import pearsonr 12 | from tests.data import DATA_PATH 13 | from torch.utils.data import DataLoader 14 | 15 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 16 | os.environ["OMP_NUM_THREADS"] = "1" 17 | 18 | 19 | class TestReferencelessRegression(unittest.TestCase): 20 | @classmethod 21 | def tearDownClass(cls): 22 | shutil.rmtree(os.path.join(DATA_PATH, "checkpoints")) 23 | 24 | def test_training(self): 25 | 26 | seed_everything(12) 27 | trainer = Trainer( 28 | gpus=0, 29 | max_epochs=10, 30 | deterministic=True, 31 | checkpoint_callback=True, 32 | default_root_dir=DATA_PATH, 33 | logger=False, 34 | weights_summary=None, 35 | progress_bar_refresh_rate=0, 36 | ) 37 | model = ReferencelessRegression( 38 | encoder_model="BERT", 39 | pretrained_model="google/bert_uncased_L-2_H-128_A-2", 40 | train_data=os.path.join(DATA_PATH, "test_regression_data.csv"), 41 | validation_data=os.path.join(DATA_PATH, "test_regression_data.csv"), 42 | hidden_sizes=[256], 43 | layerwise_decay=0.95, 44 | batch_size=32, 45 | learning_rate=1e-04, 46 | encoder_learning_rate=1e-04, 47 | ) 48 | trainer.fit(model) 49 | self.assertTrue( 50 | os.path.exists( 51 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=159.ckpt") 52 | ) 53 | ) 54 | 55 | saved_model = ReferencelessRegression.load_from_checkpoint( 56 | os.path.join(DATA_PATH, "checkpoints", "epoch=9-step=159.ckpt") 57 | ) 58 | dataset = saved_model.read_csv( 59 | os.path.join(DATA_PATH, "test_regression_data.csv") 60 | ) 61 | y = [s["score"] for s in dataset] 62 | dataloader = DataLoader( 63 | dataset=dataset, 64 | batch_size=256, 65 | collate_fn=lambda x: saved_model.prepare_sample(x, inference=True), 66 | num_workers=multiprocessing.cpu_count(), 67 | ) 68 | y_hat = ( 69 | torch.cat( 70 | trainer.predict(dataloaders=dataloader, return_predictions=True), dim=0 71 | ) 72 | .cpu() 73 | .tolist() 74 | ) 75 | self.assertAlmostEqual(pearsonr(y_hat, y)[0], 0.8, places=1) 76 | -------------------------------------------------------------------------------- /comet/models/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | # -*- coding: utf-8 -*- 3 | # Copyright (C) 2020 Unbabel 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from .regression.regression_metric import RegressionMetric 18 | from .ranking.ranking_metric import RankingMetric 19 | from .regression.referenceless import ReferencelessRegression 20 | from .base import CometModel 21 | 22 | import os 23 | import yaml 24 | 25 | str2model = { 26 | "referenceless_regression_metric": ReferencelessRegression, 27 | "regression_metric": RegressionMetric, 28 | "ranking_metric": RankingMetric, 29 | } 30 | 31 | available_metrics = { 32 | "emnlp20-comet-rank": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/emnlp20-comet-rank.tar.gz", 33 | "wmt20-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-da.tar.gz", 34 | "wmt20-comet-qe-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt20/wmt20-comet-qe-da.tar.gz", 35 | "wmt21-comet-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-da.tar.gz", 36 | "wmt21-comet-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-comet-mqm.tar.gz", 37 | # "wmt21-cometinho-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-cometinho-mqm.tar.gz", 38 | "wmt21-cometinho-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/wmt21-cometinho-da.tar.gz", 39 | # "wmt21-comet-qe-mqm": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/refless-wmt21-comet-mqm.tar.gz", 40 | # "wmt21-comet-qe-da": "https://unbabel-experimental-models.s3.amazonaws.com/comet/wmt21/refless-wmt21-comet-da.tar.gz", 41 | } 42 | 43 | 44 | def load_from_checkpoint(checkpoint_path: str) -> CometModel: 45 | """Loads models from a checkpoint path. 46 | :param checkpoint_path: Path to a model checkpoint. 47 | 48 | :return: Returns a COMET model. 49 | """ 50 | if not os.path.exists(checkpoint_path): 51 | raise Exception(f"Invalid checkpoint path: {checkpoint_path}") 52 | 53 | hparams_file = "/".join(checkpoint_path.split("/")[:-2] + ["hparams.yaml"]) 54 | if os.path.exists(hparams_file): 55 | with open(hparams_file) as yaml_file: 56 | hparams = yaml.load(yaml_file.read(), Loader=yaml.FullLoader) 57 | model_class = str2model[hparams["class_identifier"]] 58 | model = model_class.load_from_checkpoint(checkpoint_path, **hparams) 59 | return model 60 | else: 61 | raise Exception("hparams.yaml file is missing!") 62 | -------------------------------------------------------------------------------- /comet/encoders/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Unbabel 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | r""" 15 | Encoder Model base 16 | ==================== 17 | Module defining the common interface between all pretrained encoder models. 18 | """ 19 | import abc 20 | from typing import Dict, List 21 | 22 | import torch 23 | import torch.nn as nn 24 | 25 | 26 | class Encoder(nn.Module, metaclass=abc.ABCMeta): 27 | """Base class for an encoder model.""" 28 | 29 | @property 30 | @abc.abstractmethod 31 | def output_units(self): 32 | """Max number of tokens the encoder handles.""" 33 | pass 34 | 35 | @property 36 | @abc.abstractmethod 37 | def max_positions(self): 38 | """Max number of tokens the encoder handles.""" 39 | pass 40 | 41 | @property 42 | @abc.abstractmethod 43 | def num_layers(self): 44 | """Number of model layers available.""" 45 | pass 46 | 47 | @classmethod 48 | @abc.abstractmethod 49 | def from_pretrained(cls, pretrained_model): 50 | """Function that loads a pretrained encoder and the respective tokenizer. 51 | 52 | :return: Encoder model 53 | """ 54 | raise NotImplementedError 55 | 56 | def prepare_sample(self, sample: List[str]) -> Dict[str, torch.Tensor]: 57 | """Receives a list of strings and applies tokenization and vectorization. 58 | 59 | :param sample: List with text segments to be tokenized and padded. 60 | 61 | :return: Dictionary with HF model inputs. 62 | """ 63 | tokenizer_output = self.tokenizer( 64 | sample, 65 | return_tensors="pt", 66 | padding=True, 67 | truncation=True, 68 | max_length=self.max_positions - 2, 69 | ) 70 | return tokenizer_output 71 | 72 | def freeze(self) -> None: 73 | """Frezees the entire encoder.""" 74 | for param in self.parameters(): 75 | param.requires_grad = False 76 | 77 | def unfreeze(self) -> None: 78 | """Unfrezees the entire encoder.""" 79 | for param in self.parameters(): 80 | param.requires_grad = True 81 | 82 | @abc.abstractmethod 83 | def freeze_embeddings(self) -> None: 84 | """Frezees the embedding layer.""" 85 | pass 86 | 87 | @abc.abstractmethod 88 | def layerwise_lr(self, lr: float, decay: float): 89 | """ 90 | :param lr: Learning rate for the highest encoder layer. 91 | :param decay: decay percentage for the lower layers. 92 | 93 | :return: List of model parameters with layer-wise decay learning rate 94 | """ 95 | pass 96 | 97 | @abc.abstractmethod 98 | def forward( 99 | self, tokens: torch.Tensor, lengths: torch.Tensor 100 | ) -> Dict[str, torch.Tensor]: 101 | pass 102 | -------------------------------------------------------------------------------- /comet/encoders/bert.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | BERT Encoder 17 | ============== 18 | Pretrained BERT encoder from Hugging Face. 19 | """ 20 | from typing import Dict 21 | 22 | import torch 23 | from comet.encoders.base import Encoder 24 | from transformers import AutoModel, AutoTokenizer 25 | 26 | 27 | class BERTEncoder(Encoder): 28 | """BERT encoder. 29 | 30 | :param pretrained_model: Pretrained model from hugging face. 31 | """ 32 | 33 | def __init__(self, pretrained_model: str) -> None: 34 | super().__init__() 35 | self.tokenizer = AutoTokenizer.from_pretrained(pretrained_model, use_fast=True) 36 | self.model = AutoModel.from_pretrained(pretrained_model) 37 | self.model.encoder.output_hidden_states = True 38 | 39 | @property 40 | def output_units(self): 41 | """Max number of tokens the encoder handles.""" 42 | return self.model.config.hidden_size 43 | 44 | @property 45 | def max_positions(self): 46 | """Max number of tokens the encoder handles.""" 47 | return self.model.config.max_position_embeddings 48 | 49 | @property 50 | def num_layers(self): 51 | """Number of model layers available.""" 52 | return self.model.config.num_hidden_layers + 1 53 | 54 | @classmethod 55 | def from_pretrained(cls, pretrained_model: str) -> Encoder: 56 | """Function that loads a pretrained encoder from Hugging Face. 57 | :param pretrained_model: Name of the pretrain model to be loaded. 58 | 59 | :return: Encoder model 60 | """ 61 | return BERTEncoder(pretrained_model) 62 | 63 | def freeze_embeddings(self) -> None: 64 | """Frezees the embedding layer.""" 65 | for param in self.model.embeddings.parameters(): 66 | param.requires_grad = False 67 | 68 | def layerwise_lr(self, lr: float, decay: float): 69 | """ 70 | :param lr: Learning rate for the highest encoder layer. 71 | :param decay: decay percentage for the lower layers. 72 | 73 | :return: List of model parameters with layer-wise decay learning rate 74 | """ 75 | # Embedding Layer 76 | opt_parameters = [ 77 | { 78 | "params": self.model.embeddings.parameters(), 79 | "lr": lr * decay ** (self.num_layers), 80 | } 81 | ] 82 | # All layers 83 | opt_parameters += [ 84 | { 85 | "params": self.model.encoder.layer[i].parameters(), 86 | "lr": lr * decay ** i, 87 | } 88 | for i in range(self.num_layers - 2, 0, -1) 89 | ] 90 | return opt_parameters 91 | 92 | def forward( 93 | self, input_ids: torch.Tensor, attention_mask: torch.Tensor, **kwargs 94 | ) -> Dict[str, torch.Tensor]: 95 | last_hidden_states, pooler_output, all_layers = self.model( 96 | input_ids=input_ids, 97 | attention_mask=attention_mask, 98 | output_hidden_states=True, 99 | return_dict=False, 100 | ) 101 | return { 102 | "sentemb": pooler_output, 103 | "wordemb": last_hidden_states, 104 | "all_layers": all_layers, 105 | "attention_mask": attention_mask, 106 | } 107 | -------------------------------------------------------------------------------- /comet/cli/train.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Unbabel 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | 16 | Command for training new Metrics. 17 | ================================= 18 | 19 | e.g: 20 | ``` 21 | comet-train --cfg configs/models/regression_metric.yaml 22 | ``` 23 | 24 | For more details run the following command: 25 | ``` 26 | comet-train --help 27 | ``` 28 | """ 29 | import json 30 | 31 | from comet.models import ( 32 | CometModel, 33 | RankingMetric, 34 | ReferencelessRegression, 35 | RegressionMetric, 36 | ) 37 | from jsonargparse import ActionConfigFile, ArgumentParser, namespace_to_dict 38 | from pytorch_lightning import seed_everything 39 | from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint 40 | from pytorch_lightning.trainer.trainer import Trainer 41 | 42 | 43 | def train_command() -> None: 44 | parser = ArgumentParser(description="Command for training COMET models.") 45 | parser.add_argument( 46 | "--seed_everything", 47 | type=int, 48 | default=12, 49 | help="Training Seed.", 50 | ) 51 | parser.add_argument("--cfg", action=ActionConfigFile) 52 | parser.add_class_arguments(CometModel, "model") 53 | parser.add_subclass_arguments(RegressionMetric, "regression_metric") 54 | parser.add_subclass_arguments( 55 | ReferencelessRegression, "referenceless_regression_metric" 56 | ) 57 | parser.add_subclass_arguments(RankingMetric, "ranking_metric") 58 | parser.add_subclass_arguments(EarlyStopping, "early_stopping") 59 | parser.add_subclass_arguments(ModelCheckpoint, "model_checkpoint") 60 | parser.add_subclass_arguments(Trainer, "trainer") 61 | cfg = parser.parse_args() 62 | seed_everything(cfg.seed_everything) 63 | 64 | checkpoint_callback = ModelCheckpoint( 65 | **namespace_to_dict(cfg.model_checkpoint.init_args) 66 | ) 67 | early_stop_callback = EarlyStopping( 68 | **namespace_to_dict(cfg.early_stopping.init_args) 69 | ) 70 | trainer_args = namespace_to_dict(cfg.trainer.init_args) 71 | trainer_args["callbacks"] = [early_stop_callback, checkpoint_callback] 72 | print("TRAINER ARGUMENTS: ") 73 | print(json.dumps(trainer_args, indent=4, default=lambda x: x.__dict__)) 74 | trainer = Trainer(**trainer_args) 75 | 76 | print("MODEL ARGUMENTS: ") 77 | if cfg.regression_metric is not None: 78 | print( 79 | json.dumps( 80 | cfg.regression_metric.init_args, indent=4, default=lambda x: x.__dict__ 81 | ) 82 | ) 83 | model = RegressionMetric(**namespace_to_dict(cfg.regression_metric.init_args)) 84 | elif cfg.referenceless_regression_metric is not None: 85 | print( 86 | json.dumps( 87 | cfg.referenceless_regression_metric.init_args, 88 | indent=4, 89 | default=lambda x: x.__dict__, 90 | ) 91 | ) 92 | model = ReferencelessRegression( 93 | **namespace_to_dict(cfg.referenceless_regression_metric.init_args) 94 | ) 95 | elif cfg.ranking_metric is not None: 96 | print( 97 | json.dumps( 98 | cfg.ranking_metric.init_args, indent=4, default=lambda x: x.__dict__ 99 | ) 100 | ) 101 | model = RankingMetric(**namespace_to_dict(cfg.ranking_metric.init_args)) 102 | else: 103 | raise Exception("Model configurations missing!") 104 | 105 | trainer.fit(model) 106 | -------------------------------------------------------------------------------- /tests/integration/modules/test_feedforward.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unittest 3 | 4 | import torch 5 | from sklearn.datasets import load_digits 6 | from sklearn.model_selection import train_test_split 7 | from torch import nn 8 | 9 | from comet.modules.feedforward import FeedForward 10 | from pytorch_lightning import seed_everything 11 | 12 | 13 | class TestFeedForward(unittest.TestCase): 14 | def test_MNIST(self): 15 | seed_everything(3) 16 | """ 17 | STEP 1: LOADING DATASET 18 | """ 19 | images, labels = load_digits(return_X_y=True) 20 | images = [torch.Tensor(images[i, :]) for i in range(images.shape[0])] 21 | labels = torch.tensor(labels, dtype=torch.long) 22 | 23 | train_images, test_images, train_labels, test_labels = train_test_split( 24 | images, labels, test_size=0.2, random_state=42 25 | ) 26 | 27 | train_dataset = list(zip(train_images, train_labels)) 28 | test_dataset = list(zip(test_images, test_labels)) 29 | 30 | """ 31 | STEP 2: MAKING DATASET ITERABLE 32 | """ 33 | batch_size = 256 34 | n_iters = 80 35 | num_epochs = n_iters / (len(train_dataset) / batch_size) 36 | num_epochs = int(num_epochs) 37 | 38 | train_loader = torch.utils.data.DataLoader( 39 | dataset=train_dataset, batch_size=batch_size, shuffle=True 40 | ) 41 | 42 | test_loader = torch.utils.data.DataLoader( 43 | dataset=test_dataset, batch_size=batch_size, shuffle=False 44 | ) 45 | 46 | """ 47 | STEP 3: INSTANTIATE MODEL CLASS 48 | """ 49 | model = FeedForward( 50 | in_dim=8 * 8, 51 | out_dim=10, 52 | hidden_sizes=[100], 53 | activations="Tanh", 54 | ) 55 | 56 | """ 57 | STEP 4: INSTANTIATE LOSS CLASS 58 | """ 59 | criterion = nn.CrossEntropyLoss() 60 | 61 | """ 62 | STEP 5: INSTANTIATE OPTIMIZER CLASS 63 | """ 64 | learning_rate = 0.1 65 | optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) 66 | 67 | """ 68 | STEP 7: TRAIN THE MODEL 69 | """ 70 | iter = 0 71 | for epoch in range(num_epochs): 72 | for i, (images, labels) in enumerate(train_loader): 73 | # Load images with gradient accumulation capabilities 74 | images = images.view(-1, 8 * 8).requires_grad_() 75 | 76 | # Clear gradients w.r.t. parameters 77 | optimizer.zero_grad() 78 | 79 | # Forward pass to get output/logits 80 | outputs = model(images) 81 | 82 | # Calculate Loss: softmax --> cross entropy loss 83 | loss = criterion(outputs, labels) 84 | 85 | # Getting gradients w.r.t. parameters 86 | loss.backward() 87 | 88 | # Updating parameters 89 | optimizer.step() 90 | 91 | iter += 1 92 | 93 | if iter % 10 == 0: 94 | # Calculate Accuracy 95 | correct = 0 96 | total = 0 97 | # Iterate through test dataset 98 | for images, labels in test_loader: 99 | # Load images with gradient accumulation capabilities 100 | images = images.view(-1, 8 * 8).requires_grad_() 101 | 102 | # Forward pass only to get logits/output 103 | outputs = model(images) 104 | 105 | # Get predictions from the maximum value 106 | _, predicted = torch.max(outputs.data, 1) 107 | 108 | # Total number of labels 109 | total += labels.size(0) 110 | 111 | # Total correct predictions 112 | correct += (predicted == labels).sum() 113 | 114 | accuracy = 100 * correct // total 115 | self.assertGreaterEqual(accuracy, 95) 116 | -------------------------------------------------------------------------------- /comet/modules/layerwise_attention.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | Layer-Wise Attention Mechanism 17 | ================================ 18 | Computes a parameterised scalar mixture of N tensors, 19 | `mixture = gamma * sum(s_k * tensor_k)` 20 | where `s = softmax(w)`, with `w` and `gamma` scalar parameters. 21 | 22 | If `layer_norm=True` then apply layer normalization. 23 | 24 | If `dropout > 0`, then for each scalar weight, adjust its softmax 25 | weight mass to 0 with the dropout probability (i.e., setting the 26 | unnormalized weight to -inf). This effectively should redistribute 27 | dropped probability mass to all other weights. 28 | 29 | Original implementation: 30 | - https://github.com/Hyperparticle/udify 31 | """ 32 | from typing import List, Optional 33 | 34 | import torch 35 | from torch.nn import Parameter, ParameterList 36 | 37 | 38 | class LayerwiseAttention(torch.nn.Module): 39 | def __init__( 40 | self, 41 | num_layers: int, 42 | layer_norm: bool = False, 43 | layer_weights: Optional[List[int]] = None, 44 | dropout: float = None, 45 | ) -> None: 46 | super(LayerwiseAttention, self).__init__() 47 | self.num_layers = num_layers 48 | self.layer_norm = layer_norm 49 | self.dropout = dropout 50 | 51 | if layer_weights is None: 52 | layer_weights = [0.0] * num_layers 53 | elif len(layer_weights) != num_layers: 54 | raise Exception( 55 | "Length of layer_weights {} differs \ 56 | from num_layers {}".format( 57 | layer_weights, num_layers 58 | ) 59 | ) 60 | 61 | self.scalar_parameters = ParameterList( 62 | [ 63 | Parameter( 64 | torch.FloatTensor([layer_weights[i]]), 65 | requires_grad=True, 66 | ) 67 | for i in range(num_layers) 68 | ] 69 | ) 70 | 71 | self.gamma = Parameter(torch.FloatTensor([1.0]), requires_grad=True) 72 | 73 | if self.dropout: 74 | dropout_mask = torch.zeros(len(self.scalar_parameters)) 75 | dropout_fill = torch.empty(len(self.scalar_parameters)).fill_(-1e20) 76 | self.register_buffer("dropout_mask", dropout_mask) 77 | self.register_buffer("dropout_fill", dropout_fill) 78 | 79 | def forward( 80 | self, 81 | tensors: List[torch.Tensor], # pylint: disable=arguments-differ 82 | mask: torch.Tensor = None, 83 | ) -> torch.Tensor: 84 | 85 | if len(tensors) != self.num_layers: 86 | raise Exception( 87 | "{} tensors were passed, but the module was initialized to \ 88 | mix {} tensors.".format( 89 | len(tensors), self.num_layers 90 | ) 91 | ) 92 | 93 | def _layer_norm(tensor, broadcast_mask, num_elements_not_masked): 94 | tensor_masked = tensor * broadcast_mask 95 | mean = torch.sum(tensor_masked) / num_elements_not_masked 96 | variance = ( 97 | torch.sum(((tensor_masked - mean) * broadcast_mask) ** 2) 98 | / num_elements_not_masked 99 | ) 100 | return (tensor - mean) / torch.sqrt(variance + 1e-12) 101 | 102 | # BUG: Pytorch bug fix when Parameters are not well copied across GPUs 103 | # https://github.com/pytorch/pytorch/issues/36035 104 | if len([parameter for parameter in self.scalar_parameters]) != self.num_layers: 105 | weights = torch.tensor(self.weights, device=tensors[0].device) 106 | gamma = torch.tensor(self.gamma_value, device=tensors[0].device) 107 | else: 108 | weights = torch.cat([parameter for parameter in self.scalar_parameters]) 109 | gamma = self.gamma 110 | 111 | if self.training and self.dropout: 112 | weights = torch.where( 113 | self.dropout_mask.uniform_() > self.dropout, weights, self.dropout_fill 114 | ) 115 | 116 | normed_weights = torch.nn.functional.softmax(weights, dim=0) 117 | normed_weights = torch.split(normed_weights, split_size_or_sections=1) 118 | 119 | if not self.layer_norm: 120 | pieces = [] 121 | for weight, tensor in zip(normed_weights, tensors): 122 | pieces.append(weight * tensor) 123 | return gamma * sum(pieces) 124 | 125 | else: 126 | mask_float = mask.float() 127 | broadcast_mask = mask_float.unsqueeze(-1) 128 | input_dim = tensors[0].size(-1) 129 | num_elements_not_masked = torch.sum(mask_float) * input_dim 130 | 131 | pieces = [] 132 | for weight, tensor in zip(normed_weights, tensors): 133 | pieces.append( 134 | weight 135 | * _layer_norm(tensor, broadcast_mask, num_elements_not_masked) 136 | ) 137 | return gamma * sum(pieces) 138 | -------------------------------------------------------------------------------- /comet/cli/score.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Unbabel 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Command for scoring MT systems. 16 | =============================== 17 | 18 | optional arguments: 19 | -h, --help Show this help message and exit. 20 | -s SOURCES, --sources SOURCES 21 | (required, type: Path_fr) 22 | -t TRANSLATIONS, --translations TRANSLATIONS 23 | (required, type: Path_fr) 24 | -r REFERENCES, --references REFERENCES 25 | (required, type: Path_fr) 26 | --to_json TO_JSON (type: Union[bool, str], default: False) 27 | --model MODEL (type: Union[str, Path_fr], default: wmt21-large-estimator) 28 | --batch_size BATCH_SIZE 29 | (type: int, default: 32) 30 | --gpus GPUS (type: int, default: 1) 31 | 32 | """ 33 | import json 34 | from typing import Union 35 | 36 | from comet.download_utils import download_model 37 | from comet.models import available_metrics, load_from_checkpoint 38 | from comet.modules import HeteroscedasticLoss, HeteroApproxLoss 39 | from jsonargparse import ArgumentParser 40 | from jsonargparse.typing import Path_fr 41 | from pytorch_lightning import seed_everything 42 | 43 | 44 | def score_command() -> None: 45 | parser = ArgumentParser(description="Command for scoring MT systems.") 46 | parser.add_argument("-s", "--sources", type=Path_fr, required=True) 47 | parser.add_argument("-t", "--translations", type=Path_fr, required=True) 48 | parser.add_argument("-r", "--references", type=Path_fr) 49 | parser.add_argument("-f", "--features", type=Path_fr, help="Path to additional features for predictor (optional)") 50 | parser.add_argument("--batch_size", type=int, default=8) 51 | parser.add_argument("--gpus", type=int, default=1) 52 | parser.add_argument( 53 | "--to_json", 54 | type=Union[bool, str], 55 | default=False, 56 | help="Exports results to a json file.", 57 | ) 58 | parser.add_argument( 59 | "--model", 60 | type=Union[str, Path_fr], 61 | required=False, 62 | default="wmt20-comet-da", 63 | #choices=available_metrics.keys(), 64 | help="COMET model to be used.", 65 | ) 66 | parser.add_argument( 67 | "--mc_dropout", 68 | type=Union[bool, int], 69 | default=False, 70 | help="Number of inference runs for each sample in MC Dropout.", 71 | ) 72 | parser.add_argument( 73 | "--refless", 74 | type=bool, 75 | default=False, 76 | help="flag for heteroschedastic loss", 77 | ) 78 | parser.add_argument( 79 | "--seed_everything", 80 | help="Prediction seed.", 81 | type=int, 82 | default=12, 83 | ) 84 | cfg = parser.parse_args() 85 | seed_everything(cfg.seed_everything) 86 | 87 | if (cfg.references is None) and ("refless" not in cfg.model) and (not cfg.refless): 88 | parser.error("{} requires -r/--references.".format(cfg.model)) 89 | 90 | model_path = ( 91 | download_model(cfg.model) if cfg.model in available_metrics else cfg.model 92 | ) 93 | model = load_from_checkpoint(model_path) 94 | model.eval() 95 | 96 | with open(cfg.sources()) as fp: 97 | sources = [line.strip() for line in fp.readlines()] 98 | 99 | with open(cfg.translations()) as fp: 100 | translations = [line.strip() for line in fp.readlines()] 101 | 102 | if cfg.features is not None : 103 | with open(cfg.features()) as fp: 104 | features = [(line.strip().split(',')) for line in fp.readlines()] 105 | features = list(map(list, zip(*features))) 106 | features = [[float(i) for i in f] for f in features] 107 | 108 | 109 | 110 | if "refless" in cfg.model or cfg.refless: 111 | if cfg.features is not None : 112 | data = {"src": sources, "mt": translations} 113 | for i,f in enumerate(features): 114 | data['f'+str(i+1)]=f 115 | else: 116 | data = {"src": sources, "mt": translations} 117 | else: 118 | with open(cfg.references()) as fp: 119 | references = [line.strip() for line in fp.readlines()] 120 | if cfg.features is not None : 121 | data = {"src": sources, "mt": translations, "ref": references} 122 | for i,f in enumerate(features): 123 | data['f'+str(i+1)]=f 124 | else: 125 | data = {"src": sources, "mt": translations, "ref": references} 126 | 127 | data = [dict(zip(data, t)) for t in zip(*data.values())] 128 | if cfg.mc_dropout: 129 | if isinstance(model.loss, HeteroscedasticLoss): 130 | mean_scores, std_scores, hts_mean, hts_std, sys_score = model.predict( 131 | data, cfg.batch_size, cfg.gpus, cfg.mc_dropout) 132 | else: 133 | mean_scores, std_scores, sys_score = model.predict( 134 | data, cfg.batch_size, cfg.gpus, cfg.mc_dropout) 135 | for i, (mean, std, sample) in enumerate(zip(mean_scores, std_scores, data)): 136 | print("Segment {}\tscore: {:.4f}\tvariance: {:.4f}".format(i, mean, std)) 137 | sample["COMET score"] = mean 138 | sample["COMET variance"] = std 139 | if isinstance(model.loss, HeteroscedasticLoss): 140 | sample["Heteroscedastic score"] = hts_mean 141 | sample["Heteroscedastic variance"] = hts_std 142 | 143 | print("System score: {:.4f}".format(sys_score)) 144 | if isinstance(cfg.to_json, str): 145 | with open(cfg.to_json, "w") as outfile: 146 | json.dump(data, outfile, ensure_ascii=False, indent=4) 147 | print("Predictions saved in: {}.".format(cfg.to_json)) 148 | 149 | else: 150 | if isinstance(model.loss, HeteroscedasticLoss): 151 | predictions, hts, sys_score = model.predict(data, cfg.batch_size, cfg.gpus) 152 | else: 153 | predictions, sys_score = model.predict(data, cfg.batch_size, cfg.gpus) 154 | for i, (score, sample) in enumerate(zip(predictions, data)): 155 | print("Segment {}\tscore: {:.4f}".format(i, score)) 156 | sample["COMET score"] = score 157 | if isinstance(model.loss, HeteroscedasticLoss): 158 | sample["Heteroscedastic score"] = hts[i] 159 | 160 | print("System score: {:.4f}".format(sys_score)) 161 | if isinstance(cfg.to_json, str): 162 | with open(cfg.to_json, "w") as outfile: 163 | json.dump(data, outfile, ensure_ascii=False, indent=4) 164 | print("Predictions saved in: {}.".format(cfg.to_json)) 165 | -------------------------------------------------------------------------------- /comet/cli/compare.py: -------------------------------------------------------------------------------- 1 | # Copyright (C) 2020 Unbabel 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | """ 15 | Command for comparing two MT systems. 16 | ====================================== 17 | 18 | optional arguments: 19 | -h, --help Show this help message and exit. 20 | -s SOURCES, --sources SOURCES 21 | (required, type: Path_fr) 22 | -x SYSTEM_X, --system_x SYSTEM_X 23 | (required, type: Path_fr) 24 | -y SYSTEM_Y, --system_y SYSTEM_Y 25 | (required, type: Path_fr) 26 | -r REFERENCES, --references REFERENCES 27 | (type: Path_fr, default: null) 28 | --batch_size BATCH_SIZE 29 | (type: int, default: 8) 30 | --gpus GPUS (type: int, default: 1) 31 | --num_splits NUM_SPLITS 32 | Number of random partitions used in Bootstrap resampling. (type: int, default: 300) 33 | --sample_ratio SAMPLE_RATIO 34 | Percentage of the testset to use in each bootstrap resampling partition. (type: float, default: 0.4) 35 | --to_json TO_JSON Exports results to a json file. (type: Union[bool, str], default: False) 36 | --model {emnlp20-comet-rank,wmt20-comet-da,wmt20-comet-qe-da,wmt21-cometinho-da} 37 | COMET model to be used. (type: Union[str, Path_fr], default: wmt20-comet-da) 38 | --seed_everything SEED_EVERYTHING 39 | Prediction seed. (type: int, default: 12) 40 | 41 | """ 42 | 43 | import json 44 | from typing import Union 45 | 46 | import numpy as np 47 | from comet.download_utils import download_model 48 | from comet.models import available_metrics, load_from_checkpoint 49 | from jsonargparse import ArgumentParser 50 | from jsonargparse.typing import Path_fr 51 | from pytorch_lightning import seed_everything 52 | 53 | 54 | def compare_command() -> None: 55 | parser = ArgumentParser(description="Command for comparing two MT systems.") 56 | parser.add_argument("-s", "--sources", type=Path_fr, required=True) 57 | parser.add_argument("-x", "--system_x", type=Path_fr, required=True) 58 | parser.add_argument("-y", "--system_y", type=Path_fr, required=True) 59 | parser.add_argument("-r", "--references", type=Path_fr) 60 | parser.add_argument("--batch_size", type=int, default=8) 61 | parser.add_argument("--gpus", type=int, default=1) 62 | parser.add_argument( 63 | "--num_splits", 64 | type=int, 65 | default=300, 66 | help="Number of random partitions used in Bootstrap resampling.", 67 | ) 68 | parser.add_argument( 69 | "--sample_ratio", 70 | type=float, 71 | default=0.4, 72 | help="Percentage of the testset to use in each bootstrap resampling partition.", 73 | ) 74 | parser.add_argument( 75 | "--to_json", 76 | type=Union[bool, str], 77 | default=False, 78 | help="Exports results to a json file.", 79 | ) 80 | parser.add_argument( 81 | "--model", 82 | type=Union[str, Path_fr], 83 | required=False, 84 | default="wmt20-comet-da", 85 | choices=available_metrics.keys(), 86 | help="COMET model to be used.", 87 | ) 88 | parser.add_argument( 89 | "--seed_everything", 90 | help="Prediction seed.", 91 | type=int, 92 | default=12, 93 | ) 94 | cfg = parser.parse_args() 95 | seed_everything(cfg.seed_everything) 96 | 97 | if (cfg.references is None) and ("refless" not in cfg.model): 98 | parser.error("{} requires -r/--references.".format(cfg.model)) 99 | 100 | model_path = ( 101 | download_model(cfg.model) if cfg.model in available_metrics else cfg.model 102 | ) 103 | model = load_from_checkpoint(model_path) 104 | model.eval() 105 | 106 | with open(cfg.sources()) as fp: 107 | sources = [line.strip() for line in fp.readlines()] 108 | 109 | with open(cfg.system_x()) as fp: 110 | system_x = [line.strip() for line in fp.readlines()] 111 | 112 | with open(cfg.system_y()) as fp: 113 | system_y = [line.strip() for line in fp.readlines()] 114 | 115 | if "refless" in cfg.model: 116 | system_x = {"src": sources, "mt": system_x} 117 | system_y = {"src": sources, "mt": system_y} 118 | else: 119 | with open(cfg.references()) as fp: 120 | references = [line.strip() for line in fp.readlines()] 121 | system_x = {"src": sources, "mt": system_x, "ref": references} 122 | system_y = {"src": sources, "mt": system_y, "ref": references} 123 | 124 | system_x = [dict(zip(system_x, t)) for t in zip(*system_x.values())] 125 | system_y = [dict(zip(system_y, t)) for t in zip(*system_y.values())] 126 | 127 | x_seg_scores, _ = model.predict(system_x, cfg.batch_size, cfg.gpus) 128 | y_seg_scores, _ = model.predict(system_y, cfg.batch_size, cfg.gpus) 129 | 130 | data = [] 131 | for i, (x_score, y_score) in enumerate(zip(x_seg_scores, y_seg_scores)): 132 | print( 133 | "Segment {}\tsystem_x score: {:.4f}\tsystem_y score: {:.4f}".format( 134 | i, x_score, y_score 135 | ) 136 | ) 137 | data.append( 138 | { 139 | "src": system_x[0]["src"], 140 | "system_x": {"mt": system_x[0]["mt"], "score": x_score}, 141 | "system_y": {"mt": system_y[0]["mt"], "score": y_score}, 142 | "ref": system_y[0]["ref"], 143 | } 144 | ) 145 | 146 | n = len(sources) 147 | ids = list(range(n)) 148 | sample_size = max(int(n * cfg.sample_ratio), 1) 149 | 150 | x_sys_scores, y_sys_scores = [], [] 151 | win_count = [0, 0, 0] 152 | for _ in range(cfg.num_splits): 153 | # Subsample the gold and system outputs (with replacement) 154 | subsample_ids = np.random.choice(ids, size=sample_size, replace=True) 155 | subsample_x_scr = sum([x_seg_scores[i] for i in subsample_ids]) / sample_size 156 | subsample_y_scr = sum([y_seg_scores[i] for i in subsample_ids]) / sample_size 157 | 158 | if subsample_x_scr > subsample_y_scr: 159 | win_count[0] += 1 160 | elif subsample_y_scr > subsample_x_scr: 161 | win_count[1] += 1 162 | else: 163 | win_count[2] += 1 164 | 165 | x_sys_scores.append(subsample_x_scr) 166 | y_sys_scores.append(subsample_y_scr) 167 | 168 | data.insert( 169 | 0, 170 | { 171 | "x-mean": np.mean(np.array(x_sys_scores)), 172 | "y-mean": np.mean(np.array(y_sys_scores)), 173 | "ties (%)": win_count[2] / sum(win_count), 174 | "x_wins (%)": win_count[0] / sum(win_count), 175 | "y_wins (%)": win_count[1] / sum(win_count), 176 | }, 177 | ) 178 | for k, v in data[0].items(): 179 | print("{}:\t{:.4f}".format(k, v)) 180 | 181 | if isinstance(cfg.to_json, str): 182 | with open(cfg.to_json, "w") as outfile: 183 | json.dump(data, outfile, ensure_ascii=False, indent=4) 184 | print("Predictions saved in: {}.".format(cfg.to_json)) 185 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # uncertainties_MT_eval 2 | Code and data for the paper: [Disentangling Uncertainty in Machine Translation Evaluation](https://arxiv.org/pdf/2204.06546.pdf) 3 | 4 | 5 | ## Quick Installation 6 | 7 | We are using Python 3.8. 8 | 9 | Detailed usage examples and instructions for the COMET metric can be found in the [Full Documentation](https://unbabel.github.io/COMET/html/index.html). 10 | 11 | To develop locally: 12 | ```bash 13 | git clone https://github.com/deep-spin/uncertainties_MT_eval.git 14 | pip install -r requirements.txt 15 | pip install -e . 16 | ``` 17 | 18 | ## TL;DR 19 | 20 | This repository is en extension of the original COMET metric, providing different options to enhance it with uncertainty predictors. It includes code for **heteroscedastic losses (HTS and KL)**, as well as the option to use the same architecture for **direct uncertainty prediction (DUP)**. 21 | We used COMET v1.0 as the basis for this extension. 22 | 23 | ## Important commands 24 | 25 | - To train a new metric use: 26 | 27 | ```bash 28 | comet-train --cfg config/models/model_config.yaml 29 | ``` 30 | 31 | - To use a trained metric of a triplet of a source file , translation file and reference file and obtain predictions use: 32 | 33 | ```bash 34 | comet-score --model -s src.txt -t mt.txt -r ref.txt 35 | ``` 36 | 37 | ## Description of configurations and command options 38 | 39 | ### COMET configuration 40 | To train a plain COMET model on your data without using the uncertainty-related code, use the configuration file : 41 | [uncertainties_MT_eval/configs/models/regression_metric_comet_plain.yaml](../uncertainties_MT_eval/configs/models/regression_metric_comet_plain.yaml) 42 | 43 | This model will use an MSE loss and will produce a single output for each segment, corresponding to the predicted **quality score**. 44 | 45 | ### COMET with MC Dropout configuration 46 | 47 | After having (any) trained COMET model you can apply MC Dropout during inference using the ```--mc_dropout``` and specify the desired number *N* of the forward stochastic runs during ```comet-score``` as follows: 48 | 49 | ```bash 50 | comet-score --model -s src.txt -t mt.txt -r ref.txt --mc_dropout N 51 | ``` 52 | 53 | 54 | This option can be used with models trained using any of the three loss options: hts, kl, mse. 55 | 56 | If the option is used with a model trained with the MSE loss, then the model will pgenerateroduce a second output for each segment corresponding to the variance/uncertainty value for each segment's quality score prediction. 57 | 58 | If the option is used in combination with any of the two heteroscedastic losses, the model will generate four outputs for each segment in total: 59 | 1. The predicted quality score 60 | 2. The estimated variance for the quality score 61 | 3. The predicted aleatoric uncertainty 62 | 4. The estimated variance of the aleatoric uncertainty 63 | 64 | Then the total uncertainty value for the segment can be calculated as indicated in Eq. 4 in the paper. 65 | 66 | 67 | >Note that we used N=100 for all experiments in the paper. To reproduce other related works this number might have to be reduced. 68 | 69 | ### COMET with aleatoric uncertainty predictions 70 | 71 | There are two options to train COMET with aleatoric uncertainty prediction. 72 | 73 | 1. Heteroscedastic uncertainty (HTS) which can be used with any labelled dataset. It only requires setting the loss to "hts" in the configuration file; see [uncertainties_MT_eval/configs/models/regression_metric_comet_heteroscedastic.yaml](../uncertainties_MT_eval/configs/models/regression_metric_comet_heteroscedastic.yaml) as an example. 74 | 75 | 2. KL-divergence minimisation based uncertainty (KL). To train a model with the KL setup requires access to labelled data with multiple annotator per segment that provides either (a) multiple human judgements per segment, or (b) the standard deviation of the multiple annotator scores per segment. See file [uncertainties_MT_eval/data/mqm2020/mqm.train.z_score.csv](uncertainties_MT_eval/data/mqm2020/mqm.train.z_score.csv) as an example. 76 | To train a model on this data set the loss to "kl" in the configuration file. See [uncertainties_MT_eval/configs/models/regression_metric_comet_kl.yaml](../uncertainties_MT_eval/configs/models/regression_metric_comet_kl.yaml) 77 | 78 | 79 | ### COMET-based direct uncertainty prediction (COMET-DUP) 80 | 81 | It is possible train a COMET model to predict the uncertainty of a given prediction (casting uncertainty as the error/distance to the human judgement), henceforth referred to as COMET-DUP. 82 | 83 | #### **Training Setup:** 84 | 85 | To train a COMET-DUP model it is necessary to: 86 | 87 | - Have access to human judgements $q^*$ on a train dataset $\mathcal{D}$ 88 | - Run a MT Evaluation or MT Quality Estimation model to obtain quality predictions $\hat{q}$ over $\mathcal{D}$ 89 | - Calculate $\epsilon = |q^*-\hat{q}|$ for $\mathcal{D}$ 90 | - Use $\epsilon$ as the target for the uncertainty predicting COMET, instead of the human quality judgements which is the default target 91 | 92 | Provide the training data in a csv file using a column **f1** that holds the values for the predicted quality scores $\hat{q}$ and a column **score** that contains the computed $\epsilon$ (target) for each instance. 93 | 94 | #### **Losses** 95 | 96 | Upon calculating the above three different losses can be used for the COMET-DUP training: 97 | 98 | 1. Typical MSE loss: $\mathcal{L}^\mathrm{E}_{\mathrm{ABS}}(\hat{\epsilon}; \epsilon^*) = (\epsilon^* - \hat{\epsilon})^2$\ 99 | Specify loss: "mse" in the yaml configuration file to use it 100 | 2. MSE loss with squared values: 101 | $\mathcal{L}^\mathrm{E}_{\mathrm{SQ}}(\hat{\epsilon}; \epsilon^*) = ((\epsilon^*)^2 - \hat{\epsilon}^2)^2 $ 102 | Specify loss: "squared" in the yaml configuration file to use it 103 | 3. Heteroschedastic approximation loss: 104 | $\mathcal{L}^\mathrm{E}_{\mathrm{HTS}}(\hat{\epsilon}; \epsilon^*) = \frac{(\epsilon^*)^2}{2 \hat{\epsilon}^2} + \frac{1}{2}\log(\hat{\epsilon})^2$ 105 | Specify loss: "hts_approx" in the yaml configuration file to use it 106 | 107 | #### **Bottleneck**: 108 | COMET-DUP unlike COMET uses a bottleneck layer to incorporate the initial quality predictions $\hat{q}$ as training. You need to specify the the size of the bottleneck layer in the configuration file. 109 | Recommended value: 256 110 | 111 | 112 | #### **Full Train Configuration**: 113 | For an example of a configuration file to train COMET-DUP with $\mathcal{L}^\mathrm{E}_{\mathrm{HTS}}$ see the file [uncertainties_MT_eval/configs/models/regression_metric_comet_dup.yaml](../uncertainties_MT_eval/configs/models/regression_metric_comet_dup.yaml) 114 | 115 | 116 | #### **Inference** 117 | 118 | For inference with COMET-DUP use the same inference command (`comet-score`) used for the other COMET models providing a trained COMET-DUP model in the `--model` option. Remember that the output in this case will be uncertainty scores instead of quality scores. 119 | 120 |
121 |
122 | 123 | *** 124 | 125 | ## Related Publications 126 | 127 | - [Better Uncertainty Quantification for Machine Translation Evaluation](https://arxiv.org/pdf/2204.06546.pdf) 128 | 129 | - [Uncertainty-Aware Machine Translation Evaluation](https://aclanthology.org/2021.findings-emnlp.330/) 130 | 131 | - [IST-Unbabel 2021 Submission for the Quality Estimation Shared Task](https://aclanthology.org/2021.wmt-1.102/) 132 | 133 | - [Are References Really Needed? Unbabel-IST 2021 Submission for the Metrics Shared Task](http://statmt.org/wmt21/pdf/2021.wmt-1.111.pdf) 134 | 135 | - [COMET - Deploying a New State-of-the-art MT Evaluation Metric in Production](https://www.aclweb.org/anthology/2020.amta-user.4) 136 | 137 | - [Unbabel's Participation in the WMT20 Metrics Shared Task](https://aclanthology.org/2020.wmt-1.101/) 138 | 139 | - [COMET: A Neural Framework for MT Evaluation](https://www.aclweb.org/anthology/2020.emnlp-main.213) 140 | -------------------------------------------------------------------------------- /comet/download_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | import logging 17 | import os 18 | import subprocess 19 | import urllib.request 20 | import zipfile 21 | from typing import List 22 | from urllib.parse import urlparse 23 | 24 | from tqdm import tqdm 25 | 26 | from comet.models import available_metrics 27 | 28 | logger = logging.getLogger(__name__) 29 | 30 | 31 | def get_cache_folder(): 32 | if "HOME" in os.environ: 33 | cache_directory = os.environ["HOME"] + "/.cache/torch/unbabel_comet/" 34 | if not os.path.exists(cache_directory): 35 | os.makedirs(cache_directory) 36 | return cache_directory 37 | else: 38 | raise Exception("HOME environment variable is not defined.") 39 | 40 | 41 | def _reporthook(t): 42 | """``reporthook`` to use with ``urllib.request`` that prints the 43 | process of the download. 44 | 45 | Uses ``tqdm`` for progress bar. 46 | 47 | **Reference:** 48 | https://github.com/tqdm/tqdm 49 | 50 | """ 51 | last_b = [0] 52 | 53 | def inner(b: int = 1, bsize: int = 1, tsize: int = None): 54 | """ 55 | :param b: Number of blocks just transferred [default: 1]. 56 | :param bsize: Size of each block (in tqdm units) [default: 1]. 57 | :param tsize: Total size (in tqdm units). 58 | If [default: None] remains unchanged. 59 | """ 60 | if tsize is not None: 61 | t.total = tsize 62 | t.update((b - last_b[0]) * bsize) 63 | last_b[0] = b 64 | 65 | return inner 66 | 67 | 68 | def _maybe_extract(compressed_filename: str, directory: str, extension: str = None): 69 | """Extract a compressed file to ``directory``. 70 | 71 | :param compressed_filename: Compressed file. 72 | :param directory: Extract to directory. 73 | :param extension: Extension of the file; Otherwise, attempts to 74 | extract extension from the filename. 75 | """ 76 | logger.info("Extracting {}".format(compressed_filename)) 77 | 78 | if extension is None: 79 | basename = os.path.basename(compressed_filename) 80 | extension = basename.split(".", 1)[1] 81 | 82 | if "zip" in extension: 83 | with zipfile.ZipFile(compressed_filename, "r") as zip_: 84 | zip_.extractall(directory) 85 | 86 | elif "tar.gz" in extension or "tgz" in extension: 87 | # `tar` is much faster than python's `tarfile` implementation 88 | with open(os.devnull, "w") as devnull: 89 | subprocess.call( 90 | ["tar", "-C", directory, "-zxvf", compressed_filename], stdout=devnull 91 | ) 92 | 93 | elif "tar" in extension: 94 | with open(os.devnull, "w") as devnull: 95 | subprocess.call( 96 | ["tar", "-C", directory, "-xvf", compressed_filename], stdout=devnull 97 | ) 98 | 99 | logger.info("Extracted {}".format(compressed_filename)) 100 | 101 | 102 | def _get_filename_from_url(url): 103 | """Return a filename from a URL 104 | 105 | Args: 106 | url (str): URL to extract filename from 107 | 108 | Returns: 109 | (str): Filename in URL 110 | """ 111 | parse = urlparse(url) 112 | return os.path.basename(parse.path) 113 | 114 | 115 | def _check_download(*filepaths): 116 | """Check if the downloaded files are found. 117 | 118 | Args: 119 | filepaths (list of str): Check if these filepaths exist 120 | 121 | Returns: 122 | (bool): Returns True if all filepaths exist 123 | """ 124 | return all([os.path.isfile(filepath) for filepath in filepaths]) 125 | 126 | 127 | def download_file_maybe_extract( 128 | url: str, 129 | directory: str, 130 | filename: str = None, 131 | extension: str = None, 132 | check_files: List[str] = [], 133 | ): 134 | """Download the file at ``url`` to ``directory``. 135 | Extract to ``directory`` if tar or zip. 136 | 137 | :param url: Url of file (str or Path). 138 | :param directory: Directory to download to. 139 | :param filename: Name of the file to download; Otherwise, a filename is extracted 140 | from the url. 141 | :param extension: Extension of the file; Otherwise, attempts to extract extension 142 | from the filename. 143 | :param check_files: Check if these files exist, ensuring the download 144 | succeeded. If these files exist before the download, the download is skipped. 145 | 146 | :return: Filename of download file. 147 | """ 148 | if filename is None: 149 | filename = _get_filename_from_url(url) 150 | 151 | directory = str(directory) 152 | filepath = os.path.join(directory, filename) 153 | check_files = [os.path.join(directory, str(f)) for f in check_files] 154 | 155 | if len(check_files) > 0 and _check_download(*check_files): 156 | return filepath 157 | 158 | if not os.path.isdir(directory): 159 | os.makedirs(directory) 160 | 161 | logger.info("Downloading {}".format(filename)) 162 | 163 | # Download 164 | with tqdm(unit="B", unit_scale=True, miniters=1, desc=filename) as t: 165 | urllib.request.urlretrieve(url, filename=filepath, reporthook=_reporthook(t)) 166 | 167 | _maybe_extract( 168 | compressed_filename=filepath, directory=directory, extension=extension 169 | ) 170 | 171 | if not _check_download(*check_files): 172 | raise ValueError("[DOWNLOAD FAILED] `*check_files` not found") 173 | 174 | return filepath 175 | 176 | 177 | def download_model(model: str, saving_directory: str = None) -> str: 178 | """ 179 | Function that loads pretrained models from AWS. 180 | 181 | :param model: Name of the model to be loaded. 182 | :param saving_directory: RELATIVE path to the saving folder (must end with /). 183 | 184 | Return: 185 | - Path to model checkpoint. 186 | """ 187 | 188 | if saving_directory is None: 189 | saving_directory = get_cache_folder() 190 | 191 | if not saving_directory.endswith("/"): 192 | saving_directory += "/" 193 | 194 | if not os.path.exists(saving_directory): 195 | os.makedirs(saving_directory) 196 | 197 | if os.path.isdir(saving_directory + model): 198 | logger.info(f"{model} is already in cache.") 199 | if not model.endswith("/"): 200 | model += "/" 201 | 202 | elif model not in available_metrics.keys(): 203 | raise Exception( 204 | f"{model} is not in the `availale_metrics` or is a valid checkpoint folder." 205 | ) 206 | 207 | elif available_metrics[model].startswith("https://"): 208 | download_file_maybe_extract( 209 | available_metrics[model], directory=saving_directory 210 | ) 211 | 212 | else: 213 | raise Exception("Invalid model name!") 214 | 215 | # CLEAN Cache 216 | if os.path.exists(saving_directory + model + ".zip"): 217 | os.remove(saving_directory + model + ".zip") 218 | if os.path.exists(saving_directory + model + ".tar.gz"): 219 | os.remove(saving_directory + model + ".tar.gz") 220 | if os.path.exists(saving_directory + model + ".tar"): 221 | os.remove(saving_directory + model + ".tar") 222 | 223 | checkpoints_folder = saving_directory + model + "/checkpoints" 224 | checkpoints = [ 225 | file for file in os.listdir(checkpoints_folder) if file.endswith(".ckpt") 226 | ] 227 | checkpoint = checkpoints[-1] 228 | checkpoint_path = checkpoints_folder + "/" + checkpoint 229 | return checkpoint_path 230 | -------------------------------------------------------------------------------- /comet/models/regression/referenceless.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | r""" 17 | ReferencelessRegression 18 | ======================== 19 | Referenceless Regression Metric that learns to predict a quality assessment by 20 | looking at source and translation. 21 | """ 22 | from typing import Dict, List, Optional, Tuple, Union 23 | 24 | import pandas as pd 25 | import torch 26 | from comet.models.regression.regression_metric import RegressionMetric 27 | from comet.modules import FeedForward, Bottleneck 28 | 29 | 30 | class ReferencelessRegression(RegressionMetric): 31 | """ReferencelessRegression: 32 | 33 | :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen. 34 | :param keep_embeddings_frozen: Keeps the embeddings frozen during training. 35 | :param keep_encoder_frozen: freezes entire encoder. 36 | :param optimizer: Optimizer used during training. 37 | :param encoder_learning_rate: Learning rate used to fine-tune the encoder model. 38 | :param learning_rate: Learning rate used to fine-tune the top layers. 39 | :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers. 40 | :param encoder_model: Encoder model to be used. 41 | :param pretrained_model: Pretrained model from Hugging Face. 42 | :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg']. 43 | :param layer: Encoder layer to be used ('mix' for pooling info from all layers.) 44 | :param dropout: Dropout used in the top-layers. 45 | :param batch_size: Batch size used during training. 46 | :param train_data: Path to a csv file containing the training data. 47 | :param validation_data: Path to a csv file containing the validation data. 48 | :param hidden_sizes: Hidden sizes for the Feed Forward regression. 49 | :param activations: Feed Forward activation function. 50 | :param load_weights_from_checkpoint: Path to a checkpoint file. 51 | """ 52 | 53 | def __init__( 54 | self, 55 | nr_frozen_epochs: Union[float, int] = 0.3, 56 | keep_embeddings_frozen: bool = False, 57 | keep_encoder_frozen: bool = False, 58 | optimizer: str = "AdamW", 59 | encoder_learning_rate: float = 1e-05, 60 | learning_rate: float = 3e-05, 61 | layerwise_decay: float = 0.95, 62 | encoder_model: str = "XLM-RoBERTa", 63 | pretrained_model: str = "xlm-roberta-base", 64 | pool: str = "avg", 65 | layer: Union[str, int] = "mix", 66 | dropout: float = 0.1, 67 | batch_size: int = 4, 68 | train_data: Optional[str] = None, 69 | validation_data: Optional[str] = None, 70 | hidden_sizes_bottleneck: List[int] = [1536, 256], 71 | hidden_sizes: List[int] = [768], 72 | activations: str = "Tanh", 73 | final_activation: Optional[str] = None, 74 | load_weights_from_checkpoint: Optional[str] = None, 75 | loss: Optional[str]="mse", 76 | data_portion: Optional[float]=1.0, 77 | feature_size: Optional[int] = 0 78 | ) -> None: 79 | super(RegressionMetric, self).__init__( 80 | nr_frozen_epochs, 81 | keep_embeddings_frozen, 82 | keep_encoder_frozen, 83 | optimizer, 84 | encoder_learning_rate, 85 | learning_rate, 86 | layerwise_decay, 87 | encoder_model, 88 | pretrained_model, 89 | pool, 90 | layer, 91 | dropout, 92 | batch_size, 93 | train_data, 94 | validation_data, 95 | load_weights_from_checkpoint, 96 | "referenceless_regression_metric", 97 | ) 98 | self.save_hyperparameters() 99 | 100 | self.bottleneck = Bottleneck( 101 | in_dim=self.encoder.output_units * 4 , 102 | hidden_sizes = self.hparams.hidden_sizes_bottleneck, 103 | activations=self.hparams.activations, 104 | dropout=self.hparams.dropout, 105 | ) 106 | 107 | self.estimator = FeedForward( 108 | in_dim=self.hparams.hidden_sizes_bottleneck[-1] + self.hparams.feature_size, 109 | out_dim = 2 if self.hparams.loss in ["var", "hts"] else 1, 110 | hidden_sizes=self.hparams.hidden_sizes, 111 | activations=self.hparams.activations, 112 | dropout=self.hparams.dropout, 113 | final_activation=self.hparams.final_activation, 114 | ) 115 | 116 | def prepare_sample( 117 | self, sample: List[Dict[str, Union[str, float]]], inference: bool = False, data_portion: float = 1.0, 118 | ) -> Union[ 119 | Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor] 120 | ]: 121 | """ 122 | Function that prepares a sample to input the model. 123 | 124 | :param sample: list of dictionaries. 125 | :param inference: If set to true prepares only the model inputs. 126 | 127 | :returns: Tuple with 2 dictionaries (model inputs and targets). 128 | If `inference=True` returns only the model inputs. 129 | """ 130 | sample = {k: [dic[k] for dic in sample] for k in sample[0]} 131 | src_inputs = self.encoder.prepare_sample(sample["src"]) 132 | mt_inputs = self.encoder.prepare_sample(sample["mt"]) 133 | 134 | src_inputs = {"src_" + k: v for k, v in src_inputs.items()} 135 | mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()} 136 | if self.hparams.feature_size>0: 137 | feats = [] 138 | for feat in sample: 139 | if feat.startswith("f"): 140 | feats.append(sample[feat]) 141 | #print(len(feats)) 142 | feature_tensor = torch.as_tensor(feats, dtype=torch.float) 143 | #print(feature_tensor.shape) 144 | #print('------------------') 145 | features = {"custom_features": feature_tensor.T} 146 | 147 | 148 | else: 149 | features = {"custom_features": torch.Tensor()} 150 | 151 | inputs = {**src_inputs, **mt_inputs, **features} 152 | 153 | if inference: 154 | return inputs 155 | 156 | targets = {"score": torch.tensor(sample["score"], dtype=torch.float)} 157 | return inputs, targets 158 | 159 | def forward( 160 | self, 161 | src_input_ids: torch.tensor, 162 | src_attention_mask: torch.tensor, 163 | mt_input_ids: torch.tensor, 164 | mt_attention_mask: torch.tensor, 165 | custom_features: torch.tensor, 166 | **kwargs 167 | ) -> Dict[str, torch.Tensor]: 168 | src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask) 169 | mt_sentemb = self.get_sentence_embedding(mt_input_ids, mt_attention_mask) 170 | 171 | diff_src = torch.abs(mt_sentemb - src_sentemb) 172 | prod_src = mt_sentemb * src_sentemb 173 | 174 | embedded_sequences = torch.cat( 175 | (mt_sentemb, src_sentemb, prod_src, diff_src), dim=1 176 | ) 177 | 178 | if self.hparams.feature_size>0: 179 | #custom_features=torch.unsqueeze(custom_features, 1) 180 | #print(embedded_sequences.shape) 181 | #print(f1.shape) 182 | 183 | bottleneck = self.bottleneck(embedded_sequences) 184 | #print(bottleneck.shape) 185 | #print(custom_features.shape) 186 | seq_feats = torch.cat((bottleneck,custom_features),dim=1) 187 | #print(seq_feats.shape) 188 | 189 | score = self.estimator(seq_feats) 190 | else: 191 | bottleneck = self.bottleneck(embedded_sequences) 192 | score = self.estimator(bottleneck) 193 | if self.hparams.loss in ["var","hts"]: 194 | return {"score": score[:,0], "variance": score[:,1]} 195 | 196 | return {"score": score} 197 | 198 | def read_csv(self, path: str) -> List[dict]: 199 | """Reads a comma separated value file. 200 | 201 | :param path: path to a csv file. 202 | 203 | :return: List of records as dictionaries 204 | """ 205 | feats=[] 206 | df = pd.read_csv(path) 207 | flen = self.hparams.feature_size 208 | columns = ["src", "mt", "score"] 209 | for i in range(flen): 210 | fstring='f'+str(i+1) 211 | print('feature added: '+str(fstring)) 212 | columns.append(fstring) 213 | feats.append(fstring) 214 | df = df[columns] 215 | 216 | df["src"] = df["src"].astype(str) 217 | df["mt"] = df["mt"].astype(str) 218 | df["score"] = df["score"].astype(float) 219 | for feat in feats: 220 | df[feat] = df[feat].astype(float) 221 | return df.to_dict("records") 222 | -------------------------------------------------------------------------------- /comet/models/regression/regression_metric_hybrid.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | r""" 17 | RegressionMetric 18 | ======================== 19 | Regression Metric that learns to predict a quality assessment by looking 20 | at source, translation and reference. 21 | """ 22 | from typing import Dict, List, Optional, Tuple, Union 23 | 24 | import pandas as pd 25 | import torch 26 | from comet.models.base import CometModel 27 | from comet.modules import FeedForward, Bottleneck 28 | from torchmetrics import MetricCollection, PearsonCorrcoef, SpearmanCorrcoef 29 | from transformers import AdamW 30 | import random 31 | 32 | class RegressionMetric(CometModel): 33 | """RegressionMetric: 34 | 35 | :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen. 36 | :param keep_embeddings_frozen: Keeps the embeddings frozen during training. 37 | :param keep_encoder_frozen: freezes entire encoder. 38 | :param optimizer: Optimizer used during training. 39 | :param encoder_learning_rate: Learning rate used to fine-tune the encoder model. 40 | :param learning_rate: Learning rate used to fine-tune the top layers. 41 | :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers. 42 | :param encoder_model: Encoder model to be used. 43 | :param pretrained_model: Pretrained model from Hugging Face. 44 | :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg']. 45 | :param layer: Encoder layer to be used ('mix' for pooling info from all layers.) 46 | :param dropout: Dropout used in the top-layers. 47 | :param batch_size: Batch size used during training. 48 | :param train_data: Path to a csv file containing the training data. 49 | :param validation_data: Path to a csv file containing the validation data. 50 | :param hidden_sizes: Hidden sizes for the Feed Forward regression. 51 | :param activations: Feed Forward activation function. 52 | :param load_weights_from_checkpoint: Path to a checkpoint file. 53 | """ 54 | 55 | def __init__( 56 | self, 57 | nr_frozen_epochs: Union[float, int] = 0.3, 58 | keep_embeddings_frozen: bool = False, 59 | keep_encoder_frozen: bool = False, 60 | optimizer: str = "AdamW", 61 | encoder_learning_rate: float = 1e-05, 62 | learning_rate: float = 3e-05, 63 | layerwise_decay: float = 0.95, 64 | encoder_model: str = "XLM-RoBERTa", 65 | pretrained_model: str = "xlm-roberta-base", 66 | pool: str = "avg", 67 | layer: Union[str, int] = "mix", 68 | dropout: float = 0.1, 69 | batch_size: int = 4, 70 | train_data: Optional[str] = None, 71 | validation_data: Optional[str] = None, 72 | hidden_sizes_bottleneck: List[int] = [2304, 256], 73 | hidden_sizes: List[int] = [768], 74 | activations: str = "Tanh", 75 | final_activation: Optional[str] = None, 76 | load_weights_from_checkpoint: Optional[str] = None, 77 | loss: Optional[str]="mse", 78 | data_portion: Optional[float] = 1.0, 79 | feature_size: Optional[int] = 0 80 | ) -> None: 81 | super().__init__( 82 | nr_frozen_epochs, 83 | keep_embeddings_frozen, 84 | keep_encoder_frozen, 85 | optimizer, 86 | encoder_learning_rate, 87 | learning_rate, 88 | layerwise_decay, 89 | encoder_model, 90 | pretrained_model, 91 | pool, 92 | layer, 93 | dropout, 94 | batch_size, 95 | train_data, 96 | validation_data, 97 | load_weights_from_checkpoint, 98 | "regression_metric", 99 | ) 100 | self.save_hyperparameters() 101 | self.bottleneck = Bottleneck( 102 | in_dim=self.encoder.output_units * 6 , 103 | hidden_sizes = [self.hparams.hidden_sizes[0],self.hparams.hidden_sizes_bottleneck[-1]], 104 | activations=self.hparams.activations, 105 | dropout=self.hparams.dropout, 106 | ) 107 | 108 | self.estimator = FeedForward( 109 | in_dim=self.hparams.hidden_sizes_bottleneck[-1] + self.hparams.feature_size, 110 | out_dim = 2 if self.hparams.loss in ["var", "hts"] else 1, 111 | hidden_sizes=[self.hparams.hidden_sizes[-1]], 112 | activations=self.hparams.activations, 113 | dropout=self.hparams.dropout, 114 | final_activation=self.hparams.final_activation, 115 | ) 116 | 117 | def init_metrics(self): 118 | metrics = MetricCollection( 119 | {"spearman": SpearmanCorrcoef(), "pearson": PearsonCorrcoef()} 120 | ) 121 | self.train_metrics = metrics.clone(prefix="train_") 122 | self.val_metrics = metrics.clone(prefix="val_") 123 | 124 | def configure_optimizers( 125 | self, 126 | ) -> Tuple[List[torch.optim.Optimizer], List[torch.optim.lr_scheduler.LambdaLR]]: 127 | """Sets the optimizers to be used during training.""" 128 | layer_parameters = self.encoder.layerwise_lr( 129 | self.hparams.encoder_learning_rate, self.hparams.layerwise_decay 130 | ) 131 | top_layers_parameters = [ 132 | {"params": self.estimator.parameters() , "lr": self.hparams.learning_rate} 133 | ] 134 | bott_layers_parameters = [ 135 | {"params": self.bottleneck.parameters() , "lr": self.hparams.learning_rate} 136 | ] 137 | if self.layerwise_attention: 138 | layerwise_attn_params = [ 139 | { 140 | "params": self.layerwise_attention.parameters(), 141 | "lr": self.hparams.learning_rate, 142 | } 143 | ] 144 | params = layer_parameters + top_layers_parameters + bott_layers_parameters + layerwise_attn_params 145 | else: 146 | params = layer_parameters + top_layers_parameters + bott_layers_parameters 147 | 148 | optimizer = AdamW( 149 | params, 150 | lr=self.hparams.learning_rate, 151 | correct_bias=True, 152 | ) 153 | # scheduler = self._build_scheduler(optimizer) 154 | return [optimizer], [] 155 | 156 | def prepare_sample( 157 | self, sample: List[Dict[str, Union[str, float]]], inference: bool = False, data_portion: float = 1.0, 158 | ) -> Union[ 159 | Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor] 160 | ]: 161 | """ 162 | Function that prepares a sample to input the model. 163 | 164 | :param sample: list of dictionaries. 165 | :param inference: If set to true prepares only the model inputs. 166 | 167 | :returns: Tuple with 2 dictionaries (model inputs and targets). 168 | If `inference=True` returns only the model inputs. 169 | """ 170 | #print(sample[0]) 171 | sample = {k: [dic[k] for dic in sample] for k in sample[0]} 172 | src_inputs = self.encoder.prepare_sample(sample["src"]) 173 | mt_inputs = self.encoder.prepare_sample(sample["mt"]) 174 | ref_inputs = self.encoder.prepare_sample(sample["ref"]) 175 | 176 | src_inputs = {"src_" + k: v for k, v in src_inputs.items()} 177 | mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()} 178 | ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()} 179 | if self.hparams.feature_size>0: 180 | feats = [] 181 | for feat in sample: 182 | if feat.startswith("f"): 183 | feats.append(sample[feat]) 184 | #print(len(feats)) 185 | feature_tensor = torch.as_tensor(feats, dtype=torch.float) 186 | #print(feature_tensor.shape) 187 | #print('------------------') 188 | features = {"custom_features": feature_tensor.T} 189 | 190 | 191 | else: 192 | features = {"custom_features": torch.Tensor()} 193 | 194 | inputs = {**src_inputs, **mt_inputs, **ref_inputs, **features} 195 | if inference: 196 | return inputs 197 | 198 | targets = {"score": torch.tensor(sample["score"], dtype=torch.float)} 199 | return inputs, targets 200 | 201 | def forward( 202 | self, 203 | src_input_ids: torch.tensor, 204 | src_attention_mask: torch.tensor, 205 | mt_input_ids: torch.tensor, 206 | mt_attention_mask: torch.tensor, 207 | ref_input_ids: torch.tensor, 208 | ref_attention_mask: torch.tensor, 209 | custom_features: torch.tensor, 210 | **kwargs 211 | ) -> Dict[str, torch.Tensor]: 212 | src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask) 213 | mt_sentemb = self.get_sentence_embedding(mt_input_ids, mt_attention_mask) 214 | ref_sentemb = self.get_sentence_embedding(ref_input_ids, ref_attention_mask) 215 | 216 | diff_ref = torch.abs(mt_sentemb - ref_sentemb) 217 | diff_src = torch.abs(mt_sentemb - src_sentemb) 218 | 219 | prod_ref = mt_sentemb * ref_sentemb 220 | prod_src = mt_sentemb * src_sentemb 221 | 222 | embedded_sequences = torch.cat( 223 | (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_src, diff_src), 224 | dim=1, 225 | ) 226 | if self.hparams.feature_size>0: 227 | #custom_features=torch.unsqueeze(custom_features, 1) 228 | #print(embedded_sequences.shape) 229 | #print(f1.shape) 230 | 231 | bottleneck = self.bottleneck(embedded_sequences) 232 | #print(bottleneck.shape) 233 | #print(custom_features.shape) 234 | seq_feats = torch.cat((bottleneck,custom_features),dim=1) 235 | #print(seq_feats.shape) 236 | 237 | score = self.estimator(seq_feats) 238 | else: 239 | bottleneck = self.bottleneck(embedded_sequences) 240 | score = self.estimator(bottleneck) 241 | if self.hparams.loss in ["var","hts"]: 242 | return {"score": score[:,0], "variance": score[:,1]} 243 | 244 | return {"score": score} 245 | 246 | def read_csv(self, path: str) -> List[dict]: 247 | """Reads a comma separated value file. 248 | 249 | :param path: path to a csv file. 250 | 251 | :return: List of records as dictionaries 252 | """ 253 | feats=[] 254 | df = pd.read_csv(path) 255 | flen = self.hparams.feature_size 256 | columns = ["src", "mt", "ref", "score"] 257 | for i in range(flen): 258 | fstring='f'+str(i+1) 259 | print('feature added: '+str(fstring)) 260 | columns.append(fstring) 261 | feats.append(fstring) 262 | df = df[columns] 263 | df["src"] = df["src"].astype(str) 264 | df["mt"] = df["mt"].astype(str) 265 | df["ref"] = df["ref"].astype(str) 266 | df["score"] = df["score"].astype(float) 267 | for feat in feats: 268 | df[feat] = df[feat].astype(float) 269 | return df.to_dict("records") 270 | -------------------------------------------------------------------------------- /comet/models/regression/regression_metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | r""" 17 | RegressionMetric 18 | ======================== 19 | Regression Metric that learns to predict a quality assessment by looking 20 | at source, translation and reference. 21 | """ 22 | from typing import Dict, List, Optional, Tuple, Union 23 | 24 | import pandas as pd 25 | import torch 26 | from comet.models.base import CometModel 27 | from comet.modules import FeedForward, Bottleneck 28 | from torchmetrics import MetricCollection, PearsonCorrcoef, SpearmanCorrcoef 29 | from transformers import AdamW 30 | import random 31 | 32 | class RegressionMetric(CometModel): 33 | """RegressionMetric: 34 | 35 | :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen. 36 | :param keep_embeddings_frozen: Keeps the embeddings frozen during training. 37 | :param keep_encoder_frozen: freezes entire encoder. 38 | :param optimizer: Optimizer used during training. 39 | :param encoder_learning_rate: Learning rate used to fine-tune the encoder model. 40 | :param learning_rate: Learning rate used to fine-tune the top layers. 41 | :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers. 42 | :param encoder_model: Encoder model to be used. 43 | :param pretrained_model: Pretrained model from Hugging Face. 44 | :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg']. 45 | :param layer: Encoder layer to be used ('mix' for pooling info from all layers.) 46 | :param dropout: Dropout used in the top-layers. 47 | :param batch_size: Batch size used during training. 48 | :param train_data: Path to a csv file containing the training data. 49 | :param validation_data: Path to a csv file containing the validation data. 50 | :param hidden_sizes: Hidden sizes for the Feed Forward regression. 51 | :param activations: Feed Forward activation function. 52 | :param load_weights_from_checkpoint: Path to a checkpoint file. 53 | """ 54 | 55 | def __init__( 56 | self, 57 | nr_frozen_epochs: Union[float, int] = 0.3, 58 | keep_embeddings_frozen: bool = False, 59 | keep_encoder_frozen: bool = False, 60 | optimizer: str = "AdamW", 61 | encoder_learning_rate: float = 1e-05, 62 | learning_rate: float = 3e-05, 63 | layerwise_decay: float = 0.95, 64 | encoder_model: str = "XLM-RoBERTa", 65 | pretrained_model: str = "xlm-roberta-base", 66 | pool: str = "avg", 67 | layer: Union[str, int] = "mix", 68 | dropout: float = 0.1, 69 | batch_size: int = 4, 70 | train_data: Optional[str] = None, 71 | validation_data: Optional[str] = None, 72 | hidden_sizes_bottleneck: List[int] = [2304, 256], 73 | hidden_sizes: List[int] = [768], 74 | activations: str = "Tanh", 75 | final_activation: Optional[str] = None, 76 | load_weights_from_checkpoint: Optional[str] = None, 77 | loss: Optional[str]="mse", 78 | feature_size: Optional[int] = 0 79 | ) -> None: 80 | super().__init__( 81 | nr_frozen_epochs, 82 | keep_embeddings_frozen, 83 | keep_encoder_frozen, 84 | optimizer, 85 | encoder_learning_rate, 86 | learning_rate, 87 | layerwise_decay, 88 | encoder_model, 89 | pretrained_model, 90 | pool, 91 | layer, 92 | dropout, 93 | batch_size, 94 | train_data, 95 | validation_data, 96 | load_weights_from_checkpoint, 97 | "regression_metric", 98 | ) 99 | self.save_hyperparameters() 100 | 101 | if self.hparams.feature_size > 0: 102 | self.bottleneck = Bottleneck( 103 | in_dim=self.encoder.output_units * 6, 104 | hidden_sizes = [self.hparams.hidden_sizes[0],self.hparams.hidden_sizes_bottleneck[-1]], 105 | activations=self.hparams.activations, 106 | dropout=self.hparams.dropout, 107 | ) 108 | 109 | self.estimator = FeedForward( 110 | in_dim=self.hparams.hidden_sizes_bottleneck[-1] + self.hparams.feature_size, 111 | out_dim = 2 if self.hparams.loss in ["kl", "hts"] else 1, 112 | hidden_sizes=[self.hparams.hidden_sizes[-1]], 113 | activations=self.hparams.activations, 114 | dropout=self.hparams.dropout, 115 | final_activation=self.hparams.final_activation, 116 | ) 117 | else: 118 | self.estimator = FeedForward( 119 | in_dim=self.encoder.output_units * 6, 120 | hidden_sizes=self.hparams.hidden_sizes, 121 | activations=self.hparams.activations, 122 | dropout=self.hparams.dropout, 123 | final_activation=self.hparams.final_activation, 124 | out_dim=2 if self.hparams.loss in ["kl", "hts"] else 1, 125 | ) 126 | 127 | def init_metrics(self): 128 | metrics = MetricCollection( 129 | {"spearman": SpearmanCorrcoef(), "pearson": PearsonCorrcoef()} 130 | ) 131 | self.train_metrics = metrics.clone(prefix="train_") 132 | self.val_metrics = metrics.clone(prefix="val_") 133 | 134 | def configure_optimizers( 135 | self, 136 | ) -> Tuple[List[torch.optim.Optimizer], List[torch.optim.lr_scheduler.LambdaLR]]: 137 | """Sets the optimizers to be used during training.""" 138 | layer_parameters = self.encoder.layerwise_lr( 139 | self.hparams.encoder_learning_rate, self.hparams.layerwise_decay 140 | ) 141 | top_layers_parameters = [ 142 | {"params": self.estimator.parameters() , "lr": self.hparams.learning_rate} 143 | ] 144 | if self.hparams.feature_size>0: 145 | bott_layers_parameters = [ 146 | {"params": self.bottleneck.parameters() , "lr": self.hparams.learning_rate} 147 | ] 148 | if self.layerwise_attention: 149 | layerwise_attn_params = [ 150 | { 151 | "params": self.layerwise_attention.parameters(), 152 | "lr": self.hparams.learning_rate, 153 | } 154 | ] 155 | params = layer_parameters + top_layers_parameters + layerwise_attn_params 156 | else: 157 | params = layer_parameters + top_layers_parameters 158 | if self.hparams.feature_size > 0: 159 | params += bott_layers_parameters 160 | 161 | optimizer = AdamW( 162 | params, 163 | lr=self.hparams.learning_rate, 164 | correct_bias=True, 165 | ) 166 | # scheduler = self._build_scheduler(optimizer) 167 | return [optimizer], [] 168 | 169 | def prepare_sample( 170 | self, sample: List[Dict[str, Union[str, float]]], inference: bool = False, 171 | ) -> Union[ 172 | Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], Dict[str, torch.Tensor] 173 | ]: 174 | """ 175 | Function that prepares a sample to input the model. 176 | 177 | :param sample: list of dictionaries. 178 | :param inference: If set to true prepares only the model inputs. 179 | 180 | :returns: Tuple with 2 dictionaries (model inputs and targets). 181 | If `inference=True` returns only the model inputs. 182 | """ 183 | #print(sample[0]) 184 | sample = {k: [dic[k] for dic in sample] for k in sample[0]} 185 | src_inputs = self.encoder.prepare_sample(sample["src"]) 186 | mt_inputs = self.encoder.prepare_sample(sample["mt"]) 187 | ref_inputs = self.encoder.prepare_sample(sample["ref"]) 188 | 189 | src_inputs = {"src_" + k: v for k, v in src_inputs.items()} 190 | mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()} 191 | ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()} 192 | if self.hparams.feature_size>0: 193 | feats = [] 194 | for feat in sample: 195 | if feat.startswith("f"): 196 | feats.append(sample[feat]) 197 | feature_tensor = torch.as_tensor(feats, dtype=torch.float) 198 | features = {"custom_features": feature_tensor.T} 199 | else: 200 | features = {"custom_features": torch.Tensor()} 201 | 202 | inputs = {**src_inputs, **mt_inputs, **ref_inputs, **features} 203 | if inference: 204 | return inputs 205 | 206 | targets = {"score": torch.tensor(sample["score"], dtype=torch.float)} 207 | return inputs, targets 208 | 209 | def forward( 210 | self, 211 | src_input_ids: torch.tensor, 212 | src_attention_mask: torch.tensor, 213 | mt_input_ids: torch.tensor, 214 | mt_attention_mask: torch.tensor, 215 | ref_input_ids: torch.tensor, 216 | ref_attention_mask: torch.tensor, 217 | custom_features: torch.tensor, 218 | **kwargs 219 | ) -> Dict[str, torch.Tensor]: 220 | src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask) 221 | mt_sentemb = self.get_sentence_embedding(mt_input_ids, mt_attention_mask) 222 | ref_sentemb = self.get_sentence_embedding(ref_input_ids, ref_attention_mask) 223 | 224 | diff_ref = torch.abs(mt_sentemb - ref_sentemb) 225 | diff_src = torch.abs(mt_sentemb - src_sentemb) 226 | 227 | prod_ref = mt_sentemb * ref_sentemb 228 | prod_src = mt_sentemb * src_sentemb 229 | 230 | embedded_sequences = torch.cat( 231 | (mt_sentemb, ref_sentemb, prod_ref, diff_ref, prod_src, diff_src), 232 | dim=1, 233 | ) 234 | if self.hparams.feature_size>0: 235 | bottleneck = self.bottleneck(embedded_sequences) 236 | seq_feats = torch.cat((bottleneck,custom_features),dim=1) 237 | score = self.estimator(seq_feats) 238 | else: 239 | score = self.estimator(embedded_sequences) 240 | 241 | if self.hparams.loss in ["kl","hts"]: 242 | return {"score": score[:,0], "variance": score[:,1]} 243 | return {"score": score} 244 | 245 | def read_csv(self, path: str) -> List[dict]: 246 | """Reads a comma separated value file. 247 | :param path: path to a csv file. 248 | :return: List of records as dictionaries 249 | """ 250 | feats=[] 251 | df = pd.read_csv(path) 252 | flen = self.hparams.feature_size 253 | columns = ["src", "mt", "ref", "score"] 254 | if self.hparams.loss == 'kl': 255 | columns.append("std") 256 | for i in range(flen): 257 | fstring='f'+str(i+1) 258 | print('feature added: '+str(fstring)) 259 | columns.append(fstring) 260 | feats.append(fstring) 261 | df = df[columns] 262 | df["src"] = df["src"].astype(str) 263 | df["mt"] = df["mt"].astype(str) 264 | df["ref"] = df["ref"].astype(str) 265 | df["score"] = df["score"].astype(float) 266 | for feat in feats: 267 | df[feat] = df[feat].astype(float) 268 | return df.to_dict("records") 269 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /comet/models/ranking/ranking_metric.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | 16 | r""" 17 | Ranking Metric 18 | ==================== 19 | Translation Ranking metric was introduced by 20 | [Rei, et al. 2020](https://aclanthology.org/2020.emnlp-main.213/) 21 | and it is trained on top of Direct Assessment Relative Ranks (DARR) to encode 22 | `good` translations closer to the anchors (source & reference) than `worse` 23 | translations. 24 | """ 25 | from typing import Dict, List, Optional, Tuple, Union 26 | 27 | import pandas as pd 28 | import torch 29 | import torch.nn.functional as F 30 | from comet.models.base import CometModel 31 | from transformers import AdamW 32 | 33 | from .wmt_kendall import WMTKendall 34 | 35 | 36 | class RankingMetric(CometModel): 37 | """RankingMetric 38 | 39 | :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen. 40 | :param keep_embeddings_frozen: Keeps the encoder frozen during training. 41 | :param optimizer: Optimizer used during training. 42 | :param encoder_learning_rate: Learning rate used to fine-tune the encoder model. 43 | :param learning_rate: Learning rate used to fine-tune the top layers. 44 | :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers. 45 | :param encoder_model: Encoder model to be used. 46 | :param pretrained_model: Pretrained model from Hugging Face. 47 | :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg']. 48 | :param layer: Encoder layer to be used ('mix' for pooling info from all layers.) 49 | :param dropout: Dropout used in the top-layers. 50 | :param batch_size: Batch size used during training. 51 | :param train_data: Path to a csv file containing the training data. 52 | :param validation_data: Path to a csv file containing the validation data. 53 | :param load_weights_from_checkpoint: Path to a checkpoint file. 54 | """ 55 | 56 | def __init__( 57 | self, 58 | nr_frozen_epochs: Union[float, int] = 0.05, 59 | keep_embeddings_frozen: bool = False, 60 | optimizer: str = "AdamW", 61 | encoder_learning_rate: float = 1e-05, 62 | learning_rate: float = 3e-05, 63 | layerwise_decay: float = 0.95, 64 | encoder_model: str = "XLM-RoBERTa", 65 | pretrained_model: str = "xlm-roberta-base", 66 | pool: str = "avg", 67 | layer: Union[str, int] = "mix", 68 | dropout: float = 0.1, 69 | batch_size: int = 8, 70 | train_data: Optional[str] = None, 71 | validation_data: Optional[str] = None, 72 | load_weights_from_checkpoint: Optional[str] = None, 73 | ) -> None: 74 | super().__init__( 75 | nr_frozen_epochs, 76 | keep_embeddings_frozen, 77 | optimizer, 78 | encoder_learning_rate, 79 | learning_rate, 80 | layerwise_decay, 81 | encoder_model, 82 | pretrained_model, 83 | pool, 84 | layer, 85 | dropout, 86 | batch_size, 87 | train_data, 88 | validation_data, 89 | load_weights_from_checkpoint, 90 | "ranking_metric", 91 | ) 92 | self.save_hyperparameters() 93 | 94 | def init_metrics(self): 95 | self.train_metrics = WMTKendall(prefix="train") 96 | self.val_metrics = WMTKendall(prefix="val") 97 | 98 | @property 99 | def loss(self): 100 | return torch.nn.TripletMarginLoss(margin=1.0, p=2) 101 | 102 | def configure_optimizers( 103 | self, 104 | ) -> Tuple[List[torch.optim.Optimizer], List[torch.optim.lr_scheduler.LambdaLR]]: 105 | """Sets the optimizers to be used during training.""" 106 | layer_parameters = self.encoder.layerwise_lr( 107 | self.hparams.encoder_learning_rate, self.hparams.layerwise_decay 108 | ) 109 | if self.layerwise_attention: 110 | layerwise_attn_params = [ 111 | { 112 | "params": self.layerwise_attention.parameters(), 113 | "lr": self.hparams.learning_rate, 114 | } 115 | ] 116 | params = layer_parameters + layerwise_attn_params 117 | else: 118 | params = layer_parameters 119 | 120 | optimizer = AdamW( 121 | params, 122 | lr=self.hparams.learning_rate, 123 | correct_bias=True, 124 | ) 125 | # scheduler = self._build_scheduler(optimizer) 126 | return [optimizer], [] 127 | 128 | def prepare_sample( 129 | self, sample: List[Dict[str, Union[str, float]]], inference: bool = False 130 | ) -> Dict[str, torch.Tensor]: 131 | 132 | sample = {k: [dic[k] for dic in sample] for k in sample[0]} 133 | 134 | if inference: 135 | src_inputs = self.encoder.prepare_sample(sample["src"]) 136 | mt_inputs = self.encoder.prepare_sample(sample["mt"]) 137 | ref_inputs = self.encoder.prepare_sample(sample["ref"]) 138 | 139 | ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()} 140 | src_inputs = {"src_" + k: v for k, v in src_inputs.items()} 141 | mt_inputs = {"mt_" + k: v for k, v in mt_inputs.items()} 142 | 143 | return {**ref_inputs, **src_inputs, **mt_inputs} 144 | 145 | ref_inputs = self.encoder.prepare_sample(sample["ref"]) 146 | src_inputs = self.encoder.prepare_sample(sample["src"]) 147 | pos_inputs = self.encoder.prepare_sample(sample["pos"]) 148 | neg_inputs = self.encoder.prepare_sample(sample["neg"]) 149 | 150 | ref_inputs = {"ref_" + k: v for k, v in ref_inputs.items()} 151 | src_inputs = {"src_" + k: v for k, v in src_inputs.items()} 152 | pos_inputs = {"pos_" + k: v for k, v in pos_inputs.items()} 153 | neg_inputs = {"neg_" + k: v for k, v in neg_inputs.items()} 154 | 155 | return {**ref_inputs, **src_inputs, **pos_inputs, **neg_inputs} 156 | 157 | def forward( 158 | self, 159 | src_input_ids: torch.tensor, 160 | ref_input_ids: torch.tensor, 161 | pos_input_ids: torch.tensor, 162 | neg_input_ids: torch.tensor, 163 | src_attention_mask: torch.tensor, 164 | ref_attention_mask: torch.tensor, 165 | pos_attention_mask: torch.tensor, 166 | neg_attention_mask: torch.tensor, 167 | **kwargs 168 | ) -> Dict[str, torch.Tensor]: 169 | src_sentemb = self.get_sentence_embedding(src_input_ids, src_attention_mask) 170 | ref_sentemb = self.get_sentence_embedding(ref_input_ids, ref_attention_mask) 171 | pos_sentemb = self.get_sentence_embedding(pos_input_ids, pos_attention_mask) 172 | neg_sentemb = self.get_sentence_embedding(neg_input_ids, neg_attention_mask) 173 | 174 | loss = self.loss(src_sentemb, pos_sentemb, neg_sentemb) + self.loss( 175 | ref_sentemb, pos_sentemb, neg_sentemb 176 | ) 177 | 178 | distance_src_pos = F.pairwise_distance(pos_sentemb, src_sentemb) 179 | distance_ref_pos = F.pairwise_distance(pos_sentemb, ref_sentemb) 180 | # Harmonic mean between anchors and the positive example 181 | distance_pos = (2 * distance_src_pos * distance_ref_pos) / ( 182 | distance_src_pos + distance_ref_pos 183 | ) 184 | 185 | # Harmonic mean between anchors and the negative example 186 | distance_src_neg = F.pairwise_distance(neg_sentemb, src_sentemb) 187 | distance_ref_neg = F.pairwise_distance(neg_sentemb, ref_sentemb) 188 | distance_neg = (2 * distance_src_neg * distance_ref_neg) / ( 189 | distance_src_neg + distance_ref_neg 190 | ) 191 | 192 | return { 193 | "loss": loss, 194 | "distance_pos": distance_pos, 195 | "distance_neg": distance_neg, 196 | } 197 | 198 | def read_csv(self, path: str, regression: bool = False) -> List[dict]: 199 | """Reads a comma separated value file. 200 | 201 | :param path: path to a csv file. 202 | 203 | :return: List of records as dictionaries 204 | """ 205 | df = pd.read_csv(path) 206 | 207 | if regression: 208 | df = df[["src", "mt", "ref", "score"]] 209 | df["src"] = df["src"].astype(str) 210 | df["mt"] = df["mt"].astype(str) 211 | df["ref"] = df["ref"].astype(str) 212 | df["score"] = df["score"].astype(float) 213 | return df.to_dict("records") 214 | 215 | df = df[["src", "pos", "neg", "ref"]] 216 | df["src"] = df["src"].astype(str) 217 | df["pos"] = df["pos"].astype(str) 218 | df["neg"] = df["neg"].astype(str) 219 | df["ref"] = df["ref"].astype(str) 220 | return df.to_dict("records") 221 | 222 | def training_step( 223 | self, 224 | batch: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], 225 | batch_nb: int, 226 | ) -> Dict[str, torch.Tensor]: 227 | """ 228 | Runs one training step. 229 | This usually consists in the forward function followed by the loss function. 230 | 231 | :param batch: The output of your prepare_sample function. 232 | :param batch_nb: Integer displaying which batch this is. 233 | 234 | :returns: dictionary containing the loss and the metrics to be added to the 235 | lightning logger. 236 | """ 237 | batch_prediction = self.forward(**batch) 238 | loss_value = batch_prediction["loss"] 239 | 240 | if ( 241 | self.nr_frozen_epochs < 1.0 242 | and self.nr_frozen_epochs > 0.0 243 | and batch_nb > self.epoch_total_steps * self.nr_frozen_epochs 244 | ): 245 | self.unfreeze_encoder() 246 | self._frozen = False 247 | 248 | self.log("train_loss", loss_value, on_step=True, on_epoch=True) 249 | return loss_value 250 | 251 | def validation_step( 252 | self, 253 | batch: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], 254 | batch_nb: int, 255 | dataloader_idx: int, 256 | ) -> Dict[str, torch.Tensor]: 257 | """ 258 | Similar to the training step but with the model in eval mode. 259 | 260 | :param batch: The output of your prepare_sample function. 261 | :param batch_nb: Integer displaying which batch this is. 262 | :param dataloader_idx: Integer displaying which dataloader this is. 263 | 264 | :returns: dictionary passed to the validation_end function. 265 | """ 266 | batch_prediction = self.forward(**batch) 267 | loss_value = batch_prediction["loss"] 268 | self.log("val_loss", loss_value, on_step=True, on_epoch=True) 269 | 270 | # TODO: REMOVE if condition after torchmetrics bug fix 271 | if dataloader_idx == 0: 272 | self.train_metrics.update( 273 | batch_prediction["distance_pos"], batch_prediction["distance_neg"] 274 | ) 275 | elif dataloader_idx == 1: 276 | self.val_metrics.update( 277 | batch_prediction["distance_pos"], batch_prediction["distance_neg"] 278 | ) 279 | 280 | def predict_step( 281 | self, 282 | batch: Dict[str, torch.Tensor], 283 | batch_idx: int, 284 | dataloader_idx: Optional[int], 285 | ) -> List[float]: 286 | src_sentemb = self.get_sentence_embedding( 287 | batch["src_input_ids"], batch["src_attention_mask"] 288 | ) 289 | ref_sentemb = self.get_sentence_embedding( 290 | batch["ref_input_ids"], batch["ref_attention_mask"] 291 | ) 292 | mt_sentemb = self.get_sentence_embedding( 293 | batch["mt_input_ids"], batch["mt_attention_mask"] 294 | ) 295 | 296 | src_distance = F.pairwise_distance(mt_sentemb, src_sentemb) 297 | ref_distance = F.pairwise_distance(mt_sentemb, ref_sentemb) 298 | 299 | distances = (2 * ref_distance * src_distance) / (ref_distance + src_distance) 300 | return torch.ones_like(distances) / (1 + distances) 301 | -------------------------------------------------------------------------------- /comet/models/base.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # Copyright (C) 2020 Unbabel 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | r""" 16 | CometModel 17 | ======================== 18 | Abstract Model class that implements some of the Pytorch Lightning logic. 19 | Extend this class to create new model and metrics within COMET. 20 | """ 21 | import abc 22 | import logging 23 | import multiprocessing 24 | import sys 25 | from os import path 26 | from typing import Dict, List, Optional, Tuple, Union 27 | import random 28 | import numpy as np 29 | import pytorch_lightning as ptl 30 | import torch 31 | from comet.encoders import str2encoder 32 | from comet.modules import LayerwiseAttention, HeteroscedasticLoss, HeteroscedasticLossv2, HeteroApproxLoss, HeteroApproxLossv2, SquaredLoss 33 | from torch import nn 34 | from torch.utils.data import DataLoader, RandomSampler, Subset 35 | from pytorch_lightning.loggers import TensorBoardLogger 36 | from tqdm import tqdm 37 | 38 | from .pooling_utils import average_pooling, max_pooling 39 | 40 | logger = logging.getLogger(__name__) 41 | 42 | 43 | class CometModel(ptl.LightningModule, metaclass=abc.ABCMeta): 44 | """CometModel: 45 | 46 | :param nr_frozen_epochs: Number of epochs (% of epoch) that the encoder is frozen. 47 | :param keep_embeddings_frozen: Keeps the embeddings frozen during training. 48 | :param keep_encoder_frozen: freezes entire encoder. 49 | :param optimizer: Optimizer used during training. 50 | :param encoder_learning_rate: Learning rate used to fine-tune the encoder model. 51 | :param learning_rate: Learning rate used to fine-tune the top layers. 52 | :param layerwise_decay: Learning rate % decay from top-to-bottom encoder layers. 53 | :param encoder_model: Encoder model to be used. 54 | :param pretrained_model: Pretrained model from Hugging Face. 55 | :param pool: Pooling strategy to derive a sentence embedding ['cls', 'max', 'avg']. 56 | :param layer: Encoder layer to be used ('mix' for pooling info from all layers.) 57 | :param dropout: Dropout used in the top-layers. 58 | :param batch_size: Batch size used during training. 59 | :param train_data: Path to a csv file containing the training data. 60 | :param validation_data: Path to a csv file containing the validation data. 61 | :param load_weights_from_checkpoint: Path to a checkpoint file. 62 | :param class_identifier: subclass identifier. 63 | """ 64 | 65 | def __init__( 66 | self, 67 | nr_frozen_epochs: Union[float, int] = 0.3, 68 | keep_embeddings_frozen: bool = False, 69 | keep_encoder_frozen: bool = False, 70 | optimizer: str = "AdamW", 71 | encoder_learning_rate: float = 1e-05, 72 | learning_rate: float = 3e-05, 73 | layerwise_decay: float = 0.95, 74 | encoder_model: str = "XLM-RoBERTa", 75 | pretrained_model: str = "xlm-roberta-large", 76 | pool: str = "avg", 77 | layer: Union[str, int] = "mix", 78 | dropout: float = 0.1, 79 | batch_size: int = 4, 80 | train_data: Optional[str] = None, 81 | validation_data: Optional[str] = None, 82 | load_weights_from_checkpoint: Optional[str] = None, 83 | class_identifier: Optional[str] = None, 84 | loss: Optional[str]="mse", 85 | data_portion: Optional[float] = 1.0, 86 | feature_size: Optional[int] = 0 87 | ) -> None: 88 | super().__init__() 89 | self.save_hyperparameters( 90 | ignore=["train_data", "validation_data", "load_weights_from_checkpoint"] 91 | ) 92 | self.encoder = str2encoder[self.hparams.encoder_model].from_pretrained( 93 | self.hparams.pretrained_model 94 | ) 95 | self.epoch_nr = 0 96 | if self.hparams.layer == "mix": 97 | self.layerwise_attention = LayerwiseAttention( 98 | num_layers=self.encoder.num_layers, 99 | dropout=self.hparams.dropout, 100 | layer_norm=True, 101 | ) 102 | else: 103 | self.layerwise_attention = None 104 | 105 | if self.hparams.nr_frozen_epochs > 0: 106 | self._frozen = True 107 | self.freeze_encoder() 108 | else: 109 | self._frozen = False 110 | if self.hparams.keep_encoder_frozen: 111 | self._frozen = True 112 | self.freeze_encoder() 113 | 114 | if self.hparams.keep_embeddings_frozen: 115 | self.encoder.freeze_embeddings() 116 | 117 | self.nr_frozen_epochs = self.hparams.nr_frozen_epochs 118 | 119 | if load_weights_from_checkpoint is not None: 120 | if path.exists(load_weights_from_checkpoint): 121 | self.load_weights(load_weights_from_checkpoint) 122 | else: 123 | logger.warning(f"Path {load_weights_from_checkpoint} does not exist!") 124 | 125 | self.mc_dropout = False # Flag used to control usage of MC Dropout 126 | 127 | def set_mc_dropout(self, value: bool): 128 | self.mc_dropout = value 129 | 130 | def load_weights(self, checkpoint: str) -> None: 131 | """Function that loads the weights from a given checkpoint file. 132 | Note: 133 | If the checkpoint model architecture is different then `self`, only 134 | the common parts will be loaded. 135 | 136 | :param checkpoint: Path to the checkpoint containing the weights to be loaded. 137 | """ 138 | logger.info(f"Loading weights from {checkpoint}.") 139 | checkpoint = torch.load(checkpoint, map_location=lambda storage, loc: storage) 140 | pretrained_dict = checkpoint["state_dict"] 141 | model_dict = self.state_dict() 142 | # 1. filter out unnecessary keys 143 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 144 | # 2. overwrite entries in the existing state dict 145 | model_dict.update(pretrained_dict) 146 | # 3. load the new state dict 147 | self.load_state_dict(model_dict) 148 | 149 | @abc.abstractmethod 150 | def read_csv(self): 151 | pass 152 | 153 | @abc.abstractmethod 154 | def prepare_sample( 155 | self, sample: List[Dict[str, Union[str, float]]], *args, **kwargs 156 | ): 157 | pass 158 | 159 | @abc.abstractmethod 160 | def configure_optimizers(self): 161 | pass 162 | 163 | @abc.abstractmethod 164 | def init_metrics(self) -> None: 165 | pass 166 | 167 | @abc.abstractmethod 168 | def forward(self, *args, **kwargs) -> Dict[str, torch.Tensor]: 169 | pass 170 | 171 | def freeze_encoder(self) -> None: 172 | logger.info("Encoder model frozen.") 173 | self.encoder.freeze() 174 | 175 | @property 176 | def loss(self) -> None: 177 | if self.hparams.loss in ["var","hts"]: 178 | return HeteroscedasticLoss() 179 | elif self.hparams.loss in ["var2","hts2"]: 180 | return HeteroscedasticLossv2() 181 | elif self.hparams.loss in ["var_approx","hts_approx"]: 182 | return HeteroApproxLoss() 183 | elif self.hparams.loss in ["var_approx2","hts_approx2"]: 184 | return HeteroApproxLossv2() 185 | elif self.hparams.loss in ["squared"]: 186 | return SquaredLoss() 187 | return nn.MSELoss() 188 | 189 | def compute_loss( 190 | self, predictions: Dict[str, torch.Tensor], targets: Dict[str, torch.Tensor] 191 | ) -> torch.Tensor: 192 | if self.hparams.loss in ["var","hts"]: 193 | return self.loss(predictions["score"].view(-1), predictions["variance"].view(-1) , targets["score"]) 194 | 195 | return self.loss(predictions["score"].view(-1), targets["score"]) 196 | 197 | def unfreeze_encoder(self) -> None: 198 | if self._frozen: 199 | if self.trainer.is_global_zero: 200 | logger.info("Encoder model fine-tuning") 201 | 202 | self.encoder.unfreeze() 203 | self._frozen = False 204 | if self.hparams.keep_embeddings_frozen: 205 | self.encoder.freeze_embeddings() 206 | 207 | def on_train_epoch_end(self) -> None: 208 | """Hook used to unfreeze encoder during training.""" 209 | self.epoch_nr += 1 210 | if self.epoch_nr >= self.nr_frozen_epochs and self._frozen and not self.hparams.keep_encoder_frozen: 211 | self.unfreeze_encoder() 212 | self._frozen = False 213 | 214 | def get_sentence_embedding( 215 | self, input_ids: torch.Tensor, attention_mask: torch.Tensor 216 | ) -> torch.Tensor: 217 | """Function that extracts sentence embeddings for 218 | a single sentence. 219 | 220 | :param tokens: sequences [batch_size x seq_len] 221 | :param lengths: lengths [batch_size] 222 | 223 | :return: torch.Tensor [batch_size x hidden_size] 224 | """ 225 | encoder_out = self.encoder(input_ids, attention_mask) 226 | if self.layerwise_attention: 227 | # HACK: LayerNorm is applied at the MiniBatch. This means that for big batch sizes the variance 228 | # and norm within the batch will create small differences in the final score 229 | # If we are predicting we split the data into equal size batches to minimize this variance. 230 | if not self.training: 231 | n_splits = len(torch.split(encoder_out["all_layers"][-1], 8)) 232 | embeddings = [] 233 | for split in range(n_splits): 234 | all_layers = [] 235 | for layer in range(len(encoder_out["all_layers"])): 236 | layer_embs = torch.split(encoder_out["all_layers"][layer], 8) 237 | all_layers.append(layer_embs[split]) 238 | split_attn = torch.split(attention_mask, 8)[split] 239 | embeddings.append(self.layerwise_attention(all_layers, split_attn)) 240 | embeddings = torch.cat(embeddings, dim=0) 241 | else: 242 | embeddings = self.layerwise_attention( 243 | encoder_out["all_layers"], attention_mask 244 | ) 245 | 246 | elif self.hparams.layer >= 0 and self.hparams.layer < self.encoder.num_layers: 247 | embeddings = encoder_out["all_layers"][self.hparams.layer] 248 | 249 | else: 250 | raise Exception("Invalid model layer {}.".format(self.hparams.layer)) 251 | 252 | if self.hparams.pool == "default": 253 | sentemb = encoder_out["sentemb"] 254 | 255 | elif self.hparams.pool == "max": 256 | sentemb = max_pooling( 257 | input_ids, embeddings, self.encoder.tokenizer.pad_token_id 258 | ) 259 | 260 | elif self.hparams.pool == "avg": 261 | sentemb = average_pooling( 262 | input_ids, 263 | embeddings, 264 | attention_mask, 265 | self.encoder.tokenizer.pad_token_id, 266 | ) 267 | 268 | elif self.hparams.pool == "cls": 269 | sentemb = embeddings[:, 0, :] 270 | 271 | else: 272 | raise Exception("Invalid pooling technique.") 273 | 274 | return sentemb 275 | 276 | def training_step( 277 | self, 278 | batch: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], 279 | batch_nb: int, 280 | ) -> torch.Tensor: 281 | """ 282 | Runs one training step and logs the training loss. 283 | 284 | :param batch: The output of your prepare_sample function. 285 | :param batch_nb: Integer displaying which batch this is. 286 | 287 | :returns: Loss value 288 | """ 289 | batch_input, batch_target = batch 290 | batch_prediction = self.forward(**batch_input) 291 | #if not self.lossalternate: 292 | loss_value = self.compute_loss(batch_prediction, batch_target) 293 | 294 | if ( 295 | self.nr_frozen_epochs < 1.0 296 | and self.nr_frozen_epochs > 0.0 297 | and batch_nb > self.epoch_total_steps * self.nr_frozen_epochs 298 | and not self.hparams.keep_encoder_frozen 299 | ): 300 | self.unfreeze_encoder() 301 | self._frozen = False 302 | 303 | self.log("train_loss", loss_value, on_step=True, on_epoch=True) 304 | return loss_value 305 | 306 | def validation_step( 307 | self, 308 | batch: Tuple[Dict[str, torch.Tensor], Dict[str, torch.Tensor]], 309 | batch_nb: int, 310 | dataloader_idx: int, 311 | ) -> torch.Tensor: 312 | """ 313 | Runs one validation step and logs metrics. 314 | 315 | :param batch: The output of your prepare_sample function. 316 | :param batch_nb: Integer displaying which batch this is. 317 | :param dataloader_idx: Integer displaying which dataloader this is. 318 | """ 319 | batch_input, batch_target = batch 320 | batch_prediction = self.forward(**batch_input) 321 | loss_value = self.compute_loss(batch_prediction, batch_target) 322 | 323 | self.log("val_loss", loss_value, on_step=True, on_epoch=True) 324 | 325 | # TODO: REMOVE if condition after torchmetrics bug fix 326 | if batch_prediction["score"].view(-1).size() != torch.Size([1]): 327 | if dataloader_idx == 0: 328 | self.train_metrics.update( 329 | batch_prediction["score"].view(-1), batch_target["score"] 330 | ) 331 | elif dataloader_idx == 1: 332 | self.val_metrics.update( 333 | batch_prediction["score"].view(-1), batch_target["score"] 334 | ) 335 | #print(loss_value) 336 | return loss_value 337 | 338 | def on_predict_start(self) -> None: 339 | """Called when predict begins.""" 340 | if self.mc_dropout: 341 | self.train() 342 | else: 343 | self.eval() 344 | 345 | def predict_step( 346 | self, 347 | batch: Dict[str, torch.Tensor], 348 | batch_idx: Optional[int] = None, 349 | dataloader_idx: Optional[int] = None, 350 | ) -> torch.Tensor: 351 | """ 352 | Runs one prediction step and returns the predicted values. 353 | 354 | :param batch: The output of your prepare_sample function. 355 | :param batch_nb: Integer displaying which batch this is. 356 | :param dataloader_idx: Integer displaying which dataloader this is. 357 | """ 358 | if self.mc_dropout: 359 | #print(self.loss) 360 | #print(isinstance(self.loss, HeteroscedasticLoss)) 361 | #mcd_outputs = torch.stack( 362 | # [self(**batch)["score"].view(-1) for _ in range(self.mc_dropout)] 363 | #) 364 | mcd_outputs = torch.empty(size=(self.mc_dropout, 2)) 365 | hts_outputs = torch.empty(size=(self.mc_dropout, 2)) 366 | 367 | # mcd_outputs = torch.empty(size=(self.mc_dropout, self.hparams.batch_size)) 368 | # hts_outputs = torch.empty(size=(self.mc_dropout, self.hparams.batch_size)) 369 | for i in range(self.mc_dropout): 370 | outputs = self(**batch) 371 | 372 | mcd_outputs[i,:] = outputs["score"].view(-1) 373 | if isinstance(self.loss, HeteroscedasticLoss): 374 | hts_outputs[i]=outputs["variance"].view(-1) 375 | 376 | mcd_mean = mcd_outputs.mean(dim=0) 377 | mcd_std = mcd_outputs.std(dim=0) 378 | #print(mcd_mean) 379 | if isinstance(self.loss, HeteroscedasticLoss): 380 | hts_mean = hts_outputs.mean(dim=0) 381 | hts_std = hts_outputs.std(dim=0) 382 | return mcd_mean, mcd_std, hts_mean, hts_std 383 | return mcd_mean, mcd_std 384 | 385 | output = self(**batch) 386 | if isinstance(self.loss, HeteroscedasticLoss): 387 | return output["score"].view(-1), output["variance"].view(-1) 388 | return output["score"].view(-1) 389 | 390 | def validation_epoch_end(self, outputs, *args, **kwargs) -> None: 391 | """ " Computes and logs metrics.""" 392 | #print(outputs) 393 | avg_loss = torch.stack([x[0] for x in outputs]).mean() 394 | self.logger.experiment.add_scalar('validation_loss',avg_loss, self.current_epoch) 395 | self.log_dict(self.train_metrics.compute(), prog_bar=True) 396 | self.log_dict(self.val_metrics.compute(), prog_bar=True) 397 | self.train_metrics.reset() 398 | self.val_metrics.reset() 399 | 400 | def setup(self, stage) -> None: 401 | """Data preparation function called before training by Lightning. 402 | 403 | :param stage: either 'fit', 'validate', 'test', or 'predict' 404 | """ 405 | if stage in (None, "fit"): 406 | self.train_dataset = self.read_csv(self.hparams.train_data) 407 | if self.hparams.data_portion < 1.0: 408 | print(len(self.train_dataset)) 409 | length = len(self.train_dataset) 410 | data_size = int(self.hparams.data_portion*length) 411 | self.train_dataset = list(random.sample(self.train_dataset, data_size)) 412 | print(len(self.train_dataset)) 413 | self.validation_dataset = self.read_csv(self.hparams.validation_data) 414 | 415 | self.epoch_total_steps = len(self.train_dataset) // ( 416 | self.hparams.batch_size * max(1, self.trainer.num_gpus) 417 | ) 418 | self.total_steps = self.epoch_total_steps * float(self.trainer.max_epochs) 419 | 420 | # Always validate the model with 2k examples to control overfit. 421 | train_subset = np.random.choice(a=len(self.train_dataset), size=2000) 422 | self.train_subset = Subset(self.train_dataset, train_subset) 423 | self.init_metrics() 424 | 425 | def train_dataloader(self) -> DataLoader: 426 | """Function that loads the train set.""" 427 | return DataLoader( 428 | dataset=self.train_dataset, 429 | sampler=RandomSampler(self.train_dataset), 430 | batch_size=self.hparams.batch_size, 431 | collate_fn=lambda x: self.prepare_sample(x, inference=False, data_portion=self.hparams.data_portion), 432 | num_workers=multiprocessing.cpu_count(), 433 | ) 434 | 435 | def val_dataloader(self) -> DataLoader: 436 | """Function that loads the validation set.""" 437 | return [ 438 | DataLoader( 439 | dataset=self.train_subset, 440 | batch_size=self.hparams.batch_size, 441 | collate_fn=lambda x: self.prepare_sample(x, inference=False, data_portion=self.hparams.data_portion), 442 | num_workers=min(8, multiprocessing.cpu_count()), 443 | ), 444 | DataLoader( 445 | dataset=self.validation_dataset, 446 | batch_size=self.hparams.batch_size, 447 | collate_fn=self.prepare_sample, 448 | num_workers=min(8, multiprocessing.cpu_count()), 449 | ), 450 | ] 451 | 452 | def predict( 453 | self, 454 | samples: List[Dict[str, str]], 455 | batch_size: int = 8, 456 | gpus: int = 1, 457 | mc_dropout: Union[int, bool] = False, 458 | ) -> Union[Tuple[List[float], float], Tuple[List[float], List[float], float]]: 459 | """Function that receives a list of samples (dictionaries with translations, sources and/or references) 460 | and returns segment level scores and a system level score. If `mc_dropout` is set, it also returns for each 461 | segment score, a confidence value. 462 | 463 | :param samples: List with dictionaries with source, translations and/or references. 464 | :param batch_size: Batch size used during inference. 465 | :gpus: Number of GPUs to be used. 466 | 467 | :return: List with segment-level scores and a system-score or segment-level scores, segment-level 468 | confidence and a system-score. 469 | """ 470 | 471 | class PredictProgressBar(ptl.callbacks.ProgressBar): 472 | """Default Lightning Progress bar writes to stdout, we replace stdout with stderr""" 473 | 474 | def init_predict_tqdm(self) -> tqdm: 475 | bar = tqdm( 476 | desc="Predicting", 477 | initial=self.train_batch_idx, 478 | position=(2 * self.process_position), 479 | disable=self.is_disabled, 480 | leave=True, 481 | dynamic_ncols=True, 482 | file=sys.stderr, 483 | smoothing=0, 484 | ) 485 | return bar 486 | 487 | # HACK: Workaround pytorch bug that prevents ParameterList to be used in DP 488 | # https://github.com/pytorch/pytorch/issues/36035 489 | if self.layerwise_attention is not None and gpus > 1: 490 | self.layerwise_attention.gamma_value = float( 491 | self.layerwise_attention.gamma[0] 492 | ) 493 | self.layerwise_attention.weights = [ 494 | float(parameter[0]) 495 | for parameter in self.layerwise_attention.scalar_parameters 496 | ] 497 | 498 | self.eval() 499 | dataloader = DataLoader( 500 | dataset=samples, 501 | batch_size=batch_size, 502 | collate_fn=lambda x: self.prepare_sample(x, inference=True), 503 | num_workers=multiprocessing.cpu_count(), 504 | ) 505 | 506 | prog_bar = PredictProgressBar() 507 | #tb_logger = TensorBoardLogger("tb_logs", name="DEUP_logger") 508 | trainer = ptl.Trainer( 509 | gpus=gpus, 510 | deterministic=True, 511 | logger=False, 512 | callbacks=[prog_bar], 513 | accelerator="dp" if gpus > 1 else None, 514 | ) 515 | 516 | if mc_dropout: 517 | self.set_mc_dropout(mc_dropout) 518 | predictions = trainer.predict( 519 | self, dataloaders=dataloader, return_predictions=True 520 | ) 521 | mean_scores = [out[0] for out in predictions] 522 | std_scores = [out[1] for out in predictions] 523 | mean_scores = torch.cat(mean_scores, dim=0).tolist() 524 | std_scores = torch.cat(std_scores, dim=0).tolist() 525 | 526 | if isinstance(self.loss, HeteroscedasticLoss): 527 | hts_scores = [out[2] for out in predictions] 528 | hts_std_scores = [out[3] for out in predictions] 529 | hts_scores = torch.cat(hts_scores, dim=0).tolist() 530 | hts_std_scores = torch.cat(hts_std_scores, dim=0).tolist() 531 | return mean_scores, std_scores, hts_scores, hts_std_scores, sum(mean_scores) / len(mean_scores) 532 | 533 | return mean_scores, std_scores, sum(mean_scores) / len(mean_scores) 534 | 535 | else: 536 | predictions = trainer.predict( 537 | self, dataloaders=dataloader, return_predictions=True 538 | ) 539 | 540 | if isinstance(self.loss, HeteroscedasticLoss): 541 | #print(predictions) 542 | mean_scores = [out[0] for out in predictions] 543 | 544 | hts_scores = [out[1] for out in predictions] 545 | #print(hts_scores) 546 | #print(len(predictions)) 547 | 548 | 549 | quality_predictions = torch.cat(mean_scores, dim=0).tolist() 550 | variance_predictions = torch.cat(hts_scores, dim=0).tolist() 551 | #print(variance_predictions) 552 | #print(len(variance_predictions)) 553 | 554 | return quality_predictions, variance_predictions, sum(quality_predictions) / len(quality_predictions) 555 | else: 556 | predictions = torch.cat(predictions, dim=0).tolist() 557 | return predictions, sum(predictions) / len(predictions) 558 | -------------------------------------------------------------------------------- /tests/data/test_ranking_data.csv: -------------------------------------------------------------------------------- 1 | src,ref,pos,neg 2 | "In terms of the attack, Christopheros is convinced that the unknown man must have confused him with someone else.","In terms of the attack, Christopheros is convinced that the unknown man must have confused him with someone else.","As for the attack, Christopheros is sure the unknown man must have mistaken him for someone else.","As for the attack, Christopheros is confident that an unknown man had to confuse him with someone." 3 | "I'm happy with how it's going so far.""","I'm happy with how it's going so far.""","I'm happy with the result so far.""","With the result I am satisfied. """ 4 | "Joining the trio in the field are former Masters winner Ian Woosnam and Senior major champions Paul Broadhurst, Roger Chapman, Mark James and Mark McNulty.","Joining the trio in the field are former Masters winner Ian Woosnam and Senior major champions Paul Broadhurst, Roger Chapman, Mark James and Mark McNulty.","For a trio of players on the field connects also a former winner of the Masters, Ian Woosnam and the winners of the senior tournament Paul Broadhurst, Roger Chapman, Mark James and Mark McNulty..","The three players will also be joined on the field by former Masters winner Ian Woosnam and senior tournament winners Paul Broadhurst, Roger Chapman, Mark James and Mark McNulty." 5 | "Orr and former Ryder Cup captain Sam Torrance are among six Scots in the field, the others being Andrew Oldcorn, Stephen McAllister, Bill Longmuir and Ross Drummond.","Orr and former Ryder Cup captain Sam Torrance are among six Scots in the field, the others being Andrew Oldcorn, Stephen McAllister, Bill Longmuir and Ross Drummond.","ORR and former Ryder Cup captain Sam Torrance are among six Scottish players on the pitch; the others are Andrew Oldcorn, Stephen McAllister, Bill Longmuir and Ross Drummond.","ORR and former Ryder Cup captain Sam Torrance are among the six Scottish players on the field; Others are Andrew Oldcorn, Stephen McAllister, Bill Longmuir and Ross Drummond." 6 | """You want to do well; but you do feel that extra pressure.","""You want to do well; but you do feel that extra pressure.","""You want to be successful and feel great pressure.","""You want to do well while feeling a lot of pressure." 7 | """You want to do well; but you do feel that extra pressure.","""You want to do well; but you do feel that extra pressure.","""You want to be able to thrive and you feel a lot of pressure.","""You want to do well while feeling a lot of pressure." 8 | Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and death in Indonesia's land of gold,Poverty and death in indonesian country gold 9 | Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and death in the Indonesian country of gold,Poverty and death in indonesian country gold 10 | Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and death in indonesian country gold 11 | Poverty and Death in Indonesia's Land of Gold,Poverty and Death in Indonesia's Land of Gold,Poverty and death in the Indonesian country of gold,Poverty and death in indonesian country gold 12 | "Recently, the Foundation for Responsible Robotics (FRR) released a report looking at the development of sex robots over the next five to 10 years.","Recently, the Foundation for Responsible Robotics (FRR) released a report looking at the development of sex robots over the next five to 10 years.",The Foundation for Responsible Robotics (FRR) recently released a statement on the development of sex robots within the next five to 10 years.,Organization Foundation for Responsible Robotics (FRR) recently issued a statement about the development of sex robots within the next five to ten years. 13 | "Recently, the Foundation for Responsible Robotics (FRR) released a report looking at the development of sex robots over the next five to 10 years.","Recently, the Foundation for Responsible Robotics (FRR) released a report looking at the development of sex robots over the next five to 10 years.",The Foundation for Responsible Robotics (FRR) recently issued a statement on the development of sex robots within the next five to ten years.,Organization Foundation for Responsible Robotics (FRR) recently issued a statement about the development of sex robots within the next five to ten years. 14 | """This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and we will bring this change of green card issuance rules to residents of other countries.","""This bill will reduce poverty, raise wages and save billions of dollars for taxpayers." 15 | """This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This bill will reduce poverty, raise wages and save billions of dollars for taxpayers, and we will do this by changing the rules for issuing green cards to residents of other countries.","""This bill will reduce poverty, raise wages and save billions of dollars for taxpayers." 16 | """This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, raise wages and save billions of dollars for taxpayers, and will do this by changing the way the US issues Green Cards to citizens from other countries.","""This law will reduce poverty, increase wages and will save billions of dollars for tax payers. You can achieve it by changing the rules on the issuing of green cards to residents of other countries.","""This bill will reduce poverty, raise wages and save billions of dollars for taxpayers." 17 | The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and are supported by the president.,New rules are proposed by the Republicans and the President.,The new rules propose the republicans and the president. 18 | The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and the president.,The new rules propose the republicans and the president. 19 | The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and are supported by the president.,The new rules are proposed by the Republicans and the president.,The new rules propose the republicans and the president. 20 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The bill will be submitted to Congress soon.,The bill will soon be submitted to Congress. 21 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The bill will be submitted to Congress soon.,The bill will soon be submitted to Congress. 22 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The draft law will be soon submitted to Congress.,The bill will be submitted to Congress soon. 23 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The bill will be submitted to Congress soon.,The bill will soon be submitted to Congress. 24 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The draft law will be soon submitted to Congress.,The bill will soon be submitted to Congress. 25 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The draft law will be soon submitted to Congress.,The bill will soon be submitted to Congress. 26 | The draft law is due to be discussed in the two chambers of Congress.,The draft law is due to be discussed in the two chambers of Congress.,The draft law will be soon submitted to Congress.,The bill will soon be submitted to Congress. 27 | "Freedom of speech, dissent and discourse lie at the very foundation of our nation.","Freedom of speech, dissent and discourse lie at the very foundation of our nation.","Freedom of speech, disagreement and intimidation lies in the very foundations of our nation.","Freedom of expression, dissent and the intimidation lies in the very foundation of our nation." 28 | And true leadership means accepting that.,And true leadership means accepting that.,And the real leader must accept it.,And the real leader has to accept that. 29 | And true leadership means accepting that.,And true leadership means accepting that.,And a real leader has to accept that.,And the real leader has to accept that. 30 | And true leadership means accepting that.,And true leadership means accepting that.,And the real leader has to accept it.,And the real leader has to accept that. 31 | And true leadership means accepting that.,And true leadership means accepting that.,And the real leader must accept it.,And a true leader must take. 32 | And true leadership means accepting that.,And true leadership means accepting that.,And a real leader has to accept that.,And a true leader must take. 33 | And true leadership means accepting that.,And true leadership means accepting that.,And the real leader has to accept it.,And a true leader must take. 34 | "And even back then, Washington had anonymous trolls.","And even back then, Washington had anonymous trolls.","And even in those times, Washington had to deal with anonymous trolls.","And even in those days, Washington had to deal with anonymous trolls." 35 | "And even back then, Washington had anonymous trolls.","And even back then, Washington had anonymous trolls.","And even in those times, Washington had to deal with anonymous trolls.","And even in those days, Washington had to deal with anonymous trolls." 36 | "And even back then, Washington had anonymous trolls.","And even back then, Washington had anonymous trolls.","And even in those times, Washington had to deal with anonymous trolls.","And even in those days, Washington had to deal with anonymous trolls." 37 | "And even back then, Washington had anonymous trolls.","And even back then, Washington had anonymous trolls.","And even in those times, Washington had to deal with anonymous trolls.","And even in those days, Washington had to deal with anonymous trolls." 38 | "In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington's time, the era of affordable postage had an impact much like the Internet.","In the times of Washington, the low price of postage had virtually the same impact as the internet.","In the Times of Washington, the low postage price had practically the same impact as the Internet." 39 | "In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington's time, the era of affordable postage had an impact much like the Internet.","In the days of Washington, the low cost of postage had virtually the same impact as the Internet.","In Washington, the low cost of postage had virtually the same impact as the Internet." 40 | "In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington, the low cost of postage had virtually the same impact as the Internet.","In the Times of Washington, the low postage price had practically the same impact as the Internet." 41 | "In Washington's time, the era of affordable postage had an impact much like the Internet.","In Washington's time, the era of affordable postage had an impact much like the Internet.","In the days of Washington, the low cost of postage had virtually the same impact as the Internet.","In the Times of Washington, the low postage price had practically the same impact as the Internet." 42 | "The number of newspapers quadrupled between 1776 and 1800, and anonymous letter writers hammered his leadership.","The number of newspapers quadrupled between 1776 and 1800, and anonymous letter writers hammered his leadership.","Between 1776 and 1800, the number of daily newspapers quadrupled and his leader's position was questioned by many anonymous writers.","Between the years 1776 and 1800, the number of daily newspapers quadrupled and his position as leader Zpochybňovalo many anonymous writers." 43 | "So blocking people who come to the governor's page - which is a public forum, labeled as official and administered by staff members paid public tax dollars - is unnecessary and ultimately dangerous.","So blocking people who come to the governor's page - which is a public forum, labeled as official and administered by staff members paid public tax dollars - is unnecessary and ultimately dangerous.",Blocking people who visit the Guvernérovu page – which is a public forum designated as official and managed by its employees paid from taxpayers ' money – is useless and ultimately dangerous.,"Blocking people closing on the governor's page - which is a public forum, identified as official and managed by his staff paid from taxpayers ""money - is unnecessary and ultimately dangerous." 44 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview for The Washington Post, Hogan's spokeswoman Amelia Chasse defended the governor, saying that blocking the comments went from a position of modelling contributions.","In an interview with Washington Post Hogan's spokeswoman Amelia Chasse, the governor argued that comment blocking was from post-moderation posts." 45 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post, Hogan's spokeswoman, Amelia Chasse, defended the governor, saying the blocking of comments was done from a position of moderating posts.","In an interview for The Washington Post, Hogan's spokeswoman Amelia Chasse defended the governor, saying that blocking the comments went from a position of modelling contributions." 46 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview for The Washington Post, Hogan's spokeswoman Amelia Chasse defended the governor, saying that blocking the comments went from a position of modelling contributions.","In an interview with the Washington Post Hoganova, the spokesman Amelia Chasseová defended the governor, saying that the blocking of comments took place from the post moderation." 47 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post, Hogan's spokeswoman, Amelia Chasse, defended the governor, saying the blocking of comments was done from a position of moderating posts.","In an interview with Washington Post Hogan's spokeswoman Amelia Chasse, the governor argued that comment blocking was from post-moderation posts." 48 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post Hoganova spokesperson Amelia Chasseová defended the governor, saying that the blocking of the comments was from a position of moderation of the contributions.","In an interview with Washington Post Hogan's spokeswoman Amelia Chasse, the governor argued that comment blocking was from post-moderation posts." 49 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post, Hogan's spokeswoman, Amelia Chasse, defended the governor, saying the blocking of comments was done from a position of moderating posts.","In an interview with the Washington Post Hoganova spokesperson Amelia Chasseová defended the governor, saying that the blocking of the comments was from a position of moderation of the contributions." 50 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post Hoganova spokesperson Amelia Chasseová defended the governor, saying that the blocking of the comments was from a position of moderation of the contributions.","In an interview with the Washington Post Hoganova, the spokesman Amelia Chasseová defended the governor, saying that the blocking of comments took place from the post moderation." 51 | "In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with The Washington Post, Hogan spokeswoman Amelia Chasse defended the governor's actions, arguing that blocking the comments was nothing more than moderating them.","In an interview with the Washington Post, Hogan's spokeswoman, Amelia Chasse, defended the governor, saying the blocking of comments was done from a position of moderating posts.","In an interview with the Washington Post Hoganova, the spokesman Amelia Chasseová defended the governor, saying that the blocking of comments took place from the post moderation." 52 | The Post talked to some of the real people blocked by Hogan.,The Post talked to some of the real people blocked by Hogan.,The Washington Post interviewed some real people who had their posts deleted.,The Washington Post interviewed several real people to whom the posts were deleted. 53 | The Post talked to some of the real people blocked by Hogan.,The Post talked to some of the real people blocked by Hogan.,The Washington Post interviewed some real people who had their posts deleted.,The Washington Post interviewed several real people have been deleted posts. 54 | "They all said that their comments were respectful, thoughtful and not profane.","They all said that their comments were respectful, thoughtful and not profane.","As everyone reported, their comments were polite and considerate, not profane.","As everyone said, their comments were polite and considerate, not blatant." 55 | "They all said that their comments were respectful, thoughtful and not profane.","They all said that their comments were respectful, thoughtful and not profane.","As everyone reported, their comments were polite and considerate, not profane.","As they all said, their comments were polite and considerate, not blasphemous." 56 | "They all said that their comments were respectful, thoughtful and not profane.","They all said that their comments were respectful, thoughtful and not profane.","As you all have said, their comments were polite and respectful, no profane.","As everyone said, their comments were polite and considerate, not blatant." 57 | "They all said that their comments were respectful, thoughtful and not profane.","They all said that their comments were respectful, thoughtful and not profane.","As you all have said, their comments were polite and respectful, no profane.","As they all said, their comments were polite and considerate, not blasphemous." 58 | "The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","In his comment, the pastor cited the Bible, appealing to Hogan's Catholic faith.","The Pastor, in his commentary quoted from the Bible, apelujíc on Hoganovu the catholic faith." 59 | "The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","In his commentary, the pastor cited the Bible, appealing to Hogan's Catholic faith.","The Pastor, in his commentary quoted from the Bible, apelujíc on Hoganovu the catholic faith." 60 | "The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his commentary, appealing to Hogan's Catholic faith.","The Pastor, in his commentary quoted from the Bible, apelujíc on Hoganovu the catholic faith." 61 | "The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The pastor quoted the Bible in his post, appealing to Hogan's Catholic faith.","The Pastor quoted the Bible in his commentary, apelujíc the Hoganovu Catholic faith.","The Pastor, in his commentary quoted from the Bible, apelujíc on Hoganovu the catholic faith." 62 | "Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Bowie's attorney Lakshmi Sarma Ramani, Md. asked about hate crimes, however, her comment was not hateful.","Attorney company Bowie Lakshmi Sarma Ramaniová, Md. asked on hate crimes, her comment was not hateful." 63 | "Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Lawyer Bowie Lakshmi Sarma Ramani, MD asked for hate crimes, but her comment was not hateful.","Attorney company Bowie Lakshmi Sarma Ramaniová, Md. asked on hate crimes, her comment was not hateful." 64 | "Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Bowie Lakshmi's attorney, Sarma Ramani, Md., inquired about hate crimes, but her comment was not hateful.","Attorney company Bowie Lakshmi Sarma Ramaniová, Md. asked on hate crimes, her comment was not hateful." 65 | "Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","Attorney Lakshmi Sarma Ramani of Bowie, Md., wasn't hateful, but she asked about hate crimes.","The lawyer of Bowie Lakshmi Sarma Ramaniová, Md. Asked about hate crimes, but her commentary was not hateful.","Attorney company Bowie Lakshmi Sarma Ramaniová, Md. asked on hate crimes, her comment was not hateful." 66 | "Facebook is to send more potential hoax articles to third-party fact checkers and show their findings below the original post, the world's largest online social network said on Thursday as it tries to fight so-called fake news.","Facebook is to send more potential hoax articles to third-party fact checkers and show their findings below the original post, the world's largest online social network said on Thursday as it tries to fight so-called fake news.","Facebook will send more potentially fake articles to control third parties and display findings under original posts in an attempt to fight against the so-called ""fake news"", said the largest social network on Thursday.","Facebook will send more potentially fake articles to third-party control and display findings under the original posts in an effort to fight so-called ""fake news,"" said the largest social network on Thursday." 67 | "Facebook is to send more potential hoax articles to third-party fact checkers and show their findings below the original post, the world's largest online social network said on Thursday as it tries to fight so-called fake news.","Facebook is to send more potential hoax articles to third-party fact checkers and show their findings below the original post, the world's largest online social network said on Thursday as it tries to fight so-called fake news.","Facebook will send a more potentially false articles for inspection to third parties and to display findings under the original posts in an attempt to fight against the so-called ""fake news"", said the largest social network on Thursday.","Facebook will send more potentially fake articles to third-party control and display findings under the original posts in an effort to fight so-called ""fake news,"" said the largest social network on Thursday." 68 | I'm not going to worry too much about it.,I'm not going to worry too much about it.,I believe everything will be all right.,I believe that all will be well. 69 | I'm not going to worry too much about it.,I'm not going to worry too much about it.,I believe everything will be all right.,I believe everything will be fine. 70 | I'm not going to worry too much about it.,I'm not going to worry too much about it.,I believe all will be well.,I believe that all will be well. 71 | I'm not going to worry too much about it.,I'm not going to worry too much about it.,I believe all will be well.,I believe everything will be fine. 72 | "We have a plan in place.""","We have a plan in place.""","It's all planned. ""","Everything is scheduled. """ 73 | "Riga is amazing, I'm going to miss, said goalkeeper Sedláček as he looks for a new engagement.","Riga is amazing, I'm going to miss, said goalkeeper Sedláček as he looks for a new engagement.","Riga is beautiful, I will miss, says goalie Sedláček and looks for engagement.","Riga is beautiful, I will miss him, says the goalie Sedlacek and looking for engagement." 74 | "Riga is amazing, I'm going to miss, said goalkeeper Sedláček as he looks for a new engagement.","Riga is amazing, I'm going to miss, said goalkeeper Sedláček as he looks for a new engagement.","Riga is beautiful, I will miss, says goalkeeper Sedláček and looking for engagement.","Riga is beautiful, I will miss him, says the goalie Sedlacek and looking for engagement." 75 | The World Health Organization considers the preparation to be slightly toxic.,The World Health Organization considers the preparation to be slightly toxic.,The world health organization considers the preparation as moderately toxic.,World Health Organization WHO considers the preparation to be slightly toxic. 76 | The World Health Organization considers the preparation to be slightly toxic.,The World Health Organization considers the preparation to be slightly toxic.,The world health organization considers the preparation as moderately toxic.,The WHO considers the preparation to be slightly toxic. 77 | How were the four years you spent in Latvia?,How were the four years you spent in Latvia?,What were the four years you lived in Latvia?,What were the four years you spent in Latvia? 78 | How were the four years you spent in Latvia?,How were the four years you spent in Latvia?,What were the four years you lived in Latvia?,What were the four years you spent in Latvia? 79 | How were the four years you spent in Latvia?,How were the four years you spent in Latvia?,What were the four years you lived in Latvia?,What were the four years you spent in Latvia? 80 | How were the four years you spent in Latvia?,How were the four years you spent in Latvia?,What were the four years you lived in Latvia?,What were the four years you spent in Latvia? 81 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether you get an engagement at the start of a new year.,It depends on whether he gets an engagement at the start of the new year. 82 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether he gets an engagement before the start of the new year.,It depends on whether he gets an engagement at the start of the new year. 83 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether the new year starts gaining engagement.,It depends on whether he gets an engagement at the start of the new year. 84 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether you get an engagement at the start of a new year.,It depends on whether the start of the new year gets engagement. 85 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether he gets an engagement before the start of the new year.,It depends on whether the start of the new year gets engagement. 86 | It depends on whether he finds an engagement before the start of the new year.,It depends on whether he finds an engagement before the start of the new year.,It depends on whether the new year starts gaining engagement.,It depends on whether the start of the new year gets engagement. 87 | I was learning.,I was learning.,I was learning.,I was taught to. 88 | I was learning.,I was learning.,I learned.,I was taught to. 89 | I was learning.,I was learning.,I was learning.,I was taught to. 90 | I was learning.,I was learning.,I was learning.,I was taught to. 91 | "It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina, but ""more like a shower gel or an egg or a futuristic object.","Doesn't look like an anatomical replica of the vagina, but ""more like a shower gel or as an egg or futuristic object." 92 | "It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina, but ""more like a shower gel or an egg or a futuristic object.","Doesn't look like an anatomical replica of the vagina, but ""more like a shower gel or as an egg or futuristic object." 93 | "It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It doesn't look like an anatomical imitation of a vagina but ""more like a shower gel or an egg, or some kind of futuristic object.","It does not look like an anatomical imitation of the vagina, but ""more like a shower gel or as an egg or futuristic object.","Doesn't look like an anatomical replica of the vagina, but ""more like a shower gel or as an egg or futuristic object." 94 | "On the other hand, the category of penetrative vibrators was a complete washout,","On the other hand, the category of penetrative vibrators was a complete washout,","On the contrary, a certain dropper is a category of stunning vibrators.","On the contrary, a certain parachute is a category of loading vibrators." 95 | at least for us.,at least for us.,At least then with us.,At least with us. 96 | at least for us.,at least for us.,At least then with us.,At least with us. 97 | at least for us.,at least for us.,At least then with us.,At least with us. 98 | at least for us.,at least for us.,At least then with us.,At least in the us. 99 | Correspondents say the growing strength of the Taliban and the group known as so-called Islamic State (IS) in Qarabagh is a major source of concern to Nato forces based in nearby Bagram.,Correspondents say the growing strength of the Taliban and the group known as so-called Islamic State (IS) in Qarabagh is a major source of concern to Nato forces based in nearby Bagram.,"According to the reports, the growing power of the Taliban and the group known as Islamic State (IS) in the Qarabagh area is a major source of concern for NATO-allied forces in nearby Bagram.",According to the observations of the rapporteurs is the growing power of the Taliban and the group known as the Islamic state (IS) in the area of Qarabagh an important source of concern for NATO allied forces at the nearby Bagram. 100 | All measures have been implemented.,All measures have been implemented.,All measures have been taken.,It was taken to all measures. 101 | All measures have been implemented.,All measures have been implemented.,All measures have been taken.,It was taken to all measures. 102 | --------------------------------------------------------------------------------