├── .DS_Store ├── LICENSE ├── README.md ├── evaluation ├── .DS_Store ├── evaluation_inference_speed.py ├── evaluation_multilingual.py ├── evaluation_stsbenchmark.py ├── evaluation_stsbenchmark_sbert-wk.py ├── evaluation_translation_matching.py └── stsbenchmark.zip ├── index.rst ├── requirements.txt ├── sentence_transformers ├── LoggingHandler.py ├── SentenceTransformer.py ├── __init__.py ├── __pycache__ │ ├── LoggingHandler.cpython-36.pyc │ ├── LoggingHandler.cpython-38.pyc │ ├── SentenceTransformer.cpython-36.pyc │ ├── SentenceTransformer.cpython-38.pyc │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-38.pyc │ ├── util.cpython-36.pyc │ └── util.cpython-38.pyc ├── cross_encoder │ ├── CrossEncoder.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── CrossEncoder.cpython-36.pyc │ │ ├── CrossEncoder.cpython-38.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── __init__.cpython-38.pyc │ └── evaluation │ │ ├── CEBinaryAccuracyEvaluator.py │ │ ├── CEBinaryClassificationEvaluator.py │ │ ├── CECorrelationEvaluator.py │ │ ├── CERerankingEvaluator.py │ │ ├── CESoftmaxAccuracyEvaluator.py │ │ └── __init__.py ├── datasets │ ├── DenoisingAutoEncoderDataset.py │ ├── NoDuplicatesDataLoader.py │ ├── ParallelSentencesDataset.py │ ├── SentenceLabelDataset.py │ ├── SentencesDataset.py │ ├── __init__.py │ └── __pycache__ │ │ ├── DenoisingAutoEncoderDataset.cpython-36.pyc │ │ ├── DenoisingAutoEncoderDataset.cpython-38.pyc │ │ ├── NoDuplicatesDataLoader.cpython-36.pyc │ │ ├── NoDuplicatesDataLoader.cpython-38.pyc │ │ ├── ParallelSentencesDataset.cpython-36.pyc │ │ ├── ParallelSentencesDataset.cpython-38.pyc │ │ ├── SentenceLabelDataset.cpython-36.pyc │ │ ├── SentenceLabelDataset.cpython-38.pyc │ │ ├── SentencesDataset.cpython-36.pyc │ │ ├── SentencesDataset.cpython-38.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── __init__.cpython-38.pyc ├── evaluation │ ├── BinaryClassificationEvaluator.py │ ├── EmbeddingSimilarityEvaluator.py │ ├── InformationRetrievalEvaluator.py │ ├── LabelAccuracyEvaluator.py │ ├── MSEEvaluator.py │ ├── MSEEvaluatorFromDataFrame.py │ ├── ParaphraseMiningEvaluator.py │ ├── RerankingEvaluator.py │ ├── SentenceEvaluator.py │ ├── SequentialEvaluator.py │ ├── SimilarityFunction.py │ ├── TranslationEvaluator.py │ ├── TripletEvaluator.py │ ├── __init__.py │ └── __pycache__ │ │ ├── BinaryClassificationEvaluator.cpython-36.pyc │ │ ├── BinaryClassificationEvaluator.cpython-38.pyc │ │ ├── EmbeddingSimilarityEvaluator.cpython-36.pyc │ │ ├── EmbeddingSimilarityEvaluator.cpython-38.pyc │ │ ├── InformationRetrievalEvaluator.cpython-36.pyc │ │ ├── InformationRetrievalEvaluator.cpython-38.pyc │ │ ├── LabelAccuracyEvaluator.cpython-36.pyc │ │ ├── LabelAccuracyEvaluator.cpython-38.pyc │ │ ├── MSEEvaluator.cpython-36.pyc │ │ ├── MSEEvaluator.cpython-38.pyc │ │ ├── MSEEvaluatorFromDataFrame.cpython-36.pyc │ │ ├── MSEEvaluatorFromDataFrame.cpython-38.pyc │ │ ├── ParaphraseMiningEvaluator.cpython-36.pyc │ │ ├── ParaphraseMiningEvaluator.cpython-38.pyc │ │ ├── RerankingEvaluator.cpython-36.pyc │ │ ├── RerankingEvaluator.cpython-38.pyc │ │ ├── SentenceEvaluator.cpython-36.pyc │ │ ├── SentenceEvaluator.cpython-38.pyc │ │ ├── SequentialEvaluator.cpython-36.pyc │ │ ├── SequentialEvaluator.cpython-38.pyc │ │ ├── SimilarityFunction.cpython-36.pyc │ │ ├── SimilarityFunction.cpython-38.pyc │ │ ├── TranslationEvaluator.cpython-36.pyc │ │ ├── TranslationEvaluator.cpython-38.pyc │ │ ├── TripletEvaluator.cpython-36.pyc │ │ ├── TripletEvaluator.cpython-38.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── __init__.cpython-38.pyc ├── losses │ ├── BYOLoss.py │ ├── BatchAllTripletLoss.py │ ├── BatchHardSoftMarginTripletLoss.py │ ├── BatchHardTripletLoss.py │ ├── BatchSemiHardTripletLoss.py │ ├── ContrastiveLoss.py │ ├── ContrastiveTensionLoss.py │ ├── CosineSimilarityLoss.py │ ├── DenoisingAutoEncoderLoss.py │ ├── MSELoss.py │ ├── MegaBatchMarginLoss.py │ ├── MultipleNegativesRankingLoss.py │ ├── OnlineContrastiveLoss.py │ ├── SoftmaxLoss.py │ ├── TripletLoss.py │ ├── __init__.py │ └── __pycache__ │ │ ├── BYOLoss.cpython-36.pyc │ │ ├── BYOLoss.cpython-38.pyc │ │ ├── BatchAllTripletLoss.cpython-36.pyc │ │ ├── BatchAllTripletLoss.cpython-38.pyc │ │ ├── BatchHardSoftMarginTripletLoss.cpython-36.pyc │ │ ├── BatchHardSoftMarginTripletLoss.cpython-38.pyc │ │ ├── BatchHardTripletLoss.cpython-36.pyc │ │ ├── BatchHardTripletLoss.cpython-38.pyc │ │ ├── BatchSemiHardTripletLoss.cpython-36.pyc │ │ ├── BatchSemiHardTripletLoss.cpython-38.pyc │ │ ├── ContrastiveLoss.cpython-36.pyc │ │ ├── ContrastiveLoss.cpython-38.pyc │ │ ├── ContrastiveTensionLoss.cpython-36.pyc │ │ ├── ContrastiveTensionLoss.cpython-38.pyc │ │ ├── CosineSimilarityLoss.cpython-36.pyc │ │ ├── CosineSimilarityLoss.cpython-38.pyc │ │ ├── DenoisingAutoEncoderLoss.cpython-36.pyc │ │ ├── DenoisingAutoEncoderLoss.cpython-38.pyc │ │ ├── MSELoss.cpython-36.pyc │ │ ├── MSELoss.cpython-38.pyc │ │ ├── MegaBatchMarginLoss.cpython-36.pyc │ │ ├── MegaBatchMarginLoss.cpython-38.pyc │ │ ├── MultipleNegativesRankingLoss.cpython-36.pyc │ │ ├── MultipleNegativesRankingLoss.cpython-38.pyc │ │ ├── OnlineContrastiveLoss.cpython-36.pyc │ │ ├── OnlineContrastiveLoss.cpython-38.pyc │ │ ├── SoftmaxLoss.cpython-36.pyc │ │ ├── SoftmaxLoss.cpython-38.pyc │ │ ├── TripletLoss.cpython-36.pyc │ │ ├── TripletLoss.cpython-38.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── __init__.cpython-38.pyc ├── models │ ├── ALBERT.py │ ├── Asym.py │ ├── BERT.py │ ├── BoW.py │ ├── CLIPModel.py │ ├── CNN.py │ ├── CamemBERT.py │ ├── Dense.py │ ├── DistilBERT.py │ ├── LSTM.py │ ├── LayerNorm.py │ ├── Normalize.py │ ├── Pooling.py │ ├── RoBERTa.py │ ├── T5.py │ ├── Transformer.py │ ├── WKPooling.py │ ├── WeightedLayerPooling.py │ ├── WordEmbeddings.py │ ├── WordWeights.py │ ├── XLMRoBERTa.py │ ├── XLNet.py │ ├── __init__.py │ ├── __pycache__ │ │ ├── Asym.cpython-36.pyc │ │ ├── Asym.cpython-38.pyc │ │ ├── BERT.cpython-36.pyc │ │ ├── BERT.cpython-38.pyc │ │ ├── BoW.cpython-36.pyc │ │ ├── BoW.cpython-38.pyc │ │ ├── CNN.cpython-36.pyc │ │ ├── CNN.cpython-38.pyc │ │ ├── Dense.cpython-36.pyc │ │ ├── Dense.cpython-38.pyc │ │ ├── DistilBERT.cpython-36.pyc │ │ ├── LSTM.cpython-36.pyc │ │ ├── LSTM.cpython-38.pyc │ │ ├── LayerNorm.cpython-36.pyc │ │ ├── LayerNorm.cpython-38.pyc │ │ ├── Normalize.cpython-36.pyc │ │ ├── Normalize.cpython-38.pyc │ │ ├── Pooling.cpython-36.pyc │ │ ├── Pooling.cpython-38.pyc │ │ ├── Transformer.cpython-36.pyc │ │ ├── Transformer.cpython-38.pyc │ │ ├── WKPooling.cpython-36.pyc │ │ ├── WKPooling.cpython-38.pyc │ │ ├── WeightedLayerPooling.cpython-36.pyc │ │ ├── WeightedLayerPooling.cpython-38.pyc │ │ ├── WordEmbeddings.cpython-36.pyc │ │ ├── WordEmbeddings.cpython-38.pyc │ │ ├── WordWeights.cpython-36.pyc │ │ ├── WordWeights.cpython-38.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── __init__.cpython-38.pyc │ └── tokenizer │ │ ├── PhraseTokenizer.py │ │ ├── WhitespaceTokenizer.py │ │ ├── WordTokenizer.py │ │ ├── __init__.py │ │ └── __pycache__ │ │ ├── WhitespaceTokenizer.cpython-36.pyc │ │ ├── WhitespaceTokenizer.cpython-38.pyc │ │ ├── WordTokenizer.cpython-36.pyc │ │ ├── WordTokenizer.cpython-38.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── __init__.cpython-38.pyc ├── readers │ ├── InputExample.py │ ├── LabelSentenceReader.py │ ├── NLIDataReader.py │ ├── PairedFilesReader.py │ ├── STSDataReader.py │ ├── TripletReader.py │ ├── __init__.py │ └── __pycache__ │ │ ├── InputExample.cpython-36.pyc │ │ ├── InputExample.cpython-38.pyc │ │ ├── LabelSentenceReader.cpython-36.pyc │ │ ├── LabelSentenceReader.cpython-38.pyc │ │ ├── NLIDataReader.cpython-36.pyc │ │ ├── NLIDataReader.cpython-38.pyc │ │ ├── STSDataReader.cpython-36.pyc │ │ ├── STSDataReader.cpython-38.pyc │ │ ├── TripletReader.cpython-36.pyc │ │ ├── TripletReader.cpython-38.pyc │ │ ├── __init__.cpython-36.pyc │ │ └── __init__.cpython-38.pyc └── util.py ├── setup.cfg ├── setup.py └── training ├── .DS_Store ├── data ├── .DS_Store └── back_translated_nli.txt.zip ├── multilingual_tuning.py ├── supervised_tuning.py └── unsupervised_tuning.py /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/.DS_Store -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Install first 2 | 3 | ```` 4 | pip3 install -e . 5 | ```` 6 | 7 | ## Training 8 | 9 | ```` 10 | python3 training/unsupervised_tuning.py 11 | python3 training/supervised_tuning.py 12 | python3 training/multilingual_tuning.py 13 | ```` 14 | 15 | The multilingual NLI corpus can be downloaded from here (https://drive.google.com/file/d/19O2NArJz_RlVNNGRbBnnWxNMW-7HaFZ8/view?usp=sharing) 16 | 17 | ## pretrained Model 18 | Our pretrained model can be downloaded from here (https://drive.google.com/drive/folders/1fURXl4fGTGJ55PQF_Gr4Wr8ds2Qwa7U5?usp=sharing) 19 | 20 | 21 | ## Acknowledgements 22 | 23 | Codes are adapted from the repos of the EMNLP19 paper [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://github.com/UKPLab/sentence-transformers) 24 | -------------------------------------------------------------------------------- /evaluation/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/evaluation/.DS_Store -------------------------------------------------------------------------------- /evaluation/evaluation_inference_speed.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples measures the inference speed of a certain model 3 | 4 | Usage: 5 | python evaluation_inference_speed.py 6 | OR 7 | python evaluation_inference_speed.py model_name 8 | """ 9 | from sentence_transformers import SentenceTransformer, util 10 | import sys 11 | import os 12 | import time 13 | import torch 14 | import gzip 15 | import csv 16 | 17 | #Limit torch to 4 threads 18 | torch.set_num_threads(4) 19 | 20 | 21 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens' 22 | 23 | # Load a named sentence model (based on BERT). This will download the model from our server. 24 | # Alternatively, you can also pass a filepath to SentenceTransformer() 25 | model = SentenceTransformer(model_name) 26 | 27 | 28 | nli_dataset_path = 'datasets/AllNLI.tsv.gz' 29 | sentences = set() 30 | max_sentences = 100000 31 | 32 | 33 | #Download datasets if needed 34 | if not os.path.exists(nli_dataset_path): 35 | util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) 36 | 37 | with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: 38 | reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) 39 | for row in reader: 40 | sentences.add(row['sentence1']) 41 | if len(sentences) >= max_sentences: 42 | break 43 | 44 | sentences = list(sentences) 45 | print("Model Name:", model_name) 46 | print("Number of sentences:", len(sentences)) 47 | 48 | for i in range(3): 49 | print("Run", i) 50 | start_time = time.time() 51 | emb = model.encode(sentences, num_workers=2, batch_size=32) 52 | end_time = time.time() 53 | diff_time = end_time - start_time 54 | print("Done after {:.2f} seconds".format(diff_time)) 55 | print("Speed: {:.2f} sentences / second".format(len(sentences) / diff_time)) 56 | print("=====") -------------------------------------------------------------------------------- /evaluation/evaluation_multilingual.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, evaluation 3 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator 4 | from sentence_transformers.readers import STSBenchmarkDataReader 5 | import logging 6 | import sys 7 | import os 8 | import torch 9 | import numpy as np 10 | import zipfile 11 | import io 12 | script_folder_path = os.path.dirname(os.path.realpath(__file__)) 13 | 14 | 15 | #### Just some code to print debug information to stdout 16 | logging.basicConfig(format='%(asctime)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | level=logging.INFO, 19 | handlers=[LoggingHandler()]) 20 | #### /print debug information to stdout 21 | 22 | model_name = 'stsb-xlm-r-multilingual' 23 | # Load a named sentence model (based on BERT). This will download the model from our server. 24 | # Alternatively, you can also pass a filepath to SentenceTransformer() 25 | model = SentenceTransformer(model_name) 26 | 27 | 28 | # Read the dataset 29 | source_languages = ['en'] 30 | target_languages = ['en', 'de', 'es', 'fr', 'ar', 'tr'] 31 | sts_corpus = "../training/data/STS2017-extended.zip" 32 | 33 | logging.info("Read STS test dataset") 34 | ##### Read cross-lingual Semantic Textual Similarity (STS) data #### 35 | all_languages = list(set(list(source_languages)+list(target_languages))) 36 | sts_data = {} 37 | evaluators = [] 38 | #Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data 39 | with zipfile.ZipFile(sts_corpus) as zip: 40 | filelist = zip.namelist() 41 | for i in range(len(all_languages)): 42 | for j in range(i, len(all_languages)): 43 | lang1 = all_languages[i] 44 | lang2 = all_languages[j] 45 | filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2) 46 | if filepath not in filelist: 47 | lang1, lang2 = lang2, lang1 48 | filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2) 49 | 50 | if filepath in filelist: 51 | filename = os.path.basename(filepath) 52 | sts_data[filename] = {'sentences1': [], 'sentences2': [], 'scores': []} 53 | 54 | fIn = zip.open(filepath) 55 | for line in io.TextIOWrapper(fIn, 'utf8'): 56 | sent1, sent2, score = line.strip().split("\t") 57 | score = float(score) 58 | sts_data[filename]['sentences1'].append(sent1) 59 | sts_data[filename]['sentences2'].append(sent2) 60 | sts_data[filename]['scores'].append(score) 61 | 62 | # model = SentenceTransformer(model_save_path) 63 | for filename, data in sts_data.items(): 64 | test_evaluator = EmbeddingSimilarityEvaluator(data['sentences1'], data['sentences2'], data['scores'], batch_size=16, name=filename, show_progress_bar=False) 65 | test_evaluator(model) 66 | 67 | 68 | 69 | -------------------------------------------------------------------------------- /evaluation/evaluation_stsbenchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset 3 | 4 | Usage: 5 | python evaluation_stsbenchmark.py 6 | OR 7 | python evaluation_stsbenchmark.py model_name 8 | """ 9 | from torch.utils.data import DataLoader 10 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, evaluation 11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator 12 | from sentence_transformers.readers import STSBenchmarkDataReader 13 | import logging 14 | import sys 15 | import os 16 | import torch 17 | import numpy as np 18 | 19 | script_folder_path = os.path.dirname(os.path.realpath(__file__)) 20 | 21 | #Limit torch to 4 threads 22 | torch.set_num_threads(4) 23 | 24 | #### Just some code to print debug information to stdout 25 | logging.basicConfig(format='%(asctime)s - %(message)s', 26 | datefmt='%Y-%m-%d %H:%M:%S', 27 | level=logging.INFO, 28 | handlers=[LoggingHandler()]) 29 | #### /print debug information to stdout 30 | model_name = '../training/output/BSL_tuning-bert-base-nli-mean-tokens-64-2021-08-31_21-19-14' 31 | # Load a named sentence model (based on BERT). This will download the model from our server. 32 | # Alternatively, you can also pass a filepath to SentenceTransformer() 33 | model = SentenceTransformer(model_name) 34 | 35 | sts_corpus = "stsbenchmark/" 36 | target_eval_files = set(['sts','sts12', 'sts13', 'sts14', 'sts15', 'sts16', 'sick-r']) 37 | 38 | evaluators = [] #evaluators has a list of different evaluator classes we call periodically 39 | sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, sts_corpus)) 40 | for target in target_eval_files: 41 | output_filename_eval = os.path.join(script_folder_path,sts_corpus + target + "-test.csv") 42 | evaluators.append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target)) 43 | 44 | evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)) 45 | model.evaluate(evaluator) 46 | -------------------------------------------------------------------------------- /evaluation/evaluation_stsbenchmark_sbert-wk.py: -------------------------------------------------------------------------------- 1 | """ 2 | Performs the pooling described in the paper: 3 | SBERT-WK: A Sentence Embedding Method by Dissecting BERT-based Word Models, 2020, https://arxiv.org/abs/2002.06652 4 | 5 | Note: WKPooling improves the performance only for certain models. Further, WKPooling requires QR-decomposition, 6 | for which there is so far not efficient implementation in pytorch for GPUs (see https://github.com/pytorch/pytorch/issues/22573). 7 | Hence, WKPooling runs on the GPU, which makes it rather in-efficient. 8 | """ 9 | from torch.utils.data import DataLoader 10 | from sentence_transformers import SentenceTransformer, LoggingHandler, models 11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 12 | from sentence_transformers.readers import STSBenchmarkDataReader 13 | import logging 14 | import torch 15 | 16 | #Limit torch to 4 threads, as this example runs on the CPU 17 | torch.set_num_threads(4) 18 | 19 | #### Just some code to print debug information to stdout 20 | logging.basicConfig(format='%(asctime)s - %(message)s', 21 | datefmt='%Y-%m-%d %H:%M:%S', 22 | level=logging.INFO, 23 | handlers=[LoggingHandler()]) 24 | #### /print debug information to stdout 25 | 26 | 27 | #1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true 28 | word_embedding_model = models.Transformer('bert-base-uncased', model_args={'output_hidden_states': True}) 29 | 30 | #2) Add WKPooling 31 | pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension()) 32 | 33 | #3) Create a sentence transformer model to glue both models together 34 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 35 | 36 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') 37 | evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples("sts-test.csv")) 38 | 39 | model.evaluate(evaluator) 40 | -------------------------------------------------------------------------------- /evaluation/evaluation_translation_matching.py: -------------------------------------------------------------------------------- 1 | """ 2 | Given a tab seperated file (.tsv) with parallel sentences, where the second column is the translation of the sentence in the first column, for example, in the format: 3 | src1 trg1 4 | src2 trg2 5 | ... 6 | 7 | where trg_i is the translation of src_i. 8 | 9 | Given src_i, the TranslationEvaluator checks which trg_j has the highest similarity using cosine similarity. If i == j, we assume 10 | a match, i.e., the correct translation has been found for src_i out of all possible target sentences. 11 | 12 | It then computes an accuracy over all possible source sentences src_i. Equivalently, it computes also the accuracy for the other direction. 13 | 14 | A high accuracy score indicates that the model is able to find the correct translation out of a large pool with sentences. 15 | 16 | Usage: 17 | python [model_name_or_path] [parallel-file1] [parallel-file2] ... 18 | 19 | For example: 20 | python distiluse-base-multilingual-cased TED2020-en-de.tsv.gz 21 | 22 | See the training_multilingual/get_parallel_data_...py scripts for getting parallel sentence data from different sources 23 | """ 24 | 25 | from sentence_transformers import SentenceTransformer, evaluation, LoggingHandler 26 | import sys 27 | import gzip 28 | import os 29 | import logging 30 | 31 | 32 | logging.basicConfig(format='%(asctime)s - %(message)s', 33 | datefmt='%Y-%m-%d %H:%M:%S', 34 | level=logging.INFO, 35 | handlers=[LoggingHandler()]) 36 | 37 | logger = logging.getLogger(__name__) 38 | 39 | model_name = sys.argv[1] 40 | filepaths = sys.argv[2:] 41 | inference_batch_size = 32 42 | 43 | model = SentenceTransformer(model_name) 44 | 45 | 46 | for filepath in filepaths: 47 | src_sentences = [] 48 | trg_sentences = [] 49 | with gzip.open(filepath, 'rt', encoding='utf8') if filepath.endswith('.gz') else open(filepath, 'r', encoding='utf8') as fIn: 50 | for line in fIn: 51 | splits = line.strip().split('\t') 52 | if len(splits) >= 2: 53 | src_sentences.append(splits[0]) 54 | trg_sentences.append(splits[1]) 55 | 56 | logger.info(os.path.basename(filepath)+": "+str(len(src_sentences))+" sentence pairs") 57 | dev_trans_acc = evaluation.TranslationEvaluator(src_sentences, trg_sentences, name=os.path.basename(filepath), batch_size=inference_batch_size) 58 | dev_trans_acc(model) 59 | 60 | 61 | -------------------------------------------------------------------------------- /evaluation/stsbenchmark.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/evaluation/stsbenchmark.zip -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=3.1.0,<5.0.0 2 | tqdm 3 | torch>=1.6.0 4 | numpy 5 | scikit-learn 6 | scipy 7 | nltk 8 | sentencepiece 9 | -------------------------------------------------------------------------------- /sentence_transformers/LoggingHandler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tqdm 3 | 4 | class LoggingHandler(logging.Handler): 5 | def __init__(self, level=logging.NOTSET): 6 | super().__init__(level) 7 | 8 | def emit(self, record): 9 | try: 10 | msg = self.format(record) 11 | tqdm.tqdm.write(msg) 12 | self.flush() 13 | except (KeyboardInterrupt, SystemExit): 14 | raise 15 | except: 16 | self.handleError(record) 17 | 18 | 19 | def install_logger( 20 | given_logger, level = logging.WARNING, fmt="%(levelname)s:%(name)s:%(message)s" 21 | ): 22 | """ Configures the given logger; format, logging level, style, etc """ 23 | import coloredlogs 24 | 25 | def add_notice_log_level(): 26 | """ Creates a new 'notice' logging level """ 27 | # inspired by: 28 | # https://stackoverflow.com/questions/2183233/how-to-add-a-custom-loglevel-to-pythons-logging-facility 29 | NOTICE_LEVEL_NUM = 25 30 | logging.addLevelName(NOTICE_LEVEL_NUM, "NOTICE") 31 | 32 | def notice(self, message, *args, **kws): 33 | if self.isEnabledFor(NOTICE_LEVEL_NUM): 34 | self._log(NOTICE_LEVEL_NUM, message, args, **kws) 35 | 36 | logging.Logger.notice = notice 37 | 38 | # Add an extra logging level above INFO and below WARNING 39 | add_notice_log_level() 40 | 41 | # More style info at: 42 | # https://coloredlogs.readthedocs.io/en/latest/api.html 43 | field_styles = coloredlogs.DEFAULT_FIELD_STYLES.copy() 44 | field_styles["asctime"] = {} 45 | level_styles = coloredlogs.DEFAULT_LEVEL_STYLES.copy() 46 | level_styles["debug"] = {"color": "white", "faint": True} 47 | level_styles["notice"] = {"color": "cyan", "bold": True} 48 | 49 | coloredlogs.install( 50 | logger=given_logger, 51 | level=level, 52 | use_chroot=False, 53 | fmt=fmt, 54 | level_styles=level_styles, 55 | field_styles=field_styles, 56 | ) 57 | -------------------------------------------------------------------------------- /sentence_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "1.2.0" 2 | __DOWNLOAD_SERVER__ = 'http://sbert.net/models/' 3 | from .datasets import SentencesDataset, ParallelSentencesDataset 4 | from .LoggingHandler import LoggingHandler 5 | from .SentenceTransformer import SentenceTransformer 6 | from .readers import InputExample 7 | from .cross_encoder.CrossEncoder import CrossEncoder 8 | 9 | -------------------------------------------------------------------------------- /sentence_transformers/__pycache__/LoggingHandler.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/LoggingHandler.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/__pycache__/LoggingHandler.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/LoggingHandler.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/__pycache__/SentenceTransformer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/SentenceTransformer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/__pycache__/SentenceTransformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/SentenceTransformer.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/__pycache__/util.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/util.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/__pycache__/util.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/util.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/__init__.py: -------------------------------------------------------------------------------- 1 | from .CrossEncoder import CrossEncoder -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/__pycache__/CrossEncoder.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/cross_encoder/__pycache__/CrossEncoder.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/__pycache__/CrossEncoder.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/cross_encoder/__pycache__/CrossEncoder.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/cross_encoder/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/cross_encoder/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/evaluation/CEBinaryAccuracyEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import csv 4 | from typing import List 5 | from ... import InputExample 6 | import numpy as np 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class CEBinaryAccuracyEvaluator: 12 | """ 13 | This evaluator can be used with the CrossEncoder class. 14 | 15 | It is designed for CrossEncoders with 1 outputs. It measure the 16 | accuracy of the predict class vs. the gold labels. It uses a fixed threshold to determine the label (0 vs 1). 17 | 18 | See CEBinaryClassificationEvaluator for an evaluator that determines automatically the optimal threshold. 19 | """ 20 | def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str='', threshold: float = 0.5, write_csv: bool = True): 21 | self.sentence_pairs = sentence_pairs 22 | self.labels = labels 23 | self.name = name 24 | self.threshold = threshold 25 | 26 | self.csv_file = "CEBinaryAccuracyEvaluator" + ("_" + name if name else '') + "_results.csv" 27 | self.csv_headers = ["epoch", "steps", "Accuracy"] 28 | self.write_csv = write_csv 29 | 30 | @classmethod 31 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 32 | sentence_pairs = [] 33 | labels = [] 34 | 35 | for example in examples: 36 | sentence_pairs.append(example.texts) 37 | labels.append(example.label) 38 | return cls(sentence_pairs, labels, **kwargs) 39 | 40 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 41 | if epoch != -1: 42 | if steps == -1: 43 | out_txt = " after epoch {}:".format(epoch) 44 | else: 45 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 46 | else: 47 | out_txt = ":" 48 | 49 | logger.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 50 | pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False) 51 | pred_labels = pred_scores > self.threshold 52 | 53 | assert len(pred_labels) == len(self.labels) 54 | 55 | acc = np.sum(pred_labels == self.labels) / len(self.labels) 56 | 57 | logger.info("Accuracy: {:.2f}".format(acc*100)) 58 | 59 | if output_path is not None and self.write_csv: 60 | csv_path = os.path.join(output_path, self.csv_file) 61 | output_file_exists = os.path.isfile(csv_path) 62 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 63 | writer = csv.writer(f) 64 | if not output_file_exists: 65 | writer.writerow(self.csv_headers) 66 | 67 | writer.writerow([epoch, steps, acc]) 68 | 69 | return acc 70 | -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/evaluation/CEBinaryClassificationEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from sklearn.metrics import average_precision_score 3 | from typing import List 4 | import numpy as np 5 | import os 6 | import csv 7 | 8 | from ... import InputExample 9 | from ...evaluation import BinaryClassificationEvaluator 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class CEBinaryClassificationEvaluator: 15 | """ 16 | This evaluator can be used with the CrossEncoder class. Given sentence pairs and binary labels (0 and 1), 17 | it compute the average precision and the best possible f1 score 18 | """ 19 | def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str='', write_csv: bool = True): 20 | assert len(sentence_pairs) == len(labels) 21 | for label in labels: 22 | assert (label == 0 or label == 1) 23 | 24 | self.sentence_pairs = sentence_pairs 25 | self.labels = np.asarray(labels) 26 | self.name = name 27 | 28 | self.csv_file = "CEBinaryClassificationEvaluator" + ("_" + name if name else '') + "_results.csv" 29 | self.csv_headers = ["epoch", "steps", "Accuracy", "Accuracy_Threshold", "F1", "F1_Threshold", "Precision", "Recall", "Average_Precision"] 30 | self.write_csv = write_csv 31 | 32 | @classmethod 33 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 34 | sentence_pairs = [] 35 | labels = [] 36 | 37 | for example in examples: 38 | sentence_pairs.append(example.texts) 39 | labels.append(example.label) 40 | return cls(sentence_pairs, labels, **kwargs) 41 | 42 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 43 | if epoch != -1: 44 | if steps == -1: 45 | out_txt = " after epoch {}:".format(epoch) 46 | else: 47 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 48 | else: 49 | out_txt = ":" 50 | 51 | logger.info("CEBinaryClassificationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 52 | pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False) 53 | 54 | acc, acc_threshold = BinaryClassificationEvaluator.find_best_acc_and_threshold(pred_scores, self.labels, True) 55 | f1, precision, recall, f1_threshold = BinaryClassificationEvaluator.find_best_f1_and_threshold(pred_scores, self.labels, True) 56 | ap = average_precision_score(self.labels, pred_scores) 57 | 58 | logger.info("Accuracy: {:.2f}\t(Threshold: {:.4f})".format(acc * 100, acc_threshold)) 59 | logger.info("F1: {:.2f}\t(Threshold: {:.4f})".format(f1 * 100, f1_threshold)) 60 | logger.info("Precision: {:.2f}".format(precision * 100)) 61 | logger.info("Recall: {:.2f}".format(recall * 100)) 62 | logger.info("Average Precision: {:.2f}\n".format(ap * 100)) 63 | 64 | if output_path is not None and self.write_csv: 65 | csv_path = os.path.join(output_path, self.csv_file) 66 | output_file_exists = os.path.isfile(csv_path) 67 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 68 | writer = csv.writer(f) 69 | if not output_file_exists: 70 | writer.writerow(self.csv_headers) 71 | 72 | writer.writerow([epoch, steps, acc, acc_threshold, f1, f1_threshold, precision, recall, ap]) 73 | 74 | 75 | return ap 76 | -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/evaluation/CECorrelationEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from scipy.stats import pearsonr, spearmanr 3 | from typing import List 4 | import os 5 | import csv 6 | from ... import InputExample 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class CECorrelationEvaluator: 12 | """ 13 | This evaluator can be used with the CrossEncoder class. Given sentence pairs and continuous scores, 14 | it compute the pearson & spearman correlation between the predicted score for the sentence pair 15 | and the gold score. 16 | """ 17 | def __init__(self, sentence_pairs: List[List[str]], scores: List[float], name: str='', write_csv: bool = True): 18 | self.sentence_pairs = sentence_pairs 19 | self.scores = scores 20 | self.name = name 21 | 22 | self.csv_file = "CECorrelationEvaluator" + ("_" + name if name else '') + "_results.csv" 23 | self.csv_headers = ["epoch", "steps", "Pearson_Correlation", "Spearman_Correlation"] 24 | self.write_csv = write_csv 25 | 26 | @classmethod 27 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 28 | sentence_pairs = [] 29 | scores = [] 30 | 31 | for example in examples: 32 | sentence_pairs.append(example.texts) 33 | scores.append(example.label) 34 | return cls(sentence_pairs, scores, **kwargs) 35 | 36 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 37 | if epoch != -1: 38 | if steps == -1: 39 | out_txt = " after epoch {}:".format(epoch) 40 | else: 41 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 42 | else: 43 | out_txt = ":" 44 | 45 | logger.info("CECorrelationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 46 | pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False) 47 | 48 | 49 | eval_pearson, _ = pearsonr(self.scores, pred_scores) 50 | eval_spearman, _ = spearmanr(self.scores, pred_scores) 51 | 52 | logger.info("Correlation:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson, eval_spearman)) 53 | 54 | if output_path is not None and self.write_csv: 55 | csv_path = os.path.join(output_path, self.csv_file) 56 | output_file_exists = os.path.isfile(csv_path) 57 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 58 | writer = csv.writer(f) 59 | if not output_file_exists: 60 | writer.writerow(self.csv_headers) 61 | 62 | writer.writerow([epoch, steps, eval_pearson, eval_spearman]) 63 | 64 | return eval_spearman 65 | -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/evaluation/CERerankingEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import numpy as np 3 | import os 4 | import csv 5 | 6 | logger = logging.getLogger(__name__) 7 | 8 | class CERerankingEvaluator: 9 | """ 10 | This class evaluates a CrossEncoder model for the task of re-ranking. 11 | 12 | Given a query and a list of documents, it computes the score [query, doc_i] for all possible 13 | documents and sorts them in decreasing order. Then, MRR@10 is compute to measure the quality of the ranking. 14 | 15 | :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query, 16 | positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents. 17 | """ 18 | def __init__(self, samples, mrr_at_k: int = 10, name: str = '', write_csv: bool = True): 19 | self.samples = samples 20 | self.name = name 21 | self.mrr_at_k = mrr_at_k 22 | 23 | if isinstance(self.samples, dict): 24 | self.samples = list(self.samples.values()) 25 | 26 | self.csv_file = "CERerankingEvaluator" + ("_" + name if name else '') + "_results.csv" 27 | self.csv_headers = ["epoch", "steps", "MRR@{}".format(mrr_at_k)] 28 | self.write_csv = write_csv 29 | 30 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 31 | if epoch != -1: 32 | if steps == -1: 33 | out_txt = " after epoch {}:".format(epoch) 34 | else: 35 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 36 | else: 37 | out_txt = ":" 38 | 39 | logger.info("CERerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 40 | 41 | all_mrr_scores = [] 42 | num_queries = 0 43 | num_positives = [] 44 | num_negatives = [] 45 | for instance in self.samples: 46 | query = instance['query'] 47 | positive = list(instance['positive']) 48 | negative = list(instance['negative']) 49 | docs = positive + negative 50 | is_relevant = [True]*len(positive) + [False]*len(negative) 51 | 52 | if len(positive) == 0 or len(negative) == 0: 53 | continue 54 | 55 | num_queries += 1 56 | num_positives.append(len(positive)) 57 | num_negatives.append(len(negative)) 58 | 59 | model_input = [[query, doc] for doc in docs] 60 | pred_scores = model.predict(model_input, convert_to_numpy=True, show_progress_bar=False) 61 | pred_scores_argsort = np.argsort(-pred_scores) #Sort in decreasing order 62 | 63 | mrr_score = 0 64 | for rank, index in enumerate(pred_scores_argsort[0:self.mrr_at_k]): 65 | if is_relevant[index]: 66 | mrr_score = 1 / (rank+1) 67 | break 68 | 69 | all_mrr_scores.append(mrr_score) 70 | 71 | mean_mrr = np.mean(all_mrr_scores) 72 | logger.info("Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(num_queries, np.min(num_positives), np.mean(num_positives), np.max(num_positives), np.min(num_negatives), np.mean(num_negatives), np.max(num_negatives))) 73 | logger.info("MRR@{}: {:.2f}".format(self.mrr_at_k, mean_mrr*100)) 74 | 75 | if output_path is not None and self.write_csv: 76 | csv_path = os.path.join(output_path, self.csv_file) 77 | output_file_exists = os.path.isfile(csv_path) 78 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 79 | writer = csv.writer(f) 80 | if not output_file_exists: 81 | writer.writerow(self.csv_headers) 82 | 83 | writer.writerow([epoch, steps, mean_mrr]) 84 | 85 | return mean_mrr -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import csv 4 | from typing import List 5 | from ... import InputExample 6 | import numpy as np 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class CESoftmaxAccuracyEvaluator: 12 | """ 13 | This evaluator can be used with the CrossEncoder class. 14 | 15 | It is designed for CrossEncoders with 2 or more outputs. It measure the 16 | accuracy of the predict class vs. the gold labels. 17 | """ 18 | def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str='', write_csv: bool = True): 19 | self.sentence_pairs = sentence_pairs 20 | self.labels = labels 21 | self.name = name 22 | 23 | self.csv_file = "CESoftmaxAccuracyEvaluator" + ("_" + name if name else '') + "_results.csv" 24 | self.csv_headers = ["epoch", "steps", "Accuracy"] 25 | self.write_csv = write_csv 26 | 27 | @classmethod 28 | def from_input_examples(cls, examples: List[InputExample], **kwargs): 29 | sentence_pairs = [] 30 | labels = [] 31 | 32 | for example in examples: 33 | sentence_pairs.append(example.texts) 34 | labels.append(example.label) 35 | return cls(sentence_pairs, labels, **kwargs) 36 | 37 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 38 | if epoch != -1: 39 | if steps == -1: 40 | out_txt = " after epoch {}:".format(epoch) 41 | else: 42 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 43 | else: 44 | out_txt = ":" 45 | 46 | logger.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 47 | pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False) 48 | pred_labels = np.argmax(pred_scores, axis=1) 49 | 50 | assert len(pred_labels) == len(self.labels) 51 | 52 | acc = np.sum(pred_labels == self.labels) / len(self.labels) 53 | 54 | logger.info("Accuracy: {:.2f}".format(acc*100)) 55 | 56 | if output_path is not None and self.write_csv: 57 | csv_path = os.path.join(output_path, self.csv_file) 58 | output_file_exists = os.path.isfile(csv_path) 59 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 60 | writer = csv.writer(f) 61 | if not output_file_exists: 62 | writer.writerow(self.csv_headers) 63 | 64 | writer.writerow([epoch, steps, acc]) 65 | 66 | return acc 67 | -------------------------------------------------------------------------------- /sentence_transformers/cross_encoder/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .CEBinaryAccuracyEvaluator import CEBinaryAccuracyEvaluator 2 | from .CEBinaryClassificationEvaluator import CEBinaryClassificationEvaluator 3 | from .CECorrelationEvaluator import CECorrelationEvaluator 4 | from .CESoftmaxAccuracyEvaluator import CESoftmaxAccuracyEvaluator 5 | from .CERerankingEvaluator import CERerankingEvaluator 6 | -------------------------------------------------------------------------------- /sentence_transformers/datasets/DenoisingAutoEncoderDataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from typing import List 3 | from ..readers.InputExample import InputExample 4 | import numpy as np 5 | import nltk 6 | from nltk.tokenize.treebank import TreebankWordDetokenizer 7 | 8 | class DenoisingAutoEncoderDataset(Dataset): 9 | """ 10 | The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence] 11 | It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the 12 | sentence without noise. 13 | 14 | :param sentences: A list of sentences 15 | :param noise_fn: A noise function: Given a string, it returns a string with noise, e.g. deleted words 16 | """ 17 | def __init__(self, sentences: List[str], noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s)): 18 | self.sentences = sentences 19 | self.noise_fn = noise_fn 20 | 21 | 22 | def __getitem__(self, item): 23 | sent = self.sentences[item] 24 | return InputExample(texts=[self.noise_fn(sent), sent]) 25 | 26 | 27 | def __len__(self): 28 | return len(self.sentences) 29 | 30 | # Deletion noise. 31 | @staticmethod 32 | def delete(text, del_ratio=0.6): 33 | words = nltk.word_tokenize(text) 34 | n = len(words) 35 | if n == 0: 36 | return text 37 | 38 | keep_or_not = np.random.rand(n) > del_ratio 39 | if sum(keep_or_not) == 0: 40 | keep_or_not[np.random.choice(n)] = True # guarantee that at least one word remains 41 | words_processed = TreebankWordDetokenizer().detokenize(np.array(words)[keep_or_not]) 42 | return words_processed -------------------------------------------------------------------------------- /sentence_transformers/datasets/NoDuplicatesDataLoader.py: -------------------------------------------------------------------------------- 1 | import random 2 | import math 3 | 4 | class NoDuplicatesDataLoader: 5 | 6 | def __init__(self, train_examples, batch_size): 7 | """ 8 | A special data loader to be used with MultipleNegativesRankingLoss. 9 | The data loader ensures that there are no duplicate sentences within the same batch 10 | """ 11 | self.batch_size = batch_size 12 | self.data_pointer = 0 13 | self.collate_fn = None 14 | self.train_examples = train_examples 15 | random.shuffle(self.train_examples) 16 | 17 | def __iter__(self): 18 | for _ in range(self.__len__()): 19 | batch = [] 20 | texts_in_batch = set() 21 | 22 | while len(batch) < self.batch_size: 23 | example = self.train_examples[self.data_pointer] 24 | 25 | valid_example = True 26 | for text in example.texts: 27 | if text.strip().lower() in texts_in_batch: 28 | valid_example = False 29 | break 30 | 31 | if valid_example: 32 | batch.append(example) 33 | for text in example.texts: 34 | texts_in_batch.add(text.strip().lower()) 35 | 36 | self.data_pointer += 1 37 | if self.data_pointer >= len(self.train_examples): 38 | self.data_pointer = 0 39 | random.shuffle(self.train_examples) 40 | 41 | yield self.collate_fn(batch) if self.collate_fn is not None else batch 42 | 43 | def __len__(self): 44 | return math.floor(len(self.train_examples) / self.batch_size) -------------------------------------------------------------------------------- /sentence_transformers/datasets/SentenceLabelDataset.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | """ 4 | from torch.utils.data import IterableDataset 5 | import numpy as np 6 | from typing import List 7 | from ..readers import InputExample 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class SentenceLabelDataset(IterableDataset): 13 | """ 14 | This dataset can be used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS which requires 15 | multiple examples with the same label in a batch. 16 | 17 | It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label. 18 | 19 | Labels with fewer than n unique samples are ignored. 20 | This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped. 21 | 22 | This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible 23 | by the samples drawn per label. 24 | """ 25 | def __init__(self, examples: List[InputExample], samples_per_label: int = 2, with_replacement: bool = False): 26 | """ 27 | Creates a LabelSampler for a SentenceLabelDataset. 28 | 29 | :param examples: 30 | a list with InputExamples 31 | :param samples_per_label: 32 | the number of consecutive, random and unique samples drawn per label. Batch size should be a multiple of samples_per_label 33 | :param with_replacement: 34 | if this is True, then each sample is drawn at most once (depending on the total number of samples per label). 35 | if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same 36 | drawing. 37 | """ 38 | super().__init__() 39 | 40 | self.samples_per_label = samples_per_label 41 | 42 | #Group examples by label 43 | label2ex = {} 44 | for example in examples: 45 | if example.label not in label2ex: 46 | label2ex[example.label] = [] 47 | label2ex[example.label].append(example) 48 | 49 | #Include only labels with at least 2 examples 50 | self.grouped_inputs = [] 51 | self.groups_right_border = [] 52 | num_labels = 0 53 | 54 | for label, label_examples in label2ex.items(): 55 | if len(label_examples) >= self.samples_per_label: 56 | self.grouped_inputs.extend(label_examples) 57 | self.groups_right_border.append(len(self.grouped_inputs)) # At which position does this label group / bucket end? 58 | num_labels += 1 59 | 60 | self.label_range = np.arange(num_labels) 61 | self.with_replacement = with_replacement 62 | np.random.shuffle(self.label_range) 63 | 64 | logger.info("SentenceLabelDataset: {} examples, from which {} examples could be used (those labels appeared at least {} times). {} different labels found.".format(len(examples), len(self.grouped_inputs), self.samples_per_label, num_labels )) 65 | 66 | def __iter__(self): 67 | label_idx = 0 68 | count = 0 69 | already_seen = {} 70 | while count < len(self.grouped_inputs): 71 | label = self.label_range[label_idx] 72 | if label not in already_seen: 73 | already_seen[label] = set() 74 | 75 | left_border = 0 if label == 0 else self.groups_right_border[label-1] 76 | right_border = self.groups_right_border[label] 77 | 78 | if self.with_replacement: 79 | selection = np.arange(left_border, right_border) 80 | else: 81 | selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]] 82 | 83 | if len(selection) >= self.samples_per_label: 84 | for element_idx in np.random.choice(selection, self.samples_per_label, replace=False): 85 | count += 1 86 | already_seen[label].add(element_idx) 87 | yield self.grouped_inputs[element_idx] 88 | 89 | label_idx += 1 90 | if label_idx >= len(self.label_range): 91 | label_idx = 0 92 | already_seen = {} 93 | np.random.shuffle(self.label_range) 94 | 95 | def __len__(self): 96 | return len(self.grouped_inputs) -------------------------------------------------------------------------------- /sentence_transformers/datasets/SentencesDataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from typing import List 3 | import torch 4 | from .. import SentenceTransformer 5 | from ..readers.InputExample import InputExample 6 | 7 | class SentencesDataset(Dataset): 8 | """ 9 | DEPRECATED: This class is no longer used. Instead of wrapping your List of InputExamples in a SentencesDataset 10 | and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader. 11 | """ 12 | def __init__(self, 13 | examples: List[InputExample], 14 | model: SentenceTransformer 15 | ): 16 | self.examples = examples 17 | 18 | 19 | def __getitem__(self, item): 20 | return self.examples[item] 21 | 22 | 23 | def __len__(self): 24 | return len(self.examples) 25 | -------------------------------------------------------------------------------- /sentence_transformers/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .DenoisingAutoEncoderDataset import DenoisingAutoEncoderDataset 2 | from .NoDuplicatesDataLoader import NoDuplicatesDataLoader 3 | from .ParallelSentencesDataset import ParallelSentencesDataset 4 | from .SentencesDataset import SentencesDataset 5 | from .SentenceLabelDataset import SentenceLabelDataset 6 | -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/DenoisingAutoEncoderDataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/DenoisingAutoEncoderDataset.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/DenoisingAutoEncoderDataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/DenoisingAutoEncoderDataset.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/NoDuplicatesDataLoader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/NoDuplicatesDataLoader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/NoDuplicatesDataLoader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/NoDuplicatesDataLoader.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/ParallelSentencesDataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/ParallelSentencesDataset.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/ParallelSentencesDataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/ParallelSentencesDataset.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/SentenceLabelDataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/SentenceLabelDataset.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/SentenceLabelDataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/SentenceLabelDataset.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/SentencesDataset.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/SentencesDataset.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/SentencesDataset.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/SentencesDataset.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/datasets/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/LabelAccuracyEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import torch 3 | from torch.utils.data import DataLoader 4 | import logging 5 | from ..util import batch_to_device 6 | import os 7 | import csv 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class LabelAccuracyEvaluator(SentenceEvaluator): 13 | """ 14 | Evaluate a model based on its accuracy on a labeled dataset 15 | 16 | This requires a model with LossFunction.SOFTMAX 17 | 18 | The results are written in a CSV. If a CSV already exists, then values are appended. 19 | """ 20 | 21 | def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None, write_csv: bool = True): 22 | """ 23 | Constructs an evaluator for the given dataset 24 | 25 | :param dataloader: 26 | the data for the evaluation 27 | """ 28 | self.dataloader = dataloader 29 | self.name = name 30 | self.softmax_model = softmax_model 31 | 32 | if name: 33 | name = "_"+name 34 | 35 | self.write_csv = write_csv 36 | self.csv_file = "accuracy_evaluation"+name+"_results.csv" 37 | self.csv_headers = ["epoch", "steps", "accuracy"] 38 | 39 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 40 | model.eval() 41 | total = 0 42 | correct = 0 43 | 44 | if epoch != -1: 45 | if steps == -1: 46 | out_txt = " after epoch {}:".format(epoch) 47 | else: 48 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 49 | else: 50 | out_txt = ":" 51 | 52 | logger.info("Evaluation on the "+self.name+" dataset"+out_txt) 53 | self.dataloader.collate_fn = model.smart_batching_collate 54 | for step, batch in enumerate(self.dataloader): 55 | features, label_ids = batch 56 | for idx in range(len(features)): 57 | features[idx] = batch_to_device(features[idx], model.device) 58 | label_ids = label_ids.to(model.device) 59 | with torch.no_grad(): 60 | _, prediction = self.softmax_model(features, labels=None) 61 | 62 | total += prediction.size(0) 63 | correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item() 64 | accuracy = correct/total 65 | 66 | logger.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total)) 67 | 68 | if output_path is not None and self.write_csv: 69 | csv_path = os.path.join(output_path, self.csv_file) 70 | if not os.path.isfile(csv_path): 71 | with open(csv_path, mode="w", encoding="utf-8") as f: 72 | writer = csv.writer(f) 73 | writer.writerow(self.csv_headers) 74 | writer.writerow([epoch, steps, accuracy]) 75 | else: 76 | with open(csv_path, mode="a", encoding="utf-8") as f: 77 | writer = csv.writer(f) 78 | writer.writerow([epoch, steps, accuracy]) 79 | 80 | return accuracy 81 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/MSEEvaluator.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers.evaluation import SentenceEvaluator 2 | import numpy as np 3 | import logging 4 | import os 5 | import csv 6 | from typing import List 7 | 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class MSEEvaluator(SentenceEvaluator): 12 | """ 13 | Computes the mean squared error (x100) between the computed sentence embedding 14 | and some target sentence embedding. 15 | 16 | The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||. 17 | 18 | For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English 19 | and target_sentences are in a different language like German, Chinese, Spanish... 20 | 21 | :param source_sentences: Source sentences are embedded with the teacher model 22 | :param target_sentences: Target sentences are ambedding with the student model. 23 | :param show_progress_bar: Show progress bar when computing embeddings 24 | :param batch_size: Batch size to compute sentence embeddings 25 | :param name: Name of the evaluator 26 | :param write_csv: Write results to CSV file 27 | """ 28 | def __init__(self, source_sentences: List[str], target_sentences: List[str], teacher_model = None, show_progress_bar: bool = False, batch_size: int = 32, name: str = '', write_csv: bool = True): 29 | self.source_embeddings = teacher_model.encode(source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True) 30 | 31 | self.target_sentences = target_sentences 32 | self.show_progress_bar = show_progress_bar 33 | self.batch_size = batch_size 34 | self.name = name 35 | 36 | self.csv_file = "mse_evaluation_" + name + "_results.csv" 37 | self.csv_headers = ["epoch", "steps", "MSE"] 38 | self.write_csv = write_csv 39 | 40 | def __call__(self, model, output_path, epoch = -1, steps = -1): 41 | if epoch != -1: 42 | if steps == -1: 43 | out_txt = " after epoch {}:".format(epoch) 44 | else: 45 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 46 | else: 47 | out_txt = ":" 48 | 49 | target_embeddings = model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=True) 50 | 51 | mse = ((self.source_embeddings - target_embeddings)**2).mean() 52 | mse *= 100 53 | 54 | logger.info("MSE evaluation (lower = better) on "+self.name+" dataset"+out_txt) 55 | logger.info("MSE (*100):\t{:4f}".format(mse)) 56 | 57 | if output_path is not None and self.write_csv: 58 | csv_path = os.path.join(output_path, self.csv_file) 59 | output_file_exists = os.path.isfile(csv_path) 60 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 61 | writer = csv.writer(f) 62 | if not output_file_exists: 63 | writer.writerow(self.csv_headers) 64 | 65 | writer.writerow([epoch, steps, mse]) 66 | 67 | return -mse #Return negative score as SentenceTransformers maximizes the performance 68 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers.evaluation import SentenceEvaluator 2 | from sentence_transformers.util import batch_to_device 3 | from sentence_transformers import SentenceTransformer 4 | from typing import List, Tuple, Dict 5 | import torch 6 | import numpy as np 7 | import logging 8 | import os 9 | import csv 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class MSEEvaluatorFromDataFrame(SentenceEvaluator): 16 | """ 17 | Computes the mean squared error (x100) between the computed sentence embedding 18 | and some target sentence embedding. 19 | :param dataframe: 20 | It must have the following format. Rows contains different, parallel sentences. Columns are the respective language codes 21 | [{'en': 'My sentence', 'es': 'Sentence in Spanisch', 'fr': 'Sentence in French'...}, 22 | {'en': 'My second sentence', ....] 23 | :param combinations: 24 | Must be of the format [('en', 'es'), ('en', 'fr'), ...] 25 | First entry in a tuple is the source language. The sentence in the respective language will be fetched from the dataframe and passed to the teacher model. 26 | Second entry in a tuple the the target language. Sentence will be fetched from the dataframe and passed to the student model 27 | """ 28 | def __init__(self, dataframe: List[Dict[str, str]], teacher_model: SentenceTransformer, combinations: List[Tuple[str, str]], batch_size: int = 8, name='', write_csv: bool = True): 29 | 30 | self.combinations = combinations 31 | self.name = name 32 | self.batch_size = batch_size 33 | 34 | 35 | if name: 36 | name = "_"+name 37 | 38 | self.csv_file = "mse_evaluation" + name + "_results.csv" 39 | self.csv_headers = ["epoch", "steps"] 40 | self.write_csv = write_csv 41 | self.data = {} 42 | 43 | logger.info("Compute teacher embeddings") 44 | all_source_sentences = set() 45 | for src_lang, trg_lang in self.combinations: 46 | src_sentences = [] 47 | trg_sentences = [] 48 | 49 | for row in dataframe: 50 | if row[src_lang].strip() != "" and row[trg_lang].strip() != "": 51 | all_source_sentences.add(row[src_lang]) 52 | src_sentences.append(row[src_lang]) 53 | trg_sentences.append(row[trg_lang]) 54 | 55 | self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences) 56 | self.csv_headers.append("{}-{}".format(src_lang, trg_lang)) 57 | 58 | all_source_sentences = list(all_source_sentences) 59 | all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size) 60 | self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)} 61 | 62 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1): 63 | model.eval() 64 | 65 | mse_scores = [] 66 | for src_lang, trg_lang in self.combinations: 67 | src_sentences, trg_sentences = self.data[(src_lang, trg_lang)] 68 | 69 | src_embeddings = np.asarray([self.teacher_embeddings[sent] for sent in src_sentences]) 70 | trg_embeddings = np.asarray(model.encode(trg_sentences, batch_size=self.batch_size)) 71 | 72 | mse = ((src_embeddings - trg_embeddings) ** 2).mean() 73 | mse *= 100 74 | mse_scores.append(mse) 75 | 76 | logger.info("MSE evaluation on {} dataset - {}-{}:".format(self.name, src_lang, trg_lang)) 77 | logger.info("MSE (*100):\t{:4f}".format(mse)) 78 | 79 | if output_path is not None and self.write_csv: 80 | csv_path = os.path.join(output_path, self.csv_file) 81 | output_file_exists = os.path.isfile(csv_path) 82 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 83 | writer = csv.writer(f) 84 | if not output_file_exists: 85 | writer.writerow(self.csv_headers) 86 | 87 | writer.writerow([epoch, steps]+mse_scores) 88 | 89 | return -np.mean(mse_scores) #Return negative score as SentenceTransformers maximizes the performance 90 | 91 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/RerankingEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import logging 3 | import numpy as np 4 | import os 5 | import csv 6 | from ..util import cos_sim, dot_score 7 | import torch 8 | from sklearn.metrics import average_precision_score 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class RerankingEvaluator(SentenceEvaluator): 13 | """ 14 | This class evaluates a SentenceTransformer model for the task of re-ranking. 15 | 16 | Given a query and a list of documents, it computes the score [query, doc_i] for all possible 17 | documents and sorts them in decreasing order. Then, MRR@10 and MAP is compute to measure the quality of the ranking. 18 | 19 | :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query, 20 | positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents. 21 | """ 22 | def __init__(self, samples, mrr_at_k: int = 10, name: str = '', write_csv: bool = True, similarity_fct=cos_sim): 23 | self.samples = samples 24 | self.name = name 25 | self.mrr_at_k = mrr_at_k 26 | self.similarity_fct = cos_sim 27 | 28 | if isinstance(self.samples, dict): 29 | self.samples = list(self.samples.values()) 30 | 31 | 32 | self.csv_file = "RerankingEvaluator" + ("_" + name if name else '') + "_results.csv" 33 | self.csv_headers = ["epoch", "steps", "MAP", "MRR@{}".format(mrr_at_k)] 34 | self.write_csv = write_csv 35 | 36 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 37 | if epoch != -1: 38 | if steps == -1: 39 | out_txt = " after epoch {}:".format(epoch) 40 | else: 41 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 42 | else: 43 | out_txt = ":" 44 | 45 | logger.info("RerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt) 46 | 47 | all_mrr_scores = [] 48 | all_ap_scores = [] 49 | 50 | num_queries = 0 51 | num_positives = [] 52 | num_negatives = [] 53 | for instance in self.samples: 54 | query = instance['query'] 55 | positive = list(instance['positive']) 56 | negative = list(instance['negative']) 57 | docs = positive + negative 58 | is_relevant = [True]*len(positive) + [False]*len(negative) 59 | 60 | if len(positive) == 0 or len(negative) == 0: 61 | continue 62 | 63 | num_queries += 1 64 | num_positives.append(len(positive)) 65 | num_negatives.append(len(negative)) 66 | 67 | query_emb = model.encode(query, convert_to_tensor=True, show_progress_bar=False) 68 | docs_emb = model.encode(docs, convert_to_tensor=True, show_progress_bar=False) 69 | 70 | pred_scores = self.similarity_fct(query_emb, docs_emb) 71 | if len(pred_scores.shape) > 1: 72 | pred_scores = pred_scores[0] 73 | 74 | pred_scores_argsort = torch.argsort(-pred_scores) #Sort in decreasing order 75 | 76 | #Compute MRR score 77 | mrr_score = 0 78 | for rank, index in enumerate(pred_scores_argsort[0:self.mrr_at_k]): 79 | if is_relevant[index]: 80 | mrr_score = 1 / (rank+1) 81 | break 82 | all_mrr_scores.append(mrr_score) 83 | 84 | # Compute AP 85 | all_ap_scores.append(average_precision_score(is_relevant, pred_scores.cpu().tolist())) 86 | 87 | mean_ap = np.mean(all_ap_scores) 88 | mean_mrr = np.mean(all_mrr_scores) 89 | 90 | logger.info("Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(num_queries, np.min(num_positives), np.mean(num_positives), np.max(num_positives), np.min(num_negatives), np.mean(num_negatives), np.max(num_negatives))) 91 | logger.info("MAP: {:.2f}".format(mean_ap * 100)) 92 | logger.info("MRR@{}: {:.2f}".format(self.mrr_at_k, mean_mrr*100)) 93 | 94 | if output_path is not None and self.write_csv: 95 | csv_path = os.path.join(output_path, self.csv_file) 96 | output_file_exists = os.path.isfile(csv_path) 97 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 98 | writer = csv.writer(f) 99 | if not output_file_exists: 100 | writer.writerow(self.csv_headers) 101 | 102 | writer.writerow([epoch, steps, mean_ap, mean_mrr]) 103 | 104 | return mean_ap -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SentenceEvaluator.py: -------------------------------------------------------------------------------- 1 | class SentenceEvaluator: 2 | """ 3 | Base class for all evaluators 4 | 5 | Extend this class and implement __call__ for custom evaluators. 6 | """ 7 | 8 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 9 | """ 10 | This is called during training to evaluate the model. 11 | It returns a score for the evaluation with a higher score indicating a better result. 12 | 13 | :param model: 14 | the model to evaluate 15 | :param output_path: 16 | path where predictions and metrics are written to 17 | :param epoch 18 | the epoch where the evaluation takes place. 19 | This is used for the file prefixes. 20 | If this is -1, then we assume evaluation on test data. 21 | :param steps 22 | the steps in the current epoch at time of the evaluation. 23 | This is used for the file prefixes. 24 | If this is -1, then we assume evaluation at the end of the epoch. 25 | :return: a score for the evaluation with a higher score indicating a better result 26 | """ 27 | pass 28 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SequentialEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | from typing import Iterable 3 | 4 | class SequentialEvaluator(SentenceEvaluator): 5 | """ 6 | This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated, 7 | the data is passed sequentially to all sub-evaluators. 8 | 9 | All scores are passed to 'main_score_function', which derives one final score value 10 | """ 11 | def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function = lambda scores: scores[-1]): 12 | self.evaluators = evaluators 13 | self.main_score_function = main_score_function 14 | 15 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 16 | scores = [] 17 | for evaluator in self.evaluators: 18 | scores.append(evaluator(model, output_path, epoch, steps)) 19 | 20 | return self.main_score_function(scores) 21 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SimilarityFunction.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class SimilarityFunction(Enum): 4 | COSINE = 0 5 | EUCLIDEAN = 1 6 | MANHATTAN = 2 7 | DOT_PRODUCT = 3 8 | 9 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/TranslationEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import logging 3 | from ..util import pytorch_cos_sim 4 | import os 5 | import csv 6 | import numpy as np 7 | import scipy.spatial 8 | from typing import List 9 | import torch 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class TranslationEvaluator(SentenceEvaluator): 15 | """ 16 | Given two sets of sentences in different languages, e.g. (en_1, en_2, en_3...) and (fr_1, fr_2, fr_3, ...), 17 | and assuming that fr_i is the translation of en_i. 18 | Checks if vec(en_i) has the highest similarity to vec(fr_i). Computes the accurarcy in both directions 19 | """ 20 | def __init__(self, source_sentences: List[str], target_sentences: List[str], show_progress_bar: bool = False, batch_size: int = 16, name: str = '', print_wrong_matches: bool = False, write_csv: bool = True): 21 | """ 22 | Constructs an evaluator based for the dataset 23 | 24 | The labels need to indicate the similarity between the sentences. 25 | 26 | :param source_sentences: 27 | List of sentences in source language 28 | :param target_sentences: 29 | List of sentences in target language 30 | :param print_wrong_matches: 31 | Prints incorrect matches 32 | :param write_csv: 33 | Write results to CSV file 34 | """ 35 | self.source_sentences = source_sentences 36 | self.target_sentences = target_sentences 37 | self.name = name 38 | self.batch_size = batch_size 39 | self.show_progress_bar = show_progress_bar 40 | self.print_wrong_matches = print_wrong_matches 41 | 42 | assert len(self.source_sentences) == len(self.target_sentences) 43 | 44 | if name: 45 | name = "_"+name 46 | 47 | self.csv_file = "translation_evaluation"+name+"_results.csv" 48 | self.csv_headers = ["epoch", "steps", "src2trg", "trg2src"] 49 | self.write_csv = write_csv 50 | 51 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 52 | if epoch != -1: 53 | if steps == -1: 54 | out_txt = " after epoch {}:".format(epoch) 55 | else: 56 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 57 | else: 58 | out_txt = ":" 59 | 60 | logger.info("Evaluating translation matching Accuracy on "+self.name+" dataset"+out_txt) 61 | 62 | embeddings1 = torch.stack(model.encode(self.source_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=False)) 63 | embeddings2 = torch.stack(model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=False)) 64 | 65 | 66 | cos_sims = pytorch_cos_sim(embeddings1, embeddings2).detach().cpu().numpy() 67 | 68 | correct_src2trg = 0 69 | correct_trg2src = 0 70 | 71 | for i in range(len(cos_sims)): 72 | max_idx = np.argmax(cos_sims[i]) 73 | 74 | if i == max_idx: 75 | correct_src2trg += 1 76 | elif self.print_wrong_matches: 77 | print("i:", i, "j:", max_idx, "INCORRECT" if i != max_idx else "CORRECT") 78 | print("Src:", self.source_sentences[i]) 79 | print("Trg:", self.target_sentences[max_idx]) 80 | print("Argmax score:", cos_sims[i][max_idx], "vs. correct score:", cos_sims[i][i]) 81 | 82 | results = zip(range(len(cos_sims[i])), cos_sims[i]) 83 | results = sorted(results, key=lambda x: x[1], reverse=True) 84 | for idx, score in results[0:5]: 85 | print("\t", idx, "(Score: %.4f)" % (score), self.target_sentences[idx]) 86 | 87 | 88 | 89 | cos_sims = cos_sims.T 90 | for i in range(len(cos_sims)): 91 | max_idx = np.argmax(cos_sims[i]) 92 | if i == max_idx: 93 | correct_trg2src += 1 94 | 95 | acc_src2trg = correct_src2trg / len(cos_sims) 96 | acc_trg2src = correct_trg2src / len(cos_sims) 97 | 98 | logger.info("Accuracy src2trg: {:.2f}".format(acc_src2trg*100)) 99 | logger.info("Accuracy trg2src: {:.2f}".format(acc_trg2src*100)) 100 | 101 | if output_path is not None and self.write_csv: 102 | csv_path = os.path.join(output_path, self.csv_file) 103 | output_file_exists = os.path.isfile(csv_path) 104 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 105 | writer = csv.writer(f) 106 | if not output_file_exists: 107 | writer.writerow(self.csv_headers) 108 | 109 | writer.writerow([epoch, steps, acc_src2trg, acc_trg2src]) 110 | 111 | return (acc_src2trg+acc_trg2src)/2 112 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .SentenceEvaluator import SentenceEvaluator 2 | from .SimilarityFunction import SimilarityFunction 3 | from .BinaryClassificationEvaluator import BinaryClassificationEvaluator 4 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator 5 | from .InformationRetrievalEvaluator import InformationRetrievalEvaluator 6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator 7 | from .MSEEvaluator import MSEEvaluator 8 | from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame 9 | from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator 10 | from .SequentialEvaluator import SequentialEvaluator 11 | from .TranslationEvaluator import TranslationEvaluator 12 | from .TripletEvaluator import TripletEvaluator 13 | from .RerankingEvaluator import RerankingEvaluator -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/MSEEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/MSEEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/MSEEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/MSEEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/RerankingEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/RerankingEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/RerankingEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/RerankingEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/SentenceEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SentenceEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/SentenceEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SentenceEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/SequentialEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SequentialEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/SequentialEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SequentialEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/SimilarityFunction.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SimilarityFunction.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/SimilarityFunction.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SimilarityFunction.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/TranslationEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/TranslationEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/TranslationEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/TranslationEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/TripletEvaluator.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/TripletEvaluator.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/TripletEvaluator.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/TripletEvaluator.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/BYOLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import torch.nn.functional as F 5 | from ..SentenceTransformer import SentenceTransformer 6 | import torch 7 | import torch.nn as nn 8 | import numpy as np 9 | import logging 10 | import math 11 | from functools import wraps 12 | import copy 13 | import random 14 | 15 | 16 | class EMA(): 17 | def __init__(self, beta): 18 | super().__init__() 19 | self.beta = beta 20 | 21 | def update_average(self, old, new): 22 | if old is None: 23 | return new 24 | return old * self.beta + (1 - self.beta) * new 25 | 26 | def update_moving_average(ema_updater, ma_model, current_model): 27 | for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()): 28 | old_weight, up_weight = ma_params.data, current_params.data 29 | ma_params.data = ema_updater.update_average(old_weight, up_weight) 30 | 31 | # MLP for predictor 32 | class MLP(nn.Module): 33 | def __init__(self, dim, projection_size, hidden_size): 34 | super().__init__() 35 | self.net = nn.Sequential( 36 | nn.Linear(dim, hidden_size), 37 | nn.BatchNorm1d(hidden_size), 38 | nn.ReLU(), 39 | nn.Linear(hidden_size, hidden_size), 40 | nn.ReLU(), 41 | nn.Linear(hidden_size, projection_size) 42 | ) 43 | 44 | def forward(self, x): 45 | return self.net(x) 46 | 47 | 48 | # loss fn 49 | def loss_fn(x, y): 50 | x = F.normalize(x, dim=-1, p=2) 51 | y = F.normalize(y, dim=-1, p=2) 52 | return 2 - 2 * (x * y).sum(dim=-1) 53 | 54 | 55 | 56 | class BYOLoss(nn.Module): 57 | def __init__(self, 58 | model: SentenceTransformer, 59 | sentence_embedding_dimension: int, 60 | moving_average_decay: float): 61 | super(BYOLoss, self).__init__() 62 | self.online_encoder = model 63 | self.online_predictor_1 = MLP(sentence_embedding_dimension, sentence_embedding_dimension, 10 * sentence_embedding_dimension) 64 | self.online_predictor_2 = MLP(sentence_embedding_dimension, sentence_embedding_dimension, 10 * sentence_embedding_dimension) 65 | self.online_predictor_3 = MLP(sentence_embedding_dimension, sentence_embedding_dimension, 10 * sentence_embedding_dimension) 66 | self.target_encoder = copy.deepcopy(self.online_encoder) 67 | self.target_ema_updater = EMA(moving_average_decay) 68 | 69 | def update_moving_average(self): 70 | assert self.target_encoder is not None, 'target encoder has not been created yet' 71 | update_moving_average(self.target_ema_updater, self.target_encoder, self.online_encoder) 72 | 73 | 74 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 75 | 76 | target_sentence_features = copy.deepcopy(sentence_features) 77 | rep_one, rep_two = [self.online_encoder(sentence_feature) for sentence_feature in sentence_features] 78 | online_pred_one, online_pred_two = rep_one['sentence_embedding'], rep_two['sentence_embedding'] 79 | online_pred_one, online_pred_two = self.online_predictor_1(online_pred_one), self.online_predictor_1(online_pred_two) 80 | online_pred_one, online_pred_two = self.online_predictor_2(online_pred_one), self.online_predictor_2(online_pred_two) 81 | online_pred_one, online_pred_two = self.online_predictor_3(online_pred_one), self.online_predictor_3(online_pred_two) 82 | 83 | with torch.no_grad(): 84 | 85 | target_one, target_two = [self.target_encoder(sentence_feature) for sentence_feature in target_sentence_features] 86 | target_proj_one, target_proj_two = target_one['sentence_embedding'], target_two['sentence_embedding'] 87 | 88 | loss_one = loss_fn(online_pred_one, target_proj_two.detach()) 89 | loss_two = loss_fn(online_pred_two, target_proj_one.detach()) 90 | 91 | loss = loss_one + loss_two 92 | 93 | return loss.mean() 94 | 95 | -------------------------------------------------------------------------------- /sentence_transformers/losses/BatchAllTripletLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | from .BatchHardTripletLoss import BatchHardTripletLoss, BatchHardTripletLossDistanceFunction 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer 6 | 7 | 8 | class BatchAllTripletLoss(nn.Module): 9 | """ 10 | BatchAllTripletLoss takes a batch with (label, sentence) pairs and computes the loss for all possible, valid 11 | triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. The labels 12 | must be integers, with same label indicating sentences from the same class. You train dataset 13 | must contain at least 2 examples per label class. 14 | 15 | | Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py 16 | | Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737 17 | | Blog post: https://omoindrot.github.io/triplet-loss 18 | 19 | :param model: SentenceTransformer model 20 | :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used 21 | :param margin: Negative samples should be at least margin further apart from the anchor than the positive. 22 | 23 | Example:: 24 | 25 | from sentence_transformers import SentenceTransformer, SentencesDataset, losses 26 | from sentence_transformers.readers import InputExample 27 | 28 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 29 | train_examples = [InputExample(texts=['Sentence from class 0'], label=0), InputExample(texts=['Another sentence from class 0'], label=0), 30 | InputExample(texts=['Sentence from class 1'], label=1), InputExample(texts=['Sentence from class 2'], label=2)] 31 | train_dataset = SentencesDataset(train_examples, model) 32 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 33 | train_loss = losses.BatchAllTripletLoss(model=model) 34 | """ 35 | def __init__(self, model: SentenceTransformer, distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance, margin: float = 5): 36 | super(BatchAllTripletLoss, self).__init__() 37 | self.sentence_embedder = model 38 | self.triplet_margin = margin 39 | self.distance_metric = distance_metric 40 | 41 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 42 | rep = self.sentence_embedder(sentence_features[0])['sentence_embedding'] 43 | return self.batch_all_triplet_loss(labels, rep) 44 | 45 | 46 | 47 | def batch_all_triplet_loss(self, labels, embeddings): 48 | """Build the triplet loss over a batch of embeddings. 49 | We generate all the valid triplets and average the loss over the positive ones. 50 | Args: 51 | labels: labels of the batch, of size (batch_size,) 52 | embeddings: tensor of shape (batch_size, embed_dim) 53 | margin: margin for triplet loss 54 | squared: Boolean. If true, output is the pairwise squared euclidean distance matrix. 55 | If false, output is the pairwise euclidean distance matrix. 56 | Returns: 57 | Label_Sentence_Triplet: scalar tensor containing the triplet loss 58 | """ 59 | # Get the pairwise distance matrix 60 | pairwise_dist = self.distance_metric(embeddings) 61 | 62 | anchor_positive_dist = pairwise_dist.unsqueeze(2) 63 | anchor_negative_dist = pairwise_dist.unsqueeze(1) 64 | 65 | # Compute a 3D tensor of size (batch_size, batch_size, batch_size) 66 | # triplet_loss[i, j, k] will contain the triplet loss of anchor=i, positive=j, negative=k 67 | # Uses broadcasting where the 1st argument has shape (batch_size, batch_size, 1) 68 | # and the 2nd (batch_size, 1, batch_size) 69 | triplet_loss = anchor_positive_dist - anchor_negative_dist + self.triplet_margin 70 | 71 | # Put to zero the invalid triplets 72 | # (where label(a) != label(p) or label(n) == label(a) or a == p) 73 | mask = BatchHardTripletLoss.get_triplet_mask(labels) 74 | triplet_loss = mask.float() * triplet_loss 75 | 76 | # Remove negative losses (i.e. the easy triplets) 77 | triplet_loss[triplet_loss < 0] = 0 78 | 79 | # Count number of positive triplets (where triplet_loss > 0) 80 | valid_triplets = triplet_loss[triplet_loss > 1e-16] 81 | num_positive_triplets = valid_triplets.size(0) 82 | num_valid_triplets = mask.sum() 83 | 84 | fraction_positive_triplets = num_positive_triplets / (num_valid_triplets.float() + 1e-16) 85 | 86 | # Get final mean triplet loss over the positive valid triplets 87 | triplet_loss = triplet_loss.sum() / (num_positive_triplets + 1e-16) 88 | 89 | return triplet_loss 90 | 91 | -------------------------------------------------------------------------------- /sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | from .BatchHardTripletLoss import BatchHardTripletLoss, BatchHardTripletLossDistanceFunction 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer 6 | 7 | class BatchHardSoftMarginTripletLoss(BatchHardTripletLoss): 8 | """ 9 | BatchHardSoftMarginTripletLoss takes a batch with (label, sentence) pairs and computes the loss for all possible, valid 10 | triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. The labels 11 | must be integers, with same label indicating sentences from the same class. You train dataset 12 | must contain at least 2 examples per label class. The margin is computed automatically. 13 | 14 | Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py 15 | Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737 16 | Blog post: https://omoindrot.github.io/triplet-loss 17 | 18 | :param model: SentenceTransformer model 19 | :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used 20 | 21 | 22 | Example:: 23 | 24 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 25 | from sentence_transformers.readers import InputExample 26 | 27 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 28 | train_examples = [InputExample(texts=['Sentence from class 0'], label=0), InputExample(texts=['Another sentence from class 0'], label=0), 29 | InputExample(texts=['Sentence from class 1'], label=1), InputExample(texts=['Sentence from class 2'], label=2)] 30 | train_dataset = SentencesDataset(train_examples, model) 31 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 32 | train_loss = losses.BatchHardSoftMarginTripletLoss(model=model) 33 | """ 34 | def __init__(self, model: SentenceTransformer, distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance): 35 | super(BatchHardSoftMarginTripletLoss, self).__init__(model) 36 | self.sentence_embedder = model 37 | self.distance_metric = distance_metric 38 | 39 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 40 | rep = self.sentence_embedder(sentence_features[0])['sentence_embedding'] 41 | return self.batch_hard_triplet_soft_margin_loss(labels, rep) 42 | 43 | 44 | # Hard Triplet Loss with Soft Margin 45 | # Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737 46 | def batch_hard_triplet_soft_margin_loss(self, labels: Tensor, embeddings: Tensor) -> Tensor: 47 | """Build the triplet loss over a batch of embeddings. 48 | For each anchor, we get the hardest positive and hardest negative to form a triplet. 49 | Args: 50 | labels: labels of the batch, of size (batch_size,) 51 | embeddings: tensor of shape (batch_size, embed_dim) 52 | squared: Boolean. If true, output is the pairwise squared euclidean distance matrix. 53 | If false, output is the pairwise euclidean distance matrix. 54 | Returns: 55 | Label_Sentence_Triplet: scalar tensor containing the triplet loss 56 | """ 57 | # Get the pairwise distance matrix 58 | pairwise_dist = self.distance_metric(embeddings) 59 | 60 | 61 | # For each anchor, get the hardest positive 62 | # First, we need to get a mask for every valid positive (they should have same label) 63 | mask_anchor_positive = BatchHardTripletLoss.get_anchor_positive_triplet_mask(labels).float() 64 | 65 | # We put to 0 any element where (a, p) is not valid (valid if a != p and label(a) == label(p)) 66 | anchor_positive_dist = mask_anchor_positive * pairwise_dist 67 | 68 | # shape (batch_size, 1) 69 | hardest_positive_dist, _ = anchor_positive_dist.max(1, keepdim=True) 70 | 71 | # For each anchor, get the hardest negative 72 | # First, we need to get a mask for every valid negative (they should have different labels) 73 | mask_anchor_negative = BatchHardTripletLoss.get_anchor_negative_triplet_mask(labels).float() 74 | 75 | # We add the maximum value in each row to the invalid negatives (label(a) == label(n)) 76 | max_anchor_negative_dist, _ = pairwise_dist.max(1, keepdim=True) 77 | anchor_negative_dist = pairwise_dist + max_anchor_negative_dist * (1.0 - mask_anchor_negative) 78 | 79 | # shape (batch_size,) 80 | hardest_negative_dist, _ = anchor_negative_dist.min(1, keepdim=True) 81 | 82 | # Combine biggest d(a, p) and smallest d(a, n) into final triplet loss with soft margin 83 | #tl = hardest_positive_dist - hardest_negative_dist + margin 84 | #tl[tl < 0] = 0 85 | tl = torch.log1p(torch.exp(hardest_positive_dist - hardest_negative_dist)) 86 | triplet_loss = tl.mean() 87 | 88 | return triplet_loss 89 | -------------------------------------------------------------------------------- /sentence_transformers/losses/ContrastiveLoss.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | from typing import Iterable, Dict 3 | import torch.nn.functional as F 4 | from torch import nn, Tensor 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer 6 | 7 | 8 | class SiameseDistanceMetric(Enum): 9 | """ 10 | The metric for the contrastive loss 11 | """ 12 | EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2) 13 | MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1) 14 | COSINE_DISTANCE = lambda x, y: 1-F.cosine_similarity(x, y) 15 | 16 | 17 | class ContrastiveLoss(nn.Module): 18 | """ 19 | Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the 20 | two embeddings is reduced. If the label == 0, then the distance between the embeddings is increased. 21 | 22 | Further information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf 23 | 24 | :param model: SentenceTransformer model 25 | :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used 26 | :param margin: Negative samples (label == 0) should have a distance of at least the margin value. 27 | :param size_average: Average by the size of the mini-batch. 28 | 29 | Example:: 30 | 31 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 32 | from sentence_transformers.readers import InputExample 33 | 34 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 35 | train_examples = [InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1), 36 | InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)] 37 | train_dataset = SentencesDataset(train_examples, model) 38 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 39 | train_loss = losses.ContrastiveLoss(model=model) 40 | 41 | """ 42 | 43 | def __init__(self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5, size_average:bool = True): 44 | super(ContrastiveLoss, self).__init__() 45 | self.distance_metric = distance_metric 46 | self.margin = margin 47 | self.model = model 48 | self.size_average = size_average 49 | 50 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 51 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 52 | assert len(reps) == 2 53 | rep_anchor, rep_other = reps 54 | distances = self.distance_metric(rep_anchor, rep_other) 55 | losses = 0.5 * (labels.float() * distances.pow(2) + (1 - labels).float() * F.relu(self.margin - distances).pow(2)) 56 | return losses.mean() if self.size_average else losses.sum() 57 | 58 | 59 | 60 | -------------------------------------------------------------------------------- /sentence_transformers/losses/CosineSimilarityLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Iterable, Dict 4 | from ..SentenceTransformer import SentenceTransformer 5 | 6 | 7 | class CosineSimilarityLoss(nn.Module): 8 | """ 9 | CosineSimilarityLoss expects, that the InputExamples consists of two texts and a float label. 10 | 11 | It computes the vectors u = model(input_text[0]) and v = model(input_text[1]) and measures the cosine-similarity between the two. 12 | By default, it minimizes the following loss: ||input_label - cos_score_transformation(cosine_sim(u,v))||_2. 13 | 14 | :param model: SentenceTranformer model 15 | :param loss_fct: Which pytorch loss function should be used to compare the cosine_similartiy(u,v) with the input_label? By default, MSE: ||input_label - cosine_sim(u,v)||_2 16 | :param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity. By default, the identify function is used (i.e. no change). 17 | 18 | Example:: 19 | 20 | from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses 21 | 22 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 23 | train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8), 24 | InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)] 25 | train_dataset = SentencesDataset(train_examples, model) 26 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 27 | train_loss = losses.CosineSimilarityLoss(model=model) 28 | 29 | 30 | """ 31 | def __init__(self, model: SentenceTransformer, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()): 32 | super(CosineSimilarityLoss, self).__init__() 33 | self.model = model 34 | self.loss_fct = loss_fct 35 | self.cos_score_transformation = cos_score_transformation 36 | 37 | 38 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 39 | embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 40 | output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1])) 41 | return self.loss_fct(output, labels.view(-1)) 42 | 43 | -------------------------------------------------------------------------------- /sentence_transformers/losses/MSELoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | 5 | 6 | class MSELoss(nn.Module): 7 | """ 8 | Computes the MSE loss between the computed sentence embedding and a target sentence embedding. This loss 9 | is used when extending sentence embeddings to new languages as described in our publication 10 | Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation: https://arxiv.org/abs/2004.09813 11 | 12 | For an example, see the documentation on extending language models to new languages. 13 | """ 14 | def __init__(self, model): 15 | super(MSELoss, self).__init__() 16 | self.model = model 17 | self.loss_fct = nn.MSELoss() 18 | 19 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 20 | rep = self.model(sentence_features[0])['sentence_embedding'] 21 | return self.loss_fct(rep, labels) 22 | -------------------------------------------------------------------------------- /sentence_transformers/losses/MegaBatchMarginLoss.py: -------------------------------------------------------------------------------- 1 | from .. import util 2 | import torch 3 | from torch import nn, Tensor 4 | from typing import Iterable, Dict 5 | import torch.nn.functional as F 6 | 7 | class MegaBatchMarginLoss(nn.Module): 8 | """ 9 | Loss function inspired from ParaNMT paper: 10 | https://www.aclweb.org/anthology/P18-1042/ 11 | 12 | Given a large batch (like 500 or more examples) of (anchor_i, positive_i) pairs, 13 | find for each pair in the batch the hardest negative, i.e. find j != i such that cos_sim(anchor_i, positive_j) 14 | is maximal. Then create from this a triplet (anchor_i, positive_i, positive_j) where positive_j 15 | serves as the negative for this triplet. 16 | 17 | Train than as with the triplet loss 18 | """ 19 | 20 | def __init__(self, model, positive_margin: float = 0.8, negative_margin: float = 0.3, use_mini_batched_version: bool = True, mini_batch_size: bool = 50): 21 | """ 22 | :param model: SentenceTransformerModel 23 | :param positive_margin: Positive margin, cos(anchor, positive) should be > positive_margin 24 | :param negative_margin: Negative margin, cos(anchor, negative) should be < negative_margin 25 | :param use_mini_batched_version: As large batch sizes require a lot of memory, we can use a mini-batched version. We break down the large batch with 500 examples to smaller batches with fewer examples. 26 | :param mini_batch_size: Size for the mini-batches. Should be a devisor for the batch size in your data loader. 27 | """ 28 | super(MegaBatchMarginLoss, self).__init__() 29 | self.model = model 30 | self.positive_margin = positive_margin 31 | self.negative_margin = negative_margin 32 | self.mini_batch_size = mini_batch_size 33 | self.forward = self.forward_mini_batched if use_mini_batched_version else self.forward_non_mini_batched 34 | 35 | 36 | def forward_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 37 | anchor, positive = sentence_features 38 | feature_names = list(anchor.keys()) 39 | 40 | with torch.no_grad(): 41 | self.model.eval() 42 | all_positive_emb = self.model(positive)['sentence_embedding'].detach() 43 | self.model.train() 44 | 45 | diagonal_matrix = torch.eye(len(all_positive_emb), len(all_positive_emb), device=all_positive_emb.device) 46 | 47 | #Iterate over the triplets (anchor, positive, hardest_negative) in smaller mini_batch sizes 48 | for start_idx in range(0, len(all_positive_emb), self.mini_batch_size): 49 | end_idx = start_idx + self.mini_batch_size 50 | anchor_emb = self.model({key: anchor[key][start_idx:end_idx] for key in feature_names})['sentence_embedding'] 51 | 52 | # Find hard negatives. For each anchor, find the hardest negative 53 | # Store them in the triplets (anchor, positive, hardest_negative) 54 | hard_negative_features = {key: [] for key in feature_names} 55 | with torch.no_grad(): 56 | cos_scores = util.pytorch_cos_sim(anchor_emb, all_positive_emb) 57 | negative_scores = cos_scores - 2 * diagonal_matrix[start_idx:end_idx] # Remove positive scores along the diagonal, set them to -1 so that they are not selected by the max() operation 58 | negatives_max, negatives_ids = torch.max(negative_scores, dim=1) 59 | 60 | for hard_negative_id in negatives_ids: 61 | for key in feature_names: 62 | hard_negative_features[key].append(positive[key][hard_negative_id]) 63 | 64 | for key in feature_names: 65 | hard_negative_features[key] = torch.stack(hard_negative_features[key]) 66 | 67 | 68 | #Compute differentiable negative and positive embeddings 69 | positive_emb = self.model({key: positive[key][start_idx:end_idx] for key in feature_names})['sentence_embedding'] 70 | negative_emb = self.model(hard_negative_features)['sentence_embedding'] 71 | 72 | assert anchor_emb.shape == positive_emb.shape 73 | assert anchor_emb.shape == negative_emb.shape 74 | 75 | #Compute loss 76 | pos_cosine = F.cosine_similarity(anchor_emb, positive_emb) 77 | neg_cosine = F.cosine_similarity(anchor_emb, negative_emb) 78 | losses = F.relu(self.positive_margin - pos_cosine) + F.relu(neg_cosine - self.negative_margin) 79 | losses = losses.mean() 80 | 81 | #Backpropagate unless it is the last mini batch. The last mini-batch will be back propagated by the outside train loop 82 | if end_idx < len(cos_scores): 83 | losses.backward() 84 | 85 | return losses 86 | 87 | 88 | ##### Non mini-batched version ### 89 | def forward_non_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 90 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 91 | embeddings_a, embeddings_b = reps 92 | 93 | cos_scores = util.pytorch_cos_sim(embeddings_a, embeddings_b) 94 | positive_scores = torch.diagonal(cos_scores) 95 | negative_scores = cos_scores - (2*torch.eye(*cos_scores.shape, device=cos_scores.device)) # Remove positive scores along the diagonal 96 | negatives_max, _ = torch.max(negative_scores, dim=1) 97 | losses = F.relu(self.positive_margin - positive_scores) + F.relu(negatives_max - self.negative_margin) 98 | return losses.mean() 99 | -------------------------------------------------------------------------------- /sentence_transformers/losses/MultipleNegativesRankingLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Iterable, Dict 4 | from ..SentenceTransformer import SentenceTransformer 5 | from .. import util 6 | 7 | class MultipleNegativesRankingLoss(nn.Module): 8 | """ 9 | This loss expects as input a batch consisting of sentence pairs (a_1, p_1), (a_2, p_2)..., (a_n, p_n) 10 | where we assume that (a_i, p_i) are a positive pair and (a_i, p_j) for i!=j a negative pair. 11 | 12 | For each a_i, it uses all other p_j as negative samples, i.e., for a_i, we have 1 positive example (p_i) and 13 | n-1 negative examples (p_j). It then minimizes the negative log-likehood for softmax normalized scores. 14 | 15 | This loss function works great to train embeddings for retrieval setups where you have positive pairs (e.g. (query, relevant_doc)) 16 | as it will sample in each batch n-1 negative docs randomly. 17 | 18 | The performance usually increases with increasing batch sizes. 19 | 20 | For more information, see: https://arxiv.org/pdf/1705.00652.pdf 21 | (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4) 22 | 23 | You can also provide one or multiple hard negatives per anchor-positive pair by structering the data like this: 24 | (a_1, p_1, n_1), (a_2, p_2, n_2) 25 | 26 | Here, n_1 is a hard negative for (a_1, p_1). The loss will use for the pair (a_i, p_i) all p_j (j!=i) and all n_j as negatives. 27 | 28 | Example:: 29 | 30 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 31 | from sentence_transformers.readers import InputExample 32 | 33 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 34 | train_examples = [InputExample(texts=['Anchor 1', 'Positive 1']), 35 | InputExample(texts=['Anchor 2', 'Positive 2'])] 36 | train_dataset = SentencesDataset(train_examples, model) 37 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 38 | train_loss = losses.MultipleNegativesRankingLoss(model=model) 39 | """ 40 | def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_fct = util.cos_sim): 41 | """ 42 | :param model: SentenceTransformer model 43 | :param scale: Output of similarity function is multiplied by scale value 44 | :param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1) 45 | """ 46 | super(MultipleNegativesRankingLoss, self).__init__() 47 | self.model = model 48 | self.scale = scale 49 | self.similarity_fct = similarity_fct 50 | self.cross_entropy_loss = nn.CrossEntropyLoss() 51 | 52 | 53 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 54 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 55 | embeddings_a = reps[0] 56 | embeddings_b = torch.cat(reps[1:]) 57 | print(embeddings_a.size()) 58 | print(embeddings_b.size()) 59 | 60 | scores = self.similarity_fct(embeddings_a, embeddings_b) * self.scale 61 | labels = torch.tensor(range(len(scores)), dtype=torch.long, device=scores.device) # Example a[i] should match with b[i] 62 | return self.cross_entropy_loss(scores, labels) 63 | 64 | 65 | 66 | 67 | 68 | -------------------------------------------------------------------------------- /sentence_transformers/losses/OnlineContrastiveLoss.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable, Dict 2 | import torch.nn.functional as F 3 | from torch import nn, Tensor 4 | from .ContrastiveLoss import SiameseDistanceMetric 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer 6 | 7 | 8 | class OnlineContrastiveLoss(nn.Module): 9 | """ 10 | Online Contrastive loss. Similar to ConstrativeLoss, but it selects hard positive (positives that are far apart) 11 | and hard negative pairs (negatives that are close) and computes the loss only for these pairs. Often yields 12 | better performances than ConstrativeLoss. 13 | 14 | :param model: SentenceTransformer model 15 | :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used 16 | :param margin: Negative samples (label == 0) should have a distance of at least the margin value. 17 | :param size_average: Average by the size of the mini-batch. 18 | 19 | Example:: 20 | 21 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 22 | from sentence_transformers.readers import InputExample 23 | 24 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 25 | train_examples = [InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1), 26 | InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)] 27 | train_dataset = SentencesDataset(train_examples, model) 28 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 29 | train_loss = losses.OnlineContrastiveLoss(model=model) 30 | """ 31 | 32 | def __init__(self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5): 33 | super(OnlineContrastiveLoss, self).__init__() 34 | self.model = model 35 | self.margin = margin 36 | self.distance_metric = distance_metric 37 | 38 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor, size_average=False): 39 | embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 40 | 41 | distance_matrix = self.distance_metric(embeddings[0], embeddings[1]) 42 | negs = distance_matrix[labels == 0] 43 | poss = distance_matrix[labels == 1] 44 | 45 | # select hard positive and hard negative pairs 46 | negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())] 47 | positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())] 48 | 49 | positive_loss = positive_pairs.pow(2).sum() 50 | negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum() 51 | loss = positive_loss + negative_loss 52 | return loss -------------------------------------------------------------------------------- /sentence_transformers/losses/SoftmaxLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | from ..SentenceTransformer import SentenceTransformer 5 | import logging 6 | 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class SoftmaxLoss(nn.Module): 11 | """ 12 | This loss was used in our SBERT publication (https://arxiv.org/abs/1908.10084) to train the SentenceTransformer 13 | model on NLI data. It adds a softmax classifier on top of the output of two transformer networks. 14 | 15 | :param model: SentenceTransformer model 16 | :param sentence_embedding_dimension: Dimension of your sentence embeddings 17 | :param num_labels: Number of different labels 18 | :param concatenation_sent_rep: Concatenate vectors u,v for the softmax classifier? 19 | :param concatenation_sent_difference: Add abs(u-v) for the softmax classifier? 20 | :param concatenation_sent_multiplication: Add u*v for the softmax classifier? 21 | 22 | Example:: 23 | 24 | from sentence_transformers import SentenceTransformer, SentencesDataset, losses 25 | from sentence_transformers.readers import InputExample 26 | 27 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 28 | train_examples = [InputExample(InputExample(texts=['First pair, sent A', 'First pair, sent B'], label=0), 29 | InputExample(texts=['Second Pair, sent A', 'Second Pair, sent B'], label=3)] 30 | train_dataset = SentencesDataset(train_examples, model) 31 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 32 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 33 | """ 34 | def __init__(self, 35 | model: SentenceTransformer, 36 | sentence_embedding_dimension: int, 37 | num_labels: int, 38 | concatenation_sent_rep: bool = True, 39 | concatenation_sent_difference: bool = True, 40 | concatenation_sent_multiplication: bool = False): 41 | super(SoftmaxLoss, self).__init__() 42 | self.model = model 43 | self.num_labels = num_labels 44 | self.concatenation_sent_rep = concatenation_sent_rep 45 | self.concatenation_sent_difference = concatenation_sent_difference 46 | self.concatenation_sent_multiplication = concatenation_sent_multiplication 47 | 48 | num_vectors_concatenated = 0 49 | if concatenation_sent_rep: 50 | num_vectors_concatenated += 2 51 | if concatenation_sent_difference: 52 | num_vectors_concatenated += 1 53 | if concatenation_sent_multiplication: 54 | num_vectors_concatenated += 1 55 | logger.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated)) 56 | self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels) 57 | 58 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 59 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 60 | rep_a, rep_b = reps 61 | 62 | vectors_concat = [] 63 | if self.concatenation_sent_rep: 64 | vectors_concat.append(rep_a) 65 | vectors_concat.append(rep_b) 66 | 67 | if self.concatenation_sent_difference: 68 | vectors_concat.append(torch.abs(rep_a - rep_b)) 69 | 70 | if self.concatenation_sent_multiplication: 71 | vectors_concat.append(rep_a * rep_b) 72 | 73 | features = torch.cat(vectors_concat, 1) 74 | 75 | output = self.classifier(features) 76 | loss_fct = nn.CrossEntropyLoss() 77 | 78 | if labels is not None: 79 | loss = loss_fct(output, labels.view(-1)) 80 | return loss 81 | else: 82 | return reps, output 83 | -------------------------------------------------------------------------------- /sentence_transformers/losses/TripletLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import torch.nn.functional as F 5 | from enum import Enum 6 | from ..SentenceTransformer import SentenceTransformer 7 | 8 | class TripletDistanceMetric(Enum): 9 | """ 10 | The metric for the triplet loss 11 | """ 12 | COSINE = lambda x, y: 1 - F.cosine_similarity(x, y) 13 | EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2) 14 | MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1) 15 | 16 | class TripletLoss(nn.Module): 17 | """ 18 | This class implements triplet loss. Given a triplet of (anchor, positive, negative), 19 | the loss minimizes the distance between anchor and positive while it maximizes the distance 20 | between anchor and negative. It compute the following loss function: 21 | 22 | loss = max(||anchor - positive|| - ||anchor - negative|| + margin, 0). 23 | 24 | Margin is an important hyperparameter and needs to be tuned respectively. 25 | 26 | For further details, see: https://en.wikipedia.org/wiki/Triplet_loss 27 | 28 | :param model: SentenceTransformerModel 29 | :param distance_metric: Function to compute distance between two embeddings. The class TripletDistanceMetric contains common distance metrices that can be used. 30 | :param triplet_margin: The negative should be at least this much further away from the anchor than the positive. 31 | 32 | Example:: 33 | 34 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 35 | from sentence_transformers.readers import InputExample 36 | 37 | model = SentenceTransformer('distilbert-base-nli-mean-tokens') 38 | train_examples = [InputExample(texts=['Anchor 1', 'Positive 1', 'Negative 1']), 39 | InputExample(texts=['Anchor 2', 'Positive 2', 'Negative 2'])] 40 | train_dataset = SentencesDataset(train_examples, model) 41 | train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size) 42 | train_loss = losses.TripletLoss(model=model) 43 | """ 44 | def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin: float = 5): 45 | super(TripletLoss, self).__init__() 46 | self.model = model 47 | self.distance_metric = distance_metric 48 | self.triplet_margin = triplet_margin 49 | 50 | 51 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 52 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 53 | 54 | rep_anchor, rep_pos, rep_neg = reps 55 | distance_pos = self.distance_metric(rep_anchor, rep_pos) 56 | distance_neg = self.distance_metric(rep_anchor, rep_neg) 57 | 58 | losses = F.relu(distance_pos - distance_neg + self.triplet_margin) 59 | return losses.mean() -------------------------------------------------------------------------------- /sentence_transformers/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .CosineSimilarityLoss import * 2 | from .SoftmaxLoss import * 3 | from .MultipleNegativesRankingLoss import * 4 | from .TripletLoss import * 5 | from .MSELoss import * 6 | from .ContrastiveLoss import * 7 | from .ContrastiveTensionLoss import * 8 | from .OnlineContrastiveLoss import * 9 | from .MegaBatchMarginLoss import * 10 | from .DenoisingAutoEncoderLoss import * 11 | 12 | # Triplet losses 13 | from .BatchHardTripletLoss import * 14 | from .BatchHardSoftMarginTripletLoss import * 15 | from .BatchSemiHardTripletLoss import * 16 | from .BatchAllTripletLoss import * 17 | from .BYOLoss import * -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BYOLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BYOLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BYOLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BYOLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BatchAllTripletLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchAllTripletLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BatchAllTripletLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchAllTripletLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BatchHardSoftMarginTripletLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchHardSoftMarginTripletLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BatchHardSoftMarginTripletLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchHardSoftMarginTripletLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BatchHardTripletLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchHardTripletLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BatchHardTripletLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchHardTripletLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BatchSemiHardTripletLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchSemiHardTripletLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/BatchSemiHardTripletLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchSemiHardTripletLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/ContrastiveLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/ContrastiveLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/ContrastiveLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/ContrastiveLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/ContrastiveTensionLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/ContrastiveTensionLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/ContrastiveTensionLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/ContrastiveTensionLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/CosineSimilarityLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/CosineSimilarityLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/CosineSimilarityLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/CosineSimilarityLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/DenoisingAutoEncoderLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/DenoisingAutoEncoderLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/DenoisingAutoEncoderLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/DenoisingAutoEncoderLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/MSELoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MSELoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/MSELoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MSELoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/MegaBatchMarginLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MegaBatchMarginLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/MegaBatchMarginLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MegaBatchMarginLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/MultipleNegativesRankingLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MultipleNegativesRankingLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/MultipleNegativesRankingLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MultipleNegativesRankingLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/OnlineContrastiveLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/OnlineContrastiveLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/OnlineContrastiveLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/OnlineContrastiveLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/SoftmaxLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/SoftmaxLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/SoftmaxLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/SoftmaxLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/TripletLoss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/TripletLoss.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/TripletLoss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/TripletLoss.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/losses/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/ALBERT.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class ALBERT(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /sentence_transformers/models/BERT.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class BERT(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /sentence_transformers/models/BoW.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | import numpy as np 9 | from .tokenizer import WhitespaceTokenizer 10 | 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class BoW(nn.Module): 15 | """Implements a Bag-of-Words (BoW) model to derive sentence embeddings. 16 | 17 | A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab. 18 | """ 19 | 20 | def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True): 21 | super(BoW, self).__init__() 22 | vocab = list(set(vocab)) #Ensure vocab is unique 23 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency'] 24 | self.vocab = vocab 25 | self.word_weights = word_weights 26 | self.unknown_word_weight = unknown_word_weight 27 | self.cumulative_term_frequency = cumulative_term_frequency 28 | 29 | #Maps wordIdx -> word weight 30 | self.weights = [] 31 | num_unknown_words = 0 32 | for word in vocab: 33 | weight = unknown_word_weight 34 | if word in word_weights: 35 | weight = word_weights[word] 36 | elif word.lower() in word_weights: 37 | weight = word_weights[word.lower()] 38 | else: 39 | num_unknown_words += 1 40 | self.weights.append(weight) 41 | 42 | logger.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 43 | 44 | self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False) 45 | self.sentence_embedding_dimension = len(vocab) 46 | 47 | 48 | def forward(self, features: Dict[str, Tensor]): 49 | #Nothing to do, everything is done in get_sentence_features 50 | return features 51 | 52 | def tokenize(self, texts: List[str]) -> List[int]: 53 | tokenized = [self.tokenizer.tokenize(text) for text in texts] 54 | return self.get_sentence_features(tokenized) 55 | 56 | def get_sentence_embedding_dimension(self): 57 | return self.sentence_embedding_dimension 58 | 59 | def get_sentence_features(self, tokenized_texts: List[List[int]], pad_seq_length: int = 0): 60 | vectors = [] 61 | 62 | for tokens in tokenized_texts: 63 | vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32) 64 | for token in tokens: 65 | if self.cumulative_term_frequency: 66 | vector[token] += self.weights[token] 67 | else: 68 | vector[token] = self.weights[token] 69 | vectors.append(vector) 70 | 71 | return {'sentence_embedding': torch.tensor(vectors, dtype=torch.float)} 72 | 73 | def get_config_dict(self): 74 | return {key: self.__dict__[key] for key in self.config_keys} 75 | 76 | def save(self, output_path): 77 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 78 | json.dump(self.get_config_dict(), fOut, indent=2) 79 | 80 | @staticmethod 81 | def load(input_path): 82 | with open(os.path.join(input_path, 'config.json')) as fIn: 83 | config = json.load(fIn) 84 | 85 | return BoW(**config) 86 | -------------------------------------------------------------------------------- /sentence_transformers/models/CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import logging 5 | import gzip 6 | from tqdm import tqdm 7 | import numpy as np 8 | import os 9 | import json 10 | from ..util import import_from_string, fullname, http_get 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 12 | 13 | 14 | class CNN(nn.Module): 15 | """CNN-layer with multiple kernel-sizes over the word embeddings""" 16 | 17 | def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5], stride_sizes: List[int] = None): 18 | nn.Module.__init__(self) 19 | self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes'] 20 | self.in_word_embedding_dimension = in_word_embedding_dimension 21 | self.out_channels = out_channels 22 | self.kernel_sizes = kernel_sizes 23 | 24 | self.embeddings_dimension = out_channels*len(kernel_sizes) 25 | self.convs = nn.ModuleList() 26 | 27 | in_channels = in_word_embedding_dimension 28 | if stride_sizes is None: 29 | stride_sizes = [1] * len(kernel_sizes) 30 | 31 | for kernel_size, stride in zip(kernel_sizes, stride_sizes): 32 | padding_size = int((kernel_size - 1) / 2) 33 | conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, 34 | kernel_size=kernel_size, 35 | stride=stride, 36 | padding=padding_size) 37 | self.convs.append(conv) 38 | 39 | def forward(self, features): 40 | token_embeddings = features['token_embeddings'] 41 | 42 | token_embeddings = token_embeddings.transpose(1, -1) 43 | vectors = [conv(token_embeddings) for conv in self.convs] 44 | out = torch.cat(vectors, 1).transpose(1, -1) 45 | 46 | features.update({'token_embeddings': out}) 47 | return features 48 | 49 | def get_word_embedding_dimension(self) -> int: 50 | return self.embeddings_dimension 51 | 52 | def tokenize(self, text: str) -> List[int]: 53 | raise NotImplementedError() 54 | 55 | def save(self, output_path: str): 56 | with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut: 57 | json.dump(self.get_config_dict(), fOut, indent=2) 58 | 59 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 60 | 61 | def get_config_dict(self): 62 | return {key: self.__dict__[key] for key in self.config_keys} 63 | 64 | @staticmethod 65 | def load(input_path: str): 66 | with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn: 67 | config = json.load(fIn) 68 | 69 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')) 70 | model = CNN(**config) 71 | model.load_state_dict(weights) 72 | return model 73 | 74 | -------------------------------------------------------------------------------- /sentence_transformers/models/CamemBERT.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | 4 | class CamemBERT(Transformer): 5 | """ 6 | DEPRECATED: Please use models.Transformer instead. 7 | """ 8 | pass 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /sentence_transformers/models/Dense.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from torch import functional as F 5 | from typing import Union, Tuple, List, Iterable, Dict 6 | import os 7 | import json 8 | from ..util import fullname, import_from_string 9 | 10 | 11 | class Dense(nn.Module): 12 | """Feed-forward function with activiation function. 13 | 14 | This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN). 15 | 16 | :param in_features: Size of the input dimension 17 | :param out_features: Output size 18 | :param bias: Add a bias vector 19 | :param activation_function: Pytorch activation function applied on output 20 | :param init_weight: Initial value for the matrix of the linear layer 21 | :param init_bias: Initial value for the bias of the linear layer 22 | """ 23 | def __init__(self, in_features: int, out_features: int, bias: bool = True, activation_function=nn.Tanh(), init_weight: Tensor = None, init_bias: Tensor = None): 24 | super(Dense, self).__init__() 25 | self.in_features = in_features 26 | self.out_features = out_features 27 | self.bias = bias 28 | self.activation_function = activation_function 29 | self.linear = nn.Linear(in_features, out_features, bias=bias) 30 | 31 | if init_weight is not None: 32 | self.linear.weight = nn.Parameter(init_weight) 33 | 34 | if init_bias is not None: 35 | self.linear.bias = nn.Parameter(init_bias) 36 | 37 | def forward(self, features: Dict[str, Tensor]): 38 | features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))}) 39 | return features 40 | 41 | def get_sentence_embedding_dimension(self) -> int: 42 | return self.out_features 43 | 44 | def save(self, output_path): 45 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 46 | json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut) 47 | 48 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 49 | 50 | @staticmethod 51 | def load(input_path): 52 | with open(os.path.join(input_path, 'config.json')) as fIn: 53 | config = json.load(fIn) 54 | 55 | config['activation_function'] = import_from_string(config['activation_function'])() 56 | model = Dense(**config) 57 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 58 | return model 59 | -------------------------------------------------------------------------------- /sentence_transformers/models/DistilBERT.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class DistilBERT(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /sentence_transformers/models/LSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from typing import List 4 | import os 5 | import json 6 | 7 | 8 | 9 | class LSTM(nn.Module): 10 | """ 11 | Bidirectional LSTM running over word embeddings. 12 | """ 13 | def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True): 14 | nn.Module.__init__(self) 15 | self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional'] 16 | self.word_embedding_dimension = word_embedding_dimension 17 | self.hidden_dim = hidden_dim 18 | self.num_layers = num_layers 19 | self.dropout = dropout 20 | self.bidirectional = bidirectional 21 | 22 | self.embeddings_dimension = hidden_dim 23 | if self.bidirectional: 24 | self.embeddings_dimension *= 2 25 | 26 | self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True) 27 | 28 | def forward(self, features): 29 | token_embeddings = features['token_embeddings'] 30 | sentence_lengths = torch.clamp(features['sentence_lengths'], min=1) 31 | 32 | packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False) 33 | packed = self.encoder(packed) 34 | unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0] 35 | features.update({'token_embeddings': unpack}) 36 | return features 37 | 38 | def get_word_embedding_dimension(self) -> int: 39 | return self.embeddings_dimension 40 | 41 | def tokenize(self, text: str) -> List[int]: 42 | raise NotImplementedError() 43 | 44 | def save(self, output_path: str): 45 | with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut: 46 | json.dump(self.get_config_dict(), fOut, indent=2) 47 | 48 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 49 | 50 | def get_config_dict(self): 51 | return {key: self.__dict__[key] for key in self.config_keys} 52 | 53 | @staticmethod 54 | def load(input_path: str): 55 | with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn: 56 | config = json.load(fIn) 57 | 58 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 59 | model = LSTM(**config) 60 | model.load_state_dict(weights) 61 | return model 62 | 63 | -------------------------------------------------------------------------------- /sentence_transformers/models/LayerNorm.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class LayerNorm(nn.Module): 10 | def __init__(self, dimension: int): 11 | super(LayerNorm, self).__init__() 12 | self.dimension = dimension 13 | self.norm = nn.LayerNorm(dimension) 14 | 15 | 16 | def forward(self, features: Dict[str, Tensor]): 17 | features['sentence_embedding'] = self.norm(features['sentence_embedding']) 18 | return features 19 | 20 | 21 | def get_sentence_embedding_dimension(self): 22 | return self.dimension 23 | 24 | def save(self, output_path): 25 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 26 | json.dump({'dimension': self.dimension}, fOut, indent=2) 27 | 28 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 29 | 30 | @staticmethod 31 | def load(input_path): 32 | with open(os.path.join(input_path, 'config.json')) as fIn: 33 | config = json.load(fIn) 34 | 35 | model = LayerNorm(**config) 36 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 37 | return model -------------------------------------------------------------------------------- /sentence_transformers/models/Normalize.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from typing import Dict 4 | import torch.nn.functional as F 5 | 6 | class Normalize(nn.Module): 7 | """ 8 | This layer normalizes embeddings to unit length 9 | """ 10 | def __init__(self): 11 | super(Normalize, self).__init__() 12 | 13 | def forward(self, features: Dict[str, Tensor]): 14 | features.update({'sentence_embedding': F.normalize(features['sentence_embedding'], p=2, dim=1)}) 15 | return features 16 | 17 | def save(self, output_path): 18 | pass 19 | 20 | @staticmethod 21 | def load(input_path): 22 | return Normalize() 23 | -------------------------------------------------------------------------------- /sentence_transformers/models/Pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class Pooling(nn.Module): 10 | """Performs pooling (max or mean) on the token embeddings. 11 | 12 | Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model. 13 | You can concatenate multiple poolings together. 14 | 15 | :param word_embedding_dimension: Dimensions for the word embeddings 16 | :param pooling_mode: Can be a string: mean/max/cls. If set, overwrites the other pooling_mode_* settings 17 | :param pooling_mode_cls_token: Use the first token (CLS token) as text representations 18 | :param pooling_mode_max_tokens: Use max in each dimension over all tokens. 19 | :param pooling_mode_mean_tokens: Perform mean-pooling 20 | :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length). 21 | """ 22 | def __init__(self, 23 | word_embedding_dimension: int, 24 | pooling_mode: str = None, 25 | pooling_mode_cls_token: bool = False, 26 | pooling_mode_max_tokens: bool = False, 27 | pooling_mode_mean_tokens: bool = True, 28 | pooling_mode_mean_sqrt_len_tokens: bool = False, 29 | ): 30 | super(Pooling, self).__init__() 31 | 32 | self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens'] 33 | 34 | if pooling_mode is not None: #Set pooling mode by string 35 | pooling_mode = pooling_mode.lower() 36 | assert pooling_mode in ['mean', 'max', 'cls'] 37 | pooling_mode_cls_token = (pooling_mode == 'cls') 38 | pooling_mode_max_tokens = (pooling_mode == 'max') 39 | pooling_mode_mean_tokens = (pooling_mode == 'mean') 40 | 41 | self.word_embedding_dimension = word_embedding_dimension 42 | self.pooling_mode_cls_token = pooling_mode_cls_token 43 | self.pooling_mode_mean_tokens = pooling_mode_mean_tokens 44 | self.pooling_mode_max_tokens = pooling_mode_max_tokens 45 | self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens 46 | 47 | pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens]) 48 | self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension) 49 | 50 | def forward(self, features: Dict[str, Tensor]): 51 | token_embeddings = features['token_embeddings'] 52 | cls_token = features['cls_token_embeddings'] 53 | attention_mask = features['attention_mask'] 54 | 55 | ## Pooling strategy 56 | output_vectors = [] 57 | if self.pooling_mode_cls_token: 58 | output_vectors.append(cls_token) 59 | if self.pooling_mode_max_tokens: 60 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 61 | token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value 62 | max_over_time = torch.max(token_embeddings, 1)[0] 63 | output_vectors.append(max_over_time) 64 | if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens: 65 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 66 | sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) 67 | 68 | #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present 69 | if 'token_weights_sum' in features: 70 | sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) 71 | else: 72 | sum_mask = input_mask_expanded.sum(1) 73 | 74 | sum_mask = torch.clamp(sum_mask, min=1e-9) 75 | 76 | if self.pooling_mode_mean_tokens: 77 | output_vectors.append(sum_embeddings / sum_mask) 78 | if self.pooling_mode_mean_sqrt_len_tokens: 79 | output_vectors.append(sum_embeddings / torch.sqrt(sum_mask)) 80 | 81 | output_vector = torch.cat(output_vectors, 1) 82 | features.update({'sentence_embedding': output_vector}) 83 | return features 84 | 85 | def get_sentence_embedding_dimension(self): 86 | return self.pooling_output_dimension 87 | 88 | def get_config_dict(self): 89 | return {key: self.__dict__[key] for key in self.config_keys} 90 | 91 | def save(self, output_path): 92 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 93 | json.dump(self.get_config_dict(), fOut, indent=2) 94 | 95 | @staticmethod 96 | def load(input_path): 97 | with open(os.path.join(input_path, 'config.json')) as fIn: 98 | config = json.load(fIn) 99 | 100 | return Pooling(**config) 101 | -------------------------------------------------------------------------------- /sentence_transformers/models/RoBERTa.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class RoBERTa(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /sentence_transformers/models/T5.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import T5Model, T5Tokenizer 3 | import json 4 | from typing import List, Dict, Optional 5 | import os 6 | import numpy as np 7 | import logging 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class T5(nn.Module): 13 | """DEPRECATED: Please use models.Transformer instead. 14 | 15 | T5 model to generate token embeddings. 16 | 17 | Each token is mapped to an output vector from BERT. 18 | """ 19 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, task_identifier: str = 'stsb sentence1: ', model_args: Dict = {}, tokenizer_args: Dict = {}): 20 | super(T5, self).__init__() 21 | self.config_keys = ['max_seq_length', 'do_lower_case', 'task_identifier'] 22 | self.do_lower_case = do_lower_case 23 | 24 | if max_seq_length > 512: 25 | logger.warning("T5 only allows a max_seq_length of 512. Value will be set to 512") 26 | max_seq_length = 512 27 | self.max_seq_length = max_seq_length 28 | 29 | if self.do_lower_case is not None: 30 | tokenizer_args['do_lower_case'] = do_lower_case 31 | 32 | self.t5model = T5Model.from_pretrained(model_name_or_path, **model_args) 33 | self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 34 | self.task_identifier = task_identifier 35 | 36 | def forward(self, features): 37 | """Returns token_embeddings, cls_token""" 38 | output_states = self.t5model.encoder(input_ids=features['input_ids'], attention_mask=features['attention_mask']) 39 | output_tokens = output_states[0] 40 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 41 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens}) 42 | 43 | if len(output_states) > 1: 44 | features.update({'all_layer_embeddings': output_states[1]}) 45 | 46 | return features 47 | 48 | def get_word_embedding_dimension(self) -> int: 49 | return self.t5model.config.hidden_size 50 | 51 | def tokenize(self, text: str) -> List[int]: 52 | """ 53 | Tokenizes a text and maps tokens to token-ids 54 | """ 55 | return self.tokenizer.encode(self.task_identifier+text) 56 | 57 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 58 | """ 59 | Convert tokenized sentence in its embedding ids, segment ids and mask 60 | 61 | :param tokens: 62 | a tokenized sentence 63 | :param pad_seq_length: 64 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 65 | :return: embedding ids, segment ids and mask for the sentence 66 | """ 67 | 68 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 69 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, padding='max_length', return_tensors='pt', truncation=True, prepend_batch_axis=True) 70 | 71 | def get_config_dict(self): 72 | return {key: self.__dict__[key] for key in self.config_keys} 73 | 74 | def save(self, output_path: str): 75 | self.t5model.save_pretrained(output_path) 76 | self.tokenizer.save_pretrained(output_path) 77 | 78 | with open(os.path.join(output_path, 'sentence_T5_config.json'), 'w') as fOut: 79 | json.dump(self.get_config_dict(), fOut, indent=2) 80 | 81 | @staticmethod 82 | def load(input_path: str): 83 | with open(os.path.join(input_path, 'sentence_T5_config.json')) as fIn: 84 | config = json.load(fIn) 85 | return T5(model_name_or_path=input_path, **config) 86 | 87 | 88 | 89 | 90 | 91 | 92 | -------------------------------------------------------------------------------- /sentence_transformers/models/Transformer.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import AutoModel, AutoTokenizer, AutoConfig 3 | import json 4 | from typing import List, Dict, Optional, Union, Tuple 5 | import os 6 | 7 | 8 | class Transformer(nn.Module): 9 | """Huggingface AutoModel to generate token embeddings. 10 | Loads the correct class, e.g. BERT / RoBERTa etc. 11 | 12 | :param model_name_or_path: Huggingface models name (https://huggingface.co/models) 13 | :param max_seq_length: Truncate any inputs longer than max_seq_length 14 | :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model 15 | :param cache_dir: Cache dir for Huggingface Transformers to store/load models 16 | :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model 17 | :param do_lower_case: If true, lowercases the input (independet if the model is cased or not) 18 | """ 19 | def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = 64, 20 | model_args: Dict = {}, cache_dir: Optional[str] = None, 21 | tokenizer_args: Dict = {}, do_lower_case: bool = False): 22 | super(Transformer, self).__init__() 23 | self.config_keys = ['max_seq_length', 'do_lower_case'] 24 | self.max_seq_length = max_seq_length 25 | self.do_lower_case = do_lower_case 26 | 27 | config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) 28 | self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) 29 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir, **tokenizer_args) 30 | 31 | 32 | def forward(self, features): 33 | """Returns token_embeddings, cls_token""" 34 | trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']} 35 | if 'token_type_ids' in features: 36 | trans_features['token_type_ids'] = features['token_type_ids'] 37 | 38 | output_states = self.auto_model(**trans_features, return_dict=False) 39 | output_tokens = output_states[0] 40 | 41 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 42 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 43 | 44 | if self.auto_model.config.output_hidden_states: 45 | all_layer_idx = 2 46 | if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states 47 | all_layer_idx = 1 48 | 49 | hidden_states = output_states[all_layer_idx] 50 | features.update({'all_layer_embeddings': hidden_states}) 51 | 52 | return features 53 | 54 | def get_word_embedding_dimension(self) -> int: 55 | return self.auto_model.config.hidden_size 56 | 57 | def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]): 58 | """ 59 | Tokenizes a text and maps tokens to token-ids 60 | """ 61 | output = {} 62 | if isinstance(texts[0], str): 63 | to_tokenize = [texts] 64 | elif isinstance(texts[0], dict): 65 | to_tokenize = [] 66 | output['text_keys'] = [] 67 | for lookup in texts: 68 | text_key, text = next(iter(lookup.items())) 69 | to_tokenize.append(text) 70 | output['text_keys'].append(text_key) 71 | to_tokenize = [to_tokenize] 72 | else: 73 | batch1, batch2 = [], [] 74 | for text_tuple in texts: 75 | batch1.append(text_tuple[0]) 76 | batch2.append(text_tuple[1]) 77 | to_tokenize = [batch1, batch2] 78 | 79 | #strip 80 | to_tokenize = [[s.strip() for s in col] for col in to_tokenize] 81 | 82 | #Lowercase 83 | if self.do_lower_case: 84 | to_tokenize = [[s.lower() for s in col] for col in to_tokenize] 85 | 86 | 87 | output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length)) 88 | return output 89 | 90 | 91 | def get_config_dict(self): 92 | return {key: self.__dict__[key] for key in self.config_keys} 93 | 94 | def save(self, output_path: str): 95 | self.auto_model.save_pretrained(output_path) 96 | self.tokenizer.save_pretrained(output_path) 97 | 98 | with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut: 99 | json.dump(self.get_config_dict(), fOut, indent=2) 100 | 101 | @staticmethod 102 | def load(input_path: str): 103 | #Old classes used other config names than 'sentence_bert_config.json' 104 | for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', 'sentence_distilbert_config.json', 'sentence_camembert_config.json', 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', 'sentence_xlnet_config.json']: 105 | sbert_config_path = os.path.join(input_path, config_name) 106 | if os.path.exists(sbert_config_path): 107 | break 108 | 109 | with open(sbert_config_path) as fIn: 110 | config = json.load(fIn) 111 | return Transformer(model_name_or_path=input_path, **config) 112 | 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /sentence_transformers/models/WeightedLayerPooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class WeightedLayerPooling(nn.Module): 10 | """ 11 | Token embeddings are weighted mean of their different hidden layer representations 12 | """ 13 | def __init__(self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights = None): 14 | super(WeightedLayerPooling, self).__init__() 15 | self.config_keys = ['word_embedding_dimension', 'layer_start', 'num_hidden_layers'] 16 | self.word_embedding_dimension = word_embedding_dimension 17 | self.layer_start = layer_start 18 | self.num_hidden_layers = num_hidden_layers 19 | self.layer_weights = layer_weights if layer_weights is not None else nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)) 20 | 21 | def forward(self, features: Dict[str, Tensor]): 22 | ft_all_layers = features['all_layer_embeddings'] 23 | 24 | all_layer_embedding = torch.stack(ft_all_layers) 25 | all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :] # Start from 4th layers output 26 | 27 | weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size()) 28 | weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum() 29 | 30 | features.update({'token_embeddings': weighted_average}) 31 | return features 32 | 33 | def get_word_embedding_dimension(self): 34 | return self.word_embedding_dimension 35 | 36 | def get_config_dict(self): 37 | return {key: self.__dict__[key] for key in self.config_keys} 38 | 39 | def save(self, output_path): 40 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 41 | json.dump(self.get_config_dict(), fOut, indent=2) 42 | 43 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 44 | 45 | 46 | @staticmethod 47 | def load(input_path): 48 | with open(os.path.join(input_path, 'config.json')) as fIn: 49 | config = json.load(fIn) 50 | 51 | model = WeightedLayerPooling(**config) 52 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 53 | return model 54 | -------------------------------------------------------------------------------- /sentence_transformers/models/WordWeights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class WordWeights(nn.Module): 13 | """This model can weight word embeddings, for example, with idf-values.""" 14 | 15 | def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1): 16 | """ 17 | 18 | :param vocab: 19 | Vocabulary of the tokenizer 20 | :param word_weights: 21 | Mapping of tokens to a float weight value. Words embeddings are multiplied by this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values) 22 | :param unknown_word_weight: 23 | Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists. 24 | """ 25 | super(WordWeights, self).__init__() 26 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight'] 27 | self.vocab = vocab 28 | self.word_weights = word_weights 29 | self.unknown_word_weight = unknown_word_weight 30 | 31 | weights = [] 32 | num_unknown_words = 0 33 | for word in vocab: 34 | weight = unknown_word_weight 35 | if word in word_weights: 36 | weight = word_weights[word] 37 | elif word.lower() in word_weights: 38 | weight = word_weights[word.lower()] 39 | else: 40 | num_unknown_words += 1 41 | weights.append(weight) 42 | 43 | logger.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 44 | 45 | self.emb_layer = nn.Embedding(len(vocab), 1) 46 | self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)}) 47 | 48 | 49 | def forward(self, features: Dict[str, Tensor]): 50 | attention_mask = features['attention_mask'] 51 | token_embeddings = features['token_embeddings'] 52 | 53 | #Compute a weight value for each token 54 | token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1) 55 | token_weights = token_weights_raw * attention_mask.float() 56 | token_weights_sum = torch.sum(token_weights, 1) 57 | 58 | #Multiply embedding by token weight value 59 | token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size()) 60 | token_embeddings = token_embeddings * token_weights_expanded 61 | 62 | features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum}) 63 | return features 64 | 65 | def get_config_dict(self): 66 | return {key: self.__dict__[key] for key in self.config_keys} 67 | 68 | def save(self, output_path): 69 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 70 | json.dump(self.get_config_dict(), fOut, indent=2) 71 | 72 | @staticmethod 73 | def load(input_path): 74 | with open(os.path.join(input_path, 'config.json')) as fIn: 75 | config = json.load(fIn) 76 | 77 | return WordWeights(**config) 78 | -------------------------------------------------------------------------------- /sentence_transformers/models/XLMRoBERTa.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class XLMRoBERTa(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /sentence_transformers/models/XLNet.py: -------------------------------------------------------------------------------- 1 | from . import Transformer 2 | 3 | class XLNet(Transformer): 4 | """ 5 | DEPRECATED: Please use models.Transformer instead. 6 | """ 7 | pass 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /sentence_transformers/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .Transformer import Transformer 2 | from .Asym import Asym 3 | from .BoW import BoW 4 | from .CNN import CNN 5 | from .Dense import Dense 6 | from .LayerNorm import LayerNorm 7 | from .LSTM import LSTM 8 | from .Normalize import Normalize 9 | from .Pooling import Pooling 10 | from .WKPooling import WKPooling 11 | from .WeightedLayerPooling import WeightedLayerPooling 12 | from .WordEmbeddings import WordEmbeddings 13 | from .WordWeights import WordWeights 14 | -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Asym.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Asym.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Asym.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Asym.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/BERT.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/BERT.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/BERT.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/BERT.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/BoW.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/BoW.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/BoW.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/BoW.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/CNN.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/CNN.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/CNN.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/CNN.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Dense.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Dense.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Dense.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Dense.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/DistilBERT.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/DistilBERT.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/LSTM.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/LSTM.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/LSTM.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/LSTM.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/LayerNorm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/LayerNorm.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/LayerNorm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/LayerNorm.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Normalize.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Normalize.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Normalize.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Normalize.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Pooling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Pooling.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Pooling.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Pooling.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Transformer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Transformer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/Transformer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Transformer.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/WKPooling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WKPooling.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/WKPooling.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WKPooling.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/WeightedLayerPooling.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WeightedLayerPooling.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/WeightedLayerPooling.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WeightedLayerPooling.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/WordEmbeddings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WordEmbeddings.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/WordEmbeddings.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WordEmbeddings.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/WordWeights.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WordWeights.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/WordWeights.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WordWeights.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/PhraseTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | import logging 7 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 8 | import nltk 9 | 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class PhraseTokenizer(WordTokenizer): 14 | """Tokenizes the text with respect to existent phrases in the vocab. 15 | 16 | This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example, 17 | in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the]) 18 | """ 19 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, ngram_separator: str = "_", max_ngram_length: int = 5): 20 | self.stop_words = set(stop_words) 21 | self.do_lower_case = do_lower_case 22 | self.ngram_separator = ngram_separator 23 | self.max_ngram_length = max_ngram_length 24 | self.set_vocab(vocab) 25 | 26 | def get_vocab(self): 27 | return self.vocab 28 | 29 | def set_vocab(self, vocab: Iterable[str]): 30 | self.vocab = vocab 31 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 32 | 33 | # Check for ngram in vocab 34 | self.ngram_lookup = set() 35 | self.ngram_lengths = set() 36 | for word in vocab: 37 | 38 | if self.ngram_separator is not None and self.ngram_separator in word: 39 | # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other 40 | ngram_count = word.count(self.ngram_separator) + 1 41 | if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length: 42 | self.ngram_lookup.add(word) 43 | self.ngram_lengths.add(ngram_count) 44 | 45 | if len(vocab) > 0: 46 | logger.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths)) 47 | logger.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup))) 48 | 49 | def tokenize(self, text: str) -> List[int]: 50 | tokens = nltk.word_tokenize(text, preserve_line=True) 51 | 52 | #phrase detection 53 | for ngram_len in sorted(self.ngram_lengths, reverse=True): 54 | idx = 0 55 | while idx <= len(tokens) - ngram_len: 56 | ngram = self.ngram_separator.join(tokens[idx:idx + ngram_len]) 57 | if ngram in self.ngram_lookup: 58 | tokens[idx:idx + ngram_len] = [ngram] 59 | elif ngram.lower() in self.ngram_lookup: 60 | tokens[idx:idx + ngram_len] = [ngram.lower()] 61 | idx += 1 62 | 63 | #Map tokens to idx, filter stop words 64 | tokens_filtered = [] 65 | for token in tokens: 66 | if token in self.stop_words: 67 | continue 68 | elif token in self.word2idx: 69 | tokens_filtered.append(self.word2idx[token]) 70 | continue 71 | 72 | token = token.lower() 73 | if token in self.stop_words: 74 | continue 75 | elif token in self.word2idx: 76 | tokens_filtered.append(self.word2idx[token]) 77 | continue 78 | 79 | token = token.strip(string.punctuation) 80 | if token in self.stop_words: 81 | continue 82 | elif len(token) > 0 and token in self.word2idx: 83 | tokens_filtered.append(self.word2idx[token]) 84 | continue 85 | 86 | return tokens_filtered 87 | 88 | def save(self, output_path: str): 89 | with open(os.path.join(output_path, 'phrasetokenizer_config.json'), 'w') as fOut: 90 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'ngram_separator': self.ngram_separator, 'max_ngram_length': self.max_ngram_length}, fOut) 91 | 92 | @staticmethod 93 | def load(input_path: str): 94 | with open(os.path.join(input_path, 'phrasetokenizer_config.json'), 'r') as fIn: 95 | config = json.load(fIn) 96 | 97 | return PhraseTokenizer(**config) 98 | -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/WhitespaceTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 7 | 8 | class WhitespaceTokenizer(WordTokenizer): 9 | """ 10 | Simple and fast white-space tokenizer. Splits sentence based on white spaces. 11 | Punctuation are stripped from tokens. 12 | """ 13 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False): 14 | self.stop_words = set(stop_words) 15 | self.do_lower_case = do_lower_case 16 | self.set_vocab(vocab) 17 | 18 | def get_vocab(self): 19 | return self.vocab 20 | 21 | def set_vocab(self, vocab: Iterable[str]): 22 | self.vocab = vocab 23 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 24 | 25 | def tokenize(self, text: str) -> List[int]: 26 | if self.do_lower_case: 27 | text = text.lower() 28 | 29 | tokens = text.split() 30 | 31 | tokens_filtered = [] 32 | for token in tokens: 33 | if token in self.stop_words: 34 | continue 35 | elif token in self.word2idx: 36 | tokens_filtered.append(self.word2idx[token]) 37 | continue 38 | 39 | token = token.strip(string.punctuation) 40 | if token in self.stop_words: 41 | continue 42 | elif len(token) > 0 and token in self.word2idx: 43 | tokens_filtered.append(self.word2idx[token]) 44 | continue 45 | 46 | token = token.lower() 47 | if token in self.stop_words: 48 | continue 49 | elif token in self.word2idx: 50 | tokens_filtered.append(self.word2idx[token]) 51 | continue 52 | 53 | return tokens_filtered 54 | 55 | def save(self, output_path: str): 56 | with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut: 57 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut) 58 | 59 | @staticmethod 60 | def load(input_path: str): 61 | with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn: 62 | config = json.load(fIn) 63 | 64 | return WhitespaceTokenizer(**config) 65 | -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/WordTokenizer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Union, Tuple, List, Iterable, Dict 3 | 4 | ENGLISH_STOP_WORDS = ['!', '"', "''", "``", '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldn', 'couldnt', 'cry', 'd', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'hadn', 'has', 'hasn', 'hasnt', 'have', 'haven', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'll', 'ltd', 'm', 'ma', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mightn', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'mustn', 'my', 'myself', 'name', 'namely', 'needn', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'shan', 'she', 'should', 'shouldn', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 've', 'very', 'via', 'was', 'wasn', 'we', 'well', 'were', 'weren', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'won', 'would', 'wouldn', 'y', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves'] 5 | 6 | 7 | class WordTokenizer(ABC): 8 | @abstractmethod 9 | def set_vocab(self, vocab: Iterable[str]): 10 | pass 11 | 12 | @abstractmethod 13 | def get_vocab(self, vocab: Iterable[str]): 14 | pass 15 | 16 | @abstractmethod 17 | def tokenize(self, text: str) -> List[int]: 18 | pass 19 | 20 | @abstractmethod 21 | def save(self, output_path: str): 22 | pass 23 | 24 | @staticmethod 25 | @abstractmethod 26 | def load(input_path: str): 27 | pass -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 2 | from .WhitespaceTokenizer import WhitespaceTokenizer 3 | from .WhitespaceTokenizer import WhitespaceTokenizer -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__pycache__/WordTokenizer.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/WordTokenizer.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__pycache__/WordTokenizer.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/WordTokenizer.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/InputExample.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | 4 | class InputExample: 5 | """ 6 | Structure for one input example with texts, the label and a unique id 7 | """ 8 | def __init__(self, guid: str = '', texts: List[str] = None, label: Union[int, float] = 0): 9 | """ 10 | Creates one InputExample with the given texts, guid and label 11 | 12 | 13 | :param guid 14 | id for the example 15 | :param texts 16 | the texts for the example. Note, str.strip() is called on the texts 17 | :param label 18 | the label for the example 19 | """ 20 | self.guid = guid 21 | self.texts = texts 22 | self.label = label 23 | 24 | def __str__(self): 25 | return " label: {}, texts: {}".format(str(self.label), "; ".join(self.texts)) -------------------------------------------------------------------------------- /sentence_transformers/readers/LabelSentenceReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class LabelSentenceReader: 7 | """Reads in a file that has at least two columns: a label and a sentence. 8 | This reader can for example be used with the BatchHardTripletLoss. 9 | Maps labels automatically to integers""" 10 | def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator='\t'): 11 | self.folder = folder 12 | self.label_map = {} 13 | self.label_col_idx = label_col_idx 14 | self.sentence_col_idx = sentence_col_idx 15 | self.separator = separator 16 | 17 | def get_examples(self, filename, max_examples=0): 18 | examples = [] 19 | 20 | id = 0 21 | for line in open(os.path.join(self.folder, filename), encoding="utf-8"): 22 | splits = line.strip().split(self.separator) 23 | label = splits[self.label_col_idx] 24 | sentence = splits[self.sentence_col_idx] 25 | 26 | if label not in self.label_map: 27 | self.label_map[label] = len(self.label_map) 28 | 29 | label_id = self.label_map[label] 30 | guid = "%s-%d" % (filename, id) 31 | id += 1 32 | examples.append(InputExample(guid=guid, texts=[sentence], label=label_id)) 33 | 34 | if 0 < max_examples <= id: 35 | break 36 | 37 | return examples 38 | -------------------------------------------------------------------------------- /sentence_transformers/readers/NLIDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | 7 | class NLIDataReader(object): 8 | """ 9 | Reads in the Stanford NLI dataset and the MultiGenre NLI dataset 10 | """ 11 | def __init__(self, dataset_folder): 12 | self.dataset_folder = dataset_folder 13 | 14 | def get_examples(self, filename, max_examples=0): 15 | """ 16 | data_splits specified which data split to use (train, dev, test). 17 | Expects that self.dataset_folder contains the files s1.$data_split.gz, s2.$data_split.gz, 18 | labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz 19 | """ 20 | s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename), 21 | mode="rt", encoding="utf-8").readlines() 22 | s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename), 23 | mode="rt", encoding="utf-8").readlines() 24 | labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename), 25 | mode="rt", encoding="utf-8").readlines() 26 | 27 | examples = [] 28 | id = 0 29 | for sentence_a, sentence_b, label in zip(s1, s2, labels): 30 | guid = "%s-%d" % (filename, id) 31 | id += 1 32 | examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label))) 33 | 34 | if 0 < max_examples <= len(examples): 35 | break 36 | 37 | return examples 38 | 39 | @staticmethod 40 | def get_labels(): 41 | return {"contradiction": 0, "entailment": 1, "neutral": 2} 42 | 43 | def get_num_labels(self): 44 | return len(self.get_labels()) 45 | 46 | def map_label(self, label): 47 | return self.get_labels()[label.strip().lower()] -------------------------------------------------------------------------------- /sentence_transformers/readers/PairedFilesReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | import gzip 6 | 7 | class PairedFilesReader(object): 8 | """ 9 | Reads in the a Pair Dataset, split in two files 10 | """ 11 | def __init__(self, filepaths): 12 | self.filepaths = filepaths 13 | 14 | 15 | def get_examples(self, max_examples=0): 16 | """ 17 | """ 18 | fIns = [] 19 | for filepath in self.filepaths: 20 | fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8') 21 | fIns.append(fIn) 22 | 23 | examples = [] 24 | 25 | eof = False 26 | while not eof: 27 | texts = [] 28 | for fIn in fIns: 29 | text = fIn.readline() 30 | 31 | if text == '': 32 | eof = True 33 | break 34 | 35 | texts.append(text) 36 | 37 | if eof: 38 | break; 39 | 40 | examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1)) 41 | if max_examples > 0 and len(examples) >= max_examples: 42 | break 43 | 44 | return examples -------------------------------------------------------------------------------- /sentence_transformers/readers/STSDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class STSDataReader: 7 | """ 8 | Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx) 9 | 10 | Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1 11 | """ 12 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t", 13 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 14 | self.dataset_folder = dataset_folder 15 | self.score_col_idx = score_col_idx 16 | self.s1_col_idx = s1_col_idx 17 | self.s2_col_idx = s2_col_idx 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | self.normalize_scores = normalize_scores 21 | self.min_score = min_score 22 | self.max_score = max_score 23 | 24 | def get_examples(self, filename, max_examples=0): 25 | """ 26 | filename specified which data split to use (train.csv, dev.csv, test.csv). 27 | """ 28 | filepath = os.path.join(self.dataset_folder, filename) 29 | with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8") as fIn: 30 | data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting) 31 | examples = [] 32 | for id, row in enumerate(data): 33 | score = float(row[self.score_col_idx]) 34 | if self.normalize_scores: # Normalize to a 0...1 value 35 | score = (score - self.min_score) / (self.max_score - self.min_score) 36 | 37 | s1 = row[self.s1_col_idx] 38 | s2 = row[self.s2_col_idx] 39 | examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score)) 40 | 41 | if max_examples > 0 and len(examples) >= max_examples: 42 | break 43 | 44 | return examples 45 | 46 | class STSBenchmarkDataReader(STSDataReader): 47 | """ 48 | Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4. 49 | Scores are normalized from 0...5 to 0...1 50 | """ 51 | def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t", 52 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 53 | super().__init__(dataset_folder=dataset_folder, s1_col_idx=s1_col_idx, s2_col_idx=s2_col_idx, score_col_idx=score_col_idx, delimiter=delimiter, 54 | quoting=quoting, normalize_scores=normalize_scores, min_score=min_score, max_score=max_score) -------------------------------------------------------------------------------- /sentence_transformers/readers/TripletReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class TripletReader(object): 7 | """ 8 | Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1), 9 | one positive example (s2) and one negative example (s3) 10 | """ 11 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t", 12 | quoting=csv.QUOTE_NONE): 13 | self.dataset_folder = dataset_folder 14 | self.s1_col_idx = s1_col_idx 15 | self.s2_col_idx = s2_col_idx 16 | self.s3_col_idx = s3_col_idx 17 | self.has_header = has_header 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | 21 | def get_examples(self, filename, max_examples=0): 22 | """ 23 | 24 | """ 25 | data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter, 26 | quoting=self.quoting) 27 | examples = [] 28 | if self.has_header: 29 | next(data) 30 | 31 | for id, row in enumerate(data): 32 | s1 = row[self.s1_col_idx] 33 | s2 = row[self.s2_col_idx] 34 | s3 = row[self.s3_col_idx] 35 | 36 | examples.append(InputExample(texts=[s1, s2, s3])) 37 | if max_examples > 0 and len(examples) >= max_examples: 38 | break 39 | 40 | return examples -------------------------------------------------------------------------------- /sentence_transformers/readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .InputExample import InputExample 2 | from .LabelSentenceReader import LabelSentenceReader 3 | from .NLIDataReader import NLIDataReader 4 | from .STSDataReader import STSDataReader, STSBenchmarkDataReader 5 | from .TripletReader import TripletReader -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/InputExample.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/InputExample.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/InputExample.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/InputExample.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/LabelSentenceReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/LabelSentenceReader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/LabelSentenceReader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/LabelSentenceReader.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/NLIDataReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/NLIDataReader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/NLIDataReader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/NLIDataReader.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/STSDataReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/STSDataReader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/STSDataReader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/STSDataReader.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/TripletReader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/TripletReader.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/TripletReader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/TripletReader.cpython-38.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /sentence_transformers/readers/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file: 4 | readme = readme_file.read() 5 | 6 | 7 | 8 | setup( 9 | name="sentence-transformers", 10 | version="1.2.0", 11 | author="Nils Reimers", 12 | author_email="info@nils-reimers.de", 13 | description="Sentence Embeddings using BERT / RoBERTa / XLM-R", 14 | long_description=readme, 15 | long_description_content_type="text/markdown", 16 | license="Apache License 2.0", 17 | url="https://github.com/UKPLab/sentence-transformers", 18 | download_url="https://github.com/UKPLab/sentence-transformers/archive/v1.2.0.zip", 19 | packages=find_packages(), 20 | install_requires=[ 21 | 'transformers>=3.1.0,<5.0.0', 22 | 'tqdm', 23 | 'torch>=1.6.0', 24 | 'torchvision', 25 | 'numpy', 26 | 'scikit-learn', 27 | 'scipy', 28 | 'nltk', 29 | 'sentencepiece' 30 | ], 31 | classifiers=[ 32 | "Development Status :: 4 - Beta", 33 | "Intended Audience :: Science/Research", 34 | "License :: OSI Approved :: Apache Software License", 35 | "Programming Language :: Python :: 3.6", 36 | "Topic :: Scientific/Engineering :: Artificial Intelligence" 37 | ], 38 | keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning" 39 | ) 40 | -------------------------------------------------------------------------------- /training/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/training/.DS_Store -------------------------------------------------------------------------------- /training/data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/training/data/.DS_Store -------------------------------------------------------------------------------- /training/data/back_translated_nli.txt.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/training/data/back_translated_nli.txt.zip -------------------------------------------------------------------------------- /training/supervised_tuning.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | import math 3 | from sentence_transformers import models, losses 4 | from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample 5 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 6 | import logging 7 | from datetime import datetime 8 | import os 9 | import gzip 10 | import csv 11 | 12 | #### Just some code to print debug information to stdout 13 | logging.basicConfig(format='%(asctime)s - %(message)s', 14 | datefmt='%Y-%m-%d %H:%M:%S', 15 | level=logging.INFO, 16 | handlers=[LoggingHandler()]) 17 | #### /print debug information to stdout 18 | 19 | # Training parameters 20 | model_name = 'nli-roberta-base-v2' 21 | train_batch_size = 128 22 | num_epochs = 1 23 | max_seq_length = 64 24 | moving_average_decay = 0.9999 25 | nli_dataset_path = 'data/AllNLI' 26 | 27 | # Save path to store our model 28 | model_save_path = 'output/BSL_tuning-{}-{}-{}'.format(model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 29 | 30 | # Check if dataset exsist. If not, download and extract it 31 | sts_dataset_path = 'data/stsbenchmark.tsv.gz' 32 | 33 | if not os.path.exists(sts_dataset_path): 34 | util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) 35 | 36 | nli_dataset_path = 'data/AllNLI.tsv.gz' 37 | if not os.path.exists(nli_dataset_path): 38 | util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path) 39 | 40 | model = SentenceTransformer(model_name) 41 | 42 | # Read the AllNLI.tsv.gz file and create the training dataset 43 | logging.info("Read AllNLI train dataset") 44 | 45 | label2int = {"contradiction": 0, "entailment": 1, "neutral": 2} 46 | train_samples = [] 47 | count = 0 48 | with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn: 49 | reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) 50 | for row in reader: 51 | if row['split'] == 'train': 52 | # label_id = label2int[row['label']] 53 | if row['label'] == "entailment": 54 | count += 1 55 | train_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']])) 56 | 57 | 58 | 59 | # Read STSbenchmark dataset and use it as development set 60 | logging.info("Read STSbenchmark dev dataset") 61 | dev_samples = [] 62 | test_samples = [] 63 | with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: 64 | reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) 65 | for row in reader: 66 | score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 67 | if row['split'] == 'dev': 68 | dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) 69 | elif row['split'] == 'test': 70 | test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) 71 | 72 | dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev') 73 | test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test') 74 | 75 | train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True) 76 | train_loss = losses.BYOLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), moving_average_decay=moving_average_decay) 77 | 78 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up 79 | evaluation_steps = int(len(train_dataloader) * 0.1) #Evaluate every 10% of the data 80 | logging.info("Training sentences: {}".format(len(train_samples))) 81 | logging.info("Warmup-steps: {}".format(warmup_steps)) 82 | logging.info("Performance before training") 83 | dev_evaluator(model) 84 | 85 | # Train the model 86 | model.fit(train_objectives=[(train_dataloader, train_loss)], 87 | evaluator=dev_evaluator, 88 | epochs=num_epochs, 89 | evaluation_steps=evaluation_steps, 90 | warmup_steps=warmup_steps, 91 | output_path=model_save_path, 92 | optimizer_params={'lr': 5e-5}, 93 | use_amp=True #Set to True, if your GPU supports FP16 cores 94 | ) 95 | 96 | ############################################################################## 97 | # 98 | # Load the stored model and evaluate its performance on STS benchmark dataset 99 | # 100 | ############################################################################## 101 | 102 | 103 | model = SentenceTransformer(model_save_path) 104 | test_evaluator(model, output_path=model_save_path) 105 | -------------------------------------------------------------------------------- /training/unsupervised_tuning.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import DataLoader 2 | import math 3 | from sentence_transformers import models, losses 4 | from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample 5 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 6 | import logging 7 | from datetime import datetime 8 | import os 9 | import gzip 10 | import csv 11 | 12 | #### Just some code to print debug information to stdout 13 | logging.basicConfig(format='%(asctime)s - %(message)s', 14 | datefmt='%Y-%m-%d %H:%M:%S', 15 | level=logging.INFO, 16 | handlers=[LoggingHandler()]) 17 | 18 | # Training parameters 19 | model_name = 'bert-base-uncased' 20 | train_batch_size = 64 21 | num_epochs = 1 22 | max_seq_length = 64 23 | #predictor_layer_num = 3 24 | moving_average_decay = 0.999 25 | un_nli_dataset_path = 'data/back_translated_nli.txt' 26 | 27 | # Save path to store our model 28 | model_save_path = 'output/BSL_tuning-{}-{}-{}'.format(model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) 29 | 30 | # Check if dataset exsist. If not, download and extract it 31 | sts_dataset_path = 'data/stsbenchmark.tsv.gz' 32 | 33 | if not os.path.exists(sts_dataset_path): 34 | util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path) 35 | 36 | 37 | word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length) 38 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension()) 39 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 40 | 41 | 42 | #train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent] 43 | train_samples = [] 44 | with open(un_nli_dataset_path, 'r', encoding='utf8') as fIn: 45 | for line in fIn: 46 | line = line.strip() 47 | seg = line.strip().split('\t') 48 | train_samples.append(InputExample(texts=[seg[0], seg[1]])) 49 | 50 | 51 | # Read STSbenchmark dataset and use it as development set 52 | logging.info("Read STSbenchmark dev dataset") 53 | dev_samples = [] 54 | test_samples = [] 55 | with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn: 56 | reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE) 57 | for row in reader: 58 | score = float(row['score']) / 5.0 # Normalize score to range 0 ... 1 59 | if row['split'] == 'dev': 60 | dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) 61 | elif row['split'] == 'test': 62 | test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score)) 63 | 64 | dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev') 65 | test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test') 66 | 67 | train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True) 68 | train_loss = losses.BYOLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), moving_average_decay=moving_average_decay) 69 | 70 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) # 10% of train data for warm-up 71 | evaluation_steps = int(len(train_dataloader) * 0.1) #Evaluate every 10% of the data 72 | logging.info("Training sentences: {}".format(len(train_samples))) 73 | logging.info("Warmup-steps: {}".format(warmup_steps)) 74 | logging.info("Performance before training") 75 | dev_evaluator(model) 76 | 77 | # Train the model 78 | model.fit(train_objectives=[(train_dataloader, train_loss)], 79 | evaluator=dev_evaluator, 80 | epochs=num_epochs, 81 | evaluation_steps=evaluation_steps, 82 | warmup_steps=warmup_steps, 83 | output_path=model_save_path, 84 | optimizer_params={'lr': 1e-4}, 85 | use_amp=True #Set to True, if your GPU supports FP16 cores 86 | ) 87 | 88 | ############################################################################## 89 | # 90 | # Load the stored model and evaluate its performance on STS benchmark dataset 91 | # 92 | ############################################################################## 93 | 94 | 95 | model = SentenceTransformer(model_save_path) 96 | test_evaluator(model, output_path=model_save_path) 97 | --------------------------------------------------------------------------------