├── .DS_Store
├── LICENSE
├── README.md
├── evaluation
    ├── .DS_Store
    ├── evaluation_inference_speed.py
    ├── evaluation_multilingual.py
    ├── evaluation_stsbenchmark.py
    ├── evaluation_stsbenchmark_sbert-wk.py
    ├── evaluation_translation_matching.py
    └── stsbenchmark.zip
├── index.rst
├── requirements.txt
├── sentence_transformers
    ├── LoggingHandler.py
    ├── SentenceTransformer.py
    ├── __init__.py
    ├── __pycache__
    │   ├── LoggingHandler.cpython-36.pyc
    │   ├── LoggingHandler.cpython-38.pyc
    │   ├── SentenceTransformer.cpython-36.pyc
    │   ├── SentenceTransformer.cpython-38.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── util.cpython-36.pyc
    │   └── util.cpython-38.pyc
    ├── cross_encoder
    │   ├── CrossEncoder.py
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── CrossEncoder.cpython-36.pyc
    │   │   ├── CrossEncoder.cpython-38.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── __init__.cpython-38.pyc
    │   └── evaluation
    │   │   ├── CEBinaryAccuracyEvaluator.py
    │   │   ├── CEBinaryClassificationEvaluator.py
    │   │   ├── CECorrelationEvaluator.py
    │   │   ├── CERerankingEvaluator.py
    │   │   ├── CESoftmaxAccuracyEvaluator.py
    │   │   └── __init__.py
    ├── datasets
    │   ├── DenoisingAutoEncoderDataset.py
    │   ├── NoDuplicatesDataLoader.py
    │   ├── ParallelSentencesDataset.py
    │   ├── SentenceLabelDataset.py
    │   ├── SentencesDataset.py
    │   ├── __init__.py
    │   └── __pycache__
    │   │   ├── DenoisingAutoEncoderDataset.cpython-36.pyc
    │   │   ├── DenoisingAutoEncoderDataset.cpython-38.pyc
    │   │   ├── NoDuplicatesDataLoader.cpython-36.pyc
    │   │   ├── NoDuplicatesDataLoader.cpython-38.pyc
    │   │   ├── ParallelSentencesDataset.cpython-36.pyc
    │   │   ├── ParallelSentencesDataset.cpython-38.pyc
    │   │   ├── SentenceLabelDataset.cpython-36.pyc
    │   │   ├── SentenceLabelDataset.cpython-38.pyc
    │   │   ├── SentencesDataset.cpython-36.pyc
    │   │   ├── SentencesDataset.cpython-38.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── __init__.cpython-38.pyc
    ├── evaluation
    │   ├── BinaryClassificationEvaluator.py
    │   ├── EmbeddingSimilarityEvaluator.py
    │   ├── InformationRetrievalEvaluator.py
    │   ├── LabelAccuracyEvaluator.py
    │   ├── MSEEvaluator.py
    │   ├── MSEEvaluatorFromDataFrame.py
    │   ├── ParaphraseMiningEvaluator.py
    │   ├── RerankingEvaluator.py
    │   ├── SentenceEvaluator.py
    │   ├── SequentialEvaluator.py
    │   ├── SimilarityFunction.py
    │   ├── TranslationEvaluator.py
    │   ├── TripletEvaluator.py
    │   ├── __init__.py
    │   └── __pycache__
    │   │   ├── BinaryClassificationEvaluator.cpython-36.pyc
    │   │   ├── BinaryClassificationEvaluator.cpython-38.pyc
    │   │   ├── EmbeddingSimilarityEvaluator.cpython-36.pyc
    │   │   ├── EmbeddingSimilarityEvaluator.cpython-38.pyc
    │   │   ├── InformationRetrievalEvaluator.cpython-36.pyc
    │   │   ├── InformationRetrievalEvaluator.cpython-38.pyc
    │   │   ├── LabelAccuracyEvaluator.cpython-36.pyc
    │   │   ├── LabelAccuracyEvaluator.cpython-38.pyc
    │   │   ├── MSEEvaluator.cpython-36.pyc
    │   │   ├── MSEEvaluator.cpython-38.pyc
    │   │   ├── MSEEvaluatorFromDataFrame.cpython-36.pyc
    │   │   ├── MSEEvaluatorFromDataFrame.cpython-38.pyc
    │   │   ├── ParaphraseMiningEvaluator.cpython-36.pyc
    │   │   ├── ParaphraseMiningEvaluator.cpython-38.pyc
    │   │   ├── RerankingEvaluator.cpython-36.pyc
    │   │   ├── RerankingEvaluator.cpython-38.pyc
    │   │   ├── SentenceEvaluator.cpython-36.pyc
    │   │   ├── SentenceEvaluator.cpython-38.pyc
    │   │   ├── SequentialEvaluator.cpython-36.pyc
    │   │   ├── SequentialEvaluator.cpython-38.pyc
    │   │   ├── SimilarityFunction.cpython-36.pyc
    │   │   ├── SimilarityFunction.cpython-38.pyc
    │   │   ├── TranslationEvaluator.cpython-36.pyc
    │   │   ├── TranslationEvaluator.cpython-38.pyc
    │   │   ├── TripletEvaluator.cpython-36.pyc
    │   │   ├── TripletEvaluator.cpython-38.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── __init__.cpython-38.pyc
    ├── losses
    │   ├── BYOLoss.py
    │   ├── BatchAllTripletLoss.py
    │   ├── BatchHardSoftMarginTripletLoss.py
    │   ├── BatchHardTripletLoss.py
    │   ├── BatchSemiHardTripletLoss.py
    │   ├── ContrastiveLoss.py
    │   ├── ContrastiveTensionLoss.py
    │   ├── CosineSimilarityLoss.py
    │   ├── DenoisingAutoEncoderLoss.py
    │   ├── MSELoss.py
    │   ├── MegaBatchMarginLoss.py
    │   ├── MultipleNegativesRankingLoss.py
    │   ├── OnlineContrastiveLoss.py
    │   ├── SoftmaxLoss.py
    │   ├── TripletLoss.py
    │   ├── __init__.py
    │   └── __pycache__
    │   │   ├── BYOLoss.cpython-36.pyc
    │   │   ├── BYOLoss.cpython-38.pyc
    │   │   ├── BatchAllTripletLoss.cpython-36.pyc
    │   │   ├── BatchAllTripletLoss.cpython-38.pyc
    │   │   ├── BatchHardSoftMarginTripletLoss.cpython-36.pyc
    │   │   ├── BatchHardSoftMarginTripletLoss.cpython-38.pyc
    │   │   ├── BatchHardTripletLoss.cpython-36.pyc
    │   │   ├── BatchHardTripletLoss.cpython-38.pyc
    │   │   ├── BatchSemiHardTripletLoss.cpython-36.pyc
    │   │   ├── BatchSemiHardTripletLoss.cpython-38.pyc
    │   │   ├── ContrastiveLoss.cpython-36.pyc
    │   │   ├── ContrastiveLoss.cpython-38.pyc
    │   │   ├── ContrastiveTensionLoss.cpython-36.pyc
    │   │   ├── ContrastiveTensionLoss.cpython-38.pyc
    │   │   ├── CosineSimilarityLoss.cpython-36.pyc
    │   │   ├── CosineSimilarityLoss.cpython-38.pyc
    │   │   ├── DenoisingAutoEncoderLoss.cpython-36.pyc
    │   │   ├── DenoisingAutoEncoderLoss.cpython-38.pyc
    │   │   ├── MSELoss.cpython-36.pyc
    │   │   ├── MSELoss.cpython-38.pyc
    │   │   ├── MegaBatchMarginLoss.cpython-36.pyc
    │   │   ├── MegaBatchMarginLoss.cpython-38.pyc
    │   │   ├── MultipleNegativesRankingLoss.cpython-36.pyc
    │   │   ├── MultipleNegativesRankingLoss.cpython-38.pyc
    │   │   ├── OnlineContrastiveLoss.cpython-36.pyc
    │   │   ├── OnlineContrastiveLoss.cpython-38.pyc
    │   │   ├── SoftmaxLoss.cpython-36.pyc
    │   │   ├── SoftmaxLoss.cpython-38.pyc
    │   │   ├── TripletLoss.cpython-36.pyc
    │   │   ├── TripletLoss.cpython-38.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── __init__.cpython-38.pyc
    ├── models
    │   ├── ALBERT.py
    │   ├── Asym.py
    │   ├── BERT.py
    │   ├── BoW.py
    │   ├── CLIPModel.py
    │   ├── CNN.py
    │   ├── CamemBERT.py
    │   ├── Dense.py
    │   ├── DistilBERT.py
    │   ├── LSTM.py
    │   ├── LayerNorm.py
    │   ├── Normalize.py
    │   ├── Pooling.py
    │   ├── RoBERTa.py
    │   ├── T5.py
    │   ├── Transformer.py
    │   ├── WKPooling.py
    │   ├── WeightedLayerPooling.py
    │   ├── WordEmbeddings.py
    │   ├── WordWeights.py
    │   ├── XLMRoBERTa.py
    │   ├── XLNet.py
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── Asym.cpython-36.pyc
    │   │   ├── Asym.cpython-38.pyc
    │   │   ├── BERT.cpython-36.pyc
    │   │   ├── BERT.cpython-38.pyc
    │   │   ├── BoW.cpython-36.pyc
    │   │   ├── BoW.cpython-38.pyc
    │   │   ├── CNN.cpython-36.pyc
    │   │   ├── CNN.cpython-38.pyc
    │   │   ├── Dense.cpython-36.pyc
    │   │   ├── Dense.cpython-38.pyc
    │   │   ├── DistilBERT.cpython-36.pyc
    │   │   ├── LSTM.cpython-36.pyc
    │   │   ├── LSTM.cpython-38.pyc
    │   │   ├── LayerNorm.cpython-36.pyc
    │   │   ├── LayerNorm.cpython-38.pyc
    │   │   ├── Normalize.cpython-36.pyc
    │   │   ├── Normalize.cpython-38.pyc
    │   │   ├── Pooling.cpython-36.pyc
    │   │   ├── Pooling.cpython-38.pyc
    │   │   ├── Transformer.cpython-36.pyc
    │   │   ├── Transformer.cpython-38.pyc
    │   │   ├── WKPooling.cpython-36.pyc
    │   │   ├── WKPooling.cpython-38.pyc
    │   │   ├── WeightedLayerPooling.cpython-36.pyc
    │   │   ├── WeightedLayerPooling.cpython-38.pyc
    │   │   ├── WordEmbeddings.cpython-36.pyc
    │   │   ├── WordEmbeddings.cpython-38.pyc
    │   │   ├── WordWeights.cpython-36.pyc
    │   │   ├── WordWeights.cpython-38.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── __init__.cpython-38.pyc
    │   └── tokenizer
    │   │   ├── PhraseTokenizer.py
    │   │   ├── WhitespaceTokenizer.py
    │   │   ├── WordTokenizer.py
    │   │   ├── __init__.py
    │   │   └── __pycache__
    │   │       ├── WhitespaceTokenizer.cpython-36.pyc
    │   │       ├── WhitespaceTokenizer.cpython-38.pyc
    │   │       ├── WordTokenizer.cpython-36.pyc
    │   │       ├── WordTokenizer.cpython-38.pyc
    │   │       ├── __init__.cpython-36.pyc
    │   │       └── __init__.cpython-38.pyc
    ├── readers
    │   ├── InputExample.py
    │   ├── LabelSentenceReader.py
    │   ├── NLIDataReader.py
    │   ├── PairedFilesReader.py
    │   ├── STSDataReader.py
    │   ├── TripletReader.py
    │   ├── __init__.py
    │   └── __pycache__
    │   │   ├── InputExample.cpython-36.pyc
    │   │   ├── InputExample.cpython-38.pyc
    │   │   ├── LabelSentenceReader.cpython-36.pyc
    │   │   ├── LabelSentenceReader.cpython-38.pyc
    │   │   ├── NLIDataReader.cpython-36.pyc
    │   │   ├── NLIDataReader.cpython-38.pyc
    │   │   ├── STSDataReader.cpython-36.pyc
    │   │   ├── STSDataReader.cpython-38.pyc
    │   │   ├── TripletReader.cpython-36.pyc
    │   │   ├── TripletReader.cpython-38.pyc
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── __init__.cpython-38.pyc
    └── util.py
├── setup.cfg
├── setup.py
└── training
    ├── .DS_Store
    ├── data
        ├── .DS_Store
        └── back_translated_nli.txt.zip
    ├── multilingual_tuning.py
    ├── supervised_tuning.py
    └── unsupervised_tuning.py


/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/.DS_Store


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | ## Install first
 2 | 
 3 | ````
 4 | pip3 install -e .
 5 | ```` 
 6 | 
 7 | ## Training
 8 | 
 9 | ````
10 | python3 training/unsupervised_tuning.py
11 | python3 training/supervised_tuning.py
12 | python3 training/multilingual_tuning.py
13 | ````
14 | 
15 | The multilingual NLI corpus can be downloaded from here (https://drive.google.com/file/d/19O2NArJz_RlVNNGRbBnnWxNMW-7HaFZ8/view?usp=sharing)
16 | 
17 | ## pretrained Model
18 | Our pretrained model can be downloaded from here (https://drive.google.com/drive/folders/1fURXl4fGTGJ55PQF_Gr4Wr8ds2Qwa7U5?usp=sharing)
19 | 
20 | 
21 | ## Acknowledgements
22 | 
23 | Codes are adapted from the repos of the EMNLP19 paper [Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks](https://github.com/UKPLab/sentence-transformers) 
24 | 


--------------------------------------------------------------------------------
/evaluation/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/evaluation/.DS_Store


--------------------------------------------------------------------------------
/evaluation/evaluation_inference_speed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples measures the inference speed of a certain model
 3 | 
 4 | Usage:
 5 | python evaluation_inference_speed.py
 6 | OR
 7 | python evaluation_inference_speed.py model_name
 8 | """
 9 | from sentence_transformers import SentenceTransformer, util
10 | import sys
11 | import os
12 | import time
13 | import torch
14 | import gzip
15 | import csv
16 | 
17 | #Limit torch to 4 threads
18 | torch.set_num_threads(4)
19 | 
20 | 
21 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'
22 | 
23 | # Load a named sentence model (based on BERT). This will download the model from our server.
24 | # Alternatively, you can also pass a filepath to SentenceTransformer()
25 | model = SentenceTransformer(model_name)
26 | 
27 | 
28 | nli_dataset_path = 'datasets/AllNLI.tsv.gz'
29 | sentences = set()
30 | max_sentences = 100000
31 | 
32 | 
33 | #Download datasets if needed
34 | if not os.path.exists(nli_dataset_path):
35 |     util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)
36 | 
37 | with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
38 |     reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
39 |     for row in reader:
40 |         sentences.add(row['sentence1'])
41 |         if len(sentences) >= max_sentences:
42 |             break
43 | 
44 | sentences = list(sentences)
45 | print("Model Name:", model_name)
46 | print("Number of sentences:", len(sentences))
47 | 
48 | for i in range(3):
49 |     print("Run", i)
50 |     start_time = time.time()
51 |     emb = model.encode(sentences, num_workers=2, batch_size=32)
52 |     end_time = time.time()
53 |     diff_time = end_time - start_time
54 |     print("Done after {:.2f} seconds".format(diff_time))
55 |     print("Speed: {:.2f} sentences / second".format(len(sentences) / diff_time))
56 |     print("=====")


--------------------------------------------------------------------------------
/evaluation/evaluation_multilingual.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import DataLoader
 2 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, evaluation
 3 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator
 4 | from sentence_transformers.readers import STSBenchmarkDataReader
 5 | import logging
 6 | import sys
 7 | import os
 8 | import torch
 9 | import numpy as np
10 | import zipfile
11 | import io
12 | script_folder_path = os.path.dirname(os.path.realpath(__file__))
13 | 
14 | 
15 | #### Just some code to print debug information to stdout
16 | logging.basicConfig(format='%(asctime)s - %(message)s',
17 |                     datefmt='%Y-%m-%d %H:%M:%S',
18 |                     level=logging.INFO,
19 |                     handlers=[LoggingHandler()])
20 | #### /print debug information to stdout
21 | 
22 | model_name = 'stsb-xlm-r-multilingual'
23 | # Load a named sentence model (based on BERT). This will download the model from our server.
24 | # Alternatively, you can also pass a filepath to SentenceTransformer()
25 | model = SentenceTransformer(model_name)
26 | 
27 | 
28 | # Read the dataset
29 | source_languages = ['en']                     
30 | target_languages = ['en', 'de', 'es', 'fr', 'ar', 'tr']    
31 | sts_corpus = "../training/data/STS2017-extended.zip" 
32 | 
33 | logging.info("Read STS test dataset")
34 | ##### Read cross-lingual Semantic Textual Similarity (STS) data ####
35 | all_languages = list(set(list(source_languages)+list(target_languages)))
36 | sts_data = {}
37 | evaluators = [] 
38 | #Open the ZIP File of STS2017-extended.zip and check for which language combinations we have STS data
39 | with zipfile.ZipFile(sts_corpus) as zip:
40 |         filelist = zip.namelist()
41 |         for i in range(len(all_languages)):
42 |                 for j in range(i, len(all_languages)):
43 |                         lang1 = all_languages[i]
44 |                         lang2 = all_languages[j]
45 |                         filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2)
46 |                         if filepath not in filelist:
47 |                                 lang1, lang2 = lang2, lang1
48 |                                 filepath = 'STS2017-extended/STS.{}-{}.txt'.format(lang1, lang2)
49 | 
50 |                         if filepath in filelist:
51 |                                 filename = os.path.basename(filepath)
52 |                                 sts_data[filename] = {'sentences1': [], 'sentences2': [], 'scores': []}
53 | 
54 |                                 fIn = zip.open(filepath)
55 |                                 for line in io.TextIOWrapper(fIn, 'utf8'):
56 |                                         sent1, sent2, score = line.strip().split("\t")
57 |                                         score = float(score)
58 |                                         sts_data[filename]['sentences1'].append(sent1)
59 |                                         sts_data[filename]['sentences2'].append(sent2)
60 |                                         sts_data[filename]['scores'].append(score)
61 | 
62 | # model = SentenceTransformer(model_save_path)
63 | for filename, data in sts_data.items():
64 |         test_evaluator = EmbeddingSimilarityEvaluator(data['sentences1'], data['sentences2'], data['scores'], batch_size=16, name=filename, show_progress_bar=False)
65 |         test_evaluator(model)
66 | 
67 | 
68 | 
69 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_stsbenchmark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset
 3 | 
 4 | Usage:
 5 | python evaluation_stsbenchmark.py
 6 | OR
 7 | python evaluation_stsbenchmark.py model_name
 8 | """
 9 | from torch.utils.data import DataLoader
10 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, evaluation
11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator, SequentialEvaluator
12 | from sentence_transformers.readers import STSBenchmarkDataReader
13 | import logging
14 | import sys
15 | import os
16 | import torch
17 | import numpy as np
18 | 
19 | script_folder_path = os.path.dirname(os.path.realpath(__file__))
20 | 
21 | #Limit torch to 4 threads
22 | torch.set_num_threads(4)
23 | 
24 | #### Just some code to print debug information to stdout
25 | logging.basicConfig(format='%(asctime)s - %(message)s',
26 |                     datefmt='%Y-%m-%d %H:%M:%S',
27 |                     level=logging.INFO,
28 |                     handlers=[LoggingHandler()])
29 | #### /print debug information to stdout
30 | model_name = '../training/output/BSL_tuning-bert-base-nli-mean-tokens-64-2021-08-31_21-19-14'
31 | # Load a named sentence model (based on BERT). This will download the model from our server.
32 | # Alternatively, you can also pass a filepath to SentenceTransformer()
33 | model = SentenceTransformer(model_name)
34 | 
35 | sts_corpus = "stsbenchmark/" 
36 | target_eval_files = set(['sts','sts12', 'sts13', 'sts14', 'sts15', 'sts16', 'sick-r']) 
37 | 
38 | evaluators = []         #evaluators has a list of different evaluator classes we call periodically
39 | sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, sts_corpus))
40 | for target in target_eval_files:
41 | 	output_filename_eval = os.path.join(script_folder_path,sts_corpus + target + "-test.csv")
42 | 	evaluators.append(EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples(output_filename_eval), name=target))
43 | 
44 | evaluator = SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores))
45 | model.evaluate(evaluator)
46 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_stsbenchmark_sbert-wk.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Performs the pooling described in the paper:
 3 | SBERT-WK: A Sentence Embedding Method by Dissecting BERT-based Word Models, 2020, https://arxiv.org/abs/2002.06652
 4 | 
 5 | Note: WKPooling improves the performance only for certain models. Further, WKPooling requires QR-decomposition,
 6 | for which there is so far not efficient implementation in pytorch for GPUs (see https://github.com/pytorch/pytorch/issues/22573).
 7 | Hence, WKPooling runs on the GPU, which makes it rather in-efficient.
 8 | """
 9 | from torch.utils.data import DataLoader
10 | from sentence_transformers import SentenceTransformer, LoggingHandler, models
11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
12 | from sentence_transformers.readers import STSBenchmarkDataReader
13 | import logging
14 | import torch
15 | 
16 | #Limit torch to 4 threads, as this example runs on the CPU
17 | torch.set_num_threads(4)
18 | 
19 | #### Just some code to print debug information to stdout
20 | logging.basicConfig(format='%(asctime)s - %(message)s',
21 |                     datefmt='%Y-%m-%d %H:%M:%S',
22 |                     level=logging.INFO,
23 |                     handlers=[LoggingHandler()])
24 | #### /print debug information to stdout
25 | 
26 | 
27 | #1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true
28 | word_embedding_model = models.Transformer('bert-base-uncased', model_args={'output_hidden_states': True})
29 | 
30 | #2) Add WKPooling
31 | pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension())
32 | 
33 | #3) Create a sentence transformer model to glue both models together
34 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
35 | 
36 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
37 | evaluator = EmbeddingSimilarityEvaluator.from_input_examples(sts_reader.get_examples("sts-test.csv"))
38 | 
39 | model.evaluate(evaluator)
40 | 


--------------------------------------------------------------------------------
/evaluation/evaluation_translation_matching.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Given a tab seperated file (.tsv) with parallel sentences, where the second column is the translation of the sentence in the first column, for example, in the format:
 3 | src1    trg1
 4 | src2    trg2
 5 | ...
 6 | 
 7 | where trg_i is the translation of src_i.
 8 | 
 9 | Given src_i, the TranslationEvaluator checks which trg_j has the highest similarity using cosine similarity. If i == j, we assume
10 | a match, i.e., the correct translation has been found for src_i out of all possible target sentences.
11 | 
12 | It then computes an accuracy over all possible source sentences src_i. Equivalently, it computes also the accuracy for the other direction.
13 | 
14 | A high accuracy score indicates that the model is able to find the correct translation out of a large pool with sentences.
15 | 
16 | Usage:
17 | python [model_name_or_path] [parallel-file1] [parallel-file2] ...
18 | 
19 | For example:
20 | python distiluse-base-multilingual-cased  TED2020-en-de.tsv.gz
21 | 
22 | See the training_multilingual/get_parallel_data_...py scripts for getting parallel sentence data from different sources
23 | """
24 | 
25 | from sentence_transformers import SentenceTransformer, evaluation, LoggingHandler
26 | import sys
27 | import gzip
28 | import os
29 | import logging
30 | 
31 | 
32 | logging.basicConfig(format='%(asctime)s - %(message)s',
33 |                     datefmt='%Y-%m-%d %H:%M:%S',
34 |                     level=logging.INFO,
35 |                     handlers=[LoggingHandler()])
36 | 
37 | logger = logging.getLogger(__name__)
38 | 
39 | model_name = sys.argv[1]
40 | filepaths = sys.argv[2:]
41 | inference_batch_size = 32
42 | 
43 | model = SentenceTransformer(model_name)
44 | 
45 | 
46 | for filepath in filepaths:
47 |     src_sentences = []
48 |     trg_sentences = []
49 |     with gzip.open(filepath, 'rt', encoding='utf8') if filepath.endswith('.gz') else open(filepath, 'r', encoding='utf8') as fIn:
50 |         for line in fIn:
51 |             splits = line.strip().split('\t')
52 |             if len(splits) >= 2:
53 |                 src_sentences.append(splits[0])
54 |                 trg_sentences.append(splits[1])
55 | 
56 |     logger.info(os.path.basename(filepath)+": "+str(len(src_sentences))+" sentence pairs")
57 |     dev_trans_acc = evaluation.TranslationEvaluator(src_sentences, trg_sentences, name=os.path.basename(filepath), batch_size=inference_batch_size)
58 |     dev_trans_acc(model)
59 | 
60 | 
61 | 


--------------------------------------------------------------------------------
/evaluation/stsbenchmark.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/evaluation/stsbenchmark.zip


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=3.1.0,<5.0.0
2 | tqdm
3 | torch>=1.6.0
4 | numpy
5 | scikit-learn
6 | scipy
7 | nltk
8 | sentencepiece
9 | 


--------------------------------------------------------------------------------
/sentence_transformers/LoggingHandler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import tqdm
 3 | 
 4 | class LoggingHandler(logging.Handler):
 5 |     def __init__(self, level=logging.NOTSET):
 6 |         super().__init__(level)
 7 | 
 8 |     def emit(self, record):
 9 |         try:
10 |             msg = self.format(record)
11 |             tqdm.tqdm.write(msg)
12 |             self.flush()
13 |         except (KeyboardInterrupt, SystemExit):
14 |             raise
15 |         except:
16 |             self.handleError(record)
17 | 
18 | 
19 | def install_logger(
20 |     given_logger, level = logging.WARNING, fmt="%(levelname)s:%(name)s:%(message)s"
21 | ):
22 |     """ Configures the given logger; format, logging level, style, etc """
23 |     import coloredlogs
24 | 
25 |     def add_notice_log_level():
26 |         """ Creates a new 'notice' logging level """
27 |         # inspired by:
28 |         # https://stackoverflow.com/questions/2183233/how-to-add-a-custom-loglevel-to-pythons-logging-facility
29 |         NOTICE_LEVEL_NUM = 25
30 |         logging.addLevelName(NOTICE_LEVEL_NUM, "NOTICE")
31 | 
32 |         def notice(self, message, *args, **kws):
33 |             if self.isEnabledFor(NOTICE_LEVEL_NUM):
34 |                 self._log(NOTICE_LEVEL_NUM, message, args, **kws)
35 | 
36 |         logging.Logger.notice = notice
37 | 
38 |     # Add an extra logging level above INFO and below WARNING
39 |     add_notice_log_level()
40 | 
41 |     # More style info at:
42 |     # https://coloredlogs.readthedocs.io/en/latest/api.html
43 |     field_styles = coloredlogs.DEFAULT_FIELD_STYLES.copy()
44 |     field_styles["asctime"] = {}
45 |     level_styles = coloredlogs.DEFAULT_LEVEL_STYLES.copy()
46 |     level_styles["debug"] = {"color": "white", "faint": True}
47 |     level_styles["notice"] = {"color": "cyan", "bold": True}
48 | 
49 |     coloredlogs.install(
50 |         logger=given_logger,
51 |         level=level,
52 |         use_chroot=False,
53 |         fmt=fmt,
54 |         level_styles=level_styles,
55 |         field_styles=field_styles,
56 |     )
57 | 


--------------------------------------------------------------------------------
/sentence_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "1.2.0"
2 | __DOWNLOAD_SERVER__ = 'http://sbert.net/models/'
3 | from .datasets import SentencesDataset, ParallelSentencesDataset
4 | from .LoggingHandler import LoggingHandler
5 | from .SentenceTransformer import SentenceTransformer
6 | from .readers import InputExample
7 | from .cross_encoder.CrossEncoder import CrossEncoder
8 | 
9 | 


--------------------------------------------------------------------------------
/sentence_transformers/__pycache__/LoggingHandler.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/LoggingHandler.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/__pycache__/LoggingHandler.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/LoggingHandler.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/__pycache__/SentenceTransformer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/SentenceTransformer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/__pycache__/SentenceTransformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/SentenceTransformer.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/__pycache__/util.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/util.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/__pycache__/util.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/__pycache__/util.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .CrossEncoder import CrossEncoder


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/__pycache__/CrossEncoder.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/cross_encoder/__pycache__/CrossEncoder.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/__pycache__/CrossEncoder.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/cross_encoder/__pycache__/CrossEncoder.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/cross_encoder/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/cross_encoder/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/evaluation/CEBinaryAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import csv
 4 | from typing import List
 5 | from ... import InputExample
 6 | import numpy as np
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class CEBinaryAccuracyEvaluator:
12 |     """
13 |     This evaluator can be used with the CrossEncoder class.
14 | 
15 |     It is designed for CrossEncoders with 1 outputs. It measure the
16 |     accuracy of the predict class vs. the gold labels. It uses a fixed threshold to determine the label (0 vs 1).
17 | 
18 |     See CEBinaryClassificationEvaluator for an evaluator that determines automatically the optimal threshold.
19 |     """
20 |     def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str='', threshold: float = 0.5, write_csv: bool = True):
21 |         self.sentence_pairs = sentence_pairs
22 |         self.labels = labels
23 |         self.name = name
24 |         self.threshold = threshold
25 | 
26 |         self.csv_file = "CEBinaryAccuracyEvaluator" + ("_" + name if name else '') + "_results.csv"
27 |         self.csv_headers = ["epoch", "steps", "Accuracy"]
28 |         self.write_csv = write_csv
29 | 
30 |     @classmethod
31 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
32 |         sentence_pairs = []
33 |         labels = []
34 | 
35 |         for example in examples:
36 |             sentence_pairs.append(example.texts)
37 |             labels.append(example.label)
38 |         return cls(sentence_pairs, labels, **kwargs)
39 | 
40 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
41 |         if epoch != -1:
42 |             if steps == -1:
43 |                 out_txt = " after epoch {}:".format(epoch)
44 |             else:
45 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
46 |         else:
47 |             out_txt = ":"
48 | 
49 |         logger.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
50 |         pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
51 |         pred_labels = pred_scores > self.threshold
52 | 
53 |         assert len(pred_labels) == len(self.labels)
54 | 
55 |         acc = np.sum(pred_labels == self.labels) / len(self.labels)
56 | 
57 |         logger.info("Accuracy: {:.2f}".format(acc*100))
58 | 
59 |         if output_path is not None and self.write_csv:
60 |             csv_path = os.path.join(output_path, self.csv_file)
61 |             output_file_exists = os.path.isfile(csv_path)
62 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
63 |                 writer = csv.writer(f)
64 |                 if not output_file_exists:
65 |                     writer.writerow(self.csv_headers)
66 | 
67 |                 writer.writerow([epoch, steps, acc])
68 | 
69 |         return acc
70 | 


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/evaluation/CEBinaryClassificationEvaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from sklearn.metrics import average_precision_score
 3 | from typing import List
 4 | import numpy as np
 5 | import os
 6 | import csv
 7 | 
 8 | from ... import InputExample
 9 | from ...evaluation import BinaryClassificationEvaluator
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | class CEBinaryClassificationEvaluator:
15 |     """
16 |     This evaluator can be used with the CrossEncoder class. Given sentence pairs and binary labels (0 and 1),
17 |     it compute the average precision and the best possible f1 score
18 |     """
19 |     def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str='', write_csv: bool = True):
20 |         assert len(sentence_pairs) == len(labels)
21 |         for label in labels:
22 |             assert (label == 0 or label == 1)
23 | 
24 |         self.sentence_pairs = sentence_pairs
25 |         self.labels = np.asarray(labels)
26 |         self.name = name
27 | 
28 |         self.csv_file = "CEBinaryClassificationEvaluator" + ("_" + name if name else '') + "_results.csv"
29 |         self.csv_headers = ["epoch", "steps", "Accuracy", "Accuracy_Threshold", "F1", "F1_Threshold", "Precision", "Recall", "Average_Precision"]
30 |         self.write_csv = write_csv
31 | 
32 |     @classmethod
33 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
34 |         sentence_pairs = []
35 |         labels = []
36 | 
37 |         for example in examples:
38 |             sentence_pairs.append(example.texts)
39 |             labels.append(example.label)
40 |         return cls(sentence_pairs, labels, **kwargs)
41 | 
42 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
43 |         if epoch != -1:
44 |             if steps == -1:
45 |                 out_txt = " after epoch {}:".format(epoch)
46 |             else:
47 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
48 |         else:
49 |             out_txt = ":"
50 | 
51 |         logger.info("CEBinaryClassificationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
52 |         pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
53 | 
54 |         acc, acc_threshold = BinaryClassificationEvaluator.find_best_acc_and_threshold(pred_scores, self.labels, True)
55 |         f1, precision, recall, f1_threshold = BinaryClassificationEvaluator.find_best_f1_and_threshold(pred_scores, self.labels, True)
56 |         ap = average_precision_score(self.labels, pred_scores)
57 | 
58 |         logger.info("Accuracy:           {:.2f}\t(Threshold: {:.4f})".format(acc * 100, acc_threshold))
59 |         logger.info("F1:                 {:.2f}\t(Threshold: {:.4f})".format(f1 * 100, f1_threshold))
60 |         logger.info("Precision:          {:.2f}".format(precision * 100))
61 |         logger.info("Recall:             {:.2f}".format(recall * 100))
62 |         logger.info("Average Precision:  {:.2f}\n".format(ap * 100))
63 | 
64 |         if output_path is not None and self.write_csv:
65 |             csv_path = os.path.join(output_path, self.csv_file)
66 |             output_file_exists = os.path.isfile(csv_path)
67 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
68 |                 writer = csv.writer(f)
69 |                 if not output_file_exists:
70 |                     writer.writerow(self.csv_headers)
71 | 
72 |                 writer.writerow([epoch, steps, acc, acc_threshold, f1, f1_threshold, precision, recall, ap])
73 | 
74 | 
75 |         return ap
76 | 


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/evaluation/CECorrelationEvaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from scipy.stats import pearsonr, spearmanr
 3 | from typing import List
 4 | import os
 5 | import csv
 6 | from ... import InputExample
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class CECorrelationEvaluator:
12 |     """
13 |     This evaluator can be used with the CrossEncoder class. Given sentence pairs and continuous scores,
14 |     it compute the pearson & spearman correlation between the predicted score for the sentence pair
15 |     and the gold score.
16 |     """
17 |     def __init__(self, sentence_pairs: List[List[str]], scores: List[float], name: str='', write_csv: bool = True):
18 |         self.sentence_pairs = sentence_pairs
19 |         self.scores = scores
20 |         self.name = name
21 | 
22 |         self.csv_file = "CECorrelationEvaluator" + ("_" + name if name else '') + "_results.csv"
23 |         self.csv_headers = ["epoch", "steps", "Pearson_Correlation", "Spearman_Correlation"]
24 |         self.write_csv = write_csv
25 | 
26 |     @classmethod
27 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
28 |         sentence_pairs = []
29 |         scores = []
30 | 
31 |         for example in examples:
32 |             sentence_pairs.append(example.texts)
33 |             scores.append(example.label)
34 |         return cls(sentence_pairs, scores, **kwargs)
35 | 
36 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
37 |         if epoch != -1:
38 |             if steps == -1:
39 |                 out_txt = " after epoch {}:".format(epoch)
40 |             else:
41 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
42 |         else:
43 |             out_txt = ":"
44 | 
45 |         logger.info("CECorrelationEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
46 |         pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
47 | 
48 | 
49 |         eval_pearson, _ = pearsonr(self.scores, pred_scores)
50 |         eval_spearman, _ = spearmanr(self.scores, pred_scores)
51 | 
52 |         logger.info("Correlation:\tPearson: {:.4f}\tSpearman: {:.4f}".format(eval_pearson, eval_spearman))
53 | 
54 |         if output_path is not None and self.write_csv:
55 |             csv_path = os.path.join(output_path, self.csv_file)
56 |             output_file_exists = os.path.isfile(csv_path)
57 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
58 |                 writer = csv.writer(f)
59 |                 if not output_file_exists:
60 |                     writer.writerow(self.csv_headers)
61 | 
62 |                 writer.writerow([epoch, steps, eval_pearson, eval_spearman])
63 | 
64 |         return eval_spearman
65 | 


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/evaluation/CERerankingEvaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import numpy as np
 3 | import os
 4 | import csv
 5 | 
 6 | logger = logging.getLogger(__name__)
 7 | 
 8 | class CERerankingEvaluator:
 9 |     """
10 |     This class evaluates a CrossEncoder model for the task of re-ranking.
11 | 
12 |     Given a query and a list of documents, it computes the score [query, doc_i] for all possible
13 |     documents and sorts them in decreasing order. Then, MRR@10 is compute to measure the quality of the ranking.
14 | 
15 |     :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
16 |      positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
17 |     """
18 |     def __init__(self, samples, mrr_at_k: int = 10, name: str = '', write_csv: bool = True):
19 |         self.samples = samples
20 |         self.name = name
21 |         self.mrr_at_k = mrr_at_k
22 | 
23 |         if isinstance(self.samples, dict):
24 |             self.samples = list(self.samples.values())
25 | 
26 |         self.csv_file = "CERerankingEvaluator" + ("_" + name if name else '') + "_results.csv"
27 |         self.csv_headers = ["epoch", "steps", "MRR@{}".format(mrr_at_k)]
28 |         self.write_csv = write_csv
29 | 
30 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
31 |         if epoch != -1:
32 |             if steps == -1:
33 |                 out_txt = " after epoch {}:".format(epoch)
34 |             else:
35 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
36 |         else:
37 |             out_txt = ":"
38 | 
39 |         logger.info("CERerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
40 | 
41 |         all_mrr_scores = []
42 |         num_queries = 0
43 |         num_positives = []
44 |         num_negatives = []
45 |         for instance in self.samples:
46 |             query = instance['query']
47 |             positive = list(instance['positive'])
48 |             negative = list(instance['negative'])
49 |             docs = positive + negative
50 |             is_relevant = [True]*len(positive) + [False]*len(negative)
51 | 
52 |             if len(positive) == 0 or len(negative) == 0:
53 |                 continue
54 | 
55 |             num_queries += 1
56 |             num_positives.append(len(positive))
57 |             num_negatives.append(len(negative))
58 | 
59 |             model_input = [[query, doc] for doc in docs]
60 |             pred_scores = model.predict(model_input, convert_to_numpy=True, show_progress_bar=False)
61 |             pred_scores_argsort = np.argsort(-pred_scores)  #Sort in decreasing order
62 | 
63 |             mrr_score = 0
64 |             for rank, index in enumerate(pred_scores_argsort[0:self.mrr_at_k]):
65 |                 if is_relevant[index]:
66 |                     mrr_score = 1 / (rank+1)
67 |                     break
68 | 
69 |             all_mrr_scores.append(mrr_score)
70 | 
71 |         mean_mrr = np.mean(all_mrr_scores)
72 |         logger.info("Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(num_queries, np.min(num_positives), np.mean(num_positives), np.max(num_positives), np.min(num_negatives), np.mean(num_negatives), np.max(num_negatives)))
73 |         logger.info("MRR@{}: {:.2f}".format(self.mrr_at_k, mean_mrr*100))
74 | 
75 |         if output_path is not None and self.write_csv:
76 |             csv_path = os.path.join(output_path, self.csv_file)
77 |             output_file_exists = os.path.isfile(csv_path)
78 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
79 |                 writer = csv.writer(f)
80 |                 if not output_file_exists:
81 |                     writer.writerow(self.csv_headers)
82 | 
83 |                 writer.writerow([epoch, steps, mean_mrr])
84 | 
85 |         return mean_mrr


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/evaluation/CESoftmaxAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import csv
 4 | from typing import List
 5 | from ... import InputExample
 6 | import numpy as np
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class CESoftmaxAccuracyEvaluator:
12 |     """
13 |     This evaluator can be used with the CrossEncoder class.
14 | 
15 |     It is designed for CrossEncoders with 2 or more outputs. It measure the
16 |     accuracy of the predict class vs. the gold labels.
17 |     """
18 |     def __init__(self, sentence_pairs: List[List[str]], labels: List[int], name: str='', write_csv: bool = True):
19 |         self.sentence_pairs = sentence_pairs
20 |         self.labels = labels
21 |         self.name = name
22 | 
23 |         self.csv_file = "CESoftmaxAccuracyEvaluator" + ("_" + name if name else '') + "_results.csv"
24 |         self.csv_headers = ["epoch", "steps", "Accuracy"]
25 |         self.write_csv = write_csv
26 | 
27 |     @classmethod
28 |     def from_input_examples(cls, examples: List[InputExample], **kwargs):
29 |         sentence_pairs = []
30 |         labels = []
31 | 
32 |         for example in examples:
33 |             sentence_pairs.append(example.texts)
34 |             labels.append(example.label)
35 |         return cls(sentence_pairs, labels, **kwargs)
36 | 
37 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
38 |         if epoch != -1:
39 |             if steps == -1:
40 |                 out_txt = " after epoch {}:".format(epoch)
41 |             else:
42 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
43 |         else:
44 |             out_txt = ":"
45 | 
46 |         logger.info("CESoftmaxAccuracyEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
47 |         pred_scores = model.predict(self.sentence_pairs, convert_to_numpy=True, show_progress_bar=False)
48 |         pred_labels = np.argmax(pred_scores, axis=1)
49 | 
50 |         assert len(pred_labels) == len(self.labels)
51 | 
52 |         acc = np.sum(pred_labels == self.labels) / len(self.labels)
53 | 
54 |         logger.info("Accuracy: {:.2f}".format(acc*100))
55 | 
56 |         if output_path is not None and self.write_csv:
57 |             csv_path = os.path.join(output_path, self.csv_file)
58 |             output_file_exists = os.path.isfile(csv_path)
59 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
60 |                 writer = csv.writer(f)
61 |                 if not output_file_exists:
62 |                     writer.writerow(self.csv_headers)
63 | 
64 |                 writer.writerow([epoch, steps, acc])
65 | 
66 |         return acc
67 | 


--------------------------------------------------------------------------------
/sentence_transformers/cross_encoder/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .CEBinaryAccuracyEvaluator import CEBinaryAccuracyEvaluator
2 | from .CEBinaryClassificationEvaluator import CEBinaryClassificationEvaluator
3 | from .CECorrelationEvaluator import CECorrelationEvaluator
4 | from .CESoftmaxAccuracyEvaluator import CESoftmaxAccuracyEvaluator
5 | from .CERerankingEvaluator import CERerankingEvaluator
6 | 


--------------------------------------------------------------------------------
/sentence_transformers/datasets/DenoisingAutoEncoderDataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from typing import List
 3 | from ..readers.InputExample import InputExample
 4 | import numpy as np
 5 | import nltk
 6 | from nltk.tokenize.treebank import TreebankWordDetokenizer
 7 | 
 8 | class DenoisingAutoEncoderDataset(Dataset):
 9 |     """
10 |     The DenoisingAutoEncoderDataset returns InputExamples in the format: texts=[noise_fn(sentence), sentence]
11 |     It is used in combination with the DenoisingAutoEncoderLoss: Here, a decoder tries to re-construct the
12 |     sentence without noise.
13 | 
14 |     :param sentences: A list of sentences
15 |     :param noise_fn: A noise function: Given a string, it returns a string with noise, e.g. deleted words
16 |     """
17 |     def __init__(self, sentences: List[str], noise_fn=lambda s: DenoisingAutoEncoderDataset.delete(s)):
18 |         self.sentences = sentences
19 |         self.noise_fn = noise_fn
20 | 
21 | 
22 |     def __getitem__(self, item):
23 |         sent = self.sentences[item]
24 |         return InputExample(texts=[self.noise_fn(sent), sent])
25 | 
26 | 
27 |     def __len__(self):
28 |         return len(self.sentences)
29 | 
30 |     # Deletion noise.
31 |     @staticmethod
32 |     def delete(text, del_ratio=0.6):
33 |         words = nltk.word_tokenize(text)
34 |         n = len(words)
35 |         if n == 0:
36 |             return text
37 | 
38 |         keep_or_not = np.random.rand(n) > del_ratio
39 |         if sum(keep_or_not) == 0:
40 |             keep_or_not[np.random.choice(n)] = True # guarantee that at least one word remains
41 |         words_processed = TreebankWordDetokenizer().detokenize(np.array(words)[keep_or_not])
42 |         return words_processed


--------------------------------------------------------------------------------
/sentence_transformers/datasets/NoDuplicatesDataLoader.py:
--------------------------------------------------------------------------------
 1 | import random
 2 | import math
 3 | 
 4 | class NoDuplicatesDataLoader:
 5 | 
 6 |     def __init__(self, train_examples, batch_size):
 7 |         """
 8 |         A special data loader to be used with MultipleNegativesRankingLoss.
 9 |         The data loader ensures that there are no duplicate sentences within the same batch
10 |         """
11 |         self.batch_size = batch_size
12 |         self.data_pointer = 0
13 |         self.collate_fn = None
14 |         self.train_examples = train_examples
15 |         random.shuffle(self.train_examples)
16 | 
17 |     def __iter__(self):
18 |         for _ in range(self.__len__()):
19 |             batch = []
20 |             texts_in_batch = set()
21 | 
22 |             while len(batch) < self.batch_size:
23 |                 example = self.train_examples[self.data_pointer]
24 | 
25 |                 valid_example = True
26 |                 for text in example.texts:
27 |                     if text.strip().lower() in texts_in_batch:
28 |                         valid_example = False
29 |                         break
30 | 
31 |                 if valid_example:
32 |                     batch.append(example)
33 |                     for text in example.texts:
34 |                         texts_in_batch.add(text.strip().lower())
35 | 
36 |                 self.data_pointer += 1
37 |                 if self.data_pointer >= len(self.train_examples):
38 |                     self.data_pointer = 0
39 |                     random.shuffle(self.train_examples)
40 | 
41 |             yield self.collate_fn(batch) if self.collate_fn is not None else batch
42 | 
43 |     def __len__(self):
44 |         return math.floor(len(self.train_examples) / self.batch_size)


--------------------------------------------------------------------------------
/sentence_transformers/datasets/SentenceLabelDataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 | 
 3 | """
 4 | from torch.utils.data import  IterableDataset
 5 | import numpy as np
 6 | from typing import List
 7 | from ..readers import InputExample
 8 | import logging
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class SentenceLabelDataset(IterableDataset):
13 |     """
14 |     This dataset can be used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS which requires
15 |     multiple examples with the same label in a batch.
16 | 
17 |     It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
18 | 
19 |     Labels with fewer than n unique samples are ignored.
20 |     This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
21 | 
22 |     This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
23 |     by the samples drawn per label.
24 |     """
25 |     def __init__(self, examples: List[InputExample], samples_per_label: int = 2, with_replacement: bool = False):
26 |         """
27 |         Creates a LabelSampler for a SentenceLabelDataset.
28 | 
29 |         :param examples:
30 |             a list with InputExamples
31 |         :param samples_per_label:
32 |             the number of consecutive, random and unique samples drawn per label. Batch size should be a multiple of samples_per_label
33 |         :param with_replacement:
34 |             if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
35 |             if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
36 |             drawing.
37 |         """
38 |         super().__init__()
39 | 
40 |         self.samples_per_label = samples_per_label
41 | 
42 |         #Group examples by label
43 |         label2ex = {}
44 |         for example in examples:
45 |             if example.label not in label2ex:
46 |                 label2ex[example.label] = []
47 |             label2ex[example.label].append(example)
48 | 
49 |         #Include only labels with at least 2 examples
50 |         self.grouped_inputs = []
51 |         self.groups_right_border = []
52 |         num_labels = 0
53 | 
54 |         for label, label_examples in label2ex.items():
55 |             if len(label_examples) >= self.samples_per_label:
56 |                 self.grouped_inputs.extend(label_examples)
57 |                 self.groups_right_border.append(len(self.grouped_inputs))  # At which position does this label group / bucket end?
58 |                 num_labels += 1
59 | 
60 |         self.label_range = np.arange(num_labels)
61 |         self.with_replacement = with_replacement
62 |         np.random.shuffle(self.label_range)
63 | 
64 |         logger.info("SentenceLabelDataset: {} examples, from which {} examples could be used (those labels appeared at least {} times). {} different labels found.".format(len(examples), len(self.grouped_inputs), self.samples_per_label, num_labels ))
65 | 
66 |     def __iter__(self):
67 |         label_idx = 0
68 |         count = 0
69 |         already_seen = {}
70 |         while count < len(self.grouped_inputs):
71 |             label = self.label_range[label_idx]
72 |             if label not in already_seen:
73 |                 already_seen[label] = set()
74 | 
75 |             left_border = 0 if label == 0 else self.groups_right_border[label-1]
76 |             right_border = self.groups_right_border[label]
77 | 
78 |             if self.with_replacement:
79 |                 selection = np.arange(left_border, right_border)
80 |             else:
81 |                 selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
82 | 
83 |             if len(selection) >= self.samples_per_label:
84 |                 for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
85 |                     count += 1
86 |                     already_seen[label].add(element_idx)
87 |                     yield self.grouped_inputs[element_idx]
88 | 
89 |             label_idx += 1
90 |             if label_idx >= len(self.label_range):
91 |                 label_idx = 0
92 |                 already_seen = {}
93 |                 np.random.shuffle(self.label_range)
94 | 
95 |     def __len__(self):
96 |         return len(self.grouped_inputs)


--------------------------------------------------------------------------------
/sentence_transformers/datasets/SentencesDataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from typing import List
 3 | import torch
 4 | from .. import SentenceTransformer
 5 | from ..readers.InputExample import InputExample
 6 | 
 7 | class SentencesDataset(Dataset):
 8 |     """
 9 |     DEPRECATED: This class is no longer used. Instead of wrapping your List of InputExamples in a SentencesDataset
10 |     and then passing it to the DataLoader, you can pass the list of InputExamples directly to the dataset loader.
11 |     """
12 |     def __init__(self,
13 |                  examples: List[InputExample],
14 |                  model: SentenceTransformer
15 |                  ):
16 |         self.examples = examples
17 | 
18 | 
19 |     def __getitem__(self, item):
20 |         return self.examples[item]
21 | 
22 | 
23 |     def __len__(self):
24 |         return len(self.examples)
25 | 


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .DenoisingAutoEncoderDataset import DenoisingAutoEncoderDataset
2 | from .NoDuplicatesDataLoader import NoDuplicatesDataLoader
3 | from .ParallelSentencesDataset import ParallelSentencesDataset
4 | from .SentencesDataset import SentencesDataset
5 | from .SentenceLabelDataset import SentenceLabelDataset
6 | 


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/DenoisingAutoEncoderDataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/DenoisingAutoEncoderDataset.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/DenoisingAutoEncoderDataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/DenoisingAutoEncoderDataset.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/NoDuplicatesDataLoader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/NoDuplicatesDataLoader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/NoDuplicatesDataLoader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/NoDuplicatesDataLoader.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/ParallelSentencesDataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/ParallelSentencesDataset.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/ParallelSentencesDataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/ParallelSentencesDataset.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/SentenceLabelDataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/SentenceLabelDataset.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/SentenceLabelDataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/SentenceLabelDataset.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/SentencesDataset.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/SentencesDataset.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/SentencesDataset.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/SentencesDataset.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/datasets/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/LabelAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | import logging
 5 | from ..util import batch_to_device
 6 | import os
 7 | import csv
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class LabelAccuracyEvaluator(SentenceEvaluator):
13 |     """
14 |     Evaluate a model based on its accuracy on a labeled dataset
15 | 
16 |     This requires a model with LossFunction.SOFTMAX
17 | 
18 |     The results are written in a CSV. If a CSV already exists, then values are appended.
19 |     """
20 | 
21 |     def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None, write_csv: bool = True):
22 |         """
23 |         Constructs an evaluator for the given dataset
24 | 
25 |         :param dataloader:
26 |             the data for the evaluation
27 |         """
28 |         self.dataloader = dataloader
29 |         self.name = name
30 |         self.softmax_model = softmax_model
31 | 
32 |         if name:
33 |             name = "_"+name
34 | 
35 |         self.write_csv = write_csv
36 |         self.csv_file = "accuracy_evaluation"+name+"_results.csv"
37 |         self.csv_headers = ["epoch", "steps", "accuracy"]
38 | 
39 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
40 |         model.eval()
41 |         total = 0
42 |         correct = 0
43 | 
44 |         if epoch != -1:
45 |             if steps == -1:
46 |                 out_txt = " after epoch {}:".format(epoch)
47 |             else:
48 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
49 |         else:
50 |             out_txt = ":"
51 | 
52 |         logger.info("Evaluation on the "+self.name+" dataset"+out_txt)
53 |         self.dataloader.collate_fn = model.smart_batching_collate
54 |         for step, batch in enumerate(self.dataloader):
55 |             features, label_ids = batch
56 |             for idx in range(len(features)):
57 |                 features[idx] = batch_to_device(features[idx], model.device)
58 |             label_ids = label_ids.to(model.device)
59 |             with torch.no_grad():
60 |                 _, prediction = self.softmax_model(features, labels=None)
61 | 
62 |             total += prediction.size(0)
63 |             correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
64 |         accuracy = correct/total
65 | 
66 |         logger.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
67 | 
68 |         if output_path is not None and self.write_csv:
69 |             csv_path = os.path.join(output_path, self.csv_file)
70 |             if not os.path.isfile(csv_path):
71 |                 with open(csv_path, mode="w", encoding="utf-8") as f:
72 |                     writer = csv.writer(f)
73 |                     writer.writerow(self.csv_headers)
74 |                     writer.writerow([epoch, steps, accuracy])
75 |             else:
76 |                 with open(csv_path, mode="a", encoding="utf-8") as f:
77 |                     writer = csv.writer(f)
78 |                     writer.writerow([epoch, steps, accuracy])
79 | 
80 |         return accuracy
81 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/MSEEvaluator.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers.evaluation import SentenceEvaluator
 2 | import numpy as np
 3 | import logging
 4 | import os
 5 | import csv
 6 | from typing import List
 7 | 
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | class MSEEvaluator(SentenceEvaluator):
12 |     """
13 |     Computes the mean squared error (x100) between the computed sentence embedding
14 |     and some target sentence embedding.
15 | 
16 |     The MSE is computed between ||teacher.encode(source_sentences) - student.encode(target_sentences)||.
17 | 
18 |     For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English
19 |     and target_sentences are in a different language like German, Chinese, Spanish...
20 | 
21 |     :param source_sentences: Source sentences are embedded with the teacher model
22 |     :param target_sentences: Target sentences are ambedding with the student model.
23 |     :param show_progress_bar: Show progress bar when computing embeddings
24 |     :param batch_size: Batch size to compute sentence embeddings
25 |     :param name: Name of the evaluator
26 |     :param write_csv: Write results to CSV file
27 |     """
28 |     def __init__(self, source_sentences: List[str], target_sentences: List[str], teacher_model = None, show_progress_bar: bool = False, batch_size: int = 32, name: str = '', write_csv: bool = True):
29 |         self.source_embeddings = teacher_model.encode(source_sentences, show_progress_bar=show_progress_bar, batch_size=batch_size, convert_to_numpy=True)
30 | 
31 |         self.target_sentences = target_sentences
32 |         self.show_progress_bar = show_progress_bar
33 |         self.batch_size = batch_size
34 |         self.name = name
35 | 
36 |         self.csv_file = "mse_evaluation_" + name + "_results.csv"
37 |         self.csv_headers = ["epoch", "steps", "MSE"]
38 |         self.write_csv = write_csv
39 | 
40 |     def __call__(self, model, output_path, epoch  = -1, steps = -1):
41 |         if epoch != -1:
42 |             if steps == -1:
43 |                 out_txt = " after epoch {}:".format(epoch)
44 |             else:
45 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
46 |         else:
47 |             out_txt = ":"
48 | 
49 |         target_embeddings = model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=True)
50 | 
51 |         mse = ((self.source_embeddings - target_embeddings)**2).mean()
52 |         mse *= 100
53 | 
54 |         logger.info("MSE evaluation (lower = better) on "+self.name+" dataset"+out_txt)
55 |         logger.info("MSE (*100):\t{:4f}".format(mse))
56 | 
57 |         if output_path is not None and self.write_csv:
58 |             csv_path = os.path.join(output_path, self.csv_file)
59 |             output_file_exists = os.path.isfile(csv_path)
60 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
61 |                 writer = csv.writer(f)
62 |                 if not output_file_exists:
63 |                     writer.writerow(self.csv_headers)
64 | 
65 |                 writer.writerow([epoch, steps, mse])
66 | 
67 |         return -mse #Return negative score as SentenceTransformers maximizes the performance
68 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/MSEEvaluatorFromDataFrame.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers.evaluation import SentenceEvaluator
 2 | from sentence_transformers.util import batch_to_device
 3 | from sentence_transformers import SentenceTransformer
 4 | from typing import List, Tuple, Dict
 5 | import torch
 6 | import numpy as np
 7 | import logging
 8 | import os
 9 | import csv
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | 
15 | class MSEEvaluatorFromDataFrame(SentenceEvaluator):
16 |     """
17 |     Computes the mean squared error (x100) between the computed sentence embedding
18 |     and some target sentence embedding.
19 |     :param dataframe:
20 |         It must have the following format. Rows contains different, parallel sentences. Columns are the respective language codes
21 |         [{'en': 'My sentence', 'es': 'Sentence in Spanisch', 'fr': 'Sentence in French'...},
22 |          {'en': 'My second sentence', ....]
23 |     :param combinations:
24 |         Must be of the format [('en', 'es'), ('en', 'fr'), ...]
25 |         First entry in a tuple is the source language. The sentence in the respective language will be fetched from the dataframe and passed to the teacher model.
26 |         Second entry in a tuple the the target language. Sentence will be fetched from the dataframe and passed to the student model
27 |     """
28 |     def __init__(self, dataframe: List[Dict[str, str]], teacher_model: SentenceTransformer, combinations: List[Tuple[str, str]], batch_size: int = 8, name='', write_csv: bool = True):
29 | 
30 |         self.combinations = combinations
31 |         self.name = name
32 |         self.batch_size = batch_size
33 | 
34 | 
35 |         if name:
36 |             name = "_"+name
37 | 
38 |         self.csv_file = "mse_evaluation" + name + "_results.csv"
39 |         self.csv_headers = ["epoch", "steps"]
40 |         self.write_csv = write_csv
41 |         self.data = {}
42 | 
43 |         logger.info("Compute teacher embeddings")
44 |         all_source_sentences = set()
45 |         for src_lang, trg_lang in self.combinations:
46 |             src_sentences = []
47 |             trg_sentences = []
48 | 
49 |             for row in dataframe:
50 |                 if row[src_lang].strip() != "" and row[trg_lang].strip() != "":
51 |                     all_source_sentences.add(row[src_lang])
52 |                     src_sentences.append(row[src_lang])
53 |                     trg_sentences.append(row[trg_lang])
54 | 
55 |             self.data[(src_lang, trg_lang)] = (src_sentences, trg_sentences)
56 |             self.csv_headers.append("{}-{}".format(src_lang, trg_lang))
57 | 
58 |         all_source_sentences = list(all_source_sentences)
59 |         all_src_embeddings = teacher_model.encode(all_source_sentences, batch_size=self.batch_size)
60 |         self.teacher_embeddings = {sent: emb for sent, emb in zip(all_source_sentences, all_src_embeddings)}
61 | 
62 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int  = -1):
63 |         model.eval()
64 | 
65 |         mse_scores = []
66 |         for src_lang, trg_lang in self.combinations:
67 |             src_sentences, trg_sentences = self.data[(src_lang, trg_lang)]
68 | 
69 |             src_embeddings = np.asarray([self.teacher_embeddings[sent] for sent in src_sentences])
70 |             trg_embeddings = np.asarray(model.encode(trg_sentences, batch_size=self.batch_size))
71 | 
72 |             mse = ((src_embeddings - trg_embeddings) ** 2).mean()
73 |             mse *= 100
74 |             mse_scores.append(mse)
75 | 
76 |             logger.info("MSE evaluation on {} dataset - {}-{}:".format(self.name, src_lang, trg_lang))
77 |             logger.info("MSE (*100):\t{:4f}".format(mse))
78 | 
79 |         if output_path is not None and self.write_csv:
80 |             csv_path = os.path.join(output_path, self.csv_file)
81 |             output_file_exists = os.path.isfile(csv_path)
82 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
83 |                 writer = csv.writer(f)
84 |                 if not output_file_exists:
85 |                     writer.writerow(self.csv_headers)
86 | 
87 |                 writer.writerow([epoch, steps]+mse_scores)
88 | 
89 |         return -np.mean(mse_scores) #Return negative score as SentenceTransformers maximizes the performance
90 | 
91 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/RerankingEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator
  2 | import logging
  3 | import numpy as np
  4 | import os
  5 | import csv
  6 | from ..util import cos_sim, dot_score
  7 | import torch
  8 | from sklearn.metrics import average_precision_score
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | class RerankingEvaluator(SentenceEvaluator):
 13 |     """
 14 |     This class evaluates a SentenceTransformer model for the task of re-ranking.
 15 | 
 16 |     Given a query and a list of documents, it computes the score [query, doc_i] for all possible
 17 |     documents and sorts them in decreasing order. Then, MRR@10 and MAP is compute to measure the quality of the ranking.
 18 | 
 19 |     :param samples: Must be a list and each element is of the form: {'query': '', 'positive': [], 'negative': []}. Query is the search query,
 20 |      positive is a list of positive (relevant) documents, negative is a list of negative (irrelevant) documents.
 21 |     """
 22 |     def __init__(self, samples, mrr_at_k: int = 10, name: str = '', write_csv: bool = True, similarity_fct=cos_sim):
 23 |         self.samples = samples
 24 |         self.name = name
 25 |         self.mrr_at_k = mrr_at_k
 26 |         self.similarity_fct = cos_sim
 27 | 
 28 |         if isinstance(self.samples, dict):
 29 |             self.samples = list(self.samples.values())
 30 | 
 31 | 
 32 |         self.csv_file = "RerankingEvaluator" + ("_" + name if name else '') + "_results.csv"
 33 |         self.csv_headers = ["epoch", "steps", "MAP", "MRR@{}".format(mrr_at_k)]
 34 |         self.write_csv = write_csv
 35 | 
 36 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 37 |         if epoch != -1:
 38 |             if steps == -1:
 39 |                 out_txt = " after epoch {}:".format(epoch)
 40 |             else:
 41 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
 42 |         else:
 43 |             out_txt = ":"
 44 | 
 45 |         logger.info("RerankingEvaluator: Evaluating the model on " + self.name + " dataset" + out_txt)
 46 | 
 47 |         all_mrr_scores = []
 48 |         all_ap_scores = []
 49 | 
 50 |         num_queries = 0
 51 |         num_positives = []
 52 |         num_negatives = []
 53 |         for instance in self.samples:
 54 |             query = instance['query']
 55 |             positive = list(instance['positive'])
 56 |             negative = list(instance['negative'])
 57 |             docs = positive + negative
 58 |             is_relevant = [True]*len(positive) + [False]*len(negative)
 59 | 
 60 |             if len(positive) == 0 or len(negative) == 0:
 61 |                 continue
 62 | 
 63 |             num_queries += 1
 64 |             num_positives.append(len(positive))
 65 |             num_negatives.append(len(negative))
 66 | 
 67 |             query_emb = model.encode(query, convert_to_tensor=True, show_progress_bar=False)
 68 |             docs_emb = model.encode(docs, convert_to_tensor=True, show_progress_bar=False)
 69 | 
 70 |             pred_scores = self.similarity_fct(query_emb, docs_emb)
 71 |             if len(pred_scores.shape) > 1:
 72 |                 pred_scores = pred_scores[0]
 73 | 
 74 |             pred_scores_argsort = torch.argsort(-pred_scores)  #Sort in decreasing order
 75 | 
 76 |             #Compute MRR score
 77 |             mrr_score = 0
 78 |             for rank, index in enumerate(pred_scores_argsort[0:self.mrr_at_k]):
 79 |                 if is_relevant[index]:
 80 |                     mrr_score = 1 / (rank+1)
 81 |                     break
 82 |             all_mrr_scores.append(mrr_score)
 83 | 
 84 |             # Compute AP
 85 |             all_ap_scores.append(average_precision_score(is_relevant, pred_scores.cpu().tolist()))
 86 | 
 87 |         mean_ap = np.mean(all_ap_scores)
 88 |         mean_mrr = np.mean(all_mrr_scores)
 89 | 
 90 |         logger.info("Queries: {} \t Positives: Min {:.1f}, Mean {:.1f}, Max {:.1f} \t Negatives: Min {:.1f}, Mean {:.1f}, Max {:.1f}".format(num_queries, np.min(num_positives), np.mean(num_positives), np.max(num_positives), np.min(num_negatives), np.mean(num_negatives), np.max(num_negatives)))
 91 |         logger.info("MAP: {:.2f}".format(mean_ap * 100))
 92 |         logger.info("MRR@{}: {:.2f}".format(self.mrr_at_k, mean_mrr*100))
 93 | 
 94 |         if output_path is not None and self.write_csv:
 95 |             csv_path = os.path.join(output_path, self.csv_file)
 96 |             output_file_exists = os.path.isfile(csv_path)
 97 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
 98 |                 writer = csv.writer(f)
 99 |                 if not output_file_exists:
100 |                     writer.writerow(self.csv_headers)
101 | 
102 |                 writer.writerow([epoch, steps, mean_ap, mean_mrr])
103 | 
104 |         return mean_ap


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SentenceEvaluator.py:
--------------------------------------------------------------------------------
 1 | class SentenceEvaluator:
 2 |     """
 3 |     Base class for all evaluators
 4 | 
 5 |     Extend this class and implement __call__ for custom evaluators.
 6 |     """
 7 | 
 8 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 9 |         """
10 |         This is called during training to evaluate the model.
11 |         It returns a score for the evaluation with a higher score indicating a better result.
12 | 
13 |         :param model:
14 |             the model to evaluate
15 |         :param output_path:
16 |             path where predictions and metrics are written to
17 |         :param epoch
18 |             the epoch where the evaluation takes place.
19 |             This is used for the file prefixes.
20 |             If this is -1, then we assume evaluation on test data.
21 |         :param steps
22 |             the steps in the current epoch at time of the evaluation.
23 |             This is used for the file prefixes.
24 |             If this is -1, then we assume evaluation at the end of the epoch.
25 |         :return: a score for the evaluation with a higher score indicating a better result
26 |         """
27 |         pass
28 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SequentialEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | from typing import Iterable
 3 | 
 4 | class SequentialEvaluator(SentenceEvaluator):
 5 |     """
 6 |     This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
 7 |     the data is passed sequentially to all sub-evaluators.
 8 | 
 9 |     All scores are passed to 'main_score_function', which derives one final score value
10 |     """
11 |     def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function = lambda scores: scores[-1]):
12 |         self.evaluators = evaluators
13 |         self.main_score_function = main_score_function
14 | 
15 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
16 |         scores = []
17 |         for evaluator in self.evaluators:
18 |             scores.append(evaluator(model, output_path, epoch, steps))
19 | 
20 |         return self.main_score_function(scores)
21 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SimilarityFunction.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | class SimilarityFunction(Enum):
4 |     COSINE = 0
5 |     EUCLIDEAN = 1
6 |     MANHATTAN = 2
7 |     DOT_PRODUCT = 3
8 | 
9 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/TranslationEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator
  2 | import logging
  3 | from ..util import pytorch_cos_sim
  4 | import os
  5 | import csv
  6 | import numpy as np
  7 | import scipy.spatial
  8 | from typing import List
  9 | import torch
 10 | 
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class TranslationEvaluator(SentenceEvaluator):
 15 |     """
 16 |     Given two sets of sentences in different languages, e.g. (en_1, en_2, en_3...) and (fr_1, fr_2, fr_3, ...),
 17 |     and assuming that fr_i is the translation of en_i.
 18 |     Checks if vec(en_i) has the highest similarity to vec(fr_i). Computes the accurarcy in both directions
 19 |     """
 20 |     def __init__(self, source_sentences: List[str], target_sentences: List[str],  show_progress_bar: bool = False, batch_size: int = 16, name: str = '', print_wrong_matches: bool = False, write_csv: bool = True):
 21 |         """
 22 |         Constructs an evaluator based for the dataset
 23 | 
 24 |         The labels need to indicate the similarity between the sentences.
 25 | 
 26 |         :param source_sentences:
 27 |             List of sentences in source language
 28 |         :param target_sentences:
 29 |             List of sentences in target language
 30 |         :param print_wrong_matches:
 31 |             Prints incorrect matches
 32 |         :param write_csv:
 33 |             Write results to CSV file
 34 |         """
 35 |         self.source_sentences = source_sentences
 36 |         self.target_sentences = target_sentences
 37 |         self.name = name
 38 |         self.batch_size = batch_size
 39 |         self.show_progress_bar = show_progress_bar
 40 |         self.print_wrong_matches = print_wrong_matches
 41 | 
 42 |         assert len(self.source_sentences) == len(self.target_sentences)
 43 | 
 44 |         if name:
 45 |             name = "_"+name
 46 | 
 47 |         self.csv_file = "translation_evaluation"+name+"_results.csv"
 48 |         self.csv_headers = ["epoch", "steps", "src2trg", "trg2src"]
 49 |         self.write_csv = write_csv
 50 | 
 51 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 52 |         if epoch != -1:
 53 |             if steps == -1:
 54 |                 out_txt = " after epoch {}:".format(epoch)
 55 |             else:
 56 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
 57 |         else:
 58 |             out_txt = ":"
 59 | 
 60 |         logger.info("Evaluating translation matching Accuracy on "+self.name+" dataset"+out_txt)
 61 | 
 62 |         embeddings1 = torch.stack(model.encode(self.source_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=False))
 63 |         embeddings2 = torch.stack(model.encode(self.target_sentences, show_progress_bar=self.show_progress_bar, batch_size=self.batch_size, convert_to_numpy=False))
 64 | 
 65 | 
 66 |         cos_sims = pytorch_cos_sim(embeddings1, embeddings2).detach().cpu().numpy()
 67 | 
 68 |         correct_src2trg = 0
 69 |         correct_trg2src = 0
 70 | 
 71 |         for i in range(len(cos_sims)):
 72 |             max_idx = np.argmax(cos_sims[i])
 73 | 
 74 |             if i == max_idx:
 75 |                 correct_src2trg += 1
 76 |             elif self.print_wrong_matches:
 77 |                 print("i:", i, "j:", max_idx, "INCORRECT" if i != max_idx else "CORRECT")
 78 |                 print("Src:", self.source_sentences[i])
 79 |                 print("Trg:", self.target_sentences[max_idx])
 80 |                 print("Argmax score:", cos_sims[i][max_idx], "vs. correct score:", cos_sims[i][i])
 81 | 
 82 |                 results = zip(range(len(cos_sims[i])), cos_sims[i])
 83 |                 results = sorted(results, key=lambda x: x[1], reverse=True)
 84 |                 for idx, score in results[0:5]:
 85 |                     print("\t", idx, "(Score: %.4f)" % (score), self.target_sentences[idx])
 86 | 
 87 | 
 88 | 
 89 |         cos_sims = cos_sims.T
 90 |         for i in range(len(cos_sims)):
 91 |             max_idx = np.argmax(cos_sims[i])
 92 |             if i == max_idx:
 93 |                 correct_trg2src += 1
 94 | 
 95 |         acc_src2trg = correct_src2trg / len(cos_sims)
 96 |         acc_trg2src = correct_trg2src / len(cos_sims)
 97 | 
 98 |         logger.info("Accuracy src2trg: {:.2f}".format(acc_src2trg*100))
 99 |         logger.info("Accuracy trg2src: {:.2f}".format(acc_trg2src*100))
100 | 
101 |         if output_path is not None and self.write_csv:
102 |             csv_path = os.path.join(output_path, self.csv_file)
103 |             output_file_exists = os.path.isfile(csv_path)
104 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
105 |                 writer = csv.writer(f)
106 |                 if not output_file_exists:
107 |                     writer.writerow(self.csv_headers)
108 | 
109 |                 writer.writerow([epoch, steps, acc_src2trg, acc_trg2src])
110 | 
111 |         return (acc_src2trg+acc_trg2src)/2
112 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .SentenceEvaluator import SentenceEvaluator
 2 | from .SimilarityFunction import SimilarityFunction
 3 | from .BinaryClassificationEvaluator import BinaryClassificationEvaluator
 4 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator
 5 | from .InformationRetrievalEvaluator import InformationRetrievalEvaluator
 6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
 7 | from .MSEEvaluator import MSEEvaluator
 8 | from .MSEEvaluatorFromDataFrame import MSEEvaluatorFromDataFrame
 9 | from .ParaphraseMiningEvaluator import ParaphraseMiningEvaluator
10 | from .SequentialEvaluator import SequentialEvaluator
11 | from .TranslationEvaluator import TranslationEvaluator
12 | from .TripletEvaluator import TripletEvaluator
13 | from .RerankingEvaluator import RerankingEvaluator


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/BinaryClassificationEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/EmbeddingSimilarityEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/InformationRetrievalEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/LabelAccuracyEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/MSEEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/MSEEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/MSEEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/MSEEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/MSEEvaluatorFromDataFrame.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/ParaphraseMiningEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/RerankingEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/RerankingEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/RerankingEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/RerankingEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/SentenceEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SentenceEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/SentenceEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SentenceEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/SequentialEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SequentialEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/SequentialEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SequentialEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/SimilarityFunction.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SimilarityFunction.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/SimilarityFunction.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/SimilarityFunction.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/TranslationEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/TranslationEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/TranslationEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/TranslationEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/TripletEvaluator.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/TripletEvaluator.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/TripletEvaluator.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/TripletEvaluator.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/evaluation/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/BYOLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | from ..SentenceTransformer import SentenceTransformer
 6 | import torch
 7 | import torch.nn as nn
 8 | import numpy as np
 9 | import logging
10 | import math
11 | from functools import wraps
12 | import copy
13 | import random
14 | 
15 | 
16 | class EMA():
17 | 	def __init__(self, beta):
18 | 		super().__init__()
19 | 		self.beta = beta
20 | 
21 | 	def update_average(self, old, new):
22 | 		if old is None:
23 | 			return new
24 | 		return old * self.beta + (1 - self.beta) * new
25 | 
26 | def update_moving_average(ema_updater, ma_model, current_model):
27 | 	for current_params, ma_params in zip(current_model.parameters(), ma_model.parameters()):
28 | 		old_weight, up_weight = ma_params.data, current_params.data
29 | 		ma_params.data = ema_updater.update_average(old_weight, up_weight)
30 | 
31 | # MLP for  predictor
32 | class MLP(nn.Module):
33 | 	def __init__(self, dim, projection_size, hidden_size):
34 | 		super().__init__()
35 | 		self.net = nn.Sequential(
36 | 			nn.Linear(dim, hidden_size),
37 | 			nn.BatchNorm1d(hidden_size),
38 | 			nn.ReLU(),
39 | 			nn.Linear(hidden_size, hidden_size),
40 | 			nn.ReLU(),
41 | 			nn.Linear(hidden_size, projection_size)
42 | 		)
43 | 
44 | 	def forward(self, x):
45 | 		return self.net(x)
46 | 
47 | 
48 | # loss fn
49 | def loss_fn(x, y):
50 | 	x = F.normalize(x, dim=-1, p=2)
51 | 	y = F.normalize(y, dim=-1, p=2)
52 | 	return 2 - 2 * (x * y).sum(dim=-1)
53 | 
54 | 
55 | 
56 | class BYOLoss(nn.Module):
57 | 	def __init__(self,
58 | 				 model: SentenceTransformer,
59 | 				 sentence_embedding_dimension: int,
60 | 				 moving_average_decay: float):
61 | 		super(BYOLoss, self).__init__()
62 | 		self.online_encoder = model
63 | 		self.online_predictor_1 = MLP(sentence_embedding_dimension, sentence_embedding_dimension, 10 * sentence_embedding_dimension) 
64 | 		self.online_predictor_2 = MLP(sentence_embedding_dimension, sentence_embedding_dimension, 10 * sentence_embedding_dimension) 
65 | 		self.online_predictor_3 = MLP(sentence_embedding_dimension, sentence_embedding_dimension, 10 * sentence_embedding_dimension) 
66 | 		self.target_encoder = copy.deepcopy(self.online_encoder)
67 | 		self.target_ema_updater = EMA(moving_average_decay)  
68 | 
69 | 	def update_moving_average(self):
70 | 		assert self.target_encoder is not None, 'target encoder has not been created yet'
71 | 		update_moving_average(self.target_ema_updater, self.target_encoder, self.online_encoder)
72 | 
73 | 
74 | 	def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
75 | 
76 | 		target_sentence_features = copy.deepcopy(sentence_features)
77 | 		rep_one, rep_two = [self.online_encoder(sentence_feature) for sentence_feature in sentence_features]
78 | 		online_pred_one, online_pred_two = rep_one['sentence_embedding'], rep_two['sentence_embedding']
79 | 		online_pred_one, online_pred_two = self.online_predictor_1(online_pred_one), self.online_predictor_1(online_pred_two)
80 | 		online_pred_one, online_pred_two = self.online_predictor_2(online_pred_one), self.online_predictor_2(online_pred_two)
81 | 		online_pred_one, online_pred_two = self.online_predictor_3(online_pred_one), self.online_predictor_3(online_pred_two)
82 | 
83 | 		with torch.no_grad():
84 | 
85 | 			target_one, target_two = [self.target_encoder(sentence_feature) for sentence_feature in target_sentence_features]
86 | 			target_proj_one, target_proj_two = target_one['sentence_embedding'],  target_two['sentence_embedding']
87 | 
88 | 		loss_one = loss_fn(online_pred_one, target_proj_two.detach())
89 | 		loss_two = loss_fn(online_pred_two, target_proj_one.detach())
90 | 
91 | 		loss = loss_one + loss_two
92 | 
93 | 		return loss.mean()
94 | 
95 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/BatchAllTripletLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from .BatchHardTripletLoss import BatchHardTripletLoss, BatchHardTripletLossDistanceFunction
 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer
 6 | 
 7 | 
 8 | class BatchAllTripletLoss(nn.Module):
 9 |     """
10 |     BatchAllTripletLoss takes a batch with (label, sentence) pairs and computes the loss for all possible, valid
11 |     triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. The labels
12 |     must be integers, with same label indicating sentences from the same class. You train dataset
13 |     must contain at least 2 examples per label class.
14 | 
15 |     | Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py
16 |     | Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
17 |     | Blog post: https://omoindrot.github.io/triplet-loss
18 | 
19 |     :param model: SentenceTransformer model
20 |     :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
21 |     :param margin: Negative samples should be at least margin further apart from the anchor than the positive.
22 | 
23 |     Example::
24 | 
25 |         from sentence_transformers import SentenceTransformer, SentencesDataset, losses
26 |         from sentence_transformers.readers import InputExample
27 | 
28 |         model = SentenceTransformer('distilbert-base-nli-mean-tokens')
29 |         train_examples = [InputExample(texts=['Sentence from class 0'], label=0), InputExample(texts=['Another sentence from class 0'], label=0),
30 |             InputExample(texts=['Sentence from class 1'], label=1), InputExample(texts=['Sentence from class 2'], label=2)]
31 |         train_dataset = SentencesDataset(train_examples, model)
32 |         train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
33 |         train_loss = losses.BatchAllTripletLoss(model=model)
34 |     """
35 |     def __init__(self, model: SentenceTransformer, distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance, margin: float = 5):
36 |         super(BatchAllTripletLoss, self).__init__()
37 |         self.sentence_embedder = model
38 |         self.triplet_margin = margin
39 |         self.distance_metric = distance_metric
40 | 
41 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
42 |         rep = self.sentence_embedder(sentence_features[0])['sentence_embedding']
43 |         return self.batch_all_triplet_loss(labels, rep)
44 | 
45 | 
46 | 
47 |     def batch_all_triplet_loss(self, labels, embeddings):
48 |         """Build the triplet loss over a batch of embeddings.
49 |         We generate all the valid triplets and average the loss over the positive ones.
50 |         Args:
51 |             labels: labels of the batch, of size (batch_size,)
52 |             embeddings: tensor of shape (batch_size, embed_dim)
53 |             margin: margin for triplet loss
54 |             squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
55 |                      If false, output is the pairwise euclidean distance matrix.
56 |         Returns:
57 |             Label_Sentence_Triplet: scalar tensor containing the triplet loss
58 |         """
59 |         # Get the pairwise distance matrix
60 |         pairwise_dist = self.distance_metric(embeddings)
61 | 
62 |         anchor_positive_dist = pairwise_dist.unsqueeze(2)
63 |         anchor_negative_dist = pairwise_dist.unsqueeze(1)
64 | 
65 |         # Compute a 3D tensor of size (batch_size, batch_size, batch_size)
66 |         # triplet_loss[i, j, k] will contain the triplet loss of anchor=i, positive=j, negative=k
67 |         # Uses broadcasting where the 1st argument has shape (batch_size, batch_size, 1)
68 |         # and the 2nd (batch_size, 1, batch_size)
69 |         triplet_loss = anchor_positive_dist - anchor_negative_dist + self.triplet_margin
70 | 
71 |         # Put to zero the invalid triplets
72 |         # (where label(a) != label(p) or label(n) == label(a) or a == p)
73 |         mask = BatchHardTripletLoss.get_triplet_mask(labels)
74 |         triplet_loss = mask.float() * triplet_loss
75 | 
76 |         # Remove negative losses (i.e. the easy triplets)
77 |         triplet_loss[triplet_loss < 0] = 0
78 | 
79 |         # Count number of positive triplets (where triplet_loss > 0)
80 |         valid_triplets = triplet_loss[triplet_loss > 1e-16]
81 |         num_positive_triplets = valid_triplets.size(0)
82 |         num_valid_triplets = mask.sum()
83 | 
84 |         fraction_positive_triplets = num_positive_triplets / (num_valid_triplets.float() + 1e-16)
85 | 
86 |         # Get final mean triplet loss over the positive valid triplets
87 |         triplet_loss = triplet_loss.sum() / (num_positive_triplets + 1e-16)
88 | 
89 |         return triplet_loss
90 | 
91 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/BatchHardSoftMarginTripletLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from .BatchHardTripletLoss import BatchHardTripletLoss, BatchHardTripletLossDistanceFunction
 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer
 6 | 
 7 | class BatchHardSoftMarginTripletLoss(BatchHardTripletLoss):
 8 |     """
 9 |     BatchHardSoftMarginTripletLoss takes a batch with (label, sentence) pairs and computes the loss for all possible, valid
10 |     triplets, i.e., anchor and positive must have the same label, anchor and negative a different label. The labels
11 |     must be integers, with same label indicating sentences from the same class. You train dataset
12 |     must contain at least 2 examples per label class. The margin is computed automatically.
13 | 
14 |     Source: https://github.com/NegatioN/OnlineMiningTripletLoss/blob/master/online_triplet_loss/losses.py
15 |     Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
16 |     Blog post: https://omoindrot.github.io/triplet-loss
17 | 
18 |     :param model: SentenceTransformer model
19 |     :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
20 | 
21 | 
22 |     Example::
23 | 
24 |        from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
25 |        from sentence_transformers.readers import InputExample
26 | 
27 |        model = SentenceTransformer('distilbert-base-nli-mean-tokens')
28 |        train_examples = [InputExample(texts=['Sentence from class 0'], label=0), InputExample(texts=['Another sentence from class 0'], label=0),
29 |            InputExample(texts=['Sentence from class 1'], label=1), InputExample(texts=['Sentence from class 2'], label=2)]
30 |        train_dataset = SentencesDataset(train_examples, model)
31 |        train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
32 |        train_loss = losses.BatchHardSoftMarginTripletLoss(model=model)
33 |     """
34 |     def __init__(self, model: SentenceTransformer, distance_metric=BatchHardTripletLossDistanceFunction.eucledian_distance):
35 |         super(BatchHardSoftMarginTripletLoss, self).__init__(model)
36 |         self.sentence_embedder = model
37 |         self.distance_metric = distance_metric
38 | 
39 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
40 |         rep = self.sentence_embedder(sentence_features[0])['sentence_embedding']
41 |         return self.batch_hard_triplet_soft_margin_loss(labels, rep)
42 | 
43 | 
44 |     # Hard Triplet Loss with Soft Margin
45 |     # Paper: In Defense of the Triplet Loss for Person Re-Identification, https://arxiv.org/abs/1703.07737
46 |     def batch_hard_triplet_soft_margin_loss(self, labels: Tensor, embeddings: Tensor) -> Tensor:
47 |         """Build the triplet loss over a batch of embeddings.
48 |         For each anchor, we get the hardest positive and hardest negative to form a triplet.
49 |         Args:
50 |             labels: labels of the batch, of size (batch_size,)
51 |             embeddings: tensor of shape (batch_size, embed_dim)
52 |             squared: Boolean. If true, output is the pairwise squared euclidean distance matrix.
53 |                      If false, output is the pairwise euclidean distance matrix.
54 |         Returns:
55 |             Label_Sentence_Triplet: scalar tensor containing the triplet loss
56 |         """
57 |         # Get the pairwise distance matrix
58 |         pairwise_dist = self.distance_metric(embeddings)
59 | 
60 | 
61 |         # For each anchor, get the hardest positive
62 |         # First, we need to get a mask for every valid positive (they should have same label)
63 |         mask_anchor_positive = BatchHardTripletLoss.get_anchor_positive_triplet_mask(labels).float()
64 | 
65 |         # We put to 0 any element where (a, p) is not valid (valid if a != p and label(a) == label(p))
66 |         anchor_positive_dist = mask_anchor_positive * pairwise_dist
67 | 
68 |         # shape (batch_size, 1)
69 |         hardest_positive_dist, _ = anchor_positive_dist.max(1, keepdim=True)
70 | 
71 |         # For each anchor, get the hardest negative
72 |         # First, we need to get a mask for every valid negative (they should have different labels)
73 |         mask_anchor_negative = BatchHardTripletLoss.get_anchor_negative_triplet_mask(labels).float()
74 | 
75 |         # We add the maximum value in each row to the invalid negatives (label(a) == label(n))
76 |         max_anchor_negative_dist, _ = pairwise_dist.max(1, keepdim=True)
77 |         anchor_negative_dist = pairwise_dist + max_anchor_negative_dist * (1.0 - mask_anchor_negative)
78 | 
79 |         # shape (batch_size,)
80 |         hardest_negative_dist, _ = anchor_negative_dist.min(1, keepdim=True)
81 | 
82 |         # Combine biggest d(a, p) and smallest d(a, n) into final triplet loss with soft margin
83 |         #tl = hardest_positive_dist - hardest_negative_dist + margin
84 |         #tl[tl < 0] = 0
85 |         tl = torch.log1p(torch.exp(hardest_positive_dist - hardest_negative_dist))
86 |         triplet_loss = tl.mean()
87 | 
88 |         return triplet_loss
89 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/ContrastiveLoss.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | from typing import Iterable, Dict
 3 | import torch.nn.functional as F
 4 | from torch import nn, Tensor
 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer
 6 | 
 7 | 
 8 | class SiameseDistanceMetric(Enum):
 9 |     """
10 |     The metric for the contrastive loss
11 |     """
12 |     EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
13 |     MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
14 |     COSINE_DISTANCE = lambda x, y: 1-F.cosine_similarity(x, y)
15 | 
16 | 
17 | class ContrastiveLoss(nn.Module):
18 |     """
19 |     Contrastive loss. Expects as input two texts and a label of either 0 or 1. If the label == 1, then the distance between the
20 |     two embeddings is reduced. If the label == 0, then the distance between the embeddings is increased.
21 | 
22 |     Further information: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
23 | 
24 |     :param model: SentenceTransformer model
25 |     :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
26 |     :param margin: Negative samples (label == 0) should have a distance of at least the margin value.
27 |     :param size_average: Average by the size of the mini-batch.
28 | 
29 |     Example::
30 | 
31 |         from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
32 |         from sentence_transformers.readers import InputExample
33 | 
34 |         model = SentenceTransformer('distilbert-base-nli-mean-tokens')
35 |         train_examples = [InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
36 |             InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]
37 |         train_dataset = SentencesDataset(train_examples, model)
38 |         train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
39 |         train_loss = losses.ContrastiveLoss(model=model)
40 | 
41 |     """
42 | 
43 |     def __init__(self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5, size_average:bool = True):
44 |         super(ContrastiveLoss, self).__init__()
45 |         self.distance_metric = distance_metric
46 |         self.margin = margin
47 |         self.model = model
48 |         self.size_average = size_average
49 | 
50 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
51 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
52 |         assert len(reps) == 2
53 |         rep_anchor, rep_other = reps
54 |         distances = self.distance_metric(rep_anchor, rep_other)
55 |         losses = 0.5 * (labels.float() * distances.pow(2) + (1 - labels).float() * F.relu(self.margin - distances).pow(2))
56 |         return losses.mean() if self.size_average else losses.sum()
57 | 
58 | 
59 | 
60 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/CosineSimilarityLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | 
 6 | 
 7 | class CosineSimilarityLoss(nn.Module):
 8 |     """
 9 |     CosineSimilarityLoss expects, that the InputExamples consists of two texts and a float label.
10 | 
11 |     It computes the vectors u = model(input_text[0]) and v = model(input_text[1]) and measures the cosine-similarity between the two.
12 |     By default, it minimizes the following loss: ||input_label - cos_score_transformation(cosine_sim(u,v))||_2.
13 | 
14 |     :param model: SentenceTranformer model
15 |     :param loss_fct: Which pytorch loss function should be used to compare the cosine_similartiy(u,v) with the input_label? By default, MSE:  ||input_label - cosine_sim(u,v)||_2
16 |     :param cos_score_transformation: The cos_score_transformation function is applied on top of cosine_similarity. By default, the identify function is used (i.e. no change).
17 | 
18 |     Example::
19 | 
20 |             from sentence_transformers import SentenceTransformer, SentencesDataset, InputExample, losses
21 | 
22 |             model = SentenceTransformer('distilbert-base-nli-mean-tokens')
23 |             train_examples = [InputExample(texts=['My first sentence', 'My second sentence'], label=0.8),
24 |                 InputExample(texts=['Another pair', 'Unrelated sentence'], label=0.3)]
25 |             train_dataset = SentencesDataset(train_examples, model)
26 |             train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
27 |             train_loss = losses.CosineSimilarityLoss(model=model)
28 | 
29 | 
30 |     """
31 |     def __init__(self, model: SentenceTransformer, loss_fct = nn.MSELoss(), cos_score_transformation=nn.Identity()):
32 |         super(CosineSimilarityLoss, self).__init__()
33 |         self.model = model
34 |         self.loss_fct = loss_fct
35 |         self.cos_score_transformation = cos_score_transformation
36 | 
37 | 
38 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
39 |         embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
40 |         output = self.cos_score_transformation(torch.cosine_similarity(embeddings[0], embeddings[1]))
41 |         return self.loss_fct(output, labels.view(-1))
42 | 
43 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/MSELoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | 
 5 | 
 6 | class MSELoss(nn.Module):
 7 |     """
 8 |     Computes the MSE loss between the computed sentence embedding and a target sentence embedding. This loss
 9 |     is used when extending sentence embeddings to new languages as described in our publication
10 |     Making Monolingual Sentence Embeddings Multilingual using Knowledge Distillation: https://arxiv.org/abs/2004.09813
11 | 
12 |     For an example, see the documentation on extending language models to new languages.
13 |     """
14 |     def __init__(self, model):
15 |         super(MSELoss, self).__init__()
16 |         self.model = model
17 |         self.loss_fct = nn.MSELoss()
18 | 
19 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
20 |         rep = self.model(sentence_features[0])['sentence_embedding']
21 |         return self.loss_fct(rep, labels)
22 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/MegaBatchMarginLoss.py:
--------------------------------------------------------------------------------
 1 | from .. import util
 2 | import torch
 3 | from torch import nn, Tensor
 4 | from typing import Iterable, Dict
 5 | import torch.nn.functional as F
 6 | 
 7 | class MegaBatchMarginLoss(nn.Module):
 8 |     """
 9 |     Loss function inspired from ParaNMT paper:
10 |     https://www.aclweb.org/anthology/P18-1042/
11 | 
12 |     Given a large batch (like 500 or more examples) of (anchor_i, positive_i) pairs,
13 |     find for each pair in the batch the hardest negative, i.e. find j != i such that cos_sim(anchor_i, positive_j)
14 |     is maximal. Then create from this a triplet (anchor_i, positive_i, positive_j) where positive_j
15 |     serves as the negative for this triplet.
16 | 
17 |     Train than as with the triplet loss
18 |     """
19 | 
20 |     def __init__(self, model, positive_margin: float = 0.8, negative_margin: float = 0.3, use_mini_batched_version: bool = True, mini_batch_size: bool = 50):
21 |         """
22 |         :param model: SentenceTransformerModel
23 |         :param positive_margin: Positive margin, cos(anchor, positive) should be > positive_margin
24 |         :param negative_margin: Negative margin, cos(anchor, negative) should be < negative_margin
25 |         :param use_mini_batched_version: As large batch sizes require a lot of memory, we can use a mini-batched version. We break down the large batch with 500 examples to smaller batches with fewer examples.
26 |         :param mini_batch_size: Size for the mini-batches. Should be a devisor for the batch size in your data loader.
27 |         """
28 |         super(MegaBatchMarginLoss, self).__init__()
29 |         self.model = model
30 |         self.positive_margin = positive_margin
31 |         self.negative_margin = negative_margin
32 |         self.mini_batch_size = mini_batch_size
33 |         self.forward = self.forward_mini_batched if use_mini_batched_version else self.forward_non_mini_batched
34 | 
35 | 
36 |     def forward_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
37 |         anchor, positive = sentence_features
38 |         feature_names = list(anchor.keys())
39 | 
40 |         with torch.no_grad():
41 |             self.model.eval()
42 |             all_positive_emb = self.model(positive)['sentence_embedding'].detach()
43 |             self.model.train()
44 | 
45 |         diagonal_matrix = torch.eye(len(all_positive_emb), len(all_positive_emb), device=all_positive_emb.device)
46 | 
47 |         #Iterate over the triplets (anchor, positive, hardest_negative) in smaller mini_batch sizes
48 |         for start_idx in range(0, len(all_positive_emb), self.mini_batch_size):
49 |             end_idx = start_idx + self.mini_batch_size
50 |             anchor_emb = self.model({key: anchor[key][start_idx:end_idx] for key in feature_names})['sentence_embedding']
51 | 
52 |             # Find hard negatives. For each anchor, find the hardest negative
53 |             # Store them in the triplets (anchor, positive, hardest_negative)
54 |             hard_negative_features = {key: [] for key in feature_names}
55 |             with torch.no_grad():
56 |                 cos_scores = util.pytorch_cos_sim(anchor_emb, all_positive_emb)
57 |                 negative_scores = cos_scores - 2 * diagonal_matrix[start_idx:end_idx]  # Remove positive scores along the diagonal, set them to -1 so that they are not selected by the max() operation
58 |                 negatives_max, negatives_ids = torch.max(negative_scores, dim=1)
59 | 
60 |             for hard_negative_id in negatives_ids:
61 |                 for key in feature_names:
62 |                     hard_negative_features[key].append(positive[key][hard_negative_id])
63 | 
64 |             for key in feature_names:
65 |                 hard_negative_features[key] = torch.stack(hard_negative_features[key])
66 | 
67 | 
68 |             #Compute differentiable negative and positive embeddings
69 |             positive_emb = self.model({key: positive[key][start_idx:end_idx] for key in feature_names})['sentence_embedding']
70 |             negative_emb = self.model(hard_negative_features)['sentence_embedding']
71 | 
72 |             assert anchor_emb.shape == positive_emb.shape
73 |             assert anchor_emb.shape == negative_emb.shape
74 | 
75 |             #Compute loss
76 |             pos_cosine = F.cosine_similarity(anchor_emb, positive_emb)
77 |             neg_cosine = F.cosine_similarity(anchor_emb, negative_emb)
78 |             losses = F.relu(self.positive_margin - pos_cosine) + F.relu(neg_cosine - self.negative_margin)
79 |             losses = losses.mean()
80 | 
81 |             #Backpropagate unless it is the last mini batch. The last mini-batch will be back propagated by the outside train loop
82 |             if end_idx < len(cos_scores):
83 |                 losses.backward()
84 | 
85 |         return losses
86 | 
87 | 
88 |     ##### Non mini-batched version ###
89 |     def forward_non_mini_batched(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
90 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
91 |         embeddings_a, embeddings_b = reps
92 | 
93 |         cos_scores = util.pytorch_cos_sim(embeddings_a, embeddings_b)
94 |         positive_scores = torch.diagonal(cos_scores)
95 |         negative_scores = cos_scores - (2*torch.eye(*cos_scores.shape, device=cos_scores.device))  # Remove positive scores along the diagonal
96 |         negatives_max, _ = torch.max(negative_scores, dim=1)
97 |         losses = F.relu(self.positive_margin - positive_scores) + F.relu(negatives_max - self.negative_margin)
98 |         return losses.mean()
99 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/MultipleNegativesRankingLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | from .. import util
 6 | 
 7 | class MultipleNegativesRankingLoss(nn.Module):
 8 |     """
 9 |         This loss expects as input a batch consisting of sentence pairs (a_1, p_1), (a_2, p_2)..., (a_n, p_n)
10 |         where we assume that (a_i, p_i) are a positive pair and (a_i, p_j) for i!=j a negative pair.
11 | 
12 |         For each a_i, it uses all other p_j as negative samples, i.e., for a_i, we have 1 positive example (p_i) and
13 |         n-1 negative examples (p_j). It then minimizes the negative log-likehood for softmax normalized scores.
14 | 
15 |         This loss function works great to train embeddings for retrieval setups where you have positive pairs (e.g. (query, relevant_doc))
16 |         as it will sample in each batch n-1 negative docs randomly.
17 | 
18 |         The performance usually increases with increasing batch sizes.
19 | 
20 |         For more information, see: https://arxiv.org/pdf/1705.00652.pdf
21 |         (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4)
22 | 
23 |         You can also provide one or multiple hard negatives per anchor-positive pair by structering the data like this:
24 |         (a_1, p_1, n_1), (a_2, p_2, n_2)
25 | 
26 |         Here, n_1 is a hard negative for (a_1, p_1). The loss will use for the pair (a_i, p_i) all p_j (j!=i) and all n_j as negatives.
27 | 
28 |         Example::
29 | 
30 |             from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
31 |             from sentence_transformers.readers import InputExample
32 | 
33 |             model = SentenceTransformer('distilbert-base-nli-mean-tokens')
34 |             train_examples = [InputExample(texts=['Anchor 1', 'Positive 1']),
35 |                 InputExample(texts=['Anchor 2', 'Positive 2'])]
36 |             train_dataset = SentencesDataset(train_examples, model)
37 |             train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
38 |             train_loss = losses.MultipleNegativesRankingLoss(model=model)
39 |     """
40 |     def __init__(self, model: SentenceTransformer, scale: float = 20.0, similarity_fct = util.cos_sim):
41 |         """
42 |         :param model: SentenceTransformer model
43 |         :param scale: Output of similarity function is multiplied by scale value
44 |         :param similarity_fct: similarity function between sentence embeddings. By default, cos_sim. Can also be set to dot product (and then set scale to 1)
45 |         """
46 |         super(MultipleNegativesRankingLoss, self).__init__()
47 |         self.model = model
48 |         self.scale = scale
49 |         self.similarity_fct = similarity_fct
50 |         self.cross_entropy_loss = nn.CrossEntropyLoss()
51 | 
52 | 
53 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
54 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
55 |         embeddings_a = reps[0]
56 |         embeddings_b = torch.cat(reps[1:])
57 |         print(embeddings_a.size())
58 |         print(embeddings_b.size())
59 | 
60 |         scores = self.similarity_fct(embeddings_a, embeddings_b) * self.scale
61 |         labels = torch.tensor(range(len(scores)), dtype=torch.long, device=scores.device)  # Example a[i] should match with b[i]
62 |         return self.cross_entropy_loss(scores, labels)
63 | 
64 | 
65 | 
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/OnlineContrastiveLoss.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable, Dict
 2 | import torch.nn.functional as F
 3 | from torch import nn, Tensor
 4 | from .ContrastiveLoss import SiameseDistanceMetric
 5 | from sentence_transformers.SentenceTransformer import SentenceTransformer
 6 | 
 7 | 
 8 | class OnlineContrastiveLoss(nn.Module):
 9 |     """
10 |     Online Contrastive loss. Similar to ConstrativeLoss, but it selects hard positive (positives that are far apart)
11 |      and hard negative pairs (negatives that are close) and computes the loss only for these pairs. Often yields
12 |      better performances than  ConstrativeLoss.
13 | 
14 |     :param model: SentenceTransformer model
15 |     :param distance_metric: Function that returns a distance between two emeddings. The class SiameseDistanceMetric contains pre-defined metrices that can be used
16 |     :param margin: Negative samples (label == 0) should have a distance of at least the margin value.
17 |     :param size_average: Average by the size of the mini-batch.
18 | 
19 |     Example::
20 | 
21 |         from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
22 |         from sentence_transformers.readers import InputExample
23 | 
24 |         model = SentenceTransformer('distilbert-base-nli-mean-tokens')
25 |         train_examples = [InputExample(texts=['This is a positive pair', 'Where the distance will be minimized'], label=1),
26 |             InputExample(texts=['This is a negative pair', 'Their distance will be increased'], label=0)]
27 |         train_dataset = SentencesDataset(train_examples, model)
28 |         train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
29 |         train_loss = losses.OnlineContrastiveLoss(model=model)
30 |     """
31 | 
32 |     def __init__(self, model: SentenceTransformer, distance_metric=SiameseDistanceMetric.COSINE_DISTANCE, margin: float = 0.5):
33 |         super(OnlineContrastiveLoss, self).__init__()
34 |         self.model = model
35 |         self.margin = margin
36 |         self.distance_metric = distance_metric
37 | 
38 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor, size_average=False):
39 |         embeddings = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
40 | 
41 |         distance_matrix = self.distance_metric(embeddings[0], embeddings[1])
42 |         negs = distance_matrix[labels == 0]
43 |         poss = distance_matrix[labels == 1]
44 | 
45 |         # select hard positive and hard negative pairs
46 |         negative_pairs = negs[negs < (poss.max() if len(poss) > 1 else negs.mean())]
47 |         positive_pairs = poss[poss > (negs.min() if len(negs) > 1 else poss.mean())]
48 | 
49 |         positive_loss = positive_pairs.pow(2).sum()
50 |         negative_loss = F.relu(self.margin - negative_pairs).pow(2).sum()
51 |         loss = positive_loss + negative_loss
52 |         return loss


--------------------------------------------------------------------------------
/sentence_transformers/losses/SoftmaxLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | import logging
 6 | 
 7 | 
 8 | logger = logging.getLogger(__name__)
 9 | 
10 | class SoftmaxLoss(nn.Module):
11 |     """
12 |     This loss was used in our SBERT publication (https://arxiv.org/abs/1908.10084) to train the SentenceTransformer
13 |     model on NLI data. It adds a softmax classifier on top of the output of two transformer networks.
14 | 
15 |     :param model: SentenceTransformer model
16 |     :param sentence_embedding_dimension: Dimension of your sentence embeddings
17 |     :param num_labels: Number of different labels
18 |     :param concatenation_sent_rep: Concatenate vectors u,v for the softmax classifier?
19 |     :param concatenation_sent_difference: Add abs(u-v) for the softmax classifier?
20 |     :param concatenation_sent_multiplication: Add u*v for the softmax classifier?
21 | 
22 |     Example::
23 | 
24 |         from sentence_transformers import SentenceTransformer, SentencesDataset, losses
25 |         from sentence_transformers.readers import InputExample
26 | 
27 |         model = SentenceTransformer('distilbert-base-nli-mean-tokens')
28 |         train_examples = [InputExample(InputExample(texts=['First pair, sent A', 'First pair, sent B'], label=0),
29 |             InputExample(texts=['Second Pair, sent A', 'Second Pair, sent B'], label=3)]
30 |         train_dataset = SentencesDataset(train_examples, model)
31 |         train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
32 |         train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
33 |     """
34 |     def __init__(self,
35 |                  model: SentenceTransformer,
36 |                  sentence_embedding_dimension: int,
37 |                  num_labels: int,
38 |                  concatenation_sent_rep: bool = True,
39 |                  concatenation_sent_difference: bool = True,
40 |                  concatenation_sent_multiplication: bool = False):
41 |         super(SoftmaxLoss, self).__init__()
42 |         self.model = model
43 |         self.num_labels = num_labels
44 |         self.concatenation_sent_rep = concatenation_sent_rep
45 |         self.concatenation_sent_difference = concatenation_sent_difference
46 |         self.concatenation_sent_multiplication = concatenation_sent_multiplication
47 | 
48 |         num_vectors_concatenated = 0
49 |         if concatenation_sent_rep:
50 |             num_vectors_concatenated += 2
51 |         if concatenation_sent_difference:
52 |             num_vectors_concatenated += 1
53 |         if concatenation_sent_multiplication:
54 |             num_vectors_concatenated += 1
55 |         logger.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated))
56 |         self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels)
57 | 
58 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
59 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
60 |         rep_a, rep_b = reps
61 | 
62 |         vectors_concat = []
63 |         if self.concatenation_sent_rep:
64 |             vectors_concat.append(rep_a)
65 |             vectors_concat.append(rep_b)
66 | 
67 |         if self.concatenation_sent_difference:
68 |             vectors_concat.append(torch.abs(rep_a - rep_b))
69 | 
70 |         if self.concatenation_sent_multiplication:
71 |             vectors_concat.append(rep_a * rep_b)
72 | 
73 |         features = torch.cat(vectors_concat, 1)
74 | 
75 |         output = self.classifier(features)
76 |         loss_fct = nn.CrossEntropyLoss()
77 | 
78 |         if labels is not None:
79 |             loss = loss_fct(output, labels.view(-1))
80 |             return loss
81 |         else:
82 |             return reps, output
83 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/TripletLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | from enum import Enum
 6 | from ..SentenceTransformer import SentenceTransformer
 7 | 
 8 | class TripletDistanceMetric(Enum):
 9 |     """
10 |     The metric for the triplet loss
11 |     """
12 |     COSINE = lambda x, y: 1 - F.cosine_similarity(x, y)
13 |     EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
14 |     MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
15 | 
16 | class TripletLoss(nn.Module):
17 |     """
18 |     This class implements triplet loss. Given a triplet of (anchor, positive, negative),
19 |     the loss minimizes the distance between anchor and positive while it maximizes the distance
20 |     between anchor and negative. It compute the following loss function:
21 | 
22 |     loss = max(||anchor - positive|| - ||anchor - negative|| + margin, 0).
23 | 
24 |     Margin is an important hyperparameter and needs to be tuned respectively.
25 | 
26 |     For further details, see: https://en.wikipedia.org/wiki/Triplet_loss
27 | 
28 |     :param model: SentenceTransformerModel
29 |     :param distance_metric: Function to compute distance between two embeddings. The class TripletDistanceMetric contains common distance metrices that can be used.
30 |     :param triplet_margin: The negative should be at least this much further away from the anchor than the positive.
31 | 
32 |     Example::
33 | 
34 |         from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
35 |         from sentence_transformers.readers import InputExample
36 | 
37 |         model = SentenceTransformer('distilbert-base-nli-mean-tokens')
38 |         train_examples = [InputExample(texts=['Anchor 1', 'Positive 1', 'Negative 1']),
39 |             InputExample(texts=['Anchor 2', 'Positive 2', 'Negative 2'])]
40 |         train_dataset = SentencesDataset(train_examples, model)
41 |         train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=train_batch_size)
42 |         train_loss = losses.TripletLoss(model=model)
43 |     """
44 |     def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin: float = 5):
45 |         super(TripletLoss, self).__init__()
46 |         self.model = model
47 |         self.distance_metric = distance_metric
48 |         self.triplet_margin = triplet_margin
49 | 
50 | 
51 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
52 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
53 | 
54 |         rep_anchor, rep_pos, rep_neg = reps
55 |         distance_pos = self.distance_metric(rep_anchor, rep_pos)
56 |         distance_neg = self.distance_metric(rep_anchor, rep_neg)
57 | 
58 |         losses = F.relu(distance_pos - distance_neg + self.triplet_margin)
59 |         return losses.mean()


--------------------------------------------------------------------------------
/sentence_transformers/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | from .CosineSimilarityLoss import *
 2 | from .SoftmaxLoss import *
 3 | from .MultipleNegativesRankingLoss import *
 4 | from .TripletLoss import *
 5 | from .MSELoss import *
 6 | from .ContrastiveLoss import *
 7 | from .ContrastiveTensionLoss import *
 8 | from .OnlineContrastiveLoss import *
 9 | from .MegaBatchMarginLoss import *
10 | from .DenoisingAutoEncoderLoss import *
11 | 
12 | # Triplet losses
13 | from .BatchHardTripletLoss import *
14 | from .BatchHardSoftMarginTripletLoss import *
15 | from .BatchSemiHardTripletLoss import *
16 | from .BatchAllTripletLoss import *
17 | from .BYOLoss import *


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BYOLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BYOLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BYOLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BYOLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BatchAllTripletLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchAllTripletLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BatchAllTripletLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchAllTripletLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BatchHardSoftMarginTripletLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchHardSoftMarginTripletLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BatchHardSoftMarginTripletLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchHardSoftMarginTripletLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BatchHardTripletLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchHardTripletLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BatchHardTripletLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchHardTripletLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BatchSemiHardTripletLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchSemiHardTripletLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/BatchSemiHardTripletLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/BatchSemiHardTripletLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/ContrastiveLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/ContrastiveLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/ContrastiveLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/ContrastiveLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/ContrastiveTensionLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/ContrastiveTensionLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/ContrastiveTensionLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/ContrastiveTensionLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/CosineSimilarityLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/CosineSimilarityLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/CosineSimilarityLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/CosineSimilarityLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/DenoisingAutoEncoderLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/DenoisingAutoEncoderLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/DenoisingAutoEncoderLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/DenoisingAutoEncoderLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/MSELoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MSELoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/MSELoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MSELoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/MegaBatchMarginLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MegaBatchMarginLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/MegaBatchMarginLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MegaBatchMarginLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/MultipleNegativesRankingLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MultipleNegativesRankingLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/MultipleNegativesRankingLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/MultipleNegativesRankingLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/OnlineContrastiveLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/OnlineContrastiveLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/OnlineContrastiveLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/OnlineContrastiveLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/SoftmaxLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/SoftmaxLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/SoftmaxLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/SoftmaxLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/TripletLoss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/TripletLoss.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/TripletLoss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/TripletLoss.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/losses/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/losses/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/ALBERT.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class ALBERT(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/BERT.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class BERT(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/BoW.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | import numpy as np
 9 | from .tokenizer import WhitespaceTokenizer
10 | 
11 | 
12 | logger = logging.getLogger(__name__)
13 | 
14 | class BoW(nn.Module):
15 |     """Implements a Bag-of-Words (BoW) model to derive sentence embeddings.
16 | 
17 |     A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
18 |     """
19 | 
20 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True):
21 |         super(BoW, self).__init__()
22 |         vocab = list(set(vocab)) #Ensure vocab is unique
23 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency']
24 |         self.vocab = vocab
25 |         self.word_weights = word_weights
26 |         self.unknown_word_weight = unknown_word_weight
27 |         self.cumulative_term_frequency = cumulative_term_frequency
28 | 
29 |         #Maps wordIdx -> word weight
30 |         self.weights = []
31 |         num_unknown_words = 0
32 |         for word in vocab:
33 |             weight = unknown_word_weight
34 |             if word in word_weights:
35 |                 weight = word_weights[word]
36 |             elif word.lower() in word_weights:
37 |                 weight = word_weights[word.lower()]
38 |             else:
39 |                 num_unknown_words += 1
40 |             self.weights.append(weight)
41 | 
42 |         logger.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
43 | 
44 |         self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False)
45 |         self.sentence_embedding_dimension = len(vocab)
46 | 
47 | 
48 |     def forward(self, features: Dict[str, Tensor]):
49 |         #Nothing to do, everything is done in get_sentence_features
50 |         return features
51 | 
52 |     def tokenize(self, texts: List[str]) -> List[int]:
53 |         tokenized =  [self.tokenizer.tokenize(text) for text in texts]
54 |         return self.get_sentence_features(tokenized)
55 | 
56 |     def get_sentence_embedding_dimension(self):
57 |         return self.sentence_embedding_dimension
58 | 
59 |     def get_sentence_features(self, tokenized_texts: List[List[int]], pad_seq_length: int = 0):
60 |         vectors = []
61 | 
62 |         for tokens in tokenized_texts:
63 |             vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32)
64 |             for token in tokens:
65 |                 if self.cumulative_term_frequency:
66 |                     vector[token] += self.weights[token]
67 |                 else:
68 |                     vector[token] = self.weights[token]
69 |             vectors.append(vector)
70 | 
71 |         return {'sentence_embedding': torch.tensor(vectors, dtype=torch.float)}
72 | 
73 |     def get_config_dict(self):
74 |         return {key: self.__dict__[key] for key in self.config_keys}
75 | 
76 |     def save(self, output_path):
77 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
78 |             json.dump(self.get_config_dict(), fOut, indent=2)
79 | 
80 |     @staticmethod
81 |     def load(input_path):
82 |         with open(os.path.join(input_path, 'config.json')) as fIn:
83 |             config = json.load(fIn)
84 | 
85 |         return BoW(**config)
86 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/CNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import logging
 5 | import gzip
 6 | from tqdm import tqdm
 7 | import numpy as np
 8 | import os
 9 | import json
10 | from ..util import import_from_string, fullname, http_get
11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
12 | 
13 | 
14 | class CNN(nn.Module):
15 |     """CNN-layer with multiple kernel-sizes over the word embeddings"""
16 | 
17 |     def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5], stride_sizes: List[int] = None):
18 |         nn.Module.__init__(self)
19 |         self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes']
20 |         self.in_word_embedding_dimension = in_word_embedding_dimension
21 |         self.out_channels = out_channels
22 |         self.kernel_sizes = kernel_sizes
23 | 
24 |         self.embeddings_dimension = out_channels*len(kernel_sizes)
25 |         self.convs = nn.ModuleList()
26 | 
27 |         in_channels = in_word_embedding_dimension
28 |         if stride_sizes is None:
29 |             stride_sizes = [1] * len(kernel_sizes)
30 | 
31 |         for kernel_size, stride in zip(kernel_sizes, stride_sizes):
32 |             padding_size = int((kernel_size - 1) / 2)
33 |             conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels,
34 |                              kernel_size=kernel_size,
35 |                              stride=stride,
36 |                              padding=padding_size)
37 |             self.convs.append(conv)
38 | 
39 |     def forward(self, features):
40 |         token_embeddings = features['token_embeddings']
41 | 
42 |         token_embeddings = token_embeddings.transpose(1, -1)
43 |         vectors = [conv(token_embeddings) for conv in self.convs]
44 |         out = torch.cat(vectors, 1).transpose(1, -1)
45 | 
46 |         features.update({'token_embeddings': out})
47 |         return features
48 | 
49 |     def get_word_embedding_dimension(self) -> int:
50 |         return self.embeddings_dimension
51 | 
52 |     def tokenize(self, text: str) -> List[int]:
53 |         raise NotImplementedError()
54 | 
55 |     def save(self, output_path: str):
56 |         with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut:
57 |             json.dump(self.get_config_dict(), fOut, indent=2)
58 | 
59 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
60 | 
61 |     def get_config_dict(self):
62 |         return {key: self.__dict__[key] for key in self.config_keys}
63 | 
64 |     @staticmethod
65 |     def load(input_path: str):
66 |         with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn:
67 |             config = json.load(fIn)
68 | 
69 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))
70 |         model = CNN(**config)
71 |         model.load_state_dict(weights)
72 |         return model
73 | 
74 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/CamemBERT.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | 
 4 | class CamemBERT(Transformer):
 5 |     """
 6 |     DEPRECATED: Please use models.Transformer instead.
 7 |     """
 8 |     pass
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/Dense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from torch import functional as F
 5 | from typing import Union, Tuple, List, Iterable, Dict
 6 | import os
 7 | import json
 8 | from ..util import fullname, import_from_string
 9 | 
10 | 
11 | class Dense(nn.Module):
12 |     """Feed-forward function with  activiation function.
13 | 
14 |     This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN).
15 | 
16 |     :param in_features: Size of the input dimension
17 |     :param out_features: Output size
18 |     :param bias: Add a bias vector
19 |     :param activation_function: Pytorch activation function applied on output
20 |     :param init_weight: Initial value for the matrix of the linear layer
21 |     :param init_bias: Initial value for the bias of the linear layer
22 |     """
23 |     def __init__(self, in_features: int, out_features: int, bias: bool = True, activation_function=nn.Tanh(), init_weight: Tensor = None, init_bias: Tensor = None):
24 |         super(Dense, self).__init__()
25 |         self.in_features = in_features
26 |         self.out_features = out_features
27 |         self.bias = bias
28 |         self.activation_function = activation_function
29 |         self.linear = nn.Linear(in_features, out_features, bias=bias)
30 | 
31 |         if init_weight is not None:
32 |             self.linear.weight = nn.Parameter(init_weight)
33 | 
34 |         if init_bias is not None:
35 |             self.linear.bias = nn.Parameter(init_bias)
36 | 
37 |     def forward(self, features: Dict[str, Tensor]):
38 |         features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))})
39 |         return features
40 | 
41 |     def get_sentence_embedding_dimension(self) -> int:
42 |         return self.out_features
43 | 
44 |     def save(self, output_path):
45 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
46 |             json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut)
47 | 
48 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
49 | 
50 |     @staticmethod
51 |     def load(input_path):
52 |         with open(os.path.join(input_path, 'config.json')) as fIn:
53 |             config = json.load(fIn)
54 | 
55 |         config['activation_function'] = import_from_string(config['activation_function'])()
56 |         model = Dense(**config)
57 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
58 |         return model
59 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/DistilBERT.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class DistilBERT(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/LSTM.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from typing import List
 4 | import os
 5 | import json
 6 | 
 7 | 
 8 | 
 9 | class LSTM(nn.Module):
10 |     """
11 |     Bidirectional LSTM running over word embeddings.
12 |     """
13 |     def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True):
14 |         nn.Module.__init__(self)
15 |         self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional']
16 |         self.word_embedding_dimension = word_embedding_dimension
17 |         self.hidden_dim = hidden_dim
18 |         self.num_layers = num_layers
19 |         self.dropout = dropout
20 |         self.bidirectional = bidirectional
21 | 
22 |         self.embeddings_dimension = hidden_dim
23 |         if self.bidirectional:
24 |             self.embeddings_dimension *= 2
25 | 
26 |         self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
27 | 
28 |     def forward(self, features):
29 |         token_embeddings = features['token_embeddings']
30 |         sentence_lengths = torch.clamp(features['sentence_lengths'], min=1)
31 | 
32 |         packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False)
33 |         packed = self.encoder(packed)
34 |         unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0]
35 |         features.update({'token_embeddings': unpack})
36 |         return features
37 | 
38 |     def get_word_embedding_dimension(self) -> int:
39 |         return self.embeddings_dimension
40 | 
41 |     def tokenize(self, text: str) -> List[int]:
42 |         raise NotImplementedError()
43 | 
44 |     def save(self, output_path: str):
45 |         with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
46 |             json.dump(self.get_config_dict(), fOut, indent=2)
47 | 
48 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
49 | 
50 |     def get_config_dict(self):
51 |         return {key: self.__dict__[key] for key in self.config_keys}
52 | 
53 |     @staticmethod
54 |     def load(input_path: str):
55 |         with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
56 |             config = json.load(fIn)
57 | 
58 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
59 |         model = LSTM(**config)
60 |         model.load_state_dict(weights)
61 |         return model
62 | 
63 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/LayerNorm.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class LayerNorm(nn.Module):
10 |     def __init__(self, dimension: int):
11 |         super(LayerNorm, self).__init__()
12 |         self.dimension = dimension
13 |         self.norm = nn.LayerNorm(dimension)
14 |    
15 | 
16 |     def forward(self, features: Dict[str, Tensor]):
17 |         features['sentence_embedding'] = self.norm(features['sentence_embedding']) 
18 |         return features
19 | 
20 | 
21 |     def get_sentence_embedding_dimension(self):
22 |         return self.dimension
23 | 
24 |     def save(self, output_path):
25 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
26 |             json.dump({'dimension': self.dimension}, fOut, indent=2)
27 | 
28 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
29 | 
30 |     @staticmethod
31 |     def load(input_path):
32 |         with open(os.path.join(input_path, 'config.json')) as fIn:
33 |             config = json.load(fIn)
34 | 
35 |         model = LayerNorm(**config)
36 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
37 |         return model


--------------------------------------------------------------------------------
/sentence_transformers/models/Normalize.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from typing import Dict
 4 | import torch.nn.functional as F
 5 | 
 6 | class Normalize(nn.Module):
 7 |     """
 8 |     This layer normalizes embeddings to unit length
 9 |     """
10 |     def __init__(self):
11 |         super(Normalize, self).__init__()
12 | 
13 |     def forward(self, features: Dict[str, Tensor]):
14 |         features.update({'sentence_embedding': F.normalize(features['sentence_embedding'], p=2, dim=1)})
15 |         return features
16 | 
17 |     def save(self, output_path):
18 |         pass
19 | 
20 |     @staticmethod
21 |     def load(input_path):
22 |         return Normalize()
23 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/Pooling.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import Tensor
  3 | from torch import nn
  4 | from typing import Union, Tuple, List, Iterable, Dict
  5 | import os
  6 | import json
  7 | 
  8 | 
  9 | class Pooling(nn.Module):
 10 |     """Performs pooling (max or mean) on the token embeddings.
 11 | 
 12 |     Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
 13 |     You can concatenate multiple poolings together.
 14 | 
 15 |     :param word_embedding_dimension: Dimensions for the word embeddings
 16 |     :param pooling_mode: Can be a string: mean/max/cls. If set, overwrites the other pooling_mode_* settings
 17 |     :param pooling_mode_cls_token: Use the first token (CLS token) as text representations
 18 |     :param pooling_mode_max_tokens: Use max in each dimension over all tokens.
 19 |     :param pooling_mode_mean_tokens: Perform mean-pooling
 20 |     :param pooling_mode_mean_sqrt_len_tokens: Perform mean-pooling, but devide by sqrt(input_length).
 21 |     """
 22 |     def __init__(self,
 23 |                  word_embedding_dimension: int,
 24 |                  pooling_mode: str = None,
 25 |                  pooling_mode_cls_token: bool = False,
 26 |                  pooling_mode_max_tokens: bool = False,
 27 |                  pooling_mode_mean_tokens: bool = True,
 28 |                  pooling_mode_mean_sqrt_len_tokens: bool = False,
 29 |                  ):
 30 |         super(Pooling, self).__init__()
 31 | 
 32 |         self.config_keys = ['word_embedding_dimension',  'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
 33 | 
 34 |         if pooling_mode is not None:        #Set pooling mode by string
 35 |             pooling_mode = pooling_mode.lower()
 36 |             assert pooling_mode in ['mean', 'max', 'cls']
 37 |             pooling_mode_cls_token = (pooling_mode == 'cls')
 38 |             pooling_mode_max_tokens = (pooling_mode == 'max')
 39 |             pooling_mode_mean_tokens = (pooling_mode == 'mean')
 40 | 
 41 |         self.word_embedding_dimension = word_embedding_dimension
 42 |         self.pooling_mode_cls_token = pooling_mode_cls_token
 43 |         self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
 44 |         self.pooling_mode_max_tokens = pooling_mode_max_tokens
 45 |         self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
 46 | 
 47 |         pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens])
 48 |         self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension)
 49 | 
 50 |     def forward(self, features: Dict[str, Tensor]):
 51 |         token_embeddings = features['token_embeddings']
 52 |         cls_token = features['cls_token_embeddings']
 53 |         attention_mask = features['attention_mask']
 54 | 
 55 |         ## Pooling strategy
 56 |         output_vectors = []
 57 |         if self.pooling_mode_cls_token:
 58 |             output_vectors.append(cls_token)
 59 |         if self.pooling_mode_max_tokens:
 60 |             input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
 61 |             token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
 62 |             max_over_time = torch.max(token_embeddings, 1)[0]
 63 |             output_vectors.append(max_over_time)
 64 |         if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
 65 |             input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
 66 |             sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
 67 | 
 68 |             #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
 69 |             if 'token_weights_sum' in features:
 70 |                 sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
 71 |             else:
 72 |                 sum_mask = input_mask_expanded.sum(1)
 73 | 
 74 |             sum_mask = torch.clamp(sum_mask, min=1e-9)
 75 | 
 76 |             if self.pooling_mode_mean_tokens:
 77 |                 output_vectors.append(sum_embeddings / sum_mask)
 78 |             if self.pooling_mode_mean_sqrt_len_tokens:
 79 |                 output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
 80 | 
 81 |         output_vector = torch.cat(output_vectors, 1)
 82 |         features.update({'sentence_embedding': output_vector})
 83 |         return features
 84 | 
 85 |     def get_sentence_embedding_dimension(self):
 86 |         return self.pooling_output_dimension
 87 | 
 88 |     def get_config_dict(self):
 89 |         return {key: self.__dict__[key] for key in self.config_keys}
 90 | 
 91 |     def save(self, output_path):
 92 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
 93 |             json.dump(self.get_config_dict(), fOut, indent=2)
 94 | 
 95 |     @staticmethod
 96 |     def load(input_path):
 97 |         with open(os.path.join(input_path, 'config.json')) as fIn:
 98 |             config = json.load(fIn)
 99 | 
100 |         return Pooling(**config)
101 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/RoBERTa.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class RoBERTa(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/T5.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import T5Model, T5Tokenizer
 3 | import json
 4 | from typing import List, Dict, Optional
 5 | import os
 6 | import numpy as np
 7 | import logging
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class T5(nn.Module):
13 |     """DEPRECATED: Please use models.Transformer instead.
14 | 
15 |     T5 model to generate token embeddings.
16 | 
17 |     Each token is mapped to an output vector from BERT.
18 |     """
19 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, task_identifier: str = 'stsb sentence1: ', model_args: Dict = {}, tokenizer_args: Dict = {}):
20 |         super(T5, self).__init__()
21 |         self.config_keys = ['max_seq_length', 'do_lower_case', 'task_identifier']
22 |         self.do_lower_case = do_lower_case
23 | 
24 |         if max_seq_length > 512:
25 |             logger.warning("T5 only allows a max_seq_length of 512. Value will be set to 512")
26 |             max_seq_length = 512
27 |         self.max_seq_length = max_seq_length
28 | 
29 |         if self.do_lower_case is not None:
30 |             tokenizer_args['do_lower_case'] = do_lower_case
31 | 
32 |         self.t5model = T5Model.from_pretrained(model_name_or_path, **model_args)
33 |         self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
34 |         self.task_identifier = task_identifier
35 | 
36 |     def forward(self, features):
37 |         """Returns token_embeddings, cls_token"""
38 |         output_states = self.t5model.encoder(input_ids=features['input_ids'], attention_mask=features['attention_mask'])
39 |         output_tokens = output_states[0]
40 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
41 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens})
42 | 
43 |         if len(output_states) > 1:
44 |             features.update({'all_layer_embeddings': output_states[1]})
45 | 
46 |         return features
47 | 
48 |     def get_word_embedding_dimension(self) -> int:
49 |         return self.t5model.config.hidden_size
50 | 
51 |     def tokenize(self, text: str) -> List[int]:
52 |         """
53 |         Tokenizes a text and maps tokens to token-ids
54 |         """
55 |         return self.tokenizer.encode(self.task_identifier+text)
56 | 
57 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
58 |         """
59 |         Convert tokenized sentence in its embedding ids, segment ids and mask
60 | 
61 |         :param tokens:
62 |             a tokenized sentence
63 |         :param pad_seq_length:
64 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
65 |         :return: embedding ids, segment ids and mask for the sentence
66 |         """
67 | 
68 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
69 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, padding='max_length', return_tensors='pt', truncation=True, prepend_batch_axis=True)
70 | 
71 |     def get_config_dict(self):
72 |         return {key: self.__dict__[key] for key in self.config_keys}
73 | 
74 |     def save(self, output_path: str):
75 |         self.t5model.save_pretrained(output_path)
76 |         self.tokenizer.save_pretrained(output_path)
77 | 
78 |         with open(os.path.join(output_path, 'sentence_T5_config.json'), 'w') as fOut:
79 |             json.dump(self.get_config_dict(), fOut, indent=2)
80 | 
81 |     @staticmethod
82 |     def load(input_path: str):
83 |         with open(os.path.join(input_path, 'sentence_T5_config.json')) as fIn:
84 |             config = json.load(fIn)
85 |         return T5(model_name_or_path=input_path, **config)
86 | 
87 | 
88 | 
89 | 
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/Transformer.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from transformers import AutoModel, AutoTokenizer, AutoConfig
  3 | import json
  4 | from typing import List, Dict, Optional, Union, Tuple
  5 | import os
  6 | 
  7 | 
  8 | class Transformer(nn.Module):
  9 |     """Huggingface AutoModel to generate token embeddings.
 10 |     Loads the correct class, e.g. BERT / RoBERTa etc.
 11 | 
 12 |     :param model_name_or_path: Huggingface models name (https://huggingface.co/models)
 13 |     :param max_seq_length: Truncate any inputs longer than max_seq_length
 14 |     :param model_args: Arguments (key, value pairs) passed to the Huggingface Transformers model
 15 |     :param cache_dir: Cache dir for Huggingface Transformers to store/load models
 16 |     :param tokenizer_args: Arguments (key, value pairs) passed to the Huggingface Tokenizer model
 17 |     :param do_lower_case: If true, lowercases the input (independet if the model is cased or not)
 18 |     """
 19 |     def __init__(self, model_name_or_path: str, max_seq_length: Optional[int] = 64,
 20 |                  model_args: Dict = {}, cache_dir: Optional[str] = None,
 21 |                  tokenizer_args: Dict = {}, do_lower_case: bool = False):
 22 |         super(Transformer, self).__init__()
 23 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 24 |         self.max_seq_length = max_seq_length
 25 |         self.do_lower_case = do_lower_case
 26 | 
 27 |         config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
 28 |         self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
 29 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir, **tokenizer_args)
 30 | 
 31 | 
 32 |     def forward(self, features):
 33 |         """Returns token_embeddings, cls_token"""
 34 |         trans_features = {'input_ids': features['input_ids'], 'attention_mask': features['attention_mask']}
 35 |         if 'token_type_ids' in features:
 36 |             trans_features['token_type_ids'] = features['token_type_ids']
 37 | 
 38 |         output_states = self.auto_model(**trans_features, return_dict=False)
 39 |         output_tokens = output_states[0]
 40 | 
 41 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 42 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
 43 | 
 44 |         if self.auto_model.config.output_hidden_states:
 45 |             all_layer_idx = 2
 46 |             if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states
 47 |                 all_layer_idx = 1
 48 | 
 49 |             hidden_states = output_states[all_layer_idx]
 50 |             features.update({'all_layer_embeddings': hidden_states})
 51 | 
 52 |         return features
 53 | 
 54 |     def get_word_embedding_dimension(self) -> int:
 55 |         return self.auto_model.config.hidden_size
 56 | 
 57 |     def tokenize(self, texts: Union[List[str], List[Dict], List[Tuple[str, str]]]):
 58 |         """
 59 |         Tokenizes a text and maps tokens to token-ids
 60 |         """
 61 |         output = {}
 62 |         if isinstance(texts[0], str):
 63 |             to_tokenize = [texts]
 64 |         elif isinstance(texts[0], dict):
 65 |             to_tokenize = []
 66 |             output['text_keys'] = []
 67 |             for lookup in texts:
 68 |                 text_key, text = next(iter(lookup.items()))
 69 |                 to_tokenize.append(text)
 70 |                 output['text_keys'].append(text_key)
 71 |             to_tokenize = [to_tokenize]
 72 |         else:
 73 |             batch1, batch2 = [], []
 74 |             for text_tuple in texts:
 75 |                 batch1.append(text_tuple[0])
 76 |                 batch2.append(text_tuple[1])
 77 |             to_tokenize = [batch1, batch2]
 78 | 
 79 |         #strip
 80 |         to_tokenize = [[s.strip() for s in col] for col in to_tokenize]
 81 | 
 82 |         #Lowercase
 83 |         if self.do_lower_case:
 84 |             to_tokenize = [[s.lower() for s in col] for col in to_tokenize]
 85 | 
 86 | 
 87 |         output.update(self.tokenizer(*to_tokenize, padding=True, truncation='longest_first', return_tensors="pt", max_length=self.max_seq_length))
 88 |         return output
 89 | 
 90 | 
 91 |     def get_config_dict(self):
 92 |         return {key: self.__dict__[key] for key in self.config_keys}
 93 | 
 94 |     def save(self, output_path: str):
 95 |         self.auto_model.save_pretrained(output_path)
 96 |         self.tokenizer.save_pretrained(output_path)
 97 | 
 98 |         with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut:
 99 |             json.dump(self.get_config_dict(), fOut, indent=2)
100 | 
101 |     @staticmethod
102 |     def load(input_path: str):
103 |         #Old classes used other config names than 'sentence_bert_config.json'
104 |         for config_name in ['sentence_bert_config.json', 'sentence_roberta_config.json', 'sentence_distilbert_config.json', 'sentence_camembert_config.json', 'sentence_albert_config.json', 'sentence_xlm-roberta_config.json', 'sentence_xlnet_config.json']:
105 |             sbert_config_path = os.path.join(input_path, config_name)
106 |             if os.path.exists(sbert_config_path):
107 |                 break
108 | 
109 |         with open(sbert_config_path) as fIn:
110 |             config = json.load(fIn)
111 |         return Transformer(model_name_or_path=input_path, **config)
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/WeightedLayerPooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class WeightedLayerPooling(nn.Module):
10 |     """
11 |     Token embeddings are weighted mean of their different hidden layer representations
12 |     """
13 |     def __init__(self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights = None):
14 |         super(WeightedLayerPooling, self).__init__()
15 |         self.config_keys = ['word_embedding_dimension', 'layer_start', 'num_hidden_layers']
16 |         self.word_embedding_dimension = word_embedding_dimension
17 |         self.layer_start = layer_start
18 |         self.num_hidden_layers = num_hidden_layers
19 |         self.layer_weights = layer_weights if layer_weights is not None else nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float))
20 | 
21 |     def forward(self, features: Dict[str, Tensor]):
22 |         ft_all_layers = features['all_layer_embeddings']
23 | 
24 |         all_layer_embedding = torch.stack(ft_all_layers)
25 |         all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]  # Start from 4th layers output
26 | 
27 |         weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
28 |         weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
29 | 
30 |         features.update({'token_embeddings': weighted_average})
31 |         return features
32 | 
33 |     def get_word_embedding_dimension(self):
34 |         return self.word_embedding_dimension
35 | 
36 |     def get_config_dict(self):
37 |         return {key: self.__dict__[key] for key in self.config_keys}
38 | 
39 |     def save(self, output_path):
40 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
41 |             json.dump(self.get_config_dict(), fOut, indent=2)
42 | 
43 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
44 | 
45 | 
46 |     @staticmethod
47 |     def load(input_path):
48 |         with open(os.path.join(input_path, 'config.json')) as fIn:
49 |             config = json.load(fIn)
50 | 
51 |         model = WeightedLayerPooling(**config)
52 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
53 |         return model
54 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/WordWeights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | 
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class WordWeights(nn.Module):
13 |     """This model can weight word embeddings, for example, with idf-values."""
14 | 
15 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
16 |         """
17 | 
18 |         :param vocab:
19 |             Vocabulary of the tokenizer
20 |         :param word_weights:
21 |             Mapping of tokens to a float weight value. Words embeddings are multiplied by  this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
22 |         :param unknown_word_weight:
23 |             Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
24 |         """
25 |         super(WordWeights, self).__init__()
26 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight']
27 |         self.vocab = vocab
28 |         self.word_weights = word_weights
29 |         self.unknown_word_weight = unknown_word_weight
30 | 
31 |         weights = []
32 |         num_unknown_words = 0
33 |         for word in vocab:
34 |             weight = unknown_word_weight
35 |             if word in word_weights:
36 |                 weight = word_weights[word]
37 |             elif word.lower() in word_weights:
38 |                 weight = word_weights[word.lower()]
39 |             else:
40 |                 num_unknown_words += 1
41 |             weights.append(weight)
42 | 
43 |         logger.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
44 | 
45 |         self.emb_layer = nn.Embedding(len(vocab), 1)
46 |         self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)})
47 | 
48 | 
49 |     def forward(self, features: Dict[str, Tensor]):
50 |         attention_mask = features['attention_mask']
51 |         token_embeddings = features['token_embeddings']
52 | 
53 |         #Compute a weight value for each token
54 |         token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1)
55 |         token_weights = token_weights_raw * attention_mask.float()
56 |         token_weights_sum = torch.sum(token_weights, 1)
57 | 
58 |         #Multiply embedding by token weight value
59 |         token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size())
60 |         token_embeddings = token_embeddings * token_weights_expanded
61 | 
62 |         features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum})
63 |         return features
64 | 
65 |     def get_config_dict(self):
66 |         return {key: self.__dict__[key] for key in self.config_keys}
67 | 
68 |     def save(self, output_path):
69 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
70 |             json.dump(self.get_config_dict(), fOut, indent=2)
71 | 
72 |     @staticmethod
73 |     def load(input_path):
74 |         with open(os.path.join(input_path, 'config.json')) as fIn:
75 |             config = json.load(fIn)
76 | 
77 |         return WordWeights(**config)
78 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/XLMRoBERTa.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class XLMRoBERTa(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 
15 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/XLNet.py:
--------------------------------------------------------------------------------
 1 | from . import Transformer
 2 | 
 3 | class XLNet(Transformer):
 4 |     """
 5 |     DEPRECATED: Please use models.Transformer instead.
 6 |     """
 7 |     pass
 8 | 
 9 | 
10 | 
11 | 
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .Transformer import Transformer
 2 | from .Asym import Asym
 3 | from .BoW import BoW
 4 | from .CNN import CNN
 5 | from .Dense import Dense
 6 | from .LayerNorm import LayerNorm
 7 | from .LSTM import LSTM
 8 | from .Normalize import Normalize
 9 | from .Pooling import Pooling
10 | from .WKPooling import WKPooling
11 | from .WeightedLayerPooling import WeightedLayerPooling
12 | from .WordEmbeddings import WordEmbeddings
13 | from .WordWeights import WordWeights
14 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Asym.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Asym.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Asym.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Asym.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/BERT.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/BERT.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/BERT.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/BERT.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/BoW.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/BoW.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/BoW.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/BoW.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/CNN.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/CNN.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/CNN.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/CNN.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Dense.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Dense.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Dense.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Dense.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/DistilBERT.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/DistilBERT.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/LSTM.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/LSTM.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/LSTM.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/LSTM.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/LayerNorm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/LayerNorm.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/LayerNorm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/LayerNorm.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Normalize.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Normalize.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Normalize.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Normalize.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Pooling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Pooling.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Pooling.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Pooling.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Transformer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Transformer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/Transformer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/Transformer.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/WKPooling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WKPooling.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/WKPooling.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WKPooling.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/WeightedLayerPooling.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WeightedLayerPooling.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/WeightedLayerPooling.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WeightedLayerPooling.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/WordEmbeddings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WordEmbeddings.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/WordEmbeddings.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WordEmbeddings.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/WordWeights.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WordWeights.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/WordWeights.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/WordWeights.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/PhraseTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | import logging
 7 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 8 | import nltk
 9 | 
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | class PhraseTokenizer(WordTokenizer):
14 |     """Tokenizes the text with respect to existent phrases in the vocab.
15 | 
16 |     This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
17 |     in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
18 |     """
19 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, ngram_separator: str = "_", max_ngram_length: int = 5):
20 |         self.stop_words = set(stop_words)
21 |         self.do_lower_case = do_lower_case
22 |         self.ngram_separator = ngram_separator
23 |         self.max_ngram_length = max_ngram_length
24 |         self.set_vocab(vocab)
25 | 
26 |     def get_vocab(self):
27 |         return self.vocab
28 | 
29 |     def set_vocab(self, vocab: Iterable[str]):
30 |         self.vocab = vocab
31 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
32 | 
33 |         # Check for ngram in vocab
34 |         self.ngram_lookup = set()
35 |         self.ngram_lengths = set()
36 |         for word in vocab:
37 | 
38 |             if self.ngram_separator is not None and self.ngram_separator in word:
39 |                 # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other
40 |                 ngram_count = word.count(self.ngram_separator) + 1
41 |                 if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length:
42 |                     self.ngram_lookup.add(word)
43 |                     self.ngram_lengths.add(ngram_count)
44 | 
45 |         if len(vocab) > 0:
46 |             logger.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths))
47 |             logger.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup)))
48 | 
49 |     def tokenize(self, text: str) -> List[int]:
50 |         tokens = nltk.word_tokenize(text, preserve_line=True)
51 | 
52 |         #phrase detection
53 |         for ngram_len in sorted(self.ngram_lengths, reverse=True):
54 |             idx = 0
55 |             while idx <= len(tokens) - ngram_len:
56 |                 ngram = self.ngram_separator.join(tokens[idx:idx + ngram_len])
57 |                 if ngram in self.ngram_lookup:
58 |                     tokens[idx:idx + ngram_len] = [ngram]
59 |                 elif ngram.lower() in self.ngram_lookup:
60 |                     tokens[idx:idx + ngram_len] = [ngram.lower()]
61 |                 idx += 1
62 | 
63 |         #Map tokens to idx, filter stop words
64 |         tokens_filtered = []
65 |         for token in tokens:
66 |             if token in self.stop_words:
67 |                 continue
68 |             elif token in self.word2idx:
69 |                 tokens_filtered.append(self.word2idx[token])
70 |                 continue
71 | 
72 |             token = token.lower()
73 |             if token in self.stop_words:
74 |                 continue
75 |             elif token in self.word2idx:
76 |                 tokens_filtered.append(self.word2idx[token])
77 |                 continue
78 | 
79 |             token = token.strip(string.punctuation)
80 |             if token in self.stop_words:
81 |                 continue
82 |             elif len(token) > 0 and token in self.word2idx:
83 |                 tokens_filtered.append(self.word2idx[token])
84 |                 continue
85 | 
86 |         return tokens_filtered
87 | 
88 |     def save(self, output_path: str):
89 |         with open(os.path.join(output_path, 'phrasetokenizer_config.json'), 'w') as fOut:
90 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'ngram_separator': self.ngram_separator, 'max_ngram_length': self.max_ngram_length}, fOut)
91 | 
92 |     @staticmethod
93 |     def load(input_path: str):
94 |         with open(os.path.join(input_path, 'phrasetokenizer_config.json'), 'r') as fIn:
95 |             config = json.load(fIn)
96 | 
97 |         return PhraseTokenizer(**config)
98 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 7 | 
 8 | class WhitespaceTokenizer(WordTokenizer):
 9 |     """
10 |     Simple and fast white-space tokenizer. Splits sentence based on white spaces.
11 |     Punctuation are stripped from tokens.
12 |     """
13 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False):
14 |         self.stop_words = set(stop_words)
15 |         self.do_lower_case = do_lower_case
16 |         self.set_vocab(vocab)
17 | 
18 |     def get_vocab(self):
19 |         return self.vocab
20 | 
21 |     def set_vocab(self, vocab: Iterable[str]):
22 |         self.vocab = vocab
23 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
24 | 
25 |     def tokenize(self, text: str) -> List[int]:
26 |         if self.do_lower_case:
27 |             text = text.lower()
28 | 
29 |         tokens = text.split()
30 | 
31 |         tokens_filtered = []
32 |         for token in tokens:
33 |             if token in self.stop_words:
34 |                 continue
35 |             elif token in self.word2idx:
36 |                 tokens_filtered.append(self.word2idx[token])
37 |                 continue
38 | 
39 |             token = token.strip(string.punctuation)
40 |             if token in self.stop_words:
41 |                 continue
42 |             elif len(token) > 0 and token in self.word2idx:
43 |                 tokens_filtered.append(self.word2idx[token])
44 |                 continue
45 | 
46 |             token = token.lower()
47 |             if token in self.stop_words:
48 |                 continue
49 |             elif token in self.word2idx:
50 |                 tokens_filtered.append(self.word2idx[token])
51 |                 continue
52 | 
53 |         return tokens_filtered
54 | 
55 |     def save(self, output_path: str):
56 |         with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut:
57 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut)
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         return WhitespaceTokenizer(**config)
65 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/WordTokenizer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Union, Tuple, List, Iterable, Dict
 3 | 
 4 | ENGLISH_STOP_WORDS = ['!', '"', "''", "``", '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',  '{', '|', '}', '~', 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldn', 'couldnt', 'cry', 'd', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'hadn', 'has', 'hasn', 'hasnt', 'have', 'haven', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'll', 'ltd', 'm', 'ma', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mightn', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'mustn', 'my', 'myself', 'name', 'namely', 'needn', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'shan', 'she', 'should', 'shouldn', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 've', 'very', 'via', 'was', 'wasn', 'we', 'well', 'were', 'weren', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'won', 'would', 'wouldn', 'y', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']
 5 | 
 6 | 
 7 | class WordTokenizer(ABC):
 8 |     @abstractmethod
 9 |     def set_vocab(self, vocab: Iterable[str]):
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def get_vocab(self, vocab: Iterable[str]):
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def tokenize(self, text: str) -> List[int]:
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def save(self, output_path: str):
22 |         pass
23 | 
24 |     @staticmethod
25 |     @abstractmethod
26 |     def load(input_path: str):
27 |         pass


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
2 | from .WhitespaceTokenizer import WhitespaceTokenizer
3 | from .WhitespaceTokenizer import WhitespaceTokenizer


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/WhitespaceTokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__pycache__/WordTokenizer.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/WordTokenizer.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__pycache__/WordTokenizer.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/WordTokenizer.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/models/tokenizer/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/InputExample.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | 
 4 | class InputExample:
 5 |     """
 6 |     Structure for one input example with texts, the label and a unique id
 7 |     """
 8 |     def __init__(self, guid: str = '', texts: List[str] = None,  label: Union[int, float] = 0):
 9 |         """
10 |         Creates one InputExample with the given texts, guid and label
11 | 
12 | 
13 |         :param guid
14 |             id for the example
15 |         :param texts
16 |             the texts for the example. Note, str.strip() is called on the texts
17 |         :param label
18 |             the label for the example
19 |         """
20 |         self.guid = guid
21 |         self.texts = texts
22 |         self.label = label
23 | 
24 |     def __str__(self):
25 |         return "<InputExample> label: {}, texts: {}".format(str(self.label), "; ".join(self.texts))


--------------------------------------------------------------------------------
/sentence_transformers/readers/LabelSentenceReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class LabelSentenceReader:
 7 |     """Reads in a file that has at least two columns: a label and a sentence.
 8 |     This reader can for example be used with the BatchHardTripletLoss.
 9 |     Maps labels automatically to integers"""
10 |     def __init__(self, folder, label_col_idx=0, sentence_col_idx=1, separator='\t'):
11 |         self.folder = folder
12 |         self.label_map = {}
13 |         self.label_col_idx = label_col_idx
14 |         self.sentence_col_idx = sentence_col_idx
15 |         self.separator = separator
16 | 
17 |     def get_examples(self, filename, max_examples=0):
18 |         examples = []
19 | 
20 |         id = 0
21 |         for line in open(os.path.join(self.folder, filename), encoding="utf-8"):
22 |             splits = line.strip().split(self.separator)
23 |             label = splits[self.label_col_idx]
24 |             sentence = splits[self.sentence_col_idx]
25 | 
26 |             if label not in self.label_map:
27 |                 self.label_map[label] = len(self.label_map)
28 | 
29 |             label_id = self.label_map[label]
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence], label=label_id))
33 | 
34 |             if 0 < max_examples <= id:
35 |                 break
36 | 
37 |         return examples
38 | 


--------------------------------------------------------------------------------
/sentence_transformers/readers/NLIDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | 
 7 | class NLIDataReader(object):
 8 |     """
 9 |     Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
10 |     """
11 |     def __init__(self, dataset_folder):
12 |         self.dataset_folder = dataset_folder
13 | 
14 |     def get_examples(self, filename, max_examples=0):
15 |         """
16 |         data_splits specified which data split to use (train, dev, test).
17 |         Expects that self.dataset_folder contains the files s1.$data_split.gz,  s2.$data_split.gz,
18 |         labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz
19 |         """
20 |         s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename),
21 |                        mode="rt", encoding="utf-8").readlines()
22 |         s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename),
23 |                        mode="rt", encoding="utf-8").readlines()
24 |         labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename),
25 |                            mode="rt", encoding="utf-8").readlines()
26 | 
27 |         examples = []
28 |         id = 0
29 |         for sentence_a, sentence_b, label in zip(s1, s2, labels):
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))
33 | 
34 |             if 0 < max_examples <= len(examples):
35 |                 break
36 | 
37 |         return examples
38 | 
39 |     @staticmethod
40 |     def get_labels():
41 |         return {"contradiction": 0, "entailment": 1, "neutral": 2}
42 | 
43 |     def get_num_labels(self):
44 |         return len(self.get_labels())
45 | 
46 |     def map_label(self, label):
47 |         return self.get_labels()[label.strip().lower()]


--------------------------------------------------------------------------------
/sentence_transformers/readers/PairedFilesReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | import gzip
 6 | 
 7 | class PairedFilesReader(object):
 8 |     """
 9 |     Reads in the a Pair Dataset, split in two files
10 |     """
11 |     def __init__(self, filepaths):
12 |         self.filepaths = filepaths
13 | 
14 | 
15 |     def get_examples(self, max_examples=0):
16 |         """
17 |         """
18 |         fIns = []
19 |         for filepath in self.filepaths:
20 |             fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8')
21 |             fIns.append(fIn)
22 | 
23 |         examples = []
24 | 
25 |         eof = False
26 |         while not eof:
27 |             texts = []
28 |             for fIn in fIns:
29 |                 text = fIn.readline()
30 | 
31 |                 if text == '':
32 |                     eof = True
33 |                     break
34 | 
35 |                 texts.append(text)
36 | 
37 |             if eof:
38 |                 break;
39 | 
40 |             examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
41 |             if max_examples > 0 and len(examples) >= max_examples:
42 |                 break
43 | 
44 |         return examples


--------------------------------------------------------------------------------
/sentence_transformers/readers/STSDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class STSDataReader:
 7 |     """
 8 |     Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
 9 | 
10 |     Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
11 |     """
12 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t",
13 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
14 |         self.dataset_folder = dataset_folder
15 |         self.score_col_idx = score_col_idx
16 |         self.s1_col_idx = s1_col_idx
17 |         self.s2_col_idx = s2_col_idx
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 |         self.normalize_scores = normalize_scores
21 |         self.min_score = min_score
22 |         self.max_score = max_score
23 | 
24 |     def get_examples(self, filename, max_examples=0):
25 |         """
26 |         filename specified which data split to use (train.csv, dev.csv, test.csv).
27 |         """
28 |         filepath = os.path.join(self.dataset_folder, filename)
29 |         with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8") as fIn:
30 |             data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
31 |             examples = []
32 |             for id, row in enumerate(data):
33 |                 score = float(row[self.score_col_idx])
34 |                 if self.normalize_scores:  # Normalize to a 0...1 value
35 |                     score = (score - self.min_score) / (self.max_score - self.min_score)
36 | 
37 |                 s1 = row[self.s1_col_idx]
38 |                 s2 = row[self.s2_col_idx]
39 |                 examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))
40 | 
41 |                 if max_examples > 0 and len(examples) >= max_examples:
42 |                     break
43 | 
44 |         return examples
45 | 
46 | class STSBenchmarkDataReader(STSDataReader):
47 |     """
48 |     Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
49 |     Scores are normalized from 0...5 to 0...1
50 |     """
51 |     def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t",
52 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
53 |         super().__init__(dataset_folder=dataset_folder, s1_col_idx=s1_col_idx, s2_col_idx=s2_col_idx, score_col_idx=score_col_idx, delimiter=delimiter,
54 |                  quoting=quoting, normalize_scores=normalize_scores, min_score=min_score, max_score=max_score)


--------------------------------------------------------------------------------
/sentence_transformers/readers/TripletReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class TripletReader(object):
 7 |     """
 8 |     Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
 9 |     one positive example (s2) and one negative example (s3)
10 |     """
11 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t",
12 |                  quoting=csv.QUOTE_NONE):
13 |         self.dataset_folder = dataset_folder
14 |         self.s1_col_idx = s1_col_idx
15 |         self.s2_col_idx = s2_col_idx
16 |         self.s3_col_idx = s3_col_idx
17 |         self.has_header = has_header
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 | 
21 |     def get_examples(self, filename, max_examples=0):
22 |         """
23 | 
24 |         """
25 |         data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter,
26 |                           quoting=self.quoting)
27 |         examples = []
28 |         if self.has_header:
29 |             next(data)
30 | 
31 |         for id, row in enumerate(data):
32 |             s1 = row[self.s1_col_idx]
33 |             s2 = row[self.s2_col_idx]
34 |             s3 = row[self.s3_col_idx]
35 | 
36 |             examples.append(InputExample(texts=[s1, s2, s3]))
37 |             if max_examples > 0 and len(examples) >= max_examples:
38 |                 break
39 | 
40 |         return examples


--------------------------------------------------------------------------------
/sentence_transformers/readers/__init__.py:
--------------------------------------------------------------------------------
1 | from .InputExample import InputExample
2 | from .LabelSentenceReader import LabelSentenceReader
3 | from .NLIDataReader import NLIDataReader
4 | from .STSDataReader import STSDataReader, STSBenchmarkDataReader
5 | from .TripletReader import TripletReader


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/InputExample.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/InputExample.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/InputExample.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/InputExample.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/LabelSentenceReader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/LabelSentenceReader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/LabelSentenceReader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/LabelSentenceReader.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/NLIDataReader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/NLIDataReader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/NLIDataReader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/NLIDataReader.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/STSDataReader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/STSDataReader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/STSDataReader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/STSDataReader.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/TripletReader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/TripletReader.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/TripletReader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/TripletReader.cpython-38.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/sentence_transformers/readers/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/sentence_transformers/readers/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file:
 4 |     readme = readme_file.read()
 5 | 
 6 | 
 7 | 
 8 | setup(
 9 |     name="sentence-transformers",
10 |     version="1.2.0",
11 |     author="Nils Reimers",
12 |     author_email="info@nils-reimers.de",
13 |     description="Sentence Embeddings using BERT / RoBERTa / XLM-R",
14 |     long_description=readme,
15 |     long_description_content_type="text/markdown",
16 |     license="Apache License 2.0",
17 |     url="https://github.com/UKPLab/sentence-transformers",
18 |     download_url="https://github.com/UKPLab/sentence-transformers/archive/v1.2.0.zip",
19 |     packages=find_packages(),
20 |     install_requires=[
21 |         'transformers>=3.1.0,<5.0.0',
22 |         'tqdm',
23 |         'torch>=1.6.0',
24 |         'torchvision',
25 |         'numpy',
26 |         'scikit-learn',
27 |         'scipy',
28 |         'nltk',
29 |         'sentencepiece'
30 |     ],
31 |     classifiers=[
32 |         "Development Status :: 4 - Beta",
33 |         "Intended Audience :: Science/Research",
34 |         "License :: OSI Approved :: Apache Software License",
35 |         "Programming Language :: Python :: 3.6",
36 |         "Topic :: Scientific/Engineering :: Artificial Intelligence"
37 |     ],
38 |     keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning"
39 | )
40 | 


--------------------------------------------------------------------------------
/training/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/training/.DS_Store


--------------------------------------------------------------------------------
/training/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/training/data/.DS_Store


--------------------------------------------------------------------------------
/training/data/back_translated_nli.txt.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/yanzhangnlp/BSL/cfacb3b2975cd5575f4b15aac028955f260425e4/training/data/back_translated_nli.txt.zip


--------------------------------------------------------------------------------
/training/supervised_tuning.py:
--------------------------------------------------------------------------------
  1 | from torch.utils.data import DataLoader
  2 | import math
  3 | from sentence_transformers import models, losses
  4 | from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
  5 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
  6 | import logging
  7 | from datetime import datetime
  8 | import os
  9 | import gzip
 10 | import csv
 11 | 
 12 | #### Just some code to print debug information to stdout
 13 | logging.basicConfig(format='%(asctime)s - %(message)s',
 14 |                     datefmt='%Y-%m-%d %H:%M:%S',
 15 |                     level=logging.INFO,
 16 |                     handlers=[LoggingHandler()])
 17 | #### /print debug information to stdout
 18 | 
 19 | # Training parameters
 20 | model_name = 'nli-roberta-base-v2'
 21 | train_batch_size = 128
 22 | num_epochs = 1
 23 | max_seq_length = 64
 24 | moving_average_decay = 0.9999
 25 | nli_dataset_path = 'data/AllNLI'
 26 | 
 27 | # Save path to store our model
 28 | model_save_path = 'output/BSL_tuning-{}-{}-{}'.format(model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
 29 | 
 30 | # Check if dataset exsist. If not, download and extract  it
 31 | sts_dataset_path = 'data/stsbenchmark.tsv.gz'
 32 | 
 33 | if not os.path.exists(sts_dataset_path):
 34 |     util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)
 35 | 
 36 | nli_dataset_path = 'data/AllNLI.tsv.gz'
 37 | if not os.path.exists(nli_dataset_path):
 38 |     util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)
 39 | 
 40 | model = SentenceTransformer(model_name)
 41 | 
 42 | # Read the AllNLI.tsv.gz file and create the training dataset
 43 | logging.info("Read AllNLI train dataset")
 44 | 
 45 | label2int = {"contradiction": 0, "entailment": 1, "neutral": 2}
 46 | train_samples = []
 47 | count = 0
 48 | with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
 49 |     reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
 50 |     for row in reader:
 51 |         if row['split'] == 'train':
 52 |             # label_id = label2int[row['label']]
 53 |             if row['label'] == "entailment":
 54 |                 count += 1
 55 |                 train_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']]))
 56 | 
 57 | 
 58 | 
 59 | # Read STSbenchmark dataset and use it as development set
 60 | logging.info("Read STSbenchmark dev dataset")
 61 | dev_samples = []
 62 | test_samples = []
 63 | with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
 64 |     reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
 65 |     for row in reader:
 66 |         score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
 67 |         if row['split'] == 'dev':
 68 |             dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
 69 |         elif row['split'] == 'test':
 70 |             test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
 71 | 
 72 | dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')
 73 | test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
 74 | 
 75 | train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
 76 | train_loss = losses.BYOLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), moving_average_decay=moving_average_decay)
 77 | 
 78 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
 79 | evaluation_steps = int(len(train_dataloader) * 0.1) #Evaluate every 10% of the data
 80 | logging.info("Training sentences: {}".format(len(train_samples)))
 81 | logging.info("Warmup-steps: {}".format(warmup_steps))
 82 | logging.info("Performance before training")
 83 | dev_evaluator(model)
 84 | 
 85 | # Train the model
 86 | model.fit(train_objectives=[(train_dataloader, train_loss)],
 87 |           evaluator=dev_evaluator,
 88 |           epochs=num_epochs,
 89 |           evaluation_steps=evaluation_steps,
 90 |           warmup_steps=warmup_steps,
 91 |           output_path=model_save_path,
 92 |           optimizer_params={'lr': 5e-5},
 93 |           use_amp=True        #Set to True, if your GPU supports FP16 cores
 94 |           )
 95 | 
 96 | ##############################################################################
 97 | #
 98 | # Load the stored model and evaluate its performance on STS benchmark dataset
 99 | #
100 | ##############################################################################
101 | 
102 | 
103 | model = SentenceTransformer(model_save_path)
104 | test_evaluator(model, output_path=model_save_path)
105 | 


--------------------------------------------------------------------------------
/training/unsupervised_tuning.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import DataLoader
 2 | import math
 3 | from sentence_transformers import models, losses
 4 | from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
 5 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 6 | import logging
 7 | from datetime import datetime
 8 | import os
 9 | import gzip
10 | import csv
11 | 
12 | #### Just some code to print debug information to stdout
13 | logging.basicConfig(format='%(asctime)s - %(message)s',
14 |                     datefmt='%Y-%m-%d %H:%M:%S',
15 |                     level=logging.INFO,
16 |                     handlers=[LoggingHandler()])
17 | 
18 | # Training parameters
19 | model_name = 'bert-base-uncased'
20 | train_batch_size = 64
21 | num_epochs = 1
22 | max_seq_length = 64
23 | #predictor_layer_num = 3
24 | moving_average_decay = 0.999
25 | un_nli_dataset_path = 'data/back_translated_nli.txt'
26 | 
27 | # Save path to store our model
28 | model_save_path = 'output/BSL_tuning-{}-{}-{}'.format(model_name, train_batch_size, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
29 | 
30 | # Check if dataset exsist. If not, download and extract  it
31 | sts_dataset_path = 'data/stsbenchmark.tsv.gz'
32 | 
33 | if not os.path.exists(sts_dataset_path):
34 |     util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)
35 | 
36 | 
37 | word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
38 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
39 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
40 | 
41 | 
42 | #train_samples is a list of InputExample objects where we pass the same sentence twice to texts, i.e. texts=[sent, sent]
43 | train_samples = []
44 | with open(un_nli_dataset_path, 'r', encoding='utf8') as fIn:
45 |     for line in fIn:
46 |         line = line.strip()
47 |         seg = line.strip().split('\t')
48 |         train_samples.append(InputExample(texts=[seg[0], seg[1]]))
49 | 
50 | 
51 | # Read STSbenchmark dataset and use it as development set
52 | logging.info("Read STSbenchmark dev dataset")
53 | dev_samples = []
54 | test_samples = []
55 | with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
56 |     reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
57 |     for row in reader:
58 |         score = float(row['score']) / 5.0  # Normalize score to range 0 ... 1
59 |         if row['split'] == 'dev':
60 |             dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
61 |         elif row['split'] == 'test':
62 |             test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))
63 | 
64 | dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')
65 | test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
66 | 
67 | train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=train_batch_size, drop_last=True)
68 | train_loss = losses.BYOLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), moving_average_decay=moving_average_decay)
69 | 
70 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1)  # 10% of train data for warm-up
71 | evaluation_steps = int(len(train_dataloader) * 0.1) #Evaluate every 10% of the data
72 | logging.info("Training sentences: {}".format(len(train_samples)))
73 | logging.info("Warmup-steps: {}".format(warmup_steps))
74 | logging.info("Performance before training")
75 | dev_evaluator(model)
76 | 
77 | # Train the model
78 | model.fit(train_objectives=[(train_dataloader, train_loss)],
79 |           evaluator=dev_evaluator,
80 |           epochs=num_epochs,
81 |           evaluation_steps=evaluation_steps,
82 |           warmup_steps=warmup_steps,
83 |           output_path=model_save_path,
84 |           optimizer_params={'lr': 1e-4},
85 |           use_amp=True        #Set to True, if your GPU supports FP16 cores
86 |           )
87 | 
88 | ##############################################################################
89 | #
90 | # Load the stored model and evaluate its performance on STS benchmark dataset
91 | #
92 | ##############################################################################
93 | 
94 | 
95 | model = SentenceTransformer(model_save_path)
96 | test_evaluator(model, output_path=model_save_path)
97 | 


--------------------------------------------------------------------------------