├── setup.cfg ├── requirements.txt ├── .gitignore ├── sentence_transformers ├── evaluation │ ├── SimilarityFunction.py │ ├── __init__.py │ ├── SequentialEvaluator.py │ ├── SentenceEvaluator.py │ ├── LabelAccuracyEvaluator.py │ ├── TripletEvaluator.py │ ├── BinaryEmbeddingSimilarityEvaluator.py │ └── EmbeddingSimilarityEvaluator.py ├── losses │ ├── __init__.py │ ├── CosineSimilarityLoss.py │ ├── TripletLoss.py │ ├── MultipleNegativesRankingLoss.py │ ├── SoftmaxLoss.py │ └── test_batch_hard_triplet_loss.py ├── models │ ├── tokenizer │ │ ├── __init__.py │ │ ├── WhitespaceTokenizer.py │ │ ├── WordTokenizer.py │ │ └── PhraseTokenizer.py │ ├── __init__.py │ ├── Dense.py │ ├── LSTM.py │ ├── CNN.py │ ├── BoW.py │ ├── WordWeights.py │ ├── Pooling.py │ ├── DistilBERT.py │ ├── RoBERTa.py │ ├── XLMRoBERTa.py │ ├── T5.py │ ├── BERT.py │ ├── CamemBERT.py │ ├── ALBERT.py │ ├── XLNet.py │ └── WordEmbeddings.py ├── readers │ ├── __init__.py │ ├── InputExample.py │ ├── LabelSentenceReader.py │ ├── TripletReader.py │ ├── STSDataReader.py │ └── NLIDataReader.py ├── __init__.py ├── LoggingHandler.py ├── util.py ├── data_samplers.py └── datasets.py ├── NOTICE.txt ├── examples ├── datasets │ ├── README.md │ └── get_data.py ├── basic_embedding.py ├── evaluation_stsbenchmark.py ├── application_clustering.py ├── application_semantic_search.py ├── training_stsbenchmark_continue_training.py ├── training_stsbenchmark_bert.py ├── training_stsbenchmark_xlnet.py ├── training_stsbenchmark_roberta.py ├── training_stsbenchmark_albert.py ├── training_stsbenchmark_distilbert.py ├── training_stsbenchmark_cnn.py ├── training_wikipedia_sections.py ├── training_nli_T5.py ├── training_stsbenchmark_bilstm.py ├── application_clustering_wikipedia_sections.py ├── training_nli_bert.py ├── training_nli_roberta.py ├── training_nli_albert.py ├── training_nli_xlm-roberta.py ├── training_nli_distilbert.py ├── training_stsbenchmark_avg_word_embeddings.py ├── training_stsbenchmark_bow.py └── training_stsbenchmark_tf-idf_word_embeddings.py ├── setup.py └── docs └── pretrained-models ├── sts-models.md ├── wikipedia-sections-models.md └── nli-models.md /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==2.3.0 2 | tqdm 3 | torch>=1.0.1 4 | numpy 5 | scikit-learn 6 | scipy 7 | nltk -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | examples/datasets/*/ 4 | examples/output 5 | sentence_transformers.egg-info 6 | dist/ 7 | examples_nr/ -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SimilarityFunction.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class SimilarityFunction(Enum): 4 | COSINE = 0 5 | EUCLIDEAN = 1 6 | MANHATTAN = 2 7 | DOT_PRODUCT = 3 8 | 9 | -------------------------------------------------------------------------------- /sentence_transformers/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .CosineSimilarityLoss import * 2 | from .SoftmaxLoss import * 3 | from .BatchHardTripletLoss import * 4 | from .MultipleNegativesRankingLoss import * 5 | from .TripletLoss import * -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 2 | from .WhitespaceTokenizer import WhitespaceTokenizer 3 | from .WhitespaceTokenizer import WhitespaceTokenizer -------------------------------------------------------------------------------- /sentence_transformers/readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .InputExample import InputExample 2 | from .LabelSentenceReader import LabelSentenceReader 3 | from .NLIDataReader import NLIDataReader 4 | from .STSDataReader import STSDataReader 5 | from .TripletReader import TripletReader -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | Copyright 2019 3 | Ubiquitous Knowledge Processing (UKP) Lab 4 | Technische Universität Darmstadt 5 | ------------------------------------------------------------------------------- -------------------------------------------------------------------------------- /sentence_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.5" 2 | __DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/' 3 | from .datasets import SentencesDataset, SentenceLabelDataset 4 | from .data_samplers import LabelSampler 5 | from .LoggingHandler import LoggingHandler 6 | from .SentenceTransformer import SentenceTransformer 7 | 8 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .SentenceEvaluator import SentenceEvaluator 2 | from .SimilarityFunction import SimilarityFunction 3 | 4 | from .BinaryEmbeddingSimilarityEvaluator import BinaryEmbeddingSimilarityEvaluator 5 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator 6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator 7 | from .SequentialEvaluator import SequentialEvaluator 8 | from .TripletEvaluator import TripletEvaluator 9 | -------------------------------------------------------------------------------- /sentence_transformers/LoggingHandler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tqdm 3 | 4 | class LoggingHandler(logging.Handler): 5 | def __init__(self, level=logging.NOTSET): 6 | super().__init__(level) 7 | 8 | def emit(self, record): 9 | try: 10 | msg = self.format(record) 11 | tqdm.tqdm.write(msg) 12 | self.flush() 13 | except (KeyboardInterrupt, SystemExit): 14 | raise 15 | except: 16 | self.handleError(record) -------------------------------------------------------------------------------- /sentence_transformers/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .ALBERT import ALBERT 2 | from .BERT import BERT 3 | from .BoW import BoW 4 | from .CamemBERT import CamemBERT 5 | from .CNN import CNN 6 | from .Dense import Dense 7 | from .DistilBERT import DistilBERT 8 | from .LSTM import LSTM 9 | from .Pooling import Pooling 10 | from .RoBERTa import RoBERTa 11 | from .T5 import T5 12 | from .WordEmbeddings import WordEmbeddings 13 | from .WordWeights import WordWeights 14 | from .XLMRoBERTa import XLMRoBERTa 15 | from .XLNet import XLNet 16 | -------------------------------------------------------------------------------- /examples/datasets/README.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | This folder contains some example datasets that can be used to for training and evaluation of sentence embeddings methods. 3 | 4 | To download these datasets, run: 5 | ``` 6 | python get_data.py 7 | ``` 8 | 9 | It will download the datasets and unzip them into this directory. 10 | 11 | 12 | # AllNLI Dataset 13 | The AllNLI dataset is the concatenation of the SNLI dataset (https://nlp.stanford.edu/projects/snli/) and the MultiNLI dataset (https://www.nyu.edu/projects/bowman/multinli/). 14 | 15 | # STS Benchmark 16 | The STS Benchmark (http://ixa2.si.ehu.eus/stswiki) contains sentence pairs with human gold score for their similarity. 17 | -------------------------------------------------------------------------------- /sentence_transformers/readers/InputExample.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | 4 | class InputExample: 5 | """ 6 | Structure for one input example with texts, the label and a unique id 7 | """ 8 | def __init__(self, guid: str, texts: List[str], label: Union[int, float]): 9 | """ 10 | Creates one InputExample with the given texts, guid and label 11 | 12 | str.strip() is called on both texts. 13 | 14 | :param guid 15 | id for the example 16 | :param texts 17 | the texts for the example 18 | :param label 19 | the label for the example 20 | """ 21 | self.guid = guid 22 | self.texts = [text.strip() for text in texts] 23 | self.label = label 24 | -------------------------------------------------------------------------------- /examples/datasets/get_data.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import zipfile 3 | import os 4 | folder_path = os.path.dirname(os.path.realpath(__file__)) 5 | print('Beginning download of datasets') 6 | 7 | datasets = ['AllNLI.zip', 'stsbenchmark.zip', 'wikipedia-sections-triplets.zip'] 8 | server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/" 9 | 10 | for dataset in datasets: 11 | print("Download", dataset) 12 | url = server+dataset 13 | dataset_path = os.path.join(folder_path, dataset) 14 | urllib.request.urlretrieve(url, dataset_path) 15 | 16 | print("Extract", dataset) 17 | with zipfile.ZipFile(dataset_path, "r") as zip_ref: 18 | zip_ref.extractall(folder_path) 19 | os.remove(dataset_path) 20 | 21 | 22 | print("All datasets downloaded and extracted") 23 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SequentialEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | from typing import Iterable 3 | 4 | class SequentialEvaluator(SentenceEvaluator): 5 | """ 6 | This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated, 7 | the data is passed sequentially to all sub-evaluators. 8 | 9 | The score from the last sub-evaluator will be used as the main score for the best model decision. 10 | """ 11 | def __init__(self, evaluators: Iterable[SentenceEvaluator]): 12 | self.evaluators = evaluators 13 | 14 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 15 | for evaluator in self.evaluators: 16 | main_score = evaluator(model, output_path, epoch, steps) 17 | 18 | return main_score 19 | -------------------------------------------------------------------------------- /sentence_transformers/losses/CosineSimilarityLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | from ..SentenceTransformer import SentenceTransformer 5 | 6 | class CosineSimilarityLoss(nn.Module): 7 | def __init__(self, model: SentenceTransformer): 8 | super(CosineSimilarityLoss, self).__init__() 9 | self.model = model 10 | 11 | 12 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 13 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 14 | rep_a, rep_b = reps 15 | 16 | output = torch.cosine_similarity(rep_a, rep_b) 17 | loss_fct = nn.MSELoss() 18 | 19 | if labels is not None: 20 | loss = loss_fct(output, labels.view(-1)) 21 | return loss 22 | else: 23 | return reps, output -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SentenceEvaluator.py: -------------------------------------------------------------------------------- 1 | class SentenceEvaluator: 2 | """ 3 | Base class for all evaluators 4 | 5 | Extend this class and implement __call__ for custom evaluators. 6 | """ 7 | 8 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 9 | """ 10 | This is called during training to evaluate the model. 11 | It returns a score for the evaluation with a higher score indicating a better result. 12 | 13 | :param model: 14 | the model to evaluate 15 | :param output_path: 16 | path where predictions and metrics are written to 17 | :param epoch 18 | the epoch where the evaluation takes place. 19 | This is used for the file prefixes. 20 | If this is -1, then we assume evaluation on test data. 21 | :param steps 22 | the steps in the current epoch at time of the evaluation. 23 | This is used for the file prefixes. 24 | If this is -1, then we assume evaluation at the end of the epoch. 25 | :return: a score for the evaluation with a higher score indicating a better result 26 | """ 27 | pass 28 | -------------------------------------------------------------------------------- /examples/basic_embedding.py: -------------------------------------------------------------------------------- 1 | """ 2 | This basic example loads a pre-trained model from the web and uses it to 3 | generate sentence embeddings for a given list of sentences. 4 | """ 5 | 6 | from sentence_transformers import SentenceTransformer, LoggingHandler 7 | import numpy as np 8 | import logging 9 | 10 | #### Just some code to print debug information to stdout 11 | np.set_printoptions(threshold=100) 12 | 13 | logging.basicConfig(format='%(asctime)s - %(message)s', 14 | datefmt='%Y-%m-%d %H:%M:%S', 15 | level=logging.INFO, 16 | handlers=[LoggingHandler()]) 17 | #### /print debug information to stdout 18 | 19 | 20 | 21 | # Load Sentence model (based on BERT) from URL 22 | model = SentenceTransformer('bert-base-nli-mean-tokens') 23 | 24 | # Embed a list of sentences 25 | sentences = ['This framework generates embeddings for each input sentence', 26 | 'Sentences are passed as a list of string.', 27 | 'The quick brown fox jumps over the lazy dog.'] 28 | sentence_embeddings = model.encode(sentences) 29 | 30 | # The result is a list of sentence embeddings as numpy arrays 31 | for sentence, embedding in zip(sentences, sentence_embeddings): 32 | print("Sentence:", sentence) 33 | print("Embedding:", embedding) 34 | print("") 35 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file: 4 | readme = readme_file.read() 5 | 6 | setup( 7 | name="sentence-transformers", 8 | version="0.2.5", 9 | author="Nils Reimers, Gregor Geigle", 10 | author_email="Rnils@web.de", 11 | description="Sentence Embeddings using BERT / RoBERTa / XLNet", 12 | long_description=readme, 13 | long_description_content_type="text/markdown", 14 | license="Apache License 2.0", 15 | url="https://github.com/UKPLab/sentence-transformers", 16 | download_url="https://github.com/UKPLab/sentence-transformers/archive/v0.2.5.zip", 17 | packages=find_packages(), 18 | install_requires=[ 19 | "transformers==2.3.0", 20 | "tqdm", 21 | "torch>=1.0.1", 22 | "numpy", 23 | "scikit-learn", 24 | "scipy", 25 | "nltk" 26 | ], 27 | classifiers=[ 28 | "Development Status :: 4 - Beta", 29 | "Intended Audience :: Science/Research", 30 | "License :: OSI Approved :: Apache Software License", 31 | "Programming Language :: Python :: 3.6", 32 | "Topic :: Scientific/Engineering :: Artificial Intelligence" 33 | ], 34 | keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning" 35 | ) 36 | -------------------------------------------------------------------------------- /sentence_transformers/readers/LabelSentenceReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class LabelSentenceReader: 7 | """Reads in a file that has at least two columns: a label and a sentence. 8 | This reader can for example be used with the BatchHardTripletLoss. 9 | Maps labels automatically to integers""" 10 | def __init__(self, folder, label_col_idx=0, sentence_col_idx=1): 11 | self.folder = folder 12 | self.label_map = {} 13 | self.label_col_idx = label_col_idx 14 | self.sentence_col_idx = sentence_col_idx 15 | 16 | def get_examples(self, filename, max_examples=0): 17 | examples = [] 18 | 19 | id = 0 20 | for line in open(os.path.join(self.folder, filename), encoding="utf-8"): 21 | splits = line.strip().split('\t') 22 | label = splits[self.label_col_idx] 23 | sentence = splits[self.sentence_col_idx] 24 | 25 | if label not in self.label_map: 26 | self.label_map[label] = len(self.label_map) 27 | 28 | label_id = self.label_map[label] 29 | guid = "%s-%d" % (filename, id) 30 | id += 1 31 | examples.append(InputExample(guid=guid, texts=[sentence], label=label_id)) 32 | 33 | if 0 < max_examples <= id: 34 | break 35 | 36 | return examples -------------------------------------------------------------------------------- /examples/evaluation_stsbenchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset 3 | """ 4 | from torch.utils.data import DataLoader 5 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler 6 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 7 | from sentence_transformers.readers import STSDataReader 8 | import numpy as np 9 | import logging 10 | 11 | 12 | #### Just some code to print debug information to stdout 13 | np.set_printoptions(threshold=100) 14 | 15 | logging.basicConfig(format='%(asctime)s - %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S', 17 | level=logging.INFO, 18 | handlers=[LoggingHandler()]) 19 | #### /print debug information to stdout 20 | 21 | 22 | 23 | # Load a named sentence model (based on BERT). This will download the model from our server. 24 | # Alternatively, you can also pass a filepath to SentenceTransformer() 25 | model = SentenceTransformer('bert-base-nli-mean-tokens') 26 | 27 | sts_reader = STSDataReader('datasets/stsbenchmark') 28 | 29 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 30 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) 31 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 32 | 33 | model.evaluate(evaluator) 34 | -------------------------------------------------------------------------------- /sentence_transformers/losses/TripletLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import torch.nn.functional as F 5 | from enum import Enum 6 | from ..SentenceTransformer import SentenceTransformer 7 | 8 | class TripletDistanceMetric(Enum): 9 | """ 10 | The metric for the triplet loss 11 | """ 12 | COSINE = lambda x, y: 1 - F.cosine_similarity(x, y) 13 | EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2) 14 | MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1) 15 | 16 | class TripletLoss(nn.Module): 17 | def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1): 18 | super(TripletLoss, self).__init__() 19 | self.model = model 20 | self.distance_metric = distance_metric 21 | self.triplet_margin = triplet_margin 22 | 23 | 24 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 25 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 26 | 27 | rep_anchor, rep_pos, rep_neg = reps 28 | distance_pos = self.distance_metric(rep_anchor, rep_pos) 29 | distance_neg = self.distance_metric(rep_anchor, rep_neg) 30 | 31 | losses = F.relu(distance_pos - distance_neg + self.triplet_margin) 32 | return losses.mean() -------------------------------------------------------------------------------- /sentence_transformers/readers/TripletReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class TripletReader(object): 7 | """ 8 | Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1), 9 | one positive example (s2) and one negative example (s3) 10 | """ 11 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t", 12 | quoting=csv.QUOTE_NONE): 13 | self.dataset_folder = dataset_folder 14 | self.s1_col_idx = s1_col_idx 15 | self.s2_col_idx = s2_col_idx 16 | self.s3_col_idx = s3_col_idx 17 | self.has_header = has_header 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | 21 | def get_examples(self, filename, max_examples=0): 22 | """ 23 | 24 | """ 25 | data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter, 26 | quoting=self.quoting) 27 | examples = [] 28 | if self.has_header: 29 | next(data) 30 | 31 | for id, row in enumerate(data): 32 | s1 = row[self.s1_col_idx] 33 | s2 = row[self.s2_col_idx] 34 | s3 = row[self.s3_col_idx] 35 | 36 | examples.append(InputExample(guid=filename+str(id), texts=[s1, s2, s3], label=1)) 37 | if max_examples > 0 and len(examples) >= max_examples: 38 | break 39 | 40 | return examples -------------------------------------------------------------------------------- /examples/application_clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a simple application for sentence embeddings: clustering 3 | 4 | Sentences are mapped to sentence embeddings and then k-mean clustering is applied. 5 | """ 6 | from sentence_transformers import SentenceTransformer 7 | from sklearn.cluster import KMeans 8 | 9 | embedder = SentenceTransformer('bert-base-nli-mean-tokens') 10 | 11 | # Corpus with example sentences 12 | corpus = ['A man is eating food.', 13 | 'A man is eating a piece of bread.', 14 | 'A man is eating pasta.', 15 | 'The girl is carrying a baby.', 16 | 'The baby is carried by the woman', 17 | 'A man is riding a horse.', 18 | 'A man is riding a white horse on an enclosed ground.', 19 | 'A monkey is playing drums.', 20 | 'Someone in a gorilla costume is playing a set of drums.', 21 | 'A cheetah is running behind its prey.', 22 | 'A cheetah chases prey on across a field.' 23 | ] 24 | corpus_embeddings = embedder.encode(corpus) 25 | 26 | # Perform kmean clustering 27 | num_clusters = 5 28 | clustering_model = KMeans(n_clusters=num_clusters) 29 | clustering_model.fit(corpus_embeddings) 30 | cluster_assignment = clustering_model.labels_ 31 | 32 | clustered_sentences = [[] for i in range(num_clusters)] 33 | for sentence_id, cluster_id in enumerate(cluster_assignment): 34 | clustered_sentences[cluster_id].append(corpus[sentence_id]) 35 | 36 | for i, cluster in enumerate(clustered_sentences): 37 | print("Cluster ", i+1) 38 | print(cluster) 39 | print("") 40 | -------------------------------------------------------------------------------- /sentence_transformers/readers/STSDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class STSDataReader: 7 | """ 8 | Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx) 9 | """ 10 | def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t", 11 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 12 | self.dataset_folder = dataset_folder 13 | self.score_col_idx = score_col_idx 14 | self.s1_col_idx = s1_col_idx 15 | self.s2_col_idx = s2_col_idx 16 | self.delimiter = delimiter 17 | self.quoting = quoting 18 | self.normalize_scores = normalize_scores 19 | self.min_score = min_score 20 | self.max_score = max_score 21 | 22 | def get_examples(self, filename, max_examples=0): 23 | """ 24 | filename specified which data split to use (train.csv, dev.csv, test.csv). 25 | """ 26 | data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), 27 | delimiter=self.delimiter, quoting=self.quoting) 28 | examples = [] 29 | for id, row in enumerate(data): 30 | score = float(row[self.score_col_idx]) 31 | if self.normalize_scores: # Normalize to a 0...1 value 32 | score = (score - self.min_score) / (self.max_score - self.min_score) 33 | 34 | s1 = row[self.s1_col_idx] 35 | s2 = row[self.s2_col_idx] 36 | examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score)) 37 | 38 | if max_examples > 0 and len(examples) >= max_examples: 39 | break 40 | 41 | return examples 42 | -------------------------------------------------------------------------------- /sentence_transformers/readers/NLIDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | 7 | class NLIDataReader(object): 8 | """ 9 | Reads in the Stanford NLI dataset and the MultiGenre NLI dataset 10 | """ 11 | def __init__(self, dataset_folder): 12 | self.dataset_folder = dataset_folder 13 | 14 | def get_examples(self, filename, max_examples=0): 15 | """ 16 | data_splits specified which data split to use (train, dev, test). 17 | Expects that self.dataset_folder contains the files s1.$data_split.gz, s2.$data_split.gz, 18 | labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz 19 | """ 20 | s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename), 21 | mode="rt", encoding="utf-8").readlines() 22 | s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename), 23 | mode="rt", encoding="utf-8").readlines() 24 | labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename), 25 | mode="rt", encoding="utf-8").readlines() 26 | 27 | examples = [] 28 | id = 0 29 | for sentence_a, sentence_b, label in zip(s1, s2, labels): 30 | guid = "%s-%d" % (filename, id) 31 | id += 1 32 | examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label))) 33 | 34 | if 0 < max_examples <= len(examples): 35 | break 36 | 37 | return examples 38 | 39 | @staticmethod 40 | def get_labels(): 41 | return {"contradiction": 0, "entailment": 1, "neutral": 2} 42 | 43 | def get_num_labels(self): 44 | return len(self.get_labels()) 45 | 46 | def map_label(self, label): 47 | return self.get_labels()[label.strip().lower()] -------------------------------------------------------------------------------- /examples/application_semantic_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a simple application for sentence embeddings: semantic search 3 | 4 | We have a corpus with various sentences. Then, for a given query sentence, 5 | we want to find the most similar sentence in this corpus. 6 | 7 | This script outputs for various queries the top 5 most similar sentences in the corpus. 8 | """ 9 | from sentence_transformers import SentenceTransformer 10 | import scipy.spatial 11 | 12 | embedder = SentenceTransformer('bert-base-nli-mean-tokens') 13 | 14 | # Corpus with example sentences 15 | corpus = ['A man is eating food.', 16 | 'A man is eating a piece of bread.', 17 | 'The girl is carrying a baby.', 18 | 'A man is riding a horse.', 19 | 'A woman is playing violin.', 20 | 'Two men pushed carts through the woods.', 21 | 'A man is riding a white horse on an enclosed ground.', 22 | 'A monkey is playing drums.', 23 | 'A cheetah is running behind its prey.' 24 | ] 25 | corpus_embeddings = embedder.encode(corpus) 26 | 27 | # Query sentences: 28 | queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.'] 29 | query_embeddings = embedder.encode(queries) 30 | 31 | # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity 32 | closest_n = 5 33 | for query, query_embedding in zip(queries, query_embeddings): 34 | distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0] 35 | 36 | results = zip(range(len(distances)), distances) 37 | results = sorted(results, key=lambda x: x[1]) 38 | 39 | print("\n\n======================\n\n") 40 | print("Query:", query) 41 | print("\nTop 5 most similar sentences in corpus:") 42 | 43 | for idx, distance in results[0:closest_n]: 44 | print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance)) 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /sentence_transformers/losses/MultipleNegativesRankingLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import torch.nn.functional as F 5 | from ..SentenceTransformer import SentenceTransformer 6 | 7 | class MultipleNegativesRankingLoss(nn.Module): 8 | def __init__(self, model: SentenceTransformer): 9 | super(MultipleNegativesRankingLoss, self).__init__() 10 | self.model = model 11 | 12 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 13 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 14 | 15 | reps_a, reps_b = reps 16 | return self.multiple_negatives_ranking_loss(reps_a, reps_b) 17 | 18 | # Multiple Negatives Ranking Loss 19 | # Paper: https://arxiv.org/pdf/1705.00652.pdf 20 | # Efficient Natural Language Response Suggestion for Smart Reply 21 | # Section 4.4 22 | def multiple_negatives_ranking_loss(self, embeddings_a: Tensor, embeddings_b: Tensor): 23 | """ 24 | Compute the loss over a batch with two embeddings per example. 25 | 26 | Each pair is a positive example. The negative examples are all other embeddings in embeddings_b with each embedding 27 | in embedding_a. 28 | 29 | See the paper for more information: https://arxiv.org/pdf/1705.00652.pdf 30 | (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4) 31 | 32 | :param embeddings_a: 33 | Tensor of shape (batch_size, embedding_dim) 34 | :param embeddings_b: 35 | Tensor of shape (batch_size, embedding_dim) 36 | :return: 37 | The scalar loss 38 | """ 39 | scores = torch.matmul(embeddings_a, embeddings_b.t()) 40 | diagonal_mean = torch.mean(torch.diag(scores)) 41 | mean_log_row_sum_exp = torch.mean(torch.logsumexp(scores, dim=1)) 42 | return -diagonal_mean + mean_log_row_sum_exp 43 | -------------------------------------------------------------------------------- /sentence_transformers/models/Dense.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from torch import functional as F 5 | from typing import Union, Tuple, List, Iterable, Dict 6 | import os 7 | import json 8 | from ..util import fullname, import_from_string 9 | 10 | 11 | class Dense(nn.Module): 12 | """Feed-forward function with activiation function. 13 | 14 | This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN). 15 | """ 16 | def __init__(self, in_features, out_features, bias=True, activation_function=nn.Tanh()): 17 | super(Dense, self).__init__() 18 | self.in_features = in_features 19 | self.out_features = out_features 20 | self.bias = bias 21 | self.activation_function = activation_function 22 | self.linear = nn.Linear(in_features, out_features, bias=bias) 23 | 24 | def forward(self, features: Dict[str, Tensor]): 25 | features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))}) 26 | return features 27 | 28 | def get_sentence_embedding_dimension(self) -> int: 29 | return self.out_features 30 | 31 | def save(self, output_path): 32 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 33 | json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut) 34 | 35 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 36 | 37 | @staticmethod 38 | def load(input_path): 39 | with open(os.path.join(input_path, 'config.json')) as fIn: 40 | config = json.load(fIn) 41 | 42 | config['activation_function'] = import_from_string(config['activation_function'])() 43 | model = Dense(**config) 44 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'))) 45 | return model 46 | -------------------------------------------------------------------------------- /docs/pretrained-models/sts-models.md: -------------------------------------------------------------------------------- 1 | # STS Models 2 | The models were first trained on [NLI data](nli-models.md), then we fine-tuned them on the [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark). This generate sentence embeddings that are especially suitable to measure the semantic similarity between sentence pairs. 3 | 4 | # Datasets 5 | We use the training file from the [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark). 6 | 7 | For a training example, see: 8 | - [examples/training_stsbenchmark.py](../../examples/training_stsbenchmark_bert.py) - Train directly on STS data 9 | - [examples/training_stsbenchmark_continue_training.py ](../../examples/training_stsbenchmark_continue_training.py) - First train one NLI, than train on STS data. 10 | 11 | # Pre-trained models 12 | We provide the following pre-trained models: 13 | 14 | ### BERT models 15 | - **bert-base-nli-stsb-mean-tokens**: BERT-base trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.14 16 | - **bert-large-nli-stsb-mean-tokens**: BERT-large trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.29 17 | 18 | ### RoBERTa models 19 | RoBERTa is an extension of BERT. [More Information](https://arxiv.org/abs/1907.11692). 20 | - **roberta-base-nli-stsb-mean-tokens**: RoBERTa-base trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.40 21 | - **roberta-large-nli-stsb-mean-tokens**: RoBERTa-large trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 86.31 22 | 23 | ### DistilBERT 24 | DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. [More Information](https://github.com/huggingface/transformers/tree/master/examples/distillation) 25 | - **distilbert-base-nli-stsb-mean-tokens**: Performance: STSbenchmark: 84.38 26 | 27 | # Performance Comparison 28 | Here are the performances on the STS benchmark for other sentence embeddings methods. They were also computed by using cosine-similarity and Spearman rank correlation. Note, these models were not-fined on the STS benchmark. 29 | 30 | - Avg. GloVe embeddings: 58.02 31 | - BERT-as-a-service avg. embeddings: 46.35 32 | - BERT-as-a-service CLS-vector: 16.50 33 | - InferSent - GloVe: 68.03 34 | - Universal Sentence Encoder: 74.92 35 | -------------------------------------------------------------------------------- /docs/pretrained-models/wikipedia-sections-models.md: -------------------------------------------------------------------------------- 1 | # Wikipedia Sections Models 2 | The `wikipedia-sections-models` implement the idea from Dor et al., 2018, [Learning Thematic Similarity Metric Using Triplet Networks](https://aclweb.org/anthology/P18-2009). 3 | 4 | It was trained with a triplet-loss: The anchor and the positive example were sentences from the same section from an wikipedia article, for example, from the History section of the London article. The negative example came from a different section from the same article, for example, from the Education section of the London article. 5 | 6 | # Dataset 7 | We use dataset from Dor et al., 2018, [Learning Thematic Similarity Metric Using Triplet Networks](https://aclweb.org/anthology/P18-2009). 8 | 9 | See [examples/training_wikipedia_sections.py](../../examples/training_wikipedia_sections.py) for how to train on this dataset. 10 | 11 | 12 | # Pre-trained models 13 | We provide the following pre-trained models: 14 | 15 | - **bert-base-wikipedia-sections-mean-tokens**: 80.42% accuracy on test set. 16 | 17 | You can use them in the following way: 18 | ``` 19 | from sentence_transformers import SentenceTransformer 20 | embedder = SentenceTransformer('pretrained-model-name') 21 | ``` 22 | 23 | # Performance Comparison 24 | Performance (accuracy) reported by Dor et al.: 25 | - mean-vectors: 0.65 26 | - skip-thoughts-CS: 0.615 27 | - skip-thoughts-SICK: 0.547 28 | - triplet-sen: 0.74 29 | 30 | 31 | # Applications 32 | The models achieve a rather low performance on the STS benchmark dataset. The reason for this is the training objective: An anchor, a positive and a negative example are presented. The network must only learn to differentiate what the positive and what the negative example is by ensuring that the negative example is further away from the anchor than the positive example. 33 | 34 | However, it does not matter how far the negative example is away, it can be little or really far away. This makes this model rather bad for deciding if a pair is somewhat similar. It learns only to recognize similar pairs (high scores) and dissimilar pairs (low scores). 35 | 36 | However, this model works well for **fine-grained clustering**. 37 | 38 | For an example, see: 39 | [examples/application_clustering_wikipedia_sections.py](../../examples/application_clustering_wikipedia_sections.py) 40 | 41 | 42 | -------------------------------------------------------------------------------- /sentence_transformers/losses/SoftmaxLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | from ..SentenceTransformer import SentenceTransformer 5 | import logging 6 | 7 | class SoftmaxLoss(nn.Module): 8 | def __init__(self, 9 | model: SentenceTransformer, 10 | sentence_embedding_dimension: int, 11 | num_labels: int, 12 | concatenation_sent_rep: bool = True, 13 | concatenation_sent_difference: bool = True, 14 | concatenation_sent_multiplication: bool = False): 15 | super(SoftmaxLoss, self).__init__() 16 | self.model = model 17 | self.num_labels = num_labels 18 | self.concatenation_sent_rep = concatenation_sent_rep 19 | self.concatenation_sent_difference = concatenation_sent_difference 20 | self.concatenation_sent_multiplication = concatenation_sent_multiplication 21 | 22 | num_vectors_concatenated = 0 23 | if concatenation_sent_rep: 24 | num_vectors_concatenated += 2 25 | if concatenation_sent_difference: 26 | num_vectors_concatenated += 1 27 | if concatenation_sent_multiplication: 28 | num_vectors_concatenated += 1 29 | logging.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated)) 30 | self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels) 31 | 32 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 33 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 34 | rep_a, rep_b = reps 35 | 36 | vectors_concat = [] 37 | if self.concatenation_sent_rep: 38 | vectors_concat.append(rep_a) 39 | vectors_concat.append(rep_b) 40 | 41 | if self.concatenation_sent_difference: 42 | vectors_concat.append(torch.abs(rep_a - rep_b)) 43 | 44 | if self.concatenation_sent_multiplication: 45 | vectors_concat.append(rep_a * rep_b) 46 | 47 | features = torch.cat(vectors_concat, 1) 48 | 49 | output = self.classifier(features) 50 | loss_fct = nn.CrossEntropyLoss() 51 | 52 | if labels is not None: 53 | loss = loss_fct(output, labels.view(-1)) 54 | return loss 55 | else: 56 | return reps, output -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/WhitespaceTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 7 | 8 | class WhitespaceTokenizer(WordTokenizer): 9 | """ 10 | Simple and fast white-space tokenizer. Splits sentence based on white spaces. 11 | Punctuation are stripped from tokens. 12 | """ 13 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False): 14 | self.stop_words = set(stop_words) 15 | self.do_lower_case = do_lower_case 16 | self.set_vocab(vocab) 17 | 18 | def get_vocab(self): 19 | return self.vocab 20 | 21 | def set_vocab(self, vocab: Iterable[str]): 22 | self.vocab = vocab 23 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 24 | 25 | def tokenize(self, text: str) -> List[int]: 26 | if self.do_lower_case: 27 | text = text.lower() 28 | 29 | tokens = text.split() 30 | 31 | tokens_filtered = [] 32 | for token in tokens: 33 | if token in self.stop_words: 34 | continue 35 | elif token in self.word2idx: 36 | tokens_filtered.append(self.word2idx[token]) 37 | continue 38 | 39 | token = token.strip(string.punctuation) 40 | if token in self.stop_words: 41 | continue 42 | elif len(token) > 0 and token in self.word2idx: 43 | tokens_filtered.append(self.word2idx[token]) 44 | continue 45 | 46 | token = token.lower() 47 | if token in self.stop_words: 48 | continue 49 | elif token in self.word2idx: 50 | tokens_filtered.append(self.word2idx[token]) 51 | continue 52 | 53 | return tokens_filtered 54 | 55 | def save(self, output_path: str): 56 | with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut: 57 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut) 58 | 59 | @staticmethod 60 | def load(input_path: str): 61 | with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn: 62 | config = json.load(fIn) 63 | 64 | return WhitespaceTokenizer(**config) 65 | -------------------------------------------------------------------------------- /sentence_transformers/models/LSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import logging 5 | import gzip 6 | from tqdm import tqdm 7 | import numpy as np 8 | import os 9 | import json 10 | from ..util import import_from_string, fullname, http_get 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 12 | 13 | 14 | class LSTM(nn.Module): 15 | """Bidirectional LSTM running over word embeddings. 16 | """ 17 | def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0): 18 | nn.Module.__init__(self) 19 | self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout'] 20 | self.word_embedding_dimension = word_embedding_dimension 21 | self.hidden_dim = hidden_dim 22 | self.num_layers = num_layers 23 | self.dropout = dropout 24 | 25 | self.embeddings_dimension = 2*hidden_dim 26 | self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=True, batch_first=True) 27 | 28 | def forward(self, features): 29 | token_embeddings = features['token_embeddings'] 30 | sentence_lengths = torch.clamp(features['sentence_lengths'], min=1) 31 | 32 | packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False) 33 | packed = self.encoder(packed) 34 | unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0] 35 | features.update({'token_embeddings': unpack}) 36 | return features 37 | 38 | def get_word_embedding_dimension(self) -> int: 39 | return self.embeddings_dimension 40 | 41 | def tokenize(self, text: str) -> List[int]: 42 | raise NotImplementedError() 43 | 44 | def save(self, output_path: str): 45 | with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut: 46 | json.dump(self.get_config_dict(), fOut, indent=2) 47 | 48 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 49 | 50 | def get_config_dict(self): 51 | return {key: self.__dict__[key] for key in self.config_keys} 52 | 53 | @staticmethod 54 | def load(input_path: str): 55 | with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn: 56 | config = json.load(fIn) 57 | 58 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 59 | model = LSTM(**config) 60 | model.load_state_dict(weights) 61 | return model 62 | 63 | -------------------------------------------------------------------------------- /sentence_transformers/models/CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import logging 5 | import gzip 6 | from tqdm import tqdm 7 | import numpy as np 8 | import os 9 | import json 10 | from ..util import import_from_string, fullname, http_get 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 12 | 13 | 14 | class CNN(nn.Module): 15 | """CNN-layer with multiple kernel-sizes over the word embeddings""" 16 | 17 | def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]): 18 | nn.Module.__init__(self) 19 | self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes'] 20 | self.in_word_embedding_dimension = in_word_embedding_dimension 21 | self.out_channels = out_channels 22 | self.kernel_sizes = kernel_sizes 23 | 24 | self.embeddings_dimension = out_channels*len(kernel_sizes) 25 | self.convs = nn.ModuleList() 26 | 27 | in_channels = in_word_embedding_dimension 28 | for kernel_size in kernel_sizes: 29 | padding_size = int((kernel_size - 1) / 2) 30 | conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, 31 | padding=padding_size) 32 | self.convs.append(conv) 33 | 34 | def forward(self, features): 35 | token_embeddings = features['token_embeddings'] 36 | 37 | token_embeddings = token_embeddings.transpose(1, -1) 38 | vectors = [conv(token_embeddings) for conv in self.convs] 39 | out = torch.cat(vectors, 1).transpose(1, -1) 40 | 41 | features.update({'token_embeddings': out}) 42 | return features 43 | 44 | def get_word_embedding_dimension(self) -> int: 45 | return self.embeddings_dimension 46 | 47 | def tokenize(self, text: str) -> List[int]: 48 | raise NotImplementedError() 49 | 50 | def save(self, output_path: str): 51 | with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut: 52 | json.dump(self.get_config_dict(), fOut, indent=2) 53 | 54 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 55 | 56 | def get_config_dict(self): 57 | return {key: self.__dict__[key] for key in self.config_keys} 58 | 59 | @staticmethod 60 | def load(input_path: str): 61 | with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn: 62 | config = json.load(fIn) 63 | 64 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 65 | model = CNN(**config) 66 | model.load_state_dict(weights) 67 | return model 68 | 69 | -------------------------------------------------------------------------------- /sentence_transformers/util.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from torch import Tensor, device 3 | from typing import Tuple, List 4 | from tqdm import tqdm 5 | import sys 6 | import importlib 7 | 8 | 9 | def batch_to_device(batch, target_device: device): 10 | """ 11 | send a batch to a device 12 | 13 | :param batch: 14 | :param target_device: 15 | :return: the batch sent to the device 16 | """ 17 | features = batch['features'] 18 | for paired_sentence_idx in range(len(features)): 19 | for feature_name in features[paired_sentence_idx]: 20 | features[paired_sentence_idx][feature_name] = features[paired_sentence_idx][feature_name].to(target_device) 21 | 22 | labels = batch['labels'].to(target_device) 23 | return features, labels 24 | 25 | 26 | 27 | def http_get(url, path): 28 | file_binary = open(path, "wb") 29 | req = requests.get(url, stream=True) 30 | if req.status_code != 200: 31 | print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr) 32 | req.raise_for_status() 33 | 34 | content_length = req.headers.get('Content-Length') 35 | total = int(content_length) if content_length is not None else None 36 | progress = tqdm(unit="B", total=total, unit_scale=True) 37 | for chunk in req.iter_content(chunk_size=1024): 38 | if chunk: # filter out keep-alive new chunks 39 | progress.update(len(chunk)) 40 | file_binary.write(chunk) 41 | progress.close() 42 | 43 | 44 | def fullname(o): 45 | # o.__module__ + "." + o.__class__.__qualname__ is an example in 46 | # this context of H.L. Mencken's "neat, plausible, and wrong." 47 | # Python makes no guarantees as to whether the __module__ special 48 | # attribute is defined, so we take a more circumspect approach. 49 | # Alas, the module name is explicitly excluded from __qualname__ 50 | # in Python 3. 51 | 52 | module = o.__class__.__module__ 53 | if module is None or module == str.__class__.__module__: 54 | return o.__class__.__name__ # Avoid reporting __builtin__ 55 | else: 56 | return module + '.' + o.__class__.__name__ 57 | 58 | def import_from_string(dotted_path): 59 | """ 60 | Import a dotted module path and return the attribute/class designated by the 61 | last name in the path. Raise ImportError if the import failed. 62 | """ 63 | try: 64 | module_path, class_name = dotted_path.rsplit('.', 1) 65 | except ValueError: 66 | msg = "%s doesn't look like a module path" % dotted_path 67 | raise ImportError(msg) 68 | 69 | module = importlib.import_module(module_path) 70 | 71 | try: 72 | return getattr(module, class_name) 73 | except AttributeError: 74 | msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name) 75 | raise ImportError(msg) -------------------------------------------------------------------------------- /sentence_transformers/evaluation/LabelAccuracyEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import torch 3 | from torch.utils.data import DataLoader 4 | import logging 5 | from tqdm import tqdm 6 | from ..util import batch_to_device 7 | import os 8 | import csv 9 | 10 | class LabelAccuracyEvaluator(SentenceEvaluator): 11 | """ 12 | Evaluate a model based on its accuracy on a labeled dataset 13 | 14 | This requires a model with LossFunction.SOFTMAX 15 | 16 | The results are written in a CSV. If a CSV already exists, then values are appended. 17 | """ 18 | 19 | def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None): 20 | """ 21 | Constructs an evaluator for the given dataset 22 | 23 | :param dataloader: 24 | the data for the evaluation 25 | """ 26 | self.dataloader = dataloader 27 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 28 | self.name = name 29 | self.softmax_model = softmax_model 30 | self.softmax_model.to(self.device) 31 | 32 | if name: 33 | name = "_"+name 34 | 35 | self.csv_file = "accuracy_evaluation"+name+"_results.csv" 36 | self.csv_headers = ["epoch", "steps", "accuracy"] 37 | 38 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 39 | model.eval() 40 | total = 0 41 | correct = 0 42 | 43 | if epoch != -1: 44 | if steps == -1: 45 | out_txt = " after epoch {}:".format(epoch) 46 | else: 47 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 48 | else: 49 | out_txt = ":" 50 | 51 | logging.info("Evaluation on the "+self.name+" dataset"+out_txt) 52 | self.dataloader.collate_fn = model.smart_batching_collate 53 | for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): 54 | features, label_ids = batch_to_device(batch, self.device) 55 | with torch.no_grad(): 56 | _, prediction = self.softmax_model(features, labels=None) 57 | 58 | total += prediction.size(0) 59 | correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item() 60 | accuracy = correct/total 61 | 62 | logging.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total)) 63 | 64 | if output_path is not None: 65 | csv_path = os.path.join(output_path, self.csv_file) 66 | if not os.path.isfile(csv_path): 67 | with open(csv_path, mode="w", encoding="utf-8") as f: 68 | writer = csv.writer(f) 69 | writer.writerow(self.csv_headers) 70 | writer.writerow([epoch, steps, accuracy]) 71 | else: 72 | with open(csv_path, mode="a", encoding="utf-8") as f: 73 | writer = csv.writer(f) 74 | writer.writerow([epoch, steps, accuracy]) 75 | 76 | return accuracy -------------------------------------------------------------------------------- /examples/training_stsbenchmark_continue_training.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example loads the pre-trained bert-base-nli-mean-tokens models from the server. 3 | It then fine-tunes this model for some epochs on the STS benchmark dataset. 4 | """ 5 | from torch.utils.data import DataLoader 6 | import math 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 9 | from sentence_transformers.readers import STSDataReader 10 | import logging 11 | from datetime import datetime 12 | 13 | 14 | #### Just some code to print debug information to stdout 15 | logging.basicConfig(format='%(asctime)s - %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S', 17 | level=logging.INFO, 18 | handlers=[LoggingHandler()]) 19 | #### /print debug information to stdout 20 | 21 | # Read the dataset 22 | model_name = 'bert-base-nli-mean-tokens' 23 | train_batch_size = 16 24 | num_epochs = 4 25 | model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 26 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True) 27 | 28 | # Load a pre-trained sentence transformer model 29 | model = SentenceTransformer(model_name) 30 | 31 | # Convert the dataset to a DataLoader ready for training 32 | logging.info("Read STSbenchmark train dataset") 33 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) 34 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 35 | train_loss = losses.CosineSimilarityLoss(model=model) 36 | 37 | 38 | logging.info("Read STSbenchmark dev dataset") 39 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 40 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 41 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 42 | 43 | 44 | # Configure the training. We skip evaluation in this example 45 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 46 | logging.info("Warmup-steps: {}".format(warmup_steps)) 47 | 48 | 49 | # Train the model 50 | model.fit(train_objectives=[(train_dataloader, train_loss)], 51 | evaluator=evaluator, 52 | epochs=num_epochs, 53 | evaluation_steps=1000, 54 | warmup_steps=warmup_steps, 55 | output_path=model_save_path) 56 | 57 | 58 | ############################################################################## 59 | # 60 | # Load the stored model and evaluate its performance on STS benchmark dataset 61 | # 62 | ############################################################################## 63 | 64 | model = SentenceTransformer(model_save_path) 65 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 66 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 67 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 68 | model.evaluate(evaluator) 69 | -------------------------------------------------------------------------------- /sentence_transformers/models/BoW.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | import numpy as np 9 | from .tokenizer import WhitespaceTokenizer 10 | 11 | class BoW(nn.Module): 12 | """Implements a Bag-of-Words (BoW) model to derive sentence embeddings. 13 | 14 | A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab. 15 | """ 16 | 17 | def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True): 18 | super(BoW, self).__init__() 19 | vocab = list(set(vocab)) #Ensure vocab is unique 20 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency'] 21 | self.vocab = vocab 22 | self.word_weights = word_weights 23 | self.unknown_word_weight = unknown_word_weight 24 | self.cumulative_term_frequency = cumulative_term_frequency 25 | 26 | #Maps wordIdx -> word weight 27 | self.weights = [] 28 | num_unknown_words = 0 29 | for word in vocab: 30 | weight = unknown_word_weight 31 | if word in word_weights: 32 | weight = word_weights[word] 33 | elif word.lower() in word_weights: 34 | weight = word_weights[word.lower()] 35 | else: 36 | num_unknown_words += 1 37 | self.weights.append(weight) 38 | 39 | logging.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 40 | 41 | self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False) 42 | self.sentence_embedding_dimension = len(vocab) 43 | 44 | 45 | def forward(self, features: Dict[str, Tensor]): 46 | #Nothing to do, everything is done in get_sentence_features 47 | return features 48 | 49 | def tokenize(self, text: str) -> List[int]: 50 | return self.tokenizer.tokenize(text) 51 | 52 | def get_sentence_embedding_dimension(self): 53 | return self.sentence_embedding_dimension 54 | 55 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 56 | #return {'input_ids': tokens} 57 | vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32) 58 | for token in tokens: 59 | if self.cumulative_term_frequency: 60 | vector[token] += self.weights[token] 61 | else: 62 | vector[token] = self.weights[token] 63 | 64 | return {'sentence_embedding': vector} 65 | 66 | def get_config_dict(self): 67 | return {key: self.__dict__[key] for key in self.config_keys} 68 | 69 | def save(self, output_path): 70 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 71 | json.dump(self.get_config_dict(), fOut, indent=2) 72 | 73 | @staticmethod 74 | def load(input_path): 75 | with open(os.path.join(input_path, 'config.json')) as fIn: 76 | config = json.load(fIn) 77 | 78 | return BoW(**config) -------------------------------------------------------------------------------- /docs/pretrained-models/nli-models.md: -------------------------------------------------------------------------------- 1 | # NLI Models 2 | Conneau et al., 2017, show in the InferSent-Paper ([Supervised Learning of Universal Sentence Representations from Natural Language Inference Data](https://arxiv.org/abs/1705.02364)) that training on Natural Language Inference (NLI) data can produce universal sentence embeddings. 3 | 4 | The datasets labeled sentence pairs with the labels *entail*, *contradict*, and *neutral*. For both sentences, we compute a sentence embedding. These two embeddings are concatenated and passed to softmax classifier to derive the final label. 5 | 6 | As shown, this produces sentence embeddings that can be used for various use cases like clustering or semantic search. 7 | 8 | # Datasets 9 | We train the models on the [SNLI](https://nlp.stanford.edu/projects/snli/) and on the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset. We call the combination of the two datasets AllNLI. 10 | 11 | For a training example, see [examples/training_nli_bert.py](../../examples/training_nli_bert.py). 12 | 13 | # Pre-trained models 14 | We provide the following pre-trained models. The performance was evaluated on the test set of the [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) using Spearman rank correlation. 15 | 16 | 17 | ### BERT models 18 | - **bert-base-nli-mean-tokens**: BERT-base model with mean-tokens pooling. Performance: STSbenchmark: 77.12 19 | - **bert-base-nli-max-tokens**: BERT-base with max-tokens pooling. Performance: STSbenchmark: 77.18 20 | - **bert-base-nli-cls-token**: BERT-base with cls token pooling. Performance: STSbenchmark: 76.30 21 | - **bert-large-nli-mean-tokens**: BERT-large with mean-tokens pooling. Performance: STSbenchmark: 79.19 22 | - **bert-large-nli-max-tokens**: BERT-large with max-tokens pooling. Performance: STSbenchmark: 78.32 23 | - **bert-large-nli-cls-token**: BERT-large with CLS token pooling. Performance: STSbenchmark: 78.29 24 | 25 | ### RoBERTa models 26 | RoBERTa is an extension of BERT. [More Information](https://arxiv.org/abs/1907.11692). 27 | - **roberta-base-nli-mean-tokens**: RoBERTa-base with mean-tokens pooling. Performance: STSbenchmark: 77.42 28 | - **roberta-large-nli-mean-tokens**: RoBERTa-base with mean-tokens pooling. Performance: STSbenchmark: 78.58 29 | 30 | ### DistilBERT models 31 | DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. [More Information](https://github.com/huggingface/transformers/tree/master/examples/distillation) 32 | - **distilbert-base-nli-mean-tokens**: DistilBERT-base with mean-tokens pooling. Performance: STSbenchmark: 76.97 33 | 34 | # Performance Comparison 35 | Here are the performances on the STS benchmark for other sentence embeddings methods. They were also computed by using cosine-similarity and Spearman rank correlation: 36 | - Avg. GloVe embeddings: 58.02 37 | - BERT-as-a-service avg. embeddings: 46.35 38 | - BERT-as-a-service CLS-vector: 16.50 39 | - InferSent - GloVe: 68.03 40 | - Universal Sentence Encoder: 74.92 41 | 42 | # Applications 43 | This model works well in accessing the coarse-grained similarity between sentences. For application examples, see [examples/application_semantic_search.py](../../examples/application_semantic_search.py) and [examples/application_clustering.py](../../examples/application_clustering.py) -------------------------------------------------------------------------------- /sentence_transformers/models/WordWeights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | 9 | class WordWeights(nn.Module): 10 | """This model can weight word embeddings, for example, with idf-values.""" 11 | 12 | def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1): 13 | """ 14 | 15 | :param vocab: 16 | Vocabulary of the tokenizer 17 | :param word_weights: 18 | Mapping of tokens to a float weight value. Words embeddings are multiplied by this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values) 19 | :param unknown_word_weight: 20 | Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists. 21 | """ 22 | super(WordWeights, self).__init__() 23 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight'] 24 | self.vocab = vocab 25 | self.word_weights = word_weights 26 | self.unknown_word_weight = unknown_word_weight 27 | 28 | weights = [] 29 | num_unknown_words = 0 30 | for word in vocab: 31 | weight = unknown_word_weight 32 | if word in word_weights: 33 | weight = word_weights[word] 34 | elif word.lower() in word_weights: 35 | weight = word_weights[word.lower()] 36 | else: 37 | num_unknown_words += 1 38 | weights.append(weight) 39 | 40 | logging.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 41 | 42 | self.emb_layer = nn.Embedding(len(vocab), 1) 43 | self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)}) 44 | 45 | 46 | def forward(self, features: Dict[str, Tensor]): 47 | input_mask = features['input_mask'] 48 | token_embeddings = features['token_embeddings'] 49 | 50 | #Compute a weight value for each token 51 | token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1) 52 | token_weights = token_weights_raw * input_mask.float() 53 | token_weights_sum = torch.sum(token_weights, 1) 54 | 55 | #Multiply embedding by token weight value 56 | token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size()) 57 | token_embeddings = token_embeddings * token_weights_expanded 58 | 59 | features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum}) 60 | return features 61 | 62 | def get_config_dict(self): 63 | return {key: self.__dict__[key] for key in self.config_keys} 64 | 65 | def save(self, output_path): 66 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 67 | json.dump(self.get_config_dict(), fOut, indent=2) 68 | 69 | @staticmethod 70 | def load(input_path): 71 | with open(os.path.join(input_path, 'config.json')) as fIn: 72 | config = json.load(fIn) 73 | 74 | return WordWeights(**config) -------------------------------------------------------------------------------- /sentence_transformers/data_samplers.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains sampler functions, that can be used to sample mini-batches with specific properties. 3 | """ 4 | from torch.utils.data import Sampler 5 | import numpy as np 6 | from .datasets import SentenceLabelDataset 7 | 8 | 9 | class LabelSampler(Sampler): 10 | """ 11 | This sampler is used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS 12 | or MULTIPLE_NEGATIVES_RANKING_LOSS which require multiple or only one sample from one label per batch. 13 | 14 | It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label. 15 | 16 | Labels with fewer than n unique samples are ignored. 17 | This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped. 18 | 19 | This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible 20 | by the samples drawn per label. 21 | 22 | 23 | """ 24 | def __init__(self, data_source: SentenceLabelDataset, samples_per_label: int = 5, 25 | with_replacement: bool = False): 26 | """ 27 | Creates a LabelSampler for a SentenceLabelDataset. 28 | 29 | :param data_source: 30 | the dataset from which samples are drawn 31 | :param samples_per_label: 32 | the number of consecutive, random and unique samples drawn per label 33 | :param with_replacement: 34 | if this is True, then each sample is drawn at most once (depending on the total number of samples per label). 35 | if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same 36 | drawing. 37 | """ 38 | super().__init__(data_source) 39 | self.data_source = data_source 40 | self.samples_per_label = samples_per_label 41 | self.label_range = np.arange(data_source.num_labels) 42 | self.borders = data_source.labels_right_border 43 | self.with_replacement = with_replacement 44 | np.random.shuffle(self.label_range) 45 | 46 | def __iter__(self): 47 | label_idx = 0 48 | count = 0 49 | already_seen = {} 50 | while count < len(self.data_source): 51 | label = self.label_range[label_idx] 52 | if label not in already_seen: 53 | already_seen[label] = [] 54 | 55 | left_border = 0 if label == 0 else self.borders[label-1] 56 | right_border = self.borders[label] 57 | 58 | if self.with_replacement: 59 | selection = np.arange(left_border, right_border) 60 | else: 61 | selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]] 62 | 63 | if len(selection) >= self.samples_per_label: 64 | for element_idx in np.random.choice(selection, self.samples_per_label, replace=False): 65 | count += 1 66 | already_seen[label].append(element_idx) 67 | yield element_idx 68 | 69 | label_idx += 1 70 | if label_idx >= len(self.label_range): 71 | label_idx = 0 72 | np.random.shuffle(self.label_range) 73 | 74 | def __len__(self): 75 | return len(self.data_source) -------------------------------------------------------------------------------- /examples/training_stsbenchmark_bert.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples trains BERT for the STSbenchmark from scratch. It generates sentence embeddings 3 | that can be compared using cosine-similarity to measure the similarity. 4 | """ 5 | from torch.utils.data import DataLoader 6 | import math 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 9 | from sentence_transformers.readers import STSDataReader 10 | import logging 11 | from datetime import datetime 12 | 13 | 14 | #### Just some code to print debug information to stdout 15 | logging.basicConfig(format='%(asctime)s - %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S', 17 | level=logging.INFO, 18 | handlers=[LoggingHandler()]) 19 | #### /print debug information to stdout 20 | 21 | # Read the dataset 22 | train_batch_size = 16 23 | num_epochs = 4 24 | model_save_path = 'output/training_stsbenchmark_bert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 25 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True) 26 | 27 | # Use BERT for mapping tokens to embeddings 28 | word_embedding_model = models.BERT('bert-base-uncased') 29 | 30 | # Apply mean pooling to get one fixed sized sentence vector 31 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 32 | pooling_mode_mean_tokens=True, 33 | pooling_mode_cls_token=False, 34 | pooling_mode_max_tokens=False) 35 | 36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 37 | 38 | # Convert the dataset to a DataLoader ready for training 39 | logging.info("Read STSbenchmark train dataset") 40 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) 41 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 42 | train_loss = losses.CosineSimilarityLoss(model=model) 43 | 44 | 45 | logging.info("Read STSbenchmark dev dataset") 46 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 47 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 48 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 49 | 50 | 51 | # Configure the training. We skip evaluation in this example 52 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 53 | logging.info("Warmup-steps: {}".format(warmup_steps)) 54 | 55 | 56 | # Train the model 57 | model.fit(train_objectives=[(train_dataloader, train_loss)], 58 | evaluator=evaluator, 59 | epochs=num_epochs, 60 | evaluation_steps=1000, 61 | warmup_steps=warmup_steps, 62 | output_path=model_save_path) 63 | 64 | 65 | ############################################################################## 66 | # 67 | # Load the stored model and evaluate its performance on STS benchmark dataset 68 | # 69 | ############################################################################## 70 | 71 | model = SentenceTransformer(model_save_path) 72 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 73 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 74 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 75 | model.evaluate(evaluator) 76 | -------------------------------------------------------------------------------- /examples/training_stsbenchmark_xlnet.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples trains XLNet for the STSbenchmark from scratch. It generates sentence embeddings 3 | that can be compared using cosine-similarity to measure the similarity. 4 | """ 5 | from torch.utils.data import DataLoader 6 | import math 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 9 | from sentence_transformers.readers import STSDataReader 10 | import logging 11 | from datetime import datetime 12 | 13 | 14 | #### Just some code to print debug information to stdout 15 | logging.basicConfig(format='%(asctime)s - %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S', 17 | level=logging.INFO, 18 | handlers=[LoggingHandler()]) 19 | #### /print debug information to stdout 20 | 21 | # Read the dataset 22 | train_batch_size = 16 23 | num_epochs = 4 24 | model_save_path = 'output/training_stsbenchmark_xlnet-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 25 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True) 26 | 27 | # Use XLNet for mapping tokens to embeddings 28 | word_embedding_model = models.XLNet('xlnet-base-cased') 29 | 30 | # Apply mean pooling to get one fixed sized sentence vector 31 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 32 | pooling_mode_mean_tokens=True, 33 | pooling_mode_cls_token=False, 34 | pooling_mode_max_tokens=False) 35 | 36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 37 | 38 | # Convert the dataset to a DataLoader ready for training 39 | logging.info("Read STSbenchmark train dataset") 40 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) 41 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 42 | train_loss = losses.CosineSimilarityLoss(model=model) 43 | 44 | 45 | logging.info("Read STSbenchmark dev dataset") 46 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 47 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 48 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 49 | 50 | 51 | # Configure the training. We skip evaluation in this example 52 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 53 | logging.info("Warmup-steps: {}".format(warmup_steps)) 54 | 55 | 56 | # Train the model 57 | model.fit(train_objectives=[(train_dataloader, train_loss)], 58 | evaluator=evaluator, 59 | epochs=num_epochs, 60 | evaluation_steps=1000, 61 | warmup_steps=warmup_steps, 62 | output_path=model_save_path) 63 | 64 | 65 | ############################################################################## 66 | # 67 | # Load the stored model and evaluate its performance on STS benchmark dataset 68 | # 69 | ############################################################################## 70 | 71 | model = SentenceTransformer(model_save_path) 72 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 73 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 74 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 75 | model.evaluate(evaluator) 76 | -------------------------------------------------------------------------------- /examples/training_stsbenchmark_roberta.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples trains RoBERTa for the STSbenchmark from scratch. It generates sentence embeddings 3 | that can be compared using cosine-similarity to measure the similarity. 4 | """ 5 | from torch.utils.data import DataLoader 6 | import math 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 9 | from sentence_transformers.readers import STSDataReader 10 | import logging 11 | from datetime import datetime 12 | 13 | 14 | #### Just some code to print debug information to stdout 15 | logging.basicConfig(format='%(asctime)s - %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S', 17 | level=logging.INFO, 18 | handlers=[LoggingHandler()]) 19 | #### /print debug information to stdout 20 | 21 | # Read the dataset 22 | train_batch_size = 16 23 | num_epochs = 4 24 | model_save_path = 'output/training_stsbenchmark_roberta-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 25 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True) 26 | 27 | # Use RoBERTa-base for mapping tokens to embeddings 28 | word_embedding_model = models.RoBERTa('roberta-base') 29 | 30 | # Apply mean pooling to get one fixed sized sentence vector 31 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 32 | pooling_mode_mean_tokens=True, 33 | pooling_mode_cls_token=False, 34 | pooling_mode_max_tokens=False) 35 | 36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 37 | 38 | # Convert the dataset to a DataLoader ready for training 39 | logging.info("Read STSbenchmark train dataset") 40 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) 41 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 42 | train_loss = losses.CosineSimilarityLoss(model=model) 43 | 44 | 45 | logging.info("Read STSbenchmark dev dataset") 46 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 47 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 48 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 49 | 50 | 51 | # Configure the training. We skip evaluation in this example 52 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 53 | logging.info("Warmup-steps: {}".format(warmup_steps)) 54 | 55 | 56 | # Train the model 57 | model.fit(train_objectives=[(train_dataloader, train_loss)], 58 | evaluator=evaluator, 59 | epochs=num_epochs, 60 | evaluation_steps=1000, 61 | warmup_steps=warmup_steps, 62 | output_path=model_save_path) 63 | 64 | 65 | ############################################################################## 66 | # 67 | # Load the stored model and evaluate its performance on STS benchmark dataset 68 | # 69 | ############################################################################## 70 | 71 | model = SentenceTransformer(model_save_path) 72 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 73 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 74 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 75 | model.evaluate(evaluator) 76 | -------------------------------------------------------------------------------- /examples/training_stsbenchmark_albert.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples trains ALBERT for the STSbenchmark from scratch. It generates sentence embeddings 3 | that can be compared using cosine-similarity to measure the similarity. 4 | """ 5 | from torch.utils.data import DataLoader 6 | import math 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 9 | from sentence_transformers.readers import STSDataReader 10 | import logging 11 | from datetime import datetime 12 | 13 | 14 | 15 | #### Just some code to print debug information to stdout 16 | logging.basicConfig(format='%(asctime)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | level=logging.INFO, 19 | handlers=[LoggingHandler()]) 20 | #### /print debug information to stdout 21 | 22 | # Read the dataset 23 | train_batch_size = 16 24 | num_epochs = 4 25 | model_save_path = 'output/training_stsbenchmark_albert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 26 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True) 27 | 28 | # Use RoBERTa-base for mapping tokens to embeddings 29 | word_embedding_model = models.ALBERT('albert-base-v2') 30 | 31 | # Apply mean pooling to get one fixed sized sentence vector 32 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 33 | pooling_mode_mean_tokens=True, 34 | pooling_mode_cls_token=False, 35 | pooling_mode_max_tokens=False) 36 | 37 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 38 | 39 | # Convert the dataset to a DataLoader ready for training 40 | logging.info("Read STSbenchmark train dataset") 41 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) 42 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 43 | train_loss = losses.CosineSimilarityLoss(model=model) 44 | 45 | 46 | logging.info("Read STSbenchmark dev dataset") 47 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 48 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 49 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 50 | 51 | 52 | # Configure the training. We skip evaluation in this example 53 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 54 | logging.info("Warmup-steps: {}".format(warmup_steps)) 55 | 56 | 57 | # Train the model 58 | model.fit(train_objectives=[(train_dataloader, train_loss)], 59 | evaluator=evaluator, 60 | epochs=num_epochs, 61 | evaluation_steps=1000, 62 | warmup_steps=warmup_steps, 63 | output_path=model_save_path) 64 | 65 | 66 | ############################################################################## 67 | # 68 | # Load the stored model and evaluate its performance on STS benchmark dataset 69 | # 70 | ############################################################################## 71 | 72 | model = SentenceTransformer(model_save_path) 73 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 74 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 75 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 76 | model.evaluate(evaluator) 77 | -------------------------------------------------------------------------------- /examples/training_stsbenchmark_distilbert.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples trains DistilBERT for the STSbenchmark from scratch. It generates sentence embeddings 3 | that can be compared using cosine-similarity to measure the similarity. 4 | """ 5 | from torch.utils.data import DataLoader 6 | import math 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 9 | from sentence_transformers.readers import STSDataReader 10 | import logging 11 | from datetime import datetime 12 | 13 | 14 | #### Just some code to print debug information to stdout 15 | logging.basicConfig(format='%(asctime)s - %(message)s', 16 | datefmt='%Y-%m-%d %H:%M:%S', 17 | level=logging.INFO, 18 | handlers=[LoggingHandler()]) 19 | #### /print debug information to stdout 20 | 21 | # Read the dataset 22 | train_batch_size = 16 23 | num_epochs = 4 24 | model_save_path = 'output/training_stsbenchmark_distilbert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 25 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True) 26 | 27 | # Use DistilBERT-base for mapping tokens to embeddings 28 | word_embedding_model = models.DistilBERT('distilbert-base-uncased') 29 | 30 | # Apply mean pooling to get one fixed sized sentence vector 31 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 32 | pooling_mode_mean_tokens=True, 33 | pooling_mode_cls_token=False, 34 | pooling_mode_max_tokens=False) 35 | 36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 37 | 38 | # Convert the dataset to a DataLoader ready for training 39 | logging.info("Read STSbenchmark train dataset") 40 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) 41 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 42 | train_loss = losses.CosineSimilarityLoss(model=model) 43 | 44 | 45 | logging.info("Read STSbenchmark dev dataset") 46 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 47 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 48 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 49 | 50 | 51 | # Configure the training. We skip evaluation in this example 52 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 53 | logging.info("Warmup-steps: {}".format(warmup_steps)) 54 | 55 | 56 | # Train the model 57 | model.fit(train_objectives=[(train_dataloader, train_loss)], 58 | evaluator=evaluator, 59 | epochs=num_epochs, 60 | evaluation_steps=1000, 61 | warmup_steps=warmup_steps, 62 | output_path=model_save_path) 63 | 64 | 65 | ############################################################################## 66 | # 67 | # Load the stored model and evaluate its performance on STS benchmark dataset 68 | # 69 | ############################################################################## 70 | 71 | model = SentenceTransformer(model_save_path) 72 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 73 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 74 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 75 | model.evaluate(evaluator) 76 | -------------------------------------------------------------------------------- /examples/training_stsbenchmark_cnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example runs a CNN after the word embedding lookup. The output of the CNN is than pooled, 3 | for example with mean-pooling. 4 | 5 | 6 | """ 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import models, losses 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 13 | from sentence_transformers.readers import * 14 | import logging 15 | from datetime import datetime 16 | 17 | #### Just some code to print debug information to stdout 18 | logging.basicConfig(format='%(asctime)s - %(message)s', 19 | datefmt='%Y-%m-%d %H:%M:%S', 20 | level=logging.INFO, 21 | handlers=[LoggingHandler()]) 22 | #### /print debug information to stdout 23 | 24 | # Read the dataset 25 | batch_size = 32 26 | sts_reader = STSDataReader('datasets/stsbenchmark') 27 | model_save_path = 'output/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 28 | 29 | 30 | 31 | # Map tokens to vectors using BERT 32 | word_embedding_model = models.BERT('bert-base-uncased') 33 | 34 | cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), out_channels=256, kernel_sizes=[1,3,5]) 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(cnn.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=True, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=False) 41 | 42 | 43 | model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model]) 44 | 45 | 46 | # Convert the dataset to a DataLoader ready for training 47 | logging.info("Read STSbenchmark train dataset") 48 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 49 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 50 | train_loss = losses.CosineSimilarityLoss(model=model) 51 | 52 | logging.info("Read STSbenchmark dev dataset") 53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 56 | 57 | # Configure the training 58 | num_epochs = 10 59 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 60 | logging.info("Warmup-steps: {}".format(warmup_steps)) 61 | 62 | # Train the model 63 | model.fit(train_objectives=[(train_dataloader, train_loss)], 64 | evaluator=evaluator, 65 | epochs=num_epochs, 66 | warmup_steps=warmup_steps, 67 | output_path=model_save_path 68 | ) 69 | 70 | 71 | 72 | ############################################################################## 73 | # 74 | # Load the stored model and evaluate its performance on STS benchmark dataset 75 | # 76 | ############################################################################## 77 | 78 | model = SentenceTransformer(model_save_path) 79 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 80 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 81 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 82 | 83 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /examples/training_wikipedia_sections.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script trains sentence transformers with a triplet loss function. 3 | 4 | As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks. 5 | 6 | See docs/pretrained-models/wikipedia-sections-modesl.md for further details. 7 | 8 | You can get the dataset by running examples/datasets/get_data.py 9 | """ 10 | 11 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 12 | from torch.utils.data import DataLoader 13 | from sentence_transformers.readers import TripletReader 14 | from sentence_transformers.evaluation import TripletEvaluator 15 | from datetime import datetime 16 | 17 | import csv 18 | import logging 19 | 20 | 21 | 22 | logging.basicConfig(format='%(asctime)s - %(message)s', 23 | datefmt='%Y-%m-%d %H:%M:%S', 24 | level=logging.INFO, 25 | handlers=[LoggingHandler()]) 26 | 27 | 28 | 29 | ### Create a torch.DataLoader that passes training batch instances to our model 30 | train_batch_size = 16 31 | triplet_reader = TripletReader('datasets/wikipedia-sections-triplets', s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True) 32 | output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 33 | num_epochs = 1 34 | 35 | 36 | ### Configure sentence transformers for training and train on the provided dataset 37 | # Use BERT for mapping tokens to embeddings 38 | word_embedding_model = models.BERT('bert-base-uncased') 39 | 40 | # Apply mean pooling to get one fixed sized sentence vector 41 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 42 | pooling_mode_mean_tokens=True, 43 | pooling_mode_cls_token=False, 44 | pooling_mode_max_tokens=False) 45 | 46 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 47 | 48 | 49 | logging.info("Read Triplet train dataset") 50 | train_data = SentencesDataset(examples=triplet_reader.get_examples('train.csv'), model=model) 51 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 52 | train_loss = losses.TripletLoss(model=model) 53 | 54 | logging.info("Read Wikipedia Triplet dev dataset") 55 | dev_data = SentencesDataset(examples=triplet_reader.get_examples('validation.csv', 1000), model=model) 56 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 57 | evaluator = TripletEvaluator(dev_dataloader) 58 | 59 | 60 | warmup_steps = int(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data 61 | 62 | 63 | # Train the model 64 | model.fit(train_objectives=[(train_dataloader, train_loss)], 65 | evaluator=evaluator, 66 | epochs=num_epochs, 67 | evaluation_steps=1000, 68 | warmup_steps=warmup_steps, 69 | output_path=output_path) 70 | 71 | ############################################################################## 72 | # 73 | # Load the stored model and evaluate its performance on STS benchmark dataset 74 | # 75 | ############################################################################## 76 | 77 | model = SentenceTransformer(output_path) 78 | test_data = SentencesDataset(examples=triplet_reader.get_examples('test.csv'), model=model) 79 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 80 | evaluator = TripletEvaluator(test_dataloader) 81 | 82 | model.evaluate(evaluator) 83 | 84 | -------------------------------------------------------------------------------- /examples/training_nli_T5.py: -------------------------------------------------------------------------------- 1 | """ 2 | The system trains T5 on the SNLI + MultiNLI (AllNLI) dataset 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the 4 | STS benchmark dataset 5 | """ 6 | from torch.utils.data import DataLoader 7 | import math 8 | from sentence_transformers import models, losses 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 11 | from sentence_transformers.readers import * 12 | import logging 13 | from datetime import datetime 14 | 15 | #### Just some code to print debug information to stdout 16 | logging.basicConfig(format='%(asctime)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | level=logging.INFO, 19 | handlers=[LoggingHandler()]) 20 | #### /print debug information to stdout 21 | 22 | 23 | # Read the dataset 24 | model_name = 't5-small' 25 | batch_size = 16 26 | nli_reader = NLIDataReader('datasets/AllNLI') 27 | sts_reader = STSDataReader('datasets/stsbenchmark') 28 | train_num_labels = nli_reader.get_num_labels() 29 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 30 | 31 | 32 | 33 | # Use BERT for mapping tokens to embeddings 34 | word_embedding_model = models.T5(model_name) 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=True, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=False) 41 | 42 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 43 | 44 | 45 | # Convert the dataset to a DataLoader ready for training 46 | logging.info("Read AllNLI train dataset") 47 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) 48 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 49 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 50 | 51 | 52 | 53 | logging.info("Read STSbenchmark dev dataset") 54 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 55 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 56 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 57 | 58 | # Configure the training 59 | num_epochs = 1 60 | 61 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up 62 | logging.info("Warmup-steps: {}".format(warmup_steps)) 63 | 64 | 65 | 66 | # Train the model 67 | model.fit(train_objectives=[(train_dataloader, train_loss)], 68 | evaluator=evaluator, 69 | epochs=num_epochs, 70 | evaluation_steps=1000, 71 | warmup_steps=warmup_steps, 72 | output_path=model_save_path 73 | ) 74 | 75 | 76 | 77 | ############################################################################## 78 | # 79 | # Load the stored model and evaluate its performance on STS benchmark dataset 80 | # 81 | ############################################################################## 82 | 83 | model = SentenceTransformer(model_save_path) 84 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 85 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 86 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 87 | 88 | model.evaluate(evaluator) 89 | -------------------------------------------------------------------------------- /examples/training_stsbenchmark_bilstm.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled, 3 | for example with max-pooling (which gives a system like InferSent) or with mean-pooling. 4 | 5 | Note, you can also pass BERT embeddings to the BiLSTM. 6 | """ 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import models, losses 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 13 | from sentence_transformers.readers import * 14 | import logging 15 | from datetime import datetime 16 | 17 | #### Just some code to print debug information to stdout 18 | logging.basicConfig(format='%(asctime)s - %(message)s', 19 | datefmt='%Y-%m-%d %H:%M:%S', 20 | level=logging.INFO, 21 | handlers=[LoggingHandler()]) 22 | #### /print debug information to stdout 23 | 24 | # Read the dataset 25 | batch_size = 32 26 | sts_reader = STSDataReader('datasets/stsbenchmark') 27 | model_save_path = 'output/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 28 | 29 | 30 | 31 | # Map tokens to traditional word embeddings like GloVe 32 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz') 33 | 34 | lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=1024) 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(lstm.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=False, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=True) 41 | 42 | 43 | model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model]) 44 | 45 | 46 | # Convert the dataset to a DataLoader ready for training 47 | logging.info("Read STSbenchmark train dataset") 48 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 49 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 50 | train_loss = losses.CosineSimilarityLoss(model=model) 51 | 52 | logging.info("Read STSbenchmark dev dataset") 53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 56 | 57 | # Configure the training 58 | num_epochs = 10 59 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 60 | logging.info("Warmup-steps: {}".format(warmup_steps)) 61 | 62 | # Train the model 63 | model.fit(train_objectives=[(train_dataloader, train_loss)], 64 | evaluator=evaluator, 65 | epochs=num_epochs, 66 | warmup_steps=warmup_steps, 67 | output_path=model_save_path 68 | ) 69 | 70 | 71 | 72 | ############################################################################## 73 | # 74 | # Load the stored model and evaluate its performance on STS benchmark dataset 75 | # 76 | ############################################################################## 77 | 78 | model = SentenceTransformer(model_save_path) 79 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 80 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 81 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 82 | 83 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /examples/application_clustering_wikipedia_sections.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples clusters different sentences that come from the same wikipedia article. 3 | 4 | It uses the 'wikipedia-sections' model, a model that was trained to differentiate if two sentences from the 5 | same article come from the same section or from different sections in that article. 6 | """ 7 | from sentence_transformers import SentenceTransformer 8 | from sklearn.cluster import AgglomerativeClustering 9 | 10 | 11 | 12 | embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens') 13 | 14 | #Sentences and sections are from Wikipeda. 15 | #Source: https://en.wikipedia.org/wiki/Bushnell,_Illinois 16 | corpus = [ 17 | ("Bushnell is located at 40°33′6″N 90°30′29″W (40.551667, -90.507921).", "Geography"), 18 | ("According to the 2010 census, Bushnell has a total area of 2.138 square miles (5.54 km2), of which 2.13 square miles (5.52 km2) (or 99.63%) is land and 0.008 square miles (0.02 km2) (or 0.37%) is water.", "Geography"), 19 | 20 | ("The town was founded in 1854 when the Northern Cross Railroad built a line through the area.", "History"), 21 | ("Nehemiah Bushnell was the President of the Railroad, and townspeople honored him by naming their community after him. ", "History"), 22 | ("Bushnell was also served by the Toledo, Peoria and Western Railway, now the Keokuk Junction Railway.", "History"), 23 | 24 | ("As of the census[6] of 2000, there were 3,221 people, 1,323 households, and 889 families residing in the city. ", "Demographics"), 25 | ("The population density was 1,573.9 people per square mile (606.7/km²).", "Demographics"), 26 | ("There were 1,446 housing units at an average density of 706.6 per square mile (272.3/km²).", "Demographics"), 27 | 28 | ("From 1991 to 2012, Bushnell was home to one of the largest Christian Music and Arts festivals in the world, known as the Cornerstone Festival.", "Music"), 29 | ("Each year around the 4th of July, 25,000 people from all over the world would descend on the small farm town to watch over 300 bands, authors and artists perform at the Cornerstone Farm Campgrounds.", "Music"), 30 | ("The festival was generally well received by locals, and businesses in the area would typically put up signs welcoming festival-goers to their town.", "Music"), 31 | ("As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. ", "Music"), 32 | ("Cornerstone held its final festival in 2012 and no longer operates.", "Music"), 33 | 34 | ("Beginning in 1908, the Truman Pioneer Stud Farm in Bushnell was home to one of the largest horse shows in the Midwest.", "Horse show"), 35 | ("The show was well known for imported European horses.", "Horse show"), 36 | ("The Bushnell Horse Show features some of the best Belgian and Percheron hitches in the country. Teams have come from many different states and Canada to compete.", "Horse show"), 37 | ] 38 | 39 | sentences = [row[0] for row in corpus] 40 | 41 | corpus_embeddings = embedder.encode(sentences) 42 | num_clusters = len(set([row[1] for row in corpus])) 43 | 44 | #Sklearn clustering 45 | km = AgglomerativeClustering(n_clusters=num_clusters) 46 | km.fit(corpus_embeddings) 47 | 48 | cluster_assignment = km.labels_ 49 | 50 | 51 | clustered_sentences = [[] for i in range(num_clusters)] 52 | for sentence_id, cluster_id in enumerate(cluster_assignment): 53 | clustered_sentences[cluster_id].append(corpus[sentence_id]) 54 | 55 | for i, cluster in enumerate(clustered_sentences): 56 | print("Cluster ", i+1) 57 | for row in cluster: 58 | print("(Gold label: {}) - {}".format(row[1], row[0])) 59 | print("") 60 | 61 | -------------------------------------------------------------------------------- /examples/training_nli_bert.py: -------------------------------------------------------------------------------- 1 | """ 2 | The system trains BERT on the SNLI + MultiNLI (AllNLI) dataset 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the 4 | STS benchmark dataset 5 | """ 6 | from torch.utils.data import DataLoader 7 | import math 8 | from sentence_transformers import models, losses 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 11 | from sentence_transformers.readers import * 12 | import logging 13 | from datetime import datetime 14 | 15 | #### Just some code to print debug information to stdout 16 | logging.basicConfig(format='%(asctime)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | level=logging.INFO, 19 | handlers=[LoggingHandler()]) 20 | #### /print debug information to stdout 21 | 22 | # Read the dataset 23 | model_name = 'bert-base-uncased' 24 | batch_size = 16 25 | nli_reader = NLIDataReader('datasets/AllNLI') 26 | sts_reader = STSDataReader('datasets/stsbenchmark') 27 | train_num_labels = nli_reader.get_num_labels() 28 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 29 | 30 | 31 | 32 | # Use BERT for mapping tokens to embeddings 33 | word_embedding_model = models.BERT(model_name) 34 | 35 | # Apply mean pooling to get one fixed sized sentence vector 36 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 37 | pooling_mode_mean_tokens=True, 38 | pooling_mode_cls_token=False, 39 | pooling_mode_max_tokens=False) 40 | 41 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 42 | 43 | 44 | # Convert the dataset to a DataLoader ready for training 45 | logging.info("Read AllNLI train dataset") 46 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) 47 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 48 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 49 | 50 | 51 | 52 | logging.info("Read STSbenchmark dev dataset") 53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 56 | 57 | # Configure the training 58 | num_epochs = 1 59 | 60 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up 61 | logging.info("Warmup-steps: {}".format(warmup_steps)) 62 | 63 | 64 | 65 | # Train the model 66 | model.fit(train_objectives=[(train_dataloader, train_loss)], 67 | evaluator=evaluator, 68 | epochs=num_epochs, 69 | evaluation_steps=1000, 70 | warmup_steps=warmup_steps, 71 | output_path=model_save_path 72 | ) 73 | 74 | 75 | 76 | ############################################################################## 77 | # 78 | # Load the stored model and evaluate its performance on STS benchmark dataset 79 | # 80 | ############################################################################## 81 | 82 | model = SentenceTransformer(model_save_path) 83 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 84 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 85 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 86 | 87 | model.evaluate(evaluator) 88 | -------------------------------------------------------------------------------- /examples/training_nli_roberta.py: -------------------------------------------------------------------------------- 1 | """ 2 | The system RoBERTa trains on the SNLI + MultiNLI (AllNLI) dataset 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the 4 | STS benchmark dataset 5 | """ 6 | from torch.utils.data import DataLoader 7 | import math 8 | from sentence_transformers import models, losses 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 11 | from sentence_transformers.readers import * 12 | import logging 13 | from datetime import datetime 14 | 15 | #### Just some code to print debug information to stdout 16 | logging.basicConfig(format='%(asctime)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | level=logging.INFO, 19 | handlers=[LoggingHandler()]) 20 | #### /print debug information to stdout 21 | 22 | # Read the dataset 23 | model_name = 'roberta-large' 24 | batch_size = 16 25 | nli_reader = NLIDataReader('datasets/AllNLI') 26 | sts_reader = STSDataReader('datasets/stsbenchmark') 27 | train_num_labels = nli_reader.get_num_labels() 28 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 29 | 30 | 31 | 32 | # Use RoBERTa for mapping tokens to embeddings 33 | word_embedding_model = models.RoBERTa(model_name) 34 | 35 | # Apply mean pooling to get one fixed sized sentence vector 36 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 37 | pooling_mode_mean_tokens=True, 38 | pooling_mode_cls_token=False, 39 | pooling_mode_max_tokens=False) 40 | 41 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 42 | 43 | 44 | # Convert the dataset to a DataLoader ready for training 45 | logging.info("Read AllNLI train dataset") 46 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) 47 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 48 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 49 | 50 | 51 | 52 | logging.info("Read STSbenchmark dev dataset") 53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 56 | 57 | # Configure the training 58 | num_epochs = 1 59 | 60 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 61 | logging.info("Warmup-steps: {}".format(warmup_steps)) 62 | 63 | 64 | # Train the model 65 | model.fit(train_objectives=[(train_dataloader, train_loss)], 66 | evaluator=evaluator, 67 | epochs=num_epochs, 68 | evaluation_steps=1000, 69 | warmup_steps=warmup_steps, 70 | output_path=model_save_path 71 | ) 72 | 73 | 74 | 75 | ############################################################################## 76 | # 77 | # Load the stored model and evaluate its performance on STS benchmark dataset 78 | # 79 | ############################################################################## 80 | 81 | model = SentenceTransformer(model_save_path) 82 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 83 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 84 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 85 | 86 | model.evaluate(evaluator) 87 | -------------------------------------------------------------------------------- /examples/training_nli_albert.py: -------------------------------------------------------------------------------- 1 | """ 2 | The system ALBERT trains on the SNLI + MultiNLI (AllNLI) dataset 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the 4 | STS benchmark dataset 5 | """ 6 | from torch.utils.data import DataLoader 7 | import math 8 | from sentence_transformers import models, losses 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 11 | from sentence_transformers.readers import * 12 | import logging 13 | from datetime import datetime 14 | 15 | 16 | #### Just some code to print debug information to stdout 17 | logging.basicConfig(format='%(asctime)s - %(message)s', 18 | datefmt='%Y-%m-%d %H:%M:%S', 19 | level=logging.INFO, 20 | handlers=[LoggingHandler()]) 21 | #### /print debug information to stdout 22 | 23 | # Read the dataset 24 | model_name = 'albert-base-v2' 25 | batch_size = 16 26 | nli_reader = NLIDataReader('datasets/AllNLI') 27 | sts_reader = STSDataReader('datasets/stsbenchmark') 28 | train_num_labels = nli_reader.get_num_labels() 29 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 30 | 31 | 32 | 33 | # Use ALBERT for mapping tokens to embeddings 34 | word_embedding_model = models.ALBERT(model_name) 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=True, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=False) 41 | 42 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 43 | 44 | 45 | # Convert the dataset to a DataLoader ready for training 46 | logging.info("Read AllNLI train dataset") 47 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) 48 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 49 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 50 | 51 | 52 | 53 | logging.info("Read STSbenchmark dev dataset") 54 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 55 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 56 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 57 | 58 | # Configure the training 59 | num_epochs = 1 60 | 61 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 62 | logging.info("Warmup-steps: {}".format(warmup_steps)) 63 | 64 | 65 | # Train the model 66 | model.fit(train_objectives=[(train_dataloader, train_loss)], 67 | evaluator=evaluator, 68 | epochs=num_epochs, 69 | evaluation_steps=1000, 70 | warmup_steps=warmup_steps, 71 | output_path=model_save_path 72 | ) 73 | 74 | 75 | 76 | ############################################################################## 77 | # 78 | # Load the stored model and evaluate its performance on STS benchmark dataset 79 | # 80 | ############################################################################## 81 | 82 | model = SentenceTransformer(model_save_path) 83 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 84 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 85 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 86 | 87 | model.evaluate(evaluator) 88 | -------------------------------------------------------------------------------- /examples/training_nli_xlm-roberta.py: -------------------------------------------------------------------------------- 1 | """ 2 | The system trains T5 on the SNLI + MultiNLI (AllNLI) dataset 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the 4 | STS benchmark dataset 5 | """ 6 | from torch.utils.data import DataLoader 7 | import math 8 | from sentence_transformers import models, losses 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 11 | from sentence_transformers.readers import * 12 | import logging 13 | from datetime import datetime 14 | 15 | #### Just some code to print debug information to stdout 16 | logging.basicConfig(format='%(asctime)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | level=logging.INFO, 19 | handlers=[LoggingHandler()]) 20 | #### /print debug information to stdout 21 | 22 | 23 | # Read the dataset 24 | model_name = 'xlm-roberta-base' 25 | batch_size = 16 26 | nli_reader = NLIDataReader('datasets/AllNLI') 27 | sts_reader = STSDataReader('datasets/stsbenchmark') 28 | train_num_labels = nli_reader.get_num_labels() 29 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 30 | 31 | 32 | 33 | # Use BERT for mapping tokens to embeddings 34 | word_embedding_model = models.XLMRoBERTa(model_name) 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=True, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=False) 41 | 42 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 43 | 44 | 45 | # Convert the dataset to a DataLoader ready for training 46 | logging.info("Read AllNLI train dataset") 47 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) 48 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 49 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 50 | 51 | 52 | 53 | logging.info("Read STSbenchmark dev dataset") 54 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 55 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 56 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 57 | 58 | # Configure the training 59 | num_epochs = 1 60 | 61 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up 62 | logging.info("Warmup-steps: {}".format(warmup_steps)) 63 | 64 | 65 | 66 | # Train the model 67 | model.fit(train_objectives=[(train_dataloader, train_loss)], 68 | evaluator=evaluator, 69 | epochs=num_epochs, 70 | evaluation_steps=1000, 71 | warmup_steps=warmup_steps, 72 | output_path=model_save_path 73 | ) 74 | 75 | 76 | 77 | ############################################################################## 78 | # 79 | # Load the stored model and evaluate its performance on STS benchmark dataset 80 | # 81 | ############################################################################## 82 | 83 | model = SentenceTransformer(model_save_path) 84 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 85 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 86 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 87 | 88 | model.evaluate(evaluator) 89 | -------------------------------------------------------------------------------- /examples/training_nli_distilbert.py: -------------------------------------------------------------------------------- 1 | """ 2 | The system DistilBERT trains on the SNLI + MultiNLI (AllNLI) dataset 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the 4 | STS benchmark dataset 5 | """ 6 | from torch.utils.data import DataLoader 7 | import math 8 | from sentence_transformers import models, losses 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 11 | from sentence_transformers.readers import * 12 | import logging 13 | from datetime import datetime 14 | 15 | #### Just some code to print debug information to stdout 16 | logging.basicConfig(format='%(asctime)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | level=logging.INFO, 19 | handlers=[LoggingHandler()]) 20 | #### /print debug information to stdout 21 | 22 | # Read the dataset 23 | model_name = 'distilbert-base-uncased' 24 | batch_size = 16 25 | nli_reader = NLIDataReader('datasets/AllNLI') 26 | sts_reader = STSDataReader('datasets/stsbenchmark') 27 | train_num_labels = nli_reader.get_num_labels() 28 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 29 | 30 | 31 | 32 | # Use DistilBERT for mapping tokens to embeddings 33 | word_embedding_model = models.DistilBERT(model_name) 34 | 35 | # Apply mean pooling to get one fixed sized sentence vector 36 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 37 | pooling_mode_mean_tokens=True, 38 | pooling_mode_cls_token=False, 39 | pooling_mode_max_tokens=False) 40 | 41 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 42 | 43 | 44 | # Convert the dataset to a DataLoader ready for training 45 | logging.info("Read AllNLI train dataset") 46 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) 47 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 48 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 49 | 50 | 51 | 52 | logging.info("Read STSbenchmark dev dataset") 53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 56 | 57 | # Configure the training 58 | num_epochs = 1 59 | 60 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 61 | logging.info("Warmup-steps: {}".format(warmup_steps)) 62 | 63 | 64 | # Train the model 65 | model.fit(train_objectives=[(train_dataloader, train_loss)], 66 | evaluator=evaluator, 67 | epochs=num_epochs, 68 | evaluation_steps=1000, 69 | warmup_steps=warmup_steps, 70 | output_path=model_save_path 71 | ) 72 | 73 | 74 | 75 | ############################################################################## 76 | # 77 | # Load the stored model and evaluate its performance on STS benchmark dataset 78 | # 79 | ############################################################################## 80 | 81 | model = SentenceTransformer(model_save_path) 82 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 83 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 84 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 85 | 86 | model.evaluate(evaluator) 87 | -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/WordTokenizer.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Union, Tuple, List, Iterable, Dict 3 | 4 | ENGLISH_STOP_WORDS = ['!', '"', "''", "``", '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~', 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldn', 'couldnt', 'cry', 'd', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'hadn', 'has', 'hasn', 'hasnt', 'have', 'haven', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'll', 'ltd', 'm', 'ma', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mightn', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'mustn', 'my', 'myself', 'name', 'namely', 'needn', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'shan', 'she', 'should', 'shouldn', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 've', 'very', 'via', 'was', 'wasn', 'we', 'well', 'were', 'weren', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'won', 'would', 'wouldn', 'y', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves'] 5 | 6 | 7 | class WordTokenizer(ABC): 8 | @abstractmethod 9 | def set_vocab(self, vocab: Iterable[str]): 10 | pass 11 | 12 | @abstractmethod 13 | def get_vocab(self, vocab: Iterable[str]): 14 | pass 15 | 16 | @abstractmethod 17 | def tokenize(self, text: str) -> List[int]: 18 | pass 19 | 20 | @abstractmethod 21 | def save(self, output_path: str): 22 | pass 23 | 24 | @staticmethod 25 | @abstractmethod 26 | def load(input_path: str): 27 | pass -------------------------------------------------------------------------------- /examples/training_stsbenchmark_avg_word_embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example uses average word embeddings (for example from GloVe). It adds two fully-connected feed-forward layers (dense layers) to create a Deep Averaging Network (DAN). 3 | 4 | If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server. 5 | 6 | See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/ 7 | for available word embeddings files 8 | """ 9 | import torch 10 | from torch.utils.data import DataLoader 11 | import math 12 | from sentence_transformers import models, losses 13 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 14 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 15 | from sentence_transformers.readers import * 16 | import logging 17 | from datetime import datetime 18 | 19 | #### Just some code to print debug information to stdout 20 | logging.basicConfig(format='%(asctime)s - %(message)s', 21 | datefmt='%Y-%m-%d %H:%M:%S', 22 | level=logging.INFO, 23 | handlers=[LoggingHandler()]) 24 | #### /print debug information to stdout 25 | 26 | # Read the dataset 27 | batch_size = 32 28 | sts_reader = STSDataReader('datasets/stsbenchmark') 29 | model_save_path = 'output/training_stsbenchmark_avg_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 30 | 31 | 32 | 33 | # Map tokens to traditional word embeddings like GloVe 34 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz') 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=True, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=False) 41 | 42 | # Add two trainable feed-forward networks (DAN) 43 | sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension() 44 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) 45 | dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) 46 | 47 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dan1, dan2]) 48 | 49 | 50 | # Convert the dataset to a DataLoader ready for training 51 | logging.info("Read STSbenchmark train dataset") 52 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 53 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 54 | train_loss = losses.CosineSimilarityLoss(model=model) 55 | 56 | logging.info("Read STSbenchmark dev dataset") 57 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 58 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 59 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 60 | 61 | # Configure the training 62 | num_epochs = 10 63 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 64 | logging.info("Warmup-steps: {}".format(warmup_steps)) 65 | 66 | # Train the model 67 | model.fit(train_objectives=[(train_dataloader, train_loss)], 68 | evaluator=evaluator, 69 | epochs=num_epochs, 70 | warmup_steps=warmup_steps, 71 | output_path=model_save_path 72 | ) 73 | 74 | 75 | 76 | ############################################################################## 77 | # 78 | # Load the stored model and evaluate its performance on STS benchmark dataset 79 | # 80 | ############################################################################## 81 | 82 | model = SentenceTransformer(model_save_path) 83 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 84 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 85 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 86 | 87 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /sentence_transformers/models/Pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class Pooling(nn.Module): 10 | """Performs pooling (max or mean) on the token embeddings. 11 | 12 | Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model. 13 | You can concatenate multiple poolings together. 14 | """ 15 | def __init__(self, 16 | word_embedding_dimension: int, 17 | pooling_mode_cls_token: bool = False, 18 | pooling_mode_max_tokens: bool = False, 19 | pooling_mode_mean_tokens: bool = True, 20 | pooling_mode_mean_sqrt_len_tokens: bool = False, 21 | ): 22 | super(Pooling, self).__init__() 23 | 24 | self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens'] 25 | 26 | self.word_embedding_dimension = word_embedding_dimension 27 | self.pooling_mode_cls_token = pooling_mode_cls_token 28 | self.pooling_mode_mean_tokens = pooling_mode_mean_tokens 29 | self.pooling_mode_max_tokens = pooling_mode_max_tokens 30 | self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens 31 | 32 | pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens]) 33 | self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension) 34 | 35 | def forward(self, features: Dict[str, Tensor]): 36 | token_embeddings = features['token_embeddings'] 37 | cls_token = features['cls_token_embeddings'] 38 | input_mask = features['input_mask'] 39 | 40 | ## Pooling strategy 41 | output_vectors = [] 42 | if self.pooling_mode_cls_token: 43 | output_vectors.append(cls_token) 44 | if self.pooling_mode_max_tokens: 45 | input_mask_expanded = input_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 46 | token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value 47 | max_over_time = torch.max(token_embeddings, 1)[0] 48 | output_vectors.append(max_over_time) 49 | if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens: 50 | input_mask_expanded = input_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 51 | sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) 52 | 53 | #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present 54 | if 'token_weights_sum' in features: 55 | sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) 56 | else: 57 | sum_mask = input_mask_expanded.sum(1) 58 | 59 | sum_mask = torch.clamp(sum_mask, min=1e-9) 60 | 61 | if self.pooling_mode_mean_tokens: 62 | output_vectors.append(sum_embeddings / sum_mask) 63 | if self.pooling_mode_mean_sqrt_len_tokens: 64 | output_vectors.append(sum_embeddings / torch.sqrt(sum_mask)) 65 | 66 | output_vector = torch.cat(output_vectors, 1) 67 | features.update({'sentence_embedding': output_vector}) 68 | return features 69 | 70 | def get_sentence_embedding_dimension(self): 71 | return self.pooling_output_dimension 72 | 73 | def get_config_dict(self): 74 | return {key: self.__dict__[key] for key in self.config_keys} 75 | 76 | def save(self, output_path): 77 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 78 | json.dump(self.get_config_dict(), fOut, indent=2) 79 | 80 | @staticmethod 81 | def load(input_path): 82 | with open(os.path.join(input_path, 'config.json')) as fIn: 83 | config = json.load(fIn) 84 | 85 | return Pooling(**config) 86 | -------------------------------------------------------------------------------- /sentence_transformers/models/DistilBERT.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import DistilBertModel, DistilBertTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | class DistilBERT(nn.Module): 11 | """DistilBERT model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from DistilBERT. 14 | """ 15 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): 16 | super(DistilBERT, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | 20 | if max_seq_length > 510: 21 | logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") 22 | max_seq_length = 510 23 | self.max_seq_length = max_seq_length 24 | 25 | 26 | 27 | self.bert = DistilBertModel.from_pretrained(model_name_or_path) 28 | self.tokenizer = DistilBertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 29 | self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] 30 | self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0] 31 | 32 | def forward(self, features): 33 | """Returns token_embeddings, cls_token""" 34 | # DistilBERT does not use token_type_ids 35 | output_tokens = self.bert(input_ids=features['input_ids'], attention_mask=features['input_mask'])[0] 36 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 37 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 38 | return features 39 | 40 | def get_word_embedding_dimension(self) -> int: 41 | return self.bert.config.hidden_size 42 | 43 | def tokenize(self, text: str) -> List[int]: 44 | """ 45 | Tokenizes a text and maps tokens to token-ids 46 | """ 47 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 48 | 49 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 50 | """ 51 | Convert tokenized sentence in its embedding ids, segment ids and mask 52 | 53 | :param tokens: 54 | a tokenized sentence 55 | :param pad_seq_length: 56 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 57 | :return: embedding ids, segment ids and mask for the sentence 58 | """ 59 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 60 | 61 | tokens = tokens[:pad_seq_length] 62 | input_ids = [self.cls_token_id] + tokens + [self.sep_token_id] 63 | sentence_length = len(input_ids) 64 | 65 | pad_seq_length += 2 ##Add Space for CLS + SEP token 66 | 67 | 68 | input_mask = [1] * len(input_ids) 69 | 70 | # Zero-pad up to the sequence length. BERT: Pad to the right 71 | padding = [0] * (pad_seq_length - len(input_ids)) 72 | input_ids += padding 73 | 74 | input_mask += padding 75 | 76 | assert len(input_ids) == pad_seq_length 77 | assert len(input_mask) == pad_seq_length 78 | 79 | 80 | return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)} 81 | 82 | def get_config_dict(self): 83 | return {key: self.__dict__[key] for key in self.config_keys} 84 | 85 | def save(self, output_path: str): 86 | self.bert.save_pretrained(output_path) 87 | self.tokenizer.save_pretrained(output_path) 88 | 89 | with open(os.path.join(output_path, 'sentence_distilbert_config.json'), 'w') as fOut: 90 | json.dump(self.get_config_dict(), fOut, indent=2) 91 | 92 | @staticmethod 93 | def load(input_path: str): 94 | with open(os.path.join(input_path, 'sentence_distilbert_config.json')) as fIn: 95 | config = json.load(fIn) 96 | return DistilBERT(model_name_or_path=input_path, **config) 97 | 98 | 99 | 100 | 101 | 102 | 103 | -------------------------------------------------------------------------------- /sentence_transformers/models/RoBERTa.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import RobertaModel, RobertaTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | class RoBERTa(nn.Module): 11 | """RoBERTa model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from RoBERTa. 14 | """ 15 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): 16 | super(RoBERTa, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | 20 | if max_seq_length > 511: 21 | logging.warning("RoBERTa only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511") 22 | max_seq_length = 511 23 | self.max_seq_length = max_seq_length 24 | 25 | 26 | self.roberta = RobertaModel.from_pretrained(model_name_or_path) 27 | self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 28 | self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] 29 | self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0] 30 | 31 | def forward(self, features): 32 | """Returns token_embeddings, cls_token""" 33 | #RoBERTa does not use token_type_ids 34 | output_tokens = self.roberta(input_ids=features['input_ids'], token_type_ids=None, attention_mask=features['input_mask'])[0] 35 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 36 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 37 | return features 38 | 39 | def get_word_embedding_dimension(self) -> int: 40 | return self.roberta.config.hidden_size 41 | 42 | def tokenize(self, text: str) -> List[int]: 43 | """ 44 | Tokenizes a text and maps tokens to token-ids 45 | """ 46 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 47 | 48 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 49 | """ 50 | Convert tokenized sentence in its embedding ids, segment ids and mask 51 | 52 | :param tokens: 53 | a tokenized sentence 54 | :param pad_seq_length: 55 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 56 | :return: embedding ids, segment ids and mask for the sentence 57 | """ 58 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 59 | 60 | tokens = tokens[:pad_seq_length] 61 | input_ids = [self.cls_token_id] + tokens + [self.sep_token_id] + [self.sep_token_id] 62 | sentence_length = len(input_ids) 63 | 64 | pad_seq_length += 3 ##Add Space for CLS + SEP + SEP token 65 | 66 | input_mask = [1] * len(input_ids) 67 | 68 | # Zero-pad up to the sequence length. BERT: Pad to the right 69 | padding = [0] * (pad_seq_length - len(input_ids)) 70 | input_ids += padding 71 | 72 | input_mask += padding 73 | 74 | assert len(input_ids) == pad_seq_length 75 | assert len(input_mask) == pad_seq_length 76 | 77 | 78 | return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)} 79 | 80 | def get_config_dict(self): 81 | return {key: self.__dict__[key] for key in self.config_keys} 82 | 83 | def save(self, output_path: str): 84 | self.roberta.save_pretrained(output_path) 85 | self.tokenizer.save_pretrained(output_path) 86 | 87 | with open(os.path.join(output_path, 'sentence_roberta_config.json'), 'w') as fOut: 88 | json.dump(self.get_config_dict(), fOut, indent=2) 89 | 90 | @staticmethod 91 | def load(input_path: str): 92 | with open(os.path.join(input_path, 'sentence_roberta_config.json')) as fIn: 93 | config = json.load(fIn) 94 | return RoBERTa(model_name_or_path=input_path, **config) 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /sentence_transformers/models/XLMRoBERTa.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import XLMRobertaModel, XLMRobertaTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | class XLMRoBERTa(nn.Module): 11 | """RoBERTa model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from RoBERTa. 14 | """ 15 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): 16 | super(XLMRoBERTa, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path) 20 | self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 21 | 22 | if max_seq_length > self.tokenizer.max_len_single_sentence: 23 | logging.warning("XLM-RoBERTa only allows a max_seq_length of "+self.tokenizer.max_len_single_sentence) 24 | max_seq_length = self.tokenizer.max_len_single_sentence 25 | self.max_seq_length = max_seq_length 26 | 27 | 28 | self.cls_token_id = self.tokenizer.cls_token_id 29 | self.eos_token_id = self.tokenizer.eos_token_id 30 | 31 | def forward(self, features): 32 | """Returns token_embeddings, cls_token""" 33 | #RoBERTa does not use token_type_ids 34 | output_tokens = self.xlm_roberta(input_ids=features['input_ids'], token_type_ids=None, attention_mask=features['input_mask'])[0] 35 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 36 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 37 | return features 38 | 39 | def get_word_embedding_dimension(self) -> int: 40 | return self.xlm_roberta.config.hidden_size 41 | 42 | def tokenize(self, text: str) -> List[int]: 43 | """ 44 | Tokenizes a text and maps tokens to token-ids 45 | """ 46 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 47 | 48 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 49 | """ 50 | Convert tokenized sentence in its embedding ids, segment ids and mask 51 | 52 | :param tokens: 53 | a tokenized sentence 54 | :param pad_seq_length: 55 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 56 | :return: embedding ids, segment ids and mask for the sentence 57 | """ 58 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 59 | 60 | tokens = tokens[:pad_seq_length] 61 | input_ids = [self.cls_token_id] + tokens + [self.eos_token_id] 62 | sentence_length = len(input_ids) 63 | 64 | pad_seq_length += 3 ##Add Space for CLS + SEP + SEP token 65 | 66 | input_mask = [1] * len(input_ids) 67 | 68 | # Zero-pad up to the sequence length. BERT: Pad to the right 69 | padding = [0] * (pad_seq_length - len(input_ids)) 70 | input_ids += padding 71 | 72 | input_mask += padding 73 | 74 | assert len(input_ids) == pad_seq_length 75 | assert len(input_mask) == pad_seq_length 76 | 77 | 78 | return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)} 79 | 80 | def get_config_dict(self): 81 | return {key: self.__dict__[key] for key in self.config_keys} 82 | 83 | def save(self, output_path: str): 84 | self.xlm_roberta.save_pretrained(output_path) 85 | self.tokenizer.save_pretrained(output_path) 86 | 87 | with open(os.path.join(output_path, 'sentence_xlm-roberta_config.json'), 'w') as fOut: 88 | json.dump(self.get_config_dict(), fOut, indent=2) 89 | 90 | @staticmethod 91 | def load(input_path: str): 92 | with open(os.path.join(input_path, 'sentence_xlm-roberta_config.json')) as fIn: 93 | config = json.load(fIn) 94 | return XLMRoBERTa(model_name_or_path=input_path, **config) 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /sentence_transformers/models/T5.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import T5Model, T5Tokenizer 3 | import json 4 | from typing import List 5 | import os 6 | import numpy as np 7 | import logging 8 | 9 | class T5(nn.Module): 10 | """T5 model to generate token embeddings. 11 | 12 | Each token is mapped to an output vector from BERT. 13 | """ 14 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): 15 | super(T5, self).__init__() 16 | self.config_keys = ['max_seq_length', 'do_lower_case'] 17 | self.do_lower_case = do_lower_case 18 | 19 | if max_seq_length > 512: 20 | logging.warning("T5 only allows a max_seq_length of 512. Value will be set to 512") 21 | max_seq_length = 512 22 | self.max_seq_length = max_seq_length 23 | 24 | self.enc_model = T5Model.from_pretrained(model_name_or_path) 25 | self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 26 | #self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] 27 | #self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0] 28 | 29 | def forward(self, features): 30 | """Returns token_embeddings, cls_token""" 31 | output_tokens = self.enc_model(input_ids=features['input_ids'], attention_mask=features['input_mask'])[0] 32 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 33 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 34 | return features 35 | 36 | def get_word_embedding_dimension(self) -> int: 37 | return self.enc_model.config.hidden_size 38 | 39 | def tokenize(self, text: str) -> List[int]: 40 | """ 41 | Tokenizes a text and maps tokens to token-ids 42 | """ 43 | return self.tokenizer.encode(text) 44 | 45 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 46 | """ 47 | Convert tokenized sentence in its embedding ids, segment ids and mask 48 | 49 | :param tokens: 50 | a tokenized sentence 51 | :param pad_seq_length: 52 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 53 | :return: embedding ids, segment ids and mask for the sentence 54 | """ 55 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 56 | 57 | tokens = tokens[:pad_seq_length] 58 | input_ids = tokens #[self.cls_token_id] + tokens + [self.sep_token_id] 59 | sentence_length = len(input_ids) 60 | 61 | #pad_seq_length += 2 ##Add Space for CLS + SEP token 62 | 63 | token_type_ids = [0] * len(input_ids) 64 | input_mask = [1] * len(input_ids) 65 | 66 | # Zero-pad up to the sequence length. BERT: Pad to the right 67 | padding = [0] * (pad_seq_length - len(input_ids)) 68 | input_ids += padding 69 | token_type_ids += padding 70 | input_mask += padding 71 | 72 | assert len(input_ids) == pad_seq_length 73 | assert len(input_mask) == pad_seq_length 74 | assert len(token_type_ids) == pad_seq_length 75 | 76 | 77 | return { 78 | 'input_ids': np.asarray(input_ids, dtype=np.int64), 79 | 'token_type_ids': np.asarray(token_type_ids, dtype=np.int64), 80 | 'input_mask': np.asarray(input_mask, dtype=np.int64), 81 | 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64) 82 | } 83 | 84 | def get_config_dict(self): 85 | return {key: self.__dict__[key] for key in self.config_keys} 86 | 87 | def save(self, output_path: str): 88 | self.enc_model.save_pretrained(output_path) 89 | self.tokenizer.save_pretrained(output_path) 90 | 91 | with open(os.path.join(output_path, 'sentence_T5_config.json'), 'w') as fOut: 92 | json.dump(self.get_config_dict(), fOut, indent=2) 93 | 94 | @staticmethod 95 | def load(input_path: str): 96 | with open(os.path.join(input_path, 'sentence_T5_config.json')) as fIn: 97 | config = json.load(fIn) 98 | return T5(model_name_or_path=input_path, **config) 99 | 100 | 101 | 102 | 103 | 104 | 105 | -------------------------------------------------------------------------------- /sentence_transformers/models/BERT.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import BertModel, BertTokenizer 3 | import json 4 | from typing import List 5 | import os 6 | import numpy as np 7 | import logging 8 | 9 | class BERT(nn.Module): 10 | """BERT model to generate token embeddings. 11 | 12 | Each token is mapped to an output vector from BERT. 13 | """ 14 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): 15 | super(BERT, self).__init__() 16 | self.config_keys = ['max_seq_length', 'do_lower_case'] 17 | self.do_lower_case = do_lower_case 18 | 19 | if max_seq_length > 510: 20 | logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") 21 | max_seq_length = 510 22 | self.max_seq_length = max_seq_length 23 | 24 | self.bert = BertModel.from_pretrained(model_name_or_path) 25 | self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 26 | self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] 27 | self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0] 28 | 29 | def forward(self, features): 30 | """Returns token_embeddings, cls_token""" 31 | output_tokens = self.bert(input_ids=features['input_ids'], token_type_ids=features['token_type_ids'], attention_mask=features['input_mask'])[0] 32 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 33 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 34 | return features 35 | 36 | def get_word_embedding_dimension(self) -> int: 37 | return self.bert.config.hidden_size 38 | 39 | def tokenize(self, text: str) -> List[int]: 40 | """ 41 | Tokenizes a text and maps tokens to token-ids 42 | """ 43 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 44 | 45 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 46 | """ 47 | Convert tokenized sentence in its embedding ids, segment ids and mask 48 | 49 | :param tokens: 50 | a tokenized sentence 51 | :param pad_seq_length: 52 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 53 | :return: embedding ids, segment ids and mask for the sentence 54 | """ 55 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 56 | 57 | tokens = tokens[:pad_seq_length] 58 | input_ids = [self.cls_token_id] + tokens + [self.sep_token_id] 59 | sentence_length = len(input_ids) 60 | 61 | pad_seq_length += 2 ##Add Space for CLS + SEP token 62 | 63 | token_type_ids = [0] * len(input_ids) 64 | input_mask = [1] * len(input_ids) 65 | 66 | # Zero-pad up to the sequence length. BERT: Pad to the right 67 | padding = [0] * (pad_seq_length - len(input_ids)) 68 | input_ids += padding 69 | token_type_ids += padding 70 | input_mask += padding 71 | 72 | assert len(input_ids) == pad_seq_length 73 | assert len(input_mask) == pad_seq_length 74 | assert len(token_type_ids) == pad_seq_length 75 | 76 | return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'token_type_ids': np.asarray(token_type_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)} 77 | 78 | def get_config_dict(self): 79 | return {key: self.__dict__[key] for key in self.config_keys} 80 | 81 | def save(self, output_path: str): 82 | self.bert.save_pretrained(output_path) 83 | self.tokenizer.save_pretrained(output_path) 84 | 85 | with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut: 86 | json.dump(self.get_config_dict(), fOut, indent=2) 87 | 88 | @staticmethod 89 | def load(input_path: str): 90 | with open(os.path.join(input_path, 'sentence_bert_config.json')) as fIn: 91 | config = json.load(fIn) 92 | return BERT(model_name_or_path=input_path, **config) 93 | 94 | 95 | 96 | 97 | 98 | 99 | -------------------------------------------------------------------------------- /sentence_transformers/models/CamemBERT.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import CamembertModel, CamembertTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | 11 | class CamemBERT(nn.Module): 12 | """CamemBERT model to generate token embeddings. 13 | 14 | Each token is mapped to an output vector from CamemBERT. 15 | """ 16 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): 17 | super(CamemBERT, self).__init__() 18 | self.config_keys = ['max_seq_length', 'do_lower_case'] 19 | self.do_lower_case = do_lower_case 20 | 21 | if max_seq_length > 511: 22 | logging.warning("CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511") 23 | max_seq_length = 511 24 | self.max_seq_length = max_seq_length 25 | 26 | self.camembert = CamembertModel.from_pretrained(model_name_or_path) 27 | self.tokenizer = CamembertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 28 | self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] 29 | self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0] 30 | 31 | def forward(self, features): 32 | """Returns token_embeddings, cls_token""" 33 | #CamemBERT does not use token_type_ids 34 | output_tokens = self.camembert(input_ids=features['input_ids'], token_type_ids=None, attention_mask=features['input_mask'])[0] 35 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 36 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 37 | return features 38 | 39 | def get_word_embedding_dimension(self) -> int: 40 | return self.camembert.config.hidden_size 41 | 42 | def tokenize(self, text: str) -> List[int]: 43 | """ 44 | Tokenizes a text and maps tokens to token-ids 45 | """ 46 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 47 | 48 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 49 | """ 50 | Convert tokenized sentence in its embedding ids, segment ids and mask 51 | 52 | :param tokens: 53 | a tokenized sentence 54 | :param pad_seq_length: 55 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 56 | :return: embedding ids, segment ids and mask for the sentence 57 | """ 58 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 59 | 60 | tokens = tokens[:pad_seq_length] 61 | input_ids = [self.cls_token_id] + tokens + [self.sep_token_id] + [self.sep_token_id] 62 | sentence_length = len(input_ids) 63 | 64 | pad_seq_length += 3 ##Add Space for CLS + SEP + SEP token 65 | 66 | input_mask = [1] * len(input_ids) 67 | 68 | # Zero-pad up to the sequence length. BERT: Pad to the right 69 | padding = [0] * (pad_seq_length - len(input_ids)) 70 | input_ids += padding 71 | 72 | input_mask += padding 73 | 74 | assert len(input_ids) == pad_seq_length 75 | assert len(input_mask) == pad_seq_length 76 | 77 | 78 | return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)} 79 | 80 | def get_config_dict(self): 81 | return {key: self.__dict__[key] for key in self.config_keys} 82 | 83 | def save(self, output_path: str): 84 | self.camembert.save_pretrained(output_path) 85 | self.tokenizer.save_pretrained(output_path) 86 | 87 | with open(os.path.join(output_path, 'sentence_camembert_config.json'), 'w') as fOut: 88 | json.dump(self.get_config_dict(), fOut, indent=2) 89 | 90 | @staticmethod 91 | def load(input_path: str): 92 | with open(os.path.join(input_path, 'sentence_camembert_config.json')) as fIn: 93 | config = json.load(fIn) 94 | return CamemBERT(model_name_or_path=input_path, **config) 95 | 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /examples/training_stsbenchmark_bow.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example uses a simple bag-of-words (BoW) approach. A sentence is mapped 3 | to a sparse vector with e.g. 25,000 dimensions. Optionally, you can also use tf-idf. 4 | 5 | To make the model trainable, we add multiple dense layers to create a Deep Averaging Network (DAN). 6 | """ 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import models, losses 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 13 | from sentence_transformers.readers import * 14 | from sentence_transformers.models.tokenizer.WordTokenizer import ENGLISH_STOP_WORDS 15 | import logging 16 | from datetime import datetime 17 | 18 | #### Just some code to print debug information to stdout 19 | logging.basicConfig(format='%(asctime)s - %(message)s', 20 | datefmt='%Y-%m-%d %H:%M:%S', 21 | level=logging.INFO, 22 | handlers=[LoggingHandler()]) 23 | #### /print debug information to stdout 24 | 25 | # Read the dataset 26 | batch_size = 32 27 | sts_reader = STSDataReader('datasets/stsbenchmark') 28 | model_save_path = 'output/training_tf-idf_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 29 | 30 | 31 | 32 | # Create the vocab for the BoW model 33 | stop_words = ENGLISH_STOP_WORDS 34 | max_vocab_size = 25000 #This is also the size of the BoW sentence vector. 35 | 36 | 37 | #Read the most common max_vocab_size words. Skip stop-words 38 | vocab = set() 39 | weights = {} 40 | lines = open('wikipedia_doc_frequencies.txt').readlines() 41 | num_docs = int(lines[0]) 42 | for line in lines[1:]: 43 | word, freq = line.lower().strip().split("\t") 44 | if word in stop_words: 45 | continue 46 | 47 | vocab.add(word) 48 | weights[word] = math.log(num_docs/int(freq)) 49 | 50 | if len(vocab) >= max_vocab_size: 51 | break 52 | 53 | #Create the BoW model. Because we set word_weights to the IDF values and cumulative_term_frequency=True, we 54 | #get tf-idf vectors. Set word_weights to an empty dict and cumulative_term_frequency=False to get a 1-hot sentence encoding 55 | bow = models.BoW(vocab=vocab, word_weights=weights, cumulative_term_frequency=True) 56 | 57 | # Add two trainable feed-forward networks (DAN) with max_vocab_size -> 768 -> 512 dimensions. 58 | sent_embeddings_dimension = max_vocab_size 59 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=768) 60 | dan2 = models.Dense(in_features=768, out_features=512) 61 | 62 | model = SentenceTransformer(modules=[bow, dan1, dan2]) 63 | 64 | 65 | # Convert the dataset to a DataLoader ready for training 66 | logging.info("Read STSbenchmark train dataset") 67 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 68 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 69 | train_loss = losses.CosineSimilarityLoss(model=model) 70 | 71 | logging.info("Read STSbenchmark dev dataset") 72 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 73 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 74 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 75 | 76 | # Configure the training 77 | num_epochs = 10 78 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 79 | logging.info("Warmup-steps: {}".format(warmup_steps)) 80 | 81 | # Train the model 82 | model.fit(train_objectives=[(train_dataloader, train_loss)], 83 | evaluator=evaluator, 84 | epochs=num_epochs, 85 | warmup_steps=warmup_steps, 86 | output_path=model_save_path 87 | ) 88 | 89 | 90 | 91 | ############################################################################## 92 | # 93 | # Load the stored model and evaluate its performance on STS benchmark dataset 94 | # 95 | ############################################################################## 96 | 97 | model = SentenceTransformer(model_save_path) 98 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 99 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 100 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 101 | 102 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/PhraseTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | import logging 7 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 8 | import nltk 9 | 10 | class PhraseTokenizer(WordTokenizer): 11 | """Tokenizes the text with respect to existent phrases in the vocab. 12 | 13 | This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example, 14 | in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the]) 15 | """ 16 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, ngram_separator: str = "_", max_ngram_length: int = 5): 17 | self.stop_words = set(stop_words) 18 | self.do_lower_case = do_lower_case 19 | self.ngram_separator = ngram_separator 20 | self.max_ngram_length = max_ngram_length 21 | self.set_vocab(vocab) 22 | 23 | def get_vocab(self): 24 | return self.vocab 25 | 26 | def set_vocab(self, vocab: Iterable[str]): 27 | self.vocab = vocab 28 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 29 | 30 | # Check for ngram in vocab 31 | self.ngram_lookup = set() 32 | self.ngram_lengths = set() 33 | for word in vocab: 34 | 35 | if self.ngram_separator is not None and self.ngram_separator in word: 36 | # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other 37 | ngram_count = word.count(self.ngram_separator) + 1 38 | if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length: 39 | self.ngram_lookup.add(word) 40 | self.ngram_lengths.add(ngram_count) 41 | 42 | if len(vocab) > 0: 43 | logging.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths)) 44 | logging.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup))) 45 | 46 | def tokenize(self, text: str) -> List[int]: 47 | tokens = nltk.word_tokenize(text, preserve_line=True) 48 | 49 | #phrase detection 50 | for ngram_len in sorted(self.ngram_lengths, reverse=True): 51 | idx = 0 52 | while idx <= len(tokens) - ngram_len: 53 | ngram = self.ngram_separator.join(tokens[idx:idx + ngram_len]) 54 | if ngram in self.ngram_lookup: 55 | tokens[idx:idx + ngram_len] = [ngram] 56 | elif ngram.lower() in self.ngram_lookup: 57 | tokens[idx:idx + ngram_len] = [ngram.lower()] 58 | idx += 1 59 | 60 | #Map tokens to idx, filter stop words 61 | tokens_filtered = [] 62 | for token in tokens: 63 | if token in self.stop_words: 64 | continue 65 | elif token in self.word2idx: 66 | tokens_filtered.append(self.word2idx[token]) 67 | continue 68 | 69 | token = token.lower() 70 | if token in self.stop_words: 71 | continue 72 | elif token in self.word2idx: 73 | tokens_filtered.append(self.word2idx[token]) 74 | continue 75 | 76 | token = token.strip(string.punctuation) 77 | if token in self.stop_words: 78 | continue 79 | elif len(token) > 0 and token in self.word2idx: 80 | tokens_filtered.append(self.word2idx[token]) 81 | continue 82 | 83 | return tokens_filtered 84 | 85 | def save(self, output_path: str): 86 | with open(os.path.join(output_path, 'phrasetokenizer_config.json'), 'w') as fOut: 87 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'ngram_separator': self.ngram_separator, 'max_ngram_length': self.max_ngram_length}, fOut) 88 | 89 | @staticmethod 90 | def load(input_path: str): 91 | with open(os.path.join(input_path, 'phrasetokenizer_config.json'), 'r') as fIn: 92 | config = json.load(fIn) 93 | 94 | return PhraseTokenizer(**config) 95 | -------------------------------------------------------------------------------- /sentence_transformers/models/ALBERT.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import AlbertModel, AlbertTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | class ALBERT(nn.Module): 11 | """ALBERT model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from BERT. 14 | """ 15 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True): 16 | super(ALBERT, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | 20 | if max_seq_length > 510: 21 | logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") 22 | max_seq_length = 510 23 | self.max_seq_length = max_seq_length 24 | 25 | self.bert = AlbertModel.from_pretrained(model_name_or_path) 26 | self.tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 27 | self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] 28 | self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0] 29 | 30 | def forward(self, features): 31 | """Returns token_embeddings, cls_token""" 32 | output_tokens = self.bert(input_ids=features['input_ids'], token_type_ids=features['token_type_ids'], attention_mask=features['input_mask'])[0] 33 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 34 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 35 | return features 36 | 37 | def get_word_embedding_dimension(self) -> int: 38 | return self.bert.config.hidden_size 39 | 40 | def tokenize(self, text: str) -> List[int]: 41 | """ 42 | Tokenizes a text and maps tokens to token-ids 43 | """ 44 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 45 | 46 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 47 | """ 48 | Convert tokenized sentence in its embedding ids, segment ids and mask 49 | 50 | :param tokens: 51 | a tokenized sentence 52 | :param pad_seq_length: 53 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 54 | :return: embedding ids, segment ids and mask for the sentence 55 | """ 56 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 57 | 58 | tokens = tokens[:pad_seq_length] 59 | input_ids = [self.cls_token_id] + tokens + [self.sep_token_id] 60 | sentence_length = len(input_ids) 61 | 62 | pad_seq_length += 2 ##Add Space for CLS + SEP token 63 | 64 | token_type_ids = [0] * len(input_ids) 65 | input_mask = [1] * len(input_ids) 66 | 67 | # Zero-pad up to the sequence length. BERT: Pad to the right 68 | padding = [0] * (pad_seq_length - len(input_ids)) 69 | input_ids += padding 70 | token_type_ids += padding 71 | input_mask += padding 72 | 73 | assert len(input_ids) == pad_seq_length 74 | assert len(input_mask) == pad_seq_length 75 | assert len(token_type_ids) == pad_seq_length 76 | 77 | return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'token_type_ids': np.asarray(token_type_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)} 78 | 79 | def get_config_dict(self): 80 | return {key: self.__dict__[key] for key in self.config_keys} 81 | 82 | def save(self, output_path: str): 83 | self.bert.save_pretrained(output_path) 84 | self.tokenizer.save_pretrained(output_path) 85 | 86 | with open(os.path.join(output_path, 'sentence_albert_config.json'), 'w') as fOut: 87 | json.dump(self.get_config_dict(), fOut, indent=2) 88 | 89 | @staticmethod 90 | def load(input_path: str): 91 | with open(os.path.join(input_path, 'sentence_albert_config.json')) as fIn: 92 | config = json.load(fIn) 93 | return ALBERT(model_name_or_path=input_path, **config) 94 | 95 | 96 | 97 | 98 | 99 | 100 | -------------------------------------------------------------------------------- /sentence_transformers/models/XLNet.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import XLNetModel, XLNetTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict 6 | import os 7 | import numpy as np 8 | 9 | class XLNet(nn.Module): 10 | """XLNet model to generate token embeddings. 11 | 12 | Each token is mapped to an output vector from XLNet. 13 | """ 14 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = False): 15 | super(XLNet, self).__init__() 16 | self.config_keys = ['max_seq_length', 'do_lower_case'] 17 | self.max_seq_length = max_seq_length 18 | self.do_lower_case = do_lower_case 19 | 20 | self.xlnet = XLNetModel.from_pretrained(model_name_or_path) 21 | self.tokenizer = XLNetTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case) 22 | self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] 23 | self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0] 24 | 25 | def forward(self, features): 26 | """Returns token_embeddings, cls_token""" 27 | output_tokens = self.xlnet(input_ids=features['input_ids'], token_type_ids=features['token_type_ids'], attention_mask=features['input_mask'])[0] 28 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 29 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 30 | return features 31 | 32 | def get_word_embedding_dimension(self) -> int: 33 | return self.xlnet.config.d_model 34 | 35 | def tokenize(self, text: str) -> List[int]: 36 | """ 37 | Tokenizes a text and maps tokens to token-ids 38 | """ 39 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 40 | 41 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int) -> Dict[str, Tensor]: 42 | """ 43 | Convert tokenized sentence in its embedding ids, segment ids and mask 44 | 45 | :param tokens: 46 | a tokenized sentence 47 | :param pad_seq_length: 48 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 49 | :return: embedding ids, segment ids and mask for the sentence 50 | """ 51 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 52 | 53 | sep_token = self.sep_token_id 54 | cls_token = self.cls_token_id 55 | sequence_a_segment_id = 0 56 | cls_token_segment_id = 2 57 | pad_token_segment_id = 4 58 | pad_token = 0 59 | 60 | tokens = tokens[:pad_seq_length] + [sep_token] 61 | token_type_ids = [sequence_a_segment_id] * len(tokens) 62 | 63 | # XLNet CLS token at the end 64 | tokens = tokens + [cls_token] 65 | token_type_ids = token_type_ids + [cls_token_segment_id] 66 | pad_seq_length += 2 ##+2 for CLS and SEP token 67 | 68 | input_ids = tokens 69 | input_mask = [1] * len(input_ids) 70 | sentence_length = len(input_ids) 71 | 72 | # Zero-pad up to the sequence length. XLNet: Pad to the left 73 | padding_length = pad_seq_length - len(input_ids) 74 | input_ids = ([pad_token] * padding_length) + input_ids 75 | input_mask = ([0] * padding_length) + input_mask 76 | token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids 77 | 78 | assert len(input_ids) == pad_seq_length 79 | assert len(input_mask) == pad_seq_length 80 | assert len(token_type_ids) == pad_seq_length 81 | 82 | 83 | return {'input_ids': np.asarray(input_ids, dtype=np.int64), 84 | 'token_type_ids': np.asarray(token_type_ids, dtype=np.int64), 85 | 'input_mask': np.asarray(input_mask, dtype=np.int64), 86 | 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)} 87 | 88 | def get_config_dict(self): 89 | return {key: self.__dict__[key] for key in self.config_keys} 90 | 91 | def save(self, output_path: str): 92 | self.xlnet.save_pretrained(output_path) 93 | self.tokenizer.save_pretrained(output_path) 94 | 95 | with open(os.path.join(output_path, 'sentence_xlnet_config.json'), 'w') as fOut: 96 | json.dump(self.get_config_dict(), fOut, indent=2) 97 | 98 | @staticmethod 99 | def load(input_path: str): 100 | with open(os.path.join(input_path, 'sentence_xlnet_config.json')) as fIn: 101 | config = json.load(fIn) 102 | return XLNet(model_name_or_path=input_path, **config) 103 | 104 | 105 | 106 | 107 | 108 | 109 | -------------------------------------------------------------------------------- /examples/training_stsbenchmark_tf-idf_word_embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example weights word embeddings (like GloVe) with IDF weights. The IDF weights can for example be computed on Wikipedia. 3 | 4 | If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server. 5 | 6 | See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/ for available word embeddings files 7 | 8 | You can get term-document frequencies from here: 9 | https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt 10 | """ 11 | import torch 12 | from torch.utils.data import DataLoader 13 | import math 14 | from sentence_transformers import models, losses 15 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 16 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 17 | from sentence_transformers.readers import * 18 | import logging 19 | from datetime import datetime 20 | 21 | #### Just some code to print debug information to stdout 22 | logging.basicConfig(format='%(asctime)s - %(message)s', 23 | datefmt='%Y-%m-%d %H:%M:%S', 24 | level=logging.INFO, 25 | handlers=[LoggingHandler()]) 26 | #### /print debug information to stdout 27 | 28 | # Read the dataset 29 | batch_size = 32 30 | sts_reader = STSDataReader('datasets/stsbenchmark') 31 | model_save_path = 'output/training_tf-idf_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 32 | 33 | 34 | 35 | # Map tokens to traditional word embeddings like GloVe 36 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz') 37 | 38 | # Weight word embeddings using Inverse-Document-Frequency (IDF) values. 39 | # For each word in the vocab ob the tokenizer, we must specify a weight value. 40 | # The word embedding is then multiplied by this value 41 | vocab = word_embedding_model.tokenizer.get_vocab() 42 | word_weights = {} 43 | lines = open('wikipedia_doc_frequencies.txt').readlines() 44 | num_docs = int(lines[0]) 45 | for line in lines[1:]: 46 | word, freq = line.strip().split("\t") 47 | word_weights[word] = math.log(num_docs/int(freq)) 48 | 49 | # Words in the vocab that are not in the doc_frequencies file get a frequency of 1 50 | unknown_word_weight = math.log(num_docs/1) 51 | 52 | # Initialize the WordWeights model. This model must be between the WordEmbeddings and the Pooling model 53 | word_weights = models.WordWeights(vocab=vocab, word_weights=word_weights, unknown_word_weight=unknown_word_weight) 54 | 55 | 56 | # Apply mean pooling to get one fixed sized sentence vector 57 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 58 | pooling_mode_mean_tokens=True, 59 | pooling_mode_cls_token=False, 60 | pooling_mode_max_tokens=False) 61 | 62 | # Add two trainable feed-forward networks (DAN) 63 | sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension() 64 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) 65 | dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) 66 | 67 | model = SentenceTransformer(modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2]) 68 | 69 | 70 | # Convert the dataset to a DataLoader ready for training 71 | logging.info("Read STSbenchmark train dataset") 72 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 73 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 74 | train_loss = losses.CosineSimilarityLoss(model=model) 75 | 76 | logging.info("Read STSbenchmark dev dataset") 77 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 78 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 79 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 80 | 81 | # Configure the training 82 | num_epochs = 10 83 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 84 | logging.info("Warmup-steps: {}".format(warmup_steps)) 85 | 86 | # Train the model 87 | model.fit(train_objectives=[(train_dataloader, train_loss)], 88 | evaluator=evaluator, 89 | epochs=num_epochs, 90 | warmup_steps=warmup_steps, 91 | output_path=model_save_path 92 | ) 93 | 94 | 95 | 96 | ############################################################################## 97 | # 98 | # Load the stored model and evaluate its performance on STS benchmark dataset 99 | # 100 | ############################################################################## 101 | 102 | model = SentenceTransformer(model_save_path) 103 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 104 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 105 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 106 | 107 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /sentence_transformers/evaluation/TripletEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator, SimilarityFunction 2 | import torch 3 | from torch.utils.data import DataLoader 4 | import logging 5 | from tqdm import tqdm 6 | from ..util import batch_to_device 7 | import os 8 | import csv 9 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances 10 | 11 | 12 | 13 | class TripletEvaluator(SentenceEvaluator): 14 | """ 15 | Evaluate a model based on a triplet: (sentence, positive_example, negative_example). Checks if distance(sentence,positive_example) < distance(sentence, negative_example). 16 | """ 17 | def __init__(self, dataloader: DataLoader, main_distance_function: SimilarityFunction = None, name: str =''): 18 | """ 19 | Constructs an evaluator based for the dataset 20 | 21 | 22 | :param dataloader: 23 | the data for the evaluation 24 | :param main_similarity: 25 | the similarity metric that will be used for the returned score 26 | """ 27 | self.dataloader = dataloader 28 | self.main_distance_function = main_distance_function 29 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 30 | self.name = name 31 | if name: 32 | name = "_"+name 33 | 34 | self.csv_file: str = "triplet_evaluation"+name+"_results.csv" 35 | self.csv_headers = ["epoch", "steps", "accuracy_cosinus", "accuracy_manhatten", "accuracy_euclidean"] 36 | 37 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 38 | model.eval() 39 | 40 | if epoch != -1: 41 | if steps == -1: 42 | out_txt = " after epoch {}:".format(epoch) 43 | else: 44 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 45 | else: 46 | out_txt = ":" 47 | 48 | logging.info("Evaluation the model on "+self.name+" dataset"+out_txt) 49 | 50 | num_triplets = 0 51 | num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0 52 | 53 | self.dataloader.collate_fn = model.smart_batching_collate 54 | for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): 55 | features, label_ids = batch_to_device(batch, self.device) 56 | with torch.no_grad(): 57 | emb1, emb2, emb3 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features] 58 | 59 | #Cosine distance 60 | pos_cos_distance = paired_cosine_distances(emb1, emb2) 61 | neg_cos_distances = paired_cosine_distances(emb1, emb3) 62 | 63 | # Manhatten 64 | pos_manhatten_distance = paired_manhattan_distances(emb1, emb2) 65 | neg_manhatten_distances = paired_manhattan_distances(emb1, emb3) 66 | 67 | # Euclidean 68 | pos_euclidean_distance = paired_euclidean_distances(emb1, emb2) 69 | neg_euclidean_distances = paired_euclidean_distances(emb1, emb3) 70 | 71 | for idx in range(len(pos_cos_distance)): 72 | num_triplets += 1 73 | 74 | if pos_cos_distance[idx] < neg_cos_distances[idx]: 75 | num_correct_cos_triplets += 1 76 | 77 | if pos_manhatten_distance[idx] < neg_manhatten_distances[idx]: 78 | num_correct_manhatten_triplets += 1 79 | 80 | if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]: 81 | num_correct_euclidean_triplets += 1 82 | 83 | 84 | 85 | accuracy_cos = num_correct_cos_triplets / num_triplets 86 | accuracy_manhatten = num_correct_manhatten_triplets / num_triplets 87 | accuracy_euclidean = num_correct_euclidean_triplets / num_triplets 88 | 89 | logging.info("Accuracy Cosine Distance:\t{:.4f}".format(accuracy_cos)) 90 | logging.info("Accuracy Manhatten Distance:\t{:.4f}".format(accuracy_manhatten)) 91 | logging.info("Accuracy Euclidean Distance:\t{:.4f}\n".format(accuracy_euclidean)) 92 | 93 | if output_path is not None: 94 | csv_path = os.path.join(output_path, self.csv_file) 95 | if not os.path.isfile(csv_path): 96 | with open(csv_path, mode="w", encoding="utf-8") as f: 97 | writer = csv.writer(f) 98 | writer.writerow(self.csv_headers) 99 | writer.writerow([epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean]) 100 | 101 | else: 102 | with open(csv_path, mode="a", encoding="utf-8") as f: 103 | writer = csv.writer(f) 104 | writer.writerow([epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean]) 105 | 106 | if self.main_distance_function == SimilarityFunction.COSINE: 107 | return accuracy_cos 108 | if self.main_distance_function == SimilarityFunction.MANHATTAN: 109 | return accuracy_manhatten 110 | if self.main_distance_function == SimilarityFunction.EUCLIDEAN: 111 | return accuracy_euclidean 112 | 113 | return max(accuracy_cos, accuracy_manhatten, accuracy_euclidean) -------------------------------------------------------------------------------- /sentence_transformers/evaluation/BinaryEmbeddingSimilarityEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator, SimilarityFunction 2 | import torch 3 | from torch.utils.data import DataLoader 4 | import logging 5 | from tqdm import tqdm 6 | from ..util import batch_to_device 7 | import os 8 | import csv 9 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances 10 | import numpy as np 11 | 12 | 13 | class BinaryEmbeddingSimilarityEvaluator(SentenceEvaluator): 14 | """ 15 | Evaluate a model based on the similarity of the embeddings by calculating the accuracy of identifying similar and 16 | dissimilar sentences. 17 | This is done by taking the metrics and checking if sentence pairs with a label of 0 are in the top 50% and pairs 18 | with label 1 in the bottom 50%. 19 | This assumes that the dataset is split 50-50. 20 | The metrics are the cosine similarity as well as euclidean and Manhattan distance 21 | The returned score is the accuracy with a specified metric. 22 | 23 | The results are written in a CSV. If a CSV already exists, then values are appended. 24 | """ 25 | def __init__(self, dataloader: DataLoader, 26 | main_similarity: SimilarityFunction = SimilarityFunction.COSINE, name:str =''): 27 | """ 28 | Constructs an evaluator based for the dataset 29 | 30 | The labels need to be 0 for dissimilar pairs and 1 for similar pairs. 31 | The dataset needs to be split 50-50 with the labels. 32 | 33 | :param dataloader: 34 | the data for the evaluation 35 | :param main_similarity: 36 | the similarity metric that will be used for the returned score 37 | """ 38 | self.dataloader = dataloader 39 | self.main_similarity = main_similarity 40 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 41 | self.name = name 42 | if name: 43 | name = "_"+name 44 | 45 | self.csv_file: str = "binary_similarity_evaluation"+name+"_results.csv" 46 | self.csv_headers = ["epoch", "steps", "cosine_acc", "euclidean_acc", "manhattan_acc"] 47 | 48 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 49 | model.eval() 50 | embeddings1 = [] 51 | embeddings2 = [] 52 | labels = [] 53 | 54 | if epoch != -1: 55 | if steps == -1: 56 | out_txt = f" after epoch {epoch}:" 57 | else: 58 | out_txt = f" in epoch {epoch} after {steps} steps:" 59 | else: 60 | out_txt = ":" 61 | 62 | logging.info("Evaluation the model on "+self.name+" dataset"+out_txt) 63 | self.dataloader.collate_fn = model.smart_batching_collate 64 | for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): 65 | features, label_ids = batch_to_device(batch, self.device) 66 | with torch.no_grad(): 67 | emb1, emb2 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features] 68 | 69 | labels.extend(label_ids.to("cpu").numpy()) 70 | embeddings1.extend(emb1) 71 | embeddings2.extend(emb2) 72 | cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) 73 | manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2) 74 | euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2) 75 | 76 | #Ensure labels are just 0 or 1 77 | for label in labels: 78 | assert (label == 0 or label == 1) 79 | 80 | cosine_middle = np.median(cosine_scores) 81 | cosine_acc = 0 82 | for label, score in zip(labels, cosine_scores): 83 | if (label == 1 and score > cosine_middle) or (label == 0 and score <= cosine_middle): 84 | cosine_acc += 1 85 | cosine_acc /= len(labels) 86 | 87 | manhattan_middle = np.median(manhattan_distances) 88 | manhattan_acc = 0 89 | for label, score in zip(labels, manhattan_distances): 90 | if (label == 1 and score > manhattan_middle) or (label == 0 and score <= manhattan_middle): 91 | manhattan_acc += 1 92 | manhattan_acc /= len(labels) 93 | 94 | euclidean_middle = np.median(euclidean_distances) 95 | euclidean_acc = 0 96 | for label, score in zip(labels, euclidean_distances): 97 | if (label == 1 and score > euclidean_middle) or (label == 0 and score <= euclidean_middle): 98 | euclidean_acc += 1 99 | euclidean_acc /= len(labels) 100 | 101 | logging.info("Cosine-Classification:\t{:4f}".format( 102 | cosine_acc)) 103 | logging.info("Manhattan-Classification:\t{:4f}".format( 104 | manhattan_acc)) 105 | logging.info("Euclidean-Classification:\t{:4f}\n".format( 106 | euclidean_acc)) 107 | 108 | if output_path is not None: 109 | csv_path = os.path.join(output_path, self.csv_file) 110 | if not os.path.isfile(csv_path): 111 | with open(csv_path, mode="w", encoding="utf-8") as f: 112 | writer = csv.writer(f) 113 | writer.writerow(self.csv_headers) 114 | writer.writerow([epoch, steps, cosine_acc, euclidean_acc, manhattan_acc]) 115 | else: 116 | with open(csv_path, mode="a", encoding="utf-8") as f: 117 | writer = csv.writer(f) 118 | writer.writerow([epoch, steps, cosine_acc, euclidean_acc, manhattan_acc]) 119 | 120 | if self.main_similarity == SimilarityFunction.COSINE: 121 | return cosine_acc 122 | elif self.main_similarity == SimilarityFunction.EUCLIDEAN: 123 | return euclidean_acc 124 | elif self.main_similarity == SimilarityFunction.MANHATTAN: 125 | return manhattan_acc 126 | else: 127 | raise ValueError("Unknown main_similarity value") -------------------------------------------------------------------------------- /sentence_transformers/models/WordEmbeddings.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import logging 5 | import gzip 6 | from tqdm import tqdm 7 | import numpy as np 8 | import os 9 | import json 10 | from ..util import import_from_string, fullname, http_get 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 12 | 13 | 14 | class WordEmbeddings(nn.Module): 15 | def __init__(self, tokenizer: WordTokenizer, embedding_weights, update_embeddings: bool = False, max_seq_length: int = 1000000): 16 | nn.Module.__init__(self) 17 | if isinstance(embedding_weights, list): 18 | embedding_weights = np.asarray(embedding_weights) 19 | 20 | if isinstance(embedding_weights, np.ndarray): 21 | embedding_weights = torch.from_numpy(embedding_weights) 22 | 23 | num_embeddings, embeddings_dimension = embedding_weights.size() 24 | self.embeddings_dimension = embeddings_dimension 25 | self.emb_layer = nn.Embedding(num_embeddings, embeddings_dimension) 26 | self.emb_layer.load_state_dict({'weight': embedding_weights}) 27 | self.emb_layer.weight.requires_grad = update_embeddings 28 | self.tokenizer = tokenizer 29 | self.update_embeddings = update_embeddings 30 | self.max_seq_length = max_seq_length 31 | 32 | def forward(self, features): 33 | token_embeddings = self.emb_layer(features['input_ids']) 34 | cls_tokens = None 35 | features.update({'token_embeddings': token_embeddings, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']}) 36 | return features 37 | 38 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 39 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 40 | 41 | tokens = tokens[0:pad_seq_length] #Truncate tokens if needed 42 | input_ids = tokens 43 | 44 | sentence_length = len(input_ids) 45 | input_mask = [1] * len(input_ids) 46 | padding = [0] * (pad_seq_length - len(input_ids)) 47 | input_ids += padding 48 | input_mask += padding 49 | 50 | assert len(input_ids) == pad_seq_length 51 | assert len(input_mask) == pad_seq_length 52 | 53 | return {'input_ids': input_ids, 'input_mask': input_mask, 'sentence_lengths': sentence_length} 54 | 55 | return {'input_ids': np.asarray(input_ids, dtype=np.int64), 56 | 'input_mask': np.asarray(input_mask, dtype=np.int64), 57 | 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)} 58 | 59 | def get_word_embedding_dimension(self) -> int: 60 | return self.embeddings_dimension 61 | 62 | def tokenize(self, text: str) -> List[int]: 63 | return self.tokenizer.tokenize(text) 64 | 65 | def save(self, output_path: str): 66 | with open(os.path.join(output_path, 'wordembedding_config.json'), 'w') as fOut: 67 | json.dump(self.get_config_dict(), fOut, indent=2) 68 | 69 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 70 | self.tokenizer.save(output_path) 71 | 72 | def get_config_dict(self): 73 | return {'tokenizer_class': fullname(self.tokenizer), 'update_embeddings': self.update_embeddings, 'max_seq_length': self.max_seq_length} 74 | 75 | @staticmethod 76 | def load(input_path: str): 77 | with open(os.path.join(input_path, 'wordembedding_config.json'), 'r') as fIn: 78 | config = json.load(fIn) 79 | 80 | tokenizer_class = import_from_string(config['tokenizer_class']) 81 | tokenizer = tokenizer_class.load(input_path) 82 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 83 | embedding_weights = weights['emb_layer.weight'] 84 | model = WordEmbeddings(tokenizer=tokenizer, embedding_weights=embedding_weights, update_embeddings=config['update_embeddings']) 85 | return model 86 | 87 | @staticmethod 88 | def from_text_file(embeddings_file_path: str, update_embeddings: bool = False, item_separator: str = " ", tokenizer=WhitespaceTokenizer(), max_vocab_size: int = None): 89 | logging.info("Read in embeddings file {}".format(embeddings_file_path)) 90 | 91 | if not os.path.exists(embeddings_file_path): 92 | logging.info("{} does not exist, try to download from server".format(embeddings_file_path)) 93 | 94 | if '/' in embeddings_file_path or '\\' in embeddings_file_path: 95 | raise ValueError("Embeddings file not found: ".format(embeddings_file_path)) 96 | 97 | url = "https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/"+embeddings_file_path 98 | http_get(url, embeddings_file_path) 99 | 100 | embeddings_dimension = None 101 | vocab = [] 102 | embeddings = [] 103 | 104 | with gzip.open(embeddings_file_path, "rt", encoding="utf8") if embeddings_file_path.endswith('.gz') else open(embeddings_file_path, encoding="utf8") as fIn: 105 | iterator = tqdm(fIn, desc="Load Word Embeddings", unit="Embeddings") 106 | for line in iterator: 107 | split = line.rstrip().split(item_separator) 108 | word = split[0] 109 | 110 | if embeddings_dimension == None: 111 | embeddings_dimension = len(split) - 1 112 | vocab.append("PADDING_TOKEN") 113 | embeddings.append(np.zeros(embeddings_dimension)) 114 | 115 | if (len(split) - 1) != embeddings_dimension: # Assure that all lines in the embeddings file are of the same length 116 | logging.error("ERROR: A line in the embeddings file had more or less dimensions than expected. Skip token.") 117 | continue 118 | 119 | vector = np.array([float(num) for num in split[1:]]) 120 | embeddings.append(vector) 121 | vocab.append(word) 122 | 123 | if max_vocab_size is not None and max_vocab_size > 0 and len(vocab) > max_vocab_size: 124 | break 125 | 126 | embeddings = np.asarray(embeddings) 127 | 128 | tokenizer.set_vocab(vocab) 129 | return WordEmbeddings(tokenizer=tokenizer, embedding_weights=embeddings, update_embeddings=update_embeddings) 130 | 131 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator, SimilarityFunction 2 | from torch.utils.data import DataLoader 3 | 4 | import torch 5 | import logging 6 | from tqdm import tqdm 7 | from ..util import batch_to_device 8 | import os 9 | import csv 10 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances 11 | from scipy.stats import pearsonr, spearmanr 12 | import numpy as np 13 | 14 | class EmbeddingSimilarityEvaluator(SentenceEvaluator): 15 | """ 16 | Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation 17 | in comparison to the gold standard labels. 18 | The metrics are the cosine similarity as well as euclidean and Manhattan distance 19 | The returned score is the Spearman correlation with a specified metric. 20 | 21 | The results are written in a CSV. If a CSV already exists, then values are appended. 22 | """ 23 | 24 | 25 | def __init__(self, dataloader: DataLoader, main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = None): 26 | """ 27 | Constructs an evaluator based for the dataset 28 | 29 | The labels need to indicate the similarity between the sentences. 30 | 31 | :param dataloader: 32 | the data for the evaluation 33 | :param main_similarity: 34 | the similarity metric that will be used for the returned score 35 | """ 36 | self.dataloader = dataloader 37 | self.main_similarity = main_similarity 38 | self.name = name 39 | if name: 40 | name = "_"+name 41 | 42 | if show_progress_bar is None: 43 | show_progress_bar = (logging.getLogger().getEffectiveLevel() == logging.INFO or logging.getLogger().getEffectiveLevel() == logging.DEBUG) 44 | self.show_progress_bar = show_progress_bar 45 | 46 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 47 | self.csv_file: str = "similarity_evaluation"+name+"_results.csv" 48 | self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman"] 49 | 50 | def __call__(self, model: 'SequentialSentenceEmbedder', output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 51 | model.eval() 52 | embeddings1 = [] 53 | embeddings2 = [] 54 | labels = [] 55 | 56 | if epoch != -1: 57 | if steps == -1: 58 | out_txt = f" after epoch {epoch}:" 59 | else: 60 | out_txt = f" in epoch {epoch} after {steps} steps:" 61 | else: 62 | out_txt = ":" 63 | 64 | logging.info("Evaluation the model on "+self.name+" dataset"+out_txt) 65 | 66 | self.dataloader.collate_fn = model.smart_batching_collate 67 | 68 | iterator = self.dataloader 69 | if self.show_progress_bar: 70 | iterator = tqdm(iterator, desc="Convert Evaluating") 71 | 72 | for step, batch in enumerate(iterator): 73 | features, label_ids = batch_to_device(batch, self.device) 74 | with torch.no_grad(): 75 | emb1, emb2 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features] 76 | 77 | labels.extend(label_ids.to("cpu").numpy()) 78 | embeddings1.extend(emb1) 79 | embeddings2.extend(emb2) 80 | 81 | try: 82 | cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2)) 83 | except Exception as e: 84 | print(embeddings1) 85 | print(embeddings2) 86 | raise(e) 87 | 88 | manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2) 89 | euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2) 90 | dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)] 91 | 92 | 93 | eval_pearson_cosine, _ = pearsonr(labels, cosine_scores) 94 | eval_spearman_cosine, _ = spearmanr(labels, cosine_scores) 95 | 96 | eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances) 97 | eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances) 98 | 99 | eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances) 100 | eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances) 101 | 102 | eval_pearson_dot, _ = pearsonr(labels, dot_products) 103 | eval_spearman_dot, _ = spearmanr(labels, dot_products) 104 | 105 | logging.info("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format( 106 | eval_pearson_cosine, eval_spearman_cosine)) 107 | logging.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( 108 | eval_pearson_manhattan, eval_spearman_manhattan)) 109 | logging.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format( 110 | eval_pearson_euclidean, eval_spearman_euclidean)) 111 | logging.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format( 112 | eval_pearson_dot, eval_spearman_dot)) 113 | 114 | if output_path is not None: 115 | csv_path = os.path.join(output_path, self.csv_file) 116 | output_file_exists = os.path.isfile(csv_path) 117 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 118 | writer = csv.writer(f) 119 | if not output_file_exists: 120 | writer.writerow(self.csv_headers) 121 | 122 | writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean, 123 | eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot]) 124 | 125 | 126 | if self.main_similarity == SimilarityFunction.COSINE: 127 | return eval_spearman_cosine 128 | elif self.main_similarity == SimilarityFunction.EUCLIDEAN: 129 | return eval_spearman_euclidean 130 | elif self.main_similarity == SimilarityFunction.MANHATTAN: 131 | return eval_spearman_manhattan 132 | elif self.main_similarity == SimilarityFunction.DOT_PRODUCT: 133 | return eval_spearman_dot 134 | elif self.main_similarity is None: 135 | return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot) 136 | else: 137 | raise ValueError("Unknown main_similarity value") 138 | -------------------------------------------------------------------------------- /sentence_transformers/losses/test_batch_hard_triplet_loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | from sentence_transformers.losses import BatchHardTripletLoss 4 | 5 | # Test-suite from https://github.com/omoindrot/tensorflow-triplet-loss/blob/master/model/tests/test_triplet_loss.py 6 | # Skipped the `test_gradients_pairwise_distances()` test since it's trivial to see if your model loss turns NaN 7 | # and porting it proved more difficult than expected. 8 | 9 | def pairwise_distance_np(feature, squared=False): 10 | """Computes the pairwise distance matrix in numpy. 11 | Args: 12 | feature: 2-D numpy array of size [number of data, feature dimension] 13 | squared: Boolean. If true, output is the pairwise squared euclidean 14 | distance matrix; else, output is the pairwise euclidean distance matrix. 15 | Returns: 16 | pairwise_distances: 2-D numpy array of size 17 | [number of data, number of data]. 18 | """ 19 | triu = np.triu_indices(feature.shape[0], 1) 20 | upper_tri_pdists = np.linalg.norm(feature[triu[1]] - feature[triu[0]], axis=1) 21 | if squared: 22 | upper_tri_pdists **= 2. 23 | num_data = feature.shape[0] 24 | pairwise_distances = np.zeros((num_data, num_data)) 25 | pairwise_distances[np.triu_indices(num_data, 1)] = upper_tri_pdists 26 | # Make symmetrical. 27 | pairwise_distances = pairwise_distances + pairwise_distances.T - np.diag( 28 | pairwise_distances.diagonal()) 29 | return pairwise_distances 30 | 31 | def test_pairwise_distances(): 32 | """Test the pairwise distances function.""" 33 | num_data = 64 34 | feat_dim = 6 35 | 36 | embeddings = np.random.randn(num_data, feat_dim).astype(np.float32) 37 | embeddings[1] = embeddings[0] # to get distance 0 38 | 39 | for squared in [True, False]: 40 | res_np = pairwise_distance_np(embeddings, squared=squared) 41 | res_pt = BatchHardTripletLoss._pairwise_distances(torch.from_numpy(embeddings), squared=squared) 42 | assert np.allclose(res_np, res_pt) 43 | 44 | def test_pairwise_distances_are_positive(): 45 | """Test that the pairwise distances are always positive. 46 | Use a tricky case where numerical errors are common. 47 | """ 48 | num_data = 64 49 | feat_dim = 6 50 | 51 | # Create embeddings very close to each other in [1.0 - 2e-7, 1.0 + 2e-7] 52 | # This will encourage errors in the computation 53 | embeddings = 1.0 + 2e-7 * np.random.randn(num_data, feat_dim).astype(np.float32) 54 | embeddings[1] = embeddings[0] # to get distance 0 55 | 56 | for squared in [True, False]: 57 | res_tf = BatchHardTripletLoss._pairwise_distances(torch.from_numpy(embeddings), squared=squared) 58 | assert res_tf[res_tf < 0].sum() == 0 59 | 60 | 61 | def test_triplet_mask(): 62 | """Test function _get_triplet_mask.""" 63 | num_data = 64 64 | num_classes = 10 65 | 66 | labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32) 67 | 68 | mask_np = np.zeros((num_data, num_data, num_data)) 69 | for i in range(num_data): 70 | for j in range(num_data): 71 | for k in range(num_data): 72 | distinct = (i != j and i != k and j != k) 73 | valid = (labels[i] == labels[j]) and (labels[i] != labels[k]) 74 | mask_np[i, j, k] = (distinct and valid) 75 | 76 | mask_tf_val = BatchHardTripletLoss._get_triplet_mask(torch.from_numpy(labels)) 77 | assert np.allclose(mask_np, mask_tf_val) 78 | 79 | def test_anchor_positive_triplet_mask(): 80 | """Test function _get_anchor_positive_triplet_mask.""" 81 | num_data = 64 82 | num_classes = 10 83 | 84 | labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32) 85 | 86 | mask_np = np.zeros((num_data, num_data)) 87 | for i in range(num_data): 88 | for j in range(num_data): 89 | distinct = (i != j) 90 | valid = labels[i] == labels[j] 91 | mask_np[i, j] = (distinct and valid) 92 | 93 | mask_tf_val = BatchHardTripletLoss._get_anchor_positive_triplet_mask(torch.from_numpy(labels)) 94 | 95 | assert np.allclose(mask_np, mask_tf_val) 96 | 97 | def test_anchor_negative_triplet_mask(): 98 | """Test function _get_anchor_negative_triplet_mask.""" 99 | num_data = 64 100 | num_classes = 10 101 | 102 | labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32) 103 | 104 | mask_np = np.zeros((num_data, num_data)) 105 | for i in range(num_data): 106 | for k in range(num_data): 107 | distinct = (i != k) 108 | valid = (labels[i] != labels[k]) 109 | mask_np[i, k] = (distinct and valid) 110 | 111 | mask_tf_val = BatchHardTripletLoss._get_anchor_negative_triplet_mask(torch.from_numpy(labels)) 112 | 113 | assert np.allclose(mask_np, mask_tf_val) 114 | 115 | def test_simple_batch_all_triplet_loss(): 116 | """Test the triplet loss with batch all triplet mining in a simple case. 117 | There is just one class in this super simple edge case, and we want to make sure that 118 | the loss is 0. 119 | """ 120 | num_data = 10 121 | feat_dim = 6 122 | margin = 0.2 123 | num_classes = 1 124 | 125 | embeddings = np.random.rand(num_data, feat_dim).astype(np.float32) 126 | labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32) 127 | labels, embeddings = torch.from_numpy(labels), torch.from_numpy(embeddings) 128 | 129 | for squared in [True, False]: 130 | loss_np = 0.0 131 | 132 | # Compute the loss in TF. 133 | loss_tf_val, fraction_val = BatchHardTripletLoss.batch_all_triplet_loss(labels, embeddings, margin, squared=squared) 134 | 135 | assert np.allclose(loss_np, loss_tf_val) 136 | assert np.allclose(fraction_val, 0.0) 137 | 138 | 139 | def test_batch_all_triplet_loss(): 140 | """Test the triplet loss with batch all triplet mining""" 141 | num_data = 10 142 | feat_dim = 6 143 | margin = 0.2 144 | num_classes = 5 145 | 146 | embeddings = np.random.rand(num_data, feat_dim).astype(np.float32) 147 | labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32) 148 | 149 | for squared in [True, False]: 150 | pdist_matrix = pairwise_distance_np(embeddings, squared=squared) 151 | 152 | loss_np = 0.0 153 | num_positives = 0.0 154 | num_valid = 0.0 155 | for i in range(num_data): 156 | for j in range(num_data): 157 | for k in range(num_data): 158 | distinct = (i != j and i != k and j != k) 159 | valid = (labels[i] == labels[j]) and (labels[i] != labels[k]) 160 | if distinct and valid: 161 | num_valid += 1.0 162 | 163 | pos_distance = pdist_matrix[i][j] 164 | neg_distance = pdist_matrix[i][k] 165 | 166 | loss = np.maximum(0.0, pos_distance - neg_distance + margin) 167 | loss_np += loss 168 | 169 | num_positives += (loss > 0) 170 | 171 | loss_np /= num_positives 172 | 173 | # Compute the loss in TF. 174 | loss_tf_val, fraction_val = BatchHardTripletLoss.batch_all_triplet_loss(torch.from_numpy(labels), torch.from_numpy(embeddings), margin, squared=squared) 175 | assert np.allclose(loss_np, loss_tf_val) 176 | assert np.allclose(num_positives / num_valid, fraction_val) 177 | 178 | def test_batch_hard_triplet_loss(): 179 | """Test the triplet loss with batch hard triplet mining""" 180 | num_data = 50 181 | feat_dim = 6 182 | margin = 0.2 183 | num_classes = 5 184 | min_class = 100 185 | 186 | embeddings = np.random.rand(num_data, feat_dim).astype(np.float32) 187 | labels = np.random.randint(min_class, min_class+num_classes, size=(num_data)).astype(np.float32) 188 | 189 | for squared in [True, False]: 190 | pdist_matrix = pairwise_distance_np(embeddings, squared=squared) 191 | 192 | loss_np = 0.0 193 | for i in range(num_data): 194 | # Select the hardest positive 195 | max_pos_dist = np.max(pdist_matrix[i][labels == labels[i]]) 196 | 197 | # Select the hardest negative 198 | min_neg_dist = np.min(pdist_matrix[i][labels != labels[i]]) 199 | 200 | 201 | loss = np.maximum(0.0, max_pos_dist - min_neg_dist + margin) 202 | loss_np += loss 203 | 204 | loss_np /= num_data 205 | 206 | # Compute the loss in TF. 207 | loss_tf_val = BatchHardTripletLoss.batch_hard_triplet_loss(torch.from_numpy(labels), torch.from_numpy(embeddings), margin, squared=squared) 208 | assert np.allclose(loss_np, loss_tf_val) 209 | 210 | if __name__ == '__main__': 211 | test_pairwise_distances() 212 | test_pairwise_distances_are_positive() 213 | test_triplet_mask() 214 | test_anchor_positive_triplet_mask() 215 | test_anchor_negative_triplet_mask() 216 | test_batch_hard_triplet_loss() 217 | print("--TESTS done ---") -------------------------------------------------------------------------------- /sentence_transformers/datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | This files contains various pytorch dataset classes, that provide 3 | data to the Transformer model 4 | """ 5 | from torch.utils.data import Dataset 6 | from typing import List 7 | from torch import Tensor 8 | import bisect 9 | import torch 10 | import logging 11 | import numpy as np 12 | from tqdm import tqdm 13 | from . import SentenceTransformer 14 | from .readers.InputExample import InputExample 15 | 16 | 17 | class SentencesDataset(Dataset): 18 | """ 19 | Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all 20 | sequences to the max length. 21 | The SentenceBertEncoder.smart_batching_collate is required for this to work. 22 | SmartBatchingDataset does *not* work without it. 23 | """ 24 | def __init__(self, examples: List[InputExample], model: SentenceTransformer, show_progress_bar: bool = None): 25 | """ 26 | Create a new SentencesDataset with the tokenized texts and the labels as Tensor 27 | """ 28 | if show_progress_bar is None: 29 | show_progress_bar = (logging.getLogger().getEffectiveLevel() == logging.INFO or logging.getLogger().getEffectiveLevel() == logging.DEBUG) 30 | self.show_progress_bar = show_progress_bar 31 | 32 | self.convert_input_examples(examples, model) 33 | 34 | def convert_input_examples(self, examples: List[InputExample], model: SentenceTransformer): 35 | """ 36 | Converts input examples to a SmartBatchingDataset usable to train the model with 37 | SentenceTransformer.smart_batching_collate as the collate_fn for the DataLoader 38 | 39 | smart_batching_collate as collate_fn is required because it transforms the tokenized texts to the tensors. 40 | 41 | :param examples: 42 | the input examples for the training 43 | :param model 44 | the Sentence BERT model for the conversion 45 | :return: a SmartBatchingDataset usable to train the model with SentenceTransformer.smart_batching_collate as the collate_fn 46 | for the DataLoader 47 | """ 48 | num_texts = len(examples[0].texts) 49 | inputs = [[] for _ in range(num_texts)] 50 | labels = [] 51 | too_long = [0] * num_texts 52 | label_type = None 53 | iterator = examples 54 | max_seq_length = model.get_max_seq_length() 55 | 56 | if self.show_progress_bar: 57 | iterator = tqdm(iterator, desc="Convert dataset") 58 | 59 | for ex_index, example in enumerate(iterator): 60 | if label_type is None: 61 | if isinstance(example.label, int): 62 | label_type = torch.long 63 | elif isinstance(example.label, float): 64 | label_type = torch.float 65 | tokenized_texts = [model.tokenize(text) for text in example.texts] 66 | 67 | for i, token in enumerate(tokenized_texts): 68 | if max_seq_length != None and max_seq_length > 0 and len(token) >= max_seq_length: 69 | too_long[i] += 1 70 | 71 | labels.append(example.label) 72 | for i in range(num_texts): 73 | inputs[i].append(tokenized_texts[i]) 74 | 75 | tensor_labels = torch.tensor(labels, dtype=label_type) 76 | 77 | logging.info("Num sentences: %d" % (len(examples))) 78 | for i in range(num_texts): 79 | logging.info("Sentences {} longer than max_seqence_length: {}".format(i, too_long[i])) 80 | 81 | self.tokens = inputs 82 | self.labels = tensor_labels 83 | 84 | def __getitem__(self, item): 85 | return [self.tokens[i][item] for i in range(len(self.tokens))], self.labels[item] 86 | 87 | def __len__(self): 88 | return len(self.tokens[0]) 89 | 90 | 91 | class SentenceLabelDataset(Dataset): 92 | """ 93 | Dataset for training with triplet loss. 94 | This dataset takes a list of sentences grouped by their label and uses this grouping to dynamically select a 95 | positive example from the same group and a negative example from the other sentences for a selected anchor sentence. 96 | 97 | This dataset should be used in combination with dataset_reader.LabelSentenceReader 98 | 99 | One iteration over this dataset selects every sentence as anchor once. 100 | 101 | This also uses smart batching like SentenceDataset. 102 | """ 103 | tokens: List[List[str]] 104 | labels: Tensor 105 | num_labels: int 106 | labels_right_border: List[int] 107 | 108 | def __init__(self, examples: List[InputExample], model: SentenceTransformer, provide_positive: bool = True, 109 | provide_negative: bool = True): 110 | """ 111 | Converts input examples to a SentenceLabelDataset usable to train the model with 112 | SentenceTransformer.smart_batching_collate as the collate_fn for the DataLoader 113 | 114 | Assumes only one sentence per InputExample and labels as integers from 0 to max_num_labels 115 | and should be used in combination with dataset_reader.LabelSentenceReader. 116 | 117 | Labels with only one example are ignored. 118 | 119 | smart_batching_collate as collate_fn is required because it transforms the tokenized texts to the tensors. 120 | 121 | :param examples: 122 | the input examples for the training 123 | :param model 124 | the Sentence BERT model for the conversion 125 | :param provide_positive: 126 | set this to False, if you don't need a positive example (e.g. for BATCH_HARD_TRIPLET_LOSS). 127 | :param provide_negative: 128 | set this to False, if you don't need a negative example (e.g. for BATCH_HARD_TRIPLET_LOSS 129 | or MULTIPLE_NEGATIVES_RANKING_LOSS). 130 | """ 131 | self.convert_input_examples(examples, model) 132 | self.idxs = np.arange(len(self.tokens)) 133 | self.positive = provide_positive 134 | self.negative = provide_negative 135 | 136 | def convert_input_examples(self, examples: List[InputExample], model: SentenceTransformer): 137 | """ 138 | Converts input examples to a SentenceLabelDataset. 139 | 140 | Assumes only one sentence per InputExample and labels as integers from 0 to max_num_labels 141 | and should be used in combination with dataset_reader.LabelSentenceReader. 142 | 143 | Labels with only one example are ignored. 144 | 145 | :param examples: 146 | the input examples for the training 147 | :param model 148 | the Sentence Transformer model for the conversion 149 | """ 150 | self.labels_right_border = [] 151 | self.num_labels = 0 152 | inputs = [] 153 | labels = [] 154 | 155 | label_sent_mapping = {} 156 | too_long = 0 157 | label_type = None 158 | for ex_index, example in enumerate(tqdm(examples, desc="Convert dataset")): 159 | if label_type is None: 160 | if isinstance(example.label, int): 161 | label_type = torch.long 162 | elif isinstance(example.label, float): 163 | label_type = torch.float 164 | tokenized_text = model.tokenize(example.texts[0]) 165 | 166 | if hasattr(model, 'max_seq_length') and model.max_seq_length is not None and model.max_seq_length > 0 and len(tokenized_text) >= model.max_seq_length: 167 | too_long += 1 168 | if example.label in label_sent_mapping: 169 | label_sent_mapping[example.label].append(ex_index) 170 | else: 171 | label_sent_mapping[example.label] = [ex_index] 172 | labels.append(example.label) 173 | inputs.append(tokenized_text) 174 | 175 | grouped_inputs = [] 176 | for i in range(len(label_sent_mapping)): 177 | if len(label_sent_mapping[i]) >= 2: 178 | grouped_inputs.extend([inputs[j] for j in label_sent_mapping[i]]) 179 | self.labels_right_border.append(len(grouped_inputs)) 180 | self.num_labels += 1 181 | 182 | tensor_labels = torch.tensor(labels, dtype=label_type) 183 | 184 | logging.info("Num sentences: %d" % (len(grouped_inputs))) 185 | logging.info("Sentences longer than max_seqence_length: {}".format(too_long)) 186 | logging.info("Number of labels with >1 examples: {}".format(self.num_labels)) 187 | self.tokens = grouped_inputs 188 | self.labels = tensor_labels 189 | 190 | def __getitem__(self, item): 191 | if not self.positive and not self.negative: 192 | return [self.tokens[item]], self.labels[item] 193 | 194 | label = bisect.bisect_right(self.labels_right_border, item) 195 | left_border = 0 if label == 0 else self.labels_right_border[label-1] 196 | right_border = self.labels_right_border[label] 197 | positive_item = np.random.choice(np.concatenate([self.idxs[left_border:item], self.idxs[item+1:right_border]])) 198 | negative_item = np.random.choice(np.concatenate([self.idxs[0:left_border], self.idxs[right_border:]])) 199 | 200 | if self.positive: 201 | positive = [self.tokens[positive_item]] 202 | else: 203 | positive = [] 204 | if self.negative: 205 | negative = [self.tokens[negative_item]] 206 | else: 207 | negative = [] 208 | 209 | return [self.tokens[item]]+positive+negative, self.labels[item] 210 | 211 | def __len__(self): 212 | return len(self.tokens) 213 | --------------------------------------------------------------------------------