├── setup.cfg
├── requirements.txt
├── .gitignore
├── sentence_transformers
    ├── evaluation
    │   ├── SimilarityFunction.py
    │   ├── __init__.py
    │   ├── SequentialEvaluator.py
    │   ├── SentenceEvaluator.py
    │   ├── LabelAccuracyEvaluator.py
    │   ├── TripletEvaluator.py
    │   ├── BinaryEmbeddingSimilarityEvaluator.py
    │   └── EmbeddingSimilarityEvaluator.py
    ├── losses
    │   ├── __init__.py
    │   ├── CosineSimilarityLoss.py
    │   ├── TripletLoss.py
    │   ├── MultipleNegativesRankingLoss.py
    │   ├── SoftmaxLoss.py
    │   └── test_batch_hard_triplet_loss.py
    ├── models
    │   ├── tokenizer
    │   │   ├── __init__.py
    │   │   ├── WhitespaceTokenizer.py
    │   │   ├── WordTokenizer.py
    │   │   └── PhraseTokenizer.py
    │   ├── __init__.py
    │   ├── Dense.py
    │   ├── LSTM.py
    │   ├── CNN.py
    │   ├── BoW.py
    │   ├── WordWeights.py
    │   ├── Pooling.py
    │   ├── DistilBERT.py
    │   ├── RoBERTa.py
    │   ├── XLMRoBERTa.py
    │   ├── T5.py
    │   ├── BERT.py
    │   ├── CamemBERT.py
    │   ├── ALBERT.py
    │   ├── XLNet.py
    │   └── WordEmbeddings.py
    ├── readers
    │   ├── __init__.py
    │   ├── InputExample.py
    │   ├── LabelSentenceReader.py
    │   ├── TripletReader.py
    │   ├── STSDataReader.py
    │   └── NLIDataReader.py
    ├── __init__.py
    ├── LoggingHandler.py
    ├── util.py
    ├── data_samplers.py
    └── datasets.py
├── NOTICE.txt
├── examples
    ├── datasets
    │   ├── README.md
    │   └── get_data.py
    ├── basic_embedding.py
    ├── evaluation_stsbenchmark.py
    ├── application_clustering.py
    ├── application_semantic_search.py
    ├── training_stsbenchmark_continue_training.py
    ├── training_stsbenchmark_bert.py
    ├── training_stsbenchmark_xlnet.py
    ├── training_stsbenchmark_roberta.py
    ├── training_stsbenchmark_albert.py
    ├── training_stsbenchmark_distilbert.py
    ├── training_stsbenchmark_cnn.py
    ├── training_wikipedia_sections.py
    ├── training_nli_T5.py
    ├── training_stsbenchmark_bilstm.py
    ├── application_clustering_wikipedia_sections.py
    ├── training_nli_bert.py
    ├── training_nli_roberta.py
    ├── training_nli_albert.py
    ├── training_nli_xlm-roberta.py
    ├── training_nli_distilbert.py
    ├── training_stsbenchmark_avg_word_embeddings.py
    ├── training_stsbenchmark_bow.py
    └── training_stsbenchmark_tf-idf_word_embeddings.py
├── setup.py
└── docs
    └── pretrained-models
        ├── sts-models.md
        ├── wikipedia-sections-models.md
        └── nli-models.md


/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==2.3.0
2 | tqdm
3 | torch>=1.0.1
4 | numpy
5 | scikit-learn
6 | scipy
7 | nltk


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | examples/datasets/*/
4 | examples/output
5 | sentence_transformers.egg-info
6 | dist/
7 | examples_nr/


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SimilarityFunction.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | class SimilarityFunction(Enum):
4 |     COSINE = 0
5 |     EUCLIDEAN = 1
6 |     MANHATTAN = 2
7 |     DOT_PRODUCT = 3
8 | 
9 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .CosineSimilarityLoss import *
2 | from .SoftmaxLoss import *
3 | from .BatchHardTripletLoss import *
4 | from .MultipleNegativesRankingLoss import *
5 | from .TripletLoss import *


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
2 | from .WhitespaceTokenizer import WhitespaceTokenizer
3 | from .WhitespaceTokenizer import WhitespaceTokenizer


--------------------------------------------------------------------------------
/sentence_transformers/readers/__init__.py:
--------------------------------------------------------------------------------
1 | from .InputExample import InputExample
2 | from .LabelSentenceReader import LabelSentenceReader
3 | from .NLIDataReader import NLIDataReader
4 | from .STSDataReader import STSDataReader
5 | from .TripletReader import TripletReader


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------------
2 | Copyright 2019
3 | Ubiquitous Knowledge Processing (UKP) Lab
4 | Technische Universität Darmstadt
5 | -------------------------------------------------------------------------------


--------------------------------------------------------------------------------
/sentence_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.5"
2 | __DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/'
3 | from .datasets import SentencesDataset, SentenceLabelDataset
4 | from .data_samplers import LabelSampler
5 | from .LoggingHandler import LoggingHandler
6 | from .SentenceTransformer import SentenceTransformer
7 | 
8 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .SentenceEvaluator import SentenceEvaluator
2 | from .SimilarityFunction import SimilarityFunction
3 | 
4 | from .BinaryEmbeddingSimilarityEvaluator import BinaryEmbeddingSimilarityEvaluator
5 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator
6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
7 | from .SequentialEvaluator import SequentialEvaluator
8 | from .TripletEvaluator import TripletEvaluator
9 | 


--------------------------------------------------------------------------------
/sentence_transformers/LoggingHandler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import tqdm
 3 | 
 4 | class LoggingHandler(logging.Handler):
 5 |     def __init__(self, level=logging.NOTSET):
 6 |         super().__init__(level)
 7 | 
 8 |     def emit(self, record):
 9 |         try:
10 |             msg = self.format(record)
11 |             tqdm.tqdm.write(msg)
12 |             self.flush()
13 |         except (KeyboardInterrupt, SystemExit):
14 |             raise
15 |         except:
16 |             self.handleError(record)


--------------------------------------------------------------------------------
/sentence_transformers/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ALBERT import ALBERT
 2 | from .BERT import BERT
 3 | from .BoW import BoW
 4 | from .CamemBERT import CamemBERT
 5 | from .CNN import CNN
 6 | from .Dense import Dense
 7 | from .DistilBERT import DistilBERT
 8 | from .LSTM import LSTM
 9 | from .Pooling import Pooling
10 | from .RoBERTa import RoBERTa
11 | from .T5 import T5
12 | from .WordEmbeddings import WordEmbeddings
13 | from .WordWeights import WordWeights
14 | from .XLMRoBERTa import XLMRoBERTa
15 | from .XLNet import XLNet
16 | 


--------------------------------------------------------------------------------
/examples/datasets/README.md:
--------------------------------------------------------------------------------
 1 | # Datasets
 2 | This folder contains some example datasets that can be used to for training and evaluation of sentence embeddings methods.
 3 | 
 4 | To download these datasets, run:
 5 | ```
 6 | python get_data.py
 7 | ```
 8 | 
 9 | It will download the datasets and unzip them into this directory.
10 | 
11 | 
12 | # AllNLI Dataset
13 | The AllNLI dataset is the concatenation of the SNLI dataset (https://nlp.stanford.edu/projects/snli/) and the MultiNLI dataset (https://www.nyu.edu/projects/bowman/multinli/).
14 | 
15 | # STS Benchmark
16 | The STS Benchmark (http://ixa2.si.ehu.eus/stswiki) contains sentence pairs with human gold score for their similarity.
17 | 


--------------------------------------------------------------------------------
/sentence_transformers/readers/InputExample.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | 
 4 | class InputExample:
 5 |     """
 6 |     Structure for one input example with texts, the label and a unique id
 7 |     """
 8 |     def __init__(self, guid: str, texts: List[str], label: Union[int, float]):
 9 |         """
10 |         Creates one InputExample with the given texts, guid and label
11 | 
12 |         str.strip() is called on both texts.
13 | 
14 |         :param guid
15 |             id for the example
16 |         :param texts
17 |             the texts for the example
18 |         :param label
19 |             the label for the example
20 |         """
21 |         self.guid = guid
22 |         self.texts = [text.strip() for text in texts]
23 |         self.label = label
24 | 


--------------------------------------------------------------------------------
/examples/datasets/get_data.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | import zipfile
 3 | import os
 4 | folder_path = os.path.dirname(os.path.realpath(__file__))
 5 | print('Beginning download of datasets')
 6 | 
 7 | datasets = ['AllNLI.zip', 'stsbenchmark.zip', 'wikipedia-sections-triplets.zip']
 8 | server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/"
 9 | 
10 | for dataset in datasets:
11 |     print("Download", dataset)
12 |     url = server+dataset
13 |     dataset_path = os.path.join(folder_path, dataset)
14 |     urllib.request.urlretrieve(url, dataset_path)
15 | 
16 |     print("Extract", dataset)
17 |     with zipfile.ZipFile(dataset_path, "r") as zip_ref:
18 |         zip_ref.extractall(folder_path)
19 |     os.remove(dataset_path)
20 | 
21 | 
22 | print("All datasets downloaded and extracted")
23 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SequentialEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | from typing import Iterable
 3 | 
 4 | class SequentialEvaluator(SentenceEvaluator):
 5 |     """
 6 |     This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
 7 |     the data is passed sequentially to all sub-evaluators.
 8 | 
 9 |     The score from the last sub-evaluator will be used as the main score for the best model decision.
10 |     """
11 |     def __init__(self, evaluators: Iterable[SentenceEvaluator]):
12 |         self.evaluators = evaluators
13 | 
14 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
15 |         for evaluator in self.evaluators:
16 |             main_score = evaluator(model, output_path, epoch, steps)
17 | 
18 |         return main_score
19 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/CosineSimilarityLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | 
 6 | class CosineSimilarityLoss(nn.Module):
 7 |     def __init__(self, model: SentenceTransformer):
 8 |         super(CosineSimilarityLoss, self).__init__()
 9 |         self.model = model
10 | 
11 | 
12 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
13 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
14 |         rep_a, rep_b = reps
15 | 
16 |         output = torch.cosine_similarity(rep_a, rep_b)
17 |         loss_fct = nn.MSELoss()
18 | 
19 |         if labels is not None:
20 |             loss = loss_fct(output, labels.view(-1))
21 |             return loss
22 |         else:
23 |             return reps, output


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SentenceEvaluator.py:
--------------------------------------------------------------------------------
 1 | class SentenceEvaluator:
 2 |     """
 3 |     Base class for all evaluators
 4 | 
 5 |     Extend this class and implement __call__ for custom evaluators.
 6 |     """
 7 | 
 8 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 9 |         """
10 |         This is called during training to evaluate the model.
11 |         It returns a score for the evaluation with a higher score indicating a better result.
12 | 
13 |         :param model:
14 |             the model to evaluate
15 |         :param output_path:
16 |             path where predictions and metrics are written to
17 |         :param epoch
18 |             the epoch where the evaluation takes place.
19 |             This is used for the file prefixes.
20 |             If this is -1, then we assume evaluation on test data.
21 |         :param steps
22 |             the steps in the current epoch at time of the evaluation.
23 |             This is used for the file prefixes.
24 |             If this is -1, then we assume evaluation at the end of the epoch.
25 |         :return: a score for the evaluation with a higher score indicating a better result
26 |         """
27 |         pass
28 | 


--------------------------------------------------------------------------------
/examples/basic_embedding.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This basic example loads a pre-trained model from the web and uses it to
 3 | generate sentence embeddings for a given list of sentences.
 4 | """
 5 | 
 6 | from sentence_transformers import SentenceTransformer, LoggingHandler
 7 | import numpy as np
 8 | import logging
 9 | 
10 | #### Just some code to print debug information to stdout
11 | np.set_printoptions(threshold=100)
12 | 
13 | logging.basicConfig(format='%(asctime)s - %(message)s',
14 |                     datefmt='%Y-%m-%d %H:%M:%S',
15 |                     level=logging.INFO,
16 |                     handlers=[LoggingHandler()])
17 | #### /print debug information to stdout
18 | 
19 | 
20 | 
21 | # Load Sentence model (based on BERT) from URL
22 | model = SentenceTransformer('bert-base-nli-mean-tokens')
23 | 
24 | # Embed a list of sentences
25 | sentences = ['This framework generates embeddings for each input sentence',
26 |              'Sentences are passed as a list of string.',
27 |              'The quick brown fox jumps over the lazy dog.']
28 | sentence_embeddings = model.encode(sentences)
29 | 
30 | # The result is a list of sentence embeddings as numpy arrays
31 | for sentence, embedding in zip(sentences, sentence_embeddings):
32 |     print("Sentence:", sentence)
33 |     print("Embedding:", embedding)
34 |     print("")
35 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file:
 4 |     readme = readme_file.read()
 5 | 
 6 | setup(
 7 |     name="sentence-transformers",
 8 |     version="0.2.5",
 9 |     author="Nils Reimers, Gregor Geigle",
10 |     author_email="Rnils@web.de",
11 |     description="Sentence Embeddings using BERT / RoBERTa / XLNet",
12 |     long_description=readme,
13 |     long_description_content_type="text/markdown",
14 |     license="Apache License 2.0",
15 |     url="https://github.com/UKPLab/sentence-transformers",
16 |     download_url="https://github.com/UKPLab/sentence-transformers/archive/v0.2.5.zip",
17 |     packages=find_packages(),
18 |     install_requires=[
19 |         "transformers==2.3.0",
20 |         "tqdm",
21 |         "torch>=1.0.1",
22 |         "numpy",
23 |         "scikit-learn",
24 |         "scipy",
25 |         "nltk"
26 |     ],
27 |     classifiers=[
28 |         "Development Status :: 4 - Beta",
29 |         "Intended Audience :: Science/Research",
30 |         "License :: OSI Approved :: Apache Software License",
31 |         "Programming Language :: Python :: 3.6",
32 |         "Topic :: Scientific/Engineering :: Artificial Intelligence"
33 |     ],
34 |     keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning"
35 | )
36 | 


--------------------------------------------------------------------------------
/sentence_transformers/readers/LabelSentenceReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class LabelSentenceReader:
 7 |     """Reads in a file that has at least two columns: a label and a sentence.
 8 |     This reader can for example be used with the BatchHardTripletLoss.
 9 |     Maps labels automatically to integers"""
10 |     def __init__(self, folder, label_col_idx=0, sentence_col_idx=1):
11 |         self.folder = folder
12 |         self.label_map = {}
13 |         self.label_col_idx = label_col_idx
14 |         self.sentence_col_idx = sentence_col_idx
15 | 
16 |     def get_examples(self, filename, max_examples=0):
17 |         examples = []
18 | 
19 |         id = 0
20 |         for line in open(os.path.join(self.folder, filename), encoding="utf-8"):
21 |             splits = line.strip().split('\t')
22 |             label = splits[self.label_col_idx]
23 |             sentence = splits[self.sentence_col_idx]
24 | 
25 |             if label not in self.label_map:
26 |                 self.label_map[label] = len(self.label_map)
27 | 
28 |             label_id = self.label_map[label]
29 |             guid = "%s-%d" % (filename, id)
30 |             id += 1
31 |             examples.append(InputExample(guid=guid, texts=[sentence], label=label_id))
32 | 
33 |             if 0 < max_examples <= id:
34 |                 break
35 | 
36 |         return examples


--------------------------------------------------------------------------------
/examples/evaluation_stsbenchmark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset
 3 | """
 4 | from torch.utils.data import DataLoader
 5 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler
 6 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 7 | from sentence_transformers.readers import STSDataReader
 8 | import numpy as np
 9 | import logging
10 | 
11 | 
12 | #### Just some code to print debug information to stdout
13 | np.set_printoptions(threshold=100)
14 | 
15 | logging.basicConfig(format='%(asctime)s - %(message)s',
16 |                     datefmt='%Y-%m-%d %H:%M:%S',
17 |                     level=logging.INFO,
18 |                     handlers=[LoggingHandler()])
19 | #### /print debug information to stdout
20 | 
21 | 
22 | 
23 | # Load a named sentence model (based on BERT). This will download the model from our server.
24 | # Alternatively, you can also pass a filepath to SentenceTransformer()
25 | model = SentenceTransformer('bert-base-nli-mean-tokens')
26 | 
27 | sts_reader = STSDataReader('datasets/stsbenchmark')
28 | 
29 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
30 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
31 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
32 | 
33 | model.evaluate(evaluator)
34 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/TripletLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | from enum import Enum
 6 | from ..SentenceTransformer import SentenceTransformer
 7 | 
 8 | class TripletDistanceMetric(Enum):
 9 |     """
10 |     The metric for the triplet loss
11 |     """
12 |     COSINE = lambda x, y: 1 - F.cosine_similarity(x, y)
13 |     EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
14 |     MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
15 | 
16 | class TripletLoss(nn.Module):
17 |     def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1):
18 |         super(TripletLoss, self).__init__()
19 |         self.model = model
20 |         self.distance_metric = distance_metric
21 |         self.triplet_margin = triplet_margin
22 | 
23 | 
24 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
25 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
26 | 
27 |         rep_anchor, rep_pos, rep_neg = reps
28 |         distance_pos = self.distance_metric(rep_anchor, rep_pos)
29 |         distance_neg = self.distance_metric(rep_anchor, rep_neg)
30 | 
31 |         losses = F.relu(distance_pos - distance_neg + self.triplet_margin)
32 |         return losses.mean()


--------------------------------------------------------------------------------
/sentence_transformers/readers/TripletReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class TripletReader(object):
 7 |     """
 8 |     Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
 9 |     one positive example (s2) and one negative example (s3)
10 |     """
11 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t",
12 |                  quoting=csv.QUOTE_NONE):
13 |         self.dataset_folder = dataset_folder
14 |         self.s1_col_idx = s1_col_idx
15 |         self.s2_col_idx = s2_col_idx
16 |         self.s3_col_idx = s3_col_idx
17 |         self.has_header = has_header
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 | 
21 |     def get_examples(self, filename, max_examples=0):
22 |         """
23 | 
24 |         """
25 |         data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter,
26 |                           quoting=self.quoting)
27 |         examples = []
28 |         if self.has_header:
29 |             next(data)
30 | 
31 |         for id, row in enumerate(data):
32 |             s1 = row[self.s1_col_idx]
33 |             s2 = row[self.s2_col_idx]
34 |             s3 = row[self.s3_col_idx]
35 | 
36 |             examples.append(InputExample(guid=filename+str(id), texts=[s1, s2, s3], label=1))
37 |             if max_examples > 0 and len(examples) >= max_examples:
38 |                 break
39 | 
40 |         return examples


--------------------------------------------------------------------------------
/examples/application_clustering.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a simple application for sentence embeddings: clustering
 3 | 
 4 | Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
 5 | """
 6 | from sentence_transformers import SentenceTransformer
 7 | from sklearn.cluster import KMeans
 8 | 
 9 | embedder = SentenceTransformer('bert-base-nli-mean-tokens')
10 | 
11 | # Corpus with example sentences
12 | corpus = ['A man is eating food.',
13 |           'A man is eating a piece of bread.',
14 |           'A man is eating pasta.',
15 |           'The girl is carrying a baby.',
16 |           'The baby is carried by the woman',
17 |           'A man is riding a horse.',
18 |           'A man is riding a white horse on an enclosed ground.',
19 |           'A monkey is playing drums.',
20 |           'Someone in a gorilla costume is playing a set of drums.',
21 |           'A cheetah is running behind its prey.',
22 |           'A cheetah chases prey on across a field.'
23 |           ]
24 | corpus_embeddings = embedder.encode(corpus)
25 | 
26 | # Perform kmean clustering
27 | num_clusters = 5
28 | clustering_model = KMeans(n_clusters=num_clusters)
29 | clustering_model.fit(corpus_embeddings)
30 | cluster_assignment = clustering_model.labels_
31 | 
32 | clustered_sentences = [[] for i in range(num_clusters)]
33 | for sentence_id, cluster_id in enumerate(cluster_assignment):
34 |     clustered_sentences[cluster_id].append(corpus[sentence_id])
35 | 
36 | for i, cluster in enumerate(clustered_sentences):
37 |     print("Cluster ", i+1)
38 |     print(cluster)
39 |     print("")
40 | 


--------------------------------------------------------------------------------
/sentence_transformers/readers/STSDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class STSDataReader:
 7 |     """
 8 |     Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
 9 |     """
10 |     def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t",
11 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
12 |         self.dataset_folder = dataset_folder
13 |         self.score_col_idx = score_col_idx
14 |         self.s1_col_idx = s1_col_idx
15 |         self.s2_col_idx = s2_col_idx
16 |         self.delimiter = delimiter
17 |         self.quoting = quoting
18 |         self.normalize_scores = normalize_scores
19 |         self.min_score = min_score
20 |         self.max_score = max_score
21 | 
22 |     def get_examples(self, filename, max_examples=0):
23 |         """
24 |         filename specified which data split to use (train.csv, dev.csv, test.csv).
25 |         """
26 |         data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"),
27 |                           delimiter=self.delimiter, quoting=self.quoting)
28 |         examples = []
29 |         for id, row in enumerate(data):
30 |             score = float(row[self.score_col_idx])
31 |             if self.normalize_scores:  # Normalize to a 0...1 value
32 |                 score = (score - self.min_score) / (self.max_score - self.min_score)
33 | 
34 |             s1 = row[self.s1_col_idx]
35 |             s2 = row[self.s2_col_idx]
36 |             examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))
37 | 
38 |             if max_examples > 0 and len(examples) >= max_examples:
39 |                 break
40 | 
41 |         return examples
42 | 


--------------------------------------------------------------------------------
/sentence_transformers/readers/NLIDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | 
 7 | class NLIDataReader(object):
 8 |     """
 9 |     Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
10 |     """
11 |     def __init__(self, dataset_folder):
12 |         self.dataset_folder = dataset_folder
13 | 
14 |     def get_examples(self, filename, max_examples=0):
15 |         """
16 |         data_splits specified which data split to use (train, dev, test).
17 |         Expects that self.dataset_folder contains the files s1.$data_split.gz,  s2.$data_split.gz,
18 |         labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz
19 |         """
20 |         s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename),
21 |                        mode="rt", encoding="utf-8").readlines()
22 |         s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename),
23 |                        mode="rt", encoding="utf-8").readlines()
24 |         labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename),
25 |                            mode="rt", encoding="utf-8").readlines()
26 | 
27 |         examples = []
28 |         id = 0
29 |         for sentence_a, sentence_b, label in zip(s1, s2, labels):
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))
33 | 
34 |             if 0 < max_examples <= len(examples):
35 |                 break
36 | 
37 |         return examples
38 | 
39 |     @staticmethod
40 |     def get_labels():
41 |         return {"contradiction": 0, "entailment": 1, "neutral": 2}
42 | 
43 |     def get_num_labels(self):
44 |         return len(self.get_labels())
45 | 
46 |     def map_label(self, label):
47 |         return self.get_labels()[label.strip().lower()]


--------------------------------------------------------------------------------
/examples/application_semantic_search.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a simple application for sentence embeddings: semantic search
 3 | 
 4 | We have a corpus with various sentences. Then, for a given query sentence,
 5 | we want to find the most similar sentence in this corpus.
 6 | 
 7 | This script outputs for various queries the top 5 most similar sentences in the corpus.
 8 | """
 9 | from sentence_transformers import SentenceTransformer
10 | import scipy.spatial
11 | 
12 | embedder = SentenceTransformer('bert-base-nli-mean-tokens')
13 | 
14 | # Corpus with example sentences
15 | corpus = ['A man is eating food.',
16 |           'A man is eating a piece of bread.',
17 |           'The girl is carrying a baby.',
18 |           'A man is riding a horse.',
19 |           'A woman is playing violin.',
20 |           'Two men pushed carts through the woods.',
21 |           'A man is riding a white horse on an enclosed ground.',
22 |           'A monkey is playing drums.',
23 |           'A cheetah is running behind its prey.'
24 |           ]
25 | corpus_embeddings = embedder.encode(corpus)
26 | 
27 | # Query sentences:
28 | queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']
29 | query_embeddings = embedder.encode(queries)
30 | 
31 | # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
32 | closest_n = 5
33 | for query, query_embedding in zip(queries, query_embeddings):
34 |     distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
35 | 
36 |     results = zip(range(len(distances)), distances)
37 |     results = sorted(results, key=lambda x: x[1])
38 | 
39 |     print("\n\n======================\n\n")
40 |     print("Query:", query)
41 |     print("\nTop 5 most similar sentences in corpus:")
42 | 
43 |     for idx, distance in results[0:closest_n]:
44 |         print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/MultipleNegativesRankingLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | from ..SentenceTransformer import SentenceTransformer
 6 | 
 7 | class MultipleNegativesRankingLoss(nn.Module):
 8 |     def __init__(self, model: SentenceTransformer):
 9 |         super(MultipleNegativesRankingLoss, self).__init__()
10 |         self.model = model
11 | 
12 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
13 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
14 | 
15 |         reps_a, reps_b = reps
16 |         return self.multiple_negatives_ranking_loss(reps_a, reps_b)
17 | 
18 |     # Multiple Negatives Ranking Loss
19 |     # Paper: https://arxiv.org/pdf/1705.00652.pdf
20 |     #   Efficient Natural Language Response Suggestion for Smart Reply
21 |     #   Section 4.4
22 |     def multiple_negatives_ranking_loss(self, embeddings_a: Tensor, embeddings_b: Tensor):
23 |         """
24 |         Compute the loss over a batch with two embeddings per example.
25 | 
26 |         Each pair is a positive example. The negative examples are all other embeddings in embeddings_b with each embedding
27 |         in embedding_a.
28 | 
29 |         See the paper for more information: https://arxiv.org/pdf/1705.00652.pdf
30 |         (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4)
31 | 
32 |         :param embeddings_a:
33 |             Tensor of shape (batch_size, embedding_dim)
34 |         :param embeddings_b:
35 |             Tensor of shape (batch_size, embedding_dim)
36 |         :return:
37 |             The scalar loss
38 |         """
39 |         scores = torch.matmul(embeddings_a, embeddings_b.t())
40 |         diagonal_mean = torch.mean(torch.diag(scores))
41 |         mean_log_row_sum_exp = torch.mean(torch.logsumexp(scores, dim=1))
42 |         return -diagonal_mean + mean_log_row_sum_exp
43 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/Dense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from torch import functional as F
 5 | from typing import Union, Tuple, List, Iterable, Dict
 6 | import os
 7 | import json
 8 | from ..util import fullname, import_from_string
 9 | 
10 | 
11 | class Dense(nn.Module):
12 |     """Feed-forward function with  activiation function.
13 | 
14 |     This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN).
15 |     """
16 |     def __init__(self, in_features, out_features, bias=True, activation_function=nn.Tanh()):
17 |         super(Dense, self).__init__()
18 |         self.in_features = in_features
19 |         self.out_features = out_features
20 |         self.bias = bias
21 |         self.activation_function = activation_function
22 |         self.linear = nn.Linear(in_features, out_features, bias=bias)
23 | 
24 |     def forward(self, features: Dict[str, Tensor]):
25 |         features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))})
26 |         return features
27 | 
28 |     def get_sentence_embedding_dimension(self) -> int:
29 |         return self.out_features
30 | 
31 |     def save(self, output_path):
32 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
33 |             json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut)
34 | 
35 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
36 | 
37 |     @staticmethod
38 |     def load(input_path):
39 |         with open(os.path.join(input_path, 'config.json')) as fIn:
40 |             config = json.load(fIn)
41 | 
42 |         config['activation_function'] = import_from_string(config['activation_function'])()
43 |         model = Dense(**config)
44 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin')))
45 |         return model
46 | 


--------------------------------------------------------------------------------
/docs/pretrained-models/sts-models.md:
--------------------------------------------------------------------------------
 1 | # STS Models
 2 | The models were first trained on [NLI data](nli-models.md), then we fine-tuned them on the  [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark). This generate sentence embeddings that are especially suitable to measure the semantic similarity between sentence pairs.
 3 | 
 4 | # Datasets
 5 | We use the training file from the  [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark).
 6 | 
 7 | For a training example, see:
 8 | - [examples/training_stsbenchmark.py](../../examples/training_stsbenchmark_bert.py) - Train directly on STS data
 9 | - [examples/training_stsbenchmark_continue_training.py ](../../examples/training_stsbenchmark_continue_training.py) - First train one NLI, than train on STS data.
10 | 
11 | # Pre-trained models
12 |  We provide the following pre-trained models:
13 |  
14 | ### BERT models
15 | - **bert-base-nli-stsb-mean-tokens**: BERT-base trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.14
16 | - **bert-large-nli-stsb-mean-tokens**: BERT-large trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.29
17 | 
18 | ### RoBERTa models
19 | RoBERTa is an extension of BERT. [More Information](https://arxiv.org/abs/1907.11692).
20 | - **roberta-base-nli-stsb-mean-tokens**: RoBERTa-base trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.40
21 | - **roberta-large-nli-stsb-mean-tokens**: RoBERTa-large trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 86.31
22 | 
23 | ### DistilBERT
24 | DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. [More Information](https://github.com/huggingface/transformers/tree/master/examples/distillation)
25 | - **distilbert-base-nli-stsb-mean-tokens**: Performance: STSbenchmark: 84.38
26 | 
27 | # Performance Comparison
28 | Here are the performances on the STS benchmark for other sentence embeddings methods. They were also computed by using cosine-similarity and Spearman rank correlation. Note, these models were not-fined on the STS benchmark.
29 | 
30 | - Avg. GloVe embeddings:  58.02 
31 | - BERT-as-a-service avg. embeddings:  46.35 
32 | - BERT-as-a-service CLS-vector: 16.50 
33 | - InferSent - GloVe: 68.03 
34 | - Universal Sentence Encoder: 74.92
35 | 


--------------------------------------------------------------------------------
/docs/pretrained-models/wikipedia-sections-models.md:
--------------------------------------------------------------------------------
 1 | # Wikipedia Sections Models
 2 | The `wikipedia-sections-models` implement the idea from Dor et al., 2018, [Learning Thematic Similarity Metric Using Triplet Networks](https://aclweb.org/anthology/P18-2009).
 3 | 
 4 | It was trained with a triplet-loss: The anchor and the positive example were sentences from the same section from an wikipedia article, for example, from the History section of the London article. The negative example came from a different section from the same article, for example, from the Education section of the London article.
 5 | 
 6 | # Dataset
 7 | We use dataset from Dor et al., 2018, [Learning Thematic Similarity Metric Using Triplet Networks](https://aclweb.org/anthology/P18-2009).
 8 | 
 9 | See [examples/training_wikipedia_sections.py](../../examples/training_wikipedia_sections.py) for how to train on this dataset.
10 | 
11 | 
12 | # Pre-trained models
13 | We provide the following pre-trained models:
14 | 
15 | - **bert-base-wikipedia-sections-mean-tokens**: 80.42% accuracy on test set.
16 | 
17 | You can use them in the following way:
18 | ```
19 | from sentence_transformers import SentenceTransformer
20 | embedder = SentenceTransformer('pretrained-model-name')
21 | ```
22 | 
23 | # Performance Comparison
24 | Performance (accuracy) reported by Dor et al.:
25 | - mean-vectors: 0.65
26 | - skip-thoughts-CS: 0.615
27 | - skip-thoughts-SICK: 0.547
28 | - triplet-sen: 0.74
29 | 
30 | 
31 | # Applications
32 | The models achieve a rather low performance on the STS benchmark dataset. The reason for this is the training objective: An anchor, a positive and a negative example are presented. The network must only learn to differentiate what the positive and what the negative example is by ensuring that the negative example is further away from the anchor than the positive example.
33 | 
34 | However, it does not matter how far the negative example is away, it can be little or really far away. This makes this model rather bad for deciding if a pair is somewhat similar. It learns only to recognize similar pairs (high scores) and dissimilar pairs (low scores).
35 | 
36 | However, this model works well for **fine-grained clustering**. 
37 | 
38 | For an example, see:
39 | [examples/application_clustering_wikipedia_sections.py](../../examples/application_clustering_wikipedia_sections.py)
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/SoftmaxLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | import logging
 6 | 
 7 | class SoftmaxLoss(nn.Module):
 8 |     def __init__(self,
 9 |                  model: SentenceTransformer,
10 |                  sentence_embedding_dimension: int,
11 |                  num_labels: int,
12 |                  concatenation_sent_rep: bool = True,
13 |                  concatenation_sent_difference: bool = True,
14 |                  concatenation_sent_multiplication: bool = False):
15 |         super(SoftmaxLoss, self).__init__()
16 |         self.model = model
17 |         self.num_labels = num_labels
18 |         self.concatenation_sent_rep = concatenation_sent_rep
19 |         self.concatenation_sent_difference = concatenation_sent_difference
20 |         self.concatenation_sent_multiplication = concatenation_sent_multiplication
21 | 
22 |         num_vectors_concatenated = 0
23 |         if concatenation_sent_rep:
24 |             num_vectors_concatenated += 2
25 |         if concatenation_sent_difference:
26 |             num_vectors_concatenated += 1
27 |         if concatenation_sent_multiplication:
28 |             num_vectors_concatenated += 1
29 |         logging.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated))
30 |         self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels)
31 | 
32 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
33 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
34 |         rep_a, rep_b = reps
35 | 
36 |         vectors_concat = []
37 |         if self.concatenation_sent_rep:
38 |             vectors_concat.append(rep_a)
39 |             vectors_concat.append(rep_b)
40 | 
41 |         if self.concatenation_sent_difference:
42 |             vectors_concat.append(torch.abs(rep_a - rep_b))
43 | 
44 |         if self.concatenation_sent_multiplication:
45 |             vectors_concat.append(rep_a * rep_b)
46 | 
47 |         features = torch.cat(vectors_concat, 1)
48 | 
49 |         output = self.classifier(features)
50 |         loss_fct = nn.CrossEntropyLoss()
51 | 
52 |         if labels is not None:
53 |             loss = loss_fct(output, labels.view(-1))
54 |             return loss
55 |         else:
56 |             return reps, output


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 7 | 
 8 | class WhitespaceTokenizer(WordTokenizer):
 9 |     """
10 |     Simple and fast white-space tokenizer. Splits sentence based on white spaces.
11 |     Punctuation are stripped from tokens.
12 |     """
13 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False):
14 |         self.stop_words = set(stop_words)
15 |         self.do_lower_case = do_lower_case
16 |         self.set_vocab(vocab)
17 | 
18 |     def get_vocab(self):
19 |         return self.vocab
20 | 
21 |     def set_vocab(self, vocab: Iterable[str]):
22 |         self.vocab = vocab
23 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
24 | 
25 |     def tokenize(self, text: str) -> List[int]:
26 |         if self.do_lower_case:
27 |             text = text.lower()
28 | 
29 |         tokens = text.split()
30 | 
31 |         tokens_filtered = []
32 |         for token in tokens:
33 |             if token in self.stop_words:
34 |                 continue
35 |             elif token in self.word2idx:
36 |                 tokens_filtered.append(self.word2idx[token])
37 |                 continue
38 | 
39 |             token = token.strip(string.punctuation)
40 |             if token in self.stop_words:
41 |                 continue
42 |             elif len(token) > 0 and token in self.word2idx:
43 |                 tokens_filtered.append(self.word2idx[token])
44 |                 continue
45 | 
46 |             token = token.lower()
47 |             if token in self.stop_words:
48 |                 continue
49 |             elif token in self.word2idx:
50 |                 tokens_filtered.append(self.word2idx[token])
51 |                 continue
52 | 
53 |         return tokens_filtered
54 | 
55 |     def save(self, output_path: str):
56 |         with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut:
57 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut)
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         return WhitespaceTokenizer(**config)
65 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/LSTM.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import logging
 5 | import gzip
 6 | from tqdm import tqdm
 7 | import numpy as np
 8 | import os
 9 | import json
10 | from ..util import import_from_string, fullname, http_get
11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
12 | 
13 | 
14 | class LSTM(nn.Module):
15 |     """Bidirectional LSTM running over word embeddings.
16 |     """
17 |     def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0):
18 |         nn.Module.__init__(self)
19 |         self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout']
20 |         self.word_embedding_dimension = word_embedding_dimension
21 |         self.hidden_dim = hidden_dim
22 |         self.num_layers = num_layers
23 |         self.dropout = dropout
24 | 
25 |         self.embeddings_dimension = 2*hidden_dim
26 |         self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=True, batch_first=True)
27 | 
28 |     def forward(self, features):
29 |         token_embeddings = features['token_embeddings']
30 |         sentence_lengths = torch.clamp(features['sentence_lengths'], min=1)
31 | 
32 |         packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False)
33 |         packed = self.encoder(packed)
34 |         unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0]
35 |         features.update({'token_embeddings': unpack})
36 |         return features
37 | 
38 |     def get_word_embedding_dimension(self) -> int:
39 |         return self.embeddings_dimension
40 | 
41 |     def tokenize(self, text: str) -> List[int]:
42 |         raise NotImplementedError()
43 | 
44 |     def save(self, output_path: str):
45 |         with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
46 |             json.dump(self.get_config_dict(), fOut, indent=2)
47 | 
48 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
49 | 
50 |     def get_config_dict(self):
51 |         return {key: self.__dict__[key] for key in self.config_keys}
52 | 
53 |     @staticmethod
54 |     def load(input_path: str):
55 |         with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
56 |             config = json.load(fIn)
57 | 
58 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
59 |         model = LSTM(**config)
60 |         model.load_state_dict(weights)
61 |         return model
62 | 
63 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/CNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import logging
 5 | import gzip
 6 | from tqdm import tqdm
 7 | import numpy as np
 8 | import os
 9 | import json
10 | from ..util import import_from_string, fullname, http_get
11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
12 | 
13 | 
14 | class CNN(nn.Module):
15 |     """CNN-layer with multiple kernel-sizes over the word embeddings"""
16 | 
17 |     def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]):
18 |         nn.Module.__init__(self)
19 |         self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes']
20 |         self.in_word_embedding_dimension = in_word_embedding_dimension
21 |         self.out_channels = out_channels
22 |         self.kernel_sizes = kernel_sizes
23 | 
24 |         self.embeddings_dimension = out_channels*len(kernel_sizes)
25 |         self.convs = nn.ModuleList()
26 | 
27 |         in_channels = in_word_embedding_dimension
28 |         for kernel_size in kernel_sizes:
29 |             padding_size = int((kernel_size - 1) / 2)
30 |             conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
31 |                              padding=padding_size)
32 |             self.convs.append(conv)
33 | 
34 |     def forward(self, features):
35 |         token_embeddings = features['token_embeddings']
36 | 
37 |         token_embeddings = token_embeddings.transpose(1, -1)
38 |         vectors = [conv(token_embeddings) for conv in self.convs]
39 |         out = torch.cat(vectors, 1).transpose(1, -1)
40 | 
41 |         features.update({'token_embeddings': out})
42 |         return features
43 | 
44 |     def get_word_embedding_dimension(self) -> int:
45 |         return self.embeddings_dimension
46 | 
47 |     def tokenize(self, text: str) -> List[int]:
48 |         raise NotImplementedError()
49 | 
50 |     def save(self, output_path: str):
51 |         with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut:
52 |             json.dump(self.get_config_dict(), fOut, indent=2)
53 | 
54 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
55 | 
56 |     def get_config_dict(self):
57 |         return {key: self.__dict__[key] for key in self.config_keys}
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
65 |         model = CNN(**config)
66 |         model.load_state_dict(weights)
67 |         return model
68 | 
69 | 


--------------------------------------------------------------------------------
/sentence_transformers/util.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from torch import Tensor, device
 3 | from typing import Tuple, List
 4 | from tqdm import tqdm
 5 | import sys
 6 | import importlib
 7 | 
 8 | 
 9 | def batch_to_device(batch, target_device: device):
10 |     """
11 |     send a batch to a device
12 | 
13 |     :param batch:
14 |     :param target_device:
15 |     :return: the batch sent to the device
16 |     """
17 |     features = batch['features']
18 |     for paired_sentence_idx in range(len(features)):
19 |         for feature_name in features[paired_sentence_idx]:
20 |             features[paired_sentence_idx][feature_name] = features[paired_sentence_idx][feature_name].to(target_device)
21 | 
22 |     labels = batch['labels'].to(target_device)
23 |     return features, labels
24 | 
25 | 
26 | 
27 | def http_get(url, path):
28 |     file_binary = open(path, "wb")
29 |     req = requests.get(url, stream=True)
30 |     if req.status_code != 200:
31 |         print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
32 |         req.raise_for_status()
33 | 
34 |     content_length = req.headers.get('Content-Length')
35 |     total = int(content_length) if content_length is not None else None
36 |     progress = tqdm(unit="B", total=total, unit_scale=True)
37 |     for chunk in req.iter_content(chunk_size=1024):
38 |         if chunk: # filter out keep-alive new chunks
39 |             progress.update(len(chunk))
40 |             file_binary.write(chunk)
41 |     progress.close()
42 | 
43 | 
44 | def fullname(o):
45 |   # o.__module__ + "." + o.__class__.__qualname__ is an example in
46 |   # this context of H.L. Mencken's "neat, plausible, and wrong."
47 |   # Python makes no guarantees as to whether the __module__ special
48 |   # attribute is defined, so we take a more circumspect approach.
49 |   # Alas, the module name is explicitly excluded from __qualname__
50 |   # in Python 3.
51 | 
52 |   module = o.__class__.__module__
53 |   if module is None or module == str.__class__.__module__:
54 |     return o.__class__.__name__  # Avoid reporting __builtin__
55 |   else:
56 |     return module + '.' + o.__class__.__name__
57 | 
58 | def import_from_string(dotted_path):
59 |     """
60 |     Import a dotted module path and return the attribute/class designated by the
61 |     last name in the path. Raise ImportError if the import failed.
62 |     """
63 |     try:
64 |         module_path, class_name = dotted_path.rsplit('.', 1)
65 |     except ValueError:
66 |         msg = "%s doesn't look like a module path" % dotted_path
67 |         raise ImportError(msg)
68 | 
69 |     module = importlib.import_module(module_path)
70 | 
71 |     try:
72 |         return getattr(module, class_name)
73 |     except AttributeError:
74 |         msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name)
75 |         raise ImportError(msg)


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/LabelAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | import logging
 5 | from tqdm import tqdm
 6 | from ..util import batch_to_device
 7 | import os
 8 | import csv
 9 | 
10 | class LabelAccuracyEvaluator(SentenceEvaluator):
11 |     """
12 |     Evaluate a model based on its accuracy on a labeled dataset
13 | 
14 |     This requires a model with LossFunction.SOFTMAX
15 | 
16 |     The results are written in a CSV. If a CSV already exists, then values are appended.
17 |     """
18 | 
19 |     def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None):
20 |         """
21 |         Constructs an evaluator for the given dataset
22 | 
23 |         :param dataloader:
24 |             the data for the evaluation
25 |         """
26 |         self.dataloader = dataloader
27 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28 |         self.name = name
29 |         self.softmax_model = softmax_model
30 |         self.softmax_model.to(self.device)
31 | 
32 |         if name:
33 |             name = "_"+name
34 | 
35 |         self.csv_file = "accuracy_evaluation"+name+"_results.csv"
36 |         self.csv_headers = ["epoch", "steps", "accuracy"]
37 | 
38 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
39 |         model.eval()
40 |         total = 0
41 |         correct = 0
42 | 
43 |         if epoch != -1:
44 |             if steps == -1:
45 |                 out_txt = " after epoch {}:".format(epoch)
46 |             else:
47 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
48 |         else:
49 |             out_txt = ":"
50 | 
51 |         logging.info("Evaluation on the "+self.name+" dataset"+out_txt)
52 |         self.dataloader.collate_fn = model.smart_batching_collate
53 |         for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
54 |             features, label_ids = batch_to_device(batch, self.device)
55 |             with torch.no_grad():
56 |                 _, prediction = self.softmax_model(features, labels=None)
57 | 
58 |             total += prediction.size(0)
59 |             correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
60 |         accuracy = correct/total
61 | 
62 |         logging.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
63 | 
64 |         if output_path is not None:
65 |             csv_path = os.path.join(output_path, self.csv_file)
66 |             if not os.path.isfile(csv_path):
67 |                 with open(csv_path, mode="w", encoding="utf-8") as f:
68 |                     writer = csv.writer(f)
69 |                     writer.writerow(self.csv_headers)
70 |                     writer.writerow([epoch, steps, accuracy])
71 |             else:
72 |                 with open(csv_path, mode="a", encoding="utf-8") as f:
73 |                     writer = csv.writer(f)
74 |                     writer.writerow([epoch, steps, accuracy])
75 | 
76 |         return accuracy


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_continue_training.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example loads the pre-trained bert-base-nli-mean-tokens models from the server.
 3 | It then fine-tunes this model for some epochs on the STS benchmark dataset.
 4 | """
 5 | from torch.utils.data import DataLoader
 6 | import math
 7 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 9 | from sentence_transformers.readers import STSDataReader
10 | import logging
11 | from datetime import datetime
12 | 
13 | 
14 | #### Just some code to print debug information to stdout
15 | logging.basicConfig(format='%(asctime)s - %(message)s',
16 |                     datefmt='%Y-%m-%d %H:%M:%S',
17 |                     level=logging.INFO,
18 |                     handlers=[LoggingHandler()])
19 | #### /print debug information to stdout
20 | 
21 | # Read the dataset
22 | model_name = 'bert-base-nli-mean-tokens'
23 | train_batch_size = 16
24 | num_epochs = 4
25 | model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
26 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)
27 | 
28 | # Load a pre-trained sentence transformer model
29 | model = SentenceTransformer(model_name)
30 | 
31 | # Convert the dataset to a DataLoader ready for training
32 | logging.info("Read STSbenchmark train dataset")
33 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
34 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
35 | train_loss = losses.CosineSimilarityLoss(model=model)
36 | 
37 | 
38 | logging.info("Read STSbenchmark dev dataset")
39 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
40 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
41 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
42 | 
43 | 
44 | # Configure the training. We skip evaluation in this example
45 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
46 | logging.info("Warmup-steps: {}".format(warmup_steps))
47 | 
48 | 
49 | # Train the model
50 | model.fit(train_objectives=[(train_dataloader, train_loss)],
51 |           evaluator=evaluator,
52 |           epochs=num_epochs,
53 |           evaluation_steps=1000,
54 |           warmup_steps=warmup_steps,
55 |           output_path=model_save_path)
56 | 
57 | 
58 | ##############################################################################
59 | #
60 | # Load the stored model and evaluate its performance on STS benchmark dataset
61 | #
62 | ##############################################################################
63 | 
64 | model = SentenceTransformer(model_save_path)
65 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
66 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
67 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
68 | model.evaluate(evaluator)
69 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/BoW.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | import numpy as np
 9 | from .tokenizer import WhitespaceTokenizer
10 | 
11 | class BoW(nn.Module):
12 |     """Implements a Bag-of-Words (BoW) model to derive sentence embeddings.
13 | 
14 |     A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
15 |     """
16 | 
17 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True):
18 |         super(BoW, self).__init__()
19 |         vocab = list(set(vocab)) #Ensure vocab is unique
20 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency']
21 |         self.vocab = vocab
22 |         self.word_weights = word_weights
23 |         self.unknown_word_weight = unknown_word_weight
24 |         self.cumulative_term_frequency = cumulative_term_frequency
25 | 
26 |         #Maps wordIdx -> word weight
27 |         self.weights = []
28 |         num_unknown_words = 0
29 |         for word in vocab:
30 |             weight = unknown_word_weight
31 |             if word in word_weights:
32 |                 weight = word_weights[word]
33 |             elif word.lower() in word_weights:
34 |                 weight = word_weights[word.lower()]
35 |             else:
36 |                 num_unknown_words += 1
37 |             self.weights.append(weight)
38 | 
39 |         logging.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
40 | 
41 |         self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False)
42 |         self.sentence_embedding_dimension = len(vocab)
43 | 
44 | 
45 |     def forward(self, features: Dict[str, Tensor]):
46 |         #Nothing to do, everything is done in get_sentence_features
47 |         return features
48 | 
49 |     def tokenize(self, text: str) -> List[int]:
50 |         return self.tokenizer.tokenize(text)
51 | 
52 |     def get_sentence_embedding_dimension(self):
53 |         return self.sentence_embedding_dimension
54 | 
55 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
56 |         #return {'input_ids': tokens}
57 |         vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32)
58 |         for token in tokens:
59 |             if self.cumulative_term_frequency:
60 |                 vector[token] += self.weights[token]
61 |             else:
62 |                 vector[token] = self.weights[token]
63 | 
64 |         return {'sentence_embedding': vector}
65 | 
66 |     def get_config_dict(self):
67 |         return {key: self.__dict__[key] for key in self.config_keys}
68 | 
69 |     def save(self, output_path):
70 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
71 |             json.dump(self.get_config_dict(), fOut, indent=2)
72 | 
73 |     @staticmethod
74 |     def load(input_path):
75 |         with open(os.path.join(input_path, 'config.json')) as fIn:
76 |             config = json.load(fIn)
77 | 
78 |         return BoW(**config)


--------------------------------------------------------------------------------
/docs/pretrained-models/nli-models.md:
--------------------------------------------------------------------------------
 1 | # NLI Models
 2 | Conneau et al., 2017, show in the InferSent-Paper ([Supervised Learning of Universal Sentence Representations from Natural Language Inference Data](https://arxiv.org/abs/1705.02364)) that training on Natural Language Inference (NLI) data can produce universal sentence embeddings.
 3 | 
 4 | The datasets labeled sentence pairs with the labels *entail*, *contradict*, and *neutral*. For both sentences, we compute a sentence embedding. These two embeddings are concatenated and passed to softmax classifier to derive the final label.
 5 | 
 6 | As shown, this produces sentence embeddings that can be used for various use cases like clustering or semantic search.
 7 | 
 8 | # Datasets
 9 | We train the models on the [SNLI](https://nlp.stanford.edu/projects/snli/) and on the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset. We call the combination of the two datasets AllNLI.
10 | 
11 | For a training example, see [examples/training_nli_bert.py](../../examples/training_nli_bert.py). 
12 | 
13 | # Pre-trained models
14 | We provide the following pre-trained models. The performance was evaluated on the test set of the [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) using Spearman rank correlation.
15 | 
16 | 
17 | ### BERT models
18 | - **bert-base-nli-mean-tokens**: BERT-base model with mean-tokens pooling. Performance: STSbenchmark: 77.12
19 | - **bert-base-nli-max-tokens**: BERT-base with max-tokens pooling. Performance: STSbenchmark: 77.18
20 | - **bert-base-nli-cls-token**: BERT-base with cls token pooling. Performance: STSbenchmark: 76.30
21 | - **bert-large-nli-mean-tokens**: BERT-large with mean-tokens pooling. Performance: STSbenchmark: 79.19
22 | - **bert-large-nli-max-tokens**: BERT-large with max-tokens pooling. Performance: STSbenchmark: 78.32
23 | - **bert-large-nli-cls-token**: BERT-large with CLS token pooling. Performance: STSbenchmark: 78.29
24 | 
25 | ### RoBERTa models
26 | RoBERTa is an extension of BERT. [More Information](https://arxiv.org/abs/1907.11692).
27 | - **roberta-base-nli-mean-tokens**: RoBERTa-base with mean-tokens pooling. Performance: STSbenchmark: 77.42
28 | - **roberta-large-nli-mean-tokens**: RoBERTa-base with mean-tokens pooling. Performance: STSbenchmark: 78.58
29 | 
30 | ### DistilBERT models
31 | DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. [More Information](https://github.com/huggingface/transformers/tree/master/examples/distillation)
32 | - **distilbert-base-nli-mean-tokens**: DistilBERT-base with mean-tokens pooling. Performance: STSbenchmark: 76.97
33 | 
34 | # Performance Comparison
35 | Here are the performances on the STS benchmark for other sentence embeddings methods. They were also computed by using cosine-similarity and Spearman rank correlation:
36 | - Avg. GloVe embeddings:  58.02 
37 | - BERT-as-a-service avg. embeddings:  46.35 
38 | - BERT-as-a-service CLS-vector: 16.50 
39 | - InferSent - GloVe: 68.03 
40 | - Universal Sentence Encoder: 74.92
41 | 
42 | # Applications
43 | This model works well in accessing the coarse-grained similarity between sentences. For application examples, see [examples/application_semantic_search.py](../../examples/application_semantic_search.py) and [examples/application_clustering.py](../../examples/application_clustering.py)


--------------------------------------------------------------------------------
/sentence_transformers/models/WordWeights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | 
 9 | class WordWeights(nn.Module):
10 |     """This model can weight word embeddings, for example, with idf-values."""
11 | 
12 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
13 |         """
14 | 
15 |         :param vocab:
16 |             Vocabulary of the tokenizer
17 |         :param word_weights:
18 |             Mapping of tokens to a float weight value. Words embeddings are multiplied by  this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
19 |         :param unknown_word_weight:
20 |             Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
21 |         """
22 |         super(WordWeights, self).__init__()
23 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight']
24 |         self.vocab = vocab
25 |         self.word_weights = word_weights
26 |         self.unknown_word_weight = unknown_word_weight
27 | 
28 |         weights = []
29 |         num_unknown_words = 0
30 |         for word in vocab:
31 |             weight = unknown_word_weight
32 |             if word in word_weights:
33 |                 weight = word_weights[word]
34 |             elif word.lower() in word_weights:
35 |                 weight = word_weights[word.lower()]
36 |             else:
37 |                 num_unknown_words += 1
38 |             weights.append(weight)
39 |         
40 |         logging.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
41 | 
42 |         self.emb_layer = nn.Embedding(len(vocab), 1)
43 |         self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)})
44 | 
45 | 
46 |     def forward(self, features: Dict[str, Tensor]):
47 |         input_mask = features['input_mask']
48 |         token_embeddings = features['token_embeddings']
49 | 
50 |         #Compute a weight value for each token
51 |         token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1)
52 |         token_weights = token_weights_raw * input_mask.float()
53 |         token_weights_sum = torch.sum(token_weights, 1)
54 | 
55 |         #Multiply embedding by token weight value
56 |         token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size())
57 |         token_embeddings = token_embeddings * token_weights_expanded
58 | 
59 |         features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum})
60 |         return features
61 | 
62 |     def get_config_dict(self):
63 |         return {key: self.__dict__[key] for key in self.config_keys}
64 | 
65 |     def save(self, output_path):
66 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
67 |             json.dump(self.get_config_dict(), fOut, indent=2)
68 | 
69 |     @staticmethod
70 |     def load(input_path):
71 |         with open(os.path.join(input_path, 'config.json')) as fIn:
72 |             config = json.load(fIn)
73 | 
74 |         return WordWeights(**config)


--------------------------------------------------------------------------------
/sentence_transformers/data_samplers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains sampler functions, that can be used to sample mini-batches with specific properties.
 3 | """
 4 | from torch.utils.data import Sampler
 5 | import numpy as np
 6 | from .datasets import SentenceLabelDataset
 7 | 
 8 | 
 9 | class LabelSampler(Sampler):
10 |     """
11 |     This sampler is used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS
12 |     or MULTIPLE_NEGATIVES_RANKING_LOSS which require multiple or only one sample from one label per batch.
13 | 
14 |     It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
15 | 
16 |     Labels with fewer than n unique samples are ignored.
17 |     This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
18 | 
19 |     This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
20 |     by the samples drawn per label.
21 | 
22 | 
23 |     """
24 |     def __init__(self, data_source: SentenceLabelDataset, samples_per_label: int = 5,
25 |                  with_replacement: bool = False):
26 |         """
27 |         Creates a LabelSampler for a SentenceLabelDataset.
28 | 
29 |         :param data_source:
30 |             the dataset from which samples are drawn
31 |         :param samples_per_label:
32 |             the number of consecutive, random and unique samples drawn per label
33 |         :param with_replacement:
34 |             if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
35 |             if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
36 |             drawing.
37 |         """
38 |         super().__init__(data_source)
39 |         self.data_source = data_source
40 |         self.samples_per_label = samples_per_label
41 |         self.label_range = np.arange(data_source.num_labels)
42 |         self.borders = data_source.labels_right_border
43 |         self.with_replacement = with_replacement
44 |         np.random.shuffle(self.label_range)
45 | 
46 |     def __iter__(self):
47 |         label_idx = 0
48 |         count = 0
49 |         already_seen = {}
50 |         while count < len(self.data_source):
51 |             label = self.label_range[label_idx]
52 |             if label not in already_seen:
53 |                 already_seen[label] = []
54 | 
55 |             left_border = 0 if label == 0 else self.borders[label-1]
56 |             right_border = self.borders[label]
57 | 
58 |             if self.with_replacement:
59 |                 selection = np.arange(left_border, right_border)
60 |             else:
61 |                 selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
62 | 
63 |             if len(selection) >= self.samples_per_label:
64 |                 for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
65 |                     count += 1
66 |                     already_seen[label].append(element_idx)
67 |                     yield element_idx
68 | 
69 |             label_idx += 1
70 |             if label_idx >= len(self.label_range):
71 |                 label_idx = 0
72 |                 np.random.shuffle(self.label_range)
73 | 
74 |     def __len__(self):
75 |         return len(self.data_source)


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_bert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples trains BERT for the STSbenchmark from scratch. It generates sentence embeddings
 3 | that can be compared using cosine-similarity to measure the similarity.
 4 | """
 5 | from torch.utils.data import DataLoader
 6 | import math
 7 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, models
 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 9 | from sentence_transformers.readers import STSDataReader
10 | import logging
11 | from datetime import datetime
12 | 
13 | 
14 | #### Just some code to print debug information to stdout
15 | logging.basicConfig(format='%(asctime)s - %(message)s',
16 |                     datefmt='%Y-%m-%d %H:%M:%S',
17 |                     level=logging.INFO,
18 |                     handlers=[LoggingHandler()])
19 | #### /print debug information to stdout
20 | 
21 | # Read the dataset
22 | train_batch_size = 16
23 | num_epochs = 4
24 | model_save_path = 'output/training_stsbenchmark_bert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
25 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)
26 | 
27 | # Use BERT for mapping tokens to embeddings
28 | word_embedding_model = models.BERT('bert-base-uncased')
29 | 
30 | # Apply mean pooling to get one fixed sized sentence vector
31 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
32 |                                pooling_mode_mean_tokens=True,
33 |                                pooling_mode_cls_token=False,
34 |                                pooling_mode_max_tokens=False)
35 | 
36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
37 | 
38 | # Convert the dataset to a DataLoader ready for training
39 | logging.info("Read STSbenchmark train dataset")
40 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
41 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
42 | train_loss = losses.CosineSimilarityLoss(model=model)
43 | 
44 | 
45 | logging.info("Read STSbenchmark dev dataset")
46 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
47 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
48 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
49 | 
50 | 
51 | # Configure the training. We skip evaluation in this example
52 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
53 | logging.info("Warmup-steps: {}".format(warmup_steps))
54 | 
55 | 
56 | # Train the model
57 | model.fit(train_objectives=[(train_dataloader, train_loss)],
58 |           evaluator=evaluator,
59 |           epochs=num_epochs,
60 |           evaluation_steps=1000,
61 |           warmup_steps=warmup_steps,
62 |           output_path=model_save_path)
63 | 
64 | 
65 | ##############################################################################
66 | #
67 | # Load the stored model and evaluate its performance on STS benchmark dataset
68 | #
69 | ##############################################################################
70 | 
71 | model = SentenceTransformer(model_save_path)
72 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
73 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
74 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
75 | model.evaluate(evaluator)
76 | 


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_xlnet.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples trains XLNet for the STSbenchmark from scratch. It generates sentence embeddings
 3 | that can be compared using cosine-similarity to measure the similarity.
 4 | """
 5 | from torch.utils.data import DataLoader
 6 | import math
 7 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, models
 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 9 | from sentence_transformers.readers import STSDataReader
10 | import logging
11 | from datetime import datetime
12 | 
13 | 
14 | #### Just some code to print debug information to stdout
15 | logging.basicConfig(format='%(asctime)s - %(message)s',
16 |                     datefmt='%Y-%m-%d %H:%M:%S',
17 |                     level=logging.INFO,
18 |                     handlers=[LoggingHandler()])
19 | #### /print debug information to stdout
20 | 
21 | # Read the dataset
22 | train_batch_size = 16
23 | num_epochs = 4
24 | model_save_path = 'output/training_stsbenchmark_xlnet-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
25 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)
26 | 
27 | # Use XLNet for mapping tokens to embeddings
28 | word_embedding_model = models.XLNet('xlnet-base-cased')
29 | 
30 | # Apply mean pooling to get one fixed sized sentence vector
31 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
32 |                                pooling_mode_mean_tokens=True,
33 |                                pooling_mode_cls_token=False,
34 |                                pooling_mode_max_tokens=False)
35 | 
36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
37 | 
38 | # Convert the dataset to a DataLoader ready for training
39 | logging.info("Read STSbenchmark train dataset")
40 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
41 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
42 | train_loss = losses.CosineSimilarityLoss(model=model)
43 | 
44 | 
45 | logging.info("Read STSbenchmark dev dataset")
46 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
47 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
48 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
49 | 
50 | 
51 | # Configure the training. We skip evaluation in this example
52 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
53 | logging.info("Warmup-steps: {}".format(warmup_steps))
54 | 
55 | 
56 | # Train the model
57 | model.fit(train_objectives=[(train_dataloader, train_loss)],
58 |           evaluator=evaluator,
59 |           epochs=num_epochs,
60 |           evaluation_steps=1000,
61 |           warmup_steps=warmup_steps,
62 |           output_path=model_save_path)
63 | 
64 | 
65 | ##############################################################################
66 | #
67 | # Load the stored model and evaluate its performance on STS benchmark dataset
68 | #
69 | ##############################################################################
70 | 
71 | model = SentenceTransformer(model_save_path)
72 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
73 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
74 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
75 | model.evaluate(evaluator)
76 | 


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_roberta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples trains RoBERTa for the STSbenchmark from scratch. It generates sentence embeddings
 3 | that can be compared using cosine-similarity to measure the similarity.
 4 | """
 5 | from torch.utils.data import DataLoader
 6 | import math
 7 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, models
 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 9 | from sentence_transformers.readers import STSDataReader
10 | import logging
11 | from datetime import datetime
12 | 
13 | 
14 | #### Just some code to print debug information to stdout
15 | logging.basicConfig(format='%(asctime)s - %(message)s',
16 |                     datefmt='%Y-%m-%d %H:%M:%S',
17 |                     level=logging.INFO,
18 |                     handlers=[LoggingHandler()])
19 | #### /print debug information to stdout
20 | 
21 | # Read the dataset
22 | train_batch_size = 16
23 | num_epochs = 4
24 | model_save_path = 'output/training_stsbenchmark_roberta-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
25 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)
26 | 
27 | # Use RoBERTa-base for mapping tokens to embeddings
28 | word_embedding_model = models.RoBERTa('roberta-base')
29 | 
30 | # Apply mean pooling to get one fixed sized sentence vector
31 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
32 |                                pooling_mode_mean_tokens=True,
33 |                                pooling_mode_cls_token=False,
34 |                                pooling_mode_max_tokens=False)
35 | 
36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
37 | 
38 | # Convert the dataset to a DataLoader ready for training
39 | logging.info("Read STSbenchmark train dataset")
40 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
41 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
42 | train_loss = losses.CosineSimilarityLoss(model=model)
43 | 
44 | 
45 | logging.info("Read STSbenchmark dev dataset")
46 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
47 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
48 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
49 | 
50 | 
51 | # Configure the training. We skip evaluation in this example
52 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
53 | logging.info("Warmup-steps: {}".format(warmup_steps))
54 | 
55 | 
56 | # Train the model
57 | model.fit(train_objectives=[(train_dataloader, train_loss)],
58 |           evaluator=evaluator,
59 |           epochs=num_epochs,
60 |           evaluation_steps=1000,
61 |           warmup_steps=warmup_steps,
62 |           output_path=model_save_path)
63 | 
64 | 
65 | ##############################################################################
66 | #
67 | # Load the stored model and evaluate its performance on STS benchmark dataset
68 | #
69 | ##############################################################################
70 | 
71 | model = SentenceTransformer(model_save_path)
72 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
73 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
74 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
75 | model.evaluate(evaluator)
76 | 


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_albert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples trains ALBERT for the STSbenchmark from scratch. It generates sentence embeddings
 3 | that can be compared using cosine-similarity to measure the similarity.
 4 | """
 5 | from torch.utils.data import DataLoader
 6 | import math
 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models
 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 9 | from sentence_transformers.readers import STSDataReader
10 | import logging
11 | from datetime import datetime
12 | 
13 | 
14 | 
15 | #### Just some code to print debug information to stdout
16 | logging.basicConfig(format='%(asctime)s - %(message)s',
17 |                     datefmt='%Y-%m-%d %H:%M:%S',
18 |                     level=logging.INFO,
19 |                     handlers=[LoggingHandler()])
20 | #### /print debug information to stdout
21 | 
22 | # Read the dataset
23 | train_batch_size = 16
24 | num_epochs = 4
25 | model_save_path = 'output/training_stsbenchmark_albert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
26 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)
27 | 
28 | # Use RoBERTa-base for mapping tokens to embeddings
29 | word_embedding_model = models.ALBERT('albert-base-v2')
30 | 
31 | # Apply mean pooling to get one fixed sized sentence vector
32 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
33 |                                pooling_mode_mean_tokens=True,
34 |                                pooling_mode_cls_token=False,
35 |                                pooling_mode_max_tokens=False)
36 | 
37 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
38 | 
39 | # Convert the dataset to a DataLoader ready for training
40 | logging.info("Read STSbenchmark train dataset")
41 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
42 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
43 | train_loss = losses.CosineSimilarityLoss(model=model)
44 | 
45 | 
46 | logging.info("Read STSbenchmark dev dataset")
47 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
48 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
49 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
50 | 
51 | 
52 | # Configure the training. We skip evaluation in this example
53 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
54 | logging.info("Warmup-steps: {}".format(warmup_steps))
55 | 
56 | 
57 | # Train the model
58 | model.fit(train_objectives=[(train_dataloader, train_loss)],
59 |           evaluator=evaluator,
60 |           epochs=num_epochs,
61 |           evaluation_steps=1000,
62 |           warmup_steps=warmup_steps,
63 |           output_path=model_save_path)
64 | 
65 | 
66 | ##############################################################################
67 | #
68 | # Load the stored model and evaluate its performance on STS benchmark dataset
69 | #
70 | ##############################################################################
71 | 
72 | model = SentenceTransformer(model_save_path)
73 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
74 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
75 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
76 | model.evaluate(evaluator)
77 | 


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_distilbert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples trains DistilBERT for the STSbenchmark from scratch. It generates sentence embeddings
 3 | that can be compared using cosine-similarity to measure the similarity.
 4 | """
 5 | from torch.utils.data import DataLoader
 6 | import math
 7 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models
 8 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 9 | from sentence_transformers.readers import STSDataReader
10 | import logging
11 | from datetime import datetime
12 | 
13 | 
14 | #### Just some code to print debug information to stdout
15 | logging.basicConfig(format='%(asctime)s - %(message)s',
16 |                     datefmt='%Y-%m-%d %H:%M:%S',
17 |                     level=logging.INFO,
18 |                     handlers=[LoggingHandler()])
19 | #### /print debug information to stdout
20 | 
21 | # Read the dataset
22 | train_batch_size = 16
23 | num_epochs = 4
24 | model_save_path = 'output/training_stsbenchmark_distilbert-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
25 | sts_reader = STSDataReader('datasets/stsbenchmark', normalize_scores=True)
26 | 
27 | # Use DistilBERT-base for mapping tokens to embeddings
28 | word_embedding_model = models.DistilBERT('distilbert-base-uncased')
29 | 
30 | # Apply mean pooling to get one fixed sized sentence vector
31 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
32 |                                pooling_mode_mean_tokens=True,
33 |                                pooling_mode_cls_token=False,
34 |                                pooling_mode_max_tokens=False)
35 | 
36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
37 | 
38 | # Convert the dataset to a DataLoader ready for training
39 | logging.info("Read STSbenchmark train dataset")
40 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
41 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
42 | train_loss = losses.CosineSimilarityLoss(model=model)
43 | 
44 | 
45 | logging.info("Read STSbenchmark dev dataset")
46 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
47 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
48 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
49 | 
50 | 
51 | # Configure the training. We skip evaluation in this example
52 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
53 | logging.info("Warmup-steps: {}".format(warmup_steps))
54 | 
55 | 
56 | # Train the model
57 | model.fit(train_objectives=[(train_dataloader, train_loss)],
58 |           evaluator=evaluator,
59 |           epochs=num_epochs,
60 |           evaluation_steps=1000,
61 |           warmup_steps=warmup_steps,
62 |           output_path=model_save_path)
63 | 
64 | 
65 | ##############################################################################
66 | #
67 | # Load the stored model and evaluate its performance on STS benchmark dataset
68 | #
69 | ##############################################################################
70 | 
71 | model = SentenceTransformer(model_save_path)
72 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
73 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
74 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
75 | model.evaluate(evaluator)
76 | 


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_cnn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example runs a CNN after the word embedding lookup. The output of the CNN is than pooled,
 3 | for example with mean-pooling.
 4 | 
 5 | 
 6 | """
 7 | import torch
 8 | from torch.utils.data import DataLoader
 9 | import math
10 | from sentence_transformers import models, losses
11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
13 | from sentence_transformers.readers import *
14 | import logging
15 | from datetime import datetime
16 | 
17 | #### Just some code to print debug information to stdout
18 | logging.basicConfig(format='%(asctime)s - %(message)s',
19 |                     datefmt='%Y-%m-%d %H:%M:%S',
20 |                     level=logging.INFO,
21 |                     handlers=[LoggingHandler()])
22 | #### /print debug information to stdout
23 | 
24 | # Read the dataset
25 | batch_size = 32
26 | sts_reader = STSDataReader('datasets/stsbenchmark')
27 | model_save_path = 'output/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
28 | 
29 | 
30 | 
31 | # Map tokens to vectors using BERT
32 | word_embedding_model = models.BERT('bert-base-uncased')
33 | 
34 | cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), out_channels=256, kernel_sizes=[1,3,5])
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(cnn.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=True,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=False)
41 | 
42 | 
43 | model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model])
44 | 
45 | 
46 | # Convert the dataset to a DataLoader ready for training
47 | logging.info("Read STSbenchmark train dataset")
48 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
49 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
50 | train_loss = losses.CosineSimilarityLoss(model=model)
51 | 
52 | logging.info("Read STSbenchmark dev dataset")
53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
56 | 
57 | # Configure the training
58 | num_epochs = 10
59 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
60 | logging.info("Warmup-steps: {}".format(warmup_steps))
61 | 
62 | # Train the model
63 | model.fit(train_objectives=[(train_dataloader, train_loss)],
64 |           evaluator=evaluator,
65 |           epochs=num_epochs,
66 |           warmup_steps=warmup_steps,
67 |           output_path=model_save_path
68 |           )
69 | 
70 | 
71 | 
72 | ##############################################################################
73 | #
74 | # Load the stored model and evaluate its performance on STS benchmark dataset
75 | #
76 | ##############################################################################
77 | 
78 | model = SentenceTransformer(model_save_path)
79 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
80 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
81 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
82 | 
83 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/examples/training_wikipedia_sections.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script trains sentence transformers with a triplet loss function.
 3 | 
 4 | As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks.
 5 | 
 6 | See docs/pretrained-models/wikipedia-sections-modesl.md for further details.
 7 | 
 8 | You can get the dataset by running examples/datasets/get_data.py
 9 | """
10 | 
11 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models
12 | from torch.utils.data import DataLoader
13 | from sentence_transformers.readers import TripletReader
14 | from sentence_transformers.evaluation import TripletEvaluator
15 | from datetime import datetime
16 | 
17 | import csv
18 | import logging
19 | 
20 | 
21 | 
22 | logging.basicConfig(format='%(asctime)s - %(message)s',
23 |                     datefmt='%Y-%m-%d %H:%M:%S',
24 |                     level=logging.INFO,
25 |                     handlers=[LoggingHandler()])
26 | 
27 | 
28 | 
29 | ### Create a torch.DataLoader that passes training batch instances to our model
30 | train_batch_size = 16
31 | triplet_reader = TripletReader('datasets/wikipedia-sections-triplets', s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True)
32 | output_path = "output/bert-base-wikipedia-sections-mean-tokens-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
33 | num_epochs = 1
34 | 
35 | 
36 | ### Configure sentence transformers for training and train on the provided dataset
37 | # Use BERT for mapping tokens to embeddings
38 | word_embedding_model = models.BERT('bert-base-uncased')
39 | 
40 | # Apply mean pooling to get one fixed sized sentence vector
41 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
42 |                                pooling_mode_mean_tokens=True,
43 |                                pooling_mode_cls_token=False,
44 |                                pooling_mode_max_tokens=False)
45 | 
46 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
47 | 
48 | 
49 | logging.info("Read Triplet train dataset")
50 | train_data = SentencesDataset(examples=triplet_reader.get_examples('train.csv'), model=model)
51 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
52 | train_loss = losses.TripletLoss(model=model)
53 | 
54 | logging.info("Read Wikipedia Triplet dev dataset")
55 | dev_data = SentencesDataset(examples=triplet_reader.get_examples('validation.csv', 1000), model=model)
56 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
57 | evaluator = TripletEvaluator(dev_dataloader)
58 | 
59 | 
60 | warmup_steps = int(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data
61 | 
62 | 
63 | # Train the model
64 | model.fit(train_objectives=[(train_dataloader, train_loss)],
65 |           evaluator=evaluator,
66 |           epochs=num_epochs,
67 |           evaluation_steps=1000,
68 |           warmup_steps=warmup_steps,
69 |           output_path=output_path)
70 | 
71 | ##############################################################################
72 | #
73 | # Load the stored model and evaluate its performance on STS benchmark dataset
74 | #
75 | ##############################################################################
76 | 
77 | model = SentenceTransformer(output_path)
78 | test_data = SentencesDataset(examples=triplet_reader.get_examples('test.csv'), model=model)
79 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
80 | evaluator = TripletEvaluator(test_dataloader)
81 | 
82 | model.evaluate(evaluator)
83 | 
84 | 


--------------------------------------------------------------------------------
/examples/training_nli_T5.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The system trains T5 on the SNLI + MultiNLI (AllNLI) dataset
 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the
 4 | STS benchmark dataset
 5 | """
 6 | from torch.utils.data import DataLoader
 7 | import math
 8 | from sentence_transformers import models, losses
 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
11 | from sentence_transformers.readers import *
12 | import logging
13 | from datetime import datetime
14 | 
15 | #### Just some code to print debug information to stdout
16 | logging.basicConfig(format='%(asctime)s - %(message)s',
17 |                     datefmt='%Y-%m-%d %H:%M:%S',
18 |                     level=logging.INFO,
19 |                     handlers=[LoggingHandler()])
20 | #### /print debug information to stdout
21 | 
22 | 
23 | # Read the dataset
24 | model_name = 't5-small'
25 | batch_size = 16
26 | nli_reader = NLIDataReader('datasets/AllNLI')
27 | sts_reader = STSDataReader('datasets/stsbenchmark')
28 | train_num_labels = nli_reader.get_num_labels()
29 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
30 | 
31 | 
32 | 
33 | # Use BERT for mapping tokens to embeddings
34 | word_embedding_model = models.T5(model_name)
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=True,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=False)
41 | 
42 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
43 | 
44 | 
45 | # Convert the dataset to a DataLoader ready for training
46 | logging.info("Read AllNLI train dataset")
47 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
48 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
49 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
50 | 
51 | 
52 | 
53 | logging.info("Read STSbenchmark dev dataset")
54 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
55 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
56 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
57 | 
58 | # Configure the training
59 | num_epochs = 1
60 | 
61 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
62 | logging.info("Warmup-steps: {}".format(warmup_steps))
63 | 
64 | 
65 | 
66 | # Train the model
67 | model.fit(train_objectives=[(train_dataloader, train_loss)],
68 |           evaluator=evaluator,
69 |           epochs=num_epochs,
70 |           evaluation_steps=1000,
71 |           warmup_steps=warmup_steps,
72 |           output_path=model_save_path
73 |           )
74 | 
75 | 
76 | 
77 | ##############################################################################
78 | #
79 | # Load the stored model and evaluate its performance on STS benchmark dataset
80 | #
81 | ##############################################################################
82 | 
83 | model = SentenceTransformer(model_save_path)
84 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
85 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
86 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
87 | 
88 | model.evaluate(evaluator)
89 | 


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_bilstm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled,
 3 | for example with max-pooling (which gives a system like InferSent) or with mean-pooling.
 4 | 
 5 | Note, you can also pass BERT embeddings to the BiLSTM.
 6 | """
 7 | import torch
 8 | from torch.utils.data import DataLoader
 9 | import math
10 | from sentence_transformers import models, losses
11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
13 | from sentence_transformers.readers import *
14 | import logging
15 | from datetime import datetime
16 | 
17 | #### Just some code to print debug information to stdout
18 | logging.basicConfig(format='%(asctime)s - %(message)s',
19 |                     datefmt='%Y-%m-%d %H:%M:%S',
20 |                     level=logging.INFO,
21 |                     handlers=[LoggingHandler()])
22 | #### /print debug information to stdout
23 | 
24 | # Read the dataset
25 | batch_size = 32
26 | sts_reader = STSDataReader('datasets/stsbenchmark')
27 | model_save_path = 'output/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
28 | 
29 | 
30 | 
31 | # Map tokens to traditional word embeddings like GloVe
32 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz')
33 | 
34 | lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=1024)
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(lstm.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=False,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=True)
41 | 
42 | 
43 | model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model])
44 | 
45 | 
46 | # Convert the dataset to a DataLoader ready for training
47 | logging.info("Read STSbenchmark train dataset")
48 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
49 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
50 | train_loss = losses.CosineSimilarityLoss(model=model)
51 | 
52 | logging.info("Read STSbenchmark dev dataset")
53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
56 | 
57 | # Configure the training
58 | num_epochs = 10
59 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
60 | logging.info("Warmup-steps: {}".format(warmup_steps))
61 | 
62 | # Train the model
63 | model.fit(train_objectives=[(train_dataloader, train_loss)],
64 |           evaluator=evaluator,
65 |           epochs=num_epochs,
66 |           warmup_steps=warmup_steps,
67 |           output_path=model_save_path
68 |           )
69 | 
70 | 
71 | 
72 | ##############################################################################
73 | #
74 | # Load the stored model and evaluate its performance on STS benchmark dataset
75 | #
76 | ##############################################################################
77 | 
78 | model = SentenceTransformer(model_save_path)
79 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
80 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
81 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
82 | 
83 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/examples/application_clustering_wikipedia_sections.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples clusters different sentences that come from the same wikipedia article.
 3 | 
 4 | It uses the 'wikipedia-sections' model, a model that was trained to differentiate if two sentences from the
 5 | same article come from the same section or from different sections in that article.
 6 | """
 7 | from sentence_transformers import SentenceTransformer
 8 | from sklearn.cluster import AgglomerativeClustering
 9 | 
10 | 
11 | 
12 | embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')
13 | 
14 | #Sentences and sections are from Wikipeda.
15 | #Source: https://en.wikipedia.org/wiki/Bushnell,_Illinois
16 | corpus = [
17 | ("Bushnell is located at 40°33′6″N 90°30′29″W (40.551667, -90.507921).", "Geography"),
18 | ("According to the 2010 census, Bushnell has a total area of 2.138 square miles (5.54 km2), of which 2.13 square miles (5.52 km2) (or 99.63%) is land and 0.008 square miles (0.02 km2) (or 0.37%) is water.", "Geography"),
19 | 
20 | ("The town was founded in 1854 when the Northern Cross Railroad built a line through the area.", "History"),
21 | ("Nehemiah Bushnell was the President of the Railroad, and townspeople honored him by naming their community after him. ", "History"),
22 | ("Bushnell was also served by the Toledo, Peoria and Western Railway, now the Keokuk Junction Railway.", "History"),
23 | 
24 | ("As of the census[6] of 2000, there were 3,221 people, 1,323 households, and 889 families residing in the city. ", "Demographics"),
25 | ("The population density was 1,573.9 people per square mile (606.7/km²).", "Demographics"),
26 | ("There were 1,446 housing units at an average density of 706.6 per square mile (272.3/km²).", "Demographics"),
27 | 
28 | ("From 1991 to 2012, Bushnell was home to one of the largest Christian Music and Arts festivals in the world, known as the Cornerstone Festival.", "Music"),
29 | ("Each year around the 4th of July, 25,000 people from all over the world would descend on the small farm town to watch over 300 bands, authors and artists perform at the Cornerstone Farm Campgrounds.", "Music"),
30 | ("The festival was generally well received by locals, and businesses in the area would typically put up signs welcoming festival-goers to their town.", "Music"),
31 | ("As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. ", "Music"),
32 | ("Cornerstone held its final festival in 2012 and no longer operates.", "Music"),
33 | 
34 | ("Beginning in 1908, the Truman Pioneer Stud Farm in Bushnell was home to one of the largest horse shows in the Midwest.", "Horse show"),
35 | ("The show was well known for imported European horses.", "Horse show"),
36 | ("The Bushnell Horse Show features some of the best Belgian and Percheron hitches in the country. Teams have come from many different states and Canada to compete.", "Horse show"),
37 | ]
38 | 
39 | sentences = [row[0] for row in corpus]
40 | 
41 | corpus_embeddings = embedder.encode(sentences)
42 | num_clusters = len(set([row[1] for row in corpus]))
43 | 
44 | #Sklearn clustering
45 | km = AgglomerativeClustering(n_clusters=num_clusters)
46 | km.fit(corpus_embeddings)
47 | 
48 | cluster_assignment = km.labels_
49 | 
50 | 
51 | clustered_sentences = [[] for i in range(num_clusters)]
52 | for sentence_id, cluster_id in enumerate(cluster_assignment):
53 |     clustered_sentences[cluster_id].append(corpus[sentence_id])
54 | 
55 | for i, cluster in enumerate(clustered_sentences):
56 |     print("Cluster ", i+1)
57 |     for row in cluster:
58 |         print("(Gold label: {}) - {}".format(row[1], row[0]))
59 |     print("")
60 | 
61 | 


--------------------------------------------------------------------------------
/examples/training_nli_bert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The system trains BERT on the SNLI + MultiNLI (AllNLI) dataset
 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the
 4 | STS benchmark dataset
 5 | """
 6 | from torch.utils.data import DataLoader
 7 | import math
 8 | from sentence_transformers import models, losses
 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
11 | from sentence_transformers.readers import *
12 | import logging
13 | from datetime import datetime
14 | 
15 | #### Just some code to print debug information to stdout
16 | logging.basicConfig(format='%(asctime)s - %(message)s',
17 |                     datefmt='%Y-%m-%d %H:%M:%S',
18 |                     level=logging.INFO,
19 |                     handlers=[LoggingHandler()])
20 | #### /print debug information to stdout
21 | 
22 | # Read the dataset
23 | model_name = 'bert-base-uncased'
24 | batch_size = 16
25 | nli_reader = NLIDataReader('datasets/AllNLI')
26 | sts_reader = STSDataReader('datasets/stsbenchmark')
27 | train_num_labels = nli_reader.get_num_labels()
28 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
29 | 
30 | 
31 | 
32 | # Use BERT for mapping tokens to embeddings
33 | word_embedding_model = models.BERT(model_name)
34 | 
35 | # Apply mean pooling to get one fixed sized sentence vector
36 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
37 |                                pooling_mode_mean_tokens=True,
38 |                                pooling_mode_cls_token=False,
39 |                                pooling_mode_max_tokens=False)
40 | 
41 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
42 | 
43 | 
44 | # Convert the dataset to a DataLoader ready for training
45 | logging.info("Read AllNLI train dataset")
46 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
47 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
48 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
49 | 
50 | 
51 | 
52 | logging.info("Read STSbenchmark dev dataset")
53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
56 | 
57 | # Configure the training
58 | num_epochs = 1
59 | 
60 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
61 | logging.info("Warmup-steps: {}".format(warmup_steps))
62 | 
63 | 
64 | 
65 | # Train the model
66 | model.fit(train_objectives=[(train_dataloader, train_loss)],
67 |           evaluator=evaluator,
68 |           epochs=num_epochs,
69 |           evaluation_steps=1000,
70 |           warmup_steps=warmup_steps,
71 |           output_path=model_save_path
72 |           )
73 | 
74 | 
75 | 
76 | ##############################################################################
77 | #
78 | # Load the stored model and evaluate its performance on STS benchmark dataset
79 | #
80 | ##############################################################################
81 | 
82 | model = SentenceTransformer(model_save_path)
83 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
84 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
85 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
86 | 
87 | model.evaluate(evaluator)
88 | 


--------------------------------------------------------------------------------
/examples/training_nli_roberta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The system RoBERTa trains on the SNLI + MultiNLI (AllNLI) dataset
 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the
 4 | STS benchmark dataset
 5 | """
 6 | from torch.utils.data import DataLoader
 7 | import math
 8 | from sentence_transformers import models, losses
 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
11 | from sentence_transformers.readers import *
12 | import logging
13 | from datetime import datetime
14 | 
15 | #### Just some code to print debug information to stdout
16 | logging.basicConfig(format='%(asctime)s - %(message)s',
17 |                     datefmt='%Y-%m-%d %H:%M:%S',
18 |                     level=logging.INFO,
19 |                     handlers=[LoggingHandler()])
20 | #### /print debug information to stdout
21 | 
22 | # Read the dataset
23 | model_name = 'roberta-large'
24 | batch_size = 16
25 | nli_reader = NLIDataReader('datasets/AllNLI')
26 | sts_reader = STSDataReader('datasets/stsbenchmark')
27 | train_num_labels = nli_reader.get_num_labels()
28 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
29 | 
30 | 
31 | 
32 | # Use RoBERTa for mapping tokens to embeddings
33 | word_embedding_model = models.RoBERTa(model_name)
34 | 
35 | # Apply mean pooling to get one fixed sized sentence vector
36 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
37 |                                pooling_mode_mean_tokens=True,
38 |                                pooling_mode_cls_token=False,
39 |                                pooling_mode_max_tokens=False)
40 | 
41 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
42 | 
43 | 
44 | # Convert the dataset to a DataLoader ready for training
45 | logging.info("Read AllNLI train dataset")
46 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
47 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
48 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
49 | 
50 | 
51 | 
52 | logging.info("Read STSbenchmark dev dataset")
53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
56 | 
57 | # Configure the training
58 | num_epochs = 1
59 | 
60 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
61 | logging.info("Warmup-steps: {}".format(warmup_steps))
62 | 
63 | 
64 | # Train the model
65 | model.fit(train_objectives=[(train_dataloader, train_loss)],
66 |           evaluator=evaluator,
67 |           epochs=num_epochs,
68 |           evaluation_steps=1000,
69 |           warmup_steps=warmup_steps,
70 |           output_path=model_save_path
71 |           )
72 | 
73 | 
74 | 
75 | ##############################################################################
76 | #
77 | # Load the stored model and evaluate its performance on STS benchmark dataset
78 | #
79 | ##############################################################################
80 | 
81 | model = SentenceTransformer(model_save_path)
82 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
83 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
84 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
85 | 
86 | model.evaluate(evaluator)
87 | 


--------------------------------------------------------------------------------
/examples/training_nli_albert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The system ALBERT trains on the SNLI + MultiNLI (AllNLI) dataset
 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the
 4 | STS benchmark dataset
 5 | """
 6 | from torch.utils.data import DataLoader
 7 | import math
 8 | from sentence_transformers import models, losses
 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
11 | from sentence_transformers.readers import *
12 | import logging
13 | from datetime import datetime
14 | 
15 | 
16 | #### Just some code to print debug information to stdout
17 | logging.basicConfig(format='%(asctime)s - %(message)s',
18 |                     datefmt='%Y-%m-%d %H:%M:%S',
19 |                     level=logging.INFO,
20 |                     handlers=[LoggingHandler()])
21 | #### /print debug information to stdout
22 | 
23 | # Read the dataset
24 | model_name = 'albert-base-v2'
25 | batch_size = 16
26 | nli_reader = NLIDataReader('datasets/AllNLI')
27 | sts_reader = STSDataReader('datasets/stsbenchmark')
28 | train_num_labels = nli_reader.get_num_labels()
29 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
30 | 
31 | 
32 | 
33 | # Use ALBERT for mapping tokens to embeddings
34 | word_embedding_model = models.ALBERT(model_name)
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=True,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=False)
41 | 
42 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
43 | 
44 | 
45 | # Convert the dataset to a DataLoader ready for training
46 | logging.info("Read AllNLI train dataset")
47 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
48 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
49 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
50 | 
51 | 
52 | 
53 | logging.info("Read STSbenchmark dev dataset")
54 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
55 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
56 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
57 | 
58 | # Configure the training
59 | num_epochs = 1
60 | 
61 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
62 | logging.info("Warmup-steps: {}".format(warmup_steps))
63 | 
64 | 
65 | # Train the model
66 | model.fit(train_objectives=[(train_dataloader, train_loss)],
67 |           evaluator=evaluator,
68 |           epochs=num_epochs,
69 |           evaluation_steps=1000,
70 |           warmup_steps=warmup_steps,
71 |           output_path=model_save_path
72 |           )
73 | 
74 | 
75 | 
76 | ##############################################################################
77 | #
78 | # Load the stored model and evaluate its performance on STS benchmark dataset
79 | #
80 | ##############################################################################
81 | 
82 | model = SentenceTransformer(model_save_path)
83 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
84 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
85 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
86 | 
87 | model.evaluate(evaluator)
88 | 


--------------------------------------------------------------------------------
/examples/training_nli_xlm-roberta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The system trains T5 on the SNLI + MultiNLI (AllNLI) dataset
 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the
 4 | STS benchmark dataset
 5 | """
 6 | from torch.utils.data import DataLoader
 7 | import math
 8 | from sentence_transformers import models, losses
 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
11 | from sentence_transformers.readers import *
12 | import logging
13 | from datetime import datetime
14 | 
15 | #### Just some code to print debug information to stdout
16 | logging.basicConfig(format='%(asctime)s - %(message)s',
17 |                     datefmt='%Y-%m-%d %H:%M:%S',
18 |                     level=logging.INFO,
19 |                     handlers=[LoggingHandler()])
20 | #### /print debug information to stdout
21 | 
22 | 
23 | # Read the dataset
24 | model_name = 'xlm-roberta-base'
25 | batch_size = 16
26 | nli_reader = NLIDataReader('datasets/AllNLI')
27 | sts_reader = STSDataReader('datasets/stsbenchmark')
28 | train_num_labels = nli_reader.get_num_labels()
29 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
30 | 
31 | 
32 | 
33 | # Use BERT for mapping tokens to embeddings
34 | word_embedding_model = models.XLMRoBERTa(model_name)
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=True,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=False)
41 | 
42 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
43 | 
44 | 
45 | # Convert the dataset to a DataLoader ready for training
46 | logging.info("Read AllNLI train dataset")
47 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
48 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
49 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
50 | 
51 | 
52 | 
53 | logging.info("Read STSbenchmark dev dataset")
54 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
55 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
56 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
57 | 
58 | # Configure the training
59 | num_epochs = 1
60 | 
61 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
62 | logging.info("Warmup-steps: {}".format(warmup_steps))
63 | 
64 | 
65 | 
66 | # Train the model
67 | model.fit(train_objectives=[(train_dataloader, train_loss)],
68 |           evaluator=evaluator,
69 |           epochs=num_epochs,
70 |           evaluation_steps=1000,
71 |           warmup_steps=warmup_steps,
72 |           output_path=model_save_path
73 |           )
74 | 
75 | 
76 | 
77 | ##############################################################################
78 | #
79 | # Load the stored model and evaluate its performance on STS benchmark dataset
80 | #
81 | ##############################################################################
82 | 
83 | model = SentenceTransformer(model_save_path)
84 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
85 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
86 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
87 | 
88 | model.evaluate(evaluator)
89 | 


--------------------------------------------------------------------------------
/examples/training_nli_distilbert.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The system DistilBERT trains on the SNLI + MultiNLI (AllNLI) dataset
 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the
 4 | STS benchmark dataset
 5 | """
 6 | from torch.utils.data import DataLoader
 7 | import math
 8 | from sentence_transformers import models, losses
 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
11 | from sentence_transformers.readers import *
12 | import logging
13 | from datetime import datetime
14 | 
15 | #### Just some code to print debug information to stdout
16 | logging.basicConfig(format='%(asctime)s - %(message)s',
17 |                     datefmt='%Y-%m-%d %H:%M:%S',
18 |                     level=logging.INFO,
19 |                     handlers=[LoggingHandler()])
20 | #### /print debug information to stdout
21 | 
22 | # Read the dataset
23 | model_name = 'distilbert-base-uncased'
24 | batch_size = 16
25 | nli_reader = NLIDataReader('datasets/AllNLI')
26 | sts_reader = STSDataReader('datasets/stsbenchmark')
27 | train_num_labels = nli_reader.get_num_labels()
28 | model_save_path = 'output/training_nli_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
29 | 
30 | 
31 | 
32 | # Use DistilBERT for mapping tokens to embeddings
33 | word_embedding_model = models.DistilBERT(model_name)
34 | 
35 | # Apply mean pooling to get one fixed sized sentence vector
36 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
37 |                                pooling_mode_mean_tokens=True,
38 |                                pooling_mode_cls_token=False,
39 |                                pooling_mode_max_tokens=False)
40 | 
41 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
42 | 
43 | 
44 | # Convert the dataset to a DataLoader ready for training
45 | logging.info("Read AllNLI train dataset")
46 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
47 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
48 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
49 | 
50 | 
51 | 
52 | logging.info("Read STSbenchmark dev dataset")
53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
56 | 
57 | # Configure the training
58 | num_epochs = 1
59 | 
60 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
61 | logging.info("Warmup-steps: {}".format(warmup_steps))
62 | 
63 | 
64 | # Train the model
65 | model.fit(train_objectives=[(train_dataloader, train_loss)],
66 |           evaluator=evaluator,
67 |           epochs=num_epochs,
68 |           evaluation_steps=1000,
69 |           warmup_steps=warmup_steps,
70 |           output_path=model_save_path
71 |           )
72 | 
73 | 
74 | 
75 | ##############################################################################
76 | #
77 | # Load the stored model and evaluate its performance on STS benchmark dataset
78 | #
79 | ##############################################################################
80 | 
81 | model = SentenceTransformer(model_save_path)
82 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
83 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
84 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
85 | 
86 | model.evaluate(evaluator)
87 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/WordTokenizer.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import Union, Tuple, List, Iterable, Dict
 3 | 
 4 | ENGLISH_STOP_WORDS = ['!', '"', "''", "``", '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`',  '{', '|', '}', '~', 'a', 'about', 'above', 'across', 'after', 'afterwards', 'again', 'against', 'ain', 'all', 'almost', 'alone', 'along', 'already', 'also', 'although', 'always', 'am', 'among', 'amongst', 'amoungst', 'amount', 'an', 'and', 'another', 'any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere', 'are', 'aren', 'around', 'as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before', 'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both', 'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldn', 'couldnt', 'cry', 'd', 'de', 'describe', 'detail', 'did', 'didn', 'do', 'does', 'doesn', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg', 'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone', 'everything', 'everywhere', 'except', 'few', 'fifteen', 'fifty', 'fill', 'find', 'fire', 'first', 'five', 'for', 'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had', 'hadn', 'has', 'hasn', 'hasnt', 'have', 'haven', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed', 'interest', 'into', 'is', 'isn', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less', 'll', 'ltd', 'm', 'ma', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mightn', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly', 'move', 'much', 'must', 'mustn', 'my', 'myself', 'name', 'namely', 'needn', 'neither', 'never', 'nevertheless', 'next', 'nine', 'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'o', 'of', 'off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own', 'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming', 'seems', 'serious', 'several', 'shan', 'she', 'should', 'shouldn', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system', 't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there', 'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thick', 'thin', 'third', 'this', 'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward', 'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 've', 'very', 'via', 'was', 'wasn', 'we', 'well', 'were', 'weren', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby', 'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with', 'within', 'without', 'won', 'would', 'wouldn', 'y', 'yet', 'you', 'your', 'yours', 'yourself', 'yourselves']
 5 | 
 6 | 
 7 | class WordTokenizer(ABC):
 8 |     @abstractmethod
 9 |     def set_vocab(self, vocab: Iterable[str]):
10 |         pass
11 | 
12 |     @abstractmethod
13 |     def get_vocab(self, vocab: Iterable[str]):
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def tokenize(self, text: str) -> List[int]:
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def save(self, output_path: str):
22 |         pass
23 | 
24 |     @staticmethod
25 |     @abstractmethod
26 |     def load(input_path: str):
27 |         pass


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_avg_word_embeddings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example uses average word embeddings (for example from GloVe). It adds two fully-connected feed-forward layers (dense layers) to create a Deep Averaging Network (DAN).
 3 | 
 4 | If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server.
 5 | 
 6 | See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/
 7 | for available word embeddings files
 8 | """
 9 | import torch
10 | from torch.utils.data import DataLoader
11 | import math
12 | from sentence_transformers import models, losses
13 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
14 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
15 | from sentence_transformers.readers import *
16 | import logging
17 | from datetime import datetime
18 | 
19 | #### Just some code to print debug information to stdout
20 | logging.basicConfig(format='%(asctime)s - %(message)s',
21 |                     datefmt='%Y-%m-%d %H:%M:%S',
22 |                     level=logging.INFO,
23 |                     handlers=[LoggingHandler()])
24 | #### /print debug information to stdout
25 | 
26 | # Read the dataset
27 | batch_size = 32
28 | sts_reader = STSDataReader('datasets/stsbenchmark')
29 | model_save_path = 'output/training_stsbenchmark_avg_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
30 | 
31 | 
32 | 
33 | # Map tokens to traditional word embeddings like GloVe
34 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz')
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=True,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=False)
41 | 
42 | # Add two trainable feed-forward networks (DAN)
43 | sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
44 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
45 | dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
46 | 
47 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dan1, dan2])
48 | 
49 | 
50 | # Convert the dataset to a DataLoader ready for training
51 | logging.info("Read STSbenchmark train dataset")
52 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
53 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
54 | train_loss = losses.CosineSimilarityLoss(model=model)
55 | 
56 | logging.info("Read STSbenchmark dev dataset")
57 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
58 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
59 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
60 | 
61 | # Configure the training
62 | num_epochs = 10
63 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
64 | logging.info("Warmup-steps: {}".format(warmup_steps))
65 | 
66 | # Train the model
67 | model.fit(train_objectives=[(train_dataloader, train_loss)],
68 |           evaluator=evaluator,
69 |           epochs=num_epochs,
70 |           warmup_steps=warmup_steps,
71 |           output_path=model_save_path
72 |           )
73 | 
74 | 
75 | 
76 | ##############################################################################
77 | #
78 | # Load the stored model and evaluate its performance on STS benchmark dataset
79 | #
80 | ##############################################################################
81 | 
82 | model = SentenceTransformer(model_save_path)
83 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
84 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
85 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
86 | 
87 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/sentence_transformers/models/Pooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class Pooling(nn.Module):
10 |     """Performs pooling (max or mean) on the token embeddings.
11 | 
12 |     Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
13 |     You can concatenate multiple poolings together.
14 |     """
15 |     def __init__(self,
16 |                  word_embedding_dimension: int,
17 |                  pooling_mode_cls_token: bool = False,
18 |                  pooling_mode_max_tokens: bool = False,
19 |                  pooling_mode_mean_tokens: bool = True,
20 |                  pooling_mode_mean_sqrt_len_tokens: bool = False,
21 |                  ):
22 |         super(Pooling, self).__init__()
23 | 
24 |         self.config_keys = ['word_embedding_dimension',  'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
25 | 
26 |         self.word_embedding_dimension = word_embedding_dimension
27 |         self.pooling_mode_cls_token = pooling_mode_cls_token
28 |         self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
29 |         self.pooling_mode_max_tokens = pooling_mode_max_tokens
30 |         self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
31 | 
32 |         pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens])
33 |         self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension)
34 | 
35 |     def forward(self, features: Dict[str, Tensor]):
36 |         token_embeddings = features['token_embeddings']
37 |         cls_token = features['cls_token_embeddings']
38 |         input_mask = features['input_mask']
39 | 
40 |         ## Pooling strategy
41 |         output_vectors = []
42 |         if self.pooling_mode_cls_token:
43 |             output_vectors.append(cls_token)
44 |         if self.pooling_mode_max_tokens:
45 |             input_mask_expanded = input_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
46 |             token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
47 |             max_over_time = torch.max(token_embeddings, 1)[0]
48 |             output_vectors.append(max_over_time)
49 |         if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
50 |             input_mask_expanded = input_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
51 |             sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
52 | 
53 |             #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
54 |             if 'token_weights_sum' in features:
55 |                 sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
56 |             else:
57 |                 sum_mask = input_mask_expanded.sum(1)
58 | 
59 |             sum_mask = torch.clamp(sum_mask, min=1e-9)
60 | 
61 |             if self.pooling_mode_mean_tokens:
62 |                 output_vectors.append(sum_embeddings / sum_mask)
63 |             if self.pooling_mode_mean_sqrt_len_tokens:
64 |                 output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
65 | 
66 |         output_vector = torch.cat(output_vectors, 1)
67 |         features.update({'sentence_embedding': output_vector})
68 |         return features
69 | 
70 |     def get_sentence_embedding_dimension(self):
71 |         return self.pooling_output_dimension
72 | 
73 |     def get_config_dict(self):
74 |         return {key: self.__dict__[key] for key in self.config_keys}
75 | 
76 |     def save(self, output_path):
77 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
78 |             json.dump(self.get_config_dict(), fOut, indent=2)
79 | 
80 |     @staticmethod
81 |     def load(input_path):
82 |         with open(os.path.join(input_path, 'config.json')) as fIn:
83 |             config = json.load(fIn)
84 | 
85 |         return Pooling(**config)
86 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/DistilBERT.py:
--------------------------------------------------------------------------------
  1 | from torch import Tensor
  2 | from torch import nn
  3 | from transformers import DistilBertModel, DistilBertTokenizer
  4 | import json
  5 | from typing import Union, Tuple, List, Dict
  6 | import os
  7 | import numpy as np
  8 | import logging
  9 | 
 10 | class DistilBERT(nn.Module):
 11 |     """DistilBERT model to generate token embeddings.
 12 | 
 13 |     Each token is mapped to an output vector from DistilBERT.
 14 |     """
 15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
 16 |         super(DistilBERT, self).__init__()
 17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 18 |         self.do_lower_case = do_lower_case
 19 | 
 20 |         if max_seq_length > 510:
 21 |             logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
 22 |             max_seq_length = 510
 23 |         self.max_seq_length = max_seq_length
 24 | 
 25 | 
 26 | 
 27 |         self.bert = DistilBertModel.from_pretrained(model_name_or_path)
 28 |         self.tokenizer = DistilBertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 29 |         self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
 30 |         self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
 31 | 
 32 |     def forward(self, features):
 33 |         """Returns token_embeddings, cls_token"""
 34 |         # DistilBERT does not use token_type_ids
 35 |         output_tokens = self.bert(input_ids=features['input_ids'], attention_mask=features['input_mask'])[0]
 36 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 37 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 38 |         return features
 39 | 
 40 |     def get_word_embedding_dimension(self) -> int:
 41 |         return self.bert.config.hidden_size
 42 | 
 43 |     def tokenize(self, text: str) -> List[int]:
 44 |         """
 45 |         Tokenizes a text and maps tokens to token-ids
 46 |         """
 47 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
 48 | 
 49 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
 50 |         """
 51 |         Convert tokenized sentence in its embedding ids, segment ids and mask
 52 | 
 53 |         :param tokens:
 54 |             a tokenized sentence
 55 |         :param pad_seq_length:
 56 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
 57 |         :return: embedding ids, segment ids and mask for the sentence
 58 |         """
 59 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 60 | 
 61 |         tokens = tokens[:pad_seq_length]
 62 |         input_ids = [self.cls_token_id] + tokens + [self.sep_token_id]
 63 |         sentence_length = len(input_ids)
 64 | 
 65 |         pad_seq_length += 2  ##Add Space for CLS + SEP token
 66 | 
 67 | 
 68 |         input_mask = [1] * len(input_ids)
 69 | 
 70 |         # Zero-pad up to the sequence length. BERT: Pad to the right
 71 |         padding = [0] * (pad_seq_length - len(input_ids))
 72 |         input_ids += padding
 73 | 
 74 |         input_mask += padding
 75 | 
 76 |         assert len(input_ids) == pad_seq_length
 77 |         assert len(input_mask) == pad_seq_length
 78 | 
 79 | 
 80 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
 81 | 
 82 |     def get_config_dict(self):
 83 |         return {key: self.__dict__[key] for key in self.config_keys}
 84 | 
 85 |     def save(self, output_path: str):
 86 |         self.bert.save_pretrained(output_path)
 87 |         self.tokenizer.save_pretrained(output_path)
 88 | 
 89 |         with open(os.path.join(output_path, 'sentence_distilbert_config.json'), 'w') as fOut:
 90 |             json.dump(self.get_config_dict(), fOut, indent=2)
 91 | 
 92 |     @staticmethod
 93 |     def load(input_path: str):
 94 |         with open(os.path.join(input_path, 'sentence_distilbert_config.json')) as fIn:
 95 |             config = json.load(fIn)
 96 |         return DistilBERT(model_name_or_path=input_path, **config)
 97 | 
 98 | 
 99 | 
100 | 
101 | 
102 | 
103 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/RoBERTa.py:
--------------------------------------------------------------------------------
  1 | from torch import Tensor
  2 | from torch import nn
  3 | from transformers import RobertaModel, RobertaTokenizer
  4 | import json
  5 | from typing import Union, Tuple, List, Dict
  6 | import os
  7 | import numpy as np
  8 | import logging
  9 | 
 10 | class RoBERTa(nn.Module):
 11 |     """RoBERTa model to generate token embeddings.
 12 | 
 13 |     Each token is mapped to an output vector from RoBERTa.
 14 |     """
 15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
 16 |         super(RoBERTa, self).__init__()
 17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 18 |         self.do_lower_case = do_lower_case
 19 | 
 20 |         if max_seq_length > 511:
 21 |             logging.warning("RoBERTa only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511")
 22 |             max_seq_length = 511
 23 |         self.max_seq_length = max_seq_length
 24 | 
 25 | 
 26 |         self.roberta = RobertaModel.from_pretrained(model_name_or_path)
 27 |         self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 28 |         self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
 29 |         self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
 30 | 
 31 |     def forward(self, features):
 32 |         """Returns token_embeddings, cls_token"""
 33 |         #RoBERTa does not use token_type_ids
 34 |         output_tokens = self.roberta(input_ids=features['input_ids'], token_type_ids=None, attention_mask=features['input_mask'])[0]
 35 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 36 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 37 |         return features
 38 | 
 39 |     def get_word_embedding_dimension(self) -> int:
 40 |         return self.roberta.config.hidden_size
 41 | 
 42 |     def tokenize(self, text: str) -> List[int]:
 43 |         """
 44 |         Tokenizes a text and maps tokens to token-ids
 45 |         """
 46 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
 47 | 
 48 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
 49 |         """
 50 |         Convert tokenized sentence in its embedding ids, segment ids and mask
 51 | 
 52 |         :param tokens:
 53 |             a tokenized sentence
 54 |         :param pad_seq_length:
 55 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
 56 |         :return: embedding ids, segment ids and mask for the sentence
 57 |         """
 58 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 59 | 
 60 |         tokens = tokens[:pad_seq_length]
 61 |         input_ids = [self.cls_token_id] + tokens + [self.sep_token_id] + [self.sep_token_id]
 62 |         sentence_length = len(input_ids)
 63 | 
 64 |         pad_seq_length += 3  ##Add Space for CLS + SEP + SEP token
 65 | 
 66 |         input_mask = [1] * len(input_ids)
 67 | 
 68 |         # Zero-pad up to the sequence length. BERT: Pad to the right
 69 |         padding = [0] * (pad_seq_length - len(input_ids))
 70 |         input_ids += padding
 71 | 
 72 |         input_mask += padding
 73 | 
 74 |         assert len(input_ids) == pad_seq_length
 75 |         assert len(input_mask) == pad_seq_length
 76 | 
 77 | 
 78 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
 79 | 
 80 |     def get_config_dict(self):
 81 |         return {key: self.__dict__[key] for key in self.config_keys}
 82 | 
 83 |     def save(self, output_path: str):
 84 |         self.roberta.save_pretrained(output_path)
 85 |         self.tokenizer.save_pretrained(output_path)
 86 | 
 87 |         with open(os.path.join(output_path, 'sentence_roberta_config.json'), 'w') as fOut:
 88 |             json.dump(self.get_config_dict(), fOut, indent=2)
 89 | 
 90 |     @staticmethod
 91 |     def load(input_path: str):
 92 |         with open(os.path.join(input_path, 'sentence_roberta_config.json')) as fIn:
 93 |             config = json.load(fIn)
 94 |         return RoBERTa(model_name_or_path=input_path, **config)
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/XLMRoBERTa.py:
--------------------------------------------------------------------------------
  1 | from torch import Tensor
  2 | from torch import nn
  3 | from transformers import XLMRobertaModel, XLMRobertaTokenizer
  4 | import json
  5 | from typing import Union, Tuple, List, Dict
  6 | import os
  7 | import numpy as np
  8 | import logging
  9 | 
 10 | class XLMRoBERTa(nn.Module):
 11 |     """RoBERTa model to generate token embeddings.
 12 | 
 13 |     Each token is mapped to an output vector from RoBERTa.
 14 |     """
 15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
 16 |         super(XLMRoBERTa, self).__init__()
 17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 18 |         self.do_lower_case = do_lower_case
 19 |         self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path)
 20 |         self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 21 | 
 22 |         if max_seq_length > self.tokenizer.max_len_single_sentence:
 23 |             logging.warning("XLM-RoBERTa only allows a max_seq_length of "+self.tokenizer.max_len_single_sentence)
 24 |             max_seq_length = self.tokenizer.max_len_single_sentence
 25 |         self.max_seq_length = max_seq_length
 26 | 
 27 | 
 28 |         self.cls_token_id = self.tokenizer.cls_token_id
 29 |         self.eos_token_id = self.tokenizer.eos_token_id
 30 | 
 31 |     def forward(self, features):
 32 |         """Returns token_embeddings, cls_token"""
 33 |         #RoBERTa does not use token_type_ids
 34 |         output_tokens = self.xlm_roberta(input_ids=features['input_ids'], token_type_ids=None, attention_mask=features['input_mask'])[0]
 35 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 36 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 37 |         return features
 38 | 
 39 |     def get_word_embedding_dimension(self) -> int:
 40 |         return self.xlm_roberta.config.hidden_size
 41 | 
 42 |     def tokenize(self, text: str) -> List[int]:
 43 |         """
 44 |         Tokenizes a text and maps tokens to token-ids
 45 |         """
 46 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
 47 | 
 48 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
 49 |         """
 50 |         Convert tokenized sentence in its embedding ids, segment ids and mask
 51 | 
 52 |         :param tokens:
 53 |             a tokenized sentence
 54 |         :param pad_seq_length:
 55 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
 56 |         :return: embedding ids, segment ids and mask for the sentence
 57 |         """
 58 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 59 | 
 60 |         tokens = tokens[:pad_seq_length]
 61 |         input_ids = [self.cls_token_id] + tokens + [self.eos_token_id]
 62 |         sentence_length = len(input_ids)
 63 | 
 64 |         pad_seq_length += 3  ##Add Space for CLS + SEP + SEP token
 65 | 
 66 |         input_mask = [1] * len(input_ids)
 67 | 
 68 |         # Zero-pad up to the sequence length. BERT: Pad to the right
 69 |         padding = [0] * (pad_seq_length - len(input_ids))
 70 |         input_ids += padding
 71 | 
 72 |         input_mask += padding
 73 | 
 74 |         assert len(input_ids) == pad_seq_length
 75 |         assert len(input_mask) == pad_seq_length
 76 | 
 77 | 
 78 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
 79 | 
 80 |     def get_config_dict(self):
 81 |         return {key: self.__dict__[key] for key in self.config_keys}
 82 | 
 83 |     def save(self, output_path: str):
 84 |         self.xlm_roberta.save_pretrained(output_path)
 85 |         self.tokenizer.save_pretrained(output_path)
 86 | 
 87 |         with open(os.path.join(output_path, 'sentence_xlm-roberta_config.json'), 'w') as fOut:
 88 |             json.dump(self.get_config_dict(), fOut, indent=2)
 89 | 
 90 |     @staticmethod
 91 |     def load(input_path: str):
 92 |         with open(os.path.join(input_path, 'sentence_xlm-roberta_config.json')) as fIn:
 93 |             config = json.load(fIn)
 94 |         return XLMRoBERTa(model_name_or_path=input_path, **config)
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/T5.py:
--------------------------------------------------------------------------------
  1 | from torch import nn
  2 | from transformers import T5Model, T5Tokenizer
  3 | import json
  4 | from typing import List
  5 | import os
  6 | import numpy as np
  7 | import logging
  8 | 
  9 | class T5(nn.Module):
 10 |     """T5 model to generate token embeddings.
 11 | 
 12 |     Each token is mapped to an output vector from BERT.
 13 |     """
 14 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
 15 |         super(T5, self).__init__()
 16 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 17 |         self.do_lower_case = do_lower_case
 18 | 
 19 |         if max_seq_length > 512:
 20 |             logging.warning("T5 only allows a max_seq_length of 512. Value will be set to 512")
 21 |             max_seq_length = 512
 22 |         self.max_seq_length = max_seq_length
 23 | 
 24 |         self.enc_model = T5Model.from_pretrained(model_name_or_path)
 25 |         self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 26 |         #self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
 27 |         #self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
 28 | 
 29 |     def forward(self, features):
 30 |         """Returns token_embeddings, cls_token"""
 31 |         output_tokens = self.enc_model(input_ids=features['input_ids'], attention_mask=features['input_mask'])[0]
 32 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 33 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 34 |         return features
 35 | 
 36 |     def get_word_embedding_dimension(self) -> int:
 37 |         return self.enc_model.config.hidden_size
 38 | 
 39 |     def tokenize(self, text: str) -> List[int]:
 40 |         """
 41 |         Tokenizes a text and maps tokens to token-ids
 42 |         """
 43 |         return self.tokenizer.encode(text)
 44 | 
 45 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
 46 |         """
 47 |         Convert tokenized sentence in its embedding ids, segment ids and mask
 48 | 
 49 |         :param tokens:
 50 |             a tokenized sentence
 51 |         :param pad_seq_length:
 52 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
 53 |         :return: embedding ids, segment ids and mask for the sentence
 54 |         """
 55 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 56 | 
 57 |         tokens = tokens[:pad_seq_length]
 58 |         input_ids = tokens #[self.cls_token_id] + tokens + [self.sep_token_id]
 59 |         sentence_length = len(input_ids)
 60 | 
 61 |         #pad_seq_length += 2  ##Add Space for CLS + SEP token
 62 | 
 63 |         token_type_ids = [0] * len(input_ids)
 64 |         input_mask = [1] * len(input_ids)
 65 | 
 66 |         # Zero-pad up to the sequence length. BERT: Pad to the right
 67 |         padding = [0] * (pad_seq_length - len(input_ids))
 68 |         input_ids += padding
 69 |         token_type_ids += padding
 70 |         input_mask += padding
 71 | 
 72 |         assert len(input_ids) == pad_seq_length
 73 |         assert len(input_mask) == pad_seq_length
 74 |         assert len(token_type_ids) == pad_seq_length
 75 | 
 76 | 
 77 |         return {
 78 |             'input_ids': np.asarray(input_ids, dtype=np.int64),
 79 |             'token_type_ids': np.asarray(token_type_ids, dtype=np.int64),
 80 |             'input_mask': np.asarray(input_mask, dtype=np.int64),
 81 |             'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)
 82 |         }
 83 | 
 84 |     def get_config_dict(self):
 85 |         return {key: self.__dict__[key] for key in self.config_keys}
 86 | 
 87 |     def save(self, output_path: str):
 88 |         self.enc_model.save_pretrained(output_path)
 89 |         self.tokenizer.save_pretrained(output_path)
 90 | 
 91 |         with open(os.path.join(output_path, 'sentence_T5_config.json'), 'w') as fOut:
 92 |             json.dump(self.get_config_dict(), fOut, indent=2)
 93 | 
 94 |     @staticmethod
 95 |     def load(input_path: str):
 96 |         with open(os.path.join(input_path, 'sentence_T5_config.json')) as fIn:
 97 |             config = json.load(fIn)
 98 |         return T5(model_name_or_path=input_path, **config)
 99 | 
100 | 
101 | 
102 | 
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/BERT.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import BertModel, BertTokenizer
 3 | import json
 4 | from typing import List
 5 | import os
 6 | import numpy as np
 7 | import logging
 8 | 
 9 | class BERT(nn.Module):
10 |     """BERT model to generate token embeddings.
11 | 
12 |     Each token is mapped to an output vector from BERT.
13 |     """
14 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
15 |         super(BERT, self).__init__()
16 |         self.config_keys = ['max_seq_length', 'do_lower_case']
17 |         self.do_lower_case = do_lower_case
18 | 
19 |         if max_seq_length > 510:
20 |             logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
21 |             max_seq_length = 510
22 |         self.max_seq_length = max_seq_length
23 | 
24 |         self.bert = BertModel.from_pretrained(model_name_or_path)
25 |         self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
26 |         self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
27 |         self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
28 | 
29 |     def forward(self, features):
30 |         """Returns token_embeddings, cls_token"""
31 |         output_tokens = self.bert(input_ids=features['input_ids'], token_type_ids=features['token_type_ids'], attention_mask=features['input_mask'])[0]
32 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
33 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
34 |         return features
35 | 
36 |     def get_word_embedding_dimension(self) -> int:
37 |         return self.bert.config.hidden_size
38 | 
39 |     def tokenize(self, text: str) -> List[int]:
40 |         """
41 |         Tokenizes a text and maps tokens to token-ids
42 |         """
43 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
44 | 
45 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
46 |         """
47 |         Convert tokenized sentence in its embedding ids, segment ids and mask
48 | 
49 |         :param tokens:
50 |             a tokenized sentence
51 |         :param pad_seq_length:
52 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
53 |         :return: embedding ids, segment ids and mask for the sentence
54 |         """
55 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
56 | 
57 |         tokens = tokens[:pad_seq_length]
58 |         input_ids = [self.cls_token_id] + tokens + [self.sep_token_id]
59 |         sentence_length = len(input_ids)
60 | 
61 |         pad_seq_length += 2  ##Add Space for CLS + SEP token
62 | 
63 |         token_type_ids = [0] * len(input_ids)
64 |         input_mask = [1] * len(input_ids)
65 | 
66 |         # Zero-pad up to the sequence length. BERT: Pad to the right
67 |         padding = [0] * (pad_seq_length - len(input_ids))
68 |         input_ids += padding
69 |         token_type_ids += padding
70 |         input_mask += padding
71 | 
72 |         assert len(input_ids) == pad_seq_length
73 |         assert len(input_mask) == pad_seq_length
74 |         assert len(token_type_ids) == pad_seq_length
75 | 
76 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'token_type_ids': np.asarray(token_type_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
77 | 
78 |     def get_config_dict(self):
79 |         return {key: self.__dict__[key] for key in self.config_keys}
80 | 
81 |     def save(self, output_path: str):
82 |         self.bert.save_pretrained(output_path)
83 |         self.tokenizer.save_pretrained(output_path)
84 | 
85 |         with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut:
86 |             json.dump(self.get_config_dict(), fOut, indent=2)
87 | 
88 |     @staticmethod
89 |     def load(input_path: str):
90 |         with open(os.path.join(input_path, 'sentence_bert_config.json')) as fIn:
91 |             config = json.load(fIn)
92 |         return BERT(model_name_or_path=input_path, **config)
93 | 
94 | 
95 | 
96 | 
97 | 
98 | 
99 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/CamemBERT.py:
--------------------------------------------------------------------------------
  1 | from torch import Tensor
  2 | from torch import nn
  3 | from transformers import CamembertModel, CamembertTokenizer
  4 | import json
  5 | from typing import Union, Tuple, List, Dict
  6 | import os
  7 | import numpy as np
  8 | import logging
  9 | 
 10 | 
 11 | class CamemBERT(nn.Module):
 12 |     """CamemBERT model to generate token embeddings.
 13 | 
 14 |     Each token is mapped to an output vector from CamemBERT.
 15 |     """
 16 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
 17 |         super(CamemBERT, self).__init__()
 18 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 19 |         self.do_lower_case = do_lower_case
 20 | 
 21 |         if max_seq_length > 511:
 22 |             logging.warning("CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511")
 23 |             max_seq_length = 511
 24 |         self.max_seq_length = max_seq_length
 25 | 
 26 |         self.camembert = CamembertModel.from_pretrained(model_name_or_path)
 27 |         self.tokenizer = CamembertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 28 |         self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
 29 |         self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
 30 | 
 31 |     def forward(self, features):
 32 |         """Returns token_embeddings, cls_token"""
 33 |         #CamemBERT does not use token_type_ids
 34 |         output_tokens = self.camembert(input_ids=features['input_ids'], token_type_ids=None, attention_mask=features['input_mask'])[0]
 35 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 36 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 37 |         return features
 38 | 
 39 |     def get_word_embedding_dimension(self) -> int:
 40 |         return self.camembert.config.hidden_size
 41 | 
 42 |     def tokenize(self, text: str) -> List[int]:
 43 |         """
 44 |         Tokenizes a text and maps tokens to token-ids
 45 |         """
 46 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
 47 | 
 48 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
 49 |         """
 50 |         Convert tokenized sentence in its embedding ids, segment ids and mask
 51 | 
 52 |         :param tokens:
 53 |             a tokenized sentence
 54 |         :param pad_seq_length:
 55 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
 56 |         :return: embedding ids, segment ids and mask for the sentence
 57 |         """
 58 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 59 | 
 60 |         tokens = tokens[:pad_seq_length]
 61 |         input_ids = [self.cls_token_id] + tokens + [self.sep_token_id] + [self.sep_token_id]
 62 |         sentence_length = len(input_ids)
 63 | 
 64 |         pad_seq_length += 3  ##Add Space for CLS + SEP + SEP token
 65 | 
 66 |         input_mask = [1] * len(input_ids)
 67 | 
 68 |         # Zero-pad up to the sequence length. BERT: Pad to the right
 69 |         padding = [0] * (pad_seq_length - len(input_ids))
 70 |         input_ids += padding
 71 | 
 72 |         input_mask += padding
 73 | 
 74 |         assert len(input_ids) == pad_seq_length
 75 |         assert len(input_mask) == pad_seq_length
 76 | 
 77 | 
 78 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
 79 | 
 80 |     def get_config_dict(self):
 81 |         return {key: self.__dict__[key] for key in self.config_keys}
 82 | 
 83 |     def save(self, output_path: str):
 84 |         self.camembert.save_pretrained(output_path)
 85 |         self.tokenizer.save_pretrained(output_path)
 86 | 
 87 |         with open(os.path.join(output_path, 'sentence_camembert_config.json'), 'w') as fOut:
 88 |             json.dump(self.get_config_dict(), fOut, indent=2)
 89 | 
 90 |     @staticmethod
 91 |     def load(input_path: str):
 92 |         with open(os.path.join(input_path, 'sentence_camembert_config.json')) as fIn:
 93 |             config = json.load(fIn)
 94 |         return CamemBERT(model_name_or_path=input_path, **config)
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_bow.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This example uses a simple bag-of-words (BoW) approach. A sentence is mapped
  3 | to a sparse vector with e.g. 25,000 dimensions. Optionally, you can also use tf-idf.
  4 | 
  5 | To make the model trainable, we add multiple dense layers to create a Deep Averaging Network (DAN).
  6 | """
  7 | import torch
  8 | from torch.utils.data import DataLoader
  9 | import math
 10 | from sentence_transformers import models, losses
 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 13 | from sentence_transformers.readers import *
 14 | from sentence_transformers.models.tokenizer.WordTokenizer import ENGLISH_STOP_WORDS
 15 | import logging
 16 | from datetime import datetime
 17 | 
 18 | #### Just some code to print debug information to stdout
 19 | logging.basicConfig(format='%(asctime)s - %(message)s',
 20 |                     datefmt='%Y-%m-%d %H:%M:%S',
 21 |                     level=logging.INFO,
 22 |                     handlers=[LoggingHandler()])
 23 | #### /print debug information to stdout
 24 | 
 25 | # Read the dataset
 26 | batch_size = 32
 27 | sts_reader = STSDataReader('datasets/stsbenchmark')
 28 | model_save_path = 'output/training_tf-idf_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 29 | 
 30 | 
 31 | 
 32 | # Create the vocab for the BoW model
 33 | stop_words = ENGLISH_STOP_WORDS
 34 | max_vocab_size = 25000 #This is also the size of the BoW sentence vector.
 35 | 
 36 | 
 37 | #Read the most common max_vocab_size words. Skip stop-words
 38 | vocab = set()
 39 | weights = {}
 40 | lines = open('wikipedia_doc_frequencies.txt').readlines()
 41 | num_docs = int(lines[0])
 42 | for line in lines[1:]:
 43 |     word, freq = line.lower().strip().split("\t")
 44 |     if word in stop_words:
 45 |         continue
 46 | 
 47 |     vocab.add(word)
 48 |     weights[word] = math.log(num_docs/int(freq))
 49 | 
 50 |     if len(vocab) >= max_vocab_size:
 51 |         break
 52 | 
 53 | #Create the BoW model. Because we set word_weights to the IDF values and cumulative_term_frequency=True, we
 54 | #get tf-idf vectors. Set word_weights to an empty dict and cumulative_term_frequency=False to get a 1-hot sentence encoding
 55 | bow = models.BoW(vocab=vocab, word_weights=weights, cumulative_term_frequency=True)
 56 | 
 57 | # Add two trainable feed-forward networks (DAN) with max_vocab_size -> 768 -> 512 dimensions.
 58 | sent_embeddings_dimension = max_vocab_size
 59 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=768)
 60 | dan2 = models.Dense(in_features=768, out_features=512)
 61 | 
 62 | model = SentenceTransformer(modules=[bow, dan1, dan2])
 63 | 
 64 | 
 65 | # Convert the dataset to a DataLoader ready for training
 66 | logging.info("Read STSbenchmark train dataset")
 67 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
 68 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
 69 | train_loss = losses.CosineSimilarityLoss(model=model)
 70 | 
 71 | logging.info("Read STSbenchmark dev dataset")
 72 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
 73 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
 74 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
 75 | 
 76 | # Configure the training
 77 | num_epochs = 10
 78 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
 79 | logging.info("Warmup-steps: {}".format(warmup_steps))
 80 | 
 81 | # Train the model
 82 | model.fit(train_objectives=[(train_dataloader, train_loss)],
 83 |           evaluator=evaluator,
 84 |           epochs=num_epochs,
 85 |           warmup_steps=warmup_steps,
 86 |           output_path=model_save_path
 87 |           )
 88 | 
 89 | 
 90 | 
 91 | ##############################################################################
 92 | #
 93 | # Load the stored model and evaluate its performance on STS benchmark dataset
 94 | #
 95 | ##############################################################################
 96 | 
 97 | model = SentenceTransformer(model_save_path)
 98 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
 99 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
100 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
101 | 
102 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/PhraseTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | import logging
 7 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 8 | import nltk
 9 | 
10 | class PhraseTokenizer(WordTokenizer):
11 |     """Tokenizes the text with respect to existent phrases in the vocab.
12 | 
13 |     This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
14 |     in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
15 |     """
16 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, ngram_separator: str = "_", max_ngram_length: int = 5):
17 |         self.stop_words = set(stop_words)
18 |         self.do_lower_case = do_lower_case
19 |         self.ngram_separator = ngram_separator
20 |         self.max_ngram_length = max_ngram_length
21 |         self.set_vocab(vocab)
22 | 
23 |     def get_vocab(self):
24 |         return self.vocab
25 | 
26 |     def set_vocab(self, vocab: Iterable[str]):
27 |         self.vocab = vocab
28 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
29 | 
30 |         # Check for ngram in vocab
31 |         self.ngram_lookup = set()
32 |         self.ngram_lengths = set()
33 |         for word in vocab:
34 | 
35 |             if self.ngram_separator is not None and self.ngram_separator in word:
36 |                 # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other
37 |                 ngram_count = word.count(self.ngram_separator) + 1
38 |                 if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length:
39 |                     self.ngram_lookup.add(word)
40 |                     self.ngram_lengths.add(ngram_count)
41 | 
42 |         if len(vocab) > 0:
43 |             logging.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths))
44 |             logging.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup)))
45 | 
46 |     def tokenize(self, text: str) -> List[int]:
47 |         tokens = nltk.word_tokenize(text, preserve_line=True)
48 | 
49 |         #phrase detection
50 |         for ngram_len in sorted(self.ngram_lengths, reverse=True):
51 |             idx = 0
52 |             while idx <= len(tokens) - ngram_len:
53 |                 ngram = self.ngram_separator.join(tokens[idx:idx + ngram_len])
54 |                 if ngram in self.ngram_lookup:
55 |                     tokens[idx:idx + ngram_len] = [ngram]
56 |                 elif ngram.lower() in self.ngram_lookup:
57 |                     tokens[idx:idx + ngram_len] = [ngram.lower()]
58 |                 idx += 1
59 | 
60 |         #Map tokens to idx, filter stop words
61 |         tokens_filtered = []
62 |         for token in tokens:
63 |             if token in self.stop_words:
64 |                 continue
65 |             elif token in self.word2idx:
66 |                 tokens_filtered.append(self.word2idx[token])
67 |                 continue
68 | 
69 |             token = token.lower()
70 |             if token in self.stop_words:
71 |                 continue
72 |             elif token in self.word2idx:
73 |                 tokens_filtered.append(self.word2idx[token])
74 |                 continue
75 | 
76 |             token = token.strip(string.punctuation)
77 |             if token in self.stop_words:
78 |                 continue
79 |             elif len(token) > 0 and token in self.word2idx:
80 |                 tokens_filtered.append(self.word2idx[token])
81 |                 continue
82 | 
83 |         return tokens_filtered
84 | 
85 |     def save(self, output_path: str):
86 |         with open(os.path.join(output_path, 'phrasetokenizer_config.json'), 'w') as fOut:
87 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'ngram_separator': self.ngram_separator, 'max_ngram_length': self.max_ngram_length}, fOut)
88 | 
89 |     @staticmethod
90 |     def load(input_path: str):
91 |         with open(os.path.join(input_path, 'phrasetokenizer_config.json'), 'r') as fIn:
92 |             config = json.load(fIn)
93 | 
94 |         return PhraseTokenizer(**config)
95 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/ALBERT.py:
--------------------------------------------------------------------------------
  1 | from torch import Tensor
  2 | from torch import nn
  3 | from transformers import AlbertModel, AlbertTokenizer
  4 | import json
  5 | from typing import Union, Tuple, List, Dict
  6 | import os
  7 | import numpy as np
  8 | import logging
  9 | 
 10 | class ALBERT(nn.Module):
 11 |     """ALBERT model to generate token embeddings.
 12 | 
 13 |     Each token is mapped to an output vector from BERT.
 14 |     """
 15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = True):
 16 |         super(ALBERT, self).__init__()
 17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 18 |         self.do_lower_case = do_lower_case
 19 | 
 20 |         if max_seq_length > 510:
 21 |             logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
 22 |             max_seq_length = 510
 23 |         self.max_seq_length = max_seq_length
 24 | 
 25 |         self.bert = AlbertModel.from_pretrained(model_name_or_path)
 26 |         self.tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 27 |         self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
 28 |         self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
 29 | 
 30 |     def forward(self, features):
 31 |         """Returns token_embeddings, cls_token"""
 32 |         output_tokens = self.bert(input_ids=features['input_ids'], token_type_ids=features['token_type_ids'], attention_mask=features['input_mask'])[0]
 33 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 34 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 35 |         return features
 36 | 
 37 |     def get_word_embedding_dimension(self) -> int:
 38 |         return self.bert.config.hidden_size
 39 | 
 40 |     def tokenize(self, text: str) -> List[int]:
 41 |         """
 42 |         Tokenizes a text and maps tokens to token-ids
 43 |         """
 44 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
 45 | 
 46 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
 47 |         """
 48 |         Convert tokenized sentence in its embedding ids, segment ids and mask
 49 | 
 50 |         :param tokens:
 51 |             a tokenized sentence
 52 |         :param pad_seq_length:
 53 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
 54 |         :return: embedding ids, segment ids and mask for the sentence
 55 |         """
 56 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 57 | 
 58 |         tokens = tokens[:pad_seq_length]
 59 |         input_ids = [self.cls_token_id] + tokens + [self.sep_token_id]
 60 |         sentence_length = len(input_ids)
 61 | 
 62 |         pad_seq_length += 2  ##Add Space for CLS + SEP token
 63 | 
 64 |         token_type_ids = [0] * len(input_ids)
 65 |         input_mask = [1] * len(input_ids)
 66 | 
 67 |         # Zero-pad up to the sequence length. BERT: Pad to the right
 68 |         padding = [0] * (pad_seq_length - len(input_ids))
 69 |         input_ids += padding
 70 |         token_type_ids += padding
 71 |         input_mask += padding
 72 | 
 73 |         assert len(input_ids) == pad_seq_length
 74 |         assert len(input_mask) == pad_seq_length
 75 |         assert len(token_type_ids) == pad_seq_length
 76 | 
 77 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64), 'token_type_ids': np.asarray(token_type_ids, dtype=np.int64), 'input_mask': np.asarray(input_mask, dtype=np.int64), 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
 78 | 
 79 |     def get_config_dict(self):
 80 |         return {key: self.__dict__[key] for key in self.config_keys}
 81 | 
 82 |     def save(self, output_path: str):
 83 |         self.bert.save_pretrained(output_path)
 84 |         self.tokenizer.save_pretrained(output_path)
 85 | 
 86 |         with open(os.path.join(output_path, 'sentence_albert_config.json'), 'w') as fOut:
 87 |             json.dump(self.get_config_dict(), fOut, indent=2)
 88 | 
 89 |     @staticmethod
 90 |     def load(input_path: str):
 91 |         with open(os.path.join(input_path, 'sentence_albert_config.json')) as fIn:
 92 |             config = json.load(fIn)
 93 |         return ALBERT(model_name_or_path=input_path, **config)
 94 | 
 95 | 
 96 | 
 97 | 
 98 | 
 99 | 
100 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/XLNet.py:
--------------------------------------------------------------------------------
  1 | from torch import Tensor
  2 | from torch import nn
  3 | from transformers import XLNetModel, XLNetTokenizer
  4 | import json
  5 | from typing import Union, Tuple, List, Dict
  6 | import os
  7 | import numpy as np
  8 | 
  9 | class XLNet(nn.Module):
 10 |     """XLNet model to generate token embeddings.
 11 | 
 12 |     Each token is mapped to an output vector from XLNet.
 13 |     """
 14 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: bool = False):
 15 |         super(XLNet, self).__init__()
 16 |         self.config_keys = ['max_seq_length', 'do_lower_case']
 17 |         self.max_seq_length = max_seq_length
 18 |         self.do_lower_case = do_lower_case
 19 | 
 20 |         self.xlnet = XLNetModel.from_pretrained(model_name_or_path)
 21 |         self.tokenizer = XLNetTokenizer.from_pretrained(model_name_or_path, do_lower_case=do_lower_case)
 22 |         self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
 23 |         self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
 24 | 
 25 |     def forward(self, features):
 26 |         """Returns token_embeddings, cls_token"""
 27 |         output_tokens = self.xlnet(input_ids=features['input_ids'], token_type_ids=features['token_type_ids'], attention_mask=features['input_mask'])[0]
 28 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
 29 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 30 |         return features
 31 | 
 32 |     def get_word_embedding_dimension(self) -> int:
 33 |         return self.xlnet.config.d_model
 34 | 
 35 |     def tokenize(self, text: str) -> List[int]:
 36 |         """
 37 |         Tokenizes a text and maps tokens to token-ids
 38 |         """
 39 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
 40 | 
 41 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int) -> Dict[str, Tensor]:
 42 |         """
 43 |         Convert tokenized sentence in its embedding ids, segment ids and mask
 44 | 
 45 |         :param tokens:
 46 |             a tokenized sentence
 47 |         :param pad_seq_length:
 48 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
 49 |         :return: embedding ids, segment ids and mask for the sentence
 50 |         """
 51 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 52 | 
 53 |         sep_token = self.sep_token_id
 54 |         cls_token = self.cls_token_id
 55 |         sequence_a_segment_id = 0
 56 |         cls_token_segment_id = 2
 57 |         pad_token_segment_id = 4
 58 |         pad_token = 0
 59 | 
 60 |         tokens = tokens[:pad_seq_length] + [sep_token]
 61 |         token_type_ids = [sequence_a_segment_id] * len(tokens)
 62 | 
 63 |         # XLNet CLS token at the end
 64 |         tokens = tokens + [cls_token]
 65 |         token_type_ids = token_type_ids + [cls_token_segment_id]
 66 |         pad_seq_length += 2  ##+2 for CLS and SEP token
 67 | 
 68 |         input_ids = tokens
 69 |         input_mask = [1] * len(input_ids)
 70 |         sentence_length = len(input_ids)
 71 | 
 72 |         # Zero-pad up to the sequence length. XLNet: Pad to the left
 73 |         padding_length = pad_seq_length - len(input_ids)
 74 |         input_ids = ([pad_token] * padding_length) + input_ids
 75 |         input_mask = ([0] * padding_length) + input_mask
 76 |         token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
 77 | 
 78 |         assert len(input_ids) == pad_seq_length
 79 |         assert len(input_mask) == pad_seq_length
 80 |         assert len(token_type_ids) == pad_seq_length
 81 | 
 82 | 
 83 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64),
 84 |                 'token_type_ids': np.asarray(token_type_ids, dtype=np.int64),
 85 |                 'input_mask': np.asarray(input_mask, dtype=np.int64),
 86 |                 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
 87 | 
 88 |     def get_config_dict(self):
 89 |         return {key: self.__dict__[key] for key in self.config_keys}
 90 | 
 91 |     def save(self, output_path: str):
 92 |         self.xlnet.save_pretrained(output_path)
 93 |         self.tokenizer.save_pretrained(output_path)
 94 | 
 95 |         with open(os.path.join(output_path, 'sentence_xlnet_config.json'), 'w') as fOut:
 96 |             json.dump(self.get_config_dict(), fOut, indent=2)
 97 | 
 98 |     @staticmethod
 99 |     def load(input_path: str):
100 |         with open(os.path.join(input_path, 'sentence_xlnet_config.json')) as fIn:
101 |             config = json.load(fIn)
102 |         return XLNet(model_name_or_path=input_path, **config)
103 | 
104 | 
105 | 
106 | 
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/examples/training_stsbenchmark_tf-idf_word_embeddings.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This example weights word embeddings (like GloVe) with IDF weights. The IDF weights can for example be computed on Wikipedia.
  3 | 
  4 | If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server.
  5 | 
  6 | See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/ for available word embeddings files
  7 | 
  8 | You can get term-document frequencies from here:
  9 | https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/wikipedia_doc_frequencies.txt
 10 | """
 11 | import torch
 12 | from torch.utils.data import DataLoader
 13 | import math
 14 | from sentence_transformers import models, losses
 15 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
 16 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 17 | from sentence_transformers.readers import *
 18 | import logging
 19 | from datetime import datetime
 20 | 
 21 | #### Just some code to print debug information to stdout
 22 | logging.basicConfig(format='%(asctime)s - %(message)s',
 23 |                     datefmt='%Y-%m-%d %H:%M:%S',
 24 |                     level=logging.INFO,
 25 |                     handlers=[LoggingHandler()])
 26 | #### /print debug information to stdout
 27 | 
 28 | # Read the dataset
 29 | batch_size = 32
 30 | sts_reader = STSDataReader('datasets/stsbenchmark')
 31 | model_save_path = 'output/training_tf-idf_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 32 | 
 33 | 
 34 | 
 35 | # Map tokens to traditional word embeddings like GloVe
 36 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz')
 37 | 
 38 | # Weight word embeddings using Inverse-Document-Frequency (IDF) values.
 39 | # For each word in the vocab ob the tokenizer, we must specify a weight value.
 40 | # The word embedding is then multiplied by this value
 41 | vocab = word_embedding_model.tokenizer.get_vocab()
 42 | word_weights = {}
 43 | lines = open('wikipedia_doc_frequencies.txt').readlines()
 44 | num_docs = int(lines[0])
 45 | for line in lines[1:]:
 46 |     word, freq = line.strip().split("\t")
 47 |     word_weights[word] = math.log(num_docs/int(freq))
 48 | 
 49 | # Words in the vocab that are not in the doc_frequencies file get a frequency of 1
 50 | unknown_word_weight = math.log(num_docs/1)
 51 | 
 52 | # Initialize the WordWeights model. This model must be between the WordEmbeddings and the Pooling model
 53 | word_weights = models.WordWeights(vocab=vocab, word_weights=word_weights, unknown_word_weight=unknown_word_weight)
 54 | 
 55 | 
 56 | # Apply mean pooling to get one fixed sized sentence vector
 57 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
 58 |                                pooling_mode_mean_tokens=True,
 59 |                                pooling_mode_cls_token=False,
 60 |                                pooling_mode_max_tokens=False)
 61 | 
 62 | # Add two trainable feed-forward networks (DAN)
 63 | sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
 64 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
 65 | dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
 66 | 
 67 | model = SentenceTransformer(modules=[word_embedding_model, word_weights, pooling_model, dan1, dan2])
 68 | 
 69 | 
 70 | # Convert the dataset to a DataLoader ready for training
 71 | logging.info("Read STSbenchmark train dataset")
 72 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
 73 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
 74 | train_loss = losses.CosineSimilarityLoss(model=model)
 75 | 
 76 | logging.info("Read STSbenchmark dev dataset")
 77 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
 78 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
 79 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
 80 | 
 81 | # Configure the training
 82 | num_epochs = 10
 83 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
 84 | logging.info("Warmup-steps: {}".format(warmup_steps))
 85 | 
 86 | # Train the model
 87 | model.fit(train_objectives=[(train_dataloader, train_loss)],
 88 |           evaluator=evaluator,
 89 |           epochs=num_epochs,
 90 |           warmup_steps=warmup_steps,
 91 |           output_path=model_save_path
 92 |           )
 93 | 
 94 | 
 95 | 
 96 | ##############################################################################
 97 | #
 98 | # Load the stored model and evaluate its performance on STS benchmark dataset
 99 | #
100 | ##############################################################################
101 | 
102 | model = SentenceTransformer(model_save_path)
103 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
104 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
105 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
106 | 
107 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/TripletEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator, SimilarityFunction
  2 | import torch
  3 | from torch.utils.data import DataLoader
  4 | import logging
  5 | from tqdm import tqdm
  6 | from ..util import batch_to_device
  7 | import os
  8 | import csv
  9 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
 10 | 
 11 | 
 12 | 
 13 | class TripletEvaluator(SentenceEvaluator):
 14 |     """
 15 |     Evaluate a model based on a triplet: (sentence, positive_example, negative_example). Checks if distance(sentence,positive_example) < distance(sentence, negative_example).
 16 |     """
 17 |     def __init__(self, dataloader: DataLoader, main_distance_function: SimilarityFunction = None, name: str =''):
 18 |         """
 19 |         Constructs an evaluator based for the dataset
 20 | 
 21 | 
 22 |         :param dataloader:
 23 |             the data for the evaluation
 24 |         :param main_similarity:
 25 |             the similarity metric that will be used for the returned score
 26 |         """
 27 |         self.dataloader = dataloader
 28 |         self.main_distance_function = main_distance_function
 29 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 30 |         self.name = name
 31 |         if name:
 32 |             name = "_"+name
 33 | 
 34 |         self.csv_file: str = "triplet_evaluation"+name+"_results.csv"
 35 |         self.csv_headers = ["epoch", "steps", "accuracy_cosinus", "accuracy_manhatten", "accuracy_euclidean"]
 36 | 
 37 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 38 |         model.eval()
 39 | 
 40 |         if epoch != -1:
 41 |             if steps == -1:
 42 |                 out_txt = " after epoch {}:".format(epoch)
 43 |             else:
 44 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
 45 |         else:
 46 |             out_txt = ":"
 47 | 
 48 |         logging.info("Evaluation the model on "+self.name+" dataset"+out_txt)
 49 | 
 50 |         num_triplets = 0
 51 |         num_correct_cos_triplets, num_correct_manhatten_triplets, num_correct_euclidean_triplets = 0, 0, 0
 52 | 
 53 |         self.dataloader.collate_fn = model.smart_batching_collate
 54 |         for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
 55 |             features, label_ids = batch_to_device(batch, self.device)
 56 |             with torch.no_grad():
 57 |                 emb1, emb2, emb3 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features]
 58 | 
 59 |             #Cosine distance
 60 |             pos_cos_distance = paired_cosine_distances(emb1, emb2)
 61 |             neg_cos_distances = paired_cosine_distances(emb1, emb3)
 62 | 
 63 |             # Manhatten
 64 |             pos_manhatten_distance = paired_manhattan_distances(emb1, emb2)
 65 |             neg_manhatten_distances = paired_manhattan_distances(emb1, emb3)
 66 | 
 67 |             # Euclidean
 68 |             pos_euclidean_distance = paired_euclidean_distances(emb1, emb2)
 69 |             neg_euclidean_distances = paired_euclidean_distances(emb1, emb3)
 70 | 
 71 |             for idx in range(len(pos_cos_distance)):
 72 |                 num_triplets += 1
 73 | 
 74 |                 if pos_cos_distance[idx] < neg_cos_distances[idx]:
 75 |                     num_correct_cos_triplets += 1
 76 | 
 77 |                 if pos_manhatten_distance[idx] < neg_manhatten_distances[idx]:
 78 |                     num_correct_manhatten_triplets += 1
 79 | 
 80 |                 if pos_euclidean_distance[idx] < neg_euclidean_distances[idx]:
 81 |                     num_correct_euclidean_triplets += 1
 82 | 
 83 | 
 84 | 
 85 |         accuracy_cos = num_correct_cos_triplets / num_triplets
 86 |         accuracy_manhatten = num_correct_manhatten_triplets / num_triplets
 87 |         accuracy_euclidean = num_correct_euclidean_triplets / num_triplets
 88 | 
 89 |         logging.info("Accuracy Cosine Distance:\t{:.4f}".format(accuracy_cos))
 90 |         logging.info("Accuracy Manhatten Distance:\t{:.4f}".format(accuracy_manhatten))
 91 |         logging.info("Accuracy Euclidean Distance:\t{:.4f}\n".format(accuracy_euclidean))
 92 | 
 93 |         if output_path is not None:
 94 |             csv_path = os.path.join(output_path, self.csv_file)
 95 |             if not os.path.isfile(csv_path):
 96 |                 with open(csv_path, mode="w", encoding="utf-8") as f:
 97 |                     writer = csv.writer(f)
 98 |                     writer.writerow(self.csv_headers)
 99 |                     writer.writerow([epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean])
100 | 
101 |             else:
102 |                 with open(csv_path, mode="a", encoding="utf-8") as f:
103 |                     writer = csv.writer(f)
104 |                     writer.writerow([epoch, steps, accuracy_cos, accuracy_manhatten, accuracy_euclidean])
105 | 
106 |         if self.main_distance_function == SimilarityFunction.COSINE:
107 |             return accuracy_cos
108 |         if self.main_distance_function == SimilarityFunction.MANHATTAN:
109 |             return accuracy_manhatten
110 |         if self.main_distance_function == SimilarityFunction.EUCLIDEAN:
111 |             return accuracy_euclidean
112 | 
113 |         return max(accuracy_cos, accuracy_manhatten, accuracy_euclidean)


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/BinaryEmbeddingSimilarityEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator, SimilarityFunction
  2 | import torch
  3 | from torch.utils.data import DataLoader
  4 | import logging
  5 | from tqdm import tqdm
  6 | from ..util import batch_to_device
  7 | import os
  8 | import csv
  9 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
 10 | import numpy as np
 11 | 
 12 | 
 13 | class BinaryEmbeddingSimilarityEvaluator(SentenceEvaluator):
 14 |     """
 15 |     Evaluate a model based on the similarity of the embeddings by calculating the accuracy of identifying similar and
 16 |     dissimilar sentences.
 17 |     This is done by taking the metrics and checking if sentence pairs with a label of 0 are in the top 50% and pairs
 18 |     with label 1 in the bottom 50%.
 19 |     This assumes that the dataset is split 50-50.
 20 |     The metrics are the cosine similarity as well as euclidean and Manhattan distance
 21 |     The returned score is the accuracy with a specified metric.
 22 | 
 23 |     The results are written in a CSV. If a CSV already exists, then values are appended.
 24 |     """
 25 |     def __init__(self, dataloader: DataLoader,
 26 |                  main_similarity: SimilarityFunction = SimilarityFunction.COSINE, name:str =''):
 27 |         """
 28 |         Constructs an evaluator based for the dataset
 29 | 
 30 |         The labels need to be 0 for dissimilar pairs and 1 for similar pairs.
 31 |         The dataset needs to be split 50-50 with the labels.
 32 | 
 33 |         :param dataloader:
 34 |             the data for the evaluation
 35 |         :param main_similarity:
 36 |             the similarity metric that will be used for the returned score
 37 |         """
 38 |         self.dataloader = dataloader
 39 |         self.main_similarity = main_similarity
 40 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 41 |         self.name = name
 42 |         if name:
 43 |             name = "_"+name
 44 | 
 45 |         self.csv_file: str = "binary_similarity_evaluation"+name+"_results.csv"
 46 |         self.csv_headers = ["epoch", "steps", "cosine_acc", "euclidean_acc", "manhattan_acc"]
 47 | 
 48 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 49 |         model.eval()
 50 |         embeddings1 = []
 51 |         embeddings2 = []
 52 |         labels = []
 53 | 
 54 |         if epoch != -1:
 55 |             if steps == -1:
 56 |                 out_txt = f" after epoch {epoch}:"
 57 |             else:
 58 |                 out_txt = f" in epoch {epoch} after {steps} steps:"
 59 |         else:
 60 |             out_txt = ":"
 61 | 
 62 |         logging.info("Evaluation the model on "+self.name+" dataset"+out_txt)
 63 |         self.dataloader.collate_fn = model.smart_batching_collate
 64 |         for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
 65 |             features, label_ids = batch_to_device(batch, self.device)
 66 |             with torch.no_grad():
 67 |                 emb1, emb2 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features]
 68 | 
 69 |             labels.extend(label_ids.to("cpu").numpy())
 70 |             embeddings1.extend(emb1)
 71 |             embeddings2.extend(emb2)
 72 |         cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
 73 |         manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
 74 |         euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
 75 | 
 76 |         #Ensure labels are just 0 or 1
 77 |         for label in labels:
 78 |             assert (label == 0 or label == 1)
 79 | 
 80 |         cosine_middle = np.median(cosine_scores)
 81 |         cosine_acc = 0
 82 |         for label, score in zip(labels, cosine_scores):
 83 |             if (label == 1 and score > cosine_middle) or (label == 0 and score <= cosine_middle):
 84 |                 cosine_acc += 1
 85 |         cosine_acc /= len(labels)
 86 | 
 87 |         manhattan_middle = np.median(manhattan_distances)
 88 |         manhattan_acc = 0
 89 |         for label, score in zip(labels, manhattan_distances):
 90 |             if (label == 1 and score > manhattan_middle) or (label == 0 and score <= manhattan_middle):
 91 |                 manhattan_acc += 1
 92 |         manhattan_acc /= len(labels)
 93 | 
 94 |         euclidean_middle = np.median(euclidean_distances)
 95 |         euclidean_acc = 0
 96 |         for label, score in zip(labels, euclidean_distances):
 97 |             if (label == 1 and score > euclidean_middle) or (label == 0 and score <= euclidean_middle):
 98 |                 euclidean_acc += 1
 99 |         euclidean_acc /= len(labels)
100 | 
101 |         logging.info("Cosine-Classification:\t{:4f}".format(
102 |             cosine_acc))
103 |         logging.info("Manhattan-Classification:\t{:4f}".format(
104 |             manhattan_acc))
105 |         logging.info("Euclidean-Classification:\t{:4f}\n".format(
106 |             euclidean_acc))
107 | 
108 |         if output_path is not None:
109 |             csv_path = os.path.join(output_path, self.csv_file)
110 |             if not os.path.isfile(csv_path):
111 |                 with open(csv_path, mode="w", encoding="utf-8") as f:
112 |                     writer = csv.writer(f)
113 |                     writer.writerow(self.csv_headers)
114 |                     writer.writerow([epoch, steps, cosine_acc, euclidean_acc, manhattan_acc])
115 |             else:
116 |                 with open(csv_path, mode="a", encoding="utf-8") as f:
117 |                     writer = csv.writer(f)
118 |                     writer.writerow([epoch, steps, cosine_acc, euclidean_acc, manhattan_acc])
119 | 
120 |         if self.main_similarity == SimilarityFunction.COSINE:
121 |             return cosine_acc
122 |         elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
123 |             return euclidean_acc
124 |         elif self.main_similarity == SimilarityFunction.MANHATTAN:
125 |             return manhattan_acc
126 |         else:
127 |             raise ValueError("Unknown main_similarity value")


--------------------------------------------------------------------------------
/sentence_transformers/models/WordEmbeddings.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch import nn, Tensor
  3 | from typing import Union, Tuple, List, Iterable, Dict
  4 | import logging
  5 | import gzip
  6 | from tqdm import tqdm
  7 | import numpy as np
  8 | import os
  9 | import json
 10 | from ..util import import_from_string, fullname, http_get
 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
 12 | 
 13 | 
 14 | class WordEmbeddings(nn.Module):
 15 |     def __init__(self, tokenizer: WordTokenizer, embedding_weights, update_embeddings: bool = False, max_seq_length: int = 1000000):
 16 |         nn.Module.__init__(self)
 17 |         if isinstance(embedding_weights, list):
 18 |             embedding_weights = np.asarray(embedding_weights)
 19 | 
 20 |         if isinstance(embedding_weights, np.ndarray):
 21 |             embedding_weights = torch.from_numpy(embedding_weights)
 22 | 
 23 |         num_embeddings, embeddings_dimension = embedding_weights.size()
 24 |         self.embeddings_dimension = embeddings_dimension
 25 |         self.emb_layer = nn.Embedding(num_embeddings, embeddings_dimension)
 26 |         self.emb_layer.load_state_dict({'weight': embedding_weights})
 27 |         self.emb_layer.weight.requires_grad = update_embeddings
 28 |         self.tokenizer = tokenizer
 29 |         self.update_embeddings = update_embeddings
 30 |         self.max_seq_length = max_seq_length
 31 | 
 32 |     def forward(self, features):
 33 |         token_embeddings = self.emb_layer(features['input_ids'])
 34 |         cls_tokens = None
 35 |         features.update({'token_embeddings': token_embeddings, 'cls_token_embeddings': cls_tokens, 'input_mask': features['input_mask']})
 36 |         return features
 37 | 
 38 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
 39 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
 40 | 
 41 |         tokens = tokens[0:pad_seq_length] #Truncate tokens if needed
 42 |         input_ids = tokens
 43 | 
 44 |         sentence_length = len(input_ids)
 45 |         input_mask = [1] * len(input_ids)
 46 |         padding = [0] * (pad_seq_length - len(input_ids))
 47 |         input_ids += padding
 48 |         input_mask += padding
 49 | 
 50 |         assert len(input_ids) == pad_seq_length
 51 |         assert len(input_mask) == pad_seq_length
 52 | 
 53 |         return {'input_ids': input_ids, 'input_mask': input_mask, 'sentence_lengths': sentence_length}
 54 | 
 55 |         return {'input_ids': np.asarray(input_ids, dtype=np.int64),
 56 |                 'input_mask': np.asarray(input_mask, dtype=np.int64),
 57 |                 'sentence_lengths': np.asarray(sentence_length, dtype=np.int64)}
 58 | 
 59 |     def get_word_embedding_dimension(self) -> int:
 60 |         return self.embeddings_dimension
 61 | 
 62 |     def tokenize(self, text: str) -> List[int]:
 63 |         return self.tokenizer.tokenize(text)
 64 | 
 65 |     def save(self, output_path: str):
 66 |         with open(os.path.join(output_path, 'wordembedding_config.json'), 'w') as fOut:
 67 |             json.dump(self.get_config_dict(), fOut, indent=2)
 68 | 
 69 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
 70 |         self.tokenizer.save(output_path)
 71 | 
 72 |     def get_config_dict(self):
 73 |         return {'tokenizer_class': fullname(self.tokenizer), 'update_embeddings': self.update_embeddings, 'max_seq_length': self.max_seq_length}
 74 | 
 75 |     @staticmethod
 76 |     def load(input_path: str):
 77 |         with open(os.path.join(input_path, 'wordembedding_config.json'), 'r') as fIn:
 78 |             config = json.load(fIn)
 79 | 
 80 |         tokenizer_class = import_from_string(config['tokenizer_class'])
 81 |         tokenizer = tokenizer_class.load(input_path)
 82 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
 83 |         embedding_weights = weights['emb_layer.weight']
 84 |         model = WordEmbeddings(tokenizer=tokenizer, embedding_weights=embedding_weights, update_embeddings=config['update_embeddings'])
 85 |         return model
 86 | 
 87 |     @staticmethod
 88 |     def from_text_file(embeddings_file_path: str, update_embeddings: bool = False, item_separator: str = " ", tokenizer=WhitespaceTokenizer(), max_vocab_size: int = None):
 89 |         logging.info("Read in embeddings file {}".format(embeddings_file_path))
 90 | 
 91 |         if not os.path.exists(embeddings_file_path):
 92 |             logging.info("{} does not exist, try to download from server".format(embeddings_file_path))
 93 | 
 94 |             if '/' in embeddings_file_path or '\\' in embeddings_file_path:
 95 |                 raise ValueError("Embeddings file not found: ".format(embeddings_file_path))
 96 | 
 97 |             url = "https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/"+embeddings_file_path
 98 |             http_get(url, embeddings_file_path)
 99 | 
100 |         embeddings_dimension = None
101 |         vocab = []
102 |         embeddings = []
103 | 
104 |         with gzip.open(embeddings_file_path, "rt", encoding="utf8") if embeddings_file_path.endswith('.gz') else open(embeddings_file_path, encoding="utf8") as fIn:
105 |             iterator = tqdm(fIn, desc="Load Word Embeddings", unit="Embeddings")
106 |             for line in iterator:
107 |                 split = line.rstrip().split(item_separator)
108 |                 word = split[0]
109 | 
110 |                 if embeddings_dimension == None:
111 |                     embeddings_dimension = len(split) - 1
112 |                     vocab.append("PADDING_TOKEN")
113 |                     embeddings.append(np.zeros(embeddings_dimension))
114 | 
115 |                 if (len(split) - 1) != embeddings_dimension:  # Assure that all lines in the embeddings file are of the same length
116 |                     logging.error("ERROR: A line in the embeddings file had more or less  dimensions than expected. Skip token.")
117 |                     continue
118 | 
119 |                 vector = np.array([float(num) for num in split[1:]])
120 |                 embeddings.append(vector)
121 |                 vocab.append(word)
122 | 
123 |                 if max_vocab_size is not None and max_vocab_size > 0 and len(vocab) > max_vocab_size:
124 |                     break
125 | 
126 |             embeddings = np.asarray(embeddings)
127 | 
128 |             tokenizer.set_vocab(vocab)
129 |             return WordEmbeddings(tokenizer=tokenizer, embedding_weights=embeddings, update_embeddings=update_embeddings)
130 | 
131 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/EmbeddingSimilarityEvaluator.py:
--------------------------------------------------------------------------------
  1 | from . import SentenceEvaluator, SimilarityFunction
  2 | from torch.utils.data import DataLoader
  3 | 
  4 | import torch
  5 | import logging
  6 | from tqdm import tqdm
  7 | from ..util import batch_to_device
  8 | import os
  9 | import csv
 10 | from sklearn.metrics.pairwise import paired_cosine_distances, paired_euclidean_distances, paired_manhattan_distances
 11 | from scipy.stats import pearsonr, spearmanr
 12 | import numpy as np
 13 | 
 14 | class EmbeddingSimilarityEvaluator(SentenceEvaluator):
 15 |     """
 16 |     Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
 17 |     in comparison to the gold standard labels.
 18 |     The metrics are the cosine similarity as well as euclidean and Manhattan distance
 19 |     The returned score is the Spearman correlation with a specified metric.
 20 | 
 21 |     The results are written in a CSV. If a CSV already exists, then values are appended.
 22 |     """
 23 | 
 24 | 
 25 |     def __init__(self, dataloader: DataLoader, main_similarity: SimilarityFunction = None, name: str = '', show_progress_bar: bool = None):
 26 |         """
 27 |         Constructs an evaluator based for the dataset
 28 | 
 29 |         The labels need to indicate the similarity between the sentences.
 30 | 
 31 |         :param dataloader:
 32 |             the data for the evaluation
 33 |         :param main_similarity:
 34 |             the similarity metric that will be used for the returned score
 35 |         """
 36 |         self.dataloader = dataloader
 37 |         self.main_similarity = main_similarity
 38 |         self.name = name
 39 |         if name:
 40 |             name = "_"+name
 41 | 
 42 |         if show_progress_bar is None:
 43 |             show_progress_bar = (logging.getLogger().getEffectiveLevel() == logging.INFO or logging.getLogger().getEffectiveLevel() == logging.DEBUG)
 44 |         self.show_progress_bar = show_progress_bar
 45 | 
 46 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 47 |         self.csv_file: str = "similarity_evaluation"+name+"_results.csv"
 48 |         self.csv_headers = ["epoch", "steps", "cosine_pearson", "cosine_spearman", "euclidean_pearson", "euclidean_spearman", "manhattan_pearson", "manhattan_spearman", "dot_pearson", "dot_spearman"]
 49 | 
 50 |     def __call__(self, model: 'SequentialSentenceEmbedder', output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 51 |         model.eval()
 52 |         embeddings1 = []
 53 |         embeddings2 = []
 54 |         labels = []
 55 | 
 56 |         if epoch != -1:
 57 |             if steps == -1:
 58 |                 out_txt = f" after epoch {epoch}:"
 59 |             else:
 60 |                 out_txt = f" in epoch {epoch} after {steps} steps:"
 61 |         else:
 62 |             out_txt = ":"
 63 | 
 64 |         logging.info("Evaluation the model on "+self.name+" dataset"+out_txt)
 65 | 
 66 |         self.dataloader.collate_fn = model.smart_batching_collate
 67 | 
 68 |         iterator = self.dataloader
 69 |         if self.show_progress_bar:
 70 |             iterator = tqdm(iterator, desc="Convert Evaluating")
 71 | 
 72 |         for step, batch in enumerate(iterator):
 73 |             features, label_ids = batch_to_device(batch, self.device)
 74 |             with torch.no_grad():
 75 |                 emb1, emb2 = [model(sent_features)['sentence_embedding'].to("cpu").numpy() for sent_features in features]
 76 | 
 77 |             labels.extend(label_ids.to("cpu").numpy())
 78 |             embeddings1.extend(emb1)
 79 |             embeddings2.extend(emb2)
 80 | 
 81 |         try:
 82 |             cosine_scores = 1 - (paired_cosine_distances(embeddings1, embeddings2))
 83 |         except Exception as e:
 84 |             print(embeddings1)
 85 |             print(embeddings2)
 86 |             raise(e)
 87 | 
 88 |         manhattan_distances = -paired_manhattan_distances(embeddings1, embeddings2)
 89 |         euclidean_distances = -paired_euclidean_distances(embeddings1, embeddings2)
 90 |         dot_products = [np.dot(emb1, emb2) for emb1, emb2 in zip(embeddings1, embeddings2)]
 91 | 
 92 | 
 93 |         eval_pearson_cosine, _ = pearsonr(labels, cosine_scores)
 94 |         eval_spearman_cosine, _ = spearmanr(labels, cosine_scores)
 95 | 
 96 |         eval_pearson_manhattan, _ = pearsonr(labels, manhattan_distances)
 97 |         eval_spearman_manhattan, _ = spearmanr(labels, manhattan_distances)
 98 | 
 99 |         eval_pearson_euclidean, _ = pearsonr(labels, euclidean_distances)
100 |         eval_spearman_euclidean, _ = spearmanr(labels, euclidean_distances)
101 | 
102 |         eval_pearson_dot, _ = pearsonr(labels, dot_products)
103 |         eval_spearman_dot, _ = spearmanr(labels, dot_products)
104 | 
105 |         logging.info("Cosine-Similarity :\tPearson: {:.4f}\tSpearman: {:.4f}".format(
106 |             eval_pearson_cosine, eval_spearman_cosine))
107 |         logging.info("Manhattan-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
108 |             eval_pearson_manhattan, eval_spearman_manhattan))
109 |         logging.info("Euclidean-Distance:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
110 |             eval_pearson_euclidean, eval_spearman_euclidean))
111 |         logging.info("Dot-Product-Similarity:\tPearson: {:.4f}\tSpearman: {:.4f}".format(
112 |             eval_pearson_dot, eval_spearman_dot))
113 | 
114 |         if output_path is not None:
115 |             csv_path = os.path.join(output_path, self.csv_file)
116 |             output_file_exists = os.path.isfile(csv_path)
117 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
118 |                 writer = csv.writer(f)
119 |                 if not output_file_exists:
120 |                     writer.writerow(self.csv_headers)
121 | 
122 |                 writer.writerow([epoch, steps, eval_pearson_cosine, eval_spearman_cosine, eval_pearson_euclidean,
123 |                                  eval_spearman_euclidean, eval_pearson_manhattan, eval_spearman_manhattan, eval_pearson_dot, eval_spearman_dot])
124 | 
125 | 
126 |         if self.main_similarity == SimilarityFunction.COSINE:
127 |             return eval_spearman_cosine
128 |         elif self.main_similarity == SimilarityFunction.EUCLIDEAN:
129 |             return eval_spearman_euclidean
130 |         elif self.main_similarity == SimilarityFunction.MANHATTAN:
131 |             return eval_spearman_manhattan
132 |         elif self.main_similarity == SimilarityFunction.DOT_PRODUCT:
133 |             return eval_spearman_dot
134 |         elif self.main_similarity is None:
135 |             return max(eval_spearman_cosine, eval_spearman_manhattan, eval_spearman_euclidean, eval_spearman_dot)
136 |         else:
137 |             raise ValueError("Unknown main_similarity value")
138 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/test_batch_hard_triplet_loss.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | from sentence_transformers.losses import BatchHardTripletLoss
  4 | 
  5 | # Test-suite from https://github.com/omoindrot/tensorflow-triplet-loss/blob/master/model/tests/test_triplet_loss.py
  6 | # Skipped the `test_gradients_pairwise_distances()` test since it's trivial to see if your model loss turns NaN
  7 | # and porting it proved more difficult than expected.
  8 | 
  9 | def pairwise_distance_np(feature, squared=False):
 10 |     """Computes the pairwise distance matrix in numpy.
 11 |     Args:
 12 |         feature: 2-D numpy array of size [number of data, feature dimension]
 13 |         squared: Boolean. If true, output is the pairwise squared euclidean
 14 |                  distance matrix; else, output is the pairwise euclidean distance matrix.
 15 |     Returns:
 16 |         pairwise_distances: 2-D numpy array of size
 17 |                             [number of data, number of data].
 18 |     """
 19 |     triu = np.triu_indices(feature.shape[0], 1)
 20 |     upper_tri_pdists = np.linalg.norm(feature[triu[1]] - feature[triu[0]], axis=1)
 21 |     if squared:
 22 |         upper_tri_pdists **= 2.
 23 |     num_data = feature.shape[0]
 24 |     pairwise_distances = np.zeros((num_data, num_data))
 25 |     pairwise_distances[np.triu_indices(num_data, 1)] = upper_tri_pdists
 26 |     # Make symmetrical.
 27 |     pairwise_distances = pairwise_distances + pairwise_distances.T - np.diag(
 28 |         pairwise_distances.diagonal())
 29 |     return pairwise_distances
 30 | 
 31 | def test_pairwise_distances():
 32 |     """Test the pairwise distances function."""
 33 |     num_data = 64
 34 |     feat_dim = 6
 35 | 
 36 |     embeddings = np.random.randn(num_data, feat_dim).astype(np.float32)
 37 |     embeddings[1] = embeddings[0]  # to get distance 0
 38 | 
 39 |     for squared in [True, False]:
 40 |         res_np = pairwise_distance_np(embeddings, squared=squared)
 41 |         res_pt = BatchHardTripletLoss._pairwise_distances(torch.from_numpy(embeddings), squared=squared)
 42 |         assert np.allclose(res_np, res_pt)
 43 | 
 44 | def test_pairwise_distances_are_positive():
 45 |     """Test that the pairwise distances are always positive.
 46 |     Use a tricky case where numerical errors are common.
 47 |     """
 48 |     num_data = 64
 49 |     feat_dim = 6
 50 | 
 51 |     # Create embeddings very close to each other in [1.0 - 2e-7, 1.0 + 2e-7]
 52 |     # This will encourage errors in the computation
 53 |     embeddings = 1.0 + 2e-7 * np.random.randn(num_data, feat_dim).astype(np.float32)
 54 |     embeddings[1] = embeddings[0]  # to get distance 0
 55 | 
 56 |     for squared in [True, False]:
 57 |         res_tf = BatchHardTripletLoss._pairwise_distances(torch.from_numpy(embeddings), squared=squared)
 58 |         assert res_tf[res_tf < 0].sum() == 0
 59 | 
 60 | 
 61 | def test_triplet_mask():
 62 |     """Test function _get_triplet_mask."""
 63 |     num_data = 64
 64 |     num_classes = 10
 65 | 
 66 |     labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32)
 67 | 
 68 |     mask_np = np.zeros((num_data, num_data, num_data))
 69 |     for i in range(num_data):
 70 |         for j in range(num_data):
 71 |             for k in range(num_data):
 72 |                 distinct = (i != j and i != k and j != k)
 73 |                 valid = (labels[i] == labels[j]) and (labels[i] != labels[k])
 74 |                 mask_np[i, j, k] = (distinct and valid)
 75 | 
 76 |     mask_tf_val = BatchHardTripletLoss._get_triplet_mask(torch.from_numpy(labels))
 77 |     assert np.allclose(mask_np, mask_tf_val)
 78 | 
 79 | def test_anchor_positive_triplet_mask():
 80 |     """Test function _get_anchor_positive_triplet_mask."""
 81 |     num_data = 64
 82 |     num_classes = 10
 83 | 
 84 |     labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32)
 85 | 
 86 |     mask_np = np.zeros((num_data, num_data))
 87 |     for i in range(num_data):
 88 |         for j in range(num_data):
 89 |             distinct = (i != j)
 90 |             valid = labels[i] == labels[j]
 91 |             mask_np[i, j] = (distinct and valid)
 92 | 
 93 |     mask_tf_val = BatchHardTripletLoss._get_anchor_positive_triplet_mask(torch.from_numpy(labels))
 94 | 
 95 |     assert np.allclose(mask_np, mask_tf_val)
 96 | 
 97 | def test_anchor_negative_triplet_mask():
 98 |     """Test function _get_anchor_negative_triplet_mask."""
 99 |     num_data = 64
100 |     num_classes = 10
101 | 
102 |     labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32)
103 | 
104 |     mask_np = np.zeros((num_data, num_data))
105 |     for i in range(num_data):
106 |         for k in range(num_data):
107 |             distinct = (i != k)
108 |             valid = (labels[i] != labels[k])
109 |             mask_np[i, k] = (distinct and valid)
110 | 
111 |     mask_tf_val = BatchHardTripletLoss._get_anchor_negative_triplet_mask(torch.from_numpy(labels))
112 | 
113 |     assert np.allclose(mask_np, mask_tf_val)
114 | 
115 | def test_simple_batch_all_triplet_loss():
116 |     """Test the triplet loss with batch all triplet mining in a simple case.
117 |     There is just one class in this super simple edge case, and we want to make sure that
118 |     the loss is 0.
119 |     """
120 |     num_data = 10
121 |     feat_dim = 6
122 |     margin = 0.2
123 |     num_classes = 1
124 | 
125 |     embeddings = np.random.rand(num_data, feat_dim).astype(np.float32)
126 |     labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32)
127 |     labels, embeddings = torch.from_numpy(labels), torch.from_numpy(embeddings)
128 | 
129 |     for squared in [True, False]:
130 |         loss_np = 0.0
131 | 
132 |         # Compute the loss in TF.
133 |         loss_tf_val, fraction_val = BatchHardTripletLoss.batch_all_triplet_loss(labels, embeddings, margin, squared=squared)
134 | 
135 |         assert np.allclose(loss_np, loss_tf_val)
136 |         assert np.allclose(fraction_val, 0.0)
137 | 
138 | 
139 | def test_batch_all_triplet_loss():
140 |     """Test the triplet loss with batch all triplet mining"""
141 |     num_data = 10
142 |     feat_dim = 6
143 |     margin = 0.2
144 |     num_classes = 5
145 | 
146 |     embeddings = np.random.rand(num_data, feat_dim).astype(np.float32)
147 |     labels = np.random.randint(0, num_classes, size=(num_data)).astype(np.float32)
148 | 
149 |     for squared in [True, False]:
150 |         pdist_matrix = pairwise_distance_np(embeddings, squared=squared)
151 | 
152 |         loss_np = 0.0
153 |         num_positives = 0.0
154 |         num_valid = 0.0
155 |         for i in range(num_data):
156 |             for j in range(num_data):
157 |                 for k in range(num_data):
158 |                     distinct = (i != j and i != k and j != k)
159 |                     valid = (labels[i] == labels[j]) and (labels[i] != labels[k])
160 |                     if distinct and valid:
161 |                         num_valid += 1.0
162 | 
163 |                         pos_distance = pdist_matrix[i][j]
164 |                         neg_distance = pdist_matrix[i][k]
165 | 
166 |                         loss = np.maximum(0.0, pos_distance - neg_distance + margin)
167 |                         loss_np += loss
168 | 
169 |                         num_positives += (loss > 0)
170 | 
171 |         loss_np /= num_positives
172 | 
173 |         # Compute the loss in TF.
174 |         loss_tf_val, fraction_val = BatchHardTripletLoss.batch_all_triplet_loss(torch.from_numpy(labels), torch.from_numpy(embeddings), margin, squared=squared)
175 |         assert np.allclose(loss_np, loss_tf_val)
176 |         assert np.allclose(num_positives / num_valid, fraction_val)
177 | 
178 | def test_batch_hard_triplet_loss():
179 |     """Test the triplet loss with batch hard triplet mining"""
180 |     num_data = 50
181 |     feat_dim = 6
182 |     margin = 0.2
183 |     num_classes = 5
184 |     min_class = 100
185 | 
186 |     embeddings = np.random.rand(num_data, feat_dim).astype(np.float32)
187 |     labels = np.random.randint(min_class, min_class+num_classes, size=(num_data)).astype(np.float32)
188 | 
189 |     for squared in [True, False]:
190 |         pdist_matrix = pairwise_distance_np(embeddings, squared=squared)
191 | 
192 |         loss_np = 0.0
193 |         for i in range(num_data):
194 |             # Select the hardest positive
195 |             max_pos_dist = np.max(pdist_matrix[i][labels == labels[i]])
196 | 
197 |             # Select the hardest negative
198 |             min_neg_dist = np.min(pdist_matrix[i][labels != labels[i]])
199 | 
200 | 
201 |             loss = np.maximum(0.0, max_pos_dist - min_neg_dist + margin)
202 |             loss_np += loss
203 | 
204 |         loss_np /= num_data
205 | 
206 |         # Compute the loss in TF.
207 |         loss_tf_val = BatchHardTripletLoss.batch_hard_triplet_loss(torch.from_numpy(labels), torch.from_numpy(embeddings), margin, squared=squared)
208 |         assert np.allclose(loss_np, loss_tf_val)
209 | 
210 | if __name__ == '__main__':
211 |     test_pairwise_distances()
212 |     test_pairwise_distances_are_positive()
213 |     test_triplet_mask()
214 |     test_anchor_positive_triplet_mask()
215 |     test_anchor_negative_triplet_mask()
216 |     test_batch_hard_triplet_loss()
217 |     print("--TESTS done ---")


--------------------------------------------------------------------------------
/sentence_transformers/datasets.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This files contains various pytorch dataset classes, that provide
  3 | data to the Transformer model
  4 | """
  5 | from torch.utils.data import Dataset
  6 | from typing import List
  7 | from torch import Tensor
  8 | import bisect
  9 | import torch
 10 | import logging
 11 | import numpy as np
 12 | from tqdm import tqdm
 13 | from . import SentenceTransformer
 14 | from .readers.InputExample import InputExample
 15 | 
 16 | 
 17 | class SentencesDataset(Dataset):
 18 |     """
 19 |     Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all
 20 |     sequences to the max length.
 21 |     The SentenceBertEncoder.smart_batching_collate is required for this to work.
 22 |     SmartBatchingDataset does *not* work without it.
 23 |     """
 24 |     def __init__(self, examples: List[InputExample], model: SentenceTransformer, show_progress_bar: bool = None):
 25 |         """
 26 |         Create a new SentencesDataset with the tokenized texts and the labels as Tensor
 27 |         """
 28 |         if show_progress_bar is None:
 29 |             show_progress_bar = (logging.getLogger().getEffectiveLevel() == logging.INFO or logging.getLogger().getEffectiveLevel() == logging.DEBUG)
 30 |         self.show_progress_bar = show_progress_bar
 31 | 
 32 |         self.convert_input_examples(examples, model)
 33 | 
 34 |     def convert_input_examples(self, examples: List[InputExample], model: SentenceTransformer):
 35 |         """
 36 |         Converts input examples to a SmartBatchingDataset usable to train the model with
 37 |         SentenceTransformer.smart_batching_collate as the collate_fn for the DataLoader
 38 | 
 39 |         smart_batching_collate as collate_fn is required because it transforms the tokenized texts to the tensors.
 40 | 
 41 |         :param examples:
 42 |             the input examples for the training
 43 |         :param model
 44 |             the Sentence BERT model for the conversion
 45 |         :return: a SmartBatchingDataset usable to train the model with SentenceTransformer.smart_batching_collate as the collate_fn
 46 |             for the DataLoader
 47 |         """
 48 |         num_texts = len(examples[0].texts)
 49 |         inputs = [[] for _ in range(num_texts)]
 50 |         labels = []
 51 |         too_long = [0] * num_texts
 52 |         label_type = None
 53 |         iterator = examples
 54 |         max_seq_length = model.get_max_seq_length()
 55 | 
 56 |         if self.show_progress_bar:
 57 |             iterator = tqdm(iterator, desc="Convert dataset")
 58 | 
 59 |         for ex_index, example in enumerate(iterator):
 60 |             if label_type is None:
 61 |                 if isinstance(example.label, int):
 62 |                     label_type = torch.long
 63 |                 elif isinstance(example.label, float):
 64 |                     label_type = torch.float
 65 |             tokenized_texts = [model.tokenize(text) for text in example.texts]
 66 | 
 67 |             for i, token in enumerate(tokenized_texts):
 68 |                 if max_seq_length != None and max_seq_length > 0 and len(token) >= max_seq_length:
 69 |                     too_long[i] += 1
 70 | 
 71 |             labels.append(example.label)
 72 |             for i in range(num_texts):
 73 |                 inputs[i].append(tokenized_texts[i])
 74 | 
 75 |         tensor_labels = torch.tensor(labels, dtype=label_type)
 76 | 
 77 |         logging.info("Num sentences: %d" % (len(examples)))
 78 |         for i in range(num_texts):
 79 |             logging.info("Sentences {} longer than max_seqence_length: {}".format(i, too_long[i]))
 80 | 
 81 |         self.tokens = inputs
 82 |         self.labels = tensor_labels
 83 | 
 84 |     def __getitem__(self, item):
 85 |         return [self.tokens[i][item] for i in range(len(self.tokens))], self.labels[item]
 86 | 
 87 |     def __len__(self):
 88 |         return len(self.tokens[0])
 89 | 
 90 | 
 91 | class SentenceLabelDataset(Dataset):
 92 |     """
 93 |     Dataset for training with triplet loss.
 94 |     This dataset takes a list of sentences grouped by their label and uses this grouping to dynamically select a
 95 |     positive example from the same group and a negative example from the other sentences for a selected anchor sentence.
 96 | 
 97 |     This dataset should be used in combination with dataset_reader.LabelSentenceReader
 98 | 
 99 |     One iteration over this dataset selects every sentence as anchor once.
100 | 
101 |     This also uses smart batching like SentenceDataset.
102 |     """
103 |     tokens: List[List[str]]
104 |     labels: Tensor
105 |     num_labels: int
106 |     labels_right_border: List[int]
107 | 
108 |     def __init__(self, examples: List[InputExample], model: SentenceTransformer, provide_positive: bool = True,
109 |                  provide_negative: bool = True):
110 |         """
111 |         Converts input examples to a SentenceLabelDataset usable to train the model with
112 |         SentenceTransformer.smart_batching_collate as the collate_fn for the DataLoader
113 | 
114 |         Assumes only one sentence per InputExample and labels as integers from 0 to max_num_labels
115 |         and should be used in combination with dataset_reader.LabelSentenceReader.
116 | 
117 |         Labels with only one example are ignored.
118 | 
119 |         smart_batching_collate as collate_fn is required because it transforms the tokenized texts to the tensors.
120 | 
121 |         :param examples:
122 |             the input examples for the training
123 |         :param model
124 |             the Sentence BERT model for the conversion
125 |         :param provide_positive:
126 |             set this to False, if you don't need a positive example (e.g. for BATCH_HARD_TRIPLET_LOSS).
127 |         :param provide_negative:
128 |             set this to False, if you don't need a negative example (e.g. for BATCH_HARD_TRIPLET_LOSS
129 |             or MULTIPLE_NEGATIVES_RANKING_LOSS).
130 |         """
131 |         self.convert_input_examples(examples, model)
132 |         self.idxs = np.arange(len(self.tokens))
133 |         self.positive = provide_positive
134 |         self.negative = provide_negative
135 | 
136 |     def convert_input_examples(self, examples: List[InputExample], model: SentenceTransformer):
137 |         """
138 |         Converts input examples to a SentenceLabelDataset.
139 | 
140 |         Assumes only one sentence per InputExample and labels as integers from 0 to max_num_labels
141 |         and should be used in combination with dataset_reader.LabelSentenceReader.
142 | 
143 |         Labels with only one example are ignored.
144 | 
145 |         :param examples:
146 |             the input examples for the training
147 |         :param model
148 |             the Sentence Transformer model for the conversion
149 |         """
150 |         self.labels_right_border = []
151 |         self.num_labels = 0
152 |         inputs = []
153 |         labels = []
154 | 
155 |         label_sent_mapping = {}
156 |         too_long = 0
157 |         label_type = None
158 |         for ex_index, example in enumerate(tqdm(examples, desc="Convert dataset")):
159 |             if label_type is None:
160 |                 if isinstance(example.label, int):
161 |                     label_type = torch.long
162 |                 elif isinstance(example.label, float):
163 |                     label_type = torch.float
164 |             tokenized_text = model.tokenize(example.texts[0])
165 | 
166 |             if hasattr(model, 'max_seq_length') and model.max_seq_length is not None and model.max_seq_length > 0 and len(tokenized_text) >= model.max_seq_length:
167 |                 too_long += 1
168 |             if example.label in label_sent_mapping:
169 |                 label_sent_mapping[example.label].append(ex_index)
170 |             else:
171 |                 label_sent_mapping[example.label] = [ex_index]
172 |             labels.append(example.label)
173 |             inputs.append(tokenized_text)
174 | 
175 |         grouped_inputs = []
176 |         for i in range(len(label_sent_mapping)):
177 |             if len(label_sent_mapping[i]) >= 2:
178 |                 grouped_inputs.extend([inputs[j] for j in label_sent_mapping[i]])
179 |                 self.labels_right_border.append(len(grouped_inputs))
180 |                 self.num_labels += 1
181 | 
182 |         tensor_labels = torch.tensor(labels, dtype=label_type)
183 | 
184 |         logging.info("Num sentences: %d" % (len(grouped_inputs)))
185 |         logging.info("Sentences longer than max_seqence_length: {}".format(too_long))
186 |         logging.info("Number of labels with >1 examples: {}".format(self.num_labels))
187 |         self.tokens = grouped_inputs
188 |         self.labels = tensor_labels
189 | 
190 |     def __getitem__(self, item):
191 |         if not self.positive and not self.negative:
192 |             return [self.tokens[item]], self.labels[item]
193 | 
194 |         label = bisect.bisect_right(self.labels_right_border, item)
195 |         left_border = 0 if label == 0 else self.labels_right_border[label-1]
196 |         right_border = self.labels_right_border[label]
197 |         positive_item = np.random.choice(np.concatenate([self.idxs[left_border:item], self.idxs[item+1:right_border]]))
198 |         negative_item = np.random.choice(np.concatenate([self.idxs[0:left_border], self.idxs[right_border:]]))
199 | 
200 |         if self.positive:
201 |             positive = [self.tokens[positive_item]]
202 |         else:
203 |             positive = []
204 |         if self.negative:
205 |             negative = [self.tokens[negative_item]]
206 |         else:
207 |             negative = []
208 | 
209 |         return [self.tokens[item]]+positive+negative, self.labels[item]
210 | 
211 |     def __len__(self):
212 |         return len(self.tokens)
213 | 


--------------------------------------------------------------------------------