├── .gitignore
├── DataNLI
    ├── labels.dev.gz
    ├── labels.test.gz
    ├── labels.train.gz
    ├── s1.dev.gz
    ├── s1.test.gz
    ├── s1.train.gz
    ├── s2.dev.gz
    ├── s2.test.gz
    └── s2.train.gz
├── LICENSE
├── NOTICE.txt
├── README.md
├── docs
    └── pretrained-models
    │   ├── multilingual-distillation.png
    │   ├── multilingual-models.md
    │   ├── nli-models.md
    │   ├── sts-models.md
    │   └── wikipedia-sections-models.md
├── examples
    ├── README.md
    ├── applications
    │   ├── basic_embedding.py
    │   ├── clustering.py
    │   ├── clustering_wikipedia_sections.py
    │   └── semantic_search.py
    ├── datasets
    │   ├── README.md
    │   ├── clean_sts.py
    │   ├── get_data.py
    │   ├── stsbenchmark
    │   │   ├── LICENSE.txt
    │   │   ├── correlation.pl
    │   │   ├── readme.txt
    │   │   ├── sts-dev_vi.csv
    │   │   ├── sts-test_vi.csv
    │   │   ├── sts-train-dev_vi.csv
    │   │   └── sts-train_vi.csv
    │   └── translate_sts.py
    ├── evaluate_STSb_datasets
    │   ├── sbert_embbeding
    │   │   ├── training.py
    │   │   ├── training_CNN.py
    │   │   ├── training_LSTM.py
    │   │   └── training_NLI.py
    │   └── word_embbeding
    │   │   ├── training_biltsm.py
    │   │   ├── training_cnn.py
    │   │   ├── training_w2v_no_word_segmentation.py
    │   │   └── training_w2v_word_segmentation.py
    ├── evaluation
    │   ├── evaluation_inference_speed.py
    │   ├── evaluation_stsbenchmark.py
    │   └── evaluation_stsbenchmark_sbert-wk.py
    ├── training_basic_models
    │   ├── training_stsbenchmark_avg_word_embeddings.py
    │   ├── training_stsbenchmark_bilstm.py
    │   ├── training_stsbenchmark_bow.py
    │   ├── training_stsbenchmark_cnn.py
    │   └── training_stsbenchmark_tf-idf_word_embeddings.py
    ├── training_multilingual
    │   └── training_sbert-en-de.py
    └── training_transformers
    │   ├── training_multi-task.py
    │   ├── training_nli.py
    │   ├── training_nli_phobert.py
    │   ├── training_stsbenchmark.py
    │   ├── training_stsbenchmark_continue_training.py
    │   └── training_wikipedia_sections.py
├── requirements.txt
├── sentence_transformers
    ├── LoggingHandler.py
    ├── SentenceTransformer.py
    ├── __init__.py
    ├── data_samplers.py
    ├── datasets
    │   ├── ParallelSentencesDataset.py
    │   ├── SentenceLabelDataset.py
    │   ├── SentencesDataset.py
    │   └── __init__.py
    ├── evaluation
    │   ├── BinaryEmbeddingSimilarityEvaluator.py
    │   ├── EmbeddingSimilarityEvaluator.py
    │   ├── LabelAccuracyEvaluator.py
    │   ├── MSEEvaluator.py
    │   ├── SentenceEvaluator.py
    │   ├── SequentialEvaluator.py
    │   ├── SimilarityFunction.py
    │   ├── TranslationEvaluator.py
    │   ├── TripletEvaluator.py
    │   └── __init__.py
    ├── losses
    │   ├── BatchHardTripletLoss.py
    │   ├── CosineSimilarityLoss.py
    │   ├── MSELoss.py
    │   ├── MultipleNegativesRankingLoss.py
    │   ├── SoftmaxLoss.py
    │   ├── TripletLoss.py
    │   ├── __init__.py
    │   └── test_batch_hard_triplet_loss.py
    ├── models
    │   ├── ADVANCED_CNN.py
    │   ├── ALBERT.py
    │   ├── BERT.py
    │   ├── BERT_LSTM.py
    │   ├── BoW.py
    │   ├── CNN.py
    │   ├── CamemBERT.py
    │   ├── Dense.py
    │   ├── DistilBERT.py
    │   ├── LSTM.py
    │   ├── PhoBERT.py
    │   ├── Pooling.py
    │   ├── RoBERTa.py
    │   ├── T5.py
    │   ├── Transformer.py
    │   ├── WKPooling.py
    │   ├── WeightedLayerPooling.py
    │   ├── WordEmbeddings.py
    │   ├── WordWeights.py
    │   ├── XLMRoBERTa.py
    │   ├── XLNet.py
    │   ├── __init__.py
    │   ├── proposed_CNN.py
    │   └── tokenizer
    │   │   ├── PhoTokenizer.py
    │   │   ├── PhraseTokenizer.py
    │   │   ├── VietnameseTokenizer.py
    │   │   ├── WhitespaceTokenizer.py
    │   │   ├── WordTokenizer.py
    │   │   └── __init__.py
    ├── readers
    │   ├── InputExample.py
    │   ├── LabelSentenceReader.py
    │   ├── NLIDataReader.py
    │   ├── PairedFilesReader.py
    │   ├── STSDataReader.py
    │   ├── TripletReader.py
    │   └── __init__.py
    └── util.py
├── setup.cfg
├── setup.py
├── tests
    ├── test_pretrained_stsb.py
    └── test_wkpooling.py
└── training_NLI.py


/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | examples/output
4 | sentence_transformers.egg-info
5 | dist/
6 | nr_*/
7 | .vscode/
8 | __pycache__/
9 | Pre-trained_models


--------------------------------------------------------------------------------
/DataNLI/labels.dev.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/labels.dev.gz


--------------------------------------------------------------------------------
/DataNLI/labels.test.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/labels.test.gz


--------------------------------------------------------------------------------
/DataNLI/labels.train.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/labels.train.gz


--------------------------------------------------------------------------------
/DataNLI/s1.dev.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s1.dev.gz


--------------------------------------------------------------------------------
/DataNLI/s1.test.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s1.test.gz


--------------------------------------------------------------------------------
/DataNLI/s1.train.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s1.train.gz


--------------------------------------------------------------------------------
/DataNLI/s2.dev.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s2.dev.gz


--------------------------------------------------------------------------------
/DataNLI/s2.test.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s2.test.gz


--------------------------------------------------------------------------------
/DataNLI/s2.train.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s2.train.gz


--------------------------------------------------------------------------------
/NOTICE.txt:
--------------------------------------------------------------------------------
1 | -------------------------------------------------------------------------------
2 | Copyright 2019
3 | Ubiquitous Knowledge Processing (UKP) Lab
4 | Technische Universität Darmstadt
5 | -------------------------------------------------------------------------------


--------------------------------------------------------------------------------
/docs/pretrained-models/multilingual-distillation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/docs/pretrained-models/multilingual-distillation.png


--------------------------------------------------------------------------------
/docs/pretrained-models/nli-models.md:
--------------------------------------------------------------------------------
 1 | # NLI Models
 2 | Conneau et al., 2017, show in the InferSent-Paper ([Supervised Learning of Universal Sentence Representations from Natural Language Inference Data](https://arxiv.org/abs/1705.02364)) that training on Natural Language Inference (NLI) data can produce universal sentence embeddings.
 3 | 
 4 | The datasets labeled sentence pairs with the labels *entail*, *contradict*, and *neutral*. For both sentences, we compute a sentence embedding. These two embeddings are concatenated and passed to softmax classifier to derive the final label.
 5 | 
 6 | As shown, this produces sentence embeddings that can be used for various use cases like clustering or semantic search.
 7 | 
 8 | # Datasets
 9 | We train the models on the [SNLI](https://nlp.stanford.edu/projects/snli/) and on the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset. We call the combination of the two datasets AllNLI.
10 | 
11 | For a training example, see [examples/training_nli_bert.py](../../examples/training_nli_bert.py). 
12 | 
13 | # Pre-trained models
14 | We provide the following pre-trained models. The performance was evaluated on the test set of the [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) using Spearman rank correlation.
15 | 
16 | 
17 | ### BERT models
18 | - **bert-base-nli-mean-tokens**: BERT-base model with mean-tokens pooling. Performance: STSbenchmark: 77.12
19 | - **bert-base-nli-max-tokens**: BERT-base with max-tokens pooling. Performance: STSbenchmark: 77.21
20 | - **bert-base-nli-cls-token**: BERT-base with cls token pooling. Performance: STSbenchmark: 76.30
21 | - **bert-large-nli-mean-tokens**: BERT-large with mean-tokens pooling. Performance: STSbenchmark: 79.19
22 | - **bert-large-nli-max-tokens**: BERT-large with max-tokens pooling. Performance: STSbenchmark: 78.41
23 | - **bert-large-nli-cls-token**: BERT-large with CLS token pooling. Performance: STSbenchmark: 78.29
24 | 
25 | ### RoBERTa models
26 | RoBERTa is an extension of BERT. [More Information](https://arxiv.org/abs/1907.11692).
27 | - **roberta-base-nli-mean-tokens**: RoBERTa-base with mean-tokens pooling. Performance: STSbenchmark: 77.49
28 | - **roberta-large-nli-mean-tokens**: RoBERTa-base with mean-tokens pooling. Performance: STSbenchmark: 78.69
29 | 
30 | ### DistilBERT models
31 | DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. [More Information](https://github.com/huggingface/transformers/tree/master/examples/distillation)
32 | - **distilbert-base-nli-mean-tokens**: DistilBERT-base with mean-tokens pooling. Performance: STSbenchmark: 76.97
33 | 
34 | # Performance Comparison
35 | Here are the performances on the STS benchmark for other sentence embeddings methods. They were also computed by using cosine-similarity and Spearman rank correlation:
36 | - Avg. GloVe embeddings:  58.02 
37 | - BERT-as-a-service avg. embeddings:  46.35 
38 | - BERT-as-a-service CLS-vector: 16.50 
39 | - InferSent - GloVe: 68.03 
40 | - Universal Sentence Encoder: 74.92
41 | 
42 | # Applications
43 | This model works well in accessing the coarse-grained similarity between sentences. For application examples, see [examples/application_semantic_search.py](../../examples/application_semantic_search.py) and [examples/application_clustering.py](../../examples/application_clustering.py)


--------------------------------------------------------------------------------
/docs/pretrained-models/sts-models.md:
--------------------------------------------------------------------------------
 1 | # STS Models
 2 | The models were first trained on [NLI data](nli-models.md), then we fine-tuned them on the  [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark). This generate sentence embeddings that are especially suitable to measure the semantic similarity between sentence pairs.
 3 | 
 4 | # Datasets
 5 | We use the training file from the  [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark).
 6 | 
 7 | For a training example, see:
 8 | - [examples/training_stsbenchmark.py](../../examples/training_stsbenchmark_bert.py) - Train directly on STS data
 9 | - [examples/training_stsbenchmark_continue_training.py ](../../examples/training_stsbenchmark_continue_training.py) - First train one NLI, than train on STS data.
10 | 
11 | # Pre-trained models
12 |  We provide the following pre-trained models:
13 |  
14 | ### BERT models
15 | - **bert-base-nli-stsb-mean-tokens**: BERT-base trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.14
16 | - **bert-large-nli-stsb-mean-tokens**: BERT-large trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.29
17 | 
18 | ### RoBERTa models
19 | RoBERTa is an extension of BERT. [More Information](https://arxiv.org/abs/1907.11692).
20 | - **roberta-base-nli-stsb-mean-tokens**: RoBERTa-base trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.40
21 | - **roberta-large-nli-stsb-mean-tokens**: RoBERTa-large trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 86.31
22 | 
23 | ### DistilBERT
24 | DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. [More Information](https://github.com/huggingface/transformers/tree/master/examples/distillation)
25 | - **distilbert-base-nli-stsb-mean-tokens**: Performance: STSbenchmark: 84.38
26 | 
27 | # Performance Comparison
28 | Here are the performances on the STS benchmark for other sentence embeddings methods. They were also computed by using cosine-similarity and Spearman rank correlation. Note, these models were not-fined on the STS benchmark.
29 | 
30 | - Avg. GloVe embeddings:  58.02 
31 | - BERT-as-a-service avg. embeddings:  46.35 
32 | - BERT-as-a-service CLS-vector: 16.50 
33 | - InferSent - GloVe: 68.03 
34 | - Universal Sentence Encoder: 74.92
35 | 


--------------------------------------------------------------------------------
/docs/pretrained-models/wikipedia-sections-models.md:
--------------------------------------------------------------------------------
 1 | # Wikipedia Sections Models
 2 | The `wikipedia-sections-models` implement the idea from Dor et al., 2018, [Learning Thematic Similarity Metric Using Triplet Networks](https://aclweb.org/anthology/P18-2009).
 3 | 
 4 | It was trained with a triplet-loss: The anchor and the positive example were sentences from the same section from an wikipedia article, for example, from the History section of the London article. The negative example came from a different section from the same article, for example, from the Education section of the London article.
 5 | 
 6 | # Dataset
 7 | We use dataset from Dor et al., 2018, [Learning Thematic Similarity Metric Using Triplet Networks](https://aclweb.org/anthology/P18-2009).
 8 | 
 9 | See [examples/training_wikipedia_sections.py](../../examples/training_wikipedia_sections.py) for how to train on this dataset.
10 | 
11 | 
12 | # Pre-trained models
13 | We provide the following pre-trained models:
14 | 
15 | - **bert-base-wikipedia-sections-mean-tokens**: 80.42% accuracy on test set.
16 | 
17 | You can use them in the following way:
18 | ```
19 | from sentence_transformers import SentenceTransformer
20 | embedder = SentenceTransformer('pretrained-model-name')
21 | ```
22 | 
23 | # Performance Comparison
24 | Performance (accuracy) reported by Dor et al.:
25 | - mean-vectors: 0.65
26 | - skip-thoughts-CS: 0.615
27 | - skip-thoughts-SICK: 0.547
28 | - triplet-sen: 0.74
29 | 
30 | 
31 | # Applications
32 | The models achieve a rather low performance on the STS benchmark dataset. The reason for this is the training objective: An anchor, a positive and a negative example are presented. The network must only learn to differentiate what the positive and what the negative example is by ensuring that the negative example is further away from the anchor than the positive example.
33 | 
34 | However, it does not matter how far the negative example is away, it can be little or really far away. This makes this model rather bad for deciding if a pair is somewhat similar. It learns only to recognize similar pairs (high scores) and dissimilar pairs (low scores).
35 | 
36 | However, this model works well for **fine-grained clustering**. 
37 | 
38 | For an example, see:
39 | [examples/application_clustering_wikipedia_sections.py](../../examples/application_clustering_wikipedia_sections.py)
40 | 
41 | 
42 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | This folder contains various examples how to use SentenceTransformers.
 3 | 
 4 | ##Datasets
 5 | The [datasets](datasets/) folder stores datasets that are used in these examples. To download these datasets, run in the dataset folder:
 6 | ```
 7 | python get_data.py
 8 | ```
 9 | 
10 | 
11 | ##Applications
12 | The [applications](applications/) folder contains examples how to use SentenceTransformers for tasks like clustering or semantic search.
13 | 
14 | ##Training Transformers
15 | The [training_transformers](training_transformers/) folder contains examples how to fine-tune transformer models like BERT, RoBERTa, or XLM-RoBERTa for generating sentence embedding.
16 | 
17 | Further, it contains examples for **multi-task-learning** and **multilingual-learning**.
18 | 
19 | ##Training Basic Models
20 | The [training_basic_models](training_basic_models/) show some examples how to train simple models like average word embeddings, Tf-Idf. Further, it has some more complex models based on Deep Averaging Networks (DAN), CNN, and LSTM.
21 | 
22 | These example are great if a high speed for sentence embedding generation is needed.
23 | 
24 | ##Evaluation
25 | The [evaluation](evaluation/) folder contains some examples how to evaluate SentenceTransformer models for common tasks.


--------------------------------------------------------------------------------
/examples/applications/basic_embedding.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This basic example loads a pre-trained model from the web and uses it to
 3 | generate sentence embeddings for a given list of sentences.
 4 | """
 5 | 
 6 | from sentence_transformers import SentenceTransformer, LoggingHandler
 7 | import numpy as np
 8 | import logging
 9 | 
10 | #### Just some code to print debug information to stdout
11 | np.set_printoptions(threshold=100)
12 | 
13 | logging.basicConfig(format='%(asctime)s - %(message)s',
14 |                     datefmt='%Y-%m-%d %H:%M:%S',
15 |                     level=logging.INFO,
16 |                     handlers=[LoggingHandler()])
17 | #### /print debug information to stdout
18 | 
19 | 
20 | 
21 | # Load Sentence model (based on BERT) from URL
22 | model = SentenceTransformer('bert-base-nli-mean-tokens')
23 | 
24 | # Embed a list of sentences
25 | sentences = ['This framework generates embeddings for each input sentence',
26 |              'Sentences are passed as a list of string.',
27 |              'The quick brown fox jumps over the lazy dog.']
28 | sentence_embeddings = model.encode(sentences)
29 | 
30 | # The result is a list of sentence embeddings as numpy arrays
31 | for sentence, embedding in zip(sentences, sentence_embeddings):
32 |     print("Sentence:", sentence)
33 |     print("Embedding:", embedding)
34 |     print("")
35 | 


--------------------------------------------------------------------------------
/examples/applications/clustering.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a simple application for sentence embeddings: clustering
 3 | 
 4 | Sentences are mapped to sentence embeddings and then k-mean clustering is applied.
 5 | """
 6 | from sentence_transformers import SentenceTransformer
 7 | from sklearn.cluster import KMeans
 8 | 
 9 | embedder = SentenceTransformer('bert-base-nli-mean-tokens')
10 | 
11 | # Corpus with example sentences
12 | corpus = ['A man is eating food.',
13 |           'A man is eating a piece of bread.',
14 |           'A man is eating pasta.',
15 |           'The girl is carrying a baby.',
16 |           'The baby is carried by the woman',
17 |           'A man is riding a horse.',
18 |           'A man is riding a white horse on an enclosed ground.',
19 |           'A monkey is playing drums.',
20 |           'Someone in a gorilla costume is playing a set of drums.',
21 |           'A cheetah is running behind its prey.',
22 |           'A cheetah chases prey on across a field.'
23 |           ]
24 | corpus_embeddings = embedder.encode(corpus)
25 | 
26 | # Perform kmean clustering
27 | num_clusters = 5
28 | clustering_model = KMeans(n_clusters=num_clusters)
29 | clustering_model.fit(corpus_embeddings)
30 | cluster_assignment = clustering_model.labels_
31 | 
32 | clustered_sentences = [[] for i in range(num_clusters)]
33 | for sentence_id, cluster_id in enumerate(cluster_assignment):
34 |     clustered_sentences[cluster_id].append(corpus[sentence_id])
35 | 
36 | for i, cluster in enumerate(clustered_sentences):
37 |     print("Cluster ", i+1)
38 |     print(cluster)
39 |     print("")
40 | 


--------------------------------------------------------------------------------
/examples/applications/clustering_wikipedia_sections.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples clusters different sentences that come from the same wikipedia article.
 3 | 
 4 | It uses the 'wikipedia-sections' model, a model that was trained to differentiate if two sentences from the
 5 | same article come from the same section or from different sections in that article.
 6 | """
 7 | from sentence_transformers import SentenceTransformer
 8 | from sklearn.cluster import AgglomerativeClustering
 9 | 
10 | 
11 | 
12 | embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens')
13 | 
14 | #Sentences and sections are from Wikipeda.
15 | #Source: https://en.wikipedia.org/wiki/Bushnell,_Illinois
16 | corpus = [
17 | ("Bushnell is located at 40°33′6″N 90°30′29″W (40.551667, -90.507921).", "Geography"),
18 | ("According to the 2010 census, Bushnell has a total area of 2.138 square miles (5.54 km2), of which 2.13 square miles (5.52 km2) (or 99.63%) is land and 0.008 square miles (0.02 km2) (or 0.37%) is water.", "Geography"),
19 | 
20 | ("The town was founded in 1854 when the Northern Cross Railroad built a line through the area.", "History"),
21 | ("Nehemiah Bushnell was the President of the Railroad, and townspeople honored him by naming their community after him. ", "History"),
22 | ("Bushnell was also served by the Toledo, Peoria and Western Railway, now the Keokuk Junction Railway.", "History"),
23 | 
24 | ("As of the census[6] of 2000, there were 3,221 people, 1,323 households, and 889 families residing in the city. ", "Demographics"),
25 | ("The population density was 1,573.9 people per square mile (606.7/km²).", "Demographics"),
26 | ("There were 1,446 housing units at an average density of 706.6 per square mile (272.3/km²).", "Demographics"),
27 | 
28 | ("From 1991 to 2012, Bushnell was home to one of the largest Christian Music and Arts festivals in the world, known as the Cornerstone Festival.", "Music"),
29 | ("Each year around the 4th of July, 25,000 people from all over the world would descend on the small farm town to watch over 300 bands, authors and artists perform at the Cornerstone Farm Campgrounds.", "Music"),
30 | ("The festival was generally well received by locals, and businesses in the area would typically put up signs welcoming festival-goers to their town.", "Music"),
31 | ("As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. ", "Music"),
32 | ("Cornerstone held its final festival in 2012 and no longer operates.", "Music"),
33 | 
34 | ("Beginning in 1908, the Truman Pioneer Stud Farm in Bushnell was home to one of the largest horse shows in the Midwest.", "Horse show"),
35 | ("The show was well known for imported European horses.", "Horse show"),
36 | ("The Bushnell Horse Show features some of the best Belgian and Percheron hitches in the country. Teams have come from many different states and Canada to compete.", "Horse show"),
37 | ]
38 | 
39 | sentences = [row[0] for row in corpus]
40 | 
41 | corpus_embeddings = embedder.encode(sentences)
42 | num_clusters = len(set([row[1] for row in corpus]))
43 | 
44 | #Sklearn clustering
45 | km = AgglomerativeClustering(n_clusters=num_clusters)
46 | km.fit(corpus_embeddings)
47 | 
48 | cluster_assignment = km.labels_
49 | 
50 | 
51 | clustered_sentences = [[] for i in range(num_clusters)]
52 | for sentence_id, cluster_id in enumerate(cluster_assignment):
53 |     clustered_sentences[cluster_id].append(corpus[sentence_id])
54 | 
55 | for i, cluster in enumerate(clustered_sentences):
56 |     print("Cluster ", i+1)
57 |     for row in cluster:
58 |         print("(Gold label: {}) - {}".format(row[1], row[0]))
59 |     print("")
60 | 
61 | 


--------------------------------------------------------------------------------
/examples/applications/semantic_search.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is a simple application for sentence embeddings: semantic search
 3 | 
 4 | We have a corpus with various sentences. Then, for a given query sentence,
 5 | we want to find the most similar sentence in this corpus.
 6 | 
 7 | This script outputs for various queries the top 5 most similar sentences in the corpus.
 8 | """
 9 | from sentence_transformers import SentenceTransformer
10 | import scipy.spatial
11 | 
12 | embedder = SentenceTransformer('bert-base-nli-mean-tokens')
13 | 
14 | # Corpus with example sentences
15 | corpus = ['A man is eating food.',
16 |           'A man is eating a piece of bread.',
17 |           'The girl is carrying a baby.',
18 |           'A man is riding a horse.',
19 |           'A woman is playing violin.',
20 |           'Two men pushed carts through the woods.',
21 |           'A man is riding a white horse on an enclosed ground.',
22 |           'A monkey is playing drums.',
23 |           'A cheetah is running behind its prey.'
24 |           ]
25 | corpus_embeddings = embedder.encode(corpus)
26 | 
27 | # Query sentences:
28 | queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.']
29 | query_embeddings = embedder.encode(queries)
30 | 
31 | # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
32 | closest_n = 5
33 | for query, query_embedding in zip(queries, query_embeddings):
34 |     distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0]
35 | 
36 |     results = zip(range(len(distances)), distances)
37 |     results = sorted(results, key=lambda x: x[1])
38 | 
39 |     print("\n\n======================\n\n")
40 |     print("Query:", query)
41 |     print("\nTop 5 most similar sentences in corpus:")
42 | 
43 |     for idx, distance in results[0:closest_n]:
44 |         print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance))
45 | 
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/examples/datasets/README.md:
--------------------------------------------------------------------------------
 1 | # Datasets
 2 | This folder contains some example datasets that can be used to for training and evaluation of sentence embeddings methods.
 3 | 
 4 | To download these datasets, run:
 5 | ```
 6 | python get_data.py
 7 | ```
 8 | 
 9 | It will download the datasets and unzip them into this directory.
10 | 
11 | 
12 | # AllNLI Dataset
13 | The AllNLI dataset is the concatenation of the SNLI dataset (https://nlp.stanford.edu/projects/snli/) and the MultiNLI dataset (https://www.nyu.edu/projects/bowman/multinli/).
14 | 
15 | # STS Benchmark
16 | The STS Benchmark (http://ixa2.si.ehu.eus/stswiki) contains sentence pairs with human gold score for their similarity.
17 | 


--------------------------------------------------------------------------------
/examples/datasets/clean_sts.py:
--------------------------------------------------------------------------------
 1 | from pathlib import Path
 2 | 
 3 | 
 4 | # filter out all csv files
 5 | working_dir = Path('/workspace/sentence-transformers/examples/datasets/stsbenchmark/').glob('*_vi.csv')
 6 | 
 7 | for f in working_dir:
 8 |     # read all lines to memory and use map for efficiency
 9 |     print(f"Working on {f.name}.")
10 |     with open(f) as fi:
11 |         data = fi.readlines()
12 | 
13 |     data = [line.split('\t') for line in data]
14 |     ident_sents = [idx for idx, line in enumerate(data) if line[4] == '5.000' and line[5] == line[6]]
15 |     print("line(s)", ident_sents)


--------------------------------------------------------------------------------
/examples/datasets/get_data.py:
--------------------------------------------------------------------------------
 1 | import urllib.request
 2 | import zipfile
 3 | import os
 4 | folder_path = os.path.dirname(os.path.realpath(__file__))
 5 | print('Beginning download of datasets')
 6 | 
 7 | datasets = ['AllNLI.zip', 'stsbenchmark.zip', 'wikipedia-sections-triplets.zip', 'STS2017.en-de.txt.gz', 'TED2013-en-de.txt.gz', 'xnli-en-de.txt.gz']
 8 | server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/"
 9 | 
10 | for dataset in datasets:
11 |     print("Download", dataset)
12 |     url = server+dataset
13 |     dataset_path = os.path.join(folder_path, dataset)
14 |     urllib.request.urlretrieve(url, dataset_path)
15 | 
16 |     if dataset.endswith('.zip'):
17 |         print("Extract", dataset)
18 |         with zipfile.ZipFile(dataset_path, "r") as zip_ref:
19 |             zip_ref.extractall(folder_path)
20 |         os.remove(dataset_path)
21 | 
22 | 
23 | print("All datasets downloaded and extracted")
24 | 


--------------------------------------------------------------------------------
/examples/datasets/stsbenchmark/correlation.pl:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/perl
  2 | 
  3 | 
  4 | =head1 $0
  5 | 
  6 | =head1 SYNOPSIS
  7 | 
  8 |  correlation.pl gs system
  9 | 
 10 |  Outputs the Pearson correlation.
 11 | 
 12 |  Example:
 13 | 
 14 |    $ ./correlation.pl gs sys 
 15 | 
 16 |  Author: Eneko Agirre, Aitor Gonzalez-Agirre
 17 | 
 18 |  Dec. 31, 2012
 19 | 
 20 | =cut
 21 | 
 22 | use Getopt::Long qw(:config auto_help); 
 23 | use Pod::Usage; 
 24 | use warnings;
 25 | use strict;
 26 | use Math::Complex;
 27 | 
 28 | pod2usage if $#ARGV != 1 ;
 29 | 
 30 | if (-e $ARGV[1]) {
 31 |     my $continue = 0;
 32 |     my %filtered;
 33 |     my $do = 0;
 34 |     my %a ;
 35 |     my %b ;
 36 |     my %c ;
 37 | 
 38 |     open(I,$ARGV[0]) or die $! ;
 39 |     my $filter = 0;
 40 |     my $i = 0;
 41 |     while (<I>) {
 42 | 	chomp ;
 43 | 	next if /^\#/ ;
 44 | 	if ($_ eq "") {
 45 | 	    $filter++;
 46 | 	    $filtered{$filter} = 1;
 47 | 	}
 48 | 	else {
 49 | 	    my @fields = (split(/\t/,$_)) ;
 50 | 	    my $score = $fields[4] ;
 51 | 	    warn "wrong range of score in gold standard: $score\n" if ($score > 5) or ($score < 0) ;
 52 | 	    $a{$i++} = $score ;
 53 | 	    $filter++;
 54 | 	}
 55 |     } 
 56 |     close(I) ;
 57 | 
 58 |     my $j = 0 ;
 59 | 
 60 |     open(I,$ARGV[1]) or die $! ;
 61 |     my $line = 1;
 62 |     while (<I>) {
 63 | 	if(!defined($filtered{$line})) {
 64 | 	    chomp ;
 65 | 	    next if /^\#/ ;
 66 | 	    my @fields = (split(/\s+/,$_)) ; 
 67 | 	    my ($score) = @fields ;
 68 | 	    $b{$j} = $score ;
 69 | 	    $c{$j} = 100;
 70 | 	    $continue = 1;
 71 | 	    $j++;
 72 | 	}
 73 | 	$line++;
 74 |     } 
 75 |     close(I) ;
 76 |    
 77 |     if ($continue == 1) {
 78 | 	my $sumw=0;
 79 | 
 80 | 	my $sumwy=0;
 81 | 	for(my $y = 0; $y < $i; $y++) {
 82 | 	    $sumwy = $sumwy + (100 * $a{$y});
 83 | 	    $sumw = $sumw + 100;
 84 | 	}
 85 | 	my $meanyw = $sumwy/$sumw;
 86 | 
 87 | 	my $sumwx=0;
 88 | 	for(my $x = 0; $x < $i; $x++) {
 89 | 	    $sumwx = $sumwx + ($c{$x} * $b{$x});
 90 | 	}
 91 | 	my $meanxw = $sumwx/$sumw;
 92 | 
 93 | 	my $sumwxy = 0;
 94 | 	for(my $x = 0; $x < $i; $x++) {
 95 | 	    $sumwxy = $sumwxy + $c{$x}*($b{$x} - $meanxw)*($a{$x} - $meanyw);
 96 | 	}
 97 | 	my $covxyw = $sumwxy/$sumw;
 98 | 
 99 | 	my $sumwxx = 0;
100 | 	for(my $x = 0; $x < $i; $x++) {
101 | 	    $sumwxx = $sumwxx + $c{$x}*($b{$x} - $meanxw)*($b{$x} - $meanxw);
102 | 	}
103 | 	my $covxxw = $sumwxx/$sumw;
104 | 
105 | 	my $sumwyy = 0;
106 | 	for(my $x = 0; $x < $i; $x++) {
107 | 	    $sumwyy = $sumwyy + $c{$x}*($a{$x} - $meanyw)*($a{$x} - $meanyw);
108 | 	}
109 | 	my $covyyw = $sumwyy/$sumw;
110 | 
111 | 	my $corrxyw = $covxyw/sqrt($covxxw*$covyyw);
112 | 
113 | 	printf "Pearson: %.5f\n", $corrxyw ;
114 |     }
115 | }
116 | else{
117 |     printf "Pearson: nan\n";
118 |     exit(1);
119 | }
120 | 


--------------------------------------------------------------------------------
/examples/datasets/translate_sts.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import re
 3 | import os
 4 | from ast import literal_eval
 5 | from pathlib import Path
 6 | from typing import List, Callable
 7 | from tqdm.auto import tqdm
 8 | # from functools import reduce
 9 | 
10 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/translate-stsbenchmark-ed18aff9282f.json'
11 | 
12 | from google.cloud import translate_v2 as translate
13 | translate_client = translate.Client(target_language="vi")
14 | 
15 | def gg_translate(text):
16 |     return translate_client.translate(text, format_='text', source_language='en', model='nmt')['translatedText']
17 | 
18 | # class Translator(object):
19 | #     def __init__(self):
20 | #         super().__init__()
21 | 
22 | #         self.query = {
23 | #             'client': 'gtx',
24 | #             'sl': 'en',
25 | #             'tl': 'vi',
26 | #             'hl': 'vi',
27 | #             'dt': ['at', 'bd', 'ex', 'ld', 'md', 'qca', 'rw', 'rm', 'ss', 't'],
28 | #             'ie': 'UTF-8',
29 | #             'oe': 'UTF-8',
30 | #             'otf': 1,
31 | #             'ssel': 0,
32 | #             'tsel': 0,
33 | #             'kc': 7
34 | #         }
35 | 
36 | #     def translate(self, text):
37 | #         result = ['']
38 | #         try:
39 | #             r = requests.post('https://translate.google.com/translate_a/single', params=self.query, data={'q': text})
40 | #         except requests.RequestException as e:
41 | #             print(e)
42 | 
43 | #         # replace all keywords that doesn't exist in python
44 |         
45 | #         try:
46 | #             result = literal_eval(re.sub(r'Array|null|true|false', '0', r.text))
47 | #         except SyntaxError as e:
48 | #             print(e)
49 | #             print(r.text)
50 | 
51 | #         # concat sentences
52 | #         translated = ' '.join([sent[0] for sent in result[0]])
53 | 
54 | #         return translated
55 | 
56 | 
57 | def translate_sentences_in_stsbenchmark_line(f: Path, line: str, translator: Callable[[str], str]) -> str:
58 |     err_basket = []
59 |     new_line = line.split('\t')
60 |     try: 
61 |         # in stsbenchmark, field 5 and 6 (0 indexed) are 2 sentences
62 |         new_line[5] = translator(new_line[5].strip())
63 |         new_line[6] = translator(new_line[6].strip())
64 | 
65 |         # sometimes, two sentences are identical after translating to Vietnamese so we change the similarity score
66 |         if new_line[5] == new_line[6]:
67 |             new_line[4] = '5.000'
68 |         
69 |         new_line = '\t'.join(new_line)
70 |     except:
71 |         err_basket.append(line)
72 |     
73 |     if len(err_basket):
74 |         log_path = ''.join([f.stem, '.log'])
75 |         with open(log_path, 'w') as fo:
76 |             fo.write('\n'.join(err_basket))
77 | 
78 |     return new_line if type(new_line) == str else "@@ERR@@"
79 | 
80 | # filter out all csv files
81 | working_dir = Path('/workspace/sentence-transformers/examples/datasets/stsbenchmark/').glob('*.csv')
82 | 
83 | # open each file and translate 2 sentences in each line to vietnamese
84 | for f in tqdm(working_dir):
85 |     # read all lines to memory and use map for efficiency
86 |     print(f"Working on {f.name}.")
87 |     with open(f) as fi:
88 |         data = fi.readlines()
89 | 
90 |     print(f"Translating {len(data)} lines.")    
91 |     data = [translate_sentences_in_stsbenchmark_line(f, line, gg_translate) for line in tqdm(data)]
92 |     
93 |     new_name = ''.join([f.stem, '_vi', f.suffix])
94 | 
95 |     print(f"|--> Saving to {new_name}.")
96 |     f.with_name(new_name).write_text('\n'.join(data))
97 | 
98 | print("gimme a breakpoint :))")


--------------------------------------------------------------------------------
/examples/evaluate_STSb_datasets/sbert_embbeding/training.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
 3 | that can be compared using cosine-similarity to measure the similarity.
 4 | 
 5 | Usage:
 6 | python training_nli.py
 7 | 
 8 | OR
 9 | python training_nli.py pretrained_transformer_model_name
10 | """
11 | import torch
12 | from torch.utils.data import DataLoader
13 | import math
14 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, models
15 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
16 | from sentence_transformers.readers import STSBenchmarkDataReader
17 | import logging
18 | from datetime import datetime
19 | import sys, os
20 | import argparse
21 | import numpy as np
22 | np.random.seed(42)
23 | torch.manual_seed(42)
24 | parser = argparse.ArgumentParser(description='Process some integers.')
25 | parser.add_argument('--batch_size', type=int, default=24)
26 | parser.add_argument('--evaluation_steps', type=int, default= 1000)
27 | parser.add_argument('--ckpt_path', type=str, default = "./output")
28 | parser.add_argument('--num_epochs', type=int, default ="1")
29 | parser.add_argument('--data_path', type=str, default = "./DataNLI")
30 | parser.add_argument('--pre_trained_path', type=str, default = "./PhoBERT")
31 | parser.add_argument('--vncorenlp_path', type=str, default = "./VnCoreNLP/VnCoreNLP-1.1.1.jar")
32 | parser.add_argument('--bpe_path', type=str, default = "./PhoBERT")
33 | args = parser.parse_args() 
34 | 
35 | #### Just some code to print debug information to stdout
36 | logging.basicConfig(format='%(asctime)s - %(message)s',
37 |                     datefmt='%Y-%m-%d %H:%M:%S',
38 |                     level=logging.INFO,
39 |                     handlers=[LoggingHandler()])
40 | #### /print debug information to stdout
41 | if not os.path.exists(args.ckpt_path):
42 |     os.mkdir(args.ckpt_path)
43 | 
44 | 
45 | # Read the dataset
46 | sts_reader = STSBenchmarkDataReader(args.data_path, normalize_scores=True)
47 | 
48 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
49 | word_embedding_model = models.PhoBERT(args.pre_trained_path, tokenizer_args={'vncorenlp_path':args.vncorenlp_path, 'bpe_path':args.bpe_path})
50 | 
51 | 
52 | # Apply mean pooling to get one fixed sized sentence vector
53 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
54 |                                pooling_mode_mean_tokens=True,
55 |                                pooling_mode_cls_token=False,
56 |                                pooling_mode_max_tokens=False)
57 | 
58 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
59 | 
60 | # Convert the dataset to a DataLoader ready for training
61 | logging.info("Read STSbenchmark train dataset")
62 | train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model)
63 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size)
64 | train_loss = losses.CosineSimilarityLoss(model=model)
65 | 
66 | 
67 | logging.info("Read STSbenchmark dev dataset")
68 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'), model=model)
69 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size)
70 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
71 | 
72 | 
73 | # Configure the training. We skip evaluation in this example
74 | warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.batch_size*0.1) #10% of train data for warm-up
75 | logging.info("Warmup-steps: {}".format(warmup_steps))
76 | 
77 | 
78 | # Train the model
79 | model.fit(train_objectives=[(train_dataloader, train_loss)],
80 |           evaluator=evaluator,
81 |           epochs=args.num_epochs,
82 |           evaluation_steps=1000,
83 |           warmup_steps=warmup_steps,
84 |           output_path=args.ckpt_path)
85 | 
86 | 
87 | ##############################################################################
88 | #
89 | # Load the stored model and evaluate its performance on STS benchmark dataset
90 | #
91 | ##############################################################################
92 | 
93 | model = SentenceTransformer(args.ckpt_path)
94 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test_vi.csv"), model=model)
95 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size)
96 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
97 | model.evaluate(evaluator)
98 | 


--------------------------------------------------------------------------------
/examples/evaluate_STSb_datasets/sbert_embbeding/training_LSTM.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
 3 | that can be compared using cosine-similarity to measure the similarity.
 4 | 
 5 | Usage:
 6 | python training_nli.py
 7 | 
 8 | OR
 9 | python training_nli.py pretrained_transformer_model_name
10 | """
11 | from torch.utils.data import DataLoader
12 | import math
13 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, models
14 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
15 | from sentence_transformers.readers import STSBenchmarkDataReader
16 | import logging
17 | from datetime import datetime
18 | import sys, os
19 | import argparse
20 | import numpy as np
21 | np.random.seed(42)
22 | torch.manual_seed(42)
23 | 
24 | parser = argparse.ArgumentParser(description='Process some integers.')
25 | parser.add_argument('--batch_size', type=int, default=24)
26 | parser.add_argument('--evaluation_steps', type=int, default= 1000)
27 | parser.add_argument('--ckpt_path', type=str, default = "./output")
28 | parser.add_argument('--num_epochs', type=int, default ="1")
29 | parser.add_argument('--data_path', type=str, default = "./DataNLI")
30 | parser.add_argument('--pre_trained_path', type=str, default = "./PhoBERT")
31 | parser.add_argument('--vncorenlp_path', type=str, default = "./VnCoreNLP/VnCoreNLP-1.1.1.jar")
32 | parser.add_argument('--bpe_path', type=str, default = "./PhoBERT")
33 | args = parser.parse_args() 
34 | 
35 | #### Just some code to print debug information to stdout
36 | logging.basicConfig(format='%(asctime)s - %(message)s',
37 |                     datefmt='%Y-%m-%d %H:%M:%S',
38 |                     level=logging.INFO,
39 |                     handlers=[LoggingHandler()])
40 | #### /print debug information to stdout
41 | if not os.path.exists(args.ckpt_path):
42 |     os.mkdir(args.ckpt_path)
43 | 
44 | 
45 | # Read the dataset
46 | sts_reader = STSBenchmarkDataReader(args.data_path, normalize_scores=True)
47 | 
48 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
49 | word_embedding_model = models.PhoBERT(args.pre_trained_path, tokenizer_args={'vncorenlp_path':args.vncorenlp_path, 'bpe_path':args.bpe_path})
50 | 
51 | lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=384, num_layers=1)# Apply mean pooling to get one fixed sized sentence vector
52 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
53 |                                pooling_mode_mean_tokens=True,
54 |                                pooling_mode_cls_token=False,
55 |                                pooling_mode_max_tokens=False)
56 | 
57 | model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model])
58 | 
59 | # Convert the dataset to a DataLoader ready for training
60 | logging.info("Read STSbenchmark train dataset")
61 | train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model)
62 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size)
63 | train_loss = losses.CosineSimilarityLoss(model=model)
64 | 
65 | 
66 | logging.info("Read STSbenchmark dev dataset")
67 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'), model=model)
68 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size)
69 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
70 | 
71 | 
72 | # Configure the training. We skip evaluation in this example
73 | warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.batch_size*0.1) #10% of train data for warm-up
74 | logging.info("Warmup-steps: {}".format(warmup_steps))
75 | 
76 | 
77 | # Train the model
78 | model.fit(train_objectives=[(train_dataloader, train_loss)],
79 |           evaluator=evaluator,
80 |           epochs=args.num_epochs,
81 |           evaluation_steps=1000,
82 |           warmup_steps=warmup_steps,
83 |           output_path=args.ckpt_path)
84 | 
85 | 
86 | ##############################################################################
87 | #
88 | # Load the stored model and evaluate its performance on STS benchmark dataset
89 | #
90 | ##############################################################################
91 | 
92 | model = SentenceTransformer(args.ckpt_path)
93 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test_vi.csv"), model=model)
94 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size)
95 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
96 | model.evaluate(evaluator, args.ckpt_path)
97 | 


--------------------------------------------------------------------------------
/examples/evaluate_STSb_datasets/word_embbeding/training_biltsm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled,
 3 | for example with max-pooling (which gives a system like InferSent) or with mean-pooling.
 4 | 
 5 | Note, you can also pass BERT embeddings to the BiLSTM.
 6 | """
 7 | import torch
 8 | from torch.utils.data import DataLoader
 9 | import math
10 | from sentence_transformers import models, losses
11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
13 | from sentence_transformers.readers import *
14 | import logging
15 | from datetime import datetime
16 | from sentence_transformers.models.tokenizer.WordTokenizer import VIETNAM_STOP_WORDS_SEGMENTATION
17 | from sentence_transformers.models.tokenizer.VietnameseTokenizer import *
18 | import argparse
19 | import numpy as np
20 | np.random.seed(42)
21 | torch.manual_seed(42)
22 | parser = argparse.ArgumentParser(description='Process some integers.')
23 | parser.add_argument('--batch_size', type=int, default=24)
24 | parser.add_argument('--ckpt_path', type=str, default = "./output")
25 | parser.add_argument('--num_epochs', type=int, default ="1")
26 | parser.add_argument('--data_path', type=str, default = "./stsbenchmark'")
27 | parser.add_argument('--vncorenlp_path', type=str, default = "./VnCoreNLP/VnCoreNLP-1.1.1.jar")
28 | parser.add_argument('--embeddings_file_path', type=str, default= "./glove.6B.300d.txt.gz")
29 | args = parser.parse_args()
30 | 
31 | #### Just some code to print debug information to stdout
32 | logging.basicConfig(format='%(asctime)s - %(message)s',
33 |                     datefmt='%Y-%m-%d %H:%M:%S',
34 |                     level=logging.INFO,
35 |                     handlers=[LoggingHandler()])
36 | #### /print debug information to stdout
37 | if not os.path.exists(args.ckpt_path):
38 |     os.mkdir(args.ckpt_path)
39 |     
40 | # Read the dataset
41 | sts_reader = STSBenchmarkDataReader(args.data_path)
42 | 
43 | 
44 | 
45 | # Map tokens to traditional word embeddings like GloVe
46 | word_embedding_model = models.WordEmbeddings.from_text_file(embeddings_file_path=args.embeddings_file_path, tokenizer = VietnameseTokenizer(stop_words=VIETNAM_STOP_WORDS_SEGMENTATION, vncorenlp_path=args.vncorenlp_path))
47 | 
48 | lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=150)
49 | 
50 | # Apply mean pooling to get one fixed sized sentence vector
51 | pooling_model = models.Pooling(lstm.get_word_embedding_dimension(),
52 |                                pooling_mode_mean_tokens=False,
53 |                                pooling_mode_cls_token=False,
54 |                                pooling_mode_max_tokens=True)
55 | 
56 | 
57 | model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model])
58 | 
59 | 
60 | # Convert the dataset to a DataLoader ready for training
61 | logging.info("Read STSbenchmark train dataset")
62 | train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model=model)
63 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size)
64 | train_loss = losses.CosineSimilarityLoss(model=model)
65 | 
66 | logging.info("Read STSbenchmark dev dataset")
67 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'), model=model)
68 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size)
69 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
70 | 
71 | # Configure the training
72 | warmup_steps = math.ceil(len(train_data) * args.num_epochs / args.batch_size * 0.1) #10% of train data for warm-up
73 | logging.info("Warmup-steps: {}".format(warmup_steps))
74 | 
75 | # Train the model
76 | model.fit(train_objectives=[(train_dataloader, train_loss)],
77 |           evaluator=evaluator,
78 |           epochs=args.num_epochs,
79 |           warmup_steps=warmup_steps,
80 |           output_path=args.ckpt_path
81 |           )
82 | 
83 | 
84 | 
85 | ##############################################################################
86 | #
87 | # Load the stored model and evaluate its performance on STS benchmark dataset
88 | #
89 | ##############################################################################
90 | 
91 | model = SentenceTransformer(args.ckpt_path)
92 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test_vi.csv"), model=model)
93 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size)
94 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
95 | 
96 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/examples/evaluate_STSb_datasets/word_embbeding/training_cnn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled,
 3 | for example with max-pooling (which gives a system like InferSent) or with mean-pooling.
 4 | 
 5 | Note, you can also pass BERT embeddings to the BiLSTM.
 6 | """
 7 | import torch
 8 | from torch.utils.data import DataLoader
 9 | import math
10 | from sentence_transformers import models, losses
11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
13 | from sentence_transformers.readers import *
14 | import logging
15 | from datetime import datetime
16 | from sentence_transformers.models.tokenizer.WordTokenizer import VIETNAM_STOP_WORDS_SEGMENTATION
17 | from sentence_transformers.models.tokenizer.VietnameseTokenizer import *
18 | import argparse
19 | import numpy as np
20 | np.random.seed(42)
21 | torch.manual_seed(42)
22 | 
23 | parser = argparse.ArgumentParser(description='Process some integers.')
24 | parser.add_argument('--batch_size', type=int, default=24)
25 | parser.add_argument('--ckpt_path', type=str, default = "./output")
26 | parser.add_argument('--num_epochs', type=int, default ="1")
27 | parser.add_argument('--data_path', type=str, default = "./stsbenchmark'")
28 | parser.add_argument('--vncorenlp_path', type=str, default = "./VnCoreNLP/VnCoreNLP-1.1.1.jar")
29 | parser.add_argument('--embeddings_file_path', type=str, default= "./glove.6B.300d.txt.gz")
30 | args = parser.parse_args()
31 | 
32 | #### Just some code to print debug information to stdout
33 | logging.basicConfig(format='%(asctime)s - %(message)s',
34 |                     datefmt='%Y-%m-%d %H:%M:%S',
35 |                     level=logging.INFO,
36 |                     handlers=[LoggingHandler()])
37 | #### /print debug information to stdout
38 | 
39 | if not os.path.exists(args.ckpt_path):
40 |     os.mkdir(args.ckpt_path)
41 | 
42 | # Read the dataset
43 | sts_reader = STSBenchmarkDataReader(args.data_path)
44 | 
45 | 
46 | 
47 | # Map tokens to traditional word embeddings like GloVe
48 | word_embedding_model = models.WordEmbeddings.from_text_file(embeddings_file_path=args.embeddings_file_path, tokenizer = VietnameseTokenizer(stop_words=VIETNAM_STOP_WORDS_SEGMENTATION, vncorenlp_path=args.vncorenlp_path))
49 | 
50 | cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), out_channels=300, kernel_sizes=[1,3,5])
51 | 
52 | # Apply mean pooling to get one fixed sized sentence vector
53 | pooling_model = models.Pooling(cnn.get_word_embedding_dimension(),
54 |                                pooling_mode_mean_tokens=False,
55 |                                pooling_mode_cls_token=False,
56 |                                pooling_mode_max_tokens=True)
57 | 
58 | 
59 | model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model])
60 | 
61 | 
62 | # Convert the dataset to a DataLoader ready for training
63 | logging.info("Read STSbenchmark train dataset")
64 | train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model=model)
65 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size)
66 | train_loss = losses.CosineSimilarityLoss(model=model)
67 | 
68 | logging.info("Read STSbenchmark dev dataset")
69 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'), model=model)
70 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size)
71 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
72 | 
73 | # Configure the training
74 | warmup_steps = math.ceil(len(train_data) * args.num_epochs / args.batch_size * 0.1) #10% of train data for warm-up
75 | logging.info("Warmup-steps: {}".format(warmup_steps))
76 | 
77 | # Train the model
78 | model.fit(train_objectives=[(train_dataloader, train_loss)],
79 |           evaluator=evaluator,
80 |           epochs=args.num_epochs,
81 |           warmup_steps=warmup_steps,
82 |           output_path=args.ckpt_path
83 |           )
84 | 
85 | 
86 | 
87 | ##############################################################################
88 | #
89 | # Load the stored model and evaluate its performance on STS benchmark dataset
90 | #
91 | ##############################################################################
92 | 
93 | model = SentenceTransformer(args.ckpt_path)
94 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test_vi.csv"), model=model)
95 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size)
96 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
97 | 
98 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/examples/evaluation/evaluation_inference_speed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples measures the inference speed of a certain model
 3 | 
 4 | Usage:
 5 | python evaluation_inference_speed.py
 6 | OR
 7 | python evaluation_inference_speed.py model_name
 8 | """
 9 | from torch.utils.data import DataLoader
10 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler
11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
12 | from sentence_transformers.readers import STSBenchmarkDataReader
13 | import logging
14 | import sys
15 | import os
16 | import time
17 | import torch
18 | 
19 | #Limit torch to 4 threads
20 | torch.set_num_threads(4)
21 | 
22 | script_folder_path = os.path.dirname(os.path.realpath(__file__))
23 | 
24 | 
25 | #### Just some code to print debug information to stdout
26 | logging.basicConfig(format='%(asctime)s - %(message)s',
27 |                     datefmt='%Y-%m-%d %H:%M:%S',
28 |                     level=logging.INFO,
29 |                     handlers=[LoggingHandler()])
30 | #### /print debug information to stdout
31 | 
32 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'
33 | 
34 | # Load a named sentence model (based on BERT). This will download the model from our server.
35 | # Alternatively, you can also pass a filepath to SentenceTransformer()
36 | model = SentenceTransformer(model_name)
37 | 
38 | sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark'))
39 | examples = sts_reader.get_examples("sts-train.csv")
40 | sentences = [text for ex in examples for text in ex.texts]
41 | print("Number of sentences:", len(sentences))
42 | 
43 | start_time = time.time()
44 | emb = model.encode(sentences, batch_size=32)
45 | end_time = time.time()
46 | diff_time = end_time - start_time
47 | print("Done after {:.2f} sec".format(diff_time))
48 | print("Speed: {:.2f}".format(len(sentences) / diff_time))


--------------------------------------------------------------------------------
/examples/evaluation/evaluation_stsbenchmark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset
 3 | 
 4 | Usage:
 5 | python evaluation_stsbenchmark.py
 6 | OR
 7 | python evaluation_stsbenchmark.py model_name
 8 | """
 9 | from torch.utils.data import DataLoader
10 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler
11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
12 | from sentence_transformers.readers import STSBenchmarkDataReader
13 | import logging
14 | import sys
15 | import os
16 | import torch
17 | 
18 | script_folder_path = os.path.dirname(os.path.realpath(__file__))
19 | 
20 | #Limit torch to 4 threads
21 | torch.set_num_threads(4)
22 | 
23 | #### Just some code to print debug information to stdout
24 | logging.basicConfig(format='%(asctime)s - %(message)s',
25 |                     datefmt='%Y-%m-%d %H:%M:%S',
26 |                     level=logging.INFO,
27 |                     handlers=[LoggingHandler()])
28 | #### /print debug information to stdout
29 | 
30 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens'
31 | 
32 | # Load a named sentence model (based on BERT). This will download the model from our server.
33 | # Alternatively, you can also pass a filepath to SentenceTransformer()
34 | model = SentenceTransformer(model_name)
35 | 
36 | sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark'))
37 | 
38 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
39 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
40 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
41 | 
42 | model.evaluate(evaluator)
43 | 


--------------------------------------------------------------------------------
/examples/evaluation/evaluation_stsbenchmark_sbert-wk.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Performs the pooling described in the paper:
 3 | SBERT-WK: A Sentence Embedding Method by Dissecting BERT-based Word Models, 2020, https://arxiv.org/abs/2002.06652
 4 | 
 5 | Note: WKPooling improves the performance only for certain models. Further, WKPooling requires QR-decomposition,
 6 | for which there is so far not efficient implementation in pytorch for GPUs (see https://github.com/pytorch/pytorch/issues/22573).
 7 | Hence, WKPooling runs on the GPU, which makes it rather in-efficient.
 8 | """
 9 | from torch.utils.data import DataLoader
10 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, models
11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
12 | from sentence_transformers.readers import STSBenchmarkDataReader
13 | import logging
14 | import torch
15 | 
16 | #Limit torch to 4 threads, as this example runs on the CPU
17 | torch.set_num_threads(4)
18 | 
19 | #### Just some code to print debug information to stdout
20 | logging.basicConfig(format='%(asctime)s - %(message)s',
21 |                     datefmt='%Y-%m-%d %H:%M:%S',
22 |                     level=logging.INFO,
23 |                     handlers=[LoggingHandler()])
24 | #### /print debug information to stdout
25 | 
26 | 
27 | #1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true
28 | word_embedding_model = models.Transformer('bert-base-uncased', model_args={'output_hidden_states': True})
29 | 
30 | #2) Add WKPooling
31 | pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension())
32 | 
33 | #3) Create a sentence transformer model to glue both models together
34 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
35 | 
36 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
37 | 
38 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
39 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
40 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
41 | 
42 | model.evaluate(evaluator)
43 | 


--------------------------------------------------------------------------------
/examples/training_basic_models/training_stsbenchmark_avg_word_embeddings.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example uses average word embeddings (for example from GloVe). It adds two fully-connected feed-forward layers (dense layers) to create a Deep Averaging Network (DAN).
 3 | 
 4 | If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server.
 5 | 
 6 | See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/
 7 | for available word embeddings files
 8 | """
 9 | import torch
10 | from torch.utils.data import DataLoader
11 | import math
12 | from sentence_transformers import models, losses
13 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
14 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
15 | from sentence_transformers.readers import *
16 | import logging
17 | from datetime import datetime
18 | 
19 | #### Just some code to print debug information to stdout
20 | logging.basicConfig(format='%(asctime)s - %(message)s',
21 |                     datefmt='%Y-%m-%d %H:%M:%S',
22 |                     level=logging.INFO,
23 |                     handlers=[LoggingHandler()])
24 | #### /print debug information to stdout
25 | 
26 | # Read the dataset
27 | batch_size = 32
28 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
29 | model_save_path = 'output/training_stsbenchmark_avg_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
30 | 
31 | 
32 | 
33 | # Map tokens to traditional word embeddings like GloVe
34 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz')
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=True,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=False)
41 | 
42 | # Add two trainable feed-forward networks (DAN)
43 | sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension()
44 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
45 | dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension)
46 | 
47 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dan1, dan2])
48 | 
49 | 
50 | # Convert the dataset to a DataLoader ready for training
51 | logging.info("Read STSbenchmark train dataset")
52 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
53 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
54 | train_loss = losses.CosineSimilarityLoss(model=model)
55 | 
56 | logging.info("Read STSbenchmark dev dataset")
57 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
58 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
59 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
60 | 
61 | # Configure the training
62 | num_epochs = 10
63 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
64 | logging.info("Warmup-steps: {}".format(warmup_steps))
65 | 
66 | # Train the model
67 | model.fit(train_objectives=[(train_dataloader, train_loss)],
68 |           evaluator=evaluator,
69 |           epochs=num_epochs,
70 |           warmup_steps=warmup_steps,
71 |           output_path=model_save_path
72 |           )
73 | 
74 | 
75 | 
76 | ##############################################################################
77 | #
78 | # Load the stored model and evaluate its performance on STS benchmark dataset
79 | #
80 | ##############################################################################
81 | 
82 | model = SentenceTransformer(model_save_path)
83 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
84 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
85 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
86 | 
87 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/examples/training_basic_models/training_stsbenchmark_bilstm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled,
 3 | for example with max-pooling (which gives a system like InferSent) or with mean-pooling.
 4 | 
 5 | Note, you can also pass BERT embeddings to the BiLSTM.
 6 | """
 7 | import torch
 8 | from torch.utils.data import DataLoader
 9 | import math
10 | from sentence_transformers import models, losses
11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
13 | from sentence_transformers.readers import *
14 | import logging
15 | from datetime import datetime
16 | 
17 | #### Just some code to print debug information to stdout
18 | logging.basicConfig(format='%(asctime)s - %(message)s',
19 |                     datefmt='%Y-%m-%d %H:%M:%S',
20 |                     level=logging.INFO,
21 |                     handlers=[LoggingHandler()])
22 | #### /print debug information to stdout
23 | 
24 | # Read the dataset
25 | batch_size = 32
26 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
27 | model_save_path = 'output/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
28 | 
29 | 
30 | 
31 | # Map tokens to traditional word embeddings like GloVe
32 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz')
33 | 
34 | lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=1024)
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(lstm.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=False,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=True)
41 | 
42 | 
43 | model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model])
44 | 
45 | 
46 | # Convert the dataset to a DataLoader ready for training
47 | logging.info("Read STSbenchmark train dataset")
48 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
49 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
50 | train_loss = losses.CosineSimilarityLoss(model=model)
51 | 
52 | logging.info("Read STSbenchmark dev dataset")
53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
56 | 
57 | # Configure the training
58 | num_epochs = 10
59 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
60 | logging.info("Warmup-steps: {}".format(warmup_steps))
61 | 
62 | # Train the model
63 | model.fit(train_objectives=[(train_dataloader, train_loss)],
64 |           evaluator=evaluator,
65 |           epochs=num_epochs,
66 |           warmup_steps=warmup_steps,
67 |           output_path=model_save_path
68 |           )
69 | 
70 | 
71 | 
72 | ##############################################################################
73 | #
74 | # Load the stored model and evaluate its performance on STS benchmark dataset
75 | #
76 | ##############################################################################
77 | 
78 | model = SentenceTransformer(model_save_path)
79 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
80 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
81 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
82 | 
83 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/examples/training_basic_models/training_stsbenchmark_bow.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This example uses a simple bag-of-words (BoW) approach. A sentence is mapped
  3 | to a sparse vector with e.g. 25,000 dimensions. Optionally, you can also use tf-idf.
  4 | 
  5 | To make the model trainable, we add multiple dense layers to create a Deep Averaging Network (DAN).
  6 | """
  7 | import torch
  8 | from torch.utils.data import DataLoader
  9 | import math
 10 | from sentence_transformers import models, losses
 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 13 | from sentence_transformers.readers import *
 14 | from sentence_transformers.models.tokenizer.WordTokenizer import ENGLISH_STOP_WORDS
 15 | import logging
 16 | from datetime import datetime
 17 | 
 18 | #### Just some code to print debug information to stdout
 19 | logging.basicConfig(format='%(asctime)s - %(message)s',
 20 |                     datefmt='%Y-%m-%d %H:%M:%S',
 21 |                     level=logging.INFO,
 22 |                     handlers=[LoggingHandler()])
 23 | #### /print debug information to stdout
 24 | 
 25 | # Read the dataset
 26 | batch_size = 32
 27 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
 28 | model_save_path = 'output/training_tf-idf_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 29 | 
 30 | 
 31 | 
 32 | # Create the vocab for the BoW model
 33 | stop_words = ENGLISH_STOP_WORDS
 34 | max_vocab_size = 25000 #This is also the size of the BoW sentence vector.
 35 | 
 36 | 
 37 | #Read the most common max_vocab_size words. Skip stop-words
 38 | vocab = set()
 39 | weights = {}
 40 | lines = open('wikipedia_doc_frequencies.txt', encoding='utf8').readlines()
 41 | num_docs = int(lines[0])
 42 | for line in lines[1:]:
 43 |     word, freq = line.lower().strip().split("\t")
 44 |     if word in stop_words:
 45 |         continue
 46 | 
 47 |     vocab.add(word)
 48 |     weights[word] = math.log(num_docs/int(freq))
 49 | 
 50 |     if len(vocab) >= max_vocab_size:
 51 |         break
 52 | 
 53 | #Create the BoW model. Because we set word_weights to the IDF values and cumulative_term_frequency=True, we
 54 | #get tf-idf vectors. Set word_weights to an empty dict and cumulative_term_frequency=False to get a 1-hot sentence encoding
 55 | bow = models.BoW(vocab=vocab, word_weights=weights, cumulative_term_frequency=True)
 56 | 
 57 | # Add two trainable feed-forward networks (DAN) with max_vocab_size -> 768 -> 512 dimensions.
 58 | sent_embeddings_dimension = max_vocab_size
 59 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=768)
 60 | dan2 = models.Dense(in_features=768, out_features=512)
 61 | 
 62 | model = SentenceTransformer(modules=[bow, dan1, dan2])
 63 | 
 64 | 
 65 | # Convert the dataset to a DataLoader ready for training
 66 | logging.info("Read STSbenchmark train dataset")
 67 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
 68 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
 69 | train_loss = losses.CosineSimilarityLoss(model=model)
 70 | 
 71 | logging.info("Read STSbenchmark dev dataset")
 72 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
 73 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
 74 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
 75 | 
 76 | # Configure the training
 77 | num_epochs = 10
 78 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
 79 | logging.info("Warmup-steps: {}".format(warmup_steps))
 80 | 
 81 | # Train the model
 82 | model.fit(train_objectives=[(train_dataloader, train_loss)],
 83 |           evaluator=evaluator,
 84 |           epochs=num_epochs,
 85 |           warmup_steps=warmup_steps,
 86 |           output_path=model_save_path
 87 |           )
 88 | 
 89 | 
 90 | 
 91 | ##############################################################################
 92 | #
 93 | # Load the stored model and evaluate its performance on STS benchmark dataset
 94 | #
 95 | ##############################################################################
 96 | 
 97 | model = SentenceTransformer(model_save_path)
 98 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
 99 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
100 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
101 | 
102 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/examples/training_basic_models/training_stsbenchmark_cnn.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example runs a CNN after the word embedding lookup. The output of the CNN is than pooled,
 3 | for example with mean-pooling.
 4 | 
 5 | 
 6 | """
 7 | import torch
 8 | from torch.utils.data import DataLoader
 9 | import math
10 | from sentence_transformers import models, losses
11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
13 | from sentence_transformers.readers import *
14 | import logging
15 | from datetime import datetime
16 | 
17 | #### Just some code to print debug information to stdout
18 | logging.basicConfig(format='%(asctime)s - %(message)s',
19 |                     datefmt='%Y-%m-%d %H:%M:%S',
20 |                     level=logging.INFO,
21 |                     handlers=[LoggingHandler()])
22 | #### /print debug information to stdout
23 | 
24 | # Read the dataset
25 | batch_size = 32
26 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
27 | model_save_path = 'output/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
28 | 
29 | 
30 | 
31 | # Map tokens to vectors using BERT
32 | word_embedding_model = models.BERT('bert-base-uncased')
33 | 
34 | cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), out_channels=256, kernel_sizes=[1,3,5])
35 | 
36 | # Apply mean pooling to get one fixed sized sentence vector
37 | pooling_model = models.Pooling(cnn.get_word_embedding_dimension(),
38 |                                pooling_mode_mean_tokens=True,
39 |                                pooling_mode_cls_token=False,
40 |                                pooling_mode_max_tokens=False)
41 | 
42 | 
43 | model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model])
44 | 
45 | 
46 | # Convert the dataset to a DataLoader ready for training
47 | logging.info("Read STSbenchmark train dataset")
48 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
49 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
50 | train_loss = losses.CosineSimilarityLoss(model=model)
51 | 
52 | logging.info("Read STSbenchmark dev dataset")
53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
56 | 
57 | # Configure the training
58 | num_epochs = 10
59 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
60 | logging.info("Warmup-steps: {}".format(warmup_steps))
61 | 
62 | # Train the model
63 | model.fit(train_objectives=[(train_dataloader, train_loss)],
64 |           evaluator=evaluator,
65 |           epochs=num_epochs,
66 |           warmup_steps=warmup_steps,
67 |           output_path=model_save_path
68 |           )
69 | 
70 | 
71 | 
72 | ##############################################################################
73 | #
74 | # Load the stored model and evaluate its performance on STS benchmark dataset
75 | #
76 | ##############################################################################
77 | 
78 | model = SentenceTransformer(model_save_path)
79 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
80 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
81 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
82 | 
83 | model.evaluate(evaluator)


--------------------------------------------------------------------------------
/examples/training_multilingual/training_sbert-en-de.py:
--------------------------------------------------------------------------------
  1 | """
  2 | This script contains an example how to extend a model to new languages.
  3 | 
  4 | We use an existent (English) teacher sentence embedding model and extend it to a new language, in this case, German.
  5 | 
  6 | In order to run this example, you must download these files:
  7 | https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/TED2013-en-de.txt.gz
  8 | https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/STS2017.en-de.txt.gz
  9 | https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/xnli-en-de.txt.gz
 10 | 
 11 | And store them in the datasets-folder.
 12 | 
 13 | You can then run this code like this:
 14 | python training_multilingual.py datasets/TED2013-en-de.txt.gz
 15 | """
 16 | 
 17 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models, readers, evaluation, losses
 18 | from torch.utils.data import DataLoader
 19 | from sentence_transformers.datasets import ParallelSentencesDataset
 20 | from datetime import datetime
 21 | 
 22 | import csv
 23 | import logging
 24 | import sys
 25 | import torch
 26 | import os
 27 | import numpy as np
 28 | 
 29 | #We can pass multiple train files to this script
 30 | train_files = sys.argv[1:]
 31 | 
 32 | 
 33 | if len(train_files) == 0:
 34 |     print("Please specify at least 1 training file: python training_multilingual.py path/to/trainfile.txt")
 35 | 
 36 | logging.basicConfig(format='%(asctime)s - %(message)s',
 37 |                     datefmt='%Y-%m-%d %H:%M:%S',
 38 |                     level=logging.INFO,
 39 |                     handlers=[LoggingHandler()])
 40 | 
 41 | max_seq_length = 128
 42 | train_batch_size = 64
 43 | 
 44 | logging.info("Load teacher model")
 45 | teacher_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens')
 46 | 
 47 | logging.info("Create student model from scratch")
 48 | word_embedding_model = models.Transformer("xlm-roberta-base")
 49 | 
 50 | # Apply mean pooling to get one fixed sized sentence vector
 51 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
 52 |                                pooling_mode_mean_tokens=True,
 53 |                                pooling_mode_cls_token=False,
 54 |                                pooling_mode_max_tokens=False)
 55 | 
 56 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
 57 | 
 58 | output_path = "output/make-multilingual-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
 59 | 
 60 | logging.info("Create dataset reader")
 61 | 
 62 | 
 63 | ###### Read Dataset ######
 64 | train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
 65 | for train_file in train_files:
 66 |     train_data.load_data(train_file)
 67 | 
 68 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
 69 | train_loss = losses.MSELoss(model=model)
 70 | 
 71 | 
 72 | ###### Load dev sets ######
 73 | 
 74 | # Test on STS 2017.en-de dataset using Spearman rank correlation
 75 | logging.info("Read STS2017.en-de dataset")
 76 | evaluators = []
 77 | sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2)
 78 | dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model)
 79 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
 80 | evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de')
 81 | evaluators.append(evaluator_sts)
 82 | 
 83 | 
 84 | # Use XLNI.en-de dataset with MSE evaluation
 85 | logging.info("Read XNLI.en-de dataset")
 86 | xnli_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model)
 87 | xnli_reader.load_data('../datasets/xnli-en-de.txt.gz')
 88 | 
 89 | xnli_dataloader = DataLoader(xnli_reader, shuffle=False, batch_size=train_batch_size)
 90 | xnli_mse = evaluation.MSEEvaluator(xnli_dataloader, name='xnli-en-de')
 91 | evaluators.append(xnli_mse)
 92 | 
 93 | 
 94 | 
 95 | # Train the model
 96 | model.fit(train_objectives=[(train_dataloader, train_loss)],
 97 |           evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]),
 98 |           epochs=20,
 99 |           evaluation_steps=1000,
100 |           warmup_steps=10000,
101 |           scheduler='warmupconstant',
102 |           output_path=output_path,
103 |           save_best_model=True,
104 |           optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
105 |           )
106 | 
107 | 
108 | 


--------------------------------------------------------------------------------
/examples/training_transformers/training_multi-task.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This is an example how to train SentenceTransformers in a multi-task setup.
 3 | 
 4 | The system trains BERT on the AllNLI and on the STSbenchmark dataset.
 5 | """
 6 | from torch.utils.data import DataLoader
 7 | import math
 8 | from sentence_transformers import models, losses
 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
11 | from sentence_transformers.readers import *
12 | import logging
13 | from datetime import datetime
14 | 
15 | #### Just some code to print debug information to stdout
16 | logging.basicConfig(format='%(asctime)s - %(message)s',
17 |                     datefmt='%Y-%m-%d %H:%M:%S',
18 |                     level=logging.INFO,
19 |                     handlers=[LoggingHandler()])
20 | #### /print debug information to stdout
21 | 
22 | # Read the dataset
23 | model_name = 'bert-base-uncased'
24 | batch_size = 16
25 | nli_reader = NLIDataReader('../datasets/AllNLI')
26 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
27 | train_num_labels = nli_reader.get_num_labels()
28 | model_save_path = 'output/training_multi-task_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
29 | 
30 | 
31 | 
32 | # Use BERT for mapping tokens to embeddings
33 | word_embedding_model = models.Transformer(model_name)
34 | 
35 | # Apply mean pooling to get one fixed sized sentence vector
36 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
37 |                                pooling_mode_mean_tokens=True,
38 |                                pooling_mode_cls_token=False,
39 |                                pooling_mode_max_tokens=False)
40 | 
41 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
42 | 
43 | 
44 | # Convert the dataset to a DataLoader ready for training
45 | logging.info("Read AllNLI train dataset")
46 | train_data_nli = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
47 | train_dataloader_nli = DataLoader(train_data_nli, shuffle=True, batch_size=batch_size)
48 | train_loss_nli = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
49 | 
50 | logging.info("Read STSbenchmark train dataset")
51 | train_data_sts = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model)
52 | train_dataloader_sts = DataLoader(train_data_sts, shuffle=True, batch_size=batch_size)
53 | train_loss_sts = losses.CosineSimilarityLoss(model=model)
54 | 
55 | 
56 | logging.info("Read STSbenchmark dev dataset")
57 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
58 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
59 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
60 | 
61 | # Configure the training
62 | num_epochs = 4
63 | 
64 | warmup_steps = math.ceil(len(train_dataloader_sts) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
65 | logging.info("Warmup-steps: {}".format(warmup_steps))
66 | 
67 | 
68 | # Here we define the two train objectives: train_dataloader_nli with train_loss_nli (i.e., SoftmaxLoss for NLI data)
69 | # and train_dataloader_sts with train_loss_sts (i.e., CosineSimilarityLoss for STSbenchmark data)
70 | # You can pass as many (dataloader, loss) tuples as you like. They are iterated in a round-robin way.
71 | train_objectives = [(train_dataloader_nli, train_loss_nli), (train_dataloader_sts, train_loss_sts)]
72 | 
73 | # Train the model
74 | model.fit(train_objectives=train_objectives,
75 |           evaluator=evaluator,
76 |           epochs=num_epochs,
77 |           evaluation_steps=1000,
78 |           warmup_steps=warmup_steps,
79 |           output_path=model_save_path
80 |           )
81 | 
82 | 
83 | 
84 | ##############################################################################
85 | #
86 | # Load the stored model and evaluate its performance on STS benchmark dataset
87 | #
88 | ##############################################################################
89 | 
90 | model = SentenceTransformer(model_save_path)
91 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
92 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
93 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
94 | 
95 | model.evaluate(evaluator)
96 | 


--------------------------------------------------------------------------------
/examples/training_transformers/training_nli.py:
--------------------------------------------------------------------------------
 1 | """
 2 | The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the
 4 | STS benchmark dataset
 5 | 
 6 | Usage:
 7 | python training_nli.py
 8 | 
 9 | OR
10 | python training_nli.py pretrained_transformer_model_name
11 | """
12 | from torch.utils.data import DataLoader
13 | import math
14 | from sentence_transformers import models, losses
15 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer
16 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
17 | from sentence_transformers.readers import *
18 | import logging
19 | from datetime import datetime
20 | import sys
21 | 
22 | #### Just some code to print debug information to stdout
23 | logging.basicConfig(format='%(asctime)s - %(message)s',
24 |                     datefmt='%Y-%m-%d %H:%M:%S',
25 |                     level=logging.INFO,
26 |                     handlers=[LoggingHandler()])
27 | #### /print debug information to stdout
28 | 
29 | #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
30 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased'
31 | 
32 | # Read the dataset
33 | batch_size = 16
34 | nli_reader = NLIDataReader('../datasets/AllNLI')
35 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark')
36 | train_num_labels = nli_reader.get_num_labels()
37 | model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
38 | 
39 | 
40 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
41 | word_embedding_model = models.Transformer(model_name)
42 | 
43 | # Apply mean pooling to get one fixed sized sentence vector
44 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
45 |                                pooling_mode_mean_tokens=True,
46 |                                pooling_mode_cls_token=False,
47 |                                pooling_mode_max_tokens=False)
48 | 
49 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
50 | 
51 | 
52 | # Convert the dataset to a DataLoader ready for training
53 | logging.info("Read AllNLI train dataset")
54 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model)
55 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
56 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels)
57 | 
58 | 
59 | 
60 | logging.info("Read STSbenchmark dev dataset")
61 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
62 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size)
63 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
64 | 
65 | # Configure the training
66 | num_epochs = 1
67 | 
68 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up
69 | logging.info("Warmup-steps: {}".format(warmup_steps))
70 | 
71 | 
72 | 
73 | # Train the model
74 | model.fit(train_objectives=[(train_dataloader, train_loss)],
75 |           evaluator=evaluator,
76 |           epochs=num_epochs,
77 |           evaluation_steps=1000,
78 |           warmup_steps=warmup_steps,
79 |           output_path=model_save_path
80 |           )
81 | 
82 | 
83 | 
84 | ##############################################################################
85 | #
86 | # Load the stored model and evaluate its performance on STS benchmark dataset
87 | #
88 | ##############################################################################
89 | 
90 | model = SentenceTransformer(model_save_path)
91 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
92 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size)
93 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
94 | 
95 | model.evaluate(evaluator)
96 | 


--------------------------------------------------------------------------------
/examples/training_transformers/training_nli_phobert.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/examples/training_transformers/training_nli_phobert.py


--------------------------------------------------------------------------------
/examples/training_transformers/training_stsbenchmark.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings
 3 | that can be compared using cosine-similarity to measure the similarity.
 4 | 
 5 | Usage:
 6 | python training_nli.py
 7 | 
 8 | OR
 9 | python training_nli.py pretrained_transformer_model_name
10 | """
11 | from torch.utils.data import DataLoader
12 | import math
13 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses, models
14 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
15 | from sentence_transformers.readers import STSBenchmarkDataReader
16 | import logging
17 | from datetime import datetime
18 | import sys
19 | 
20 | #### Just some code to print debug information to stdout
21 | logging.basicConfig(format='%(asctime)s - %(message)s',
22 |                     datefmt='%Y-%m-%d %H:%M:%S',
23 |                     level=logging.INFO,
24 |                     handlers=[LoggingHandler()])
25 | #### /print debug information to stdout
26 | 
27 | #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
28 | model_name = sys.argv[1] if len(sys.argv) > 1 else  'bert-base-uncased'
29 | 
30 | # Read the dataset
31 | train_batch_size = 16
32 | num_epochs = 4
33 | model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
34 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark', normalize_scores=True)
35 | 
36 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
37 | word_embedding_model = models.Transformer(model_name)
38 | 
39 | # Apply mean pooling to get one fixed sized sentence vector
40 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
41 |                                pooling_mode_mean_tokens=True,
42 |                                pooling_mode_cls_token=False,
43 |                                pooling_mode_max_tokens=False)
44 | 
45 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
46 | 
47 | # Convert the dataset to a DataLoader ready for training
48 | logging.info("Read STSbenchmark train dataset")
49 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
50 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
51 | train_loss = losses.CosineSimilarityLoss(model=model)
52 | 
53 | 
54 | logging.info("Read STSbenchmark dev dataset")
55 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
56 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
57 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
58 | 
59 | 
60 | # Configure the training. We skip evaluation in this example
61 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
62 | logging.info("Warmup-steps: {}".format(warmup_steps))
63 | 
64 | 
65 | # Train the model
66 | model.fit(train_objectives=[(train_dataloader, train_loss)],
67 |           evaluator=evaluator,
68 |           epochs=num_epochs,
69 |           evaluation_steps=1000,
70 |           warmup_steps=warmup_steps,
71 |           output_path=model_save_path)
72 | 
73 | 
74 | ##############################################################################
75 | #
76 | # Load the stored model and evaluate its performance on STS benchmark dataset
77 | #
78 | ##############################################################################
79 | 
80 | model = SentenceTransformer(model_save_path)
81 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
82 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
83 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
84 | model.evaluate(evaluator)
85 | 


--------------------------------------------------------------------------------
/examples/training_transformers/training_stsbenchmark_continue_training.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server.
 3 | It then fine-tunes this model for some epochs on the STS benchmark dataset.
 4 | 
 5 | Note: In this example, you must specify a SentenceTransformer model.
 6 | If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py
 7 | """
 8 | from torch.utils.data import DataLoader
 9 | import math
10 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler, losses
11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
12 | from sentence_transformers.readers import STSBenchmarkDataReader
13 | import logging
14 | from datetime import datetime
15 | 
16 | 
17 | #### Just some code to print debug information to stdout
18 | logging.basicConfig(format='%(asctime)s - %(message)s',
19 |                     datefmt='%Y-%m-%d %H:%M:%S',
20 |                     level=logging.INFO,
21 |                     handlers=[LoggingHandler()])
22 | #### /print debug information to stdout
23 | 
24 | # Read the dataset
25 | model_name = 'bert-base-nli-mean-tokens'
26 | train_batch_size = 16
27 | num_epochs = 4
28 | model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
29 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark', normalize_scores=True)
30 | 
31 | # Load a pre-trained sentence transformer model
32 | model = SentenceTransformer(model_name)
33 | 
34 | # Convert the dataset to a DataLoader ready for training
35 | logging.info("Read STSbenchmark train dataset")
36 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model)
37 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
38 | train_loss = losses.CosineSimilarityLoss(model=model)
39 | 
40 | 
41 | logging.info("Read STSbenchmark dev dataset")
42 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model)
43 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
44 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader)
45 | 
46 | 
47 | # Configure the training. We skip evaluation in this example
48 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up
49 | logging.info("Warmup-steps: {}".format(warmup_steps))
50 | 
51 | 
52 | # Train the model
53 | model.fit(train_objectives=[(train_dataloader, train_loss)],
54 |           evaluator=evaluator,
55 |           epochs=num_epochs,
56 |           evaluation_steps=1000,
57 |           warmup_steps=warmup_steps,
58 |           output_path=model_save_path)
59 | 
60 | 
61 | ##############################################################################
62 | #
63 | # Load the stored model and evaluate its performance on STS benchmark dataset
64 | #
65 | ##############################################################################
66 | 
67 | model = SentenceTransformer(model_save_path)
68 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
69 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
70 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
71 | model.evaluate(evaluator)
72 | 


--------------------------------------------------------------------------------
/examples/training_transformers/training_wikipedia_sections.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This script trains sentence transformers with a triplet loss function.
 3 | 
 4 | As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks.
 5 | 
 6 | See docs/pretrained-models/wikipedia-sections-modesl.md for further details.
 7 | 
 8 | You can get the dataset by running examples/datasets/get_data.py
 9 | """
10 | 
11 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models
12 | from torch.utils.data import DataLoader
13 | from sentence_transformers.readers import TripletReader
14 | from sentence_transformers.evaluation import TripletEvaluator
15 | from datetime import datetime
16 | 
17 | import csv
18 | import logging
19 | 
20 | 
21 | 
22 | logging.basicConfig(format='%(asctime)s - %(message)s',
23 |                     datefmt='%Y-%m-%d %H:%M:%S',
24 |                     level=logging.INFO,
25 |                     handlers=[LoggingHandler()])
26 | 
27 | 
28 | #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
29 | model_name = 'bert-base-uncased'
30 | 
31 | 
32 | ### Create a torch.DataLoader that passes training batch instances to our model
33 | train_batch_size = 16
34 | triplet_reader = TripletReader('../datasets/wikipedia-sections-triplets', s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True)
35 | output_path = "output/training-wikipedia-sections-"+model_name+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
36 | num_epochs = 1
37 | 
38 | 
39 | ### Configure sentence transformers for training and train on the provided dataset
40 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings
41 | word_embedding_model = models.Transformer(model_name)
42 | 
43 | # Apply mean pooling to get one fixed sized sentence vector
44 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(),
45 |                                pooling_mode_mean_tokens=True,
46 |                                pooling_mode_cls_token=False,
47 |                                pooling_mode_max_tokens=False)
48 | 
49 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
50 | 
51 | 
52 | logging.info("Read Triplet train dataset")
53 | train_data = SentencesDataset(examples=triplet_reader.get_examples('train.csv', max_examples=100000), model=model)
54 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size)
55 | train_loss = losses.TripletLoss(model=model)
56 | 
57 | logging.info("Read Wikipedia Triplet dev dataset")
58 | dev_data = SentencesDataset(examples=triplet_reader.get_examples('validation.csv', 1000), model=model)
59 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size)
60 | evaluator = TripletEvaluator(dev_dataloader)
61 | 
62 | 
63 | warmup_steps = int(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data
64 | 
65 | 
66 | # Train the model
67 | model.fit(train_objectives=[(train_dataloader, train_loss)],
68 |           evaluator=evaluator,
69 |           epochs=num_epochs,
70 |           evaluation_steps=1000,
71 |           warmup_steps=warmup_steps,
72 |           output_path=output_path)
73 | 
74 | ##############################################################################
75 | #
76 | # Load the stored model and evaluate its performance on STS benchmark dataset
77 | #
78 | ##############################################################################
79 | 
80 | model = SentenceTransformer(output_path)
81 | test_data = SentencesDataset(examples=triplet_reader.get_examples('test.csv'), model=model)
82 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size)
83 | evaluator = TripletEvaluator(test_dataloader)
84 | 
85 | model.evaluate(evaluator)
86 | 
87 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers>=2.8.0
2 | tqdm
3 | torch>=1.0.1
4 | numpy
5 | scikit-learn
6 | scipy
7 | nltk


--------------------------------------------------------------------------------
/sentence_transformers/LoggingHandler.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import tqdm
 3 | 
 4 | class LoggingHandler(logging.Handler):
 5 |     def __init__(self, level=logging.NOTSET):
 6 |         super().__init__(level)
 7 | 
 8 |     def emit(self, record):
 9 |         try:
10 |             msg = self.format(record)
11 |             tqdm.tqdm.write(msg)
12 |             self.flush()
13 |         except (KeyboardInterrupt, SystemExit):
14 |             raise
15 |         except:
16 |             self.handleError(record)


--------------------------------------------------------------------------------
/sentence_transformers/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.2.6"
2 | __DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/'
3 | from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset
4 | from .data_samplers import LabelSampler
5 | from .LoggingHandler import LoggingHandler
6 | from .SentenceTransformer import SentenceTransformer
7 | 
8 | 


--------------------------------------------------------------------------------
/sentence_transformers/data_samplers.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This file contains sampler functions, that can be used to sample mini-batches with specific properties.
 3 | """
 4 | from torch.utils.data import Sampler
 5 | import numpy as np
 6 | from .datasets import SentenceLabelDataset
 7 | 
 8 | 
 9 | class LabelSampler(Sampler):
10 |     """
11 |     This sampler is used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS
12 |     or MULTIPLE_NEGATIVES_RANKING_LOSS which require multiple or only one sample from one label per batch.
13 | 
14 |     It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label.
15 | 
16 |     Labels with fewer than n unique samples are ignored.
17 |     This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped.
18 | 
19 |     This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible
20 |     by the samples drawn per label.
21 | 
22 | 
23 |     """
24 |     def __init__(self, data_source: SentenceLabelDataset, samples_per_label: int = 5,
25 |                  with_replacement: bool = False):
26 |         """
27 |         Creates a LabelSampler for a SentenceLabelDataset.
28 | 
29 |         :param data_source:
30 |             the dataset from which samples are drawn
31 |         :param samples_per_label:
32 |             the number of consecutive, random and unique samples drawn per label
33 |         :param with_replacement:
34 |             if this is True, then each sample is drawn at most once (depending on the total number of samples per label).
35 |             if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same
36 |             drawing.
37 |         """
38 |         super().__init__(data_source)
39 |         self.data_source = data_source
40 |         self.samples_per_label = samples_per_label
41 |         self.label_range = np.arange(data_source.num_labels)
42 |         self.borders = data_source.labels_right_border
43 |         self.with_replacement = with_replacement
44 |         np.random.shuffle(self.label_range)
45 | 
46 |     def __iter__(self):
47 |         label_idx = 0
48 |         count = 0
49 |         already_seen = {}
50 |         while count < len(self.data_source):
51 |             label = self.label_range[label_idx]
52 |             if label not in already_seen:
53 |                 already_seen[label] = []
54 | 
55 |             left_border = 0 if label == 0 else self.borders[label-1]
56 |             right_border = self.borders[label]
57 | 
58 |             if self.with_replacement:
59 |                 selection = np.arange(left_border, right_border)
60 |             else:
61 |                 selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]]
62 | 
63 |             if len(selection) >= self.samples_per_label:
64 |                 for element_idx in np.random.choice(selection, self.samples_per_label, replace=False):
65 |                     count += 1
66 |                     already_seen[label].append(element_idx)
67 |                     yield element_idx
68 | 
69 |             label_idx += 1
70 |             if label_idx >= len(self.label_range):
71 |                 label_idx = 0
72 |                 np.random.shuffle(self.label_range)
73 | 
74 |     def __len__(self):
75 |         return len(self.data_source)


--------------------------------------------------------------------------------
/sentence_transformers/datasets/ParallelSentencesDataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | import torch
 3 | import logging
 4 | import gzip
 5 | import os
 6 | import random
 7 | from .. import SentenceTransformer
 8 | 
 9 | 
10 | class ParallelSentencesDataset(Dataset):
11 |     """
12 |     This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same
13 |     sentence in different languages. For example, the file can look like this (EN\tDE\tES):
14 |     hello world     hallo welt  hola mundo
15 |     second sentence zweiter satz    segunda oración
16 | 
17 |     The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example,
18 |     embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be
19 |     mapped to this English sentence embedding.
20 | 
21 |     When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence.
22 | 
23 |     teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and
24 |     returns a list of sentence embeddings
25 |     """
26 | 
27 |     def __init__(self, student_model: SentenceTransformer, teacher_model):
28 |         """
29 |         Parallel sentences dataset reader to train student model given a teacher model
30 |         :param student_model: Student sentence embedding model that should be trained
31 |         :param teacher_model: Teacher model, that provides the sentence embeddings for the first column in the dataset file
32 |         """
33 |         self.student_model = student_model
34 |         self.teacher_model = teacher_model
35 |         self.datasets = []
36 |         self.dataset_indices = []
37 |         self.copy_dataset_indices = []
38 | 
39 |     def load_data(self, filepath: str, weight: int = 100, max_sentences: int = None, max_sentence_length: int = 128):
40 |         """
41 |         Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column
42 | 
43 |         :param filepath: Filepath to the file
44 |         :param weight: If more that one dataset is loaded with load_data: With which frequency should data be sampled from this dataset?
45 |         :param max_sentences: Max number of lines to be read from filepath
46 |         :param max_sentence_length: Skip the example if one of the sentences is has more characters than max_sentence_length
47 |         :return:
48 |         """
49 |         sentences_map = {}
50 |         with gzip.open(filepath, 'rt', encoding='utf8') if filepath.endswith('.gz') else open(filepath, encoding='utf8') as fIn:
51 |             count = 0
52 |             for line in fIn:
53 |                 sentences = line.strip().split("\t")
54 |                 sentence_lengths = [len(sent) for sent in sentences]
55 |                 if max(sentence_lengths) > max_sentence_length:
56 |                     continue
57 | 
58 |                 eng_sentence = sentences[0]
59 |                 if eng_sentence not in sentences_map:
60 |                     sentences_map[eng_sentence] = set()
61 | 
62 |                 for sent in sentences:
63 |                     sentences_map[eng_sentence].add(sent)
64 | 
65 |                 count += 1
66 |                 if max_sentences is not None and count >= max_sentences:
67 |                     break
68 | 
69 |         eng_sentences = list(sentences_map.keys())
70 |         logging.info("Create sentence embeddings for " + os.path.basename(filepath))
71 |         labels = torch.tensor(self.teacher_model.encode(eng_sentences, batch_size=32, show_progress_bar=True),
72 |                               dtype=torch.float)
73 | 
74 |         data = []
75 |         for idx in range(len(eng_sentences)):
76 |             eng_key = eng_sentences[idx]
77 |             label = labels[idx]
78 |             for sent in sentences_map[eng_key]:
79 |                 data.append([[self.student_model.tokenize(sent)], label])
80 | 
81 |         dataset_id = len(self.datasets)
82 |         self.datasets.append(data)
83 |         self.dataset_indices.extend([dataset_id] * weight)
84 | 
85 |     def __len__(self):
86 |         return max([len(dataset) for dataset in self.datasets])
87 | 
88 |     def __getitem__(self, idx):
89 |         if len(self.copy_dataset_indices) == 0:
90 |             self.copy_dataset_indices = self.dataset_indices.copy()
91 |             random.shuffle(self.copy_dataset_indices)
92 | 
93 |         dataset_idx = self.copy_dataset_indices.pop()
94 |         return self.datasets[dataset_idx][idx % len(self.datasets[dataset_idx])]
95 | 


--------------------------------------------------------------------------------
/sentence_transformers/datasets/SentencesDataset.py:
--------------------------------------------------------------------------------
 1 | from torch.utils.data import Dataset
 2 | from typing import List
 3 | import torch
 4 | import logging
 5 | from tqdm import tqdm
 6 | from .. import SentenceTransformer
 7 | from ..readers.InputExample import InputExample
 8 | 
 9 | 
10 | class SentencesDataset(Dataset):
11 |     """
12 |     Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all
13 |     sequences to the max length.
14 |     The SentenceBertEncoder.smart_batching_collate is required for this to work.
15 |     SmartBatchingDataset does *not* work without it.
16 |     """
17 |     def __init__(self, examples: List[InputExample], model: SentenceTransformer, show_progress_bar: bool = None):
18 |         """
19 |         Create a new SentencesDataset with the tokenized texts and the labels as Tensor
20 |         """
21 |         if show_progress_bar is None:
22 |             show_progress_bar = (logging.getLogger().getEffectiveLevel() == logging.INFO or logging.getLogger().getEffectiveLevel() == logging.DEBUG)
23 |         self.show_progress_bar = show_progress_bar
24 | 
25 |         self.convert_input_examples(examples, model)
26 | 
27 |     def convert_input_examples(self, examples: List[InputExample], model: SentenceTransformer):
28 |         """
29 |         Converts input examples to a SmartBatchingDataset usable to train the model with
30 |         SentenceTransformer.smart_batching_collate as the collate_fn for the DataLoader
31 | 
32 |         smart_batching_collate as collate_fn is required because it transforms the tokenized texts to the tensors.
33 | 
34 |         :param examples:
35 |             the input examples for the training
36 |         :param model
37 |             the Sentence BERT model for the conversion
38 |         :return: a SmartBatchingDataset usable to train the model with SentenceTransformer.smart_batching_collate as the collate_fn
39 |             for the DataLoader
40 |         """
41 |         num_texts = len(examples[0].texts)
42 |         inputs = [[] for _ in range(num_texts)]
43 |         labels = []
44 |         too_long = [0] * num_texts
45 |         label_type = None
46 |         iterator = examples
47 |         max_seq_length = model.get_max_seq_length()
48 | 
49 |         if self.show_progress_bar:
50 |             iterator = tqdm(iterator, desc="Convert dataset")
51 | 
52 |         for ex_index, example in enumerate(iterator):
53 |             if label_type is None:
54 |                 if isinstance(example.label, int):
55 |                     label_type = torch.long
56 |                 elif isinstance(example.label, float):
57 |                     label_type = torch.float
58 |             tokenized_texts = [model.tokenize(text) for text in example.texts]
59 | 
60 |             for i, token in enumerate(tokenized_texts):
61 |                 if max_seq_length != None and max_seq_length > 0 and len(token) >= max_seq_length:
62 |                     too_long[i] += 1
63 | 
64 |             labels.append(example.label)
65 |             for i in range(num_texts):
66 |                 inputs[i].append(tokenized_texts[i])
67 | 
68 |         tensor_labels = torch.tensor(labels, dtype=label_type)
69 | 
70 |         logging.info("Num sentences: %d" % (len(examples)))
71 |         for i in range(num_texts):
72 |             logging.info("Sentences {} longer than max_seqence_length: {}".format(i, too_long[i]))
73 | 
74 |         self.tokens = inputs
75 |         self.labels = tensor_labels
76 | 
77 |     def __getitem__(self, item):
78 |         return [self.tokens[i][item] for i in range(len(self.tokens))], self.labels[item]
79 | 
80 |     def __len__(self):
81 |         return len(self.tokens[0])


--------------------------------------------------------------------------------
/sentence_transformers/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .ParallelSentencesDataset import ParallelSentencesDataset
2 | from .SentenceLabelDataset import SentenceLabelDataset
3 | from .SentencesDataset import SentencesDataset


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/LabelAccuracyEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | import torch
 3 | from torch.utils.data import DataLoader
 4 | import logging
 5 | from tqdm import tqdm
 6 | from ..util import batch_to_device
 7 | import os
 8 | import csv
 9 | 
10 | class LabelAccuracyEvaluator(SentenceEvaluator):
11 |     """
12 |     Evaluate a model based on its accuracy on a labeled dataset
13 | 
14 |     This requires a model with LossFunction.SOFTMAX
15 | 
16 |     The results are written in a CSV. If a CSV already exists, then values are appended.
17 |     """
18 | 
19 |     def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None):
20 |         """
21 |         Constructs an evaluator for the given dataset
22 | 
23 |         :param dataloader:
24 |             the data for the evaluation
25 |         """
26 |         self.dataloader = dataloader
27 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
28 |         self.name = name
29 |         self.softmax_model = softmax_model
30 |         self.softmax_model.to(self.device)
31 | 
32 |         if name:
33 |             name = "_"+name
34 | 
35 |         self.csv_file = "accuracy_evaluation"+name+"_results.csv"
36 |         self.csv_headers = ["epoch", "steps", "accuracy"]
37 | 
38 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
39 |         model.eval()
40 |         total = 0
41 |         correct = 0
42 | 
43 |         if epoch != -1:
44 |             if steps == -1:
45 |                 out_txt = " after epoch {}:".format(epoch)
46 |             else:
47 |                 out_txt = " in epoch {} after {} steps:".format(epoch, steps)
48 |         else:
49 |             out_txt = ":"
50 | 
51 |         logging.info("Evaluation on the "+self.name+" dataset"+out_txt)
52 |         self.dataloader.collate_fn = model.smart_batching_collate
53 |         for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")):
54 |             features, label_ids = batch_to_device(batch, self.device)
55 |             with torch.no_grad():
56 |                 _, prediction = self.softmax_model(features, labels=None)
57 | 
58 |             total += prediction.size(0)
59 |             correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item()
60 |         accuracy = correct/total
61 | 
62 |         logging.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total))
63 | 
64 |         if output_path is not None:
65 |             csv_path = os.path.join(output_path, self.csv_file)
66 |             if not os.path.isfile(csv_path):
67 |                 with open(csv_path, mode="w", encoding="utf-8") as f:
68 |                     writer = csv.writer(f)
69 |                     writer.writerow(self.csv_headers)
70 |                     writer.writerow([epoch, steps, accuracy])
71 |             else:
72 |                 with open(csv_path, mode="a", encoding="utf-8") as f:
73 |                     writer = csv.writer(f)
74 |                     writer.writerow([epoch, steps, accuracy])
75 | 
76 |         #return accuracy, self.softmax_model.state_dict()
77 |         return accuracy


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/MSEEvaluator.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers.evaluation import SentenceEvaluator
 2 | from sentence_transformers.util import batch_to_device
 3 | import torch
 4 | import numpy as np
 5 | import logging
 6 | import os
 7 | import csv
 8 | 
 9 | 
10 | class MSEEvaluator(SentenceEvaluator):
11 |     """
12 |     Computes the mean squared error (x100) between the computed sentence embedding
13 |     and some target sentence embedding
14 |     """
15 |     def __init__(self, dataloader, name=''):
16 |         self.dataloader = dataloader
17 |         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
18 |         self.name = name
19 | 
20 |         if name:
21 |             name = "_"+name
22 |         self.csv_file = "mse_evaluation" + name + "_results.csv"
23 |         self.csv_headers = ["epoch", "steps", "MSE"]
24 | 
25 |     def __call__(self, model, output_path, epoch  = -1, steps = -1):
26 |         model.eval()
27 |         self.dataloader.collate_fn = model.smart_batching_collate
28 | 
29 |         embeddings = []
30 |         labels = []
31 |         for step, batch in enumerate(self.dataloader):
32 |             features, batch_labels = batch_to_device(batch, self.device)
33 |             with torch.no_grad():
34 |                 emb1 = model(features[0])['sentence_embedding'].to("cpu").numpy()
35 | 
36 |             labels.extend(batch_labels.to("cpu").numpy())
37 |             embeddings.extend(emb1)
38 | 
39 |         embeddings = np.asarray(embeddings)
40 |         labels = np.asarray(labels)
41 | 
42 |         mse = ((embeddings - labels)**2).mean()
43 | 
44 |         logging.info("MSE evaluation on "+self.name+" dataset")
45 |         mse *= 100
46 | 
47 |         logging.info("embeddings shape:\t"+str(embeddings.shape))
48 |         logging.info("MSE (*100):\t{:4f}".format(mse))
49 | 
50 |         if output_path is not None:
51 |             csv_path = os.path.join(output_path, self.csv_file)
52 |             output_file_exists = os.path.isfile(csv_path)
53 |             with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f:
54 |                 writer = csv.writer(f)
55 |                 if not output_file_exists:
56 |                     writer.writerow(self.csv_headers)
57 | 
58 |                 writer.writerow([epoch, steps, mse])
59 | 
60 | 
61 |         return -mse #Return negative score as SentenceTransformers maximizes the performance


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SentenceEvaluator.py:
--------------------------------------------------------------------------------
 1 | class SentenceEvaluator:
 2 |     """
 3 |     Base class for all evaluators
 4 | 
 5 |     Extend this class and implement __call__ for custom evaluators.
 6 |     """
 7 | 
 8 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
 9 |         """
10 |         This is called during training to evaluate the model.
11 |         It returns a score for the evaluation with a higher score indicating a better result.
12 | 
13 |         :param model:
14 |             the model to evaluate
15 |         :param output_path:
16 |             path where predictions and metrics are written to
17 |         :param epoch
18 |             the epoch where the evaluation takes place.
19 |             This is used for the file prefixes.
20 |             If this is -1, then we assume evaluation on test data.
21 |         :param steps
22 |             the steps in the current epoch at time of the evaluation.
23 |             This is used for the file prefixes.
24 |             If this is -1, then we assume evaluation at the end of the epoch.
25 |         :return: a score for the evaluation with a higher score indicating a better result
26 |         """
27 |         pass
28 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SequentialEvaluator.py:
--------------------------------------------------------------------------------
 1 | from . import SentenceEvaluator
 2 | from typing import Iterable
 3 | 
 4 | class SequentialEvaluator(SentenceEvaluator):
 5 |     """
 6 |     This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated,
 7 |     the data is passed sequentially to all sub-evaluators.
 8 | 
 9 |     All scores are passed to 'main_score_function', which derives one final score value
10 |     """
11 |     def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function = lambda scores: scores[-1]):
12 |         self.evaluators = evaluators
13 |         self.main_score_function = main_score_function
14 | 
15 |     def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float:
16 |         scores = []
17 |         for evaluator in self.evaluators:
18 |             scores.append(evaluator(model, output_path, epoch, steps))
19 | 
20 |         return self.main_score_function(scores)
21 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/SimilarityFunction.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 | 
3 | class SimilarityFunction(Enum):
4 |     COSINE = 0
5 |     EUCLIDEAN = 1
6 |     MANHATTAN = 2
7 |     DOT_PRODUCT = 3
8 | 
9 | 


--------------------------------------------------------------------------------
/sentence_transformers/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .SentenceEvaluator import SentenceEvaluator
2 | from .SimilarityFunction import SimilarityFunction
3 | 
4 | from .BinaryEmbeddingSimilarityEvaluator import BinaryEmbeddingSimilarityEvaluator
5 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator
6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator
7 | from .SequentialEvaluator import SequentialEvaluator
8 | from .TripletEvaluator import TripletEvaluator
9 | from .MSEEvaluator import MSEEvaluator


--------------------------------------------------------------------------------
/sentence_transformers/losses/CosineSimilarityLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | 
 6 | class CosineSimilarityLoss(nn.Module):
 7 |     def __init__(self, model: SentenceTransformer):
 8 |         super(CosineSimilarityLoss, self).__init__()
 9 |         self.model = model
10 | 
11 | 
12 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
13 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
14 |         rep_a, rep_b = reps
15 | 
16 |         output = torch.cosine_similarity(rep_a, rep_b)
17 |         loss_fct = nn.MSELoss()
18 | 
19 |         if labels is not None:
20 |             loss = loss_fct(output, labels.view(-1))
21 |             return loss
22 |         else:
23 |             return reps, output


--------------------------------------------------------------------------------
/sentence_transformers/losses/MSELoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | 
 5 | 
 6 | class MSELoss(nn.Module):
 7 |     """
 8 |     Computes the MSE loss between the computed sentence embedding and a target sentence embedding
 9 |     """
10 |     def __init__(self, model):
11 |         super(MSELoss, self).__init__()
12 |         self.model = model
13 | 
14 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
15 |         rep = self.model(sentence_features[0])['sentence_embedding']
16 |         loss_fct = nn.MSELoss()
17 |         loss = loss_fct(rep, labels)
18 |         return loss
19 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/MultipleNegativesRankingLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | from ..SentenceTransformer import SentenceTransformer
 6 | 
 7 | class MultipleNegativesRankingLoss(nn.Module):
 8 |     def __init__(self, model: SentenceTransformer):
 9 |         super(MultipleNegativesRankingLoss, self).__init__()
10 |         self.model = model
11 | 
12 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
13 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
14 | 
15 |         reps_a, reps_b = reps
16 |         return self.multiple_negatives_ranking_loss(reps_a, reps_b)
17 | 
18 |     # Multiple Negatives Ranking Loss
19 |     # Paper: https://arxiv.org/pdf/1705.00652.pdf
20 |     #   Efficient Natural Language Response Suggestion for Smart Reply
21 |     #   Section 4.4
22 |     def multiple_negatives_ranking_loss(self, embeddings_a: Tensor, embeddings_b: Tensor):
23 |         """
24 |         Compute the loss over a batch with two embeddings per example.
25 | 
26 |         Each pair is a positive example. The negative examples are all other embeddings in embeddings_b with each embedding
27 |         in embedding_a.
28 | 
29 |         See the paper for more information: https://arxiv.org/pdf/1705.00652.pdf
30 |         (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4)
31 | 
32 |         :param embeddings_a:
33 |             Tensor of shape (batch_size, embedding_dim)
34 |         :param embeddings_b:
35 |             Tensor of shape (batch_size, embedding_dim)
36 |         :return:
37 |             The scalar loss
38 |         """
39 |         scores = torch.matmul(embeddings_a, embeddings_b.t())
40 |         diagonal_mean = torch.mean(torch.diag(scores))
41 |         mean_log_row_sum_exp = torch.mean(torch.logsumexp(scores, dim=1))
42 |         return -diagonal_mean + mean_log_row_sum_exp
43 | 


--------------------------------------------------------------------------------
/sentence_transformers/losses/SoftmaxLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | from ..SentenceTransformer import SentenceTransformer
 5 | import logging
 6 | 
 7 | class SoftmaxLoss(nn.Module):
 8 |     def __init__(self,
 9 |                  model: SentenceTransformer,
10 |                  sentence_embedding_dimension: int,
11 |                  num_labels: int,
12 |                  concatenation_sent_rep: bool = True,
13 |                  concatenation_sent_difference: bool = True,
14 |                  concatenation_sent_multiplication: bool = False):
15 |         super(SoftmaxLoss, self).__init__()
16 |         self.model = model
17 |         self.num_labels = num_labels
18 |         self.concatenation_sent_rep = concatenation_sent_rep
19 |         self.concatenation_sent_difference = concatenation_sent_difference
20 |         self.concatenation_sent_multiplication = concatenation_sent_multiplication
21 | 
22 |         num_vectors_concatenated = 0
23 |         if concatenation_sent_rep:
24 |             num_vectors_concatenated += 2
25 |         if concatenation_sent_difference:
26 |             num_vectors_concatenated += 1
27 |         if concatenation_sent_multiplication:
28 |             num_vectors_concatenated += 1
29 |         logging.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated))
30 |         self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels)
31 | 
32 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
33 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
34 |         rep_a, rep_b = reps
35 | 
36 |         vectors_concat = []
37 |         if self.concatenation_sent_rep:
38 |             vectors_concat.append(rep_a)
39 |             vectors_concat.append(rep_b)
40 | 
41 |         if self.concatenation_sent_difference:
42 |             vectors_concat.append(torch.abs(rep_a - rep_b))
43 | 
44 |         if self.concatenation_sent_multiplication:
45 |             vectors_concat.append(rep_a * rep_b)
46 | 
47 |         features = torch.cat(vectors_concat, 1)
48 | 
49 |         output = self.classifier(features)
50 |         loss_fct = nn.CrossEntropyLoss()
51 | 
52 |         if labels is not None:
53 |             loss = loss_fct(output, labels.view(-1))
54 |             return loss
55 |         else:
56 |             return reps, output


--------------------------------------------------------------------------------
/sentence_transformers/losses/TripletLoss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import torch.nn.functional as F
 5 | from enum import Enum
 6 | from ..SentenceTransformer import SentenceTransformer
 7 | 
 8 | class TripletDistanceMetric(Enum):
 9 |     """
10 |     The metric for the triplet loss
11 |     """
12 |     COSINE = lambda x, y: 1 - F.cosine_similarity(x, y)
13 |     EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2)
14 |     MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1)
15 | 
16 | class TripletLoss(nn.Module):
17 |     def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1):
18 |         super(TripletLoss, self).__init__()
19 |         self.model = model
20 |         self.distance_metric = distance_metric
21 |         self.triplet_margin = triplet_margin
22 | 
23 | 
24 |     def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor):
25 |         reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features]
26 | 
27 |         rep_anchor, rep_pos, rep_neg = reps
28 |         distance_pos = self.distance_metric(rep_anchor, rep_pos)
29 |         distance_neg = self.distance_metric(rep_anchor, rep_neg)
30 | 
31 |         losses = F.relu(distance_pos - distance_neg + self.triplet_margin)
32 |         return losses.mean()


--------------------------------------------------------------------------------
/sentence_transformers/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .CosineSimilarityLoss import *
2 | from .SoftmaxLoss import *
3 | from .BatchHardTripletLoss import *
4 | from .MultipleNegativesRankingLoss import *
5 | from .TripletLoss import *
6 | from .MSELoss import *


--------------------------------------------------------------------------------
/sentence_transformers/models/ADVANCED_CNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | import torch.nn.functional as F
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import logging
 6 | import gzip
 7 | from tqdm import tqdm
 8 | import numpy as np
 9 | import os
10 | import json
11 | from ..util import import_from_string, fullname, http_get
12 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
13 | 
14 | 
15 | class CNN(nn.Module):
16 |     """CNN-layer with multiple kernel-sizes over the word embeddings"""
17 | 
18 |     def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]):
19 |         nn.Module.__init__(self)
20 |         self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes']
21 |         self.in_word_embedding_dimension = in_word_embedding_dimension
22 |         self.out_channels = out_channels
23 |         self.kernel_sizes = kernel_sizes
24 | 
25 |         self.embeddings_dimension = out_channels*len(kernel_sizes)
26 |         self.convsModule = nn.ModuleList()
27 |         self.pooling = nn.AvgPool1d(2, stride=2)
28 |         in_channels = in_word_embedding_dimension
29 |         
30 |         for _ in range(4):
31 |             self.convsModule.append(nn.ModuleList())
32 | 
33 |         for kernel_size in kernel_sizes:
34 |             padding_size = int((kernel_size - 1) / 2)
35 |             conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
36 |                              padding=padding_size)
37 |             for i in self.convsModule:
38 |                 i.append(conv)
39 | 
40 |     def forward(self, features):
41 |         token_embeddings = features['all_layer_embeddings']
42 |         vectors =[]
43 |         for idx, convs in enumerate(self.convsModule):
44 |             temp = []
45 |             token_embedding = token_embeddings[len(token_embeddings)-idx-1].transpose(1, -1)
46 |             for conv in convs:
47 |                 a = F.tanh(conv(token_embedding))
48 |                 a = a.transpose(1, -1)
49 |                 a = self.pooling(a)
50 |                 a = a.transpose(1, -1)
51 |                 temp.append(a)
52 |             vectors.append(torch.cat(temp, 1))
53 | 
54 |         out = torch.cat(vectors, 1).transpose(1, -1)
55 |         features.update({'token_embeddings': out})
56 |         return features
57 | 
58 |     def get_word_embedding_dimension(self) -> int:
59 |         return self.embeddings_dimension
60 | 
61 |     def tokenize(self, text: str) -> List[int]:
62 |         raise NotImplementedError()
63 | 
64 |     def save(self, output_path: str):
65 |         with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut:
66 |             json.dump(self.get_config_dict(), fOut, indent=2)
67 | 
68 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
69 | 
70 |     def get_config_dict(self):
71 |         return {key: self.__dict__[key] for key in self.config_keys}
72 | 
73 |     @staticmethod
74 |     def load(input_path: str):
75 |         with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn:
76 |             config = json.load(fIn)
77 | 
78 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
79 |         model = CNN(**config)
80 |         model.load_state_dict(weights)
81 |         return model


--------------------------------------------------------------------------------
/sentence_transformers/models/ALBERT.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from transformers import AlbertModel, AlbertTokenizer
 4 | import json
 5 | from typing import Union, Tuple, List, Dict, Optional
 6 | import os
 7 | import numpy as np
 8 | import logging
 9 | 
10 | class ALBERT(nn.Module):
11 |     """ALBERT model to generate token embeddings.
12 | 
13 |     Each token is mapped to an output vector from BERT.
14 |     """
15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
16 |         super(ALBERT, self).__init__()
17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
18 |         self.do_lower_case = do_lower_case
19 | 
20 |         if max_seq_length > 510:
21 |             logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
22 |             max_seq_length = 510
23 |         self.max_seq_length = max_seq_length
24 | 
25 |         if self.do_lower_case is not None:
26 |             tokenizer_args['do_lower_case'] = do_lower_case
27 | 
28 |         self.albert = AlbertModel.from_pretrained(model_name_or_path, **model_args)
29 |         self.tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
30 | 
31 |     def forward(self, features):
32 |         """Returns token_embeddings, cls_token"""
33 |         output_states = self.albert(**features)
34 |         output_tokens = output_states[0]
35 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
36 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
37 | 
38 |         if self.albert.config.output_hidden_states:
39 |             hidden_states = output_states[2]
40 |             features.update({'all_layer_embeddings': hidden_states})
41 | 
42 |         return features
43 | 
44 |     def get_word_embedding_dimension(self) -> int:
45 |         return self.albert.config.hidden_size
46 | 
47 |     def tokenize(self, text: str) -> List[int]:
48 |         """
49 |         Tokenizes a text and maps tokens to token-ids
50 |         """
51 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
52 | 
53 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
54 |         """
55 |         Convert tokenized sentence in its embedding ids, segment ids and mask
56 | 
57 |         :param tokens:
58 |             a tokenized sentence
59 |         :param pad_seq_length:
60 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
61 |         :return: embedding ids, segment ids and mask for the sentence
62 |         """
63 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens
64 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
65 | 
66 | 
67 |     def get_config_dict(self):
68 |         return {key: self.__dict__[key] for key in self.config_keys}
69 | 
70 |     def save(self, output_path: str):
71 |         self.albert.save_pretrained(output_path)
72 |         self.tokenizer.save_pretrained(output_path)
73 | 
74 |         with open(os.path.join(output_path, 'sentence_albert_config.json'), 'w') as fOut:
75 |             json.dump(self.get_config_dict(), fOut, indent=2)
76 | 
77 |     @staticmethod
78 |     def load(input_path: str):
79 |         with open(os.path.join(input_path, 'sentence_albert_config.json')) as fIn:
80 |             config = json.load(fIn)
81 |         return ALBERT(model_name_or_path=input_path, **config)
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/BERT.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import BertModel, BertTokenizer
 3 | import json
 4 | from typing import List, Dict, Optional
 5 | import os
 6 | import numpy as np
 7 | import logging
 8 | 
 9 | class BERT(nn.Module):
10 |     """BERT model to generate token embeddings.
11 | 
12 |     Each token is mapped to an output vector from BERT.
13 |     """
14 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
15 |         super(BERT, self).__init__()
16 |         self.config_keys = ['max_seq_length', 'do_lower_case']
17 |         self.do_lower_case = do_lower_case
18 | 
19 |         if max_seq_length > 510:
20 |             logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
21 |             max_seq_length = 510
22 |         self.max_seq_length = max_seq_length
23 | 
24 |         if self.do_lower_case is not None:
25 |             tokenizer_args['do_lower_case'] = do_lower_case
26 | 
27 |         self.bert = BertModel.from_pretrained(model_name_or_path, **model_args)
28 |         self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
29 | 
30 | 
31 |     def forward(self, features):
32 |         """Returns token_embeddings, cls_token"""
33 |         output_states = self.bert(**features)
34 |         output_tokens = output_states[0]
35 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
36 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
37 | 
38 |         if len(output_states) > 2:
39 |             features.update({'all_layer_embeddings': output_states[2]})
40 | 
41 |         return features
42 | 
43 |     def get_word_embedding_dimension(self) -> int:
44 |         return self.bert.config.hidden_size
45 | 
46 |     def tokenize(self, text: str) -> List[int]:
47 |         """
48 |         Tokenizes a text and maps tokens to token-ids
49 |         """
50 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
51 | 
52 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
53 |         """
54 |         Convert tokenized sentence in its embedding ids, segment ids and mask
55 | 
56 |         :param tokens:
57 |             a tokenized sentence
58 |         :param pad_seq_length:
59 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
60 |         :return: embedding ids, segment ids and mask for the sentence
61 |         """
62 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2  ##Add Space for CLS + SEP token
63 | 
64 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
65 | 
66 | 
67 |     def get_config_dict(self):
68 |         return {key: self.__dict__[key] for key in self.config_keys}
69 | 
70 |     def save(self, output_path: str):
71 |         self.bert.save_pretrained(output_path)
72 |         self.tokenizer.save_pretrained(output_path)
73 | 
74 |         with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut:
75 |             json.dump(self.get_config_dict(), fOut, indent=2)
76 | 
77 |     @staticmethod
78 |     def load(input_path: str):
79 |         with open(os.path.join(input_path, 'sentence_bert_config.json')) as fIn:
80 |             config = json.load(fIn)
81 |         return BERT(model_name_or_path=input_path, **config)
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/BERT_LSTM.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from typing import List
 4 | import os
 5 | import json
 6 | import numpy as np
 7 | 
 8 | 
 9 | class LSTM(nn.Module):
10 |     """
11 |     Bidirectional LSTM running over word embeddings.
12 |     """
13 |     def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True):
14 |         nn.Module.__init__(self)
15 |         self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional']
16 |         self.word_embedding_dimension = word_embedding_dimension
17 |         self.hidden_dim = hidden_dim
18 |         self.num_layers = num_layers
19 |         self.dropout = dropout
20 |         self.bidirectional = bidirectional
21 | 
22 |         self.embeddings_dimension = hidden_dim
23 |         if self.bidirectional:
24 |             self.embeddings_dimension *= 2
25 | 
26 |         self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
27 | 
28 |     def forward(self, features):
29 |         token_embeddings = features['token_embeddings']
30 |         a = []
31 |         for i in token_embeddings:
32 |             a.append(len(i))
33 |         features.update({'sentence_lengths':  torch.tensor(a, dtype=torch.long)})
34 |         
35 |         sentence_lengths = torch.clamp(features['sentence_lengths'], min=1)
36 |         # print(sentence_lengths)
37 |         #print(features['sentence_lengths'])
38 |         
39 |         packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False)
40 |         packed = self.encoder(packed)
41 |         unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0]
42 |         #print(unpack.size())
43 |         features.update({'token_embeddings': unpack})
44 |         return features
45 | 
46 |     def get_word_embedding_dimension(self) -> int:
47 |         return self.embeddings_dimension
48 | 
49 |     def tokenize(self, text: str) -> List[int]:
50 |         raise NotImplementedError()
51 | 
52 |     def save(self, output_path: str):
53 |         with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
54 |             json.dump(self.get_config_dict(), fOut, indent=2)
55 | 
56 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
57 | 
58 |     def get_config_dict(self):
59 |         return {key: self.__dict__[key] for key in self.config_keys}
60 | 
61 |     @staticmethod
62 |     def load(input_path: str):
63 |         with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
64 |             config = json.load(fIn)
65 | 
66 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
67 |         model = LSTM(**config)
68 |         model.load_state_dict(weights)
69 |         return model
70 | 
71 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/BoW.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | import numpy as np
 9 | from .tokenizer import WhitespaceTokenizer
10 | 
11 | class BoW(nn.Module):
12 |     """Implements a Bag-of-Words (BoW) model to derive sentence embeddings.
13 | 
14 |     A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab.
15 |     """
16 | 
17 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True):
18 |         super(BoW, self).__init__()
19 |         vocab = list(set(vocab)) #Ensure vocab is unique
20 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency']
21 |         self.vocab = vocab
22 |         self.word_weights = word_weights
23 |         self.unknown_word_weight = unknown_word_weight
24 |         self.cumulative_term_frequency = cumulative_term_frequency
25 | 
26 |         #Maps wordIdx -> word weight
27 |         self.weights = []
28 |         num_unknown_words = 0
29 |         for word in vocab:
30 |             weight = unknown_word_weight
31 |             if word in word_weights:
32 |                 weight = word_weights[word]
33 |             elif word.lower() in word_weights:
34 |                 weight = word_weights[word.lower()]
35 |             else:
36 |                 num_unknown_words += 1
37 |             self.weights.append(weight)
38 | 
39 |         logging.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
40 | 
41 |         self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False)
42 |         self.sentence_embedding_dimension = len(vocab)
43 | 
44 | 
45 |     def forward(self, features: Dict[str, Tensor]):
46 |         #Nothing to do, everything is done in get_sentence_features
47 |         return features
48 | 
49 |     def tokenize(self, text: str) -> List[int]:
50 |         return self.tokenizer.tokenize(text)
51 | 
52 |     def get_sentence_embedding_dimension(self):
53 |         return self.sentence_embedding_dimension
54 | 
55 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
56 |         vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32)
57 |         for token in tokens:
58 |             if self.cumulative_term_frequency:
59 |                 vector[token] += self.weights[token]
60 |             else:
61 |                 vector[token] = self.weights[token]
62 | 
63 |         return {'sentence_embedding': torch.tensor([vector], dtype=torch.float)}
64 | 
65 |     def get_config_dict(self):
66 |         return {key: self.__dict__[key] for key in self.config_keys}
67 | 
68 |     def save(self, output_path):
69 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
70 |             json.dump(self.get_config_dict(), fOut, indent=2)
71 | 
72 |     @staticmethod
73 |     def load(input_path):
74 |         with open(os.path.join(input_path, 'config.json')) as fIn:
75 |             config = json.load(fIn)
76 | 
77 |         return BoW(**config)


--------------------------------------------------------------------------------
/sentence_transformers/models/CNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | from typing import Union, Tuple, List, Iterable, Dict
 4 | import logging
 5 | import gzip
 6 | from tqdm import tqdm
 7 | import numpy as np
 8 | import os
 9 | import json
10 | from ..util import import_from_string, fullname, http_get
11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
12 | 
13 | 
14 | class CNN(nn.Module):
15 |     """CNN-layer with multiple kernel-sizes over the word embeddings"""
16 | 
17 |     def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]):
18 |         nn.Module.__init__(self)
19 |         self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes']
20 |         self.in_word_embedding_dimension = in_word_embedding_dimension
21 |         self.out_channels = out_channels
22 |         self.kernel_sizes = kernel_sizes
23 | 
24 |         self.embeddings_dimension = out_channels*len(kernel_sizes)
25 |         self.convs = nn.ModuleList()
26 | 
27 |         in_channels = in_word_embedding_dimension
28 |         for kernel_size in kernel_sizes:
29 |             padding_size = int((kernel_size - 1) / 2)
30 |             conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
31 |                              padding=padding_size)
32 |             self.convs.append(conv)
33 | 
34 |     def forward(self, features):
35 |         token_embeddings = features['token_embeddings']
36 | 
37 |         token_embeddings = token_embeddings.transpose(1, -1)
38 |         vectors = [conv(token_embeddings) for conv in self.convs]
39 |         out = torch.cat(vectors, 1).transpose(1, -1)
40 | 
41 |         features.update({'token_embeddings': out})
42 |         return features
43 | 
44 |     def get_word_embedding_dimension(self) -> int:
45 |         return self.embeddings_dimension
46 | 
47 |     def tokenize(self, text: str) -> List[int]:
48 |         raise NotImplementedError()
49 | 
50 |     def save(self, output_path: str):
51 |         with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut:
52 |             json.dump(self.get_config_dict(), fOut, indent=2)
53 | 
54 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
55 | 
56 |     def get_config_dict(self):
57 |         return {key: self.__dict__[key] for key in self.config_keys}
58 | 
59 |     @staticmethod
60 |     def load(input_path: str):
61 |         with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn:
62 |             config = json.load(fIn)
63 | 
64 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
65 |         model = CNN(**config)
66 |         model.load_state_dict(weights)
67 |         return model
68 | 
69 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/CamemBERT.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from transformers import CamembertModel, CamembertTokenizer
 4 | import json
 5 | from typing import Union, Tuple, List, Dict, Optional
 6 | import os
 7 | import numpy as np
 8 | import logging
 9 | 
10 | 
11 | class CamemBERT(nn.Module):
12 |     """CamemBERT model to generate token embeddings.
13 | 
14 |     Each token is mapped to an output vector from CamemBERT.
15 |     """
16 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
17 |         super(CamemBERT, self).__init__()
18 |         self.config_keys = ['max_seq_length', 'do_lower_case']
19 |         self.do_lower_case = do_lower_case
20 | 
21 |         if max_seq_length > 511:
22 |             logging.warning("CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511")
23 |             max_seq_length = 511
24 |         self.max_seq_length = max_seq_length
25 | 
26 |         if self.do_lower_case is not None:
27 |             tokenizer_args['do_lower_case'] = do_lower_case
28 | 
29 |         self.camembert = CamembertModel.from_pretrained(model_name_or_path, **model_args)
30 |         self.tokenizer = CamembertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
31 | 
32 |     def forward(self, features):
33 |         """Returns token_embeddings, cls_token"""
34 |         #CamemBERT does not use token_type_ids
35 |         output_states = self.camembert(**features)
36 |         output_tokens = output_states[0]
37 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
38 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
39 | 
40 |         if self.camembert.config.output_hidden_states:
41 |             hidden_states = output_states[2]
42 |             features.update({'all_layer_embeddings': hidden_states})
43 | 
44 |         return features
45 | 
46 |     def get_word_embedding_dimension(self) -> int:
47 |         return self.camembert.config.hidden_size
48 | 
49 |     def tokenize(self, text: str) -> List[int]:
50 |         """
51 |         Tokenizes a text and maps tokens to token-ids
52 |         """
53 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
54 | 
55 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
56 |         """
57 |         Convert tokenized sentence in its embedding ids, segment ids and mask
58 | 
59 |         :param tokens:
60 |             a tokenized sentence
61 |         :param pad_seq_length:
62 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
63 |         :return: embedding ids, segment ids and mask for the sentence
64 |         """
65 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens
66 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
67 | 
68 |     def get_config_dict(self):
69 |         return {key: self.__dict__[key] for key in self.config_keys}
70 | 
71 |     def save(self, output_path: str):
72 |         self.camembert.save_pretrained(output_path)
73 |         self.tokenizer.save_pretrained(output_path)
74 | 
75 |         with open(os.path.join(output_path, 'sentence_camembert_config.json'), 'w') as fOut:
76 |             json.dump(self.get_config_dict(), fOut, indent=2)
77 | 
78 |     @staticmethod
79 |     def load(input_path: str):
80 |         with open(os.path.join(input_path, 'sentence_camembert_config.json')) as fIn:
81 |             config = json.load(fIn)
82 |         return CamemBERT(model_name_or_path=input_path, **config)
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/Dense.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from torch import functional as F
 5 | from typing import Union, Tuple, List, Iterable, Dict
 6 | import os
 7 | import json
 8 | from ..util import fullname, import_from_string
 9 | 
10 | 
11 | class Dense(nn.Module):
12 |     """Feed-forward function with  activiation function.
13 | 
14 |     This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN).
15 |     """
16 |     def __init__(self, in_features, out_features, bias=True, activation_function=nn.Tanh()):
17 |         super(Dense, self).__init__()
18 |         self.in_features = in_features
19 |         self.out_features = out_features
20 |         self.bias = bias
21 |         self.activation_function = activation_function
22 |         self.linear = nn.Linear(in_features, out_features, bias=bias)
23 | 
24 |     def forward(self, features: Dict[str, Tensor]):
25 |         features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))})
26 |         return features
27 | 
28 |     def get_sentence_embedding_dimension(self) -> int:
29 |         return self.out_features
30 | 
31 |     def save(self, output_path):
32 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
33 |             json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut)
34 | 
35 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
36 | 
37 |     @staticmethod
38 |     def load(input_path):
39 |         with open(os.path.join(input_path, 'config.json')) as fIn:
40 |             config = json.load(fIn)
41 | 
42 |         config['activation_function'] = import_from_string(config['activation_function'])()
43 |         model = Dense(**config)
44 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
45 |         return model
46 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/DistilBERT.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from transformers import DistilBertModel, DistilBertTokenizer
 4 | import json
 5 | from typing import Union, Tuple, List, Dict, Optional
 6 | import os
 7 | import numpy as np
 8 | import logging
 9 | 
10 | class DistilBERT(nn.Module):
11 |     """DistilBERT model to generate token embeddings.
12 | 
13 |     Each token is mapped to an output vector from DistilBERT.
14 |     """
15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
16 |         super(DistilBERT, self).__init__()
17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
18 |         self.do_lower_case = do_lower_case
19 | 
20 |         if max_seq_length > 510:
21 |             logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510")
22 |             max_seq_length = 510
23 |         self.max_seq_length = max_seq_length
24 | 
25 |         if self.do_lower_case is not None:
26 |             tokenizer_args['do_lower_case'] = do_lower_case
27 | 
28 |         self.bert = DistilBertModel.from_pretrained(model_name_or_path, **model_args)
29 |         self.tokenizer = DistilBertTokenizer.from_pretrained(model_name_or_path,  **tokenizer_args)
30 | 
31 |     def forward(self, features):
32 |         """Returns token_embeddings, cls_token"""
33 |         # DistilBERT does not use token_type_ids
34 |         output_states = self.bert(**features)
35 |         output_tokens = output_states[0]
36 | 
37 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
38 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
39 | 
40 |         if len(output_states) > 1:
41 |             features.update({'all_layer_embeddings': output_states[1]})
42 | 
43 |         return features
44 | 
45 |     def get_word_embedding_dimension(self) -> int:
46 |         return self.bert.config.hidden_size
47 | 
48 |     def tokenize(self, text: str) -> List[int]:
49 |         """
50 |         Tokenizes a text and maps tokens to token-ids
51 |         """
52 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
53 | 
54 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
55 |         """
56 |         Convert tokenized sentence in its embedding ids, segment ids and mask
57 | 
58 |         :param tokens:
59 |             a tokenized sentence
60 |         :param pad_seq_length:
61 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
62 |         :return: embedding ids, segment ids and mask for the sentence
63 |         """
64 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 #Add space for special tokens
65 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
66 | 
67 |     def get_config_dict(self):
68 |         return {key: self.__dict__[key] for key in self.config_keys}
69 | 
70 |     def save(self, output_path: str):
71 |         self.bert.save_pretrained(output_path)
72 |         self.tokenizer.save_pretrained(output_path)
73 | 
74 |         with open(os.path.join(output_path, 'sentence_distilbert_config.json'), 'w') as fOut:
75 |             json.dump(self.get_config_dict(), fOut, indent=2)
76 | 
77 |     @staticmethod
78 |     def load(input_path: str):
79 |         with open(os.path.join(input_path, 'sentence_distilbert_config.json')) as fIn:
80 |             config = json.load(fIn)
81 |         return DistilBERT(model_name_or_path=input_path, **config)
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/LSTM.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn
 3 | from typing import List
 4 | import os
 5 | import json
 6 | 
 7 | 
 8 | 
 9 | class LSTM(nn.Module):
10 |     """
11 |     Bidirectional LSTM running over word embeddings.
12 |     """
13 |     def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True):
14 |         nn.Module.__init__(self)
15 |         self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional']
16 |         self.word_embedding_dimension = word_embedding_dimension
17 |         self.hidden_dim = hidden_dim
18 |         self.num_layers = num_layers
19 |         self.dropout = dropout
20 |         self.bidirectional = bidirectional
21 | 
22 |         self.embeddings_dimension = hidden_dim
23 |         if self.bidirectional:
24 |             self.embeddings_dimension *= 2
25 | 
26 |         self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
27 | 
28 |     def forward(self, features):
29 |         token_embeddings = features['token_embeddings']
30 |         sentence_lengths = torch.clamp(features['sentence_lengths'], min=1)
31 | 
32 |         packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False)
33 |         packed = self.encoder(packed)
34 |         unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0]
35 |         features.update({'token_embeddings': unpack})
36 |         return features
37 | 
38 |     def get_word_embedding_dimension(self) -> int:
39 |         return self.embeddings_dimension
40 | 
41 |     def tokenize(self, text: str) -> List[int]:
42 |         raise NotImplementedError()
43 | 
44 |     def save(self, output_path: str):
45 |         with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut:
46 |             json.dump(self.get_config_dict(), fOut, indent=2)
47 | 
48 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
49 | 
50 |     def get_config_dict(self):
51 |         return {key: self.__dict__[key] for key in self.config_keys}
52 | 
53 |     @staticmethod
54 |     def load(input_path: str):
55 |         with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn:
56 |             config = json.load(fIn)
57 | 
58 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
59 |         model = LSTM(**config)
60 |         model.load_state_dict(weights)
61 |         return model
62 | 
63 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/PhoBERT.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from transformers import RobertaModel
 4 | from .tokenizer.PhoTokenizer import PhoTokenizer
 5 | import json
 6 | from typing import Union, Tuple, List, Dict, Optional
 7 | import os
 8 | import logging
 9 | 
10 | class PhoBERT(nn.Module):
11 |     """PhoBERT model to generate token embeddings.
12 | 
13 |     Each token is mapped to an output vector from PhoBERT.
14 |     """
15 |     def __init__(self, model_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = False, model_args: Dict = {}, tokenizer_args: Dict = {}):
16 |         super(PhoBERT, self).__init__()
17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
18 |         self.do_lower_case = do_lower_case
19 | 
20 |         if max_seq_length > 256:
21 |             logging.warning("PhoBERT only allows a max_seq_length of 256 (258 with special tokens). Value will be set to 256")
22 |             max_seq_length = 256
23 |         self.max_seq_length = max_seq_length
24 | 
25 |         if self.do_lower_case is not None:
26 |             tokenizer_args['do_lower_case'] = do_lower_case
27 | 
28 |         self.phobert = RobertaModel.from_pretrained(model_path, **model_args)
29 |         self.tokenizer = PhoTokenizer.load(model_path, **tokenizer_args)
30 | 
31 | 
32 |     def forward(self, features):
33 |         """Returns token_embeddings, cls_token"""
34 |         output_states = self.phobert(**features)
35 |         output_tokens = output_states[0]
36 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
37 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
38 | 
39 |         if len(output_states) > 2:
40 |             features.update({'all_layer_embeddings': output_states[2]})
41 | 
42 |         return features
43 | 
44 |     def get_word_embedding_dimension(self) -> int:
45 |         return self.phobert.config.hidden_size
46 | 
47 |     def tokenize(self, text: str) -> List[int]:
48 |         """
49 |         Tokenizes a text and maps tokens to token-ids
50 |         """
51 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
52 | 
53 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
54 |         """
55 |         Convert tokenized sentence in its embedding ids, segment ids and mask
56 | 
57 |         :param tokens:
58 |             a tokenized sentence
59 |         :param pad_seq_length:
60 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
61 |         :return: embedding ids, segment ids and mask for the sentence
62 |         """
63 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token
64 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
65 | 
66 |     def get_config_dict(self):
67 |         return {key: self.__dict__[key] for key in self.config_keys}
68 | 
69 |     def save(self, output_path: str):
70 |         self.phobert.save_pretrained(output_path)
71 |         self.tokenizer.save(output_path)
72 | 
73 |         with open(os.path.join(output_path, 'sentence_phobert_config.json'), 'w') as fOut:
74 |             json.dump(self.get_config_dict(), fOut, indent=2)
75 | 
76 |     @staticmethod
77 |     def load(input_path: str):
78 |         with open(os.path.join(input_path, 'sentence_phobert_config.json')) as fIn:
79 |                 config = json.load(fIn)
80 |         return PhoBERT(model_path=input_path, **config)


--------------------------------------------------------------------------------
/sentence_transformers/models/Pooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | 
 8 | 
 9 | class Pooling(nn.Module):
10 |     """Performs pooling (max or mean) on the token embeddings.
11 | 
12 |     Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model.
13 |     You can concatenate multiple poolings together.
14 |     """
15 |     def __init__(self,
16 |                  word_embedding_dimension: int,
17 |                  pooling_mode_cls_token: bool = False,
18 |                  pooling_mode_max_tokens: bool = False,
19 |                  pooling_mode_mean_tokens: bool = True,
20 |                  pooling_mode_mean_sqrt_len_tokens: bool = False,
21 |                  ):
22 |         super(Pooling, self).__init__()
23 | 
24 |         self.config_keys = ['word_embedding_dimension',  'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens']
25 | 
26 |         self.word_embedding_dimension = word_embedding_dimension
27 |         self.pooling_mode_cls_token = pooling_mode_cls_token
28 |         self.pooling_mode_mean_tokens = pooling_mode_mean_tokens
29 |         self.pooling_mode_max_tokens = pooling_mode_max_tokens
30 |         self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens
31 | 
32 |         pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens])
33 |         self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension)
34 | 
35 |     def forward(self, features: Dict[str, Tensor]):
36 |         token_embeddings = features['token_embeddings']
37 |         cls_token = features['cls_token_embeddings']
38 |         attention_mask = features['attention_mask']
39 | 
40 |         ## Pooling strategy
41 |         output_vectors = []
42 |         if self.pooling_mode_cls_token:
43 |             output_vectors.append(cls_token)
44 |         if self.pooling_mode_max_tokens:
45 |             input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
46 |             token_embeddings[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
47 |             max_over_time = torch.max(token_embeddings, 1)[0]
48 |             output_vectors.append(max_over_time)
49 |         if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens:
50 |             input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
51 |             sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
52 | 
53 |             #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
54 |             if 'token_weights_sum' in features:
55 |                 sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
56 |             else:
57 |                 sum_mask = input_mask_expanded.sum(1)
58 | 
59 |             sum_mask = torch.clamp(sum_mask, min=1e-9)
60 | 
61 |             if self.pooling_mode_mean_tokens:
62 |                 output_vectors.append(sum_embeddings / sum_mask)
63 |             if self.pooling_mode_mean_sqrt_len_tokens:
64 |                 output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
65 | 
66 |         output_vector = torch.cat(output_vectors, 1)
67 |         features.update({'sentence_embedding': output_vector})
68 |         return features
69 | 
70 |     def get_sentence_embedding_dimension(self):
71 |         return self.pooling_output_dimension
72 | 
73 |     def get_config_dict(self):
74 |         return {key: self.__dict__[key] for key in self.config_keys}
75 | 
76 |     def save(self, output_path):
77 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
78 |             json.dump(self.get_config_dict(), fOut, indent=2)
79 | 
80 |     @staticmethod
81 |     def load(input_path):
82 |         with open(os.path.join(input_path, 'config.json')) as fIn:
83 |             config = json.load(fIn)
84 | 
85 |         return Pooling(**config)
86 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/RoBERTa.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from transformers import RobertaModel, RobertaTokenizer
 4 | import json
 5 | from typing import Union, Tuple, List, Dict, Optional
 6 | import os
 7 | import numpy as np
 8 | import logging
 9 | 
10 | class RoBERTa(nn.Module):
11 |     """RoBERTa model to generate token embeddings.
12 | 
13 |     Each token is mapped to an output vector from RoBERTa.
14 |     """
15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
16 |         super(RoBERTa, self).__init__()
17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
18 |         self.do_lower_case = do_lower_case
19 | 
20 |         if max_seq_length > 512:
21 |             logging.warning("RoBERTa only allows a max_seq_length of 512 (514 with special tokens). Value will be set to 512")
22 |             max_seq_length = 512
23 |         self.max_seq_length = max_seq_length
24 | 
25 |         if self.do_lower_case is not None:
26 |             tokenizer_args['do_lower_case'] = do_lower_case
27 | 
28 |         self.roberta = RobertaModel.from_pretrained(model_name_or_path, **model_args)
29 |         self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
30 | 
31 | 
32 |     def forward(self, features):
33 |         """Returns token_embeddings, cls_token"""
34 |         output_states = self.roberta(**features)
35 |         output_tokens = output_states[0]
36 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
37 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
38 | 
39 |         if len(output_states) > 2:
40 |             features.update({'all_layer_embeddings': output_states[2]})
41 | 
42 |         return features
43 | 
44 |     def get_word_embedding_dimension(self) -> int:
45 |         return self.roberta.config.hidden_size
46 | 
47 |     def tokenize(self, text: str) -> List[int]:
48 |         """
49 |         Tokenizes a text and maps tokens to token-ids
50 |         """
51 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
52 | 
53 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
54 |         """
55 |         Convert tokenized sentence in its embedding ids, segment ids and mask
56 | 
57 |         :param tokens:
58 |             a tokenized sentence
59 |         :param pad_seq_length:
60 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
61 |         :return: embedding ids, segment ids and mask for the sentence
62 |         """
63 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token
64 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
65 | 
66 |     def get_config_dict(self):
67 |         return {key: self.__dict__[key] for key in self.config_keys}
68 | 
69 |     def save(self, output_path: str):
70 |         self.roberta.save_pretrained(output_path)
71 |         self.tokenizer.save_pretrained(output_path)
72 | 
73 |         with open(os.path.join(output_path, 'sentence_roberta_config.json'), 'w') as fOut:
74 |             json.dump(self.get_config_dict(), fOut, indent=2)
75 | 
76 |     @staticmethod
77 |     def load(input_path: str):
78 |         with open(os.path.join(input_path, 'sentence_roberta_config.json')) as fIn:
79 |             config = json.load(fIn)
80 |         return RoBERTa(model_name_or_path=input_path, **config)
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/T5.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import T5Model, T5Tokenizer
 3 | import json
 4 | from typing import List, Dict, Optional
 5 | import os
 6 | import numpy as np
 7 | import logging
 8 | 
 9 | class T5(nn.Module):
10 |     """T5 model to generate token embeddings.
11 | 
12 |     Each token is mapped to an output vector from BERT.
13 |     """
14 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, task_identifier: str = 'stsb sentence1: ', model_args: Dict = {}, tokenizer_args: Dict = {}):
15 |         super(T5, self).__init__()
16 |         self.config_keys = ['max_seq_length', 'do_lower_case', 'task_identifier']
17 |         self.do_lower_case = do_lower_case
18 | 
19 |         if max_seq_length > 512:
20 |             logging.warning("T5 only allows a max_seq_length of 512. Value will be set to 512")
21 |             max_seq_length = 512
22 |         self.max_seq_length = max_seq_length
23 | 
24 |         if self.do_lower_case is not None:
25 |             tokenizer_args['do_lower_case'] = do_lower_case
26 | 
27 |         self.t5model = T5Model.from_pretrained(model_name_or_path, **model_args)
28 |         self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
29 |         self.task_identifier = task_identifier
30 | 
31 |     def forward(self, features):
32 |         """Returns token_embeddings, cls_token"""
33 |         output_states = self.t5model.encoder(input_ids=features['input_ids'], attention_mask=features['attention_mask'])
34 |         output_tokens = output_states[0]
35 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
36 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens})
37 | 
38 |         if len(output_states) > 1:
39 |             features.update({'all_layer_embeddings': output_states[1]})
40 | 
41 |         return features
42 | 
43 |     def get_word_embedding_dimension(self) -> int:
44 |         return self.t5model.config.hidden_size
45 | 
46 |     def tokenize(self, text: str) -> List[int]:
47 |         """
48 |         Tokenizes a text and maps tokens to token-ids
49 |         """
50 |         return self.tokenizer.encode(self.task_identifier+text)
51 | 
52 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
53 |         """
54 |         Convert tokenized sentence in its embedding ids, segment ids and mask
55 | 
56 |         :param tokens:
57 |             a tokenized sentence
58 |         :param pad_seq_length:
59 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
60 |         :return: embedding ids, segment ids and mask for the sentence
61 |         """
62 | 
63 |         pad_seq_length = min(pad_seq_length, self.max_seq_length)
64 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
65 | 
66 |     def get_config_dict(self):
67 |         return {key: self.__dict__[key] for key in self.config_keys}
68 | 
69 |     def save(self, output_path: str):
70 |         self.t5model.save_pretrained(output_path)
71 |         self.tokenizer.save_pretrained(output_path)
72 | 
73 |         with open(os.path.join(output_path, 'sentence_T5_config.json'), 'w') as fOut:
74 |             json.dump(self.get_config_dict(), fOut, indent=2)
75 | 
76 |     @staticmethod
77 |     def load(input_path: str):
78 |         with open(os.path.join(input_path, 'sentence_T5_config.json')) as fIn:
79 |             config = json.load(fIn)
80 |         return T5(model_name_or_path=input_path, **config)
81 | 
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/Transformer.py:
--------------------------------------------------------------------------------
 1 | from torch import nn
 2 | from transformers import AutoModel, AutoTokenizer, AutoConfig
 3 | import json
 4 | from typing import List, Dict, Optional
 5 | import os
 6 | import numpy as np
 7 | import logging
 8 | 
 9 | class Transformer(nn.Module):
10 |     """Huggingface AutoModel to generate token embeddings.
11 |     Loads the correct class, e.g. BERT / RoBERTa etc.
12 |     """
13 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ):
14 |         super(Transformer, self).__init__()
15 |         self.config_keys = ['max_seq_length']
16 |         self.max_seq_length = max_seq_length
17 | 
18 |         config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir)
19 |         self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
20 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
21 | 
22 | 
23 |     def forward(self, features):
24 |         """Returns token_embeddings, cls_token"""
25 |         output_states = self.auto_model(**features)
26 |         output_tokens = output_states[0]
27 | 
28 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
29 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
30 | 
31 |         if self.auto_model.config.output_hidden_states:
32 |             all_layer_idx = 2
33 |             if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states
34 |                 all_layer_idx = 1
35 | 
36 |             hidden_states = output_states[all_layer_idx]
37 |             features.update({'all_layer_embeddings': hidden_states})
38 | 
39 |         return features
40 | 
41 |     def get_word_embedding_dimension(self) -> int:
42 |         return self.auto_model.config.hidden_size
43 | 
44 |     def tokenize(self, text: str) -> List[int]:
45 |         """
46 |         Tokenizes a text and maps tokens to token-ids
47 |         """
48 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
49 | 
50 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
51 |         """
52 |         Convert tokenized sentence in its embedding ids, segment ids and mask
53 | 
54 |         :param tokens:
55 |             a tokenized sentence
56 |         :param pad_seq_length:
57 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
58 |         :return: embedding ids, segment ids and mask for the sentence
59 |         """
60 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens
61 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
62 | 
63 |     def get_config_dict(self):
64 |         return {key: self.__dict__[key] for key in self.config_keys}
65 | 
66 |     def save(self, output_path: str):
67 |         self.auto_model.save_pretrained(output_path)
68 |         self.tokenizer.save_pretrained(output_path)
69 | 
70 |         with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut:
71 |             json.dump(self.get_config_dict(), fOut, indent=2)
72 | 
73 |     @staticmethod
74 |     def load(input_path: str):
75 |         with open(os.path.join(input_path, 'sentence_bert_config.json')) as fIn:
76 |             config = json.load(fIn)
77 |         return Transformer(model_name_or_path=input_path, **config)
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/WeightedLayerPooling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import numpy as np
 8 | import torch.nn.functional as F
 9 | 
10 | from sklearn.metrics.pairwise import cosine_similarity
11 | from sklearn.preprocessing import normalize
12 | 
13 | class WeightedLayerPooling(nn.Module):
14 |     """
15 |     Token embeddings are weighted mean of their different hidden layer representations
16 |     """
17 |     def __init__(self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights = None):
18 |         super(WeightedLayerPooling, self).__init__()
19 |         self.config_keys = ['word_embedding_dimension', 'layer_start', 'num_hidden_layers']
20 |         self.word_embedding_dimension = word_embedding_dimension
21 |         self.layer_start = layer_start
22 |         self.num_hidden_layers = num_hidden_layers
23 |         self.layer_weights = layer_weights if layer_weights is not None else nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float))
24 | 
25 |     def forward(self, features: Dict[str, Tensor]):
26 |         ft_all_layers = features['all_layer_embeddings']
27 | 
28 |         all_layer_embedding = torch.stack(ft_all_layers)
29 |         all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :]  # Start from 4th layers output
30 | 
31 |         weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size())
32 |         weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum()
33 | 
34 |         features.update({'token_embeddings': weighted_average})
35 |         return features
36 | 
37 |     def get_word_embedding_dimension(self):
38 |         return self.word_embedding_dimension
39 | 
40 |     def get_config_dict(self):
41 |         return {key: self.__dict__[key] for key in self.config_keys}
42 | 
43 |     def save(self, output_path):
44 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
45 |             json.dump(self.get_config_dict(), fOut, indent=2)
46 | 
47 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
48 | 
49 | 
50 |     @staticmethod
51 |     def load(input_path):
52 |         with open(os.path.join(input_path, 'config.json')) as fIn:
53 |             config = json.load(fIn)
54 | 
55 |         model = WeightedLayerPooling(**config)
56 |         model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu')))
57 |         return model
58 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/WordWeights.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import Tensor
 3 | from torch import nn
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import os
 6 | import json
 7 | import logging
 8 | 
 9 | class WordWeights(nn.Module):
10 |     """This model can weight word embeddings, for example, with idf-values."""
11 | 
12 |     def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1):
13 |         """
14 | 
15 |         :param vocab:
16 |             Vocabulary of the tokenizer
17 |         :param word_weights:
18 |             Mapping of tokens to a float weight value. Words embeddings are multiplied by  this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values)
19 |         :param unknown_word_weight:
20 |             Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists.
21 |         """
22 |         super(WordWeights, self).__init__()
23 |         self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight']
24 |         self.vocab = vocab
25 |         self.word_weights = word_weights
26 |         self.unknown_word_weight = unknown_word_weight
27 | 
28 |         weights = []
29 |         num_unknown_words = 0
30 |         for word in vocab:
31 |             weight = unknown_word_weight
32 |             if word in word_weights:
33 |                 weight = word_weights[word]
34 |             elif word.lower() in word_weights:
35 |                 weight = word_weights[word.lower()]
36 |             else:
37 |                 num_unknown_words += 1
38 |             weights.append(weight)
39 |         
40 |         logging.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight))
41 | 
42 |         self.emb_layer = nn.Embedding(len(vocab), 1)
43 |         self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)})
44 | 
45 | 
46 |     def forward(self, features: Dict[str, Tensor]):
47 |         attention_mask = features['attention_mask']
48 |         token_embeddings = features['token_embeddings']
49 | 
50 |         #Compute a weight value for each token
51 |         token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1)
52 |         token_weights = token_weights_raw * attention_mask.float()
53 |         token_weights_sum = torch.sum(token_weights, 1)
54 | 
55 |         #Multiply embedding by token weight value
56 |         token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size())
57 |         token_embeddings = token_embeddings * token_weights_expanded
58 | 
59 |         features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum})
60 |         return features
61 | 
62 |     def get_config_dict(self):
63 |         return {key: self.__dict__[key] for key in self.config_keys}
64 | 
65 |     def save(self, output_path):
66 |         with open(os.path.join(output_path, 'config.json'), 'w') as fOut:
67 |             json.dump(self.get_config_dict(), fOut, indent=2)
68 | 
69 |     @staticmethod
70 |     def load(input_path):
71 |         with open(os.path.join(input_path, 'config.json')) as fIn:
72 |             config = json.load(fIn)
73 | 
74 |         return WordWeights(**config)


--------------------------------------------------------------------------------
/sentence_transformers/models/XLMRoBERTa.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from transformers import XLMRobertaModel, XLMRobertaTokenizer
 4 | import json
 5 | from typing import Union, Tuple, List, Dict, Optional
 6 | import os
 7 | import numpy as np
 8 | import logging
 9 | 
10 | class XLMRoBERTa(nn.Module):
11 |     """RoBERTa model to generate token embeddings.
12 | 
13 |     Each token is mapped to an output vector from RoBERTa.
14 |     """
15 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
16 |         super(XLMRoBERTa, self).__init__()
17 |         self.config_keys = ['max_seq_length', 'do_lower_case']
18 |         self.do_lower_case = do_lower_case
19 | 
20 |         if self.do_lower_case is not None:
21 |             tokenizer_args['do_lower_case'] = do_lower_case
22 | 
23 |         self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path, **model_args)
24 |         self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
25 | 
26 |         if max_seq_length > self.tokenizer.max_len_single_sentence:
27 |             logging.warning("XLM-RoBERTa only allows a max_seq_length of "+self.tokenizer.max_len_single_sentence)
28 |             max_seq_length = self.tokenizer.max_len_single_sentence
29 |         self.max_seq_length = max_seq_length
30 | 
31 | 
32 |     def forward(self, features):
33 |         """Returns token_embeddings, cls_token"""
34 |         #RoBERTa does not use token_type_ids
35 |         output_states = self.xlm_roberta(**features)
36 |         output_tokens = output_states[0]
37 |         cls_tokens = output_tokens[:, 0, :]  # CLS token is first token
38 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
39 | 
40 |         if self.xlm_roberta.config.output_hidden_states:
41 |             hidden_states = output_states[2]
42 |             features.update({'all_layer_embeddings': hidden_states})
43 | 
44 |         return features
45 | 
46 |     def get_word_embedding_dimension(self) -> int:
47 |         return self.xlm_roberta.config.hidden_size
48 | 
49 |     def tokenize(self, text: str) -> List[int]:
50 |         """
51 |         Tokenizes a text and maps tokens to token-ids
52 |         """
53 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
54 | 
55 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int):
56 |         """
57 |         Convert tokenized sentence in its embedding ids, segment ids and mask
58 | 
59 |         :param tokens:
60 |             a tokenized sentence
61 |         :param pad_seq_length:
62 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
63 |         :return: embedding ids, segment ids and mask for the sentence
64 |         """
65 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 #Add space for special tokens
66 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
67 | 
68 |     def get_config_dict(self):
69 |         return {key: self.__dict__[key] for key in self.config_keys}
70 | 
71 |     def save(self, output_path: str):
72 |         self.xlm_roberta.save_pretrained(output_path)
73 |         self.tokenizer.save_pretrained(output_path)
74 | 
75 |         with open(os.path.join(output_path, 'sentence_xlm-roberta_config.json'), 'w') as fOut:
76 |             json.dump(self.get_config_dict(), fOut, indent=2)
77 | 
78 |     @staticmethod
79 |     def load(input_path: str):
80 |         with open(os.path.join(input_path, 'sentence_xlm-roberta_config.json')) as fIn:
81 |             config = json.load(fIn)
82 |         return XLMRoBERTa(model_name_or_path=input_path, **config)
83 | 
84 | 
85 | 
86 | 
87 | 
88 | 
89 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/XLNet.py:
--------------------------------------------------------------------------------
 1 | from torch import Tensor
 2 | from torch import nn
 3 | from transformers import XLNetModel, XLNetTokenizer
 4 | import json
 5 | from typing import Union, Tuple, List, Dict, Optional
 6 | import os
 7 | import numpy as np
 8 | 
 9 | class XLNet(nn.Module):
10 |     """XLNet model to generate token embeddings.
11 | 
12 |     Each token is mapped to an output vector from XLNet.
13 |     """
14 |     def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}):
15 |         super(XLNet, self).__init__()
16 |         self.config_keys = ['max_seq_length', 'do_lower_case']
17 |         self.max_seq_length = max_seq_length
18 |         self.do_lower_case = do_lower_case
19 | 
20 |         if self.do_lower_case is not None:
21 |             tokenizer_args['do_lower_case'] = do_lower_case
22 | 
23 |         self.xlnet = XLNetModel.from_pretrained(model_name_or_path, **model_args)
24 |         self.tokenizer = XLNetTokenizer.from_pretrained(model_name_or_path, **tokenizer_args)
25 |         self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0]
26 |         self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0]
27 | 
28 |     def forward(self, features):
29 |         """Returns token_embeddings, cls_token"""
30 |         output_states = self.xlnet(**features)
31 |         output_tokens = output_states[0]
32 |         cls_tokens = output_tokens[:, -1, :]  # CLS token is the last token
33 |         features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']})
34 | 
35 |         if self.xlnet.config.output_hidden_states:
36 |             hidden_states = output_states[2]
37 |             features.update({'all_layer_embeddings': hidden_states})
38 | 
39 |         return features
40 | 
41 |     def get_word_embedding_dimension(self) -> int:
42 |         return self.xlnet.config.d_model
43 | 
44 |     def tokenize(self, text: str) -> List[int]:
45 |         """
46 |         Tokenizes a text and maps tokens to token-ids
47 |         """
48 |         return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text))
49 | 
50 |     def get_sentence_features(self, tokens: List[int], pad_seq_length: int) -> Dict[str, Tensor]:
51 |         """
52 |         Convert tokenized sentence in its embedding ids, segment ids and mask
53 | 
54 |         :param tokens:
55 |             a tokenized sentence
56 |         :param pad_seq_length:
57 |             the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length
58 |         :return: embedding ids, segment ids and mask for the sentence
59 |         """
60 |         pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens
61 |         return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt')
62 | 
63 |     def get_config_dict(self):
64 |         return {key: self.__dict__[key] for key in self.config_keys}
65 | 
66 |     def save(self, output_path: str):
67 |         self.xlnet.save_pretrained(output_path)
68 |         self.tokenizer.save_pretrained(output_path)
69 | 
70 |         with open(os.path.join(output_path, 'sentence_xlnet_config.json'), 'w') as fOut:
71 |             json.dump(self.get_config_dict(), fOut, indent=2)
72 | 
73 |     @staticmethod
74 |     def load(input_path: str):
75 |         with open(os.path.join(input_path, 'sentence_xlnet_config.json')) as fIn:
76 |             config = json.load(fIn)
77 |         return XLNet(model_name_or_path=input_path, **config)
78 | 
79 | 
80 | 
81 | 
82 | 
83 | 
84 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ALBERT import ALBERT
 2 | from .Transformer import Transformer
 3 | from .BERT import BERT
 4 | from .BoW import BoW
 5 | from .CNN import CNN
 6 | from .CamemBERT import CamemBERT
 7 | from .Dense import Dense
 8 | from .DistilBERT import DistilBERT
 9 | from .LSTM import LSTM
10 | from .Pooling import Pooling
11 | from .RoBERTa import RoBERTa
12 | from .T5 import T5
13 | from .WKPooling import WKPooling
14 | from .WeightedLayerPooling import WeightedLayerPooling
15 | from .WordEmbeddings import WordEmbeddings
16 | from .WordWeights import WordWeights
17 | from .XLMRoBERTa import XLMRoBERTa
18 | from .XLNet import XLNet
19 | from .PhoBERT import PhoBERT


--------------------------------------------------------------------------------
/sentence_transformers/models/proposed_CNN.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch import nn, Tensor
 3 | import torch.nn.functional as F
 4 | from typing import Union, Tuple, List, Iterable, Dict
 5 | import logging
 6 | import gzip
 7 | from tqdm import tqdm
 8 | import numpy as np
 9 | import os
10 | import json
11 | from ..util import import_from_string, fullname, http_get
12 | from .tokenizer import WordTokenizer, WhitespaceTokenizer
13 | 
14 | 
15 | class CNN(nn.Module):
16 |     """CNN-layer with multiple kernel-sizes over the word embeddings"""
17 | 
18 |     def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]):
19 |         nn.Module.__init__(self)
20 |         self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes']
21 |         self.in_word_embedding_dimension = in_word_embedding_dimension
22 |         self.out_channels = out_channels
23 |         self.kernel_sizes = kernel_sizes
24 | 
25 |         self.embeddings_dimension = out_channels*len(kernel_sizes)
26 |         self.convs = nn.ModuleList()
27 |         self.pooling = nn.AvgPool1d(2, stride=2)
28 |         in_channels = in_word_embedding_dimension
29 |         for kernel_size in kernel_sizes:
30 |             padding_size = int((kernel_size - 1) / 2)
31 |             conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
32 |                              padding=padding_size)
33 |             
34 |             self.convs.append(conv)
35 | 
36 |     def forward(self, features):
37 |         token_embeddings = features['token_embeddings']
38 | 
39 |         token_embeddings = token_embeddings.transpose(1, -1)
40 |         vectors = []
41 |         for conv in self.convs:
42 |             a = F.tanh(conv(token_embeddings))
43 |             a = a.transpose(1, -1)
44 |             a = self.pooling(a)
45 |             a = a.transpose(1, -1)
46 |             vectors.append(a)
47 |         #vectors = [self.pooling(conv(token_embeddings)) for conv in self.convs]
48 |         # for i in vectors:
49 |         #     print(np.shape(i))
50 |         out = torch.cat(vectors, 1).transpose(1, -1)
51 | 
52 |         features.update({'token_embeddings': out})
53 |         return features
54 | 
55 |     def get_word_embedding_dimension(self) -> int:
56 |         return self.embeddings_dimension
57 | 
58 |     def tokenize(self, text: str) -> List[int]:
59 |         raise NotImplementedError()
60 | 
61 |     def save(self, output_path: str):
62 |         with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut:
63 |             json.dump(self.get_config_dict(), fOut, indent=2)
64 | 
65 |         torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin'))
66 | 
67 |     def get_config_dict(self):
68 |         return {key: self.__dict__[key] for key in self.config_keys}
69 | 
70 |     @staticmethod
71 |     def load(input_path: str):
72 |         with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn:
73 |             config = json.load(fIn)
74 | 
75 |         weights = torch.load(os.path.join(input_path, 'pytorch_model.bin'))
76 |         model = CNN(**config)
77 |         model.load_state_dict(weights)
78 |         return model
79 | 
80 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/PhraseTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | import logging
 7 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 8 | import nltk
 9 | 
10 | class PhraseTokenizer(WordTokenizer):
11 |     """Tokenizes the text with respect to existent phrases in the vocab.
12 | 
13 |     This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example,
14 |     in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the])
15 |     """
16 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, ngram_separator: str = "_", max_ngram_length: int = 5):
17 |         self.stop_words = set(stop_words)
18 |         self.do_lower_case = do_lower_case
19 |         self.ngram_separator = ngram_separator
20 |         self.max_ngram_length = max_ngram_length
21 |         self.set_vocab(vocab)
22 | 
23 |     def get_vocab(self):
24 |         return self.vocab
25 | 
26 |     def set_vocab(self, vocab: Iterable[str]):
27 |         self.vocab = vocab
28 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
29 | 
30 |         # Check for ngram in vocab
31 |         self.ngram_lookup = set()
32 |         self.ngram_lengths = set()
33 |         for word in vocab:
34 | 
35 |             if self.ngram_separator is not None and self.ngram_separator in word:
36 |                 # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other
37 |                 ngram_count = word.count(self.ngram_separator) + 1
38 |                 if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length:
39 |                     self.ngram_lookup.add(word)
40 |                     self.ngram_lengths.add(ngram_count)
41 | 
42 |         if len(vocab) > 0:
43 |             logging.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths))
44 |             logging.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup)))
45 | 
46 |     def tokenize(self, text: str) -> List[int]:
47 |         tokens = nltk.word_tokenize(text, preserve_line=True)
48 | 
49 |         #phrase detection
50 |         for ngram_len in sorted(self.ngram_lengths, reverse=True):
51 |             idx = 0
52 |             while idx <= len(tokens) - ngram_len:
53 |                 ngram = self.ngram_separator.join(tokens[idx:idx + ngram_len])
54 |                 if ngram in self.ngram_lookup:
55 |                     tokens[idx:idx + ngram_len] = [ngram]
56 |                 elif ngram.lower() in self.ngram_lookup:
57 |                     tokens[idx:idx + ngram_len] = [ngram.lower()]
58 |                 idx += 1
59 | 
60 |         #Map tokens to idx, filter stop words
61 |         tokens_filtered = []
62 |         for token in tokens:
63 |             if token in self.stop_words:
64 |                 continue
65 |             elif token in self.word2idx:
66 |                 tokens_filtered.append(self.word2idx[token])
67 |                 continue
68 | 
69 |             token = token.lower()
70 |             if token in self.stop_words:
71 |                 continue
72 |             elif token in self.word2idx:
73 |                 tokens_filtered.append(self.word2idx[token])
74 |                 continue
75 | 
76 |             token = token.strip(string.punctuation)
77 |             if token in self.stop_words:
78 |                 continue
79 |             elif len(token) > 0 and token in self.word2idx:
80 |                 tokens_filtered.append(self.word2idx[token])
81 |                 continue
82 | 
83 |         return tokens_filtered
84 | 
85 |     def save(self, output_path: str):
86 |         with open(os.path.join(output_path, 'phrasetokenizer_config.json'), 'w') as fOut:
87 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'ngram_separator': self.ngram_separator, 'max_ngram_length': self.max_ngram_length}, fOut)
88 | 
89 |     @staticmethod
90 |     def load(input_path: str):
91 |         with open(os.path.join(input_path, 'phrasetokenizer_config.json'), 'r') as fIn:
92 |             config = json.load(fIn)
93 | 
94 |         return PhraseTokenizer(**config)
95 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/VietnameseTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 7 | from vncorenlp import VnCoreNLP
 8 | import operator
 9 | from functools import reduce
10 | class VietnameseTokenizer(WordTokenizer):
11 |     """
12 |     Simple and fast white-space tokenizer. Splits sentence based on white spaces.
13 |     Punctuation are stripped from tokens.
14 |     """
15 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, vncorenlp_path = None):
16 |         self.stop_words = set(stop_words)
17 |         self.do_lower_case = do_lower_case
18 |         self.set_vocab(vocab)
19 |         self.vncorenlp_path = vncorenlp_path
20 |         self.rdrsegmenter = VnCoreNLP(vncorenlp_path, annotators="wseg", max_heap_size='-Xmx1g')
21 | 
22 |     def get_vocab(self):
23 |         return self.vocab
24 | 
25 |     def set_vocab(self, vocab: Iterable[str]):
26 |         self.vocab = vocab
27 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
28 | 
29 |     def segment(self, text: str) -> str:
30 |         ''' Segment words in text and then flat the list '''
31 |         segmented_word = self.rdrsegmenter.tokenize(text)
32 |         return ' '.join(reduce(operator.concat, segmented_word))
33 | 
34 |     def tokenize(self, text: str) -> List[int]:
35 |         #segment words in text
36 |         text = self.segment(text)
37 |         
38 |         if self.do_lower_case:
39 |             text = text.lower()
40 | 
41 |         tokens = text.split()
42 | 
43 |         tokens_filtered = []
44 |         for token in tokens:
45 |             if token in self.stop_words:
46 |                 continue
47 |             elif token in self.word2idx:
48 |                 tokens_filtered.append(self.word2idx[token])
49 |                 continue
50 | 
51 |             token = token.strip(string.punctuation)
52 |             if token in self.stop_words:
53 |                 continue
54 |             elif len(token) > 0 and token in self.word2idx:
55 |                 tokens_filtered.append(self.word2idx[token])
56 |                 continue
57 | 
58 |             token = token.lower()
59 |             if token in self.stop_words:
60 |                 continue
61 |             elif token in self.word2idx:
62 |                 tokens_filtered.append(self.word2idx[token])
63 |                 continue
64 |             tokens_filtered.append(0)
65 |         return tokens_filtered
66 | 
67 |     def save(self, output_path: str):
68 |         with open(os.path.join(output_path, 'VietnameseTokenizer_config.json'), 'w') as fOut:
69 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'vncorenlp_path': self.vncorenlp_path}, fOut)
70 | 
71 |     @staticmethod
72 |     def load(input_path: str):
73 |         with open(os.path.join(input_path, 'VietnameseTokenizer_config.json'), 'r') as fIn:
74 |             config = json.load(fIn)
75 | 
76 |         return VietnameseTokenizer(**config)
77 |         


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/WhitespaceTokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, Tuple, List, Iterable, Dict
 2 | import collections
 3 | import string
 4 | import os
 5 | import json
 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
 7 | 
 8 | class WhitespaceTokenizer(WordTokenizer):
 9 |     """
10 |     Simple and fast white-space tokenizer. Splits sentence based on white spaces.
11 |     Punctuation are stripped from tokens.
12 |     """
13 |     def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False):
14 |         self.stop_words = set(stop_words)
15 |         self.do_lower_case = do_lower_case
16 |         self.set_vocab(vocab)
17 | 
18 |     def get_vocab(self):
19 |         return self.vocab
20 | 
21 |     def set_vocab(self, vocab: Iterable[str]):
22 |         self.vocab = vocab
23 |         self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)])
24 | 
25 |     # def tokenize(self, text: str) -> List[int]:
26 |     #     if self.do_lower_case:
27 |     #         text = text.lower()
28 | 
29 |     #     tokens = text.split()
30 | 
31 |     #     tokens_filtered = []
32 |     #     for token in tokens:
33 |     #         if token in self.stop_words:
34 |     #             continue
35 |     #         elif token in self.word2idx:
36 |     #             tokens_filtered.append(self.word2idx[token])
37 |     #             continue
38 | 
39 |     #         token = token.strip(string.punctuation)
40 |     #         if token in self.stop_words:
41 |     #             continue
42 |     #         elif len(token) > 0 and token in self.word2idx:
43 |     #             tokens_filtered.append(self.word2idx[token])
44 |     #             continue
45 | 
46 |     #         token = token.lower()
47 |     #         if token in self.stop_words:
48 |     #             continue
49 |     #         elif token in self.word2idx:
50 |     #             tokens_filtered.append(self.word2idx[token])
51 |     #             continue
52 | 
53 |     #     return tokens_filtered
54 |     def tokenize(self, text: str) -> List[int]:
55 |         if self.do_lower_case:
56 |             text = text.lower()
57 |         for stopword in self.stop_words:
58 |             if stopword in text:
59 |                 text = text.replace(stopword,"")
60 |         text = text.strip()
61 |         tokens = text.split()
62 | 
63 |         tokens_filtered = []
64 |         for token in tokens:
65 |             if token in self.word2idx:
66 |                 tokens_filtered.append(self.word2idx[token])
67 |                 continue
68 |             #string.punctuation == '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' 
69 |             token = token.strip(string.punctuation)
70 |             if token in self.stop_words:
71 |                 continue
72 |             elif len(token) > 0 and token in self.word2idx:
73 |                 tokens_filtered.append(self.word2idx[token])
74 |                 continue
75 | 
76 |             token = token.lower()
77 |             if token in self.stop_words:
78 |                 continue
79 |             elif token in self.word2idx:
80 |                 tokens_filtered.append(self.word2idx[token])
81 |                 continue
82 |             tokens_filtered.append(0)
83 |         return tokens_filtered
84 |     def save(self, output_path: str):
85 |         with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut:
86 |             json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut)
87 | 
88 |     @staticmethod
89 |     def load(input_path: str):
90 |         with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn:
91 |             config = json.load(fIn)
92 | 
93 |         return WhitespaceTokenizer(**config)
94 | 


--------------------------------------------------------------------------------
/sentence_transformers/models/tokenizer/__init__.py:
--------------------------------------------------------------------------------
1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS
2 | from .WhitespaceTokenizer import WhitespaceTokenizer
3 | from .WhitespaceTokenizer import WhitespaceTokenizer
4 | from .PhoTokenizer import PhoTokenizer


--------------------------------------------------------------------------------
/sentence_transformers/readers/InputExample.py:
--------------------------------------------------------------------------------
 1 | from typing import Union, List
 2 | 
 3 | 
 4 | class InputExample:
 5 |     """
 6 |     Structure for one input example with texts, the label and a unique id
 7 |     """
 8 |     def __init__(self, guid: str, texts: List[str], label: Union[int, float]):
 9 |         """
10 |         Creates one InputExample with the given texts, guid and label
11 | 
12 |         str.strip() is called on both texts.
13 | 
14 |         :param guid
15 |             id for the example
16 |         :param texts
17 |             the texts for the example
18 |         :param label
19 |             the label for the example
20 |         """
21 |         self.guid = guid
22 |         self.texts = [text.strip() for text in texts]
23 |         self.label = label
24 | 


--------------------------------------------------------------------------------
/sentence_transformers/readers/LabelSentenceReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class LabelSentenceReader:
 7 |     """Reads in a file that has at least two columns: a label and a sentence.
 8 |     This reader can for example be used with the BatchHardTripletLoss.
 9 |     Maps labels automatically to integers"""
10 |     def __init__(self, folder, label_col_idx=0, sentence_col_idx=1):
11 |         self.folder = folder
12 |         self.label_map = {}
13 |         self.label_col_idx = label_col_idx
14 |         self.sentence_col_idx = sentence_col_idx
15 | 
16 |     def get_examples(self, filename, max_examples=0):
17 |         examples = []
18 | 
19 |         id = 0
20 |         for line in open(os.path.join(self.folder, filename), encoding="utf-8"):
21 |             splits = line.strip().split('\t')
22 |             label = splits[self.label_col_idx]
23 |             sentence = splits[self.sentence_col_idx]
24 | 
25 |             if label not in self.label_map:
26 |                 self.label_map[label] = len(self.label_map)
27 | 
28 |             label_id = self.label_map[label]
29 |             guid = "%s-%d" % (filename, id)
30 |             id += 1
31 |             examples.append(InputExample(guid=guid, texts=[sentence], label=label_id))
32 | 
33 |             if 0 < max_examples <= id:
34 |                 break
35 | 
36 |         return examples


--------------------------------------------------------------------------------
/sentence_transformers/readers/NLIDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | 
 7 | class NLIDataReader(object):
 8 |     """
 9 |     Reads in the Stanford NLI dataset and the MultiGenre NLI dataset
10 |     """
11 |     def __init__(self, dataset_folder):
12 |         self.dataset_folder = dataset_folder
13 | 
14 |     def get_examples(self, filename, max_examples=0):
15 |         """
16 |         data_splits specified which data split to use (train, dev, test).
17 |         Expects that self.dataset_folder contains the files s1.$data_split.gz,  s2.$data_split.gz,
18 |         labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz
19 |         """
20 |         s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename),
21 |                        mode="rt", encoding="utf-8").readlines()
22 |         s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename),
23 |                        mode="rt", encoding="utf-8").readlines()
24 |         labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename),
25 |                            mode="rt", encoding="utf-8").readlines()
26 | 
27 |         examples = []
28 |         id = 0
29 |         for sentence_a, sentence_b, label in zip(s1, s2, labels):
30 |             guid = "%s-%d" % (filename, id)
31 |             id += 1
32 |             examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label)))
33 | 
34 |             if 0 < max_examples <= len(examples):
35 |                 break
36 | 
37 |         return examples
38 | 
39 |     @staticmethod
40 |     def get_labels():
41 |         return {"contradiction": 0, "entailment": 1, "neutral": 2}
42 | 
43 |     def get_num_labels(self):
44 |         return len(self.get_labels())
45 | 
46 |     def map_label(self, label):
47 |         return self.get_labels()[label.strip().lower()]


--------------------------------------------------------------------------------
/sentence_transformers/readers/PairedFilesReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | import gzip
 6 | 
 7 | class PairedFilesReader(object):
 8 |     """
 9 |     Reads in the a Pair Dataset, split in two files
10 |     """
11 |     def __init__(self, filepaths):
12 |         self.filepaths = filepaths
13 | 
14 | 
15 |     def get_examples(self, max_examples=0):
16 |         """
17 |         """
18 |         fIns = []
19 |         for filepath in self.filepaths:
20 |             fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8')
21 |             fIns.append(fIn)
22 | 
23 |         examples = []
24 | 
25 |         eof = False
26 |         while not eof:
27 |             texts = []
28 |             for fIn in fIns:
29 |                 text = fIn.readline()
30 | 
31 |                 if text == '':
32 |                     eof = True
33 |                     break
34 | 
35 |                 texts.append(text)
36 | 
37 |             if eof:
38 |                 break;
39 | 
40 |             examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1))
41 |             if max_examples > 0 and len(examples) >= max_examples:
42 |                 break
43 | 
44 |         return examples


--------------------------------------------------------------------------------
/sentence_transformers/readers/STSDataReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class STSDataReader:
 7 |     """
 8 |     Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx)
 9 | 
10 |     Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1
11 |     """
12 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t",
13 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
14 |         self.dataset_folder = dataset_folder
15 |         self.score_col_idx = score_col_idx
16 |         self.s1_col_idx = s1_col_idx
17 |         self.s2_col_idx = s2_col_idx
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 |         self.normalize_scores = normalize_scores
21 |         self.min_score = min_score
22 |         self.max_score = max_score
23 | 
24 |     def get_examples(self, filename, max_examples=0):
25 |         """
26 |         filename specified which data split to use (train.csv, dev.csv, test.csv).
27 |         """
28 |         filepath = os.path.join(self.dataset_folder, filename)
29 |         with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8") as fIn:
30 |             data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting)
31 |             examples = []
32 |             for id, row in enumerate(data):
33 |                 score = float(row[self.score_col_idx])
34 |                 if self.normalize_scores:  # Normalize to a 0...1 value
35 |                     score = (score - self.min_score) / (self.max_score - self.min_score)
36 | 
37 |                 s1 = row[self.s1_col_idx]
38 |                 s2 = row[self.s2_col_idx]
39 |                 examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score))
40 | 
41 |                 if max_examples > 0 and len(examples) >= max_examples:
42 |                     break
43 | 
44 |         return examples
45 | 
46 | class STSBenchmarkDataReader(STSDataReader):
47 |     """
48 |     Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4.
49 |     Scores are normalized from 0...5 to 0...1
50 |     """
51 |     def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t",
52 |                  quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5):
53 |         super().__init__(dataset_folder=dataset_folder, s1_col_idx=s1_col_idx, s2_col_idx=s2_col_idx, score_col_idx=score_col_idx, delimiter="\t",
54 |                  quoting=quoting, normalize_scores=normalize_scores, min_score=min_score, max_score=max_score)


--------------------------------------------------------------------------------
/sentence_transformers/readers/TripletReader.py:
--------------------------------------------------------------------------------
 1 | from . import InputExample
 2 | import csv
 3 | import gzip
 4 | import os
 5 | 
 6 | class TripletReader(object):
 7 |     """
 8 |     Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1),
 9 |     one positive example (s2) and one negative example (s3)
10 |     """
11 |     def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t",
12 |                  quoting=csv.QUOTE_NONE):
13 |         self.dataset_folder = dataset_folder
14 |         self.s1_col_idx = s1_col_idx
15 |         self.s2_col_idx = s2_col_idx
16 |         self.s3_col_idx = s3_col_idx
17 |         self.has_header = has_header
18 |         self.delimiter = delimiter
19 |         self.quoting = quoting
20 | 
21 |     def get_examples(self, filename, max_examples=0):
22 |         """
23 | 
24 |         """
25 |         data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter,
26 |                           quoting=self.quoting)
27 |         examples = []
28 |         if self.has_header:
29 |             next(data)
30 | 
31 |         for id, row in enumerate(data):
32 |             s1 = row[self.s1_col_idx]
33 |             s2 = row[self.s2_col_idx]
34 |             s3 = row[self.s3_col_idx]
35 | 
36 |             examples.append(InputExample(guid=filename+str(id), texts=[s1, s2, s3], label=1))
37 |             if max_examples > 0 and len(examples) >= max_examples:
38 |                 break
39 | 
40 |         return examples


--------------------------------------------------------------------------------
/sentence_transformers/readers/__init__.py:
--------------------------------------------------------------------------------
1 | from .InputExample import InputExample
2 | from .LabelSentenceReader import LabelSentenceReader
3 | from .NLIDataReader import NLIDataReader
4 | from .STSDataReader import STSDataReader, STSBenchmarkDataReader
5 | from .TripletReader import TripletReader


--------------------------------------------------------------------------------
/sentence_transformers/util.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from torch import Tensor, device
 3 | from typing import Tuple, List
 4 | from tqdm import tqdm
 5 | import sys
 6 | import importlib
 7 | 
 8 | 
 9 | def batch_to_device(batch, target_device: device):
10 |     """
11 |     send a batch to a device
12 | 
13 |     :param batch:
14 |     :param target_device:
15 |     :return: the batch sent to the device
16 |     """
17 |     features = batch['features']
18 |     for paired_sentence_idx in range(len(features)):
19 |         for feature_name in features[paired_sentence_idx]:
20 |             features[paired_sentence_idx][feature_name] = features[paired_sentence_idx][feature_name].to(target_device)
21 | 
22 |     labels = batch['labels'].to(target_device)
23 |     return features, labels
24 | 
25 | 
26 | 
27 | def http_get(url, path):
28 |     with open(path, "wb") as file_binary:
29 |         req = requests.get(url, stream=True)
30 |         if req.status_code != 200:
31 |             print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr)
32 |             req.raise_for_status()
33 | 
34 |         content_length = req.headers.get('Content-Length')
35 |         total = int(content_length) if content_length is not None else None
36 |         progress = tqdm(unit="B", total=total, unit_scale=True)
37 |         for chunk in req.iter_content(chunk_size=1024):
38 |             if chunk: # filter out keep-alive new chunks
39 |                 progress.update(len(chunk))
40 |                 file_binary.write(chunk)
41 |     progress.close()
42 | 
43 | 
44 | def fullname(o):
45 |   # o.__module__ + "." + o.__class__.__qualname__ is an example in
46 |   # this context of H.L. Mencken's "neat, plausible, and wrong."
47 |   # Python makes no guarantees as to whether the __module__ special
48 |   # attribute is defined, so we take a more circumspect approach.
49 |   # Alas, the module name is explicitly excluded from __qualname__
50 |   # in Python 3.
51 | 
52 |   module = o.__class__.__module__
53 |   if module is None or module == str.__class__.__module__:
54 |     return o.__class__.__name__  # Avoid reporting __builtin__
55 |   else:
56 |     return module + '.' + o.__class__.__name__
57 | 
58 | def import_from_string(dotted_path):
59 |     """
60 |     Import a dotted module path and return the attribute/class designated by the
61 |     last name in the path. Raise ImportError if the import failed.
62 |     """
63 |     try:
64 |         module_path, class_name = dotted_path.rsplit('.', 1)
65 |     except ValueError:
66 |         msg = "%s doesn't look like a module path" % dotted_path
67 |         raise ImportError(msg)
68 | 
69 |     module = importlib.import_module(module_path)
70 | 
71 |     try:
72 |         return getattr(module, class_name)
73 |     except AttributeError:
74 |         msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name)
75 |         raise ImportError(msg)


--------------------------------------------------------------------------------
/setup.cfg:
--------------------------------------------------------------------------------
1 | [metadata]
2 | description-file = README.md


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file:
 4 |     readme = readme_file.read()
 5 | 
 6 | 
 7 | 
 8 | setup(
 9 |     name="sentence-transformers",
10 |     version="0.2.6.1",
11 |     author="Nils Reimers, Gregor Geigle",
12 |     author_email="Rnils@web.de",
13 |     description="Sentence Embeddings using BERT / RoBERTa / XLNet",
14 |     long_description=readme,
15 |     long_description_content_type="text/markdown",
16 |     license="Apache License 2.0",
17 |     url="https://github.com/UKPLab/sentence-transformers",
18 |     download_url="https://github.com/UKPLab/sentence-transformers/archive/v0.2.6.zip",
19 |     packages=find_packages(),
20 |     install_requires=[
21 |         'transformers>=2.8.0',
22 |         'tqdm',
23 |         'torch>=1.0.1',
24 |         'numpy',
25 |         'scikit-learn',
26 |         'scipy',
27 |         'nltk'
28 |     ],
29 |     classifiers=[
30 |         "Development Status :: 4 - Beta",
31 |         "Intended Audience :: Science/Research",
32 |         "License :: OSI Approved :: Apache Software License",
33 |         "Programming Language :: Python :: 3.6",
34 |         "Topic :: Scientific/Engineering :: Artificial Intelligence"
35 |     ],
36 |     keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning"
37 | )
38 | 


--------------------------------------------------------------------------------
/tests/test_pretrained_stsb.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests that the pretrained models produce the correct scores on the STSbenchmark dataset
 3 | """
 4 | from torch.utils.data import DataLoader
 5 | from sentence_transformers import SentenceTransformer,  SentencesDataset, LoggingHandler
 6 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
 7 | from sentence_transformers.readers import STSDataReader
 8 | import unittest
 9 | 
10 | 
11 | class PretrainedSTSbTest(unittest.TestCase):
12 |     def pretrained_model_score(self, model_name, expected_score):
13 |         model = SentenceTransformer(model_name)
14 |         sts_reader = STSDataReader('../examples/datasets/stsbenchmark')
15 | 
16 |         test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model)
17 |         test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8)
18 |         evaluator = EmbeddingSimilarityEvaluator(test_dataloader)
19 | 
20 |         score = model.evaluate(evaluator)*100
21 |         print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score))
22 |         assert abs(score-expected_score) < 0.1
23 | 
24 |     def test_bert_base(self):
25 |         self.pretrained_model_score('bert-base-nli-mean-tokens', 77.12)
26 |         self.pretrained_model_score('bert-base-nli-max-tokens', 77.21)
27 |         self.pretrained_model_score('bert-base-nli-cls-token', 76.30)
28 |         self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 85.14)
29 | 
30 | 
31 |     def test_bert_large(self):
32 |         self.pretrained_model_score('bert-large-nli-mean-tokens', 79.19)
33 |         self.pretrained_model_score('bert-large-nli-max-tokens', 78.41)
34 |         self.pretrained_model_score('bert-large-nli-cls-token', 78.29)
35 |         self.pretrained_model_score('bert-large-nli-stsb-mean-tokens', 85.29)
36 | 
37 |     def test_roberta(self):
38 |         self.pretrained_model_score('roberta-base-nli-mean-tokens', 77.49)
39 |         self.pretrained_model_score('roberta-large-nli-mean-tokens', 78.69)
40 |         self.pretrained_model_score('roberta-base-nli-stsb-mean-tokens', 85.44)
41 |         self.pretrained_model_score('roberta-large-nli-stsb-mean-tokens', 86.39)
42 | 
43 |     def test_distilbert(self):
44 |         self.pretrained_model_score('distilbert-base-nli-mean-tokens', 76.97)
45 |         self.pretrained_model_score('distilbert-base-nli-stsb-mean-tokens', 84.38)
46 | 
47 |     def test_multiling(self):
48 |         self.pretrained_model_score('distiluse-base-multilingual-cased', 80.62)
49 | 
50 | if "__main__" == __name__:
51 |     unittest.main()


--------------------------------------------------------------------------------
/tests/test_wkpooling.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Tests the WKPooling model
 3 | """
 4 | import unittest
 5 | from sentence_transformers import models, SentenceTransformer
 6 | import scipy
 7 | 
 8 | class WKPoolingTest(unittest.TestCase):
 9 |     sentence_pairs = [
10 |         ('Can you please. Send me the attachment.', 'I dont know. Where is it?'),
11 |         ('My name is Paul', 'My name is Lisa'),
12 |         ('The cat sits on the mat while the dog is barking', 'London is the capital of England'),
13 |         ('BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new state-of-the-art performance on sentence-pair regression tasks like semantic textual similarity (STS)', 'However, it requires that both sentences are fed into the network, which causes a massive computational overhead: Finding the most similar pair in a collection of 10,000 sentences requires about 50 million inference computations (~65 hours) with BERT.'),
14 |         ('In deep learning, each level learns to transform its input data into a slightly more abstract and composite representation.', 'London is considered to be one of the world\'s most important global cities.')
15 |     ]
16 | 
17 |     def test_bert_wkpooling(self):
18 |         word_embedding_model = models.BERT('bert-base-uncased', model_args={'output_hidden_states': True})
19 |         pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension())
20 |         model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
21 |         scores = [0.6906377742193329,
22 |                   0.9910573945907297,
23 |                   0.8395676755959804,
24 |                   0.7569234597143,
25 |                   0.8324509121875274]
26 | 
27 |         for sentences, score in zip(WKPoolingTest.sentence_pairs, scores):
28 |             embedding = model.encode(sentences, convert_to_numpy=True)
29 | 
30 |             similarity = 1-scipy.spatial.distance.cosine(embedding[0], embedding[1])
31 |             assert abs(similarity-score) < 0.01
32 | 
33 |     def test_roberta_wkpooling(self):
34 |         word_embedding_model = models.Auto('roberta-base', model_args={'output_hidden_states': True})
35 |         pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension())
36 |         model = SentenceTransformer(modules=[word_embedding_model, pooling_model])
37 |         scores = [0.9594874382019043,
38 |                   0.9928674697875977,
39 |                   0.9241214990615845,
40 |                   0.9309519529342651,
41 |                   0.9506515264511108]
42 | 
43 |         for sentences, score in zip(WKPoolingTest.sentence_pairs, scores):
44 |             embedding = model.encode(sentences, convert_to_numpy=True)
45 | 
46 |             similarity = 1-scipy.spatial.distance.cosine(embedding[0], embedding[1])
47 |             assert abs(similarity-score) < 0.01
48 | 
49 | 
50 | if "__main__" == __name__:
51 |     unittest.main()


--------------------------------------------------------------------------------