├── .gitignore ├── DataNLI ├── labels.dev.gz ├── labels.test.gz ├── labels.train.gz ├── s1.dev.gz ├── s1.test.gz ├── s1.train.gz ├── s2.dev.gz ├── s2.test.gz └── s2.train.gz ├── LICENSE ├── NOTICE.txt ├── README.md ├── docs └── pretrained-models │ ├── multilingual-distillation.png │ ├── multilingual-models.md │ ├── nli-models.md │ ├── sts-models.md │ └── wikipedia-sections-models.md ├── examples ├── README.md ├── applications │ ├── basic_embedding.py │ ├── clustering.py │ ├── clustering_wikipedia_sections.py │ └── semantic_search.py ├── datasets │ ├── README.md │ ├── clean_sts.py │ ├── get_data.py │ ├── stsbenchmark │ │ ├── LICENSE.txt │ │ ├── correlation.pl │ │ ├── readme.txt │ │ ├── sts-dev_vi.csv │ │ ├── sts-test_vi.csv │ │ ├── sts-train-dev_vi.csv │ │ └── sts-train_vi.csv │ └── translate_sts.py ├── evaluate_STSb_datasets │ ├── sbert_embbeding │ │ ├── training.py │ │ ├── training_CNN.py │ │ ├── training_LSTM.py │ │ └── training_NLI.py │ └── word_embbeding │ │ ├── training_biltsm.py │ │ ├── training_cnn.py │ │ ├── training_w2v_no_word_segmentation.py │ │ └── training_w2v_word_segmentation.py ├── evaluation │ ├── evaluation_inference_speed.py │ ├── evaluation_stsbenchmark.py │ └── evaluation_stsbenchmark_sbert-wk.py ├── training_basic_models │ ├── training_stsbenchmark_avg_word_embeddings.py │ ├── training_stsbenchmark_bilstm.py │ ├── training_stsbenchmark_bow.py │ ├── training_stsbenchmark_cnn.py │ └── training_stsbenchmark_tf-idf_word_embeddings.py ├── training_multilingual │ └── training_sbert-en-de.py └── training_transformers │ ├── training_multi-task.py │ ├── training_nli.py │ ├── training_nli_phobert.py │ ├── training_stsbenchmark.py │ ├── training_stsbenchmark_continue_training.py │ └── training_wikipedia_sections.py ├── requirements.txt ├── sentence_transformers ├── LoggingHandler.py ├── SentenceTransformer.py ├── __init__.py ├── data_samplers.py ├── datasets │ ├── ParallelSentencesDataset.py │ ├── SentenceLabelDataset.py │ ├── SentencesDataset.py │ └── __init__.py ├── evaluation │ ├── BinaryEmbeddingSimilarityEvaluator.py │ ├── EmbeddingSimilarityEvaluator.py │ ├── LabelAccuracyEvaluator.py │ ├── MSEEvaluator.py │ ├── SentenceEvaluator.py │ ├── SequentialEvaluator.py │ ├── SimilarityFunction.py │ ├── TranslationEvaluator.py │ ├── TripletEvaluator.py │ └── __init__.py ├── losses │ ├── BatchHardTripletLoss.py │ ├── CosineSimilarityLoss.py │ ├── MSELoss.py │ ├── MultipleNegativesRankingLoss.py │ ├── SoftmaxLoss.py │ ├── TripletLoss.py │ ├── __init__.py │ └── test_batch_hard_triplet_loss.py ├── models │ ├── ADVANCED_CNN.py │ ├── ALBERT.py │ ├── BERT.py │ ├── BERT_LSTM.py │ ├── BoW.py │ ├── CNN.py │ ├── CamemBERT.py │ ├── Dense.py │ ├── DistilBERT.py │ ├── LSTM.py │ ├── PhoBERT.py │ ├── Pooling.py │ ├── RoBERTa.py │ ├── T5.py │ ├── Transformer.py │ ├── WKPooling.py │ ├── WeightedLayerPooling.py │ ├── WordEmbeddings.py │ ├── WordWeights.py │ ├── XLMRoBERTa.py │ ├── XLNet.py │ ├── __init__.py │ ├── proposed_CNN.py │ └── tokenizer │ │ ├── PhoTokenizer.py │ │ ├── PhraseTokenizer.py │ │ ├── VietnameseTokenizer.py │ │ ├── WhitespaceTokenizer.py │ │ ├── WordTokenizer.py │ │ └── __init__.py ├── readers │ ├── InputExample.py │ ├── LabelSentenceReader.py │ ├── NLIDataReader.py │ ├── PairedFilesReader.py │ ├── STSDataReader.py │ ├── TripletReader.py │ └── __init__.py └── util.py ├── setup.cfg ├── setup.py ├── tests ├── test_pretrained_stsb.py └── test_wkpooling.py └── training_NLI.py /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | examples/output 4 | sentence_transformers.egg-info 5 | dist/ 6 | nr_*/ 7 | .vscode/ 8 | __pycache__/ 9 | Pre-trained_models -------------------------------------------------------------------------------- /DataNLI/labels.dev.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/labels.dev.gz -------------------------------------------------------------------------------- /DataNLI/labels.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/labels.test.gz -------------------------------------------------------------------------------- /DataNLI/labels.train.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/labels.train.gz -------------------------------------------------------------------------------- /DataNLI/s1.dev.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s1.dev.gz -------------------------------------------------------------------------------- /DataNLI/s1.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s1.test.gz -------------------------------------------------------------------------------- /DataNLI/s1.train.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s1.train.gz -------------------------------------------------------------------------------- /DataNLI/s2.dev.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s2.dev.gz -------------------------------------------------------------------------------- /DataNLI/s2.test.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s2.test.gz -------------------------------------------------------------------------------- /DataNLI/s2.train.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/DataNLI/s2.train.gz -------------------------------------------------------------------------------- /NOTICE.txt: -------------------------------------------------------------------------------- 1 | ------------------------------------------------------------------------------- 2 | Copyright 2019 3 | Ubiquitous Knowledge Processing (UKP) Lab 4 | Technische Universität Darmstadt 5 | ------------------------------------------------------------------------------- -------------------------------------------------------------------------------- /docs/pretrained-models/multilingual-distillation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/docs/pretrained-models/multilingual-distillation.png -------------------------------------------------------------------------------- /docs/pretrained-models/nli-models.md: -------------------------------------------------------------------------------- 1 | # NLI Models 2 | Conneau et al., 2017, show in the InferSent-Paper ([Supervised Learning of Universal Sentence Representations from Natural Language Inference Data](https://arxiv.org/abs/1705.02364)) that training on Natural Language Inference (NLI) data can produce universal sentence embeddings. 3 | 4 | The datasets labeled sentence pairs with the labels *entail*, *contradict*, and *neutral*. For both sentences, we compute a sentence embedding. These two embeddings are concatenated and passed to softmax classifier to derive the final label. 5 | 6 | As shown, this produces sentence embeddings that can be used for various use cases like clustering or semantic search. 7 | 8 | # Datasets 9 | We train the models on the [SNLI](https://nlp.stanford.edu/projects/snli/) and on the [MultiNLI](https://www.nyu.edu/projects/bowman/multinli/) dataset. We call the combination of the two datasets AllNLI. 10 | 11 | For a training example, see [examples/training_nli_bert.py](../../examples/training_nli_bert.py). 12 | 13 | # Pre-trained models 14 | We provide the following pre-trained models. The performance was evaluated on the test set of the [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark) using Spearman rank correlation. 15 | 16 | 17 | ### BERT models 18 | - **bert-base-nli-mean-tokens**: BERT-base model with mean-tokens pooling. Performance: STSbenchmark: 77.12 19 | - **bert-base-nli-max-tokens**: BERT-base with max-tokens pooling. Performance: STSbenchmark: 77.21 20 | - **bert-base-nli-cls-token**: BERT-base with cls token pooling. Performance: STSbenchmark: 76.30 21 | - **bert-large-nli-mean-tokens**: BERT-large with mean-tokens pooling. Performance: STSbenchmark: 79.19 22 | - **bert-large-nli-max-tokens**: BERT-large with max-tokens pooling. Performance: STSbenchmark: 78.41 23 | - **bert-large-nli-cls-token**: BERT-large with CLS token pooling. Performance: STSbenchmark: 78.29 24 | 25 | ### RoBERTa models 26 | RoBERTa is an extension of BERT. [More Information](https://arxiv.org/abs/1907.11692). 27 | - **roberta-base-nli-mean-tokens**: RoBERTa-base with mean-tokens pooling. Performance: STSbenchmark: 77.49 28 | - **roberta-large-nli-mean-tokens**: RoBERTa-base with mean-tokens pooling. Performance: STSbenchmark: 78.69 29 | 30 | ### DistilBERT models 31 | DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. [More Information](https://github.com/huggingface/transformers/tree/master/examples/distillation) 32 | - **distilbert-base-nli-mean-tokens**: DistilBERT-base with mean-tokens pooling. Performance: STSbenchmark: 76.97 33 | 34 | # Performance Comparison 35 | Here are the performances on the STS benchmark for other sentence embeddings methods. They were also computed by using cosine-similarity and Spearman rank correlation: 36 | - Avg. GloVe embeddings: 58.02 37 | - BERT-as-a-service avg. embeddings: 46.35 38 | - BERT-as-a-service CLS-vector: 16.50 39 | - InferSent - GloVe: 68.03 40 | - Universal Sentence Encoder: 74.92 41 | 42 | # Applications 43 | This model works well in accessing the coarse-grained similarity between sentences. For application examples, see [examples/application_semantic_search.py](../../examples/application_semantic_search.py) and [examples/application_clustering.py](../../examples/application_clustering.py) -------------------------------------------------------------------------------- /docs/pretrained-models/sts-models.md: -------------------------------------------------------------------------------- 1 | # STS Models 2 | The models were first trained on [NLI data](nli-models.md), then we fine-tuned them on the [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark). This generate sentence embeddings that are especially suitable to measure the semantic similarity between sentence pairs. 3 | 4 | # Datasets 5 | We use the training file from the [STS benchmark dataset](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark). 6 | 7 | For a training example, see: 8 | - [examples/training_stsbenchmark.py](../../examples/training_stsbenchmark_bert.py) - Train directly on STS data 9 | - [examples/training_stsbenchmark_continue_training.py ](../../examples/training_stsbenchmark_continue_training.py) - First train one NLI, than train on STS data. 10 | 11 | # Pre-trained models 12 | We provide the following pre-trained models: 13 | 14 | ### BERT models 15 | - **bert-base-nli-stsb-mean-tokens**: BERT-base trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.14 16 | - **bert-large-nli-stsb-mean-tokens**: BERT-large trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.29 17 | 18 | ### RoBERTa models 19 | RoBERTa is an extension of BERT. [More Information](https://arxiv.org/abs/1907.11692). 20 | - **roberta-base-nli-stsb-mean-tokens**: RoBERTa-base trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 85.40 21 | - **roberta-large-nli-stsb-mean-tokens**: RoBERTa-large trained on AllNLI, then on STS benchmark training set. Performance: STSbenchmark: 86.31 22 | 23 | ### DistilBERT 24 | DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. [More Information](https://github.com/huggingface/transformers/tree/master/examples/distillation) 25 | - **distilbert-base-nli-stsb-mean-tokens**: Performance: STSbenchmark: 84.38 26 | 27 | # Performance Comparison 28 | Here are the performances on the STS benchmark for other sentence embeddings methods. They were also computed by using cosine-similarity and Spearman rank correlation. Note, these models were not-fined on the STS benchmark. 29 | 30 | - Avg. GloVe embeddings: 58.02 31 | - BERT-as-a-service avg. embeddings: 46.35 32 | - BERT-as-a-service CLS-vector: 16.50 33 | - InferSent - GloVe: 68.03 34 | - Universal Sentence Encoder: 74.92 35 | -------------------------------------------------------------------------------- /docs/pretrained-models/wikipedia-sections-models.md: -------------------------------------------------------------------------------- 1 | # Wikipedia Sections Models 2 | The `wikipedia-sections-models` implement the idea from Dor et al., 2018, [Learning Thematic Similarity Metric Using Triplet Networks](https://aclweb.org/anthology/P18-2009). 3 | 4 | It was trained with a triplet-loss: The anchor and the positive example were sentences from the same section from an wikipedia article, for example, from the History section of the London article. The negative example came from a different section from the same article, for example, from the Education section of the London article. 5 | 6 | # Dataset 7 | We use dataset from Dor et al., 2018, [Learning Thematic Similarity Metric Using Triplet Networks](https://aclweb.org/anthology/P18-2009). 8 | 9 | See [examples/training_wikipedia_sections.py](../../examples/training_wikipedia_sections.py) for how to train on this dataset. 10 | 11 | 12 | # Pre-trained models 13 | We provide the following pre-trained models: 14 | 15 | - **bert-base-wikipedia-sections-mean-tokens**: 80.42% accuracy on test set. 16 | 17 | You can use them in the following way: 18 | ``` 19 | from sentence_transformers import SentenceTransformer 20 | embedder = SentenceTransformer('pretrained-model-name') 21 | ``` 22 | 23 | # Performance Comparison 24 | Performance (accuracy) reported by Dor et al.: 25 | - mean-vectors: 0.65 26 | - skip-thoughts-CS: 0.615 27 | - skip-thoughts-SICK: 0.547 28 | - triplet-sen: 0.74 29 | 30 | 31 | # Applications 32 | The models achieve a rather low performance on the STS benchmark dataset. The reason for this is the training objective: An anchor, a positive and a negative example are presented. The network must only learn to differentiate what the positive and what the negative example is by ensuring that the negative example is further away from the anchor than the positive example. 33 | 34 | However, it does not matter how far the negative example is away, it can be little or really far away. This makes this model rather bad for deciding if a pair is somewhat similar. It learns only to recognize similar pairs (high scores) and dissimilar pairs (low scores). 35 | 36 | However, this model works well for **fine-grained clustering**. 37 | 38 | For an example, see: 39 | [examples/application_clustering_wikipedia_sections.py](../../examples/application_clustering_wikipedia_sections.py) 40 | 41 | 42 | -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | # Examples 2 | This folder contains various examples how to use SentenceTransformers. 3 | 4 | ##Datasets 5 | The [datasets](datasets/) folder stores datasets that are used in these examples. To download these datasets, run in the dataset folder: 6 | ``` 7 | python get_data.py 8 | ``` 9 | 10 | 11 | ##Applications 12 | The [applications](applications/) folder contains examples how to use SentenceTransformers for tasks like clustering or semantic search. 13 | 14 | ##Training Transformers 15 | The [training_transformers](training_transformers/) folder contains examples how to fine-tune transformer models like BERT, RoBERTa, or XLM-RoBERTa for generating sentence embedding. 16 | 17 | Further, it contains examples for **multi-task-learning** and **multilingual-learning**. 18 | 19 | ##Training Basic Models 20 | The [training_basic_models](training_basic_models/) show some examples how to train simple models like average word embeddings, Tf-Idf. Further, it has some more complex models based on Deep Averaging Networks (DAN), CNN, and LSTM. 21 | 22 | These example are great if a high speed for sentence embedding generation is needed. 23 | 24 | ##Evaluation 25 | The [evaluation](evaluation/) folder contains some examples how to evaluate SentenceTransformer models for common tasks. -------------------------------------------------------------------------------- /examples/applications/basic_embedding.py: -------------------------------------------------------------------------------- 1 | """ 2 | This basic example loads a pre-trained model from the web and uses it to 3 | generate sentence embeddings for a given list of sentences. 4 | """ 5 | 6 | from sentence_transformers import SentenceTransformer, LoggingHandler 7 | import numpy as np 8 | import logging 9 | 10 | #### Just some code to print debug information to stdout 11 | np.set_printoptions(threshold=100) 12 | 13 | logging.basicConfig(format='%(asctime)s - %(message)s', 14 | datefmt='%Y-%m-%d %H:%M:%S', 15 | level=logging.INFO, 16 | handlers=[LoggingHandler()]) 17 | #### /print debug information to stdout 18 | 19 | 20 | 21 | # Load Sentence model (based on BERT) from URL 22 | model = SentenceTransformer('bert-base-nli-mean-tokens') 23 | 24 | # Embed a list of sentences 25 | sentences = ['This framework generates embeddings for each input sentence', 26 | 'Sentences are passed as a list of string.', 27 | 'The quick brown fox jumps over the lazy dog.'] 28 | sentence_embeddings = model.encode(sentences) 29 | 30 | # The result is a list of sentence embeddings as numpy arrays 31 | for sentence, embedding in zip(sentences, sentence_embeddings): 32 | print("Sentence:", sentence) 33 | print("Embedding:", embedding) 34 | print("") 35 | -------------------------------------------------------------------------------- /examples/applications/clustering.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a simple application for sentence embeddings: clustering 3 | 4 | Sentences are mapped to sentence embeddings and then k-mean clustering is applied. 5 | """ 6 | from sentence_transformers import SentenceTransformer 7 | from sklearn.cluster import KMeans 8 | 9 | embedder = SentenceTransformer('bert-base-nli-mean-tokens') 10 | 11 | # Corpus with example sentences 12 | corpus = ['A man is eating food.', 13 | 'A man is eating a piece of bread.', 14 | 'A man is eating pasta.', 15 | 'The girl is carrying a baby.', 16 | 'The baby is carried by the woman', 17 | 'A man is riding a horse.', 18 | 'A man is riding a white horse on an enclosed ground.', 19 | 'A monkey is playing drums.', 20 | 'Someone in a gorilla costume is playing a set of drums.', 21 | 'A cheetah is running behind its prey.', 22 | 'A cheetah chases prey on across a field.' 23 | ] 24 | corpus_embeddings = embedder.encode(corpus) 25 | 26 | # Perform kmean clustering 27 | num_clusters = 5 28 | clustering_model = KMeans(n_clusters=num_clusters) 29 | clustering_model.fit(corpus_embeddings) 30 | cluster_assignment = clustering_model.labels_ 31 | 32 | clustered_sentences = [[] for i in range(num_clusters)] 33 | for sentence_id, cluster_id in enumerate(cluster_assignment): 34 | clustered_sentences[cluster_id].append(corpus[sentence_id]) 35 | 36 | for i, cluster in enumerate(clustered_sentences): 37 | print("Cluster ", i+1) 38 | print(cluster) 39 | print("") 40 | -------------------------------------------------------------------------------- /examples/applications/clustering_wikipedia_sections.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples clusters different sentences that come from the same wikipedia article. 3 | 4 | It uses the 'wikipedia-sections' model, a model that was trained to differentiate if two sentences from the 5 | same article come from the same section or from different sections in that article. 6 | """ 7 | from sentence_transformers import SentenceTransformer 8 | from sklearn.cluster import AgglomerativeClustering 9 | 10 | 11 | 12 | embedder = SentenceTransformer('bert-base-wikipedia-sections-mean-tokens') 13 | 14 | #Sentences and sections are from Wikipeda. 15 | #Source: https://en.wikipedia.org/wiki/Bushnell,_Illinois 16 | corpus = [ 17 | ("Bushnell is located at 40°33′6″N 90°30′29″W (40.551667, -90.507921).", "Geography"), 18 | ("According to the 2010 census, Bushnell has a total area of 2.138 square miles (5.54 km2), of which 2.13 square miles (5.52 km2) (or 99.63%) is land and 0.008 square miles (0.02 km2) (or 0.37%) is water.", "Geography"), 19 | 20 | ("The town was founded in 1854 when the Northern Cross Railroad built a line through the area.", "History"), 21 | ("Nehemiah Bushnell was the President of the Railroad, and townspeople honored him by naming their community after him. ", "History"), 22 | ("Bushnell was also served by the Toledo, Peoria and Western Railway, now the Keokuk Junction Railway.", "History"), 23 | 24 | ("As of the census[6] of 2000, there were 3,221 people, 1,323 households, and 889 families residing in the city. ", "Demographics"), 25 | ("The population density was 1,573.9 people per square mile (606.7/km²).", "Demographics"), 26 | ("There were 1,446 housing units at an average density of 706.6 per square mile (272.3/km²).", "Demographics"), 27 | 28 | ("From 1991 to 2012, Bushnell was home to one of the largest Christian Music and Arts festivals in the world, known as the Cornerstone Festival.", "Music"), 29 | ("Each year around the 4th of July, 25,000 people from all over the world would descend on the small farm town to watch over 300 bands, authors and artists perform at the Cornerstone Farm Campgrounds.", "Music"), 30 | ("The festival was generally well received by locals, and businesses in the area would typically put up signs welcoming festival-goers to their town.", "Music"), 31 | ("As a result of the location of the music festival, numerous live albums and videos have been recorded or filmed in Bushnell, including the annual Cornerstone Festival DVD. ", "Music"), 32 | ("Cornerstone held its final festival in 2012 and no longer operates.", "Music"), 33 | 34 | ("Beginning in 1908, the Truman Pioneer Stud Farm in Bushnell was home to one of the largest horse shows in the Midwest.", "Horse show"), 35 | ("The show was well known for imported European horses.", "Horse show"), 36 | ("The Bushnell Horse Show features some of the best Belgian and Percheron hitches in the country. Teams have come from many different states and Canada to compete.", "Horse show"), 37 | ] 38 | 39 | sentences = [row[0] for row in corpus] 40 | 41 | corpus_embeddings = embedder.encode(sentences) 42 | num_clusters = len(set([row[1] for row in corpus])) 43 | 44 | #Sklearn clustering 45 | km = AgglomerativeClustering(n_clusters=num_clusters) 46 | km.fit(corpus_embeddings) 47 | 48 | cluster_assignment = km.labels_ 49 | 50 | 51 | clustered_sentences = [[] for i in range(num_clusters)] 52 | for sentence_id, cluster_id in enumerate(cluster_assignment): 53 | clustered_sentences[cluster_id].append(corpus[sentence_id]) 54 | 55 | for i, cluster in enumerate(clustered_sentences): 56 | print("Cluster ", i+1) 57 | for row in cluster: 58 | print("(Gold label: {}) - {}".format(row[1], row[0])) 59 | print("") 60 | 61 | -------------------------------------------------------------------------------- /examples/applications/semantic_search.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is a simple application for sentence embeddings: semantic search 3 | 4 | We have a corpus with various sentences. Then, for a given query sentence, 5 | we want to find the most similar sentence in this corpus. 6 | 7 | This script outputs for various queries the top 5 most similar sentences in the corpus. 8 | """ 9 | from sentence_transformers import SentenceTransformer 10 | import scipy.spatial 11 | 12 | embedder = SentenceTransformer('bert-base-nli-mean-tokens') 13 | 14 | # Corpus with example sentences 15 | corpus = ['A man is eating food.', 16 | 'A man is eating a piece of bread.', 17 | 'The girl is carrying a baby.', 18 | 'A man is riding a horse.', 19 | 'A woman is playing violin.', 20 | 'Two men pushed carts through the woods.', 21 | 'A man is riding a white horse on an enclosed ground.', 22 | 'A monkey is playing drums.', 23 | 'A cheetah is running behind its prey.' 24 | ] 25 | corpus_embeddings = embedder.encode(corpus) 26 | 27 | # Query sentences: 28 | queries = ['A man is eating pasta.', 'Someone in a gorilla costume is playing a set of drums.', 'A cheetah chases prey on across a field.'] 29 | query_embeddings = embedder.encode(queries) 30 | 31 | # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity 32 | closest_n = 5 33 | for query, query_embedding in zip(queries, query_embeddings): 34 | distances = scipy.spatial.distance.cdist([query_embedding], corpus_embeddings, "cosine")[0] 35 | 36 | results = zip(range(len(distances)), distances) 37 | results = sorted(results, key=lambda x: x[1]) 38 | 39 | print("\n\n======================\n\n") 40 | print("Query:", query) 41 | print("\nTop 5 most similar sentences in corpus:") 42 | 43 | for idx, distance in results[0:closest_n]: 44 | print(corpus[idx].strip(), "(Score: %.4f)" % (1-distance)) 45 | 46 | 47 | 48 | -------------------------------------------------------------------------------- /examples/datasets/README.md: -------------------------------------------------------------------------------- 1 | # Datasets 2 | This folder contains some example datasets that can be used to for training and evaluation of sentence embeddings methods. 3 | 4 | To download these datasets, run: 5 | ``` 6 | python get_data.py 7 | ``` 8 | 9 | It will download the datasets and unzip them into this directory. 10 | 11 | 12 | # AllNLI Dataset 13 | The AllNLI dataset is the concatenation of the SNLI dataset (https://nlp.stanford.edu/projects/snli/) and the MultiNLI dataset (https://www.nyu.edu/projects/bowman/multinli/). 14 | 15 | # STS Benchmark 16 | The STS Benchmark (http://ixa2.si.ehu.eus/stswiki) contains sentence pairs with human gold score for their similarity. 17 | -------------------------------------------------------------------------------- /examples/datasets/clean_sts.py: -------------------------------------------------------------------------------- 1 | from pathlib import Path 2 | 3 | 4 | # filter out all csv files 5 | working_dir = Path('/workspace/sentence-transformers/examples/datasets/stsbenchmark/').glob('*_vi.csv') 6 | 7 | for f in working_dir: 8 | # read all lines to memory and use map for efficiency 9 | print(f"Working on {f.name}.") 10 | with open(f) as fi: 11 | data = fi.readlines() 12 | 13 | data = [line.split('\t') for line in data] 14 | ident_sents = [idx for idx, line in enumerate(data) if line[4] == '5.000' and line[5] == line[6]] 15 | print("line(s)", ident_sents) -------------------------------------------------------------------------------- /examples/datasets/get_data.py: -------------------------------------------------------------------------------- 1 | import urllib.request 2 | import zipfile 3 | import os 4 | folder_path = os.path.dirname(os.path.realpath(__file__)) 5 | print('Beginning download of datasets') 6 | 7 | datasets = ['AllNLI.zip', 'stsbenchmark.zip', 'wikipedia-sections-triplets.zip', 'STS2017.en-de.txt.gz', 'TED2013-en-de.txt.gz', 'xnli-en-de.txt.gz'] 8 | server = "https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/" 9 | 10 | for dataset in datasets: 11 | print("Download", dataset) 12 | url = server+dataset 13 | dataset_path = os.path.join(folder_path, dataset) 14 | urllib.request.urlretrieve(url, dataset_path) 15 | 16 | if dataset.endswith('.zip'): 17 | print("Extract", dataset) 18 | with zipfile.ZipFile(dataset_path, "r") as zip_ref: 19 | zip_ref.extractall(folder_path) 20 | os.remove(dataset_path) 21 | 22 | 23 | print("All datasets downloaded and extracted") 24 | -------------------------------------------------------------------------------- /examples/datasets/stsbenchmark/correlation.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | 3 | 4 | =head1 $0 5 | 6 | =head1 SYNOPSIS 7 | 8 | correlation.pl gs system 9 | 10 | Outputs the Pearson correlation. 11 | 12 | Example: 13 | 14 | $ ./correlation.pl gs sys 15 | 16 | Author: Eneko Agirre, Aitor Gonzalez-Agirre 17 | 18 | Dec. 31, 2012 19 | 20 | =cut 21 | 22 | use Getopt::Long qw(:config auto_help); 23 | use Pod::Usage; 24 | use warnings; 25 | use strict; 26 | use Math::Complex; 27 | 28 | pod2usage if $#ARGV != 1 ; 29 | 30 | if (-e $ARGV[1]) { 31 | my $continue = 0; 32 | my %filtered; 33 | my $do = 0; 34 | my %a ; 35 | my %b ; 36 | my %c ; 37 | 38 | open(I,$ARGV[0]) or die $! ; 39 | my $filter = 0; 40 | my $i = 0; 41 | while () { 42 | chomp ; 43 | next if /^\#/ ; 44 | if ($_ eq "") { 45 | $filter++; 46 | $filtered{$filter} = 1; 47 | } 48 | else { 49 | my @fields = (split(/\t/,$_)) ; 50 | my $score = $fields[4] ; 51 | warn "wrong range of score in gold standard: $score\n" if ($score > 5) or ($score < 0) ; 52 | $a{$i++} = $score ; 53 | $filter++; 54 | } 55 | } 56 | close(I) ; 57 | 58 | my $j = 0 ; 59 | 60 | open(I,$ARGV[1]) or die $! ; 61 | my $line = 1; 62 | while () { 63 | if(!defined($filtered{$line})) { 64 | chomp ; 65 | next if /^\#/ ; 66 | my @fields = (split(/\s+/,$_)) ; 67 | my ($score) = @fields ; 68 | $b{$j} = $score ; 69 | $c{$j} = 100; 70 | $continue = 1; 71 | $j++; 72 | } 73 | $line++; 74 | } 75 | close(I) ; 76 | 77 | if ($continue == 1) { 78 | my $sumw=0; 79 | 80 | my $sumwy=0; 81 | for(my $y = 0; $y < $i; $y++) { 82 | $sumwy = $sumwy + (100 * $a{$y}); 83 | $sumw = $sumw + 100; 84 | } 85 | my $meanyw = $sumwy/$sumw; 86 | 87 | my $sumwx=0; 88 | for(my $x = 0; $x < $i; $x++) { 89 | $sumwx = $sumwx + ($c{$x} * $b{$x}); 90 | } 91 | my $meanxw = $sumwx/$sumw; 92 | 93 | my $sumwxy = 0; 94 | for(my $x = 0; $x < $i; $x++) { 95 | $sumwxy = $sumwxy + $c{$x}*($b{$x} - $meanxw)*($a{$x} - $meanyw); 96 | } 97 | my $covxyw = $sumwxy/$sumw; 98 | 99 | my $sumwxx = 0; 100 | for(my $x = 0; $x < $i; $x++) { 101 | $sumwxx = $sumwxx + $c{$x}*($b{$x} - $meanxw)*($b{$x} - $meanxw); 102 | } 103 | my $covxxw = $sumwxx/$sumw; 104 | 105 | my $sumwyy = 0; 106 | for(my $x = 0; $x < $i; $x++) { 107 | $sumwyy = $sumwyy + $c{$x}*($a{$x} - $meanyw)*($a{$x} - $meanyw); 108 | } 109 | my $covyyw = $sumwyy/$sumw; 110 | 111 | my $corrxyw = $covxyw/sqrt($covxxw*$covyyw); 112 | 113 | printf "Pearson: %.5f\n", $corrxyw ; 114 | } 115 | } 116 | else{ 117 | printf "Pearson: nan\n"; 118 | exit(1); 119 | } 120 | -------------------------------------------------------------------------------- /examples/datasets/translate_sts.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import re 3 | import os 4 | from ast import literal_eval 5 | from pathlib import Path 6 | from typing import List, Callable 7 | from tqdm.auto import tqdm 8 | # from functools import reduce 9 | 10 | os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/home/translate-stsbenchmark-ed18aff9282f.json' 11 | 12 | from google.cloud import translate_v2 as translate 13 | translate_client = translate.Client(target_language="vi") 14 | 15 | def gg_translate(text): 16 | return translate_client.translate(text, format_='text', source_language='en', model='nmt')['translatedText'] 17 | 18 | # class Translator(object): 19 | # def __init__(self): 20 | # super().__init__() 21 | 22 | # self.query = { 23 | # 'client': 'gtx', 24 | # 'sl': 'en', 25 | # 'tl': 'vi', 26 | # 'hl': 'vi', 27 | # 'dt': ['at', 'bd', 'ex', 'ld', 'md', 'qca', 'rw', 'rm', 'ss', 't'], 28 | # 'ie': 'UTF-8', 29 | # 'oe': 'UTF-8', 30 | # 'otf': 1, 31 | # 'ssel': 0, 32 | # 'tsel': 0, 33 | # 'kc': 7 34 | # } 35 | 36 | # def translate(self, text): 37 | # result = [''] 38 | # try: 39 | # r = requests.post('https://translate.google.com/translate_a/single', params=self.query, data={'q': text}) 40 | # except requests.RequestException as e: 41 | # print(e) 42 | 43 | # # replace all keywords that doesn't exist in python 44 | 45 | # try: 46 | # result = literal_eval(re.sub(r'Array|null|true|false', '0', r.text)) 47 | # except SyntaxError as e: 48 | # print(e) 49 | # print(r.text) 50 | 51 | # # concat sentences 52 | # translated = ' '.join([sent[0] for sent in result[0]]) 53 | 54 | # return translated 55 | 56 | 57 | def translate_sentences_in_stsbenchmark_line(f: Path, line: str, translator: Callable[[str], str]) -> str: 58 | err_basket = [] 59 | new_line = line.split('\t') 60 | try: 61 | # in stsbenchmark, field 5 and 6 (0 indexed) are 2 sentences 62 | new_line[5] = translator(new_line[5].strip()) 63 | new_line[6] = translator(new_line[6].strip()) 64 | 65 | # sometimes, two sentences are identical after translating to Vietnamese so we change the similarity score 66 | if new_line[5] == new_line[6]: 67 | new_line[4] = '5.000' 68 | 69 | new_line = '\t'.join(new_line) 70 | except: 71 | err_basket.append(line) 72 | 73 | if len(err_basket): 74 | log_path = ''.join([f.stem, '.log']) 75 | with open(log_path, 'w') as fo: 76 | fo.write('\n'.join(err_basket)) 77 | 78 | return new_line if type(new_line) == str else "@@ERR@@" 79 | 80 | # filter out all csv files 81 | working_dir = Path('/workspace/sentence-transformers/examples/datasets/stsbenchmark/').glob('*.csv') 82 | 83 | # open each file and translate 2 sentences in each line to vietnamese 84 | for f in tqdm(working_dir): 85 | # read all lines to memory and use map for efficiency 86 | print(f"Working on {f.name}.") 87 | with open(f) as fi: 88 | data = fi.readlines() 89 | 90 | print(f"Translating {len(data)} lines.") 91 | data = [translate_sentences_in_stsbenchmark_line(f, line, gg_translate) for line in tqdm(data)] 92 | 93 | new_name = ''.join([f.stem, '_vi', f.suffix]) 94 | 95 | print(f"|--> Saving to {new_name}.") 96 | f.with_name(new_name).write_text('\n'.join(data)) 97 | 98 | print("gimme a breakpoint :))") -------------------------------------------------------------------------------- /examples/evaluate_STSb_datasets/sbert_embbeding/training.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings 3 | that can be compared using cosine-similarity to measure the similarity. 4 | 5 | Usage: 6 | python training_nli.py 7 | 8 | OR 9 | python training_nli.py pretrained_transformer_model_name 10 | """ 11 | import torch 12 | from torch.utils.data import DataLoader 13 | import math 14 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 15 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 16 | from sentence_transformers.readers import STSBenchmarkDataReader 17 | import logging 18 | from datetime import datetime 19 | import sys, os 20 | import argparse 21 | import numpy as np 22 | np.random.seed(42) 23 | torch.manual_seed(42) 24 | parser = argparse.ArgumentParser(description='Process some integers.') 25 | parser.add_argument('--batch_size', type=int, default=24) 26 | parser.add_argument('--evaluation_steps', type=int, default= 1000) 27 | parser.add_argument('--ckpt_path', type=str, default = "./output") 28 | parser.add_argument('--num_epochs', type=int, default ="1") 29 | parser.add_argument('--data_path', type=str, default = "./DataNLI") 30 | parser.add_argument('--pre_trained_path', type=str, default = "./PhoBERT") 31 | parser.add_argument('--vncorenlp_path', type=str, default = "./VnCoreNLP/VnCoreNLP-1.1.1.jar") 32 | parser.add_argument('--bpe_path', type=str, default = "./PhoBERT") 33 | args = parser.parse_args() 34 | 35 | #### Just some code to print debug information to stdout 36 | logging.basicConfig(format='%(asctime)s - %(message)s', 37 | datefmt='%Y-%m-%d %H:%M:%S', 38 | level=logging.INFO, 39 | handlers=[LoggingHandler()]) 40 | #### /print debug information to stdout 41 | if not os.path.exists(args.ckpt_path): 42 | os.mkdir(args.ckpt_path) 43 | 44 | 45 | # Read the dataset 46 | sts_reader = STSBenchmarkDataReader(args.data_path, normalize_scores=True) 47 | 48 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings 49 | word_embedding_model = models.PhoBERT(args.pre_trained_path, tokenizer_args={'vncorenlp_path':args.vncorenlp_path, 'bpe_path':args.bpe_path}) 50 | 51 | 52 | # Apply mean pooling to get one fixed sized sentence vector 53 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 54 | pooling_mode_mean_tokens=True, 55 | pooling_mode_cls_token=False, 56 | pooling_mode_max_tokens=False) 57 | 58 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 59 | 60 | # Convert the dataset to a DataLoader ready for training 61 | logging.info("Read STSbenchmark train dataset") 62 | train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model) 63 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size) 64 | train_loss = losses.CosineSimilarityLoss(model=model) 65 | 66 | 67 | logging.info("Read STSbenchmark dev dataset") 68 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'), model=model) 69 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size) 70 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 71 | 72 | 73 | # Configure the training. We skip evaluation in this example 74 | warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.batch_size*0.1) #10% of train data for warm-up 75 | logging.info("Warmup-steps: {}".format(warmup_steps)) 76 | 77 | 78 | # Train the model 79 | model.fit(train_objectives=[(train_dataloader, train_loss)], 80 | evaluator=evaluator, 81 | epochs=args.num_epochs, 82 | evaluation_steps=1000, 83 | warmup_steps=warmup_steps, 84 | output_path=args.ckpt_path) 85 | 86 | 87 | ############################################################################## 88 | # 89 | # Load the stored model and evaluate its performance on STS benchmark dataset 90 | # 91 | ############################################################################## 92 | 93 | model = SentenceTransformer(args.ckpt_path) 94 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test_vi.csv"), model=model) 95 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size) 96 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 97 | model.evaluate(evaluator) 98 | -------------------------------------------------------------------------------- /examples/evaluate_STSb_datasets/sbert_embbeding/training_LSTM.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings 3 | that can be compared using cosine-similarity to measure the similarity. 4 | 5 | Usage: 6 | python training_nli.py 7 | 8 | OR 9 | python training_nli.py pretrained_transformer_model_name 10 | """ 11 | from torch.utils.data import DataLoader 12 | import math 13 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 14 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 15 | from sentence_transformers.readers import STSBenchmarkDataReader 16 | import logging 17 | from datetime import datetime 18 | import sys, os 19 | import argparse 20 | import numpy as np 21 | np.random.seed(42) 22 | torch.manual_seed(42) 23 | 24 | parser = argparse.ArgumentParser(description='Process some integers.') 25 | parser.add_argument('--batch_size', type=int, default=24) 26 | parser.add_argument('--evaluation_steps', type=int, default= 1000) 27 | parser.add_argument('--ckpt_path', type=str, default = "./output") 28 | parser.add_argument('--num_epochs', type=int, default ="1") 29 | parser.add_argument('--data_path', type=str, default = "./DataNLI") 30 | parser.add_argument('--pre_trained_path', type=str, default = "./PhoBERT") 31 | parser.add_argument('--vncorenlp_path', type=str, default = "./VnCoreNLP/VnCoreNLP-1.1.1.jar") 32 | parser.add_argument('--bpe_path', type=str, default = "./PhoBERT") 33 | args = parser.parse_args() 34 | 35 | #### Just some code to print debug information to stdout 36 | logging.basicConfig(format='%(asctime)s - %(message)s', 37 | datefmt='%Y-%m-%d %H:%M:%S', 38 | level=logging.INFO, 39 | handlers=[LoggingHandler()]) 40 | #### /print debug information to stdout 41 | if not os.path.exists(args.ckpt_path): 42 | os.mkdir(args.ckpt_path) 43 | 44 | 45 | # Read the dataset 46 | sts_reader = STSBenchmarkDataReader(args.data_path, normalize_scores=True) 47 | 48 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings 49 | word_embedding_model = models.PhoBERT(args.pre_trained_path, tokenizer_args={'vncorenlp_path':args.vncorenlp_path, 'bpe_path':args.bpe_path}) 50 | 51 | lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=384, num_layers=1)# Apply mean pooling to get one fixed sized sentence vector 52 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 53 | pooling_mode_mean_tokens=True, 54 | pooling_mode_cls_token=False, 55 | pooling_mode_max_tokens=False) 56 | 57 | model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model]) 58 | 59 | # Convert the dataset to a DataLoader ready for training 60 | logging.info("Read STSbenchmark train dataset") 61 | train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model) 62 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size) 63 | train_loss = losses.CosineSimilarityLoss(model=model) 64 | 65 | 66 | logging.info("Read STSbenchmark dev dataset") 67 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'), model=model) 68 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size) 69 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 70 | 71 | 72 | # Configure the training. We skip evaluation in this example 73 | warmup_steps = math.ceil(len(train_data)*args.num_epochs/args.batch_size*0.1) #10% of train data for warm-up 74 | logging.info("Warmup-steps: {}".format(warmup_steps)) 75 | 76 | 77 | # Train the model 78 | model.fit(train_objectives=[(train_dataloader, train_loss)], 79 | evaluator=evaluator, 80 | epochs=args.num_epochs, 81 | evaluation_steps=1000, 82 | warmup_steps=warmup_steps, 83 | output_path=args.ckpt_path) 84 | 85 | 86 | ############################################################################## 87 | # 88 | # Load the stored model and evaluate its performance on STS benchmark dataset 89 | # 90 | ############################################################################## 91 | 92 | model = SentenceTransformer(args.ckpt_path) 93 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test_vi.csv"), model=model) 94 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size) 95 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 96 | model.evaluate(evaluator, args.ckpt_path) 97 | -------------------------------------------------------------------------------- /examples/evaluate_STSb_datasets/word_embbeding/training_biltsm.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled, 3 | for example with max-pooling (which gives a system like InferSent) or with mean-pooling. 4 | 5 | Note, you can also pass BERT embeddings to the BiLSTM. 6 | """ 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import models, losses 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 13 | from sentence_transformers.readers import * 14 | import logging 15 | from datetime import datetime 16 | from sentence_transformers.models.tokenizer.WordTokenizer import VIETNAM_STOP_WORDS_SEGMENTATION 17 | from sentence_transformers.models.tokenizer.VietnameseTokenizer import * 18 | import argparse 19 | import numpy as np 20 | np.random.seed(42) 21 | torch.manual_seed(42) 22 | parser = argparse.ArgumentParser(description='Process some integers.') 23 | parser.add_argument('--batch_size', type=int, default=24) 24 | parser.add_argument('--ckpt_path', type=str, default = "./output") 25 | parser.add_argument('--num_epochs', type=int, default ="1") 26 | parser.add_argument('--data_path', type=str, default = "./stsbenchmark'") 27 | parser.add_argument('--vncorenlp_path', type=str, default = "./VnCoreNLP/VnCoreNLP-1.1.1.jar") 28 | parser.add_argument('--embeddings_file_path', type=str, default= "./glove.6B.300d.txt.gz") 29 | args = parser.parse_args() 30 | 31 | #### Just some code to print debug information to stdout 32 | logging.basicConfig(format='%(asctime)s - %(message)s', 33 | datefmt='%Y-%m-%d %H:%M:%S', 34 | level=logging.INFO, 35 | handlers=[LoggingHandler()]) 36 | #### /print debug information to stdout 37 | if not os.path.exists(args.ckpt_path): 38 | os.mkdir(args.ckpt_path) 39 | 40 | # Read the dataset 41 | sts_reader = STSBenchmarkDataReader(args.data_path) 42 | 43 | 44 | 45 | # Map tokens to traditional word embeddings like GloVe 46 | word_embedding_model = models.WordEmbeddings.from_text_file(embeddings_file_path=args.embeddings_file_path, tokenizer = VietnameseTokenizer(stop_words=VIETNAM_STOP_WORDS_SEGMENTATION, vncorenlp_path=args.vncorenlp_path)) 47 | 48 | lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=150) 49 | 50 | # Apply mean pooling to get one fixed sized sentence vector 51 | pooling_model = models.Pooling(lstm.get_word_embedding_dimension(), 52 | pooling_mode_mean_tokens=False, 53 | pooling_mode_cls_token=False, 54 | pooling_mode_max_tokens=True) 55 | 56 | 57 | model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model]) 58 | 59 | 60 | # Convert the dataset to a DataLoader ready for training 61 | logging.info("Read STSbenchmark train dataset") 62 | train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model=model) 63 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size) 64 | train_loss = losses.CosineSimilarityLoss(model=model) 65 | 66 | logging.info("Read STSbenchmark dev dataset") 67 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'), model=model) 68 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size) 69 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 70 | 71 | # Configure the training 72 | warmup_steps = math.ceil(len(train_data) * args.num_epochs / args.batch_size * 0.1) #10% of train data for warm-up 73 | logging.info("Warmup-steps: {}".format(warmup_steps)) 74 | 75 | # Train the model 76 | model.fit(train_objectives=[(train_dataloader, train_loss)], 77 | evaluator=evaluator, 78 | epochs=args.num_epochs, 79 | warmup_steps=warmup_steps, 80 | output_path=args.ckpt_path 81 | ) 82 | 83 | 84 | 85 | ############################################################################## 86 | # 87 | # Load the stored model and evaluate its performance on STS benchmark dataset 88 | # 89 | ############################################################################## 90 | 91 | model = SentenceTransformer(args.ckpt_path) 92 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test_vi.csv"), model=model) 93 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size) 94 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 95 | 96 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /examples/evaluate_STSb_datasets/word_embbeding/training_cnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled, 3 | for example with max-pooling (which gives a system like InferSent) or with mean-pooling. 4 | 5 | Note, you can also pass BERT embeddings to the BiLSTM. 6 | """ 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import models, losses 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 13 | from sentence_transformers.readers import * 14 | import logging 15 | from datetime import datetime 16 | from sentence_transformers.models.tokenizer.WordTokenizer import VIETNAM_STOP_WORDS_SEGMENTATION 17 | from sentence_transformers.models.tokenizer.VietnameseTokenizer import * 18 | import argparse 19 | import numpy as np 20 | np.random.seed(42) 21 | torch.manual_seed(42) 22 | 23 | parser = argparse.ArgumentParser(description='Process some integers.') 24 | parser.add_argument('--batch_size', type=int, default=24) 25 | parser.add_argument('--ckpt_path', type=str, default = "./output") 26 | parser.add_argument('--num_epochs', type=int, default ="1") 27 | parser.add_argument('--data_path', type=str, default = "./stsbenchmark'") 28 | parser.add_argument('--vncorenlp_path', type=str, default = "./VnCoreNLP/VnCoreNLP-1.1.1.jar") 29 | parser.add_argument('--embeddings_file_path', type=str, default= "./glove.6B.300d.txt.gz") 30 | args = parser.parse_args() 31 | 32 | #### Just some code to print debug information to stdout 33 | logging.basicConfig(format='%(asctime)s - %(message)s', 34 | datefmt='%Y-%m-%d %H:%M:%S', 35 | level=logging.INFO, 36 | handlers=[LoggingHandler()]) 37 | #### /print debug information to stdout 38 | 39 | if not os.path.exists(args.ckpt_path): 40 | os.mkdir(args.ckpt_path) 41 | 42 | # Read the dataset 43 | sts_reader = STSBenchmarkDataReader(args.data_path) 44 | 45 | 46 | 47 | # Map tokens to traditional word embeddings like GloVe 48 | word_embedding_model = models.WordEmbeddings.from_text_file(embeddings_file_path=args.embeddings_file_path, tokenizer = VietnameseTokenizer(stop_words=VIETNAM_STOP_WORDS_SEGMENTATION, vncorenlp_path=args.vncorenlp_path)) 49 | 50 | cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), out_channels=300, kernel_sizes=[1,3,5]) 51 | 52 | # Apply mean pooling to get one fixed sized sentence vector 53 | pooling_model = models.Pooling(cnn.get_word_embedding_dimension(), 54 | pooling_mode_mean_tokens=False, 55 | pooling_mode_cls_token=False, 56 | pooling_mode_max_tokens=True) 57 | 58 | 59 | model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model]) 60 | 61 | 62 | # Convert the dataset to a DataLoader ready for training 63 | logging.info("Read STSbenchmark train dataset") 64 | train_data = SentencesDataset(sts_reader.get_examples('sts-train_vi.csv'), model=model) 65 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=args.batch_size) 66 | train_loss = losses.CosineSimilarityLoss(model=model) 67 | 68 | logging.info("Read STSbenchmark dev dataset") 69 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev_vi.csv'), model=model) 70 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=args.batch_size) 71 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 72 | 73 | # Configure the training 74 | warmup_steps = math.ceil(len(train_data) * args.num_epochs / args.batch_size * 0.1) #10% of train data for warm-up 75 | logging.info("Warmup-steps: {}".format(warmup_steps)) 76 | 77 | # Train the model 78 | model.fit(train_objectives=[(train_dataloader, train_loss)], 79 | evaluator=evaluator, 80 | epochs=args.num_epochs, 81 | warmup_steps=warmup_steps, 82 | output_path=args.ckpt_path 83 | ) 84 | 85 | 86 | 87 | ############################################################################## 88 | # 89 | # Load the stored model and evaluate its performance on STS benchmark dataset 90 | # 91 | ############################################################################## 92 | 93 | model = SentenceTransformer(args.ckpt_path) 94 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test_vi.csv"), model=model) 95 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=args.batch_size) 96 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 97 | 98 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /examples/evaluation/evaluation_inference_speed.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples measures the inference speed of a certain model 3 | 4 | Usage: 5 | python evaluation_inference_speed.py 6 | OR 7 | python evaluation_inference_speed.py model_name 8 | """ 9 | from torch.utils.data import DataLoader 10 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler 11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 12 | from sentence_transformers.readers import STSBenchmarkDataReader 13 | import logging 14 | import sys 15 | import os 16 | import time 17 | import torch 18 | 19 | #Limit torch to 4 threads 20 | torch.set_num_threads(4) 21 | 22 | script_folder_path = os.path.dirname(os.path.realpath(__file__)) 23 | 24 | 25 | #### Just some code to print debug information to stdout 26 | logging.basicConfig(format='%(asctime)s - %(message)s', 27 | datefmt='%Y-%m-%d %H:%M:%S', 28 | level=logging.INFO, 29 | handlers=[LoggingHandler()]) 30 | #### /print debug information to stdout 31 | 32 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens' 33 | 34 | # Load a named sentence model (based on BERT). This will download the model from our server. 35 | # Alternatively, you can also pass a filepath to SentenceTransformer() 36 | model = SentenceTransformer(model_name) 37 | 38 | sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark')) 39 | examples = sts_reader.get_examples("sts-train.csv") 40 | sentences = [text for ex in examples for text in ex.texts] 41 | print("Number of sentences:", len(sentences)) 42 | 43 | start_time = time.time() 44 | emb = model.encode(sentences, batch_size=32) 45 | end_time = time.time() 46 | diff_time = end_time - start_time 47 | print("Done after {:.2f} sec".format(diff_time)) 48 | print("Speed: {:.2f}".format(len(sentences) / diff_time)) -------------------------------------------------------------------------------- /examples/evaluation/evaluation_stsbenchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples loads a pre-trained model and evaluates it on the STSbenchmark dataset 3 | 4 | Usage: 5 | python evaluation_stsbenchmark.py 6 | OR 7 | python evaluation_stsbenchmark.py model_name 8 | """ 9 | from torch.utils.data import DataLoader 10 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler 11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 12 | from sentence_transformers.readers import STSBenchmarkDataReader 13 | import logging 14 | import sys 15 | import os 16 | import torch 17 | 18 | script_folder_path = os.path.dirname(os.path.realpath(__file__)) 19 | 20 | #Limit torch to 4 threads 21 | torch.set_num_threads(4) 22 | 23 | #### Just some code to print debug information to stdout 24 | logging.basicConfig(format='%(asctime)s - %(message)s', 25 | datefmt='%Y-%m-%d %H:%M:%S', 26 | level=logging.INFO, 27 | handlers=[LoggingHandler()]) 28 | #### /print debug information to stdout 29 | 30 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-nli-mean-tokens' 31 | 32 | # Load a named sentence model (based on BERT). This will download the model from our server. 33 | # Alternatively, you can also pass a filepath to SentenceTransformer() 34 | model = SentenceTransformer(model_name) 35 | 36 | sts_reader = STSBenchmarkDataReader(os.path.join(script_folder_path, '../datasets/stsbenchmark')) 37 | 38 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 39 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) 40 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 41 | 42 | model.evaluate(evaluator) 43 | -------------------------------------------------------------------------------- /examples/evaluation/evaluation_stsbenchmark_sbert-wk.py: -------------------------------------------------------------------------------- 1 | """ 2 | Performs the pooling described in the paper: 3 | SBERT-WK: A Sentence Embedding Method by Dissecting BERT-based Word Models, 2020, https://arxiv.org/abs/2002.06652 4 | 5 | Note: WKPooling improves the performance only for certain models. Further, WKPooling requires QR-decomposition, 6 | for which there is so far not efficient implementation in pytorch for GPUs (see https://github.com/pytorch/pytorch/issues/22573). 7 | Hence, WKPooling runs on the GPU, which makes it rather in-efficient. 8 | """ 9 | from torch.utils.data import DataLoader 10 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, models 11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 12 | from sentence_transformers.readers import STSBenchmarkDataReader 13 | import logging 14 | import torch 15 | 16 | #Limit torch to 4 threads, as this example runs on the CPU 17 | torch.set_num_threads(4) 18 | 19 | #### Just some code to print debug information to stdout 20 | logging.basicConfig(format='%(asctime)s - %(message)s', 21 | datefmt='%Y-%m-%d %H:%M:%S', 22 | level=logging.INFO, 23 | handlers=[LoggingHandler()]) 24 | #### /print debug information to stdout 25 | 26 | 27 | #1) Point the transformer model to the BERT / RoBERTa etc. model you would like to use. Ensure that output_hidden_states is true 28 | word_embedding_model = models.Transformer('bert-base-uncased', model_args={'output_hidden_states': True}) 29 | 30 | #2) Add WKPooling 31 | pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension()) 32 | 33 | #3) Create a sentence transformer model to glue both models together 34 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 35 | 36 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') 37 | 38 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 39 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) 40 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 41 | 42 | model.evaluate(evaluator) 43 | -------------------------------------------------------------------------------- /examples/training_basic_models/training_stsbenchmark_avg_word_embeddings.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example uses average word embeddings (for example from GloVe). It adds two fully-connected feed-forward layers (dense layers) to create a Deep Averaging Network (DAN). 3 | 4 | If 'glove.6B.300d.txt.gz' does not exist, it tries to download it from our server. 5 | 6 | See https://public.ukp.informatik.tu-darmstadt.de/reimers/embeddings/ 7 | for available word embeddings files 8 | """ 9 | import torch 10 | from torch.utils.data import DataLoader 11 | import math 12 | from sentence_transformers import models, losses 13 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 14 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 15 | from sentence_transformers.readers import * 16 | import logging 17 | from datetime import datetime 18 | 19 | #### Just some code to print debug information to stdout 20 | logging.basicConfig(format='%(asctime)s - %(message)s', 21 | datefmt='%Y-%m-%d %H:%M:%S', 22 | level=logging.INFO, 23 | handlers=[LoggingHandler()]) 24 | #### /print debug information to stdout 25 | 26 | # Read the dataset 27 | batch_size = 32 28 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') 29 | model_save_path = 'output/training_stsbenchmark_avg_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 30 | 31 | 32 | 33 | # Map tokens to traditional word embeddings like GloVe 34 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz') 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=True, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=False) 41 | 42 | # Add two trainable feed-forward networks (DAN) 43 | sent_embeddings_dimension = pooling_model.get_sentence_embedding_dimension() 44 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) 45 | dan2 = models.Dense(in_features=sent_embeddings_dimension, out_features=sent_embeddings_dimension) 46 | 47 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model, dan1, dan2]) 48 | 49 | 50 | # Convert the dataset to a DataLoader ready for training 51 | logging.info("Read STSbenchmark train dataset") 52 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 53 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 54 | train_loss = losses.CosineSimilarityLoss(model=model) 55 | 56 | logging.info("Read STSbenchmark dev dataset") 57 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 58 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 59 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 60 | 61 | # Configure the training 62 | num_epochs = 10 63 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 64 | logging.info("Warmup-steps: {}".format(warmup_steps)) 65 | 66 | # Train the model 67 | model.fit(train_objectives=[(train_dataloader, train_loss)], 68 | evaluator=evaluator, 69 | epochs=num_epochs, 70 | warmup_steps=warmup_steps, 71 | output_path=model_save_path 72 | ) 73 | 74 | 75 | 76 | ############################################################################## 77 | # 78 | # Load the stored model and evaluate its performance on STS benchmark dataset 79 | # 80 | ############################################################################## 81 | 82 | model = SentenceTransformer(model_save_path) 83 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 84 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 85 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 86 | 87 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /examples/training_basic_models/training_stsbenchmark_bilstm.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example runs a BiLSTM after the word embedding lookup. The output of the BiLSTM is than pooled, 3 | for example with max-pooling (which gives a system like InferSent) or with mean-pooling. 4 | 5 | Note, you can also pass BERT embeddings to the BiLSTM. 6 | """ 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import models, losses 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 13 | from sentence_transformers.readers import * 14 | import logging 15 | from datetime import datetime 16 | 17 | #### Just some code to print debug information to stdout 18 | logging.basicConfig(format='%(asctime)s - %(message)s', 19 | datefmt='%Y-%m-%d %H:%M:%S', 20 | level=logging.INFO, 21 | handlers=[LoggingHandler()]) 22 | #### /print debug information to stdout 23 | 24 | # Read the dataset 25 | batch_size = 32 26 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') 27 | model_save_path = 'output/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 28 | 29 | 30 | 31 | # Map tokens to traditional word embeddings like GloVe 32 | word_embedding_model = models.WordEmbeddings.from_text_file('glove.6B.300d.txt.gz') 33 | 34 | lstm = models.LSTM(word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), hidden_dim=1024) 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(lstm.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=False, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=True) 41 | 42 | 43 | model = SentenceTransformer(modules=[word_embedding_model, lstm, pooling_model]) 44 | 45 | 46 | # Convert the dataset to a DataLoader ready for training 47 | logging.info("Read STSbenchmark train dataset") 48 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 49 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 50 | train_loss = losses.CosineSimilarityLoss(model=model) 51 | 52 | logging.info("Read STSbenchmark dev dataset") 53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 56 | 57 | # Configure the training 58 | num_epochs = 10 59 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 60 | logging.info("Warmup-steps: {}".format(warmup_steps)) 61 | 62 | # Train the model 63 | model.fit(train_objectives=[(train_dataloader, train_loss)], 64 | evaluator=evaluator, 65 | epochs=num_epochs, 66 | warmup_steps=warmup_steps, 67 | output_path=model_save_path 68 | ) 69 | 70 | 71 | 72 | ############################################################################## 73 | # 74 | # Load the stored model and evaluate its performance on STS benchmark dataset 75 | # 76 | ############################################################################## 77 | 78 | model = SentenceTransformer(model_save_path) 79 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 80 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 81 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 82 | 83 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /examples/training_basic_models/training_stsbenchmark_bow.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example uses a simple bag-of-words (BoW) approach. A sentence is mapped 3 | to a sparse vector with e.g. 25,000 dimensions. Optionally, you can also use tf-idf. 4 | 5 | To make the model trainable, we add multiple dense layers to create a Deep Averaging Network (DAN). 6 | """ 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import models, losses 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 13 | from sentence_transformers.readers import * 14 | from sentence_transformers.models.tokenizer.WordTokenizer import ENGLISH_STOP_WORDS 15 | import logging 16 | from datetime import datetime 17 | 18 | #### Just some code to print debug information to stdout 19 | logging.basicConfig(format='%(asctime)s - %(message)s', 20 | datefmt='%Y-%m-%d %H:%M:%S', 21 | level=logging.INFO, 22 | handlers=[LoggingHandler()]) 23 | #### /print debug information to stdout 24 | 25 | # Read the dataset 26 | batch_size = 32 27 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') 28 | model_save_path = 'output/training_tf-idf_word_embeddings-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 29 | 30 | 31 | 32 | # Create the vocab for the BoW model 33 | stop_words = ENGLISH_STOP_WORDS 34 | max_vocab_size = 25000 #This is also the size of the BoW sentence vector. 35 | 36 | 37 | #Read the most common max_vocab_size words. Skip stop-words 38 | vocab = set() 39 | weights = {} 40 | lines = open('wikipedia_doc_frequencies.txt', encoding='utf8').readlines() 41 | num_docs = int(lines[0]) 42 | for line in lines[1:]: 43 | word, freq = line.lower().strip().split("\t") 44 | if word in stop_words: 45 | continue 46 | 47 | vocab.add(word) 48 | weights[word] = math.log(num_docs/int(freq)) 49 | 50 | if len(vocab) >= max_vocab_size: 51 | break 52 | 53 | #Create the BoW model. Because we set word_weights to the IDF values and cumulative_term_frequency=True, we 54 | #get tf-idf vectors. Set word_weights to an empty dict and cumulative_term_frequency=False to get a 1-hot sentence encoding 55 | bow = models.BoW(vocab=vocab, word_weights=weights, cumulative_term_frequency=True) 56 | 57 | # Add two trainable feed-forward networks (DAN) with max_vocab_size -> 768 -> 512 dimensions. 58 | sent_embeddings_dimension = max_vocab_size 59 | dan1 = models.Dense(in_features=sent_embeddings_dimension, out_features=768) 60 | dan2 = models.Dense(in_features=768, out_features=512) 61 | 62 | model = SentenceTransformer(modules=[bow, dan1, dan2]) 63 | 64 | 65 | # Convert the dataset to a DataLoader ready for training 66 | logging.info("Read STSbenchmark train dataset") 67 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 68 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 69 | train_loss = losses.CosineSimilarityLoss(model=model) 70 | 71 | logging.info("Read STSbenchmark dev dataset") 72 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 73 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 74 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 75 | 76 | # Configure the training 77 | num_epochs = 10 78 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 79 | logging.info("Warmup-steps: {}".format(warmup_steps)) 80 | 81 | # Train the model 82 | model.fit(train_objectives=[(train_dataloader, train_loss)], 83 | evaluator=evaluator, 84 | epochs=num_epochs, 85 | warmup_steps=warmup_steps, 86 | output_path=model_save_path 87 | ) 88 | 89 | 90 | 91 | ############################################################################## 92 | # 93 | # Load the stored model and evaluate its performance on STS benchmark dataset 94 | # 95 | ############################################################################## 96 | 97 | model = SentenceTransformer(model_save_path) 98 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 99 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 100 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 101 | 102 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /examples/training_basic_models/training_stsbenchmark_cnn.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example runs a CNN after the word embedding lookup. The output of the CNN is than pooled, 3 | for example with mean-pooling. 4 | 5 | 6 | """ 7 | import torch 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import models, losses 11 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 12 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 13 | from sentence_transformers.readers import * 14 | import logging 15 | from datetime import datetime 16 | 17 | #### Just some code to print debug information to stdout 18 | logging.basicConfig(format='%(asctime)s - %(message)s', 19 | datefmt='%Y-%m-%d %H:%M:%S', 20 | level=logging.INFO, 21 | handlers=[LoggingHandler()]) 22 | #### /print debug information to stdout 23 | 24 | # Read the dataset 25 | batch_size = 32 26 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') 27 | model_save_path = 'output/training_stsbenchmark_bilstm-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 28 | 29 | 30 | 31 | # Map tokens to vectors using BERT 32 | word_embedding_model = models.BERT('bert-base-uncased') 33 | 34 | cnn = models.CNN(in_word_embedding_dimension=word_embedding_model.get_word_embedding_dimension(), out_channels=256, kernel_sizes=[1,3,5]) 35 | 36 | # Apply mean pooling to get one fixed sized sentence vector 37 | pooling_model = models.Pooling(cnn.get_word_embedding_dimension(), 38 | pooling_mode_mean_tokens=True, 39 | pooling_mode_cls_token=False, 40 | pooling_mode_max_tokens=False) 41 | 42 | 43 | model = SentenceTransformer(modules=[word_embedding_model, cnn, pooling_model]) 44 | 45 | 46 | # Convert the dataset to a DataLoader ready for training 47 | logging.info("Read STSbenchmark train dataset") 48 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 49 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 50 | train_loss = losses.CosineSimilarityLoss(model=model) 51 | 52 | logging.info("Read STSbenchmark dev dataset") 53 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 54 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 55 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 56 | 57 | # Configure the training 58 | num_epochs = 10 59 | warmup_steps = math.ceil(len(train_data) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 60 | logging.info("Warmup-steps: {}".format(warmup_steps)) 61 | 62 | # Train the model 63 | model.fit(train_objectives=[(train_dataloader, train_loss)], 64 | evaluator=evaluator, 65 | epochs=num_epochs, 66 | warmup_steps=warmup_steps, 67 | output_path=model_save_path 68 | ) 69 | 70 | 71 | 72 | ############################################################################## 73 | # 74 | # Load the stored model and evaluate its performance on STS benchmark dataset 75 | # 76 | ############################################################################## 77 | 78 | model = SentenceTransformer(model_save_path) 79 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 80 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 81 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 82 | 83 | model.evaluate(evaluator) -------------------------------------------------------------------------------- /examples/training_multilingual/training_sbert-en-de.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script contains an example how to extend a model to new languages. 3 | 4 | We use an existent (English) teacher sentence embedding model and extend it to a new language, in this case, German. 5 | 6 | In order to run this example, you must download these files: 7 | https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/TED2013-en-de.txt.gz 8 | https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/STS2017.en-de.txt.gz 9 | https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/datasets/xnli-en-de.txt.gz 10 | 11 | And store them in the datasets-folder. 12 | 13 | You can then run this code like this: 14 | python training_multilingual.py datasets/TED2013-en-de.txt.gz 15 | """ 16 | 17 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models, readers, evaluation, losses 18 | from torch.utils.data import DataLoader 19 | from sentence_transformers.datasets import ParallelSentencesDataset 20 | from datetime import datetime 21 | 22 | import csv 23 | import logging 24 | import sys 25 | import torch 26 | import os 27 | import numpy as np 28 | 29 | #We can pass multiple train files to this script 30 | train_files = sys.argv[1:] 31 | 32 | 33 | if len(train_files) == 0: 34 | print("Please specify at least 1 training file: python training_multilingual.py path/to/trainfile.txt") 35 | 36 | logging.basicConfig(format='%(asctime)s - %(message)s', 37 | datefmt='%Y-%m-%d %H:%M:%S', 38 | level=logging.INFO, 39 | handlers=[LoggingHandler()]) 40 | 41 | max_seq_length = 128 42 | train_batch_size = 64 43 | 44 | logging.info("Load teacher model") 45 | teacher_model = SentenceTransformer('bert-base-nli-stsb-mean-tokens') 46 | 47 | logging.info("Create student model from scratch") 48 | word_embedding_model = models.Transformer("xlm-roberta-base") 49 | 50 | # Apply mean pooling to get one fixed sized sentence vector 51 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 52 | pooling_mode_mean_tokens=True, 53 | pooling_mode_cls_token=False, 54 | pooling_mode_max_tokens=False) 55 | 56 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 57 | 58 | output_path = "output/make-multilingual-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 59 | 60 | logging.info("Create dataset reader") 61 | 62 | 63 | ###### Read Dataset ###### 64 | train_data = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) 65 | for train_file in train_files: 66 | train_data.load_data(train_file) 67 | 68 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 69 | train_loss = losses.MSELoss(model=model) 70 | 71 | 72 | ###### Load dev sets ###### 73 | 74 | # Test on STS 2017.en-de dataset using Spearman rank correlation 75 | logging.info("Read STS2017.en-de dataset") 76 | evaluators = [] 77 | sts_reader = readers.STSDataReader('../datasets/', s1_col_idx=0, s2_col_idx=1, score_col_idx=2) 78 | dev_data = SentencesDataset(examples=sts_reader.get_examples('STS2017.en-de.txt.gz'), model=model) 79 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 80 | evaluator_sts = evaluation.EmbeddingSimilarityEvaluator(dev_dataloader, name='STS2017.en-de') 81 | evaluators.append(evaluator_sts) 82 | 83 | 84 | # Use XLNI.en-de dataset with MSE evaluation 85 | logging.info("Read XNLI.en-de dataset") 86 | xnli_reader = ParallelSentencesDataset(student_model=model, teacher_model=teacher_model) 87 | xnli_reader.load_data('../datasets/xnli-en-de.txt.gz') 88 | 89 | xnli_dataloader = DataLoader(xnli_reader, shuffle=False, batch_size=train_batch_size) 90 | xnli_mse = evaluation.MSEEvaluator(xnli_dataloader, name='xnli-en-de') 91 | evaluators.append(xnli_mse) 92 | 93 | 94 | 95 | # Train the model 96 | model.fit(train_objectives=[(train_dataloader, train_loss)], 97 | evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: scores[-1]), 98 | epochs=20, 99 | evaluation_steps=1000, 100 | warmup_steps=10000, 101 | scheduler='warmupconstant', 102 | output_path=output_path, 103 | save_best_model=True, 104 | optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False} 105 | ) 106 | 107 | 108 | -------------------------------------------------------------------------------- /examples/training_transformers/training_multi-task.py: -------------------------------------------------------------------------------- 1 | """ 2 | This is an example how to train SentenceTransformers in a multi-task setup. 3 | 4 | The system trains BERT on the AllNLI and on the STSbenchmark dataset. 5 | """ 6 | from torch.utils.data import DataLoader 7 | import math 8 | from sentence_transformers import models, losses 9 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 10 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 11 | from sentence_transformers.readers import * 12 | import logging 13 | from datetime import datetime 14 | 15 | #### Just some code to print debug information to stdout 16 | logging.basicConfig(format='%(asctime)s - %(message)s', 17 | datefmt='%Y-%m-%d %H:%M:%S', 18 | level=logging.INFO, 19 | handlers=[LoggingHandler()]) 20 | #### /print debug information to stdout 21 | 22 | # Read the dataset 23 | model_name = 'bert-base-uncased' 24 | batch_size = 16 25 | nli_reader = NLIDataReader('../datasets/AllNLI') 26 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') 27 | train_num_labels = nli_reader.get_num_labels() 28 | model_save_path = 'output/training_multi-task_'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 29 | 30 | 31 | 32 | # Use BERT for mapping tokens to embeddings 33 | word_embedding_model = models.Transformer(model_name) 34 | 35 | # Apply mean pooling to get one fixed sized sentence vector 36 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 37 | pooling_mode_mean_tokens=True, 38 | pooling_mode_cls_token=False, 39 | pooling_mode_max_tokens=False) 40 | 41 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 42 | 43 | 44 | # Convert the dataset to a DataLoader ready for training 45 | logging.info("Read AllNLI train dataset") 46 | train_data_nli = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) 47 | train_dataloader_nli = DataLoader(train_data_nli, shuffle=True, batch_size=batch_size) 48 | train_loss_nli = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 49 | 50 | logging.info("Read STSbenchmark train dataset") 51 | train_data_sts = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model=model) 52 | train_dataloader_sts = DataLoader(train_data_sts, shuffle=True, batch_size=batch_size) 53 | train_loss_sts = losses.CosineSimilarityLoss(model=model) 54 | 55 | 56 | logging.info("Read STSbenchmark dev dataset") 57 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 58 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 59 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 60 | 61 | # Configure the training 62 | num_epochs = 4 63 | 64 | warmup_steps = math.ceil(len(train_dataloader_sts) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 65 | logging.info("Warmup-steps: {}".format(warmup_steps)) 66 | 67 | 68 | # Here we define the two train objectives: train_dataloader_nli with train_loss_nli (i.e., SoftmaxLoss for NLI data) 69 | # and train_dataloader_sts with train_loss_sts (i.e., CosineSimilarityLoss for STSbenchmark data) 70 | # You can pass as many (dataloader, loss) tuples as you like. They are iterated in a round-robin way. 71 | train_objectives = [(train_dataloader_nli, train_loss_nli), (train_dataloader_sts, train_loss_sts)] 72 | 73 | # Train the model 74 | model.fit(train_objectives=train_objectives, 75 | evaluator=evaluator, 76 | epochs=num_epochs, 77 | evaluation_steps=1000, 78 | warmup_steps=warmup_steps, 79 | output_path=model_save_path 80 | ) 81 | 82 | 83 | 84 | ############################################################################## 85 | # 86 | # Load the stored model and evaluate its performance on STS benchmark dataset 87 | # 88 | ############################################################################## 89 | 90 | model = SentenceTransformer(model_save_path) 91 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 92 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 93 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 94 | 95 | model.evaluate(evaluator) 96 | -------------------------------------------------------------------------------- /examples/training_transformers/training_nli.py: -------------------------------------------------------------------------------- 1 | """ 2 | The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset 3 | with softmax loss function. At every 1000 training steps, the model is evaluated on the 4 | STS benchmark dataset 5 | 6 | Usage: 7 | python training_nli.py 8 | 9 | OR 10 | python training_nli.py pretrained_transformer_model_name 11 | """ 12 | from torch.utils.data import DataLoader 13 | import math 14 | from sentence_transformers import models, losses 15 | from sentence_transformers import SentencesDataset, LoggingHandler, SentenceTransformer 16 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 17 | from sentence_transformers.readers import * 18 | import logging 19 | from datetime import datetime 20 | import sys 21 | 22 | #### Just some code to print debug information to stdout 23 | logging.basicConfig(format='%(asctime)s - %(message)s', 24 | datefmt='%Y-%m-%d %H:%M:%S', 25 | level=logging.INFO, 26 | handlers=[LoggingHandler()]) 27 | #### /print debug information to stdout 28 | 29 | #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base 30 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' 31 | 32 | # Read the dataset 33 | batch_size = 16 34 | nli_reader = NLIDataReader('../datasets/AllNLI') 35 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark') 36 | train_num_labels = nli_reader.get_num_labels() 37 | model_save_path = 'output/training_nli_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 38 | 39 | 40 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings 41 | word_embedding_model = models.Transformer(model_name) 42 | 43 | # Apply mean pooling to get one fixed sized sentence vector 44 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 45 | pooling_mode_mean_tokens=True, 46 | pooling_mode_cls_token=False, 47 | pooling_mode_max_tokens=False) 48 | 49 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 50 | 51 | 52 | # Convert the dataset to a DataLoader ready for training 53 | logging.info("Read AllNLI train dataset") 54 | train_data = SentencesDataset(nli_reader.get_examples('train.gz'), model=model) 55 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=batch_size) 56 | train_loss = losses.SoftmaxLoss(model=model, sentence_embedding_dimension=model.get_sentence_embedding_dimension(), num_labels=train_num_labels) 57 | 58 | 59 | 60 | logging.info("Read STSbenchmark dev dataset") 61 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 62 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=batch_size) 63 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 64 | 65 | # Configure the training 66 | num_epochs = 1 67 | 68 | warmup_steps = math.ceil(len(train_dataloader) * num_epochs / batch_size * 0.1) #10% of train data for warm-up 69 | logging.info("Warmup-steps: {}".format(warmup_steps)) 70 | 71 | 72 | 73 | # Train the model 74 | model.fit(train_objectives=[(train_dataloader, train_loss)], 75 | evaluator=evaluator, 76 | epochs=num_epochs, 77 | evaluation_steps=1000, 78 | warmup_steps=warmup_steps, 79 | output_path=model_save_path 80 | ) 81 | 82 | 83 | 84 | ############################################################################## 85 | # 86 | # Load the stored model and evaluate its performance on STS benchmark dataset 87 | # 88 | ############################################################################## 89 | 90 | model = SentenceTransformer(model_save_path) 91 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 92 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=batch_size) 93 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 94 | 95 | model.evaluate(evaluator) 96 | -------------------------------------------------------------------------------- /examples/training_transformers/training_nli_phobert.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/DatCanCode/sentence-transformers/13d597ec823ba076b9d8df4e0dec01231d14d5d1/examples/training_transformers/training_nli_phobert.py -------------------------------------------------------------------------------- /examples/training_transformers/training_stsbenchmark.py: -------------------------------------------------------------------------------- 1 | """ 2 | This examples trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) for the STSbenchmark from scratch. It generates sentence embeddings 3 | that can be compared using cosine-similarity to measure the similarity. 4 | 5 | Usage: 6 | python training_nli.py 7 | 8 | OR 9 | python training_nli.py pretrained_transformer_model_name 10 | """ 11 | from torch.utils.data import DataLoader 12 | import math 13 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 14 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 15 | from sentence_transformers.readers import STSBenchmarkDataReader 16 | import logging 17 | from datetime import datetime 18 | import sys 19 | 20 | #### Just some code to print debug information to stdout 21 | logging.basicConfig(format='%(asctime)s - %(message)s', 22 | datefmt='%Y-%m-%d %H:%M:%S', 23 | level=logging.INFO, 24 | handlers=[LoggingHandler()]) 25 | #### /print debug information to stdout 26 | 27 | #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base 28 | model_name = sys.argv[1] if len(sys.argv) > 1 else 'bert-base-uncased' 29 | 30 | # Read the dataset 31 | train_batch_size = 16 32 | num_epochs = 4 33 | model_save_path = 'output/training_stsbenchmark_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 34 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark', normalize_scores=True) 35 | 36 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings 37 | word_embedding_model = models.Transformer(model_name) 38 | 39 | # Apply mean pooling to get one fixed sized sentence vector 40 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 41 | pooling_mode_mean_tokens=True, 42 | pooling_mode_cls_token=False, 43 | pooling_mode_max_tokens=False) 44 | 45 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 46 | 47 | # Convert the dataset to a DataLoader ready for training 48 | logging.info("Read STSbenchmark train dataset") 49 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) 50 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 51 | train_loss = losses.CosineSimilarityLoss(model=model) 52 | 53 | 54 | logging.info("Read STSbenchmark dev dataset") 55 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 56 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 57 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 58 | 59 | 60 | # Configure the training. We skip evaluation in this example 61 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 62 | logging.info("Warmup-steps: {}".format(warmup_steps)) 63 | 64 | 65 | # Train the model 66 | model.fit(train_objectives=[(train_dataloader, train_loss)], 67 | evaluator=evaluator, 68 | epochs=num_epochs, 69 | evaluation_steps=1000, 70 | warmup_steps=warmup_steps, 71 | output_path=model_save_path) 72 | 73 | 74 | ############################################################################## 75 | # 76 | # Load the stored model and evaluate its performance on STS benchmark dataset 77 | # 78 | ############################################################################## 79 | 80 | model = SentenceTransformer(model_save_path) 81 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 82 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 83 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 84 | model.evaluate(evaluator) 85 | -------------------------------------------------------------------------------- /examples/training_transformers/training_stsbenchmark_continue_training.py: -------------------------------------------------------------------------------- 1 | """ 2 | This example loads the pre-trained SentenceTransformer model 'bert-base-nli-mean-tokens' from the server. 3 | It then fine-tunes this model for some epochs on the STS benchmark dataset. 4 | 5 | Note: In this example, you must specify a SentenceTransformer model. 6 | If you want to fine-tune a huggingface/transformers model like bert-base-uncased, see training_nli.py and training_stsbenchmark.py 7 | """ 8 | from torch.utils.data import DataLoader 9 | import math 10 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses 11 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 12 | from sentence_transformers.readers import STSBenchmarkDataReader 13 | import logging 14 | from datetime import datetime 15 | 16 | 17 | #### Just some code to print debug information to stdout 18 | logging.basicConfig(format='%(asctime)s - %(message)s', 19 | datefmt='%Y-%m-%d %H:%M:%S', 20 | level=logging.INFO, 21 | handlers=[LoggingHandler()]) 22 | #### /print debug information to stdout 23 | 24 | # Read the dataset 25 | model_name = 'bert-base-nli-mean-tokens' 26 | train_batch_size = 16 27 | num_epochs = 4 28 | model_save_path = 'output/training_stsbenchmark_continue_training-'+model_name+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 29 | sts_reader = STSBenchmarkDataReader('../datasets/stsbenchmark', normalize_scores=True) 30 | 31 | # Load a pre-trained sentence transformer model 32 | model = SentenceTransformer(model_name) 33 | 34 | # Convert the dataset to a DataLoader ready for training 35 | logging.info("Read STSbenchmark train dataset") 36 | train_data = SentencesDataset(sts_reader.get_examples('sts-train.csv'), model) 37 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 38 | train_loss = losses.CosineSimilarityLoss(model=model) 39 | 40 | 41 | logging.info("Read STSbenchmark dev dataset") 42 | dev_data = SentencesDataset(examples=sts_reader.get_examples('sts-dev.csv'), model=model) 43 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 44 | evaluator = EmbeddingSimilarityEvaluator(dev_dataloader) 45 | 46 | 47 | # Configure the training. We skip evaluation in this example 48 | warmup_steps = math.ceil(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data for warm-up 49 | logging.info("Warmup-steps: {}".format(warmup_steps)) 50 | 51 | 52 | # Train the model 53 | model.fit(train_objectives=[(train_dataloader, train_loss)], 54 | evaluator=evaluator, 55 | epochs=num_epochs, 56 | evaluation_steps=1000, 57 | warmup_steps=warmup_steps, 58 | output_path=model_save_path) 59 | 60 | 61 | ############################################################################## 62 | # 63 | # Load the stored model and evaluate its performance on STS benchmark dataset 64 | # 65 | ############################################################################## 66 | 67 | model = SentenceTransformer(model_save_path) 68 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 69 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 70 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 71 | model.evaluate(evaluator) 72 | -------------------------------------------------------------------------------- /examples/training_transformers/training_wikipedia_sections.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script trains sentence transformers with a triplet loss function. 3 | 4 | As corpus, we use the wikipedia sections dataset that was describd by Dor et al., 2018, Learning Thematic Similarity Metric Using Triplet Networks. 5 | 6 | See docs/pretrained-models/wikipedia-sections-modesl.md for further details. 7 | 8 | You can get the dataset by running examples/datasets/get_data.py 9 | """ 10 | 11 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler, losses, models 12 | from torch.utils.data import DataLoader 13 | from sentence_transformers.readers import TripletReader 14 | from sentence_transformers.evaluation import TripletEvaluator 15 | from datetime import datetime 16 | 17 | import csv 18 | import logging 19 | 20 | 21 | 22 | logging.basicConfig(format='%(asctime)s - %(message)s', 23 | datefmt='%Y-%m-%d %H:%M:%S', 24 | level=logging.INFO, 25 | handlers=[LoggingHandler()]) 26 | 27 | 28 | #You can specify any huggingface/transformers pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base 29 | model_name = 'bert-base-uncased' 30 | 31 | 32 | ### Create a torch.DataLoader that passes training batch instances to our model 33 | train_batch_size = 16 34 | triplet_reader = TripletReader('../datasets/wikipedia-sections-triplets', s1_col_idx=1, s2_col_idx=2, s3_col_idx=3, delimiter=',', quoting=csv.QUOTE_MINIMAL, has_header=True) 35 | output_path = "output/training-wikipedia-sections-"+model_name+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S") 36 | num_epochs = 1 37 | 38 | 39 | ### Configure sentence transformers for training and train on the provided dataset 40 | # Use Huggingface/transformers model (like BERT, RoBERTa, XLNet, XLM-R) for mapping tokens to embeddings 41 | word_embedding_model = models.Transformer(model_name) 42 | 43 | # Apply mean pooling to get one fixed sized sentence vector 44 | pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension(), 45 | pooling_mode_mean_tokens=True, 46 | pooling_mode_cls_token=False, 47 | pooling_mode_max_tokens=False) 48 | 49 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 50 | 51 | 52 | logging.info("Read Triplet train dataset") 53 | train_data = SentencesDataset(examples=triplet_reader.get_examples('train.csv', max_examples=100000), model=model) 54 | train_dataloader = DataLoader(train_data, shuffle=True, batch_size=train_batch_size) 55 | train_loss = losses.TripletLoss(model=model) 56 | 57 | logging.info("Read Wikipedia Triplet dev dataset") 58 | dev_data = SentencesDataset(examples=triplet_reader.get_examples('validation.csv', 1000), model=model) 59 | dev_dataloader = DataLoader(dev_data, shuffle=False, batch_size=train_batch_size) 60 | evaluator = TripletEvaluator(dev_dataloader) 61 | 62 | 63 | warmup_steps = int(len(train_data)*num_epochs/train_batch_size*0.1) #10% of train data 64 | 65 | 66 | # Train the model 67 | model.fit(train_objectives=[(train_dataloader, train_loss)], 68 | evaluator=evaluator, 69 | epochs=num_epochs, 70 | evaluation_steps=1000, 71 | warmup_steps=warmup_steps, 72 | output_path=output_path) 73 | 74 | ############################################################################## 75 | # 76 | # Load the stored model and evaluate its performance on STS benchmark dataset 77 | # 78 | ############################################################################## 79 | 80 | model = SentenceTransformer(output_path) 81 | test_data = SentencesDataset(examples=triplet_reader.get_examples('test.csv'), model=model) 82 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=train_batch_size) 83 | evaluator = TripletEvaluator(test_dataloader) 84 | 85 | model.evaluate(evaluator) 86 | 87 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers>=2.8.0 2 | tqdm 3 | torch>=1.0.1 4 | numpy 5 | scikit-learn 6 | scipy 7 | nltk -------------------------------------------------------------------------------- /sentence_transformers/LoggingHandler.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import tqdm 3 | 4 | class LoggingHandler(logging.Handler): 5 | def __init__(self, level=logging.NOTSET): 6 | super().__init__(level) 7 | 8 | def emit(self, record): 9 | try: 10 | msg = self.format(record) 11 | tqdm.tqdm.write(msg) 12 | self.flush() 13 | except (KeyboardInterrupt, SystemExit): 14 | raise 15 | except: 16 | self.handleError(record) -------------------------------------------------------------------------------- /sentence_transformers/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.2.6" 2 | __DOWNLOAD_SERVER__ = 'https://public.ukp.informatik.tu-darmstadt.de/reimers/sentence-transformers/v0.2/' 3 | from .datasets import SentencesDataset, SentenceLabelDataset, ParallelSentencesDataset 4 | from .data_samplers import LabelSampler 5 | from .LoggingHandler import LoggingHandler 6 | from .SentenceTransformer import SentenceTransformer 7 | 8 | -------------------------------------------------------------------------------- /sentence_transformers/data_samplers.py: -------------------------------------------------------------------------------- 1 | """ 2 | This file contains sampler functions, that can be used to sample mini-batches with specific properties. 3 | """ 4 | from torch.utils.data import Sampler 5 | import numpy as np 6 | from .datasets import SentenceLabelDataset 7 | 8 | 9 | class LabelSampler(Sampler): 10 | """ 11 | This sampler is used for some specific Triplet Losses like BATCH_HARD_TRIPLET_LOSS 12 | or MULTIPLE_NEGATIVES_RANKING_LOSS which require multiple or only one sample from one label per batch. 13 | 14 | It draws n consecutive, random and unique samples from one label at a time. This is repeated for each label. 15 | 16 | Labels with fewer than n unique samples are ignored. 17 | This also applied to drawing without replacement, once less than n samples remain for a label, it is skipped. 18 | 19 | This *DOES NOT* check if there are more labels than the batch is large or if the batch size is divisible 20 | by the samples drawn per label. 21 | 22 | 23 | """ 24 | def __init__(self, data_source: SentenceLabelDataset, samples_per_label: int = 5, 25 | with_replacement: bool = False): 26 | """ 27 | Creates a LabelSampler for a SentenceLabelDataset. 28 | 29 | :param data_source: 30 | the dataset from which samples are drawn 31 | :param samples_per_label: 32 | the number of consecutive, random and unique samples drawn per label 33 | :param with_replacement: 34 | if this is True, then each sample is drawn at most once (depending on the total number of samples per label). 35 | if this is False, then one sample can be drawn in multiple draws, but still not multiple times in the same 36 | drawing. 37 | """ 38 | super().__init__(data_source) 39 | self.data_source = data_source 40 | self.samples_per_label = samples_per_label 41 | self.label_range = np.arange(data_source.num_labels) 42 | self.borders = data_source.labels_right_border 43 | self.with_replacement = with_replacement 44 | np.random.shuffle(self.label_range) 45 | 46 | def __iter__(self): 47 | label_idx = 0 48 | count = 0 49 | already_seen = {} 50 | while count < len(self.data_source): 51 | label = self.label_range[label_idx] 52 | if label not in already_seen: 53 | already_seen[label] = [] 54 | 55 | left_border = 0 if label == 0 else self.borders[label-1] 56 | right_border = self.borders[label] 57 | 58 | if self.with_replacement: 59 | selection = np.arange(left_border, right_border) 60 | else: 61 | selection = [i for i in np.arange(left_border, right_border) if i not in already_seen[label]] 62 | 63 | if len(selection) >= self.samples_per_label: 64 | for element_idx in np.random.choice(selection, self.samples_per_label, replace=False): 65 | count += 1 66 | already_seen[label].append(element_idx) 67 | yield element_idx 68 | 69 | label_idx += 1 70 | if label_idx >= len(self.label_range): 71 | label_idx = 0 72 | np.random.shuffle(self.label_range) 73 | 74 | def __len__(self): 75 | return len(self.data_source) -------------------------------------------------------------------------------- /sentence_transformers/datasets/ParallelSentencesDataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | import torch 3 | import logging 4 | import gzip 5 | import os 6 | import random 7 | from .. import SentenceTransformer 8 | 9 | 10 | class ParallelSentencesDataset(Dataset): 11 | """ 12 | This dataset reader can be used to read-in parallel sentences, i.e., it reads in a file with tab-seperated sentences with the same 13 | sentence in different languages. For example, the file can look like this (EN\tDE\tES): 14 | hello world hallo welt hola mundo 15 | second sentence zweiter satz segunda oración 16 | 17 | The sentence in the first column will be mapped to a sentence embedding using the given the embedder. For example, 18 | embedder is a mono-lingual sentence embedding method for English. The sentences in the other languages will also be 19 | mapped to this English sentence embedding. 20 | 21 | When getting a sample from the dataset, we get one sentence with the according sentence embedding for this sentence. 22 | 23 | teacher_model can be any class that implement an encode function. The encode function gets a list of sentences and 24 | returns a list of sentence embeddings 25 | """ 26 | 27 | def __init__(self, student_model: SentenceTransformer, teacher_model): 28 | """ 29 | Parallel sentences dataset reader to train student model given a teacher model 30 | :param student_model: Student sentence embedding model that should be trained 31 | :param teacher_model: Teacher model, that provides the sentence embeddings for the first column in the dataset file 32 | """ 33 | self.student_model = student_model 34 | self.teacher_model = teacher_model 35 | self.datasets = [] 36 | self.dataset_indices = [] 37 | self.copy_dataset_indices = [] 38 | 39 | def load_data(self, filepath: str, weight: int = 100, max_sentences: int = None, max_sentence_length: int = 128): 40 | """ 41 | Reads in a tab-seperated .txt/.csv/.tsv or .gz file. The different columns contain the different translations of the sentence in the first column 42 | 43 | :param filepath: Filepath to the file 44 | :param weight: If more that one dataset is loaded with load_data: With which frequency should data be sampled from this dataset? 45 | :param max_sentences: Max number of lines to be read from filepath 46 | :param max_sentence_length: Skip the example if one of the sentences is has more characters than max_sentence_length 47 | :return: 48 | """ 49 | sentences_map = {} 50 | with gzip.open(filepath, 'rt', encoding='utf8') if filepath.endswith('.gz') else open(filepath, encoding='utf8') as fIn: 51 | count = 0 52 | for line in fIn: 53 | sentences = line.strip().split("\t") 54 | sentence_lengths = [len(sent) for sent in sentences] 55 | if max(sentence_lengths) > max_sentence_length: 56 | continue 57 | 58 | eng_sentence = sentences[0] 59 | if eng_sentence not in sentences_map: 60 | sentences_map[eng_sentence] = set() 61 | 62 | for sent in sentences: 63 | sentences_map[eng_sentence].add(sent) 64 | 65 | count += 1 66 | if max_sentences is not None and count >= max_sentences: 67 | break 68 | 69 | eng_sentences = list(sentences_map.keys()) 70 | logging.info("Create sentence embeddings for " + os.path.basename(filepath)) 71 | labels = torch.tensor(self.teacher_model.encode(eng_sentences, batch_size=32, show_progress_bar=True), 72 | dtype=torch.float) 73 | 74 | data = [] 75 | for idx in range(len(eng_sentences)): 76 | eng_key = eng_sentences[idx] 77 | label = labels[idx] 78 | for sent in sentences_map[eng_key]: 79 | data.append([[self.student_model.tokenize(sent)], label]) 80 | 81 | dataset_id = len(self.datasets) 82 | self.datasets.append(data) 83 | self.dataset_indices.extend([dataset_id] * weight) 84 | 85 | def __len__(self): 86 | return max([len(dataset) for dataset in self.datasets]) 87 | 88 | def __getitem__(self, idx): 89 | if len(self.copy_dataset_indices) == 0: 90 | self.copy_dataset_indices = self.dataset_indices.copy() 91 | random.shuffle(self.copy_dataset_indices) 92 | 93 | dataset_idx = self.copy_dataset_indices.pop() 94 | return self.datasets[dataset_idx][idx % len(self.datasets[dataset_idx])] 95 | -------------------------------------------------------------------------------- /sentence_transformers/datasets/SentencesDataset.py: -------------------------------------------------------------------------------- 1 | from torch.utils.data import Dataset 2 | from typing import List 3 | import torch 4 | import logging 5 | from tqdm import tqdm 6 | from .. import SentenceTransformer 7 | from ..readers.InputExample import InputExample 8 | 9 | 10 | class SentencesDataset(Dataset): 11 | """ 12 | Dataset for smart batching, that is each batch is only padded to its longest sequence instead of padding all 13 | sequences to the max length. 14 | The SentenceBertEncoder.smart_batching_collate is required for this to work. 15 | SmartBatchingDataset does *not* work without it. 16 | """ 17 | def __init__(self, examples: List[InputExample], model: SentenceTransformer, show_progress_bar: bool = None): 18 | """ 19 | Create a new SentencesDataset with the tokenized texts and the labels as Tensor 20 | """ 21 | if show_progress_bar is None: 22 | show_progress_bar = (logging.getLogger().getEffectiveLevel() == logging.INFO or logging.getLogger().getEffectiveLevel() == logging.DEBUG) 23 | self.show_progress_bar = show_progress_bar 24 | 25 | self.convert_input_examples(examples, model) 26 | 27 | def convert_input_examples(self, examples: List[InputExample], model: SentenceTransformer): 28 | """ 29 | Converts input examples to a SmartBatchingDataset usable to train the model with 30 | SentenceTransformer.smart_batching_collate as the collate_fn for the DataLoader 31 | 32 | smart_batching_collate as collate_fn is required because it transforms the tokenized texts to the tensors. 33 | 34 | :param examples: 35 | the input examples for the training 36 | :param model 37 | the Sentence BERT model for the conversion 38 | :return: a SmartBatchingDataset usable to train the model with SentenceTransformer.smart_batching_collate as the collate_fn 39 | for the DataLoader 40 | """ 41 | num_texts = len(examples[0].texts) 42 | inputs = [[] for _ in range(num_texts)] 43 | labels = [] 44 | too_long = [0] * num_texts 45 | label_type = None 46 | iterator = examples 47 | max_seq_length = model.get_max_seq_length() 48 | 49 | if self.show_progress_bar: 50 | iterator = tqdm(iterator, desc="Convert dataset") 51 | 52 | for ex_index, example in enumerate(iterator): 53 | if label_type is None: 54 | if isinstance(example.label, int): 55 | label_type = torch.long 56 | elif isinstance(example.label, float): 57 | label_type = torch.float 58 | tokenized_texts = [model.tokenize(text) for text in example.texts] 59 | 60 | for i, token in enumerate(tokenized_texts): 61 | if max_seq_length != None and max_seq_length > 0 and len(token) >= max_seq_length: 62 | too_long[i] += 1 63 | 64 | labels.append(example.label) 65 | for i in range(num_texts): 66 | inputs[i].append(tokenized_texts[i]) 67 | 68 | tensor_labels = torch.tensor(labels, dtype=label_type) 69 | 70 | logging.info("Num sentences: %d" % (len(examples))) 71 | for i in range(num_texts): 72 | logging.info("Sentences {} longer than max_seqence_length: {}".format(i, too_long[i])) 73 | 74 | self.tokens = inputs 75 | self.labels = tensor_labels 76 | 77 | def __getitem__(self, item): 78 | return [self.tokens[i][item] for i in range(len(self.tokens))], self.labels[item] 79 | 80 | def __len__(self): 81 | return len(self.tokens[0]) -------------------------------------------------------------------------------- /sentence_transformers/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .ParallelSentencesDataset import ParallelSentencesDataset 2 | from .SentenceLabelDataset import SentenceLabelDataset 3 | from .SentencesDataset import SentencesDataset -------------------------------------------------------------------------------- /sentence_transformers/evaluation/LabelAccuracyEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | import torch 3 | from torch.utils.data import DataLoader 4 | import logging 5 | from tqdm import tqdm 6 | from ..util import batch_to_device 7 | import os 8 | import csv 9 | 10 | class LabelAccuracyEvaluator(SentenceEvaluator): 11 | """ 12 | Evaluate a model based on its accuracy on a labeled dataset 13 | 14 | This requires a model with LossFunction.SOFTMAX 15 | 16 | The results are written in a CSV. If a CSV already exists, then values are appended. 17 | """ 18 | 19 | def __init__(self, dataloader: DataLoader, name: str = "", softmax_model = None): 20 | """ 21 | Constructs an evaluator for the given dataset 22 | 23 | :param dataloader: 24 | the data for the evaluation 25 | """ 26 | self.dataloader = dataloader 27 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 28 | self.name = name 29 | self.softmax_model = softmax_model 30 | self.softmax_model.to(self.device) 31 | 32 | if name: 33 | name = "_"+name 34 | 35 | self.csv_file = "accuracy_evaluation"+name+"_results.csv" 36 | self.csv_headers = ["epoch", "steps", "accuracy"] 37 | 38 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 39 | model.eval() 40 | total = 0 41 | correct = 0 42 | 43 | if epoch != -1: 44 | if steps == -1: 45 | out_txt = " after epoch {}:".format(epoch) 46 | else: 47 | out_txt = " in epoch {} after {} steps:".format(epoch, steps) 48 | else: 49 | out_txt = ":" 50 | 51 | logging.info("Evaluation on the "+self.name+" dataset"+out_txt) 52 | self.dataloader.collate_fn = model.smart_batching_collate 53 | for step, batch in enumerate(tqdm(self.dataloader, desc="Evaluating")): 54 | features, label_ids = batch_to_device(batch, self.device) 55 | with torch.no_grad(): 56 | _, prediction = self.softmax_model(features, labels=None) 57 | 58 | total += prediction.size(0) 59 | correct += torch.argmax(prediction, dim=1).eq(label_ids).sum().item() 60 | accuracy = correct/total 61 | 62 | logging.info("Accuracy: {:.4f} ({}/{})\n".format(accuracy, correct, total)) 63 | 64 | if output_path is not None: 65 | csv_path = os.path.join(output_path, self.csv_file) 66 | if not os.path.isfile(csv_path): 67 | with open(csv_path, mode="w", encoding="utf-8") as f: 68 | writer = csv.writer(f) 69 | writer.writerow(self.csv_headers) 70 | writer.writerow([epoch, steps, accuracy]) 71 | else: 72 | with open(csv_path, mode="a", encoding="utf-8") as f: 73 | writer = csv.writer(f) 74 | writer.writerow([epoch, steps, accuracy]) 75 | 76 | #return accuracy, self.softmax_model.state_dict() 77 | return accuracy -------------------------------------------------------------------------------- /sentence_transformers/evaluation/MSEEvaluator.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers.evaluation import SentenceEvaluator 2 | from sentence_transformers.util import batch_to_device 3 | import torch 4 | import numpy as np 5 | import logging 6 | import os 7 | import csv 8 | 9 | 10 | class MSEEvaluator(SentenceEvaluator): 11 | """ 12 | Computes the mean squared error (x100) between the computed sentence embedding 13 | and some target sentence embedding 14 | """ 15 | def __init__(self, dataloader, name=''): 16 | self.dataloader = dataloader 17 | self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 18 | self.name = name 19 | 20 | if name: 21 | name = "_"+name 22 | self.csv_file = "mse_evaluation" + name + "_results.csv" 23 | self.csv_headers = ["epoch", "steps", "MSE"] 24 | 25 | def __call__(self, model, output_path, epoch = -1, steps = -1): 26 | model.eval() 27 | self.dataloader.collate_fn = model.smart_batching_collate 28 | 29 | embeddings = [] 30 | labels = [] 31 | for step, batch in enumerate(self.dataloader): 32 | features, batch_labels = batch_to_device(batch, self.device) 33 | with torch.no_grad(): 34 | emb1 = model(features[0])['sentence_embedding'].to("cpu").numpy() 35 | 36 | labels.extend(batch_labels.to("cpu").numpy()) 37 | embeddings.extend(emb1) 38 | 39 | embeddings = np.asarray(embeddings) 40 | labels = np.asarray(labels) 41 | 42 | mse = ((embeddings - labels)**2).mean() 43 | 44 | logging.info("MSE evaluation on "+self.name+" dataset") 45 | mse *= 100 46 | 47 | logging.info("embeddings shape:\t"+str(embeddings.shape)) 48 | logging.info("MSE (*100):\t{:4f}".format(mse)) 49 | 50 | if output_path is not None: 51 | csv_path = os.path.join(output_path, self.csv_file) 52 | output_file_exists = os.path.isfile(csv_path) 53 | with open(csv_path, mode="a" if output_file_exists else 'w', encoding="utf-8") as f: 54 | writer = csv.writer(f) 55 | if not output_file_exists: 56 | writer.writerow(self.csv_headers) 57 | 58 | writer.writerow([epoch, steps, mse]) 59 | 60 | 61 | return -mse #Return negative score as SentenceTransformers maximizes the performance -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SentenceEvaluator.py: -------------------------------------------------------------------------------- 1 | class SentenceEvaluator: 2 | """ 3 | Base class for all evaluators 4 | 5 | Extend this class and implement __call__ for custom evaluators. 6 | """ 7 | 8 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 9 | """ 10 | This is called during training to evaluate the model. 11 | It returns a score for the evaluation with a higher score indicating a better result. 12 | 13 | :param model: 14 | the model to evaluate 15 | :param output_path: 16 | path where predictions and metrics are written to 17 | :param epoch 18 | the epoch where the evaluation takes place. 19 | This is used for the file prefixes. 20 | If this is -1, then we assume evaluation on test data. 21 | :param steps 22 | the steps in the current epoch at time of the evaluation. 23 | This is used for the file prefixes. 24 | If this is -1, then we assume evaluation at the end of the epoch. 25 | :return: a score for the evaluation with a higher score indicating a better result 26 | """ 27 | pass 28 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SequentialEvaluator.py: -------------------------------------------------------------------------------- 1 | from . import SentenceEvaluator 2 | from typing import Iterable 3 | 4 | class SequentialEvaluator(SentenceEvaluator): 5 | """ 6 | This evaluator allows that multiple sub-evaluators are passed. When the model is evaluated, 7 | the data is passed sequentially to all sub-evaluators. 8 | 9 | All scores are passed to 'main_score_function', which derives one final score value 10 | """ 11 | def __init__(self, evaluators: Iterable[SentenceEvaluator], main_score_function = lambda scores: scores[-1]): 12 | self.evaluators = evaluators 13 | self.main_score_function = main_score_function 14 | 15 | def __call__(self, model, output_path: str = None, epoch: int = -1, steps: int = -1) -> float: 16 | scores = [] 17 | for evaluator in self.evaluators: 18 | scores.append(evaluator(model, output_path, epoch, steps)) 19 | 20 | return self.main_score_function(scores) 21 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/SimilarityFunction.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | class SimilarityFunction(Enum): 4 | COSINE = 0 5 | EUCLIDEAN = 1 6 | MANHATTAN = 2 7 | DOT_PRODUCT = 3 8 | 9 | -------------------------------------------------------------------------------- /sentence_transformers/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .SentenceEvaluator import SentenceEvaluator 2 | from .SimilarityFunction import SimilarityFunction 3 | 4 | from .BinaryEmbeddingSimilarityEvaluator import BinaryEmbeddingSimilarityEvaluator 5 | from .EmbeddingSimilarityEvaluator import EmbeddingSimilarityEvaluator 6 | from .LabelAccuracyEvaluator import LabelAccuracyEvaluator 7 | from .SequentialEvaluator import SequentialEvaluator 8 | from .TripletEvaluator import TripletEvaluator 9 | from .MSEEvaluator import MSEEvaluator -------------------------------------------------------------------------------- /sentence_transformers/losses/CosineSimilarityLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | from ..SentenceTransformer import SentenceTransformer 5 | 6 | class CosineSimilarityLoss(nn.Module): 7 | def __init__(self, model: SentenceTransformer): 8 | super(CosineSimilarityLoss, self).__init__() 9 | self.model = model 10 | 11 | 12 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 13 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 14 | rep_a, rep_b = reps 15 | 16 | output = torch.cosine_similarity(rep_a, rep_b) 17 | loss_fct = nn.MSELoss() 18 | 19 | if labels is not None: 20 | loss = loss_fct(output, labels.view(-1)) 21 | return loss 22 | else: 23 | return reps, output -------------------------------------------------------------------------------- /sentence_transformers/losses/MSELoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | 5 | 6 | class MSELoss(nn.Module): 7 | """ 8 | Computes the MSE loss between the computed sentence embedding and a target sentence embedding 9 | """ 10 | def __init__(self, model): 11 | super(MSELoss, self).__init__() 12 | self.model = model 13 | 14 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 15 | rep = self.model(sentence_features[0])['sentence_embedding'] 16 | loss_fct = nn.MSELoss() 17 | loss = loss_fct(rep, labels) 18 | return loss 19 | -------------------------------------------------------------------------------- /sentence_transformers/losses/MultipleNegativesRankingLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import torch.nn.functional as F 5 | from ..SentenceTransformer import SentenceTransformer 6 | 7 | class MultipleNegativesRankingLoss(nn.Module): 8 | def __init__(self, model: SentenceTransformer): 9 | super(MultipleNegativesRankingLoss, self).__init__() 10 | self.model = model 11 | 12 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 13 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 14 | 15 | reps_a, reps_b = reps 16 | return self.multiple_negatives_ranking_loss(reps_a, reps_b) 17 | 18 | # Multiple Negatives Ranking Loss 19 | # Paper: https://arxiv.org/pdf/1705.00652.pdf 20 | # Efficient Natural Language Response Suggestion for Smart Reply 21 | # Section 4.4 22 | def multiple_negatives_ranking_loss(self, embeddings_a: Tensor, embeddings_b: Tensor): 23 | """ 24 | Compute the loss over a batch with two embeddings per example. 25 | 26 | Each pair is a positive example. The negative examples are all other embeddings in embeddings_b with each embedding 27 | in embedding_a. 28 | 29 | See the paper for more information: https://arxiv.org/pdf/1705.00652.pdf 30 | (Efficient Natural Language Response Suggestion for Smart Reply, Section 4.4) 31 | 32 | :param embeddings_a: 33 | Tensor of shape (batch_size, embedding_dim) 34 | :param embeddings_b: 35 | Tensor of shape (batch_size, embedding_dim) 36 | :return: 37 | The scalar loss 38 | """ 39 | scores = torch.matmul(embeddings_a, embeddings_b.t()) 40 | diagonal_mean = torch.mean(torch.diag(scores)) 41 | mean_log_row_sum_exp = torch.mean(torch.logsumexp(scores, dim=1)) 42 | return -diagonal_mean + mean_log_row_sum_exp 43 | -------------------------------------------------------------------------------- /sentence_transformers/losses/SoftmaxLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | from ..SentenceTransformer import SentenceTransformer 5 | import logging 6 | 7 | class SoftmaxLoss(nn.Module): 8 | def __init__(self, 9 | model: SentenceTransformer, 10 | sentence_embedding_dimension: int, 11 | num_labels: int, 12 | concatenation_sent_rep: bool = True, 13 | concatenation_sent_difference: bool = True, 14 | concatenation_sent_multiplication: bool = False): 15 | super(SoftmaxLoss, self).__init__() 16 | self.model = model 17 | self.num_labels = num_labels 18 | self.concatenation_sent_rep = concatenation_sent_rep 19 | self.concatenation_sent_difference = concatenation_sent_difference 20 | self.concatenation_sent_multiplication = concatenation_sent_multiplication 21 | 22 | num_vectors_concatenated = 0 23 | if concatenation_sent_rep: 24 | num_vectors_concatenated += 2 25 | if concatenation_sent_difference: 26 | num_vectors_concatenated += 1 27 | if concatenation_sent_multiplication: 28 | num_vectors_concatenated += 1 29 | logging.info("Softmax loss: #Vectors concatenated: {}".format(num_vectors_concatenated)) 30 | self.classifier = nn.Linear(num_vectors_concatenated * sentence_embedding_dimension, num_labels) 31 | 32 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 33 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 34 | rep_a, rep_b = reps 35 | 36 | vectors_concat = [] 37 | if self.concatenation_sent_rep: 38 | vectors_concat.append(rep_a) 39 | vectors_concat.append(rep_b) 40 | 41 | if self.concatenation_sent_difference: 42 | vectors_concat.append(torch.abs(rep_a - rep_b)) 43 | 44 | if self.concatenation_sent_multiplication: 45 | vectors_concat.append(rep_a * rep_b) 46 | 47 | features = torch.cat(vectors_concat, 1) 48 | 49 | output = self.classifier(features) 50 | loss_fct = nn.CrossEntropyLoss() 51 | 52 | if labels is not None: 53 | loss = loss_fct(output, labels.view(-1)) 54 | return loss 55 | else: 56 | return reps, output -------------------------------------------------------------------------------- /sentence_transformers/losses/TripletLoss.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import torch.nn.functional as F 5 | from enum import Enum 6 | from ..SentenceTransformer import SentenceTransformer 7 | 8 | class TripletDistanceMetric(Enum): 9 | """ 10 | The metric for the triplet loss 11 | """ 12 | COSINE = lambda x, y: 1 - F.cosine_similarity(x, y) 13 | EUCLIDEAN = lambda x, y: F.pairwise_distance(x, y, p=2) 14 | MANHATTAN = lambda x, y: F.pairwise_distance(x, y, p=1) 15 | 16 | class TripletLoss(nn.Module): 17 | def __init__(self, model: SentenceTransformer, distance_metric=TripletDistanceMetric.EUCLIDEAN, triplet_margin=1): 18 | super(TripletLoss, self).__init__() 19 | self.model = model 20 | self.distance_metric = distance_metric 21 | self.triplet_margin = triplet_margin 22 | 23 | 24 | def forward(self, sentence_features: Iterable[Dict[str, Tensor]], labels: Tensor): 25 | reps = [self.model(sentence_feature)['sentence_embedding'] for sentence_feature in sentence_features] 26 | 27 | rep_anchor, rep_pos, rep_neg = reps 28 | distance_pos = self.distance_metric(rep_anchor, rep_pos) 29 | distance_neg = self.distance_metric(rep_anchor, rep_neg) 30 | 31 | losses = F.relu(distance_pos - distance_neg + self.triplet_margin) 32 | return losses.mean() -------------------------------------------------------------------------------- /sentence_transformers/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .CosineSimilarityLoss import * 2 | from .SoftmaxLoss import * 3 | from .BatchHardTripletLoss import * 4 | from .MultipleNegativesRankingLoss import * 5 | from .TripletLoss import * 6 | from .MSELoss import * -------------------------------------------------------------------------------- /sentence_transformers/models/ADVANCED_CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | import torch.nn.functional as F 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import logging 6 | import gzip 7 | from tqdm import tqdm 8 | import numpy as np 9 | import os 10 | import json 11 | from ..util import import_from_string, fullname, http_get 12 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 13 | 14 | 15 | class CNN(nn.Module): 16 | """CNN-layer with multiple kernel-sizes over the word embeddings""" 17 | 18 | def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]): 19 | nn.Module.__init__(self) 20 | self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes'] 21 | self.in_word_embedding_dimension = in_word_embedding_dimension 22 | self.out_channels = out_channels 23 | self.kernel_sizes = kernel_sizes 24 | 25 | self.embeddings_dimension = out_channels*len(kernel_sizes) 26 | self.convsModule = nn.ModuleList() 27 | self.pooling = nn.AvgPool1d(2, stride=2) 28 | in_channels = in_word_embedding_dimension 29 | 30 | for _ in range(4): 31 | self.convsModule.append(nn.ModuleList()) 32 | 33 | for kernel_size in kernel_sizes: 34 | padding_size = int((kernel_size - 1) / 2) 35 | conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, 36 | padding=padding_size) 37 | for i in self.convsModule: 38 | i.append(conv) 39 | 40 | def forward(self, features): 41 | token_embeddings = features['all_layer_embeddings'] 42 | vectors =[] 43 | for idx, convs in enumerate(self.convsModule): 44 | temp = [] 45 | token_embedding = token_embeddings[len(token_embeddings)-idx-1].transpose(1, -1) 46 | for conv in convs: 47 | a = F.tanh(conv(token_embedding)) 48 | a = a.transpose(1, -1) 49 | a = self.pooling(a) 50 | a = a.transpose(1, -1) 51 | temp.append(a) 52 | vectors.append(torch.cat(temp, 1)) 53 | 54 | out = torch.cat(vectors, 1).transpose(1, -1) 55 | features.update({'token_embeddings': out}) 56 | return features 57 | 58 | def get_word_embedding_dimension(self) -> int: 59 | return self.embeddings_dimension 60 | 61 | def tokenize(self, text: str) -> List[int]: 62 | raise NotImplementedError() 63 | 64 | def save(self, output_path: str): 65 | with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut: 66 | json.dump(self.get_config_dict(), fOut, indent=2) 67 | 68 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 69 | 70 | def get_config_dict(self): 71 | return {key: self.__dict__[key] for key in self.config_keys} 72 | 73 | @staticmethod 74 | def load(input_path: str): 75 | with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn: 76 | config = json.load(fIn) 77 | 78 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 79 | model = CNN(**config) 80 | model.load_state_dict(weights) 81 | return model -------------------------------------------------------------------------------- /sentence_transformers/models/ALBERT.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import AlbertModel, AlbertTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict, Optional 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | class ALBERT(nn.Module): 11 | """ALBERT model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from BERT. 14 | """ 15 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): 16 | super(ALBERT, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | 20 | if max_seq_length > 510: 21 | logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") 22 | max_seq_length = 510 23 | self.max_seq_length = max_seq_length 24 | 25 | if self.do_lower_case is not None: 26 | tokenizer_args['do_lower_case'] = do_lower_case 27 | 28 | self.albert = AlbertModel.from_pretrained(model_name_or_path, **model_args) 29 | self.tokenizer = AlbertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 30 | 31 | def forward(self, features): 32 | """Returns token_embeddings, cls_token""" 33 | output_states = self.albert(**features) 34 | output_tokens = output_states[0] 35 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 36 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 37 | 38 | if self.albert.config.output_hidden_states: 39 | hidden_states = output_states[2] 40 | features.update({'all_layer_embeddings': hidden_states}) 41 | 42 | return features 43 | 44 | def get_word_embedding_dimension(self) -> int: 45 | return self.albert.config.hidden_size 46 | 47 | def tokenize(self, text: str) -> List[int]: 48 | """ 49 | Tokenizes a text and maps tokens to token-ids 50 | """ 51 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 52 | 53 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 54 | """ 55 | Convert tokenized sentence in its embedding ids, segment ids and mask 56 | 57 | :param tokens: 58 | a tokenized sentence 59 | :param pad_seq_length: 60 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 61 | :return: embedding ids, segment ids and mask for the sentence 62 | """ 63 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens 64 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 65 | 66 | 67 | def get_config_dict(self): 68 | return {key: self.__dict__[key] for key in self.config_keys} 69 | 70 | def save(self, output_path: str): 71 | self.albert.save_pretrained(output_path) 72 | self.tokenizer.save_pretrained(output_path) 73 | 74 | with open(os.path.join(output_path, 'sentence_albert_config.json'), 'w') as fOut: 75 | json.dump(self.get_config_dict(), fOut, indent=2) 76 | 77 | @staticmethod 78 | def load(input_path: str): 79 | with open(os.path.join(input_path, 'sentence_albert_config.json')) as fIn: 80 | config = json.load(fIn) 81 | return ALBERT(model_name_or_path=input_path, **config) 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /sentence_transformers/models/BERT.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import BertModel, BertTokenizer 3 | import json 4 | from typing import List, Dict, Optional 5 | import os 6 | import numpy as np 7 | import logging 8 | 9 | class BERT(nn.Module): 10 | """BERT model to generate token embeddings. 11 | 12 | Each token is mapped to an output vector from BERT. 13 | """ 14 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): 15 | super(BERT, self).__init__() 16 | self.config_keys = ['max_seq_length', 'do_lower_case'] 17 | self.do_lower_case = do_lower_case 18 | 19 | if max_seq_length > 510: 20 | logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") 21 | max_seq_length = 510 22 | self.max_seq_length = max_seq_length 23 | 24 | if self.do_lower_case is not None: 25 | tokenizer_args['do_lower_case'] = do_lower_case 26 | 27 | self.bert = BertModel.from_pretrained(model_name_or_path, **model_args) 28 | self.tokenizer = BertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 29 | 30 | 31 | def forward(self, features): 32 | """Returns token_embeddings, cls_token""" 33 | output_states = self.bert(**features) 34 | output_tokens = output_states[0] 35 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 36 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 37 | 38 | if len(output_states) > 2: 39 | features.update({'all_layer_embeddings': output_states[2]}) 40 | 41 | return features 42 | 43 | def get_word_embedding_dimension(self) -> int: 44 | return self.bert.config.hidden_size 45 | 46 | def tokenize(self, text: str) -> List[int]: 47 | """ 48 | Tokenizes a text and maps tokens to token-ids 49 | """ 50 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 51 | 52 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 53 | """ 54 | Convert tokenized sentence in its embedding ids, segment ids and mask 55 | 56 | :param tokens: 57 | a tokenized sentence 58 | :param pad_seq_length: 59 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 60 | :return: embedding ids, segment ids and mask for the sentence 61 | """ 62 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token 63 | 64 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 65 | 66 | 67 | def get_config_dict(self): 68 | return {key: self.__dict__[key] for key in self.config_keys} 69 | 70 | def save(self, output_path: str): 71 | self.bert.save_pretrained(output_path) 72 | self.tokenizer.save_pretrained(output_path) 73 | 74 | with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut: 75 | json.dump(self.get_config_dict(), fOut, indent=2) 76 | 77 | @staticmethod 78 | def load(input_path: str): 79 | with open(os.path.join(input_path, 'sentence_bert_config.json')) as fIn: 80 | config = json.load(fIn) 81 | return BERT(model_name_or_path=input_path, **config) 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /sentence_transformers/models/BERT_LSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from typing import List 4 | import os 5 | import json 6 | import numpy as np 7 | 8 | 9 | class LSTM(nn.Module): 10 | """ 11 | Bidirectional LSTM running over word embeddings. 12 | """ 13 | def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True): 14 | nn.Module.__init__(self) 15 | self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional'] 16 | self.word_embedding_dimension = word_embedding_dimension 17 | self.hidden_dim = hidden_dim 18 | self.num_layers = num_layers 19 | self.dropout = dropout 20 | self.bidirectional = bidirectional 21 | 22 | self.embeddings_dimension = hidden_dim 23 | if self.bidirectional: 24 | self.embeddings_dimension *= 2 25 | 26 | self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True) 27 | 28 | def forward(self, features): 29 | token_embeddings = features['token_embeddings'] 30 | a = [] 31 | for i in token_embeddings: 32 | a.append(len(i)) 33 | features.update({'sentence_lengths': torch.tensor(a, dtype=torch.long)}) 34 | 35 | sentence_lengths = torch.clamp(features['sentence_lengths'], min=1) 36 | # print(sentence_lengths) 37 | #print(features['sentence_lengths']) 38 | 39 | packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False) 40 | packed = self.encoder(packed) 41 | unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0] 42 | #print(unpack.size()) 43 | features.update({'token_embeddings': unpack}) 44 | return features 45 | 46 | def get_word_embedding_dimension(self) -> int: 47 | return self.embeddings_dimension 48 | 49 | def tokenize(self, text: str) -> List[int]: 50 | raise NotImplementedError() 51 | 52 | def save(self, output_path: str): 53 | with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut: 54 | json.dump(self.get_config_dict(), fOut, indent=2) 55 | 56 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 57 | 58 | def get_config_dict(self): 59 | return {key: self.__dict__[key] for key in self.config_keys} 60 | 61 | @staticmethod 62 | def load(input_path: str): 63 | with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn: 64 | config = json.load(fIn) 65 | 66 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 67 | model = LSTM(**config) 68 | model.load_state_dict(weights) 69 | return model 70 | 71 | -------------------------------------------------------------------------------- /sentence_transformers/models/BoW.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | import numpy as np 9 | from .tokenizer import WhitespaceTokenizer 10 | 11 | class BoW(nn.Module): 12 | """Implements a Bag-of-Words (BoW) model to derive sentence embeddings. 13 | 14 | A weighting can be added to allow the generation of tf-idf vectors. The output vector has the size of the vocab. 15 | """ 16 | 17 | def __init__(self, vocab: List[str], word_weights: Dict[str, float] = {}, unknown_word_weight: float = 1, cumulative_term_frequency: bool = True): 18 | super(BoW, self).__init__() 19 | vocab = list(set(vocab)) #Ensure vocab is unique 20 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight', 'cumulative_term_frequency'] 21 | self.vocab = vocab 22 | self.word_weights = word_weights 23 | self.unknown_word_weight = unknown_word_weight 24 | self.cumulative_term_frequency = cumulative_term_frequency 25 | 26 | #Maps wordIdx -> word weight 27 | self.weights = [] 28 | num_unknown_words = 0 29 | for word in vocab: 30 | weight = unknown_word_weight 31 | if word in word_weights: 32 | weight = word_weights[word] 33 | elif word.lower() in word_weights: 34 | weight = word_weights[word.lower()] 35 | else: 36 | num_unknown_words += 1 37 | self.weights.append(weight) 38 | 39 | logging.info("{} out of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 40 | 41 | self.tokenizer = WhitespaceTokenizer(vocab, stop_words=set(), do_lower_case=False) 42 | self.sentence_embedding_dimension = len(vocab) 43 | 44 | 45 | def forward(self, features: Dict[str, Tensor]): 46 | #Nothing to do, everything is done in get_sentence_features 47 | return features 48 | 49 | def tokenize(self, text: str) -> List[int]: 50 | return self.tokenizer.tokenize(text) 51 | 52 | def get_sentence_embedding_dimension(self): 53 | return self.sentence_embedding_dimension 54 | 55 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 56 | vector = np.zeros(self.get_sentence_embedding_dimension(), dtype=np.float32) 57 | for token in tokens: 58 | if self.cumulative_term_frequency: 59 | vector[token] += self.weights[token] 60 | else: 61 | vector[token] = self.weights[token] 62 | 63 | return {'sentence_embedding': torch.tensor([vector], dtype=torch.float)} 64 | 65 | def get_config_dict(self): 66 | return {key: self.__dict__[key] for key in self.config_keys} 67 | 68 | def save(self, output_path): 69 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 70 | json.dump(self.get_config_dict(), fOut, indent=2) 71 | 72 | @staticmethod 73 | def load(input_path): 74 | with open(os.path.join(input_path, 'config.json')) as fIn: 75 | config = json.load(fIn) 76 | 77 | return BoW(**config) -------------------------------------------------------------------------------- /sentence_transformers/models/CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | from typing import Union, Tuple, List, Iterable, Dict 4 | import logging 5 | import gzip 6 | from tqdm import tqdm 7 | import numpy as np 8 | import os 9 | import json 10 | from ..util import import_from_string, fullname, http_get 11 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 12 | 13 | 14 | class CNN(nn.Module): 15 | """CNN-layer with multiple kernel-sizes over the word embeddings""" 16 | 17 | def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]): 18 | nn.Module.__init__(self) 19 | self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes'] 20 | self.in_word_embedding_dimension = in_word_embedding_dimension 21 | self.out_channels = out_channels 22 | self.kernel_sizes = kernel_sizes 23 | 24 | self.embeddings_dimension = out_channels*len(kernel_sizes) 25 | self.convs = nn.ModuleList() 26 | 27 | in_channels = in_word_embedding_dimension 28 | for kernel_size in kernel_sizes: 29 | padding_size = int((kernel_size - 1) / 2) 30 | conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, 31 | padding=padding_size) 32 | self.convs.append(conv) 33 | 34 | def forward(self, features): 35 | token_embeddings = features['token_embeddings'] 36 | 37 | token_embeddings = token_embeddings.transpose(1, -1) 38 | vectors = [conv(token_embeddings) for conv in self.convs] 39 | out = torch.cat(vectors, 1).transpose(1, -1) 40 | 41 | features.update({'token_embeddings': out}) 42 | return features 43 | 44 | def get_word_embedding_dimension(self) -> int: 45 | return self.embeddings_dimension 46 | 47 | def tokenize(self, text: str) -> List[int]: 48 | raise NotImplementedError() 49 | 50 | def save(self, output_path: str): 51 | with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut: 52 | json.dump(self.get_config_dict(), fOut, indent=2) 53 | 54 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 55 | 56 | def get_config_dict(self): 57 | return {key: self.__dict__[key] for key in self.config_keys} 58 | 59 | @staticmethod 60 | def load(input_path: str): 61 | with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn: 62 | config = json.load(fIn) 63 | 64 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 65 | model = CNN(**config) 66 | model.load_state_dict(weights) 67 | return model 68 | 69 | -------------------------------------------------------------------------------- /sentence_transformers/models/CamemBERT.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import CamembertModel, CamembertTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict, Optional 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | 11 | class CamemBERT(nn.Module): 12 | """CamemBERT model to generate token embeddings. 13 | 14 | Each token is mapped to an output vector from CamemBERT. 15 | """ 16 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): 17 | super(CamemBERT, self).__init__() 18 | self.config_keys = ['max_seq_length', 'do_lower_case'] 19 | self.do_lower_case = do_lower_case 20 | 21 | if max_seq_length > 511: 22 | logging.warning("CamemBERT only allows a max_seq_length of 511 (514 with special tokens). Value will be set to 511") 23 | max_seq_length = 511 24 | self.max_seq_length = max_seq_length 25 | 26 | if self.do_lower_case is not None: 27 | tokenizer_args['do_lower_case'] = do_lower_case 28 | 29 | self.camembert = CamembertModel.from_pretrained(model_name_or_path, **model_args) 30 | self.tokenizer = CamembertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 31 | 32 | def forward(self, features): 33 | """Returns token_embeddings, cls_token""" 34 | #CamemBERT does not use token_type_ids 35 | output_states = self.camembert(**features) 36 | output_tokens = output_states[0] 37 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 38 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 39 | 40 | if self.camembert.config.output_hidden_states: 41 | hidden_states = output_states[2] 42 | features.update({'all_layer_embeddings': hidden_states}) 43 | 44 | return features 45 | 46 | def get_word_embedding_dimension(self) -> int: 47 | return self.camembert.config.hidden_size 48 | 49 | def tokenize(self, text: str) -> List[int]: 50 | """ 51 | Tokenizes a text and maps tokens to token-ids 52 | """ 53 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 54 | 55 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 56 | """ 57 | Convert tokenized sentence in its embedding ids, segment ids and mask 58 | 59 | :param tokens: 60 | a tokenized sentence 61 | :param pad_seq_length: 62 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 63 | :return: embedding ids, segment ids and mask for the sentence 64 | """ 65 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens 66 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 67 | 68 | def get_config_dict(self): 69 | return {key: self.__dict__[key] for key in self.config_keys} 70 | 71 | def save(self, output_path: str): 72 | self.camembert.save_pretrained(output_path) 73 | self.tokenizer.save_pretrained(output_path) 74 | 75 | with open(os.path.join(output_path, 'sentence_camembert_config.json'), 'w') as fOut: 76 | json.dump(self.get_config_dict(), fOut, indent=2) 77 | 78 | @staticmethod 79 | def load(input_path: str): 80 | with open(os.path.join(input_path, 'sentence_camembert_config.json')) as fIn: 81 | config = json.load(fIn) 82 | return CamemBERT(model_name_or_path=input_path, **config) 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /sentence_transformers/models/Dense.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from torch import functional as F 5 | from typing import Union, Tuple, List, Iterable, Dict 6 | import os 7 | import json 8 | from ..util import fullname, import_from_string 9 | 10 | 11 | class Dense(nn.Module): 12 | """Feed-forward function with activiation function. 13 | 14 | This layer takes a fixed-sized sentence embedding and passes it through a feed-forward layer. Can be used to generate deep averaging networs (DAN). 15 | """ 16 | def __init__(self, in_features, out_features, bias=True, activation_function=nn.Tanh()): 17 | super(Dense, self).__init__() 18 | self.in_features = in_features 19 | self.out_features = out_features 20 | self.bias = bias 21 | self.activation_function = activation_function 22 | self.linear = nn.Linear(in_features, out_features, bias=bias) 23 | 24 | def forward(self, features: Dict[str, Tensor]): 25 | features.update({'sentence_embedding': self.activation_function(self.linear(features['sentence_embedding']))}) 26 | return features 27 | 28 | def get_sentence_embedding_dimension(self) -> int: 29 | return self.out_features 30 | 31 | def save(self, output_path): 32 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 33 | json.dump({'in_features': self.in_features, 'out_features': self.out_features, 'bias': self.bias, 'activation_function': fullname(self.activation_function)}, fOut) 34 | 35 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 36 | 37 | @staticmethod 38 | def load(input_path): 39 | with open(os.path.join(input_path, 'config.json')) as fIn: 40 | config = json.load(fIn) 41 | 42 | config['activation_function'] = import_from_string(config['activation_function'])() 43 | model = Dense(**config) 44 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 45 | return model 46 | -------------------------------------------------------------------------------- /sentence_transformers/models/DistilBERT.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import DistilBertModel, DistilBertTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict, Optional 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | class DistilBERT(nn.Module): 11 | """DistilBERT model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from DistilBERT. 14 | """ 15 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): 16 | super(DistilBERT, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | 20 | if max_seq_length > 510: 21 | logging.warning("BERT only allows a max_seq_length of 510 (512 with special tokens). Value will be set to 510") 22 | max_seq_length = 510 23 | self.max_seq_length = max_seq_length 24 | 25 | if self.do_lower_case is not None: 26 | tokenizer_args['do_lower_case'] = do_lower_case 27 | 28 | self.bert = DistilBertModel.from_pretrained(model_name_or_path, **model_args) 29 | self.tokenizer = DistilBertTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 30 | 31 | def forward(self, features): 32 | """Returns token_embeddings, cls_token""" 33 | # DistilBERT does not use token_type_ids 34 | output_states = self.bert(**features) 35 | output_tokens = output_states[0] 36 | 37 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 38 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 39 | 40 | if len(output_states) > 1: 41 | features.update({'all_layer_embeddings': output_states[1]}) 42 | 43 | return features 44 | 45 | def get_word_embedding_dimension(self) -> int: 46 | return self.bert.config.hidden_size 47 | 48 | def tokenize(self, text: str) -> List[int]: 49 | """ 50 | Tokenizes a text and maps tokens to token-ids 51 | """ 52 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 53 | 54 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 55 | """ 56 | Convert tokenized sentence in its embedding ids, segment ids and mask 57 | 58 | :param tokens: 59 | a tokenized sentence 60 | :param pad_seq_length: 61 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 62 | :return: embedding ids, segment ids and mask for the sentence 63 | """ 64 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 #Add space for special tokens 65 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 66 | 67 | def get_config_dict(self): 68 | return {key: self.__dict__[key] for key in self.config_keys} 69 | 70 | def save(self, output_path: str): 71 | self.bert.save_pretrained(output_path) 72 | self.tokenizer.save_pretrained(output_path) 73 | 74 | with open(os.path.join(output_path, 'sentence_distilbert_config.json'), 'w') as fOut: 75 | json.dump(self.get_config_dict(), fOut, indent=2) 76 | 77 | @staticmethod 78 | def load(input_path: str): 79 | with open(os.path.join(input_path, 'sentence_distilbert_config.json')) as fIn: 80 | config = json.load(fIn) 81 | return DistilBERT(model_name_or_path=input_path, **config) 82 | 83 | 84 | 85 | 86 | 87 | 88 | -------------------------------------------------------------------------------- /sentence_transformers/models/LSTM.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from typing import List 4 | import os 5 | import json 6 | 7 | 8 | 9 | class LSTM(nn.Module): 10 | """ 11 | Bidirectional LSTM running over word embeddings. 12 | """ 13 | def __init__(self, word_embedding_dimension: int, hidden_dim: int, num_layers: int = 1, dropout: float = 0, bidirectional: bool = True): 14 | nn.Module.__init__(self) 15 | self.config_keys = ['word_embedding_dimension', 'hidden_dim', 'num_layers', 'dropout', 'bidirectional'] 16 | self.word_embedding_dimension = word_embedding_dimension 17 | self.hidden_dim = hidden_dim 18 | self.num_layers = num_layers 19 | self.dropout = dropout 20 | self.bidirectional = bidirectional 21 | 22 | self.embeddings_dimension = hidden_dim 23 | if self.bidirectional: 24 | self.embeddings_dimension *= 2 25 | 26 | self.encoder = nn.LSTM(word_embedding_dimension, hidden_dim, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True) 27 | 28 | def forward(self, features): 29 | token_embeddings = features['token_embeddings'] 30 | sentence_lengths = torch.clamp(features['sentence_lengths'], min=1) 31 | 32 | packed = nn.utils.rnn.pack_padded_sequence(token_embeddings, sentence_lengths, batch_first=True, enforce_sorted=False) 33 | packed = self.encoder(packed) 34 | unpack = nn.utils.rnn.pad_packed_sequence(packed[0], batch_first=True)[0] 35 | features.update({'token_embeddings': unpack}) 36 | return features 37 | 38 | def get_word_embedding_dimension(self) -> int: 39 | return self.embeddings_dimension 40 | 41 | def tokenize(self, text: str) -> List[int]: 42 | raise NotImplementedError() 43 | 44 | def save(self, output_path: str): 45 | with open(os.path.join(output_path, 'lstm_config.json'), 'w') as fOut: 46 | json.dump(self.get_config_dict(), fOut, indent=2) 47 | 48 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 49 | 50 | def get_config_dict(self): 51 | return {key: self.__dict__[key] for key in self.config_keys} 52 | 53 | @staticmethod 54 | def load(input_path: str): 55 | with open(os.path.join(input_path, 'lstm_config.json'), 'r') as fIn: 56 | config = json.load(fIn) 57 | 58 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 59 | model = LSTM(**config) 60 | model.load_state_dict(weights) 61 | return model 62 | 63 | -------------------------------------------------------------------------------- /sentence_transformers/models/PhoBERT.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import RobertaModel 4 | from .tokenizer.PhoTokenizer import PhoTokenizer 5 | import json 6 | from typing import Union, Tuple, List, Dict, Optional 7 | import os 8 | import logging 9 | 10 | class PhoBERT(nn.Module): 11 | """PhoBERT model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from PhoBERT. 14 | """ 15 | def __init__(self, model_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = False, model_args: Dict = {}, tokenizer_args: Dict = {}): 16 | super(PhoBERT, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | 20 | if max_seq_length > 256: 21 | logging.warning("PhoBERT only allows a max_seq_length of 256 (258 with special tokens). Value will be set to 256") 22 | max_seq_length = 256 23 | self.max_seq_length = max_seq_length 24 | 25 | if self.do_lower_case is not None: 26 | tokenizer_args['do_lower_case'] = do_lower_case 27 | 28 | self.phobert = RobertaModel.from_pretrained(model_path, **model_args) 29 | self.tokenizer = PhoTokenizer.load(model_path, **tokenizer_args) 30 | 31 | 32 | def forward(self, features): 33 | """Returns token_embeddings, cls_token""" 34 | output_states = self.phobert(**features) 35 | output_tokens = output_states[0] 36 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 37 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 38 | 39 | if len(output_states) > 2: 40 | features.update({'all_layer_embeddings': output_states[2]}) 41 | 42 | return features 43 | 44 | def get_word_embedding_dimension(self) -> int: 45 | return self.phobert.config.hidden_size 46 | 47 | def tokenize(self, text: str) -> List[int]: 48 | """ 49 | Tokenizes a text and maps tokens to token-ids 50 | """ 51 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 52 | 53 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 54 | """ 55 | Convert tokenized sentence in its embedding ids, segment ids and mask 56 | 57 | :param tokens: 58 | a tokenized sentence 59 | :param pad_seq_length: 60 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 61 | :return: embedding ids, segment ids and mask for the sentence 62 | """ 63 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token 64 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 65 | 66 | def get_config_dict(self): 67 | return {key: self.__dict__[key] for key in self.config_keys} 68 | 69 | def save(self, output_path: str): 70 | self.phobert.save_pretrained(output_path) 71 | self.tokenizer.save(output_path) 72 | 73 | with open(os.path.join(output_path, 'sentence_phobert_config.json'), 'w') as fOut: 74 | json.dump(self.get_config_dict(), fOut, indent=2) 75 | 76 | @staticmethod 77 | def load(input_path: str): 78 | with open(os.path.join(input_path, 'sentence_phobert_config.json')) as fIn: 79 | config = json.load(fIn) 80 | return PhoBERT(model_path=input_path, **config) -------------------------------------------------------------------------------- /sentence_transformers/models/Pooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | 8 | 9 | class Pooling(nn.Module): 10 | """Performs pooling (max or mean) on the token embeddings. 11 | 12 | Using pooling, it generates from a variable sized sentence a fixed sized sentence embedding. This layer also allows to use the CLS token if it is returned by the underlying word embedding model. 13 | You can concatenate multiple poolings together. 14 | """ 15 | def __init__(self, 16 | word_embedding_dimension: int, 17 | pooling_mode_cls_token: bool = False, 18 | pooling_mode_max_tokens: bool = False, 19 | pooling_mode_mean_tokens: bool = True, 20 | pooling_mode_mean_sqrt_len_tokens: bool = False, 21 | ): 22 | super(Pooling, self).__init__() 23 | 24 | self.config_keys = ['word_embedding_dimension', 'pooling_mode_cls_token', 'pooling_mode_mean_tokens', 'pooling_mode_max_tokens', 'pooling_mode_mean_sqrt_len_tokens'] 25 | 26 | self.word_embedding_dimension = word_embedding_dimension 27 | self.pooling_mode_cls_token = pooling_mode_cls_token 28 | self.pooling_mode_mean_tokens = pooling_mode_mean_tokens 29 | self.pooling_mode_max_tokens = pooling_mode_max_tokens 30 | self.pooling_mode_mean_sqrt_len_tokens = pooling_mode_mean_sqrt_len_tokens 31 | 32 | pooling_mode_multiplier = sum([pooling_mode_cls_token, pooling_mode_max_tokens, pooling_mode_mean_tokens, pooling_mode_mean_sqrt_len_tokens]) 33 | self.pooling_output_dimension = (pooling_mode_multiplier * word_embedding_dimension) 34 | 35 | def forward(self, features: Dict[str, Tensor]): 36 | token_embeddings = features['token_embeddings'] 37 | cls_token = features['cls_token_embeddings'] 38 | attention_mask = features['attention_mask'] 39 | 40 | ## Pooling strategy 41 | output_vectors = [] 42 | if self.pooling_mode_cls_token: 43 | output_vectors.append(cls_token) 44 | if self.pooling_mode_max_tokens: 45 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 46 | token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value 47 | max_over_time = torch.max(token_embeddings, 1)[0] 48 | output_vectors.append(max_over_time) 49 | if self.pooling_mode_mean_tokens or self.pooling_mode_mean_sqrt_len_tokens: 50 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 51 | sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) 52 | 53 | #If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present 54 | if 'token_weights_sum' in features: 55 | sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size()) 56 | else: 57 | sum_mask = input_mask_expanded.sum(1) 58 | 59 | sum_mask = torch.clamp(sum_mask, min=1e-9) 60 | 61 | if self.pooling_mode_mean_tokens: 62 | output_vectors.append(sum_embeddings / sum_mask) 63 | if self.pooling_mode_mean_sqrt_len_tokens: 64 | output_vectors.append(sum_embeddings / torch.sqrt(sum_mask)) 65 | 66 | output_vector = torch.cat(output_vectors, 1) 67 | features.update({'sentence_embedding': output_vector}) 68 | return features 69 | 70 | def get_sentence_embedding_dimension(self): 71 | return self.pooling_output_dimension 72 | 73 | def get_config_dict(self): 74 | return {key: self.__dict__[key] for key in self.config_keys} 75 | 76 | def save(self, output_path): 77 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 78 | json.dump(self.get_config_dict(), fOut, indent=2) 79 | 80 | @staticmethod 81 | def load(input_path): 82 | with open(os.path.join(input_path, 'config.json')) as fIn: 83 | config = json.load(fIn) 84 | 85 | return Pooling(**config) 86 | -------------------------------------------------------------------------------- /sentence_transformers/models/RoBERTa.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import RobertaModel, RobertaTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict, Optional 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | class RoBERTa(nn.Module): 11 | """RoBERTa model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from RoBERTa. 14 | """ 15 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): 16 | super(RoBERTa, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | 20 | if max_seq_length > 512: 21 | logging.warning("RoBERTa only allows a max_seq_length of 512 (514 with special tokens). Value will be set to 512") 22 | max_seq_length = 512 23 | self.max_seq_length = max_seq_length 24 | 25 | if self.do_lower_case is not None: 26 | tokenizer_args['do_lower_case'] = do_lower_case 27 | 28 | self.roberta = RobertaModel.from_pretrained(model_name_or_path, **model_args) 29 | self.tokenizer = RobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 30 | 31 | 32 | def forward(self, features): 33 | """Returns token_embeddings, cls_token""" 34 | output_states = self.roberta(**features) 35 | output_tokens = output_states[0] 36 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 37 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 38 | 39 | if len(output_states) > 2: 40 | features.update({'all_layer_embeddings': output_states[2]}) 41 | 42 | return features 43 | 44 | def get_word_embedding_dimension(self) -> int: 45 | return self.roberta.config.hidden_size 46 | 47 | def tokenize(self, text: str) -> List[int]: 48 | """ 49 | Tokenizes a text and maps tokens to token-ids 50 | """ 51 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 52 | 53 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 54 | """ 55 | Convert tokenized sentence in its embedding ids, segment ids and mask 56 | 57 | :param tokens: 58 | a tokenized sentence 59 | :param pad_seq_length: 60 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 61 | :return: embedding ids, segment ids and mask for the sentence 62 | """ 63 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 ##Add Space for CLS + SEP token 64 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 65 | 66 | def get_config_dict(self): 67 | return {key: self.__dict__[key] for key in self.config_keys} 68 | 69 | def save(self, output_path: str): 70 | self.roberta.save_pretrained(output_path) 71 | self.tokenizer.save_pretrained(output_path) 72 | 73 | with open(os.path.join(output_path, 'sentence_roberta_config.json'), 'w') as fOut: 74 | json.dump(self.get_config_dict(), fOut, indent=2) 75 | 76 | @staticmethod 77 | def load(input_path: str): 78 | with open(os.path.join(input_path, 'sentence_roberta_config.json')) as fIn: 79 | config = json.load(fIn) 80 | return RoBERTa(model_name_or_path=input_path, **config) 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /sentence_transformers/models/T5.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import T5Model, T5Tokenizer 3 | import json 4 | from typing import List, Dict, Optional 5 | import os 6 | import numpy as np 7 | import logging 8 | 9 | class T5(nn.Module): 10 | """T5 model to generate token embeddings. 11 | 12 | Each token is mapped to an output vector from BERT. 13 | """ 14 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, task_identifier: str = 'stsb sentence1: ', model_args: Dict = {}, tokenizer_args: Dict = {}): 15 | super(T5, self).__init__() 16 | self.config_keys = ['max_seq_length', 'do_lower_case', 'task_identifier'] 17 | self.do_lower_case = do_lower_case 18 | 19 | if max_seq_length > 512: 20 | logging.warning("T5 only allows a max_seq_length of 512. Value will be set to 512") 21 | max_seq_length = 512 22 | self.max_seq_length = max_seq_length 23 | 24 | if self.do_lower_case is not None: 25 | tokenizer_args['do_lower_case'] = do_lower_case 26 | 27 | self.t5model = T5Model.from_pretrained(model_name_or_path, **model_args) 28 | self.tokenizer = T5Tokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 29 | self.task_identifier = task_identifier 30 | 31 | def forward(self, features): 32 | """Returns token_embeddings, cls_token""" 33 | output_states = self.t5model.encoder(input_ids=features['input_ids'], attention_mask=features['attention_mask']) 34 | output_tokens = output_states[0] 35 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 36 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens}) 37 | 38 | if len(output_states) > 1: 39 | features.update({'all_layer_embeddings': output_states[1]}) 40 | 41 | return features 42 | 43 | def get_word_embedding_dimension(self) -> int: 44 | return self.t5model.config.hidden_size 45 | 46 | def tokenize(self, text: str) -> List[int]: 47 | """ 48 | Tokenizes a text and maps tokens to token-ids 49 | """ 50 | return self.tokenizer.encode(self.task_identifier+text) 51 | 52 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 53 | """ 54 | Convert tokenized sentence in its embedding ids, segment ids and mask 55 | 56 | :param tokens: 57 | a tokenized sentence 58 | :param pad_seq_length: 59 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 60 | :return: embedding ids, segment ids and mask for the sentence 61 | """ 62 | 63 | pad_seq_length = min(pad_seq_length, self.max_seq_length) 64 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 65 | 66 | def get_config_dict(self): 67 | return {key: self.__dict__[key] for key in self.config_keys} 68 | 69 | def save(self, output_path: str): 70 | self.t5model.save_pretrained(output_path) 71 | self.tokenizer.save_pretrained(output_path) 72 | 73 | with open(os.path.join(output_path, 'sentence_T5_config.json'), 'w') as fOut: 74 | json.dump(self.get_config_dict(), fOut, indent=2) 75 | 76 | @staticmethod 77 | def load(input_path: str): 78 | with open(os.path.join(input_path, 'sentence_T5_config.json')) as fIn: 79 | config = json.load(fIn) 80 | return T5(model_name_or_path=input_path, **config) 81 | 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /sentence_transformers/models/Transformer.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | from transformers import AutoModel, AutoTokenizer, AutoConfig 3 | import json 4 | from typing import List, Dict, Optional 5 | import os 6 | import numpy as np 7 | import logging 8 | 9 | class Transformer(nn.Module): 10 | """Huggingface AutoModel to generate token embeddings. 11 | Loads the correct class, e.g. BERT / RoBERTa etc. 12 | """ 13 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, model_args: Dict = {}, cache_dir: Optional[str] = None ): 14 | super(Transformer, self).__init__() 15 | self.config_keys = ['max_seq_length'] 16 | self.max_seq_length = max_seq_length 17 | 18 | config = AutoConfig.from_pretrained(model_name_or_path, **model_args, cache_dir=cache_dir) 19 | self.auto_model = AutoModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir) 20 | self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir) 21 | 22 | 23 | def forward(self, features): 24 | """Returns token_embeddings, cls_token""" 25 | output_states = self.auto_model(**features) 26 | output_tokens = output_states[0] 27 | 28 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 29 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 30 | 31 | if self.auto_model.config.output_hidden_states: 32 | all_layer_idx = 2 33 | if len(output_states) < 3: #Some models only output last_hidden_states and all_hidden_states 34 | all_layer_idx = 1 35 | 36 | hidden_states = output_states[all_layer_idx] 37 | features.update({'all_layer_embeddings': hidden_states}) 38 | 39 | return features 40 | 41 | def get_word_embedding_dimension(self) -> int: 42 | return self.auto_model.config.hidden_size 43 | 44 | def tokenize(self, text: str) -> List[int]: 45 | """ 46 | Tokenizes a text and maps tokens to token-ids 47 | """ 48 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 49 | 50 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 51 | """ 52 | Convert tokenized sentence in its embedding ids, segment ids and mask 53 | 54 | :param tokens: 55 | a tokenized sentence 56 | :param pad_seq_length: 57 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 58 | :return: embedding ids, segment ids and mask for the sentence 59 | """ 60 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens 61 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 62 | 63 | def get_config_dict(self): 64 | return {key: self.__dict__[key] for key in self.config_keys} 65 | 66 | def save(self, output_path: str): 67 | self.auto_model.save_pretrained(output_path) 68 | self.tokenizer.save_pretrained(output_path) 69 | 70 | with open(os.path.join(output_path, 'sentence_bert_config.json'), 'w') as fOut: 71 | json.dump(self.get_config_dict(), fOut, indent=2) 72 | 73 | @staticmethod 74 | def load(input_path: str): 75 | with open(os.path.join(input_path, 'sentence_bert_config.json')) as fIn: 76 | config = json.load(fIn) 77 | return Transformer(model_name_or_path=input_path, **config) 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /sentence_transformers/models/WeightedLayerPooling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import numpy as np 8 | import torch.nn.functional as F 9 | 10 | from sklearn.metrics.pairwise import cosine_similarity 11 | from sklearn.preprocessing import normalize 12 | 13 | class WeightedLayerPooling(nn.Module): 14 | """ 15 | Token embeddings are weighted mean of their different hidden layer representations 16 | """ 17 | def __init__(self, word_embedding_dimension, num_hidden_layers: int = 12, layer_start: int = 4, layer_weights = None): 18 | super(WeightedLayerPooling, self).__init__() 19 | self.config_keys = ['word_embedding_dimension', 'layer_start', 'num_hidden_layers'] 20 | self.word_embedding_dimension = word_embedding_dimension 21 | self.layer_start = layer_start 22 | self.num_hidden_layers = num_hidden_layers 23 | self.layer_weights = layer_weights if layer_weights is not None else nn.Parameter(torch.tensor([1] * (num_hidden_layers+1 - layer_start), dtype=torch.float)) 24 | 25 | def forward(self, features: Dict[str, Tensor]): 26 | ft_all_layers = features['all_layer_embeddings'] 27 | 28 | all_layer_embedding = torch.stack(ft_all_layers) 29 | all_layer_embedding = all_layer_embedding[self.layer_start:, :, :, :] # Start from 4th layers output 30 | 31 | weight_factor = self.layer_weights.unsqueeze(-1).unsqueeze(-1).unsqueeze(-1).expand(all_layer_embedding.size()) 32 | weighted_average = (weight_factor*all_layer_embedding).sum(dim=0) / self.layer_weights.sum() 33 | 34 | features.update({'token_embeddings': weighted_average}) 35 | return features 36 | 37 | def get_word_embedding_dimension(self): 38 | return self.word_embedding_dimension 39 | 40 | def get_config_dict(self): 41 | return {key: self.__dict__[key] for key in self.config_keys} 42 | 43 | def save(self, output_path): 44 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 45 | json.dump(self.get_config_dict(), fOut, indent=2) 46 | 47 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 48 | 49 | 50 | @staticmethod 51 | def load(input_path): 52 | with open(os.path.join(input_path, 'config.json')) as fIn: 53 | config = json.load(fIn) 54 | 55 | model = WeightedLayerPooling(**config) 56 | model.load_state_dict(torch.load(os.path.join(input_path, 'pytorch_model.bin'), map_location=torch.device('cpu'))) 57 | return model 58 | -------------------------------------------------------------------------------- /sentence_transformers/models/WordWeights.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import Tensor 3 | from torch import nn 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import os 6 | import json 7 | import logging 8 | 9 | class WordWeights(nn.Module): 10 | """This model can weight word embeddings, for example, with idf-values.""" 11 | 12 | def __init__(self, vocab: List[str], word_weights: Dict[str, float], unknown_word_weight: float = 1): 13 | """ 14 | 15 | :param vocab: 16 | Vocabulary of the tokenizer 17 | :param word_weights: 18 | Mapping of tokens to a float weight value. Words embeddings are multiplied by this float value. Tokens in word_weights must not be equal to the vocab (can contain more or less values) 19 | :param unknown_word_weight: 20 | Weight for words in vocab, that do not appear in the word_weights lookup. These can be for example rare words in the vocab, where no weight exists. 21 | """ 22 | super(WordWeights, self).__init__() 23 | self.config_keys = ['vocab', 'word_weights', 'unknown_word_weight'] 24 | self.vocab = vocab 25 | self.word_weights = word_weights 26 | self.unknown_word_weight = unknown_word_weight 27 | 28 | weights = [] 29 | num_unknown_words = 0 30 | for word in vocab: 31 | weight = unknown_word_weight 32 | if word in word_weights: 33 | weight = word_weights[word] 34 | elif word.lower() in word_weights: 35 | weight = word_weights[word.lower()] 36 | else: 37 | num_unknown_words += 1 38 | weights.append(weight) 39 | 40 | logging.info("{} of {} words without a weighting value. Set weight to {}".format(num_unknown_words, len(vocab), unknown_word_weight)) 41 | 42 | self.emb_layer = nn.Embedding(len(vocab), 1) 43 | self.emb_layer.load_state_dict({'weight': torch.FloatTensor(weights).unsqueeze(1)}) 44 | 45 | 46 | def forward(self, features: Dict[str, Tensor]): 47 | attention_mask = features['attention_mask'] 48 | token_embeddings = features['token_embeddings'] 49 | 50 | #Compute a weight value for each token 51 | token_weights_raw = self.emb_layer(features['input_ids']).squeeze(-1) 52 | token_weights = token_weights_raw * attention_mask.float() 53 | token_weights_sum = torch.sum(token_weights, 1) 54 | 55 | #Multiply embedding by token weight value 56 | token_weights_expanded = token_weights.unsqueeze(-1).expand(token_embeddings.size()) 57 | token_embeddings = token_embeddings * token_weights_expanded 58 | 59 | features.update({'token_embeddings': token_embeddings, 'token_weights_sum': token_weights_sum}) 60 | return features 61 | 62 | def get_config_dict(self): 63 | return {key: self.__dict__[key] for key in self.config_keys} 64 | 65 | def save(self, output_path): 66 | with open(os.path.join(output_path, 'config.json'), 'w') as fOut: 67 | json.dump(self.get_config_dict(), fOut, indent=2) 68 | 69 | @staticmethod 70 | def load(input_path): 71 | with open(os.path.join(input_path, 'config.json')) as fIn: 72 | config = json.load(fIn) 73 | 74 | return WordWeights(**config) -------------------------------------------------------------------------------- /sentence_transformers/models/XLMRoBERTa.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import XLMRobertaModel, XLMRobertaTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict, Optional 6 | import os 7 | import numpy as np 8 | import logging 9 | 10 | class XLMRoBERTa(nn.Module): 11 | """RoBERTa model to generate token embeddings. 12 | 13 | Each token is mapped to an output vector from RoBERTa. 14 | """ 15 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): 16 | super(XLMRoBERTa, self).__init__() 17 | self.config_keys = ['max_seq_length', 'do_lower_case'] 18 | self.do_lower_case = do_lower_case 19 | 20 | if self.do_lower_case is not None: 21 | tokenizer_args['do_lower_case'] = do_lower_case 22 | 23 | self.xlm_roberta = XLMRobertaModel.from_pretrained(model_name_or_path, **model_args) 24 | self.tokenizer = XLMRobertaTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 25 | 26 | if max_seq_length > self.tokenizer.max_len_single_sentence: 27 | logging.warning("XLM-RoBERTa only allows a max_seq_length of "+self.tokenizer.max_len_single_sentence) 28 | max_seq_length = self.tokenizer.max_len_single_sentence 29 | self.max_seq_length = max_seq_length 30 | 31 | 32 | def forward(self, features): 33 | """Returns token_embeddings, cls_token""" 34 | #RoBERTa does not use token_type_ids 35 | output_states = self.xlm_roberta(**features) 36 | output_tokens = output_states[0] 37 | cls_tokens = output_tokens[:, 0, :] # CLS token is first token 38 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 39 | 40 | if self.xlm_roberta.config.output_hidden_states: 41 | hidden_states = output_states[2] 42 | features.update({'all_layer_embeddings': hidden_states}) 43 | 44 | return features 45 | 46 | def get_word_embedding_dimension(self) -> int: 47 | return self.xlm_roberta.config.hidden_size 48 | 49 | def tokenize(self, text: str) -> List[int]: 50 | """ 51 | Tokenizes a text and maps tokens to token-ids 52 | """ 53 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 54 | 55 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int): 56 | """ 57 | Convert tokenized sentence in its embedding ids, segment ids and mask 58 | 59 | :param tokens: 60 | a tokenized sentence 61 | :param pad_seq_length: 62 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 63 | :return: embedding ids, segment ids and mask for the sentence 64 | """ 65 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 2 #Add space for special tokens 66 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 67 | 68 | def get_config_dict(self): 69 | return {key: self.__dict__[key] for key in self.config_keys} 70 | 71 | def save(self, output_path: str): 72 | self.xlm_roberta.save_pretrained(output_path) 73 | self.tokenizer.save_pretrained(output_path) 74 | 75 | with open(os.path.join(output_path, 'sentence_xlm-roberta_config.json'), 'w') as fOut: 76 | json.dump(self.get_config_dict(), fOut, indent=2) 77 | 78 | @staticmethod 79 | def load(input_path: str): 80 | with open(os.path.join(input_path, 'sentence_xlm-roberta_config.json')) as fIn: 81 | config = json.load(fIn) 82 | return XLMRoBERTa(model_name_or_path=input_path, **config) 83 | 84 | 85 | 86 | 87 | 88 | 89 | -------------------------------------------------------------------------------- /sentence_transformers/models/XLNet.py: -------------------------------------------------------------------------------- 1 | from torch import Tensor 2 | from torch import nn 3 | from transformers import XLNetModel, XLNetTokenizer 4 | import json 5 | from typing import Union, Tuple, List, Dict, Optional 6 | import os 7 | import numpy as np 8 | 9 | class XLNet(nn.Module): 10 | """XLNet model to generate token embeddings. 11 | 12 | Each token is mapped to an output vector from XLNet. 13 | """ 14 | def __init__(self, model_name_or_path: str, max_seq_length: int = 128, do_lower_case: Optional[bool] = None, model_args: Dict = {}, tokenizer_args: Dict = {}): 15 | super(XLNet, self).__init__() 16 | self.config_keys = ['max_seq_length', 'do_lower_case'] 17 | self.max_seq_length = max_seq_length 18 | self.do_lower_case = do_lower_case 19 | 20 | if self.do_lower_case is not None: 21 | tokenizer_args['do_lower_case'] = do_lower_case 22 | 23 | self.xlnet = XLNetModel.from_pretrained(model_name_or_path, **model_args) 24 | self.tokenizer = XLNetTokenizer.from_pretrained(model_name_or_path, **tokenizer_args) 25 | self.cls_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.cls_token])[0] 26 | self.sep_token_id = self.tokenizer.convert_tokens_to_ids([self.tokenizer.sep_token])[0] 27 | 28 | def forward(self, features): 29 | """Returns token_embeddings, cls_token""" 30 | output_states = self.xlnet(**features) 31 | output_tokens = output_states[0] 32 | cls_tokens = output_tokens[:, -1, :] # CLS token is the last token 33 | features.update({'token_embeddings': output_tokens, 'cls_token_embeddings': cls_tokens, 'attention_mask': features['attention_mask']}) 34 | 35 | if self.xlnet.config.output_hidden_states: 36 | hidden_states = output_states[2] 37 | features.update({'all_layer_embeddings': hidden_states}) 38 | 39 | return features 40 | 41 | def get_word_embedding_dimension(self) -> int: 42 | return self.xlnet.config.d_model 43 | 44 | def tokenize(self, text: str) -> List[int]: 45 | """ 46 | Tokenizes a text and maps tokens to token-ids 47 | """ 48 | return self.tokenizer.convert_tokens_to_ids(self.tokenizer.tokenize(text)) 49 | 50 | def get_sentence_features(self, tokens: List[int], pad_seq_length: int) -> Dict[str, Tensor]: 51 | """ 52 | Convert tokenized sentence in its embedding ids, segment ids and mask 53 | 54 | :param tokens: 55 | a tokenized sentence 56 | :param pad_seq_length: 57 | the maximal length of the sequence. Cannot be greater than self.sentence_transformer_config.max_seq_length 58 | :return: embedding ids, segment ids and mask for the sentence 59 | """ 60 | pad_seq_length = min(pad_seq_length, self.max_seq_length) + 3 #Add space for special tokens 61 | return self.tokenizer.prepare_for_model(tokens, max_length=pad_seq_length, pad_to_max_length=True, return_tensors='pt') 62 | 63 | def get_config_dict(self): 64 | return {key: self.__dict__[key] for key in self.config_keys} 65 | 66 | def save(self, output_path: str): 67 | self.xlnet.save_pretrained(output_path) 68 | self.tokenizer.save_pretrained(output_path) 69 | 70 | with open(os.path.join(output_path, 'sentence_xlnet_config.json'), 'w') as fOut: 71 | json.dump(self.get_config_dict(), fOut, indent=2) 72 | 73 | @staticmethod 74 | def load(input_path: str): 75 | with open(os.path.join(input_path, 'sentence_xlnet_config.json')) as fIn: 76 | config = json.load(fIn) 77 | return XLNet(model_name_or_path=input_path, **config) 78 | 79 | 80 | 81 | 82 | 83 | 84 | -------------------------------------------------------------------------------- /sentence_transformers/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .ALBERT import ALBERT 2 | from .Transformer import Transformer 3 | from .BERT import BERT 4 | from .BoW import BoW 5 | from .CNN import CNN 6 | from .CamemBERT import CamemBERT 7 | from .Dense import Dense 8 | from .DistilBERT import DistilBERT 9 | from .LSTM import LSTM 10 | from .Pooling import Pooling 11 | from .RoBERTa import RoBERTa 12 | from .T5 import T5 13 | from .WKPooling import WKPooling 14 | from .WeightedLayerPooling import WeightedLayerPooling 15 | from .WordEmbeddings import WordEmbeddings 16 | from .WordWeights import WordWeights 17 | from .XLMRoBERTa import XLMRoBERTa 18 | from .XLNet import XLNet 19 | from .PhoBERT import PhoBERT -------------------------------------------------------------------------------- /sentence_transformers/models/proposed_CNN.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn, Tensor 3 | import torch.nn.functional as F 4 | from typing import Union, Tuple, List, Iterable, Dict 5 | import logging 6 | import gzip 7 | from tqdm import tqdm 8 | import numpy as np 9 | import os 10 | import json 11 | from ..util import import_from_string, fullname, http_get 12 | from .tokenizer import WordTokenizer, WhitespaceTokenizer 13 | 14 | 15 | class CNN(nn.Module): 16 | """CNN-layer with multiple kernel-sizes over the word embeddings""" 17 | 18 | def __init__(self, in_word_embedding_dimension: int, out_channels: int = 256, kernel_sizes: List[int] = [1, 3, 5]): 19 | nn.Module.__init__(self) 20 | self.config_keys = ['in_word_embedding_dimension', 'out_channels', 'kernel_sizes'] 21 | self.in_word_embedding_dimension = in_word_embedding_dimension 22 | self.out_channels = out_channels 23 | self.kernel_sizes = kernel_sizes 24 | 25 | self.embeddings_dimension = out_channels*len(kernel_sizes) 26 | self.convs = nn.ModuleList() 27 | self.pooling = nn.AvgPool1d(2, stride=2) 28 | in_channels = in_word_embedding_dimension 29 | for kernel_size in kernel_sizes: 30 | padding_size = int((kernel_size - 1) / 2) 31 | conv = nn.Conv1d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, 32 | padding=padding_size) 33 | 34 | self.convs.append(conv) 35 | 36 | def forward(self, features): 37 | token_embeddings = features['token_embeddings'] 38 | 39 | token_embeddings = token_embeddings.transpose(1, -1) 40 | vectors = [] 41 | for conv in self.convs: 42 | a = F.tanh(conv(token_embeddings)) 43 | a = a.transpose(1, -1) 44 | a = self.pooling(a) 45 | a = a.transpose(1, -1) 46 | vectors.append(a) 47 | #vectors = [self.pooling(conv(token_embeddings)) for conv in self.convs] 48 | # for i in vectors: 49 | # print(np.shape(i)) 50 | out = torch.cat(vectors, 1).transpose(1, -1) 51 | 52 | features.update({'token_embeddings': out}) 53 | return features 54 | 55 | def get_word_embedding_dimension(self) -> int: 56 | return self.embeddings_dimension 57 | 58 | def tokenize(self, text: str) -> List[int]: 59 | raise NotImplementedError() 60 | 61 | def save(self, output_path: str): 62 | with open(os.path.join(output_path, 'cnn_config.json'), 'w') as fOut: 63 | json.dump(self.get_config_dict(), fOut, indent=2) 64 | 65 | torch.save(self.state_dict(), os.path.join(output_path, 'pytorch_model.bin')) 66 | 67 | def get_config_dict(self): 68 | return {key: self.__dict__[key] for key in self.config_keys} 69 | 70 | @staticmethod 71 | def load(input_path: str): 72 | with open(os.path.join(input_path, 'cnn_config.json'), 'r') as fIn: 73 | config = json.load(fIn) 74 | 75 | weights = torch.load(os.path.join(input_path, 'pytorch_model.bin')) 76 | model = CNN(**config) 77 | model.load_state_dict(weights) 78 | return model 79 | 80 | -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/PhraseTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | import logging 7 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 8 | import nltk 9 | 10 | class PhraseTokenizer(WordTokenizer): 11 | """Tokenizes the text with respect to existent phrases in the vocab. 12 | 13 | This tokenizers respects phrases that are in the vocab. Phrases are separated with 'ngram_separator', for example, 14 | in Google News word2vec file, ngrams are separated with a _ like New_York. These phrases are detected in text and merged as one special token. (New York is the ... => [New_York, is, the]) 15 | """ 16 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, ngram_separator: str = "_", max_ngram_length: int = 5): 17 | self.stop_words = set(stop_words) 18 | self.do_lower_case = do_lower_case 19 | self.ngram_separator = ngram_separator 20 | self.max_ngram_length = max_ngram_length 21 | self.set_vocab(vocab) 22 | 23 | def get_vocab(self): 24 | return self.vocab 25 | 26 | def set_vocab(self, vocab: Iterable[str]): 27 | self.vocab = vocab 28 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 29 | 30 | # Check for ngram in vocab 31 | self.ngram_lookup = set() 32 | self.ngram_lengths = set() 33 | for word in vocab: 34 | 35 | if self.ngram_separator is not None and self.ngram_separator in word: 36 | # Sum words might me malformed in e.g. google news word2vec, containing two or more _ after each other 37 | ngram_count = word.count(self.ngram_separator) + 1 38 | if self.ngram_separator + self.ngram_separator not in word and ngram_count <= self.max_ngram_length: 39 | self.ngram_lookup.add(word) 40 | self.ngram_lengths.add(ngram_count) 41 | 42 | if len(vocab) > 0: 43 | logging.info("PhraseTokenizer - Phrase ngram lengths: {}".format(self.ngram_lengths)) 44 | logging.info("PhraseTokenizer - Num phrases: {}".format(len(self.ngram_lookup))) 45 | 46 | def tokenize(self, text: str) -> List[int]: 47 | tokens = nltk.word_tokenize(text, preserve_line=True) 48 | 49 | #phrase detection 50 | for ngram_len in sorted(self.ngram_lengths, reverse=True): 51 | idx = 0 52 | while idx <= len(tokens) - ngram_len: 53 | ngram = self.ngram_separator.join(tokens[idx:idx + ngram_len]) 54 | if ngram in self.ngram_lookup: 55 | tokens[idx:idx + ngram_len] = [ngram] 56 | elif ngram.lower() in self.ngram_lookup: 57 | tokens[idx:idx + ngram_len] = [ngram.lower()] 58 | idx += 1 59 | 60 | #Map tokens to idx, filter stop words 61 | tokens_filtered = [] 62 | for token in tokens: 63 | if token in self.stop_words: 64 | continue 65 | elif token in self.word2idx: 66 | tokens_filtered.append(self.word2idx[token]) 67 | continue 68 | 69 | token = token.lower() 70 | if token in self.stop_words: 71 | continue 72 | elif token in self.word2idx: 73 | tokens_filtered.append(self.word2idx[token]) 74 | continue 75 | 76 | token = token.strip(string.punctuation) 77 | if token in self.stop_words: 78 | continue 79 | elif len(token) > 0 and token in self.word2idx: 80 | tokens_filtered.append(self.word2idx[token]) 81 | continue 82 | 83 | return tokens_filtered 84 | 85 | def save(self, output_path: str): 86 | with open(os.path.join(output_path, 'phrasetokenizer_config.json'), 'w') as fOut: 87 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'ngram_separator': self.ngram_separator, 'max_ngram_length': self.max_ngram_length}, fOut) 88 | 89 | @staticmethod 90 | def load(input_path: str): 91 | with open(os.path.join(input_path, 'phrasetokenizer_config.json'), 'r') as fIn: 92 | config = json.load(fIn) 93 | 94 | return PhraseTokenizer(**config) 95 | -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/VietnameseTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 7 | from vncorenlp import VnCoreNLP 8 | import operator 9 | from functools import reduce 10 | class VietnameseTokenizer(WordTokenizer): 11 | """ 12 | Simple and fast white-space tokenizer. Splits sentence based on white spaces. 13 | Punctuation are stripped from tokens. 14 | """ 15 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False, vncorenlp_path = None): 16 | self.stop_words = set(stop_words) 17 | self.do_lower_case = do_lower_case 18 | self.set_vocab(vocab) 19 | self.vncorenlp_path = vncorenlp_path 20 | self.rdrsegmenter = VnCoreNLP(vncorenlp_path, annotators="wseg", max_heap_size='-Xmx1g') 21 | 22 | def get_vocab(self): 23 | return self.vocab 24 | 25 | def set_vocab(self, vocab: Iterable[str]): 26 | self.vocab = vocab 27 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 28 | 29 | def segment(self, text: str) -> str: 30 | ''' Segment words in text and then flat the list ''' 31 | segmented_word = self.rdrsegmenter.tokenize(text) 32 | return ' '.join(reduce(operator.concat, segmented_word)) 33 | 34 | def tokenize(self, text: str) -> List[int]: 35 | #segment words in text 36 | text = self.segment(text) 37 | 38 | if self.do_lower_case: 39 | text = text.lower() 40 | 41 | tokens = text.split() 42 | 43 | tokens_filtered = [] 44 | for token in tokens: 45 | if token in self.stop_words: 46 | continue 47 | elif token in self.word2idx: 48 | tokens_filtered.append(self.word2idx[token]) 49 | continue 50 | 51 | token = token.strip(string.punctuation) 52 | if token in self.stop_words: 53 | continue 54 | elif len(token) > 0 and token in self.word2idx: 55 | tokens_filtered.append(self.word2idx[token]) 56 | continue 57 | 58 | token = token.lower() 59 | if token in self.stop_words: 60 | continue 61 | elif token in self.word2idx: 62 | tokens_filtered.append(self.word2idx[token]) 63 | continue 64 | tokens_filtered.append(0) 65 | return tokens_filtered 66 | 67 | def save(self, output_path: str): 68 | with open(os.path.join(output_path, 'VietnameseTokenizer_config.json'), 'w') as fOut: 69 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case, 'vncorenlp_path': self.vncorenlp_path}, fOut) 70 | 71 | @staticmethod 72 | def load(input_path: str): 73 | with open(os.path.join(input_path, 'VietnameseTokenizer_config.json'), 'r') as fIn: 74 | config = json.load(fIn) 75 | 76 | return VietnameseTokenizer(**config) 77 | -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/WhitespaceTokenizer.py: -------------------------------------------------------------------------------- 1 | from typing import Union, Tuple, List, Iterable, Dict 2 | import collections 3 | import string 4 | import os 5 | import json 6 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 7 | 8 | class WhitespaceTokenizer(WordTokenizer): 9 | """ 10 | Simple and fast white-space tokenizer. Splits sentence based on white spaces. 11 | Punctuation are stripped from tokens. 12 | """ 13 | def __init__(self, vocab: Iterable[str] = [], stop_words: Iterable[str] = ENGLISH_STOP_WORDS, do_lower_case: bool = False): 14 | self.stop_words = set(stop_words) 15 | self.do_lower_case = do_lower_case 16 | self.set_vocab(vocab) 17 | 18 | def get_vocab(self): 19 | return self.vocab 20 | 21 | def set_vocab(self, vocab: Iterable[str]): 22 | self.vocab = vocab 23 | self.word2idx = collections.OrderedDict([(word, idx) for idx, word in enumerate(vocab)]) 24 | 25 | # def tokenize(self, text: str) -> List[int]: 26 | # if self.do_lower_case: 27 | # text = text.lower() 28 | 29 | # tokens = text.split() 30 | 31 | # tokens_filtered = [] 32 | # for token in tokens: 33 | # if token in self.stop_words: 34 | # continue 35 | # elif token in self.word2idx: 36 | # tokens_filtered.append(self.word2idx[token]) 37 | # continue 38 | 39 | # token = token.strip(string.punctuation) 40 | # if token in self.stop_words: 41 | # continue 42 | # elif len(token) > 0 and token in self.word2idx: 43 | # tokens_filtered.append(self.word2idx[token]) 44 | # continue 45 | 46 | # token = token.lower() 47 | # if token in self.stop_words: 48 | # continue 49 | # elif token in self.word2idx: 50 | # tokens_filtered.append(self.word2idx[token]) 51 | # continue 52 | 53 | # return tokens_filtered 54 | def tokenize(self, text: str) -> List[int]: 55 | if self.do_lower_case: 56 | text = text.lower() 57 | for stopword in self.stop_words: 58 | if stopword in text: 59 | text = text.replace(stopword,"") 60 | text = text.strip() 61 | tokens = text.split() 62 | 63 | tokens_filtered = [] 64 | for token in tokens: 65 | if token in self.word2idx: 66 | tokens_filtered.append(self.word2idx[token]) 67 | continue 68 | #string.punctuation == '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' 69 | token = token.strip(string.punctuation) 70 | if token in self.stop_words: 71 | continue 72 | elif len(token) > 0 and token in self.word2idx: 73 | tokens_filtered.append(self.word2idx[token]) 74 | continue 75 | 76 | token = token.lower() 77 | if token in self.stop_words: 78 | continue 79 | elif token in self.word2idx: 80 | tokens_filtered.append(self.word2idx[token]) 81 | continue 82 | tokens_filtered.append(0) 83 | return tokens_filtered 84 | def save(self, output_path: str): 85 | with open(os.path.join(output_path, 'whitespacetokenizer_config.json'), 'w') as fOut: 86 | json.dump({'vocab': list(self.word2idx.keys()), 'stop_words': list(self.stop_words), 'do_lower_case': self.do_lower_case}, fOut) 87 | 88 | @staticmethod 89 | def load(input_path: str): 90 | with open(os.path.join(input_path, 'whitespacetokenizer_config.json'), 'r') as fIn: 91 | config = json.load(fIn) 92 | 93 | return WhitespaceTokenizer(**config) 94 | -------------------------------------------------------------------------------- /sentence_transformers/models/tokenizer/__init__.py: -------------------------------------------------------------------------------- 1 | from .WordTokenizer import WordTokenizer, ENGLISH_STOP_WORDS 2 | from .WhitespaceTokenizer import WhitespaceTokenizer 3 | from .WhitespaceTokenizer import WhitespaceTokenizer 4 | from .PhoTokenizer import PhoTokenizer -------------------------------------------------------------------------------- /sentence_transformers/readers/InputExample.py: -------------------------------------------------------------------------------- 1 | from typing import Union, List 2 | 3 | 4 | class InputExample: 5 | """ 6 | Structure for one input example with texts, the label and a unique id 7 | """ 8 | def __init__(self, guid: str, texts: List[str], label: Union[int, float]): 9 | """ 10 | Creates one InputExample with the given texts, guid and label 11 | 12 | str.strip() is called on both texts. 13 | 14 | :param guid 15 | id for the example 16 | :param texts 17 | the texts for the example 18 | :param label 19 | the label for the example 20 | """ 21 | self.guid = guid 22 | self.texts = [text.strip() for text in texts] 23 | self.label = label 24 | -------------------------------------------------------------------------------- /sentence_transformers/readers/LabelSentenceReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class LabelSentenceReader: 7 | """Reads in a file that has at least two columns: a label and a sentence. 8 | This reader can for example be used with the BatchHardTripletLoss. 9 | Maps labels automatically to integers""" 10 | def __init__(self, folder, label_col_idx=0, sentence_col_idx=1): 11 | self.folder = folder 12 | self.label_map = {} 13 | self.label_col_idx = label_col_idx 14 | self.sentence_col_idx = sentence_col_idx 15 | 16 | def get_examples(self, filename, max_examples=0): 17 | examples = [] 18 | 19 | id = 0 20 | for line in open(os.path.join(self.folder, filename), encoding="utf-8"): 21 | splits = line.strip().split('\t') 22 | label = splits[self.label_col_idx] 23 | sentence = splits[self.sentence_col_idx] 24 | 25 | if label not in self.label_map: 26 | self.label_map[label] = len(self.label_map) 27 | 28 | label_id = self.label_map[label] 29 | guid = "%s-%d" % (filename, id) 30 | id += 1 31 | examples.append(InputExample(guid=guid, texts=[sentence], label=label_id)) 32 | 33 | if 0 < max_examples <= id: 34 | break 35 | 36 | return examples -------------------------------------------------------------------------------- /sentence_transformers/readers/NLIDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | 7 | class NLIDataReader(object): 8 | """ 9 | Reads in the Stanford NLI dataset and the MultiGenre NLI dataset 10 | """ 11 | def __init__(self, dataset_folder): 12 | self.dataset_folder = dataset_folder 13 | 14 | def get_examples(self, filename, max_examples=0): 15 | """ 16 | data_splits specified which data split to use (train, dev, test). 17 | Expects that self.dataset_folder contains the files s1.$data_split.gz, s2.$data_split.gz, 18 | labels.$data_split.gz, e.g., for the train split, s1.train.gz, s2.train.gz, labels.train.gz 19 | """ 20 | s1 = gzip.open(os.path.join(self.dataset_folder, 's1.' + filename), 21 | mode="rt", encoding="utf-8").readlines() 22 | s2 = gzip.open(os.path.join(self.dataset_folder, 's2.' + filename), 23 | mode="rt", encoding="utf-8").readlines() 24 | labels = gzip.open(os.path.join(self.dataset_folder, 'labels.' + filename), 25 | mode="rt", encoding="utf-8").readlines() 26 | 27 | examples = [] 28 | id = 0 29 | for sentence_a, sentence_b, label in zip(s1, s2, labels): 30 | guid = "%s-%d" % (filename, id) 31 | id += 1 32 | examples.append(InputExample(guid=guid, texts=[sentence_a, sentence_b], label=self.map_label(label))) 33 | 34 | if 0 < max_examples <= len(examples): 35 | break 36 | 37 | return examples 38 | 39 | @staticmethod 40 | def get_labels(): 41 | return {"contradiction": 0, "entailment": 1, "neutral": 2} 42 | 43 | def get_num_labels(self): 44 | return len(self.get_labels()) 45 | 46 | def map_label(self, label): 47 | return self.get_labels()[label.strip().lower()] -------------------------------------------------------------------------------- /sentence_transformers/readers/PairedFilesReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | import gzip 6 | 7 | class PairedFilesReader(object): 8 | """ 9 | Reads in the a Pair Dataset, split in two files 10 | """ 11 | def __init__(self, filepaths): 12 | self.filepaths = filepaths 13 | 14 | 15 | def get_examples(self, max_examples=0): 16 | """ 17 | """ 18 | fIns = [] 19 | for filepath in self.filepaths: 20 | fIn = gzip.open(filepath, 'rt', encoding='utf-8') if filepath.endswith('.gz') else open(filepath, encoding='utf-8') 21 | fIns.append(fIn) 22 | 23 | examples = [] 24 | 25 | eof = False 26 | while not eof: 27 | texts = [] 28 | for fIn in fIns: 29 | text = fIn.readline() 30 | 31 | if text == '': 32 | eof = True 33 | break 34 | 35 | texts.append(text) 36 | 37 | if eof: 38 | break; 39 | 40 | examples.append(InputExample(guid=str(len(examples)), texts=texts, label=1)) 41 | if max_examples > 0 and len(examples) >= max_examples: 42 | break 43 | 44 | return examples -------------------------------------------------------------------------------- /sentence_transformers/readers/STSDataReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class STSDataReader: 7 | """ 8 | Reads in the STS dataset. Each line contains two sentences (s1_col_idx, s2_col_idx) and one label (score_col_idx) 9 | 10 | Default values expects a tab seperated file with the first & second column the sentence pair and third column the score (0...1). Default config normalizes scores from 0...5 to 0...1 11 | """ 12 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, score_col_idx=2, delimiter="\t", 13 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 14 | self.dataset_folder = dataset_folder 15 | self.score_col_idx = score_col_idx 16 | self.s1_col_idx = s1_col_idx 17 | self.s2_col_idx = s2_col_idx 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | self.normalize_scores = normalize_scores 21 | self.min_score = min_score 22 | self.max_score = max_score 23 | 24 | def get_examples(self, filename, max_examples=0): 25 | """ 26 | filename specified which data split to use (train.csv, dev.csv, test.csv). 27 | """ 28 | filepath = os.path.join(self.dataset_folder, filename) 29 | with gzip.open(filepath, 'rt', encoding='utf8') if filename.endswith('.gz') else open(filepath, encoding="utf-8") as fIn: 30 | data = csv.reader(fIn, delimiter=self.delimiter, quoting=self.quoting) 31 | examples = [] 32 | for id, row in enumerate(data): 33 | score = float(row[self.score_col_idx]) 34 | if self.normalize_scores: # Normalize to a 0...1 value 35 | score = (score - self.min_score) / (self.max_score - self.min_score) 36 | 37 | s1 = row[self.s1_col_idx] 38 | s2 = row[self.s2_col_idx] 39 | examples.append(InputExample(guid=filename+str(id), texts=[s1, s2], label=score)) 40 | 41 | if max_examples > 0 and len(examples) >= max_examples: 42 | break 43 | 44 | return examples 45 | 46 | class STSBenchmarkDataReader(STSDataReader): 47 | """ 48 | Reader especially for the STS benchmark dataset. There, the sentences are in column 5 and 6, the score is in column 4. 49 | Scores are normalized from 0...5 to 0...1 50 | """ 51 | def __init__(self, dataset_folder, s1_col_idx=5, s2_col_idx=6, score_col_idx=4, delimiter="\t", 52 | quoting=csv.QUOTE_NONE, normalize_scores=True, min_score=0, max_score=5): 53 | super().__init__(dataset_folder=dataset_folder, s1_col_idx=s1_col_idx, s2_col_idx=s2_col_idx, score_col_idx=score_col_idx, delimiter="\t", 54 | quoting=quoting, normalize_scores=normalize_scores, min_score=min_score, max_score=max_score) -------------------------------------------------------------------------------- /sentence_transformers/readers/TripletReader.py: -------------------------------------------------------------------------------- 1 | from . import InputExample 2 | import csv 3 | import gzip 4 | import os 5 | 6 | class TripletReader(object): 7 | """ 8 | Reads in the a Triplet Dataset: Each line contains (at least) 3 columns, one anchor column (s1), 9 | one positive example (s2) and one negative example (s3) 10 | """ 11 | def __init__(self, dataset_folder, s1_col_idx=0, s2_col_idx=1, s3_col_idx=2, has_header=False, delimiter="\t", 12 | quoting=csv.QUOTE_NONE): 13 | self.dataset_folder = dataset_folder 14 | self.s1_col_idx = s1_col_idx 15 | self.s2_col_idx = s2_col_idx 16 | self.s3_col_idx = s3_col_idx 17 | self.has_header = has_header 18 | self.delimiter = delimiter 19 | self.quoting = quoting 20 | 21 | def get_examples(self, filename, max_examples=0): 22 | """ 23 | 24 | """ 25 | data = csv.reader(open(os.path.join(self.dataset_folder, filename), encoding="utf-8"), delimiter=self.delimiter, 26 | quoting=self.quoting) 27 | examples = [] 28 | if self.has_header: 29 | next(data) 30 | 31 | for id, row in enumerate(data): 32 | s1 = row[self.s1_col_idx] 33 | s2 = row[self.s2_col_idx] 34 | s3 = row[self.s3_col_idx] 35 | 36 | examples.append(InputExample(guid=filename+str(id), texts=[s1, s2, s3], label=1)) 37 | if max_examples > 0 and len(examples) >= max_examples: 38 | break 39 | 40 | return examples -------------------------------------------------------------------------------- /sentence_transformers/readers/__init__.py: -------------------------------------------------------------------------------- 1 | from .InputExample import InputExample 2 | from .LabelSentenceReader import LabelSentenceReader 3 | from .NLIDataReader import NLIDataReader 4 | from .STSDataReader import STSDataReader, STSBenchmarkDataReader 5 | from .TripletReader import TripletReader -------------------------------------------------------------------------------- /sentence_transformers/util.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from torch import Tensor, device 3 | from typing import Tuple, List 4 | from tqdm import tqdm 5 | import sys 6 | import importlib 7 | 8 | 9 | def batch_to_device(batch, target_device: device): 10 | """ 11 | send a batch to a device 12 | 13 | :param batch: 14 | :param target_device: 15 | :return: the batch sent to the device 16 | """ 17 | features = batch['features'] 18 | for paired_sentence_idx in range(len(features)): 19 | for feature_name in features[paired_sentence_idx]: 20 | features[paired_sentence_idx][feature_name] = features[paired_sentence_idx][feature_name].to(target_device) 21 | 22 | labels = batch['labels'].to(target_device) 23 | return features, labels 24 | 25 | 26 | 27 | def http_get(url, path): 28 | with open(path, "wb") as file_binary: 29 | req = requests.get(url, stream=True) 30 | if req.status_code != 200: 31 | print("Exception when trying to download {}. Response {}".format(url, req.status_code), file=sys.stderr) 32 | req.raise_for_status() 33 | 34 | content_length = req.headers.get('Content-Length') 35 | total = int(content_length) if content_length is not None else None 36 | progress = tqdm(unit="B", total=total, unit_scale=True) 37 | for chunk in req.iter_content(chunk_size=1024): 38 | if chunk: # filter out keep-alive new chunks 39 | progress.update(len(chunk)) 40 | file_binary.write(chunk) 41 | progress.close() 42 | 43 | 44 | def fullname(o): 45 | # o.__module__ + "." + o.__class__.__qualname__ is an example in 46 | # this context of H.L. Mencken's "neat, plausible, and wrong." 47 | # Python makes no guarantees as to whether the __module__ special 48 | # attribute is defined, so we take a more circumspect approach. 49 | # Alas, the module name is explicitly excluded from __qualname__ 50 | # in Python 3. 51 | 52 | module = o.__class__.__module__ 53 | if module is None or module == str.__class__.__module__: 54 | return o.__class__.__name__ # Avoid reporting __builtin__ 55 | else: 56 | return module + '.' + o.__class__.__name__ 57 | 58 | def import_from_string(dotted_path): 59 | """ 60 | Import a dotted module path and return the attribute/class designated by the 61 | last name in the path. Raise ImportError if the import failed. 62 | """ 63 | try: 64 | module_path, class_name = dotted_path.rsplit('.', 1) 65 | except ValueError: 66 | msg = "%s doesn't look like a module path" % dotted_path 67 | raise ImportError(msg) 68 | 69 | module = importlib.import_module(module_path) 70 | 71 | try: 72 | return getattr(module, class_name) 73 | except AttributeError: 74 | msg = 'Module "%s" does not define a "%s" attribute/class' % (module_path, class_name) 75 | raise ImportError(msg) -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | description-file = README.md -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | with open("README.md", mode="r", encoding="utf-8") as readme_file: 4 | readme = readme_file.read() 5 | 6 | 7 | 8 | setup( 9 | name="sentence-transformers", 10 | version="0.2.6.1", 11 | author="Nils Reimers, Gregor Geigle", 12 | author_email="Rnils@web.de", 13 | description="Sentence Embeddings using BERT / RoBERTa / XLNet", 14 | long_description=readme, 15 | long_description_content_type="text/markdown", 16 | license="Apache License 2.0", 17 | url="https://github.com/UKPLab/sentence-transformers", 18 | download_url="https://github.com/UKPLab/sentence-transformers/archive/v0.2.6.zip", 19 | packages=find_packages(), 20 | install_requires=[ 21 | 'transformers>=2.8.0', 22 | 'tqdm', 23 | 'torch>=1.0.1', 24 | 'numpy', 25 | 'scikit-learn', 26 | 'scipy', 27 | 'nltk' 28 | ], 29 | classifiers=[ 30 | "Development Status :: 4 - Beta", 31 | "Intended Audience :: Science/Research", 32 | "License :: OSI Approved :: Apache Software License", 33 | "Programming Language :: Python :: 3.6", 34 | "Topic :: Scientific/Engineering :: Artificial Intelligence" 35 | ], 36 | keywords="Transformer Networks BERT XLNet sentence embedding PyTorch NLP deep learning" 37 | ) 38 | -------------------------------------------------------------------------------- /tests/test_pretrained_stsb.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests that the pretrained models produce the correct scores on the STSbenchmark dataset 3 | """ 4 | from torch.utils.data import DataLoader 5 | from sentence_transformers import SentenceTransformer, SentencesDataset, LoggingHandler 6 | from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator 7 | from sentence_transformers.readers import STSDataReader 8 | import unittest 9 | 10 | 11 | class PretrainedSTSbTest(unittest.TestCase): 12 | def pretrained_model_score(self, model_name, expected_score): 13 | model = SentenceTransformer(model_name) 14 | sts_reader = STSDataReader('../examples/datasets/stsbenchmark') 15 | 16 | test_data = SentencesDataset(examples=sts_reader.get_examples("sts-test.csv"), model=model) 17 | test_dataloader = DataLoader(test_data, shuffle=False, batch_size=8) 18 | evaluator = EmbeddingSimilarityEvaluator(test_dataloader) 19 | 20 | score = model.evaluate(evaluator)*100 21 | print(model_name, "{:.2f} vs. exp: {:.2f}".format(score, expected_score)) 22 | assert abs(score-expected_score) < 0.1 23 | 24 | def test_bert_base(self): 25 | self.pretrained_model_score('bert-base-nli-mean-tokens', 77.12) 26 | self.pretrained_model_score('bert-base-nli-max-tokens', 77.21) 27 | self.pretrained_model_score('bert-base-nli-cls-token', 76.30) 28 | self.pretrained_model_score('bert-base-nli-stsb-mean-tokens', 85.14) 29 | 30 | 31 | def test_bert_large(self): 32 | self.pretrained_model_score('bert-large-nli-mean-tokens', 79.19) 33 | self.pretrained_model_score('bert-large-nli-max-tokens', 78.41) 34 | self.pretrained_model_score('bert-large-nli-cls-token', 78.29) 35 | self.pretrained_model_score('bert-large-nli-stsb-mean-tokens', 85.29) 36 | 37 | def test_roberta(self): 38 | self.pretrained_model_score('roberta-base-nli-mean-tokens', 77.49) 39 | self.pretrained_model_score('roberta-large-nli-mean-tokens', 78.69) 40 | self.pretrained_model_score('roberta-base-nli-stsb-mean-tokens', 85.44) 41 | self.pretrained_model_score('roberta-large-nli-stsb-mean-tokens', 86.39) 42 | 43 | def test_distilbert(self): 44 | self.pretrained_model_score('distilbert-base-nli-mean-tokens', 76.97) 45 | self.pretrained_model_score('distilbert-base-nli-stsb-mean-tokens', 84.38) 46 | 47 | def test_multiling(self): 48 | self.pretrained_model_score('distiluse-base-multilingual-cased', 80.62) 49 | 50 | if "__main__" == __name__: 51 | unittest.main() -------------------------------------------------------------------------------- /tests/test_wkpooling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Tests the WKPooling model 3 | """ 4 | import unittest 5 | from sentence_transformers import models, SentenceTransformer 6 | import scipy 7 | 8 | class WKPoolingTest(unittest.TestCase): 9 | sentence_pairs = [ 10 | ('Can you please. Send me the attachment.', 'I dont know. Where is it?'), 11 | ('My name is Paul', 'My name is Lisa'), 12 | ('The cat sits on the mat while the dog is barking', 'London is the capital of England'), 13 | ('BERT (Devlin et al., 2018) and RoBERTa (Liu et al., 2019) has set a new state-of-the-art performance on sentence-pair regression tasks like semantic textual similarity (STS)', 'However, it requires that both sentences are fed into the network, which causes a massive computational overhead: Finding the most similar pair in a collection of 10,000 sentences requires about 50 million inference computations (~65 hours) with BERT.'), 14 | ('In deep learning, each level learns to transform its input data into a slightly more abstract and composite representation.', 'London is considered to be one of the world\'s most important global cities.') 15 | ] 16 | 17 | def test_bert_wkpooling(self): 18 | word_embedding_model = models.BERT('bert-base-uncased', model_args={'output_hidden_states': True}) 19 | pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension()) 20 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 21 | scores = [0.6906377742193329, 22 | 0.9910573945907297, 23 | 0.8395676755959804, 24 | 0.7569234597143, 25 | 0.8324509121875274] 26 | 27 | for sentences, score in zip(WKPoolingTest.sentence_pairs, scores): 28 | embedding = model.encode(sentences, convert_to_numpy=True) 29 | 30 | similarity = 1-scipy.spatial.distance.cosine(embedding[0], embedding[1]) 31 | assert abs(similarity-score) < 0.01 32 | 33 | def test_roberta_wkpooling(self): 34 | word_embedding_model = models.Auto('roberta-base', model_args={'output_hidden_states': True}) 35 | pooling_model = models.WKPooling(word_embedding_model.get_word_embedding_dimension()) 36 | model = SentenceTransformer(modules=[word_embedding_model, pooling_model]) 37 | scores = [0.9594874382019043, 38 | 0.9928674697875977, 39 | 0.9241214990615845, 40 | 0.9309519529342651, 41 | 0.9506515264511108] 42 | 43 | for sentences, score in zip(WKPoolingTest.sentence_pairs, scores): 44 | embedding = model.encode(sentences, convert_to_numpy=True) 45 | 46 | similarity = 1-scipy.spatial.distance.cosine(embedding[0], embedding[1]) 47 | assert abs(similarity-score) < 0.01 48 | 49 | 50 | if "__main__" == __name__: 51 | unittest.main() --------------------------------------------------------------------------------