├── .gitignore ├── .idea ├── .gitignore ├── Contrastive-Tension.iml ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml └── vcs.xml ├── ContrastiveTension ├── ContrastiveTensionModel.py ├── Inference.py ├── Training.py └── __init__.py ├── ExampleBatchInference.py ├── ExampleTraining.py ├── LICENSE.txt ├── README.md ├── STSData ├── Dataset.py ├── Evaluation.py ├── __init__.py ├── sts-dev.csv ├── sts-test.csv └── sts-train.csv └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | # Datasource local storage ignored files 5 | /dataSources/ 6 | /dataSources.local.xml 7 | # Editor-based HTTP Client requests 8 | /httpRequests/ 9 | -------------------------------------------------------------------------------- /.idea/Contrastive-Tension.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/profiles_settings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /ContrastiveTension/ContrastiveTensionModel.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Please note that this code is optimized towards comprehension and not performance. 3 | ''' 4 | 5 | from tensorflow.python.keras import backend as K 6 | import tensorflow as tf 7 | import numpy as np 8 | import tqdm 9 | 10 | 11 | class ContrastivTensionModel(tf.keras.Model): 12 | 13 | def __init__(self, model1, model2, *args, **kwargs): 14 | super().__init__(*args, **kwargs) 15 | 16 | self.model1 = model1 17 | self.model2 = model2 18 | self.loss = tf.losses.BinaryCrossentropy(from_logits=True) 19 | self.nonReductionLoss = lambda y, x: K.binary_crossentropy(y, x, from_logits=True) 20 | 21 | def generateSingleEmbedding(self, model, inData, training=False): 22 | inds, att = inData 23 | embs = model({'input_ids': inds, 'attention_mask': att}, training=training)[0] 24 | outAtt = tf.cast(att, tf.float32) 25 | sampleLength = tf.reduce_sum(outAtt, axis=-1, keepdims=True) 26 | maskedEmbs = embs * tf.expand_dims(outAtt, axis=-1) 27 | return tf.reduce_sum(maskedEmbs, axis=1) / tf.cast(sampleLength, tf.float32) 28 | 29 | @tf.function 30 | def call(self, inputs, training=False, mask=None): 31 | emb1 = self.generateSingleEmbedding(self.model1, inputs, training) 32 | emb2 = self.generateSingleEmbedding(self.model2, inputs, training) 33 | return emb1, emb2 34 | 35 | @tf.function 36 | def predictandCompareSents(self, x1, x2, training=False): 37 | emb1 = self.generateSingleEmbedding(self.model1, x1, training) 38 | emb2 = self.generateSingleEmbedding(self.model2, x2, training) 39 | return self.compareSents(emb1, emb2), emb1, emb2 40 | 41 | def compareSents(self, emb1, emb2): 42 | return tf.reduce_sum(emb1 * emb2, axis=-1) 43 | 44 | def extractPositiveAndNegativeLoss(self, predValues, labels): 45 | losses = self.nonReductionLoss(labels, predValues) 46 | pLoss = tf.reduce_sum(losses * labels) 47 | nLoss = tf.reduce_sum(losses * (labels - 1) * -1) 48 | return pLoss, nLoss 49 | 50 | @tf.function 51 | def predictAndUpdate(self, inds1, att1, inds2, att2, labels): 52 | with tf.GradientTape() as tape: 53 | predValues, emb1, emb2 = self.predictandCompareSents((inds1, att1), (inds2, att2), 54 | training=False) 55 | 56 | cosineLoss = self.loss(labels, predValues) 57 | grad = tape.gradient(cosineLoss, self.trainable_variables) 58 | self.optimizer.apply_gradients(zip(grad, self.trainable_variables)) 59 | 60 | # Extract loss for Positive/Negative examples for later examination 61 | pLoss, nLoss = self.extractPositiveAndNegativeLoss(predValues, labels) 62 | 63 | return cosineLoss, pLoss, nLoss 64 | 65 | def fit(self, x=None, y=None, batch_size=None, epochs=1, verbose=1, callbacks=None, validation_split=0., 66 | validation_data=None, shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, 67 | steps_per_epoch=None, validation_steps=None, validation_freq=1, max_queue_size=10, workers=1, 68 | use_multiprocessing=False, **kwargs): 69 | contrastiveLosses, pLosses, nLosses = [], [], [] 70 | f = lambda x, i: x[i:i + batch_size] 71 | inds1, att1, inds2, att2 = x 72 | 73 | for i in tqdm.tqdm(range(0, len(inds1), batch_size)): 74 | # Main Training Loop 75 | batchInd1, batchInd2, batchAtt1, batchAtt2, = f(inds1, i), f(inds2, i), f(att1, i), f(att2, i) 76 | cLoss, pLoss, nLoss = self.predictAndUpdate(batchInd1, batchAtt1, batchInd2, batchAtt2, f(y, i)) 77 | 78 | # Convert Losses into numpy format, instead of TF tensors, for faster np operations 79 | contrastiveLosses.append(cLoss.numpy()) 80 | pLosses.append(pLoss.numpy()) 81 | nLosses.append(nLoss.numpy()) 82 | 83 | return np.mean(contrastiveLosses), np.mean(pLosses), np.mean(nLosses) 84 | -------------------------------------------------------------------------------- /ContrastiveTension/Inference.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def applyOutMaskToEmbeddings(embs, mask): 5 | # Solve potential Padding issues. This can be removed if sufficient precautions are taken 6 | if (embs.shape[1] > mask.shape[1]): 7 | mask = np.concatenate( 8 | [mask, np.zeros((embs.shape[0], embs.shape[1] - mask.shape[1]))], axis=1) 9 | if (embs.shape[1] < mask.shape[1]): 10 | mask = mask[:, :embs.shape[1]] 11 | 12 | # Mask the output before calculating the final sentence embedding by taking the mean 13 | maskedEmbs = embs * np.expand_dims(mask, axis=-1) 14 | summedEmbs = np.sum(maskedEmbs, axis=1) 15 | lengths = np.sum(mask, axis=-1, keepdims=True) 16 | return summedEmbs / lengths 17 | 18 | 19 | def tensorflowGenerateSentenceEmbeddings(model, tokenizer, texts): 20 | inData = tokenizer(texts, padding=True, return_tensors='tf') 21 | inputIds, attentionMask = inData['input_ids'], inData['attention_mask'] 22 | 23 | preMaskEmbeddings = model(input_ids=inputIds, attention_mask=attentionMask)[0] 24 | return applyOutMaskToEmbeddings(preMaskEmbeddings, attentionMask) 25 | 26 | def torchGenerateSentenceEmbeddings(model, tokenizer, texts): 27 | inData = tokenizer(texts, padding=True, return_tensors='pt') 28 | inputIds, attentionMask = inData['input_ids'], inData['attention_mask'] 29 | 30 | f = lambda x: np.array(x.detach()) 31 | preMaskEmbeddings = model(input_ids=inputIds, attention_mask=attentionMask)[0] 32 | return applyOutMaskToEmbeddings(f(preMaskEmbeddings), f(attentionMask)) -------------------------------------------------------------------------------- /ContrastiveTension/Training.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Please note that this code is optimized towards comprehension and not performance. 3 | ''' 4 | 5 | from ContrastiveTension import ContrastiveTensionModel 6 | import tensorflow as tf 7 | import transformers 8 | import numpy as np 9 | 10 | 11 | def _setOptimizerWithStepWiseLinearLearningRate(model): 12 | # Optimizer hyperparameters according to the original CT paper 13 | boundaries, values = [500, 1000, 1500, 2000], [1e-5, 8e-6, 6e-6, 4e-6, 2e-6] 14 | learning_rate_fn = tf.optimizers.schedules.PiecewiseConstantDecay(boundaries, values) 15 | model.optimizer = tf.optimizers.RMSprop(learning_rate_fn) 16 | 17 | 18 | def _orderBatchSamples(p_ids, p_att, n_ids, n_att, negativeK): 19 | inds_1, att_1, inds_2, att_2, labels, negCounter = [], [], [], [], [], 0 20 | for ind, att in zip(p_ids, p_att): 21 | # Add the positive sample for both models 22 | inds_1.append(ind) 23 | att_1.append(att) 24 | inds_2.append(ind) 25 | att_2.append(att) 26 | 27 | # Add Negative Samples for Model-1 28 | inds_1.extend(n_ids[negCounter:negCounter + negativeK]) 29 | att_1.extend(n_att[negCounter:negCounter + negativeK]) 30 | # Add Negative Samples for Model-2 31 | inds_2.extend(n_ids[negCounter + negativeK:negCounter + negativeK * 2]) 32 | att_2.extend(n_att[negCounter + negativeK:negCounter + negativeK * 2]) 33 | 34 | negCounter += negativeK * 2 35 | labels.extend([1] + [0] * negativeK) # Generate fitting labels 36 | 37 | f = lambda x: tf.convert_to_tensor(x, dtype=tf.int32) 38 | g = lambda x: tf.convert_to_tensor(x, dtype=tf.float32) 39 | return (f(inds_1), f(att_1), f(inds_2), f(att_2)), g(labels) 40 | 41 | 42 | def generateTrainingSamples(tokenizer, data, num_batches, negative_k=7, max_sent_len=200): 43 | pos_sents = [data[i] for i in np.random.randint(0, len(data), num_batches)] 44 | neg_sents = [data[i] for i in np.random.choice(range(len(data)), num_batches * negative_k * 2)] 45 | enc_sents = tokenizer.batch_encode_plus(pos_sents + neg_sents, add_special_tokens=True, max_length=max_sent_len, 46 | truncation=True, padding=True) 47 | 48 | ids, att = enc_sents['input_ids'], enc_sents['attention_mask'] 49 | p_ids, p_att = ids[:len(pos_sents)], att[:len(pos_sents)] 50 | n_ids, n_att = ids[len(pos_sents):], att[len(pos_sents):] 51 | 52 | return _orderBatchSamples(p_ids, p_att, n_ids, n_att, negative_k) 53 | 54 | 55 | def tensorflowContrastiveTension(model_name, tokenizer_name, corpus_data, evalFunc=None, epochs=10, 56 | batch_size=16, negative_k=7, fetch_size=500, max_sent_len=75): 57 | ''' 58 | :param model_name: Huggingface model path 59 | :param tokenizer_name: Huggingface tokenizer path 60 | :param corpus_data: List of strings 61 | :param evalFunc: Function expecting a model, tokenizer and a batch size 62 | :param epochs: (int) number of epochs 63 | :param batch_size: (int) 64 | :param negative_k: Number of negative pairs, per positive par. batch_size=16 & negative_k=7 -> 14 neg + 2 pos 65 | :param fetch_size: (int) number of batches that are included per "epoch" 66 | :param max_sent_len: (int) truncation length during tokenization 67 | ''' 68 | fetch_size = fetch_size * int(batch_size / (negative_k + 1)) 69 | 70 | tokenizer = transformers.AutoTokenizer.from_pretrained(tokenizer_name) 71 | m1 = transformers.TFAutoModel.from_pretrained(model_name, from_pt=True) 72 | m2 = transformers.TFAutoModel.from_pretrained(model_name, from_pt=True) 73 | model = ContrastiveTensionModel.ContrastivTensionModel(m1, m2) 74 | _setOptimizerWithStepWiseLinearLearningRate(model) 75 | 76 | bestEvalScore = 0 77 | for e in range(epochs): 78 | inData, labels = generateTrainingSamples(tokenizer, corpus_data, num_batches=fetch_size, 79 | negative_k=negative_k, 80 | max_sent_len=max_sent_len, 81 | ) 82 | 83 | loss, pLoss, nLoss = model.fit(inData, labels, batch_size=batch_size) 84 | print("Loss: {} pLoss: {} nLoss: {}".format(loss, pLoss, nLoss)) 85 | 86 | # Perform Evaluation of the models between Epochs, if we have passed an evaluation function 87 | if (evalFunc != None): 88 | eval_1 = evalFunc(model.model1, tokenizer, batch_size) 89 | print("Evaluation Scores Model-1:", eval_1) 90 | eval_2 = evalFunc(model.model2, tokenizer, batch_size) 91 | print("Evaluation Scores Model-2:", eval_2) 92 | 93 | # Save the model which performed best on the evaluation data 94 | eval_1, eval_2 = eval_1['Spearman'], eval_2['Spearman'] 95 | topEval = (eval_1, m1) if eval_1 > eval_2 else (eval_2, m2) 96 | if(topEval[0] >= bestEvalScore): 97 | bestEvalScore = topEval[0] 98 | print("New Best Eval Score:", bestEvalScore) 99 | topEval[-1].save_pretrained("Top-CT-Eval") 100 | 101 | -------------------------------------------------------------------------------- /ContrastiveTension/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreddeFrallan/Contrastive-Tension/75293a883344b389bfe726a3d43d6b48b29b55fb/ContrastiveTension/__init__.py -------------------------------------------------------------------------------- /ExampleBatchInference.py: -------------------------------------------------------------------------------- 1 | from ContrastiveTension import Inference 2 | import transformers 3 | 4 | random_texts = ["This is the first sentence within this example", 5 | "Here is a second sentence for this example, that is a bit different", 6 | "Did you know that all polar bears are left handed?", 7 | "It is a fact that every polar bear prefers the use of their left hand", 8 | ] 9 | 10 | def torchExample(): 11 | model = transformers.AutoModel.from_pretrained('Contrastive-Tension/BERT-Large-CT-STSb') 12 | tokenizer = transformers.AutoTokenizer.from_pretrained('Contrastive-Tension/BERT-Large-CT-STSb') 13 | 14 | embeddings = Inference.torchGenerateSentenceEmbeddings(model, tokenizer, random_texts) 15 | print(embeddings.shape) 16 | 17 | def tensorflowExample(): 18 | model = transformers.TFAutoModel.from_pretrained('Contrastive-Tension/RoBerta-Large-CT-STSb') 19 | tokenizer = transformers.AutoTokenizer.from_pretrained('Contrastive-Tension/RoBerta-Large-CT-STSb') 20 | 21 | embeddings = Inference.tensorflowGenerateSentenceEmbeddings(model, tokenizer, random_texts) 22 | print(embeddings.shape) 23 | 24 | 25 | if __name__ == '__main__': 26 | #torchExample() 27 | tensorflowExample() -------------------------------------------------------------------------------- /ExampleTraining.py: -------------------------------------------------------------------------------- 1 | from ContrastiveTension import Training 2 | from STSData import Evaluation 3 | import numpy as np 4 | import tqdm 5 | 6 | 7 | def generateDummyCorpus(num_sentences=10000, max_words_per_sent=15, max_word_len=8): 8 | import string 9 | 10 | sents = [] 11 | letters = [c for c in string.ascii_lowercase] 12 | words_in_sents = np.random.randint(1, max_words_per_sent, num_sentences) 13 | for num_words in tqdm.tqdm(words_in_sents, desc='Generating Dummy Dataset'): 14 | word_lens = np.random.randint(1, max_word_len, num_words) 15 | sents.append(' '.join([''.join(np.random.choice(letters, n)) for n in word_lens])) 16 | return sents 17 | 18 | if __name__ == '__main__': 19 | # Hyperparamters from the Original CT paper 20 | batch_size = 16 21 | negative_k = 7 22 | fetch_size = 500 23 | max_sent_len = 75 24 | 25 | model_name = "distilbert-base-uncased" 26 | tokenizer_name = "distilbert-base-uncased" 27 | # This generates a list of randomly generated sentences, change this into whatever text corpus you wish to use. 28 | corpus = generateDummyCorpus() 29 | 30 | eval_func = lambda m, t, b: Evaluation.evaluateSTS(m, t, b, use_dev_data=True) 31 | Training.tensorflowContrastiveTension(model_name, tokenizer_name, corpus, evalFunc=eval_func, 32 | batch_size=batch_size, negative_k=negative_k, fetch_size=fetch_size, 33 | max_sent_len=max_sent_len) 34 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2018 Othneil Drew 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |
2 |

3 |

Contrastive Tension

4 |

Semantic Sentence Embeddings

5 | 6 |

7 | Published Paper 8 | · 9 | Huggingface Models 10 | · 11 | Report Bug 12 |

13 |

14 | 15 | 16 | 17 | ## Overview 18 | This is the official code accompanied with the paper [Semantic Re-Tuning via Contrastive Tension](https://openreview.net/pdf?id=Ov_sMNau-PF).
19 | The paper was accepted at ICLR-2021 and official reviews and responses can be found at [OpenReview](https://openreview.net/forum?id=Ov_sMNau-PF). 20 | 21 | Contrastive Tension(CT) is a fully self-supervised algorithm for re-tuning already pre-trained transformer Language Models, and achieves State-Of-The-Art(SOTA) sentence embeddings for Semantic Textual Similarity(STS). All that is required is hence a pre-trained model and a modestly large text corpus. The results presented in the paper sampled text data from Wikipedia. 22 | 23 | This repository contains: 24 | * Tensorflow 2 implementation of the CT algorithm 25 | * State of the art pre-trained STS models 26 | * Tensorflow 2 inference code 27 | * PyTorch inference code 28 | 29 | ### Requirements 30 | While it is possible that other versions works equally fine, we have worked with the following: 31 | 32 | * Python = 3.6.9 33 | * Transformers = 4.1.1 34 | 35 | 36 | ## Usage 37 | All the models and tokenizers are available via the Huggingface interface, and can be loaded for both Tensorflow and PyTorch: 38 | ```python 39 | import transformers 40 | 41 | tokenizer = transformers.AutoTokenizer.from_pretrained('Contrastive-Tension/RoBerta-Large-CT-STSb') 42 | 43 | TF_model = transformers.TFAutoModel.from_pretrained('Contrastive-Tension/RoBerta-Large-CT-STSb') 44 | PT_model = transformers.AutoModel.from_pretrained('Contrastive-Tension/RoBerta-Large-CT-STSb') 45 | ``` 46 | 47 | ### Inference 48 | To perform inference with the pre-trained models (or other Huggigface models) please see the script [ExampleBatchInference.py](ExampleBatchInference.py).
49 | The most important thing to remember when running inference is to apply the attention_masks on the batch output vector before mean pooling, as is done in the example script. 50 | 51 | ### CT Training 52 | To run CT on your own models and text data see [ExampleTraining.py](ExampleTraining.py) for a comprehensive example. This file currently creates a dummy corpus of random text. Simply replace this to whatever corpus you like. 53 | 54 | 55 | ## Pre-trained Models 56 | Note that these models are not trained with the exact hyperparameters as those disclosed in the original CT paper. Rather, the parameters are from a short follow-up paper currently under review, which once again pushes the SOTA. 57 | 58 | All evaluation is done using the [SentEval](https://github.com/facebookresearch/SentEval) framework, and shows the: (Pearson / Spearman) correlations 59 | ### Unsupervised / Zero-Shot 60 | As both the training of BERT, and CT itself is fully self-supervised, the models only tuned with CT require no labeled data whatsoever.
61 | The NLI models however, are first fine-tuned towards a natural language inference task, which requires labeled data. 62 | 63 | | Model| Avg Unsupervised STS |STS-b | #Parameters| 64 | | ----------------------------------|:-----: |:-----: |:-----: | 65 | |**Fully Unsupervised** || 66 | | [BERT-Distil-CT](https://huggingface.co/Contrastive-Tension/BERT-Distil-CT) | 75.12 / 75.04| 78.63 / 77.91 | 66 M| 67 | | [BERT-Base-CT](https://huggingface.co/Contrastive-Tension/BERT-Base-CT) | 73.55 / 73.36 | 75.49 / 73.31 | 108 M| 68 | | [BERT-Large-CT](https://huggingface.co/Contrastive-Tension/BERT-Large-CT) | 77.12 / 76.93| 80.75 / 79.82 | 334 M| 69 | |**Using NLI Data** || 70 | | [BERT-Distil-NLI-CT](https://huggingface.co/Contrastive-Tension/BERT-Distil-NLI-CT) | 76.65 / 76.63 | 79.74 / 81.01 | 66 M| 71 | | [BERT-Base-NLI-CT](https://huggingface.co/Contrastive-Tension/BERT-Base-NLI-CT) | 76.05 / 76.28 | 79.98 / 81.47 | 108 M| 72 | | [BERT-Large-NLI-CT](https://huggingface.co/Contrastive-Tension/BERT-Large-NLI-CT) | 77.42 / 77.41 | 80.92 / 81.66 | 334 M| 73 | 74 | ### Supervised 75 | These models are fine-tuned directly with STS data, using a modified version of the supervised training object proposed by [S-BERT](https://arxiv.org/abs/1908.10084).
76 | To our knowledge our RoBerta-Large-STSb is the current SOTA model for STS via sentence embeddings. 77 | 78 | | Model| STS-b | #Parameters| 79 | | ----------------------------------|:-----: |:-----: | 80 | | [BERT-Distil-CT-STSb](https://huggingface.co/Contrastive-Tension/BERT-Distil-CT-STSb) | 84.85 / 85.46 | 66 M| 81 | | [BERT-Base-CT-STSb](https://huggingface.co/Contrastive-Tension/BERT-Base-CT-STSb) | 85.31 / 85.76 | 108 M| 82 | | [BERT-Large-CT-STSb](https://huggingface.co/Contrastive-Tension/BERT-Large-CT-STSb) | 85.86 / 86.47 | 334 M| 83 | | [RoBerta-Large-CT-STSb](https://huggingface.co/Contrastive-Tension/RoBerta-Large-CT-STSb) | 87.56 / 88.42 | 334 M| 84 | 85 | ### Other Languages 86 | 87 | | Model | Language | #Parameters| 88 | | ----------------------------------|:-----: |:-----: | 89 | | [BERT-Base-Swe-CT-STSb](https://huggingface.co/Contrastive-Tension/BERT-Base-Swe-CT-STSb/tree/main) | Swedish | 108 M| 90 | 91 | 92 | 93 | 94 | ## License 95 | Distributed under the MIT License. See `LICENSE` for more information. 96 | 97 | 98 | 99 | ## Contact 100 | If you have questions regarding the paper, please consider creating a comment via the official [OpenReview submission](https://openreview.net/forum?id=Ov_sMNau-PF).
101 | If you have questions regarding the code or otherwise related to this Github page, please open an [issue](https://github.com/FreddeFrallan/Contrastive-Tension/issues). 102 | 103 | For other purposes, feel free to contact me directly at: Fredrik.Carlsson@ri.se 104 | 105 | 106 | ## Acknowledgements 107 | * [SentEval](https://github.com/facebookresearch/SentEval) 108 | * [Huggingface](https://huggingface.co/) 109 | * [Sentence-Transformer](https://github.com/UKPLab/sentence-transformers) 110 | * [Best Readme Template](https://github.com/othneildrew/Best-README-Template) 111 | 112 | 113 | 114 | 115 | [contributors-shield]: https://img.shields.io/github/contributors/othneildrew/Best-README-Template.svg?style=for-the-badge 116 | [contributors-url]: https://github.com/othneildrew/Best-README-Template/graphs/contributors 117 | [forks-shield]: https://img.shields.io/github/forks/othneildrew/Best-README-Template.svg?style=for-the-badge 118 | [forks-url]: https://github.com/othneildrew/Best-README-Template/network/members 119 | [stars-shield]: https://img.shields.io/github/stars/othneildrew/Best-README-Template.svg?style=for-the-badge 120 | [stars-url]: https://github.com/othneildrew/Best-README-Template/stargazers 121 | [issues-shield]: https://img.shields.io/github/issues/othneildrew/Best-README-Template.svg?style=for-the-badge 122 | [issues-url]: https://github.com/othneildrew/Best-README-Template/issues 123 | [license-shield]: https://img.shields.io/github/license/othneildrew/Best-README-Template.svg?style=for-the-badge 124 | [license-url]: https://github.com/othneildrew/Best-README-Template/blob/master/LICENSE.txt 125 | [linkedin-shield]: https://img.shields.io/badge/-LinkedIn-black.svg?style=for-the-badge&logo=linkedin&colorB=555 126 | [linkedin-url]: https://linkedin.com/in/othneildrew 127 | [product-screenshot]: images/screenshot.png 128 | -------------------------------------------------------------------------------- /STSData/Dataset.py: -------------------------------------------------------------------------------- 1 | def getUniqueCaptions(dataset, sortOnSize=True): 2 | captions = {} 3 | for s1, s2, _ in dataset: 4 | captions[s1] = 1 5 | captions[s2] = 1 6 | 7 | def getUniqueCaptions(dataset, sortOnSize=True): 8 | captions = set() 9 | for s1, s2, _ in dataset: 10 | captions.add(s1) 11 | captions.add(s2) 12 | if sortOnSize: 13 | return sorted(captions, key=len) 14 | else: 15 | return list(captions) 16 | 17 | 18 | def _readAndLoadSTSBData(name): 19 | data = [] 20 | with open("STSData/{}".format(name), 'r') as fp: 21 | for line in fp.readlines(): 22 | genre, filename, year, ids, score, sentence1, sentence2 = line.strip().split('\t')[:7] 23 | data.append((sentence1, sentence2, float(score))) 24 | return data 25 | 26 | 27 | def loadTestData(): 28 | return _readAndLoadSTSBData("sts-test.csv") 29 | 30 | 31 | def loadDevData(): 32 | return _readAndLoadSTSBData("sts-dev.csv") 33 | 34 | 35 | def loadTrainData(): 36 | return _readAndLoadSTSBData("sts-train.csv") 37 | -------------------------------------------------------------------------------- /STSData/Evaluation.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This is just a simple script to get you up and running. 3 | If you are aiming to publish your own results, please consider relying on SentEval for evaluation. 4 | https://github.com/facebookresearch/SentEval 5 | ''' 6 | 7 | from sklearn.metrics.pairwise import cosine_similarity 8 | from scipy.stats import pearsonr, spearmanr 9 | from ContrastiveTension import Inference 10 | from STSData import Dataset 11 | import numpy as np 12 | import tqdm 13 | 14 | 15 | def evalCorrelationScores(sent2Vecs, dataset): 16 | similarityScores, humanScores = [], [] 17 | for i, data in enumerate(dataset): 18 | s1, s2, score = data 19 | humanScores.append(score) 20 | similarityScores.append(cosine_similarity([sent2Vecs[s1]], [sent2Vecs[s2]])[0][0]) 21 | 22 | x, y = np.array(similarityScores), np.array(humanScores) 23 | pearResults = pearsonr(x, y) 24 | spearResults = spearmanr(x, y) 25 | 26 | return {'Pearson': pearResults[0], 'Spearman': spearResults[0]} 27 | 28 | 29 | def evaluateSTS(model, tokenizer, batch_size=512, use_dev_data=False): 30 | data = Dataset.loadDevData() if use_dev_data else Dataset.loadTestData() 31 | texts = Dataset.getUniqueCaptions(data) 32 | 33 | sent2Vec = {} 34 | for i in tqdm.tqdm(range(0, len(texts), batch_size), "Generating Eval Embeddings"): 35 | batchTexts = texts[i:i + batch_size] 36 | embs = Inference.tensorflowGenerateSentenceEmbeddings(model, tokenizer, batchTexts) 37 | 38 | for txt, emb in zip(batchTexts, embs): 39 | sent2Vec[txt] = emb 40 | 41 | return evalCorrelationScores(sent2Vec, data) 42 | -------------------------------------------------------------------------------- /STSData/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/FreddeFrallan/Contrastive-Tension/75293a883344b389bfe726a3d43d6b48b29b55fb/STSData/__init__.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | from setuptools import setup, find_packages 4 | 5 | setup(name='ContrastiveTension', 6 | version='0.0.1', 7 | description='ContrastiveTension', 8 | author='', 9 | author_email='', 10 | packages=find_packages(), 11 | ) 12 | --------------------------------------------------------------------------------