├── README.md ├── SentEval ├── LICENSE ├── README.md ├── data │ └── downstream │ │ └── download_dataset.sh ├── examples │ ├── bow.py │ ├── gensen.py │ ├── googleuse.py │ ├── infersent.py │ ├── models.py │ └── skipthought.py ├── senteval │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── __init__.cpython-38.pyc │ │ ├── binary.cpython-36.pyc │ │ ├── binary.cpython-38.pyc │ │ ├── engine.cpython-36.pyc │ │ ├── engine.cpython-38.pyc │ │ ├── mrpc.cpython-36.pyc │ │ ├── mrpc.cpython-38.pyc │ │ ├── probing.cpython-36.pyc │ │ ├── probing.cpython-38.pyc │ │ ├── rank.cpython-36.pyc │ │ ├── rank.cpython-38.pyc │ │ ├── sick.cpython-36.pyc │ │ ├── sick.cpython-38.pyc │ │ ├── snli.cpython-36.pyc │ │ ├── snli.cpython-38.pyc │ │ ├── sst.cpython-36.pyc │ │ ├── sst.cpython-38.pyc │ │ ├── sts.cpython-36.pyc │ │ ├── sts.cpython-38.pyc │ │ ├── trec.cpython-36.pyc │ │ ├── trec.cpython-38.pyc │ │ ├── utils.cpython-36.pyc │ │ └── utils.cpython-38.pyc │ ├── binary.py │ ├── engine.py │ ├── mrpc.py │ ├── probing.py │ ├── rank.py │ ├── sick.py │ ├── snli.py │ ├── sst.py │ ├── sts.py │ ├── tools │ │ ├── __init__.py │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── __init__.cpython-38.pyc │ │ │ ├── classifier.cpython-36.pyc │ │ │ ├── classifier.cpython-38.pyc │ │ │ ├── ranking.cpython-36.pyc │ │ │ ├── ranking.cpython-38.pyc │ │ │ ├── relatedness.cpython-36.pyc │ │ │ ├── relatedness.cpython-38.pyc │ │ │ ├── validation.cpython-36.pyc │ │ │ └── validation.cpython-38.pyc │ │ ├── classifier.py │ │ ├── ranking.py │ │ ├── relatedness.py │ │ └── validation.py │ ├── trec.py │ └── utils.py └── setup.py ├── cl ├── models.py └── trainers.py ├── data └── download_nli.sh ├── requirements.txt ├── run_sup_layerattnpooler.sh └── train.py /README.md: -------------------------------------------------------------------------------- 1 | # Don't Judge a Language Model by Its Last Layer: Contrastive Learning with Layer-Wise Attention Pooling 2 | 3 | 4 | Paper link: https://aclanthology.org/2022.coling-1.405/ 5 | 6 | To be published in [**Coling 2022**](https://coling2022.org/) 7 | 8 | Our code is mainly based on the code of [SimCSE](https://arxiv.org/abs/2104.08821). Please refer to their [repository](https://github.com/princeton-nlp/SimCSE) for more detailed information. 9 | 10 | ### Requirements 11 | * Python 3.8 12 | 13 | ### Install other packages 14 | ``` 15 | pip install -r requirements.txt 16 | ``` 17 | 18 | ### Download the pretraining dataset 19 | ``` 20 | cd data 21 | bash download_nli.sh 22 | ``` 23 | 24 | ### Download the downstream dataset 25 | ``` 26 | cd SentEval/data/downstream/ 27 | bash download_dataset.sh 28 | ``` 29 | 30 | ## Training 31 | (Using Multi-GPU `run_sup_layerattnpooler.sh`) 32 | ```bash 33 | python train.py \ 34 | --model_name_or_path bert-base-uncased \ 35 | --train_file data/nli_for_simcse.csv \ 36 | --output_dir result/bert-base-uncased-cl-layerattnpooler \ 37 | --num_train_epochs 3 \ 38 | --per_device_train_batch_size 64 \ 39 | --learning_rate 2e-5 \ 40 | --max_seq_length 64 \ 41 | --evaluation_strategy steps \ 42 | --metric_for_best_model stsb_spearman \ 43 | --load_best_model_at_end \ 44 | --eval_steps 100 \ 45 | --pooler_type cls \ 46 | --overwrite_output_dir \ 47 | --temp 0.05 \ 48 | --do_train \ 49 | --do_eval \ 50 | --fp16 \ 51 | "$@" 52 | ``` 53 | 54 | ## Citations 55 | 56 | Please cite our paper if they are helpful to your work! 57 | 58 | ```bibtex 59 | @inproceedings{oh2022don, 60 | title={Don’t Judge a Language Model by Its Last Layer: Contrastive Learning with Layer-Wise Attention Pooling}, 61 | author={Oh, Dongsuk and Kim, Yejin and Lee, Hodong and Huang, H Howie and Lim, Heui-Seok}, 62 | booktitle={Proceedings of the 29th International Conference on Computational Linguistics}, 63 | pages={4585--4592}, 64 | year={2022} 65 | } 66 | ``` 67 | -------------------------------------------------------------------------------- /SentEval/LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For SentEval software 4 | 5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /SentEval/README.md: -------------------------------------------------------------------------------- 1 | Our modification to SentEval: 2 | 3 | 1. Add the `all` setting to all STS tasks. 4 | 2. Change STS-B and SICK-R to not use an additional regressor. 5 | 6 | # SentEval: evaluation toolkit for sentence embeddings 7 | 8 | SentEval is a library for evaluating the quality of sentence embeddings. We assess their generalization power by using them as features on a broad and diverse set of "transfer" tasks. **SentEval currently includes 17 downstream tasks**. We also include a suite of **10 probing tasks** which evaluate what linguistic properties are encoded in sentence embeddings. Our goal is to ease the study and the development of general-purpose fixed-size sentence representations. 9 | 10 | 11 | **(04/22) SentEval new tasks: Added probing tasks for evaluating what linguistic properties are encoded in sentence embeddings** 12 | 13 | **(10/04) SentEval example scripts for three sentence encoders: [SkipThought-LN](https://github.com/ryankiros/layer-norm#skip-thoughts)/[GenSen](https://github.com/Maluuba/gensen)/[Google-USE](https://tfhub.dev/google/universal-sentence-encoder/1)** 14 | 15 | ## Dependencies 16 | 17 | This code is written in python. The dependencies are: 18 | 19 | * Python 2/3 with [NumPy](http://www.numpy.org/)/[SciPy](http://www.scipy.org/) 20 | * [Pytorch](http://pytorch.org/)>=0.4 21 | * [scikit-learn](http://scikit-learn.org/stable/index.html)>=0.18.0 22 | 23 | ## Transfer tasks 24 | 25 | ### Downstream tasks 26 | SentEval allows you to evaluate your sentence embeddings as features for the following *downstream* tasks: 27 | 28 | | Task | Type | #train | #test | needs_train | set_classifier | 29 | |---------- |------------------------------ |-----------:|----------:|:-----------:|:----------:| 30 | | [MR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | movie review | 11k | 11k | 1 | 1 | 31 | | [CR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | product review | 4k | 4k | 1 | 1 | 32 | | [SUBJ](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | subjectivity status | 10k | 10k | 1 | 1 | 33 | | [MPQA](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | opinion-polarity | 11k | 11k | 1 | 1 | 34 | | [SST](https://nlp.stanford.edu/sentiment/index.html) | binary sentiment analysis | 67k | 1.8k | 1 | 1 | 35 | | **[SST](https://nlp.stanford.edu/sentiment/index.html)** | **fine-grained sentiment analysis** | 8.5k | 2.2k | 1 | 1 | 36 | | [TREC](http://cogcomp.cs.illinois.edu/Data/QA/QC/) | question-type classification | 6k | 0.5k | 1 | 1 | 37 | | [SICK-E](http://clic.cimec.unitn.it/composes/sick.html) | natural language inference | 4.5k | 4.9k | 1 | 1 | 38 | | [SNLI](https://nlp.stanford.edu/projects/snli/) | natural language inference | 550k | 9.8k | 1 | 1 | 39 | | [MRPC](https://aclweb.org/aclwiki/Paraphrase_Identification_(State_of_the_art)) | paraphrase detection | 4.1k | 1.7k | 1 | 1 | 40 | | [STS 2012](https://www.cs.york.ac.uk/semeval-2012/task6/) | semantic textual similarity | N/A | 3.1k | 0 | 0 | 41 | | [STS 2013](http://ixa2.si.ehu.es/sts/) | semantic textual similarity | N/A | 1.5k | 0 | 0 | 42 | | [STS 2014](http://alt.qcri.org/semeval2014/task10/) | semantic textual similarity | N/A | 3.7k | 0 | 0 | 43 | | [STS 2015](http://alt.qcri.org/semeval2015/task2/) | semantic textual similarity | N/A | 8.5k | 0 | 0 | 44 | | [STS 2016](http://alt.qcri.org/semeval2016/task1/) | semantic textual similarity | N/A | 9.2k | 0 | 0 | 45 | | [STS B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results) | semantic textual similarity | 5.7k | 1.4k | 1 | 0 | 46 | | [SICK-R](http://clic.cimec.unitn.it/composes/sick.html) | semantic textual similarity | 4.5k | 4.9k | 1 | 0 | 47 | | [COCO](http://mscoco.org/) | image-caption retrieval | 567k | 5*1k | 1 | 0 | 48 | 49 | where **needs_train** means a model with parameters is learned on top of the sentence embeddings, and **set_classifier** means you can define the parameters of the classifier in the case of a classification task (see below). 50 | 51 | Note: COCO comes with ResNet-101 2048d image embeddings. [More details on the tasks.](https://arxiv.org/pdf/1705.02364.pdf) 52 | 53 | ### Probing tasks 54 | SentEval also includes a series of [*probing* tasks](https://github.com/facebookresearch/SentEval/tree/master/data/probing) to evaluate what linguistic properties are encoded in your sentence embeddings: 55 | 56 | | Task | Type | #train | #test | needs_train | set_classifier | 57 | |---------- |------------------------------ |-----------:|----------:|:-----------:|:----------:| 58 | | [SentLen](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Length prediction | 100k | 10k | 1 | 1 | 59 | | [WC](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Word Content analysis | 100k | 10k | 1 | 1 | 60 | | [TreeDepth](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Tree depth prediction | 100k | 10k | 1 | 1 | 61 | | [TopConst](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Top Constituents prediction | 100k | 10k | 1 | 1 | 62 | | [BShift](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Word order analysis | 100k | 10k | 1 | 1 | 63 | | [Tense](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Verb tense prediction | 100k | 10k | 1 | 1 | 64 | | [SubjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Subject number prediction | 100k | 10k | 1 | 1 | 65 | | [ObjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Object number prediction | 100k | 10k | 1 | 1 | 66 | | [SOMO](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Semantic odd man out | 100k | 10k | 1 | 1 | 67 | | [CoordInv](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Coordination Inversion | 100k | 10k | 1 | 1 | 68 | 69 | ## Download datasets 70 | To get all the transfer tasks datasets, run (in data/downstream/): 71 | ```bash 72 | ./get_transfer_data.bash 73 | ``` 74 | This will automatically download and preprocess the downstream datasets, and store them in data/downstream (warning: for MacOS users, you may have to use p7zip instead of unzip). The probing tasks are already in data/probing by default. 75 | 76 | ## How to use SentEval: examples 77 | 78 | ### examples/bow.py 79 | 80 | In examples/bow.py, we evaluate the quality of the average of word embeddings. 81 | 82 | To download state-of-the-art fastText embeddings: 83 | 84 | ```bash 85 | curl -Lo glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip 86 | curl -Lo crawl-300d-2M.vec.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip 87 | ``` 88 | 89 | To reproduce the results for bag-of-vectors, run (in examples/): 90 | ```bash 91 | python bow.py 92 | ``` 93 | 94 | As required by SentEval, this script implements two functions: **prepare** (optional) and **batcher** (required) that turn text sentences into sentence embeddings. Then SentEval takes care of the evaluation on the transfer tasks using the embeddings as features. 95 | 96 | ### examples/infersent.py 97 | 98 | To get the **[InferSent](https://www.github.com/facebookresearch/InferSent)** model and reproduce our results, download our best models and run infersent.py (in examples/): 99 | ```bash 100 | curl -Lo examples/infersent1.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent1.pkl 101 | curl -Lo examples/infersent2.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent2.pkl 102 | ``` 103 | 104 | ### examples/skipthought.py - examples/gensen.py - examples/googleuse.py 105 | 106 | We also provide example scripts for three other encoders: 107 | 108 | * [SkipThought with Layer-Normalization](https://github.com/ryankiros/layer-norm#skip-thoughts) in Theano 109 | * [GenSen encoder](https://github.com/Maluuba/gensen) in Pytorch 110 | * [Google encoder](https://tfhub.dev/google/universal-sentence-encoder/1) in TensorFlow 111 | 112 | Note that for SkipThought and GenSen, following the steps of the associated githubs is necessary. 113 | The Google encoder script should work as-is. 114 | 115 | ## How to use SentEval 116 | 117 | To evaluate your sentence embeddings, SentEval requires that you implement two functions: 118 | 119 | 1. **prepare** (sees the whole dataset of each task and can thus construct the word vocabulary, the dictionary of word vectors etc) 120 | 2. **batcher** (transforms a batch of text sentences into sentence embeddings) 121 | 122 | 123 | ### 1.) prepare(params, samples) (optional) 124 | 125 | *batcher* only sees one batch at a time while the *samples* argument of *prepare* contains all the sentences of a task. 126 | 127 | ``` 128 | prepare(params, samples) 129 | ``` 130 | * *params*: senteval parameters. 131 | * *samples*: list of all sentences from the tranfer task. 132 | * *output*: No output. Arguments stored in "params" can further be used by *batcher*. 133 | 134 | *Example*: in bow.py, prepare is is used to build the vocabulary of words and construct the "params.word_vect* dictionary of word vectors. 135 | 136 | 137 | ### 2.) batcher(params, batch) 138 | ``` 139 | batcher(params, batch) 140 | ``` 141 | * *params*: senteval parameters. 142 | * *batch*: numpy array of text sentences (of size params.batch_size) 143 | * *output*: numpy array of sentence embeddings (of size params.batch_size) 144 | 145 | *Example*: in bow.py, batcher is used to compute the mean of the word vectors for each sentence in the batch using params.word_vec. Use your own encoder in that function to encode sentences. 146 | 147 | ### 3.) evaluation on transfer tasks 148 | 149 | After having implemented the batch and prepare function for your own sentence encoder, 150 | 151 | 1) to perform the actual evaluation, first import senteval and set its parameters: 152 | ```python 153 | import senteval 154 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10} 155 | ``` 156 | 157 | 2) (optional) set the parameters of the classifier (when applicable): 158 | ```python 159 | params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 160 | 'tenacity': 5, 'epoch_size': 4} 161 | ``` 162 | You can choose **nhid=0** (Logistic Regression) or **nhid>0** (MLP) and define the parameters for training. 163 | 164 | 3) Create an instance of the class SE: 165 | ```python 166 | se = senteval.engine.SE(params, batcher, prepare) 167 | ``` 168 | 169 | 4) define the set of transfer tasks and run the evaluation: 170 | ```python 171 | transfer_tasks = ['MR', 'SICKEntailment', 'STS14', 'STSBenchmark'] 172 | results = se.eval(transfer_tasks) 173 | ``` 174 | The current list of available tasks is: 175 | ```python 176 | ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SNLI', 177 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'ImageCaptionRetrieval', 178 | 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 179 | 'Length', 'WordContent', 'Depth', 'TopConstituents','BigramShift', 'Tense', 180 | 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion'] 181 | ``` 182 | 183 | ## SentEval parameters 184 | Global parameters of SentEval: 185 | ```bash 186 | # senteval parameters 187 | task_path # path to SentEval datasets (required) 188 | seed # seed 189 | usepytorch # use cuda-pytorch (else scikit-learn) where possible 190 | kfold # k-fold validation for MR/CR/SUB/MPQA. 191 | ``` 192 | 193 | Parameters of the classifier: 194 | ```bash 195 | nhid: # number of hidden units (0: Logistic Regression, >0: MLP); Default nonlinearity: Tanh 196 | optim: # optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..) 197 | tenacity: # how many times dev acc does not increase before training stops 198 | epoch_size: # each epoch corresponds to epoch_size pass on the train set 199 | max_epoch: # max number of epoches 200 | dropout: # dropout for MLP 201 | ``` 202 | 203 | Note that to get a proxy of the results while **dramatically reducing computation time**, 204 | we suggest the **prototyping config**: 205 | ```python 206 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 207 | params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 208 | 'tenacity': 3, 'epoch_size': 2} 209 | ``` 210 | which will results in a 5 times speedup for classification tasks. 211 | 212 | To produce results that are **comparable to the literature**, use the **default config**: 213 | ```python 214 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10} 215 | params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 216 | 'tenacity': 5, 'epoch_size': 4} 217 | ``` 218 | which takes longer but will produce better and comparable results. 219 | 220 | For probing tasks, we used an MLP with a Sigmoid nonlinearity and and tuned the nhid (in [50, 100, 200]) and dropout (in [0.0, 0.1, 0.2]) on the dev set. 221 | 222 | ## References 223 | 224 | Please considering citing [[1]](https://arxiv.org/abs/1803.05449) if using this code for evaluating sentence embedding methods. 225 | 226 | ### SentEval: An Evaluation Toolkit for Universal Sentence Representations 227 | 228 | [1] A. Conneau, D. Kiela, [*SentEval: An Evaluation Toolkit for Universal Sentence Representations*](https://arxiv.org/abs/1803.05449) 229 | 230 | ``` 231 | @article{conneau2018senteval, 232 | title={SentEval: An Evaluation Toolkit for Universal Sentence Representations}, 233 | author={Conneau, Alexis and Kiela, Douwe}, 234 | journal={arXiv preprint arXiv:1803.05449}, 235 | year={2018} 236 | } 237 | ``` 238 | 239 | Contact: [aconneau@fb.com](mailto:aconneau@fb.com), [dkiela@fb.com](mailto:dkiela@fb.com) 240 | 241 | ### Related work 242 | * [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726) 243 | * [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx) 244 | * [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207) 245 | * [A. Conneau, D. Kiela, L. Barrault, H. Schwenk, A. Bordes - Supervised Learning of Universal Sentence Representations from Natural Language Inference Data, EMNLP 2017](https://arxiv.org/abs/1705.02364) 246 | * [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079) 247 | * [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334) 248 | * [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175) 249 | * [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070) 250 | -------------------------------------------------------------------------------- /SentEval/data/downstream/download_dataset.sh: -------------------------------------------------------------------------------- 1 | wget https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/senteval.tar 2 | tar xvf senteval.tar 3 | -------------------------------------------------------------------------------- /SentEval/examples/bow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | import sys 11 | import io 12 | import numpy as np 13 | import logging 14 | 15 | 16 | # Set PATHs 17 | PATH_TO_SENTEVAL = '../' 18 | PATH_TO_DATA = '../data' 19 | # PATH_TO_VEC = 'glove/glove.840B.300d.txt' 20 | PATH_TO_VEC = 'fasttext/crawl-300d-2M.vec' 21 | 22 | # import SentEval 23 | sys.path.insert(0, PATH_TO_SENTEVAL) 24 | import senteval 25 | 26 | 27 | # Create dictionary 28 | def create_dictionary(sentences, threshold=0): 29 | words = {} 30 | for s in sentences: 31 | for word in s: 32 | words[word] = words.get(word, 0) + 1 33 | 34 | if threshold > 0: 35 | newwords = {} 36 | for word in words: 37 | if words[word] >= threshold: 38 | newwords[word] = words[word] 39 | words = newwords 40 | words[''] = 1e9 + 4 41 | words[''] = 1e9 + 3 42 | words['

'] = 1e9 + 2 43 | 44 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort 45 | id2word = [] 46 | word2id = {} 47 | for i, (w, _) in enumerate(sorted_words): 48 | id2word.append(w) 49 | word2id[w] = i 50 | 51 | return id2word, word2id 52 | 53 | # Get word vectors from vocabulary (glove, word2vec, fasttext ..) 54 | def get_wordvec(path_to_vec, word2id): 55 | word_vec = {} 56 | 57 | with io.open(path_to_vec, 'r', encoding='utf-8') as f: 58 | # if word2vec or fasttext file : skip first line "next(f)" 59 | for line in f: 60 | word, vec = line.split(' ', 1) 61 | if word in word2id: 62 | word_vec[word] = np.fromstring(vec, sep=' ') 63 | 64 | logging.info('Found {0} words with word vectors, out of \ 65 | {1} words'.format(len(word_vec), len(word2id))) 66 | return word_vec 67 | 68 | 69 | # SentEval prepare and batcher 70 | def prepare(params, samples): 71 | _, params.word2id = create_dictionary(samples) 72 | params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id) 73 | params.wvec_dim = 300 74 | return 75 | 76 | def batcher(params, batch): 77 | batch = [sent if sent != [] else ['.'] for sent in batch] 78 | embeddings = [] 79 | 80 | for sent in batch: 81 | sentvec = [] 82 | for word in sent: 83 | if word in params.word_vec: 84 | sentvec.append(params.word_vec[word]) 85 | if not sentvec: 86 | vec = np.zeros(params.wvec_dim) 87 | sentvec.append(vec) 88 | sentvec = np.mean(sentvec, 0) 89 | embeddings.append(sentvec) 90 | 91 | embeddings = np.vstack(embeddings) 92 | return embeddings 93 | 94 | 95 | # Set params for SentEval 96 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 97 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 98 | 'tenacity': 3, 'epoch_size': 2} 99 | 100 | # Set up logger 101 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 102 | 103 | if __name__ == "__main__": 104 | se = senteval.engine.SE(params_senteval, batcher, prepare) 105 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 106 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 107 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 108 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 109 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 110 | 'OddManOut', 'CoordinationInversion'] 111 | results = se.eval(transfer_tasks) 112 | print(results) 113 | -------------------------------------------------------------------------------- /SentEval/examples/gensen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Clone GenSen repo here: https://github.com/Maluuba/gensen.git 10 | And follow instructions for loading the model used in batcher 11 | """ 12 | 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | import sys 16 | import logging 17 | # import GenSen package 18 | from gensen import GenSen, GenSenSingle 19 | 20 | # Set PATHs 21 | PATH_TO_SENTEVAL = '../' 22 | PATH_TO_DATA = '../data' 23 | 24 | # import SentEval 25 | sys.path.insert(0, PATH_TO_SENTEVAL) 26 | import senteval 27 | 28 | # SentEval prepare and batcher 29 | def prepare(params, samples): 30 | return 31 | 32 | def batcher(params, batch): 33 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch] 34 | _, reps_h_t = gensen.get_representation( 35 | sentences, pool='last', return_numpy=True, tokenize=True 36 | ) 37 | embeddings = reps_h_t 38 | return embeddings 39 | 40 | # Load GenSen model 41 | gensen_1 = GenSenSingle( 42 | model_folder='../data/models', 43 | filename_prefix='nli_large_bothskip', 44 | pretrained_emb='../data/embedding/glove.840B.300d.h5' 45 | ) 46 | gensen_2 = GenSenSingle( 47 | model_folder='../data/models', 48 | filename_prefix='nli_large_bothskip_parse', 49 | pretrained_emb='../data/embedding/glove.840B.300d.h5' 50 | ) 51 | gensen_encoder = GenSen(gensen_1, gensen_2) 52 | reps_h, reps_h_t = gensen.get_representation( 53 | sentences, pool='last', return_numpy=True, tokenize=True 54 | ) 55 | 56 | # Set params for SentEval 57 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 58 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 59 | 'tenacity': 3, 'epoch_size': 2} 60 | params_senteval['gensen'] = gensen_encoder 61 | 62 | # Set up logger 63 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 64 | 65 | if __name__ == "__main__": 66 | se = senteval.engine.SE(params_senteval, batcher, prepare) 67 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 68 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 69 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 70 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 71 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 72 | 'OddManOut', 'CoordinationInversion'] 73 | results = se.eval(transfer_tasks) 74 | print(results) 75 | -------------------------------------------------------------------------------- /SentEval/examples/googleuse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division 9 | 10 | import os 11 | import sys 12 | import logging 13 | import tensorflow as tf 14 | import tensorflow_hub as hub 15 | tf.logging.set_verbosity(0) 16 | 17 | # Set PATHs 18 | PATH_TO_SENTEVAL = '../' 19 | PATH_TO_DATA = '../data' 20 | 21 | # import SentEval 22 | sys.path.insert(0, PATH_TO_SENTEVAL) 23 | import senteval 24 | 25 | # tensorflow session 26 | session = tf.Session() 27 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 28 | 29 | # SentEval prepare and batcher 30 | def prepare(params, samples): 31 | return 32 | 33 | def batcher(params, batch): 34 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch] 35 | embeddings = params['google_use'](batch) 36 | return embeddings 37 | 38 | def make_embed_fn(module): 39 | with tf.Graph().as_default(): 40 | sentences = tf.placeholder(tf.string) 41 | embed = hub.Module(module) 42 | embeddings = embed(sentences) 43 | session = tf.train.MonitoredSession() 44 | return lambda x: session.run(embeddings, {sentences: x}) 45 | 46 | # Start TF session and load Google Universal Sentence Encoder 47 | encoder = make_embed_fn("https://tfhub.dev/google/universal-sentence-encoder-large/2") 48 | 49 | # Set params for SentEval 50 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 51 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 52 | 'tenacity': 3, 'epoch_size': 2} 53 | params_senteval['google_use'] = encoder 54 | 55 | # Set up logger 56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 57 | 58 | if __name__ == "__main__": 59 | se = senteval.engine.SE(params_senteval, batcher, prepare) 60 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 61 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 62 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 63 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 64 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 65 | 'OddManOut', 'CoordinationInversion'] 66 | results = se.eval(transfer_tasks) 67 | print(results) 68 | -------------------------------------------------------------------------------- /SentEval/examples/infersent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | InferSent models. See https://github.com/facebookresearch/InferSent. 10 | """ 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import sys 15 | import os 16 | import torch 17 | import logging 18 | 19 | # get models.py from InferSent repo 20 | from models import InferSent 21 | 22 | # Set PATHs 23 | PATH_SENTEVAL = '../' 24 | PATH_TO_DATA = '../data' 25 | PATH_TO_W2V = 'PATH/TO/glove.840B.300d.txt' # or crawl-300d-2M.vec for V2 26 | MODEL_PATH = 'infersent1.pkl' 27 | V = 1 # version of InferSent 28 | 29 | assert os.path.isfile(MODEL_PATH) and os.path.isfile(PATH_TO_W2V), \ 30 | 'Set MODEL and GloVe PATHs' 31 | 32 | # import senteval 33 | sys.path.insert(0, PATH_SENTEVAL) 34 | import senteval 35 | 36 | 37 | def prepare(params, samples): 38 | params.infersent.build_vocab([' '.join(s) for s in samples], tokenize=False) 39 | 40 | 41 | def batcher(params, batch): 42 | sentences = [' '.join(s) for s in batch] 43 | embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False) 44 | return embeddings 45 | 46 | 47 | """ 48 | Evaluation of trained model on Transfer Tasks (SentEval) 49 | """ 50 | 51 | # define senteval params 52 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 53 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 54 | 'tenacity': 3, 'epoch_size': 2} 55 | # Set up logger 56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 57 | 58 | if __name__ == "__main__": 59 | # Load InferSent model 60 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 61 | 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} 62 | model = InferSent(params_model) 63 | model.load_state_dict(torch.load(MODEL_PATH)) 64 | model.set_w2v_path(PATH_TO_W2V) 65 | 66 | params_senteval['infersent'] = model.cuda() 67 | 68 | se = senteval.engine.SE(params_senteval, batcher, prepare) 69 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 70 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 71 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 72 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 73 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 74 | 'OddManOut', 'CoordinationInversion'] 75 | results = se.eval(transfer_tasks) 76 | print(results) 77 | -------------------------------------------------------------------------------- /SentEval/examples/models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | This file contains the definition of encoders used in https://arxiv.org/pdf/1705.02364.pdf 10 | """ 11 | 12 | import numpy as np 13 | import time 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | 19 | class InferSent(nn.Module): 20 | 21 | def __init__(self, config): 22 | super(InferSent, self).__init__() 23 | self.bsize = config['bsize'] 24 | self.word_emb_dim = config['word_emb_dim'] 25 | self.enc_lstm_dim = config['enc_lstm_dim'] 26 | self.pool_type = config['pool_type'] 27 | self.dpout_model = config['dpout_model'] 28 | self.version = 1 if 'version' not in config else config['version'] 29 | 30 | self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1, 31 | bidirectional=True, dropout=self.dpout_model) 32 | 33 | assert self.version in [1, 2] 34 | if self.version == 1: 35 | self.bos = '' 36 | self.eos = '' 37 | self.max_pad = True 38 | self.moses_tok = False 39 | elif self.version == 2: 40 | self.bos = '

' 41 | self.eos = '

' 42 | self.max_pad = False 43 | self.moses_tok = True 44 | 45 | def is_cuda(self): 46 | # either all weights are on cpu or they are on gpu 47 | return self.enc_lstm.bias_hh_l0.data.is_cuda 48 | 49 | def forward(self, sent_tuple): 50 | # sent_len: [max_len, ..., min_len] (bsize) 51 | # sent: (seqlen x bsize x worddim) 52 | sent, sent_len = sent_tuple 53 | 54 | # Sort by length (keep idx) 55 | sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len) 56 | sent_len_sorted = sent_len_sorted.copy() 57 | idx_unsort = np.argsort(idx_sort) 58 | 59 | idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \ 60 | else torch.from_numpy(idx_sort) 61 | sent = sent.index_select(1, idx_sort) 62 | 63 | # Handling padding in Recurrent Networks 64 | sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted) 65 | sent_output = self.enc_lstm(sent_packed)[0] # seqlen x batch x 2*nhid 66 | sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0] 67 | 68 | # Un-sort by length 69 | idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \ 70 | else torch.from_numpy(idx_unsort) 71 | sent_output = sent_output.index_select(1, idx_unsort) 72 | 73 | # Pooling 74 | if self.pool_type == "mean": 75 | sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda() 76 | emb = torch.sum(sent_output, 0).squeeze(0) 77 | emb = emb / sent_len.expand_as(emb) 78 | elif self.pool_type == "max": 79 | if not self.max_pad: 80 | sent_output[sent_output == 0] = -1e9 81 | emb = torch.max(sent_output, 0)[0] 82 | if emb.ndimension() == 3: 83 | emb = emb.squeeze(0) 84 | assert emb.ndimension() == 2 85 | 86 | return emb 87 | 88 | def set_w2v_path(self, w2v_path): 89 | self.w2v_path = w2v_path 90 | 91 | def get_word_dict(self, sentences, tokenize=True): 92 | # create vocab of words 93 | word_dict = {} 94 | sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences] 95 | for sent in sentences: 96 | for word in sent: 97 | if word not in word_dict: 98 | word_dict[word] = '' 99 | word_dict[self.bos] = '' 100 | word_dict[self.eos] = '' 101 | return word_dict 102 | 103 | def get_w2v(self, word_dict): 104 | assert hasattr(self, 'w2v_path'), 'w2v path not set' 105 | # create word_vec with w2v vectors 106 | word_vec = {} 107 | with open(self.w2v_path, encoding='utf-8') as f: 108 | for line in f: 109 | word, vec = line.split(' ', 1) 110 | if word in word_dict: 111 | word_vec[word] = np.fromstring(vec, sep=' ') 112 | print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict))) 113 | return word_vec 114 | 115 | def get_w2v_k(self, K): 116 | assert hasattr(self, 'w2v_path'), 'w2v path not set' 117 | # create word_vec with k first w2v vectors 118 | k = 0 119 | word_vec = {} 120 | with open(self.w2v_path, encoding='utf-8') as f: 121 | for line in f: 122 | word, vec = line.split(' ', 1) 123 | if k <= K: 124 | word_vec[word] = np.fromstring(vec, sep=' ') 125 | k += 1 126 | if k > K: 127 | if word in [self.bos, self.eos]: 128 | word_vec[word] = np.fromstring(vec, sep=' ') 129 | 130 | if k > K and all([w in word_vec for w in [self.bos, self.eos]]): 131 | break 132 | return word_vec 133 | 134 | def build_vocab(self, sentences, tokenize=True): 135 | assert hasattr(self, 'w2v_path'), 'w2v path not set' 136 | word_dict = self.get_word_dict(sentences, tokenize) 137 | self.word_vec = self.get_w2v(word_dict) 138 | print('Vocab size : %s' % (len(self.word_vec))) 139 | 140 | # build w2v vocab with k most frequent words 141 | def build_vocab_k_words(self, K): 142 | assert hasattr(self, 'w2v_path'), 'w2v path not set' 143 | self.word_vec = self.get_w2v_k(K) 144 | print('Vocab size : %s' % (K)) 145 | 146 | def update_vocab(self, sentences, tokenize=True): 147 | assert hasattr(self, 'w2v_path'), 'warning : w2v path not set' 148 | assert hasattr(self, 'word_vec'), 'build_vocab before updating it' 149 | word_dict = self.get_word_dict(sentences, tokenize) 150 | 151 | # keep only new words 152 | for word in self.word_vec: 153 | if word in word_dict: 154 | del word_dict[word] 155 | 156 | # udpate vocabulary 157 | if word_dict: 158 | new_word_vec = self.get_w2v(word_dict) 159 | self.word_vec.update(new_word_vec) 160 | else: 161 | new_word_vec = [] 162 | print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec))) 163 | 164 | def get_batch(self, batch): 165 | # sent in batch in decreasing order of lengths 166 | # batch: (bsize, max_len, word_dim) 167 | embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim)) 168 | 169 | for i in range(len(batch)): 170 | for j in range(len(batch[i])): 171 | embed[j, i, :] = self.word_vec[batch[i][j]] 172 | 173 | return torch.FloatTensor(embed) 174 | 175 | def tokenize(self, s): 176 | from nltk.tokenize import word_tokenize 177 | if self.moses_tok: 178 | s = ' '.join(word_tokenize(s)) 179 | s = s.replace(" n't ", "n 't ") # HACK to get ~MOSES tokenization 180 | return s.split() 181 | else: 182 | return word_tokenize(s) 183 | 184 | def prepare_samples(self, sentences, bsize, tokenize, verbose): 185 | sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else 186 | [self.bos] + self.tokenize(s) + [self.eos] for s in sentences] 187 | n_w = np.sum([len(x) for x in sentences]) 188 | 189 | # filters words without w2v vectors 190 | for i in range(len(sentences)): 191 | s_f = [word for word in sentences[i] if word in self.word_vec] 192 | if not s_f: 193 | import warnings 194 | warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \ 195 | Replacing by ""..' % (sentences[i], i)) 196 | s_f = [self.eos] 197 | sentences[i] = s_f 198 | 199 | lengths = np.array([len(s) for s in sentences]) 200 | n_wk = np.sum(lengths) 201 | if verbose: 202 | print('Nb words kept : %s/%s (%.1f%s)' % ( 203 | n_wk, n_w, 100.0 * n_wk / n_w, '%')) 204 | 205 | # sort by decreasing length 206 | lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths) 207 | sentences = np.array(sentences)[idx_sort] 208 | 209 | return sentences, lengths, idx_sort 210 | 211 | def encode(self, sentences, bsize=64, tokenize=True, verbose=False): 212 | tic = time.time() 213 | sentences, lengths, idx_sort = self.prepare_samples( 214 | sentences, bsize, tokenize, verbose) 215 | 216 | embeddings = [] 217 | for stidx in range(0, len(sentences), bsize): 218 | batch = self.get_batch(sentences[stidx:stidx + bsize]) 219 | if self.is_cuda(): 220 | batch = batch.cuda() 221 | with torch.no_grad(): 222 | batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy() 223 | embeddings.append(batch) 224 | embeddings = np.vstack(embeddings) 225 | 226 | # unsort 227 | idx_unsort = np.argsort(idx_sort) 228 | embeddings = embeddings[idx_unsort] 229 | 230 | if verbose: 231 | print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % ( 232 | len(embeddings)/(time.time()-tic), 233 | 'gpu' if self.is_cuda() else 'cpu', bsize)) 234 | return embeddings 235 | 236 | def visualize(self, sent, tokenize=True): 237 | 238 | sent = sent.split() if not tokenize else self.tokenize(sent) 239 | sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]] 240 | 241 | if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos): 242 | import warnings 243 | warnings.warn('No words in "%s" have w2v vectors. Replacing \ 244 | by "%s %s"..' % (sent, self.bos, self.eos)) 245 | batch = self.get_batch(sent) 246 | 247 | if self.is_cuda(): 248 | batch = batch.cuda() 249 | output = self.enc_lstm(batch)[0] 250 | output, idxs = torch.max(output, 0) 251 | # output, idxs = output.squeeze(), idxs.squeeze() 252 | idxs = idxs.data.cpu().numpy() 253 | argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))] 254 | 255 | # visualize model 256 | import matplotlib.pyplot as plt 257 | x = range(len(sent[0])) 258 | y = [100.0 * n / np.sum(argmaxs) for n in argmaxs] 259 | plt.xticks(x, sent[0], rotation=45) 260 | plt.bar(x, y) 261 | plt.ylabel('%') 262 | plt.title('Visualisation of words importance') 263 | plt.show() 264 | 265 | return output, idxs 266 | -------------------------------------------------------------------------------- /SentEval/examples/skipthought.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | """ 11 | Example of file for SkipThought in SentEval 12 | """ 13 | import logging 14 | import sys 15 | sys.setdefaultencoding('utf8') 16 | 17 | 18 | # Set PATHs 19 | PATH_TO_SENTEVAL = '../' 20 | PATH_TO_DATA = '../data/senteval_data/' 21 | PATH_TO_SKIPTHOUGHT = '' 22 | 23 | assert PATH_TO_SKIPTHOUGHT != '', 'Download skipthought and set correct PATH' 24 | 25 | # import skipthought and Senteval 26 | sys.path.insert(0, PATH_TO_SKIPTHOUGHT) 27 | import skipthoughts 28 | sys.path.insert(0, PATH_TO_SENTEVAL) 29 | import senteval 30 | 31 | 32 | def prepare(params, samples): 33 | return 34 | 35 | def batcher(params, batch): 36 | batch = [str(' '.join(sent), errors="ignore") if sent != [] else '.' for sent in batch] 37 | embeddings = skipthoughts.encode(params['encoder'], batch, 38 | verbose=False, use_eos=True) 39 | return embeddings 40 | 41 | 42 | # Set params for SentEval 43 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10, 'batch_size': 512} 44 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 45 | 'tenacity': 5, 'epoch_size': 4} 46 | # Set up logger 47 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 48 | 49 | if __name__ == "__main__": 50 | # Load SkipThought model 51 | params_senteval['encoder'] = skipthoughts.load_model() 52 | 53 | se = senteval.engine.SE(params_senteval, batcher, prepare) 54 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 55 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 56 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 57 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 58 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 59 | 'OddManOut', 'CoordinationInversion'] 60 | results = se.eval(transfer_tasks) 61 | print(results) 62 | -------------------------------------------------------------------------------- /SentEval/senteval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import 9 | 10 | from senteval.engine import SE 11 | -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/binary.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/binary.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/binary.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/binary.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/engine.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/engine.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/engine.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/engine.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/mrpc.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/mrpc.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/mrpc.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/mrpc.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/probing.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/probing.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/probing.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/probing.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/rank.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/rank.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/rank.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/rank.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/sick.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/sick.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/sick.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/sick.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/snli.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/snli.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/snli.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/snli.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/sst.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/sst.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/sst.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/sst.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/sts.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/sts.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/sts.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/sts.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/trec.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/trec.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/trec.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/trec.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/binary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import io 14 | import os 15 | import numpy as np 16 | import logging 17 | 18 | from senteval.tools.validation import InnerKFoldClassifier 19 | 20 | 21 | class BinaryClassifierEval(object): 22 | def __init__(self, pos, neg, seed=1111): 23 | self.seed = seed 24 | self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg) 25 | self.n_samples = len(self.samples) 26 | 27 | def do_prepare(self, params, prepare): 28 | # prepare is given the whole text 29 | return prepare(params, self.samples) 30 | # prepare puts everything it outputs in "params" : params.word2id etc 31 | # Those output will be further used by "batcher". 32 | 33 | def loadFile(self, fpath): 34 | with io.open(fpath, 'r', encoding='latin-1') as f: 35 | return [line.split() for line in f.read().splitlines()] 36 | 37 | def run(self, params, batcher): 38 | enc_input = [] 39 | # Sort to reduce padding 40 | sorted_corpus = sorted(zip(self.samples, self.labels), 41 | key=lambda z: (len(z[0]), z[1])) 42 | sorted_samples = [x for (x, y) in sorted_corpus] 43 | sorted_labels = [y for (x, y) in sorted_corpus] 44 | logging.info('Generating sentence embeddings') 45 | for ii in range(0, self.n_samples, params.batch_size): 46 | batch = sorted_samples[ii:ii + params.batch_size] 47 | embeddings = batcher(params, batch) 48 | enc_input.append(embeddings) 49 | enc_input = np.vstack(enc_input) 50 | logging.info('Generated sentence embeddings') 51 | 52 | config = {'nclasses': 2, 'seed': self.seed, 53 | 'usepytorch': params.usepytorch, 54 | 'classifier': params.classifier, 55 | 'nhid': params.nhid, 'kfold': params.kfold} 56 | clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config) 57 | devacc, testacc = clf.run() 58 | logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc)) 59 | return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples, 60 | 'ntest': self.n_samples} 61 | 62 | 63 | class CREval(BinaryClassifierEval): 64 | def __init__(self, task_path, seed=1111): 65 | logging.debug('***** Transfer task : CR *****\n\n') 66 | pos = self.loadFile(os.path.join(task_path, 'custrev.pos')) 67 | neg = self.loadFile(os.path.join(task_path, 'custrev.neg')) 68 | super(self.__class__, self).__init__(pos, neg, seed) 69 | 70 | 71 | class MREval(BinaryClassifierEval): 72 | def __init__(self, task_path, seed=1111): 73 | logging.debug('***** Transfer task : MR *****\n\n') 74 | pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos')) 75 | neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg')) 76 | super(self.__class__, self).__init__(pos, neg, seed) 77 | 78 | 79 | class SUBJEval(BinaryClassifierEval): 80 | def __init__(self, task_path, seed=1111): 81 | logging.debug('***** Transfer task : SUBJ *****\n\n') 82 | obj = self.loadFile(os.path.join(task_path, 'subj.objective')) 83 | subj = self.loadFile(os.path.join(task_path, 'subj.subjective')) 84 | super(self.__class__, self).__init__(obj, subj, seed) 85 | 86 | 87 | class MPQAEval(BinaryClassifierEval): 88 | def __init__(self, task_path, seed=1111): 89 | logging.debug('***** Transfer task : MPQA *****\n\n') 90 | pos = self.loadFile(os.path.join(task_path, 'mpqa.pos')) 91 | neg = self.loadFile(os.path.join(task_path, 'mpqa.neg')) 92 | super(self.__class__, self).__init__(pos, neg, seed) 93 | -------------------------------------------------------------------------------- /SentEval/senteval/engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | 10 | Generic sentence evaluation scripts wrapper 11 | 12 | ''' 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | from senteval import utils 16 | from senteval.binary import CREval, MREval, MPQAEval, SUBJEval 17 | from senteval.snli import SNLIEval 18 | from senteval.trec import TRECEval 19 | from senteval.sick import SICKEntailmentEval, SICKEval 20 | from senteval.mrpc import MRPCEval 21 | from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval, SICKRelatednessEval, STSBenchmarkFinetune 22 | from senteval.sst import SSTEval 23 | from senteval.rank import ImageCaptionRetrievalEval 24 | from senteval.probing import * 25 | 26 | class SE(object): 27 | def __init__(self, params, batcher, prepare=None): 28 | # parameters 29 | params = utils.dotdict(params) 30 | params.usepytorch = True if 'usepytorch' not in params else params.usepytorch 31 | params.seed = 1111 if 'seed' not in params else params.seed 32 | 33 | params.batch_size = 128 if 'batch_size' not in params else params.batch_size 34 | params.nhid = 0 if 'nhid' not in params else params.nhid 35 | params.kfold = 5 if 'kfold' not in params else params.kfold 36 | 37 | if 'classifier' not in params or not params['classifier']: 38 | params.classifier = {'nhid': 0} 39 | 40 | assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!' 41 | 42 | self.params = params 43 | 44 | # batcher and prepare 45 | self.batcher = batcher 46 | self.prepare = prepare if prepare else lambda x, y: None 47 | 48 | self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 49 | 'SICKRelatedness', 'SICKEntailment', 'STSBenchmark', 50 | 'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13', 51 | 'STS14', 'STS15', 'STS16', 52 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 53 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 54 | 'OddManOut', 'CoordinationInversion', 'SICKRelatedness-finetune', 'STSBenchmark-finetune', 'STSBenchmark-fix'] 55 | 56 | def eval(self, name): 57 | # evaluate on evaluation [name], either takes string or list of strings 58 | if (isinstance(name, list)): 59 | self.results = {x: self.eval(x) for x in name} 60 | return self.results 61 | 62 | tpath = self.params.task_path 63 | assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks) 64 | 65 | # Original SentEval tasks 66 | if name == 'CR': 67 | self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed) 68 | elif name == 'MR': 69 | self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed) 70 | elif name == 'MPQA': 71 | self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed) 72 | elif name == 'SUBJ': 73 | self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed) 74 | elif name == 'SST2': 75 | self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed) 76 | elif name == 'SST5': 77 | self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed) 78 | elif name == 'TREC': 79 | self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed) 80 | elif name == 'MRPC': 81 | self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed) 82 | elif name == 'SICKRelatedness': 83 | self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed) 84 | elif name == 'STSBenchmark': 85 | self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed) 86 | elif name == 'STSBenchmark-fix': 87 | self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark-fix', seed=self.params.seed) 88 | elif name == 'STSBenchmark-finetune': 89 | self.evaluation = STSBenchmarkFinetune(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed) 90 | elif name == 'SICKRelatedness-finetune': 91 | self.evaluation = SICKEval(tpath + '/downstream/SICK', seed=self.params.seed) 92 | elif name == 'SICKEntailment': 93 | self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed) 94 | elif name == 'SNLI': 95 | self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed) 96 | elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']: 97 | fpath = name + '-en-test' 98 | self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed) 99 | elif name == 'ImageCaptionRetrieval': 100 | self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed) 101 | 102 | # Probing Tasks 103 | elif name == 'Length': 104 | self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed) 105 | elif name == 'WordContent': 106 | self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed) 107 | elif name == 'Depth': 108 | self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed) 109 | elif name == 'TopConstituents': 110 | self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed) 111 | elif name == 'BigramShift': 112 | self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed) 113 | elif name == 'Tense': 114 | self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed) 115 | elif name == 'SubjNumber': 116 | self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed) 117 | elif name == 'ObjNumber': 118 | self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed) 119 | elif name == 'OddManOut': 120 | self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed) 121 | elif name == 'CoordinationInversion': 122 | self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed) 123 | 124 | self.params.current_task = name 125 | self.evaluation.do_prepare(self.params, self.prepare) 126 | 127 | self.results = self.evaluation.run(self.params, self.batcher) 128 | 129 | return self.results 130 | -------------------------------------------------------------------------------- /SentEval/senteval/mrpc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | MRPC : Microsoft Research Paraphrase (detection) Corpus 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import os 14 | import logging 15 | import numpy as np 16 | import io 17 | 18 | from senteval.tools.validation import KFoldClassifier 19 | 20 | from sklearn.metrics import f1_score 21 | 22 | 23 | class MRPCEval(object): 24 | def __init__(self, task_path, seed=1111): 25 | logging.info('***** Transfer task : MRPC *****\n\n') 26 | self.seed = seed 27 | train = self.loadFile(os.path.join(task_path, 28 | 'msr_paraphrase_train.txt')) 29 | test = self.loadFile(os.path.join(task_path, 30 | 'msr_paraphrase_test.txt')) 31 | self.mrpc_data = {'train': train, 'test': test} 32 | 33 | def do_prepare(self, params, prepare): 34 | # TODO : Should we separate samples in "train, test"? 35 | samples = self.mrpc_data['train']['X_A'] + \ 36 | self.mrpc_data['train']['X_B'] + \ 37 | self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B'] 38 | return prepare(params, samples) 39 | 40 | def loadFile(self, fpath): 41 | mrpc_data = {'X_A': [], 'X_B': [], 'y': []} 42 | with io.open(fpath, 'r', encoding='utf-8') as f: 43 | for line in f: 44 | text = line.strip().split('\t') 45 | mrpc_data['X_A'].append(text[3].split()) 46 | mrpc_data['X_B'].append(text[4].split()) 47 | mrpc_data['y'].append(text[0]) 48 | 49 | mrpc_data['X_A'] = mrpc_data['X_A'][1:] 50 | mrpc_data['X_B'] = mrpc_data['X_B'][1:] 51 | mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]] 52 | return mrpc_data 53 | 54 | def run(self, params, batcher): 55 | mrpc_embed = {'train': {}, 'test': {}} 56 | 57 | for key in self.mrpc_data: 58 | logging.info('Computing embedding for {0}'.format(key)) 59 | # Sort to reduce padding 60 | text_data = {} 61 | sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'], 62 | self.mrpc_data[key]['X_B'], 63 | self.mrpc_data[key]['y']), 64 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 65 | 66 | text_data['A'] = [x for (x, y, z) in sorted_corpus] 67 | text_data['B'] = [y for (x, y, z) in sorted_corpus] 68 | text_data['y'] = [z for (x, y, z) in sorted_corpus] 69 | 70 | for txt_type in ['A', 'B']: 71 | mrpc_embed[key][txt_type] = [] 72 | for ii in range(0, len(text_data['y']), params.batch_size): 73 | batch = text_data[txt_type][ii:ii + params.batch_size] 74 | embeddings = batcher(params, batch) 75 | mrpc_embed[key][txt_type].append(embeddings) 76 | mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type]) 77 | mrpc_embed[key]['y'] = np.array(text_data['y']) 78 | logging.info('Computed {0} embeddings'.format(key)) 79 | 80 | # Train 81 | trainA = mrpc_embed['train']['A'] 82 | trainB = mrpc_embed['train']['B'] 83 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 84 | trainY = mrpc_embed['train']['y'] 85 | 86 | # Test 87 | testA = mrpc_embed['test']['A'] 88 | testB = mrpc_embed['test']['B'] 89 | testF = np.c_[np.abs(testA - testB), testA * testB] 90 | testY = mrpc_embed['test']['y'] 91 | 92 | config = {'nclasses': 2, 'seed': self.seed, 93 | 'usepytorch': params.usepytorch, 94 | 'classifier': params.classifier, 95 | 'nhid': params.nhid, 'kfold': params.kfold} 96 | clf = KFoldClassifier(train={'X': trainF, 'y': trainY}, 97 | test={'X': testF, 'y': testY}, config=config) 98 | 99 | devacc, testacc, yhat = clf.run() 100 | testf1 = round(100*f1_score(testY, yhat), 2) 101 | logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n' 102 | .format(devacc, testacc, testf1)) 103 | return {'devacc': devacc, 'acc': testacc, 'f1': testf1, 104 | 'ndev': len(trainA), 'ntest': len(testA)} 105 | -------------------------------------------------------------------------------- /SentEval/senteval/probing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | probing tasks 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import copy 17 | import logging 18 | import numpy as np 19 | 20 | from senteval.tools.validation import SplitClassifier 21 | 22 | 23 | class PROBINGEval(object): 24 | def __init__(self, task, task_path, seed=1111): 25 | self.seed = seed 26 | self.task = task 27 | logging.debug('***** (Probing) Transfer task : %s classification *****', self.task.upper()) 28 | self.task_data = {'train': {'X': [], 'y': []}, 29 | 'dev': {'X': [], 'y': []}, 30 | 'test': {'X': [], 'y': []}} 31 | self.loadFile(task_path) 32 | logging.info('Loaded %s train - %s dev - %s test for %s' % 33 | (len(self.task_data['train']['y']), len(self.task_data['dev']['y']), 34 | len(self.task_data['test']['y']), self.task)) 35 | 36 | def do_prepare(self, params, prepare): 37 | samples = self.task_data['train']['X'] + self.task_data['dev']['X'] + \ 38 | self.task_data['test']['X'] 39 | return prepare(params, samples) 40 | 41 | def loadFile(self, fpath): 42 | self.tok2split = {'tr': 'train', 'va': 'dev', 'te': 'test'} 43 | with io.open(fpath, 'r', encoding='utf-8') as f: 44 | for line in f: 45 | line = line.rstrip().split('\t') 46 | self.task_data[self.tok2split[line[0]]]['X'].append(line[-1].split()) 47 | self.task_data[self.tok2split[line[0]]]['y'].append(line[1]) 48 | 49 | labels = sorted(np.unique(self.task_data['train']['y'])) 50 | self.tok2label = dict(zip(labels, range(len(labels)))) 51 | self.nclasses = len(self.tok2label) 52 | 53 | for split in self.task_data: 54 | for i, y in enumerate(self.task_data[split]['y']): 55 | self.task_data[split]['y'][i] = self.tok2label[y] 56 | 57 | def run(self, params, batcher): 58 | task_embed = {'train': {}, 'dev': {}, 'test': {}} 59 | bsize = params.batch_size 60 | logging.info('Computing embeddings for train/dev/test') 61 | for key in self.task_data: 62 | # Sort to reduce padding 63 | sorted_data = sorted(zip(self.task_data[key]['X'], 64 | self.task_data[key]['y']), 65 | key=lambda z: (len(z[0]), z[1])) 66 | self.task_data[key]['X'], self.task_data[key]['y'] = map(list, zip(*sorted_data)) 67 | 68 | task_embed[key]['X'] = [] 69 | for ii in range(0, len(self.task_data[key]['y']), bsize): 70 | batch = self.task_data[key]['X'][ii:ii + bsize] 71 | embeddings = batcher(params, batch) 72 | task_embed[key]['X'].append(embeddings) 73 | task_embed[key]['X'] = np.vstack(task_embed[key]['X']) 74 | task_embed[key]['y'] = np.array(self.task_data[key]['y']) 75 | logging.info('Computed embeddings') 76 | 77 | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, 78 | 'usepytorch': params.usepytorch, 79 | 'classifier': params.classifier} 80 | 81 | if self.task == "WordContent" and params.classifier['nhid'] > 0: 82 | config_classifier = copy.deepcopy(config_classifier) 83 | config_classifier['classifier']['nhid'] = 0 84 | print(params.classifier['nhid']) 85 | 86 | clf = SplitClassifier(X={'train': task_embed['train']['X'], 87 | 'valid': task_embed['dev']['X'], 88 | 'test': task_embed['test']['X']}, 89 | y={'train': task_embed['train']['y'], 90 | 'valid': task_embed['dev']['y'], 91 | 'test': task_embed['test']['y']}, 92 | config=config_classifier) 93 | 94 | devacc, testacc = clf.run() 95 | logging.debug('\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper())) 96 | 97 | return {'devacc': devacc, 'acc': testacc, 98 | 'ndev': len(task_embed['dev']['X']), 99 | 'ntest': len(task_embed['test']['X'])} 100 | 101 | """ 102 | Surface Information 103 | """ 104 | class LengthEval(PROBINGEval): 105 | def __init__(self, task_path, seed=1111): 106 | task_path = os.path.join(task_path, 'sentence_length.txt') 107 | # labels: bins 108 | PROBINGEval.__init__(self, 'Length', task_path, seed) 109 | 110 | class WordContentEval(PROBINGEval): 111 | def __init__(self, task_path, seed=1111): 112 | task_path = os.path.join(task_path, 'word_content.txt') 113 | # labels: 200 target words 114 | PROBINGEval.__init__(self, 'WordContent', task_path, seed) 115 | 116 | """ 117 | Latent Structural Information 118 | """ 119 | class DepthEval(PROBINGEval): 120 | def __init__(self, task_path, seed=1111): 121 | task_path = os.path.join(task_path, 'tree_depth.txt') 122 | # labels: bins 123 | PROBINGEval.__init__(self, 'Depth', task_path, seed) 124 | 125 | class TopConstituentsEval(PROBINGEval): 126 | def __init__(self, task_path, seed=1111): 127 | task_path = os.path.join(task_path, 'top_constituents.txt') 128 | # labels: 'PP_NP_VP_.' .. (20 classes) 129 | PROBINGEval.__init__(self, 'TopConstituents', task_path, seed) 130 | 131 | class BigramShiftEval(PROBINGEval): 132 | def __init__(self, task_path, seed=1111): 133 | task_path = os.path.join(task_path, 'bigram_shift.txt') 134 | # labels: 0 or 1 135 | PROBINGEval.__init__(self, 'BigramShift', task_path, seed) 136 | 137 | # TODO: Voice? 138 | 139 | """ 140 | Latent Semantic Information 141 | """ 142 | 143 | class TenseEval(PROBINGEval): 144 | def __init__(self, task_path, seed=1111): 145 | task_path = os.path.join(task_path, 'past_present.txt') 146 | # labels: 'PRES', 'PAST' 147 | PROBINGEval.__init__(self, 'Tense', task_path, seed) 148 | 149 | class SubjNumberEval(PROBINGEval): 150 | def __init__(self, task_path, seed=1111): 151 | task_path = os.path.join(task_path, 'subj_number.txt') 152 | # labels: 'NN', 'NNS' 153 | PROBINGEval.__init__(self, 'SubjNumber', task_path, seed) 154 | 155 | class ObjNumberEval(PROBINGEval): 156 | def __init__(self, task_path, seed=1111): 157 | task_path = os.path.join(task_path, 'obj_number.txt') 158 | # labels: 'NN', 'NNS' 159 | PROBINGEval.__init__(self, 'ObjNumber', task_path, seed) 160 | 161 | class OddManOutEval(PROBINGEval): 162 | def __init__(self, task_path, seed=1111): 163 | task_path = os.path.join(task_path, 'odd_man_out.txt') 164 | # labels: 'O', 'C' 165 | PROBINGEval.__init__(self, 'OddManOut', task_path, seed) 166 | 167 | class CoordinationInversionEval(PROBINGEval): 168 | def __init__(self, task_path, seed=1111): 169 | task_path = os.path.join(task_path, 'coordination_inversion.txt') 170 | # labels: 'O', 'I' 171 | PROBINGEval.__init__(self, 'CoordinationInversion', task_path, seed) 172 | -------------------------------------------------------------------------------- /SentEval/senteval/rank.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | Image-Caption Retrieval with COCO dataset 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import os 14 | import sys 15 | import logging 16 | import numpy as np 17 | 18 | try: 19 | import cPickle as pickle 20 | except ImportError: 21 | import pickle 22 | 23 | from senteval.tools.ranking import ImageSentenceRankingPytorch 24 | 25 | 26 | class ImageCaptionRetrievalEval(object): 27 | def __init__(self, task_path, seed=1111): 28 | logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n') 29 | 30 | # Get captions and image features 31 | self.seed = seed 32 | train, dev, test = self.loadFile(task_path) 33 | self.coco_data = {'train': train, 'dev': dev, 'test': test} 34 | 35 | def do_prepare(self, params, prepare): 36 | samples = self.coco_data['train']['sent'] + \ 37 | self.coco_data['dev']['sent'] + \ 38 | self.coco_data['test']['sent'] 39 | prepare(params, samples) 40 | 41 | def loadFile(self, fpath): 42 | coco = {} 43 | 44 | for split in ['train', 'valid', 'test']: 45 | list_sent = [] 46 | list_img_feat = [] 47 | if sys.version_info < (3, 0): 48 | with open(os.path.join(fpath, split + '.pkl')) as f: 49 | cocodata = pickle.load(f) 50 | else: 51 | with open(os.path.join(fpath, split + '.pkl'), 'rb') as f: 52 | cocodata = pickle.load(f, encoding='latin1') 53 | 54 | for imgkey in range(len(cocodata['features'])): 55 | assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \ 56 | cocodata['image_to_caption_ids'][imgkey] 57 | for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]: 58 | sent = cocodata['captions'][captkey]['cleaned_caption'] 59 | sent += ' .' # add punctuation to end of sentence in COCO 60 | list_sent.append(sent.encode('utf-8').split()) 61 | list_img_feat.append(cocodata['features'][imgkey]) 62 | assert len(list_sent) == len(list_img_feat) and \ 63 | len(list_sent) % 5 == 0 64 | list_img_feat = np.array(list_img_feat).astype('float32') 65 | coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat} 66 | return coco['train'], coco['valid'], coco['test'] 67 | 68 | def run(self, params, batcher): 69 | coco_embed = {'train': {'sentfeat': [], 'imgfeat': []}, 70 | 'dev': {'sentfeat': [], 'imgfeat': []}, 71 | 'test': {'sentfeat': [], 'imgfeat': []}} 72 | 73 | for key in self.coco_data: 74 | logging.info('Computing embedding for {0}'.format(key)) 75 | # Sort to reduce padding 76 | self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent']) 77 | self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent']) 78 | idx_unsort = np.argsort(idx_sort) 79 | 80 | coco_embed[key]['X'] = [] 81 | nsent = len(self.coco_data[key]['sent']) 82 | for ii in range(0, nsent, params.batch_size): 83 | batch = self.coco_data[key]['sent'][ii:ii + params.batch_size] 84 | embeddings = batcher(params, batch) 85 | coco_embed[key]['sentfeat'].append(embeddings) 86 | coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort] 87 | coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat']) 88 | logging.info('Computed {0} embeddings'.format(key)) 89 | 90 | config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2} 91 | clf = ImageSentenceRankingPytorch(train=coco_embed['train'], 92 | valid=coco_embed['dev'], 93 | test=coco_embed['test'], 94 | config=config) 95 | 96 | bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \ 97 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run() 98 | 99 | logging.debug("\nTest scores | Image to text: \ 100 | {0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t)) 101 | logging.debug("Test scores | Text to image: \ 102 | {0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i)) 103 | 104 | return {'devacc': bestdevscore, 105 | 'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t), 106 | (r1_t2i, r5_t2i, r10_t2i, medr_t2i)], 107 | 'ndev': len(coco_embed['dev']['sentfeat']), 108 | 'ntest': len(coco_embed['test']['sentfeat'])} 109 | -------------------------------------------------------------------------------- /SentEval/senteval/sick.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SICK Relatedness and Entailment 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import os 14 | import io 15 | import logging 16 | import numpy as np 17 | 18 | from sklearn.metrics import mean_squared_error 19 | from scipy.stats import pearsonr, spearmanr 20 | 21 | from senteval.tools.relatedness import RelatednessPytorch 22 | from senteval.tools.validation import SplitClassifier 23 | 24 | class SICKEval(object): 25 | def __init__(self, task_path, seed=1111): 26 | logging.debug('***** Transfer task : SICK-Relatedness*****\n\n') 27 | self.seed = seed 28 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt')) 29 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt')) 30 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt')) 31 | self.sick_data = {'train': train, 'dev': dev, 'test': test} 32 | 33 | def do_prepare(self, params, prepare): 34 | samples = self.sick_data['train']['X_A'] + \ 35 | self.sick_data['train']['X_B'] + \ 36 | self.sick_data['dev']['X_A'] + \ 37 | self.sick_data['dev']['X_B'] + \ 38 | self.sick_data['test']['X_A'] + self.sick_data['test']['X_B'] 39 | return prepare(params, samples) 40 | 41 | def loadFile(self, fpath): 42 | skipFirstLine = True 43 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 44 | with io.open(fpath, 'r', encoding='utf-8') as f: 45 | for line in f: 46 | if skipFirstLine: 47 | skipFirstLine = False 48 | else: 49 | text = line.strip().split('\t') 50 | sick_data['X_A'].append(text[1].split()) 51 | sick_data['X_B'].append(text[2].split()) 52 | sick_data['y'].append(text[3]) 53 | 54 | sick_data['y'] = [float(s) for s in sick_data['y']] 55 | return sick_data 56 | 57 | def run(self, params, batcher): 58 | sick_embed = {'train': {}, 'dev': {}, 'test': {}} 59 | bsize = params.batch_size 60 | 61 | for key in self.sick_data: 62 | logging.info('Computing embedding for {0}'.format(key)) 63 | # Sort to reduce padding 64 | sorted_corpus = sorted(zip(self.sick_data[key]['X_A'], 65 | self.sick_data[key]['X_B'], 66 | self.sick_data[key]['y']), 67 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 68 | 69 | self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus] 70 | self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus] 71 | self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus] 72 | 73 | for txt_type in ['X_A', 'X_B']: 74 | sick_embed[key][txt_type] = [] 75 | for ii in range(0, len(self.sick_data[key]['y']), bsize): 76 | batch = self.sick_data[key][txt_type][ii:ii + bsize] 77 | embeddings = batcher(params, batch) 78 | sick_embed[key][txt_type].append(embeddings) 79 | sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type]) 80 | sick_embed[key]['y'] = np.array(self.sick_data[key]['y']) 81 | logging.info('Computed {0} embeddings'.format(key)) 82 | 83 | # Train 84 | trainA = sick_embed['train']['X_A'] 85 | trainB = sick_embed['train']['X_B'] 86 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 87 | trainY = self.encode_labels(self.sick_data['train']['y']) 88 | 89 | # Dev 90 | devA = sick_embed['dev']['X_A'] 91 | devB = sick_embed['dev']['X_B'] 92 | devF = np.c_[np.abs(devA - devB), devA * devB] 93 | devY = self.encode_labels(self.sick_data['dev']['y']) 94 | 95 | # Test 96 | testA = sick_embed['test']['X_A'] 97 | testB = sick_embed['test']['X_B'] 98 | testF = np.c_[np.abs(testA - testB), testA * testB] 99 | testY = self.encode_labels(self.sick_data['test']['y']) 100 | 101 | config = {'seed': self.seed, 'nclasses': 5} 102 | clf = RelatednessPytorch(train={'X': trainF, 'y': trainY}, 103 | valid={'X': devF, 'y': devY}, 104 | test={'X': testF, 'y': testY}, 105 | devscores=self.sick_data['dev']['y'], 106 | config=config) 107 | 108 | devspr, yhat = clf.run() 109 | 110 | pr = pearsonr(yhat, self.sick_data['test']['y'])[0] 111 | sr = spearmanr(yhat, self.sick_data['test']['y'])[0] 112 | pr = 0 if pr != pr else pr 113 | sr = 0 if sr != sr else sr 114 | se = mean_squared_error(yhat, self.sick_data['test']['y']) 115 | logging.debug('Dev : Spearman {0}'.format(devspr)) 116 | logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \ 117 | for SICK Relatedness\n'.format(pr, sr, se)) 118 | 119 | return {'devspearman': devspr, 'pearson': pr, 'spearman': sr, 'mse': se, 120 | 'yhat': yhat, 'ndev': len(devA), 'ntest': len(testA)} 121 | 122 | def encode_labels(self, labels, nclass=5): 123 | """ 124 | Label encoding from Tree LSTM paper (Tai, Socher, Manning) 125 | """ 126 | Y = np.zeros((len(labels), nclass)).astype('float32') 127 | for j, y in enumerate(labels): 128 | for i in range(nclass): 129 | if i+1 == np.floor(y) + 1: 130 | Y[j, i] = y - np.floor(y) 131 | if i+1 == np.floor(y): 132 | Y[j, i] = np.floor(y) - y + 1 133 | return Y 134 | 135 | 136 | class SICKEntailmentEval(SICKEval): 137 | def __init__(self, task_path, seed=1111): 138 | logging.debug('***** Transfer task : SICK-Entailment*****\n\n') 139 | self.seed = seed 140 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt')) 141 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt')) 142 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt')) 143 | self.sick_data = {'train': train, 'dev': dev, 'test': test} 144 | 145 | def loadFile(self, fpath): 146 | label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2} 147 | skipFirstLine = True 148 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 149 | with io.open(fpath, 'r', encoding='utf-8') as f: 150 | for line in f: 151 | if skipFirstLine: 152 | skipFirstLine = False 153 | else: 154 | text = line.strip().split('\t') 155 | sick_data['X_A'].append(text[1].split()) 156 | sick_data['X_B'].append(text[2].split()) 157 | sick_data['y'].append(text[4]) 158 | sick_data['y'] = [label2id[s] for s in sick_data['y']] 159 | return sick_data 160 | 161 | def run(self, params, batcher): 162 | sick_embed = {'train': {}, 'dev': {}, 'test': {}} 163 | bsize = params.batch_size 164 | 165 | for key in self.sick_data: 166 | logging.info('Computing embedding for {0}'.format(key)) 167 | # Sort to reduce padding 168 | sorted_corpus = sorted(zip(self.sick_data[key]['X_A'], 169 | self.sick_data[key]['X_B'], 170 | self.sick_data[key]['y']), 171 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 172 | 173 | self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus] 174 | self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus] 175 | self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus] 176 | 177 | for txt_type in ['X_A', 'X_B']: 178 | sick_embed[key][txt_type] = [] 179 | for ii in range(0, len(self.sick_data[key]['y']), bsize): 180 | batch = self.sick_data[key][txt_type][ii:ii + bsize] 181 | embeddings = batcher(params, batch) 182 | sick_embed[key][txt_type].append(embeddings) 183 | sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type]) 184 | logging.info('Computed {0} embeddings'.format(key)) 185 | 186 | # Train 187 | trainA = sick_embed['train']['X_A'] 188 | trainB = sick_embed['train']['X_B'] 189 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 190 | trainY = np.array(self.sick_data['train']['y']) 191 | 192 | # Dev 193 | devA = sick_embed['dev']['X_A'] 194 | devB = sick_embed['dev']['X_B'] 195 | devF = np.c_[np.abs(devA - devB), devA * devB] 196 | devY = np.array(self.sick_data['dev']['y']) 197 | 198 | # Test 199 | testA = sick_embed['test']['X_A'] 200 | testB = sick_embed['test']['X_B'] 201 | testF = np.c_[np.abs(testA - testB), testA * testB] 202 | testY = np.array(self.sick_data['test']['y']) 203 | 204 | config = {'nclasses': 3, 'seed': self.seed, 205 | 'usepytorch': params.usepytorch, 206 | 'classifier': params.classifier, 207 | 'nhid': params.nhid} 208 | clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF}, 209 | y={'train': trainY, 'valid': devY, 'test': testY}, 210 | config=config) 211 | 212 | devacc, testacc = clf.run() 213 | logging.debug('\nDev acc : {0} Test acc : {1} for \ 214 | SICK entailment\n'.format(devacc, testacc)) 215 | return {'devacc': devacc, 'acc': testacc, 216 | 'ndev': len(devA), 'ntest': len(testA)} 217 | -------------------------------------------------------------------------------- /SentEval/senteval/snli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SNLI - Entailment 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import codecs 14 | import os 15 | import io 16 | import copy 17 | import logging 18 | import numpy as np 19 | 20 | from senteval.tools.validation import SplitClassifier 21 | 22 | 23 | class SNLIEval(object): 24 | def __init__(self, taskpath, seed=1111): 25 | logging.debug('***** Transfer task : SNLI Entailment*****\n\n') 26 | self.seed = seed 27 | train1 = self.loadFile(os.path.join(taskpath, 's1.train')) 28 | train2 = self.loadFile(os.path.join(taskpath, 's2.train')) 29 | 30 | trainlabels = io.open(os.path.join(taskpath, 'labels.train'), 31 | encoding='utf-8').read().splitlines() 32 | 33 | valid1 = self.loadFile(os.path.join(taskpath, 's1.dev')) 34 | valid2 = self.loadFile(os.path.join(taskpath, 's2.dev')) 35 | validlabels = io.open(os.path.join(taskpath, 'labels.dev'), 36 | encoding='utf-8').read().splitlines() 37 | 38 | test1 = self.loadFile(os.path.join(taskpath, 's1.test')) 39 | test2 = self.loadFile(os.path.join(taskpath, 's2.test')) 40 | testlabels = io.open(os.path.join(taskpath, 'labels.test'), 41 | encoding='utf-8').read().splitlines() 42 | 43 | # sort data (by s2 first) to reduce padding 44 | sorted_train = sorted(zip(train2, train1, trainlabels), 45 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 46 | train2, train1, trainlabels = map(list, zip(*sorted_train)) 47 | 48 | sorted_valid = sorted(zip(valid2, valid1, validlabels), 49 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 50 | valid2, valid1, validlabels = map(list, zip(*sorted_valid)) 51 | 52 | sorted_test = sorted(zip(test2, test1, testlabels), 53 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 54 | test2, test1, testlabels = map(list, zip(*sorted_test)) 55 | 56 | self.samples = train1 + train2 + valid1 + valid2 + test1 + test2 57 | self.data = {'train': (train1, train2, trainlabels), 58 | 'valid': (valid1, valid2, validlabels), 59 | 'test': (test1, test2, testlabels) 60 | } 61 | 62 | def do_prepare(self, params, prepare): 63 | return prepare(params, self.samples) 64 | 65 | def loadFile(self, fpath): 66 | with codecs.open(fpath, 'rb', 'latin-1') as f: 67 | return [line.split() for line in 68 | f.read().splitlines()] 69 | 70 | def run(self, params, batcher): 71 | self.X, self.y = {}, {} 72 | dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} 73 | for key in self.data: 74 | if key not in self.X: 75 | self.X[key] = [] 76 | if key not in self.y: 77 | self.y[key] = [] 78 | 79 | input1, input2, mylabels = self.data[key] 80 | enc_input = [] 81 | n_labels = len(mylabels) 82 | for ii in range(0, n_labels, params.batch_size): 83 | batch1 = input1[ii:ii + params.batch_size] 84 | batch2 = input2[ii:ii + params.batch_size] 85 | 86 | if len(batch1) == len(batch2) and len(batch1) > 0: 87 | enc1 = batcher(params, batch1) 88 | enc2 = batcher(params, batch2) 89 | enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, 90 | np.abs(enc1 - enc2)))) 91 | if (ii*params.batch_size) % (20000*params.batch_size) == 0: 92 | logging.info("PROGRESS (encoding): %.2f%%" % 93 | (100 * ii / n_labels)) 94 | self.X[key] = np.vstack(enc_input) 95 | self.y[key] = [dico_label[y] for y in mylabels] 96 | 97 | config = {'nclasses': 3, 'seed': self.seed, 98 | 'usepytorch': params.usepytorch, 99 | 'cudaEfficient': True, 100 | 'nhid': params.nhid, 'noreg': True} 101 | 102 | config_classifier = copy.deepcopy(params.classifier) 103 | config_classifier['max_epoch'] = 15 104 | config_classifier['epoch_size'] = 1 105 | config['classifier'] = config_classifier 106 | 107 | clf = SplitClassifier(self.X, self.y, config) 108 | devacc, testacc = clf.run() 109 | logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n' 110 | .format(devacc, testacc)) 111 | return {'devacc': devacc, 'acc': testacc, 112 | 'ndev': len(self.data['valid'][0]), 113 | 'ntest': len(self.data['test'][0])} 114 | -------------------------------------------------------------------------------- /SentEval/senteval/sst.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SST - binary classification 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import logging 17 | import numpy as np 18 | 19 | from senteval.tools.validation import SplitClassifier 20 | 21 | 22 | class SSTEval(object): 23 | def __init__(self, task_path, nclasses=2, seed=1111): 24 | self.seed = seed 25 | 26 | # binary of fine-grained 27 | assert nclasses in [2, 5] 28 | self.nclasses = nclasses 29 | self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained' 30 | logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name) 31 | 32 | train = self.loadFile(os.path.join(task_path, 'sentiment-train')) 33 | dev = self.loadFile(os.path.join(task_path, 'sentiment-dev')) 34 | test = self.loadFile(os.path.join(task_path, 'sentiment-test')) 35 | self.sst_data = {'train': train, 'dev': dev, 'test': test} 36 | 37 | def do_prepare(self, params, prepare): 38 | samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \ 39 | self.sst_data['test']['X'] 40 | return prepare(params, samples) 41 | 42 | def loadFile(self, fpath): 43 | sst_data = {'X': [], 'y': []} 44 | with io.open(fpath, 'r', encoding='utf-8') as f: 45 | for line in f: 46 | if self.nclasses == 2: 47 | sample = line.strip().split('\t') 48 | sst_data['y'].append(int(sample[1])) 49 | sst_data['X'].append(sample[0].split()) 50 | elif self.nclasses == 5: 51 | sample = line.strip().split(' ', 1) 52 | sst_data['y'].append(int(sample[0])) 53 | sst_data['X'].append(sample[1].split()) 54 | assert max(sst_data['y']) == self.nclasses - 1 55 | return sst_data 56 | 57 | def run(self, params, batcher): 58 | sst_embed = {'train': {}, 'dev': {}, 'test': {}} 59 | bsize = params.batch_size 60 | 61 | for key in self.sst_data: 62 | logging.info('Computing embedding for {0}'.format(key)) 63 | # Sort to reduce padding 64 | sorted_data = sorted(zip(self.sst_data[key]['X'], 65 | self.sst_data[key]['y']), 66 | key=lambda z: (len(z[0]), z[1])) 67 | self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data)) 68 | 69 | sst_embed[key]['X'] = [] 70 | for ii in range(0, len(self.sst_data[key]['y']), bsize): 71 | batch = self.sst_data[key]['X'][ii:ii + bsize] 72 | embeddings = batcher(params, batch) 73 | sst_embed[key]['X'].append(embeddings) 74 | sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) 75 | sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) 76 | logging.info('Computed {0} embeddings'.format(key)) 77 | 78 | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, 79 | 'usepytorch': params.usepytorch, 80 | 'classifier': params.classifier} 81 | 82 | clf = SplitClassifier(X={'train': sst_embed['train']['X'], 83 | 'valid': sst_embed['dev']['X'], 84 | 'test': sst_embed['test']['X']}, 85 | y={'train': sst_embed['train']['y'], 86 | 'valid': sst_embed['dev']['y'], 87 | 'test': sst_embed['test']['y']}, 88 | config=config_classifier) 89 | 90 | devacc, testacc = clf.run() 91 | logging.debug('\nDev acc : {0} Test acc : {1} for \ 92 | SST {2} classification\n'.format(devacc, testacc, self.task_name)) 93 | 94 | return {'devacc': devacc, 'acc': testacc, 95 | 'ndev': len(sst_embed['dev']['X']), 96 | 'ntest': len(sst_embed['test']['X'])} 97 | -------------------------------------------------------------------------------- /SentEval/senteval/sts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | STS-{2012,2013,2014,2015,2016} (unsupervised) and 10 | STS-benchmark (supervised) tasks 11 | ''' 12 | 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | import os 16 | import io 17 | import numpy as np 18 | import logging 19 | 20 | from scipy.stats import spearmanr, pearsonr 21 | 22 | from senteval.utils import cosine 23 | from senteval.sick import SICKEval 24 | 25 | 26 | class STSEval(object): 27 | def loadFile(self, fpath): 28 | self.data = {} 29 | self.samples = [] 30 | 31 | for dataset in self.datasets: 32 | sent1, sent2 = zip(*[l.split("\t") for l in 33 | io.open(fpath + '/STS.input.%s.txt' % dataset, 34 | encoding='utf8').read().splitlines()]) 35 | raw_scores = np.array([x for x in 36 | io.open(fpath + '/STS.gs.%s.txt' % dataset, 37 | encoding='utf8') 38 | .read().splitlines()]) 39 | not_empty_idx = raw_scores != '' 40 | 41 | gs_scores = [float(x) for x in raw_scores[not_empty_idx]] 42 | sent1 = np.array([s.split() for s in sent1])[not_empty_idx] 43 | sent2 = np.array([s.split() for s in sent2])[not_empty_idx] 44 | # sort data by length to minimize padding in batcher 45 | sorted_data = sorted(zip(sent1, sent2, gs_scores), 46 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 47 | sent1, sent2, gs_scores = map(list, zip(*sorted_data)) 48 | 49 | self.data[dataset] = (sent1, sent2, gs_scores) 50 | self.samples += sent1 + sent2 51 | 52 | def do_prepare(self, params, prepare): 53 | if 'similarity' in params: 54 | self.similarity = params.similarity 55 | else: # Default similarity is cosine 56 | self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2))) 57 | return prepare(params, self.samples) 58 | 59 | def run(self, params, batcher): 60 | results = {} 61 | all_sys_scores = [] 62 | all_gs_scores = [] 63 | for dataset in self.datasets: 64 | sys_scores = [] 65 | input1, input2, gs_scores = self.data[dataset] 66 | for ii in range(0, len(gs_scores), params.batch_size): 67 | batch1 = input1[ii:ii + params.batch_size] 68 | batch2 = input2[ii:ii + params.batch_size] 69 | 70 | # we assume get_batch already throws out the faulty ones 71 | if len(batch1) == len(batch2) and len(batch1) > 0: 72 | enc1 = batcher(params, batch1) 73 | enc2 = batcher(params, batch2) 74 | 75 | for kk in range(enc2.shape[0]): 76 | sys_score = self.similarity(enc1[kk], enc2[kk]) 77 | sys_scores.append(sys_score) 78 | all_sys_scores.extend(sys_scores) 79 | all_gs_scores.extend(gs_scores) 80 | results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores), 81 | 'spearman': spearmanr(sys_scores, gs_scores), 82 | 'nsamples': len(sys_scores)} 83 | logging.debug('%s : pearson = %.4f, spearman = %.4f' % 84 | (dataset, results[dataset]['pearson'][0], 85 | results[dataset]['spearman'][0])) 86 | 87 | weights = [results[dset]['nsamples'] for dset in results.keys()] 88 | list_prs = np.array([results[dset]['pearson'][0] for 89 | dset in results.keys()]) 90 | list_spr = np.array([results[dset]['spearman'][0] for 91 | dset in results.keys()]) 92 | 93 | avg_pearson = np.average(list_prs) 94 | avg_spearman = np.average(list_spr) 95 | wavg_pearson = np.average(list_prs, weights=weights) 96 | wavg_spearman = np.average(list_spr, weights=weights) 97 | all_pearson = pearsonr(all_sys_scores, all_gs_scores) 98 | all_spearman = spearmanr(all_sys_scores, all_gs_scores) 99 | results['all'] = {'pearson': {'all': all_pearson[0], 100 | 'mean': avg_pearson, 101 | 'wmean': wavg_pearson}, 102 | 'spearman': {'all': all_spearman[0], 103 | 'mean': avg_spearman, 104 | 'wmean': wavg_spearman}} 105 | logging.debug('ALL : Pearson = %.4f, \ 106 | Spearman = %.4f' % (all_pearson[0], all_spearman[0])) 107 | logging.debug('ALL (weighted average) : Pearson = %.4f, \ 108 | Spearman = %.4f' % (wavg_pearson, wavg_spearman)) 109 | logging.debug('ALL (average) : Pearson = %.4f, \ 110 | Spearman = %.4f\n' % (avg_pearson, avg_spearman)) 111 | 112 | return results 113 | 114 | 115 | class STS12Eval(STSEval): 116 | def __init__(self, taskpath, seed=1111): 117 | logging.debug('***** Transfer task : STS12 *****\n\n') 118 | self.seed = seed 119 | self.datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl', 120 | 'surprise.OnWN', 'surprise.SMTnews'] 121 | self.loadFile(taskpath) 122 | 123 | 124 | class STS13Eval(STSEval): 125 | # STS13 here does not contain the "SMT" subtask due to LICENSE issue 126 | def __init__(self, taskpath, seed=1111): 127 | logging.debug('***** Transfer task : STS13 (-SMT) *****\n\n') 128 | self.seed = seed 129 | self.datasets = ['FNWN', 'headlines', 'OnWN'] 130 | self.loadFile(taskpath) 131 | 132 | 133 | class STS14Eval(STSEval): 134 | def __init__(self, taskpath, seed=1111): 135 | logging.debug('***** Transfer task : STS14 *****\n\n') 136 | self.seed = seed 137 | self.datasets = ['deft-forum', 'deft-news', 'headlines', 138 | 'images', 'OnWN', 'tweet-news'] 139 | self.loadFile(taskpath) 140 | 141 | 142 | class STS15Eval(STSEval): 143 | def __init__(self, taskpath, seed=1111): 144 | logging.debug('***** Transfer task : STS15 *****\n\n') 145 | self.seed = seed 146 | self.datasets = ['answers-forums', 'answers-students', 147 | 'belief', 'headlines', 'images'] 148 | self.loadFile(taskpath) 149 | 150 | 151 | class STS16Eval(STSEval): 152 | def __init__(self, taskpath, seed=1111): 153 | logging.debug('***** Transfer task : STS16 *****\n\n') 154 | self.seed = seed 155 | self.datasets = ['answer-answer', 'headlines', 'plagiarism', 156 | 'postediting', 'question-question'] 157 | self.loadFile(taskpath) 158 | 159 | 160 | class STSBenchmarkEval(STSEval): 161 | def __init__(self, task_path, seed=1111): 162 | logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n') 163 | self.seed = seed 164 | self.samples = [] 165 | train = self.loadFile(os.path.join(task_path, 'sts-train.csv')) 166 | dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv')) 167 | test = self.loadFile(os.path.join(task_path, 'sts-test.csv')) 168 | self.datasets = ['train', 'dev', 'test'] 169 | self.data = {'train': train, 'dev': dev, 'test': test} 170 | 171 | def loadFile(self, fpath): 172 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 173 | with io.open(fpath, 'r', encoding='utf-8') as f: 174 | for line in f: 175 | text = line.strip().split('\t') 176 | sick_data['X_A'].append(text[5].split()) 177 | sick_data['X_B'].append(text[6].split()) 178 | sick_data['y'].append(text[4]) 179 | 180 | sick_data['y'] = [float(s) for s in sick_data['y']] 181 | self.samples += sick_data['X_A'] + sick_data["X_B"] 182 | return (sick_data['X_A'], sick_data["X_B"], sick_data['y']) 183 | 184 | class STSBenchmarkFinetune(SICKEval): 185 | def __init__(self, task_path, seed=1111): 186 | logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n') 187 | self.seed = seed 188 | train = self.loadFile(os.path.join(task_path, 'sts-train.csv')) 189 | dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv')) 190 | test = self.loadFile(os.path.join(task_path, 'sts-test.csv')) 191 | self.sick_data = {'train': train, 'dev': dev, 'test': test} 192 | 193 | def loadFile(self, fpath): 194 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 195 | with io.open(fpath, 'r', encoding='utf-8') as f: 196 | for line in f: 197 | text = line.strip().split('\t') 198 | sick_data['X_A'].append(text[5].split()) 199 | sick_data['X_B'].append(text[6].split()) 200 | sick_data['y'].append(text[4]) 201 | 202 | sick_data['y'] = [float(s) for s in sick_data['y']] 203 | return sick_data 204 | 205 | class SICKRelatednessEval(STSEval): 206 | def __init__(self, task_path, seed=1111): 207 | logging.debug('\n\n***** Transfer task : SICKRelatedness*****\n\n') 208 | self.seed = seed 209 | self.samples = [] 210 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt')) 211 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt')) 212 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt')) 213 | self.datasets = ['train', 'dev', 'test'] 214 | self.data = {'train': train, 'dev': dev, 'test': test} 215 | 216 | def loadFile(self, fpath): 217 | skipFirstLine = True 218 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 219 | with io.open(fpath, 'r', encoding='utf-8') as f: 220 | for line in f: 221 | if skipFirstLine: 222 | skipFirstLine = False 223 | else: 224 | text = line.strip().split('\t') 225 | sick_data['X_A'].append(text[1].split()) 226 | sick_data['X_B'].append(text[2].split()) 227 | sick_data['y'].append(text[3]) 228 | 229 | sick_data['y'] = [float(s) for s in sick_data['y']] 230 | self.samples += sick_data['X_A'] + sick_data["X_B"] 231 | return (sick_data['X_A'], sick_data["X_B"], sick_data['y']) 232 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__init__.py -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/classifier.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/classifier.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/classifier.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/classifier.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/ranking.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/ranking.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/ranking.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/ranking.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/relatedness.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/relatedness.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/relatedness.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/relatedness.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/validation.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/validation.cpython-36.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/__pycache__/validation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nlpods/LayerAttPooler/a500107e792171dfe9680cc60ea8907db042ed28/SentEval/senteval/tools/__pycache__/validation.cpython-38.pyc -------------------------------------------------------------------------------- /SentEval/senteval/tools/classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Pytorch Classifier class in the style of scikit-learn 10 | Classifiers include Logistic Regression and MLP 11 | """ 12 | 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | import numpy as np 16 | import copy 17 | from senteval import utils 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | 23 | 24 | class PyTorchClassifier(object): 25 | def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111, 26 | cudaEfficient=False): 27 | # fix seed 28 | np.random.seed(seed) 29 | torch.manual_seed(seed) 30 | torch.cuda.manual_seed(seed) 31 | 32 | self.inputdim = inputdim 33 | self.nclasses = nclasses 34 | self.l2reg = l2reg 35 | self.batch_size = batch_size 36 | self.cudaEfficient = cudaEfficient 37 | 38 | def prepare_split(self, X, y, validation_data=None, validation_split=None): 39 | # Preparing validation data 40 | assert validation_split or validation_data 41 | if validation_data is not None: 42 | trainX, trainy = X, y 43 | devX, devy = validation_data 44 | else: 45 | permutation = np.random.permutation(len(X)) 46 | trainidx = permutation[int(validation_split * len(X)):] 47 | devidx = permutation[0:int(validation_split * len(X))] 48 | trainX, trainy = X[trainidx], y[trainidx] 49 | devX, devy = X[devidx], y[devidx] 50 | 51 | device = torch.device('cpu') if self.cudaEfficient else torch.device('cuda') 52 | 53 | trainX = torch.from_numpy(trainX).to(device, dtype=torch.float32) 54 | trainy = torch.from_numpy(trainy).to(device, dtype=torch.int64) 55 | devX = torch.from_numpy(devX).to(device, dtype=torch.float32) 56 | devy = torch.from_numpy(devy).to(device, dtype=torch.int64) 57 | 58 | return trainX, trainy, devX, devy 59 | 60 | def fit(self, X, y, validation_data=None, validation_split=None, 61 | early_stop=True): 62 | self.nepoch = 0 63 | bestaccuracy = -1 64 | stop_train = False 65 | early_stop_count = 0 66 | 67 | # Preparing validation data 68 | trainX, trainy, devX, devy = self.prepare_split(X, y, validation_data, 69 | validation_split) 70 | 71 | # Training 72 | while not stop_train and self.nepoch <= self.max_epoch: 73 | self.trainepoch(trainX, trainy, epoch_size=self.epoch_size) 74 | accuracy = self.score(devX, devy) 75 | if accuracy > bestaccuracy: 76 | bestaccuracy = accuracy 77 | bestmodel = copy.deepcopy(self.model) 78 | elif early_stop: 79 | if early_stop_count >= self.tenacity: 80 | stop_train = True 81 | early_stop_count += 1 82 | self.model = bestmodel 83 | return bestaccuracy 84 | 85 | def trainepoch(self, X, y, epoch_size=1): 86 | self.model.train() 87 | for _ in range(self.nepoch, self.nepoch + epoch_size): 88 | permutation = np.random.permutation(len(X)) 89 | all_costs = [] 90 | for i in range(0, len(X), self.batch_size): 91 | # forward 92 | idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().to(X.device) 93 | 94 | Xbatch = X[idx] 95 | ybatch = y[idx] 96 | 97 | if self.cudaEfficient: 98 | Xbatch = Xbatch.cuda() 99 | ybatch = ybatch.cuda() 100 | output = self.model(Xbatch) 101 | # loss 102 | loss = self.loss_fn(output, ybatch) 103 | all_costs.append(loss.data.item()) 104 | # backward 105 | self.optimizer.zero_grad() 106 | loss.backward() 107 | # Update parameters 108 | self.optimizer.step() 109 | self.nepoch += epoch_size 110 | 111 | def score(self, devX, devy): 112 | self.model.eval() 113 | correct = 0 114 | if not isinstance(devX, torch.cuda.FloatTensor) or self.cudaEfficient: 115 | devX = torch.FloatTensor(devX).cuda() 116 | devy = torch.LongTensor(devy).cuda() 117 | with torch.no_grad(): 118 | for i in range(0, len(devX), self.batch_size): 119 | Xbatch = devX[i:i + self.batch_size] 120 | ybatch = devy[i:i + self.batch_size] 121 | if self.cudaEfficient: 122 | Xbatch = Xbatch.cuda() 123 | ybatch = ybatch.cuda() 124 | output = self.model(Xbatch) 125 | pred = output.data.max(1)[1] 126 | correct += pred.long().eq(ybatch.data.long()).sum().item() 127 | accuracy = 1.0 * correct / len(devX) 128 | return accuracy 129 | 130 | def predict(self, devX): 131 | self.model.eval() 132 | if not isinstance(devX, torch.cuda.FloatTensor): 133 | devX = torch.FloatTensor(devX).cuda() 134 | yhat = np.array([]) 135 | with torch.no_grad(): 136 | for i in range(0, len(devX), self.batch_size): 137 | Xbatch = devX[i:i + self.batch_size] 138 | output = self.model(Xbatch) 139 | yhat = np.append(yhat, 140 | output.data.max(1)[1].cpu().numpy()) 141 | yhat = np.vstack(yhat) 142 | return yhat 143 | 144 | def predict_proba(self, devX): 145 | self.model.eval() 146 | probas = [] 147 | with torch.no_grad(): 148 | for i in range(0, len(devX), self.batch_size): 149 | Xbatch = devX[i:i + self.batch_size] 150 | vals = F.softmax(self.model(Xbatch).data.cpu().numpy()) 151 | if not probas: 152 | probas = vals 153 | else: 154 | probas = np.concatenate(probas, vals, axis=0) 155 | return probas 156 | 157 | 158 | """ 159 | MLP with Pytorch (nhid=0 --> Logistic Regression) 160 | """ 161 | 162 | class MLP(PyTorchClassifier): 163 | def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64, 164 | seed=1111, cudaEfficient=False): 165 | super(self.__class__, self).__init__(inputdim, nclasses, l2reg, 166 | batch_size, seed, cudaEfficient) 167 | """ 168 | PARAMETERS: 169 | -nhid: number of hidden units (0: Logistic Regression) 170 | -optim: optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..) 171 | -tenacity: how many times dev acc does not increase before stopping 172 | -epoch_size: each epoch corresponds to epoch_size pass on the train set 173 | -max_epoch: max number of epoches 174 | -dropout: dropout for MLP 175 | """ 176 | 177 | self.nhid = 0 if "nhid" not in params else params["nhid"] 178 | self.optim = "adam" if "optim" not in params else params["optim"] 179 | self.tenacity = 5 if "tenacity" not in params else params["tenacity"] 180 | self.epoch_size = 4 if "epoch_size" not in params else params["epoch_size"] 181 | self.max_epoch = 200 if "max_epoch" not in params else params["max_epoch"] 182 | self.dropout = 0. if "dropout" not in params else params["dropout"] 183 | self.batch_size = 64 if "batch_size" not in params else params["batch_size"] 184 | 185 | if params["nhid"] == 0: 186 | self.model = nn.Sequential( 187 | nn.Linear(self.inputdim, self.nclasses), 188 | ).cuda() 189 | else: 190 | self.model = nn.Sequential( 191 | nn.Linear(self.inputdim, params["nhid"]), 192 | nn.Dropout(p=self.dropout), 193 | nn.Sigmoid(), 194 | nn.Linear(params["nhid"], self.nclasses), 195 | ).cuda() 196 | 197 | self.loss_fn = nn.CrossEntropyLoss().cuda() 198 | self.loss_fn.size_average = False 199 | 200 | optim_fn, optim_params = utils.get_optimizer(self.optim) 201 | self.optimizer = optim_fn(self.model.parameters(), **optim_params) 202 | self.optimizer.param_groups[0]['weight_decay'] = self.l2reg 203 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/ranking.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Image Annotation/Search for COCO with Pytorch 10 | """ 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import logging 14 | import copy 15 | import numpy as np 16 | 17 | import torch 18 | from torch import nn 19 | from torch.autograd import Variable 20 | import torch.optim as optim 21 | 22 | 23 | class COCOProjNet(nn.Module): 24 | def __init__(self, config): 25 | super(COCOProjNet, self).__init__() 26 | self.imgdim = config['imgdim'] 27 | self.sentdim = config['sentdim'] 28 | self.projdim = config['projdim'] 29 | self.imgproj = nn.Sequential( 30 | nn.Linear(self.imgdim, self.projdim), 31 | ) 32 | self.sentproj = nn.Sequential( 33 | nn.Linear(self.sentdim, self.projdim), 34 | ) 35 | 36 | def forward(self, img, sent, imgc, sentc): 37 | # imgc : (bsize, ncontrast, imgdim) 38 | # sentc : (bsize, ncontrast, sentdim) 39 | # img : (bsize, imgdim) 40 | # sent : (bsize, sentdim) 41 | img = img.unsqueeze(1).expand_as(imgc).contiguous() 42 | img = img.view(-1, self.imgdim) 43 | imgc = imgc.view(-1, self.imgdim) 44 | sent = sent.unsqueeze(1).expand_as(sentc).contiguous() 45 | sent = sent.view(-1, self.sentdim) 46 | sentc = sentc.view(-1, self.sentdim) 47 | 48 | imgproj = self.imgproj(img) 49 | imgproj = imgproj / torch.sqrt(torch.pow(imgproj, 2).sum(1, keepdim=True)).expand_as(imgproj) 50 | imgcproj = self.imgproj(imgc) 51 | imgcproj = imgcproj / torch.sqrt(torch.pow(imgcproj, 2).sum(1, keepdim=True)).expand_as(imgcproj) 52 | sentproj = self.sentproj(sent) 53 | sentproj = sentproj / torch.sqrt(torch.pow(sentproj, 2).sum(1, keepdim=True)).expand_as(sentproj) 54 | sentcproj = self.sentproj(sentc) 55 | sentcproj = sentcproj / torch.sqrt(torch.pow(sentcproj, 2).sum(1, keepdim=True)).expand_as(sentcproj) 56 | # (bsize*ncontrast, projdim) 57 | 58 | anchor1 = torch.sum((imgproj*sentproj), 1) 59 | anchor2 = torch.sum((sentproj*imgproj), 1) 60 | img_sentc = torch.sum((imgproj*sentcproj), 1) 61 | sent_imgc = torch.sum((sentproj*imgcproj), 1) 62 | 63 | # (bsize*ncontrast) 64 | return anchor1, anchor2, img_sentc, sent_imgc 65 | 66 | def proj_sentence(self, sent): 67 | output = self.sentproj(sent) 68 | output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output) 69 | return output # (bsize, projdim) 70 | 71 | def proj_image(self, img): 72 | output = self.imgproj(img) 73 | output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output) 74 | return output # (bsize, projdim) 75 | 76 | 77 | class PairwiseRankingLoss(nn.Module): 78 | """ 79 | Pairwise ranking loss 80 | """ 81 | def __init__(self, margin): 82 | super(PairwiseRankingLoss, self).__init__() 83 | self.margin = margin 84 | 85 | def forward(self, anchor1, anchor2, img_sentc, sent_imgc): 86 | 87 | cost_sent = torch.clamp(self.margin - anchor1 + img_sentc, 88 | min=0.0).sum() 89 | cost_img = torch.clamp(self.margin - anchor2 + sent_imgc, 90 | min=0.0).sum() 91 | loss = cost_sent + cost_img 92 | return loss 93 | 94 | 95 | class ImageSentenceRankingPytorch(object): 96 | # Image Sentence Ranking on COCO with Pytorch 97 | def __init__(self, train, valid, test, config): 98 | # fix seed 99 | self.seed = config['seed'] 100 | np.random.seed(self.seed) 101 | torch.manual_seed(self.seed) 102 | torch.cuda.manual_seed(self.seed) 103 | 104 | self.train = train 105 | self.valid = valid 106 | self.test = test 107 | 108 | self.imgdim = len(train['imgfeat'][0]) 109 | self.sentdim = len(train['sentfeat'][0]) 110 | self.projdim = config['projdim'] 111 | self.margin = config['margin'] 112 | 113 | self.batch_size = 128 114 | self.ncontrast = 30 115 | self.maxepoch = 20 116 | self.early_stop = True 117 | 118 | config_model = {'imgdim': self.imgdim,'sentdim': self.sentdim, 119 | 'projdim': self.projdim} 120 | self.model = COCOProjNet(config_model).cuda() 121 | 122 | self.loss_fn = PairwiseRankingLoss(margin=self.margin).cuda() 123 | 124 | self.optimizer = optim.Adam(self.model.parameters()) 125 | 126 | def prepare_data(self, trainTxt, trainImg, devTxt, devImg, 127 | testTxt, testImg): 128 | trainTxt = torch.FloatTensor(trainTxt) 129 | trainImg = torch.FloatTensor(trainImg) 130 | devTxt = torch.FloatTensor(devTxt).cuda() 131 | devImg = torch.FloatTensor(devImg).cuda() 132 | testTxt = torch.FloatTensor(testTxt).cuda() 133 | testImg = torch.FloatTensor(testImg).cuda() 134 | 135 | return trainTxt, trainImg, devTxt, devImg, testTxt, testImg 136 | 137 | def run(self): 138 | self.nepoch = 0 139 | bestdevscore = -1 140 | early_stop_count = 0 141 | stop_train = False 142 | 143 | # Preparing data 144 | logging.info('prepare data') 145 | trainTxt, trainImg, devTxt, devImg, testTxt, testImg = \ 146 | self.prepare_data(self.train['sentfeat'], self.train['imgfeat'], 147 | self.valid['sentfeat'], self.valid['imgfeat'], 148 | self.test['sentfeat'], self.test['imgfeat']) 149 | 150 | # Training 151 | while not stop_train and self.nepoch <= self.maxepoch: 152 | logging.info('start epoch') 153 | self.trainepoch(trainTxt, trainImg, devTxt, devImg, nepoches=1) 154 | logging.info('Epoch {0} finished'.format(self.nepoch)) 155 | 156 | results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0}, 157 | 't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0}, 158 | 'dev': bestdevscore} 159 | score = 0 160 | for i in range(5): 161 | devTxt_i = devTxt[i*5000:(i+1)*5000] 162 | devImg_i = devImg[i*5000:(i+1)*5000] 163 | # Compute dev ranks img2txt 164 | r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg_i, 165 | devTxt_i) 166 | results['i2t']['r1'] += r1_i2t / 5 167 | results['i2t']['r5'] += r5_i2t / 5 168 | results['i2t']['r10'] += r10_i2t / 5 169 | results['i2t']['medr'] += medr_i2t / 5 170 | logging.info("Image to text: {0}, {1}, {2}, {3}" 171 | .format(r1_i2t, r5_i2t, r10_i2t, medr_i2t)) 172 | # Compute dev ranks txt2img 173 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg_i, 174 | devTxt_i) 175 | results['t2i']['r1'] += r1_t2i / 5 176 | results['t2i']['r5'] += r5_t2i / 5 177 | results['t2i']['r10'] += r10_t2i / 5 178 | results['t2i']['medr'] += medr_t2i / 5 179 | logging.info("Text to Image: {0}, {1}, {2}, {3}" 180 | .format(r1_t2i, r5_t2i, r10_t2i, medr_t2i)) 181 | score += (r1_i2t + r5_i2t + r10_i2t + 182 | r1_t2i + r5_t2i + r10_t2i) / 5 183 | 184 | logging.info("Dev mean Text to Image: {0}, {1}, {2}, {3}".format( 185 | results['t2i']['r1'], results['t2i']['r5'], 186 | results['t2i']['r10'], results['t2i']['medr'])) 187 | logging.info("Dev mean Image to text: {0}, {1}, {2}, {3}".format( 188 | results['i2t']['r1'], results['i2t']['r5'], 189 | results['i2t']['r10'], results['i2t']['medr'])) 190 | 191 | # early stop on Pearson 192 | if score > bestdevscore: 193 | bestdevscore = score 194 | bestmodel = copy.deepcopy(self.model) 195 | elif self.early_stop: 196 | if early_stop_count >= 3: 197 | stop_train = True 198 | early_stop_count += 1 199 | self.model = bestmodel 200 | 201 | # Compute test for the 5 splits 202 | results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0}, 203 | 't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0}, 204 | 'dev': bestdevscore} 205 | for i in range(5): 206 | testTxt_i = testTxt[i*5000:(i+1)*5000] 207 | testImg_i = testImg[i*5000:(i+1)*5000] 208 | # Compute test ranks img2txt 209 | r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(testImg_i, testTxt_i) 210 | results['i2t']['r1'] += r1_i2t / 5 211 | results['i2t']['r5'] += r5_i2t / 5 212 | results['i2t']['r10'] += r10_i2t / 5 213 | results['i2t']['medr'] += medr_i2t / 5 214 | # Compute test ranks txt2img 215 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(testImg_i, testTxt_i) 216 | results['t2i']['r1'] += r1_t2i / 5 217 | results['t2i']['r5'] += r5_t2i / 5 218 | results['t2i']['r10'] += r10_t2i / 5 219 | results['t2i']['medr'] += medr_t2i / 5 220 | 221 | return bestdevscore, results['i2t']['r1'], results['i2t']['r5'], \ 222 | results['i2t']['r10'], results['i2t']['medr'], \ 223 | results['t2i']['r1'], results['t2i']['r5'], \ 224 | results['t2i']['r10'], results['t2i']['medr'] 225 | 226 | def trainepoch(self, trainTxt, trainImg, devTxt, devImg, nepoches=1): 227 | self.model.train() 228 | for _ in range(self.nepoch, self.nepoch + nepoches): 229 | permutation = list(np.random.permutation(len(trainTxt))) 230 | all_costs = [] 231 | for i in range(0, len(trainTxt), self.batch_size): 232 | # forward 233 | if i % (self.batch_size*500) == 0 and i > 0: 234 | logging.info('samples : {0}'.format(i)) 235 | r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg, 236 | devTxt) 237 | logging.info("Image to text: {0}, {1}, {2}, {3}".format( 238 | r1_i2t, r5_i2t, r10_i2t, medr_i2t)) 239 | # Compute test ranks txt2img 240 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg, 241 | devTxt) 242 | logging.info("Text to Image: {0}, {1}, {2}, {3}".format( 243 | r1_t2i, r5_t2i, r10_t2i, medr_t2i)) 244 | idx = torch.LongTensor(permutation[i:i + self.batch_size]) 245 | imgbatch = Variable(trainImg.index_select(0, idx)).cuda() 246 | sentbatch = Variable(trainTxt.index_select(0, idx)).cuda() 247 | 248 | idximgc = np.random.choice(permutation[:i] + 249 | permutation[i + self.batch_size:], 250 | self.ncontrast*idx.size(0)) 251 | idxsentc = np.random.choice(permutation[:i] + 252 | permutation[i + self.batch_size:], 253 | self.ncontrast*idx.size(0)) 254 | idximgc = torch.LongTensor(idximgc) 255 | idxsentc = torch.LongTensor(idxsentc) 256 | # Get indexes for contrastive images and sentences 257 | imgcbatch = Variable(trainImg.index_select(0, idximgc)).view( 258 | -1, self.ncontrast, self.imgdim).cuda() 259 | sentcbatch = Variable(trainTxt.index_select(0, idxsentc)).view( 260 | -1, self.ncontrast, self.sentdim).cuda() 261 | 262 | anchor1, anchor2, img_sentc, sent_imgc = self.model( 263 | imgbatch, sentbatch, imgcbatch, sentcbatch) 264 | # loss 265 | loss = self.loss_fn(anchor1, anchor2, img_sentc, sent_imgc) 266 | all_costs.append(loss.data.item()) 267 | # backward 268 | self.optimizer.zero_grad() 269 | loss.backward() 270 | # Update parameters 271 | self.optimizer.step() 272 | self.nepoch += nepoches 273 | 274 | def t2i(self, images, captions): 275 | """ 276 | Images: (5N, imgdim) matrix of images 277 | Captions: (5N, sentdim) matrix of captions 278 | """ 279 | with torch.no_grad(): 280 | # Project images and captions 281 | img_embed, sent_embed = [], [] 282 | for i in range(0, len(images), self.batch_size): 283 | img_embed.append(self.model.proj_image( 284 | Variable(images[i:i + self.batch_size]))) 285 | sent_embed.append(self.model.proj_sentence( 286 | Variable(captions[i:i + self.batch_size]))) 287 | img_embed = torch.cat(img_embed, 0).data 288 | sent_embed = torch.cat(sent_embed, 0).data 289 | 290 | npts = int(img_embed.size(0) / 5) 291 | idxs = torch.cuda.LongTensor(range(0, len(img_embed), 5)) 292 | ims = img_embed.index_select(0, idxs) 293 | 294 | ranks = np.zeros(5 * npts) 295 | for index in range(npts): 296 | 297 | # Get query captions 298 | queries = sent_embed[5*index: 5*index + 5] 299 | 300 | # Compute scores 301 | scores = torch.mm(queries, ims.transpose(0, 1)).cpu().numpy() 302 | inds = np.zeros(scores.shape) 303 | for i in range(len(inds)): 304 | inds[i] = np.argsort(scores[i])[::-1] 305 | ranks[5 * index + i] = np.where(inds[i] == index)[0][0] 306 | 307 | # Compute metrics 308 | r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) 309 | r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) 310 | r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) 311 | medr = np.floor(np.median(ranks)) + 1 312 | return (r1, r5, r10, medr) 313 | 314 | def i2t(self, images, captions): 315 | """ 316 | Images: (5N, imgdim) matrix of images 317 | Captions: (5N, sentdim) matrix of captions 318 | """ 319 | with torch.no_grad(): 320 | # Project images and captions 321 | img_embed, sent_embed = [], [] 322 | for i in range(0, len(images), self.batch_size): 323 | img_embed.append(self.model.proj_image( 324 | Variable(images[i:i + self.batch_size]))) 325 | sent_embed.append(self.model.proj_sentence( 326 | Variable(captions[i:i + self.batch_size]))) 327 | img_embed = torch.cat(img_embed, 0).data 328 | sent_embed = torch.cat(sent_embed, 0).data 329 | 330 | npts = int(img_embed.size(0) / 5) 331 | index_list = [] 332 | 333 | ranks = np.zeros(npts) 334 | for index in range(npts): 335 | 336 | # Get query image 337 | query_img = img_embed[5 * index] 338 | 339 | # Compute scores 340 | scores = torch.mm(query_img.view(1, -1), 341 | sent_embed.transpose(0, 1)).view(-1) 342 | scores = scores.cpu().numpy() 343 | inds = np.argsort(scores)[::-1] 344 | index_list.append(inds[0]) 345 | 346 | # Score 347 | rank = 1e20 348 | for i in range(5*index, 5*index + 5, 1): 349 | tmp = np.where(inds == i)[0][0] 350 | if tmp < rank: 351 | rank = tmp 352 | ranks[index] = rank 353 | 354 | # Compute metrics 355 | r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) 356 | r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) 357 | r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) 358 | medr = np.floor(np.median(ranks)) + 1 359 | return (r1, r5, r10, medr) 360 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/relatedness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Semantic Relatedness (supervised) with Pytorch 10 | """ 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import copy 14 | import numpy as np 15 | 16 | import torch 17 | from torch import nn 18 | import torch.optim as optim 19 | 20 | from scipy.stats import pearsonr, spearmanr 21 | 22 | 23 | class RelatednessPytorch(object): 24 | # Can be used for SICK-Relatedness, and STS14 25 | def __init__(self, train, valid, test, devscores, config): 26 | # fix seed 27 | np.random.seed(config['seed']) 28 | torch.manual_seed(config['seed']) 29 | assert torch.cuda.is_available(), 'torch.cuda required for Relatedness' 30 | torch.cuda.manual_seed(config['seed']) 31 | 32 | self.train = train 33 | self.valid = valid 34 | self.test = test 35 | self.devscores = devscores 36 | 37 | self.inputdim = train['X'].shape[1] 38 | self.nclasses = config['nclasses'] 39 | self.seed = config['seed'] 40 | self.l2reg = 0. 41 | self.batch_size = 64 42 | self.maxepoch = 1000 43 | self.early_stop = True 44 | 45 | self.model = nn.Sequential( 46 | nn.Linear(self.inputdim, self.nclasses), 47 | nn.Softmax(dim=-1), 48 | ) 49 | self.loss_fn = nn.MSELoss() 50 | 51 | if torch.cuda.is_available(): 52 | self.model = self.model.cuda() 53 | self.loss_fn = self.loss_fn.cuda() 54 | 55 | self.loss_fn.size_average = False 56 | self.optimizer = optim.Adam(self.model.parameters(), 57 | weight_decay=self.l2reg) 58 | 59 | def prepare_data(self, trainX, trainy, devX, devy, testX, testy): 60 | # Transform probs to log-probs for KL-divergence 61 | trainX = torch.from_numpy(trainX).float().cuda() 62 | trainy = torch.from_numpy(trainy).float().cuda() 63 | devX = torch.from_numpy(devX).float().cuda() 64 | devy = torch.from_numpy(devy).float().cuda() 65 | testX = torch.from_numpy(testX).float().cuda() 66 | testY = torch.from_numpy(testy).float().cuda() 67 | 68 | return trainX, trainy, devX, devy, testX, testy 69 | 70 | def run(self): 71 | self.nepoch = 0 72 | bestpr = -1 73 | early_stop_count = 0 74 | r = np.arange(1, 6) 75 | stop_train = False 76 | 77 | # Preparing data 78 | trainX, trainy, devX, devy, testX, testy = self.prepare_data( 79 | self.train['X'], self.train['y'], 80 | self.valid['X'], self.valid['y'], 81 | self.test['X'], self.test['y']) 82 | 83 | # Training 84 | while not stop_train and self.nepoch <= self.maxepoch: 85 | self.trainepoch(trainX, trainy, nepoches=50) 86 | yhat = np.dot(self.predict_proba(devX), r) 87 | pr = spearmanr(yhat, self.devscores)[0] 88 | pr = 0 if pr != pr else pr # if NaN bc std=0 89 | # early stop on Pearson 90 | if pr > bestpr: 91 | bestpr = pr 92 | bestmodel = copy.deepcopy(self.model) 93 | elif self.early_stop: 94 | if early_stop_count >= 3: 95 | stop_train = True 96 | early_stop_count += 1 97 | self.model = bestmodel 98 | 99 | yhat = np.dot(self.predict_proba(testX), r) 100 | 101 | return bestpr, yhat 102 | 103 | def trainepoch(self, X, y, nepoches=1): 104 | self.model.train() 105 | for _ in range(self.nepoch, self.nepoch + nepoches): 106 | permutation = np.random.permutation(len(X)) 107 | all_costs = [] 108 | for i in range(0, len(X), self.batch_size): 109 | # forward 110 | idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda() 111 | Xbatch = X[idx] 112 | ybatch = y[idx] 113 | output = self.model(Xbatch) 114 | # loss 115 | loss = self.loss_fn(output, ybatch) 116 | all_costs.append(loss.item()) 117 | # backward 118 | self.optimizer.zero_grad() 119 | loss.backward() 120 | # Update parameters 121 | self.optimizer.step() 122 | self.nepoch += nepoches 123 | 124 | def predict_proba(self, devX): 125 | self.model.eval() 126 | probas = [] 127 | with torch.no_grad(): 128 | for i in range(0, len(devX), self.batch_size): 129 | Xbatch = devX[i:i + self.batch_size] 130 | if len(probas) == 0: 131 | probas = self.model(Xbatch).data.cpu().numpy() 132 | else: 133 | probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0) 134 | return probas 135 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/validation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Validation and classification 10 | (train) : inner-kfold classifier 11 | (train, test) : kfold classifier 12 | (train, dev, test) : split classifier 13 | 14 | """ 15 | from __future__ import absolute_import, division, unicode_literals 16 | 17 | import logging 18 | import numpy as np 19 | from senteval.tools.classifier import MLP 20 | 21 | import sklearn 22 | assert(sklearn.__version__ >= "0.18.0"), \ 23 | "need to update sklearn to version >= 0.18.0" 24 | from sklearn.linear_model import LogisticRegression 25 | from sklearn.model_selection import StratifiedKFold 26 | 27 | 28 | def get_classif_name(classifier_config, usepytorch): 29 | if not usepytorch: 30 | modelname = 'sklearn-LogReg' 31 | else: 32 | nhid = classifier_config['nhid'] 33 | optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim'] 34 | bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size'] 35 | modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs) 36 | return modelname 37 | 38 | # Pytorch version 39 | class InnerKFoldClassifier(object): 40 | """ 41 | (train) split classifier : InnerKfold. 42 | """ 43 | def __init__(self, X, y, config): 44 | self.X = X 45 | self.y = y 46 | self.featdim = X.shape[1] 47 | self.nclasses = config['nclasses'] 48 | self.seed = config['seed'] 49 | self.devresults = [] 50 | self.testresults = [] 51 | self.usepytorch = config['usepytorch'] 52 | self.classifier_config = config['classifier'] 53 | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) 54 | 55 | self.k = 5 if 'kfold' not in config else config['kfold'] 56 | 57 | def run(self): 58 | logging.info('Training {0} with (inner) {1}-fold cross-validation' 59 | .format(self.modelname, self.k)) 60 | 61 | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ 62 | [2**t for t in range(-2, 4, 1)] 63 | skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111) 64 | innerskf = StratifiedKFold(n_splits=self.k, shuffle=True, 65 | random_state=1111) 66 | count = 0 67 | for train_idx, test_idx in skf.split(self.X, self.y): 68 | count += 1 69 | X_train, X_test = self.X[train_idx], self.X[test_idx] 70 | y_train, y_test = self.y[train_idx], self.y[test_idx] 71 | scores = [] 72 | for reg in regs: 73 | regscores = [] 74 | for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train): 75 | X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx] 76 | y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx] 77 | if self.usepytorch: 78 | clf = MLP(self.classifier_config, inputdim=self.featdim, 79 | nclasses=self.nclasses, l2reg=reg, 80 | seed=self.seed) 81 | clf.fit(X_in_train, y_in_train, 82 | validation_data=(X_in_test, y_in_test)) 83 | else: 84 | clf = LogisticRegression(C=reg, random_state=self.seed) 85 | clf.fit(X_in_train, y_in_train) 86 | regscores.append(clf.score(X_in_test, y_in_test)) 87 | scores.append(round(100*np.mean(regscores), 2)) 88 | optreg = regs[np.argmax(scores)] 89 | logging.info('Best param found at split {0}: l2reg = {1} \ 90 | with score {2}'.format(count, optreg, np.max(scores))) 91 | self.devresults.append(np.max(scores)) 92 | 93 | if self.usepytorch: 94 | clf = MLP(self.classifier_config, inputdim=self.featdim, 95 | nclasses=self.nclasses, l2reg=optreg, 96 | seed=self.seed) 97 | 98 | clf.fit(X_train, y_train, validation_split=0.05) 99 | else: 100 | clf = LogisticRegression(C=optreg, random_state=self.seed) 101 | clf.fit(X_train, y_train) 102 | 103 | self.testresults.append(round(100*clf.score(X_test, y_test), 2)) 104 | 105 | devaccuracy = round(np.mean(self.devresults), 2) 106 | testaccuracy = round(np.mean(self.testresults), 2) 107 | return devaccuracy, testaccuracy 108 | 109 | 110 | class KFoldClassifier(object): 111 | """ 112 | (train, test) split classifier : cross-validation on train. 113 | """ 114 | def __init__(self, train, test, config): 115 | self.train = train 116 | self.test = test 117 | self.featdim = self.train['X'].shape[1] 118 | self.nclasses = config['nclasses'] 119 | self.seed = config['seed'] 120 | self.usepytorch = config['usepytorch'] 121 | self.classifier_config = config['classifier'] 122 | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) 123 | 124 | self.k = 5 if 'kfold' not in config else config['kfold'] 125 | 126 | def run(self): 127 | # cross-validation 128 | logging.info('Training {0} with {1}-fold cross-validation' 129 | .format(self.modelname, self.k)) 130 | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ 131 | [2**t for t in range(-1, 6, 1)] 132 | skf = StratifiedKFold(n_splits=self.k, shuffle=True, 133 | random_state=self.seed) 134 | scores = [] 135 | 136 | for reg in regs: 137 | scanscores = [] 138 | for train_idx, test_idx in skf.split(self.train['X'], 139 | self.train['y']): 140 | # Split data 141 | X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx] 142 | 143 | X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx] 144 | 145 | # Train classifier 146 | if self.usepytorch: 147 | clf = MLP(self.classifier_config, inputdim=self.featdim, 148 | nclasses=self.nclasses, l2reg=reg, 149 | seed=self.seed) 150 | clf.fit(X_train, y_train, validation_data=(X_test, y_test)) 151 | else: 152 | clf = LogisticRegression(C=reg, random_state=self.seed) 153 | clf.fit(X_train, y_train) 154 | score = clf.score(X_test, y_test) 155 | scanscores.append(score) 156 | # Append mean score 157 | scores.append(round(100*np.mean(scanscores), 2)) 158 | 159 | # evaluation 160 | logging.info([('reg:' + str(regs[idx]), scores[idx]) 161 | for idx in range(len(scores))]) 162 | optreg = regs[np.argmax(scores)] 163 | devaccuracy = np.max(scores) 164 | logging.info('Cross-validation : best param found is reg = {0} \ 165 | with score {1}'.format(optreg, devaccuracy)) 166 | 167 | logging.info('Evaluating...') 168 | if self.usepytorch: 169 | clf = MLP(self.classifier_config, inputdim=self.featdim, 170 | nclasses=self.nclasses, l2reg=optreg, 171 | seed=self.seed) 172 | clf.fit(self.train['X'], self.train['y'], validation_split=0.05) 173 | else: 174 | clf = LogisticRegression(C=optreg, random_state=self.seed) 175 | clf.fit(self.train['X'], self.train['y']) 176 | yhat = clf.predict(self.test['X']) 177 | 178 | testaccuracy = clf.score(self.test['X'], self.test['y']) 179 | testaccuracy = round(100*testaccuracy, 2) 180 | 181 | return devaccuracy, testaccuracy, yhat 182 | 183 | 184 | class SplitClassifier(object): 185 | """ 186 | (train, valid, test) split classifier. 187 | """ 188 | def __init__(self, X, y, config): 189 | self.X = X 190 | self.y = y 191 | self.nclasses = config['nclasses'] 192 | self.featdim = self.X['train'].shape[1] 193 | self.seed = config['seed'] 194 | self.usepytorch = config['usepytorch'] 195 | self.classifier_config = config['classifier'] 196 | self.cudaEfficient = False if 'cudaEfficient' not in config else \ 197 | config['cudaEfficient'] 198 | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) 199 | self.noreg = False if 'noreg' not in config else config['noreg'] 200 | self.config = config 201 | 202 | def run(self): 203 | logging.info('Training {0} with standard validation..' 204 | .format(self.modelname)) 205 | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ 206 | [2**t for t in range(-2, 4, 1)] 207 | if self.noreg: 208 | regs = [1e-9 if self.usepytorch else 1e9] 209 | scores = [] 210 | for reg in regs: 211 | if self.usepytorch: 212 | clf = MLP(self.classifier_config, inputdim=self.featdim, 213 | nclasses=self.nclasses, l2reg=reg, 214 | seed=self.seed, cudaEfficient=self.cudaEfficient) 215 | 216 | # TODO: Find a hack for reducing nb epoches in SNLI 217 | clf.fit(self.X['train'], self.y['train'], 218 | validation_data=(self.X['valid'], self.y['valid'])) 219 | else: 220 | clf = LogisticRegression(C=reg, random_state=self.seed) 221 | clf.fit(self.X['train'], self.y['train']) 222 | scores.append(round(100*clf.score(self.X['valid'], 223 | self.y['valid']), 2)) 224 | logging.info([('reg:'+str(regs[idx]), scores[idx]) 225 | for idx in range(len(scores))]) 226 | optreg = regs[np.argmax(scores)] 227 | devaccuracy = np.max(scores) 228 | logging.info('Validation : best param found is reg = {0} with score \ 229 | {1}'.format(optreg, devaccuracy)) 230 | clf = LogisticRegression(C=optreg, random_state=self.seed) 231 | logging.info('Evaluating...') 232 | if self.usepytorch: 233 | clf = MLP(self.classifier_config, inputdim=self.featdim, 234 | nclasses=self.nclasses, l2reg=optreg, 235 | seed=self.seed, cudaEfficient=self.cudaEfficient) 236 | 237 | # TODO: Find a hack for reducing nb epoches in SNLI 238 | clf.fit(self.X['train'], self.y['train'], 239 | validation_data=(self.X['valid'], self.y['valid'])) 240 | else: 241 | clf = LogisticRegression(C=optreg, random_state=self.seed) 242 | clf.fit(self.X['train'], self.y['train']) 243 | 244 | testaccuracy = clf.score(self.X['test'], self.y['test']) 245 | testaccuracy = round(100*testaccuracy, 2) 246 | return devaccuracy, testaccuracy 247 | -------------------------------------------------------------------------------- /SentEval/senteval/trec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | TREC question-type classification 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import logging 17 | import numpy as np 18 | 19 | from senteval.tools.validation import KFoldClassifier 20 | 21 | 22 | class TRECEval(object): 23 | def __init__(self, task_path, seed=1111): 24 | logging.info('***** Transfer task : TREC *****\n\n') 25 | self.seed = seed 26 | self.train = self.loadFile(os.path.join(task_path, 'train_5500.label')) 27 | self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label')) 28 | 29 | def do_prepare(self, params, prepare): 30 | samples = self.train['X'] + self.test['X'] 31 | return prepare(params, samples) 32 | 33 | def loadFile(self, fpath): 34 | trec_data = {'X': [], 'y': []} 35 | tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 36 | 'HUM': 3, 'LOC': 4, 'NUM': 5} 37 | with io.open(fpath, 'r', encoding='latin-1') as f: 38 | for line in f: 39 | target, sample = line.strip().split(':', 1) 40 | sample = sample.split(' ', 1)[1].split() 41 | assert target in tgt2idx, target 42 | trec_data['X'].append(sample) 43 | trec_data['y'].append(tgt2idx[target]) 44 | return trec_data 45 | 46 | def run(self, params, batcher): 47 | train_embeddings, test_embeddings = [], [] 48 | 49 | # Sort to reduce padding 50 | sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']), 51 | key=lambda z: (len(z[0]), z[1])) 52 | train_samples = [x for (x, y) in sorted_corpus_train] 53 | train_labels = [y for (x, y) in sorted_corpus_train] 54 | 55 | sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']), 56 | key=lambda z: (len(z[0]), z[1])) 57 | test_samples = [x for (x, y) in sorted_corpus_test] 58 | test_labels = [y for (x, y) in sorted_corpus_test] 59 | 60 | # Get train embeddings 61 | for ii in range(0, len(train_labels), params.batch_size): 62 | batch = train_samples[ii:ii + params.batch_size] 63 | embeddings = batcher(params, batch) 64 | train_embeddings.append(embeddings) 65 | train_embeddings = np.vstack(train_embeddings) 66 | logging.info('Computed train embeddings') 67 | 68 | # Get test embeddings 69 | for ii in range(0, len(test_labels), params.batch_size): 70 | batch = test_samples[ii:ii + params.batch_size] 71 | embeddings = batcher(params, batch) 72 | test_embeddings.append(embeddings) 73 | test_embeddings = np.vstack(test_embeddings) 74 | logging.info('Computed test embeddings') 75 | 76 | config_classifier = {'nclasses': 6, 'seed': self.seed, 77 | 'usepytorch': params.usepytorch, 78 | 'classifier': params.classifier, 79 | 'kfold': params.kfold} 80 | clf = KFoldClassifier({'X': train_embeddings, 81 | 'y': np.array(train_labels)}, 82 | {'X': test_embeddings, 83 | 'y': np.array(test_labels)}, 84 | config_classifier) 85 | devacc, testacc, _ = clf.run() 86 | logging.debug('\nDev acc : {0} Test acc : {1} \ 87 | for TREC\n'.format(devacc, testacc)) 88 | return {'devacc': devacc, 'acc': testacc, 89 | 'ndev': len(self.train['X']), 'ntest': len(self.test['X'])} 90 | -------------------------------------------------------------------------------- /SentEval/senteval/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | import numpy as np 11 | import re 12 | import inspect 13 | from torch import optim 14 | 15 | 16 | def create_dictionary(sentences): 17 | words = {} 18 | for s in sentences: 19 | for word in s: 20 | if word in words: 21 | words[word] += 1 22 | else: 23 | words[word] = 1 24 | words[''] = 1e9 + 4 25 | words[''] = 1e9 + 3 26 | words['

'] = 1e9 + 2 27 | # words[''] = 1e9 + 1 28 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort 29 | id2word = [] 30 | word2id = {} 31 | for i, (w, _) in enumerate(sorted_words): 32 | id2word.append(w) 33 | word2id[w] = i 34 | 35 | return id2word, word2id 36 | 37 | 38 | def cosine(u, v): 39 | return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) 40 | 41 | 42 | class dotdict(dict): 43 | """ dot.notation access to dictionary attributes """ 44 | __getattr__ = dict.get 45 | __setattr__ = dict.__setitem__ 46 | __delattr__ = dict.__delitem__ 47 | 48 | 49 | def get_optimizer(s): 50 | """ 51 | Parse optimizer parameters. 52 | Input should be of the form: 53 | - "sgd,lr=0.01" 54 | - "adagrad,lr=0.1,lr_decay=0.05" 55 | """ 56 | if "," in s: 57 | method = s[:s.find(',')] 58 | optim_params = {} 59 | for x in s[s.find(',') + 1:].split(','): 60 | split = x.split('=') 61 | assert len(split) == 2 62 | assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None 63 | optim_params[split[0]] = float(split[1]) 64 | else: 65 | method = s 66 | optim_params = {} 67 | 68 | if method == 'adadelta': 69 | optim_fn = optim.Adadelta 70 | elif method == 'adagrad': 71 | optim_fn = optim.Adagrad 72 | elif method == 'adam': 73 | optim_fn = optim.Adam 74 | elif method == 'adamax': 75 | optim_fn = optim.Adamax 76 | elif method == 'asgd': 77 | optim_fn = optim.ASGD 78 | elif method == 'rmsprop': 79 | optim_fn = optim.RMSprop 80 | elif method == 'rprop': 81 | optim_fn = optim.Rprop 82 | elif method == 'sgd': 83 | optim_fn = optim.SGD 84 | assert 'lr' in optim_params 85 | else: 86 | raise Exception('Unknown optimization method: "%s"' % method) 87 | 88 | # check that we give good parameters to the optimizer 89 | expected_args = inspect.getargspec(optim_fn.__init__)[0] 90 | assert expected_args[:2] == ['self', 'params'] 91 | if not all(k in expected_args[2:] for k in optim_params.keys()): 92 | raise Exception('Unexpected parameters: expected "%s", got "%s"' % ( 93 | str(expected_args[2:]), str(optim_params.keys()))) 94 | 95 | return optim_fn, optim_params 96 | -------------------------------------------------------------------------------- /SentEval/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import io 9 | from setuptools import setup, find_packages 10 | 11 | with io.open('./README.md', encoding='utf-8') as f: 12 | readme = f.read() 13 | 14 | setup( 15 | name='SentEval', 16 | version='0.1.0', 17 | url='https://github.com/facebookresearch/SentEval', 18 | packages=find_packages(exclude=['examples']), 19 | license='Attribution-NonCommercial 4.0 International', 20 | long_description=readme, 21 | ) 22 | -------------------------------------------------------------------------------- /cl/models.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | import torch.distributed as dist 5 | 6 | import transformers 7 | from transformers import RobertaTokenizer 8 | from transformers.models.roberta.modeling_roberta import RobertaPreTrainedModel, RobertaModel, RobertaLMHead 9 | from transformers.models.bert.modeling_bert import BertPreTrainedModel, BertModel, BertLMPredictionHead 10 | from transformers.activations import gelu 11 | from transformers.file_utils import ( 12 | add_code_sample_docstrings, 13 | add_start_docstrings, 14 | add_start_docstrings_to_model_forward, 15 | replace_return_docstrings, 16 | ) 17 | from transformers.modeling_outputs import SequenceClassifierOutput, BaseModelOutputWithPoolingAndCrossAttentions 18 | 19 | class MLPLayer(nn.Module): 20 | """ 21 | Head for getting sentence representations over RoBERTa/BERT's CLS representation. 22 | """ 23 | 24 | def __init__(self, config): 25 | super().__init__() 26 | self.dense = nn.Linear(config.hidden_size * 2, config.hidden_size) 27 | self.activation = nn.Tanh() 28 | 29 | def forward(self, features, **kwargs): 30 | x = self.dense(features) 31 | x = self.activation(x) 32 | 33 | return x 34 | 35 | 36 | class Similarity(nn.Module): 37 | """ 38 | Dot product or cosine similarity 39 | """ 40 | 41 | def __init__(self, temp): 42 | super().__init__() 43 | self.temp = temp 44 | self.cos = nn.CosineSimilarity(dim=-1) 45 | 46 | def forward(self, x, y): 47 | return self.cos(x, y) / self.temp 48 | 49 | 50 | class AttentionLayer(nn.Module): 51 | def __init__(self, config): 52 | super().__init__() 53 | self.key = nn.Sequential( 54 | nn.Linear(config.hidden_size, config.hidden_size, bias=False), 55 | nn.RReLU() 56 | ) 57 | 58 | self.query = nn.Sequential( 59 | nn.Linear(config.hidden_size, config.hidden_size, bias=False), 60 | nn.RReLU() 61 | ) 62 | 63 | self.value = nn.Sequential( 64 | nn.Linear(config.hidden_size, config.hidden_size, bias=False), 65 | nn.RReLU() 66 | ) 67 | 68 | def forward(self, source_pooler_outputs, target_outputs): 69 | target_pooler_output_list = [] 70 | for idx, source_pooler_output in enumerate(source_pooler_outputs): 71 | concated = target_outputs 72 | num_target_outputs = len(concated) 73 | concated = torch.stack(concated, dim=1) 74 | 75 | K = self.key(concated) 76 | V = self.value(concated) 77 | Q = self.query(source_pooler_output) 78 | 79 | score_list = torch.bmm(K, Q.unsqueeze(dim=-1)).squeeze(dim=-1) 80 | score_list /= num_target_outputs 81 | score_list = F.softmax(score_list, dim=-1) 82 | V = torch.mul(V, score_list.unsqueeze(dim=-1)) 83 | 84 | target_pooler_output = torch.sum(V, dim=1) 85 | target_pooler_output_list.append(target_pooler_output) 86 | 87 | target_pooler_outputs = torch.stack(target_pooler_output_list, dim=1) 88 | target_pooler_output = torch.mean(target_pooler_outputs, dim=1) 89 | 90 | pooler_output = torch.cat([source_pooler_outputs.pop(-1), target_pooler_output], dim=1) 91 | return pooler_output 92 | 93 | 94 | 95 | class Pooler(nn.Module): 96 | """ 97 | Parameter-free poolers to get the sentence embedding 98 | 'cls': [CLS] representation with BERT/RoBERTa's MLP pooler. 99 | 'cls_before_pooler': [CLS] representation without the original MLP pooler. 100 | 'avg': average of the last layers' hidden states at each token. 101 | 'avg_top2': average of the last two layers. 102 | 'avg_first_last': average of the first and the last layers. 103 | """ 104 | def __init__(self, pooler_type): 105 | super().__init__() 106 | self.pooler_type = pooler_type 107 | assert self.pooler_type in ["cls", "cls_before_pooler", "avg", "avg_top2", "avg_first_last"], "unrecognized pooling type %s" % self.pooler_type 108 | 109 | def forward(self, attention_mask, outputs): 110 | last_hidden = outputs.last_hidden_state 111 | pooler_output = outputs.pooler_output 112 | hidden_states = outputs.hidden_states 113 | 114 | if self.pooler_type in ['cls_before_pooler', 'cls']: 115 | source_pooled_result = [hidden[:, 0] for hidden in hidden_states] 116 | target_pooled_result = [((hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)) for hidden in hidden_states] 117 | return source_pooled_result, target_pooled_result 118 | elif self.pooler_type == "avg": 119 | return ((last_hidden * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1)) 120 | elif self.pooler_type == "avg_first_last": 121 | first_hidden = hidden_states[0] 122 | last_hidden = hidden_states[-1] 123 | pooled_result = ((first_hidden + last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1) 124 | return pooled_result 125 | elif self.pooler_type == "avg_top2": 126 | second_last_hidden = hidden_states[-2] 127 | last_hidden = hidden_states[-1] 128 | pooled_result = ((last_hidden + second_last_hidden) / 2.0 * attention_mask.unsqueeze(-1)).sum(1) / attention_mask.sum(-1).unsqueeze(-1) 129 | return pooled_result 130 | else: 131 | raise NotImplementedError 132 | 133 | 134 | def cl_init(cls, config): 135 | """ 136 | Contrastive learning class init function. 137 | """ 138 | cls.pooler_type = cls.model_args.pooler_type 139 | cls.pooler = Pooler(cls.model_args.pooler_type) 140 | cls.attn = AttentionLayer(config) 141 | cls.mlp = MLPLayer(config) 142 | cls.sim = Similarity(temp=cls.model_args.temp) 143 | cls.init_weights() 144 | 145 | def cl_forward(cls, 146 | encoder, 147 | input_ids=None, 148 | attention_mask=None, 149 | token_type_ids=None, 150 | position_ids=None, 151 | head_mask=None, 152 | inputs_embeds=None, 153 | labels=None, 154 | output_attentions=None, 155 | output_hidden_states=None, 156 | return_dict=None, 157 | mlm_input_ids=None, 158 | mlm_labels=None, 159 | ): 160 | return_dict = return_dict if return_dict is not None else cls.config.use_return_dict 161 | ori_input_ids = input_ids 162 | batch_size = input_ids.size(0) 163 | # Number of sentences in one instance 164 | # 2: pair instance; 3: pair instance with a hard negative 165 | num_sent = input_ids.size(1) 166 | 167 | mlm_outputs = None 168 | # Flatten input for encoding 169 | input_ids = input_ids.view((-1, input_ids.size(-1))) # (bs * num_sent, len) 170 | attention_mask = attention_mask.view((-1, attention_mask.size(-1))) # (bs * num_sent len) 171 | if token_type_ids is not None: 172 | token_type_ids = token_type_ids.view((-1, token_type_ids.size(-1))) # (bs * num_sent, len) 173 | 174 | # Get raw embeddings 175 | outputs = encoder( 176 | input_ids, 177 | attention_mask=attention_mask, 178 | token_type_ids=token_type_ids, 179 | position_ids=position_ids, 180 | head_mask=head_mask, 181 | inputs_embeds=inputs_embeds, 182 | output_attentions=output_attentions, 183 | output_hidden_states=True if cls.model_args.pooler_type in ['cls', 'avg_top2', 'avg_first_last'] else False, 184 | return_dict=True, 185 | ) 186 | 187 | # MLM auxiliary objective 188 | if mlm_input_ids is not None: 189 | mlm_input_ids = mlm_input_ids.view((-1, mlm_input_ids.size(-1))) 190 | mlm_outputs = encoder( 191 | mlm_input_ids, 192 | attention_mask=attention_mask, 193 | token_type_ids=token_type_ids, 194 | position_ids=position_ids, 195 | head_mask=head_mask, 196 | inputs_embeds=inputs_embeds, 197 | output_attentions=output_attentions, 198 | output_hidden_states=True if cls.model_args.pooler_type in ['cls', 'avg_top2', 'avg_first_last'] else False, 199 | return_dict=True, 200 | ) 201 | 202 | source_pooler_output, target_outputs = cls.pooler(attention_mask, outputs) 203 | pooler_output = cls.attn(source_pooler_output, target_outputs) 204 | pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1))) # (bs, num_sent, hidden) 205 | 206 | pooler_output = pooler_output.view((batch_size * num_sent, pooler_output.size(-1))) # (bs, num_sent, hidden) 207 | pooler_output = cls.mlp(pooler_output) 208 | pooler_output = pooler_output.view((batch_size, num_sent, pooler_output.size(-1))) # (bs, num_sent, hidden) 209 | 210 | 211 | # Separate representation 212 | z1, z2 = pooler_output[:,0], pooler_output[:,1] 213 | 214 | # Hard negative 215 | z3 = pooler_output[:, 2] 216 | 217 | 218 | # Gather all embeddings if using distributed training 219 | if dist.is_initialized() and cls.training: 220 | # Gather hard negative 221 | z3_list = [torch.zeros_like(z3) for _ in range(dist.get_world_size())] 222 | dist.all_gather(tensor_list=z3_list, tensor=z3.contiguous()) 223 | z3_list[dist.get_rank()] = z3 224 | z3 = torch.cat(z3_list, 0) 225 | 226 | 227 | # Dummy vectors for allgather 228 | z1_list = [torch.zeros_like(z1) for _ in range(dist.get_world_size())] 229 | z2_list = [torch.zeros_like(z2) for _ in range(dist.get_world_size())] 230 | # Allgather 231 | dist.all_gather(tensor_list=z1_list, tensor=z1.contiguous()) 232 | dist.all_gather(tensor_list=z2_list, tensor=z2.contiguous()) 233 | 234 | # Since allgather results do not have gradients, we replace the 235 | # current process's corresponding embeddings with original tensors 236 | z1_list[dist.get_rank()] = z1 237 | z2_list[dist.get_rank()] = z2 238 | # Get full batch embeddings: (bs x N, hidden) 239 | z1 = torch.cat(z1_list, 0) 240 | z2 = torch.cat(z2_list, 0) 241 | 242 | cos_sim = cls.sim(z1.unsqueeze(1), z2.unsqueeze(0)) 243 | # Hard negative 244 | z1_z3_cos = cls.sim(z1.unsqueeze(1), z3.unsqueeze(0)) 245 | cos_sim = torch.cat([cos_sim, z1_z3_cos], 1) 246 | 247 | 248 | labels = torch.arange(cos_sim.size(0)).long().to(cls.device) 249 | loss_fct = nn.CrossEntropyLoss() 250 | 251 | # Calculate loss with hard negatives 252 | z3_weight = cls.model_args.hard_negative_weight 253 | weights = torch.tensor( 254 | [[0.0] * (cos_sim.size(-1) - z1_z3_cos.size(-1)) + [0.0] * i + [z3_weight] + [0.0] * ( 255 | z1_z3_cos.size(-1) - i - 1) for i in range(z1_z3_cos.size(-1))] 256 | ).to(cls.device) 257 | cos_sim = cos_sim + weights 258 | 259 | loss = loss_fct(cos_sim, labels) 260 | 261 | # Calculate loss for MLM 262 | if mlm_outputs is not None and mlm_labels is not None: 263 | mlm_labels = mlm_labels.view(-1, mlm_labels.size(-1)) 264 | prediction_scores = cls.lm_head(mlm_outputs.last_hidden_state) 265 | masked_lm_loss = loss_fct(prediction_scores.view(-1, cls.config.vocab_size), mlm_labels.view(-1)) 266 | loss = loss + cls.model_args.mlm_weight * masked_lm_loss 267 | 268 | if not return_dict: 269 | output = (cos_sim,) + outputs[2:] 270 | return ((loss,) + output) if loss is not None else output 271 | return SequenceClassifierOutput( 272 | loss=loss, 273 | logits=cos_sim, 274 | hidden_states=outputs.hidden_states, 275 | attentions=outputs.attentions, 276 | ) 277 | 278 | 279 | def sentemb_forward( 280 | cls, 281 | encoder, 282 | input_ids=None, 283 | attention_mask=None, 284 | token_type_ids=None, 285 | position_ids=None, 286 | head_mask=None, 287 | inputs_embeds=None, 288 | labels=None, 289 | output_attentions=None, 290 | output_hidden_states=None, 291 | return_dict=None, 292 | ): 293 | 294 | return_dict = return_dict if return_dict is not None else cls.config.use_return_dict 295 | 296 | outputs = encoder( 297 | input_ids, 298 | attention_mask=attention_mask, 299 | token_type_ids=token_type_ids, 300 | position_ids=position_ids, 301 | head_mask=head_mask, 302 | inputs_embeds=inputs_embeds, 303 | output_attentions=output_attentions, 304 | output_hidden_states=True if cls.pooler_type in ['cls', 'avg_top2', 'avg_first_last'] else False, 305 | return_dict=True, 306 | ) 307 | 308 | source_pooler_output, target_pooler_output = cls.pooler(attention_mask, outputs) 309 | pooler_output = cls.attn(source_pooler_output, target_pooler_output) 310 | pooler_output = cls.mlp(pooler_output) 311 | 312 | if not return_dict: 313 | return (outputs[0], pooler_output) + outputs[2:] 314 | 315 | return BaseModelOutputWithPoolingAndCrossAttentions( 316 | pooler_output=pooler_output, 317 | last_hidden_state=outputs.last_hidden_state, 318 | hidden_states=outputs.hidden_states, 319 | ) 320 | 321 | 322 | class BertForCL(BertPreTrainedModel): 323 | _keys_to_ignore_on_load_missing = [r"position_ids"] 324 | 325 | def __init__(self, config, *model_args, **model_kargs): 326 | super().__init__(config) 327 | self.model_args = model_kargs["model_args"] 328 | self.bert = BertModel(config, add_pooling_layer=False) 329 | 330 | if self.model_args.do_mlm: 331 | self.lm_head = BertLMPredictionHead(config) 332 | 333 | cl_init(self, config) 334 | 335 | def forward(self, 336 | input_ids=None, 337 | attention_mask=None, 338 | token_type_ids=None, 339 | position_ids=None, 340 | head_mask=None, 341 | inputs_embeds=None, 342 | labels=None, 343 | output_attentions=None, 344 | output_hidden_states=None, 345 | return_dict=None, 346 | sent_emb=False, 347 | mlm_input_ids=None, 348 | mlm_labels=None, 349 | ): 350 | if sent_emb: 351 | return sentemb_forward(self, self.bert, 352 | input_ids=input_ids, 353 | attention_mask=attention_mask, 354 | token_type_ids=token_type_ids, 355 | position_ids=position_ids, 356 | head_mask=head_mask, 357 | inputs_embeds=inputs_embeds, 358 | labels=labels, 359 | output_attentions=output_attentions, 360 | output_hidden_states=output_hidden_states, 361 | return_dict=return_dict, 362 | ) 363 | else: 364 | return cl_forward(self, self.bert, 365 | input_ids=input_ids, 366 | attention_mask=attention_mask, 367 | token_type_ids=token_type_ids, 368 | position_ids=position_ids, 369 | head_mask=head_mask, 370 | inputs_embeds=inputs_embeds, 371 | labels=labels, 372 | output_attentions=output_attentions, 373 | output_hidden_states=output_hidden_states, 374 | return_dict=return_dict, 375 | mlm_input_ids=mlm_input_ids, 376 | mlm_labels=mlm_labels, 377 | ) 378 | 379 | 380 | 381 | class RobertaForCL(RobertaPreTrainedModel): 382 | _keys_to_ignore_on_load_missing = [r"position_ids"] 383 | 384 | def __init__(self, config, *model_args, **model_kargs): 385 | super().__init__(config) 386 | self.model_args = model_kargs["model_args"] 387 | self.roberta = RobertaModel(config, add_pooling_layer=False) 388 | 389 | if self.model_args.do_mlm: 390 | self.lm_head = RobertaLMHead(config) 391 | 392 | cl_init(self, config) 393 | 394 | def forward(self, 395 | input_ids=None, 396 | attention_mask=None, 397 | token_type_ids=None, 398 | position_ids=None, 399 | head_mask=None, 400 | inputs_embeds=None, 401 | labels=None, 402 | output_attentions=None, 403 | output_hidden_states=None, 404 | return_dict=None, 405 | sent_emb=False, 406 | mlm_input_ids=None, 407 | mlm_labels=None, 408 | ): 409 | if sent_emb: 410 | return sentemb_forward(self, self.roberta, 411 | input_ids=input_ids, 412 | attention_mask=attention_mask, 413 | token_type_ids=token_type_ids, 414 | position_ids=position_ids, 415 | head_mask=head_mask, 416 | inputs_embeds=inputs_embeds, 417 | labels=labels, 418 | output_attentions=output_attentions, 419 | output_hidden_states=output_hidden_states, 420 | return_dict=return_dict, 421 | ) 422 | else: 423 | return cl_forward(self, self.roberta, 424 | input_ids=input_ids, 425 | attention_mask=attention_mask, 426 | token_type_ids=token_type_ids, 427 | position_ids=position_ids, 428 | head_mask=head_mask, 429 | inputs_embeds=inputs_embeds, 430 | labels=labels, 431 | output_attentions=output_attentions, 432 | output_hidden_states=output_hidden_states, 433 | return_dict=return_dict, 434 | mlm_input_ids=mlm_input_ids, 435 | mlm_labels=mlm_labels, 436 | ) 437 | -------------------------------------------------------------------------------- /data/download_nli.sh: -------------------------------------------------------------------------------- 1 | wget https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/nli_for_simcse.csv 2 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.2.1 2 | scipy 3 | datasets 4 | pandas 5 | scikit-learn 6 | prettytable 7 | gradio 8 | torch 9 | setuptools 10 | setproctitle -------------------------------------------------------------------------------- /run_sup_layerattnpooler.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | NUM_GPU=4 3 | PORT_ID=$(expr $RANDOM + 1000) 4 | export OMP_NUM_THREADS=8 5 | python -m torch.distributed.launch --nproc_per_node $NUM_GPU --master_port $PORT_ID train.py \ 6 | --model_name_or_path bert-base-uncased \ 7 | --train_file data/nli_for_simcse.csv \ 8 | --output_dir result/bert-based-uncased-cl-layerattnpooler \ 9 | --num_train_epochs 3 \ 10 | --per_device_train_batch_size 64 \ 11 | --learning_rate 2e-5 \ 12 | --max_seq_length 64 \ 13 | --evaluation_strategy steps \ 14 | --metric_for_best_model stsb_spearman \ 15 | --load_best_model_at_end \ 16 | --eval_steps 100 \ 17 | --pooler_type cls \ 18 | --overwrite_output_dir \ 19 | --temp 0.05 \ 20 | --do_train \ 21 | --do_eval \ 22 | --fp16 \ 23 | "$@" 24 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import math 3 | import os 4 | import sys 5 | from dataclasses import dataclass, field 6 | from typing import Optional, Union, List, Dict, Tuple 7 | import torch 8 | import collections 9 | import random 10 | 11 | from datasets import load_dataset 12 | 13 | os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3' 14 | 15 | import transformers 16 | from transformers import ( 17 | CONFIG_MAPPING, 18 | MODEL_FOR_MASKED_LM_MAPPING, 19 | AutoConfig, 20 | AutoModelForMaskedLM, 21 | AutoModelForSequenceClassification, 22 | AutoTokenizer, 23 | DataCollatorForLanguageModeling, 24 | DataCollatorWithPadding, 25 | HfArgumentParser, 26 | Trainer, 27 | TrainingArguments, 28 | default_data_collator, 29 | set_seed, 30 | EvalPrediction, 31 | BertModel, 32 | BertForPreTraining, 33 | RobertaModel 34 | ) 35 | from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTrainedTokenizerBase 36 | from transformers.trainer_utils import is_main_process 37 | from transformers.data.data_collator import DataCollatorForLanguageModeling 38 | from transformers.file_utils import cached_property, torch_required, is_torch_available, is_torch_tpu_available 39 | from cl.models import RobertaForCL, BertForCL 40 | from cl.trainers import CLTrainer 41 | 42 | logger = logging.getLogger(__name__) 43 | MODEL_CONFIG_CLASSES = list(MODEL_FOR_MASKED_LM_MAPPING.keys()) 44 | MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES) 45 | 46 | @dataclass 47 | class ModelArguments: 48 | """ 49 | Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch. 50 | """ 51 | 52 | # Huggingface's original arguments 53 | model_name_or_path: Optional[str] = field( 54 | default=None, 55 | metadata={ 56 | "help": "The model checkpoint for weights initialization." 57 | "Don't set if you want to train a model from scratch." 58 | }, 59 | ) 60 | model_type: Optional[str] = field( 61 | default=None, 62 | metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)}, 63 | ) 64 | config_name: Optional[str] = field( 65 | default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"} 66 | ) 67 | tokenizer_name: Optional[str] = field( 68 | default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"} 69 | ) 70 | cache_dir: Optional[str] = field( 71 | default=None, 72 | metadata={"help": "Where do you want to store the pretrained models downloaded from huggingface.co"}, 73 | ) 74 | use_fast_tokenizer: bool = field( 75 | default=True, 76 | metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."}, 77 | ) 78 | model_revision: str = field( 79 | default="main", 80 | metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."}, 81 | ) 82 | use_auth_token: bool = field( 83 | default=False, 84 | metadata={ 85 | "help": "Will use the token generated when running `transformers-cli login` (necessary to use this script " 86 | "with private models)." 87 | }, 88 | ) 89 | 90 | temp: float = field( 91 | default=0.05, 92 | metadata={ 93 | "help": "Temperature for softmax." 94 | } 95 | ) 96 | pooler_type: str = field( 97 | default="cls", 98 | metadata={ 99 | "help": "What kind of pooler to use (cls, cls_before_pooler, avg, avg_top2, avg_first_last)." 100 | } 101 | ) 102 | hard_negative_weight: float = field( 103 | default=1.0, 104 | metadata={ 105 | "help": "The **logit** of weight for hard negatives (only effective if hard negatives are used)." 106 | } 107 | ) 108 | do_mlm: bool = field( 109 | default=False, 110 | metadata={ 111 | "help": "Whether to use MLM auxiliary objective." 112 | } 113 | ) 114 | mlm_weight: float = field( 115 | default=0.15, 116 | metadata={ 117 | "help": "Weight for MLM auxiliary objective (only effective if --do_mlm)." 118 | } 119 | ) 120 | 121 | 122 | @dataclass 123 | class DataTrainingArguments: 124 | """ 125 | Arguments pertaining to what data we are going to input our model for training and eval. 126 | """ 127 | 128 | # Huggingface's original arguments. 129 | dataset_name: Optional[str] = field( 130 | default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."} 131 | ) 132 | dataset_config_name: Optional[str] = field( 133 | default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."} 134 | ) 135 | overwrite_cache: bool = field( 136 | default=False, metadata={"help": "Overwrite the cached training and evaluation sets"} 137 | ) 138 | validation_split_percentage: Optional[int] = field( 139 | default=5, 140 | metadata={ 141 | "help": "The percentage of the train set used as validation set in case there's no validation split" 142 | }, 143 | ) 144 | preprocessing_num_workers: Optional[int] = field( 145 | default=None, 146 | metadata={"help": "The number of processes to use for the preprocessing."}, 147 | ) 148 | 149 | # SimCSE's arguments 150 | train_file: Optional[str] = field( 151 | default=None, 152 | metadata={"help": "The training data file (.txt or .csv)."} 153 | ) 154 | max_seq_length: Optional[int] = field( 155 | default=32, 156 | metadata={ 157 | "help": "The maximum total input sequence length after tokenization. Sequences longer " 158 | "than this will be truncated." 159 | }, 160 | ) 161 | pad_to_max_length: bool = field( 162 | default=False, 163 | metadata={ 164 | "help": "Whether to pad all samples to `max_seq_length`. " 165 | "If False, will pad the samples dynamically when batching to the maximum length in the batch." 166 | }, 167 | ) 168 | mlm_probability: float = field( 169 | default=0.15, 170 | metadata={"help": "Ratio of tokens to mask for MLM (only effective if --do_mlm)"} 171 | ) 172 | 173 | def __post_init__(self): 174 | if self.dataset_name is None and self.train_file is None and self.validation_file is None: 175 | raise ValueError("Need either a dataset name or a training/validation file.") 176 | else: 177 | if self.train_file is not None: 178 | extension = self.train_file.split(".")[-1] 179 | assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file." 180 | 181 | 182 | @dataclass 183 | class OurTrainingArguments(TrainingArguments): 184 | # Evaluation 185 | ## By default, we evaluate STS (dev) during training (for selecting best checkpoints) and evaluate 186 | ## both STS and transfer tasks (dev) at the end of training. Using --eval_transfer will allow evaluating 187 | ## both STS and transfer tasks (dev) during training. 188 | eval_transfer: bool = field( 189 | default=False, 190 | metadata={"help": "Evaluate transfer task dev sets (in validation)."} 191 | ) 192 | 193 | @cached_property 194 | @torch_required 195 | def _setup_devices(self) -> "torch.device": 196 | logger.info("PyTorch: setting up devices") 197 | if self.no_cuda: 198 | device = torch.device("cpu") 199 | self._n_gpu = 0 200 | elif is_torch_tpu_available(): 201 | device = xm.xla_device() 202 | self._n_gpu = 0 203 | elif self.local_rank == -1: 204 | # if n_gpu is > 1 we'll use nn.DataParallel. 205 | # If you only want to use a specific subset of GPUs use `CUDA_VISIBLE_DEVICES=0` 206 | # Explicitly set CUDA to the first (index 0) CUDA device, otherwise `set_device` will 207 | # trigger an error that a device index is missing. Index 0 takes into account the 208 | # GPUs available in the environment, so `CUDA_VISIBLE_DEVICES=1,2` with `cuda:0` 209 | # will use the first GPU in that env, i.e. GPU#1 210 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 211 | # Sometimes the line in the postinit has not been run before we end up here, so just checking we're not at 212 | # the default value. 213 | self._n_gpu = torch.cuda.device_count() 214 | else: 215 | # Here, we'll use torch.distributed. 216 | # Initializes the distributed backend which will take care of synchronizing nodes/GPUs 217 | # 218 | # deepspeed performs its own DDP internally, and requires the program to be started with: 219 | # deepspeed ./program.py 220 | # rather than: 221 | # python -m torch.distributed.launch --nproc_per_node=2 ./program.py 222 | if self.deepspeed: 223 | from .integrations import is_deepspeed_available 224 | 225 | if not is_deepspeed_available(): 226 | raise ImportError("--deepspeed requires deepspeed: `pip install deepspeed`.") 227 | import deepspeed 228 | 229 | deepspeed.init_distributed() 230 | else: 231 | torch.distributed.init_process_group(backend="nccl") 232 | device = torch.device("cuda", self.local_rank) 233 | self._n_gpu = 1 234 | 235 | if device.type == "cuda": 236 | torch.cuda.set_device(device) 237 | 238 | return device 239 | 240 | 241 | def main(): 242 | # See all possible arguments in src/transformers/training_args.py 243 | # or by passing the --help flag to this script. 244 | # We now keep distinct sets of args, for a cleaner separation of concerns. 245 | 246 | parser = HfArgumentParser((ModelArguments, DataTrainingArguments, OurTrainingArguments)) 247 | if len(sys.argv) == 2 and sys.argv[1].endswith(".json"): 248 | # If we pass only one argument to the script and it's the path to a json file, 249 | # let's parse it to get our arguments. 250 | model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1])) 251 | else: 252 | model_args, data_args, training_args = parser.parse_args_into_dataclasses() 253 | 254 | if ( 255 | os.path.exists(training_args.output_dir) 256 | and os.listdir(training_args.output_dir) 257 | and training_args.do_train 258 | and not training_args.overwrite_output_dir 259 | ): 260 | raise ValueError( 261 | f"Output directory ({training_args.output_dir}) already exists and is not empty." 262 | "Use --overwrite_output_dir to overcome." 263 | ) 264 | 265 | # Setup logging 266 | logging.basicConfig( 267 | format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", 268 | datefmt="%m/%d/%Y %H:%M:%S", 269 | level=logging.INFO if is_main_process(training_args.local_rank) else logging.WARN, 270 | ) 271 | 272 | # Log on each process the small summary: 273 | logger.warning( 274 | f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}" 275 | + f" distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}" 276 | ) 277 | # Set the verbosity to info of the Transformers logger (on main process only): 278 | if is_main_process(training_args.local_rank): 279 | transformers.utils.logging.set_verbosity_info() 280 | transformers.utils.logging.enable_default_handler() 281 | transformers.utils.logging.enable_explicit_format() 282 | logger.info("Training/evaluation parameters %s", training_args) 283 | 284 | # Set seed before initializing model. 285 | set_seed(training_args.seed) 286 | 287 | # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below) 288 | # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/ 289 | # (the dataset will be downloaded automatically from the datasets Hub 290 | # 291 | # For CSV/JSON files, this script will use the column called 'text' or the first column. You can easily tweak this 292 | # behavior (see below) 293 | # 294 | # In distributed training, the load_dataset function guarantee that only one local process can concurrently 295 | # download the dataset. 296 | data_files = {} 297 | if data_args.train_file is not None: 298 | data_files["train"] = data_args.train_file 299 | extension = data_args.train_file.split(".")[-1] 300 | if extension == "csv": 301 | datasets = load_dataset(extension, data_files=data_files, cache_dir="./data/", delimiter="\t" if "tsv" in data_args.train_file else ",") 302 | else: 303 | datasets = load_dataset(extension, data_files=data_files, cache_dir="./data/") 304 | 305 | # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at 306 | # https://huggingface.co/docs/datasets/loading_datasets.html. 307 | 308 | # Load pretrained model and tokenizer 309 | # 310 | # Distributed training: 311 | # The .from_pretrained methods guarantee that only one local process can concurrently 312 | # download model & vocab. 313 | config_kwargs = { 314 | "cache_dir": model_args.cache_dir, 315 | "revision": model_args.model_revision, 316 | "use_auth_token": True if model_args.use_auth_token else None, 317 | } 318 | if model_args.config_name: 319 | config = AutoConfig.from_pretrained(model_args.config_name, **config_kwargs) 320 | elif model_args.model_name_or_path: 321 | config = AutoConfig.from_pretrained(model_args.model_name_or_path, **config_kwargs) 322 | else: 323 | config = CONFIG_MAPPING[model_args.model_type]() 324 | logger.warning("You are instantiating a new config instance from scratch.") 325 | 326 | tokenizer_kwargs = { 327 | "cache_dir": model_args.cache_dir, 328 | "use_fast": model_args.use_fast_tokenizer, 329 | "revision": model_args.model_revision, 330 | "use_auth_token": True if model_args.use_auth_token else None, 331 | } 332 | if model_args.tokenizer_name: 333 | tokenizer = AutoTokenizer.from_pretrained(model_args.tokenizer_name, **tokenizer_kwargs) 334 | elif model_args.model_name_or_path: 335 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path, **tokenizer_kwargs) 336 | else: 337 | raise ValueError( 338 | "You are instantiating a new tokenizer from scratch. This is not supported by this script." 339 | "You can do it from another script, save it, and load it from here, using --tokenizer_name." 340 | ) 341 | 342 | if model_args.model_name_or_path: 343 | if 'roberta' in model_args.model_name_or_path: 344 | model = RobertaForCL.from_pretrained( 345 | model_args.model_name_or_path, 346 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 347 | config=config, 348 | cache_dir=model_args.cache_dir, 349 | revision=model_args.model_revision, 350 | use_auth_token=True if model_args.use_auth_token else None, 351 | model_args=model_args 352 | ) 353 | elif 'bert' in model_args.model_name_or_path: 354 | model = BertForCL.from_pretrained( 355 | model_args.model_name_or_path, 356 | from_tf=bool(".ckpt" in model_args.model_name_or_path), 357 | config=config, 358 | cache_dir=model_args.cache_dir, 359 | revision=model_args.model_revision, 360 | use_auth_token=True if model_args.use_auth_token else None, 361 | model_args=model_args 362 | ) 363 | if model_args.do_mlm: 364 | pretrained_model = BertForPreTraining.from_pretrained(model_args.model_name_or_path) 365 | model.lm_head.load_state_dict(pretrained_model.cls.predictions.state_dict()) 366 | else: 367 | raise NotImplementedError 368 | else: 369 | raise NotImplementedError 370 | logger.info("Training new model from scratch") 371 | model = AutoModelForMaskedLM.from_config(config) 372 | 373 | model.resize_token_embeddings(len(tokenizer)) 374 | 375 | # Prepare features 376 | column_names = datasets["train"].column_names 377 | if len(column_names) == 3: 378 | # Pair datasets with hard negatives 379 | sent0_cname = column_names[0] 380 | sent1_cname = column_names[1] 381 | sent2_cname = column_names[2] 382 | else: 383 | raise NotImplementedError 384 | 385 | def prepare_features(examples): 386 | # padding = longest (default) 387 | # If no sentence in the batch exceed the max length, then use 388 | # the max sentence length in the batch, otherwise use the 389 | # max sentence length in the argument and truncate those that 390 | # exceed the max length. 391 | # padding = max_length (when pad_to_max_length, for pressure test) 392 | # All sentences are padded/truncated to data_args.max_seq_length. 393 | total = len(examples[sent0_cname]) 394 | 395 | # Avoid "None" fields 396 | for idx in range(total): 397 | if examples[sent0_cname][idx] is None: 398 | examples[sent0_cname][idx] = " " 399 | if examples[sent1_cname][idx] is None: 400 | examples[sent1_cname][idx] = " " 401 | 402 | sentences = examples[sent0_cname] + examples[sent1_cname] 403 | 404 | for idx in range(total): 405 | if examples[sent2_cname][idx] is None: 406 | examples[sent2_cname][idx] = " " 407 | sentences += examples[sent2_cname] 408 | 409 | 410 | sent_features = tokenizer( 411 | sentences, 412 | max_length=data_args.max_seq_length, 413 | truncation=True, 414 | padding="max_length" if data_args.pad_to_max_length else False, 415 | ) 416 | 417 | features = {} 418 | for key in sent_features: 419 | features[key] = [[sent_features[key][i], sent_features[key][i + total], sent_features[key][i + total * 2]] 420 | for i in range(total)] 421 | 422 | return features 423 | 424 | if training_args.do_train: 425 | train_dataset = datasets["train"].map( 426 | prepare_features, 427 | batched=True, 428 | num_proc=data_args.preprocessing_num_workers, 429 | remove_columns=column_names, 430 | load_from_cache_file=not data_args.overwrite_cache, 431 | ) 432 | 433 | # Data collator 434 | @dataclass 435 | class OurDataCollatorWithPadding: 436 | 437 | tokenizer: PreTrainedTokenizerBase 438 | padding: Union[bool, str, PaddingStrategy] = True 439 | max_length: Optional[int] = None 440 | pad_to_multiple_of: Optional[int] = None 441 | mlm: bool = True 442 | mlm_probability: float = data_args.mlm_probability 443 | 444 | def __call__(self, features: List[Dict[str, Union[List[int], List[List[int]], torch.Tensor]]]) -> Dict[str, torch.Tensor]: 445 | special_keys = ['input_ids', 'attention_mask', 'token_type_ids', 'mlm_input_ids', 'mlm_labels'] 446 | bs = len(features) 447 | if bs > 0: 448 | num_sent = len(features[0]['input_ids']) 449 | else: 450 | return 451 | flat_features = [] 452 | for feature in features: 453 | for i in range(num_sent): 454 | flat_features.append({k: feature[k][i] if k in special_keys else feature[k] for k in feature}) 455 | 456 | batch = self.tokenizer.pad( 457 | flat_features, 458 | padding=self.padding, 459 | max_length=self.max_length, 460 | pad_to_multiple_of=self.pad_to_multiple_of, 461 | return_tensors="pt", 462 | ) 463 | if model_args.do_mlm: 464 | batch["mlm_input_ids"], batch["mlm_labels"] = self.mask_tokens(batch["input_ids"]) 465 | 466 | batch = {k: batch[k].view(bs, num_sent, -1) if k in special_keys else batch[k].view(bs, num_sent, -1)[:, 0] for k in batch} 467 | 468 | if "label" in batch: 469 | batch["labels"] = batch["label"] 470 | del batch["label"] 471 | if "label_ids" in batch: 472 | batch["labels"] = batch["label_ids"] 473 | del batch["label_ids"] 474 | 475 | return batch 476 | 477 | def mask_tokens( 478 | self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None 479 | ) -> Tuple[torch.Tensor, torch.Tensor]: 480 | """ 481 | Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original. 482 | """ 483 | inputs = inputs.clone() 484 | labels = inputs.clone() 485 | # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`) 486 | probability_matrix = torch.full(labels.shape, self.mlm_probability) 487 | if special_tokens_mask is None: 488 | special_tokens_mask = [ 489 | self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist() 490 | ] 491 | special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool) 492 | else: 493 | special_tokens_mask = special_tokens_mask.bool() 494 | 495 | probability_matrix.masked_fill_(special_tokens_mask, value=0.0) 496 | masked_indices = torch.bernoulli(probability_matrix).bool() 497 | labels[~masked_indices] = -100 # We only compute loss on masked tokens 498 | 499 | # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK]) 500 | indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices 501 | inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token) 502 | 503 | # 10% of the time, we replace masked input tokens with random word 504 | indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced 505 | random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long) 506 | inputs[indices_random] = random_words[indices_random] 507 | 508 | # The rest of the time (10% of the time) we keep the masked input tokens unchanged 509 | return inputs, labels 510 | 511 | data_collator = default_data_collator if data_args.pad_to_max_length else OurDataCollatorWithPadding(tokenizer) 512 | 513 | trainer = CLTrainer( 514 | model=model, 515 | args=training_args, 516 | train_dataset=train_dataset if training_args.do_train else None, 517 | tokenizer=tokenizer, 518 | data_collator=data_collator, 519 | ) 520 | trainer.model_args = model_args 521 | 522 | # Training 523 | if training_args.do_train: 524 | model_path = ( 525 | model_args.model_name_or_path 526 | if (model_args.model_name_or_path is not None and os.path.isdir(model_args.model_name_or_path)) 527 | else None 528 | ) 529 | train_result = trainer.train(model_path=model_path) 530 | trainer.save_model() # Saves the tokenizer too for easy upload 531 | 532 | output_train_file = os.path.join(training_args.output_dir, "train_results.txt") 533 | if trainer.is_world_process_zero(): 534 | with open(output_train_file, "w") as writer: 535 | logger.info("***** Train results *****") 536 | for key, value in sorted(train_result.metrics.items()): 537 | logger.info(f" {key} = {value}") 538 | writer.write(f"{key} = {value}\n") 539 | 540 | # Need to save the state, since Trainer.save_model saves only the tokenizer with the model 541 | trainer.state.save_to_json(os.path.join(training_args.output_dir, "trainer_state.json")) 542 | 543 | # Evaluation 544 | results = {} 545 | if training_args.do_eval: 546 | logger.info("*** Evaluate ***") 547 | results = trainer.evaluate_testset() 548 | output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") 549 | if trainer.is_world_process_zero(): 550 | with open(output_eval_file, "w") as writer: 551 | logger.info("***** Eval results *****") 552 | for key, value in sorted(results.items()): 553 | logger.info(f" {key} = {value}") 554 | writer.write(f"{key} = {value}\n") 555 | 556 | return results 557 | 558 | 559 | if __name__ == "__main__": 560 | main() 561 | --------------------------------------------------------------------------------