├── SentEval ├── senteval │ ├── tools │ │ ├── __init__.py │ │ ├── relatedness.py │ │ ├── classifier.py │ │ ├── validation.py │ │ └── ranking.py │ ├── __init__.py │ ├── utils.py │ ├── trec.py │ ├── binary.py │ ├── sst.py │ ├── mrpc.py │ ├── snli.py │ ├── rank.py │ ├── engine.py │ ├── probing.py │ ├── sick.py │ └── sts.py ├── .gitignore ├── setup.py ├── LICENSE ├── examples │ ├── skipthought.py │ ├── googleuse.py │ ├── gensen.py │ ├── infersent.py │ ├── bow.py │ └── models.py └── README.md ├── paper ├── paper.pdf └── appendix.pdf ├── .gitignore ├── requirements.txt ├── config.py ├── args.py ├── eval_mteb.py ├── README.md ├── trainer.py ├── train.py ├── eval_senteval.py └── model.py /SentEval/senteval/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /paper/paper.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinghaow99/DenoSent/HEAD/paper/paper.pdf -------------------------------------------------------------------------------- /paper/appendix.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xinghaow99/DenoSent/HEAD/paper/appendix.pdf -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | __pycache__ 2 | results 3 | wandb 4 | mteb_results 5 | SentEval/data/downstream/* 6 | *.sh -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.30.2 2 | mteb==1.0.0 3 | numpy==1.23.4 4 | accelerate 5 | torch==2.0.1 6 | scipy==1.9.2 7 | scikit-learn==1.1.2 8 | datasets 9 | prettytable 10 | wandb -------------------------------------------------------------------------------- /SentEval/.gitignore: -------------------------------------------------------------------------------- 1 | # SentEval data and .pyc files 2 | 3 | 4 | 5 | # python 6 | __pycache__/ 7 | *.py[cod] 8 | *$py.class 9 | 10 | # log files 11 | *.log 12 | *.txt 13 | 14 | # data files 15 | data/senteval_data* 16 | data/downstream/ 17 | -------------------------------------------------------------------------------- /SentEval/senteval/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import 9 | 10 | from senteval.engine import SE 11 | -------------------------------------------------------------------------------- /SentEval/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import io 9 | from setuptools import setup, find_packages 10 | 11 | with io.open('./README.md', encoding='utf-8') as f: 12 | readme = f.read() 13 | 14 | setup( 15 | name='SentEval', 16 | version='0.1.0', 17 | url='https://github.com/facebookresearch/SentEval', 18 | packages=find_packages(exclude=['examples']), 19 | license='Attribution-NonCommercial 4.0 International', 20 | long_description=readme, 21 | ) 22 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | from transformers import PretrainedConfig 2 | from typing import Optional 3 | 4 | class DenoSentConfig(PretrainedConfig): 5 | def __init__(self, 6 | encoder_name_or_path:Optional[str]=None, 7 | hidden_size:Optional[int]=768, 8 | max_length:Optional[int]=32, 9 | decoder_num_heads:Optional[int]=1, 10 | decoder_num_layers:Optional[int]=16, 11 | decoder_noise_dropout:Optional[float]=0.825, 12 | pooler:Optional[str]='mask', 13 | do_contrastive:Optional[bool]=False, 14 | do_generative:Optional[bool]=False, 15 | prompt_format:Optional[str]='[X] means [MASK]', 16 | contrastive_weight:Optional[float]=1.0, 17 | generative_weight:Optional[float]=1.0, 18 | contrastive_temp: Optional[float]=0.05, 19 | **kwargs): 20 | super().__init__(**kwargs) 21 | self.encoder_name_or_path = encoder_name_or_path 22 | self.hidden_size = hidden_size 23 | self.max_length = max_length 24 | self.decoder_num_heads = decoder_num_heads 25 | self.decoder_num_layers = decoder_num_layers 26 | self.decoder_noise_dropout = decoder_noise_dropout 27 | self.pooler = pooler 28 | self.do_contrastive = do_contrastive 29 | self.do_generative = do_generative 30 | self.prompt_format = prompt_format 31 | self.contrastive_weight = contrastive_weight 32 | self.generative_weight = generative_weight 33 | self.contrastive_temp = contrastive_temp -------------------------------------------------------------------------------- /SentEval/LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For SentEval software 4 | 5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /args.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass, field 2 | from typing import List, Optional, Tuple, Union, Dict 3 | 4 | @dataclass 5 | class ModelArguments: 6 | model_name_or_path: Optional[str] = field( 7 | default='bert-base-uncased' 8 | ) 9 | max_length: Optional[int] = field( 10 | default=32 11 | ) 12 | pooler: Optional[str] = field( 13 | default='cls' 14 | ) 15 | prompt_format: Optional[str] = field( 16 | default='"[X]" means [MASK].' 17 | ) 18 | decoder_num_layers: Optional[int] = field( 19 | default=16 20 | ) 21 | decoder_num_heads: Optional[int] = field( 22 | default=1 23 | ) 24 | decoder_target_dropout: Optional[float] = field( 25 | default=0.825 26 | ) 27 | 28 | do_contrastive: Optional[bool] = field( 29 | default=False 30 | ) 31 | do_generative: Optional[bool] = field( 32 | default=False 33 | ) 34 | contrastive_temp: Optional[float] = field( 35 | default=0.05 36 | ) 37 | contrastive_weight: Optional[float] = field( 38 | default=1.0 39 | ) 40 | generative_weight: Optional[float] = field( 41 | default=1.0 42 | ) 43 | 44 | 45 | @dataclass 46 | class DatasetArguments: 47 | train_dataset: Optional[str] = field( 48 | # Singhoo/stssickr, princeton-nlp/datasets-for-simcse, bookcorpus 49 | default='Singhoo/denosent_data', 50 | metadata={ 51 | 'help': 'Can be princeton-nlp/datasets-for-simcse, wiki1m-aug, wiki1m-aug-cleaned, Singhoo/wiki1m_translated, Singhoo/stssickr, bookcorpus.' 52 | } 53 | ) 54 | split: Optional[str] = field( 55 | default='train' 56 | ) 57 | use_auth_token: Optional[bool] = field( 58 | default=False 59 | ) 60 | group: Optional[str] = field( 61 | default=None 62 | ) 63 | -------------------------------------------------------------------------------- /eval_mteb.py: -------------------------------------------------------------------------------- 1 | 2 | from mteb import MTEB 3 | import argparse 4 | import logging 5 | from model import DenoSentModel 6 | from config import DenoSentConfig 7 | 8 | logging.basicConfig(level=logging.INFO) 9 | 10 | TASK_CLASSIFICATION = [ 11 | "AmazonCounterfactualClassification", 12 | "AmazonReviewsClassification", 13 | "Banking77Classification", 14 | "EmotionClassification", 15 | "MassiveIntentClassification", 16 | "MassiveScenarioClassification", 17 | "MTOPDomainClassification", 18 | "MTOPIntentClassification", 19 | "ToxicConversationsClassification", 20 | "TweetSentimentExtractionClassification", 21 | ] 22 | 23 | 24 | 25 | TASK_RERANKING = [ 26 | "AskUbuntuDupQuestions", 27 | "MindSmallReranking", 28 | "SciDocsRR", 29 | "StackOverflowDupQuestions", 30 | ] 31 | 32 | TASK_RETRIEVAL = [ 33 | "QuoraRetrieval", 34 | ] 35 | 36 | TASK_STS = [ 37 | "SICK-R", 38 | "STS12", 39 | "STS13", 40 | "STS14", 41 | "STS15", 42 | "STS16", 43 | "STSBenchmark", 44 | ] 45 | 46 | TASK_LIST = TASK_CLASSIFICATION + TASK_RERANKING + TASK_RETRIEVAL + TASK_STS 47 | 48 | 49 | def main(): 50 | parser = argparse.ArgumentParser() 51 | parser.add_argument("--model_name_or_path", type=str, 52 | help="Transformers' model name or path") 53 | 54 | args = parser.parse_args() 55 | 56 | config = DenoSentConfig.from_pretrained(args.model_name_or_path) 57 | model = DenoSentModel.from_pretrained(args.model_name_or_path, config=config) 58 | model = model.to("cuda") 59 | model.eval() 60 | 61 | eval_splits = ["test"] 62 | evaluation = MTEB(tasks=TASK_LIST, task_langs=["en"], task_categories=['S2S']) 63 | evaluation.run(model, overwrite_results=True, batch_size=64, eval_splits=eval_splits, output_folder='mteb_results/'+args.model_name_or_path.split('/')[-1]) 64 | 65 | if __name__ == '__main__': 66 | main() -------------------------------------------------------------------------------- /SentEval/examples/skipthought.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | """ 11 | Example of file for SkipThought in SentEval 12 | """ 13 | import logging 14 | import sys 15 | sys.setdefaultencoding('utf8') 16 | 17 | 18 | # Set PATHs 19 | PATH_TO_SENTEVAL = '../' 20 | PATH_TO_DATA = '../data/senteval_data/' 21 | PATH_TO_SKIPTHOUGHT = '' 22 | 23 | assert PATH_TO_SKIPTHOUGHT != '', 'Download skipthought and set correct PATH' 24 | 25 | # import skipthought and Senteval 26 | sys.path.insert(0, PATH_TO_SKIPTHOUGHT) 27 | import skipthoughts 28 | sys.path.insert(0, PATH_TO_SENTEVAL) 29 | import senteval 30 | 31 | 32 | def prepare(params, samples): 33 | return 34 | 35 | def batcher(params, batch): 36 | batch = [str(' '.join(sent), errors="ignore") if sent != [] else '.' for sent in batch] 37 | embeddings = skipthoughts.encode(params['encoder'], batch, 38 | verbose=False, use_eos=True) 39 | return embeddings 40 | 41 | 42 | # Set params for SentEval 43 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10, 'batch_size': 512} 44 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 45 | 'tenacity': 5, 'epoch_size': 4} 46 | # Set up logger 47 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 48 | 49 | if __name__ == "__main__": 50 | # Load SkipThought model 51 | params_senteval['encoder'] = skipthoughts.load_model() 52 | 53 | se = senteval.engine.SE(params_senteval, batcher, prepare) 54 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 55 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 56 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 57 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 58 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 59 | 'OddManOut', 'CoordinationInversion'] 60 | results = se.eval(transfer_tasks) 61 | print(results) 62 | -------------------------------------------------------------------------------- /SentEval/examples/googleuse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division 9 | 10 | import os 11 | import sys 12 | import logging 13 | import tensorflow as tf 14 | import tensorflow_hub as hub 15 | tf.logging.set_verbosity(0) 16 | 17 | # Set PATHs 18 | PATH_TO_SENTEVAL = '../' 19 | PATH_TO_DATA = '../data' 20 | 21 | # import SentEval 22 | sys.path.insert(0, PATH_TO_SENTEVAL) 23 | import senteval 24 | 25 | # tensorflow session 26 | session = tf.Session() 27 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 28 | 29 | # SentEval prepare and batcher 30 | def prepare(params, samples): 31 | return 32 | 33 | def batcher(params, batch): 34 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch] 35 | embeddings = params['google_use'](batch) 36 | return embeddings 37 | 38 | def make_embed_fn(module): 39 | with tf.Graph().as_default(): 40 | sentences = tf.placeholder(tf.string) 41 | embed = hub.Module(module) 42 | embeddings = embed(sentences) 43 | session = tf.train.MonitoredSession() 44 | return lambda x: session.run(embeddings, {sentences: x}) 45 | 46 | # Start TF session and load Google Universal Sentence Encoder 47 | encoder = make_embed_fn("https://tfhub.dev/google/universal-sentence-encoder-large/2") 48 | 49 | # Set params for SentEval 50 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 51 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 52 | 'tenacity': 3, 'epoch_size': 2} 53 | params_senteval['google_use'] = encoder 54 | 55 | # Set up logger 56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 57 | 58 | if __name__ == "__main__": 59 | se = senteval.engine.SE(params_senteval, batcher, prepare) 60 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 61 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 62 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 63 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 64 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 65 | 'OddManOut', 'CoordinationInversion'] 66 | results = se.eval(transfer_tasks) 67 | print(results) 68 | -------------------------------------------------------------------------------- /SentEval/examples/gensen.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Clone GenSen repo here: https://github.com/Maluuba/gensen.git 10 | And follow instructions for loading the model used in batcher 11 | """ 12 | 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | import sys 16 | import logging 17 | # import GenSen package 18 | from gensen import GenSen, GenSenSingle 19 | 20 | # Set PATHs 21 | PATH_TO_SENTEVAL = '../' 22 | PATH_TO_DATA = '../data' 23 | 24 | # import SentEval 25 | sys.path.insert(0, PATH_TO_SENTEVAL) 26 | import senteval 27 | 28 | # SentEval prepare and batcher 29 | def prepare(params, samples): 30 | return 31 | 32 | def batcher(params, batch): 33 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch] 34 | _, reps_h_t = gensen.get_representation( 35 | sentences, pool='last', return_numpy=True, tokenize=True 36 | ) 37 | embeddings = reps_h_t 38 | return embeddings 39 | 40 | # Load GenSen model 41 | gensen_1 = GenSenSingle( 42 | model_folder='../data/models', 43 | filename_prefix='nli_large_bothskip', 44 | pretrained_emb='../data/embedding/glove.840B.300d.h5' 45 | ) 46 | gensen_2 = GenSenSingle( 47 | model_folder='../data/models', 48 | filename_prefix='nli_large_bothskip_parse', 49 | pretrained_emb='../data/embedding/glove.840B.300d.h5' 50 | ) 51 | gensen_encoder = GenSen(gensen_1, gensen_2) 52 | reps_h, reps_h_t = gensen.get_representation( 53 | sentences, pool='last', return_numpy=True, tokenize=True 54 | ) 55 | 56 | # Set params for SentEval 57 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 58 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 59 | 'tenacity': 3, 'epoch_size': 2} 60 | params_senteval['gensen'] = gensen_encoder 61 | 62 | # Set up logger 63 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 64 | 65 | if __name__ == "__main__": 66 | se = senteval.engine.SE(params_senteval, batcher, prepare) 67 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 68 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 69 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 70 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 71 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 72 | 'OddManOut', 'CoordinationInversion'] 73 | results = se.eval(transfer_tasks) 74 | print(results) 75 | -------------------------------------------------------------------------------- /SentEval/examples/infersent.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | InferSent models. See https://github.com/facebookresearch/InferSent. 10 | """ 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import sys 15 | import os 16 | import torch 17 | import logging 18 | 19 | # get models.py from InferSent repo 20 | from models import InferSent 21 | 22 | # Set PATHs 23 | PATH_SENTEVAL = '../' 24 | PATH_TO_DATA = '../data' 25 | PATH_TO_W2V = 'PATH/TO/glove.840B.300d.txt' # or crawl-300d-2M.vec for V2 26 | MODEL_PATH = 'infersent1.pkl' 27 | V = 1 # version of InferSent 28 | 29 | assert os.path.isfile(MODEL_PATH) and os.path.isfile(PATH_TO_W2V), \ 30 | 'Set MODEL and GloVe PATHs' 31 | 32 | # import senteval 33 | sys.path.insert(0, PATH_SENTEVAL) 34 | import senteval 35 | 36 | 37 | def prepare(params, samples): 38 | params.infersent.build_vocab([' '.join(s) for s in samples], tokenize=False) 39 | 40 | 41 | def batcher(params, batch): 42 | sentences = [' '.join(s) for s in batch] 43 | embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False) 44 | return embeddings 45 | 46 | 47 | """ 48 | Evaluation of trained model on Transfer Tasks (SentEval) 49 | """ 50 | 51 | # define senteval params 52 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 53 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 54 | 'tenacity': 3, 'epoch_size': 2} 55 | # Set up logger 56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 57 | 58 | if __name__ == "__main__": 59 | # Load InferSent model 60 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 61 | 'pool_type': 'max', 'dpout_model': 0.0, 'version': V} 62 | model = InferSent(params_model) 63 | model.load_state_dict(torch.load(MODEL_PATH)) 64 | model.set_w2v_path(PATH_TO_W2V) 65 | 66 | params_senteval['infersent'] = model.cuda() 67 | 68 | se = senteval.engine.SE(params_senteval, batcher, prepare) 69 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 70 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 71 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 72 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 73 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 74 | 'OddManOut', 'CoordinationInversion'] 75 | results = se.eval(transfer_tasks) 76 | print(results) 77 | -------------------------------------------------------------------------------- /SentEval/senteval/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | import numpy as np 11 | import re 12 | import inspect 13 | from torch import optim 14 | 15 | 16 | def create_dictionary(sentences): 17 | words = {} 18 | for s in sentences: 19 | for word in s: 20 | if word in words: 21 | words[word] += 1 22 | else: 23 | words[word] = 1 24 | words[''] = 1e9 + 4 25 | words[''] = 1e9 + 3 26 | words['

'] = 1e9 + 2 27 | # words[''] = 1e9 + 1 28 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort 29 | id2word = [] 30 | word2id = {} 31 | for i, (w, _) in enumerate(sorted_words): 32 | id2word.append(w) 33 | word2id[w] = i 34 | 35 | return id2word, word2id 36 | 37 | 38 | def cosine(u, v): 39 | return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v)) 40 | 41 | 42 | class dotdict(dict): 43 | """ dot.notation access to dictionary attributes """ 44 | __getattr__ = dict.get 45 | __setattr__ = dict.__setitem__ 46 | __delattr__ = dict.__delitem__ 47 | 48 | 49 | def get_optimizer(s): 50 | """ 51 | Parse optimizer parameters. 52 | Input should be of the form: 53 | - "sgd,lr=0.01" 54 | - "adagrad,lr=0.1,lr_decay=0.05" 55 | """ 56 | if "," in s: 57 | method = s[:s.find(',')] 58 | optim_params = {} 59 | for x in s[s.find(',') + 1:].split(','): 60 | split = x.split('=') 61 | assert len(split) == 2 62 | assert re.match("^[+-]?(\d+(\.\d*)?|\.\d+)$", split[1]) is not None 63 | optim_params[split[0]] = float(split[1]) 64 | else: 65 | method = s 66 | optim_params = {} 67 | 68 | if method == 'adadelta': 69 | optim_fn = optim.Adadelta 70 | elif method == 'adagrad': 71 | optim_fn = optim.Adagrad 72 | elif method == 'adam': 73 | optim_fn = optim.Adam 74 | elif method == 'adamax': 75 | optim_fn = optim.Adamax 76 | elif method == 'asgd': 77 | optim_fn = optim.ASGD 78 | elif method == 'rmsprop': 79 | optim_fn = optim.RMSprop 80 | elif method == 'rprop': 81 | optim_fn = optim.Rprop 82 | elif method == 'sgd': 83 | optim_fn = optim.SGD 84 | assert 'lr' in optim_params 85 | else: 86 | raise Exception('Unknown optimization method: "%s"' % method) 87 | 88 | # # check that we give good parameters to the optimizer 89 | # expected_args = inspect.getfullargspec(optim_fn.__init__) 90 | # assert expected_args.args[:2] == ['self', 'params'] 91 | # if not all(k in expected_args.args[2:] for k in optim_params.keys()): 92 | # raise Exception('Unexpected parameters: expected "%s", got "%s"' % ( 93 | # str(expected_args.args[2:]), str(optim_params.keys()))) 94 | 95 | return optim_fn, optim_params 96 | -------------------------------------------------------------------------------- /SentEval/senteval/trec.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | TREC question-type classification 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import logging 17 | import numpy as np 18 | 19 | from senteval.tools.validation import KFoldClassifier 20 | 21 | 22 | class TRECEval(object): 23 | def __init__(self, task_path, seed=1111): 24 | logging.info('***** Transfer task : TREC *****\n\n') 25 | self.seed = seed 26 | self.train = self.loadFile(os.path.join(task_path, 'train_5500.label')) 27 | self.test = self.loadFile(os.path.join(task_path, 'TREC_10.label')) 28 | 29 | def do_prepare(self, params, prepare): 30 | samples = self.train['X'] + self.test['X'] 31 | return prepare(params, samples) 32 | 33 | def loadFile(self, fpath): 34 | trec_data = {'X': [], 'y': []} 35 | tgt2idx = {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 36 | 'HUM': 3, 'LOC': 4, 'NUM': 5} 37 | with io.open(fpath, 'r', encoding='latin-1') as f: 38 | for line in f: 39 | target, sample = line.strip().split(':', 1) 40 | sample = sample.split(' ', 1)[1].split() 41 | assert target in tgt2idx, target 42 | trec_data['X'].append(sample) 43 | trec_data['y'].append(tgt2idx[target]) 44 | return trec_data 45 | 46 | def run(self, params, batcher): 47 | train_embeddings, test_embeddings = [], [] 48 | 49 | # Sort to reduce padding 50 | sorted_corpus_train = sorted(zip(self.train['X'], self.train['y']), 51 | key=lambda z: (len(z[0]), z[1])) 52 | train_samples = [x for (x, y) in sorted_corpus_train] 53 | train_labels = [y for (x, y) in sorted_corpus_train] 54 | 55 | sorted_corpus_test = sorted(zip(self.test['X'], self.test['y']), 56 | key=lambda z: (len(z[0]), z[1])) 57 | test_samples = [x for (x, y) in sorted_corpus_test] 58 | test_labels = [y for (x, y) in sorted_corpus_test] 59 | 60 | # Get train embeddings 61 | for ii in range(0, len(train_labels), params.batch_size): 62 | batch = train_samples[ii:ii + params.batch_size] 63 | embeddings = batcher(params, batch) 64 | train_embeddings.append(embeddings) 65 | train_embeddings = np.vstack(train_embeddings) 66 | logging.info('Computed train embeddings') 67 | 68 | # Get test embeddings 69 | for ii in range(0, len(test_labels), params.batch_size): 70 | batch = test_samples[ii:ii + params.batch_size] 71 | embeddings = batcher(params, batch) 72 | test_embeddings.append(embeddings) 73 | test_embeddings = np.vstack(test_embeddings) 74 | logging.info('Computed test embeddings') 75 | 76 | config_classifier = {'nclasses': 6, 'seed': self.seed, 77 | 'usepytorch': params.usepytorch, 78 | 'classifier': params.classifier, 79 | 'kfold': params.kfold} 80 | clf = KFoldClassifier({'X': train_embeddings, 81 | 'y': np.array(train_labels)}, 82 | {'X': test_embeddings, 83 | 'y': np.array(test_labels)}, 84 | config_classifier) 85 | devacc, testacc, _ = clf.run() 86 | logging.debug('\nDev acc : {0} Test acc : {1} \ 87 | for TREC\n'.format(devacc, testacc)) 88 | return {'devacc': devacc, 'acc': testacc, 89 | 'ndev': len(self.train['X']), 'ntest': len(self.test['X'])} 90 | -------------------------------------------------------------------------------- /SentEval/examples/bow.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from __future__ import absolute_import, division, unicode_literals 9 | 10 | import sys 11 | import io 12 | import numpy as np 13 | import logging 14 | 15 | 16 | # Set PATHs 17 | PATH_TO_SENTEVAL = '../' 18 | PATH_TO_DATA = '../data' 19 | # PATH_TO_VEC = 'glove/glove.840B.300d.txt' 20 | PATH_TO_VEC = 'fasttext/crawl-300d-2M.vec' 21 | 22 | # import SentEval 23 | sys.path.insert(0, PATH_TO_SENTEVAL) 24 | import senteval 25 | 26 | 27 | # Create dictionary 28 | def create_dictionary(sentences, threshold=0): 29 | words = {} 30 | for s in sentences: 31 | for word in s: 32 | words[word] = words.get(word, 0) + 1 33 | 34 | if threshold > 0: 35 | newwords = {} 36 | for word in words: 37 | if words[word] >= threshold: 38 | newwords[word] = words[word] 39 | words = newwords 40 | words[''] = 1e9 + 4 41 | words[''] = 1e9 + 3 42 | words['

'] = 1e9 + 2 43 | 44 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort 45 | id2word = [] 46 | word2id = {} 47 | for i, (w, _) in enumerate(sorted_words): 48 | id2word.append(w) 49 | word2id[w] = i 50 | 51 | return id2word, word2id 52 | 53 | # Get word vectors from vocabulary (glove, word2vec, fasttext ..) 54 | def get_wordvec(path_to_vec, word2id): 55 | word_vec = {} 56 | 57 | with io.open(path_to_vec, 'r', encoding='utf-8') as f: 58 | # if word2vec or fasttext file : skip first line "next(f)" 59 | for line in f: 60 | word, vec = line.split(' ', 1) 61 | if word in word2id: 62 | word_vec[word] = np.fromstring(vec, sep=' ') 63 | 64 | logging.info('Found {0} words with word vectors, out of \ 65 | {1} words'.format(len(word_vec), len(word2id))) 66 | return word_vec 67 | 68 | 69 | # SentEval prepare and batcher 70 | def prepare(params, samples): 71 | _, params.word2id = create_dictionary(samples) 72 | params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id) 73 | params.wvec_dim = 300 74 | return 75 | 76 | def batcher(params, batch): 77 | batch = [sent if sent != [] else ['.'] for sent in batch] 78 | embeddings = [] 79 | 80 | for sent in batch: 81 | sentvec = [] 82 | for word in sent: 83 | if word in params.word_vec: 84 | sentvec.append(params.word_vec[word]) 85 | if not sentvec: 86 | vec = np.zeros(params.wvec_dim) 87 | sentvec.append(vec) 88 | sentvec = np.mean(sentvec, 0) 89 | embeddings.append(sentvec) 90 | 91 | embeddings = np.vstack(embeddings) 92 | return embeddings 93 | 94 | 95 | # Set params for SentEval 96 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 97 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 98 | 'tenacity': 3, 'epoch_size': 2} 99 | 100 | # Set up logger 101 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 102 | 103 | if __name__ == "__main__": 104 | se = senteval.engine.SE(params_senteval, batcher, prepare) 105 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 106 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 107 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 108 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 109 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 110 | 'OddManOut', 'CoordinationInversion'] 111 | results = se.eval(transfer_tasks) 112 | print(results) 113 | -------------------------------------------------------------------------------- /SentEval/senteval/binary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import io 14 | import os 15 | import numpy as np 16 | import logging 17 | 18 | from senteval.tools.validation import InnerKFoldClassifier 19 | 20 | 21 | class BinaryClassifierEval(object): 22 | def __init__(self, pos, neg, seed=1111): 23 | self.seed = seed 24 | self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg) 25 | self.n_samples = len(self.samples) 26 | 27 | def do_prepare(self, params, prepare): 28 | # prepare is given the whole text 29 | return prepare(params, self.samples) 30 | # prepare puts everything it outputs in "params" : params.word2id etc 31 | # Those output will be further used by "batcher". 32 | 33 | def loadFile(self, fpath): 34 | with io.open(fpath, 'r', encoding='latin-1') as f: 35 | return [line.split() for line in f.read().splitlines()] 36 | 37 | def run(self, params, batcher): 38 | enc_input = [] 39 | # Sort to reduce padding 40 | sorted_corpus = sorted(zip(self.samples, self.labels), 41 | key=lambda z: (len(z[0]), z[1])) 42 | sorted_samples = [x for (x, y) in sorted_corpus] 43 | sorted_labels = [y for (x, y) in sorted_corpus] 44 | logging.info('Generating sentence embeddings') 45 | for ii in range(0, self.n_samples, params.batch_size): 46 | batch = sorted_samples[ii:ii + params.batch_size] 47 | embeddings = batcher(params, batch) 48 | enc_input.append(embeddings) 49 | enc_input = np.vstack(enc_input) 50 | logging.info('Generated sentence embeddings') 51 | 52 | config = {'nclasses': 2, 'seed': self.seed, 53 | 'usepytorch': params.usepytorch, 54 | 'classifier': params.classifier, 55 | 'nhid': params.nhid, 'kfold': params.kfold} 56 | clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config) 57 | devacc, testacc = clf.run() 58 | logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc)) 59 | return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples, 60 | 'ntest': self.n_samples} 61 | 62 | 63 | class CREval(BinaryClassifierEval): 64 | def __init__(self, task_path, seed=1111): 65 | logging.debug('***** Transfer task : CR *****\n\n') 66 | pos = self.loadFile(os.path.join(task_path, 'custrev.pos')) 67 | neg = self.loadFile(os.path.join(task_path, 'custrev.neg')) 68 | super(self.__class__, self).__init__(pos, neg, seed) 69 | 70 | 71 | class MREval(BinaryClassifierEval): 72 | def __init__(self, task_path, seed=1111): 73 | logging.debug('***** Transfer task : MR *****\n\n') 74 | pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos')) 75 | neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg')) 76 | super(self.__class__, self).__init__(pos, neg, seed) 77 | 78 | 79 | class SUBJEval(BinaryClassifierEval): 80 | def __init__(self, task_path, seed=1111): 81 | logging.debug('***** Transfer task : SUBJ *****\n\n') 82 | obj = self.loadFile(os.path.join(task_path, 'subj.objective')) 83 | subj = self.loadFile(os.path.join(task_path, 'subj.subjective')) 84 | super(self.__class__, self).__init__(obj, subj, seed) 85 | 86 | 87 | class MPQAEval(BinaryClassifierEval): 88 | def __init__(self, task_path, seed=1111): 89 | logging.debug('***** Transfer task : MPQA *****\n\n') 90 | pos = self.loadFile(os.path.join(task_path, 'mpqa.pos')) 91 | neg = self.loadFile(os.path.join(task_path, 'mpqa.neg')) 92 | super(self.__class__, self).__init__(pos, neg, seed) 93 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # DenoSent: A Denoising Objective for Self-Supervised Sentence Representation Learning 2 | 3 | Official repo for our AAAI 2024 paper: [DenoSent: A Denoising Objective for Self-Supervised Sentence Representation Learning](https://arxiv.org/abs/2401.13621). 4 | 5 | ## Getting Started 6 | 7 | Run `pip install -r requirements.txt` to prepare the environment. 8 | 9 | Use the script from the [SimCSE repo](https://github.com/princeton-nlp/SimCSE) to download the datasets for SentEval evaluation: 10 | ``` 11 | cd SentEval/data/downstream/ 12 | bash download_dataset.sh 13 | ``` 14 | 15 | ## Access Our Model and Dataset from Huggingface🤗 16 | Both our [model checkpoint](https://huggingface.co/Singhoo/denosent-bert-base) and [dataset](https://huggingface.co/datasets/Singhoo/denosent_data) are available on 🤗. 17 | 18 | Generate embeddings with DenoSent: 19 | ``` 20 | from transformers import AutoModel 21 | 22 | model = AutoModel.from_pretrained("Singhoo/denosent-bert-base", trust_remote_code=True) 23 | 24 | sentences = [ 25 | "The curious cat tiptoed across the creaky wooden floor, pausing to inspect a fluttering curtain.", 26 | "A lone hiker stood atop the misty mountain, marveling at the tapestry of stars unfolding above." 27 | ] 28 | 29 | embeddings = model.encode(sentences) 30 | print(embeddings) 31 | 32 | # Excepted output 33 | # tensor([[ 0.3314, -0.2520, 0.4150, ..., 0.1575, -0.1235, -0.1226], 34 | # [ 0.5128, -0.0051, 0.2179, ..., 0.1010, 0.1654, -0.3872]]) 35 | ``` 36 | 37 | ## Evaluation 38 | 39 | ### Run Evaluation with SentEval 40 | ``` 41 | python eval_senteval.py \ 42 | --model_name_or_path Singhoo/denosent-bert-base \ 43 | --task_set sts \ 44 | --mode test \ 45 | ``` 46 | This checkpoint has slightly higher STS results than those reported in the paper. 47 | ``` 48 | ------ test ------ 49 | +-------+-------+-------+-------+-------+--------------+-----------------+-------+ 50 | | STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness | Avg. | 51 | +-------+-------+-------+-------+-------+--------------+-----------------+-------+ 52 | | 75.48 | 83.82 | 77.54 | 84.76 | 80.16 | 81.20 | 73.97 | 79.56 | 53 | +-------+-------+-------+-------+-------+--------------+-----------------+-------+ 54 | ``` 55 | 56 | ### Run evaluation with MTEB 57 | ``` 58 | python eval_mteb.py \ 59 | --model_name_or_path Singhoo/denosent-bert-base \ 60 | ``` 61 | Evaluation results for MTEB will appear in a separate directory `mteb_results`. 62 | 63 | ## Train Your Own DenoSent Models 64 | Run the following command to train your own models. Try out different hyperparameters as you like. The dataset will be automatically downloaded from Huggingface. 65 | ``` 66 | python \ 67 | train.py \ 68 | --train_dataset Singhoo/denosent_data \ 69 | --torch_compile True \ 70 | --model_name_or_path bert-base-uncased \ 71 | --max_length 32 \ 72 | --decoder_num_layers 16 \ 73 | --decoder_num_heads 1 \ 74 | --decoder_target_dropout 0.825 \ 75 | --pooler mask \ 76 | --output_dir results \ 77 | --overwrite_output_dir \ 78 | --per_device_train_batch_size 64 \ 79 | --per_device_eval_batch_size 256 \ 80 | --learning_rate 4e-5 \ 81 | --lr_scheduler_type constant_with_warmup \ 82 | --do_train \ 83 | --do_eval \ 84 | --evaluation_strategy steps \ 85 | --eval_steps 50 \ 86 | --save_strategy steps \ 87 | --save_steps 50 \ 88 | --num_train_epochs 1 \ 89 | --metric_for_best_model eval_avg_sts \ 90 | --prompt_format '"[X]" means [MASK].' \ 91 | --do_contrastive \ 92 | --do_generative \ 93 | --save_total_limit 1 \ 94 | --contrastive_temp 0.05 \ 95 | --warmup_steps 500 \ 96 | --contrastive_weight 5 \ 97 | --generative_weight 7 \ 98 | --max_steps 5000 \ 99 | --load_best_model_at_end \ 100 | ``` 101 | 102 | ## Acknowledgements 103 | 104 | We use the [SentEval toolkit](https://github.com/facebookresearch/SentEval) and the [MTEB toolkit](https://github.com/embeddings-benchmark/mteb) for evaluations, and we adopt the modified version of SentEval from the [SimCSE repository](https://github.com/princeton-nlp/SimCSE). 105 | -------------------------------------------------------------------------------- /SentEval/senteval/sst.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SST - binary classification 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import logging 17 | import numpy as np 18 | 19 | from senteval.tools.validation import SplitClassifier 20 | 21 | 22 | class SSTEval(object): 23 | def __init__(self, task_path, nclasses=2, seed=1111): 24 | self.seed = seed 25 | 26 | # binary of fine-grained 27 | assert nclasses in [2, 5] 28 | self.nclasses = nclasses 29 | self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained' 30 | logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name) 31 | 32 | train = self.loadFile(os.path.join(task_path, 'sentiment-train')) 33 | dev = self.loadFile(os.path.join(task_path, 'sentiment-dev')) 34 | test = self.loadFile(os.path.join(task_path, 'sentiment-test')) 35 | self.sst_data = {'train': train, 'dev': dev, 'test': test} 36 | 37 | def do_prepare(self, params, prepare): 38 | samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \ 39 | self.sst_data['test']['X'] 40 | return prepare(params, samples) 41 | 42 | def loadFile(self, fpath): 43 | sst_data = {'X': [], 'y': []} 44 | with io.open(fpath, 'r', encoding='utf-8') as f: 45 | for line in f: 46 | if self.nclasses == 2: 47 | sample = line.strip().split('\t') 48 | sst_data['y'].append(int(sample[1])) 49 | sst_data['X'].append(sample[0].split()) 50 | elif self.nclasses == 5: 51 | sample = line.strip().split(' ', 1) 52 | sst_data['y'].append(int(sample[0])) 53 | sst_data['X'].append(sample[1].split()) 54 | assert max(sst_data['y']) == self.nclasses - 1 55 | return sst_data 56 | 57 | def run(self, params, batcher): 58 | sst_embed = {'train': {}, 'dev': {}, 'test': {}} 59 | bsize = params.batch_size 60 | 61 | for key in self.sst_data: 62 | logging.info('Computing embedding for {0}'.format(key)) 63 | # Sort to reduce padding 64 | sorted_data = sorted(zip(self.sst_data[key]['X'], 65 | self.sst_data[key]['y']), 66 | key=lambda z: (len(z[0]), z[1])) 67 | self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data)) 68 | 69 | sst_embed[key]['X'] = [] 70 | for ii in range(0, len(self.sst_data[key]['y']), bsize): 71 | batch = self.sst_data[key]['X'][ii:ii + bsize] 72 | embeddings = batcher(params, batch) 73 | sst_embed[key]['X'].append(embeddings) 74 | sst_embed[key]['X'] = np.vstack(sst_embed[key]['X']) 75 | sst_embed[key]['y'] = np.array(self.sst_data[key]['y']) 76 | logging.info('Computed {0} embeddings'.format(key)) 77 | 78 | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, 79 | 'usepytorch': params.usepytorch, 80 | 'classifier': params.classifier} 81 | 82 | clf = SplitClassifier(X={'train': sst_embed['train']['X'], 83 | 'valid': sst_embed['dev']['X'], 84 | 'test': sst_embed['test']['X']}, 85 | y={'train': sst_embed['train']['y'], 86 | 'valid': sst_embed['dev']['y'], 87 | 'test': sst_embed['test']['y']}, 88 | config=config_classifier) 89 | 90 | devacc, testacc = clf.run() 91 | logging.debug('\nDev acc : {0} Test acc : {1} for \ 92 | SST {2} classification\n'.format(devacc, testacc, self.task_name)) 93 | 94 | return {'devacc': devacc, 'acc': testacc, 95 | 'ndev': len(sst_embed['dev']['X']), 96 | 'ntest': len(sst_embed['test']['X'])} 97 | -------------------------------------------------------------------------------- /SentEval/senteval/mrpc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | MRPC : Microsoft Research Paraphrase (detection) Corpus 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import os 14 | import logging 15 | import numpy as np 16 | import io 17 | 18 | from senteval.tools.validation import KFoldClassifier 19 | 20 | from sklearn.metrics import f1_score 21 | 22 | 23 | class MRPCEval(object): 24 | def __init__(self, task_path, seed=1111): 25 | logging.info('***** Transfer task : MRPC *****\n\n') 26 | self.seed = seed 27 | train = self.loadFile(os.path.join(task_path, 28 | 'msr_paraphrase_train.txt')) 29 | test = self.loadFile(os.path.join(task_path, 30 | 'msr_paraphrase_test.txt')) 31 | self.mrpc_data = {'train': train, 'test': test} 32 | 33 | def do_prepare(self, params, prepare): 34 | # TODO : Should we separate samples in "train, test"? 35 | samples = self.mrpc_data['train']['X_A'] + \ 36 | self.mrpc_data['train']['X_B'] + \ 37 | self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B'] 38 | return prepare(params, samples) 39 | 40 | def loadFile(self, fpath): 41 | mrpc_data = {'X_A': [], 'X_B': [], 'y': []} 42 | with io.open(fpath, 'r', encoding='utf-8') as f: 43 | for line in f: 44 | text = line.strip().split('\t') 45 | mrpc_data['X_A'].append(text[3].split()) 46 | mrpc_data['X_B'].append(text[4].split()) 47 | mrpc_data['y'].append(text[0]) 48 | 49 | mrpc_data['X_A'] = mrpc_data['X_A'][1:] 50 | mrpc_data['X_B'] = mrpc_data['X_B'][1:] 51 | mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]] 52 | return mrpc_data 53 | 54 | def run(self, params, batcher): 55 | mrpc_embed = {'train': {}, 'test': {}} 56 | 57 | for key in self.mrpc_data: 58 | logging.info('Computing embedding for {0}'.format(key)) 59 | # Sort to reduce padding 60 | text_data = {} 61 | sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'], 62 | self.mrpc_data[key]['X_B'], 63 | self.mrpc_data[key]['y']), 64 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 65 | 66 | text_data['A'] = [x for (x, y, z) in sorted_corpus] 67 | text_data['B'] = [y for (x, y, z) in sorted_corpus] 68 | text_data['y'] = [z for (x, y, z) in sorted_corpus] 69 | 70 | for txt_type in ['A', 'B']: 71 | mrpc_embed[key][txt_type] = [] 72 | for ii in range(0, len(text_data['y']), params.batch_size): 73 | batch = text_data[txt_type][ii:ii + params.batch_size] 74 | embeddings = batcher(params, batch) 75 | mrpc_embed[key][txt_type].append(embeddings) 76 | mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type]) 77 | mrpc_embed[key]['y'] = np.array(text_data['y']) 78 | logging.info('Computed {0} embeddings'.format(key)) 79 | 80 | # Train 81 | trainA = mrpc_embed['train']['A'] 82 | trainB = mrpc_embed['train']['B'] 83 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 84 | trainY = mrpc_embed['train']['y'] 85 | 86 | # Test 87 | testA = mrpc_embed['test']['A'] 88 | testB = mrpc_embed['test']['B'] 89 | testF = np.c_[np.abs(testA - testB), testA * testB] 90 | testY = mrpc_embed['test']['y'] 91 | 92 | config = {'nclasses': 2, 'seed': self.seed, 93 | 'usepytorch': params.usepytorch, 94 | 'classifier': params.classifier, 95 | 'nhid': params.nhid, 'kfold': params.kfold} 96 | clf = KFoldClassifier(train={'X': trainF, 'y': trainY}, 97 | test={'X': testF, 'y': testY}, config=config) 98 | 99 | devacc, testacc, yhat = clf.run() 100 | testf1 = round(100*f1_score(testY, yhat), 2) 101 | logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n' 102 | .format(devacc, testacc, testf1)) 103 | return {'devacc': devacc, 'acc': testacc, 'f1': testf1, 104 | 'ndev': len(trainA), 'ntest': len(testA)} 105 | -------------------------------------------------------------------------------- /trainer.py: -------------------------------------------------------------------------------- 1 | from transformers import Trainer 2 | from transformers.trainer import unwrap_model 3 | from typing import List, Optional, Dict 4 | import wandb 5 | 6 | import sys 7 | 8 | from torch.utils.data.dataset import Dataset 9 | 10 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES 11 | 12 | 13 | 14 | # Set path to SentEval 15 | PATH_TO_SENTEVAL = './SentEval' 16 | PATH_TO_DATA = './SentEval/data' 17 | 18 | # Import SentEval 19 | sys.path.insert(0, PATH_TO_SENTEVAL) 20 | import senteval 21 | 22 | from mteb import MTEB 23 | 24 | class MyTrainer(Trainer): 25 | def __init__(self, *args, **kwargs): 26 | super().__init__(*args, **kwargs) 27 | self.best_stsb_spearman = 0 28 | self.best_sickr_spearman = 0 29 | self.best_avg_sts = 0 30 | 31 | def evaluate( 32 | self, 33 | eval_dataset: Optional[Dataset] = None, 34 | ignore_keys: Optional[List[str]] = None, 35 | metric_key_prefix: str = "eval", 36 | ) -> Dict[str, float]: 37 | metrics = {} 38 | # SentEval prepare and batcher 39 | def prepare(params, samples): 40 | return 41 | 42 | def batcher(params, batch): 43 | sentences = [' '.join(s) for s in batch] 44 | return self.model.encode(sentences, len(sentences)) 45 | 46 | # Set params for SentEval (fastmode) 47 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 48 | params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 49 | 'tenacity': 3, 'epoch_size': 2} 50 | 51 | se = senteval.engine.SE(params, batcher, prepare) 52 | tasks = ['STSBenchmark', 'SICKRelatedness'] 53 | self.model.eval() 54 | results = se.eval(tasks) 55 | 56 | stsb_spearman = results['STSBenchmark']['dev']['spearman'][0] 57 | sickr_spearman = results['SICKRelatedness']['dev']['spearman'][0] 58 | # evaluation = MTEB(tasks=['STSBenchmark'], task_langs=["en"], task_categories=['S2S']) 59 | # results = evaluation.run(self.model, verbosity=0, output_folder=None, eval_splits=['validation'], batch_size=self.args.eval_batch_size) 60 | # stsb_spearman = results['STSBenchmark']['validation']['cos_sim']['spearman'] 61 | # sickr_spearman = results['SICK-R']['validation']['cos_sim']['spearman'] 62 | metrics.update({"eval_stsb_spearman": stsb_spearman, "eval_sickr_spearman": sickr_spearman, "eval_avg_sts": (stsb_spearman + sickr_spearman) / 2}) 63 | # metrics.update({"eval_stsb_spearman": stsb_spearman}) 64 | if stsb_spearman > self.best_stsb_spearman: 65 | self.best_stsb_spearman = stsb_spearman 66 | if sickr_spearman > self.best_sickr_spearman: 67 | self.best_sickr_spearman = sickr_spearman 68 | if (stsb_spearman + sickr_spearman) / 2 > self.best_avg_sts: 69 | self.best_avg_sts = (stsb_spearman + sickr_spearman) / 2 70 | wandb.run.summary["best_stsb_spearman"] = self.best_stsb_spearman 71 | wandb.run.summary["best_sickr_spearman"] = self.best_sickr_spearman 72 | wandb.run.summary["best_avg_sts"] = self.best_avg_sts 73 | self.log(metrics) 74 | return metrics 75 | 76 | def compute_loss(self, model, inputs, return_outputs=False): 77 | """ 78 | How the loss is computed by Trainer. By default, all models return the loss in the first element. 79 | 80 | Subclass and override for custom behavior. 81 | """ 82 | if self.label_smoother is not None and "labels" in inputs: 83 | labels = inputs.pop("labels") 84 | else: 85 | labels = None 86 | outputs = model(**inputs, global_step=self.state.global_step, max_steps=self.state.max_steps) 87 | # Save past state if it exists 88 | # TODO: this needs to be fixed and made cleaner later. 89 | if self.args.past_index >= 0: 90 | self._past = outputs[self.args.past_index] 91 | 92 | if labels is not None: 93 | if unwrap_model(model)._get_name() in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES.values(): 94 | loss = self.label_smoother(outputs, labels, shift_labels=True) 95 | else: 96 | loss = self.label_smoother(outputs, labels) 97 | else: 98 | if isinstance(outputs, dict) and "loss" not in outputs: 99 | raise ValueError( 100 | "The model did not return a loss from the inputs, only the following keys: " 101 | f"{','.join(outputs.keys())}. For reference, the inputs it received are {','.join(inputs.keys())}." 102 | ) 103 | # We don't use .loss here since the model may return tuples instead of ModelOutput. 104 | loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0] 105 | 106 | return (loss, outputs) if return_outputs else loss 107 | -------------------------------------------------------------------------------- /SentEval/senteval/snli.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SNLI - Entailment 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import codecs 14 | import os 15 | import io 16 | import copy 17 | import logging 18 | import numpy as np 19 | 20 | from senteval.tools.validation import SplitClassifier 21 | 22 | 23 | class SNLIEval(object): 24 | def __init__(self, taskpath, seed=1111): 25 | logging.debug('***** Transfer task : SNLI Entailment*****\n\n') 26 | self.seed = seed 27 | train1 = self.loadFile(os.path.join(taskpath, 's1.train')) 28 | train2 = self.loadFile(os.path.join(taskpath, 's2.train')) 29 | 30 | trainlabels = io.open(os.path.join(taskpath, 'labels.train'), 31 | encoding='utf-8').read().splitlines() 32 | 33 | valid1 = self.loadFile(os.path.join(taskpath, 's1.dev')) 34 | valid2 = self.loadFile(os.path.join(taskpath, 's2.dev')) 35 | validlabels = io.open(os.path.join(taskpath, 'labels.dev'), 36 | encoding='utf-8').read().splitlines() 37 | 38 | test1 = self.loadFile(os.path.join(taskpath, 's1.test')) 39 | test2 = self.loadFile(os.path.join(taskpath, 's2.test')) 40 | testlabels = io.open(os.path.join(taskpath, 'labels.test'), 41 | encoding='utf-8').read().splitlines() 42 | 43 | # sort data (by s2 first) to reduce padding 44 | sorted_train = sorted(zip(train2, train1, trainlabels), 45 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 46 | train2, train1, trainlabels = map(list, zip(*sorted_train)) 47 | 48 | sorted_valid = sorted(zip(valid2, valid1, validlabels), 49 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 50 | valid2, valid1, validlabels = map(list, zip(*sorted_valid)) 51 | 52 | sorted_test = sorted(zip(test2, test1, testlabels), 53 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 54 | test2, test1, testlabels = map(list, zip(*sorted_test)) 55 | 56 | self.samples = train1 + train2 + valid1 + valid2 + test1 + test2 57 | self.data = {'train': (train1, train2, trainlabels), 58 | 'valid': (valid1, valid2, validlabels), 59 | 'test': (test1, test2, testlabels) 60 | } 61 | 62 | def do_prepare(self, params, prepare): 63 | return prepare(params, self.samples) 64 | 65 | def loadFile(self, fpath): 66 | with codecs.open(fpath, 'rb', 'latin-1') as f: 67 | return [line.split() for line in 68 | f.read().splitlines()] 69 | 70 | def run(self, params, batcher): 71 | self.X, self.y = {}, {} 72 | dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2} 73 | for key in self.data: 74 | if key not in self.X: 75 | self.X[key] = [] 76 | if key not in self.y: 77 | self.y[key] = [] 78 | 79 | input1, input2, mylabels = self.data[key] 80 | enc_input = [] 81 | n_labels = len(mylabels) 82 | for ii in range(0, n_labels, params.batch_size): 83 | batch1 = input1[ii:ii + params.batch_size] 84 | batch2 = input2[ii:ii + params.batch_size] 85 | 86 | if len(batch1) == len(batch2) and len(batch1) > 0: 87 | enc1 = batcher(params, batch1) 88 | enc2 = batcher(params, batch2) 89 | enc_input.append(np.hstack((enc1, enc2, enc1 * enc2, 90 | np.abs(enc1 - enc2)))) 91 | if (ii*params.batch_size) % (20000*params.batch_size) == 0: 92 | logging.info("PROGRESS (encoding): %.2f%%" % 93 | (100 * ii / n_labels)) 94 | self.X[key] = np.vstack(enc_input) 95 | self.y[key] = [dico_label[y] for y in mylabels] 96 | 97 | config = {'nclasses': 3, 'seed': self.seed, 98 | 'usepytorch': params.usepytorch, 99 | 'cudaEfficient': True, 100 | 'nhid': params.nhid, 'noreg': True} 101 | 102 | config_classifier = copy.deepcopy(params.classifier) 103 | config_classifier['max_epoch'] = 15 104 | config_classifier['epoch_size'] = 1 105 | config['classifier'] = config_classifier 106 | 107 | clf = SplitClassifier(self.X, self.y, config) 108 | devacc, testacc = clf.run() 109 | logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n' 110 | .format(devacc, testacc)) 111 | return {'devacc': devacc, 'acc': testacc, 112 | 'ndev': len(self.data['valid'][0]), 113 | 'ntest': len(self.data['test'][0])} 114 | -------------------------------------------------------------------------------- /SentEval/senteval/rank.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | Image-Caption Retrieval with COCO dataset 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import os 14 | import sys 15 | import logging 16 | import numpy as np 17 | 18 | try: 19 | import cPickle as pickle 20 | except ImportError: 21 | import pickle 22 | 23 | from senteval.tools.ranking import ImageSentenceRankingPytorch 24 | 25 | 26 | class ImageCaptionRetrievalEval(object): 27 | def __init__(self, task_path, seed=1111): 28 | logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n') 29 | 30 | # Get captions and image features 31 | self.seed = seed 32 | train, dev, test = self.loadFile(task_path) 33 | self.coco_data = {'train': train, 'dev': dev, 'test': test} 34 | 35 | def do_prepare(self, params, prepare): 36 | samples = self.coco_data['train']['sent'] + \ 37 | self.coco_data['dev']['sent'] + \ 38 | self.coco_data['test']['sent'] 39 | prepare(params, samples) 40 | 41 | def loadFile(self, fpath): 42 | coco = {} 43 | 44 | for split in ['train', 'valid', 'test']: 45 | list_sent = [] 46 | list_img_feat = [] 47 | if sys.version_info < (3, 0): 48 | with open(os.path.join(fpath, split + '.pkl')) as f: 49 | cocodata = pickle.load(f) 50 | else: 51 | with open(os.path.join(fpath, split + '.pkl'), 'rb') as f: 52 | cocodata = pickle.load(f, encoding='latin1') 53 | 54 | for imgkey in range(len(cocodata['features'])): 55 | assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \ 56 | cocodata['image_to_caption_ids'][imgkey] 57 | for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]: 58 | sent = cocodata['captions'][captkey]['cleaned_caption'] 59 | sent += ' .' # add punctuation to end of sentence in COCO 60 | list_sent.append(sent.encode('utf-8').split()) 61 | list_img_feat.append(cocodata['features'][imgkey]) 62 | assert len(list_sent) == len(list_img_feat) and \ 63 | len(list_sent) % 5 == 0 64 | list_img_feat = np.array(list_img_feat).astype('float32') 65 | coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat} 66 | return coco['train'], coco['valid'], coco['test'] 67 | 68 | def run(self, params, batcher): 69 | coco_embed = {'train': {'sentfeat': [], 'imgfeat': []}, 70 | 'dev': {'sentfeat': [], 'imgfeat': []}, 71 | 'test': {'sentfeat': [], 'imgfeat': []}} 72 | 73 | for key in self.coco_data: 74 | logging.info('Computing embedding for {0}'.format(key)) 75 | # Sort to reduce padding 76 | self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent']) 77 | self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent']) 78 | idx_unsort = np.argsort(idx_sort) 79 | 80 | coco_embed[key]['X'] = [] 81 | nsent = len(self.coco_data[key]['sent']) 82 | for ii in range(0, nsent, params.batch_size): 83 | batch = self.coco_data[key]['sent'][ii:ii + params.batch_size] 84 | embeddings = batcher(params, batch) 85 | coco_embed[key]['sentfeat'].append(embeddings) 86 | coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort] 87 | coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat']) 88 | logging.info('Computed {0} embeddings'.format(key)) 89 | 90 | config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2} 91 | clf = ImageSentenceRankingPytorch(train=coco_embed['train'], 92 | valid=coco_embed['dev'], 93 | test=coco_embed['test'], 94 | config=config) 95 | 96 | bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \ 97 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run() 98 | 99 | logging.debug("\nTest scores | Image to text: \ 100 | {0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t)) 101 | logging.debug("Test scores | Text to image: \ 102 | {0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i)) 103 | 104 | return {'devacc': bestdevscore, 105 | 'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t), 106 | (r1_t2i, r5_t2i, r10_t2i, medr_t2i)], 107 | 'ndev': len(coco_embed['dev']['sentfeat']), 108 | 'ntest': len(coco_embed['test']['sentfeat'])} 109 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/relatedness.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Semantic Relatedness (supervised) with Pytorch 10 | """ 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import copy 14 | import numpy as np 15 | 16 | import torch 17 | from torch import nn 18 | import torch.optim as optim 19 | 20 | from scipy.stats import pearsonr, spearmanr 21 | 22 | 23 | class RelatednessPytorch(object): 24 | # Can be used for SICK-Relatedness, and STS14 25 | def __init__(self, train, valid, test, devscores, config): 26 | # fix seed 27 | np.random.seed(config['seed']) 28 | torch.manual_seed(config['seed']) 29 | assert torch.cuda.is_available(), 'torch.cuda required for Relatedness' 30 | torch.cuda.manual_seed(config['seed']) 31 | 32 | self.train = train 33 | self.valid = valid 34 | self.test = test 35 | self.devscores = devscores 36 | 37 | self.inputdim = train['X'].shape[1] 38 | self.nclasses = config['nclasses'] 39 | self.seed = config['seed'] 40 | self.l2reg = 0. 41 | self.batch_size = 64 42 | self.maxepoch = 1000 43 | self.early_stop = True 44 | 45 | self.model = nn.Sequential( 46 | nn.Linear(self.inputdim, self.nclasses), 47 | nn.Softmax(dim=-1), 48 | ) 49 | self.loss_fn = nn.MSELoss() 50 | 51 | if torch.cuda.is_available(): 52 | self.model = self.model.cuda() 53 | self.loss_fn = self.loss_fn.cuda() 54 | 55 | self.loss_fn.size_average = False 56 | self.optimizer = optim.Adam(self.model.parameters(), 57 | weight_decay=self.l2reg) 58 | 59 | def prepare_data(self, trainX, trainy, devX, devy, testX, testy): 60 | # Transform probs to log-probs for KL-divergence 61 | trainX = torch.from_numpy(trainX).float().cuda() 62 | trainy = torch.from_numpy(trainy).float().cuda() 63 | devX = torch.from_numpy(devX).float().cuda() 64 | devy = torch.from_numpy(devy).float().cuda() 65 | testX = torch.from_numpy(testX).float().cuda() 66 | testY = torch.from_numpy(testy).float().cuda() 67 | 68 | return trainX, trainy, devX, devy, testX, testy 69 | 70 | def run(self): 71 | self.nepoch = 0 72 | bestpr = -1 73 | early_stop_count = 0 74 | r = np.arange(1, 6) 75 | stop_train = False 76 | 77 | # Preparing data 78 | trainX, trainy, devX, devy, testX, testy = self.prepare_data( 79 | self.train['X'], self.train['y'], 80 | self.valid['X'], self.valid['y'], 81 | self.test['X'], self.test['y']) 82 | 83 | # Training 84 | while not stop_train and self.nepoch <= self.maxepoch: 85 | self.trainepoch(trainX, trainy, nepoches=50) 86 | yhat = np.dot(self.predict_proba(devX), r) 87 | pr = spearmanr(yhat, self.devscores)[0] 88 | pr = 0 if pr != pr else pr # if NaN bc std=0 89 | # early stop on Pearson 90 | if pr > bestpr: 91 | bestpr = pr 92 | bestmodel = copy.deepcopy(self.model) 93 | elif self.early_stop: 94 | if early_stop_count >= 3: 95 | stop_train = True 96 | early_stop_count += 1 97 | self.model = bestmodel 98 | 99 | yhat = np.dot(self.predict_proba(testX), r) 100 | 101 | return bestpr, yhat 102 | 103 | def trainepoch(self, X, y, nepoches=1): 104 | self.model.train() 105 | for _ in range(self.nepoch, self.nepoch + nepoches): 106 | permutation = np.random.permutation(len(X)) 107 | all_costs = [] 108 | for i in range(0, len(X), self.batch_size): 109 | # forward 110 | idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda() 111 | Xbatch = X[idx] 112 | ybatch = y[idx] 113 | output = self.model(Xbatch) 114 | # loss 115 | loss = self.loss_fn(output, ybatch) 116 | all_costs.append(loss.item()) 117 | # backward 118 | self.optimizer.zero_grad() 119 | loss.backward() 120 | # Update parameters 121 | self.optimizer.step() 122 | self.nepoch += nepoches 123 | 124 | def predict_proba(self, devX): 125 | self.model.eval() 126 | probas = [] 127 | with torch.no_grad(): 128 | for i in range(0, len(devX), self.batch_size): 129 | Xbatch = devX[i:i + self.batch_size] 130 | if len(probas) == 0: 131 | probas = self.model(Xbatch).data.cpu().numpy() 132 | else: 133 | probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0) 134 | return probas 135 | -------------------------------------------------------------------------------- /train.py: -------------------------------------------------------------------------------- 1 | from transformers import set_seed, TrainingArguments, HfArgumentParser, PretrainedConfig 2 | from transformers import AutoTokenizer 3 | from datasets import load_dataset 4 | import torch 5 | import wandb 6 | from args import ModelArguments, DatasetArguments 7 | from model import DenoSentModel 8 | from trainer import MyTrainer 9 | from mteb import MTEB 10 | from prettytable import PrettyTable 11 | from config import DenoSentConfig 12 | 13 | def preprocess_logits_for_metrics(logits, labels): 14 | """ 15 | Original Trainer may have a memory leak. 16 | This is a workaround to avoid storing too many tensors that are not needed. 17 | """ 18 | pred_ids = torch.argmax(logits[0], dim=-1) 19 | return pred_ids 20 | 21 | 22 | def eval_mteb(model, batch_size): 23 | tasks = [ 24 | "STS12", 25 | "STS13", 26 | "STS14", 27 | "STS15", 28 | "STS16", 29 | "STSBenchmark", 30 | "SICK-R", 31 | ] 32 | evaluation = MTEB(tasks=tasks, task_langs=["en"], task_categories=['S2S']) 33 | results = evaluation.run(model, overwrite_results=True, batch_size=batch_size, eval_splits=['test'], output_folder='mteb_results/'+wandb.run.name) 34 | sts12 = results['STS12']['test']['cos_sim']['spearman'] 35 | sts13 = results['STS13']['test']['cos_sim']['spearman'] 36 | sts14 = results['STS14']['test']['cos_sim']['spearman'] 37 | sts15 = results['STS15']['test']['cos_sim']['spearman'] 38 | sts16 = results['STS16']['test']['cos_sim']['spearman'] 39 | sickr = results['SICK-R']['test']['cos_sim']['spearman'] 40 | stsb = results['STSBenchmark']['test']['cos_sim']['spearman'] 41 | avg_sts = (sts12 + sts13 + sts14 + sts15 + sts16 + sickr + stsb) / 7 42 | wandb.summary['STS12'] = sts12 43 | wandb.summary['STS13'] = sts13 44 | wandb.summary['STS14'] = sts14 45 | wandb.summary['STS15'] = sts15 46 | wandb.summary['STS16'] = sts16 47 | wandb.summary['SICK-R'] = sickr 48 | wandb.summary['STSBenchmark'] = stsb 49 | wandb.summary['mteb_avg_sts'] = avg_sts 50 | return results 51 | 52 | 53 | 54 | if __name__ == "__main__": 55 | parser = HfArgumentParser((ModelArguments, TrainingArguments, DatasetArguments)) 56 | model_args, training_args, dataset_args = parser.parse_args_into_dataclasses() 57 | wandb.init(project='DenoSent') 58 | set_seed(training_args.seed) 59 | wandb.config.update(model_args) 60 | wandb.config.update(training_args) 61 | wandb.config.update(dataset_args) 62 | training_args.output_dir = 'results/' + wandb.run.name 63 | tokenizer = AutoTokenizer.from_pretrained(model_args.model_name_or_path) 64 | config = DenoSentConfig( 65 | encoder_name_or_path=model_args.model_name_or_path, 66 | max_length=model_args.max_length, 67 | decoder_num_heads=model_args.decoder_num_heads, 68 | decoder_num_layers=model_args.decoder_num_layers, 69 | decoder_noise_dropout=model_args.decoder_target_dropout, 70 | pooler=model_args.pooler, 71 | do_contrastive=model_args.do_contrastive, 72 | do_generative=model_args.do_generative, 73 | prompt_format=model_args.prompt_format, 74 | contrastive_weight=model_args.contrastive_weight, 75 | generative_weight=model_args.generative_weight, 76 | contrastive_temp=model_args.contrastive_temp, 77 | ) 78 | print(config) 79 | 80 | model = DenoSentModel(config) 81 | 82 | def map_fn(example): 83 | 84 | max_length = model_args.max_length 85 | if config.pooler == 'mask': 86 | prompt_len = len(tokenizer(config.prompt_format, add_special_tokens=False)['input_ids']) 87 | example['sent0'] = tokenizer.decode(tokenizer(example['sent0'], padding=True, truncation=True, max_length=config.max_length)['input_ids'], skip_special_tokens=True) 88 | example['sent1'] = tokenizer.decode(tokenizer(example['sent1'], padding=True, truncation=True, max_length=config.max_length)['input_ids'], skip_special_tokens=True) 89 | example['sent0'] = config.prompt_format.replace('[X]', example['sent0']).replace('[MASK]', tokenizer.mask_token) 90 | example['sent1'] = config.prompt_format.replace('[X]', example['sent1']).replace('[MASK]', tokenizer.mask_token) 91 | max_length = max_length + prompt_len 92 | original_inputs = tokenizer(example['sent0'], padding='max_length', truncation=True, max_length=max_length) 93 | example['input_ids'] = original_inputs['input_ids'] 94 | example['attention_mask'] = original_inputs['attention_mask'] 95 | 96 | positive_inputs = tokenizer(example['sent1'], padding='max_length', truncation=True, max_length=max_length) 97 | example['positive_input_ids'] = positive_inputs['input_ids'] 98 | example['positive_attention_mask'] = positive_inputs['attention_mask'] 99 | return example 100 | 101 | 102 | if dataset_args.train_dataset == "Singhoo/denosent_data": 103 | dataset = load_dataset(dataset_args.train_dataset, split='train') 104 | # dataset = load_dataset('csv', data_files='./augdata.csv', sep='\t', split='train') 105 | else: 106 | raise NotImplementedError() 107 | dataset = dataset.map(map_fn, batched=False, num_proc=12).train_test_split(0.1, seed=training_args.seed, shuffle=True) 108 | test_valid = dataset['test'].train_test_split(0.01) 109 | 110 | trainer = MyTrainer( 111 | model=model, 112 | args=training_args, 113 | tokenizer=tokenizer, 114 | train_dataset=dataset['train'], 115 | eval_dataset=test_valid['test'], 116 | preprocess_logits_for_metrics=preprocess_logits_for_metrics, 117 | ) 118 | trainer.train() 119 | mteb_results = eval_mteb(model, batch_size=training_args.eval_batch_size) 120 | table = PrettyTable(["Name", "Value"]) 121 | 122 | # Add rows 123 | table.add_row(["STS12", wandb.summary['STS12']]) 124 | table.add_row(["STS13", wandb.summary['STS13']]) 125 | table.add_row(["STS14", wandb.summary['STS14']]) 126 | table.add_row(["STS15", wandb.summary['STS15']]) 127 | table.add_row(["STS16", wandb.summary['STS16']]) 128 | table.add_row(["SICK-R", wandb.summary['SICK-R']]) 129 | table.add_row(["STSBenchmark", wandb.summary['STSBenchmark']]) 130 | table.add_row(["Avg.", wandb.summary['mteb_avg_sts']]) 131 | # Print the table 132 | print(table) 133 | 134 | wandb.finish() -------------------------------------------------------------------------------- /eval_senteval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import logging 3 | import argparse 4 | from prettytable import PrettyTable 5 | import torch 6 | from model import DenoSentModel 7 | from config import DenoSentConfig 8 | # Set up logger 9 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG) 10 | 11 | # Set PATHs 12 | PATH_TO_SENTEVAL = './SentEval' 13 | PATH_TO_DATA = './SentEval/data' 14 | 15 | # Import SentEval 16 | sys.path.insert(0, PATH_TO_SENTEVAL) 17 | import senteval 18 | 19 | def print_table(task_names, scores): 20 | tb = PrettyTable() 21 | tb.field_names = task_names 22 | tb.add_row(scores) 23 | print(tb) 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument("--model_name_or_path", type=str, 28 | help="Transformers' model name or path") 29 | parser.add_argument("--pooler", type=str, 30 | choices=['cls', 'mean', 'mask'], 31 | default='mask', 32 | help="Which pooler to use") 33 | parser.add_argument("--mode", type=str, 34 | choices=['dev', 'test', 'fasttest'], 35 | default='test', 36 | help="What evaluation mode to use (dev: fast mode, dev results; test: full mode, test results); fasttest: fast mode, test results") 37 | parser.add_argument("--task_set", type=str, 38 | choices=['sts', 'transfer', 'full', 'na'], 39 | default='sts', 40 | help="What set of tasks to evaluate on. If not 'na', this will override '--tasks'") 41 | parser.add_argument("--tasks", type=str, nargs='+', 42 | default=['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 43 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC', 44 | 'SICKRelatedness', 'STSBenchmark'], 45 | help="Tasks to evaluate on. If '--task_set' is specified, this will be overridden") 46 | args = parser.parse_args() 47 | # Load transformers' model checkpoint 48 | 49 | config = DenoSentConfig.from_pretrained(args.model_name_or_path) 50 | model = DenoSentModel.from_pretrained(args.model_name_or_path, config=config) 51 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") 52 | model = model.to(device) 53 | # Set up the tasks 54 | if args.task_set == 'sts': 55 | args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness'] 56 | elif args.task_set == 'transfer': 57 | args.tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC'] 58 | elif args.task_set == 'full': 59 | args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness'] 60 | args.tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC'] 61 | 62 | # Set params for SentEval 63 | if args.mode == 'dev' or args.mode == 'fasttest': 64 | # Fast mode 65 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5, 'cudaEfficient': True} 66 | params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 67 | 'tenacity': 3, 'epoch_size': 2} 68 | elif args.mode == 'test': 69 | # Full mode 70 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10, 'cudaEfficient': True} 71 | params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 72 | 'tenacity': 5, 'epoch_size': 4} 73 | else: 74 | raise NotImplementedError 75 | 76 | # SentEval prepare and batcher 77 | def prepare(params, samples): 78 | return 79 | 80 | def batcher(params, batch): 81 | sentences = [' '.join(s) for s in batch] 82 | return model.encode(sentences, len(sentences)) 83 | results = {} 84 | 85 | for task in args.tasks: 86 | se = senteval.engine.SE(params, batcher, prepare) 87 | result = se.eval(task) 88 | results[task] = result 89 | 90 | # Print evaluation results 91 | if args.mode == 'dev': 92 | print("------ %s ------" % (args.mode)) 93 | 94 | task_names = [] 95 | scores = [] 96 | for task in ['STSBenchmark', 'SICKRelatedness']: 97 | task_names.append(task) 98 | if task in results: 99 | scores.append("%.2f" % (results[task]['dev']['spearman'][0] * 100)) 100 | else: 101 | scores.append("0.00") 102 | print_table(task_names, scores) 103 | 104 | task_names = [] 105 | scores = [] 106 | for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']: 107 | task_names.append(task) 108 | if task in results: 109 | scores.append("%.2f" % (results[task]['devacc'])) 110 | else: 111 | scores.append("0.00") 112 | task_names.append("Avg.") 113 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) 114 | print_table(task_names, scores) 115 | 116 | elif args.mode == 'test' or args.mode == 'fasttest': 117 | print("------ %s ------" % (args.mode)) 118 | 119 | task_names = [] 120 | scores = [] 121 | for task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']: 122 | task_names.append(task) 123 | if task in results: 124 | if task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']: 125 | scores.append("%.2f" % (results[task]['all']['spearman']['all'] * 100)) 126 | else: 127 | scores.append("%.2f" % (results[task]['test']['spearman'].correlation * 100)) 128 | else: 129 | scores.append("0.00") 130 | task_names.append("Avg.") 131 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) 132 | print_table(task_names, scores) 133 | 134 | task_names = [] 135 | scores = [] 136 | for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']: 137 | task_names.append(task) 138 | if task in results: 139 | scores.append("%.2f" % (results[task]['acc'])) 140 | else: 141 | scores.append("0.00") 142 | task_names.append("Avg.") 143 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores))) 144 | print_table(task_names, scores) 145 | 146 | 147 | if __name__ == "__main__": 148 | main() -------------------------------------------------------------------------------- /SentEval/senteval/engine.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | 10 | Generic sentence evaluation scripts wrapper 11 | 12 | ''' 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | from senteval import utils 16 | from senteval.binary import CREval, MREval, MPQAEval, SUBJEval 17 | from senteval.snli import SNLIEval 18 | from senteval.trec import TRECEval 19 | from senteval.sick import SICKEntailmentEval, SICKEval 20 | from senteval.mrpc import MRPCEval 21 | from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval, SICKRelatednessEval, STSBenchmarkFinetune 22 | from senteval.sst import SSTEval 23 | from senteval.rank import ImageCaptionRetrievalEval 24 | from senteval.probing import * 25 | 26 | class SE(object): 27 | def __init__(self, params, batcher, prepare=None): 28 | # parameters 29 | params = utils.dotdict(params) 30 | params.usepytorch = True if 'usepytorch' not in params else params.usepytorch 31 | params.seed = 1111 if 'seed' not in params else params.seed 32 | 33 | params.batch_size = 128 if 'batch_size' not in params else params.batch_size 34 | params.nhid = 0 if 'nhid' not in params else params.nhid 35 | params.kfold = 5 if 'kfold' not in params else params.kfold 36 | 37 | if 'classifier' not in params or not params['classifier']: 38 | params.classifier = {'nhid': 0} 39 | 40 | assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!' 41 | 42 | self.params = params 43 | 44 | # batcher and prepare 45 | self.batcher = batcher 46 | self.prepare = prepare if prepare else lambda x, y: None 47 | 48 | self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 49 | 'SICKRelatedness', 'SICKEntailment', 'STSBenchmark', 50 | 'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13', 51 | 'STS14', 'STS15', 'STS16', 52 | 'Length', 'WordContent', 'Depth', 'TopConstituents', 53 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber', 54 | 'OddManOut', 'CoordinationInversion', 'SICKRelatedness-finetune', 'STSBenchmark-finetune', 'STSBenchmark-fix'] 55 | 56 | def eval(self, name): 57 | # evaluate on evaluation [name], either takes string or list of strings 58 | if (isinstance(name, list)): 59 | self.results = {x: self.eval(x) for x in name} 60 | return self.results 61 | 62 | tpath = self.params.task_path 63 | assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks) 64 | 65 | # Original SentEval tasks 66 | if name == 'CR': 67 | self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed) 68 | elif name == 'MR': 69 | self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed) 70 | elif name == 'MPQA': 71 | self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed) 72 | elif name == 'SUBJ': 73 | self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed) 74 | elif name == 'SST2': 75 | self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed) 76 | elif name == 'SST5': 77 | self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed) 78 | elif name == 'TREC': 79 | self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed) 80 | elif name == 'MRPC': 81 | self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed) 82 | elif name == 'SICKRelatedness': 83 | self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed) 84 | elif name == 'STSBenchmark': 85 | self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed) 86 | elif name == 'STSBenchmark-fix': 87 | self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark-fix', seed=self.params.seed) 88 | elif name == 'STSBenchmark-finetune': 89 | self.evaluation = STSBenchmarkFinetune(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed) 90 | elif name == 'SICKRelatedness-finetune': 91 | self.evaluation = SICKEval(tpath + '/downstream/SICK', seed=self.params.seed) 92 | elif name == 'SICKEntailment': 93 | self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed) 94 | elif name == 'SNLI': 95 | self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed) 96 | elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']: 97 | fpath = name + '-en-test' 98 | self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed) 99 | elif name == 'ImageCaptionRetrieval': 100 | self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed) 101 | 102 | # Probing Tasks 103 | elif name == 'Length': 104 | self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed) 105 | elif name == 'WordContent': 106 | self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed) 107 | elif name == 'Depth': 108 | self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed) 109 | elif name == 'TopConstituents': 110 | self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed) 111 | elif name == 'BigramShift': 112 | self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed) 113 | elif name == 'Tense': 114 | self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed) 115 | elif name == 'SubjNumber': 116 | self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed) 117 | elif name == 'ObjNumber': 118 | self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed) 119 | elif name == 'OddManOut': 120 | self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed) 121 | elif name == 'CoordinationInversion': 122 | self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed) 123 | 124 | self.params.current_task = name 125 | self.evaluation.do_prepare(self.params, self.prepare) 126 | 127 | self.results = self.evaluation.run(self.params, self.batcher) 128 | 129 | return self.results 130 | -------------------------------------------------------------------------------- /SentEval/senteval/probing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | probing tasks 10 | ''' 11 | 12 | from __future__ import absolute_import, division, unicode_literals 13 | 14 | import os 15 | import io 16 | import copy 17 | import logging 18 | import numpy as np 19 | 20 | from senteval.tools.validation import SplitClassifier 21 | 22 | 23 | class PROBINGEval(object): 24 | def __init__(self, task, task_path, seed=1111): 25 | self.seed = seed 26 | self.task = task 27 | logging.debug('***** (Probing) Transfer task : %s classification *****', self.task.upper()) 28 | self.task_data = {'train': {'X': [], 'y': []}, 29 | 'dev': {'X': [], 'y': []}, 30 | 'test': {'X': [], 'y': []}} 31 | self.loadFile(task_path) 32 | logging.info('Loaded %s train - %s dev - %s test for %s' % 33 | (len(self.task_data['train']['y']), len(self.task_data['dev']['y']), 34 | len(self.task_data['test']['y']), self.task)) 35 | 36 | def do_prepare(self, params, prepare): 37 | samples = self.task_data['train']['X'] + self.task_data['dev']['X'] + \ 38 | self.task_data['test']['X'] 39 | return prepare(params, samples) 40 | 41 | def loadFile(self, fpath): 42 | self.tok2split = {'tr': 'train', 'va': 'dev', 'te': 'test'} 43 | with io.open(fpath, 'r', encoding='utf-8') as f: 44 | for line in f: 45 | line = line.rstrip().split('\t') 46 | self.task_data[self.tok2split[line[0]]]['X'].append(line[-1].split()) 47 | self.task_data[self.tok2split[line[0]]]['y'].append(line[1]) 48 | 49 | labels = sorted(np.unique(self.task_data['train']['y'])) 50 | self.tok2label = dict(zip(labels, range(len(labels)))) 51 | self.nclasses = len(self.tok2label) 52 | 53 | for split in self.task_data: 54 | for i, y in enumerate(self.task_data[split]['y']): 55 | self.task_data[split]['y'][i] = self.tok2label[y] 56 | 57 | def run(self, params, batcher): 58 | task_embed = {'train': {}, 'dev': {}, 'test': {}} 59 | bsize = params.batch_size 60 | logging.info('Computing embeddings for train/dev/test') 61 | for key in self.task_data: 62 | # Sort to reduce padding 63 | sorted_data = sorted(zip(self.task_data[key]['X'], 64 | self.task_data[key]['y']), 65 | key=lambda z: (len(z[0]), z[1])) 66 | self.task_data[key]['X'], self.task_data[key]['y'] = map(list, zip(*sorted_data)) 67 | 68 | task_embed[key]['X'] = [] 69 | for ii in range(0, len(self.task_data[key]['y']), bsize): 70 | batch = self.task_data[key]['X'][ii:ii + bsize] 71 | embeddings = batcher(params, batch) 72 | task_embed[key]['X'].append(embeddings) 73 | task_embed[key]['X'] = np.vstack(task_embed[key]['X']) 74 | task_embed[key]['y'] = np.array(self.task_data[key]['y']) 75 | logging.info('Computed embeddings') 76 | 77 | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed, 78 | 'usepytorch': params.usepytorch, 79 | 'classifier': params.classifier} 80 | 81 | if self.task == "WordContent" and params.classifier['nhid'] > 0: 82 | config_classifier = copy.deepcopy(config_classifier) 83 | config_classifier['classifier']['nhid'] = 0 84 | print(params.classifier['nhid']) 85 | 86 | clf = SplitClassifier(X={'train': task_embed['train']['X'], 87 | 'valid': task_embed['dev']['X'], 88 | 'test': task_embed['test']['X']}, 89 | y={'train': task_embed['train']['y'], 90 | 'valid': task_embed['dev']['y'], 91 | 'test': task_embed['test']['y']}, 92 | config=config_classifier) 93 | 94 | devacc, testacc = clf.run() 95 | logging.debug('\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper())) 96 | 97 | return {'devacc': devacc, 'acc': testacc, 98 | 'ndev': len(task_embed['dev']['X']), 99 | 'ntest': len(task_embed['test']['X'])} 100 | 101 | """ 102 | Surface Information 103 | """ 104 | class LengthEval(PROBINGEval): 105 | def __init__(self, task_path, seed=1111): 106 | task_path = os.path.join(task_path, 'sentence_length.txt') 107 | # labels: bins 108 | PROBINGEval.__init__(self, 'Length', task_path, seed) 109 | 110 | class WordContentEval(PROBINGEval): 111 | def __init__(self, task_path, seed=1111): 112 | task_path = os.path.join(task_path, 'word_content.txt') 113 | # labels: 200 target words 114 | PROBINGEval.__init__(self, 'WordContent', task_path, seed) 115 | 116 | """ 117 | Latent Structural Information 118 | """ 119 | class DepthEval(PROBINGEval): 120 | def __init__(self, task_path, seed=1111): 121 | task_path = os.path.join(task_path, 'tree_depth.txt') 122 | # labels: bins 123 | PROBINGEval.__init__(self, 'Depth', task_path, seed) 124 | 125 | class TopConstituentsEval(PROBINGEval): 126 | def __init__(self, task_path, seed=1111): 127 | task_path = os.path.join(task_path, 'top_constituents.txt') 128 | # labels: 'PP_NP_VP_.' .. (20 classes) 129 | PROBINGEval.__init__(self, 'TopConstituents', task_path, seed) 130 | 131 | class BigramShiftEval(PROBINGEval): 132 | def __init__(self, task_path, seed=1111): 133 | task_path = os.path.join(task_path, 'bigram_shift.txt') 134 | # labels: 0 or 1 135 | PROBINGEval.__init__(self, 'BigramShift', task_path, seed) 136 | 137 | # TODO: Voice? 138 | 139 | """ 140 | Latent Semantic Information 141 | """ 142 | 143 | class TenseEval(PROBINGEval): 144 | def __init__(self, task_path, seed=1111): 145 | task_path = os.path.join(task_path, 'past_present.txt') 146 | # labels: 'PRES', 'PAST' 147 | PROBINGEval.__init__(self, 'Tense', task_path, seed) 148 | 149 | class SubjNumberEval(PROBINGEval): 150 | def __init__(self, task_path, seed=1111): 151 | task_path = os.path.join(task_path, 'subj_number.txt') 152 | # labels: 'NN', 'NNS' 153 | PROBINGEval.__init__(self, 'SubjNumber', task_path, seed) 154 | 155 | class ObjNumberEval(PROBINGEval): 156 | def __init__(self, task_path, seed=1111): 157 | task_path = os.path.join(task_path, 'obj_number.txt') 158 | # labels: 'NN', 'NNS' 159 | PROBINGEval.__init__(self, 'ObjNumber', task_path, seed) 160 | 161 | class OddManOutEval(PROBINGEval): 162 | def __init__(self, task_path, seed=1111): 163 | task_path = os.path.join(task_path, 'odd_man_out.txt') 164 | # labels: 'O', 'C' 165 | PROBINGEval.__init__(self, 'OddManOut', task_path, seed) 166 | 167 | class CoordinationInversionEval(PROBINGEval): 168 | def __init__(self, task_path, seed=1111): 169 | task_path = os.path.join(task_path, 'coordination_inversion.txt') 170 | # labels: 'O', 'I' 171 | PROBINGEval.__init__(self, 'CoordinationInversion', task_path, seed) 172 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/classifier.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Pytorch Classifier class in the style of scikit-learn 10 | Classifiers include Logistic Regression and MLP 11 | """ 12 | 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | import numpy as np 16 | import copy 17 | from senteval import utils 18 | 19 | import torch 20 | from torch import nn 21 | import torch.nn.functional as F 22 | 23 | 24 | class PyTorchClassifier(object): 25 | def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111, 26 | cudaEfficient=False): 27 | # fix seed 28 | np.random.seed(seed) 29 | torch.manual_seed(seed) 30 | torch.cuda.manual_seed(seed) 31 | 32 | self.inputdim = inputdim 33 | self.nclasses = nclasses 34 | self.l2reg = l2reg 35 | self.batch_size = batch_size 36 | self.cudaEfficient = cudaEfficient 37 | 38 | def prepare_split(self, X, y, validation_data=None, validation_split=None): 39 | # Preparing validation data 40 | assert validation_split or validation_data 41 | if validation_data is not None: 42 | trainX, trainy = X, y 43 | devX, devy = validation_data 44 | else: 45 | permutation = np.random.permutation(len(X)) 46 | trainidx = permutation[int(validation_split * len(X)):] 47 | devidx = permutation[0:int(validation_split * len(X))] 48 | trainX, trainy = X[trainidx], y[trainidx] 49 | devX, devy = X[devidx], y[devidx] 50 | 51 | device = torch.device('cpu') if self.cudaEfficient else torch.device('cuda') 52 | 53 | trainX = torch.from_numpy(trainX).to(device, dtype=torch.float32) 54 | trainy = torch.from_numpy(trainy).to(device, dtype=torch.int64) 55 | devX = torch.from_numpy(devX).to(device, dtype=torch.float32) 56 | devy = torch.from_numpy(devy).to(device, dtype=torch.int64) 57 | 58 | return trainX, trainy, devX, devy 59 | 60 | def fit(self, X, y, validation_data=None, validation_split=None, 61 | early_stop=True): 62 | self.nepoch = 0 63 | bestaccuracy = -1 64 | stop_train = False 65 | early_stop_count = 0 66 | 67 | # Preparing validation data 68 | trainX, trainy, devX, devy = self.prepare_split(X, y, validation_data, 69 | validation_split) 70 | 71 | # Training 72 | while not stop_train and self.nepoch <= self.max_epoch: 73 | self.trainepoch(trainX, trainy, epoch_size=self.epoch_size) 74 | accuracy = self.score(devX, devy) 75 | if accuracy > bestaccuracy: 76 | bestaccuracy = accuracy 77 | bestmodel = copy.deepcopy(self.model) 78 | elif early_stop: 79 | if early_stop_count >= self.tenacity: 80 | stop_train = True 81 | early_stop_count += 1 82 | self.model = bestmodel 83 | return bestaccuracy 84 | 85 | def trainepoch(self, X, y, epoch_size=1): 86 | self.model.train() 87 | for _ in range(self.nepoch, self.nepoch + epoch_size): 88 | permutation = np.random.permutation(len(X)) 89 | all_costs = [] 90 | for i in range(0, len(X), self.batch_size): 91 | # forward 92 | idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().to(X.device) 93 | 94 | Xbatch = X[idx] 95 | ybatch = y[idx] 96 | 97 | if self.cudaEfficient: 98 | Xbatch = Xbatch.cuda() 99 | ybatch = ybatch.cuda() 100 | output = self.model(Xbatch) 101 | # loss 102 | loss = self.loss_fn(output, ybatch) 103 | all_costs.append(loss.data.item()) 104 | # backward 105 | self.optimizer.zero_grad() 106 | loss.backward() 107 | # Update parameters 108 | self.optimizer.step() 109 | self.nepoch += epoch_size 110 | 111 | def score(self, devX, devy): 112 | self.model.eval() 113 | correct = 0 114 | if not isinstance(devX, torch.cuda.FloatTensor) or self.cudaEfficient: 115 | devX = torch.FloatTensor(devX).cuda() 116 | devy = torch.LongTensor(devy).cuda() 117 | with torch.no_grad(): 118 | for i in range(0, len(devX), self.batch_size): 119 | Xbatch = devX[i:i + self.batch_size] 120 | ybatch = devy[i:i + self.batch_size] 121 | if self.cudaEfficient: 122 | Xbatch = Xbatch.cuda() 123 | ybatch = ybatch.cuda() 124 | output = self.model(Xbatch) 125 | pred = output.data.max(1)[1] 126 | correct += pred.long().eq(ybatch.data.long()).sum().item() 127 | accuracy = 1.0 * correct / len(devX) 128 | return accuracy 129 | 130 | def predict(self, devX): 131 | self.model.eval() 132 | if not isinstance(devX, torch.cuda.FloatTensor): 133 | devX = torch.FloatTensor(devX).cuda() 134 | yhat = np.array([]) 135 | with torch.no_grad(): 136 | for i in range(0, len(devX), self.batch_size): 137 | Xbatch = devX[i:i + self.batch_size] 138 | output = self.model(Xbatch) 139 | yhat = np.append(yhat, 140 | output.data.max(1)[1].cpu().numpy()) 141 | yhat = np.vstack(yhat) 142 | return yhat 143 | 144 | def predict_proba(self, devX): 145 | self.model.eval() 146 | probas = [] 147 | with torch.no_grad(): 148 | for i in range(0, len(devX), self.batch_size): 149 | Xbatch = devX[i:i + self.batch_size] 150 | vals = F.softmax(self.model(Xbatch).data.cpu().numpy()) 151 | if not probas: 152 | probas = vals 153 | else: 154 | probas = np.concatenate(probas, vals, axis=0) 155 | return probas 156 | 157 | 158 | """ 159 | MLP with Pytorch (nhid=0 --> Logistic Regression) 160 | """ 161 | 162 | class MLP(PyTorchClassifier): 163 | def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64, 164 | seed=1111, cudaEfficient=False): 165 | super(self.__class__, self).__init__(inputdim, nclasses, l2reg, 166 | batch_size, seed, cudaEfficient) 167 | """ 168 | PARAMETERS: 169 | -nhid: number of hidden units (0: Logistic Regression) 170 | -optim: optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..) 171 | -tenacity: how many times dev acc does not increase before stopping 172 | -epoch_size: each epoch corresponds to epoch_size pass on the train set 173 | -max_epoch: max number of epoches 174 | -dropout: dropout for MLP 175 | """ 176 | 177 | self.nhid = 0 if "nhid" not in params else params["nhid"] 178 | self.optim = "adam" if "optim" not in params else params["optim"] 179 | self.tenacity = 5 if "tenacity" not in params else params["tenacity"] 180 | self.epoch_size = 4 if "epoch_size" not in params else params["epoch_size"] 181 | self.max_epoch = 200 if "max_epoch" not in params else params["max_epoch"] 182 | self.dropout = 0. if "dropout" not in params else params["dropout"] 183 | self.batch_size = 64 if "batch_size" not in params else params["batch_size"] 184 | 185 | if params["nhid"] == 0: 186 | self.model = nn.Sequential( 187 | nn.Linear(self.inputdim, self.nclasses), 188 | ).cuda() 189 | else: 190 | self.model = nn.Sequential( 191 | nn.Linear(self.inputdim, params["nhid"]), 192 | nn.Dropout(p=self.dropout), 193 | nn.Sigmoid(), 194 | nn.Linear(params["nhid"], self.nclasses), 195 | ).cuda() 196 | 197 | self.loss_fn = nn.CrossEntropyLoss().cuda() 198 | self.loss_fn.size_average = False 199 | 200 | optim_fn, optim_params = utils.get_optimizer(self.optim) 201 | self.optimizer = optim_fn(self.model.parameters(), **optim_params) 202 | self.optimizer.param_groups[0]['weight_decay'] = self.l2reg 203 | -------------------------------------------------------------------------------- /model.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoTokenizer, BertForMaskedLM 2 | from transformers.models.bert.modeling_bert import BertForMaskedLM 3 | from transformers.modeling_outputs import TokenClassifierOutput 4 | from transformers import PreTrainedModel 5 | import torch 6 | from torch import nn 7 | from torch.nn import TransformerDecoder, TransformerDecoderLayer 8 | 9 | from typing import Optional 10 | 11 | import wandb 12 | import numpy as np 13 | 14 | class DenoSentModel(PreTrainedModel): 15 | def __init__(self, config): 16 | super().__init__(config) 17 | self.pooler = config.pooler 18 | self.sent_embedding_projector = nn.Linear(config.hidden_size, config.hidden_size) 19 | self.decoder = TransformerDecoder(TransformerDecoderLayer(d_model=config.hidden_size, nhead=config.decoder_num_heads, batch_first=True, dropout=0.1), num_layers=config.decoder_num_layers) 20 | self.decoder_noise_dropout = nn.Dropout(config.decoder_noise_dropout) 21 | self.sim = nn.CosineSimilarity(dim=-1) 22 | self.init_weights() 23 | self.tokenizer = AutoTokenizer.from_pretrained(config.encoder_name_or_path) 24 | self.encoder = BertForMaskedLM.from_pretrained(config.encoder_name_or_path) 25 | self.prediction_head = self.encoder.cls 26 | self.encoder = self.encoder.bert 27 | self.post_init() 28 | 29 | def _init_weights(self, module): 30 | """Initialize the weights""" 31 | if isinstance(module, nn.Linear): 32 | # Slightly different from the TF version which uses truncated_normal for initialization 33 | # cf https://github.com/pytorch/pytorch/pull/5617 34 | module.weight.data.normal_(mean=0.0, std=0.02) 35 | if module.bias is not None: 36 | module.bias.data.zero_() 37 | elif isinstance(module, nn.Embedding): 38 | module.weight.data.normal_(mean=0.0, std=0.02) 39 | if module.padding_idx is not None: 40 | module.weight.data[module.padding_idx].zero_() 41 | elif isinstance(module, nn.LayerNorm): 42 | module.bias.data.zero_() 43 | module.weight.data.fill_(1.0) 44 | 45 | def encode(self, sentences, batch_size=32, **kwargs): 46 | """ Returns a list of embeddings for the given sentences. 47 | Args: 48 | sentences (`List[str]`): List of sentences to encode 49 | batch_size (`int`): Batch size for the encoding 50 | 51 | Returns: 52 | `List[np.ndarray]` or `List[tensor]`: List of embeddings for the given sentences 53 | """ 54 | self.eval() 55 | all_embeddings = [] 56 | length_sorted_idx = np.argsort([len(sen) for sen in sentences]) 57 | sentences_sorted = [sentences[idx] for idx in length_sorted_idx] 58 | if self.config.pooler == 'mask': 59 | prompt_length = len(self.tokenizer(self.config.prompt_format, add_special_tokens=False)['input_ids']) 60 | sentences_sorted = self.tokenizer.batch_decode(self.tokenizer(sentences_sorted, padding=True, truncation=True, max_length=self.config.max_length, return_tensors='pt').input_ids, skip_special_tokens=True) 61 | sentences_sorted = [self.config.prompt_format.replace('[X]', s).replace('[MASK]', self.tokenizer.mask_token) for s in sentences_sorted] 62 | for start_index in range(0, len(sentences), batch_size): 63 | sentences_batch = sentences_sorted[start_index:start_index+batch_size] 64 | inputs = self.tokenizer(sentences_batch, padding='max_length', truncation=True, return_tensors="pt", max_length=self.config.max_length+prompt_length) 65 | inputs = {k: v.to(self.device) for k,v in inputs.items()} 66 | with torch.no_grad(): 67 | encoder_outputs = self.encoder(**inputs, output_hidden_states=True, output_attentions=True, return_dict=True) 68 | last_hidden_state = encoder_outputs.last_hidden_state 69 | if self.config.pooler == 'cls': 70 | embeddings = last_hidden_state[:, 0, :] 71 | elif self.config.pooler == 'mean': 72 | embeddings = (last_hidden_state * inputs['attention_mask'].unsqueeze(-1)).sum(1) / inputs['attention_mask'].sum(-1).unsqueeze(-1) 73 | elif self.pooler == 'mask': 74 | embeddings = last_hidden_state[inputs['input_ids'] == self.tokenizer.mask_token_id] 75 | else: 76 | raise NotImplementedError() 77 | all_embeddings.extend(embeddings.cpu().numpy()) 78 | all_embeddings = torch.tensor(np.array([all_embeddings[idx] for idx in np.argsort(length_sorted_idx)])) 79 | return all_embeddings 80 | 81 | def forward( 82 | self, 83 | input_ids: Optional[torch.LongTensor] = None, 84 | attention_mask: Optional[torch.LongTensor] = None, 85 | positive_input_ids: Optional[torch.LongTensor] = None, 86 | positive_attention_mask: Optional[torch.LongTensor] = None, 87 | negative_input_ids: Optional[torch.LongTensor] = None, 88 | negative_attention_mask: Optional[torch.LongTensor] = None, 89 | global_step: Optional[int] = None, 90 | max_steps: Optional[int] = None, 91 | ): 92 | batch_size = input_ids.size(0) 93 | if negative_input_ids is not None: 94 | encoder_input_ids = torch.cat([input_ids, positive_input_ids, negative_input_ids], dim=0).to(self.device) 95 | encoder_attention_mask = torch.cat([attention_mask, positive_attention_mask, negative_attention_mask], dim=0).to(self.device) 96 | elif positive_input_ids is not None: 97 | encoder_input_ids = torch.cat([input_ids, positive_input_ids], dim=0).to(self.device) 98 | encoder_attention_mask = torch.cat([attention_mask, positive_attention_mask], dim=0).to(self.device) 99 | elif self.config.do_contrastive: 100 | encoder_input_ids = torch.cat([input_ids, input_ids], dim=0).to(self.device) 101 | encoder_attention_mask = torch.cat([attention_mask, attention_mask], dim=0).to(self.device) 102 | elif self.config.do_generative and not self.config.do_contrastive: 103 | encoder_input_ids = input_ids.to(self.device) 104 | encoder_attention_mask = attention_mask.to(self.device) 105 | else: 106 | raise NotImplementedError() 107 | encoder_outputs = self.encoder(input_ids=encoder_input_ids, attention_mask=encoder_attention_mask, return_dict=True, output_hidden_states=True, output_attentions=True) 108 | if self.pooler == 'cls': 109 | sent_embedding = encoder_outputs.last_hidden_state[:, 0, :] 110 | elif self.pooler == 'mean': 111 | sent_embedding = ((encoder_outputs.last_hidden_state * encoder_attention_mask.unsqueeze(-1)).sum(1) / encoder_attention_mask.sum(-1).unsqueeze(-1)) 112 | elif self.pooler == 'mask': 113 | sent_embedding = encoder_outputs.last_hidden_state[encoder_input_ids == self.tokenizer.mask_token_id] 114 | else: 115 | raise NotImplementedError() 116 | sent_embedding = sent_embedding.unsqueeze(1) 117 | sent_embedding = self.sent_embedding_projector(sent_embedding) 118 | 119 | if self.config.do_generative: 120 | if positive_input_ids is not None: 121 | tgt = encoder_outputs.hidden_states[0][batch_size:2*batch_size].detach() 122 | tgt_key_padding_mask = (positive_input_ids == self.tokenizer.pad_token_id) 123 | labels = positive_input_ids 124 | else: 125 | tgt = encoder_outputs.hidden_states[0][:batch_size].detach() 126 | tgt_key_padding_mask = (input_ids == self.tokenizer.pad_token_id) 127 | labels = input_ids 128 | tgt = self.decoder_noise_dropout(tgt) 129 | decoder_outputs = self.decoder(tgt=tgt, memory=sent_embedding[:batch_size], tgt_mask=None, tgt_key_padding_mask=tgt_key_padding_mask) 130 | logits = self.prediction_head(decoder_outputs) 131 | loss_fct = nn.CrossEntropyLoss(ignore_index=self.tokenizer.pad_token_id) 132 | generative_loss = loss_fct(logits.view(-1, self.encoder.config.vocab_size), labels.view(-1)) 133 | wandb.log({'train/generative_loss': generative_loss}) 134 | 135 | if self.config.do_contrastive: 136 | positive_sim = self.sim(sent_embedding[:batch_size], sent_embedding[batch_size:2*batch_size].transpose(0, 1)) 137 | cos_sim = positive_sim 138 | if negative_attention_mask is not None: 139 | negative_sim = self.sim(sent_embedding[:batch_size], sent_embedding[2*batch_size:].transpose(0, 1)) 140 | cos_sim = torch.cat([positive_sim, negative_sim], dim=1) 141 | cos_sim = cos_sim / self.config.contrastive_temp 142 | contrastive_labels = torch.arange(batch_size, dtype=torch.long, device=self.device) 143 | contrastive_loss = nn.CrossEntropyLoss()(cos_sim, contrastive_labels) 144 | wandb.log({'train/contrastive_loss': contrastive_loss.item()}) 145 | logits = None 146 | loss = 0 147 | if self.config.do_contrastive: 148 | loss += self.config.contrastive_weight * contrastive_loss 149 | if self.config.do_generative: 150 | loss += self.config.generative_weight * generative_loss 151 | wandb.log({'train/loss': loss}) 152 | return TokenClassifierOutput( 153 | loss=loss, 154 | logits=logits, 155 | hidden_states=encoder_outputs.hidden_states, 156 | attentions=encoder_outputs.attentions, 157 | ) 158 | -------------------------------------------------------------------------------- /SentEval/senteval/sick.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | SICK Relatedness and Entailment 10 | ''' 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import os 14 | import io 15 | import logging 16 | import numpy as np 17 | 18 | from sklearn.metrics import mean_squared_error 19 | from scipy.stats import pearsonr, spearmanr 20 | 21 | from senteval.tools.relatedness import RelatednessPytorch 22 | from senteval.tools.validation import SplitClassifier 23 | 24 | class SICKEval(object): 25 | def __init__(self, task_path, seed=1111): 26 | logging.debug('***** Transfer task : SICK-Relatedness*****\n\n') 27 | self.seed = seed 28 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt')) 29 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt')) 30 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt')) 31 | self.sick_data = {'train': train, 'dev': dev, 'test': test} 32 | 33 | def do_prepare(self, params, prepare): 34 | samples = self.sick_data['train']['X_A'] + \ 35 | self.sick_data['train']['X_B'] + \ 36 | self.sick_data['dev']['X_A'] + \ 37 | self.sick_data['dev']['X_B'] + \ 38 | self.sick_data['test']['X_A'] + self.sick_data['test']['X_B'] 39 | return prepare(params, samples) 40 | 41 | def loadFile(self, fpath): 42 | skipFirstLine = True 43 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 44 | with io.open(fpath, 'r', encoding='utf-8') as f: 45 | for line in f: 46 | if skipFirstLine: 47 | skipFirstLine = False 48 | else: 49 | text = line.strip().split('\t') 50 | sick_data['X_A'].append(text[1].split()) 51 | sick_data['X_B'].append(text[2].split()) 52 | sick_data['y'].append(text[3]) 53 | 54 | sick_data['y'] = [float(s) for s in sick_data['y']] 55 | return sick_data 56 | 57 | def run(self, params, batcher): 58 | sick_embed = {'train': {}, 'dev': {}, 'test': {}} 59 | bsize = params.batch_size 60 | 61 | for key in self.sick_data: 62 | logging.info('Computing embedding for {0}'.format(key)) 63 | # Sort to reduce padding 64 | sorted_corpus = sorted(zip(self.sick_data[key]['X_A'], 65 | self.sick_data[key]['X_B'], 66 | self.sick_data[key]['y']), 67 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 68 | 69 | self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus] 70 | self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus] 71 | self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus] 72 | 73 | for txt_type in ['X_A', 'X_B']: 74 | sick_embed[key][txt_type] = [] 75 | for ii in range(0, len(self.sick_data[key]['y']), bsize): 76 | batch = self.sick_data[key][txt_type][ii:ii + bsize] 77 | embeddings = batcher(params, batch) 78 | sick_embed[key][txt_type].append(embeddings) 79 | sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type]) 80 | sick_embed[key]['y'] = np.array(self.sick_data[key]['y']) 81 | logging.info('Computed {0} embeddings'.format(key)) 82 | 83 | # Train 84 | trainA = sick_embed['train']['X_A'] 85 | trainB = sick_embed['train']['X_B'] 86 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 87 | trainY = self.encode_labels(self.sick_data['train']['y']) 88 | 89 | # Dev 90 | devA = sick_embed['dev']['X_A'] 91 | devB = sick_embed['dev']['X_B'] 92 | devF = np.c_[np.abs(devA - devB), devA * devB] 93 | devY = self.encode_labels(self.sick_data['dev']['y']) 94 | 95 | # Test 96 | testA = sick_embed['test']['X_A'] 97 | testB = sick_embed['test']['X_B'] 98 | testF = np.c_[np.abs(testA - testB), testA * testB] 99 | testY = self.encode_labels(self.sick_data['test']['y']) 100 | 101 | config = {'seed': self.seed, 'nclasses': 5} 102 | clf = RelatednessPytorch(train={'X': trainF, 'y': trainY}, 103 | valid={'X': devF, 'y': devY}, 104 | test={'X': testF, 'y': testY}, 105 | devscores=self.sick_data['dev']['y'], 106 | config=config) 107 | 108 | devspr, yhat = clf.run() 109 | 110 | pr = pearsonr(yhat, self.sick_data['test']['y'])[0] 111 | sr = spearmanr(yhat, self.sick_data['test']['y'])[0] 112 | pr = 0 if pr != pr else pr 113 | sr = 0 if sr != sr else sr 114 | se = mean_squared_error(yhat, self.sick_data['test']['y']) 115 | logging.debug('Dev : Spearman {0}'.format(devspr)) 116 | logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \ 117 | for SICK Relatedness\n'.format(pr, sr, se)) 118 | 119 | return {'devspearman': devspr, 'pearson': pr, 'spearman': sr, 'mse': se, 120 | 'yhat': yhat, 'ndev': len(devA), 'ntest': len(testA)} 121 | 122 | def encode_labels(self, labels, nclass=5): 123 | """ 124 | Label encoding from Tree LSTM paper (Tai, Socher, Manning) 125 | """ 126 | Y = np.zeros((len(labels), nclass)).astype('float32') 127 | for j, y in enumerate(labels): 128 | for i in range(nclass): 129 | if i+1 == np.floor(y) + 1: 130 | Y[j, i] = y - np.floor(y) 131 | if i+1 == np.floor(y): 132 | Y[j, i] = np.floor(y) - y + 1 133 | return Y 134 | 135 | 136 | class SICKEntailmentEval(SICKEval): 137 | def __init__(self, task_path, seed=1111): 138 | logging.debug('***** Transfer task : SICK-Entailment*****\n\n') 139 | self.seed = seed 140 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt')) 141 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt')) 142 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt')) 143 | self.sick_data = {'train': train, 'dev': dev, 'test': test} 144 | 145 | def loadFile(self, fpath): 146 | label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2} 147 | skipFirstLine = True 148 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 149 | with io.open(fpath, 'r', encoding='utf-8') as f: 150 | for line in f: 151 | if skipFirstLine: 152 | skipFirstLine = False 153 | else: 154 | text = line.strip().split('\t') 155 | sick_data['X_A'].append(text[1].split()) 156 | sick_data['X_B'].append(text[2].split()) 157 | sick_data['y'].append(text[4]) 158 | sick_data['y'] = [label2id[s] for s in sick_data['y']] 159 | return sick_data 160 | 161 | def run(self, params, batcher): 162 | sick_embed = {'train': {}, 'dev': {}, 'test': {}} 163 | bsize = params.batch_size 164 | 165 | for key in self.sick_data: 166 | logging.info('Computing embedding for {0}'.format(key)) 167 | # Sort to reduce padding 168 | sorted_corpus = sorted(zip(self.sick_data[key]['X_A'], 169 | self.sick_data[key]['X_B'], 170 | self.sick_data[key]['y']), 171 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 172 | 173 | self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus] 174 | self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus] 175 | self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus] 176 | 177 | for txt_type in ['X_A', 'X_B']: 178 | sick_embed[key][txt_type] = [] 179 | for ii in range(0, len(self.sick_data[key]['y']), bsize): 180 | batch = self.sick_data[key][txt_type][ii:ii + bsize] 181 | embeddings = batcher(params, batch) 182 | sick_embed[key][txt_type].append(embeddings) 183 | sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type]) 184 | logging.info('Computed {0} embeddings'.format(key)) 185 | 186 | # Train 187 | trainA = sick_embed['train']['X_A'] 188 | trainB = sick_embed['train']['X_B'] 189 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB] 190 | trainY = np.array(self.sick_data['train']['y']) 191 | 192 | # Dev 193 | devA = sick_embed['dev']['X_A'] 194 | devB = sick_embed['dev']['X_B'] 195 | devF = np.c_[np.abs(devA - devB), devA * devB] 196 | devY = np.array(self.sick_data['dev']['y']) 197 | 198 | # Test 199 | testA = sick_embed['test']['X_A'] 200 | testB = sick_embed['test']['X_B'] 201 | testF = np.c_[np.abs(testA - testB), testA * testB] 202 | testY = np.array(self.sick_data['test']['y']) 203 | 204 | config = {'nclasses': 3, 'seed': self.seed, 205 | 'usepytorch': params.usepytorch, 206 | 'classifier': params.classifier, 207 | 'nhid': params.nhid} 208 | clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF}, 209 | y={'train': trainY, 'valid': devY, 'test': testY}, 210 | config=config) 211 | 212 | devacc, testacc = clf.run() 213 | logging.debug('\nDev acc : {0} Test acc : {1} for \ 214 | SICK entailment\n'.format(devacc, testacc)) 215 | return {'devacc': devacc, 'acc': testacc, 216 | 'ndev': len(devA), 'ntest': len(testA)} 217 | -------------------------------------------------------------------------------- /SentEval/examples/models.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | This file contains the definition of encoders used in https://arxiv.org/pdf/1705.02364.pdf 10 | """ 11 | 12 | import numpy as np 13 | import time 14 | 15 | import torch 16 | import torch.nn as nn 17 | 18 | 19 | class InferSent(nn.Module): 20 | 21 | def __init__(self, config): 22 | super(InferSent, self).__init__() 23 | self.bsize = config['bsize'] 24 | self.word_emb_dim = config['word_emb_dim'] 25 | self.enc_lstm_dim = config['enc_lstm_dim'] 26 | self.pool_type = config['pool_type'] 27 | self.dpout_model = config['dpout_model'] 28 | self.version = 1 if 'version' not in config else config['version'] 29 | 30 | self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1, 31 | bidirectional=True, dropout=self.dpout_model) 32 | 33 | assert self.version in [1, 2] 34 | if self.version == 1: 35 | self.bos = '' 36 | self.eos = '' 37 | self.max_pad = True 38 | self.moses_tok = False 39 | elif self.version == 2: 40 | self.bos = '

' 41 | self.eos = '

' 42 | self.max_pad = False 43 | self.moses_tok = True 44 | 45 | def is_cuda(self): 46 | # either all weights are on cpu or they are on gpu 47 | return self.enc_lstm.bias_hh_l0.data.is_cuda 48 | 49 | def forward(self, sent_tuple): 50 | # sent_len: [max_len, ..., min_len] (bsize) 51 | # sent: (seqlen x bsize x worddim) 52 | sent, sent_len = sent_tuple 53 | 54 | # Sort by length (keep idx) 55 | sent_len_sorted, idx_sort = np.sort(sent_len)[::-1], np.argsort(-sent_len) 56 | sent_len_sorted = sent_len_sorted.copy() 57 | idx_unsort = np.argsort(idx_sort) 58 | 59 | idx_sort = torch.from_numpy(idx_sort).cuda() if self.is_cuda() \ 60 | else torch.from_numpy(idx_sort) 61 | sent = sent.index_select(1, idx_sort) 62 | 63 | # Handling padding in Recurrent Networks 64 | sent_packed = nn.utils.rnn.pack_padded_sequence(sent, sent_len_sorted) 65 | sent_output = self.enc_lstm(sent_packed)[0] # seqlen x batch x 2*nhid 66 | sent_output = nn.utils.rnn.pad_packed_sequence(sent_output)[0] 67 | 68 | # Un-sort by length 69 | idx_unsort = torch.from_numpy(idx_unsort).cuda() if self.is_cuda() \ 70 | else torch.from_numpy(idx_unsort) 71 | sent_output = sent_output.index_select(1, idx_unsort) 72 | 73 | # Pooling 74 | if self.pool_type == "mean": 75 | sent_len = torch.FloatTensor(sent_len.copy()).unsqueeze(1).cuda() 76 | emb = torch.sum(sent_output, 0).squeeze(0) 77 | emb = emb / sent_len.expand_as(emb) 78 | elif self.pool_type == "max": 79 | if not self.max_pad: 80 | sent_output[sent_output == 0] = -1e9 81 | emb = torch.max(sent_output, 0)[0] 82 | if emb.ndimension() == 3: 83 | emb = emb.squeeze(0) 84 | assert emb.ndimension() == 2 85 | 86 | return emb 87 | 88 | def set_w2v_path(self, w2v_path): 89 | self.w2v_path = w2v_path 90 | 91 | def get_word_dict(self, sentences, tokenize=True): 92 | # create vocab of words 93 | word_dict = {} 94 | sentences = [s.split() if not tokenize else self.tokenize(s) for s in sentences] 95 | for sent in sentences: 96 | for word in sent: 97 | if word not in word_dict: 98 | word_dict[word] = '' 99 | word_dict[self.bos] = '' 100 | word_dict[self.eos] = '' 101 | return word_dict 102 | 103 | def get_w2v(self, word_dict): 104 | assert hasattr(self, 'w2v_path'), 'w2v path not set' 105 | # create word_vec with w2v vectors 106 | word_vec = {} 107 | with open(self.w2v_path, encoding='utf-8') as f: 108 | for line in f: 109 | word, vec = line.split(' ', 1) 110 | if word in word_dict: 111 | word_vec[word] = np.fromstring(vec, sep=' ') 112 | print('Found %s(/%s) words with w2v vectors' % (len(word_vec), len(word_dict))) 113 | return word_vec 114 | 115 | def get_w2v_k(self, K): 116 | assert hasattr(self, 'w2v_path'), 'w2v path not set' 117 | # create word_vec with k first w2v vectors 118 | k = 0 119 | word_vec = {} 120 | with open(self.w2v_path, encoding='utf-8') as f: 121 | for line in f: 122 | word, vec = line.split(' ', 1) 123 | if k <= K: 124 | word_vec[word] = np.fromstring(vec, sep=' ') 125 | k += 1 126 | if k > K: 127 | if word in [self.bos, self.eos]: 128 | word_vec[word] = np.fromstring(vec, sep=' ') 129 | 130 | if k > K and all([w in word_vec for w in [self.bos, self.eos]]): 131 | break 132 | return word_vec 133 | 134 | def build_vocab(self, sentences, tokenize=True): 135 | assert hasattr(self, 'w2v_path'), 'w2v path not set' 136 | word_dict = self.get_word_dict(sentences, tokenize) 137 | self.word_vec = self.get_w2v(word_dict) 138 | print('Vocab size : %s' % (len(self.word_vec))) 139 | 140 | # build w2v vocab with k most frequent words 141 | def build_vocab_k_words(self, K): 142 | assert hasattr(self, 'w2v_path'), 'w2v path not set' 143 | self.word_vec = self.get_w2v_k(K) 144 | print('Vocab size : %s' % (K)) 145 | 146 | def update_vocab(self, sentences, tokenize=True): 147 | assert hasattr(self, 'w2v_path'), 'warning : w2v path not set' 148 | assert hasattr(self, 'word_vec'), 'build_vocab before updating it' 149 | word_dict = self.get_word_dict(sentences, tokenize) 150 | 151 | # keep only new words 152 | for word in self.word_vec: 153 | if word in word_dict: 154 | del word_dict[word] 155 | 156 | # udpate vocabulary 157 | if word_dict: 158 | new_word_vec = self.get_w2v(word_dict) 159 | self.word_vec.update(new_word_vec) 160 | else: 161 | new_word_vec = [] 162 | print('New vocab size : %s (added %s words)'% (len(self.word_vec), len(new_word_vec))) 163 | 164 | def get_batch(self, batch): 165 | # sent in batch in decreasing order of lengths 166 | # batch: (bsize, max_len, word_dim) 167 | embed = np.zeros((len(batch[0]), len(batch), self.word_emb_dim)) 168 | 169 | for i in range(len(batch)): 170 | for j in range(len(batch[i])): 171 | embed[j, i, :] = self.word_vec[batch[i][j]] 172 | 173 | return torch.FloatTensor(embed) 174 | 175 | def tokenize(self, s): 176 | from nltk.tokenize import word_tokenize 177 | if self.moses_tok: 178 | s = ' '.join(word_tokenize(s)) 179 | s = s.replace(" n't ", "n 't ") # HACK to get ~MOSES tokenization 180 | return s.split() 181 | else: 182 | return word_tokenize(s) 183 | 184 | def prepare_samples(self, sentences, bsize, tokenize, verbose): 185 | sentences = [[self.bos] + s.split() + [self.eos] if not tokenize else 186 | [self.bos] + self.tokenize(s) + [self.eos] for s in sentences] 187 | n_w = np.sum([len(x) for x in sentences]) 188 | 189 | # filters words without w2v vectors 190 | for i in range(len(sentences)): 191 | s_f = [word for word in sentences[i] if word in self.word_vec] 192 | if not s_f: 193 | import warnings 194 | warnings.warn('No words in "%s" (idx=%s) have w2v vectors. \ 195 | Replacing by ""..' % (sentences[i], i)) 196 | s_f = [self.eos] 197 | sentences[i] = s_f 198 | 199 | lengths = np.array([len(s) for s in sentences]) 200 | n_wk = np.sum(lengths) 201 | if verbose: 202 | print('Nb words kept : %s/%s (%.1f%s)' % ( 203 | n_wk, n_w, 100.0 * n_wk / n_w, '%')) 204 | 205 | # sort by decreasing length 206 | lengths, idx_sort = np.sort(lengths)[::-1], np.argsort(-lengths) 207 | sentences = np.array(sentences)[idx_sort] 208 | 209 | return sentences, lengths, idx_sort 210 | 211 | def encode(self, sentences, bsize=64, tokenize=True, verbose=False): 212 | tic = time.time() 213 | sentences, lengths, idx_sort = self.prepare_samples( 214 | sentences, bsize, tokenize, verbose) 215 | 216 | embeddings = [] 217 | for stidx in range(0, len(sentences), bsize): 218 | batch = self.get_batch(sentences[stidx:stidx + bsize]) 219 | if self.is_cuda(): 220 | batch = batch.cuda() 221 | with torch.no_grad(): 222 | batch = self.forward((batch, lengths[stidx:stidx + bsize])).data.cpu().numpy() 223 | embeddings.append(batch) 224 | embeddings = np.vstack(embeddings) 225 | 226 | # unsort 227 | idx_unsort = np.argsort(idx_sort) 228 | embeddings = embeddings[idx_unsort] 229 | 230 | if verbose: 231 | print('Speed : %.1f sentences/s (%s mode, bsize=%s)' % ( 232 | len(embeddings)/(time.time()-tic), 233 | 'gpu' if self.is_cuda() else 'cpu', bsize)) 234 | return embeddings 235 | 236 | def visualize(self, sent, tokenize=True): 237 | 238 | sent = sent.split() if not tokenize else self.tokenize(sent) 239 | sent = [[self.bos] + [word for word in sent if word in self.word_vec] + [self.eos]] 240 | 241 | if ' '.join(sent[0]) == '%s %s' % (self.bos, self.eos): 242 | import warnings 243 | warnings.warn('No words in "%s" have w2v vectors. Replacing \ 244 | by "%s %s"..' % (sent, self.bos, self.eos)) 245 | batch = self.get_batch(sent) 246 | 247 | if self.is_cuda(): 248 | batch = batch.cuda() 249 | output = self.enc_lstm(batch)[0] 250 | output, idxs = torch.max(output, 0) 251 | # output, idxs = output.squeeze(), idxs.squeeze() 252 | idxs = idxs.data.cpu().numpy() 253 | argmaxs = [np.sum((idxs == k)) for k in range(len(sent[0]))] 254 | 255 | # visualize model 256 | import matplotlib.pyplot as plt 257 | x = range(len(sent[0])) 258 | y = [100.0 * n / np.sum(argmaxs) for n in argmaxs] 259 | plt.xticks(x, sent[0], rotation=45) 260 | plt.bar(x, y) 261 | plt.ylabel('%') 262 | plt.title('Visualisation of words importance') 263 | plt.show() 264 | 265 | return output, idxs 266 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/validation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Validation and classification 10 | (train) : inner-kfold classifier 11 | (train, test) : kfold classifier 12 | (train, dev, test) : split classifier 13 | 14 | """ 15 | from __future__ import absolute_import, division, unicode_literals 16 | 17 | import logging 18 | import numpy as np 19 | from senteval.tools.classifier import MLP 20 | 21 | import sklearn 22 | assert(sklearn.__version__ >= "0.18.0"), \ 23 | "need to update sklearn to version >= 0.18.0" 24 | from sklearn.linear_model import LogisticRegression 25 | from sklearn.model_selection import StratifiedKFold 26 | 27 | 28 | def get_classif_name(classifier_config, usepytorch): 29 | if not usepytorch: 30 | modelname = 'sklearn-LogReg' 31 | else: 32 | nhid = classifier_config['nhid'] 33 | optim = 'adam' if 'optim' not in classifier_config else classifier_config['optim'] 34 | bs = 64 if 'batch_size' not in classifier_config else classifier_config['batch_size'] 35 | modelname = 'pytorch-MLP-nhid%s-%s-bs%s' % (nhid, optim, bs) 36 | return modelname 37 | 38 | # Pytorch version 39 | class InnerKFoldClassifier(object): 40 | """ 41 | (train) split classifier : InnerKfold. 42 | """ 43 | def __init__(self, X, y, config): 44 | self.X = X 45 | self.y = y 46 | self.featdim = X.shape[1] 47 | self.nclasses = config['nclasses'] 48 | self.seed = config['seed'] 49 | self.devresults = [] 50 | self.testresults = [] 51 | self.usepytorch = config['usepytorch'] 52 | self.classifier_config = config['classifier'] 53 | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) 54 | 55 | self.k = 5 if 'kfold' not in config else config['kfold'] 56 | 57 | def run(self): 58 | logging.info('Training {0} with (inner) {1}-fold cross-validation' 59 | .format(self.modelname, self.k)) 60 | 61 | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ 62 | [2**t for t in range(-2, 4, 1)] 63 | skf = StratifiedKFold(n_splits=self.k, shuffle=True, random_state=1111) 64 | innerskf = StratifiedKFold(n_splits=self.k, shuffle=True, 65 | random_state=1111) 66 | count = 0 67 | for train_idx, test_idx in skf.split(self.X, self.y): 68 | count += 1 69 | X_train, X_test = self.X[train_idx], self.X[test_idx] 70 | y_train, y_test = self.y[train_idx], self.y[test_idx] 71 | scores = [] 72 | for reg in regs: 73 | regscores = [] 74 | for inner_train_idx, inner_test_idx in innerskf.split(X_train, y_train): 75 | X_in_train, X_in_test = X_train[inner_train_idx], X_train[inner_test_idx] 76 | y_in_train, y_in_test = y_train[inner_train_idx], y_train[inner_test_idx] 77 | if self.usepytorch: 78 | clf = MLP(self.classifier_config, inputdim=self.featdim, 79 | nclasses=self.nclasses, l2reg=reg, 80 | seed=self.seed) 81 | clf.fit(X_in_train, y_in_train, 82 | validation_data=(X_in_test, y_in_test)) 83 | else: 84 | clf = LogisticRegression(C=reg, random_state=self.seed) 85 | clf.fit(X_in_train, y_in_train) 86 | regscores.append(clf.score(X_in_test, y_in_test)) 87 | scores.append(round(100*np.mean(regscores), 2)) 88 | optreg = regs[np.argmax(scores)] 89 | logging.info('Best param found at split {0}: l2reg = {1} \ 90 | with score {2}'.format(count, optreg, np.max(scores))) 91 | self.devresults.append(np.max(scores)) 92 | 93 | if self.usepytorch: 94 | clf = MLP(self.classifier_config, inputdim=self.featdim, 95 | nclasses=self.nclasses, l2reg=optreg, 96 | seed=self.seed) 97 | 98 | clf.fit(X_train, y_train, validation_split=0.05) 99 | else: 100 | clf = LogisticRegression(C=optreg, random_state=self.seed) 101 | clf.fit(X_train, y_train) 102 | 103 | self.testresults.append(round(100*clf.score(X_test, y_test), 2)) 104 | 105 | devaccuracy = round(np.mean(self.devresults), 2) 106 | testaccuracy = round(np.mean(self.testresults), 2) 107 | return devaccuracy, testaccuracy 108 | 109 | 110 | class KFoldClassifier(object): 111 | """ 112 | (train, test) split classifier : cross-validation on train. 113 | """ 114 | def __init__(self, train, test, config): 115 | self.train = train 116 | self.test = test 117 | self.featdim = self.train['X'].shape[1] 118 | self.nclasses = config['nclasses'] 119 | self.seed = config['seed'] 120 | self.usepytorch = config['usepytorch'] 121 | self.classifier_config = config['classifier'] 122 | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) 123 | 124 | self.k = 5 if 'kfold' not in config else config['kfold'] 125 | 126 | def run(self): 127 | # cross-validation 128 | logging.info('Training {0} with {1}-fold cross-validation' 129 | .format(self.modelname, self.k)) 130 | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ 131 | [2**t for t in range(-1, 6, 1)] 132 | skf = StratifiedKFold(n_splits=self.k, shuffle=True, 133 | random_state=self.seed) 134 | scores = [] 135 | 136 | for reg in regs: 137 | scanscores = [] 138 | for train_idx, test_idx in skf.split(self.train['X'], 139 | self.train['y']): 140 | # Split data 141 | X_train, y_train = self.train['X'][train_idx], self.train['y'][train_idx] 142 | 143 | X_test, y_test = self.train['X'][test_idx], self.train['y'][test_idx] 144 | 145 | # Train classifier 146 | if self.usepytorch: 147 | clf = MLP(self.classifier_config, inputdim=self.featdim, 148 | nclasses=self.nclasses, l2reg=reg, 149 | seed=self.seed) 150 | clf.fit(X_train, y_train, validation_data=(X_test, y_test)) 151 | else: 152 | clf = LogisticRegression(C=reg, random_state=self.seed) 153 | clf.fit(X_train, y_train) 154 | score = clf.score(X_test, y_test) 155 | scanscores.append(score) 156 | # Append mean score 157 | scores.append(round(100*np.mean(scanscores), 2)) 158 | 159 | # evaluation 160 | logging.info([('reg:' + str(regs[idx]), scores[idx]) 161 | for idx in range(len(scores))]) 162 | optreg = regs[np.argmax(scores)] 163 | devaccuracy = np.max(scores) 164 | logging.info('Cross-validation : best param found is reg = {0} \ 165 | with score {1}'.format(optreg, devaccuracy)) 166 | 167 | logging.info('Evaluating...') 168 | if self.usepytorch: 169 | clf = MLP(self.classifier_config, inputdim=self.featdim, 170 | nclasses=self.nclasses, l2reg=optreg, 171 | seed=self.seed) 172 | clf.fit(self.train['X'], self.train['y'], validation_split=0.05) 173 | else: 174 | clf = LogisticRegression(C=optreg, random_state=self.seed) 175 | clf.fit(self.train['X'], self.train['y']) 176 | yhat = clf.predict(self.test['X']) 177 | 178 | testaccuracy = clf.score(self.test['X'], self.test['y']) 179 | testaccuracy = round(100*testaccuracy, 2) 180 | 181 | return devaccuracy, testaccuracy, yhat 182 | 183 | 184 | class SplitClassifier(object): 185 | """ 186 | (train, valid, test) split classifier. 187 | """ 188 | def __init__(self, X, y, config): 189 | self.X = X 190 | self.y = y 191 | self.nclasses = config['nclasses'] 192 | self.featdim = self.X['train'].shape[1] 193 | self.seed = config['seed'] 194 | self.usepytorch = config['usepytorch'] 195 | self.classifier_config = config['classifier'] 196 | self.cudaEfficient = False if 'cudaEfficient' not in config else \ 197 | config['cudaEfficient'] 198 | self.modelname = get_classif_name(self.classifier_config, self.usepytorch) 199 | self.noreg = False if 'noreg' not in config else config['noreg'] 200 | self.config = config 201 | 202 | def run(self): 203 | logging.info('Training {0} with standard validation..' 204 | .format(self.modelname)) 205 | regs = [10**t for t in range(-5, -1)] if self.usepytorch else \ 206 | [2**t for t in range(-2, 4, 1)] 207 | if self.noreg: 208 | regs = [1e-9 if self.usepytorch else 1e9] 209 | scores = [] 210 | for reg in regs: 211 | if self.usepytorch: 212 | clf = MLP(self.classifier_config, inputdim=self.featdim, 213 | nclasses=self.nclasses, l2reg=reg, 214 | seed=self.seed, cudaEfficient=self.cudaEfficient) 215 | 216 | # TODO: Find a hack for reducing nb epoches in SNLI 217 | clf.fit(self.X['train'], self.y['train'], 218 | validation_data=(self.X['valid'], self.y['valid'])) 219 | else: 220 | clf = LogisticRegression(C=reg, random_state=self.seed) 221 | clf.fit(self.X['train'], self.y['train']) 222 | scores.append(round(100*clf.score(self.X['valid'], 223 | self.y['valid']), 2)) 224 | logging.info([('reg:'+str(regs[idx]), scores[idx]) 225 | for idx in range(len(scores))]) 226 | optreg = regs[np.argmax(scores)] 227 | devaccuracy = np.max(scores) 228 | logging.info('Validation : best param found is reg = {0} with score \ 229 | {1}'.format(optreg, devaccuracy)) 230 | clf = LogisticRegression(C=optreg, random_state=self.seed) 231 | logging.info('Evaluating...') 232 | if self.usepytorch: 233 | clf = MLP(self.classifier_config, inputdim=self.featdim, 234 | nclasses=self.nclasses, l2reg=optreg, 235 | seed=self.seed, cudaEfficient=self.cudaEfficient) 236 | 237 | # TODO: Find a hack for reducing nb epoches in SNLI 238 | clf.fit(self.X['train'], self.y['train'], 239 | validation_data=(self.X['valid'], self.y['valid'])) 240 | else: 241 | clf = LogisticRegression(C=optreg, random_state=self.seed) 242 | clf.fit(self.X['train'], self.y['train']) 243 | 244 | testaccuracy = clf.score(self.X['test'], self.y['test']) 245 | testaccuracy = round(100*testaccuracy, 2) 246 | return devaccuracy, testaccuracy 247 | -------------------------------------------------------------------------------- /SentEval/senteval/sts.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | ''' 9 | STS-{2012,2013,2014,2015,2016} (unsupervised) and 10 | STS-benchmark (supervised) tasks 11 | ''' 12 | 13 | from __future__ import absolute_import, division, unicode_literals 14 | 15 | import os 16 | import io 17 | import numpy as np 18 | import logging 19 | 20 | from scipy.stats import spearmanr, pearsonr 21 | 22 | from senteval.utils import cosine 23 | from senteval.sick import SICKEval 24 | import torch 25 | 26 | class STSEval(object): 27 | def loadFile(self, fpath): 28 | self.data = {} 29 | self.samples = [] 30 | 31 | for dataset in self.datasets: 32 | sent1, sent2 = zip(*[l.split("\t") for l in 33 | io.open(fpath + '/STS.input.%s.txt' % dataset, 34 | encoding='utf8').read().splitlines()]) 35 | raw_scores = np.array([x for x in 36 | io.open(fpath + '/STS.gs.%s.txt' % dataset, 37 | encoding='utf8') 38 | .read().splitlines()]) 39 | not_empty_idx = raw_scores != '' 40 | 41 | gs_scores = [float(x) for x in raw_scores[not_empty_idx]] 42 | sent1 = np.array([s.split() for s in sent1])[not_empty_idx] 43 | sent2 = np.array([s.split() for s in sent2])[not_empty_idx] 44 | # sort data by length to minimize padding in batcher 45 | sorted_data = sorted(zip(sent1, sent2, gs_scores), 46 | key=lambda z: (len(z[0]), len(z[1]), z[2])) 47 | sent1, sent2, gs_scores = map(list, zip(*sorted_data)) 48 | 49 | self.data[dataset] = (sent1, sent2, gs_scores) 50 | self.samples += sent1 + sent2 51 | 52 | def do_prepare(self, params, prepare): 53 | if 'similarity' in params: 54 | self.similarity = params.similarity 55 | else: # Default similarity is cosine 56 | self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2))) 57 | return prepare(params, self.samples) 58 | 59 | def run(self, params, batcher): 60 | results = {} 61 | all_sys_scores = [] 62 | all_gs_scores = [] 63 | for dataset in self.datasets: 64 | sys_scores = [] 65 | all_enc1 = [] 66 | all_enc2 = [] 67 | input1, input2, gs_scores = self.data[dataset] 68 | for ii in range(0, len(gs_scores), params.batch_size): 69 | batch1 = input1[ii:ii + params.batch_size] 70 | batch2 = input2[ii:ii + params.batch_size] 71 | # we assume get_batch already throws out the faulty ones 72 | if len(batch1) == len(batch2) and len(batch1) > 0: 73 | enc1 = batcher(params, batch1) 74 | enc2 = batcher(params, batch2) 75 | all_enc1.append(enc1.detach()) 76 | all_enc2.append(enc2.detach()) 77 | for kk in range(enc2.shape[0]): 78 | sys_score = self.similarity(enc1[kk], enc2[kk]) 79 | sys_scores.append(sys_score) 80 | all_sys_scores.extend(sys_scores) 81 | all_gs_scores.extend(gs_scores) 82 | results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores), 83 | 'spearman': spearmanr(sys_scores, gs_scores), 84 | 'nsamples': len(sys_scores)} 85 | logging.debug('%s : pearson = %.4f, spearman = %.4f' % 86 | (dataset, results[dataset]['pearson'][0], 87 | results[dataset]['spearman'][0])) 88 | # def _norm(x, eps=1e-8): 89 | # xnorm = torch.linalg.norm(x, dim=-1) 90 | # xnorm = torch.max(xnorm, torch.ones_like(xnorm) * eps) 91 | # return x / xnorm.unsqueeze(dim=-1) 92 | # from Wang and Isola (with a bit of modification) 93 | # only consider pairs with gs > 4 (from footnote 3) 94 | # def _lalign(x, y, ok, alpha=2): 95 | # return ((_norm(x) - _norm(y)).norm(dim=1).pow(alpha) * ok).sum() / ok.sum() 96 | # def _lunif(x, t=2): 97 | # sq_pdist = torch.pdist(_norm(x), p=2).pow(2) 98 | # return sq_pdist.mul(-t).exp().mean().log() 99 | # ok = (torch.Tensor(gs_scores) > 4).int() 100 | # align = _lalign( 101 | # torch.cat(all_enc1), 102 | # torch.cat(all_enc2), 103 | # ok).item() 104 | 105 | # consider all sentences (from footnote 3) 106 | # unif = _lunif(torch.cat(all_enc1 + all_enc2)).item() 107 | # logging.info(f'align {align}\t\t uniform {unif}') 108 | # results[dataset]['alignment'] = align 109 | # results[dataset]['uniformity'] = unif 110 | 111 | weights = [results[dset]['nsamples'] for dset in results.keys()] 112 | list_prs = np.array([results[dset]['pearson'][0] for 113 | dset in results.keys()]) 114 | list_spr = np.array([results[dset]['spearman'][0] for 115 | dset in results.keys()]) 116 | 117 | avg_pearson = np.average(list_prs) 118 | avg_spearman = np.average(list_spr) 119 | wavg_pearson = np.average(list_prs, weights=weights) 120 | wavg_spearman = np.average(list_spr, weights=weights) 121 | all_pearson = pearsonr(all_sys_scores, all_gs_scores) 122 | all_spearman = spearmanr(all_sys_scores, all_gs_scores) 123 | results['all'] = {'pearson': {'all': all_pearson[0], 124 | 'mean': avg_pearson, 125 | 'wmean': wavg_pearson}, 126 | 'spearman': {'all': all_spearman[0], 127 | 'mean': avg_spearman, 128 | 'wmean': wavg_spearman}} 129 | logging.debug('ALL : Pearson = %.4f, \ 130 | Spearman = %.4f' % (all_pearson[0], all_spearman[0])) 131 | logging.debug('ALL (weighted average) : Pearson = %.4f, \ 132 | Spearman = %.4f' % (wavg_pearson, wavg_spearman)) 133 | logging.debug('ALL (average) : Pearson = %.4f, \ 134 | Spearman = %.4f\n' % (avg_pearson, avg_spearman)) 135 | return results 136 | 137 | 138 | class STS12Eval(STSEval): 139 | def __init__(self, taskpath, seed=1111): 140 | logging.debug('***** Transfer task : STS12 *****\n\n') 141 | self.seed = seed 142 | self.datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl', 143 | 'surprise.OnWN', 'surprise.SMTnews'] 144 | self.loadFile(taskpath) 145 | 146 | 147 | class STS13Eval(STSEval): 148 | # STS13 here does not contain the "SMT" subtask due to LICENSE issue 149 | def __init__(self, taskpath, seed=1111): 150 | logging.debug('***** Transfer task : STS13 (-SMT) *****\n\n') 151 | self.seed = seed 152 | self.datasets = ['FNWN', 'headlines', 'OnWN'] 153 | self.loadFile(taskpath) 154 | 155 | 156 | class STS14Eval(STSEval): 157 | def __init__(self, taskpath, seed=1111): 158 | logging.debug('***** Transfer task : STS14 *****\n\n') 159 | self.seed = seed 160 | self.datasets = ['deft-forum', 'deft-news', 'headlines', 161 | 'images', 'OnWN', 'tweet-news'] 162 | self.loadFile(taskpath) 163 | 164 | 165 | class STS15Eval(STSEval): 166 | def __init__(self, taskpath, seed=1111): 167 | logging.debug('***** Transfer task : STS15 *****\n\n') 168 | self.seed = seed 169 | self.datasets = ['answers-forums', 'answers-students', 170 | 'belief', 'headlines', 'images'] 171 | self.loadFile(taskpath) 172 | 173 | 174 | class STS16Eval(STSEval): 175 | def __init__(self, taskpath, seed=1111): 176 | logging.debug('***** Transfer task : STS16 *****\n\n') 177 | self.seed = seed 178 | self.datasets = ['answer-answer', 'headlines', 'plagiarism', 179 | 'postediting', 'question-question'] 180 | self.loadFile(taskpath) 181 | 182 | 183 | class STSBenchmarkEval(STSEval): 184 | def __init__(self, task_path, seed=1111): 185 | logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n') 186 | self.seed = seed 187 | self.samples = [] 188 | train = self.loadFile(os.path.join(task_path, 'sts-train.csv')) 189 | dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv')) 190 | test = self.loadFile(os.path.join(task_path, 'sts-test.csv')) 191 | self.datasets = ['train', 'dev', 'test'] 192 | self.data = {'train': train, 'dev': dev, 'test': test} 193 | 194 | def loadFile(self, fpath): 195 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 196 | with io.open(fpath, 'r', encoding='utf-8') as f: 197 | for line in f: 198 | text = line.strip().split('\t') 199 | sick_data['X_A'].append(text[5].split()) 200 | sick_data['X_B'].append(text[6].split()) 201 | sick_data['y'].append(text[4]) 202 | 203 | sick_data['y'] = [float(s) for s in sick_data['y']] 204 | self.samples += sick_data['X_A'] + sick_data["X_B"] 205 | return (sick_data['X_A'], sick_data["X_B"], sick_data['y']) 206 | 207 | class STSBenchmarkFinetune(SICKEval): 208 | def __init__(self, task_path, seed=1111): 209 | logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n') 210 | self.seed = seed 211 | train = self.loadFile(os.path.join(task_path, 'sts-train.csv')) 212 | dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv')) 213 | test = self.loadFile(os.path.join(task_path, 'sts-test.csv')) 214 | self.sick_data = {'train': train, 'dev': dev, 'test': test} 215 | 216 | def loadFile(self, fpath): 217 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 218 | with io.open(fpath, 'r', encoding='utf-8') as f: 219 | for line in f: 220 | text = line.strip().split('\t') 221 | sick_data['X_A'].append(text[5].split()) 222 | sick_data['X_B'].append(text[6].split()) 223 | sick_data['y'].append(text[4]) 224 | 225 | sick_data['y'] = [float(s) for s in sick_data['y']] 226 | return sick_data 227 | 228 | class SICKRelatednessEval(STSEval): 229 | def __init__(self, task_path, seed=1111): 230 | logging.debug('\n\n***** Transfer task : SICKRelatedness*****\n\n') 231 | self.seed = seed 232 | self.samples = [] 233 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt')) 234 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt')) 235 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt')) 236 | self.datasets = ['train', 'dev', 'test'] 237 | self.data = {'train': train, 'dev': dev, 'test': test} 238 | 239 | def loadFile(self, fpath): 240 | skipFirstLine = True 241 | sick_data = {'X_A': [], 'X_B': [], 'y': []} 242 | with io.open(fpath, 'r', encoding='utf-8') as f: 243 | for line in f: 244 | if skipFirstLine: 245 | skipFirstLine = False 246 | else: 247 | text = line.strip().split('\t') 248 | sick_data['X_A'].append(text[1].split()) 249 | sick_data['X_B'].append(text[2].split()) 250 | sick_data['y'].append(text[3]) 251 | 252 | sick_data['y'] = [float(s) for s in sick_data['y']] 253 | self.samples += sick_data['X_A'] + sick_data["X_B"] 254 | return (sick_data['X_A'], sick_data["X_B"], sick_data['y']) 255 | -------------------------------------------------------------------------------- /SentEval/README.md: -------------------------------------------------------------------------------- 1 | Our modification to SentEval: 2 | 3 | 1. Add the `all` setting to all STS tasks. 4 | 2. Change STS-B and SICK-R to not use an additional regressor. 5 | 6 | # SentEval: evaluation toolkit for sentence embeddings 7 | 8 | SentEval is a library for evaluating the quality of sentence embeddings. We assess their generalization power by using them as features on a broad and diverse set of "transfer" tasks. **SentEval currently includes 17 downstream tasks**. We also include a suite of **10 probing tasks** which evaluate what linguistic properties are encoded in sentence embeddings. Our goal is to ease the study and the development of general-purpose fixed-size sentence representations. 9 | 10 | 11 | **(04/22) SentEval new tasks: Added probing tasks for evaluating what linguistic properties are encoded in sentence embeddings** 12 | 13 | **(10/04) SentEval example scripts for three sentence encoders: [SkipThought-LN](https://github.com/ryankiros/layer-norm#skip-thoughts)/[GenSen](https://github.com/Maluuba/gensen)/[Google-USE](https://tfhub.dev/google/universal-sentence-encoder/1)** 14 | 15 | ## Dependencies 16 | 17 | This code is written in python. The dependencies are: 18 | 19 | * Python 2/3 with [NumPy](http://www.numpy.org/)/[SciPy](http://www.scipy.org/) 20 | * [Pytorch](http://pytorch.org/)>=0.4 21 | * [scikit-learn](http://scikit-learn.org/stable/index.html)>=0.18.0 22 | 23 | ## Transfer tasks 24 | 25 | ### Downstream tasks 26 | SentEval allows you to evaluate your sentence embeddings as features for the following *downstream* tasks: 27 | 28 | | Task | Type | #train | #test | needs_train | set_classifier | 29 | |---------- |------------------------------ |-----------:|----------:|:-----------:|:----------:| 30 | | [MR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | movie review | 11k | 11k | 1 | 1 | 31 | | [CR](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | product review | 4k | 4k | 1 | 1 | 32 | | [SUBJ](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | subjectivity status | 10k | 10k | 1 | 1 | 33 | | [MPQA](https://nlp.stanford.edu/~sidaw/home/projects:nbsvm) | opinion-polarity | 11k | 11k | 1 | 1 | 34 | | [SST](https://nlp.stanford.edu/sentiment/index.html) | binary sentiment analysis | 67k | 1.8k | 1 | 1 | 35 | | **[SST](https://nlp.stanford.edu/sentiment/index.html)** | **fine-grained sentiment analysis** | 8.5k | 2.2k | 1 | 1 | 36 | | [TREC](http://cogcomp.cs.illinois.edu/Data/QA/QC/) | question-type classification | 6k | 0.5k | 1 | 1 | 37 | | [SICK-E](http://clic.cimec.unitn.it/composes/sick.html) | natural language inference | 4.5k | 4.9k | 1 | 1 | 38 | | [SNLI](https://nlp.stanford.edu/projects/snli/) | natural language inference | 550k | 9.8k | 1 | 1 | 39 | | [MRPC](https://aclweb.org/aclwiki/Paraphrase_Identification_(State_of_the_art)) | paraphrase detection | 4.1k | 1.7k | 1 | 1 | 40 | | [STS 2012](https://www.cs.york.ac.uk/semeval-2012/task6/) | semantic textual similarity | N/A | 3.1k | 0 | 0 | 41 | | [STS 2013](http://ixa2.si.ehu.es/sts/) | semantic textual similarity | N/A | 1.5k | 0 | 0 | 42 | | [STS 2014](http://alt.qcri.org/semeval2014/task10/) | semantic textual similarity | N/A | 3.7k | 0 | 0 | 43 | | [STS 2015](http://alt.qcri.org/semeval2015/task2/) | semantic textual similarity | N/A | 8.5k | 0 | 0 | 44 | | [STS 2016](http://alt.qcri.org/semeval2016/task1/) | semantic textual similarity | N/A | 9.2k | 0 | 0 | 45 | | [STS B](http://ixa2.si.ehu.es/stswiki/index.php/STSbenchmark#Results) | semantic textual similarity | 5.7k | 1.4k | 1 | 0 | 46 | | [SICK-R](http://clic.cimec.unitn.it/composes/sick.html) | semantic textual similarity | 4.5k | 4.9k | 1 | 0 | 47 | | [COCO](http://mscoco.org/) | image-caption retrieval | 567k | 5*1k | 1 | 0 | 48 | 49 | where **needs_train** means a model with parameters is learned on top of the sentence embeddings, and **set_classifier** means you can define the parameters of the classifier in the case of a classification task (see below). 50 | 51 | Note: COCO comes with ResNet-101 2048d image embeddings. [More details on the tasks.](https://arxiv.org/pdf/1705.02364.pdf) 52 | 53 | ### Probing tasks 54 | SentEval also includes a series of [*probing* tasks](https://github.com/facebookresearch/SentEval/tree/master/data/probing) to evaluate what linguistic properties are encoded in your sentence embeddings: 55 | 56 | | Task | Type | #train | #test | needs_train | set_classifier | 57 | |---------- |------------------------------ |-----------:|----------:|:-----------:|:----------:| 58 | | [SentLen](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Length prediction | 100k | 10k | 1 | 1 | 59 | | [WC](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Word Content analysis | 100k | 10k | 1 | 1 | 60 | | [TreeDepth](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Tree depth prediction | 100k | 10k | 1 | 1 | 61 | | [TopConst](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Top Constituents prediction | 100k | 10k | 1 | 1 | 62 | | [BShift](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Word order analysis | 100k | 10k | 1 | 1 | 63 | | [Tense](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Verb tense prediction | 100k | 10k | 1 | 1 | 64 | | [SubjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Subject number prediction | 100k | 10k | 1 | 1 | 65 | | [ObjNum](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Object number prediction | 100k | 10k | 1 | 1 | 66 | | [SOMO](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Semantic odd man out | 100k | 10k | 1 | 1 | 67 | | [CoordInv](https://github.com/facebookresearch/SentEval/tree/master/data/probing) | Coordination Inversion | 100k | 10k | 1 | 1 | 68 | 69 | ## Download datasets 70 | To get all the transfer tasks datasets, run (in data/downstream/): 71 | ```bash 72 | ./get_transfer_data.bash 73 | ``` 74 | This will automatically download and preprocess the downstream datasets, and store them in data/downstream (warning: for MacOS users, you may have to use p7zip instead of unzip). The probing tasks are already in data/probing by default. 75 | 76 | ## How to use SentEval: examples 77 | 78 | ### examples/bow.py 79 | 80 | In examples/bow.py, we evaluate the quality of the average of word embeddings. 81 | 82 | To download state-of-the-art fastText embeddings: 83 | 84 | ```bash 85 | curl -Lo glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip 86 | curl -Lo crawl-300d-2M.vec.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip 87 | ``` 88 | 89 | To reproduce the results for bag-of-vectors, run (in examples/): 90 | ```bash 91 | python bow.py 92 | ``` 93 | 94 | As required by SentEval, this script implements two functions: **prepare** (optional) and **batcher** (required) that turn text sentences into sentence embeddings. Then SentEval takes care of the evaluation on the transfer tasks using the embeddings as features. 95 | 96 | ### examples/infersent.py 97 | 98 | To get the **[InferSent](https://www.github.com/facebookresearch/InferSent)** model and reproduce our results, download our best models and run infersent.py (in examples/): 99 | ```bash 100 | curl -Lo examples/infersent1.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent1.pkl 101 | curl -Lo examples/infersent2.pkl https://dl.fbaipublicfiles.com/senteval/infersent/infersent2.pkl 102 | ``` 103 | 104 | ### examples/skipthought.py - examples/gensen.py - examples/googleuse.py 105 | 106 | We also provide example scripts for three other encoders: 107 | 108 | * [SkipThought with Layer-Normalization](https://github.com/ryankiros/layer-norm#skip-thoughts) in Theano 109 | * [GenSen encoder](https://github.com/Maluuba/gensen) in Pytorch 110 | * [Google encoder](https://tfhub.dev/google/universal-sentence-encoder/1) in TensorFlow 111 | 112 | Note that for SkipThought and GenSen, following the steps of the associated githubs is necessary. 113 | The Google encoder script should work as-is. 114 | 115 | ## How to use SentEval 116 | 117 | To evaluate your sentence embeddings, SentEval requires that you implement two functions: 118 | 119 | 1. **prepare** (sees the whole dataset of each task and can thus construct the word vocabulary, the dictionary of word vectors etc) 120 | 2. **batcher** (transforms a batch of text sentences into sentence embeddings) 121 | 122 | 123 | ### 1.) prepare(params, samples) (optional) 124 | 125 | *batcher* only sees one batch at a time while the *samples* argument of *prepare* contains all the sentences of a task. 126 | 127 | ``` 128 | prepare(params, samples) 129 | ``` 130 | * *params*: senteval parameters. 131 | * *samples*: list of all sentences from the tranfer task. 132 | * *output*: No output. Arguments stored in "params" can further be used by *batcher*. 133 | 134 | *Example*: in bow.py, prepare is is used to build the vocabulary of words and construct the "params.word_vect* dictionary of word vectors. 135 | 136 | 137 | ### 2.) batcher(params, batch) 138 | ``` 139 | batcher(params, batch) 140 | ``` 141 | * *params*: senteval parameters. 142 | * *batch*: numpy array of text sentences (of size params.batch_size) 143 | * *output*: numpy array of sentence embeddings (of size params.batch_size) 144 | 145 | *Example*: in bow.py, batcher is used to compute the mean of the word vectors for each sentence in the batch using params.word_vec. Use your own encoder in that function to encode sentences. 146 | 147 | ### 3.) evaluation on transfer tasks 148 | 149 | After having implemented the batch and prepare function for your own sentence encoder, 150 | 151 | 1) to perform the actual evaluation, first import senteval and set its parameters: 152 | ```python 153 | import senteval 154 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10} 155 | ``` 156 | 157 | 2) (optional) set the parameters of the classifier (when applicable): 158 | ```python 159 | params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 160 | 'tenacity': 5, 'epoch_size': 4} 161 | ``` 162 | You can choose **nhid=0** (Logistic Regression) or **nhid>0** (MLP) and define the parameters for training. 163 | 164 | 3) Create an instance of the class SE: 165 | ```python 166 | se = senteval.engine.SE(params, batcher, prepare) 167 | ``` 168 | 169 | 4) define the set of transfer tasks and run the evaluation: 170 | ```python 171 | transfer_tasks = ['MR', 'SICKEntailment', 'STS14', 'STSBenchmark'] 172 | results = se.eval(transfer_tasks) 173 | ``` 174 | The current list of available tasks is: 175 | ```python 176 | ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC', 'SNLI', 177 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark', 'ImageCaptionRetrieval', 178 | 'STS12', 'STS13', 'STS14', 'STS15', 'STS16', 179 | 'Length', 'WordContent', 'Depth', 'TopConstituents','BigramShift', 'Tense', 180 | 'SubjNumber', 'ObjNumber', 'OddManOut', 'CoordinationInversion'] 181 | ``` 182 | 183 | ## SentEval parameters 184 | Global parameters of SentEval: 185 | ```bash 186 | # senteval parameters 187 | task_path # path to SentEval datasets (required) 188 | seed # seed 189 | usepytorch # use cuda-pytorch (else scikit-learn) where possible 190 | kfold # k-fold validation for MR/CR/SUB/MPQA. 191 | ``` 192 | 193 | Parameters of the classifier: 194 | ```bash 195 | nhid: # number of hidden units (0: Logistic Regression, >0: MLP); Default nonlinearity: Tanh 196 | optim: # optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..) 197 | tenacity: # how many times dev acc does not increase before training stops 198 | epoch_size: # each epoch corresponds to epoch_size pass on the train set 199 | max_epoch: # max number of epoches 200 | dropout: # dropout for MLP 201 | ``` 202 | 203 | Note that to get a proxy of the results while **dramatically reducing computation time**, 204 | we suggest the **prototyping config**: 205 | ```python 206 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5} 207 | params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128, 208 | 'tenacity': 3, 'epoch_size': 2} 209 | ``` 210 | which will results in a 5 times speedup for classification tasks. 211 | 212 | To produce results that are **comparable to the literature**, use the **default config**: 213 | ```python 214 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10} 215 | params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64, 216 | 'tenacity': 5, 'epoch_size': 4} 217 | ``` 218 | which takes longer but will produce better and comparable results. 219 | 220 | For probing tasks, we used an MLP with a Sigmoid nonlinearity and and tuned the nhid (in [50, 100, 200]) and dropout (in [0.0, 0.1, 0.2]) on the dev set. 221 | 222 | ## References 223 | 224 | Please considering citing [[1]](https://arxiv.org/abs/1803.05449) if using this code for evaluating sentence embedding methods. 225 | 226 | ### SentEval: An Evaluation Toolkit for Universal Sentence Representations 227 | 228 | [1] A. Conneau, D. Kiela, [*SentEval: An Evaluation Toolkit for Universal Sentence Representations*](https://arxiv.org/abs/1803.05449) 229 | 230 | ``` 231 | @article{conneau2018senteval, 232 | title={SentEval: An Evaluation Toolkit for Universal Sentence Representations}, 233 | author={Conneau, Alexis and Kiela, Douwe}, 234 | journal={arXiv preprint arXiv:1803.05449}, 235 | year={2018} 236 | } 237 | ``` 238 | 239 | Contact: [aconneau@fb.com](mailto:aconneau@fb.com), [dkiela@fb.com](mailto:dkiela@fb.com) 240 | 241 | ### Related work 242 | * [J. R Kiros, Y. Zhu, R. Salakhutdinov, R. S. Zemel, A. Torralba, R. Urtasun, S. Fidler - SkipThought Vectors, NIPS 2015](https://arxiv.org/abs/1506.06726) 243 | * [S. Arora, Y. Liang, T. Ma - A Simple but Tough-to-Beat Baseline for Sentence Embeddings, ICLR 2017](https://openreview.net/pdf?id=SyK00v5xx) 244 | * [Y. Adi, E. Kermany, Y. Belinkov, O. Lavi, Y. Goldberg - Fine-grained analysis of sentence embeddings using auxiliary prediction tasks, ICLR 2017](https://arxiv.org/abs/1608.04207) 245 | * [A. Conneau, D. Kiela, L. Barrault, H. Schwenk, A. Bordes - Supervised Learning of Universal Sentence Representations from Natural Language Inference Data, EMNLP 2017](https://arxiv.org/abs/1705.02364) 246 | * [S. Subramanian, A. Trischler, Y. Bengio, C. J Pal - Learning General Purpose Distributed Sentence Representations via Large Scale Multi-task Learning, ICLR 2018](https://arxiv.org/abs/1804.00079) 247 | * [A. Nie, E. D. Bennett, N. D. Goodman - DisSent: Sentence Representation Learning from Explicit Discourse Relations, 2018](https://arxiv.org/abs/1710.04334) 248 | * [D. Cer, Y. Yang, S. Kong, N. Hua, N. Limtiaco, R. St. John, N. Constant, M. Guajardo-Cespedes, S. Yuan, C. Tar, Y. Sung, B. Strope, R. Kurzweil - Universal Sentence Encoder, 2018](https://arxiv.org/abs/1803.11175) 249 | * [A. Conneau, G. Kruszewski, G. Lample, L. Barrault, M. Baroni - What you can cram into a single vector: Probing sentence embeddings for linguistic properties, ACL 2018](https://arxiv.org/abs/1805.01070) 250 | -------------------------------------------------------------------------------- /SentEval/senteval/tools/ranking.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2017-present, Facebook, Inc. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | """ 9 | Image Annotation/Search for COCO with Pytorch 10 | """ 11 | from __future__ import absolute_import, division, unicode_literals 12 | 13 | import logging 14 | import copy 15 | import numpy as np 16 | 17 | import torch 18 | from torch import nn 19 | from torch.autograd import Variable 20 | import torch.optim as optim 21 | 22 | 23 | class COCOProjNet(nn.Module): 24 | def __init__(self, config): 25 | super(COCOProjNet, self).__init__() 26 | self.imgdim = config['imgdim'] 27 | self.sentdim = config['sentdim'] 28 | self.projdim = config['projdim'] 29 | self.imgproj = nn.Sequential( 30 | nn.Linear(self.imgdim, self.projdim), 31 | ) 32 | self.sentproj = nn.Sequential( 33 | nn.Linear(self.sentdim, self.projdim), 34 | ) 35 | 36 | def forward(self, img, sent, imgc, sentc): 37 | # imgc : (bsize, ncontrast, imgdim) 38 | # sentc : (bsize, ncontrast, sentdim) 39 | # img : (bsize, imgdim) 40 | # sent : (bsize, sentdim) 41 | img = img.unsqueeze(1).expand_as(imgc).contiguous() 42 | img = img.view(-1, self.imgdim) 43 | imgc = imgc.view(-1, self.imgdim) 44 | sent = sent.unsqueeze(1).expand_as(sentc).contiguous() 45 | sent = sent.view(-1, self.sentdim) 46 | sentc = sentc.view(-1, self.sentdim) 47 | 48 | imgproj = self.imgproj(img) 49 | imgproj = imgproj / torch.sqrt(torch.pow(imgproj, 2).sum(1, keepdim=True)).expand_as(imgproj) 50 | imgcproj = self.imgproj(imgc) 51 | imgcproj = imgcproj / torch.sqrt(torch.pow(imgcproj, 2).sum(1, keepdim=True)).expand_as(imgcproj) 52 | sentproj = self.sentproj(sent) 53 | sentproj = sentproj / torch.sqrt(torch.pow(sentproj, 2).sum(1, keepdim=True)).expand_as(sentproj) 54 | sentcproj = self.sentproj(sentc) 55 | sentcproj = sentcproj / torch.sqrt(torch.pow(sentcproj, 2).sum(1, keepdim=True)).expand_as(sentcproj) 56 | # (bsize*ncontrast, projdim) 57 | 58 | anchor1 = torch.sum((imgproj*sentproj), 1) 59 | anchor2 = torch.sum((sentproj*imgproj), 1) 60 | img_sentc = torch.sum((imgproj*sentcproj), 1) 61 | sent_imgc = torch.sum((sentproj*imgcproj), 1) 62 | 63 | # (bsize*ncontrast) 64 | return anchor1, anchor2, img_sentc, sent_imgc 65 | 66 | def proj_sentence(self, sent): 67 | output = self.sentproj(sent) 68 | output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output) 69 | return output # (bsize, projdim) 70 | 71 | def proj_image(self, img): 72 | output = self.imgproj(img) 73 | output = output / torch.sqrt(torch.pow(output, 2).sum(1, keepdim=True)).expand_as(output) 74 | return output # (bsize, projdim) 75 | 76 | 77 | class PairwiseRankingLoss(nn.Module): 78 | """ 79 | Pairwise ranking loss 80 | """ 81 | def __init__(self, margin): 82 | super(PairwiseRankingLoss, self).__init__() 83 | self.margin = margin 84 | 85 | def forward(self, anchor1, anchor2, img_sentc, sent_imgc): 86 | 87 | cost_sent = torch.clamp(self.margin - anchor1 + img_sentc, 88 | min=0.0).sum() 89 | cost_img = torch.clamp(self.margin - anchor2 + sent_imgc, 90 | min=0.0).sum() 91 | loss = cost_sent + cost_img 92 | return loss 93 | 94 | 95 | class ImageSentenceRankingPytorch(object): 96 | # Image Sentence Ranking on COCO with Pytorch 97 | def __init__(self, train, valid, test, config): 98 | # fix seed 99 | self.seed = config['seed'] 100 | np.random.seed(self.seed) 101 | torch.manual_seed(self.seed) 102 | torch.cuda.manual_seed(self.seed) 103 | 104 | self.train = train 105 | self.valid = valid 106 | self.test = test 107 | 108 | self.imgdim = len(train['imgfeat'][0]) 109 | self.sentdim = len(train['sentfeat'][0]) 110 | self.projdim = config['projdim'] 111 | self.margin = config['margin'] 112 | 113 | self.batch_size = 128 114 | self.ncontrast = 30 115 | self.maxepoch = 20 116 | self.early_stop = True 117 | 118 | config_model = {'imgdim': self.imgdim,'sentdim': self.sentdim, 119 | 'projdim': self.projdim} 120 | self.model = COCOProjNet(config_model).cuda() 121 | 122 | self.loss_fn = PairwiseRankingLoss(margin=self.margin).cuda() 123 | 124 | self.optimizer = optim.Adam(self.model.parameters()) 125 | 126 | def prepare_data(self, trainTxt, trainImg, devTxt, devImg, 127 | testTxt, testImg): 128 | trainTxt = torch.FloatTensor(trainTxt) 129 | trainImg = torch.FloatTensor(trainImg) 130 | devTxt = torch.FloatTensor(devTxt).cuda() 131 | devImg = torch.FloatTensor(devImg).cuda() 132 | testTxt = torch.FloatTensor(testTxt).cuda() 133 | testImg = torch.FloatTensor(testImg).cuda() 134 | 135 | return trainTxt, trainImg, devTxt, devImg, testTxt, testImg 136 | 137 | def run(self): 138 | self.nepoch = 0 139 | bestdevscore = -1 140 | early_stop_count = 0 141 | stop_train = False 142 | 143 | # Preparing data 144 | logging.info('prepare data') 145 | trainTxt, trainImg, devTxt, devImg, testTxt, testImg = \ 146 | self.prepare_data(self.train['sentfeat'], self.train['imgfeat'], 147 | self.valid['sentfeat'], self.valid['imgfeat'], 148 | self.test['sentfeat'], self.test['imgfeat']) 149 | 150 | # Training 151 | while not stop_train and self.nepoch <= self.maxepoch: 152 | logging.info('start epoch') 153 | self.trainepoch(trainTxt, trainImg, devTxt, devImg, nepoches=1) 154 | logging.info('Epoch {0} finished'.format(self.nepoch)) 155 | 156 | results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0}, 157 | 't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0}, 158 | 'dev': bestdevscore} 159 | score = 0 160 | for i in range(5): 161 | devTxt_i = devTxt[i*5000:(i+1)*5000] 162 | devImg_i = devImg[i*5000:(i+1)*5000] 163 | # Compute dev ranks img2txt 164 | r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg_i, 165 | devTxt_i) 166 | results['i2t']['r1'] += r1_i2t / 5 167 | results['i2t']['r5'] += r5_i2t / 5 168 | results['i2t']['r10'] += r10_i2t / 5 169 | results['i2t']['medr'] += medr_i2t / 5 170 | logging.info("Image to text: {0}, {1}, {2}, {3}" 171 | .format(r1_i2t, r5_i2t, r10_i2t, medr_i2t)) 172 | # Compute dev ranks txt2img 173 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg_i, 174 | devTxt_i) 175 | results['t2i']['r1'] += r1_t2i / 5 176 | results['t2i']['r5'] += r5_t2i / 5 177 | results['t2i']['r10'] += r10_t2i / 5 178 | results['t2i']['medr'] += medr_t2i / 5 179 | logging.info("Text to Image: {0}, {1}, {2}, {3}" 180 | .format(r1_t2i, r5_t2i, r10_t2i, medr_t2i)) 181 | score += (r1_i2t + r5_i2t + r10_i2t + 182 | r1_t2i + r5_t2i + r10_t2i) / 5 183 | 184 | logging.info("Dev mean Text to Image: {0}, {1}, {2}, {3}".format( 185 | results['t2i']['r1'], results['t2i']['r5'], 186 | results['t2i']['r10'], results['t2i']['medr'])) 187 | logging.info("Dev mean Image to text: {0}, {1}, {2}, {3}".format( 188 | results['i2t']['r1'], results['i2t']['r5'], 189 | results['i2t']['r10'], results['i2t']['medr'])) 190 | 191 | # early stop on Pearson 192 | if score > bestdevscore: 193 | bestdevscore = score 194 | bestmodel = copy.deepcopy(self.model) 195 | elif self.early_stop: 196 | if early_stop_count >= 3: 197 | stop_train = True 198 | early_stop_count += 1 199 | self.model = bestmodel 200 | 201 | # Compute test for the 5 splits 202 | results = {'i2t': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0}, 203 | 't2i': {'r1': 0, 'r5': 0, 'r10': 0, 'medr': 0}, 204 | 'dev': bestdevscore} 205 | for i in range(5): 206 | testTxt_i = testTxt[i*5000:(i+1)*5000] 207 | testImg_i = testImg[i*5000:(i+1)*5000] 208 | # Compute test ranks img2txt 209 | r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(testImg_i, testTxt_i) 210 | results['i2t']['r1'] += r1_i2t / 5 211 | results['i2t']['r5'] += r5_i2t / 5 212 | results['i2t']['r10'] += r10_i2t / 5 213 | results['i2t']['medr'] += medr_i2t / 5 214 | # Compute test ranks txt2img 215 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(testImg_i, testTxt_i) 216 | results['t2i']['r1'] += r1_t2i / 5 217 | results['t2i']['r5'] += r5_t2i / 5 218 | results['t2i']['r10'] += r10_t2i / 5 219 | results['t2i']['medr'] += medr_t2i / 5 220 | 221 | return bestdevscore, results['i2t']['r1'], results['i2t']['r5'], \ 222 | results['i2t']['r10'], results['i2t']['medr'], \ 223 | results['t2i']['r1'], results['t2i']['r5'], \ 224 | results['t2i']['r10'], results['t2i']['medr'] 225 | 226 | def trainepoch(self, trainTxt, trainImg, devTxt, devImg, nepoches=1): 227 | self.model.train() 228 | for _ in range(self.nepoch, self.nepoch + nepoches): 229 | permutation = list(np.random.permutation(len(trainTxt))) 230 | all_costs = [] 231 | for i in range(0, len(trainTxt), self.batch_size): 232 | # forward 233 | if i % (self.batch_size*500) == 0 and i > 0: 234 | logging.info('samples : {0}'.format(i)) 235 | r1_i2t, r5_i2t, r10_i2t, medr_i2t = self.i2t(devImg, 236 | devTxt) 237 | logging.info("Image to text: {0}, {1}, {2}, {3}".format( 238 | r1_i2t, r5_i2t, r10_i2t, medr_i2t)) 239 | # Compute test ranks txt2img 240 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = self.t2i(devImg, 241 | devTxt) 242 | logging.info("Text to Image: {0}, {1}, {2}, {3}".format( 243 | r1_t2i, r5_t2i, r10_t2i, medr_t2i)) 244 | idx = torch.LongTensor(permutation[i:i + self.batch_size]) 245 | imgbatch = Variable(trainImg.index_select(0, idx)).cuda() 246 | sentbatch = Variable(trainTxt.index_select(0, idx)).cuda() 247 | 248 | idximgc = np.random.choice(permutation[:i] + 249 | permutation[i + self.batch_size:], 250 | self.ncontrast*idx.size(0)) 251 | idxsentc = np.random.choice(permutation[:i] + 252 | permutation[i + self.batch_size:], 253 | self.ncontrast*idx.size(0)) 254 | idximgc = torch.LongTensor(idximgc) 255 | idxsentc = torch.LongTensor(idxsentc) 256 | # Get indexes for contrastive images and sentences 257 | imgcbatch = Variable(trainImg.index_select(0, idximgc)).view( 258 | -1, self.ncontrast, self.imgdim).cuda() 259 | sentcbatch = Variable(trainTxt.index_select(0, idxsentc)).view( 260 | -1, self.ncontrast, self.sentdim).cuda() 261 | 262 | anchor1, anchor2, img_sentc, sent_imgc = self.model( 263 | imgbatch, sentbatch, imgcbatch, sentcbatch) 264 | # loss 265 | loss = self.loss_fn(anchor1, anchor2, img_sentc, sent_imgc) 266 | all_costs.append(loss.data.item()) 267 | # backward 268 | self.optimizer.zero_grad() 269 | loss.backward() 270 | # Update parameters 271 | self.optimizer.step() 272 | self.nepoch += nepoches 273 | 274 | def t2i(self, images, captions): 275 | """ 276 | Images: (5N, imgdim) matrix of images 277 | Captions: (5N, sentdim) matrix of captions 278 | """ 279 | with torch.no_grad(): 280 | # Project images and captions 281 | img_embed, sent_embed = [], [] 282 | for i in range(0, len(images), self.batch_size): 283 | img_embed.append(self.model.proj_image( 284 | Variable(images[i:i + self.batch_size]))) 285 | sent_embed.append(self.model.proj_sentence( 286 | Variable(captions[i:i + self.batch_size]))) 287 | img_embed = torch.cat(img_embed, 0).data 288 | sent_embed = torch.cat(sent_embed, 0).data 289 | 290 | npts = int(img_embed.size(0) / 5) 291 | idxs = torch.cuda.LongTensor(range(0, len(img_embed), 5)) 292 | ims = img_embed.index_select(0, idxs) 293 | 294 | ranks = np.zeros(5 * npts) 295 | for index in range(npts): 296 | 297 | # Get query captions 298 | queries = sent_embed[5*index: 5*index + 5] 299 | 300 | # Compute scores 301 | scores = torch.mm(queries, ims.transpose(0, 1)).cpu().numpy() 302 | inds = np.zeros(scores.shape) 303 | for i in range(len(inds)): 304 | inds[i] = np.argsort(scores[i])[::-1] 305 | ranks[5 * index + i] = np.where(inds[i] == index)[0][0] 306 | 307 | # Compute metrics 308 | r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) 309 | r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) 310 | r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) 311 | medr = np.floor(np.median(ranks)) + 1 312 | return (r1, r5, r10, medr) 313 | 314 | def i2t(self, images, captions): 315 | """ 316 | Images: (5N, imgdim) matrix of images 317 | Captions: (5N, sentdim) matrix of captions 318 | """ 319 | with torch.no_grad(): 320 | # Project images and captions 321 | img_embed, sent_embed = [], [] 322 | for i in range(0, len(images), self.batch_size): 323 | img_embed.append(self.model.proj_image( 324 | Variable(images[i:i + self.batch_size]))) 325 | sent_embed.append(self.model.proj_sentence( 326 | Variable(captions[i:i + self.batch_size]))) 327 | img_embed = torch.cat(img_embed, 0).data 328 | sent_embed = torch.cat(sent_embed, 0).data 329 | 330 | npts = int(img_embed.size(0) / 5) 331 | index_list = [] 332 | 333 | ranks = np.zeros(npts) 334 | for index in range(npts): 335 | 336 | # Get query image 337 | query_img = img_embed[5 * index] 338 | 339 | # Compute scores 340 | scores = torch.mm(query_img.view(1, -1), 341 | sent_embed.transpose(0, 1)).view(-1) 342 | scores = scores.cpu().numpy() 343 | inds = np.argsort(scores)[::-1] 344 | index_list.append(inds[0]) 345 | 346 | # Score 347 | rank = 1e20 348 | for i in range(5*index, 5*index + 5, 1): 349 | tmp = np.where(inds == i)[0][0] 350 | if tmp < rank: 351 | rank = tmp 352 | ranks[index] = rank 353 | 354 | # Compute metrics 355 | r1 = 100.0 * len(np.where(ranks < 1)[0]) / len(ranks) 356 | r5 = 100.0 * len(np.where(ranks < 5)[0]) / len(ranks) 357 | r10 = 100.0 * len(np.where(ranks < 10)[0]) / len(ranks) 358 | medr = np.floor(np.median(ranks)) + 1 359 | return (r1, r5, r10, medr) 360 | --------------------------------------------------------------------------------