├── SentEval
├── senteval
│ ├── tools
│ │ ├── __init__.py
│ │ ├── relatedness.py
│ │ ├── classifier.py
│ │ └── validation.py
│ ├── __init__.py
│ ├── utils.py
│ ├── trec.py
│ ├── binary.py
│ ├── sst.py
│ ├── mrpc.py
│ ├── snli.py
│ ├── rank.py
│ ├── engine.py
│ ├── probing.py
│ ├── sick.py
│ └── sts.py
├── data
│ └── downstream
│ │ └── download_dataset.sh
├── setup.py
├── LICENSE
├── examples
│ ├── skipthought.py
│ ├── googleuse.py
│ ├── gensen.py
│ ├── infersent.py
│ ├── bow.py
│ └── models.py
└── README.md
├── simcse
├── __init__.py
├── tool.py
├── models.py
├── models_HSCL.py
└── models_aug.py
├── data
└── .gitignore
├── requirements.txt
├── scripts
├── eval.sh
└── sup_train_mp.sh
├── LICENSE
├── simcse_to_huggingface.py
├── evaluation.py
└── README.md
/SentEval/senteval/tools/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/simcse/__init__.py:
--------------------------------------------------------------------------------
1 | from .tool import SimCSE
2 |
--------------------------------------------------------------------------------
/data/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore everything in this directory
2 | *
3 | # Except this file
4 | !.gitignore
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | transformers==4.2.1
2 | scipy
3 | datasets
4 | pandas
5 | scikit-learn
6 | prettytable
7 | gradio
8 | setuptools
--------------------------------------------------------------------------------
/SentEval/data/downstream/download_dataset.sh:
--------------------------------------------------------------------------------
1 | wget https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/senteval.tar
2 | tar xvf senteval.tar
3 |
--------------------------------------------------------------------------------
/scripts/eval.sh:
--------------------------------------------------------------------------------
1 | path=sjtu-lit/SynCSE-partial-RoBERTa-base
2 | python simcse_to_huggingface.py --path ${path}
3 | CUDA_VISIBLE_DEVICES=0 python evaluation.py \
4 | --model_name_or_path ${path} \
5 | --pooler cls \
6 | --task_set sts \
7 | --mode test
8 |
--------------------------------------------------------------------------------
/SentEval/senteval/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | from __future__ import absolute_import
9 |
10 | from senteval.engine import SE
11 |
--------------------------------------------------------------------------------
/SentEval/setup.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | import io
9 | from setuptools import setup, find_packages
10 |
11 | with io.open('./README.md', encoding='utf-8') as f:
12 | readme = f.read()
13 |
14 | setup(
15 | name='SentEval',
16 | version='0.1.0',
17 | url='https://github.com/facebookresearch/SentEval',
18 | packages=find_packages(exclude=['examples']),
19 | license='Attribution-NonCommercial 4.0 International',
20 | long_description=readme,
21 | )
22 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2023 Language Intelligence and Technology group @ SJTU
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/scripts/sup_train_mp.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # In this example, we show how to train SimCSE using multiple GPU cards and PyTorch's distributed data parallel on supervised NLI dataset.
4 | # Set how many GPUs to use
5 |
6 | NUM_GPU=1
7 |
8 | # Randomly set a port number
9 | # If you encounter "address already used" error, just run again or manually set an available port id.
10 | PORT_ID=$(expr $RANDOM + 1000)
11 |
12 | # Allow multiple threads
13 | export OMP_NUM_THREADS=1
14 |
15 | # Use distributed data parallel
16 | # If you only want to use one card, uncomment the following line and comment the line with "torch.distributed.launch"
17 | # python train.py \
18 |
19 | model=roberta-base
20 | dataset=sjtu-lit/SynCSE-partial-NLI
21 | CUDA_VISIBLE_DEVICES=0 python -m torch.distributed.launch --nproc_per_node $NUM_GPU --master_port $PORT_ID train.py \
22 | --model_name_or_path ${model} \
23 | --train_file ${dataset} \
24 | --output_dir result/my-sup-simcse-${model}_${dataset} \
25 | --num_train_epochs 3 \
26 | --per_device_train_batch_size 512 \
27 | --learning_rate 5e-5 \
28 | --max_seq_length 32 \
29 | --evaluation_strategy steps \
30 | --metric_for_best_model avg_sts \
31 | --load_best_model_at_end \
32 | --eval_steps 25 \
33 | --pooler_type cls \
34 | --overwrite_output_dir \
35 | --temp 0.05 \
36 | --do_train \
37 | --do_eval \
38 | --fp16 \
39 | --seed 42 \
40 | --do_mlm \
41 | --hard_negative_weight 0 \
42 | "$@"
43 |
--------------------------------------------------------------------------------
/simcse_to_huggingface.py:
--------------------------------------------------------------------------------
1 | """
2 | Convert SimCSE's checkpoints to Huggingface style.
3 | """
4 |
5 | import argparse
6 | import torch
7 | import os
8 | import json
9 |
10 |
11 | def main():
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument("--path", type=str, help="Path of SimCSE checkpoint folder")
14 | args = parser.parse_args()
15 |
16 | print("SimCSE checkpoint -> Huggingface checkpoint for {}".format(args.path))
17 |
18 | state_dict = torch.load(os.path.join(args.path, "pytorch_model.bin"), map_location=torch.device("cpu"))
19 | new_state_dict = {}
20 | for key, param in state_dict.items():
21 | # Replace "mlp" to "pooler"
22 | if "mlp" in key:
23 | key = key.replace("mlp", "pooler")
24 |
25 | # Delete "bert" or "roberta" prefix
26 | if "bert." in key:
27 | key = key.replace("bert.", "")
28 | if "roberta." in key:
29 | key = key.replace("roberta.", "")
30 |
31 | new_state_dict[key] = param
32 |
33 | torch.save(new_state_dict, os.path.join(args.path, "pytorch_model.bin"))
34 |
35 | # Change architectures in config.json
36 | config = json.load(open(os.path.join(args.path, "config.json")))
37 | for i in range(len(config["architectures"])):
38 | config["architectures"][i] = config["architectures"][i].replace("ForCL", "Model")
39 | json.dump(config, open(os.path.join(args.path, "config.json"), "w"), indent=2)
40 |
41 |
42 | if __name__ == "__main__":
43 | main()
44 |
--------------------------------------------------------------------------------
/SentEval/LICENSE:
--------------------------------------------------------------------------------
1 | BSD License
2 |
3 | For SentEval software
4 |
5 | Copyright (c) 2017-present, Facebook, Inc. All rights reserved.
6 |
7 | Redistribution and use in source and binary forms, with or without modification,
8 | are permitted provided that the following conditions are met:
9 |
10 | * Redistributions of source code must retain the above copyright notice, this
11 | list of conditions and the following disclaimer.
12 |
13 | * Redistributions in binary form must reproduce the above copyright notice,
14 | this list of conditions and the following disclaimer in the documentation
15 | and/or other materials provided with the distribution.
16 |
17 | * Neither the name Facebook nor the names of its contributors may be used to
18 | endorse or promote products derived from this software without specific
19 | prior written permission.
20 |
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 |
--------------------------------------------------------------------------------
/SentEval/examples/skipthought.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | from __future__ import absolute_import, division, unicode_literals
9 |
10 | """
11 | Example of file for SkipThought in SentEval
12 | """
13 | import logging
14 | import sys
15 | sys.setdefaultencoding('utf8')
16 |
17 |
18 | # Set PATHs
19 | PATH_TO_SENTEVAL = '../'
20 | PATH_TO_DATA = '../data/senteval_data/'
21 | PATH_TO_SKIPTHOUGHT = ''
22 |
23 | assert PATH_TO_SKIPTHOUGHT != '', 'Download skipthought and set correct PATH'
24 |
25 | # import skipthought and Senteval
26 | sys.path.insert(0, PATH_TO_SKIPTHOUGHT)
27 | import skipthoughts
28 | sys.path.insert(0, PATH_TO_SENTEVAL)
29 | import senteval
30 |
31 |
32 | def prepare(params, samples):
33 | return
34 |
35 | def batcher(params, batch):
36 | batch = [str(' '.join(sent), errors="ignore") if sent != [] else '.' for sent in batch]
37 | embeddings = skipthoughts.encode(params['encoder'], batch,
38 | verbose=False, use_eos=True)
39 | return embeddings
40 |
41 |
42 | # Set params for SentEval
43 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10, 'batch_size': 512}
44 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
45 | 'tenacity': 5, 'epoch_size': 4}
46 | # Set up logger
47 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
48 |
49 | if __name__ == "__main__":
50 | # Load SkipThought model
51 | params_senteval['encoder'] = skipthoughts.load_model()
52 |
53 | se = senteval.engine.SE(params_senteval, batcher, prepare)
54 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
55 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
56 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
57 | 'Length', 'WordContent', 'Depth', 'TopConstituents',
58 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
59 | 'OddManOut', 'CoordinationInversion']
60 | results = se.eval(transfer_tasks)
61 | print(results)
62 |
--------------------------------------------------------------------------------
/SentEval/examples/googleuse.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | from __future__ import absolute_import, division
9 |
10 | import os
11 | import sys
12 | import logging
13 | import tensorflow as tf
14 | import tensorflow_hub as hub
15 | tf.logging.set_verbosity(0)
16 |
17 | # Set PATHs
18 | PATH_TO_SENTEVAL = '../'
19 | PATH_TO_DATA = '../data'
20 |
21 | # import SentEval
22 | sys.path.insert(0, PATH_TO_SENTEVAL)
23 | import senteval
24 |
25 | # tensorflow session
26 | session = tf.Session()
27 | os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
28 |
29 | # SentEval prepare and batcher
30 | def prepare(params, samples):
31 | return
32 |
33 | def batcher(params, batch):
34 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
35 | embeddings = params['google_use'](batch)
36 | return embeddings
37 |
38 | def make_embed_fn(module):
39 | with tf.Graph().as_default():
40 | sentences = tf.placeholder(tf.string)
41 | embed = hub.Module(module)
42 | embeddings = embed(sentences)
43 | session = tf.train.MonitoredSession()
44 | return lambda x: session.run(embeddings, {sentences: x})
45 |
46 | # Start TF session and load Google Universal Sentence Encoder
47 | encoder = make_embed_fn("https://tfhub.dev/google/universal-sentence-encoder-large/2")
48 |
49 | # Set params for SentEval
50 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
51 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
52 | 'tenacity': 3, 'epoch_size': 2}
53 | params_senteval['google_use'] = encoder
54 |
55 | # Set up logger
56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
57 |
58 | if __name__ == "__main__":
59 | se = senteval.engine.SE(params_senteval, batcher, prepare)
60 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
61 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
62 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
63 | 'Length', 'WordContent', 'Depth', 'TopConstituents',
64 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
65 | 'OddManOut', 'CoordinationInversion']
66 | results = se.eval(transfer_tasks)
67 | print(results)
68 |
--------------------------------------------------------------------------------
/SentEval/examples/gensen.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | """
9 | Clone GenSen repo here: https://github.com/Maluuba/gensen.git
10 | And follow instructions for loading the model used in batcher
11 | """
12 |
13 | from __future__ import absolute_import, division, unicode_literals
14 |
15 | import sys
16 | import logging
17 | # import GenSen package
18 | from gensen import GenSen, GenSenSingle
19 |
20 | # Set PATHs
21 | PATH_TO_SENTEVAL = '../'
22 | PATH_TO_DATA = '../data'
23 |
24 | # import SentEval
25 | sys.path.insert(0, PATH_TO_SENTEVAL)
26 | import senteval
27 |
28 | # SentEval prepare and batcher
29 | def prepare(params, samples):
30 | return
31 |
32 | def batcher(params, batch):
33 | batch = [' '.join(sent) if sent != [] else '.' for sent in batch]
34 | _, reps_h_t = gensen.get_representation(
35 | sentences, pool='last', return_numpy=True, tokenize=True
36 | )
37 | embeddings = reps_h_t
38 | return embeddings
39 |
40 | # Load GenSen model
41 | gensen_1 = GenSenSingle(
42 | model_folder='../data/models',
43 | filename_prefix='nli_large_bothskip',
44 | pretrained_emb='../data/embedding/glove.840B.300d.h5'
45 | )
46 | gensen_2 = GenSenSingle(
47 | model_folder='../data/models',
48 | filename_prefix='nli_large_bothskip_parse',
49 | pretrained_emb='../data/embedding/glove.840B.300d.h5'
50 | )
51 | gensen_encoder = GenSen(gensen_1, gensen_2)
52 | reps_h, reps_h_t = gensen.get_representation(
53 | sentences, pool='last', return_numpy=True, tokenize=True
54 | )
55 |
56 | # Set params for SentEval
57 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
58 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
59 | 'tenacity': 3, 'epoch_size': 2}
60 | params_senteval['gensen'] = gensen_encoder
61 |
62 | # Set up logger
63 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
64 |
65 | if __name__ == "__main__":
66 | se = senteval.engine.SE(params_senteval, batcher, prepare)
67 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
68 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
69 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
70 | 'Length', 'WordContent', 'Depth', 'TopConstituents',
71 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
72 | 'OddManOut', 'CoordinationInversion']
73 | results = se.eval(transfer_tasks)
74 | print(results)
75 |
--------------------------------------------------------------------------------
/SentEval/examples/infersent.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | """
9 | InferSent models. See https://github.com/facebookresearch/InferSent.
10 | """
11 |
12 | from __future__ import absolute_import, division, unicode_literals
13 |
14 | import sys
15 | import os
16 | import torch
17 | import logging
18 |
19 | # get models.py from InferSent repo
20 | from models import InferSent
21 |
22 | # Set PATHs
23 | PATH_SENTEVAL = '../'
24 | PATH_TO_DATA = '../data'
25 | PATH_TO_W2V = 'PATH/TO/glove.840B.300d.txt' # or crawl-300d-2M.vec for V2
26 | MODEL_PATH = 'infersent1.pkl'
27 | V = 1 # version of InferSent
28 |
29 | assert os.path.isfile(MODEL_PATH) and os.path.isfile(PATH_TO_W2V), \
30 | 'Set MODEL and GloVe PATHs'
31 |
32 | # import senteval
33 | sys.path.insert(0, PATH_SENTEVAL)
34 | import senteval
35 |
36 |
37 | def prepare(params, samples):
38 | params.infersent.build_vocab([' '.join(s) for s in samples], tokenize=False)
39 |
40 |
41 | def batcher(params, batch):
42 | sentences = [' '.join(s) for s in batch]
43 | embeddings = params.infersent.encode(sentences, bsize=params.batch_size, tokenize=False)
44 | return embeddings
45 |
46 |
47 | """
48 | Evaluation of trained model on Transfer Tasks (SentEval)
49 | """
50 |
51 | # define senteval params
52 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
53 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
54 | 'tenacity': 3, 'epoch_size': 2}
55 | # Set up logger
56 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
57 |
58 | if __name__ == "__main__":
59 | # Load InferSent model
60 | params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
61 | 'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
62 | model = InferSent(params_model)
63 | model.load_state_dict(torch.load(MODEL_PATH))
64 | model.set_w2v_path(PATH_TO_W2V)
65 |
66 | params_senteval['infersent'] = model.cuda()
67 |
68 | se = senteval.engine.SE(params_senteval, batcher, prepare)
69 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
70 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
71 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
72 | 'Length', 'WordContent', 'Depth', 'TopConstituents',
73 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
74 | 'OddManOut', 'CoordinationInversion']
75 | results = se.eval(transfer_tasks)
76 | print(results)
77 |
--------------------------------------------------------------------------------
/SentEval/senteval/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | from __future__ import absolute_import, division, unicode_literals
9 |
10 | import numpy as np
11 | import re
12 | import inspect
13 | from torch import optim
14 |
15 |
16 | def create_dictionary(sentences):
17 | words = {}
18 | for s in sentences:
19 | for word in s:
20 | if word in words:
21 | words[word] += 1
22 | else:
23 | words[word] = 1
24 | words[''] = 1e9 + 4
25 | words[''] = 1e9 + 3
26 | words['
'] = 1e9 + 2
27 | # words[' '] = 1e9 + 2
43 |
44 | sorted_words = sorted(words.items(), key=lambda x: -x[1]) # inverse sort
45 | id2word = []
46 | word2id = {}
47 | for i, (w, _) in enumerate(sorted_words):
48 | id2word.append(w)
49 | word2id[w] = i
50 |
51 | return id2word, word2id
52 |
53 | # Get word vectors from vocabulary (glove, word2vec, fasttext ..)
54 | def get_wordvec(path_to_vec, word2id):
55 | word_vec = {}
56 |
57 | with io.open(path_to_vec, 'r', encoding='utf-8') as f:
58 | # if word2vec or fasttext file : skip first line "next(f)"
59 | for line in f:
60 | word, vec = line.split(' ', 1)
61 | if word in word2id:
62 | word_vec[word] = np.fromstring(vec, sep=' ')
63 |
64 | logging.info('Found {0} words with word vectors, out of \
65 | {1} words'.format(len(word_vec), len(word2id)))
66 | return word_vec
67 |
68 |
69 | # SentEval prepare and batcher
70 | def prepare(params, samples):
71 | _, params.word2id = create_dictionary(samples)
72 | params.word_vec = get_wordvec(PATH_TO_VEC, params.word2id)
73 | params.wvec_dim = 300
74 | return
75 |
76 | def batcher(params, batch):
77 | batch = [sent if sent != [] else ['.'] for sent in batch]
78 | embeddings = []
79 |
80 | for sent in batch:
81 | sentvec = []
82 | for word in sent:
83 | if word in params.word_vec:
84 | sentvec.append(params.word_vec[word])
85 | if not sentvec:
86 | vec = np.zeros(params.wvec_dim)
87 | sentvec.append(vec)
88 | sentvec = np.mean(sentvec, 0)
89 | embeddings.append(sentvec)
90 |
91 | embeddings = np.vstack(embeddings)
92 | return embeddings
93 |
94 |
95 | # Set params for SentEval
96 | params_senteval = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
97 | params_senteval['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
98 | 'tenacity': 3, 'epoch_size': 2}
99 |
100 | # Set up logger
101 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
102 |
103 | if __name__ == "__main__":
104 | se = senteval.engine.SE(params_senteval, batcher, prepare)
105 | transfer_tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
106 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
107 | 'SICKEntailment', 'SICKRelatedness', 'STSBenchmark',
108 | 'Length', 'WordContent', 'Depth', 'TopConstituents',
109 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
110 | 'OddManOut', 'CoordinationInversion']
111 | results = se.eval(transfer_tasks)
112 | print(results)
113 |
--------------------------------------------------------------------------------
/SentEval/senteval/binary.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 | Binary classifier and corresponding datasets : MR, CR, SUBJ, MPQA
10 | '''
11 | from __future__ import absolute_import, division, unicode_literals
12 |
13 | import io
14 | import os
15 | import numpy as np
16 | import logging
17 |
18 | from senteval.tools.validation import InnerKFoldClassifier
19 |
20 |
21 | class BinaryClassifierEval(object):
22 | def __init__(self, pos, neg, seed=1111):
23 | self.seed = seed
24 | self.samples, self.labels = pos + neg, [1] * len(pos) + [0] * len(neg)
25 | self.n_samples = len(self.samples)
26 |
27 | def do_prepare(self, params, prepare):
28 | # prepare is given the whole text
29 | return prepare(params, self.samples)
30 | # prepare puts everything it outputs in "params" : params.word2id etc
31 | # Those output will be further used by "batcher".
32 |
33 | def loadFile(self, fpath):
34 | with io.open(fpath, 'r', encoding='latin-1') as f:
35 | return [line.split() for line in f.read().splitlines()]
36 |
37 | def run(self, params, batcher):
38 | enc_input = []
39 | # Sort to reduce padding
40 | sorted_corpus = sorted(zip(self.samples, self.labels),
41 | key=lambda z: (len(z[0]), z[1]))
42 | sorted_samples = [x for (x, y) in sorted_corpus]
43 | sorted_labels = [y for (x, y) in sorted_corpus]
44 | logging.info('Generating sentence embeddings')
45 | for ii in range(0, self.n_samples, params.batch_size):
46 | batch = sorted_samples[ii:ii + params.batch_size]
47 | embeddings = batcher(params, batch)
48 | enc_input.append(embeddings)
49 | enc_input = np.vstack(enc_input)
50 | logging.info('Generated sentence embeddings')
51 |
52 | config = {'nclasses': 2, 'seed': self.seed,
53 | 'usepytorch': params.usepytorch,
54 | 'classifier': params.classifier,
55 | 'nhid': params.nhid, 'kfold': params.kfold}
56 | clf = InnerKFoldClassifier(enc_input, np.array(sorted_labels), config)
57 | devacc, testacc = clf.run()
58 | logging.debug('Dev acc : {0} Test acc : {1}\n'.format(devacc, testacc))
59 | return {'devacc': devacc, 'acc': testacc, 'ndev': self.n_samples,
60 | 'ntest': self.n_samples}
61 |
62 |
63 | class CREval(BinaryClassifierEval):
64 | def __init__(self, task_path, seed=1111):
65 | logging.debug('***** Transfer task : CR *****\n\n')
66 | pos = self.loadFile(os.path.join(task_path, 'custrev.pos'))
67 | neg = self.loadFile(os.path.join(task_path, 'custrev.neg'))
68 | super(self.__class__, self).__init__(pos, neg, seed)
69 |
70 |
71 | class MREval(BinaryClassifierEval):
72 | def __init__(self, task_path, seed=1111):
73 | logging.debug('***** Transfer task : MR *****\n\n')
74 | pos = self.loadFile(os.path.join(task_path, 'rt-polarity.pos'))
75 | neg = self.loadFile(os.path.join(task_path, 'rt-polarity.neg'))
76 | super(self.__class__, self).__init__(pos, neg, seed)
77 |
78 |
79 | class SUBJEval(BinaryClassifierEval):
80 | def __init__(self, task_path, seed=1111):
81 | logging.debug('***** Transfer task : SUBJ *****\n\n')
82 | obj = self.loadFile(os.path.join(task_path, 'subj.objective'))
83 | subj = self.loadFile(os.path.join(task_path, 'subj.subjective'))
84 | super(self.__class__, self).__init__(obj, subj, seed)
85 |
86 |
87 | class MPQAEval(BinaryClassifierEval):
88 | def __init__(self, task_path, seed=1111):
89 | logging.debug('***** Transfer task : MPQA *****\n\n')
90 | pos = self.loadFile(os.path.join(task_path, 'mpqa.pos'))
91 | neg = self.loadFile(os.path.join(task_path, 'mpqa.neg'))
92 | super(self.__class__, self).__init__(pos, neg, seed)
93 |
--------------------------------------------------------------------------------
/SentEval/senteval/sst.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 | SST - binary classification
10 | '''
11 |
12 | from __future__ import absolute_import, division, unicode_literals
13 |
14 | import os
15 | import io
16 | import logging
17 | import numpy as np
18 |
19 | from senteval.tools.validation import SplitClassifier
20 |
21 |
22 | class SSTEval(object):
23 | def __init__(self, task_path, nclasses=2, seed=1111):
24 | self.seed = seed
25 |
26 | # binary of fine-grained
27 | assert nclasses in [2, 5]
28 | self.nclasses = nclasses
29 | self.task_name = 'Binary' if self.nclasses == 2 else 'Fine-Grained'
30 | logging.debug('***** Transfer task : SST %s classification *****\n\n', self.task_name)
31 |
32 | train = self.loadFile(os.path.join(task_path, 'sentiment-train'))
33 | dev = self.loadFile(os.path.join(task_path, 'sentiment-dev'))
34 | test = self.loadFile(os.path.join(task_path, 'sentiment-test'))
35 | self.sst_data = {'train': train, 'dev': dev, 'test': test}
36 |
37 | def do_prepare(self, params, prepare):
38 | samples = self.sst_data['train']['X'] + self.sst_data['dev']['X'] + \
39 | self.sst_data['test']['X']
40 | return prepare(params, samples)
41 |
42 | def loadFile(self, fpath):
43 | sst_data = {'X': [], 'y': []}
44 | with io.open(fpath, 'r', encoding='utf-8') as f:
45 | for line in f:
46 | if self.nclasses == 2:
47 | sample = line.strip().split('\t')
48 | sst_data['y'].append(int(sample[1]))
49 | sst_data['X'].append(sample[0].split())
50 | elif self.nclasses == 5:
51 | sample = line.strip().split(' ', 1)
52 | sst_data['y'].append(int(sample[0]))
53 | sst_data['X'].append(sample[1].split())
54 | assert max(sst_data['y']) == self.nclasses - 1
55 | return sst_data
56 |
57 | def run(self, params, batcher):
58 | sst_embed = {'train': {}, 'dev': {}, 'test': {}}
59 | bsize = params.batch_size
60 |
61 | for key in self.sst_data:
62 | logging.info('Computing embedding for {0}'.format(key))
63 | # Sort to reduce padding
64 | sorted_data = sorted(zip(self.sst_data[key]['X'],
65 | self.sst_data[key]['y']),
66 | key=lambda z: (len(z[0]), z[1]))
67 | self.sst_data[key]['X'], self.sst_data[key]['y'] = map(list, zip(*sorted_data))
68 |
69 | sst_embed[key]['X'] = []
70 | for ii in range(0, len(self.sst_data[key]['y']), bsize):
71 | batch = self.sst_data[key]['X'][ii:ii + bsize]
72 | embeddings = batcher(params, batch)
73 | sst_embed[key]['X'].append(embeddings)
74 | sst_embed[key]['X'] = np.vstack(sst_embed[key]['X'])
75 | sst_embed[key]['y'] = np.array(self.sst_data[key]['y'])
76 | logging.info('Computed {0} embeddings'.format(key))
77 |
78 | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
79 | 'usepytorch': params.usepytorch,
80 | 'classifier': params.classifier}
81 |
82 | clf = SplitClassifier(X={'train': sst_embed['train']['X'],
83 | 'valid': sst_embed['dev']['X'],
84 | 'test': sst_embed['test']['X']},
85 | y={'train': sst_embed['train']['y'],
86 | 'valid': sst_embed['dev']['y'],
87 | 'test': sst_embed['test']['y']},
88 | config=config_classifier)
89 |
90 | devacc, testacc = clf.run()
91 | logging.debug('\nDev acc : {0} Test acc : {1} for \
92 | SST {2} classification\n'.format(devacc, testacc, self.task_name))
93 |
94 | return {'devacc': devacc, 'acc': testacc,
95 | 'ndev': len(sst_embed['dev']['X']),
96 | 'ntest': len(sst_embed['test']['X'])}
97 |
--------------------------------------------------------------------------------
/SentEval/senteval/mrpc.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 | MRPC : Microsoft Research Paraphrase (detection) Corpus
10 | '''
11 | from __future__ import absolute_import, division, unicode_literals
12 |
13 | import os
14 | import logging
15 | import numpy as np
16 | import io
17 |
18 | from senteval.tools.validation import KFoldClassifier
19 |
20 | from sklearn.metrics import f1_score
21 |
22 |
23 | class MRPCEval(object):
24 | def __init__(self, task_path, seed=1111):
25 | logging.info('***** Transfer task : MRPC *****\n\n')
26 | self.seed = seed
27 | train = self.loadFile(os.path.join(task_path,
28 | 'msr_paraphrase_train.txt'))
29 | test = self.loadFile(os.path.join(task_path,
30 | 'msr_paraphrase_test.txt'))
31 | self.mrpc_data = {'train': train, 'test': test}
32 |
33 | def do_prepare(self, params, prepare):
34 | # TODO : Should we separate samples in "train, test"?
35 | samples = self.mrpc_data['train']['X_A'] + \
36 | self.mrpc_data['train']['X_B'] + \
37 | self.mrpc_data['test']['X_A'] + self.mrpc_data['test']['X_B']
38 | return prepare(params, samples)
39 |
40 | def loadFile(self, fpath):
41 | mrpc_data = {'X_A': [], 'X_B': [], 'y': []}
42 | with io.open(fpath, 'r', encoding='utf-8') as f:
43 | for line in f:
44 | text = line.strip().split('\t')
45 | mrpc_data['X_A'].append(text[3].split())
46 | mrpc_data['X_B'].append(text[4].split())
47 | mrpc_data['y'].append(text[0])
48 |
49 | mrpc_data['X_A'] = mrpc_data['X_A'][1:]
50 | mrpc_data['X_B'] = mrpc_data['X_B'][1:]
51 | mrpc_data['y'] = [int(s) for s in mrpc_data['y'][1:]]
52 | return mrpc_data
53 |
54 | def run(self, params, batcher):
55 | mrpc_embed = {'train': {}, 'test': {}}
56 |
57 | for key in self.mrpc_data:
58 | logging.info('Computing embedding for {0}'.format(key))
59 | # Sort to reduce padding
60 | text_data = {}
61 | sorted_corpus = sorted(zip(self.mrpc_data[key]['X_A'],
62 | self.mrpc_data[key]['X_B'],
63 | self.mrpc_data[key]['y']),
64 | key=lambda z: (len(z[0]), len(z[1]), z[2]))
65 |
66 | text_data['A'] = [x for (x, y, z) in sorted_corpus]
67 | text_data['B'] = [y for (x, y, z) in sorted_corpus]
68 | text_data['y'] = [z for (x, y, z) in sorted_corpus]
69 |
70 | for txt_type in ['A', 'B']:
71 | mrpc_embed[key][txt_type] = []
72 | for ii in range(0, len(text_data['y']), params.batch_size):
73 | batch = text_data[txt_type][ii:ii + params.batch_size]
74 | embeddings = batcher(params, batch)
75 | mrpc_embed[key][txt_type].append(embeddings)
76 | mrpc_embed[key][txt_type] = np.vstack(mrpc_embed[key][txt_type])
77 | mrpc_embed[key]['y'] = np.array(text_data['y'])
78 | logging.info('Computed {0} embeddings'.format(key))
79 |
80 | # Train
81 | trainA = mrpc_embed['train']['A']
82 | trainB = mrpc_embed['train']['B']
83 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
84 | trainY = mrpc_embed['train']['y']
85 |
86 | # Test
87 | testA = mrpc_embed['test']['A']
88 | testB = mrpc_embed['test']['B']
89 | testF = np.c_[np.abs(testA - testB), testA * testB]
90 | testY = mrpc_embed['test']['y']
91 |
92 | config = {'nclasses': 2, 'seed': self.seed,
93 | 'usepytorch': params.usepytorch,
94 | 'classifier': params.classifier,
95 | 'nhid': params.nhid, 'kfold': params.kfold}
96 | clf = KFoldClassifier(train={'X': trainF, 'y': trainY},
97 | test={'X': testF, 'y': testY}, config=config)
98 |
99 | devacc, testacc, yhat = clf.run()
100 | testf1 = round(100*f1_score(testY, yhat), 2)
101 | logging.debug('Dev acc : {0} Test acc {1}; Test F1 {2} for MRPC.\n'
102 | .format(devacc, testacc, testf1))
103 | return {'devacc': devacc, 'acc': testacc, 'f1': testf1,
104 | 'ndev': len(trainA), 'ntest': len(testA)}
105 |
--------------------------------------------------------------------------------
/SentEval/senteval/snli.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 | SNLI - Entailment
10 | '''
11 | from __future__ import absolute_import, division, unicode_literals
12 |
13 | import codecs
14 | import os
15 | import io
16 | import copy
17 | import logging
18 | import numpy as np
19 |
20 | from senteval.tools.validation import SplitClassifier
21 |
22 |
23 | class SNLIEval(object):
24 | def __init__(self, taskpath, seed=1111):
25 | logging.debug('***** Transfer task : SNLI Entailment*****\n\n')
26 | self.seed = seed
27 | train1 = self.loadFile(os.path.join(taskpath, 's1.train'))
28 | train2 = self.loadFile(os.path.join(taskpath, 's2.train'))
29 |
30 | trainlabels = io.open(os.path.join(taskpath, 'labels.train'),
31 | encoding='utf-8').read().splitlines()
32 |
33 | valid1 = self.loadFile(os.path.join(taskpath, 's1.dev'))
34 | valid2 = self.loadFile(os.path.join(taskpath, 's2.dev'))
35 | validlabels = io.open(os.path.join(taskpath, 'labels.dev'),
36 | encoding='utf-8').read().splitlines()
37 |
38 | test1 = self.loadFile(os.path.join(taskpath, 's1.test'))
39 | test2 = self.loadFile(os.path.join(taskpath, 's2.test'))
40 | testlabels = io.open(os.path.join(taskpath, 'labels.test'),
41 | encoding='utf-8').read().splitlines()
42 |
43 | # sort data (by s2 first) to reduce padding
44 | sorted_train = sorted(zip(train2, train1, trainlabels),
45 | key=lambda z: (len(z[0]), len(z[1]), z[2]))
46 | train2, train1, trainlabels = map(list, zip(*sorted_train))
47 |
48 | sorted_valid = sorted(zip(valid2, valid1, validlabels),
49 | key=lambda z: (len(z[0]), len(z[1]), z[2]))
50 | valid2, valid1, validlabels = map(list, zip(*sorted_valid))
51 |
52 | sorted_test = sorted(zip(test2, test1, testlabels),
53 | key=lambda z: (len(z[0]), len(z[1]), z[2]))
54 | test2, test1, testlabels = map(list, zip(*sorted_test))
55 |
56 | self.samples = train1 + train2 + valid1 + valid2 + test1 + test2
57 | self.data = {'train': (train1, train2, trainlabels),
58 | 'valid': (valid1, valid2, validlabels),
59 | 'test': (test1, test2, testlabels)
60 | }
61 |
62 | def do_prepare(self, params, prepare):
63 | return prepare(params, self.samples)
64 |
65 | def loadFile(self, fpath):
66 | with codecs.open(fpath, 'rb', 'latin-1') as f:
67 | return [line.split() for line in
68 | f.read().splitlines()]
69 |
70 | def run(self, params, batcher):
71 | self.X, self.y = {}, {}
72 | dico_label = {'entailment': 0, 'neutral': 1, 'contradiction': 2}
73 | for key in self.data:
74 | if key not in self.X:
75 | self.X[key] = []
76 | if key not in self.y:
77 | self.y[key] = []
78 |
79 | input1, input2, mylabels = self.data[key]
80 | enc_input = []
81 | n_labels = len(mylabels)
82 | for ii in range(0, n_labels, params.batch_size):
83 | batch1 = input1[ii:ii + params.batch_size]
84 | batch2 = input2[ii:ii + params.batch_size]
85 |
86 | if len(batch1) == len(batch2) and len(batch1) > 0:
87 | enc1 = batcher(params, batch1)
88 | enc2 = batcher(params, batch2)
89 | enc_input.append(np.hstack((enc1, enc2, enc1 * enc2,
90 | np.abs(enc1 - enc2))))
91 | if (ii*params.batch_size) % (20000*params.batch_size) == 0:
92 | logging.info("PROGRESS (encoding): %.2f%%" %
93 | (100 * ii / n_labels))
94 | self.X[key] = np.vstack(enc_input)
95 | self.y[key] = [dico_label[y] for y in mylabels]
96 |
97 | config = {'nclasses': 3, 'seed': self.seed,
98 | 'usepytorch': params.usepytorch,
99 | 'cudaEfficient': True,
100 | 'nhid': params.nhid, 'noreg': True}
101 |
102 | config_classifier = copy.deepcopy(params.classifier)
103 | config_classifier['max_epoch'] = 15
104 | config_classifier['epoch_size'] = 1
105 | config['classifier'] = config_classifier
106 |
107 | clf = SplitClassifier(self.X, self.y, config)
108 | devacc, testacc = clf.run()
109 | logging.debug('Dev acc : {0} Test acc : {1} for SNLI\n'
110 | .format(devacc, testacc))
111 | return {'devacc': devacc, 'acc': testacc,
112 | 'ndev': len(self.data['valid'][0]),
113 | 'ntest': len(self.data['test'][0])}
114 |
--------------------------------------------------------------------------------
/SentEval/senteval/rank.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 | Image-Caption Retrieval with COCO dataset
10 | '''
11 | from __future__ import absolute_import, division, unicode_literals
12 |
13 | import os
14 | import sys
15 | import logging
16 | import numpy as np
17 |
18 | try:
19 | import cPickle as pickle
20 | except ImportError:
21 | import pickle
22 |
23 | from senteval.tools.ranking import ImageSentenceRankingPytorch
24 |
25 |
26 | class ImageCaptionRetrievalEval(object):
27 | def __init__(self, task_path, seed=1111):
28 | logging.debug('***** Transfer task: Image Caption Retrieval *****\n\n')
29 |
30 | # Get captions and image features
31 | self.seed = seed
32 | train, dev, test = self.loadFile(task_path)
33 | self.coco_data = {'train': train, 'dev': dev, 'test': test}
34 |
35 | def do_prepare(self, params, prepare):
36 | samples = self.coco_data['train']['sent'] + \
37 | self.coco_data['dev']['sent'] + \
38 | self.coco_data['test']['sent']
39 | prepare(params, samples)
40 |
41 | def loadFile(self, fpath):
42 | coco = {}
43 |
44 | for split in ['train', 'valid', 'test']:
45 | list_sent = []
46 | list_img_feat = []
47 | if sys.version_info < (3, 0):
48 | with open(os.path.join(fpath, split + '.pkl')) as f:
49 | cocodata = pickle.load(f)
50 | else:
51 | with open(os.path.join(fpath, split + '.pkl'), 'rb') as f:
52 | cocodata = pickle.load(f, encoding='latin1')
53 |
54 | for imgkey in range(len(cocodata['features'])):
55 | assert len(cocodata['image_to_caption_ids'][imgkey]) >= 5, \
56 | cocodata['image_to_caption_ids'][imgkey]
57 | for captkey in cocodata['image_to_caption_ids'][imgkey][0:5]:
58 | sent = cocodata['captions'][captkey]['cleaned_caption']
59 | sent += ' .' # add punctuation to end of sentence in COCO
60 | list_sent.append(sent.encode('utf-8').split())
61 | list_img_feat.append(cocodata['features'][imgkey])
62 | assert len(list_sent) == len(list_img_feat) and \
63 | len(list_sent) % 5 == 0
64 | list_img_feat = np.array(list_img_feat).astype('float32')
65 | coco[split] = {'sent': list_sent, 'imgfeat': list_img_feat}
66 | return coco['train'], coco['valid'], coco['test']
67 |
68 | def run(self, params, batcher):
69 | coco_embed = {'train': {'sentfeat': [], 'imgfeat': []},
70 | 'dev': {'sentfeat': [], 'imgfeat': []},
71 | 'test': {'sentfeat': [], 'imgfeat': []}}
72 |
73 | for key in self.coco_data:
74 | logging.info('Computing embedding for {0}'.format(key))
75 | # Sort to reduce padding
76 | self.coco_data[key]['sent'] = np.array(self.coco_data[key]['sent'])
77 | self.coco_data[key]['sent'], idx_sort = np.sort(self.coco_data[key]['sent']), np.argsort(self.coco_data[key]['sent'])
78 | idx_unsort = np.argsort(idx_sort)
79 |
80 | coco_embed[key]['X'] = []
81 | nsent = len(self.coco_data[key]['sent'])
82 | for ii in range(0, nsent, params.batch_size):
83 | batch = self.coco_data[key]['sent'][ii:ii + params.batch_size]
84 | embeddings = batcher(params, batch)
85 | coco_embed[key]['sentfeat'].append(embeddings)
86 | coco_embed[key]['sentfeat'] = np.vstack(coco_embed[key]['sentfeat'])[idx_unsort]
87 | coco_embed[key]['imgfeat'] = np.array(self.coco_data[key]['imgfeat'])
88 | logging.info('Computed {0} embeddings'.format(key))
89 |
90 | config = {'seed': self.seed, 'projdim': 1000, 'margin': 0.2}
91 | clf = ImageSentenceRankingPytorch(train=coco_embed['train'],
92 | valid=coco_embed['dev'],
93 | test=coco_embed['test'],
94 | config=config)
95 |
96 | bestdevscore, r1_i2t, r5_i2t, r10_i2t, medr_i2t, \
97 | r1_t2i, r5_t2i, r10_t2i, medr_t2i = clf.run()
98 |
99 | logging.debug("\nTest scores | Image to text: \
100 | {0}, {1}, {2}, {3}".format(r1_i2t, r5_i2t, r10_i2t, medr_i2t))
101 | logging.debug("Test scores | Text to image: \
102 | {0}, {1}, {2}, {3}\n".format(r1_t2i, r5_t2i, r10_t2i, medr_t2i))
103 |
104 | return {'devacc': bestdevscore,
105 | 'acc': [(r1_i2t, r5_i2t, r10_i2t, medr_i2t),
106 | (r1_t2i, r5_t2i, r10_t2i, medr_t2i)],
107 | 'ndev': len(coco_embed['dev']['sentfeat']),
108 | 'ntest': len(coco_embed['test']['sentfeat'])}
109 |
--------------------------------------------------------------------------------
/SentEval/senteval/tools/relatedness.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | """
9 | Semantic Relatedness (supervised) with Pytorch
10 | """
11 | from __future__ import absolute_import, division, unicode_literals
12 |
13 | import copy
14 | import numpy as np
15 |
16 | import torch
17 | from torch import nn
18 | import torch.optim as optim
19 |
20 | from scipy.stats import pearsonr, spearmanr
21 |
22 |
23 | class RelatednessPytorch(object):
24 | # Can be used for SICK-Relatedness, and STS14
25 | def __init__(self, train, valid, test, devscores, config):
26 | # fix seed
27 | np.random.seed(config['seed'])
28 | torch.manual_seed(config['seed'])
29 | assert torch.cuda.is_available(), 'torch.cuda required for Relatedness'
30 | torch.cuda.manual_seed(config['seed'])
31 |
32 | self.train = train
33 | self.valid = valid
34 | self.test = test
35 | self.devscores = devscores
36 |
37 | self.inputdim = train['X'].shape[1]
38 | self.nclasses = config['nclasses']
39 | self.seed = config['seed']
40 | self.l2reg = 0.
41 | self.batch_size = 64
42 | self.maxepoch = 1000
43 | self.early_stop = True
44 |
45 | self.model = nn.Sequential(
46 | nn.Linear(self.inputdim, self.nclasses),
47 | nn.Softmax(dim=-1),
48 | )
49 | self.loss_fn = nn.MSELoss()
50 |
51 | if torch.cuda.is_available():
52 | self.model = self.model.cuda()
53 | self.loss_fn = self.loss_fn.cuda()
54 |
55 | self.loss_fn.size_average = False
56 | self.optimizer = optim.Adam(self.model.parameters(),
57 | weight_decay=self.l2reg)
58 |
59 | def prepare_data(self, trainX, trainy, devX, devy, testX, testy):
60 | # Transform probs to log-probs for KL-divergence
61 | trainX = torch.from_numpy(trainX).float().cuda()
62 | trainy = torch.from_numpy(trainy).float().cuda()
63 | devX = torch.from_numpy(devX).float().cuda()
64 | devy = torch.from_numpy(devy).float().cuda()
65 | testX = torch.from_numpy(testX).float().cuda()
66 | testY = torch.from_numpy(testy).float().cuda()
67 |
68 | return trainX, trainy, devX, devy, testX, testy
69 |
70 | def run(self):
71 | self.nepoch = 0
72 | bestpr = -1
73 | early_stop_count = 0
74 | r = np.arange(1, 6)
75 | stop_train = False
76 |
77 | # Preparing data
78 | trainX, trainy, devX, devy, testX, testy = self.prepare_data(
79 | self.train['X'], self.train['y'],
80 | self.valid['X'], self.valid['y'],
81 | self.test['X'], self.test['y'])
82 |
83 | # Training
84 | while not stop_train and self.nepoch <= self.maxepoch:
85 | self.trainepoch(trainX, trainy, nepoches=50)
86 | yhat = np.dot(self.predict_proba(devX), r)
87 | pr = spearmanr(yhat, self.devscores)[0]
88 | pr = 0 if pr != pr else pr # if NaN bc std=0
89 | # early stop on Pearson
90 | if pr > bestpr:
91 | bestpr = pr
92 | bestmodel = copy.deepcopy(self.model)
93 | elif self.early_stop:
94 | if early_stop_count >= 3:
95 | stop_train = True
96 | early_stop_count += 1
97 | self.model = bestmodel
98 |
99 | yhat = np.dot(self.predict_proba(testX), r)
100 |
101 | return bestpr, yhat
102 |
103 | def trainepoch(self, X, y, nepoches=1):
104 | self.model.train()
105 | for _ in range(self.nepoch, self.nepoch + nepoches):
106 | permutation = np.random.permutation(len(X))
107 | all_costs = []
108 | for i in range(0, len(X), self.batch_size):
109 | # forward
110 | idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().cuda()
111 | Xbatch = X[idx]
112 | ybatch = y[idx]
113 | output = self.model(Xbatch)
114 | # loss
115 | loss = self.loss_fn(output, ybatch)
116 | all_costs.append(loss.item())
117 | # backward
118 | self.optimizer.zero_grad()
119 | loss.backward()
120 | # Update parameters
121 | self.optimizer.step()
122 | self.nepoch += nepoches
123 |
124 | def predict_proba(self, devX):
125 | self.model.eval()
126 | probas = []
127 | with torch.no_grad():
128 | for i in range(0, len(devX), self.batch_size):
129 | Xbatch = devX[i:i + self.batch_size]
130 | if len(probas) == 0:
131 | probas = self.model(Xbatch).data.cpu().numpy()
132 | else:
133 | probas = np.concatenate((probas, self.model(Xbatch).data.cpu().numpy()), axis=0)
134 | return probas
135 |
--------------------------------------------------------------------------------
/SentEval/senteval/engine.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 |
10 | Generic sentence evaluation scripts wrapper
11 |
12 | '''
13 | from __future__ import absolute_import, division, unicode_literals
14 |
15 | from senteval import utils
16 | from senteval.binary import CREval, MREval, MPQAEval, SUBJEval
17 | from senteval.snli import SNLIEval
18 | from senteval.trec import TRECEval
19 | from senteval.sick import SICKEntailmentEval, SICKEval
20 | from senteval.mrpc import MRPCEval
21 | from senteval.sts import STS12Eval, STS13Eval, STS14Eval, STS15Eval, STS16Eval, STSBenchmarkEval, SICKRelatednessEval, STSBenchmarkFinetune
22 | from senteval.sst import SSTEval
23 | from senteval.rank import ImageCaptionRetrievalEval
24 | from senteval.probing import *
25 |
26 | class SE(object):
27 | def __init__(self, params, batcher, prepare=None):
28 | # parameters
29 | params = utils.dotdict(params)
30 | params.usepytorch = True if 'usepytorch' not in params else params.usepytorch
31 | params.seed = 1111 if 'seed' not in params else params.seed
32 |
33 | params.batch_size = 128 if 'batch_size' not in params else params.batch_size
34 | params.nhid = 0 if 'nhid' not in params else params.nhid
35 | params.kfold = 5 if 'kfold' not in params else params.kfold
36 |
37 | if 'classifier' not in params or not params['classifier']:
38 | params.classifier = {'nhid': 0}
39 |
40 | assert 'nhid' in params.classifier, 'Set number of hidden units in classifier config!!'
41 |
42 | self.params = params
43 |
44 | # batcher and prepare
45 | self.batcher = batcher
46 | self.prepare = prepare if prepare else lambda x, y: None
47 |
48 | self.list_tasks = ['CR', 'MR', 'MPQA', 'SUBJ', 'SST2', 'SST5', 'TREC', 'MRPC',
49 | 'SICKRelatedness', 'SICKEntailment', 'STSBenchmark',
50 | 'SNLI', 'ImageCaptionRetrieval', 'STS12', 'STS13',
51 | 'STS14', 'STS15', 'STS16',
52 | 'Length', 'WordContent', 'Depth', 'TopConstituents',
53 | 'BigramShift', 'Tense', 'SubjNumber', 'ObjNumber',
54 | 'OddManOut', 'CoordinationInversion', 'SICKRelatedness-finetune', 'STSBenchmark-finetune', 'STSBenchmark-fix']
55 |
56 | def eval(self, name):
57 | # evaluate on evaluation [name], either takes string or list of strings
58 | if (isinstance(name, list)):
59 | self.results = {x: self.eval(x) for x in name}
60 | return self.results
61 |
62 | tpath = self.params.task_path
63 | assert name in self.list_tasks, str(name) + ' not in ' + str(self.list_tasks)
64 |
65 | # Original SentEval tasks
66 | if name == 'CR':
67 | self.evaluation = CREval(tpath + '/downstream/CR', seed=self.params.seed)
68 | elif name == 'MR':
69 | self.evaluation = MREval(tpath + '/downstream/MR', seed=self.params.seed)
70 | elif name == 'MPQA':
71 | self.evaluation = MPQAEval(tpath + '/downstream/MPQA', seed=self.params.seed)
72 | elif name == 'SUBJ':
73 | self.evaluation = SUBJEval(tpath + '/downstream/SUBJ', seed=self.params.seed)
74 | elif name == 'SST2':
75 | self.evaluation = SSTEval(tpath + '/downstream/SST/binary', nclasses=2, seed=self.params.seed)
76 | elif name == 'SST5':
77 | self.evaluation = SSTEval(tpath + '/downstream/SST/fine', nclasses=5, seed=self.params.seed)
78 | elif name == 'TREC':
79 | self.evaluation = TRECEval(tpath + '/downstream/TREC', seed=self.params.seed)
80 | elif name == 'MRPC':
81 | self.evaluation = MRPCEval(tpath + '/downstream/MRPC', seed=self.params.seed)
82 | elif name == 'SICKRelatedness':
83 | self.evaluation = SICKRelatednessEval(tpath + '/downstream/SICK', seed=self.params.seed)
84 | elif name == 'STSBenchmark':
85 | self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
86 | elif name == 'STSBenchmark-fix':
87 | self.evaluation = STSBenchmarkEval(tpath + '/downstream/STS/STSBenchmark-fix', seed=self.params.seed)
88 | elif name == 'STSBenchmark-finetune':
89 | self.evaluation = STSBenchmarkFinetune(tpath + '/downstream/STS/STSBenchmark', seed=self.params.seed)
90 | elif name == 'SICKRelatedness-finetune':
91 | self.evaluation = SICKEval(tpath + '/downstream/SICK', seed=self.params.seed)
92 | elif name == 'SICKEntailment':
93 | self.evaluation = SICKEntailmentEval(tpath + '/downstream/SICK', seed=self.params.seed)
94 | elif name == 'SNLI':
95 | self.evaluation = SNLIEval(tpath + '/downstream/SNLI', seed=self.params.seed)
96 | elif name in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
97 | fpath = name + '-en-test'
98 | self.evaluation = eval(name + 'Eval')(tpath + '/downstream/STS/' + fpath, seed=self.params.seed)
99 | elif name == 'ImageCaptionRetrieval':
100 | self.evaluation = ImageCaptionRetrievalEval(tpath + '/downstream/COCO', seed=self.params.seed)
101 |
102 | # Probing Tasks
103 | elif name == 'Length':
104 | self.evaluation = LengthEval(tpath + '/probing', seed=self.params.seed)
105 | elif name == 'WordContent':
106 | self.evaluation = WordContentEval(tpath + '/probing', seed=self.params.seed)
107 | elif name == 'Depth':
108 | self.evaluation = DepthEval(tpath + '/probing', seed=self.params.seed)
109 | elif name == 'TopConstituents':
110 | self.evaluation = TopConstituentsEval(tpath + '/probing', seed=self.params.seed)
111 | elif name == 'BigramShift':
112 | self.evaluation = BigramShiftEval(tpath + '/probing', seed=self.params.seed)
113 | elif name == 'Tense':
114 | self.evaluation = TenseEval(tpath + '/probing', seed=self.params.seed)
115 | elif name == 'SubjNumber':
116 | self.evaluation = SubjNumberEval(tpath + '/probing', seed=self.params.seed)
117 | elif name == 'ObjNumber':
118 | self.evaluation = ObjNumberEval(tpath + '/probing', seed=self.params.seed)
119 | elif name == 'OddManOut':
120 | self.evaluation = OddManOutEval(tpath + '/probing', seed=self.params.seed)
121 | elif name == 'CoordinationInversion':
122 | self.evaluation = CoordinationInversionEval(tpath + '/probing', seed=self.params.seed)
123 |
124 | self.params.current_task = name
125 | self.evaluation.do_prepare(self.params, self.prepare)
126 |
127 | self.results = self.evaluation.run(self.params, self.batcher)
128 |
129 | return self.results
130 |
--------------------------------------------------------------------------------
/SentEval/senteval/probing.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 | probing tasks
10 | '''
11 |
12 | from __future__ import absolute_import, division, unicode_literals
13 |
14 | import os
15 | import io
16 | import copy
17 | import logging
18 | import numpy as np
19 |
20 | from senteval.tools.validation import SplitClassifier
21 |
22 |
23 | class PROBINGEval(object):
24 | def __init__(self, task, task_path, seed=1111):
25 | self.seed = seed
26 | self.task = task
27 | logging.debug('***** (Probing) Transfer task : %s classification *****', self.task.upper())
28 | self.task_data = {'train': {'X': [], 'y': []},
29 | 'dev': {'X': [], 'y': []},
30 | 'test': {'X': [], 'y': []}}
31 | self.loadFile(task_path)
32 | logging.info('Loaded %s train - %s dev - %s test for %s' %
33 | (len(self.task_data['train']['y']), len(self.task_data['dev']['y']),
34 | len(self.task_data['test']['y']), self.task))
35 |
36 | def do_prepare(self, params, prepare):
37 | samples = self.task_data['train']['X'] + self.task_data['dev']['X'] + \
38 | self.task_data['test']['X']
39 | return prepare(params, samples)
40 |
41 | def loadFile(self, fpath):
42 | self.tok2split = {'tr': 'train', 'va': 'dev', 'te': 'test'}
43 | with io.open(fpath, 'r', encoding='utf-8') as f:
44 | for line in f:
45 | line = line.rstrip().split('\t')
46 | self.task_data[self.tok2split[line[0]]]['X'].append(line[-1].split())
47 | self.task_data[self.tok2split[line[0]]]['y'].append(line[1])
48 |
49 | labels = sorted(np.unique(self.task_data['train']['y']))
50 | self.tok2label = dict(zip(labels, range(len(labels))))
51 | self.nclasses = len(self.tok2label)
52 |
53 | for split in self.task_data:
54 | for i, y in enumerate(self.task_data[split]['y']):
55 | self.task_data[split]['y'][i] = self.tok2label[y]
56 |
57 | def run(self, params, batcher):
58 | task_embed = {'train': {}, 'dev': {}, 'test': {}}
59 | bsize = params.batch_size
60 | logging.info('Computing embeddings for train/dev/test')
61 | for key in self.task_data:
62 | # Sort to reduce padding
63 | sorted_data = sorted(zip(self.task_data[key]['X'],
64 | self.task_data[key]['y']),
65 | key=lambda z: (len(z[0]), z[1]))
66 | self.task_data[key]['X'], self.task_data[key]['y'] = map(list, zip(*sorted_data))
67 |
68 | task_embed[key]['X'] = []
69 | for ii in range(0, len(self.task_data[key]['y']), bsize):
70 | batch = self.task_data[key]['X'][ii:ii + bsize]
71 | embeddings = batcher(params, batch)
72 | task_embed[key]['X'].append(embeddings)
73 | task_embed[key]['X'] = np.vstack(task_embed[key]['X'])
74 | task_embed[key]['y'] = np.array(self.task_data[key]['y'])
75 | logging.info('Computed embeddings')
76 |
77 | config_classifier = {'nclasses': self.nclasses, 'seed': self.seed,
78 | 'usepytorch': params.usepytorch,
79 | 'classifier': params.classifier}
80 |
81 | if self.task == "WordContent" and params.classifier['nhid'] > 0:
82 | config_classifier = copy.deepcopy(config_classifier)
83 | config_classifier['classifier']['nhid'] = 0
84 | print(params.classifier['nhid'])
85 |
86 | clf = SplitClassifier(X={'train': task_embed['train']['X'],
87 | 'valid': task_embed['dev']['X'],
88 | 'test': task_embed['test']['X']},
89 | y={'train': task_embed['train']['y'],
90 | 'valid': task_embed['dev']['y'],
91 | 'test': task_embed['test']['y']},
92 | config=config_classifier)
93 |
94 | devacc, testacc = clf.run()
95 | logging.debug('\nDev acc : %.1f Test acc : %.1f for %s classification\n' % (devacc, testacc, self.task.upper()))
96 |
97 | return {'devacc': devacc, 'acc': testacc,
98 | 'ndev': len(task_embed['dev']['X']),
99 | 'ntest': len(task_embed['test']['X'])}
100 |
101 | """
102 | Surface Information
103 | """
104 | class LengthEval(PROBINGEval):
105 | def __init__(self, task_path, seed=1111):
106 | task_path = os.path.join(task_path, 'sentence_length.txt')
107 | # labels: bins
108 | PROBINGEval.__init__(self, 'Length', task_path, seed)
109 |
110 | class WordContentEval(PROBINGEval):
111 | def __init__(self, task_path, seed=1111):
112 | task_path = os.path.join(task_path, 'word_content.txt')
113 | # labels: 200 target words
114 | PROBINGEval.__init__(self, 'WordContent', task_path, seed)
115 |
116 | """
117 | Latent Structural Information
118 | """
119 | class DepthEval(PROBINGEval):
120 | def __init__(self, task_path, seed=1111):
121 | task_path = os.path.join(task_path, 'tree_depth.txt')
122 | # labels: bins
123 | PROBINGEval.__init__(self, 'Depth', task_path, seed)
124 |
125 | class TopConstituentsEval(PROBINGEval):
126 | def __init__(self, task_path, seed=1111):
127 | task_path = os.path.join(task_path, 'top_constituents.txt')
128 | # labels: 'PP_NP_VP_.' .. (20 classes)
129 | PROBINGEval.__init__(self, 'TopConstituents', task_path, seed)
130 |
131 | class BigramShiftEval(PROBINGEval):
132 | def __init__(self, task_path, seed=1111):
133 | task_path = os.path.join(task_path, 'bigram_shift.txt')
134 | # labels: 0 or 1
135 | PROBINGEval.__init__(self, 'BigramShift', task_path, seed)
136 |
137 | # TODO: Voice?
138 |
139 | """
140 | Latent Semantic Information
141 | """
142 |
143 | class TenseEval(PROBINGEval):
144 | def __init__(self, task_path, seed=1111):
145 | task_path = os.path.join(task_path, 'past_present.txt')
146 | # labels: 'PRES', 'PAST'
147 | PROBINGEval.__init__(self, 'Tense', task_path, seed)
148 |
149 | class SubjNumberEval(PROBINGEval):
150 | def __init__(self, task_path, seed=1111):
151 | task_path = os.path.join(task_path, 'subj_number.txt')
152 | # labels: 'NN', 'NNS'
153 | PROBINGEval.__init__(self, 'SubjNumber', task_path, seed)
154 |
155 | class ObjNumberEval(PROBINGEval):
156 | def __init__(self, task_path, seed=1111):
157 | task_path = os.path.join(task_path, 'obj_number.txt')
158 | # labels: 'NN', 'NNS'
159 | PROBINGEval.__init__(self, 'ObjNumber', task_path, seed)
160 |
161 | class OddManOutEval(PROBINGEval):
162 | def __init__(self, task_path, seed=1111):
163 | task_path = os.path.join(task_path, 'odd_man_out.txt')
164 | # labels: 'O', 'C'
165 | PROBINGEval.__init__(self, 'OddManOut', task_path, seed)
166 |
167 | class CoordinationInversionEval(PROBINGEval):
168 | def __init__(self, task_path, seed=1111):
169 | task_path = os.path.join(task_path, 'coordination_inversion.txt')
170 | # labels: 'O', 'I'
171 | PROBINGEval.__init__(self, 'CoordinationInversion', task_path, seed)
172 |
--------------------------------------------------------------------------------
/SentEval/senteval/tools/classifier.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | """
9 | Pytorch Classifier class in the style of scikit-learn
10 | Classifiers include Logistic Regression and MLP
11 | """
12 |
13 | from __future__ import absolute_import, division, unicode_literals
14 |
15 | import numpy as np
16 | import copy
17 | from senteval import utils
18 |
19 | import torch
20 | from torch import nn
21 | import torch.nn.functional as F
22 |
23 |
24 | class PyTorchClassifier(object):
25 | def __init__(self, inputdim, nclasses, l2reg=0., batch_size=64, seed=1111,
26 | cudaEfficient=False):
27 | # fix seed
28 | np.random.seed(seed)
29 | torch.manual_seed(seed)
30 | torch.cuda.manual_seed(seed)
31 |
32 | self.inputdim = inputdim
33 | self.nclasses = nclasses
34 | self.l2reg = l2reg
35 | self.batch_size = batch_size
36 | self.cudaEfficient = cudaEfficient
37 |
38 | def prepare_split(self, X, y, validation_data=None, validation_split=None):
39 | # Preparing validation data
40 | assert validation_split or validation_data
41 | if validation_data is not None:
42 | trainX, trainy = X, y
43 | devX, devy = validation_data
44 | else:
45 | permutation = np.random.permutation(len(X))
46 | trainidx = permutation[int(validation_split * len(X)):]
47 | devidx = permutation[0:int(validation_split * len(X))]
48 | trainX, trainy = X[trainidx], y[trainidx]
49 | devX, devy = X[devidx], y[devidx]
50 |
51 | device = torch.device('cpu') if self.cudaEfficient else torch.device('cuda')
52 |
53 | trainX = torch.from_numpy(trainX).to(device, dtype=torch.float32)
54 | trainy = torch.from_numpy(trainy).to(device, dtype=torch.int64)
55 | devX = torch.from_numpy(devX).to(device, dtype=torch.float32)
56 | devy = torch.from_numpy(devy).to(device, dtype=torch.int64)
57 |
58 | return trainX, trainy, devX, devy
59 |
60 | def fit(self, X, y, validation_data=None, validation_split=None,
61 | early_stop=True):
62 | self.nepoch = 0
63 | bestaccuracy = -1
64 | stop_train = False
65 | early_stop_count = 0
66 |
67 | # Preparing validation data
68 | trainX, trainy, devX, devy = self.prepare_split(X, y, validation_data,
69 | validation_split)
70 |
71 | # Training
72 | while not stop_train and self.nepoch <= self.max_epoch:
73 | self.trainepoch(trainX, trainy, epoch_size=self.epoch_size)
74 | accuracy = self.score(devX, devy)
75 | if accuracy > bestaccuracy:
76 | bestaccuracy = accuracy
77 | bestmodel = copy.deepcopy(self.model)
78 | elif early_stop:
79 | if early_stop_count >= self.tenacity:
80 | stop_train = True
81 | early_stop_count += 1
82 | self.model = bestmodel
83 | return bestaccuracy
84 |
85 | def trainepoch(self, X, y, epoch_size=1):
86 | self.model.train()
87 | for _ in range(self.nepoch, self.nepoch + epoch_size):
88 | permutation = np.random.permutation(len(X))
89 | all_costs = []
90 | for i in range(0, len(X), self.batch_size):
91 | # forward
92 | idx = torch.from_numpy(permutation[i:i + self.batch_size]).long().to(X.device)
93 |
94 | Xbatch = X[idx]
95 | ybatch = y[idx]
96 |
97 | if self.cudaEfficient:
98 | Xbatch = Xbatch.cuda()
99 | ybatch = ybatch.cuda()
100 | output = self.model(Xbatch)
101 | # loss
102 | loss = self.loss_fn(output, ybatch)
103 | all_costs.append(loss.data.item())
104 | # backward
105 | self.optimizer.zero_grad()
106 | loss.backward()
107 | # Update parameters
108 | self.optimizer.step()
109 | self.nepoch += epoch_size
110 |
111 | def score(self, devX, devy):
112 | self.model.eval()
113 | correct = 0
114 | if not isinstance(devX, torch.cuda.FloatTensor) or self.cudaEfficient:
115 | devX = torch.FloatTensor(devX).cuda()
116 | devy = torch.LongTensor(devy).cuda()
117 | with torch.no_grad():
118 | for i in range(0, len(devX), self.batch_size):
119 | Xbatch = devX[i:i + self.batch_size]
120 | ybatch = devy[i:i + self.batch_size]
121 | if self.cudaEfficient:
122 | Xbatch = Xbatch.cuda()
123 | ybatch = ybatch.cuda()
124 | output = self.model(Xbatch)
125 | pred = output.data.max(1)[1]
126 | correct += pred.long().eq(ybatch.data.long()).sum().item()
127 | accuracy = 1.0 * correct / len(devX)
128 | return accuracy
129 |
130 | def predict(self, devX):
131 | self.model.eval()
132 | if not isinstance(devX, torch.cuda.FloatTensor):
133 | devX = torch.FloatTensor(devX).cuda()
134 | yhat = np.array([])
135 | with torch.no_grad():
136 | for i in range(0, len(devX), self.batch_size):
137 | Xbatch = devX[i:i + self.batch_size]
138 | output = self.model(Xbatch)
139 | yhat = np.append(yhat,
140 | output.data.max(1)[1].cpu().numpy())
141 | yhat = np.vstack(yhat)
142 | return yhat
143 |
144 | def predict_proba(self, devX):
145 | self.model.eval()
146 | probas = []
147 | with torch.no_grad():
148 | for i in range(0, len(devX), self.batch_size):
149 | Xbatch = devX[i:i + self.batch_size]
150 | vals = F.softmax(self.model(Xbatch).data.cpu().numpy())
151 | if not probas:
152 | probas = vals
153 | else:
154 | probas = np.concatenate(probas, vals, axis=0)
155 | return probas
156 |
157 |
158 | """
159 | MLP with Pytorch (nhid=0 --> Logistic Regression)
160 | """
161 |
162 | class MLP(PyTorchClassifier):
163 | def __init__(self, params, inputdim, nclasses, l2reg=0., batch_size=64,
164 | seed=1111, cudaEfficient=False):
165 | super(self.__class__, self).__init__(inputdim, nclasses, l2reg,
166 | batch_size, seed, cudaEfficient)
167 | """
168 | PARAMETERS:
169 | -nhid: number of hidden units (0: Logistic Regression)
170 | -optim: optimizer ("sgd,lr=0.1", "adam", "rmsprop" ..)
171 | -tenacity: how many times dev acc does not increase before stopping
172 | -epoch_size: each epoch corresponds to epoch_size pass on the train set
173 | -max_epoch: max number of epoches
174 | -dropout: dropout for MLP
175 | """
176 |
177 | self.nhid = 0 if "nhid" not in params else params["nhid"]
178 | self.optim = "adam" if "optim" not in params else params["optim"]
179 | self.tenacity = 5 if "tenacity" not in params else params["tenacity"]
180 | self.epoch_size = 4 if "epoch_size" not in params else params["epoch_size"]
181 | self.max_epoch = 200 if "max_epoch" not in params else params["max_epoch"]
182 | self.dropout = 0. if "dropout" not in params else params["dropout"]
183 | self.batch_size = 64 if "batch_size" not in params else params["batch_size"]
184 |
185 | if params["nhid"] == 0:
186 | self.model = nn.Sequential(
187 | nn.Linear(self.inputdim, self.nclasses),
188 | ).cuda()
189 | else:
190 | self.model = nn.Sequential(
191 | nn.Linear(self.inputdim, params["nhid"]),
192 | nn.Dropout(p=self.dropout),
193 | nn.Sigmoid(),
194 | nn.Linear(params["nhid"], self.nclasses),
195 | ).cuda()
196 |
197 | self.loss_fn = nn.CrossEntropyLoss().cuda()
198 | self.loss_fn.size_average = False
199 |
200 | optim_fn, optim_params = utils.get_optimizer(self.optim)
201 | self.optimizer = optim_fn(self.model.parameters(), **optim_params)
202 | self.optimizer.param_groups[0]['weight_decay'] = self.l2reg
203 |
--------------------------------------------------------------------------------
/evaluation.py:
--------------------------------------------------------------------------------
1 | import sys
2 | import io, os
3 | import numpy as np
4 | import logging
5 | import argparse
6 | from prettytable import PrettyTable
7 | import torch
8 | import transformers
9 | from transformers import AutoModel, AutoTokenizer
10 |
11 | # Set up logger
12 | logging.basicConfig(format='%(asctime)s : %(message)s', level=logging.DEBUG)
13 |
14 | # Set PATHs
15 | PATH_TO_SENTEVAL = './SentEval'
16 | PATH_TO_DATA = './SentEval/data'
17 |
18 | # Import SentEval
19 | sys.path.insert(0, PATH_TO_SENTEVAL)
20 | import senteval
21 |
22 |
23 | def print_table(task_names, scores):
24 | tb = PrettyTable()
25 | tb.field_names = task_names
26 | tb.add_row(scores)
27 | print(tb)
28 |
29 |
30 | def main():
31 | parser = argparse.ArgumentParser()
32 | parser.add_argument("--model_name_or_path", type=str,
33 | help="Transformers' model name or path")
34 | parser.add_argument("--pooler", type=str,
35 | choices=['cls', 'cls_before_pooler', 'avg', 'avg_top2', 'avg_first_last'],
36 | default='cls',
37 | help="Which pooler to use")
38 | parser.add_argument("--mode", type=str,
39 | choices=['dev', 'test', 'fasttest'],
40 | default='test',
41 | help="What evaluation mode to use (dev: fast mode, dev results; test: full mode, test results); fasttest: fast mode, test results")
42 | parser.add_argument("--task_set", type=str,
43 | choices=['sts', 'transfer', 'full', 'na'],
44 | default='sts',
45 | help="What set of tasks to evaluate on. If not 'na', this will override '--tasks'")
46 | parser.add_argument("--tasks", type=str, nargs='+',
47 | default=['STS12', 'STS13', 'STS14', 'STS15', 'STS16',
48 | 'MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC',
49 | 'SICKRelatedness', 'STSBenchmark'],
50 | help="Tasks to evaluate on. If '--task_set' is specified, this will be overridden")
51 |
52 | args = parser.parse_args()
53 |
54 | # Load transformers' model checkpoint
55 | model = AutoModel.from_pretrained(args.model_name_or_path)
56 | tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
57 | device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
58 | model = model.to(device)
59 |
60 | # Set up the tasks
61 | if args.task_set == 'sts':
62 | args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
63 | elif args.task_set == 'transfer':
64 | args.tasks = ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']
65 | elif args.task_set == 'full':
66 | args.tasks = ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']
67 | args.tasks += ['MR', 'CR', 'MPQA', 'SUBJ', 'SST2', 'TREC', 'MRPC']
68 |
69 | # Set params for SentEval
70 | if args.mode == 'dev' or args.mode == 'fasttest':
71 | # Fast mode
72 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 5}
73 | params['classifier'] = {'nhid': 0, 'optim': 'rmsprop', 'batch_size': 128,
74 | 'tenacity': 3, 'epoch_size': 2}
75 | elif args.mode == 'test':
76 | # Full mode
77 | params = {'task_path': PATH_TO_DATA, 'usepytorch': True, 'kfold': 10}
78 | params['classifier'] = {'nhid': 0, 'optim': 'adam', 'batch_size': 64,
79 | 'tenacity': 5, 'epoch_size': 4}
80 | else:
81 | raise NotImplementedError
82 |
83 | # SentEval prepare and batcher
84 | def prepare(params, samples):
85 | return
86 |
87 | def batcher(params, batch, max_length=None):
88 | # Handle rare token encoding issues in the dataset
89 | if len(batch) >= 1 and len(batch[0]) >= 1 and isinstance(batch[0][0], bytes):
90 | batch = [[word.decode('utf-8') for word in s] for s in batch]
91 |
92 | sentences = [' '.join(s) for s in batch]
93 |
94 | # Tokenization
95 | if max_length is not None:
96 | batch = tokenizer.batch_encode_plus(
97 | sentences,
98 | return_tensors='pt',
99 | padding=True,
100 | max_length=max_length,
101 | truncation=True
102 | )
103 | else:
104 | batch = tokenizer.batch_encode_plus(
105 | sentences,
106 | return_tensors='pt',
107 | padding=True,
108 | )
109 |
110 | # Move to the correct device
111 | for k in batch:
112 | batch[k] = batch[k].to(device)
113 |
114 | # Get raw embeddings
115 | with torch.no_grad():
116 | outputs = model(**batch, output_hidden_states=True, return_dict=True)
117 | last_hidden = outputs.last_hidden_state
118 | pooler_output = outputs.pooler_output
119 | hidden_states = outputs.hidden_states
120 |
121 | # Apply different poolers
122 | if args.pooler == 'cls':
123 | # There is a linear+activation layer after CLS representation
124 | return pooler_output.cpu()
125 | elif args.pooler == 'cls_before_pooler':
126 | return last_hidden[:, 0].cpu()
127 | elif args.pooler == "avg":
128 | return ((last_hidden * batch['attention_mask'].unsqueeze(-1)).sum(1) / batch['attention_mask'].sum(
129 | -1).unsqueeze(-1)).cpu()
130 | elif args.pooler == "avg_first_last":
131 | first_hidden = hidden_states[1]
132 | last_hidden = hidden_states[-1]
133 | pooled_result = ((first_hidden + last_hidden) / 2.0 * batch['attention_mask'].unsqueeze(-1)).sum(1) / batch[
134 | 'attention_mask'].sum(-1).unsqueeze(-1)
135 | return pooled_result.cpu()
136 | elif args.pooler == "avg_top2":
137 | second_last_hidden = hidden_states[-2]
138 | last_hidden = hidden_states[-1]
139 | pooled_result = ((last_hidden + second_last_hidden) / 2.0 * batch['attention_mask'].unsqueeze(-1)).sum(1) / \
140 | batch['attention_mask'].sum(-1).unsqueeze(-1)
141 | return pooled_result.cpu()
142 | else:
143 | raise NotImplementedError
144 |
145 | results = {}
146 |
147 | for task in args.tasks:
148 | se = senteval.engine.SE(params, batcher, prepare)
149 | result = se.eval(task)
150 | results[task] = result
151 |
152 | # Print evaluation results
153 | if args.mode == 'dev':
154 | print("------ %s ------" % (args.mode))
155 |
156 | task_names = []
157 | scores = []
158 | for task in ['STSBenchmark', 'SICKRelatedness']:
159 | task_names.append(task)
160 | if task in results:
161 | scores.append("%.2f" % (results[task]['dev']['spearman'][0] * 100))
162 | else:
163 | scores.append("0.00")
164 | print_table(task_names, scores)
165 |
166 | task_names = []
167 | scores = []
168 | for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']:
169 | task_names.append(task)
170 | if task in results:
171 | scores.append("%.2f" % (results[task]['devacc']))
172 | else:
173 | scores.append("0.00")
174 | task_names.append("Avg.")
175 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
176 | print_table(task_names, scores)
177 |
178 | elif args.mode == 'test' or args.mode == 'fasttest':
179 | print("------ %s ------" % (args.mode))
180 |
181 | task_names = []
182 | scores = []
183 | for task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16', 'STSBenchmark', 'SICKRelatedness']:
184 | task_names.append(task)
185 | if task in results:
186 | if task in ['STS12', 'STS13', 'STS14', 'STS15', 'STS16']:
187 | scores.append("%.2f" % (results[task]['all']['spearman']['all'] * 100))
188 | else:
189 | scores.append("%.2f" % (results[task]['test']['spearman'].correlation * 100))
190 | else:
191 | scores.append("0.00")
192 | task_names.append("Avg.")
193 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
194 | print_table(task_names, scores)
195 |
196 | task_names = []
197 | scores = []
198 | for task in ['MR', 'CR', 'SUBJ', 'MPQA', 'SST2', 'TREC', 'MRPC']:
199 | task_names.append(task)
200 | if task in results:
201 | scores.append("%.2f" % (results[task]['acc']))
202 | else:
203 | scores.append("0.00")
204 | task_names.append("Avg.")
205 | scores.append("%.2f" % (sum([float(score) for score in scores]) / len(scores)))
206 | print_table(task_names, scores)
207 |
208 |
209 | if __name__ == "__main__":
210 | main()
--------------------------------------------------------------------------------
/SentEval/senteval/sick.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 | SICK Relatedness and Entailment
10 | '''
11 | from __future__ import absolute_import, division, unicode_literals
12 |
13 | import os
14 | import io
15 | import logging
16 | import numpy as np
17 |
18 | from sklearn.metrics import mean_squared_error
19 | from scipy.stats import pearsonr, spearmanr
20 |
21 | from senteval.tools.relatedness import RelatednessPytorch
22 | from senteval.tools.validation import SplitClassifier
23 |
24 | class SICKEval(object):
25 | def __init__(self, task_path, seed=1111):
26 | logging.debug('***** Transfer task : SICK-Relatedness*****\n\n')
27 | self.seed = seed
28 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
29 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
30 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
31 | self.sick_data = {'train': train, 'dev': dev, 'test': test}
32 |
33 | def do_prepare(self, params, prepare):
34 | samples = self.sick_data['train']['X_A'] + \
35 | self.sick_data['train']['X_B'] + \
36 | self.sick_data['dev']['X_A'] + \
37 | self.sick_data['dev']['X_B'] + \
38 | self.sick_data['test']['X_A'] + self.sick_data['test']['X_B']
39 | return prepare(params, samples)
40 |
41 | def loadFile(self, fpath):
42 | skipFirstLine = True
43 | sick_data = {'X_A': [], 'X_B': [], 'y': []}
44 | with io.open(fpath, 'r', encoding='utf-8') as f:
45 | for line in f:
46 | if skipFirstLine:
47 | skipFirstLine = False
48 | else:
49 | text = line.strip().split('\t')
50 | sick_data['X_A'].append(text[1].split())
51 | sick_data['X_B'].append(text[2].split())
52 | sick_data['y'].append(text[3])
53 |
54 | sick_data['y'] = [float(s) for s in sick_data['y']]
55 | return sick_data
56 |
57 | def run(self, params, batcher):
58 | sick_embed = {'train': {}, 'dev': {}, 'test': {}}
59 | bsize = params.batch_size
60 |
61 | for key in self.sick_data:
62 | logging.info('Computing embedding for {0}'.format(key))
63 | # Sort to reduce padding
64 | sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
65 | self.sick_data[key]['X_B'],
66 | self.sick_data[key]['y']),
67 | key=lambda z: (len(z[0]), len(z[1]), z[2]))
68 |
69 | self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
70 | self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
71 | self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
72 |
73 | for txt_type in ['X_A', 'X_B']:
74 | sick_embed[key][txt_type] = []
75 | for ii in range(0, len(self.sick_data[key]['y']), bsize):
76 | batch = self.sick_data[key][txt_type][ii:ii + bsize]
77 | embeddings = batcher(params, batch)
78 | sick_embed[key][txt_type].append(embeddings)
79 | sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
80 | sick_embed[key]['y'] = np.array(self.sick_data[key]['y'])
81 | logging.info('Computed {0} embeddings'.format(key))
82 |
83 | # Train
84 | trainA = sick_embed['train']['X_A']
85 | trainB = sick_embed['train']['X_B']
86 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
87 | trainY = self.encode_labels(self.sick_data['train']['y'])
88 |
89 | # Dev
90 | devA = sick_embed['dev']['X_A']
91 | devB = sick_embed['dev']['X_B']
92 | devF = np.c_[np.abs(devA - devB), devA * devB]
93 | devY = self.encode_labels(self.sick_data['dev']['y'])
94 |
95 | # Test
96 | testA = sick_embed['test']['X_A']
97 | testB = sick_embed['test']['X_B']
98 | testF = np.c_[np.abs(testA - testB), testA * testB]
99 | testY = self.encode_labels(self.sick_data['test']['y'])
100 |
101 | config = {'seed': self.seed, 'nclasses': 5}
102 | clf = RelatednessPytorch(train={'X': trainF, 'y': trainY},
103 | valid={'X': devF, 'y': devY},
104 | test={'X': testF, 'y': testY},
105 | devscores=self.sick_data['dev']['y'],
106 | config=config)
107 |
108 | devspr, yhat = clf.run()
109 |
110 | pr = pearsonr(yhat, self.sick_data['test']['y'])[0]
111 | sr = spearmanr(yhat, self.sick_data['test']['y'])[0]
112 | pr = 0 if pr != pr else pr
113 | sr = 0 if sr != sr else sr
114 | se = mean_squared_error(yhat, self.sick_data['test']['y'])
115 | logging.debug('Dev : Spearman {0}'.format(devspr))
116 | logging.debug('Test : Pearson {0} Spearman {1} MSE {2} \
117 | for SICK Relatedness\n'.format(pr, sr, se))
118 |
119 | return {'devspearman': devspr, 'pearson': pr, 'spearman': sr, 'mse': se,
120 | 'yhat': yhat, 'ndev': len(devA), 'ntest': len(testA)}
121 |
122 | def encode_labels(self, labels, nclass=5):
123 | """
124 | Label encoding from Tree LSTM paper (Tai, Socher, Manning)
125 | """
126 | Y = np.zeros((len(labels), nclass)).astype('float32')
127 | for j, y in enumerate(labels):
128 | for i in range(nclass):
129 | if i+1 == np.floor(y) + 1:
130 | Y[j, i] = y - np.floor(y)
131 | if i+1 == np.floor(y):
132 | Y[j, i] = np.floor(y) - y + 1
133 | return Y
134 |
135 |
136 | class SICKEntailmentEval(SICKEval):
137 | def __init__(self, task_path, seed=1111):
138 | logging.debug('***** Transfer task : SICK-Entailment*****\n\n')
139 | self.seed = seed
140 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
141 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
142 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
143 | self.sick_data = {'train': train, 'dev': dev, 'test': test}
144 |
145 | def loadFile(self, fpath):
146 | label2id = {'CONTRADICTION': 0, 'NEUTRAL': 1, 'ENTAILMENT': 2}
147 | skipFirstLine = True
148 | sick_data = {'X_A': [], 'X_B': [], 'y': []}
149 | with io.open(fpath, 'r', encoding='utf-8') as f:
150 | for line in f:
151 | if skipFirstLine:
152 | skipFirstLine = False
153 | else:
154 | text = line.strip().split('\t')
155 | sick_data['X_A'].append(text[1].split())
156 | sick_data['X_B'].append(text[2].split())
157 | sick_data['y'].append(text[4])
158 | sick_data['y'] = [label2id[s] for s in sick_data['y']]
159 | return sick_data
160 |
161 | def run(self, params, batcher):
162 | sick_embed = {'train': {}, 'dev': {}, 'test': {}}
163 | bsize = params.batch_size
164 | for key in self.sick_data:
165 | logging.info('Computing embedding for {0}'.format(key))
166 | # Sort to reduce padding
167 | sorted_corpus = sorted(zip(self.sick_data[key]['X_A'],
168 | self.sick_data[key]['X_B'],
169 | self.sick_data[key]['y']),
170 | key=lambda z: (len(z[0]), len(z[1]), z[2]))
171 |
172 | self.sick_data[key]['X_A'] = [x for (x, y, z) in sorted_corpus]
173 | self.sick_data[key]['X_B'] = [y for (x, y, z) in sorted_corpus]
174 | self.sick_data[key]['y'] = [z for (x, y, z) in sorted_corpus]
175 |
176 | for txt_type in ['X_A', 'X_B']:
177 | sick_embed[key][txt_type] = []
178 | for ii in range(0, len(self.sick_data[key]['y']), bsize):
179 | batch = self.sick_data[key][txt_type][ii:ii + bsize]
180 | embeddings = batcher(params, batch)
181 | sick_embed[key][txt_type].append(embeddings)
182 | sick_embed[key][txt_type] = np.vstack(sick_embed[key][txt_type])
183 | logging.info('Computed {0} embeddings'.format(key))
184 |
185 | # Train
186 | trainA = sick_embed['train']['X_A']
187 | trainB = sick_embed['train']['X_B']
188 | trainF = np.c_[np.abs(trainA - trainB), trainA * trainB]
189 | trainY = np.array(self.sick_data['train']['y'])
190 |
191 | # Dev
192 | devA = sick_embed['dev']['X_A']
193 | devB = sick_embed['dev']['X_B']
194 | devF = np.c_[np.abs(devA - devB), devA * devB]
195 | devY = np.array(self.sick_data['dev']['y'])
196 |
197 | # Test
198 | testA = sick_embed['test']['X_A']
199 | testB = sick_embed['test']['X_B']
200 | testF = np.c_[np.abs(testA - testB), testA * testB]
201 | testY = np.array(self.sick_data['test']['y'])
202 |
203 | config = {'nclasses': 3, 'seed': self.seed,
204 | 'usepytorch': params.usepytorch,
205 | 'classifier': params.classifier,
206 | 'nhid': params.nhid}
207 | clf = SplitClassifier(X={'train': trainF, 'valid': devF, 'test': testF},
208 | y={'train': trainY, 'valid': devY, 'test': testY},
209 | config=config)
210 |
211 | devacc, testacc = clf.run()
212 | logging.debug('\nDev acc : {0} Test acc : {1} for \
213 | SICK entailment\n'.format(devacc, testacc))
214 | return {'devacc': devacc, 'acc': testacc,
215 | 'ndev': len(devA), 'ntest': len(testA)}
216 |
--------------------------------------------------------------------------------
/SentEval/senteval/sts.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | '''
9 | STS-{2012,2013,2014,2015,2016} (unsupervised) and
10 | STS-benchmark (supervised) tasks
11 | '''
12 |
13 | from __future__ import absolute_import, division, unicode_literals
14 |
15 | import os
16 | import io
17 | import numpy as np
18 | import logging
19 |
20 | from scipy.stats import spearmanr, pearsonr
21 |
22 | from senteval.utils import cosine
23 | from senteval.sick import SICKEval
24 |
25 |
26 | class STSEval(object):
27 | def loadFile(self, fpath):
28 | self.data = {}
29 | self.samples = []
30 |
31 | for dataset in self.datasets:
32 | sent1, sent2 = zip(*[l.split("\t") for l in
33 | io.open(fpath + '/STS.input.%s.txt' % dataset,
34 | encoding='utf8').read().splitlines()])
35 | raw_scores = np.array([x for x in
36 | io.open(fpath + '/STS.gs.%s.txt' % dataset,
37 | encoding='utf8')
38 | .read().splitlines()])
39 | not_empty_idx = raw_scores != ''
40 |
41 | gs_scores = [float(x) for x in raw_scores[not_empty_idx]]
42 | sent1 = np.array([s.split() for s in sent1])[not_empty_idx]
43 | sent2 = np.array([s.split() for s in sent2])[not_empty_idx]
44 | # sort data by length to minimize padding in batcher
45 | sorted_data = sorted(zip(sent1, sent2, gs_scores),
46 | key=lambda z: (len(z[0]), len(z[1]), z[2]))
47 | sent1, sent2, gs_scores = map(list, zip(*sorted_data))
48 |
49 | self.data[dataset] = (sent1, sent2, gs_scores)
50 | self.samples += sent1 + sent2
51 |
52 | def do_prepare(self, params, prepare):
53 | if 'similarity' in params:
54 | self.similarity = params.similarity
55 | else: # Default similarity is cosine
56 | self.similarity = lambda s1, s2: np.nan_to_num(cosine(np.nan_to_num(s1), np.nan_to_num(s2)))
57 | return prepare(params, self.samples)
58 |
59 | def run(self, params, batcher):
60 | results = {}
61 | all_sys_scores = []
62 | all_gs_scores = []
63 | for dataset in self.datasets:
64 | sys_scores = []
65 | input1, input2, gs_scores = self.data[dataset]
66 | for ii in range(0, len(gs_scores), params.batch_size):
67 | batch1 = input1[ii:ii + params.batch_size]
68 | batch2 = input2[ii:ii + params.batch_size]
69 |
70 | # we assume get_batch already throws out the faulty ones
71 | if len(batch1) == len(batch2) and len(batch1) > 0:
72 | enc1 = batcher(params, batch1)
73 | enc2 = batcher(params, batch2)
74 |
75 | for kk in range(enc2.shape[0]):
76 | sys_score = self.similarity(enc1[kk], enc2[kk])
77 | sys_scores.append(sys_score)
78 | all_sys_scores.extend(sys_scores)
79 | all_gs_scores.extend(gs_scores)
80 | results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores),
81 | 'spearman': spearmanr(sys_scores, gs_scores),
82 | 'nsamples': len(sys_scores)}
83 | logging.debug('%s : pearson = %.4f, spearman = %.4f' %
84 | (dataset, results[dataset]['pearson'][0],
85 | results[dataset]['spearman'][0]))
86 |
87 | weights = [results[dset]['nsamples'] for dset in results.keys()]
88 | list_prs = np.array([results[dset]['pearson'][0] for
89 | dset in results.keys()])
90 | list_spr = np.array([results[dset]['spearman'][0] for
91 | dset in results.keys()])
92 |
93 | avg_pearson = np.average(list_prs)
94 | avg_spearman = np.average(list_spr)
95 | wavg_pearson = np.average(list_prs, weights=weights)
96 | wavg_spearman = np.average(list_spr, weights=weights)
97 | all_pearson = pearsonr(all_sys_scores, all_gs_scores)
98 | all_spearman = spearmanr(all_sys_scores, all_gs_scores)
99 | results['all'] = {'pearson': {'all': all_pearson[0],
100 | 'mean': avg_pearson,
101 | 'wmean': wavg_pearson},
102 | 'spearman': {'all': all_spearman[0],
103 | 'mean': avg_spearman,
104 | 'wmean': wavg_spearman}}
105 | logging.debug('ALL : Pearson = %.4f, \
106 | Spearman = %.4f' % (all_pearson[0], all_spearman[0]))
107 | logging.debug('ALL (weighted average) : Pearson = %.4f, \
108 | Spearman = %.4f' % (wavg_pearson, wavg_spearman))
109 | logging.debug('ALL (average) : Pearson = %.4f, \
110 | Spearman = %.4f\n' % (avg_pearson, avg_spearman))
111 |
112 | return results
113 |
114 |
115 | class STS12Eval(STSEval):
116 | def __init__(self, taskpath, seed=1111):
117 | logging.debug('***** Transfer task : STS12 *****\n\n')
118 | self.seed = seed
119 | self.datasets = ['MSRpar', 'MSRvid', 'SMTeuroparl',
120 | 'surprise.OnWN', 'surprise.SMTnews']
121 | self.loadFile(taskpath)
122 |
123 |
124 | class STS13Eval(STSEval):
125 | # STS13 here does not contain the "SMT" subtask due to LICENSE issue
126 | def __init__(self, taskpath, seed=1111):
127 | logging.debug('***** Transfer task : STS13 (-SMT) *****\n\n')
128 | self.seed = seed
129 | self.datasets = ['FNWN', 'headlines', 'OnWN']
130 | self.loadFile(taskpath)
131 |
132 |
133 | class STS14Eval(STSEval):
134 | def __init__(self, taskpath, seed=1111):
135 | logging.debug('***** Transfer task : STS14 *****\n\n')
136 | self.seed = seed
137 | self.datasets = ['deft-forum', 'deft-news', 'headlines',
138 | 'images', 'OnWN', 'tweet-news']
139 | self.loadFile(taskpath)
140 |
141 |
142 | class STS15Eval(STSEval):
143 | def __init__(self, taskpath, seed=1111):
144 | logging.debug('***** Transfer task : STS15 *****\n\n')
145 | self.seed = seed
146 | self.datasets = ['answers-forums', 'answers-students',
147 | 'belief', 'headlines', 'images']
148 | self.loadFile(taskpath)
149 |
150 |
151 | class STS16Eval(STSEval):
152 | def __init__(self, taskpath, seed=1111):
153 | logging.debug('***** Transfer task : STS16 *****\n\n')
154 | self.seed = seed
155 | self.datasets = ['answer-answer', 'headlines', 'plagiarism',
156 | 'postediting', 'question-question']
157 | self.loadFile(taskpath)
158 |
159 |
160 | class STSBenchmarkEval(STSEval):
161 | def __init__(self, task_path, seed=1111):
162 | logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
163 | self.seed = seed
164 | self.samples = []
165 | train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
166 | dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
167 | test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
168 | self.datasets = ['train', 'dev', 'test']
169 | self.data = {'train': train, 'dev': dev, 'test': test}
170 |
171 | def loadFile(self, fpath):
172 | sick_data = {'X_A': [], 'X_B': [], 'y': []}
173 | with io.open(fpath, 'r', encoding='utf-8') as f:
174 | for line in f:
175 | text = line.strip().split('\t')
176 | sick_data['X_A'].append(text[5].split())
177 | sick_data['X_B'].append(text[6].split())
178 | sick_data['y'].append(text[4])
179 |
180 | sick_data['y'] = [float(s) for s in sick_data['y']]
181 | self.samples += sick_data['X_A'] + sick_data["X_B"]
182 | return (sick_data['X_A'], sick_data["X_B"], sick_data['y'])
183 |
184 | class STSBenchmarkFinetune(SICKEval):
185 | def __init__(self, task_path, seed=1111):
186 | logging.debug('\n\n***** Transfer task : STSBenchmark*****\n\n')
187 | self.seed = seed
188 | train = self.loadFile(os.path.join(task_path, 'sts-train.csv'))
189 | dev = self.loadFile(os.path.join(task_path, 'sts-dev.csv'))
190 | test = self.loadFile(os.path.join(task_path, 'sts-test.csv'))
191 | self.sick_data = {'train': train, 'dev': dev, 'test': test}
192 |
193 | def loadFile(self, fpath):
194 | sick_data = {'X_A': [], 'X_B': [], 'y': []}
195 | with io.open(fpath, 'r', encoding='utf-8') as f:
196 | for line in f:
197 | text = line.strip().split('\t')
198 | sick_data['X_A'].append(text[5].split())
199 | sick_data['X_B'].append(text[6].split())
200 | sick_data['y'].append(text[4])
201 |
202 | sick_data['y'] = [float(s) for s in sick_data['y']]
203 | return sick_data
204 |
205 | class SICKRelatednessEval(STSEval):
206 | def __init__(self, task_path, seed=1111):
207 | logging.debug('\n\n***** Transfer task : SICKRelatedness*****\n\n')
208 | self.seed = seed
209 | self.samples = []
210 | train = self.loadFile(os.path.join(task_path, 'SICK_train.txt'))
211 | dev = self.loadFile(os.path.join(task_path, 'SICK_trial.txt'))
212 | test = self.loadFile(os.path.join(task_path, 'SICK_test_annotated.txt'))
213 | self.datasets = ['train', 'dev', 'test']
214 | self.data = {'train': train, 'dev': dev, 'test': test}
215 |
216 | def loadFile(self, fpath):
217 | skipFirstLine = True
218 | sick_data = {'X_A': [], 'X_B': [], 'y': []}
219 | with io.open(fpath, 'r', encoding='utf-8') as f:
220 | for line in f:
221 | if skipFirstLine:
222 | skipFirstLine = False
223 | else:
224 | text = line.strip().split('\t')
225 | sick_data['X_A'].append(text[1].split())
226 | sick_data['X_B'].append(text[2].split())
227 | sick_data['y'].append(text[3])
228 |
229 | sick_data['y'] = [float(s) for s in sick_data['y']]
230 | self.samples += sick_data['X_A'] + sick_data["X_B"]
231 | return (sick_data['X_A'], sick_data["X_B"], sick_data['y'])
232 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Contrastive Learning of Sentence Embeddings from Scratch
2 | This is the official repo for the [paper](https://arxiv.org/abs/2305.15077)
3 |
4 | ```
5 | Contrastive Learning of Sentence embeddings from scratch
6 | Junlei Zhang, Zhenzhong Lan, Junxian He
7 | Preprint 2023
8 | ```
9 |
10 | We propose SynCSE, an unsupervised sentence embedding learning approach that trains sentence embeddings from scratch, without any (unlabeled) data samples. Specifically, we use ChatGPT to synthesize the positive and hard negative samples (SynCSE-partial) given unlabeled sentences, or synthesize the unlabeled sentences, positive, and hard negative samples altogether (SynCSE-scratch). We release the synthetic SynCSE-partial and SynCSE-scratch datasets along with the model checkpoints.
11 |
12 | ## Updates
13 |
14 | * [2023-06-02]: We released our model checkpoints and datasets
15 | * [2023-05-23]: We released [our paper](https://arxiv.org/abs/2305.15077). Check it out!
16 |
17 |
18 | ## Quick Links
19 |
20 | - [Model Checkpoints](#model-checkpoints)
21 | - [Datasets](#datasets)
22 | - [Train SynCSE](#train-SynCSE)
23 | - [Requirements](#requirements)
24 | - [Training](#training)
25 | - [Evaluation](#evaluation)
26 | - [Acknowledgement](#acknowledgement)
27 | - [Citation](#citation)
28 |
29 | ## Model Checkpoints
30 |
31 | We release our model checkpoints in huggingface as listed below:
32 | | Model | Avg. STS |
33 | |:-------------------------------|:--------:|
34 | | [sjtu-lit/SynCSE-partial-RoBERTa-base](https://huggingface.co/sjtu-lit/SynCSE-partial-RoBERTa-base) | 81.84 |
35 | | [sjtu-lit/SynCSE-partial-RoBERTa-large](https://huggingface.co/sjtu-lit/SynCSE-partial-RoBERTa-large) | 82.66 |
36 | | [sjtu-lit/SynCSE-scratch-RoBERTa-base](https://huggingface.co/sjtu-lit/SynCSE-scratch-RoBERTa-base) | 80.66 |
37 | | [sjtu-lit/SynCSE-partial-RoBERTa-large](https://huggingface.co/sjtu-lit/SynCSE-partial-RoBERTa-large) |81.84|
38 |
39 | The results slightly differ from what we report in the paper, because we clean the dataset to remove failure generations such as: "I can not generate a paraphrased sentence because the input is ambiguous."
40 |
41 | ### Load and Use the checkpoints
42 | #### encoding sentences into embeddings
43 | ```python
44 | from transformers import AutoTokenizer, AutoModel
45 | tokenizer = AutoTokenizer.from_pretrained("sjtu-lit/SynCSE-partial-RoBERTa-large")
46 | model = AutoModel.from_pretrained("sjtu-lit/SynCSE-partial-RoBERTa-large")
47 | embeddings = model.encode("A woman is reading.")
48 | ```
49 |
50 | #### Compute the cosine similarities between two groups of sentences
51 | ```python
52 | sentences_a = ['A woman is reading.', 'A man is playing a guitar.']
53 | sentences_b = ['He plays guitar.', 'A woman is making a photo.']
54 | similarities = model.similarity(sentences_a, sentences_b)
55 | ```
56 |
57 | #### Build index for a group of sentences and search among them
58 | ```python
59 | sentences = ['A woman is reading.', 'A man is playing a guitar.']
60 | model.build_index(sentences)
61 | results = model.search("He plays guitar.")
62 | ```
63 | If you encounter any problem when directly loading the models by HuggingFace's API, you can also download the models manually from the above table and use `model = AutoModel.from_pretrained({PATH TO THE DOWNLOAD MODEL})`.
64 |
65 |
66 | ## Datasets
67 | | Dataset |
68 | |:-------------------------------|
69 | | [sjtu-lit/SynCSE-partial-NLI](https://huggingface.co/datasets/sjtu-lit/SynCSE-partial-NLI) |
70 | | [sjtu-lit/SynCSE-scratch-NLI](https://huggingface.co/datasets/sjtu-lit/SynCSE-scratch-NLI) |
71 |
72 | These two synthetic datasets are respectively used for the SynCSE-partial and SynCSE-scratch experimental setups. For SynCSE-partial, we use the unlabeled data from the NLI dataset used by SimCSE and generate labels for them. For SynCSE-scratch, we generate unlabeled data and their corresponding labels.
73 |
74 | To download the data, take SynCSE-partial for an example:
75 | ```
76 | wget https://huggingface.co/datasets/sjtu-lit/SynCSE-partial-NLI/resolve/resolve/train.csv
77 | ```
78 |
79 | ## Train SynCSE
80 |
81 | ### Requirements
82 |
83 | First, install PyTorch by following the instructions from [the official website](https://pytorch.org). We use the `1.13.0+cu116` pytorch version. We train our model on a single A100-80G card.
84 |
85 | Then run the following script to install the remaining dependencies,
86 |
87 | ```bash
88 | pip install -r requirements.txt
89 | ```
90 |
91 | ### Training
92 |
93 | #### Data
94 | You can specify `sjtu-lit/SynCSE-partial-NLI` or `sjtu-lit/SynCSE-scratch-NLI` in the scripts/sup_train_mp.sh. It will download the dataset automatically. You can also download the SynCSE-partial-NLI and the SynCSE-scratch-NLI [datasets](#datasets), and put them into the data folder.
95 |
96 | #### Training scripts
97 |
98 | We provide example training scripts for both training SynCSE in `scripts/sup_train_mp.sh`. Below are explanations of some arguments:
99 | * `--model_name_or_path`: Pre-trained checkpoints to start with. For now we support BERT-based models (`bert-base-uncased`, `bert-large-uncased`, etc.) and RoBERTa-based models (`RoBERTa-base`, `RoBERTa-large`, etc.).
100 | * `--temp`: Temperature for the contrastive loss.
101 | * `--pooler_type`: Pooling method. It's the same as the `--pooler_type` in the [evaluation part](#evaluation).
102 | * `--hard_negative_weight`: If using hard negatives (i.e., there are 3 columns in the training file), this is the logarithm of the weight. For example, if the weight is 1, then this argument should be set as 0 (default value).
103 | * `--do_mlm`: Whether to use the MLM auxiliary objective. If True:
104 | * `--mlm_weight`: Weight for the MLM objective.
105 | * `--mlm_probability`: Masking rate for the MLM objective.
106 |
107 | All the other arguments are standard Huggingface's `transformers` training arguments. Some often-used arguments are: `--output_dir`, `--learning_rate`, `--per_device_train_batch_size`.
108 |
109 | For results in the paper, we use Nvidia A100 (80G) GPUs with CUDA 11.6 Using different types of devices or different versions of CUDA/other softwares may lead to slightly different performance.
110 |
111 | #### Hyperparameters
112 |
113 | We use the following hyperparamters for training SynCSE:
114 | - Batch size: 512
115 | - Learning rate (base): 5e-5
116 | - Learning rate (large): 1e-5
117 |
118 | #### Convert models
119 |
120 | Our saved checkpoints are slightly different from Huggingface's pre-trained checkpoints. Run `python simcse_to_huggingface.py --path {PATH_TO_CHECKPOINT_FOLDER}` to convert it.
121 |
122 | ### Evaluation
123 | Our evaluation code for sentence embeddings is based on a modified version of [SentEval](https://github.com/facebookresearch/SentEval). It evaluates sentence embeddings on semantic textual similarity (STS) tasks and downstream transfer tasks. For STS tasks, our evaluation takes the "all" setting, and report Spearman's correlation.
124 |
125 | Before evaluation, please download the evaluation datasets by running
126 | ```bash
127 | cd SentEval/data/downstream/
128 | bash download_dataset.sh
129 | ```
130 | Then come back to the root directory, you can evaluate any `transformers`-based pre-trained models using our evaluation code. For example,
131 | ```
132 | bash ./scripts/eval.sh
133 | ```
134 | which is expected to output the results in a tabular format:
135 | ```
136 | ------ test ------
137 | +-------+-------+-------+-------+-------+--------------+-----------------+-------+
138 | | STS12 | STS13 | STS14 | STS15 | STS16 | STSBenchmark | SICKRelatedness | Avg. |
139 | +-------+-------+-------+-------+-------+--------------+-----------------+-------+
140 | | 76.14 | 84.41 | 79.23 | 84.85 | 82.87 | 83.95 | 81.41 | 81.84 |
141 | +-------+-------+-------+-------+-------+--------------+-----------------+-------+
142 | ```
143 |
144 | Arguments for the evaluation script are as follows,
145 |
146 | * `--model_name_or_path`: The name or path of a `transformers`-based pre-trained checkpoint. You can directly use the models in the above table, e.g., `sjtu-lit/SynCSE-scratch-RoBERTa-base`.
147 | * `--pooler`: Pooling method. Now we support
148 | * `cls` (default): Use the representation of `[CLS]` token.
149 | * `avg`: Average embeddings of the last layer. If you use checkpoints of SBERT/SRoBERTa ([paper](https://arxiv.org/abs/1908.10084)), you should use this option.
150 | * `avg_top2`: Average embeddings of the last two layers.
151 | * `avg_first_last`: Average embeddings of the first and last layers. If you use vanilla BERT or RoBERTa, this works the best.
152 | * `--mode`: Evaluation mode
153 | * `test` (default): The default test mode. To faithfully reproduce our results, you should use this option.
154 | * `dev`: Report the development set results. Note that in STS tasks, only `STS-B` and `SICK-R` have development sets, so we only report their numbers. It also takes a fast mode for transfer tasks, so the running time is much shorter than the `test` mode (though numbers are slightly lower).
155 | * `fasttest`: It is the same as `test`, but with a fast mode so the running time is much shorter, but the reported numbers may be lower (only for transfer tasks).
156 | * `--task_set`: What set of tasks to evaluate on (if set, it will override `--tasks`)
157 | * `sts` (default): Evaluate on STS tasks, including `STS 12~16`, `STS-B` and `SICK-R`. This is the most commonly-used set of tasks to evaluate the quality of sentence embeddings.
158 | * `transfer`: Evaluate on transfer tasks.
159 | * `full`: Evaluate on both STS and transfer tasks.
160 | * `na`: Manually set tasks by `--tasks`.
161 | * `--tasks`: Specify which dataset(s) to evaluate on. Will be overridden if `--task_set` is not `na`. See the code for a full list of tasks.
162 |
163 | ## Acknowledgement
164 | Our training code is based on the [SimCSE repo](https://github.com/princeton-nlp/SimCSE), and the evaluatio code is based on the [SentEval repo](https://github.com/facebookresearch/SentEval)
165 |
166 | ## Bugs or questions?
167 |
168 | If you have any questions related to the code or the paper, feel free to email Junlei (`zhangjunlei@westlake.edu.cn`). If you encounter any problems when using the code, or want to report a bug, you can open an issue. Please try to specify the problem with details so we can help you better and quicker!
169 |
170 | ## Citation
171 |
172 | Please cite our paper if you use SynCSE:
173 |
174 | ```bibtex
175 | @article{zhang2023contrastive,
176 | title={Contrastive Learning of Sentence Embeddings from Scratch},
177 | author={Zhang, Junlei and Lan, Zhenzhong and He, Junxian},
178 | journal={arXiv preprint arXiv:2305.15077},
179 | year={2023}
180 | }
181 | ```
182 |
--------------------------------------------------------------------------------
/SentEval/examples/models.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2017-present, Facebook, Inc.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 |
8 | """
9 | This file contains the definition of encoders used in https://arxiv.org/pdf/1705.02364.pdf
10 | """
11 |
12 | import numpy as np
13 | import time
14 |
15 | import torch
16 | import torch.nn as nn
17 |
18 |
19 | class InferSent(nn.Module):
20 |
21 | def __init__(self, config):
22 | super(InferSent, self).__init__()
23 | self.bsize = config['bsize']
24 | self.word_emb_dim = config['word_emb_dim']
25 | self.enc_lstm_dim = config['enc_lstm_dim']
26 | self.pool_type = config['pool_type']
27 | self.dpout_model = config['dpout_model']
28 | self.version = 1 if 'version' not in config else config['version']
29 |
30 | self.enc_lstm = nn.LSTM(self.word_emb_dim, self.enc_lstm_dim, 1,
31 | bidirectional=True, dropout=self.dpout_model)
32 |
33 | assert self.version in [1, 2]
34 | if self.version == 1:
35 | self.bos = ' '
41 | self.eos = ''] = 1e9 + 4
41 | words[''] = 1e9 + 3
42 | words[''
36 | self.eos = ''
37 | self.max_pad = True
38 | self.moses_tok = False
39 | elif self.version == 2:
40 | self.bos = '