├── models ├── __init__.py ├── bword2vec.py └── support.py ├── libraries ├── tools │ ├── __init__.py │ ├── ordered_attrs.py │ ├── attr_order_preservation.py │ ├── log.py │ └── word_processor.py ├── utils │ ├── __init__.py │ ├── other.py │ └── paths_and_files.py ├── evaluation │ ├── __init__.py │ ├── GloVe │ │ ├── __init__.py │ │ ├── distance.py │ │ ├── word_analogy.py │ │ ├── README.md │ │ └── evaluate.py │ ├── entailment │ │ ├── __init__.py │ │ └── data │ │ │ └── bench │ │ │ └── baroni2012 │ │ │ ├── data_lex_val copy.tsv │ │ │ ├── data_lex_val.tsv │ │ │ └── data_rnd_val.tsv │ ├── lexsub │ │ ├── __init__.py │ │ ├── jcs │ │ │ ├── __init__.py │ │ │ ├── data │ │ │ │ ├── __init__.py │ │ │ │ ├── pos.py │ │ │ │ ├── tree2conll.py │ │ │ │ ├── conll_line.py │ │ │ │ ├── context_instance.py │ │ │ │ └── embedding.py │ │ │ ├── evaluation │ │ │ │ ├── __init__.py │ │ │ │ ├── lst │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── coinco_split_dev_test.py │ │ │ │ │ ├── special_word_marker.py │ │ │ │ │ ├── extract_lst_candidates.py │ │ │ │ │ ├── lst_gap.py │ │ │ │ │ └── preprocess_lst_test.py │ │ │ │ └── measures │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── generalized_average_precision.py │ │ │ ├── text2numpy.py │ │ │ ├── embedding_inferrer.py │ │ │ ├── jcs_io.py │ │ │ ├── cs_inferrer.py │ │ │ └── context2vec_inferrer.py │ │ ├── main │ │ │ ├── __init__.py │ │ │ ├── simulators_interfaces │ │ │ │ ├── __init__.py │ │ │ │ └── bsg_simulator_interface.py │ │ │ ├── pos.py │ │ │ ├── skipgram_embeddings.py │ │ │ ├── lex_sub.py │ │ │ ├── support.py │ │ │ └── context_instance.py │ │ ├── datasets │ │ │ ├── lst_all.xml │ │ │ ├── lst_all.preprocessed │ │ │ ├── lst_test.preprocessed │ │ │ └── coinco_all.no_problematic.gold │ │ ├── run_lexsub.py │ │ └── README.md │ ├── word_sim │ │ ├── __init__.py │ │ ├── .README.md.swp │ │ ├── data │ │ │ └── word-sim │ │ │ │ ├── EN-MC-30.txt │ │ │ │ ├── EN-RG-65.txt │ │ │ │ ├── EN-YP-130.txt │ │ │ │ ├── EN-VERB-143.txt │ │ │ │ ├── EN-WS-353-SIM.txt │ │ │ │ └── EN-WS-353-REL.txt │ │ ├── read_write.py │ │ ├── wordsim.py │ │ ├── all_wordsim.py │ │ ├── README.md │ │ └── ranking.py │ └── support.py ├── simulators │ ├── __init__.py │ ├── base_simulator.py │ ├── bsg_simulator.py │ └── support.py ├── data_iterators │ ├── __init__.py │ ├── support.py │ ├── base_data_iterator.py │ └── open_text_data_iterator.py ├── __init__.py ├── misc │ ├── __init__.py │ ├── non_linearity.py │ ├── optimizations.py │ └── initializers.py ├── tokenizers │ ├── __init__.py │ ├── standard_tokenizer.py │ └── bsg_tokenizer.py ├── batch_iterators │ ├── __init__.py │ ├── base_batch_iterator.py │ ├── support.py │ ├── sentence_batch_iterator.py │ └── window_batch_iterator.py └── theano_support │ ├── __init__.py │ └── extra.py ├── eval ├── __init__.py ├── example_word_pairs.txt ├── word_pairs_eval.py └── support.py ├── interfaces ├── __init__.py ├── support.py ├── i_bsg.py ├── interface_configurator.py └── i_base.py ├── layers ├── __init__.py ├── custom │ ├── __init__.py │ └── bsg_encoder.py ├── standard │ ├── __init__.py │ ├── embeddings.py │ └── dense.py ├── support.py └── layer.py ├── .gitignore ├── requirements.txt ├── run_bsg.py └── README.md /models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/simulators/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/evaluation/GloVe/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /eval/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/data_iterators/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /libraries/evaluation/entailment/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /layers/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/data/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/main/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .idea 2 | *.pyc 3 | output 4 | .DS_Store -------------------------------------------------------------------------------- /layers/custom/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /layers/standard/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/misc/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/lst/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/tokenizers/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/batch_iterators/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/measures/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /libraries/theano_support/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/main/simulators_interfaces/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'topLvl' 2 | -------------------------------------------------------------------------------- /libraries/evaluation/entailment/data/bench/baroni2012/data_lex_val copy.tsv: -------------------------------------------------------------------------------- 1 | mosque castle True 2 | -------------------------------------------------------------------------------- /eval/example_word_pairs.txt: -------------------------------------------------------------------------------- 1 | pet cat 2 | knight human 3 | coffee drink 4 | dog animal 5 | coffee espresso 6 | cow animal 7 | -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/.README.md.swp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/word_sim/.README.md.swp -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/datasets/lst_all.xml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/lexsub/datasets/lst_all.xml -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | Theano==0.9.0 2 | numpy==1.14.2 3 | nltk==3.2.2, 4 | scipy==0.18.1, 5 | -e git+https://github.com/Lasagne/Lasagne.git#egg=Lasagne-0.2.dev1 -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/datasets/lst_all.preprocessed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/lexsub/datasets/lst_all.preprocessed -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/datasets/lst_test.preprocessed: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/lexsub/datasets/lst_test.preprocessed -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/datasets/coinco_all.no_problematic.gold: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/lexsub/datasets/coinco_all.no_problematic.gold -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/main/pos.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import wordnet 2 | 3 | to_wordnet_pos = {'N':wordnet.NOUN,'J':wordnet.ADJ,'V':wordnet.VERB,'R':wordnet.ADV} 4 | from_lst_pos = {'j':'J','a':'J', 'v':'V', 'n':'N', 'r':'R'} -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/data/pos.py: -------------------------------------------------------------------------------- 1 | from nltk.corpus import wordnet 2 | 3 | to_wordnet_pos = {'N':wordnet.NOUN,'J':wordnet.ADJ,'V':wordnet.VERB,'R':wordnet.ADV} 4 | from_lst_pos = {'j':'J','a':'J', 'v':'V', 'n':'N', 'r':'R'} -------------------------------------------------------------------------------- /libraries/data_iterators/support.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import unicodedata 3 | 4 | # removes/replaces strange symbols like é 5 | def deal_with_accents(str): 6 | return unicodedata.normalize('NFD', str)#.encode('ascii', 'ignore') 7 | 8 | -------------------------------------------------------------------------------- /libraries/tools/ordered_attrs.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | 4 | class OrderedAttrs(): 5 | """ 6 | Makes sure that attributes are stored in the order of assignment. 7 | 8 | """ 9 | def __init__(self): 10 | self.__dict__ = OrderedDict() 11 | 12 | def __setattr__(self, key, value): 13 | self.__dict__[key] = value -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/data/tree2conll.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from conll_line import ConllLine 3 | 4 | for tree_line in sys.stdin: 5 | tree_line_stripped = tree_line.strip() 6 | if len(tree_line_stripped) > 0: 7 | conll = ConllLine() 8 | conll.from_tree_line(tree_line_stripped) 9 | sys.stdout.write(str(conll)+'\n') 10 | else: 11 | sys.stdout.write('\n') -------------------------------------------------------------------------------- /libraries/tools/attr_order_preservation.py: -------------------------------------------------------------------------------- 1 | # A solution to preserve the order of attribute assignment 2 | from collections import OrderedDict 3 | 4 | 5 | class AttrOrderPreservation: 6 | def __init__(self, obj): 7 | self.obj = obj 8 | self.attr_order = OrderedDict() 9 | 10 | def add_attr(self, name, value): 11 | self.attr_order[name] = value 12 | setattr(self.obj,name, value) -------------------------------------------------------------------------------- /libraries/simulators/base_simulator.py: -------------------------------------------------------------------------------- 1 | from support import load 2 | 3 | 4 | class BaseSimulator: 5 | def __init__(self, vocab, model_file_path): 6 | """ 7 | The passed in the .pkl format model has to have "encode" and "compute_prior_params" methods. 8 | 9 | """ 10 | self.vocab = vocab 11 | self.model = load(model_file_path) 12 | assert hasattr(self.model, 'encode') 13 | -------------------------------------------------------------------------------- /libraries/data_iterators/base_data_iterator.py: -------------------------------------------------------------------------------- 1 | from nltk import word_tokenize as default_tokenizer 2 | 3 | 4 | class BaseDataIterator(): 5 | 6 | def __init__(self, tokenizer=None, input_encoding='utf-8'): 7 | """ 8 | Base data iterator that contains the general set_data_path method 9 | 10 | """ 11 | self.tokenizer = tokenizer if tokenizer else default_tokenizer 12 | self.data_path = None 13 | self.input_encoding = input_encoding 14 | 15 | def set_data_path(self, data_path): 16 | self.data_path = data_path 17 | -------------------------------------------------------------------------------- /libraries/misc/non_linearity.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | 3 | 4 | class NonLinearity(): 5 | def __init__(self, type="linear"): 6 | assert type in ['linear', 'relu', 'sigmoid', 'hard_sigmoid', 'tanh'] 7 | self.type = type 8 | 9 | def __call__(self, x): 10 | if self.type == 'linear': 11 | return x 12 | if self.type == "relu": 13 | return T.nnet.relu(x) 14 | if self.type == 'sigmoid': 15 | return T.nnet.sigmoid(x) 16 | if self.type == 'hard_sigmoid': 17 | return T.nnet.hard_sigmoid(x) 18 | if self.type == 'tanh': 19 | return T.tanh(x) 20 | 21 | 22 | -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/data/word-sim/EN-MC-30.txt: -------------------------------------------------------------------------------- 1 | car automobile 3.92 2 | gem jewel 3.84 3 | journey voyage 3.84 4 | boy lad 3.76 5 | coast shore 3.70 6 | asylum madhouse 3.61 7 | magician wizard 3.50 8 | midday noon 3.42 9 | furnace stove 3.11 10 | food fruit 3.08 11 | bird cock 3.05 12 | bird crane 2.97 13 | tool implement 2.95 14 | brother monk 2.82 15 | lad brother 1.66 16 | crane implement 1.68 17 | journey car 1.16 18 | monk oracle 1.10 19 | cemetery woodland 0.95 20 | food rooster 0.89 21 | coast hill 0.87 22 | forest graveyard 0.84 23 | shore woodland 0.63 24 | monk slave 0.55 25 | coast forest 0.42 26 | lad wizard 0.42 27 | chord smile 0.13 28 | glass magician 0.11 29 | rooster voyage 0.08 30 | noon string 0.08 31 | -------------------------------------------------------------------------------- /layers/support.py: -------------------------------------------------------------------------------- 1 | import theano.tensor as T 2 | import inspect 3 | 4 | def inverse_sigmoid(decay_rate, batch_nr): 5 | """ 6 | Computes inverse sigmoid function's value which is used in schedules sampling. 7 | :param decay_rate: hyper-parameter that controls the decay. 8 | 9 | """ 10 | return decay_rate / (decay_rate + T.exp(batch_nr / decay_rate)) 11 | 12 | 13 | def select_matching_args(func, arguments_dict): 14 | """ 15 | :return a hash with matching the function's arguments dictionary. 16 | """ 17 | func_args = inspect.getargspec(func)[0] 18 | matching_args = {} 19 | for func_arg in func_args: 20 | if func_arg in arguments_dict: 21 | matching_args[func_arg] = arguments_dict[func_arg] 22 | return matching_args -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/main/simulators_interfaces/bsg_simulator_interface.py: -------------------------------------------------------------------------------- 1 | from libraries.simulators.bsg_simulator import BsgSimulator 2 | from libraries.simulators.support import KL 3 | 4 | 5 | class BsgSimulatorInterface: 6 | """ 7 | Interface class to access simulators. 8 | 9 | """ 10 | def __init__(self, vocab, model_file_path): 11 | self.simulator = BsgSimulator(vocab=vocab, model_file_path=model_file_path) 12 | 13 | def score(self, target, left_context, right_context, candidates, **kwargs): 14 | scores = {} 15 | mu_q, sigma_q = self.simulator.encode(context_words=left_context+right_context, center_word=target) 16 | for cand in candidates: 17 | mu_p, sigma_p = self.simulator.get_representation(cand) 18 | scores[cand] = -1*KL(mu_q, sigma_q, mu_p, sigma_p) 19 | return scores -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/read_write.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import gzip 3 | import numpy 4 | import math 5 | 6 | from collections import Counter 7 | from operator import itemgetter 8 | 9 | ''' Read all the word vectors and normalize them ''' 10 | def read_word_vectors(filename): 11 | word_vecs = {} 12 | if filename.endswith('.gz'): file_object = gzip.open(filename, 'r') 13 | else: file_object = open(filename, 'r') 14 | 15 | for line_num, line in enumerate(file_object): 16 | line = line.strip().lower() 17 | word = line.split()[0] 18 | word_vecs[word] = numpy.zeros(len(line.split())-1, dtype=float) 19 | for index, vec_val in enumerate(line.split()[1:]): 20 | word_vecs[word][index] = float(vec_val) 21 | ''' normalize weight vector ''' 22 | word_vecs[word] /= math.sqrt((word_vecs[word]**2).sum() + 1e-6) 23 | 24 | sys.stderr.write("Vectors read from: "+filename+" \n") 25 | return word_vecs 26 | -------------------------------------------------------------------------------- /libraries/tools/log.py: -------------------------------------------------------------------------------- 1 | import os 2 | from libraries.utils.paths_and_files import create_folders_if_not_exist 3 | from time import strftime 4 | 5 | 6 | # A general purpose class for logging 7 | class Log(): 8 | def __init__(self, folder): 9 | self.file_path = os.path.join(folder, "log_"+strftime("%b_%d_%H_%M_%S")+'.txt') 10 | create_folders_if_not_exist(self.file_path) 11 | 12 | def write(self, string, also_print=True, include_timestamp=True): 13 | """ 14 | :param string: what string to write to the log 15 | :param also_print: if set to True, will also print to the console 16 | :param include_timestamp: whether include the timestamp 17 | """ 18 | if include_timestamp: 19 | string = "%s [INFO]: %s" % (strftime("%H:%M:%S"), string) 20 | if also_print: 21 | print(string) 22 | with open(self.file_path, "a") as f: 23 | f.write(string+" \n") 24 | -------------------------------------------------------------------------------- /libraries/theano_support/extra.py: -------------------------------------------------------------------------------- 1 | # this file contains functions that are useful for Theano models. 2 | from theano import tensor as T 3 | 4 | def expand_dims(x, dim=-1): 5 | """Add a 1-sized dimension at index "dim". 6 | """ 7 | # TODO: `keras_shape` inference. 8 | pattern = [i for i in range(x.type.ndim)] 9 | if dim < 0: 10 | if x.type.ndim == 0: 11 | dim = 0 12 | else: 13 | dim = dim % x.type.ndim + 1 14 | pattern.insert(dim, 'x') 15 | return x.dimshuffle(pattern) 16 | 17 | 18 | def add_one_dim(x, dim=-1): 19 | pattern = list(x.shape) 20 | if dim < 0: 21 | if x.type.ndim == 0: 22 | dim = 0 23 | else: 24 | dim = dim % x.type.ndim + 1 25 | pattern.insert(dim, 1) 26 | return T.reshape(x, pattern) 27 | 28 | def squeeze(x, axis): 29 | """Remove a 1-dimension from the tensor at index "axis". 30 | """ 31 | # TODO: `keras_shape` inference. 32 | shape = list(x.shape) 33 | shape.pop(axis) 34 | return T.reshape(x, tuple(shape)) -------------------------------------------------------------------------------- /layers/standard/embeddings.py: -------------------------------------------------------------------------------- 1 | from layers.layer import Layer 2 | from libraries.theano_support.extra import expand_dims 3 | 4 | 5 | class Embeddings(Layer): 6 | def __init__(self, name, collection_size, output_dim, **kwargs): 7 | Layer.__init__(self, name=name, **kwargs) 8 | self.collection_size = collection_size 9 | self.output_dim = output_dim 10 | self.W = self.add_param("W", shape=(collection_size, output_dim), init_type=self.init_type, 11 | regularizable=self.regularizable) 12 | 13 | def __call__(self, x, mask=None, perform_dimshuffle=True): 14 | """ 15 | :return tensor [batch_size, output_dim, sequence_length] or [batch_size, output_dim] 16 | 17 | """ 18 | # x = Print("x")(x) 19 | res = self.W[x] 20 | if mask: 21 | mask = expand_dims(mask, 2) 22 | res = res * mask 23 | # we want to make sure that output_dims are rows, and words are columns 24 | if res.ndim == 3 and perform_dimshuffle: 25 | res = res.dimshuffle((0, 2, 1)) 26 | return res -------------------------------------------------------------------------------- /layers/standard/dense.py: -------------------------------------------------------------------------------- 1 | from theano import tensor as T 2 | from layers.layer import Layer 3 | from libraries.misc.non_linearity import NonLinearity 4 | 5 | 6 | class Dense(Layer): 7 | def __init__(self, name, input_dim, output_dim, non_linearity='linear', init_type='uniform', regularizable=True): 8 | """ 9 | A simple fully connected layer that performs affine transformations following by a non-linearity. 10 | 11 | """ 12 | Layer.__init__(self, name) 13 | self.non_linearity = NonLinearity(type=non_linearity) 14 | self.W = self.add_param(name="W", shape=(input_dim, output_dim), init_type=init_type, 15 | regularizable=regularizable) 16 | self.b = self.add_param(name="b", shape=(output_dim, ), init_type='zeros') 17 | 18 | def __call__(self, x): 19 | """ 20 | :param x: tensor [batch_size, input_dim] 21 | :return: tensor [batch_size, output_dim] 22 | 23 | """ 24 | output = T.dot(x, self.W) + self.b 25 | output = self.non_linearity(output) 26 | return output 27 | 28 | 29 | 30 | -------------------------------------------------------------------------------- /libraries/batch_iterators/base_batch_iterator.py: -------------------------------------------------------------------------------- 1 | from multiprocessing import Process, Queue 2 | 3 | 4 | class BaseBatchIterator(): 5 | 6 | def __init__(self): 7 | pass 8 | 9 | def __iter__(self): 10 | """ 11 | Separate process pre-loading of data and iteration over its batches. 12 | 13 | """ 14 | process, queue = self.__i_parallel_load_data_batches() 15 | process.start() 16 | process.deamon = True 17 | while True: 18 | batch = queue.get() 19 | if batch is None: 20 | process.join() 21 | break # the file has ended 22 | yield batch 23 | 24 | def load_data_batches_to_queue(self, queue): 25 | raise NotImplementedError # this has to be assigned in a subclass 26 | 27 | def __i_parallel_load_data_batches(self, queue_size=5): 28 | """ 29 | Loads batches on a separate process. 30 | 31 | """ 32 | queue = Queue(queue_size) 33 | process = Process(target=self.load_data_batches_to_queue, args=(queue, )) 34 | return process, queue 35 | 36 | -------------------------------------------------------------------------------- /libraries/data_iterators/open_text_data_iterator.py: -------------------------------------------------------------------------------- 1 | from nltk import word_tokenize as default_tokenizer 2 | from support import deal_with_accents 3 | from libraries.utils.paths_and_files import get_file_paths 4 | 5 | 6 | class OpenTextDataIterator(): 7 | 8 | def __init__(self, tokenizer=None): 9 | """ 10 | Text data iterator for open text, that returns tokenized sentences. Assumes that each sentence is separated 11 | by a new line. 12 | 13 | """ 14 | self.tokenizer = tokenizer if tokenizer else default_tokenizer 15 | self.data_path = None 16 | 17 | def set_data_path(self, data_path): 18 | self.data_path = data_path 19 | 20 | def __iter__(self): 21 | if not self.data_path: 22 | raise ValueError("please specify the data_path first by calling set_data_path()") 23 | for filename in get_file_paths(self.data_path): 24 | with open(filename) as f: 25 | for line in f: 26 | tokens = self.tokenizer(deal_with_accents(line.strip().lower().decode('utf-8', 'ignore'))) 27 | yield tokens, 28 | -------------------------------------------------------------------------------- /libraries/misc/optimizations.py: -------------------------------------------------------------------------------- 1 | # This file contains learning rate optimizations 2 | import lasagne 3 | 4 | 5 | class LROpt: 6 | def __init__(self, learning_rate): 7 | self.alpha = learning_rate 8 | 9 | 10 | class Adam(LROpt): 11 | def __init__(self, learning_rate, beta1, beta2): 12 | LROpt.__init__(self, learning_rate=learning_rate) 13 | self.beta1 = beta1 14 | self.beta2 = beta2 15 | 16 | def __call__(self, cost, params): 17 | return lasagne.updates.adam(cost, params, learning_rate=self.alpha, beta1=self.beta1, beta2=self.beta2) 18 | 19 | 20 | class SGD(LROpt): 21 | def __init__(self, learning_rate): 22 | LROpt.__init__(self, learning_rate) 23 | 24 | def __call__(self, cost, params): 25 | return lasagne.updates.sgd(cost, params, learning_rate=self.alpha) 26 | 27 | 28 | class AdaGrad(LROpt): 29 | def __init__(self, learning_rate, eps): 30 | LROpt.__init__(self, learning_rate) 31 | self.eps = eps 32 | 33 | def __call__(self, cost, params): 34 | return lasagne.updates.adagrad(cost, params, learning_rate=self.alpha, epsilon=self.eps) 35 | 36 | -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/wordsim.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from read_write import read_word_vectors 5 | from ranking import * 6 | 7 | 8 | def word_sim(word_vec_file, word_sim_file): 9 | 10 | word_vecs = read_word_vectors(word_vec_file) 11 | print '=================================================================================' 12 | print "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" 13 | print '=================================================================================' 14 | 15 | manual_dict, auto_dict = ({}, {}) 16 | not_found, total_size = (0, 0) 17 | for line in open(word_sim_file,'r'): 18 | line = line.strip().lower() 19 | word1, word2, val = line.split() 20 | if word1 in word_vecs and word2 in word_vecs: 21 | manual_dict[(word1, word2)] = float(val) 22 | auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) 23 | else: 24 | not_found += 1 25 | total_size += 1 26 | print "%15s" % str(total_size), "%15s" % str(not_found), 27 | print "%15.4f" % spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict)) 28 | -------------------------------------------------------------------------------- /libraries/simulators/bsg_simulator.py: -------------------------------------------------------------------------------- 1 | from base_simulator import BaseSimulator 2 | import numpy as np 3 | 4 | 5 | class BsgSimulator(BaseSimulator): 6 | """ 7 | This class will both work for classical BSG and BSG with LSTM encoder. 8 | 9 | """ 10 | def __init__(self, **kwargs): 11 | BaseSimulator.__init__(self, **kwargs) 12 | 13 | def get_representation(self, word): 14 | word_id = self.vocab[word].id 15 | mu = self.model.get_word_mu_rep(word_id) 16 | sigma = self.model.get_word_sigma_rep(word_id) 17 | return mu, sigma 18 | 19 | def encode(self, center_word, context_words): 20 | """ 21 | :param center_word: int 22 | :param context_words: vector of ints 23 | """ 24 | # convert to vocab_ids 25 | center_word_id = np.int32(self.vocab[center_word].id) 26 | context_word_ids = np.array([obj.id for obj in self.vocab[context_words]], dtype="int32") 27 | # generate the mask of ones 28 | mask = np.ones([1, len(context_words)], dtype="float32") 29 | mu, sigma = self.model.encode([context_word_ids], [center_word_id], mask) 30 | return mu[0], sigma[0] 31 | -------------------------------------------------------------------------------- /libraries/tools/word_processor.py: -------------------------------------------------------------------------------- 1 | # word processors that use regular expressions to clean words and tokenization 2 | try: 3 | import re2 as re 4 | except ImportError: 5 | import re 6 | 7 | 8 | class WordProcessor: 9 | def __init__(self, word_processor_type='default'): 10 | self.__allowed_types = ['none', 'default', 'open_text'] 11 | # sanity checks for input 12 | assert word_processor_type in self.__allowed_types 13 | # assigning processing function 14 | if word_processor_type == 'none': 15 | self.__call__ = lambda x: x 16 | if word_processor_type == 'default': 17 | self.__call__ = lambda word: re.sub(r'[^\w_,.?@!$#\':\/\-()]|[,\'?@$#]{2,}', "", word) 18 | if word_processor_type == "open_text": 19 | self.__call__ = self.__open_text_cleaner 20 | 21 | @staticmethod 22 | def __open_text_cleaner(word): 23 | """ 24 | Direct copy from the original BSG setup. The tokens matching logic was moved to bsg_tokenizer.py 25 | 26 | """ 27 | word = re.sub(r'[^\w\'\-]|[\'\-\_]{2,}', "", word) 28 | if len(word) == 1: 29 | word = re.sub(r'[^\daiu]', '', word) 30 | return word -------------------------------------------------------------------------------- /libraries/tokenizers/standard_tokenizer.py: -------------------------------------------------------------------------------- 1 | from libraries.tools.word_processor import WordProcessor 2 | from nltk import word_tokenize as external_tokenizer 3 | 4 | 5 | # A more advanced tokenizer that both tokenizes and cleans textual data 6 | class StandardTokenizer(): 7 | 8 | def __init__(self, word_processor_type='none', use_external_tokenizer=True): 9 | """ 10 | :param use_external_tokenizer: whether to use or word_tokenize tokenizer or rely on the simple splitter(x.split()). 11 | """ 12 | if use_external_tokenizer: 13 | self.tokenizer = external_tokenizer 14 | else: 15 | self.tokenizer = lambda x: x.split() # assuming that data was already tokenized 16 | self.word_processor = WordProcessor(word_processor_type=word_processor_type) 17 | 18 | def __call__(self, sentence): 19 | """ 20 | :param sentence: a string of words 21 | :return: a list of clean tokens 22 | 23 | """ 24 | words = self.tokenizer(sentence) 25 | tokens = [] 26 | for word in words: 27 | token = self.word_processor(word) 28 | if token == "": 29 | continue 30 | tokens.append(token) 31 | return tokens -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/text2numpy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Convert word embeddings from text files to numpy-friendly format 3 | ''' 4 | 5 | import numpy as np 6 | import sys 7 | 8 | 9 | def readVectors(path, header=False): 10 | vectors = {} 11 | with open(path) as input_f: 12 | for i, line in enumerate(input_f.readlines()): 13 | if header and i==0: 14 | continue 15 | if line == "": 16 | continue 17 | tokens = line.strip().split(' ') 18 | vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]]) 19 | return vectors 20 | 21 | inpath = sys.argv[1] 22 | outpath = sys.argv[2] 23 | header = True if sys.argv[3]=="True" else False 24 | 25 | matrix = readVectors(inpath, header=header) 26 | 27 | 28 | print "done reading vectors" 29 | vocab = list(matrix.keys()) 30 | vocab.sort() 31 | with open(outpath+'.vocab', 'w') as output_f: 32 | for word in vocab: 33 | print >>output_f, word, 34 | 35 | new_matrix = np.zeros(shape=(len(vocab), len(matrix[vocab[0]])), dtype=np.float32) 36 | for i, word in enumerate(vocab): 37 | if i%1000 == 0: 38 | print(i) 39 | new_matrix[i, :] = matrix[word] 40 | 41 | print new_matrix.shape 42 | 43 | np.save(outpath+'.npy', new_matrix) -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/all_wordsim.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | 4 | from read_write import read_word_vectors 5 | from ranking import * 6 | 7 | def all_word_sim(word_vec_file, word_sim_dir): 8 | 9 | word_vecs = read_word_vectors(word_vec_file) 10 | print '=================================================================================' 11 | print "%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho" 12 | print '=================================================================================' 13 | 14 | total_rho = 0 15 | for i, filename in enumerate(os.listdir(word_sim_dir)): 16 | manual_dict, auto_dict = ({}, {}) 17 | not_found, total_size = (0, 0) 18 | for line in open(os.path.join(word_sim_dir, filename),'r'): 19 | line = line.strip().lower() 20 | word1, word2, val = line.split() 21 | if word1 in word_vecs and word2 in word_vecs: 22 | manual_dict[(word1, word2)] = float(val) 23 | auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2]) 24 | else: 25 | not_found += 1 26 | total_size += 1 27 | rho = spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict)) 28 | total_rho += rho 29 | print "%6s" % str(i+1), "%20s" % filename, "%15s" % str(total_size), 30 | print "%15s" % str(not_found), 31 | print "%15.4f" % rho 32 | print "Sum of scores: %15.4f" % total_rho 33 | -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/data/word-sim/EN-RG-65.txt: -------------------------------------------------------------------------------- 1 | gem jewel 3.94 2 | midday noon 3.94 3 | automobile car 3.92 4 | cemetery graveyard 3.88 5 | cushion pillow 3.84 6 | boy lad 3.82 7 | cock rooster 3.68 8 | implement tool 3.66 9 | forest woodland 3.65 10 | coast shore 3.60 11 | autograph signature 3.59 12 | journey voyage 3.58 13 | serf slave 3.46 14 | grin smile 3.46 15 | glass tumbler 3.45 16 | cord string 3.41 17 | hill mound 3.29 18 | magician wizard 3.21 19 | furnace stove 3.11 20 | asylum madhouse 3.04 21 | brother monk 2.74 22 | food fruit 2.69 23 | bird cock 2.63 24 | bird crane 2.63 25 | oracle sage 2.61 26 | sage wizard 2.46 27 | brother lad 2.41 28 | crane implement 2.37 29 | magician oracle 1.82 30 | glass jewel 1.78 31 | cemetery mound 1.69 32 | car journey 1.55 33 | hill woodland 1.48 34 | crane rooster 1.41 35 | furnace implement 1.37 36 | coast hill 1.26 37 | bird woodland 1.24 38 | shore voyage 1.22 39 | cemetery woodland 1.18 40 | food rooster 1.09 41 | forest graveyard 1.00 42 | lad wizard 0.99 43 | mound shore 0.97 44 | automobile cushion 0.97 45 | boy sage 0.96 46 | monk oracle 0.91 47 | shore woodland 0.90 48 | grin lad 0.88 49 | coast forest 0.85 50 | asylum cemetery 0.79 51 | monk slave 0.57 52 | cushion jewel 0.45 53 | boy rooster 0.44 54 | glass magician 0.44 55 | graveyard madhouse 0.44 56 | asylum monk 0.39 57 | asylum fruit 0.19 58 | grin implement 0.18 59 | mound stove 0.14 60 | automobile wizard 0.11 61 | autograph shore 0.06 62 | fruit furnace 0.05 63 | noon string 0.04 64 | rooster voyage 0.04 65 | chord smile 0.02 66 | -------------------------------------------------------------------------------- /run_bsg.py: -------------------------------------------------------------------------------- 1 | # this file contains an example on how to run the bayesian skip-gram model 2 | import os 3 | from interfaces.interface_configurator import InterfaceConfigurator 4 | from libraries.evaluation.support import evaluate 5 | from libraries.evaluation.lexsub.run_lexsub import run_lexsub 6 | 7 | train_data_path = '2M/' # change the path! 8 | vocab_file_path = 'vocabulary/2M.txt' # if the file does not exist - it will be created 9 | output_folder_path = "output/2M/" # change the path(optional) 10 | 11 | # obtain the interface to interact with the model. If one wants to change hyper-param the manual modification of the below class's method will be necessary! 12 | i_model = InterfaceConfigurator.get_interface(train_data_path, vocab_file_path, output_folder_path) 13 | 14 | i_model.train_workflow() 15 | 16 | # store the temporary vocab, because it can be different from the original one(e.g. smaller number of words) 17 | vocab = i_model.vocab 18 | temp_vocab_file_path = os.path.join(i_model.output_path, "vocab.txt") 19 | vocab.write(temp_vocab_file_path) 20 | 21 | mu_vecs = [os.path.join(i_model.output_path, "mu.vectors")] 22 | sigma_vecs = [os.path.join(i_model.output_path, "sigma.vectors")] 23 | 24 | # a complex of word embedding evaluations(word similarity, entailment, directional entailment) 25 | evaluate(mu_vectors_files=mu_vecs, sigma_vectors_files=sigma_vecs, vocab_file=temp_vocab_file_path, log_sigmas=False, 26 | full_sim=True, vocab=vocab) 27 | 28 | # run additionally lexical substitution evaluation 29 | run_lexsub(input_folder=i_model.output_path, output_path=i_model.output_path) 30 | -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/README.md: -------------------------------------------------------------------------------- 1 | # eval-word-vectors 2 | Manaal Faruqui, mfaruqui@cs.cmu.edu 3 | 4 | Easy-to-use scripts for evaluating word vectors on a variety of tasks. 5 | These are the scripts that run behind the online tool on ```http://www.wordvectors.org/```. 6 | I will be adding more evaluation scripts here over the course of time. 7 | 8 | ### Requirements 9 | 1. Python 2.7 (+numpy package) 10 | 11 | ### Data you need 12 | 1. Word vector file 13 | 2. Any word similarity evaluation file (if you are not using the provided ones) 14 | 15 | Each vector file should have one word vector per line as follows (space delimited):- 16 | 17 | ```the -1.0 2.4 -0.3 ...``` 18 | 19 | ### Evaluating on multiple word sim tasks 20 | 21 | ```python all_wordsim.py word_vec_file word_sim_file_dir``` 22 | 23 | ```python all_wordsim.py skip-gram-vecs.txt data/word-sim/``` 24 | 25 | ### Evaluating on one word sim task 26 | 27 | ```python wordsim.py word_vec_file word_sim_file``` 28 | 29 | ```word_sim_file``` should be in the same format as files in ```data/word-sim/``` 30 | 31 | ### Reference 32 | 33 | Please make sure to cite the papers corresponding to the word similarity dataset that you are using. This 34 | list of citation can be found at ```http://www.wordvectors.org/```. 35 | 36 | Please cite the following paper if you use this tool: 37 | ``` 38 | @InProceedings{faruqui-2014:SystemDemo, 39 | author = {Faruqui, Manaal and Dyer, Chris}, 40 | title = {Community Evaluation and Exchange of Word Vectors at wordvectors.org}, 41 | booktitle = {Proceedings of ACL: System Demonstrations}, 42 | year = {2014}, 43 | } 44 | ``` 45 | -------------------------------------------------------------------------------- /libraries/utils/other.py: -------------------------------------------------------------------------------- 1 | import operator 2 | from collections import OrderedDict 3 | 4 | # def create_special_symbols_hash(special_symbols): 5 | # new_special_symbols = {} 6 | # for ss in special_symbols: 7 | # new_special_symbols[ss] = "<"+ss+">" 8 | # return new_special_symbols 9 | 10 | 11 | # for float comparison 12 | def is_close(a, b, rel_tol=1e-09, abs_tol=0.0): 13 | return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) 14 | 15 | 16 | def is_ascii(s): 17 | return all(ord(c) < 128 for c in s) 18 | 19 | 20 | def sort_hash(hash, by_key=True, reverse=True): 21 | if by_key: 22 | indx = 0 23 | else: 24 | indx = 1 25 | return sorted(hash.items(), key=operator.itemgetter(indx), reverse=reverse) 26 | 27 | 28 | def merge_two_dicts(x, y): 29 | z = x.copy() # start with x's keys and values 30 | z.update(y) # modifies z with y's keys and values & returns None 31 | return z 32 | 33 | 34 | def merge_ordered_dicts(*args): 35 | """ 36 | Assuming that each collection is an Ordered dictionary, merges them into one. 37 | 38 | """ 39 | new_params_dict = OrderedDict() 40 | for params in args: 41 | assert isinstance(params, OrderedDict) 42 | for key, value in params.items(): 43 | new_params_dict[key] = value 44 | return new_params_dict 45 | 46 | 47 | def append_to_ordered_dict(initial_dict, param_dict): 48 | """ 49 | Assuming that collection is an Ordered dictionary, appends parameters to the initial one. 50 | 51 | """ 52 | assert isinstance(param_dict, OrderedDict) 53 | for key, value in param_dict.items(): 54 | initial_dict[key] = value 55 | return initial_dict -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/lst/coinco_split_dev_test.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | 4 | def read_ids(filename): 5 | 6 | ids = set() 7 | with open(filename) as f: 8 | for line in f: 9 | line_id = line.strip() 10 | if len(line_id) > 0: 11 | ids.add(line_id) 12 | return ids 13 | 14 | 15 | if __name__ == '__main__': 16 | 17 | if len(sys.argv)<4: 18 | print "Usage: %s " % sys.argv[0] 19 | sys.exit(1) 20 | 21 | coinco_all = open(sys.argv[1],'r') 22 | coinco_dev = open(sys.argv[1]+'.dev', 'w') 23 | coinco_test = open(sys.argv[1]+'.test', 'w') 24 | dev_ids = read_ids(sys.argv[2]) 25 | test_ids = read_ids(sys.argv[3]) 26 | format = sys.argv[4] 27 | 28 | 29 | ''' 30 | eval format: mission.N 4 1 a mission to end a war 31 | gold format: mission.N 4 :: task 2;plan 2; 32 | ''' 33 | for line in coinco_all: 34 | if len(line.strip()) > 0: 35 | if format == 'eval': 36 | line_id = line.split('\t')[1] 37 | elif format == 'gold': 38 | line_id = line.split('::')[0].strip().split()[-1] 39 | else: 40 | raise Exception('input format unknown: ' + format) 41 | if line_id in dev_ids: 42 | coinco_dev.write(line) 43 | elif line_id in test_ids: 44 | coinco_test.write(line) 45 | else: 46 | print "NOTICE: id {} is neither in dev nor in test".format(line_id) 47 | 48 | coinco_all.close() 49 | coinco_dev.close() 50 | coinco_test.close() 51 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/data/conll_line.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | class ConllLine(): 4 | 5 | 6 | 7 | def root_init(self): 8 | self.id = 0 9 | self.form = '*root*' 10 | self.lemma = '_' 11 | self.cpostag = '_' 12 | self.postag = '_' 13 | self.feats = '_' 14 | self.head = -1 15 | self.deptype = 'rroot' 16 | self.phead = -1 17 | self.pdeptype = '_' 18 | 19 | def __str__( self ): 20 | return '\t'.join([str(self.id), self.form, self.lemma, self.cpostag, self.postag, self.feats, str(self.head), self.deptype, str(self.phead), self.pdeptype]) 21 | 22 | def __init__(self, tokens=None): 23 | if tokens == None: 24 | self.root_init() 25 | else: 26 | self.id = int(tokens[0]) 27 | self.form = tokens[1] 28 | self.lemma = tokens[2] 29 | self.cpostag = tokens[3] 30 | self.postag = tokens[4] 31 | self.feats = tokens[5] 32 | self.head = int(tokens[6]) 33 | self.deptype = tokens[7] 34 | if len(tokens) > 8: 35 | self.phead = -1 if tokens[8] == '_' else int(tokens[8]) 36 | self.pdeptype = tokens[9] 37 | else: 38 | self.phead = -1 39 | self.pdeptype = '_' 40 | 41 | tree_line_extractor = re.compile('([a-z]+)\(.+-(\d+), (.+)-(\d+)\)') 42 | # stanford parser tree output: num(Years-3, Five-1) 43 | def from_tree_line(self, tree_line): 44 | self.root_init() 45 | tok = self.tree_line_extractor.match(tree_line) 46 | self.id = int(tok.group(4)) 47 | self.form = tok.group(3) 48 | self.head = int(tok.group(2)) 49 | self.deptype = tok.group(1) 50 | 51 | 52 | 53 | -------------------------------------------------------------------------------- /layers/layer.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | from libraries.misc.initializers import Initializers 4 | import theano 5 | 6 | 7 | class Parameter: 8 | def __init__(self, name, shape, regularizable=False, init_type='uniform'): 9 | self.name = name 10 | self.value = theano.shared(Initializers.init(shape, init_type), name) 11 | self.regularizable = regularizable 12 | 13 | 14 | class Layer: 15 | def __init__(self, name, init_type='xavier_uniform', regularizable=False): 16 | """ 17 | A basic layer parent class of all other layers. 18 | 19 | """ 20 | assert name is not None 21 | self.name = name 22 | self.init_type = init_type 23 | self.regularizable = regularizable 24 | self.params = OrderedDict() 25 | 26 | def get_params_to_reg(self): 27 | """ 28 | Helper function that returns parameters that are necessary to regularize. 29 | 30 | """ 31 | container = OrderedDict() 32 | for name in self.params.keys(): 33 | if self.params[name].regularizable: 34 | container[name] = self.params[name] 35 | return container 36 | 37 | # TODO: think if I need to pass reg, and init_type like that if they are set as attributes already. 38 | def add_param(self, name, shape, regularizable=False, init_type='uniform'): 39 | """ 40 | :param name: parameter's name 41 | :param shape: shape of the parameter 42 | :param regularizable: True/False depending on whether you want it be regularized 43 | :param init_type: what initialization to perform 44 | 45 | """ 46 | assert name is not None 47 | name = "_".join([self.name, name]) 48 | param = Parameter(name, shape=shape, regularizable=regularizable, init_type=init_type) 49 | self.params[name] = param 50 | return param.value -------------------------------------------------------------------------------- /eval/word_pairs_eval.py: -------------------------------------------------------------------------------- 1 | # a console application to evaluate word pairs 2 | from support import KL, cosine_sim, read_vectors_to_dict 3 | import argparse 4 | 5 | 6 | def word_pairs_eval(word_pairs_path, mu_vectors_path, sigma_vectors_path): 7 | """ 8 | :param word_pairs_path: file path that contains lines of the form word1 word2 (space separated) 9 | :param mu_vectors_path: path to the learned mu vectors 10 | :type mu_vectors_path: str 11 | :param sigma_vectors_path: path to the learned sigma vectors 12 | :type sigma_vectors_path: str 13 | 14 | """ 15 | mus_and_sigmas = read_vectors_to_dict(mu_vectors_path, sigma_vectors_path, log_sigmas=False) 16 | 17 | with open(word_pairs_path) as f: 18 | for line in f: 19 | word1, word2 = line.strip().split() 20 | 21 | mu_w1, sigma_w1 = mus_and_sigmas[word1] 22 | mu_w2, sigma_w2 = mus_and_sigmas[word2] 23 | kl1 = KL(mu_w1, sigma_w1, mu_w2, sigma_w2) 24 | kl2 = KL(mu_w2, sigma_w2, mu_w1, sigma_w1) 25 | 26 | print "cos_sim(%s, %s) = %f" % (word1, word2, cosine_sim(mu_w1, mu_w2)) 27 | print "kl(%s, %s) = %f" % (word1, word2, kl1) 28 | print "kl(%s, %s) = %f" % (word2, word1, kl2) 29 | 30 | my_str = "%s entails %s" 31 | if kl1 < kl2: 32 | print my_str % (word1, word2) 33 | else: 34 | print my_str % (word2, word1) 35 | print '---------------------------------------' 36 | 37 | if __name__ == '__main__': 38 | parser = argparse.ArgumentParser(description='Word pairs evaluation using BSG learned representations.') 39 | parser.add_argument('-wpp', '--word_pairs_path', type=str) 40 | parser.add_argument('-mup', '--mu_vectors_path', type=str) 41 | parser.add_argument('-sigmap', '--sigma_vectors_path', type=str) 42 | args = parser.parse_args() 43 | word_pairs_eval(args.word_pairs_path, args.mu_vectors_path, args.sigma_vectors_path) 44 | -------------------------------------------------------------------------------- /libraries/tokenizers/bsg_tokenizer.py: -------------------------------------------------------------------------------- 1 | from libraries.tools.word_processor import WordProcessor 2 | from nltk import word_tokenize as default_tokenizer 3 | try: 4 | import re2 as re 5 | except ImportError: 6 | import re 7 | 8 | TOKENS = {"URL_TOKEN": "", "FLOAT_TOKEN": ""} 9 | 10 | 11 | class BSGTokenizer: 12 | """ 13 | A more advanced tokenizer that both tokenizes and cleans textual data. It's specifically tailored for BSG models. 14 | 15 | """ 16 | def __init__(self, word_processor_type='none', use_external_tokenizer=True): 17 | """ 18 | :param use_external_tokenizer: whether to use NLTK tokenizer or rely on the simple splitter(x.split()). 19 | """ 20 | if use_external_tokenizer: 21 | self.tokenizer = default_tokenizer 22 | else: 23 | self.tokenizer = lambda x: x.split() # assuming that data was already tokenized 24 | self.word_processor = WordProcessor(word_processor_type=word_processor_type) 25 | 26 | def __call__(self, sentence): 27 | """ 28 | :param sentence: a string of words 29 | :return: a list of clean tokens 30 | 31 | """ 32 | words = self.tokenizer(sentence) 33 | tokens = [] 34 | for word in words: 35 | # check if the word matches some known token 36 | token = self.__match_to_known_token(word) 37 | if not token: 38 | # clean the word otherwise 39 | token = self.word_processor(word) 40 | if token == "": 41 | continue 42 | tokens.append(token) 43 | 44 | return tokens 45 | 46 | @staticmethod 47 | def __match_to_known_token(word): 48 | # URL 49 | if re.match(r"^(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})$", word): 50 | return TOKENS["URL_TOKEN"] 51 | # FLOAT 52 | if re.match(r'^([0-9]+\.)[0-9]+$', word): 53 | return TOKENS["FLOAT_TOKEN"] -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/data/context_instance.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Instance in the Lexical Substitution Task dataset 3 | 4 | ''' 5 | 6 | 7 | from pos import from_lst_pos 8 | 9 | CONTEXT_TEXT_BEGIN_INDEX = 3 10 | TARGET_INDEX = 2 11 | 12 | 13 | 14 | 15 | class ContextInstance(object): 16 | 17 | def __init__(self, line, no_pos_flag): 18 | ''' 19 | Constructor 20 | ''' 21 | self.line = line 22 | tokens1 = line.split("\t") 23 | self.target_ind = int(tokens1[TARGET_INDEX]) 24 | self.words = tokens1[3].split() 25 | self.target = self.words[self.target_ind] 26 | self.full_target_key = tokens1[0] 27 | self.pos = self.full_target_key.split('.')[-1] 28 | self.target_key = '.'.join(self.full_target_key.split('.')[:2]) # remove suffix in cases of bar.n.v 29 | self.target_lemma = self.full_target_key.split('.')[0] 30 | self.target_id = tokens1[1] 31 | 32 | # I don't see why I need this one? 33 | # if self.pos in from_lst_pos: 34 | # self.pos = from_lst_pos[self.pos] 35 | 36 | self.target_pos = '.'.join([self.target, '*']) if no_pos_flag == True else '.'.join([self.target, self.pos]) 37 | 38 | def get_neighbors(self, window_size): 39 | tokens = self.line.split()[3:] 40 | 41 | if (window_size > 0): 42 | start_pos = max(self.target_ind-window_size, 0) 43 | end_pos = min(self.target_ind+window_size+1, len(tokens)) 44 | else: 45 | start_pos = 0 46 | end_pos = len(tokens) 47 | 48 | neighbors = tokens[start_pos:self.target_ind] + tokens[self.target_ind+1:end_pos] 49 | return neighbors 50 | 51 | def decorate_context(self): 52 | tokens = self.line.split('\t') 53 | words = tokens[CONTEXT_TEXT_BEGIN_INDEX].split() 54 | words[self.target_ind] = '__'+words[self.target_ind]+'__' 55 | tokens[CONTEXT_TEXT_BEGIN_INDEX] = ' '.join(words) 56 | return '\t'.join(tokens)+"\n" -------------------------------------------------------------------------------- /layers/custom/bsg_encoder.py: -------------------------------------------------------------------------------- 1 | from layers.layer import Layer 2 | from layers.standard.embeddings import Embeddings 3 | import theano.tensor as T 4 | from libraries.misc.non_linearity import NonLinearity 5 | from libraries.utils.other import merge_ordered_dicts 6 | 7 | 8 | class BSGEncoder(Layer): 9 | """ 10 | Encoder that is specific to the original BSG version. It uses one input representation of words, and performs 11 | transformation of context and center word representations. 12 | 13 | """ 14 | def __init__(self, name, input_dim, output_dim, collection_size, non_linearity='relu'): 15 | Layer.__init__(self, name=name,init_type='uniform', regularizable=True) 16 | self.non_linearity = NonLinearity(type=non_linearity) 17 | self.embeddings = Embeddings(name="emb_encoder", collection_size=collection_size, output_dim=input_dim) 18 | self.C = self.add_param(name="C", shape=(2*input_dim, output_dim), init_type=self.init_type, 19 | regularizable=self.regularizable) 20 | # store additional params 21 | self.params = merge_ordered_dicts(self.embeddings.params, self.params) 22 | 23 | def __call__(self, context_words, center_words, mask=None): 24 | """ 25 | :param context_words: tensor [batch_size, seq_length] 26 | :param center_words: tensor [batch_size] 27 | :param mask: tensor [batch_size, seq_length] 28 | :return: tensor [batch_size, output_dim] 29 | 30 | """ 31 | b, full_window_size = context_words.shape 32 | 33 | # 0. get representations 34 | repr_center = T.repeat(self.embeddings(center_words).dimshuffle([0, 'x', 1]), full_window_size, axis=1) \ 35 | * mask.dimshuffle([0, 1, "x"]) 36 | repr_context = self.embeddings(context_words, mask, perform_dimshuffle=False) 37 | 38 | # 1. combine representations 39 | repr_common = T.concatenate([repr_center, repr_context], axis=2) 40 | 41 | # 2. compute hidden layer by summing common representations 42 | hidden = T.sum(self.non_linearity(T.dot(repr_common, self.C)), axis=1) 43 | 44 | return hidden -------------------------------------------------------------------------------- /interfaces/support.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import sys 3 | from collections import OrderedDict 4 | sys.setrecursionlimit(10000) 5 | 6 | 7 | def save(obj, file_path): 8 | file = open(file_path, 'wb+') 9 | pickle.dump(obj=obj, file=file, protocol=pickle.HIGHEST_PROTOCOL) 10 | 11 | 12 | def load(file_path): 13 | file = open(file_path, 'rb+') 14 | return pickle.load(file) 15 | 16 | 17 | def metrics_to_str(metrics, prefix=""): 18 | return prefix + " " + ", ".join(["%s: %f" % (name, value) for name, value in metrics.items()]) 19 | 20 | 21 | def infer_attributes_to_log(model): 22 | """ 23 | Automatically infers parameters/attributes that should be logged. They are either ints/floats or strings. 24 | 25 | """ 26 | all_attr = model.__dict__ 27 | attr_to_log = OrderedDict() 28 | for attr_name, attr_value in all_attr.items(): 29 | if isinstance(attr_value, (int, str, float, list)): 30 | attr_to_log[attr_name] = attr_value 31 | return attr_to_log 32 | 33 | 34 | def format_experimental_setup(setup): 35 | """ 36 | A specific for experiments writing function, that formats them property and write to the log file. 37 | :param params: a hash of params 38 | """ 39 | st = "" 40 | st += '---------------------------- \n' 41 | st += '---- EXPERIMENT\'S SETUP ---- \n' 42 | for param_name, param_value in setup.iteritems(): 43 | st += param_name + ": " + str(param_value) + '\n' 44 | st += '--------------------------' 45 | return st 46 | 47 | 48 | def compute_loss(iterator, loss_func): 49 | """ 50 | Computes the average loss over the whole dataset that is loaded to the iterator. 51 | 52 | """ 53 | total_loss = 0. 54 | batch_size = iterator.batch_size 55 | datapoints_count = 0. 56 | # TODO: rethink if it's necessary to do all those mathematical manipulations 57 | for counter, batch in enumerate(iterator, 1): 58 | total_loss += loss_func(batch=batch) 59 | datapoints_count += len(batch) 60 | # rescale back as the loss was averaged over nr. of datapoints in each batch 61 | total_loss *= (batch_size / datapoints_count) 62 | return total_loss -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/ranking.py: -------------------------------------------------------------------------------- 1 | import math 2 | import numpy 3 | from operator import itemgetter 4 | from numpy.linalg import norm 5 | 6 | EPSILON = 1e-6 7 | 8 | def euclidean(vec1, vec2): 9 | diff = vec1 - vec2 10 | return math.sqrt(diff.dot(diff)) 11 | 12 | def cosine_sim(vec1, vec2): 13 | vec1 += EPSILON * numpy.ones(len(vec1)) 14 | vec2 += EPSILON * numpy.ones(len(vec1)) 15 | return vec1.dot(vec2)/(norm(vec1)*norm(vec2)) 16 | 17 | def assign_ranks(item_dict): 18 | ranked_dict = {} 19 | sorted_list = [(key, val) for (key, val) in sorted(item_dict.items(), 20 | key=itemgetter(1), 21 | reverse=True)] 22 | for i, (key, val) in enumerate(sorted_list): 23 | same_val_indices = [] 24 | for j, (key2, val2) in enumerate(sorted_list): 25 | if val2 == val: 26 | same_val_indices.append(j+1) 27 | if len(same_val_indices) == 1: 28 | ranked_dict[key] = i+1 29 | else: 30 | ranked_dict[key] = 1.*sum(same_val_indices)/len(same_val_indices) 31 | return ranked_dict 32 | 33 | def correlation(dict1, dict2): 34 | avg1 = 1.*sum([val for key, val in dict1.iteritems()])/len(dict1) 35 | avg2 = 1.*sum([val for key, val in dict2.iteritems()])/len(dict2) 36 | numr, den1, den2 = (0., 0., 0.) 37 | for val1, val2 in zip(dict1.itervalues(), dict2.itervalues()): 38 | numr += (val1 - avg1) * (val2 - avg2) 39 | den1 += (val1 - avg1) ** 2 40 | den2 += (val2 - avg2) ** 2 41 | return numr / math.sqrt(den1 * den2) 42 | 43 | def spearmans_rho(ranked_dict1, ranked_dict2): 44 | assert len(ranked_dict1) == len(ranked_dict2) 45 | if len(ranked_dict1) == 0 or len(ranked_dict2) == 0: 46 | return 0. 47 | x_avg = 1.*sum([val for val in ranked_dict1.values()])/len(ranked_dict1) 48 | y_avg = 1.*sum([val for val in ranked_dict2.values()])/len(ranked_dict2) 49 | num, d_x, d_y = (0., 0., 0.) 50 | for key in ranked_dict1.keys(): 51 | xi = ranked_dict1[key] 52 | yi = ranked_dict2[key] 53 | num += (xi-x_avg)*(yi-y_avg) 54 | d_x += (xi-x_avg)**2 55 | d_y += (yi-y_avg)**2 56 | return num/(math.sqrt(d_x*d_y)) 57 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/lst/special_word_marker.py: -------------------------------------------------------------------------------- 1 | 2 | import sys 3 | import re 4 | 5 | RARE_WORD_TOKEN = '' 6 | NUMERIC_TOKEN = '' 7 | NAME_TOKEN = '' 8 | MAX_COUNT_FOR_NAME = 10000 9 | 10 | # very crude implementation 11 | num_re = re.compile('^[\+\/\:\-,\.\d]*\d[\+\/\:\-,\.\d]*$') 12 | def is_numeric(word_str): 13 | return num_re.match(word_str) != None 14 | 15 | def is_name(word, word_lower, vocab, begin_sentence): 16 | isname = False 17 | if not begin_sentence: 18 | if word[:1].isupper(): 19 | if word_lower not in vocab: 20 | isname = True 21 | else: 22 | count = vocab[word_lower] 23 | if count < MAX_COUNT_FOR_NAME: 24 | isname = True 25 | return isname 26 | 27 | def load_vocabulary(path): 28 | vocab = {} 29 | with open(path, 'r') as f: 30 | for line in f: 31 | if len(line) > 0: 32 | word = line.split('\t')[0].strip() 33 | count = int(line.split('\t')[1]) 34 | vocab[word] = count 35 | return vocab 36 | 37 | 38 | def mark_special_words(words, start_ind, vocab): 39 | for i in xrange(start_ind, len(words)): 40 | if is_numeric(words[i]): 41 | words[i] = NUMERIC_TOKEN 42 | elif is_name(words[i], words[i].lower(), vocab, i==start_ind): 43 | words[i] = NAME_TOKEN 44 | else: 45 | words[i] = words[i].lower() 46 | 47 | 48 | 49 | if __name__ == '__main__': 50 | 51 | if (len(sys.argv) < 2): 52 | print >> sys.stderr, "Usage: %s output" 53 | sys.exit(1) 54 | 55 | vocab = load_vocabulary(sys.argv[1]) 56 | 57 | for line in sys.stdin: 58 | try: 59 | segments = line.split('\t') 60 | words = segments[3].split() 61 | mark_special_words(words, 0, vocab) 62 | print '\t'.join(segments[:3]) + '\t' + ' '.join(words) 63 | except Exception as e: 64 | print >> sys.stderr, e 65 | sys.stderr.write("Can't parse line: %s" % line) 66 | -------------------------------------------------------------------------------- /interfaces/i_bsg.py: -------------------------------------------------------------------------------- 1 | from models.bsg import BSG 2 | from i_base import IBase 3 | from collections import OrderedDict 4 | from libraries.batch_iterators.window_batch_iterator import WindowBatchIterator as BatchIterator 5 | from support import compute_loss 6 | 7 | 8 | class IBSG(IBase): 9 | """ 10 | Interface class that builds on top of the BSG model. Specifically, it wraps the model's methods to easy user access. 11 | 12 | """ 13 | def __init__(self, data_iterator, vocab, half_window_size=5, nr_neg_samples=5, batch_size=5, 14 | subsampling_threshold=None, **kwargs): 15 | # init the parent object 16 | IBase.__init__(self, vocab=vocab, model_class=BSG, **kwargs) 17 | 18 | # general attributes 19 | self.batch_size = batch_size 20 | self.half_window_size = half_window_size 21 | self.nr_neg_samples = nr_neg_samples 22 | self.subsampling_threshold = subsampling_threshold 23 | 24 | self.init_iterator = lambda data_path: BatchIterator(vocab, data_path, data_iterator, half_window_size=half_window_size, nr_neg_samples=nr_neg_samples, 25 | subsampling_threshold=subsampling_threshold, batch_size=batch_size) 26 | 27 | def _measure_performance(self, data_path): 28 | return {"loss": compute_loss(self.init_iterator(data_path), loss_func=self.loss_func)} 29 | 30 | def _train(self, batch): 31 | mean_margin, mean_kl, avg_log_det = self.model.train(batch.pos_context_words, batch.neg_context_words, 32 | batch.center_words, batch.mask) 33 | return OrderedDict((("margin", mean_margin), ("kl", mean_kl), ("log_det", avg_log_det))) 34 | 35 | def loss_func(self, batch): 36 | return self.model.loss(batch.pos_context_words, batch.neg_context_words, batch.center_words, batch.mask) 37 | 38 | def _post_training_logic(self): 39 | # will execute this code after the training workflow is finished, can contain custom functions, e.g. saving 40 | # of word embeddings 41 | self.model.save_word_vectors(self.vocab, vectors_folder=self.output_path) 42 | self.log.write("Word vectors are saved to: %s" % self.output_path) -------------------------------------------------------------------------------- /libraries/evaluation/entailment/data/bench/baroni2012/data_lex_val.tsv: -------------------------------------------------------------------------------- 1 | mosque castle False 2 | boar spokesperson False 3 | deanery dwelling True 4 | animal artillery False 5 | information waitress False 6 | contest vertebrate False 7 | inhabitant resident False 8 | animal panda False 9 | bookmark performer False 10 | term word True 11 | robin vertebrate True 12 | yesterday day True 13 | pizza food True 14 | nanny adult True 15 | stallion animal True 16 | monastery building True 17 | misfortune catastrophe False 18 | immunology science True 19 | cat drone False 20 | bug animal True 21 | netball game True 22 | building envy False 23 | farmhouse bargain False 24 | screwdriver tool True 25 | trait competitiveness False 26 | food gun False 27 | ruler algebra False 28 | pesticide beard False 29 | washer worker True 30 | oat cereal True 31 | shortcut feeling False 32 | checklist list True 33 | drummer performer True 34 | secret information True 35 | objectivity trait True 36 | arithmetic discipline True 37 | radius fluid False 38 | tutor teacher True 39 | hamster science False 40 | enzyme chick False 41 | gall disease True 42 | saxophonist performer True 43 | pilot worker True 44 | cellulose mineral False 45 | geometry discipline True 46 | folly trait True 47 | hardness consistency True 48 | sailor privateer False 49 | undertaking slalom False 50 | kindergarten institution True 51 | disease colt False 52 | platter publication False 53 | champagne etching False 54 | charity institution True 55 | magnitude radius False 56 | holly tree True 57 | animal robin False 58 | pigeon vertebrate True 59 | vertebrate asp False 60 | security fresco False 61 | building transaction False 62 | marker bookmark False 63 | dragonfly animal True 64 | tench fish True 65 | etching discipline False 66 | clothing shirt False 67 | antibiotic drug True 68 | animal affidavit False 69 | aneurysm disorder True 70 | term telly False 71 | trumpeter performer True 72 | clothing karaoke False 73 | love feeling True 74 | doctrine aesthetic False 75 | epistle letter True 76 | eagle animal True 77 | monastery house True 78 | bead jewelry True 79 | carbohydrate molecule True 80 | bridesmaid woman True 81 | overpayment therapist False 82 | infant investor False 83 | hare fillet False 84 | melanoma cancer True 85 | vertebrate worker False 86 | technician relative False 87 | radiotherapy treatment True 88 | -------------------------------------------------------------------------------- /interfaces/interface_configurator.py: -------------------------------------------------------------------------------- 1 | from libraries.data_iterators.open_text_data_iterator import OpenTextDataIterator 2 | from libraries.tools.vocabulary import Vocabulary 3 | from libraries.tokenizers.bsg_tokenizer import BSGTokenizer 4 | from interfaces.i_bsg import IBSG 5 | from libraries.misc.optimizations import Adam 6 | 7 | 8 | class InterfaceConfigurator: 9 | """ 10 | A class for configuring the model's interface. One can alter hyper-params in get_interface. 11 | 12 | """ 13 | def __init__(self): 14 | pass 15 | 16 | @staticmethod 17 | def get_interface(train_data_path, vocab_file_path, output_folder_path=None, params_file_path=None, model_file_path=None): 18 | 19 | # Hyper-parameters 20 | half_window_size = 5 # (one sided) 21 | input_dim = 100 22 | h_dim = 100 # the number of components in the first hidden layers 23 | z_dim = 100 # the number of dimensions of the latent vectors 24 | alpha = 0.0075 # learning rate 25 | subsampling_threshold = None 26 | nr_neg_samples = 10 27 | margin = 5.0 # margin in the hinge loss 28 | epochs = 1 29 | max_vocab_size = 10000 30 | batch_size = 500 31 | 32 | tokenizer = BSGTokenizer(word_processor_type='open_text', use_external_tokenizer=False) 33 | data_iterator = OpenTextDataIterator(tokenizer=tokenizer) 34 | 35 | vocab = Vocabulary(data_iterator, max_size=max_vocab_size, min_count=1) 36 | vocab.load_or_create(vocab_file_path, train_data_path) 37 | vocab.assign_distr() 38 | 39 | lr_opt = Adam(learning_rate=alpha, beta1=0.9, beta2=0.999) 40 | 41 | i_model = IBSG(vocab=vocab, data_iterator=data_iterator, train_data_path=train_data_path, epochs=epochs, 42 | half_window_size=half_window_size, nr_neg_samples=nr_neg_samples, subsampling_threshold=subsampling_threshold, 43 | batch_size=batch_size, output_dir=output_folder_path) 44 | 45 | if model_file_path: 46 | i_model.load_model(model_file_path) 47 | else: 48 | i_model.init_model(vocab_size=len(vocab), input_dim=input_dim, hidden_dim=h_dim, latent_dim=z_dim, lr_opt=lr_opt, margin=margin) 49 | 50 | # load params only if the model was not loaded already 51 | if params_file_path and not model_file_path: 52 | i_model.load_params(params_file_path) 53 | 54 | return i_model 55 | 56 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/lst/extract_lst_candidates.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Used to extract (or pool) substitute candidates for every target type in the LST dataset 3 | ''' 4 | import sys 5 | 6 | if __name__ == '__main__': 7 | 8 | if len(sys.argv)<3: 9 | print "Usage: %s [no-mwe]" % sys.argv[0] 10 | sys.exit(1) 11 | 12 | goldfile = open(sys.argv[1], 'r') 13 | outfile = open(sys.argv[2], 'w') 14 | 15 | ignore_mwe = False 16 | if (len(sys.argv) > 3): 17 | sys.stderr.write("ignoring multi-word-expressions\n"); 18 | ignore_mwe = True 19 | 20 | good_oneword_inst = 0 21 | target2candidates = {} 22 | # bright.a 5 :: intelligent 3;clever 2;most able 1;capable 1;promising 1;sharp 1;motivated 1; 23 | for line in goldfile: 24 | if len(line)>0: 25 | oneword_in_line = 0 # e.g. ;most able 1; 26 | segments = line.split("::") 27 | if len(segments)>=2: 28 | target = segments[0][:segments[0].strip().rfind(' ')] 29 | target = '.'.join(target.split('.')[:2]) # remove suffix in cases of bar.n.v 30 | line_candidates = segments[1].strip().split(';') 31 | for candidate_count in line_candidates: 32 | if len(candidate_count) > 0: 33 | delimiter_ind = candidate_count.rfind(' ') 34 | candidate = candidate_count[:delimiter_ind] 35 | if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)): 36 | continue 37 | oneword_in_line += 1 38 | if target in target2candidates: 39 | candidates = target2candidates[target] 40 | else: 41 | candidates = set() 42 | target2candidates[target] = candidates 43 | candidates.add(candidate) 44 | if (oneword_in_line >= 1): 45 | good_oneword_inst += 1 46 | 47 | if ignore_mwe: 48 | sys.stderr.write("good_oneword_inst: " + str(good_oneword_inst) + "\n") 49 | for target, candidates in target2candidates.iteritems(): 50 | outfile.write(target + '::' + ';'.join(list(candidates)) + '\n') 51 | 52 | goldfile.close() 53 | outfile.close() 54 | -------------------------------------------------------------------------------- /libraries/evaluation/GloVe/distance.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | 5 | def generate(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--vocab_file', default='vocab.txt', type=str) 8 | parser.add_argument('--vectors_file', default='vectors.txt', type=str) 9 | args = parser.parse_args() 10 | 11 | with open(args.vocab_file, 'r') as f: 12 | words = [x.rstrip().split(' ')[0] for x in f.readlines()] 13 | with open(args.vectors_file, 'r') as f: 14 | vectors = {} 15 | for line in f: 16 | vals = line.rstrip().split(' ') 17 | vectors[vals[0]] = [float(x) for x in vals[1:]] 18 | 19 | vocab_size = len(words) 20 | vocab = {w: idx for idx, w in enumerate(words)} 21 | ivocab = {idx: w for idx, w in enumerate(words)} 22 | 23 | vector_dim = len(vectors[ivocab[0]]) 24 | W = np.zeros((vocab_size, vector_dim)) 25 | for word, v in vectors.items(): 26 | if word == '': 27 | continue 28 | W[vocab[word], :] = v 29 | 30 | # normalize each word vector to unit variance 31 | W_norm = np.zeros(W.shape) 32 | d = (np.sum(W ** 2, 1) ** (0.5)) 33 | W_norm = (W.T / d).T 34 | return (W_norm, vocab, ivocab) 35 | 36 | 37 | def distance(W, vocab, ivocab, input_term): 38 | for idx, term in enumerate(input_term.split(' ')): 39 | if term in vocab: 40 | print('Word: %s Position in vocabulary: %i' % (term, vocab[term])) 41 | if idx == 0: 42 | vec_result = W[vocab[term], :] 43 | else: 44 | vec_result += W[vocab[term], :] 45 | else: 46 | print('Word: %s Out of dictionary!\n' % term) 47 | return 48 | 49 | vec_norm = np.zeros(vec_result.shape) 50 | d = (np.sum(vec_result ** 2,) ** (0.5)) 51 | vec_norm = (vec_result.T / d).T 52 | 53 | dist = np.dot(W, vec_norm.T) 54 | 55 | for term in input_term.split(' '): 56 | index = vocab[term] 57 | dist[index] = -np.Inf 58 | 59 | a = np.argsort(-dist)[:N] 60 | 61 | print("\n Word Cosine distance\n") 62 | print("---------------------------------------------------------\n") 63 | for x in a: 64 | print("%35s\t\t%f\n" % (ivocab[x], dist[x])) 65 | 66 | 67 | if __name__ == "__main__": 68 | N = 100; # number of closest words that will be shown 69 | W, vocab, ivocab = generate() 70 | while True: 71 | input_term = raw_input("\nEnter word or sentence (EXIT to break): ") 72 | if input_term == 'EXIT': 73 | break 74 | else: 75 | distance(W, vocab, ivocab, input_term) 76 | 77 | -------------------------------------------------------------------------------- /libraries/misc/initializers.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | np.random.seed(42) 3 | 4 | class Initializers(): 5 | def __init__(self): 6 | pass 7 | 8 | @staticmethod 9 | def init(size, init_type="uniform"): 10 | """ 11 | :param init_type: uniform 12 | :return: initialized numpy matrix with float32: 13 | """ 14 | assert init_type in ['uniform', 'xavier_normal', 'xavier_uniform', 'zeros', 'bsg_log_sigmas'] 15 | fans = compute_fans(size) 16 | if init_type == 'zeros': 17 | return np.zeros(shape=size, dtype="float32") 18 | if init_type == 'uniform': 19 | return np.float32(np.random.uniform(low=-0.05, high=0.05, size=size)) 20 | if init_type == 'xavier_normal': 21 | return np.float32(np.random.normal(0.0, 2./np.sum(fans), size=size)) 22 | if init_type == 'xavier_uniform': 23 | lim = np.sqrt(6.0 / np.sum(fans)) 24 | return np.float32(np.random.uniform(low=-lim, high=lim, size=size)) 25 | if init_type == 'bsg_log_sigmas': 26 | return np.float32(np.random.uniform(low=-3.5, high=-1.5, size=size)) 27 | 28 | 29 | def compute_fans(shape, data_format='channels_last'): 30 | """Computes the number of input and output units for a weight shape. 31 | # Arguments 32 | shape: Integer shape tuple. 33 | data_format: Image data format to use for convolution kernels. 34 | Note that all kernels in Keras are standardized on the 35 | `channels_last` ordering (even when inputs are set 36 | to `channels_first`). 37 | # Returns 38 | A tuple of scalars, `(fan_in, fan_out)`. 39 | # Raises 40 | ValueError: in case of invalid `data_format` argument. 41 | """ 42 | if len(shape) == 2: 43 | fan_in = shape[0] 44 | fan_out = shape[1] 45 | elif len(shape) in {3, 4, 5}: 46 | # Assuming convolution kernels (1D, 2D or 3D). 47 | # TH kernel shape: (depth, input_depth, ...) 48 | # TF kernel shape: (..., input_depth, depth) 49 | if data_format == 'channels_first': 50 | receptive_field_size = np.prod(shape[2:]) 51 | fan_in = shape[1] * receptive_field_size 52 | fan_out = shape[0] * receptive_field_size 53 | elif data_format == 'channels_last': 54 | receptive_field_size = np.prod(shape[:2]) 55 | fan_in = shape[-2] * receptive_field_size 56 | fan_out = shape[-1] * receptive_field_size 57 | else: 58 | raise ValueError('Invalid data_format: ' + data_format) 59 | else: 60 | # No specific assumptions. 61 | fan_in = np.sqrt(np.prod(shape)) 62 | fan_out = np.sqrt(np.prod(shape)) 63 | return fan_in, fan_out 64 | 65 | 66 | 67 | -------------------------------------------------------------------------------- /libraries/utils/paths_and_files.py: -------------------------------------------------------------------------------- 1 | # contains utility functions that are related to retrieving paths, file names, creating folders, etc 2 | import os 3 | import glob 4 | import errno 5 | 6 | 7 | def get_immediate_subdirectories(a_dir): 8 | return [name for name in os.listdir(a_dir) 9 | if os.path.isdir(os.path.join(a_dir, name))] 10 | 11 | 12 | def get_file_paths(path, return_file_names=False): 13 | """ 14 | :param path: 15 | :return: :rtype: a list of filepaths that are in the folder 16 | """ 17 | if os.path.isdir(path): 18 | paths = glob.glob(path + "/*") 19 | else: 20 | paths = [path] # that means there is only one file 21 | 22 | if return_file_names: 23 | paths = [(p, p.split('/')[-1]) for p in paths] 24 | return paths 25 | 26 | 27 | 28 | def get_subdir_number(path): 29 | """ 30 | Checks the number of subdirectories, and returns it. Useful for automatic output folders generation 31 | """ 32 | if not os.path.exists(path): 33 | return 0 34 | subdirectories = get_immediate_subdirectories(path) 35 | return len(subdirectories) 36 | 37 | 38 | def create_folders_if_not_exist(filename): 39 | if os.path.dirname(filename) and not os.path.exists(os.path.dirname(filename)): 40 | try: 41 | os.makedirs(os.path.dirname(filename)) 42 | except OSError as exc: # Guard against race condition 43 | if exc.errno != errno.EEXIST: 44 | raise 45 | 46 | 47 | def append_to_file(file_path, str): 48 | create_folders_if_not_exist(file_path) 49 | with open(file_path, "a") as f: 50 | f.write(str+" \n") 51 | 52 | 53 | def files_len(path): 54 | file_paths = get_file_paths(path) 55 | total = 0 56 | for file_path in file_paths: 57 | total += files_len(file_path) 58 | return total 59 | 60 | 61 | def file_len(file_path): 62 | with open(file_path) as f: 63 | for i, l in enumerate(f): 64 | pass 65 | return i + 1 66 | 67 | 68 | def count_number_of_tokens(folder_path): 69 | """ 70 | Counts the number of tokens in all files in the folder 71 | """ 72 | nr_tokens = 0 73 | filenames = glob.glob(folder_path + "/*") 74 | for fname in filenames: 75 | with open(fname) as f: 76 | print fname 77 | for sentence in f: 78 | words = sentence.lower().split() 79 | nr_tokens += len(words) 80 | return nr_tokens 81 | 82 | 83 | def merge_text_files(input_folder, output_file_path): 84 | with open(output_file_path, 'w') as output_file: 85 | for file_path in get_file_paths(input_folder): 86 | with open(file_path) as input_file: 87 | for line in input_file: 88 | output_file.write(line) -------------------------------------------------------------------------------- /libraries/evaluation/support.py: -------------------------------------------------------------------------------- 1 | import os 2 | from libraries.evaluation.entailment.entailment import test_entailment, test_directional_entailment 3 | from libraries.evaluation.word_sim.all_wordsim import all_word_sim 4 | from libraries.evaluation.word_sim.wordsim import word_sim 5 | from libraries.evaluation.GloVe.evaluate import glove_evaluate 6 | 7 | 8 | # evaluates vectors on Glove's benchmark and offline wordvectors.org benchmark 9 | # vectors_path : filename or a folder with vectors 10 | # vocab: vocab object, should be passed as there is currently a circular dependency TODO: fix! 11 | def evaluate(mu_vectors_files, sigma_vectors_files=None, vocab_file=None, vocab=None, 12 | max_count=None, full_sim=False, log_sigmas=False): 13 | 14 | # all similarity tests are performed on mu vectors 15 | for mu_vectors_file in mu_vectors_files: 16 | # https://github.com/mfaruqui/eval-word-vectors 17 | # 1. similarity tests 18 | if full_sim: 19 | sim_folder = os.path.dirname(os.path.realpath(__file__))+"/word_sim/data/word-sim" 20 | # sim_folder = os.path.join(os.getcwd(), "../evaluation/word_sim/data/word-sim") 21 | all_word_sim(mu_vectors_file, sim_folder) 22 | else: 23 | all_sim_file = os.path.dirname(os.path.realpath(__file__))+"/word_sim/data/combined-word-sim/TEST.txt" 24 | # all_sim_file = os.path.join(os.getcwd(), "/../evaluation/word_sim/data/combined-word-sim/TEST.txt") 25 | word_sim(mu_vectors_file, all_sim_file) 26 | # https://github.com/stanfordnlp/GloVe 27 | 28 | # 2. analogical reasoning 29 | if vocab_file is not None: 30 | glove_evaluate(vocab_file, mu_vectors_file, max_count=max_count) 31 | if not sigma_vectors_files: 32 | return 33 | 34 | for mu_vectors_file, sigma_vectors_file in zip(mu_vectors_files, sigma_vectors_files): 35 | # print "mu_vectors_file: %s" % mu_vectors_file 36 | # print "sigma_vectors_file: %s"% sigma_vectors_file 37 | 38 | # 3. KL entailment 39 | for sf in ["kl", "cos", "l2"]: 40 | test_entailment(mu_vectors_path=mu_vectors_file, sigma_vectors_path=sigma_vectors_file, log_sigmas=log_sigmas, 41 | score_func=sf, normalize=False) 42 | 43 | # 4. directional entailment on Baroni 44 | test_directional_entailment(mu_vectors_path=mu_vectors_file, sigma_vectors_path=sigma_vectors_file, 45 | test_path='/data/bench/baroni2012_dir/data.tsv', header=True, vocab=vocab, 46 | log_sigmas=log_sigmas) 47 | # 5. directional entailment on Bless 48 | test_directional_entailment(mu_vectors_path=mu_vectors_file, sigma_vectors_path=sigma_vectors_file, 49 | test_path='/data/bench/bless2011_dir/data.tsv', header=True, vocab=vocab, 50 | log_sigmas=log_sigmas) -------------------------------------------------------------------------------- /libraries/evaluation/GloVe/word_analogy.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import sys 4 | 5 | def generate(): 6 | parser = argparse.ArgumentParser() 7 | parser.add_argument('--vocab_file', default='vocab.txt', type=str) 8 | parser.add_argument('--vectors_file', default='vectors.txt', type=str) 9 | args = parser.parse_args() 10 | 11 | with open(args.vocab_file, 'r') as f: 12 | words = [x.rstrip().split(' ')[0] for x in f.readlines()] 13 | with open(args.vectors_file, 'r') as f: 14 | vectors = {} 15 | for line in f: 16 | vals = line.rstrip().split(' ') 17 | vectors[vals[0]] = [float(x) for x in vals[1:]] 18 | 19 | vocab_size = len(words) 20 | vocab = {w: idx for idx, w in enumerate(words)} 21 | ivocab = {idx: w for idx, w in enumerate(words)} 22 | 23 | vector_dim = len(vectors[ivocab[0]]) 24 | W = np.zeros((vocab_size, vector_dim)) 25 | for word, v in vectors.items(): 26 | if word == '': 27 | continue 28 | W[vocab[word], :] = v 29 | 30 | # normalize each word vector to unit variance 31 | W_norm = np.zeros(W.shape) 32 | d = (np.sum(W ** 2, 1) ** (0.5)) 33 | W_norm = (W.T / d).T 34 | return (W_norm, vocab, ivocab) 35 | 36 | 37 | def distance(W, vocab, ivocab, input_term): 38 | vecs = {} 39 | if len(input_term.split(' ')) < 3: 40 | print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' '))) 41 | return 42 | else: 43 | for idx, term in enumerate(input_term.split(' ')): 44 | if term in vocab: 45 | print('Word: %s Position in vocabulary: %i' % (term, vocab[term])) 46 | vecs[idx] = W[vocab[term], :] 47 | else: 48 | print('Word: %s Out of dictionary!\n' % term) 49 | return 50 | 51 | vec_result = vecs[1] - vecs[0] + vecs[2] 52 | 53 | vec_norm = np.zeros(vec_result.shape) 54 | d = (np.sum(vec_result ** 2,) ** (0.5)) 55 | vec_norm = (vec_result.T / d).T 56 | 57 | dist = np.dot(W, vec_norm.T) 58 | 59 | for term in input_term.split(' '): 60 | index = vocab[term] 61 | dist[index] = -np.Inf 62 | 63 | a = np.argsort(-dist)[:N] 64 | 65 | print("\n Word Cosine distance\n") 66 | print("---------------------------------------------------------\n") 67 | for x in a: 68 | print("%35s\t\t%f\n" % (ivocab[x], dist[x])) 69 | 70 | 71 | if __name__ == "__main__": 72 | N = 100; # number of closest words that will be shown 73 | W, vocab, ivocab = generate() 74 | while True: 75 | input_term = raw_input("\nEnter three words (EXIT to break): ") 76 | if input_term == 'EXIT': 77 | break 78 | else: 79 | distance(W, vocab, ivocab, input_term) 80 | 81 | -------------------------------------------------------------------------------- /eval/support.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | def KL(mu_q, sigma_q, mu_p, sigma_p, debug=False): 5 | """ 6 | Kullback Leibler divergence implementation in numpy. Assumes [batch_size x z_dimension ] or [latent_dim, ] inputs. 7 | 8 | """ 9 | # adjusting dimensions 10 | flag = False 11 | if len(mu_q.shape) == 1 and len(sigma_q.shape) == 1 and len(mu_p.shape) == 1 and len(sigma_p.shape) == 1: 12 | mu_q = mu_q.reshape((1, -1)) 13 | sigma_q = sigma_q.reshape((1, -1)) 14 | mu_p = mu_p.reshape((1, -1)) 15 | sigma_p = sigma_p.reshape((1, -1)) 16 | flag = True 17 | 18 | kl = KL_gauss_spherical(mu_q, sigma_q, mu_p, sigma_p, debug=debug) 19 | if flag: 20 | kl = kl[0] 21 | return kl 22 | 23 | 24 | def KL_gauss_spherical(mu_q, sigma_q, mu_p, sigma_p, debug=False, eps=1e-8): 25 | k = mu_q.shape[1] 26 | sigma_p_inv = 1./(sigma_p + eps) 27 | trace = k * sigma_p_inv * sigma_q 28 | quadr = sigma_p_inv*np.sum(((mu_p - mu_q)**2), axis=1) 29 | log_det_p = np.log(sigma_p + 1e-10) 30 | log_det_q = np.log(sigma_q + 1e-10) 31 | log_det = k*(log_det_p - log_det_q) 32 | res = 0.5 * (trace + quadr - k + log_det) 33 | 34 | if debug: 35 | print "trace : %s" % str(trace) 36 | print "quadr : %s" % str(quadr) 37 | print 'log_det_p : %s' % str(log_det_p) 38 | print 'log_det_q : %s' % str(log_det_q) 39 | print "log_det : %s" % str(log_det) 40 | print 'res : %s'% str(res) 41 | return res.reshape((-1, )) 42 | 43 | 44 | def cosine_sim(x, y): 45 | return float(np.sum(x*y))/float(np.sqrt(np.sum(x**2)*np.sum(y**2))) 46 | 47 | 48 | def read_vectors_to_dict(mus_file_path, sigmas_file_path, log_sigmas=False, vocab=None, header=False): 49 | dict = {} 50 | with open(mus_file_path) as f: 51 | for i, sentence in enumerate(f): 52 | if header and i==0: 53 | continue 54 | 55 | parts = sentence.strip().split(" ") 56 | word = parts[0] 57 | 58 | # filter words that are not in vocab 59 | if vocab is not None and word not in vocab.word_to_index: 60 | continue 61 | 62 | mu = np.array(parts[1:], dtype="float32") 63 | # normalize it 64 | # mu = mu / (np.sum(mu**2)**0.5) 65 | dict[word] = [mu] 66 | # print len(dict) 67 | with open(sigmas_file_path) as f: 68 | for i, sentence in enumerate(f): 69 | if header and i==0: 70 | continue 71 | 72 | parts = sentence.strip().split(" ") 73 | word = parts[0] 74 | 75 | # filter words that are not in vocab 76 | if vocab is not None and word not in vocab.word_to_index: 77 | continue 78 | 79 | sigma = np.array(parts[1:], dtype="float32") 80 | if log_sigmas: 81 | sigma = np.exp(sigma) 82 | dict[word].append(sigma) 83 | return dict -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/run_lexsub.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | sys.path.append(os.path.join(os.getcwd(), "../../")) 3 | from libraries.tools.vocabulary import Vocabulary 4 | from main.simulators_interfaces.bsg_simulator_interface import BsgSimulatorInterface 5 | from main.skipgram_embeddings import Skipgram_Embeddings 6 | from main.support import read_vectors 7 | from main.lex_sub import lex_sub 8 | 9 | 10 | def run_lexsub(input_folder, output_path, half_window_size=5, input_type="normal", embeddings_type="bsg", sg_files_prefix="", 11 | arithm_type="add"): 12 | assert embeddings_type in ['bsg', 'sg'] 13 | assert input_type in ["normal", "dependency"] 14 | assert arithm_type in ["add", "mult"] 15 | # data paths 16 | candidates_file = os.path.dirname(os.path.realpath(__file__)) + "/datasets/lst.gold.candidates" 17 | test_file = os.path.dirname(os.path.realpath(__file__)) + "/datasets/lst_all.preprocessed" 18 | conll_file = os.path.dirname(os.path.realpath(__file__)) + "/datasets/lst_all.conll" 19 | gold_file = os.path.dirname(os.path.realpath(__file__)) + "/datasets/lst_all.gold" 20 | 21 | vocab_file_path = os.path.join(input_folder, 'vocab.txt') 22 | model_file_path = os.path.join(input_folder, 'model.pkl') 23 | vocab = Vocabulary() 24 | vocab.load(vocab_file_path=vocab_file_path) 25 | if embeddings_type == "bsg": 26 | embeddings = BsgSimulatorInterface(model_file_path=model_file_path, vocab=vocab) 27 | else: 28 | # default skipgram 29 | # two cases : either we have mu.vectors or prefix_input.vectors and prefix_output.vectors 30 | input_file_name = "_".join([sg_files_prefix, 'input'])+'.vectors' if sg_files_prefix != "" else "input.vectors" 31 | output_file_name = "_".join([sg_files_prefix, 'output'])+'.vectors' if sg_files_prefix != "" else "output.vectors" 32 | input_vectors_file_path = os.path.join(input_folder, input_file_name) 33 | output_vectors_file_path = os.path.join(input_folder, output_file_name) 34 | if not os.path.exists(input_vectors_file_path) or not os.path.exists(output_vectors_file_path): 35 | input_vectors_file_path = os.path.join(input_folder, 'mu.vectors') 36 | output_vectors_file_path = input_vectors_file_path 37 | embeddings = Skipgram_Embeddings(target_word_embeddings=read_vectors(input_vectors_file_path), 38 | context_embeddings=read_vectors(output_vectors_file_path)) 39 | 40 | # extract word_to_index vectors from vocab 41 | word_to_index = {obj.token:obj.id for obj in vocab} 42 | target_words_vocab = word_to_index 43 | context_words_vocab = word_to_index 44 | 45 | print ' running lexical substitution evaluation' 46 | # run lexical substitution 47 | lex_sub(embeddings=embeddings, input_type=input_type, target_words_vocab=target_words_vocab, 48 | context_words_vocab=context_words_vocab, candidates_file=candidates_file, conll_file=conll_file, test_file=test_file, 49 | gold_file=gold_file, half_window_size=half_window_size, output_path=output_path, arithm_type=arithm_type) 50 | 51 | 52 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/main/skipgram_embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from support import cosine_sim, pos_cosine_sim_normed 3 | 4 | class Skipgram_Embeddings(): 5 | 6 | def __init__(self, target_word_embeddings, context_embeddings): 7 | self.w_emb = target_word_embeddings 8 | self.c_emb = context_embeddings 9 | 10 | # performs the scoring of candidates embeddings given 11 | # returns a dictionary of scores 12 | def score(self, target, context, candidates, ar_type="add", i=-1): 13 | assert ar_type in ["add", "mult"] 14 | scores = {} 15 | if ar_type == "add": 16 | cont_repr = self.__repr_context_add(target, context, avg_flag=True) 17 | for cand in candidates: 18 | if ar_type == "add": 19 | scores[cand] = self.__add(cont_repr, cand=self.w_emb[cand]) 20 | elif ar_type == "mult": 21 | scores[cand] = self.__mult(target, context, cand, geo_mean_flag=True) 22 | return scores 23 | 24 | # Oren's strange add operation 25 | def __add(self, repr, cand): 26 | return np.dot(repr, cand) 27 | 28 | def __repr_context_add(self, target, deps, avg_flag=True): 29 | target_vec = None if target is None else np.copy(self.w_emb[target]) 30 | dep_vec = None 31 | deps_found = 0 32 | for dep in deps: 33 | if dep in self.c_emb: 34 | deps_found += 1 35 | if dep_vec is None: 36 | dep_vec = np.copy(self.c_emb[dep]) 37 | else: 38 | dep_vec += self.c_emb[dep] 39 | 40 | ret_vec = None 41 | if target_vec is not None: 42 | ret_vec = target_vec 43 | if dep_vec is not None: 44 | if avg_flag: 45 | dep_vec /= deps_found 46 | if ret_vec is None: 47 | ret_vec = dep_vec 48 | else: 49 | ret_vec += dep_vec 50 | 51 | norm = (ret_vec.dot(ret_vec.transpose()))**0.5 52 | ret_vec /= norm 53 | 54 | return ret_vec 55 | 56 | def __mult(self, target, deps, subsitute, geo_mean_flag=True): 57 | target_vec = self.w_emb[target] 58 | subs_vec = self.w_emb[subsitute] 59 | score = pos_cosine_sim_normed(target_vec, subs_vec) 60 | for dep in deps: 61 | if dep in self.c_emb: 62 | dep_vec = self.c_emb[dep] 63 | mult_scores = pos_cosine_sim_normed(dep_vec, subs_vec) 64 | if geo_mean_flag: 65 | mult_scores = mult_scores**(1.0/len(deps)) # TODO: think if you need to fix it because len(deps) +1 should be here 66 | score = np.multiply(score, mult_scores) 67 | return score 68 | 69 | 70 | 71 | # def __add(self, center_word, context, substitute): 72 | # seen = 0 73 | # scr = 0 74 | # if center_word in self.w_emb: 75 | # scr += cosine_sim(self.w_emb[center_word], self.w_emb[substitute]) 76 | # seen += 1 77 | # for c in context: 78 | # if c in self.c_emb: 79 | # scr += cosine_sim(self.c_emb[c], self.w_emb[substitute]) 80 | # seen+=1 81 | # return scr/(seen) 82 | -------------------------------------------------------------------------------- /models/bword2vec.py: -------------------------------------------------------------------------------- 1 | import pickle 2 | import os 3 | import theano 4 | from support import load, write_vectors, kl_spher 5 | from pickle import UnpicklingError 6 | from libraries.tools.ordered_attrs import OrderedAttrs 7 | 8 | ## theano configuration 9 | theano.optimizer_including = 'cudnn' 10 | 11 | 12 | class BWord2Vec(OrderedAttrs): 13 | """ 14 | Base class for the Bayesian Skip-gram model, it contains methods that can be used for multiple variants of BSG. 15 | 16 | """ 17 | def __init__(self): 18 | OrderedAttrs.__init__(self) 19 | # the following attributes will be initialized in a child object 20 | self.params = None 21 | self.params_full = None 22 | self.repr_types = None 23 | 24 | @staticmethod 25 | def kl(mu_q, sigma_q, mu_p, sigma_p): 26 | """ 27 | The generic Kullback Leibler function that passes arguments to the correct function 28 | 29 | """ 30 | return kl_spher(mu_q, sigma_q, mu_p, sigma_p) 31 | 32 | def save_word_vectors(self, index_to_word, vectors_folder): 33 | """ 34 | Extracts word vectors from different parameters and saves them to a desired vectors_folder destination 35 | :param index_to_word: an array of words from vocab object 36 | :param vectors_folder: a desired destination path where word vectors should be saved 37 | 38 | """ 39 | for name, func in self.repr_types.items(): 40 | write_vectors(index_to_word, os.path.join(vectors_folder, name+".vectors"), func) 41 | 42 | def save_params(self, output_dir, output_file_name='params.pkl'): 43 | """ 44 | Saves parameters via pickle to the output_dir under the specified name. 45 | 46 | """ 47 | f = open(os.path.join(output_dir, output_file_name), 'wb') 48 | for param_name, param in self.params_full.items(): 49 | # get_value() is necessary because param will be a tensor 50 | pickle.dump([param_name, param['values'].get_value()], f) 51 | f.close() 52 | 53 | def load_params(self, file_path, exclude_params=[]): 54 | """ 55 | Loads params from a pickle saved file. The format has to correspond to the one that is used in save_params() 56 | 57 | """ 58 | f = open(file_path, 'rb') 59 | initialized_params = [] 60 | while True: 61 | try: 62 | param_name, param = pickle.load(f) 63 | if param_name in exclude_params: 64 | continue 65 | self.initialize_param(param_name, param) 66 | initialized_params.append(param_name) 67 | except (EOFError, UnpicklingError): 68 | break 69 | f.close() 70 | return initialized_params 71 | 72 | def initialize_param(self, param_name, param_value): 73 | """ 74 | Initializes a parameter with the provided values 75 | :param param_value: a matrix(array) of parameters 76 | 77 | """ 78 | current_params = self.params_full 79 | if param_name not in current_params: 80 | raise ValueError("Could not find the parameter by '%s' name" % param_name) 81 | current_params[param_name]['values'].set_value(param_value) -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/data/word-sim/EN-YP-130.txt: -------------------------------------------------------------------------------- 1 | brag boast 4.000 2 | concoct devise 4.000 3 | divide split 4.000 4 | build construct 4.000 5 | end terminate 4.000 6 | accentuate highlight 4.000 7 | demonstrate show 3.833 8 | solve figure 3.833 9 | consume eat 3.833 10 | position situate 3.833 11 | swear vow 3.833 12 | furnish supply 3.833 13 | merit deserve 3.667 14 | submit yield 3.667 15 | seize take 3.667 16 | spin twirl 3.500 17 | enlarge swell 3.500 18 | swing sway 3.500 19 | circulate distribute 3.500 20 | recognize acknowledge 3.333 21 | resolve settle 3.333 22 | prolong sustain 3.333 23 | tap knock 3.333 24 | block hinder 3.167 25 | arrange plan 3.167 26 | twist curl 3.167 27 | hail acclaim 3.000 28 | dissipate disperse 3.000 29 | approve support 3.000 30 | impose levy 3.000 31 | hasten accelerate 2.833 32 | rap tap 2.833 33 | lean rest 2.833 34 | make earn 2.833 35 | show publish 2.833 36 | sell market 2.833 37 | weave intertwine 2.667 38 | refer direct 2.667 39 | distribute commercialize 2.500 40 | twist intertwine 2.500 41 | drain tap 2.500 42 | depict recognize 2.500 43 | build organize 2.500 44 | hail address 2.333 45 | call refer 2.167 46 | swing bounce 2.167 47 | yield seize 2.000 48 | split crush 2.000 49 | challenge yield 2.000 50 | hinder assist 2.000 51 | welcome recognize 2.000 52 | need deserve 1.833 53 | refer explain 1.833 54 | finance build 1.667 55 | expect deserve 1.667 56 | terminate postpone 1.667 57 | yell boast 1.667 58 | swell curl 1.667 59 | rotate situate 1.500 60 | seize request 1.500 61 | approve scorn 1.500 62 | supply consume 1.500 63 | clip twist 1.500 64 | divide figure 1.333 65 | advise furnish 1.333 66 | complain boast 1.333 67 | want deserve 1.333 68 | twist fasten 1.333 69 | swing crash 1.167 70 | make trade 1.167 71 | hinder yield 1.167 72 | build propose 1.167 73 | express figure 1.167 74 | resolve examine 1.167 75 | bruise split 1.167 76 | swing break 1.167 77 | catch consume 1.000 78 | swear explain 1.000 79 | request levy 1.000 80 | arrange study 1.000 81 | relieve hinder 1.000 82 | move swell 1.000 83 | weave print 0.833 84 | swear think 0.833 85 | forget resolve 0.833 86 | supervise concoct 0.833 87 | situate isolate 0.667 88 | explain boast 0.667 89 | ache spin 0.667 90 | evaluate terminate 0.667 91 | recognize succeed 0.667 92 | dilute market 0.667 93 | hasten permit 0.667 94 | scorn yield 0.667 95 | swear describe 0.667 96 | arrange explain 0.667 97 | discard arrange 0.667 98 | list figure 0.667 99 | stamp weave 0.500 100 | market sweeten 0.500 101 | boil tap 0.500 102 | sustain lower 0.500 103 | resolve publicize 0.500 104 | dissipate isolate 0.500 105 | anger approve 0.500 106 | approve boast 0.500 107 | research distribute 0.500 108 | request concoct 0.500 109 | boast yield 0.500 110 | furnish impress 0.333 111 | refine sustain 0.333 112 | acknowledge distribute 0.333 113 | clean concoct 0.333 114 | lean grate 0.333 115 | postpone show 0.333 116 | hail judge 0.333 117 | remember hail 0.333 118 | scrape lean 0.333 119 | sweat spin 0.333 120 | highlight restore 0.333 121 | seize refer 0.167 122 | levy believe 0.167 123 | alter highlight 0.167 124 | refer carry 0.167 125 | empty situate 0.167 126 | flush spin 0.167 127 | shake swell 0.167 128 | imitate highlight 0.167 129 | correlate levy 0.000 130 | refer lean 0.000 131 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/embedding_inferrer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Context insensitive inferrer, based on embeddings similarities 3 | ''' 4 | 5 | import time 6 | 7 | from jcs.cs_inferrer import CsInferrer 8 | from jcs.data.embedding import Embedding 9 | from jcs.jcs_io import vec_to_str 10 | from jcs.data.pos import to_wordnet_pos 11 | from jcs.jcs_io import load_vocabulary_counts 12 | 13 | from nltk.stem.wordnet import WordNetLemmatizer 14 | 15 | 16 | 17 | class EmbeddingInferrer(CsInferrer): 18 | ''' 19 | classdocs 20 | ''' 21 | 22 | 23 | def __init__(self, path, vocabfile, top_inferences_to_analyze): 24 | CsInferrer.__init__(self) 25 | self.embeddings = Embedding(path) 26 | self.top_inferences_to_analyze = top_inferences_to_analyze 27 | 28 | self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile) 29 | 30 | def new_target_key(self, target_key): 31 | pass 32 | 33 | def find_inferred(self, lst_instance, tfo): 34 | 35 | if lst_instance.target in self.embeddings: 36 | result_vec, deltatime = self.embeddings.closest_with_time(lst_instance.target, -1) 37 | else: 38 | result_vec, deltatime = None, 0 39 | 40 | tfo.write("\nDeltatime: %f msec\n" % ((deltatime)*1000)) 41 | self.inference_time(deltatime) 42 | 43 | if (result_vec is not None): 44 | tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') 45 | else: 46 | tfo.write("Top most similar embeddings: " + " contexts: None\n") 47 | 48 | return result_vec 49 | 50 | 51 | 52 | def filter_inferred(self, result_vec, candidates, pos): 53 | 54 | filtered_results = {} 55 | candidates_found = set() 56 | 57 | if result_vec != None: 58 | for word, weight in result_vec: 59 | wn_pos = to_wordnet_pos[pos] 60 | lemma = WordNetLemmatizer().lemmatize(word, wn_pos) 61 | if lemma in candidates: 62 | self.add_inference_result(lemma, weight, filtered_results, candidates_found) 63 | if lemma.title() in candidates: # match also capitalized words 64 | self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found) 65 | if word in candidates: # there are some few cases where the candidates are not lemmatized 66 | self.add_inference_result(word, weight, filtered_results, candidates_found) 67 | if word.title() in candidates: # there are some few cases where the candidates are not lemmatized 68 | self.add_inference_result(word.title(), weight, filtered_results, candidates_found) 69 | 70 | 71 | # assign negative weights for candidates with no score 72 | # they will appear last sorted according to their unigram count 73 | candidates_left = candidates - candidates_found 74 | for candidate in candidates_left: 75 | count = self.w2counts[candidate] if candidate in self.w2counts else 1 76 | score = -1 - (1.0/count) # between (-1,-2] 77 | filtered_results[candidate] = score 78 | 79 | return filtered_results 80 | 81 | 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Bayesian Skip-gram(BSG) 2 | 3 | This repository contains Theano code for the Bayesian Skip-gram model, COLING 2018. 4 | 5 | [1] **Embedding Words as Distributions with a Bayesian Skip-gram Model**, Arthur Bražinskas, Serhii Havrylov, Ivan Titov, [arxiv](https://arxiv.org/abs/1711.11027) 6 | 7 | The model represents words are Gaussian distributions instead of point estimates, and is capable of learning addition word properties, such as generality that is 8 | encoded in variances. The instructions below provide a guide on how to install and run the model, also how to evaluate word pairs. 9 | 10 | 11 | ## Requirements 12 | - Python 2.7 13 | - Theano 0.9.0 14 | - numpy 1.14.2 15 | - nltk 3.2.2 16 | - scipy 0.18.1 17 | - Lasagne 0.2.dev1 18 | 19 | ## Installation 20 | 21 | First of all, install the dependency Python modules, such as Theano and nltk. 22 | 23 | ``` 24 | pip install requirements.txt 25 | ``` 26 | 27 | Afterwards, install the necessary NLTK sub-packages. 28 | 29 | ``` 30 | python -m nltk.downloader wordnet 31 | 32 | python -m nltk.downloader punkt 33 | ``` 34 | 35 | ## Runing the model 36 | In order to run the model, please refer to **run_bsg.py** file that contains an example code on how to train and evaluate the model. Upon completion of training, 37 | word representations will be saved to the *output* folder. For example, one can use trained word Gaussian representations(mus and sigmas) as input to word pairs evaluation. 38 | 39 | ### Data 40 | A small dataset consisting of [15 million tokens](https://drive.google.com/open?id=1QWC2x6qq8KyHFUCgyvVJJoGHexZrw7gO) dataset is available for smoke tests of the setup. Alternatively, a dataset consisting of approximately [1 billion tokens](http://www.statmt.org/lm-benchmark/) is also available for the public use. 41 | The dataset that was used originally in the research is not publicly available, but can be (requested)[http://wacky.sslmit.unibo.it/doku.php?id=corpora]. 42 | 43 | ## Word pairs evaluation 44 | 45 | One can use the **eval/word_pairs_eval.py** console application as a playground for word pairs evaluation in terms of similarity, Kullback-Leibler divergence, 46 | and entailment directionality. The console application expects paths for word pairs, mu and sigma vectors(i.e. representations of word). 47 | A word pairs file should contain two words(order does not matter) per line separated by space. The latter two files are obtained from a trained BSG model. 48 | Alternative, pre-trained on the 3B tokens dataset [word representations](https://drive.google.com/open?id=1YQQHFV215YjKLlvxpxsKWLm__TlQMw1Q). 49 | 50 | The example command below will evaluate pairs stored in **eval/example_word_pairs.txt**, and output results to the console. 51 | ``` 52 | python eval/word_pairs_eval.py -wpp eval/example_word_pairs.txt -mup vectors/mu.vectors -sigmap vectors/sigma.vectors 53 | ``` 54 | 55 | 56 | 57 | ## Additional resourced used in the project 58 | Lexical substitution benchmark is a modified version of https://github.com/orenmel/lexsub 59 | 60 | 61 | ## Citation 62 | 63 | ``` 64 | @inproceedings{brazinskas-etal-2018-embedding, 65 | title = "Embedding Words as Distributions with a {B}ayesian Skip-gram Model", 66 | author = "Bra{\v{z}}inskas, Arthur and 67 | Havrylov, Serhii and 68 | Titov, Ivan", 69 | booktitle = "Proceedings of the 27th International Conference on Computational Linguistics", 70 | month = aug, 71 | year = "2018", 72 | address = "Santa Fe, New Mexico, USA", 73 | publisher = "Association for Computational Linguistics", 74 | url = "https://www.aclweb.org/anthology/C18-1151", 75 | pages = "1775--1789", 76 | } 77 | 78 | ``` 79 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/jcs_io.py: -------------------------------------------------------------------------------- 1 | import math 2 | import heapq 3 | 4 | STOPWORD_TOP_THRESHOLD = 256 5 | SUBVEC_DIR_SUFFIX = ".DIR" 6 | VOCAB_TOTAL = "" 7 | 8 | def wf2ws(weight): 9 | return '{0:1.5f}'.format(weight) 10 | 11 | 12 | def vec_to_str(subvec, max_n): 13 | 14 | sub_list_sorted = heapq.nlargest(max_n, subvec, key=lambda x: x[1]) 15 | sub_strs = [' '.join([word, wf2ws(weight)]) for word, weight in sub_list_sorted] 16 | return '\t'.join(sub_strs) 17 | 18 | def vec_to_str_generated(subvec, max_n): 19 | 20 | sub_list_sorted = heapq.nlargest(max_n, subvec, key=lambda x: x[1]) 21 | sub_strs = [word for word, weight in sub_list_sorted] 22 | return ';'.join(sub_strs) 23 | 24 | def count_file_lines(filename): 25 | f = open(filename, 'r') 26 | lines_num = sum(1 for line in f) 27 | f.close() 28 | return lines_num 29 | 30 | def to_rank_weights(subvec): 31 | subvec_len = len(subvec) 32 | for i in xrange(0, subvec_len): 33 | subvec[i] = (subvec[i][0], 1.0-float(i)/subvec_len) 34 | 35 | def get_pmi_weights(subvec, w2counts, sum_counts, offset, threshold, normalize=False): 36 | subvec_pmi = [] 37 | norm = 0 38 | for word, prob in subvec: 39 | if prob != 0.0: 40 | pmi = math.log(prob * sum_counts / w2counts[word])-offset 41 | if pmi>threshold: 42 | subvec_pmi.append((word, pmi)) 43 | norm += pmi**2 44 | 45 | if normalize: 46 | norm = norm**0.5 47 | for i in xrange(0,len(subvec_pmi)): 48 | subvec_pmi[i] = (subvec_pmi[i][0], subvec_pmi[i][1] / norm) 49 | 50 | return subvec_pmi 51 | 52 | 53 | 54 | def extract_word_weight(pair): 55 | tokens = pair.split(' ') 56 | return tokens[0], float(tokens[1]) 57 | 58 | 59 | def load_classes(path): 60 | w2c = {} 61 | max_class_id = 0 62 | with open(path) as f: 63 | for line in f: 64 | tokens = line.split() 65 | word = tokens[0] 66 | class_id = int(tokens[1]) 67 | w2c[word] = class_id 68 | max_class_id = max(max_class_id, class_id) 69 | return w2c, max_class_id+1 70 | 71 | def load_vocabulary_w2i(path): 72 | with open(path) as f: 73 | vocab = [line.split('\t')[0].strip() for line in f if len(line) > 0] 74 | return dict([(a, i) for i, a in enumerate(vocab)]), vocab 75 | 76 | def load_vocabulary_counts(path, factor=1.0): 77 | stop_words = set() 78 | counts = {} 79 | sum = 0 80 | with open(path) as f: 81 | i = 0 82 | for line in f: 83 | if len(line) > 0: 84 | tokens = line.split('\t') 85 | # tokens = line.split(' ') 86 | word = tokens[0].strip() 87 | count = int(tokens[1].strip()) 88 | if (factor != 1.0): 89 | factored_count = int(count**factor) 90 | else: 91 | factored_count = count 92 | counts[word] = factored_count 93 | sum += factored_count 94 | i += 1 95 | # What is this?! 96 | if (i <= STOPWORD_TOP_THRESHOLD): 97 | stop_words.add(word) 98 | total_size = sum #counts[VOCAB_TOTAL] 99 | return counts, total_size, stop_words 100 | 101 | def load_target_counts(path): 102 | counts = {} 103 | with open(path) as f: 104 | for line in f: 105 | if len(line) > 0: 106 | tokens = line.split('\t') 107 | word = tokens[0].strip() 108 | count = int(tokens[1].strip()) 109 | counts[word] = count 110 | return counts -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/README.md: -------------------------------------------------------------------------------- 1 | # Lexical Substitution Evaluation 2 | 3 | This code was used to perform the lexical substitution evaluation described in the following papers: 4 | 5 | **[1] A Simple Word Embedding Model for Lexical Substitution** 6 | Oren Melamud, Omer Levy, Ido Dagan. Workshop on Vector Space Modeling for NLP (VSM), 2015 [[pdf]](http://u.cs.biu.ac.il/~melamuo/publications/melamud_vsm15.pdf). 7 | 8 | **[2] context2vec: Learning Generic Context Embedding with Bidirectional LSTM** 9 | Oren Melamud, Jacob Goldberger, Ido Dagan. CoNLL, 2016 [[pdf]](http://u.cs.biu.ac.il/~melamuo/publications/context2vec_camera_ready.pdf). 10 | 11 | 12 | ## Requirements 13 | 14 | * Python 2.7 15 | * [NLTK 3.0](http://www.nltk.org/)) - optional (only required for the AWE baseline and MSCC evaluation) 16 | * Numpy 17 | * [context2vec](https://github.com/orenmel/context2vec) - for the context2vec evaluation 18 | 19 | ## Datasets 20 | 21 | This repository contains preprocessed data files based on the datasets introduced by the following papers: 22 | 23 | **[3] Semeval-2007 task 10: English lexical substitution task** 24 | Diana McCarthy, Roberto Navigli, SemEval 2007. 25 | (files with the prefix 'lst' under the 'dataset' directory) 26 | 27 | **[4] What substitutes tell us-analysis of an ”all-words” lexical substitution corpus.** 28 | Gerhard Kremer,Katrin Erk, Sebastian Pado, Stefan Thater. EACL, 2014. 29 | (files with the prefix 'coinco' under the 'dataset' directory) 30 | 31 | ## Evaluating the word embedding model [1] 32 | 33 | * Download the word embeddings, context embeddings from [[here]](http://u.cs.biu.ac.il/~nlp/resources/downloads/lexsub_embeddings/) 34 | * Preprocess the embedding files: 35 | ``` 36 | python jcs/text2numpy.py 37 | python jcs/text2numpy.py 38 | ``` 39 | * To perform the lexical substitution evaluation run (replace the example datasets files and params below as you wish): 40 | ``` 41 | python jcs/jcs_main.py --inferrer emb -vocabfile datasets/ukwac.vocab.lower.min100 -testfile datasets/lst_all.preprocessed -testfileconll datasets/lst_all.conll -candidatesfile datasets/lst.gold.candidates -embeddingpath -embeddingpathc -contextmath mult --debug -resultsfile 42 | ``` 43 | * This will create the following output files: 44 | - \ 45 | - \.ranked 46 | - \.generate.oot 47 | - \.generate.best 48 | * Run the following to compute the candidate ranking GAP score. The results will be written to \. 49 | ``` 50 | python jcs/evaluation/lst/lst_gap.py ~/datasets/lst_all.gold .ranked no-mwe 51 | ``` 52 | * Run the following to compute the OOT and BEST substitute prediction scores. The results will be written to \. score.pl was distributed in [3]. 53 | ``` 54 | perl dataset/score.pl \.generate.oot datasets/lst_all.gold -t oot > \ 55 | ``` 56 | ``` 57 | perl dataset/score.pl \.generate.best datasets/lst_all.gold -t best > \ 58 | ``` 59 | 60 | ## Evaluating the context2vec model [2] 61 | 62 | * See [context2vec](https://github.com/orenmel/context2vec) for how to download or train a \. 63 | * To perform the lexical substitution evaluation run (replace the example datasets files and params below as you wish): 64 | ``` 65 | python jcs/jcs_main.py --inferrer lstm -lstm_config \.params -testfile datasets/lst_all.preprocessed -testfileconll datasets/lst_all.conll -candidatesfile datasets/lst.gold.candidates -contextmath mult -resultsfile --ignoretarget --debug 66 | ``` 67 | * From here, follow the same instructions as in the previous section. 68 | 69 | 70 | ## License 71 | 72 | Apache 2.0 73 | 74 | 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/data/word-sim/EN-VERB-143.txt: -------------------------------------------------------------------------------- 1 | happen say 0.1900 2 | makes produced 0.7200 3 | established make 0.4800 4 | calling say 0.5300 5 | developed considers 0.2400 6 | organising developed 0.3500 7 | causes allows 0.4800 8 | requiring organising 0.2700 9 | made set 0.3900 10 | gave shows 0.4300 11 | giving produces 0.3600 12 | rise made 0.2300 13 | seemed find 0.3500 14 | form employ 0.1900 15 | refuses protect 0.2300 16 | produced reports 0.3100 17 | gives employed 0.1600 18 | sets starting 0.1900 19 | organising shown 0.2100 20 | works causes 0.3000 21 | using calls 0.1900 22 | causes used 0.1300 23 | affecting works 0.3100 24 | cause starts 0.5300 25 | strike form 0.1400 26 | considering leading 0.2000 27 | refuses refused 0.7900 28 | affected reported 0.2900 29 | show use 0.1800 30 | work allow 0.3100 31 | starts requires 0.1900 32 | gives produced 0.4000 33 | exist produced 0.1900 34 | starts set 0.3100 35 | employed helping 0.1850 36 | affected created 0.2200 37 | produces starts 0.3700 38 | happens produce 0.2300 39 | using employed 0.3300 40 | establishing increasing 0.3000 41 | seemed seems 0.7300 42 | increasing start 0.1800 43 | said considering 0.1600 44 | allow make 0.2700 45 | led established 0.3500 46 | calling reported 0.2600 47 | found taken 0.2500 48 | says set 0.3100 49 | happens continue 0.2200 50 | reducing works 0.1400 51 | shows produced 0.3000 52 | report creating 0.1400 53 | considers says 0.3300 54 | created work 0.1900 55 | affecting protect 0.2400 56 | providing showing 0.5800 57 | happening affected 0.2200 58 | says shows 0.4000 59 | given working 0.1800 60 | affect happen 0.2700 61 | requires exists 0.3200 62 | affecting helping 0.2800 63 | making establishing 0.5900 64 | applies cause 0.2300 65 | found take 0.2600 66 | organising produced 0.2400 67 | providing reported 0.3800 68 | calls help 0.2500 69 | refusing apply 0.1300 70 | setting finds 0.1600 71 | organise used 0.1600 72 | considers allows 0.3000 73 | allowed sets 0.2600 74 | led organised 0.2600 75 | helped starts 0.1800 76 | recognise considered 0.3400 77 | makes increases 0.2500 78 | given showing 0.2900 79 | creating exist 0.3300 80 | happened showed 0.3700 81 | starts refused 0.0700 82 | establish creating 0.3900 83 | employ affect 0.2600 84 | working developing 0.4600 85 | take providing 0.1100 86 | required considers 0.2000 87 | affecting use 0.1800 88 | recognise provide 0.1700 89 | produced provides 0.3000 90 | showed apply 0.2600 91 | setting showing 0.3500 92 | happening shown 0.2100 93 | given use 0.1700 94 | said allow 0.1400 95 | employ applied 0.2200 96 | works dismiss 0.1300 97 | showing showed 0.7000 98 | employ makes 0.3000 99 | take includes 0.1900 100 | refused provide 0.1500 101 | affected apply 0.2200 102 | concerned provides 0.1500 103 | included allows 0.2700 104 | produce lead 0.2300 105 | produce dismiss 0.0700 106 | find given 0.2600 107 | protected exist 0.1300 108 | dismiss finding 0.0900 109 | found happen 0.2100 110 | give working 0.1300 111 | reducing increased 0.0800 112 | take considered 0.1800 113 | employed applied 0.2000 114 | pay includes 0.1500 115 | including refuses 0.1000 116 | strike says 0.1900 117 | report starting 0.1400 118 | included increased 0.1500 119 | continued says 0.1200 120 | affect organising 0.1100 121 | establishing gives 0.1300 122 | provide including 0.1900 123 | said included 0.2100 124 | develop makes 0.5600 125 | refusing allows 0.1400 126 | paid seems 0.1300 127 | establishing including 0.2800 128 | seem said 0.2400 129 | seem called 0.1800 130 | happens require 0.2100 131 | use working 0.1900 132 | take leading 0.1300 133 | working worked 0.7800 134 | refusing exist 0.0900 135 | establish requires 0.1800 136 | allows employ 0.1700 137 | increased reported 0.2000 138 | seemed protects 0.1000 139 | seems allowed 0.1300 140 | lead require 0.2000 141 | affect showed 0.2100 142 | pay led 0.1800 143 | made affect 0.2300 144 | employ using 0.3300 145 | -------------------------------------------------------------------------------- /models/support.py: -------------------------------------------------------------------------------- 1 | # this file contains common functions that are used by models 2 | import pickle 3 | import numpy as np 4 | from theano import tensor as T 5 | from libraries.utils.paths_and_files import create_folders_if_not_exist 6 | from pickle import UnpicklingError 7 | from theano.tensor.shared_randomstreams import RandomStreams 8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as MRG_RandomStreams 9 | 10 | seed = 1 11 | r_stream = RandomStreams(seed=seed) 12 | r_gpu_stream = MRG_RandomStreams(seed=seed) 13 | 14 | 15 | def sample_words(self, batch_size, nr_neg_samples): 16 | """ 17 | a function that is used for negative sampling context words, draws sample based on unigram distribution 18 | :param batch_size: 19 | :return: a matrix of size [batch_size x nr_of_negative_samples] 20 | 21 | """ 22 | return self.r_stream.choice(size=(batch_size, nr_neg_samples), replace=True, 23 | a=self.vocab_size, p=self.uni_distr, dtype='int32') 24 | 25 | 26 | def kl_diag(mu_q, sigma_q, mu_p, sigma_p, eps): 27 | """ 28 | Kullback Leibler divergence between two diagonal Gaussians 29 | :return: tensor [batch_size x 1] 30 | 31 | """ 32 | d = mu_q.shape[1] 33 | sigma_p_inv = T.pow(sigma_p + 1e-6, -1) 34 | tra = T.sum(sigma_p_inv * sigma_q, axis=1) 35 | quadr = T.sum(sigma_p_inv * ((mu_p - mu_q)**2), axis=1) 36 | log_det_p = T.sum(T.log(sigma_p), axis=1) 37 | log_det_q = T.sum(T.log(sigma_q + eps), axis=1) 38 | log_det = log_det_p - log_det_q 39 | return 0.5 * (tra + quadr - d + log_det) 40 | 41 | 42 | def kl_spher(mu_q, sigma_q, mu_p, sigma_p): 43 | """ 44 | Kullback Leibler divergence between two spherical Gaussians 45 | :return: tensor [batch_size x 1] 46 | 47 | """ 48 | d = mu_q.shape[1] 49 | sigma_p_inv = (1.0/sigma_p) 50 | tra = d * sigma_q*sigma_p_inv 51 | quadr = sigma_p_inv * T.sum((mu_p - mu_q)**2, axis=1, keepdims=True) 52 | log_det = - d*T.log(sigma_q * sigma_p_inv) 53 | res = 0.5 * (tra + quadr - d + log_det) 54 | return res.reshape((-1, )) 55 | 56 | 57 | def l2_sqrd(x, axis=1): 58 | return T.sum(x**2, axis=axis) 59 | 60 | 61 | # uniform init so far only 62 | def init_weights(size, low_high_factor=100, scale_factor=1.): 63 | """ 64 | 65 | :param size: size of a matrix to initialize 66 | :param low_high_factor: a factor in the initialization (see code) 67 | :return: initialized matrix of the same size as "size" 68 | """ 69 | return np.float32(scale_factor)*np.float32(np.random.uniform(low=-low_high_factor**-0.5, high=low_high_factor**-0.5, size=size)) 70 | 71 | 72 | def init_weights2(size, low_factor=-1, high_factor=1, scale_factor=1.): 73 | """ 74 | similar to init_weights but with decoupled low and high factors 75 | :param size: size of a matrix to initialize 76 | :return: initialized matrix of the same size as "size" 77 | 78 | """ 79 | return np.float32(scale_factor)*np.float32(np.random.uniform(low=low_factor, high=high_factor, size=size)) 80 | 81 | 82 | def write_vectors(vocab, file_path, embeddings_function): 83 | """ 84 | # extracts word vectors via embeddings_function and writes them into a file 85 | :param vocab: vocabulary object 86 | :param file_path: where to write vectors 87 | :param embeddings_function: a function that takes word id as input and return a vector embedding 88 | """ 89 | create_folders_if_not_exist(file_path) 90 | with open(file_path, 'w') as output_file: 91 | for word_obj in vocab: 92 | word_vec = embeddings_function(word_obj.id) 93 | output_file.write(word_obj.token + " " + " ".join(str(f) for f in word_vec)+"\n") 94 | 95 | 96 | def load(file_path): 97 | """ 98 | a parameters loading function that is used to pre-loading pre-trained parameters to a model 99 | :param file_path: a path of a file that contains parameters in the format [parm_name:parm] 100 | 101 | """ 102 | f = open(file_path, 'rb') 103 | while True: 104 | try: 105 | name, param = pickle.load(f) 106 | yield name, param 107 | except (EOFError, UnpicklingError): 108 | break 109 | f.close() -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/measures/generalized_average_precision.py: -------------------------------------------------------------------------------- 1 | ''' 2 | See following paper for quick description of GAP: 3 | http://aclweb.org/anthology//P/P10/P10-1097.pdf 4 | ''' 5 | 6 | from operator import itemgetter 7 | from random import shuffle 8 | import copy 9 | 10 | class GeneralizedAveragePrecision(object): 11 | 12 | @staticmethod 13 | def accumulate_score(gold_vector): 14 | accumulated_vector = [] 15 | accumulated_score = 0 16 | for (key, score) in gold_vector: 17 | accumulated_score += float(score) 18 | accumulated_vector.append([key, accumulated_score]) 19 | return accumulated_vector 20 | 21 | 22 | 23 | ''' 24 | gold_vector: a vector of pairs (key, score) representing all valid results 25 | evaluated_vector: a vector of pairs (key, score) representing the results retrieved by the evaluated method 26 | gold_vector and evaluated vector don't need to include the same keys or be in the same length 27 | ''' 28 | 29 | @staticmethod 30 | def calc(gold_vector, evaluated_vector, random=False): 31 | gold_map = {} 32 | for [key, value] in gold_vector: 33 | gold_map[key]=value 34 | sorted_gold_vector = sorted(gold_vector, key=itemgetter(1), reverse=True) 35 | gold_vector_accumulated = GeneralizedAveragePrecision.accumulate_score(sorted_gold_vector) 36 | 37 | 38 | ''' first we use the eval score to sort the eval vector accordingly ''' 39 | if random is False: 40 | sorted_evaluated_vector = sorted(evaluated_vector, key=itemgetter(1), reverse=True) 41 | else: 42 | sorted_evaluated_vector = copy.copy(evaluated_vector) 43 | shuffle(sorted_evaluated_vector) 44 | sorted_evaluated_vector_with_gold_scores = [] 45 | ''' now we replace the eval score with the gold score ''' 46 | for (key, score) in sorted_evaluated_vector: 47 | if (key in gold_map.keys()): 48 | gold_score = gold_map.get(key) 49 | else: 50 | gold_score = 0 51 | sorted_evaluated_vector_with_gold_scores.append([key, gold_score]) 52 | evaluated_vector_accumulated = GeneralizedAveragePrecision.accumulate_score(sorted_evaluated_vector_with_gold_scores) 53 | 54 | ''' this is sum of precisions over all recall points ''' 55 | i = 0 56 | nominator = 0.0 57 | for (key, accum_score) in evaluated_vector_accumulated: 58 | i += 1 59 | if (key in gold_map.keys()) and (gold_map.get(key) > 0): 60 | nominator += accum_score/i 61 | 62 | ''' this is the optimal sum of precisions possible based on the gold standard ranking ''' 63 | i = 0 64 | denominator = 0 65 | for (key, accum_score) in gold_vector_accumulated: 66 | if gold_map.get(key) > 0: 67 | i += 1 68 | denominator += accum_score/i 69 | 70 | if (denominator == 0.0): 71 | gap = -1 72 | else: 73 | gap = nominator/denominator 74 | 75 | return gap 76 | 77 | 78 | @staticmethod 79 | def calcTopN(gold_vector, evaluated_vector, n, measure_type): 80 | gold_map = {} 81 | for [key, value] in gold_vector: 82 | gold_map[key]=value 83 | gold_vector_sorted = sorted(gold_vector, key=itemgetter(1), reverse=True) 84 | gold_top_score_sum = sum([float(score) for (key, score) in gold_vector_sorted[0:n]]) 85 | 86 | evaluated_top_score_sum = 0 87 | sorted_evaluated_vector = sorted(evaluated_vector, key=itemgetter(1), reverse=True) 88 | for (key, score) in sorted_evaluated_vector[0:n]: 89 | if key in gold_map: 90 | gold_score = gold_map[key] 91 | else: 92 | gold_score = 0 93 | evaluated_top_score_sum += float(gold_score) 94 | 95 | if measure_type == 'sap' or measure_type == 'wap': 96 | denominator = n 97 | else: 98 | denominator = gold_top_score_sum 99 | 100 | return evaluated_top_score_sum/denominator -------------------------------------------------------------------------------- /libraries/evaluation/GloVe/README.md: -------------------------------------------------------------------------------- 1 | ## GloVe: Global Vectors for Word Representation 2 | 3 | frog nearest neighbors | Litoria | Leptodactylidae | Rana | Eleutherodactylus 4 | -------------------------|:-------------------------:|:-------------------------:|:-------------------------:|:-------------------------:| 5 |
  • frogs
  • toad
  • litoria
  • leptodactylidae
  • rana
  • lizard
  • eleutherodactylus | ![](http://nlp.stanford.edu/projects/glove/images/litoria.jpg) | ![](http://nlp.stanford.edu/projects/glove/images/leptodactylidae.jpg) | ![](http://nlp.stanford.edu/projects/glove/images/rana.jpg) | ![](http://nlp.stanford.edu/projects/glove/images/eleutherodactylus.jpg) 6 | 7 | We provide an implementation of the GloVe model for learning word representations. Please see the [project page](http://nlp.stanford.edu/projects/glove/) for more information. 8 | 9 | man -> woman | city -> zip | comparative -> superlative 10 | :-------------------------:|:-------------------------:|:-------------------------:|:-------------------------:| 11 | ![](http://nlp.stanford.edu/projects/glove/images/man_woman_small.jpg) | ![](http://nlp.stanford.edu/projects/glove/images/city_zip_small.jpg) | ![](http://nlp.stanford.edu/projects/glove/images/comparative_superlative_small.jpg) 12 | 13 | ## Download pre-trained word vectors 14 | Pre-trained word vectors are made available under the Public Domain Dedication 15 | and License 16 |
    17 |
      18 |
    • Wikipedia 2014 + Gigaword 5 (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, & 300d vectors, 822 MB download): glove.6B.zip
    • 19 |
    • Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): glove.42B.300d.zip
    • 20 |
    • Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): glove.840B.300d.zip
    • 21 |
    • Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, & 200d vectors, 1.42 GB download): glove.twitter.27B.zip Ruby script for preprocessing Twitter data
    • 22 |
    23 |
    24 | 25 | ## Train word vectors on a new corpus 26 | 27 | $ git clone http://github.com/stanfordnlp/glove 28 | $ cd glove && make 29 | $ ./demo.sh 30 | 31 | The demo.sh scipt downloads a small corpus, consisting of the first 100M characters of Wikipedia. It collects unigram counts, constructs and shuffles cooccurrence data, and trains a simple version of the GloVe model. It also runs a word analogy evaluation script in python. Continue reading for further usage details and instructions for how to run on your own corpus. 32 | 33 | ### Package Contents 34 | This package includes four main tools: 35 | #### 1) vocab_count 36 | Constructs unigram counts from a corpus, and optionally thresholds the resulting vocabulary based on total vocabulary size or minimum frequency count. This file should already consist of whitespace-separated tokens. Use something like the Stanford Tokenizer (http://nlp.stanford.edu/software/tokenizer.shtml) first on raw text. 37 | #### 2) cooccur 38 | Constructs word-word cooccurrence statistics from a corpus. The user should supply a vocabulary file, as produced by 'vocab_count', and may specify a variety of parameters, as described by running './build/cooccur'. 39 | #### 3) shuffle 40 | Shuffles the binary file of cooccurrence statistics produced by 'cooccur'. For large files, the file is automatically split into chunks, each of which is shuffled and stored on disk before being merged and shuffled togther. The user may specify a number of parameters, as described by running './build/shuffle'. 41 | #### 4) glove 42 | Train the GloVe model on the specified cooccurrence data, which typically will be the output of the 'shuffle' tool. The user should supply a vocabulary file, as given by 'vocab_count', and may specify a number of other parameters, which are described by running './build/glove'. 43 | 44 | ### License 45 | All work contained in this package is licensed under the Apache License, Version 2.0. See the include LICENSE file. 46 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/main/lex_sub.py: -------------------------------------------------------------------------------- 1 | import os, sys 2 | import operator 3 | from support import read_candidates, wf2ws, flatten, get_best_scores_for_candidates, conll_skip_sentence 4 | from context_instance import ContextInstance 5 | import numpy as np 6 | from libraries.utils.paths_and_files import create_folders_if_not_exist 7 | from libraries.evaluation.lexsub.jcs.evaluation.lst.lst_gap import compute_gap 8 | 9 | 10 | def lex_sub(embeddings, input_type, target_words_vocab, context_words_vocab, output_path, candidates_file, conll_file, 11 | test_file, gold_file, half_window_size=5, arithm_type=None): 12 | conll = None 13 | if input_type == "dependency": 14 | conll = open(conll_file, "r") 15 | 16 | # this is a very time consuming part 17 | all_candidates = read_candidates(candidates_file, allowed_words=target_words_vocab) # read candidates 18 | ranked_file_path = os.path.join(output_path, "ranked_bsg.txt") 19 | create_folders_if_not_exist(ranked_file_path) 20 | output = open(ranked_file_path, "w") 21 | with open(test_file, 'r') as f: 22 | for i, line in enumerate(f): 23 | # if i % 1 == 0: 24 | # print "----------------------" 25 | # print 'reading line # %d' % (i+1) 26 | lst_instance = ContextInstance(line) 27 | 28 | # find if the target(center) word in in the vocab ( either his lemma or word) 29 | target = None 30 | if lst_instance.target in target_words_vocab: 31 | target = lst_instance.target 32 | elif lst_instance.target_lemma in target_words_vocab: 33 | target = lst_instance.target_lemma 34 | 35 | # sometimes the target(center) word can be not in vocab, so we skip that instance 36 | if target and lst_instance.target_key in all_candidates: 37 | left_context, right_context = lst_instance.get_neighbors(half_window_size) if input_type == "normal" else\ 38 | lst_instance.get_dep_context(conll, lst_instance.target) 39 | 40 | # perform filtering by throwing away all words that do not appear in vocab 41 | # I do it after creating windows because of indexing problem 42 | left_context = [c for c in left_context if c in context_words_vocab] 43 | right_context = [c for c in right_context if c in context_words_vocab] 44 | 45 | # print "---------------------------------------" 46 | # print "left context for the word \"%s\" is : %s" % (target, str(left_context)) 47 | # print "right context for the word \"%s\" is : %s" % (target, str(right_context)) 48 | 49 | # grab candidates 50 | candidates = all_candidates[lst_instance.target_key] # note that candidates is a dictionary 51 | 52 | # format candidates properly 53 | formatted_candidates = np.unique(flatten(candidates.values())) 54 | 55 | # rank them 56 | scores = embeddings.score(target=target, left_context=left_context, right_context=right_context, 57 | candidates=formatted_candidates, ar_type=arithm_type) 58 | 59 | # print(candidates.items()) 60 | # perform mapping to all candidates 61 | scores = get_best_scores_for_candidates(candidates, scores) 62 | 63 | # sort in descending order 64 | sorted_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True) 65 | formatted_scores = [' '.join([word, wf2ws(score)]) for word, score in sorted_scores] 66 | formatted_scores = "\t" + "\t".join(formatted_scores) 67 | else: 68 | formatted_scores = "" 69 | # print 'center word %s is not in the vocabulary' %(target) 70 | # print "skipping start" 71 | if input_type == "dependency": 72 | conll_skip_sentence(conll) # to make sure that pointers are aligned 73 | # print "skipping end" 74 | 75 | # write to a file 76 | output.write("RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + formatted_scores + "\n") 77 | if conll: 78 | conll.close() 79 | output.close() 80 | 81 | # compute gap 82 | compute_gap(gold_file_path=gold_file, eval_file_path=ranked_file_path, out_file_path=os.path.join(output_path, "gap.txt"), 83 | ignore_mwe=True) 84 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/lst/lst_gap.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Used to compute GAP score for the LST ranking task 3 | 4 | ''' 5 | 6 | import sys 7 | import random 8 | import re 9 | 10 | from libraries.evaluation.lexsub.jcs.evaluation.measures.generalized_average_precision import GeneralizedAveragePrecision 11 | 12 | 13 | #take.v 25 :: consider 2;accept 1;include 1;think about 1; 14 | def read_gold_line(gold_line, ignore_mwe): 15 | segments = gold_line.split("::") 16 | instance_id = segments[0].strip() 17 | gold_weights = [] 18 | line_candidates = segments[1].strip().split(';') 19 | for candidate_count in line_candidates: 20 | if len(candidate_count) > 0: 21 | delimiter_ind = candidate_count.rfind(' ') 22 | candidate = candidate_count[:delimiter_ind] 23 | if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)): 24 | continue 25 | count = candidate_count[delimiter_ind:] 26 | try: 27 | gold_weights.append((candidate, int(count))) 28 | except ValueError as e: 29 | print e 30 | print gold_line 31 | print "cand=%s count=%s" % (candidate,count) 32 | sys.exit(1) 33 | 34 | return instance_id, gold_weights 35 | 36 | #RESULT find.v 71 show 0.34657 37 | def read_eval_line(eval_line, ignore_mwe): 38 | eval_weights = [] 39 | segments = eval_line.split("\t") 40 | instance_id = segments[1].strip() 41 | for candidate_weight in segments[2:]: 42 | if len(candidate_weight) > 0: 43 | delimiter_ind = candidate_weight.rfind(' ') 44 | candidate = candidate_weight[:delimiter_ind] 45 | weight = candidate_weight[delimiter_ind:] 46 | if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)): 47 | continue 48 | try: 49 | eval_weights.append((candidate, float(weight))) 50 | except: 51 | print "Error appending: %s %s" % (candidate, weight) 52 | 53 | return instance_id, eval_weights 54 | 55 | def compute_gap(gold_file_path, eval_file_path, out_file_path, ignore_mwe=False, randomize=False): 56 | 57 | gold_file = open(gold_file_path, 'r') 58 | eval_file = open(eval_file_path, 'r') 59 | out_file = open(out_file_path, 'w') 60 | 61 | gold_data = {} 62 | eval_data = {} 63 | 64 | i=0 65 | sum_gap = 0.0 66 | for eval_line in eval_file: 67 | eval_instance_id, eval_weights = read_eval_line(eval_line, ignore_mwe) 68 | eval_data[eval_instance_id] = eval_weights 69 | 70 | for gold_line in gold_file: 71 | gold_instance_id, gold_weights = read_gold_line(gold_line, ignore_mwe) 72 | gold_data[gold_instance_id] = gold_weights 73 | 74 | ignored = 0 75 | for gold_instance_id, gold_weights in gold_data.iteritems(): 76 | eval_weights = eval_data[gold_instance_id] 77 | gap = GeneralizedAveragePrecision.calc(gold_weights, eval_weights, randomize) 78 | if (gap < 0): # this happens when there is nothing left to rank after filtering the multi-word expressions 79 | ignored += 1 80 | continue 81 | out_file.write(gold_instance_id + "\t" + str(gap) + "\n") 82 | i += 1 83 | sum_gap += gap 84 | 85 | mean_gap = sum_gap/i 86 | out_file.write("\ngold_data %d eval_data %d\n" % (len(gold_data),len(eval_data))) 87 | out_file.write("\nRead %d test instances\n" % i) 88 | out_file.write("\nIgnored %d test instances (couldn't compute gap)\n" % ignored) 89 | out_file.write("\nMEAN_GAP\t" + str(mean_gap) + "\n") 90 | 91 | 92 | print "MEAN GAP is %f" %mean_gap 93 | 94 | gold_file.close() 95 | eval_file.close() 96 | out_file.close() 97 | 98 | if __name__ == '__main__': 99 | 100 | if len(sys.argv) < 4: 101 | print "usage: %s [no-mwe] [random]" % (sys.argv[0]) 102 | sys.exit(1) 103 | 104 | 105 | gold_file_path = sys.argv[1] 106 | eval_file_path = sys.argv[2] 107 | out_file_path = sys.argv[3] 108 | 109 | if len(sys.argv) > 4 and sys.argv[4] == 'no-mwe': 110 | ignore_mwe = True 111 | else: 112 | ignore_mwe = False 113 | 114 | if len(sys.argv) > 5 and sys.argv[5] == 'random': 115 | randomize = True 116 | else: 117 | randomize = False 118 | 119 | compute_gap(gold_file_path, eval_file_path, out_file_path, ignore_mwe, randomize) 120 | 121 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/data/embedding.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import heapq 3 | import math 4 | import time 5 | 6 | class Embedding: 7 | 8 | def __init__(self, path): 9 | self.m = self.normalize(np.load(path + '.npy')) 10 | self.dim = self.m.shape[1] 11 | self.wi, self.iw = self.readVocab(path + '.vocab') 12 | 13 | 14 | def zeros(self): 15 | return np.zeros(self.dim) 16 | 17 | def dimension(self): 18 | return self.dim 19 | 20 | def normalize(self, m): 21 | norm = np.sqrt(np.sum(m*m, axis=1)) 22 | norm[norm==0] = 1 23 | return m / norm[:, np.newaxis] 24 | 25 | def readVocab(self, path): 26 | vocab = [] 27 | with open(path) as f: 28 | for line in f: 29 | vocab.extend(line.strip().split()) 30 | return dict([(w, i) for i, w in enumerate(vocab)]), vocab 31 | 32 | def __contains__(self, w): 33 | return w in self.wi 34 | 35 | def represent(self, w): 36 | return self.m[self.wi[w], :] 37 | 38 | def scores(self, vec): 39 | return np.dot(self.m, vec) 40 | 41 | # why +1 .../2? 42 | def pos_scores(self, vec): 43 | return (np.dot(self.m, vec)+1)/2 44 | 45 | def pos_scores2(self, vec): 46 | scores = np.dot(self.m, vec) 47 | scores[scores < 0.0] = 0.0 48 | return scores 49 | 50 | 51 | def top_scores(self, scores, n=10): 52 | if n <= 0: 53 | n = len(scores) 54 | return heapq.nlargest(n, zip(self.iw, scores), key=lambda x: x[1]) 55 | 56 | def closest(self, w, n=10): 57 | scores = np.dot(self.m, self.represent(w)) 58 | return self.top_scores(scores,n) 59 | 60 | def closest_with_time(self, w, n=10): 61 | start = time.time() 62 | scores = np.dot(self.m, self.represent(w)) 63 | end = time.time() 64 | # print "\nDeltatime: %f msec\n" % ((end-start)*1000) 65 | return self.top_scores(scores,n), end-start 66 | 67 | def closest_vec(self, wordvec, n=10): 68 | #scores = self.m.dot(self.represent(w)) 69 | scores = np.dot(self.m, wordvec) 70 | return self.top_scores(scores, n) 71 | # if n <= 0: 72 | # n = len(scores) 73 | # return heapq.nlargest(n, zip(self.iw, scores)) 74 | 75 | def closest_vec_filtered(self, wordvec, vocab, n=10): 76 | scores = np.dot(self.m, wordvec) 77 | if n <= 0: 78 | n = len(scores) 79 | scores_words = zip(self.iw, scores) 80 | for i in xrange(0,len(scores_words)): 81 | if not scores_words[i][1] in vocab: 82 | scores_words[i] = (-1, scores_words[i][0]) 83 | return heapq.nlargest(n, zip(self.iw, scores), key=lambda x: x[1]) 84 | 85 | def closest_prefix(self, w, prefix, n=10): 86 | scores = np.dot(self.m, self.represent(w)) 87 | scores_words = zip(self.iw, scores) 88 | for i in xrange(0,len(scores_words)): 89 | if not scores_words[i][1].startswith(prefix): 90 | scores_words[i] = (-1, scores_words[i][0]) 91 | return heapq.nlargest(n, scores_words, key=lambda x: x[1]) 92 | 93 | def closest_filtered(self, w, vocab, n=10): 94 | scores = np.dot(self.m, self.represent(w)) 95 | scores_words = zip(self.iw, scores) 96 | for i in xrange(0,len(scores_words)): 97 | if not scores_words[i][1] in vocab: 98 | scores_words[i] = (-1, scores_words[i][0]) 99 | return heapq.nlargest(n, scores_words, key=lambda x: x[1]) 100 | 101 | def similarity(self, w1, w2): 102 | return self.represent(w1).dot(self.represent(w2)) 103 | 104 | def norm_vec(vec): 105 | length = 1.0 * math.sqrt(sum(val ** 2 for val in vec)) 106 | return [val/length for val in vec] 107 | 108 | def score2string(score): 109 | return score[1] + "\t" + '{0:1.3f}'.format(score[0]) 110 | 111 | 112 | def closest_sym_scores(targets, subs, w, n): 113 | w_target_vec = targets.represent(w) 114 | w_sub_vec = subs.represent(w) 115 | w2subs = subs.closest_vec(w_target_vec,0) 116 | w2subs2w = [] 117 | for entry in w2subs: 118 | score = (entry[0]+1)/2 119 | sub = entry[1] 120 | sub_target_vec = targets.represent(sub) 121 | rev_score = (np.dot(sub_target_vec, w_sub_vec)+1)/2 122 | w2subs2w.append((math.sqrt(score * rev_score), sub)) 123 | return heapq.nlargest(n, w2subs2w) 124 | 125 | 126 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/main/support.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import re 3 | from nltk.corpus import wordnet as wn 4 | from pos import to_wordnet_pos 5 | from nltk.stem.wordnet import WordNetLemmatizer 6 | 7 | 8 | def read_vectors(file, header=False): 9 | dict = {} 10 | with open(file, 'r') as f: 11 | for i, sentence in enumerate(f): 12 | if header and i == 0: 13 | continue 14 | parts = sentence.strip().split(" ") 15 | word = parts[0] 16 | vec = np.array(parts[1:], dtype="float32") 17 | # normalize 18 | vec = vec/np.sum(vec**2)**0.5 19 | dict[word] = vec 20 | return dict 21 | 22 | 23 | def conll_skip_sentence(conll, eos_symbol=""): 24 | while True: 25 | line = re.split(r'\t', conll.readline()) 26 | word = line[1] 27 | if word == eos_symbol: 28 | break 29 | conll.readline() 30 | 31 | 32 | 33 | def read_candidates(candidates_file, allowed_words): 34 | target2candidates = {} 35 | # finally.r::eventually;ultimately 36 | print "--- reading candidates ---" 37 | with open(candidates_file, 'r') as f: 38 | for i, line in enumerate(f): 39 | # if (i+1) % 1 == 0: 40 | # print 'read %d lines' %(i+1) 41 | segments = line.strip().split('::') 42 | target = segments[0] 43 | word, pos = target.split('.') 44 | # assuming that candidates are unique initially 45 | candidates = [str(c) for c in segments[1].split(';') if c.find(" ") == -1] # forbid composite words 46 | candidates = filter_candidate(pos, candidates, allowed_words) 47 | target2candidates[target] = candidates 48 | print '--- done ---' 49 | return target2candidates 50 | 51 | 52 | # performs a 3 step filtering by matching with vocabulary of allowed_words 53 | # note that candidates is a dictionary old_cand -> [words in vocab], where new_cands are candidates 54 | # we find matching our allowed_words 55 | def filter_candidate(pos, candidates, allowed_words): 56 | # this fixes the problem with non_wordnet pos tags 57 | if pos in to_wordnet_pos: 58 | pos = to_wordnet_pos[pos] 59 | new_candidates = {} 60 | for word in allowed_words.keys(): 61 | if not is_ascii(word): 62 | continue 63 | lemma = WordNetLemmatizer().lemmatize(word, pos) 64 | # we try 3 things 65 | tries = [lemma, lemma.title, word, word.title()] 66 | # tries = [lemma] 67 | for w in tries: 68 | if w in candidates: 69 | if w not in new_candidates: 70 | new_candidates[w] = [] 71 | new_candidates[w].append(word) 72 | return new_candidates 73 | 74 | def is_ascii(s): 75 | return all(ord(c) < 128 for c in s) 76 | 77 | def cosine_sim(x, y): 78 | return float(np.sum(x*y))/float(np.sqrt(np.sum(x**2)*np.sum(y**2))) 79 | 80 | # some wierd form of dot pos dot product 81 | # with normalized vectors 82 | def pos_cosine_sim_normed(x, y): 83 | return (np.dot(x, y)+1)/2 84 | 85 | def wf2ws(weight): 86 | return '{0:1.5f}'.format(weight) 87 | 88 | 89 | 90 | def morphify(word, pos): 91 | """ morph a word """ 92 | synsets = wn.synsets(word, pos=pos) 93 | 94 | # Word not found 95 | if not synsets: 96 | return [] 97 | 98 | # Get all lemmas of the word 99 | lemmas = [l for s in synsets for l in s.lemmas() if s.name().split('.')[1] == pos] 100 | 101 | # Get related forms 102 | derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas] 103 | 104 | # filter only the targeted pos 105 | related_lemmas = [l for drf in derivationally_related_forms \ 106 | for l in drf[1] if l.synset().name().split('.')[1] == pos] 107 | 108 | # Extract the words from the lemmas 109 | words = [l.name() for l in related_lemmas] 110 | len_words = len(words) 111 | 112 | # Build the result in the form of a list containing tuples (word, probability) 113 | result = [(w, float(words.count(w))/len_words) for w in set(words)] 114 | result.sort(key=lambda w: -w[1]) 115 | 116 | # return all the possibilities sorted by probability 117 | return result 118 | 119 | 120 | def flatten(list): 121 | return [item for sublist in list for item in sublist] 122 | 123 | # returns (candidate, best_score) 124 | def get_best_scores_for_candidates(candidates, scores): 125 | new_scores = {} 126 | for cand, words in candidates.items(): 127 | # now choose the best one 128 | new_scores[cand] = max([scores[w] for w in words]) 129 | return new_scores 130 | 131 | -------------------------------------------------------------------------------- /libraries/batch_iterators/support.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # contains helper functions that are used in different iterators 3 | import numpy as np 4 | np.random.seed(1) 5 | 6 | def pad_sents(sentences, max_length, pad_symbol, mask_current=False, padding_mode='both'): 7 | """ 8 | pads many sentences 9 | :param mask_current: whether sentence elements that have pad symbols should be masked 10 | :param padding_mode: whether pad only the left side(dead useful for LSTM) 11 | 12 | """ 13 | padded_sentences, masks = [], [] 14 | for sentence in sentences: 15 | x, m = pad_sent(sentence, max_length, pad_symbol, mask_current=mask_current, 16 | padding_mode=padding_mode) 17 | padded_sentences.append(x) 18 | masks.append(m) 19 | return np.array(padded_sentences, dtype="int32"), np.array(masks, dtype="float32") 20 | 21 | 22 | # pads sentence if necessary with some symbol's id 23 | # note that it returns a binary mask too 24 | # TODO: write a better documentation for this function! 25 | def pad_sent(sentence, max_length, pad_symbol, mask_current=False, padding_mode='both'): 26 | assert padding_mode in ['left', 'both', 'right'] 27 | pad_number = max_length - len(sentence) 28 | 29 | if mask_current: 30 | masked_sentence = [s != pad_symbol for s in sentence] 31 | 32 | # will perform truncation if the sentence is too long 33 | if pad_number < 0: 34 | if mask_current: 35 | mask = masked_sentence 36 | else: 37 | mask = np.ones((max_length, )) 38 | res = sentence[:max_length] 39 | return res, mask 40 | 41 | # padding only left side 42 | if padding_mode == 'left': 43 | res = [pad_symbol] * pad_number + sentence 44 | if mask_current: 45 | mask = [0]*pad_number + masked_sentence 46 | else: 47 | mask = [0]*pad_number + [1]*len(sentence) 48 | # padding both sides 49 | elif padding_mode == 'both': 50 | res = [pad_symbol] * (pad_number/2) + sentence + [pad_symbol] * (pad_number/2) 51 | if mask_current: 52 | mask = [0]*(pad_number/2) + masked_sentence + [0]*(pad_number/2) 53 | else: 54 | mask = [0]*(pad_number/2) + [1]*len(sentence) + [0]*(pad_number/2) 55 | if pad_number % 2 == 1: 56 | res += [pad_symbol] 57 | mask += [0] 58 | # pad only the right side 59 | elif padding_mode == "right": 60 | res = sentence + [pad_symbol] * pad_number 61 | if mask_current: 62 | mask = masked_sentence + [0]*pad_number 63 | else: 64 | mask = [1]*len(sentence) + [0]*pad_number 65 | 66 | return res, mask 67 | 68 | 69 | def allow_with_prob(word_count, total_words_count, subsampling_threshold=1e-5): 70 | """ 71 | Sub-sampling of frequent words: can improve both accuracy and speed for large data sets 72 | Source: "Distributed Representations of Words and Phrases and their Compositionality". 73 | 74 | """ 75 | freq = float(word_count) / float(total_words_count) 76 | removal_prob = 1.0 - np.sqrt(subsampling_threshold / freq) 77 | return np.random.random_sample() > removal_prob 78 | 79 | 80 | def create_context_windows(sentence, half_window_size=0): 81 | """ 82 | A generic function that either returns (center_word, window_context) or (center_word, left_context, right_context). 83 | To switch from first to the second mode, set half_window_size=0 84 | 85 | """ 86 | n = len(sentence) 87 | for idx in range(half_window_size, n - half_window_size): 88 | center_word = sentence[idx] 89 | if half_window_size > 0: 90 | context = sentence[idx - half_window_size:idx] + sentence[idx + 1:idx + half_window_size + 1] 91 | yield (center_word, context) 92 | else: 93 | assert half_window_size == 0 94 | yield (center_word, sentence[0:idx], sentence[idx + 1:]) 95 | 96 | 97 | def create_continues_context_windows(sentence, special_center_words, half_window_size): 98 | """ 99 | Creates windows where center words are separately marked(by using differnt vocab_ids) from context words. 100 | This function was used for the BSG's LSTM data batcher. 101 | 102 | """ 103 | n = len(sentence) 104 | for idx in range(half_window_size, n - half_window_size): 105 | context = sentence[idx - half_window_size:idx] + sentence[idx + 1:idx + half_window_size + 1] 106 | context_and_center = sentence[idx - half_window_size:idx] + [special_center_words[idx]] + sentence[idx + 1:idx + half_window_size + 1] 107 | yield (sentence[idx], context, context_and_center) 108 | 109 | 110 | def sample_words(batch_size, vocab_size, distr, nr_neg_samples=1): 111 | return np.random.choice(size=(batch_size, nr_neg_samples), replace=True, a=vocab_size, p=distr) -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/cs_inferrer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Base class for context sensitive inference modules 3 | ''' 4 | 5 | import re 6 | from nltk.stem.wordnet import WordNetLemmatizer 7 | from jcs.data.pos import to_wordnet_pos 8 | 9 | # just something to return in case not enough words were generated 10 | default_generated_results = ['time', 'people', 'information', 'work', 'first', 'like', 'year', 'make', 'day', 'service'] 11 | 12 | #generated_word_re = re.compile('^[a-zA-Z]+(-[a-zA-Z]+)*$') 13 | generated_word_re = re.compile('^[a-zA-Z]+$') 14 | 15 | 16 | class CsInferrer(object): 17 | ''' 18 | classdocs 19 | ''' 20 | 21 | def __init__(self): 22 | ''' 23 | Constructor 24 | ''' 25 | self.time = [0.0, 0] 26 | 27 | 28 | def inference_time(self, seconds): 29 | self.time[0] += seconds 30 | self.time[1] += 1 31 | 32 | # processing time in msec 33 | def msec_per_word(self): 34 | return 1000*self.time[0]/self.time[1] if self.time[1] > 0 else 0.0 35 | 36 | def generate_inferred(self, result_vec, target_word, target_lemma, pos): 37 | 38 | generated_results = {} 39 | min_weight = None 40 | if result_vec is not None: 41 | for word, weight in result_vec: 42 | if generated_word_re.match(word) != None: # make sure this is not junk 43 | wn_pos = to_wordnet_pos[pos] 44 | lemma = WordNetLemmatizer().lemmatize(word, wn_pos) 45 | if word != target_word and lemma != target_lemma: 46 | if lemma in generated_results: 47 | weight = max(weight, generated_results[lemma]) 48 | generated_results[lemma] = weight 49 | if min_weight is None: 50 | min_weight = weight 51 | else: 52 | min_weight = min(min_weight, weight) 53 | 54 | if min_weight is None: 55 | min_weight = 0.0 56 | i = 0.0 57 | for lemma in default_generated_results: 58 | if len(generated_results) >= len(default_generated_results): 59 | break; 60 | i -= 1.0 61 | generated_results[lemma] = min_weight + i 62 | 63 | 64 | return generated_results 65 | 66 | 67 | 68 | def filter_inferred(self, result_vec, candidates, pos): 69 | 70 | filtered_results = {} 71 | candidates_found = set() 72 | # SO There is no way a composite word can appear?! 73 | if result_vec != None: 74 | 75 | # # TODO: this is my modification to test the difference hypothesis in our impls. 76 | # for word, weight in result_vec: 77 | # if word in candidates: 78 | # self.add_inference_result(word, weight, filtered_results, candidates_found) 79 | 80 | for word, weight in result_vec: 81 | wn_pos = to_wordnet_pos[pos] 82 | lemma = WordNetLemmatizer().lemmatize(word, wn_pos) 83 | if lemma in candidates: 84 | self.add_inference_result(lemma, weight, filtered_results, candidates_found) 85 | if lemma.title() in candidates: 86 | self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found) 87 | if word in candidates: # there are some few cases where the candidates are not lemmatized 88 | self.add_inference_result(word, weight, filtered_results, candidates_found) 89 | if word.title() in candidates: # there are some few cases where the candidates are not lemmatized 90 | self.add_inference_result(word.title(), weight, filtered_results, candidates_found) 91 | 92 | # assign negative weights for candidates with no score 93 | # they will appear last sorted according to their unigram count 94 | # candidates_left = candidates - candidates_found 95 | # for candidate in candidates_left: 96 | # count = self.w2counts[candidate] if candidate in self.w2counts else 1 97 | # score = -1 - (1.0/count) # between (-1,-2] 98 | # filtered_results[candidate] = score 99 | 100 | return filtered_results 101 | 102 | def add_inference_result(self, token, weight, filtered_results, candidates_found): 103 | candidates_found.add(token) 104 | best_last_weight = filtered_results[token] if token in filtered_results else None 105 | if best_last_weight == None or weight > best_last_weight: 106 | filtered_results[token] = weight 107 | 108 | 109 | 110 | -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/data/word-sim/EN-WS-353-SIM.txt: -------------------------------------------------------------------------------- 1 | tiger cat 7.35 2 | tiger tiger 10.00 3 | plane car 5.77 4 | train car 6.31 5 | television radio 6.77 6 | media radio 7.42 7 | bread butter 6.19 8 | cucumber potato 5.92 9 | doctor nurse 7.00 10 | professor doctor 6.62 11 | student professor 6.81 12 | smart stupid 5.81 13 | wood forest 7.73 14 | money cash 9.15 15 | king queen 8.58 16 | king rook 5.92 17 | bishop rabbi 6.69 18 | fuck sex 9.44 19 | football soccer 9.03 20 | football basketball 6.81 21 | football tennis 6.63 22 | Arafat Jackson 2.50 23 | physics chemistry 7.35 24 | vodka gin 8.46 25 | vodka brandy 8.13 26 | drink eat 6.87 27 | car automobile 8.94 28 | gem jewel 8.96 29 | journey voyage 9.29 30 | boy lad 8.83 31 | coast shore 9.10 32 | asylum madhouse 8.87 33 | magician wizard 9.02 34 | midday noon 9.29 35 | furnace stove 8.79 36 | food fruit 7.52 37 | bird cock 7.10 38 | bird crane 7.38 39 | food rooster 4.42 40 | money dollar 8.42 41 | money currency 9.04 42 | tiger jaguar 8.00 43 | tiger feline 8.00 44 | tiger carnivore 7.08 45 | tiger mammal 6.85 46 | tiger animal 7.00 47 | tiger organism 4.77 48 | tiger fauna 5.62 49 | psychology psychiatry 8.08 50 | psychology science 6.71 51 | psychology discipline 5.58 52 | planet star 8.45 53 | planet moon 8.08 54 | planet sun 8.02 55 | precedent example 5.85 56 | precedent antecedent 6.04 57 | cup tableware 6.85 58 | cup artifact 2.92 59 | cup object 3.69 60 | cup entity 2.15 61 | jaguar cat 7.42 62 | jaguar car 7.27 63 | mile kilometer 8.66 64 | skin eye 6.22 65 | Japanese American 6.50 66 | century year 7.59 67 | announcement news 7.56 68 | doctor personnel 5.00 69 | Harvard Yale 8.13 70 | hospital infrastructure 4.63 71 | life death 7.88 72 | travel activity 5.00 73 | type kind 8.97 74 | street place 6.44 75 | street avenue 8.88 76 | street block 6.88 77 | cell phone 7.81 78 | dividend payment 7.63 79 | calculation computation 8.44 80 | profit loss 7.63 81 | dollar yen 7.78 82 | dollar buck 9.22 83 | phone equipment 7.13 84 | liquid water 7.89 85 | marathon sprint 7.47 86 | seafood food 8.34 87 | seafood lobster 8.70 88 | lobster food 7.81 89 | lobster wine 5.70 90 | championship tournament 8.36 91 | man woman 8.30 92 | man governor 5.25 93 | murder manslaughter 8.53 94 | opera performance 6.88 95 | Mexico Brazil 7.44 96 | glass metal 5.56 97 | aluminum metal 7.83 98 | rock jazz 7.59 99 | museum theater 7.19 100 | shower thunderstorm 6.31 101 | monk oracle 5.00 102 | cup food 5.00 103 | journal association 4.97 104 | street children 4.94 105 | car flight 4.94 106 | space chemistry 4.88 107 | situation conclusion 4.81 108 | word similarity 4.75 109 | peace plan 4.75 110 | consumer energy 4.75 111 | ministry culture 4.69 112 | smart student 4.62 113 | investigation effort 4.59 114 | image surface 4.56 115 | life term 4.50 116 | start match 4.47 117 | computer news 4.47 118 | board recommendation 4.47 119 | lad brother 4.46 120 | observation architecture 4.38 121 | coast hill 4.38 122 | deployment departure 4.25 123 | benchmark index 4.25 124 | attempt peace 4.25 125 | consumer confidence 4.13 126 | start year 4.06 127 | focus life 4.06 128 | development issue 3.97 129 | theater history 3.91 130 | situation isolation 3.88 131 | profit warning 3.88 132 | media trading 3.88 133 | chance credibility 3.88 134 | precedent information 3.85 135 | architecture century 3.78 136 | population development 3.75 137 | stock live 3.73 138 | peace atmosphere 3.69 139 | morality marriage 3.69 140 | minority peace 3.69 141 | atmosphere landscape 3.69 142 | report gain 3.63 143 | music project 3.63 144 | seven series 3.56 145 | experience music 3.47 146 | school center 3.44 147 | five month 3.38 148 | announcement production 3.38 149 | morality importance 3.31 150 | money operation 3.31 151 | delay news 3.31 152 | governor interview 3.25 153 | practice institution 3.19 154 | century nation 3.16 155 | coast forest 3.15 156 | shore woodland 3.08 157 | drink car 3.04 158 | president medal 3.00 159 | prejudice recognition 3.00 160 | viewer serial 2.97 161 | peace insurance 2.94 162 | Mars water 2.94 163 | media gain 2.88 164 | precedent cognition 2.81 165 | announcement effort 2.75 166 | line insurance 2.69 167 | crane implement 2.69 168 | drink mother 2.65 169 | opera industry 2.63 170 | volunteer motto 2.56 171 | listing proximity 2.56 172 | precedent collection 2.50 173 | cup article 2.40 174 | sign recess 2.38 175 | problem airport 2.38 176 | reason hypertension 2.31 177 | direction combination 2.25 178 | Wednesday news 2.22 179 | glass magician 2.08 180 | cemetery woodland 2.08 181 | possibility girl 1.94 182 | cup substance 1.92 183 | forest graveyard 1.85 184 | stock egg 1.81 185 | month hotel 1.81 186 | energy secretary 1.81 187 | precedent group 1.77 188 | production hike 1.75 189 | stock phone 1.62 190 | holy sex 1.62 191 | stock CD 1.31 192 | drink ear 1.31 193 | delay racism 1.19 194 | stock life 0.92 195 | stock jaguar 0.92 196 | monk slave 0.92 197 | lad wizard 0.92 198 | sugar approach 0.88 199 | rooster voyage 0.62 200 | noon string 0.54 201 | chord smile 0.54 202 | professor cucumber 0.31 203 | king cabbage 0.23 204 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/main/context_instance.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Instance in the Lexical Substitution Task dataset 3 | 4 | ''' 5 | 6 | 7 | from pos import from_lst_pos 8 | import re 9 | 10 | CONTEXT_TEXT_BEGIN_INDEX = 3 11 | TARGET_INDEX = 2 12 | 13 | def encode_utf8(str): 14 | new_str = "" 15 | for c in str: 16 | try: 17 | new_c = c.__encode('utf-8') 18 | new_str+=new_c 19 | except UnicodeError: 20 | print "can't encode in utf-8" 21 | 22 | return new_str 23 | 24 | 25 | 26 | class ContextInstance(object): 27 | 28 | def __init__(self, line): 29 | ''' 30 | Constructor 31 | ''' 32 | self.line = line 33 | tokens1 = line.split("\t") 34 | self.target_ind = int(tokens1[TARGET_INDEX]) 35 | self.words = [w for w in tokens1[3].split()] 36 | self.target = self.words[self.target_ind] 37 | 38 | self.full_target_key = tokens1[0] 39 | self.pos = self.full_target_key.split('.')[-1] # pos is last, but target_key contains the first pos 40 | self.target_key = '.'.join(self.full_target_key.split('.')[:2]) # remove suffix in cases of bar.n.v 41 | self.target_lemma = self.full_target_key.split('.')[0] 42 | self.target_id = tokens1[1] 43 | if self.pos in from_lst_pos: 44 | self.pos = from_lst_pos[self.pos] 45 | 46 | # for non-parsed test data 47 | # TODO: modified version 48 | def get_neighbors(self, half_window_size): 49 | tokens = self.line.split()[3:] 50 | 51 | if half_window_size > 0: 52 | start_pos = max(self.target_ind-half_window_size, 0) 53 | end_pos = min(self.target_ind+half_window_size+1, len(tokens)) 54 | else: 55 | start_pos = 0 56 | end_pos = len(tokens) 57 | 58 | left_neighbors = tokens[start_pos:self.target_ind] 59 | right_neighbors = tokens[self.target_ind+1:end_pos] 60 | 61 | neighbors = left_neighbors + right_neighbors 62 | return left_neighbors, right_neighbors 63 | 64 | # for parsed test data 65 | # returns all words that share a dependency label 66 | def get_dep_context(self, conll, target): 67 | id_to_word = [] # this one contains information for us to find out who point to the target 68 | ind_to_context_inds = {} # it's like where our target points {word_ind =>[word_ind,...]} 69 | ind_to_prep_dep_inds = {} 70 | target_ind = None 71 | while True: 72 | line = conll.readline() 73 | parts = re.split(r"\t", line) 74 | ind, word, dep_word_ind, dep_type = int(parts[0]), parts[1], int(parts[6]), parts[7] 75 | if word == "": 76 | conll.readline() # to just to the next sentence 77 | break 78 | id_to_word.append(word) 79 | 80 | # bidirectional context 81 | if dep_word_ind not in ind_to_context_inds: 82 | ind_to_context_inds[dep_word_ind] = [] 83 | if ind not in ind_to_context_inds: 84 | ind_to_context_inds[ind] = [] 85 | 86 | # we don't want to add to context prepositions 87 | if dep_type != "prep": 88 | ind_to_context_inds[dep_word_ind].append(ind) 89 | if dep_type != "pobj": 90 | ind_to_context_inds[ind].append(dep_word_ind) 91 | 92 | # this part is used later to collapse dependencies 93 | if dep_type == "prep": 94 | # prep(ind) <- word(dep_word_ind) 95 | if dep_word_ind not in ind_to_prep_dep_inds: 96 | ind_to_prep_dep_inds[dep_word_ind] = [] 97 | ind_to_prep_dep_inds[dep_word_ind].append(ind) 98 | 99 | if dep_type == "pobj": 100 | if ind not in ind_to_prep_dep_inds: 101 | ind_to_prep_dep_inds[ind] = [] 102 | ind_to_prep_dep_inds[ind].append(dep_word_ind) 103 | 104 | # the search is based on matching words in FIFO fashion 105 | if not target_ind and word == target: 106 | target_ind = ind 107 | 108 | # -1 because array indices start from 0 109 | # the last check is because we don't care about 110 | # now we're going to collect context ids and then convert them to words 111 | context_inds = [] 112 | # 1. grab all words ids that the target points to 113 | if target_ind in ind_to_context_inds: 114 | context_inds += ind_to_context_inds[target_ind] 115 | 116 | # 2. grab all words of prep. words 117 | if target_ind in ind_to_prep_dep_inds: 118 | for prep_ind in ind_to_prep_dep_inds[target_ind]: 119 | for w_ind in ind_to_context_inds[prep_ind]: 120 | if w_ind != target_ind: 121 | context_inds.append(w_ind) 122 | # convert to words 123 | context = [id_to_word[ind-1] for ind in context_inds if len(id_to_word) > ind-1] # we don't care out dep. 124 | return context 125 | 126 | def decorate_context(self): 127 | tokens = self.line.split('\t') 128 | words = tokens[CONTEXT_TEXT_BEGIN_INDEX].split() 129 | words[self.target_ind] = '__'+words[self.target_ind]+'__' 130 | tokens[CONTEXT_TEXT_BEGIN_INDEX] = ' '.join(words) 131 | return '\t'.join(tokens)+"\n" -------------------------------------------------------------------------------- /libraries/simulators/support.py: -------------------------------------------------------------------------------- 1 | # functions specific for simulation 2 | import numpy as np 3 | import operator 4 | import pickle 5 | import sys 6 | sys.setrecursionlimit(10000) 7 | 8 | 9 | def load(file_path): 10 | return pickle.load(open(file_path, 'rb+')) 11 | 12 | 13 | # returns KL divergence between two Gaussians or vMF 14 | # KL(q||p) 15 | # assumes [batch_size x z_dimension ] or [latent_dim, ] inputs 16 | def KL(mu_q, sigma_q, mu_p, sigma_p, kl_type="gauss", debug=False): 17 | 18 | # adjusting dimensions 19 | flag = False 20 | if len(mu_q.shape) == 1 and len(sigma_q.shape) == 1 and len(mu_p.shape) == 1 and len(sigma_p.shape) == 1: 21 | mu_q = mu_q.reshape((1, -1)) 22 | sigma_q = sigma_q.reshape((1, -1)) 23 | mu_p = mu_p.reshape((1, -1)) 24 | sigma_p = sigma_p.reshape((1, -1)) 25 | flag = True 26 | 27 | if kl_type == "gauss": 28 | kl = KL_gauss(mu_q, sigma_q, mu_p, sigma_p, debug=debug) 29 | if kl_type == "vMF": 30 | kl = KL_vMF(mu_q, sigma_q, mu_p, sigma_p, debug=debug) 31 | if flag: 32 | kl = kl[0] 33 | return kl 34 | 35 | 36 | # standard KL for Gaussians 37 | def KL_gauss(mu_q, sigma_q, mu_p, sigma_p, debug=False): 38 | if sigma_q.shape[1] == 1 and sigma_p.shape[1] == 1: 39 | return KL_gauss_spherical(mu_q, sigma_q, mu_p, sigma_p, debug=debug) 40 | else: 41 | return KL_gauss_diagonal(mu_q, sigma_q, mu_p, sigma_p, debug=debug) 42 | 43 | def KL_gauss_spherical(mu_q, sigma_q, mu_p, sigma_p, debug=False, eps=1e-8): 44 | k = mu_q.shape[1] 45 | sigma_p_inv = 1./(sigma_p + eps) 46 | trace = k * sigma_p_inv * sigma_q 47 | quadr = sigma_p_inv*np.sum(((mu_p - mu_q)**2), axis=1) 48 | log_det_p = np.log(sigma_p + 1e-10) 49 | log_det_q = np.log(sigma_q + 1e-10) 50 | log_det = k*(log_det_p - log_det_q) 51 | res = 0.5 * (trace + quadr - k + log_det) 52 | 53 | if debug: 54 | print "trace : %s" % str(trace) 55 | print "quadr : %s" % str(quadr) 56 | print 'log_det_p : %s' % str(log_det_p) 57 | print 'log_det_q : %s' % str(log_det_q) 58 | print "log_det : %s" % str(log_det) 59 | print 'res : %s'% str(res) 60 | return res.reshape((-1, )) 61 | 62 | 63 | def KL_gauss_diagonal(mu_q, sigma_q, mu_p, sigma_p, debug=False, eps=1e-8): 64 | k = mu_q.shape[1] 65 | sigma_p_inv = 1./(sigma_p + eps) 66 | trace = np.sum(sigma_p_inv * sigma_q, axis=1) 67 | quadr = np.sum(sigma_p_inv * ((mu_p - mu_q)**2), axis=1) 68 | 69 | log_det_p = np.sum(np.log(sigma_p + eps), axis=1) 70 | log_det_q = np.sum(np.log(sigma_q + eps), axis=1) 71 | 72 | log_det = log_det_p - log_det_q 73 | 74 | if debug: 75 | print "trace : %f" % trace 76 | print "quadr : %f" % quadr 77 | print 'log_det_p : %f' % log_det_p 78 | print 'log_det_q : %f' % log_det_q 79 | print "log_det : %f" % log_det 80 | 81 | return 0.5 * (trace + quadr - k + log_det) 82 | 83 | 84 | # KL for vMF 85 | def KL_vMF(mu1, kappa1, mu2, kappa2, debug=False): 86 | return kl_vMF(mu1, kappa1, mu2, kappa2) 87 | 88 | def pad_window_size(X, desired_window_size): 89 | current_height = X.shape[0] 90 | if desired_window_size <= current_height: 91 | return X 92 | pad_height = (desired_window_size- current_height)/2 93 | if current_height % 2 == 1: 94 | return np.pad(X, [(pad_height, pad_height+1), (0, 0)], 'constant') 95 | else: 96 | return np.pad(X, [(pad_height, pad_height), (0,0)], 'constant') 97 | 98 | 99 | 100 | 101 | def sigmoid(x): 102 | return 1./(1 + np.exp(-x)) 103 | 104 | def simple_pmf(x): 105 | return 1./(1. + x) 106 | 107 | def relu(x): 108 | return np.maximum(0, x) 109 | 110 | def l2(x, axis=0): 111 | return np.sqrt(np.sum(x**2, axis=axis)) 112 | 113 | def cosine_sim(x, y): 114 | return float(np.sum(x*y))/float(np.sqrt(np.sum(x**2)*np.sum(y**2))) 115 | 116 | # search_position: over what position to search KL(position 1 || ... ) or KL( ... || position 2) 117 | def argmin_score(mu_q, sigma_q, mus_and_sigmas, num=1, type="kl", search_position=1): 118 | assert type in ["kl", "l2"] 119 | scores = {} 120 | for word, (mu_p, sigma_p) in mus_and_sigmas.items(): 121 | if type == "kl": 122 | if search_position==1: 123 | scores[word] = KL(mu_p, sigma_p, mu_q, sigma_q) 124 | else: 125 | scores[word] = KL(mu_q, sigma_q, mu_p, sigma_p) 126 | 127 | else: 128 | scores[word] = l2(mu_q - mu_p, axis=0) 129 | sorted_scores = sorted(scores.items(), key=operator.itemgetter(1)) 130 | return sorted_scores[0:num] 131 | 132 | # search_position: over what position to search KL(position 1 || ... ) or KL( ... || position 2) 133 | def closest_score(score, mu_q, sigma_q, mus_and_sigmas, num=1, type="kl", search_position=1): 134 | assert type in ["kl"] 135 | assert search_position in [1, 2] 136 | scores = {} 137 | 138 | for word, (mu_p, sigma_p) in mus_and_sigmas.items(): 139 | if type == "kl": 140 | if search_position == 1: 141 | scores[word] = abs(score - KL(mu_p, sigma_p, mu_q, sigma_q)) 142 | else: 143 | scores[word] = abs(score - KL(mu_q, sigma_q, mu_p, sigma_p)) 144 | 145 | sorted_scores = sorted(scores.items(), key=operator.itemgetter(1)) 146 | return sorted_scores[0:num] 147 | 148 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/context2vec_inferrer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Context-sensitive inferrer based on context2vec (bidirectional lsmt) 3 | Used in the paper: 4 | context2vec: Learning Generic Context Embedding with Bidirectional LSTM. CoNLL, 2016. 5 | ''' 6 | 7 | from cs_inferrer import CsInferrer 8 | from jcs.jcs_io import vec_to_str 9 | import numpy as np 10 | 11 | # from context2vec.common.model_reader import ModelReader 12 | 13 | # 14 | # class Context2vecInferrer(CsInferrer): 15 | # 16 | # def __init__(self, lstm_model_params_filename, ignore_target, context_math, top_inferences_to_analyze): 17 | # 18 | # CsInferrer.__init__(self) 19 | # self.ignore_target = ignore_target 20 | # self.context_math = context_math 21 | # self.top_inferences_to_analyze = top_inferences_to_analyze 22 | # 23 | # model_reader = ModelReader(lstm_model_params_filename) 24 | # self.context_model = model_reader.model 25 | # self.target_words = model_reader.w 26 | # self.word2index = model_reader.word2index 27 | # self.index2word = model_reader.index2word 28 | # 29 | # def represent_target_and_context(self, lst_instance, tfo): 30 | # 31 | # sent_words = lst_instance.words 32 | # target_ind = lst_instance.target_ind 33 | # 34 | # ignore_target = self.ignore_target and len(sent_words) > 1 # if there's only the target word in the sentence then we don't ignore it... 35 | # 36 | # if not ignore_target: 37 | # if lst_instance.target not in self.word2index: 38 | # tfo.write("ERROR: %s not in word embeddings.Trying lemma.\n" % lst_instance.target) 39 | # if lst_instance.target_lemma not in self.word2index: 40 | # tfo.write("ERROR: lemma %s also not in word embeddings.\n" % lst_instance.target_lemma) 41 | # else: 42 | # sent_words[target_ind] = lst_instance.target_lemma 43 | # 44 | # target_v = self.target_words[self.word2index[sent_words[target_ind]]] if not ignore_target else None 45 | # 46 | # if len(sent_words) > 1: 47 | # context_v = self.context_model.context2vec(sent_words, target_ind) 48 | # context_v = context_v / np.sqrt((context_v * context_v).sum()) 49 | # else: 50 | # context_v = None # just target with no context 51 | # 52 | # return target_v, context_v 53 | # 54 | # 55 | # def find_inferred(self, lst_instance, tfo): 56 | # 57 | # target_v, context_v = self.represent_target_and_context(lst_instance, tfo) 58 | # 59 | # if target_v is not None and context_v is not None: 60 | # 61 | # #This is not working very well at the moment. Requires more research. 62 | # 63 | # # ZERO-TO-HALF 64 | # target_sim = (self.target_words.dot(target_v)+1.0)/2 65 | # context_sim = (self.target_words.dot(context_v)+1.0)/2 66 | # similarity = target_sim*context_sim 67 | # 68 | # # RANKS 69 | # # target_sim = self.target_words.dot(target_v) 70 | # # context_sim = self.target_words.dot(context_v) 71 | # # for rank, i in enumerate(target_sim.argsort()): 72 | # # target_sim[i] = float(rank) 73 | # # for rank, i in enumerate(context_sim.argsort()): 74 | # # context_sim[i] = float(rank) 75 | # # 76 | # # similarity = (target_sim*context_sim)/(len(target_sim)**2) 77 | # 78 | # # POSITIVE SCORES 79 | # # target_sim = self.target_words.dot(target_v) 80 | # # context_sim = self.target_words.dot(context_v) 81 | # # target_sim[target_sim<0.0] = 0.0 82 | # # context_sim[context_sim<0.0] = 0.0 83 | # # similarity = target_sim*context_sim 84 | # 85 | # # EXP 86 | # # target_sim = self.target_words.dot(target_v) 87 | # # target_sim = np.exp(target_sim) 88 | # # context_sim = self.target_words.dot(context_v) 89 | # # context_sim = np.exp(context_sim) 90 | # 91 | # 92 | # # NORMALIZE 93 | # # target_sim = self.target_words.dot(target_v) 94 | # # target_sim_mean = np.mean(target_sim) 95 | # # target_sim_std = np.sqrt(np.var(target_sim)) 96 | # # target_sim = (target_sim - target_sim_mean)/target_sim_std 97 | # ## target_sim[target_sim<0.0] = 0.0 98 | # # context_sim = self.target_words.dot(context_v) 99 | # # context_sim_mean = np.mean(context_sim) 100 | # # context_sim_std = np.sqrt(np.var(context_sim)) 101 | # # context_sim = (context_sim - context_sim_mean)/context_sim_std 102 | # ## context_sim[context_sim<0.0] = 0.0 103 | # # 104 | # # similarity = target_sim + context_sim 105 | # 106 | # else: 107 | # if target_v is not None: 108 | # similarity = (self.target_words.dot(target_v)+1.0)/2 109 | # elif context_v is not None: 110 | # similarity = (self.target_words.dot(context_v)+1.0)/2 111 | # else: 112 | # raise Exception("Can't find a target nor context.") 113 | # 114 | # result_vec = sorted(zip(self.index2word, similarity), reverse=True, key=lambda x: x[1]) 115 | # 116 | # tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n') 117 | # 118 | # return result_vec 119 | 120 | 121 | 122 | -------------------------------------------------------------------------------- /libraries/batch_iterators/sentence_batch_iterator.py: -------------------------------------------------------------------------------- 1 | from support import allow_with_prob, sample_words, create_context_windows, pad_sents 2 | from base_batch_iterator import BaseBatchIterator 3 | from libraries.data_iterators.open_text_data_iterator import OpenTextDataIterator 4 | from window_batch_iterator import Batch 5 | from libraries.tools.vocabulary import PAD_TOKEN, UNK_TOKEN 6 | import numpy as np 7 | try: 8 | import re2 as re 9 | except ImportError: 10 | import re 11 | 12 | 13 | class SentenceBatchIterator(BaseBatchIterator): 14 | """ 15 | Specific for LSTM based BSG iterator over batches. 16 | 17 | """ 18 | 19 | def __init__(self, vocab, data_path, data_iterator, subsampling_threshold=None, batch_size=50, 20 | max_sentence_length=None): 21 | """ 22 | :param data_path: a path to data, can be a folder or a file path. 23 | :param subsampling_threshold: used in computation of words removal probability. The smaller the threshold 24 | the larger is the removal probability. In the original paper it was 1e-5. 25 | If None is passed, the subsampling will not be applied. 26 | 27 | """ 28 | assert all([symbol in vocab for symbol in [PAD_TOKEN, UNK_TOKEN]]) 29 | assert isinstance(data_iterator, OpenTextDataIterator) 30 | 31 | self.vocab = vocab 32 | self.data_path = data_path 33 | self.subsampling_threshold = subsampling_threshold 34 | self.batch_size = batch_size 35 | self.max_sentence_length = max_sentence_length 36 | 37 | self.data_iterator = data_iterator 38 | self.data_iterator.set_data_path(data_path) 39 | 40 | BaseBatchIterator.__init__(self) 41 | 42 | def load_data_batches_to_queue(self, queue): 43 | """ 44 | Loads batches sequentially to a queue. 45 | 46 | """ 47 | # create data holders(containers) 48 | left_context_tokens = [] 49 | right_context_tokens = [] 50 | center_tokens = [] 51 | containers_current_size = 0 52 | max_length = 0 53 | for sentence, in self.data_iterator: 54 | # apply subsampling 55 | if self.subsampling_threshold: 56 | sentence = [token for token in sentence if allow_with_prob(self.vocab[token].count, 57 | self.vocab.total_count, 58 | subsampling_threshold=self.subsampling_threshold)] 59 | # convert to word_ids 60 | sentence_ids = [obj.id for obj in self.vocab[sentence]] 61 | 62 | # trim the sentence 63 | if self.max_sentence_length: 64 | sentence_ids = sentence_ids[:self.max_sentence_length] 65 | 66 | for center_token, left_context, right_context in create_context_windows(sentence_ids, half_window_size=0): 67 | # add to the data holders 68 | center_tokens.append(center_token) 69 | left_context_tokens.append(left_context) 70 | right_context_tokens.append(right_context) 71 | containers_current_size += 1 72 | max_length = max(max_length, len(left_context), len(right_context)) 73 | # return the chunk/batch when the container gets full 74 | if containers_current_size >= self.batch_size: 75 | batch = self.__create_batch(center_words=center_tokens, left_context=left_context_tokens, 76 | right_context=right_context_tokens, max_length=max_length) 77 | queue.put(batch) 78 | 79 | # reset 80 | left_context_tokens = [] 81 | right_context_tokens = [] 82 | center_tokens = [] 83 | containers_current_size = 0 84 | max_length = 0 85 | 86 | # return what has been collected if iteration is finished 87 | if containers_current_size > 0: 88 | batch = self.__create_batch(center_words=center_tokens, left_context=left_context_tokens, 89 | right_context=right_context_tokens, max_length=max_length) 90 | queue.put(batch) 91 | queue.put(None) # to indicate that loading is finished 92 | 93 | def __create_batch(self, center_words, left_context, right_context, max_length): 94 | 95 | left_context, left_mask = pad_sents(left_context, max_length, pad_symbol=self.vocab[PAD_TOKEN].id, 96 | padding_mode='left') 97 | right_context, right_mask = pad_sents(right_context, max_length, pad_symbol=self.vocab[PAD_TOKEN].id, 98 | padding_mode='right') 99 | 100 | context = np.concatenate((left_context, right_context), axis=1) 101 | mask = np.concatenate((left_mask, right_mask), axis=1) 102 | 103 | # generate negative samples 104 | neg_context = sample_words(context.shape[0], len(self.vocab), self.vocab.uni_distr, 105 | nr_neg_samples=context.shape[1]) 106 | neg_context = np.array(neg_context, dtype="int32") 107 | 108 | batch = Batch(pos_context_words=context, neg_context_words=neg_context, center_words=center_words, mask=mask) 109 | return batch -------------------------------------------------------------------------------- /libraries/evaluation/entailment/data/bench/baroni2012/data_rnd_val.tsv: -------------------------------------------------------------------------------- 1 | conservatism mesothelioma False 2 | arithmetic discipline True 3 | insect librarian False 4 | holly official False 5 | scourge instrument True 6 | odyssey travel True 7 | snake animal True 8 | mackerel seafood True 9 | statement ship False 10 | dad relative True 11 | agony adult False 12 | loo room True 13 | velocity rate True 14 | volleyball game True 15 | fork ware True 16 | payment royalty False 17 | drummer performer True 18 | pudding food True 19 | bridesmaid woman True 20 | tyrant ruler True 21 | ore mineral True 22 | sailor privateer False 23 | playing show True 24 | heat ticket False 25 | snooker game True 26 | pram vehicle True 27 | trait goose False 28 | privateer sailor True 29 | lamb animal True 30 | petroleum payment False 31 | roadway road True 32 | sherry rodent False 33 | relative musician False 34 | vertebrate serpent False 35 | turbine engine True 36 | yesterday day True 37 | polyethylene resin True 38 | consumer drinker False 39 | vocalist performer True 40 | lesbian dyke False 41 | gymnasium school True 42 | barrier obstruction True 43 | psychotherapy science True 44 | sack apostle False 45 | turkey animal True 46 | diplomat ambassador False 47 | president leader True 48 | dogma doctrine True 49 | stag eagle False 50 | sunlight statement False 51 | mesothelioma maker False 52 | disorder insomnia False 53 | baritone adrenaline False 54 | pope leader True 55 | golf sport True 56 | sedan car True 57 | cottage house True 58 | airplane chalet False 59 | checklist information True 60 | ape animal True 61 | mesothelioma disease True 62 | reelection ache False 63 | hedgehog vertebrate True 64 | barrier fender False 65 | solid ape False 66 | penguin animal True 67 | sport garment False 68 | conduit writer False 69 | algebra mathematics True 70 | diabetes furniture False 71 | rig equipment True 72 | travel algebra False 73 | missile weapon True 74 | pathologist doctor True 75 | monastery housing True 76 | massage treatment True 77 | biotechnology discipline True 78 | computer vertebrate False 79 | vehicle auto False 80 | trait maker False 81 | whiskey liquor True 82 | country organization True 83 | spokesman spokesperson True 84 | school gymnasium False 85 | fish shark False 86 | equipment princess False 87 | building instrument False 88 | monoclonal protein True 89 | animal vertebrate False 90 | grandmother relative True 91 | horseman rider True 92 | toad animal True 93 | cheese food True 94 | cynicism feeling True 95 | asthma motel False 96 | performer comedian False 97 | castle house True 98 | seaweed alga True 99 | salmon fish True 100 | dinghy boat True 101 | heterogeneity discipline False 102 | etching art True 103 | grandchild offspring True 104 | transaction investing False 105 | karaoke entertainment True 106 | booze drug True 107 | ambassador diplomat True 108 | twelve integer True 109 | alcohol fluid True 110 | auto trait False 111 | passion feeling True 112 | section preamble False 113 | kindergarten symbol False 114 | leaf petal False 115 | patience ceremony False 116 | produce raisin False 117 | food reimbursement False 118 | collagen shopkeeper False 119 | kangaroo animal True 120 | diabetes corridor False 121 | dog annals False 122 | toast antibiotic False 123 | sport badminton False 124 | reptile vertebrate True 125 | garment jean False 126 | enzymology biochemistry True 127 | apprehension bird False 128 | contestant building False 129 | narrative fairytale False 130 | aircrew playlist False 131 | cow chocolate False 132 | tern vertebrate True 133 | performer spear False 134 | leader statesman False 135 | ferry vehicle True 136 | shark fish True 137 | traveler performer False 138 | mistress woman True 139 | sweetness sensation True 140 | fourteen integer True 141 | sis relative True 142 | slogan saying True 143 | fluorescence phenomenon True 144 | jealousy feeling True 145 | skating sport True 146 | fillet glider False 147 | animal kestrel False 148 | cost overhead False 149 | infusion animal False 150 | aristocrat baroness False 151 | animal dorm False 152 | slogan mixture False 153 | calf animal True 154 | pigeon vertebrate True 155 | panda woodpecker False 156 | edge boundary True 157 | organization math False 158 | furniture desk False 159 | cellulose carbohydrate True 160 | boat cost False 161 | observatory building True 162 | alloy bronze False 163 | insect animal True 164 | theatre building True 165 | instrument voice False 166 | apprehension skating False 167 | psoriasis disease True 168 | payment hostess False 169 | liquor beverage True 170 | supercomputer computer True 171 | goldfish vertebrate True 172 | vegetable vehicle False 173 | panic fear True 174 | feeling nostalgia False 175 | grandchild drug False 176 | aspirin drug True 177 | pipe conduit True 178 | clubhouse building True 179 | firm business True 180 | icon disease False 181 | monastery building True 182 | pathology science True 183 | velvet fabric True 184 | textbook book True 185 | berry solid True 186 | statesman leader True 187 | cassette container True 188 | robbery crime True 189 | endurance reindeer False 190 | terror emotion True 191 | bag review False 192 | ban decree True 193 | wheat pain False 194 | offspring arithmetic False 195 | graft tissue True 196 | technician worker True 197 | potato food True 198 | virus microorganism True 199 | captivity internment False 200 | uprising conflict True 201 | snake chemical False 202 | bookmark marker True 203 | mare vertebrate True 204 | cool temperature True 205 | margarine food True 206 | asp invertebrate False 207 | beverage strut False 208 | benzene chemical True 209 | champagne etching False 210 | mammal cat False 211 | walk vehicle False 212 | biplane aircraft True 213 | farmhouse bargain False 214 | deanery house True 215 | concert performance True 216 | book fruit False 217 | vertebrate mare False 218 | horse science False 219 | studio workplace True 220 | beech artillery False 221 | humanist scholar True 222 | strut walk True 223 | chant music True 224 | -------------------------------------------------------------------------------- /libraries/batch_iterators/window_batch_iterator.py: -------------------------------------------------------------------------------- 1 | from support import pad_sents, allow_with_prob, sample_words, create_context_windows 2 | from base_batch_iterator import BaseBatchIterator 3 | from libraries.tools.vocabulary import PAD_TOKEN, UNK_TOKEN 4 | import numpy as np 5 | try: 6 | import re2 as re 7 | except ImportError: 8 | import re 9 | 10 | 11 | class Batch: 12 | def __init__(self, pos_context_words, neg_context_words, center_words, mask): 13 | self.pos_context_words = pos_context_words 14 | self.neg_context_words = neg_context_words 15 | self.center_words = center_words 16 | self.mask = mask 17 | 18 | def __len__(self): 19 | return self.pos_context_words.shape[0] 20 | 21 | 22 | class WindowBatchIterator(BaseBatchIterator): 23 | 24 | def __init__(self, vocab, data_path, data_iterator, half_window_size=5, nr_neg_samples=5, 25 | subsampling_threshold=None, batch_size=50): 26 | """ 27 | :param data_path: a path to data, can be a folder or a file path. 28 | :param subsampling_threshold: used in computation of words removal probability. The smaller the threshold 29 | the larger is the removal probability. In the original paper it was 1e-5. 30 | If None is passed, the subsampling will not be applied. 31 | 32 | """ 33 | assert all([symbol in vocab for symbol in [PAD_TOKEN, UNK_TOKEN]]) 34 | self.vocab = vocab 35 | self.data_path = data_path 36 | self.half_window_size = half_window_size 37 | self.nr_neg_samples = nr_neg_samples 38 | self.subsampling_threshold = subsampling_threshold 39 | self.batch_size = batch_size 40 | 41 | self.data_iterator = data_iterator 42 | self.data_iterator.set_data_path(data_path) 43 | 44 | BaseBatchIterator.__init__(self) 45 | 46 | def load_data_batches_to_queue(self, queue): 47 | """ 48 | Loads batches sequentially to a queue. 49 | 50 | """ 51 | # create data placeholders 52 | pos_context_words = [] 53 | center_words = [] 54 | batch_current_size = 0 55 | max_length = 0 56 | 57 | for sentence, in self.data_iterator: 58 | 59 | # apply subsampling 60 | if self.subsampling_threshold: 61 | sentence = [token for token in sentence if allow_with_prob(self.vocab[token].count, 62 | self.vocab.total_count, 63 | subsampling_threshold=self.subsampling_threshold)] 64 | # convert to word_ids 65 | sentence_ids = [obj.id for obj in self.vocab[sentence]] 66 | 67 | # pad corners 68 | sentence_ids = [self.vocab[PAD_TOKEN].id] * self.half_window_size + sentence_ids + \ 69 | [self.vocab[PAD_TOKEN].id] * self.half_window_size 70 | 71 | # create windows 72 | for center_token, context_tokens in create_context_windows(sentence_ids, self.half_window_size): 73 | # add to the data holders 74 | center_words.append(center_token) 75 | pos_context_words.append(context_tokens) 76 | batch_current_size += 1 77 | max_length = max(max_length, len(context_tokens)) 78 | 79 | # return the chunk when the container gets full 80 | if batch_current_size >= self.batch_size: 81 | # generate negative samples 82 | neg_context_words = sample_words(batch_current_size, len(self.vocab), self.vocab.uni_distr, 83 | nr_neg_samples=self.nr_neg_samples) 84 | batch = self.__create_batch(pos_context_words=pos_context_words, 85 | neg_context_words=neg_context_words, 86 | center_words=center_words, max_length=max_length) 87 | queue.put(batch) 88 | 89 | # reset 90 | pos_context_words = [] 91 | center_words = [] 92 | batch_current_size = 0 93 | max_length = 0 94 | 95 | # return what has been collected if iteration is finished 96 | if batch_current_size > 0: 97 | # generate negative samples 98 | neg_context_words = sample_words(batch_current_size, len(self.vocab), self.vocab.uni_distr, 99 | nr_neg_samples=self.nr_neg_samples) 100 | batch = self.__create_batch(pos_context_words=pos_context_words, 101 | neg_context_words=neg_context_words, 102 | center_words=center_words, max_length=max_length) 103 | queue.put(batch) 104 | queue.put(None) # to indicate that loading is finished 105 | 106 | def __create_batch(self, pos_context_words, neg_context_words, center_words, max_length): 107 | 108 | neg_context_words = np.array(neg_context_words, dtype="int32") 109 | pos_context_words, mask = pad_sents(pos_context_words, max_length=max_length, 110 | pad_symbol=self.vocab[PAD_TOKEN].id, mask_current=True) 111 | center_words = np.array(center_words, dtype="int32") 112 | 113 | # convert all to numpy arrays 114 | # neg_context_words = np.array(neg_context_words, dtype="int32") 115 | pos_context_words = np.array(pos_context_words, dtype="int32") 116 | 117 | batch = Batch(pos_context_words=pos_context_words, neg_context_words=neg_context_words, 118 | center_words=center_words, mask=mask) 119 | return batch -------------------------------------------------------------------------------- /libraries/evaluation/word_sim/data/word-sim/EN-WS-353-REL.txt: -------------------------------------------------------------------------------- 1 | computer keyboard 7.62 2 | Jerusalem Israel 8.46 3 | planet galaxy 8.11 4 | canyon landscape 7.53 5 | OPEC country 5.63 6 | day summer 3.94 7 | day dawn 7.53 8 | country citizen 7.31 9 | planet people 5.75 10 | environment ecology 8.81 11 | Maradona football 8.62 12 | OPEC oil 8.59 13 | money bank 8.50 14 | computer software 8.50 15 | law lawyer 8.38 16 | weather forecast 8.34 17 | network hardware 8.31 18 | nature environment 8.31 19 | FBI investigation 8.31 20 | money wealth 8.27 21 | psychology Freud 8.21 22 | news report 8.16 23 | war troops 8.13 24 | physics proton 8.12 25 | bank money 8.12 26 | stock market 8.08 27 | planet constellation 8.06 28 | credit card 8.06 29 | hotel reservation 8.03 30 | closet clothes 8.00 31 | soap opera 7.94 32 | planet astronomer 7.94 33 | planet space 7.92 34 | movie theater 7.92 35 | treatment recovery 7.91 36 | baby mother 7.85 37 | money deposit 7.73 38 | television film 7.72 39 | psychology mind 7.69 40 | game team 7.69 41 | admission ticket 7.69 42 | Jerusalem Palestinian 7.65 43 | Arafat terror 7.65 44 | boxing round 7.61 45 | computer internet 7.58 46 | money property 7.57 47 | tennis racket 7.56 48 | telephone communication 7.50 49 | currency market 7.50 50 | psychology cognition 7.48 51 | seafood sea 7.47 52 | book paper 7.46 53 | book library 7.46 54 | psychology depression 7.42 55 | fighting defeating 7.41 56 | movie star 7.38 57 | hundred percent 7.38 58 | dollar profit 7.38 59 | money possession 7.29 60 | cup drink 7.25 61 | psychology health 7.23 62 | summer drought 7.16 63 | investor earning 7.13 64 | company stock 7.08 65 | stroke hospital 7.03 66 | liability insurance 7.03 67 | game victory 7.03 68 | psychology anxiety 7.00 69 | game defeat 6.97 70 | FBI fingerprint 6.94 71 | money withdrawal 6.88 72 | psychology fear 6.85 73 | drug abuse 6.85 74 | concert virtuoso 6.81 75 | computer laboratory 6.78 76 | love sex 6.77 77 | problem challenge 6.75 78 | movie critic 6.73 79 | Arafat peace 6.73 80 | bed closet 6.72 81 | lawyer evidence 6.69 82 | fertility egg 6.69 83 | precedent law 6.65 84 | minister party 6.63 85 | psychology clinic 6.58 86 | cup coffee 6.58 87 | water seepage 6.56 88 | government crisis 6.56 89 | space world 6.53 90 | dividend calculation 6.48 91 | victim emergency 6.47 92 | luxury car 6.47 93 | tool implement 6.46 94 | competition price 6.44 95 | psychology doctor 6.42 96 | gender equality 6.41 97 | listing category 6.38 98 | video archive 6.34 99 | oil stock 6.34 100 | governor office 6.34 101 | discovery space 6.34 102 | record number 6.31 103 | brother monk 6.27 104 | production crew 6.25 105 | nature man 6.25 106 | family planning 6.25 107 | disaster area 6.25 108 | food preparation 6.22 109 | preservation world 6.19 110 | movie popcorn 6.19 111 | lover quarrel 6.19 112 | game series 6.19 113 | dollar loss 6.09 114 | weapon secret 6.06 115 | shower flood 6.03 116 | registration arrangement 6.00 117 | arrival hotel 6.00 118 | announcement warning 6.00 119 | game round 5.97 120 | baseball season 5.97 121 | drink mouth 5.96 122 | life lesson 5.94 123 | grocery money 5.94 124 | energy crisis 5.94 125 | reason criterion 5.91 126 | equipment maker 5.91 127 | cup liquid 5.90 128 | deployment withdrawal 5.88 129 | tiger zoo 5.87 130 | journey car 5.85 131 | money laundering 5.65 132 | summer nature 5.63 133 | decoration valor 5.63 134 | Mars scientist 5.63 135 | alcohol chemistry 5.54 136 | disability death 5.47 137 | change attitude 5.44 138 | arrangement accommodation 5.41 139 | territory surface 5.34 140 | size prominence 5.31 141 | exhibit memorabilia 5.31 142 | credit information 5.31 143 | territory kilometer 5.28 144 | death row 5.25 145 | doctor liability 5.19 146 | impartiality interest 5.16 147 | energy laboratory 5.09 148 | secretary senate 5.06 149 | death inmate 5.03 150 | monk oracle 5.00 151 | cup food 5.00 152 | journal association 4.97 153 | street children 4.94 154 | car flight 4.94 155 | space chemistry 4.88 156 | situation conclusion 4.81 157 | word similarity 4.75 158 | peace plan 4.75 159 | consumer energy 4.75 160 | ministry culture 4.69 161 | smart student 4.62 162 | investigation effort 4.59 163 | image surface 4.56 164 | life term 4.50 165 | start match 4.47 166 | computer news 4.47 167 | board recommendation 4.47 168 | lad brother 4.46 169 | observation architecture 4.38 170 | coast hill 4.38 171 | deployment departure 4.25 172 | benchmark index 4.25 173 | attempt peace 4.25 174 | consumer confidence 4.13 175 | start year 4.06 176 | focus life 4.06 177 | development issue 3.97 178 | theater history 3.91 179 | situation isolation 3.88 180 | profit warning 3.88 181 | media trading 3.88 182 | chance credibility 3.88 183 | precedent information 3.85 184 | architecture century 3.78 185 | population development 3.75 186 | stock live 3.73 187 | peace atmosphere 3.69 188 | morality marriage 3.69 189 | minority peace 3.69 190 | atmosphere landscape 3.69 191 | report gain 3.63 192 | music project 3.63 193 | seven series 3.56 194 | experience music 3.47 195 | school center 3.44 196 | five month 3.38 197 | announcement production 3.38 198 | morality importance 3.31 199 | money operation 3.31 200 | delay news 3.31 201 | governor interview 3.25 202 | practice institution 3.19 203 | century nation 3.16 204 | coast forest 3.15 205 | shore woodland 3.08 206 | drink car 3.04 207 | president medal 3.00 208 | prejudice recognition 3.00 209 | viewer serial 2.97 210 | peace insurance 2.94 211 | Mars water 2.94 212 | media gain 2.88 213 | precedent cognition 2.81 214 | announcement effort 2.75 215 | line insurance 2.69 216 | crane implement 2.69 217 | drink mother 2.65 218 | opera industry 2.63 219 | volunteer motto 2.56 220 | listing proximity 2.56 221 | precedent collection 2.50 222 | cup article 2.40 223 | sign recess 2.38 224 | problem airport 2.38 225 | reason hypertension 2.31 226 | direction combination 2.25 227 | Wednesday news 2.22 228 | glass magician 2.08 229 | cemetery woodland 2.08 230 | possibility girl 1.94 231 | cup substance 1.92 232 | forest graveyard 1.85 233 | stock egg 1.81 234 | month hotel 1.81 235 | energy secretary 1.81 236 | precedent group 1.77 237 | production hike 1.75 238 | stock phone 1.62 239 | holy sex 1.62 240 | stock CD 1.31 241 | drink ear 1.31 242 | delay racism 1.19 243 | stock life 0.92 244 | stock jaguar 0.92 245 | monk slave 0.92 246 | lad wizard 0.92 247 | sugar approach 0.88 248 | rooster voyage 0.62 249 | noon string 0.54 250 | chord smile 0.54 251 | professor cucumber 0.31 252 | king cabbage 0.23 253 | -------------------------------------------------------------------------------- /libraries/evaluation/lexsub/jcs/evaluation/lst/preprocess_lst_test.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ''' 3 | 4 | # – - 5 | # — - 6 | # “ " 7 | # ” " 8 | # ‘ ' 9 | # ’ ' 10 | 11 | # – ; - 12 | # — ; - 13 | # “ ; " 14 | # ” ; " 15 | # ‘ ; ' 16 | # ’ ; ' 17 | 18 | # & & 19 | 20 | import sys 21 | import re 22 | 23 | from nltk.stem.wordnet import WordNetLemmatizer 24 | from nltk.corpus import wordnet 25 | from nltk.tag import pos_tag 26 | 27 | 28 | first_quoted_re = re.compile('.*"(.*)".*') 29 | context_re = re.compile('.*(.*).*') 30 | head_re = re.compile('.*(.*).*') 31 | 32 | target_prefix = '=2): 75 | pos_prefix = pos[:2] 76 | if (pos_prefix in to_wordnet_pos): 77 | wordnet_pos = to_wordnet_pos[pos_prefix] 78 | lemma = WordNetLemmatizer().lemmatize(word, wordnet_pos).lower(); 79 | triples.append([word, wordnet_pos, lemma]) 80 | return triples 81 | 82 | def parse_context(context): 83 | target = head_re.match(context).group(1) 84 | tokens = context.split() 85 | target_ind = tokens.index(''+target+'') 86 | tokens[target_ind] = target 87 | 88 | return tokens, target_ind 89 | 90 | 91 | def add_target(targets, target_with_pos, actual_target): 92 | wn_pos = target_with_pos.split('.')[-1] 93 | pos = from_wordnet_pos[wn_pos] 94 | targets.add(actual_target + "." + pos) 95 | 96 | def is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets): 97 | mwe_count = 0 98 | for synset in synsets: 99 | gloss_lemmas = set([WordNetLemmatizer().lemmatize(word) for word in synset.definition.split()]) 100 | if verb_lemma in gloss_lemmas or complement_lemma in gloss_lemmas: 101 | return False 102 | for syn_lemma in synset.lemmas: 103 | if syn_lemma.name != mwe: 104 | tokens = syn_lemma.name.split('_') 105 | for token in tokens: 106 | if token == verb_lemma: 107 | return False 108 | if len(tokens) == 2 and tokens[1] == complement_lemma: 109 | return False 110 | else: 111 | mwe_count += syn_lemma.count() 112 | return True 113 | 114 | 115 | def detect_mwe(text_tokens, target_ind, wordnet_pos): 116 | if (target_ind < len(text_tokens)-1): 117 | verb_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind], wordnet_pos) 118 | complement_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind+1]) 119 | mwe = '_'.join([verb_lemma, complement_lemma]) 120 | synsets = wordnet.synsets(mwe, wordnet.VERB) 121 | if len(synsets) > 0: 122 | if (target_ind+1 < len(text_tokens)-1): 123 | mwe_right = '_'.join([WordNetLemmatizer().lemmatize(text_tokens[target_ind+1]), WordNetLemmatizer().lemmatize(text_tokens[target_ind+2])]) 124 | if len(wordnet.synsets(mwe_right)) > 0: 125 | return 126 | if is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets) == True: 127 | mwe = '='.join([text_tokens[target_ind], text_tokens[target_ind+1]]) 128 | text_tokens[target_ind] = mwe 129 | del text_tokens[target_ind+1] 130 | 131 | 132 | if __name__ == '__main__': 133 | 134 | if (len(sys.argv) > 1): 135 | input = open(sys.argv[1], 'r') 136 | output = open(sys.argv[2], 'w') 137 | detect_mwe_flag = False 138 | if (len(sys.argv) > 4): 139 | if sys.argv[4] == 'mwe': 140 | detect_mwe_flag = True 141 | else: 142 | input = sys.stdin 143 | output = sys.stdout 144 | 145 | targets = set() 146 | 147 | target = None 148 | for line in input: 149 | 150 | line = line.strip() 151 | if line.startswith(target_prefix): 152 | target = first_quoted_re.match(line).group(1) 153 | if line.startswith(instance_prefix): 154 | instance_id = first_quoted_re.match(line).group(1) 155 | continue 156 | if line.startswith(context_prefix): 157 | context = context_re.match(line).group(1) 158 | context = html_to_text(context) 159 | text_tokens, target_ind = parse_context(context) 160 | wn_pos = target.split('.')[-1] 161 | if wn_pos == wordnet.VERB and detect_mwe_flag: 162 | detect_mwe(text_tokens, target_ind, wordnet.VERB) 163 | add_target(targets, target, text_tokens[target_ind]) 164 | text = ' '.join(text_tokens) 165 | output_line = '\t'.join([target, instance_id, str(target_ind), text]) 166 | print >> output, output_line 167 | continue 168 | 169 | if (len(sys.argv) > 0): 170 | input.close() 171 | output.close() 172 | 173 | if (len(sys.argv) > 3): 174 | target_file = open(sys.argv[3], 'w') 175 | for target in targets: 176 | target_file.write(target + "\n") 177 | target_file.close() 178 | -------------------------------------------------------------------------------- /interfaces/i_base.py: -------------------------------------------------------------------------------- 1 | from libraries.utils.paths_and_files import get_subdir_number 2 | import sys, os 3 | from support import load, save, metrics_to_str 4 | from libraries.tools.log import Log 5 | from libraries.utils.other import merge_ordered_dicts 6 | from support import infer_attributes_to_log, format_experimental_setup 7 | from libraries.tools.ordered_attrs import OrderedAttrs 8 | 9 | # a dirty hack from: 10 | # http://stackoverflow.com/questions/24171725/scikit-learn-multicore-attributeerror-stdin-instance-has-no-attribute-close 11 | # to avoid iPython in pycharm crushing 12 | if not hasattr(sys.stdin, 'close'): 13 | def dummy_close(): 14 | pass 15 | sys.stdin.close = dummy_close 16 | 17 | 18 | class IBase(OrderedAttrs): 19 | """ 20 | Base interface class that contains methods that must be implemented and the ones that can be used directly in children classes. 21 | 22 | """ 23 | def __init__(self, model_class, vocab, train_data_path=None, val_data_path=None, test_data_path=None, epochs=5, 24 | output_dir=None): 25 | OrderedAttrs.__init__(self) 26 | 27 | # will be assigned later on in the child class 28 | self.model = None 29 | self.init_iterator = None 30 | 31 | self.model_class = model_class 32 | self.vocab = vocab 33 | self.train_data_path = train_data_path 34 | self.val_data_path = val_data_path 35 | self.test_data_path = test_data_path 36 | self.epochs = epochs 37 | 38 | output_dir = output_dir if output_dir else os.path.join(os.getcwd(), 'output') 39 | self.output_path = os.path.join(output_dir, str(get_subdir_number(output_dir))) 40 | self.log = Log(self.output_path) # will write log to a current w. dir. if not provide 41 | 42 | def init_model(self, **kwargs): 43 | """ 44 | Initializes the actual model. 45 | 46 | """ 47 | self.model = self.model_class(**kwargs) 48 | self.record_experimental_setup() 49 | 50 | def train_workflow(self, evaluate=True, save_model=True): 51 | """ 52 | Runs a workflow of steps such as training and evaluation. One could modify it in order to create other procedures. 53 | :param evaluate: if True will evaluate otherwise not. 54 | 55 | """ 56 | assert self.train_data_path 57 | 58 | for epoch in range(1, self.epochs+1): 59 | 60 | self.log.write('epoch %d' % epoch) 61 | self.train(data_path=self.train_data_path) 62 | 63 | # evaluate training and validation accuracy and loss 64 | if evaluate: 65 | # FIXME: at the moment the training evaluation is disabled, as it's too expensive to perform evaluation over the whole large dataset. 66 | # metrics = self._measure_performance(data_path=self.train_data_path) 67 | # if metrics: 68 | # self.log.write(metrics_to_str(metrics, "training")) 69 | 70 | if self.val_data_path: 71 | metrics = self._measure_performance(data_path=self.val_data_path) 72 | self.log.write(metrics_to_str(metrics, "validation")) 73 | 74 | if evaluate and self.test_data_path: 75 | metrics = self._measure_performance(data_path=self.test_data_path) 76 | self.log.write(self.log.write(metrics_to_str(metrics, "test"))) 77 | 78 | # save the actual model 79 | if save_model: 80 | self.save_model(os.path.join(self.output_path, 'model.pkl')) 81 | self.log.write("model is saved to: %s" % self.output_path) 82 | 83 | # run post training functions 84 | self._post_training_logic() 85 | 86 | def train(self, data_path): 87 | """ 88 | A user accessible train function that wraps the model's train function. 89 | :type data_path: str 90 | 91 | """ 92 | iterator = self.init_iterator(data_path) 93 | for counter, batch in enumerate(iterator, 1): 94 | metrics = self._train(batch=batch) 95 | if counter % 10 == 0: 96 | self.log.write(metrics_to_str(metrics, prefix="chunk's # %d" % counter)) 97 | 98 | def load_model(self, model_file_path): 99 | """ 100 | :param model_file_path: pre-saved pkl file with a model. 101 | 102 | """ 103 | self.model = load(model_file_path) 104 | self.record_experimental_setup() 105 | self.log.write("loaded the model from: %s" % model_file_path) 106 | 107 | def save_model(self, file_path): 108 | assert self.model # the model should be initialized 109 | save(self.model, file_path) 110 | 111 | def load_params(self, params_dump_file_path=None, exclude_params=[]): 112 | """ 113 | Loads parameters from dumps or/and embeddings from a file. 114 | :param exclude_params: an array of parameter names that should NOT be initialized. 115 | 116 | """ 117 | assert self.model # the model should be initialized 118 | 119 | # general parameters loading 120 | if params_dump_file_path: 121 | init_params = self.model.load_params(file_path=params_dump_file_path, exclude_params=exclude_params) 122 | self.log.write("loaded parameters from: %s" % params_dump_file_path) 123 | self.log.write("initialized the following parameters: %s" % (", ".join(init_params))) 124 | 125 | def record_experimental_setup(self): 126 | """ 127 | Records the experimental setup: basic and the model specific to a log file. 128 | 129 | """ 130 | setup = merge_ordered_dicts(infer_attributes_to_log(self.model), infer_attributes_to_log(self)) 131 | self.log.write(format_experimental_setup(setup), include_timestamp=False) 132 | 133 | # the following functions will be implemented in children classes 134 | # TODO: write a basic documentation for those functions 135 | 136 | def _train(self, **kwargs): 137 | """ 138 | A specific wrapper over the model's training function. 139 | 140 | """ 141 | raise NotImplementedError 142 | 143 | def _measure_performance(self, **kwargs): 144 | """ 145 | Computes the performance of the model and returns a dictionary with names and values. 146 | 147 | """ 148 | raise NotImplementedError 149 | 150 | def _post_training_logic(self, **kwargs): 151 | """ 152 | Logic that is desired to be executed in the train_workflow after the model has finished training. 153 | 154 | """ 155 | pass 156 | -------------------------------------------------------------------------------- /libraries/evaluation/GloVe/evaluate.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import numpy as np 3 | import os 4 | 5 | 6 | def glove_evaluate(vocab_file, vectors_file, bins=None, max_count=None): 7 | counts = {} 8 | words = [] 9 | print("---------------------------------------------------------") 10 | print 'Reading %s' % vectors_file 11 | if max_count is not None: 12 | print "maximum frequency is %d (more freq. words are discarded)" % max_count 13 | with open(vocab_file, 'r') as f: 14 | # words = [x.rstrip().split(' ')[0] for x in f.readlines()] 15 | for line in f: 16 | word, count = line.split(' ') 17 | count = int(count) 18 | if max_count is not None and max_count < count: 19 | continue 20 | counts[word] = count 21 | words.append(word) 22 | vocab = {w: idx for idx, w in enumerate(words)} 23 | with open(vectors_file, 'r') as f: 24 | vectors = {} 25 | for line in f: 26 | vals = line.rstrip().split(' ') 27 | if vals[0] in vocab: 28 | vectors[vals[0]] = [float(x) for x in vals[1:]] 29 | if bins is not None: 30 | hist, bin_edges = np.histogram(counts.values(), bins=bins) 31 | 32 | vocab_size = len(vocab) 33 | ivocab = {idx: w for idx, w in enumerate(words)} 34 | 35 | vector_dim = len(vectors[ivocab[0]]) 36 | W = np.zeros((vocab_size, vector_dim)) 37 | for word, v in vectors.items(): 38 | W[vocab[word], :] = v 39 | 40 | # normalize each word vector to unit variance 41 | W_norm = np.zeros(W.shape) 42 | d = (np.sum(W ** 2, 1) ** (0.5)) 43 | W_norm = (W.T / d).T 44 | 45 | # print "total vocab size is %d" % len(vocab) 46 | 47 | 48 | if bins is None: 49 | evaluate_vectors(W_norm, vocab) 50 | else: 51 | for i in range(len(bin_edges)-1): 52 | 53 | if hist[i] == 0: 54 | continue 55 | temp_vocab = {} 56 | temp_vectors = np.zeros((hist[i], vector_dim)) 57 | edge1 = bin_edges[i] 58 | edge2 = bin_edges[i+1] 59 | j = 0 # current idx 60 | # collecting words that are between two edges 61 | for word, idx in vocab.items(): 62 | if counts[word] >= edge1 and counts[word] <= edge2: 63 | temp_vocab[word] = j 64 | temp_vectors[j, :]= W_norm[idx] 65 | j += 1 66 | # run evaluation 67 | print("---------------------------------------------------------") 68 | print "bin frequency limits are [%f, %f]" % (edge1, edge2) 69 | print "temp vocab's size is %d " % len(temp_vectors) 70 | print "temp vectors size is %d" % len(temp_vocab) 71 | evaluate_vectors(temp_vectors, temp_vocab, short=True) 72 | 73 | 74 | def evaluate_vectors(W, vocab, short=False): 75 | """Evaluate the trained word vectors on a variety of tasks""" 76 | 77 | filenames = [ 78 | 'capital-common-countries.txt', 'capital-world.txt', 'currency.txt', 79 | 'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt', 80 | 'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt', 81 | 'gram5-present-participle.txt', 'gram6-nationality-adjective.txt', 82 | 'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt', 83 | ] 84 | #prefix = './eval/question-data/' 85 | prefix = os.path.dirname(os.path.realpath(__file__))+'/question-data' 86 | 87 | # to avoid memory overflow, could be increased/decreased 88 | # depending on system and vocab size 89 | split_size = 100 90 | 91 | correct_sem = 0; # count correct semantic questions 92 | correct_syn = 0; # count correct syntactic questions 93 | correct_tot = 0 # count correct questions 94 | count_sem = 0; # count all semantic questions 95 | count_syn = 0; # count all syntactic questions 96 | count_tot = 0 # count all questions 97 | full_count = 0 # count all questions, including those with unknown words 98 | 99 | for i in range(len(filenames)): 100 | with open('%s/%s' % (prefix, filenames[i]), 'r') as f: 101 | full_data = [line.rstrip().split(' ') for line in f] 102 | full_count += len(full_data) 103 | data = [x for x in full_data if all(word in vocab for word in x)] 104 | 105 | indices = np.array([[vocab[word] for word in row] for row in data]) 106 | if len(indices)==0: continue 107 | ind1, ind2, ind3, ind4 = indices.T 108 | 109 | predictions = np.zeros((len(indices),)) 110 | num_iter = int(np.ceil(len(indices) / float(split_size))) 111 | for j in range(num_iter): 112 | subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1))) 113 | pred_vec = (W[ind2[subset], :] - W[ind1[subset], :] 114 | + W[ind3[subset], :]) 115 | #cosine similarity if input W has been normalized 116 | dist = np.dot(W, pred_vec.T) 117 | 118 | for k in range(len(subset)): 119 | dist[ind1[subset[k]], k] = -np.Inf 120 | dist[ind2[subset[k]], k] = -np.Inf 121 | dist[ind3[subset[k]], k] = -np.Inf 122 | 123 | # predicted word index 124 | predictions[subset] = np.argmax(dist, 0).flatten() 125 | 126 | val = (ind4 == predictions) # correct predictions 127 | count_tot = count_tot + len(ind1) 128 | correct_tot = correct_tot + sum(val) 129 | if i < 5: 130 | count_sem = count_sem + len(ind1) 131 | correct_sem = correct_sem + sum(val) 132 | else: 133 | count_syn = count_syn + len(ind1) 134 | correct_syn = correct_syn + sum(val) 135 | if not short: 136 | print("%s:" % filenames[i]) 137 | print('ACCURACY TOP1: %.2f%% (%d/%d)' % 138 | (np.mean(val) * 100, np.sum(val), len(val))) 139 | 140 | print('Questions seen/total: %.2f%% (%d/%d)' % 141 | (100 * count_tot / float(full_count), count_tot, full_count)) 142 | if count_sem != 0: 143 | print('Semantic accuracy: %.2f%% (%i/%i)' % 144 | (100 * correct_sem / float(count_sem), correct_sem, count_sem)) 145 | if count_syn: 146 | print('Syntactic accuracy: %.2f%% (%i/%i)' % 147 | (100 * correct_syn / float(count_syn), correct_syn, count_syn)) 148 | if count_tot != 0: 149 | print('Total accuracy: %.2f%% (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot)) 150 | 151 | --------------------------------------------------------------------------------