├── models
    ├── __init__.py
    ├── bword2vec.py
    └── support.py
├── libraries
    ├── tools
    │   ├── __init__.py
    │   ├── ordered_attrs.py
    │   ├── attr_order_preservation.py
    │   ├── log.py
    │   └── word_processor.py
    ├── utils
    │   ├── __init__.py
    │   ├── other.py
    │   └── paths_and_files.py
    ├── evaluation
    │   ├── __init__.py
    │   ├── GloVe
    │   │   ├── __init__.py
    │   │   ├── distance.py
    │   │   ├── word_analogy.py
    │   │   ├── README.md
    │   │   └── evaluate.py
    │   ├── entailment
    │   │   ├── __init__.py
    │   │   └── data
    │   │   │   └── bench
    │   │   │       └── baroni2012
    │   │   │           ├── data_lex_val copy.tsv
    │   │   │           ├── data_lex_val.tsv
    │   │   │           └── data_rnd_val.tsv
    │   ├── lexsub
    │   │   ├── __init__.py
    │   │   ├── jcs
    │   │   │   ├── __init__.py
    │   │   │   ├── data
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pos.py
    │   │   │   │   ├── tree2conll.py
    │   │   │   │   ├── conll_line.py
    │   │   │   │   ├── context_instance.py
    │   │   │   │   └── embedding.py
    │   │   │   ├── evaluation
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── lst
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── coinco_split_dev_test.py
    │   │   │   │   │   ├── special_word_marker.py
    │   │   │   │   │   ├── extract_lst_candidates.py
    │   │   │   │   │   ├── lst_gap.py
    │   │   │   │   │   └── preprocess_lst_test.py
    │   │   │   │   └── measures
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── generalized_average_precision.py
    │   │   │   ├── text2numpy.py
    │   │   │   ├── embedding_inferrer.py
    │   │   │   ├── jcs_io.py
    │   │   │   ├── cs_inferrer.py
    │   │   │   └── context2vec_inferrer.py
    │   │   ├── main
    │   │   │   ├── __init__.py
    │   │   │   ├── simulators_interfaces
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── bsg_simulator_interface.py
    │   │   │   ├── pos.py
    │   │   │   ├── skipgram_embeddings.py
    │   │   │   ├── lex_sub.py
    │   │   │   ├── support.py
    │   │   │   └── context_instance.py
    │   │   ├── datasets
    │   │   │   ├── lst_all.xml
    │   │   │   ├── lst_all.preprocessed
    │   │   │   ├── lst_test.preprocessed
    │   │   │   └── coinco_all.no_problematic.gold
    │   │   ├── run_lexsub.py
    │   │   └── README.md
    │   ├── word_sim
    │   │   ├── __init__.py
    │   │   ├── .README.md.swp
    │   │   ├── data
    │   │   │   └── word-sim
    │   │   │   │   ├── EN-MC-30.txt
    │   │   │   │   ├── EN-RG-65.txt
    │   │   │   │   ├── EN-YP-130.txt
    │   │   │   │   ├── EN-VERB-143.txt
    │   │   │   │   ├── EN-WS-353-SIM.txt
    │   │   │   │   └── EN-WS-353-REL.txt
    │   │   ├── read_write.py
    │   │   ├── wordsim.py
    │   │   ├── all_wordsim.py
    │   │   ├── README.md
    │   │   └── ranking.py
    │   └── support.py
    ├── simulators
    │   ├── __init__.py
    │   ├── base_simulator.py
    │   ├── bsg_simulator.py
    │   └── support.py
    ├── data_iterators
    │   ├── __init__.py
    │   ├── support.py
    │   ├── base_data_iterator.py
    │   └── open_text_data_iterator.py
    ├── __init__.py
    ├── misc
    │   ├── __init__.py
    │   ├── non_linearity.py
    │   ├── optimizations.py
    │   └── initializers.py
    ├── tokenizers
    │   ├── __init__.py
    │   ├── standard_tokenizer.py
    │   └── bsg_tokenizer.py
    ├── batch_iterators
    │   ├── __init__.py
    │   ├── base_batch_iterator.py
    │   ├── support.py
    │   ├── sentence_batch_iterator.py
    │   └── window_batch_iterator.py
    └── theano_support
    │   ├── __init__.py
    │   └── extra.py
├── eval
    ├── __init__.py
    ├── example_word_pairs.txt
    ├── word_pairs_eval.py
    └── support.py
├── interfaces
    ├── __init__.py
    ├── support.py
    ├── i_bsg.py
    ├── interface_configurator.py
    └── i_base.py
├── layers
    ├── __init__.py
    ├── custom
    │   ├── __init__.py
    │   └── bsg_encoder.py
    ├── standard
    │   ├── __init__.py
    │   ├── embeddings.py
    │   └── dense.py
    ├── support.py
    └── layer.py
├── .gitignore
├── requirements.txt
├── run_bsg.py
└── README.md


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/tools/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/simulators/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/evaluation/GloVe/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/eval/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/data_iterators/__init__.py:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/libraries/evaluation/entailment/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/interfaces/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/layers/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/data/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/main/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .idea
2 | *.pyc
3 | output
4 | .DS_Store


--------------------------------------------------------------------------------
/layers/custom/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/layers/standard/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/misc/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/lst/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/tokenizers/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/batch_iterators/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/measures/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/libraries/theano_support/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/main/simulators_interfaces/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'topLvl'
2 | 


--------------------------------------------------------------------------------
/libraries/evaluation/entailment/data/bench/baroni2012/data_lex_val copy.tsv:
--------------------------------------------------------------------------------
1 | mosque	castle	True
2 | 


--------------------------------------------------------------------------------
/eval/example_word_pairs.txt:
--------------------------------------------------------------------------------
1 | pet cat
2 | knight human
3 | coffee drink
4 | dog animal
5 | coffee espresso
6 | cow animal
7 | 


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/.README.md.swp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/word_sim/.README.md.swp


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/datasets/lst_all.xml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/lexsub/datasets/lst_all.xml


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | Theano==0.9.0
2 | numpy==1.14.2
3 | nltk==3.2.2,
4 | scipy==0.18.1,
5 | -e git+https://github.com/Lasagne/Lasagne.git#egg=Lasagne-0.2.dev1


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/datasets/lst_all.preprocessed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/lexsub/datasets/lst_all.preprocessed


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/datasets/lst_test.preprocessed:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/lexsub/datasets/lst_test.preprocessed


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/datasets/coinco_all.no_problematic.gold:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/abrazinskas/BSG/HEAD/libraries/evaluation/lexsub/datasets/coinco_all.no_problematic.gold


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/main/pos.py:
--------------------------------------------------------------------------------
1 | from nltk.corpus import wordnet
2 | 
3 | to_wordnet_pos = {'N':wordnet.NOUN,'J':wordnet.ADJ,'V':wordnet.VERB,'R':wordnet.ADV}
4 | from_lst_pos = {'j':'J','a':'J', 'v':'V', 'n':'N', 'r':'R'}


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/data/pos.py:
--------------------------------------------------------------------------------
1 | from nltk.corpus import wordnet
2 | 
3 | to_wordnet_pos = {'N':wordnet.NOUN,'J':wordnet.ADJ,'V':wordnet.VERB,'R':wordnet.ADV}
4 | from_lst_pos = {'j':'J','a':'J', 'v':'V', 'n':'N', 'r':'R'}


--------------------------------------------------------------------------------
/libraries/data_iterators/support.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import unicodedata
3 | 
4 | # removes/replaces strange symbols like é
5 | def deal_with_accents(str):
6 |     return unicodedata.normalize('NFD', str)#.encode('ascii', 'ignore')
7 | 
8 | 


--------------------------------------------------------------------------------
/libraries/tools/ordered_attrs.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | 
 4 | class OrderedAttrs():
 5 |     """
 6 |     Makes sure that attributes are stored in the order of assignment. 
 7 |     
 8 |     """
 9 |     def __init__(self):
10 |         self.__dict__ = OrderedDict()
11 | 
12 |     def __setattr__(self, key, value):
13 |         self.__dict__[key] = value


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/data/tree2conll.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from conll_line import ConllLine
 3 | 
 4 | for tree_line in sys.stdin:
 5 |     tree_line_stripped = tree_line.strip()
 6 |     if len(tree_line_stripped) > 0:        
 7 |         conll = ConllLine()
 8 |         conll.from_tree_line(tree_line_stripped)
 9 |         sys.stdout.write(str(conll)+'\n')
10 |     else:
11 |         sys.stdout.write('\n')


--------------------------------------------------------------------------------
/libraries/tools/attr_order_preservation.py:
--------------------------------------------------------------------------------
 1 | # A solution to preserve the order of attribute assignment
 2 | from collections import OrderedDict
 3 | 
 4 | 
 5 | class AttrOrderPreservation:
 6 |     def __init__(self, obj):
 7 |         self.obj = obj
 8 |         self.attr_order = OrderedDict()
 9 | 
10 |     def add_attr(self, name, value):
11 |         self.attr_order[name] = value
12 |         setattr(self.obj,name, value)


--------------------------------------------------------------------------------
/libraries/simulators/base_simulator.py:
--------------------------------------------------------------------------------
 1 | from support import load
 2 | 
 3 | 
 4 | class BaseSimulator:
 5 |     def __init__(self, vocab, model_file_path):
 6 |         """
 7 |         The passed in the .pkl format model has to have "encode" and "compute_prior_params" methods.
 8 | 
 9 |         """
10 |         self.vocab = vocab
11 |         self.model = load(model_file_path)
12 |         assert hasattr(self.model, 'encode')
13 | 


--------------------------------------------------------------------------------
/libraries/data_iterators/base_data_iterator.py:
--------------------------------------------------------------------------------
 1 | from nltk import word_tokenize as default_tokenizer
 2 | 
 3 | 
 4 | class BaseDataIterator():
 5 | 
 6 |     def __init__(self, tokenizer=None, input_encoding='utf-8'):
 7 |         """
 8 |         Base data iterator that contains the general set_data_path method
 9 | 
10 |         """
11 |         self.tokenizer = tokenizer if tokenizer else default_tokenizer
12 |         self.data_path = None
13 |         self.input_encoding = input_encoding
14 | 
15 |     def set_data_path(self, data_path):
16 |         self.data_path = data_path
17 | 


--------------------------------------------------------------------------------
/libraries/misc/non_linearity.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | 
 3 | 
 4 | class NonLinearity():
 5 |     def __init__(self, type="linear"):
 6 |         assert type in ['linear', 'relu', 'sigmoid', 'hard_sigmoid', 'tanh']
 7 |         self.type = type
 8 | 
 9 |     def __call__(self, x):
10 |         if self.type == 'linear':
11 |             return x
12 |         if self.type == "relu":
13 |             return T.nnet.relu(x)
14 |         if self.type == 'sigmoid':
15 |             return T.nnet.sigmoid(x)
16 |         if self.type == 'hard_sigmoid':
17 |             return T.nnet.hard_sigmoid(x)
18 |         if self.type == 'tanh':
19 |             return T.tanh(x)
20 | 
21 | 
22 | 


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/data/word-sim/EN-MC-30.txt:
--------------------------------------------------------------------------------
 1 | car	automobile	3.92
 2 | gem	jewel	3.84
 3 | journey	voyage	3.84
 4 | boy	lad	3.76
 5 | coast	shore	3.70
 6 | asylum	madhouse	3.61
 7 | magician	wizard	3.50
 8 | midday	noon	3.42
 9 | furnace	stove	3.11
10 | food	fruit	3.08
11 | bird	cock	3.05
12 | bird	crane	2.97
13 | tool	implement	2.95
14 | brother	monk	2.82
15 | lad	brother	1.66
16 | crane	implement	1.68
17 | journey	car	1.16
18 | monk	oracle	1.10
19 | cemetery	woodland	0.95
20 | food	rooster	0.89
21 | coast	hill	0.87
22 | forest	graveyard	0.84
23 | shore	woodland	0.63
24 | monk	slave	0.55
25 | coast	forest	0.42
26 | lad	wizard	0.42
27 | chord	smile	0.13
28 | glass	magician	0.11
29 | rooster	voyage	0.08
30 | noon	string	0.08
31 | 


--------------------------------------------------------------------------------
/layers/support.py:
--------------------------------------------------------------------------------
 1 | import theano.tensor as T
 2 | import inspect
 3 | 
 4 | def inverse_sigmoid(decay_rate, batch_nr):
 5 |     """
 6 |     Computes inverse sigmoid function's value which is used in schedules sampling.
 7 |     :param decay_rate: hyper-parameter that controls the decay.
 8 | 
 9 |     """
10 |     return decay_rate / (decay_rate + T.exp(batch_nr / decay_rate))
11 | 
12 | 
13 | def select_matching_args(func, arguments_dict):
14 |     """
15 |     :return a hash with matching the function's arguments dictionary.
16 |     """
17 |     func_args = inspect.getargspec(func)[0]
18 |     matching_args = {}
19 |     for func_arg in func_args:
20 |         if func_arg in arguments_dict:
21 |             matching_args[func_arg] = arguments_dict[func_arg]
22 |     return matching_args


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/main/simulators_interfaces/bsg_simulator_interface.py:
--------------------------------------------------------------------------------
 1 | from libraries.simulators.bsg_simulator import BsgSimulator
 2 | from libraries.simulators.support import KL
 3 | 
 4 | 
 5 | class BsgSimulatorInterface:
 6 |     """
 7 |     Interface class to access simulators.
 8 | 
 9 |     """
10 |     def __init__(self, vocab, model_file_path):
11 |         self.simulator = BsgSimulator(vocab=vocab, model_file_path=model_file_path)
12 | 
13 |     def score(self, target, left_context, right_context, candidates, **kwargs):
14 |             scores = {}
15 |             mu_q, sigma_q = self.simulator.encode(context_words=left_context+right_context, center_word=target)
16 |             for cand in candidates:
17 |                 mu_p, sigma_p = self.simulator.get_representation(cand)
18 |                 scores[cand] = -1*KL(mu_q, sigma_q, mu_p, sigma_p)
19 |             return scores


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/read_write.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import gzip
 3 | import numpy
 4 | import math
 5 | 
 6 | from collections import Counter
 7 | from operator import itemgetter
 8 | 
 9 | ''' Read all the word vectors and normalize them '''
10 | def read_word_vectors(filename):    
11 |   word_vecs = {}
12 |   if filename.endswith('.gz'): file_object = gzip.open(filename, 'r')
13 |   else: file_object = open(filename, 'r')
14 | 
15 |   for line_num, line in enumerate(file_object):
16 |     line = line.strip().lower()
17 |     word = line.split()[0]
18 |     word_vecs[word] = numpy.zeros(len(line.split())-1, dtype=float)
19 |     for index, vec_val in enumerate(line.split()[1:]):
20 |       word_vecs[word][index] = float(vec_val)      
21 |     ''' normalize weight vector '''
22 |     word_vecs[word] /= math.sqrt((word_vecs[word]**2).sum() + 1e-6)        
23 | 
24 |   sys.stderr.write("Vectors read from: "+filename+" \n")
25 |   return word_vecs
26 | 


--------------------------------------------------------------------------------
/libraries/tools/log.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from libraries.utils.paths_and_files import create_folders_if_not_exist
 3 | from time import strftime
 4 | 
 5 | 
 6 | # A general purpose class for logging
 7 | class Log():
 8 |     def __init__(self, folder):
 9 |         self.file_path = os.path.join(folder, "log_"+strftime("%b_%d_%H_%M_%S")+'.txt')
10 |         create_folders_if_not_exist(self.file_path)
11 | 
12 |     def write(self, string, also_print=True, include_timestamp=True):
13 |         """
14 |         :param string: what string to write to the log
15 |         :param also_print: if set to True, will also print to the console
16 |         :param include_timestamp: whether include the timestamp
17 |         """
18 |         if include_timestamp:
19 |             string = "%s [INFO]: %s" % (strftime("%H:%M:%S"), string)
20 |         if also_print:
21 |             print(string)
22 |         with open(self.file_path, "a") as f:
23 |             f.write(string+" \n")
24 | 


--------------------------------------------------------------------------------
/libraries/theano_support/extra.py:
--------------------------------------------------------------------------------
 1 | # this file contains functions that are useful for Theano models.
 2 | from theano import tensor as T
 3 | 
 4 | def expand_dims(x, dim=-1):
 5 |     """Add a 1-sized dimension at index "dim".
 6 |     """
 7 |     # TODO: `keras_shape` inference.
 8 |     pattern = [i for i in range(x.type.ndim)]
 9 |     if dim < 0:
10 |         if x.type.ndim == 0:
11 |             dim = 0
12 |         else:
13 |             dim = dim % x.type.ndim + 1
14 |     pattern.insert(dim, 'x')
15 |     return x.dimshuffle(pattern)
16 | 
17 | 
18 | def add_one_dim(x, dim=-1):
19 |     pattern = list(x.shape)
20 |     if dim < 0:
21 |         if x.type.ndim == 0:
22 |             dim = 0
23 |         else:
24 |             dim = dim % x.type.ndim + 1
25 |     pattern.insert(dim, 1)
26 |     return T.reshape(x, pattern)
27 | 
28 | def squeeze(x, axis):
29 |     """Remove a 1-dimension from the tensor at index "axis".
30 |     """
31 |     # TODO: `keras_shape` inference.
32 |     shape = list(x.shape)
33 |     shape.pop(axis)
34 |     return T.reshape(x, tuple(shape))


--------------------------------------------------------------------------------
/layers/standard/embeddings.py:
--------------------------------------------------------------------------------
 1 | from layers.layer import Layer
 2 | from libraries.theano_support.extra import expand_dims
 3 | 
 4 | 
 5 | class Embeddings(Layer):
 6 |     def __init__(self, name, collection_size, output_dim, **kwargs):
 7 |         Layer.__init__(self, name=name, **kwargs)
 8 |         self.collection_size = collection_size
 9 |         self.output_dim = output_dim
10 |         self.W = self.add_param("W", shape=(collection_size, output_dim), init_type=self.init_type,
11 |                                 regularizable=self.regularizable)
12 | 
13 |     def __call__(self, x, mask=None, perform_dimshuffle=True):
14 |         """
15 |         :return tensor [batch_size, output_dim, sequence_length] or [batch_size, output_dim]
16 | 
17 |         """
18 |         # x = Print("x")(x)
19 |         res = self.W[x]
20 |         if mask:
21 |             mask = expand_dims(mask, 2)
22 |             res = res * mask
23 |         # we want to make sure that output_dims are rows, and words are columns
24 |         if res.ndim == 3 and perform_dimshuffle:
25 |             res = res.dimshuffle((0, 2, 1))
26 |         return res


--------------------------------------------------------------------------------
/layers/standard/dense.py:
--------------------------------------------------------------------------------
 1 | from theano import tensor as T
 2 | from layers.layer import Layer
 3 | from libraries.misc.non_linearity import NonLinearity
 4 | 
 5 | 
 6 | class Dense(Layer):
 7 |     def __init__(self, name, input_dim, output_dim, non_linearity='linear', init_type='uniform', regularizable=True):
 8 |         """
 9 |         A simple fully connected layer that performs affine transformations following by a non-linearity.
10 | 
11 |         """
12 |         Layer.__init__(self, name)
13 |         self.non_linearity = NonLinearity(type=non_linearity)
14 |         self.W = self.add_param(name="W", shape=(input_dim, output_dim), init_type=init_type,
15 |                                 regularizable=regularizable)
16 |         self.b = self.add_param(name="b", shape=(output_dim, ), init_type='zeros')
17 | 
18 |     def __call__(self, x):
19 |         """
20 |         :param x: tensor [batch_size, input_dim]
21 |         :return: tensor [batch_size, output_dim]
22 | 
23 |         """
24 |         output = T.dot(x, self.W) + self.b
25 |         output = self.non_linearity(output)
26 |         return output
27 | 
28 | 
29 | 
30 | 


--------------------------------------------------------------------------------
/libraries/batch_iterators/base_batch_iterator.py:
--------------------------------------------------------------------------------
 1 | from multiprocessing import Process, Queue
 2 | 
 3 | 
 4 | class BaseBatchIterator():
 5 | 
 6 |     def __init__(self):
 7 |         pass
 8 | 
 9 |     def __iter__(self):
10 |         """
11 |         Separate process pre-loading of data and iteration over its batches.
12 | 
13 |         """
14 |         process, queue = self.__i_parallel_load_data_batches()
15 |         process.start()
16 |         process.deamon = True
17 |         while True:
18 |             batch = queue.get()
19 |             if batch is None:
20 |                 process.join()
21 |                 break  # the file has ended
22 |             yield batch
23 | 
24 |     def load_data_batches_to_queue(self, queue):
25 |         raise NotImplementedError  # this has to be assigned in a subclass
26 | 
27 |     def __i_parallel_load_data_batches(self, queue_size=5):
28 |         """
29 |         Loads batches on a separate process.
30 | 
31 |         """
32 |         queue = Queue(queue_size)
33 |         process = Process(target=self.load_data_batches_to_queue, args=(queue, ))
34 |         return process, queue
35 | 
36 | 


--------------------------------------------------------------------------------
/libraries/data_iterators/open_text_data_iterator.py:
--------------------------------------------------------------------------------
 1 | from nltk import word_tokenize as default_tokenizer
 2 | from support import deal_with_accents
 3 | from libraries.utils.paths_and_files import get_file_paths
 4 | 
 5 | 
 6 | class OpenTextDataIterator():
 7 | 
 8 |     def __init__(self, tokenizer=None):
 9 |         """
10 |         Text data iterator for open text, that returns tokenized sentences. Assumes that each sentence is separated
11 |         by a new line.
12 | 
13 |         """
14 |         self.tokenizer = tokenizer if tokenizer else default_tokenizer
15 |         self.data_path = None
16 | 
17 |     def set_data_path(self, data_path):
18 |         self.data_path = data_path
19 | 
20 |     def __iter__(self):
21 |         if not self.data_path:
22 |             raise ValueError("please specify the data_path first by calling set_data_path()")
23 |         for filename in get_file_paths(self.data_path):
24 |             with open(filename) as f:
25 |                 for line in f:
26 |                     tokens = self.tokenizer(deal_with_accents(line.strip().lower().decode('utf-8', 'ignore')))
27 |                     yield tokens,
28 | 


--------------------------------------------------------------------------------
/libraries/misc/optimizations.py:
--------------------------------------------------------------------------------
 1 | # This file contains learning rate optimizations
 2 | import lasagne
 3 | 
 4 | 
 5 | class LROpt:
 6 |     def __init__(self, learning_rate):
 7 |         self.alpha = learning_rate
 8 | 
 9 | 
10 | class Adam(LROpt):
11 |     def __init__(self, learning_rate, beta1, beta2):
12 |         LROpt.__init__(self, learning_rate=learning_rate)
13 |         self.beta1 = beta1
14 |         self.beta2 = beta2
15 | 
16 |     def __call__(self, cost, params):
17 |         return lasagne.updates.adam(cost, params, learning_rate=self.alpha, beta1=self.beta1, beta2=self.beta2)
18 | 
19 | 
20 | class SGD(LROpt):
21 |     def __init__(self, learning_rate):
22 |         LROpt.__init__(self, learning_rate)
23 | 
24 |     def __call__(self, cost, params):
25 |         return lasagne.updates.sgd(cost, params, learning_rate=self.alpha)
26 | 
27 | 
28 | class AdaGrad(LROpt):
29 |     def __init__(self, learning_rate, eps):
30 |         LROpt.__init__(self, learning_rate)
31 |         self.eps = eps
32 | 
33 |     def __call__(self, cost, params):
34 |         return lasagne.updates.adagrad(cost, params, learning_rate=self.alpha, epsilon=self.eps)
35 | 
36 | 


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/wordsim.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | from read_write import read_word_vectors
 5 | from ranking import *
 6 | 
 7 | 
 8 | def word_sim(word_vec_file, word_sim_file):
 9 | 
10 |       word_vecs = read_word_vectors(word_vec_file)
11 |       print '================================================================================='
12 |       print "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
13 |       print '================================================================================='
14 | 
15 |       manual_dict, auto_dict = ({}, {})
16 |       not_found, total_size = (0, 0)
17 |       for line in open(word_sim_file,'r'):
18 |         line = line.strip().lower()
19 |         word1, word2, val = line.split()
20 |         if word1 in word_vecs and word2 in word_vecs:
21 |           manual_dict[(word1, word2)] = float(val)
22 |           auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2])
23 |         else:
24 |           not_found += 1
25 |         total_size += 1
26 |       print "%15s" % str(total_size), "%15s" % str(not_found),
27 |       print "%15.4f" % spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict))
28 | 


--------------------------------------------------------------------------------
/libraries/simulators/bsg_simulator.py:
--------------------------------------------------------------------------------
 1 | from base_simulator import BaseSimulator
 2 | import numpy as np
 3 | 
 4 | 
 5 | class BsgSimulator(BaseSimulator):
 6 |     """
 7 |     This class will both work for classical BSG and BSG with LSTM encoder.
 8 | 
 9 |     """
10 |     def __init__(self, **kwargs):
11 |         BaseSimulator.__init__(self, **kwargs)
12 | 
13 |     def get_representation(self, word):
14 |         word_id = self.vocab[word].id
15 |         mu = self.model.get_word_mu_rep(word_id)
16 |         sigma = self.model.get_word_sigma_rep(word_id)
17 |         return mu, sigma
18 | 
19 |     def encode(self, center_word, context_words):
20 |         """
21 |         :param center_word: int
22 |         :param context_words: vector of ints
23 |         """
24 |         # convert to vocab_ids
25 |         center_word_id = np.int32(self.vocab[center_word].id)
26 |         context_word_ids = np.array([obj.id for obj in self.vocab[context_words]], dtype="int32")
27 |         # generate the mask of ones
28 |         mask = np.ones([1, len(context_words)], dtype="float32")
29 |         mu, sigma = self.model.encode([context_word_ids], [center_word_id],  mask)
30 |         return mu[0], sigma[0]
31 | 


--------------------------------------------------------------------------------
/libraries/tools/word_processor.py:
--------------------------------------------------------------------------------
 1 | # word processors that use regular expressions to clean words and tokenization
 2 | try:
 3 |     import re2 as re
 4 | except ImportError:
 5 |     import re
 6 | 
 7 | 
 8 | class WordProcessor:
 9 |     def __init__(self, word_processor_type='default'):
10 |         self.__allowed_types = ['none', 'default', 'open_text']
11 |         # sanity checks for input
12 |         assert word_processor_type in self.__allowed_types
13 |         # assigning processing function
14 |         if word_processor_type == 'none':
15 |             self.__call__ = lambda x: x
16 |         if word_processor_type == 'default':
17 |             self.__call__ = lambda word: re.sub(r'[^\w_,.?@!$#\':\/\-()]|[,\'?@$#]{2,}', "", word)
18 |         if word_processor_type == "open_text":
19 |             self.__call__ = self.__open_text_cleaner
20 | 
21 |     @staticmethod
22 |     def __open_text_cleaner(word):
23 |         """
24 |         Direct copy from the original BSG setup. The tokens matching logic was moved to bsg_tokenizer.py
25 | 
26 |         """
27 |         word = re.sub(r'[^\w\'\-]|[\'\-\_]{2,}', "", word)
28 |         if len(word) == 1:
29 |             word = re.sub(r'[^\daiu]', '', word)
30 |         return word


--------------------------------------------------------------------------------
/libraries/tokenizers/standard_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from libraries.tools.word_processor import WordProcessor
 2 | from nltk import word_tokenize as external_tokenizer
 3 | 
 4 | 
 5 | # A more advanced tokenizer that both tokenizes and cleans textual data
 6 | class StandardTokenizer():
 7 | 
 8 |     def __init__(self, word_processor_type='none', use_external_tokenizer=True):
 9 |         """
10 |         :param use_external_tokenizer: whether to use or word_tokenize tokenizer or rely on the simple splitter(x.split()).
11 |         """
12 |         if use_external_tokenizer:
13 |             self.tokenizer = external_tokenizer
14 |         else:
15 |             self.tokenizer = lambda x: x.split()  # assuming that data was already tokenized
16 |         self.word_processor = WordProcessor(word_processor_type=word_processor_type)
17 | 
18 |     def __call__(self, sentence):
19 |         """
20 |         :param sentence: a string of words
21 |         :return: a list of clean tokens
22 | 
23 |         """
24 |         words = self.tokenizer(sentence)
25 |         tokens = []
26 |         for word in words:
27 |             token = self.word_processor(word)
28 |             if token == "":
29 |                 continue
30 |             tokens.append(token)
31 |         return tokens


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/text2numpy.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Convert word embeddings from text files to numpy-friendly format
 3 | '''
 4 | 
 5 | import numpy as np
 6 | import sys
 7 | 
 8 | 
 9 | def readVectors(path, header=False):
10 |     vectors = {}
11 |     with open(path) as input_f:
12 |         for i, line in enumerate(input_f.readlines()):
13 |             if header and i==0:
14 |                 continue
15 |             if line == "":
16 |                 continue
17 |             tokens = line.strip().split(' ')
18 |             vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
19 |     return vectors
20 | 
21 | inpath = sys.argv[1]
22 | outpath = sys.argv[2]
23 | header = True if sys.argv[3]=="True" else False
24 | 
25 | matrix = readVectors(inpath, header=header)
26 | 
27 | 
28 | print "done reading vectors"
29 | vocab = list(matrix.keys())
30 | vocab.sort()
31 | with open(outpath+'.vocab', 'w') as output_f:
32 |     for word in vocab:
33 |         print >>output_f, word,
34 | 
35 | new_matrix = np.zeros(shape=(len(vocab), len(matrix[vocab[0]])), dtype=np.float32)
36 | for i, word in enumerate(vocab):
37 |     if i%1000 == 0:
38 |         print(i)
39 |     new_matrix[i, :] = matrix[word]
40 | 
41 | print new_matrix.shape
42 | 
43 | np.save(outpath+'.npy', new_matrix)


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/all_wordsim.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | 
 4 | from read_write import read_word_vectors
 5 | from ranking import *
 6 | 
 7 | def all_word_sim(word_vec_file, word_sim_dir):
 8 | 
 9 |   word_vecs = read_word_vectors(word_vec_file)
10 |   print '================================================================================='
11 |   print "%6s" %"Serial", "%20s" % "Dataset", "%15s" % "Num Pairs", "%15s" % "Not found", "%15s" % "Rho"
12 |   print '================================================================================='
13 | 
14 |   total_rho = 0
15 |   for i, filename in enumerate(os.listdir(word_sim_dir)):
16 |     manual_dict, auto_dict = ({}, {})
17 |     not_found, total_size = (0, 0)
18 |     for line in open(os.path.join(word_sim_dir, filename),'r'):
19 |       line = line.strip().lower()
20 |       word1, word2, val = line.split()
21 |       if word1 in word_vecs and word2 in word_vecs:
22 |         manual_dict[(word1, word2)] = float(val)
23 |         auto_dict[(word1, word2)] = cosine_sim(word_vecs[word1], word_vecs[word2])
24 |       else:
25 |         not_found += 1
26 |       total_size += 1
27 |     rho = spearmans_rho(assign_ranks(manual_dict), assign_ranks(auto_dict))
28 |     total_rho += rho
29 |     print "%6s" % str(i+1), "%20s" % filename, "%15s" % str(total_size),
30 |     print "%15s" % str(not_found),
31 |     print "%15.4f" % rho
32 |   print "Sum of scores: %15.4f" % total_rho
33 | 


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/data/word-sim/EN-RG-65.txt:
--------------------------------------------------------------------------------
 1 | gem	jewel	3.94
 2 | midday	noon	3.94
 3 | automobile	car	3.92
 4 | cemetery	graveyard	3.88
 5 | cushion	pillow	3.84
 6 | boy	lad	3.82
 7 | cock	rooster	3.68
 8 | implement	tool	3.66
 9 | forest	woodland	3.65
10 | coast	shore	3.60
11 | autograph	signature	3.59
12 | journey	voyage	3.58
13 | serf	slave	3.46
14 | grin	smile	3.46
15 | glass	tumbler	3.45
16 | cord	string	3.41
17 | hill	mound	3.29
18 | magician	wizard	3.21
19 | furnace	stove	3.11
20 | asylum	madhouse	3.04
21 | brother	monk	2.74
22 | food	fruit	2.69
23 | bird	cock	2.63
24 | bird	crane	2.63
25 | oracle	sage	2.61
26 | sage	wizard	2.46
27 | brother	lad	2.41
28 | crane	implement	2.37
29 | magician	oracle	1.82
30 | glass	jewel	1.78
31 | cemetery	mound	1.69
32 | car	journey	1.55
33 | hill	woodland	1.48
34 | crane	rooster	1.41
35 | furnace	implement	1.37
36 | coast	hill	1.26
37 | bird	woodland	1.24
38 | shore	voyage	1.22
39 | cemetery	woodland	1.18
40 | food	rooster	1.09
41 | forest	graveyard	1.00
42 | lad	wizard	0.99
43 | mound	shore	0.97
44 | automobile	cushion	0.97
45 | boy	sage	0.96
46 | monk	oracle	0.91
47 | shore	woodland	0.90
48 | grin	lad	0.88
49 | coast	forest	0.85
50 | asylum	cemetery	0.79
51 | monk	slave	0.57
52 | cushion	jewel	0.45
53 | boy	rooster	0.44
54 | glass	magician	0.44
55 | graveyard	madhouse	0.44
56 | asylum	monk	0.39
57 | asylum	fruit	0.19
58 | grin	implement	0.18
59 | mound	stove	0.14
60 | automobile	wizard	0.11
61 | autograph	shore	0.06
62 | fruit	furnace	0.05
63 | noon	string	0.04
64 | rooster	voyage	0.04
65 | chord	smile	0.02
66 | 


--------------------------------------------------------------------------------
/run_bsg.py:
--------------------------------------------------------------------------------
 1 | # this file contains an example on how to run the bayesian skip-gram model
 2 | import os
 3 | from interfaces.interface_configurator import InterfaceConfigurator
 4 | from libraries.evaluation.support import evaluate
 5 | from libraries.evaluation.lexsub.run_lexsub import run_lexsub
 6 | 
 7 | train_data_path = '2M/' # change the path!
 8 | vocab_file_path = 'vocabulary/2M.txt' # if the file does not exist - it will be created
 9 | output_folder_path = "output/2M/"  # change the path(optional)
10 | 
11 | # obtain the interface to interact with the model. If one wants to change hyper-param the manual modification of the below class's method will be necessary!
12 | i_model = InterfaceConfigurator.get_interface(train_data_path, vocab_file_path, output_folder_path)
13 | 
14 | i_model.train_workflow()
15 | 
16 | # store the temporary vocab, because it can be different from the original one(e.g. smaller number of words)
17 | vocab = i_model.vocab
18 | temp_vocab_file_path = os.path.join(i_model.output_path, "vocab.txt")
19 | vocab.write(temp_vocab_file_path)
20 | 
21 | mu_vecs = [os.path.join(i_model.output_path, "mu.vectors")]
22 | sigma_vecs = [os.path.join(i_model.output_path, "sigma.vectors")]
23 | 
24 | # a complex of word embedding evaluations(word similarity, entailment, directional entailment)
25 | evaluate(mu_vectors_files=mu_vecs, sigma_vectors_files=sigma_vecs, vocab_file=temp_vocab_file_path, log_sigmas=False,
26 |          full_sim=True, vocab=vocab)
27 | 
28 | # run additionally lexical substitution evaluation
29 | run_lexsub(input_folder=i_model.output_path, output_path=i_model.output_path)
30 | 


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/README.md:
--------------------------------------------------------------------------------
 1 | # eval-word-vectors
 2 | Manaal Faruqui, mfaruqui@cs.cmu.edu
 3 | 
 4 | Easy-to-use scripts for evaluating word vectors on a variety of tasks.
 5 | These are the scripts that run behind the online tool on ```http://www.wordvectors.org/```.
 6 | I will be adding more evaluation scripts here over the course of time.
 7 | 
 8 | ### Requirements
 9 | 1. Python 2.7 (+numpy package)
10 | 
11 | ### Data you need
12 | 1. Word vector file
13 | 2. Any word similarity evaluation file (if you are not using the provided ones)
14 | 
15 | Each vector file should have one word vector per line as follows (space delimited):-
16 | 
17 | ```the -1.0 2.4 -0.3 ...```
18 | 
19 | ### Evaluating on multiple word sim tasks
20 | 
21 | ```python all_wordsim.py word_vec_file word_sim_file_dir```
22 | 
23 | ```python all_wordsim.py skip-gram-vecs.txt data/word-sim/```
24 | 
25 | ### Evaluating on one word sim task
26 | 
27 | ```python wordsim.py word_vec_file word_sim_file```
28 | 
29 | ```word_sim_file``` should be in the same format as files in ```data/word-sim/```
30 | 
31 | ### Reference
32 | 
33 | Please make sure to cite the papers corresponding to the word similarity dataset that you are using. This
34 | list of citation can be found at ```http://www.wordvectors.org/```.
35 | 
36 | Please cite the following paper if you use this tool:
37 | ```
38 | @InProceedings{faruqui-2014:SystemDemo,
39 |   author    = {Faruqui, Manaal  and  Dyer, Chris},
40 |   title     = {Community Evaluation and Exchange of Word Vectors at wordvectors.org},
41 |   booktitle = {Proceedings of ACL: System Demonstrations},
42 |   year      = {2014},
43 | }
44 | ```
45 | 


--------------------------------------------------------------------------------
/libraries/utils/other.py:
--------------------------------------------------------------------------------
 1 | import operator
 2 | from collections import OrderedDict
 3 | 
 4 | # def create_special_symbols_hash(special_symbols):
 5 | #     new_special_symbols = {}
 6 | #     for ss in special_symbols:
 7 | #         new_special_symbols[ss] = "<"+ss+">"
 8 | #     return new_special_symbols
 9 | 
10 | 
11 | # for float comparison
12 | def is_close(a, b, rel_tol=1e-09, abs_tol=0.0):
13 |     return abs(a-b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol)
14 | 
15 | 
16 | def is_ascii(s):
17 |     return all(ord(c) < 128 for c in s)
18 | 
19 | 
20 | def sort_hash(hash, by_key=True, reverse=True):
21 |     if by_key:
22 |         indx = 0
23 |     else:
24 |         indx = 1
25 |     return sorted(hash.items(), key=operator.itemgetter(indx), reverse=reverse)
26 | 
27 | 
28 | def merge_two_dicts(x, y):
29 |     z = x.copy()   # start with x's keys and values
30 |     z.update(y)    # modifies z with y's keys and values & returns None
31 |     return z
32 | 
33 | 
34 | def merge_ordered_dicts(*args):
35 |     """
36 |     Assuming that each collection is an Ordered dictionary, merges them into one.
37 | 
38 |     """
39 |     new_params_dict = OrderedDict()
40 |     for params in args:
41 |         assert isinstance(params, OrderedDict)
42 |         for key, value in params.items():
43 |             new_params_dict[key] = value
44 |     return new_params_dict
45 | 
46 | 
47 | def append_to_ordered_dict(initial_dict, param_dict):
48 |     """
49 |     Assuming that collection is an Ordered dictionary, appends parameters to the initial one.
50 | 
51 |     """
52 |     assert isinstance(param_dict, OrderedDict)
53 |     for key, value in param_dict.items():
54 |         initial_dict[key] = value
55 |     return initial_dict


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/lst/coinco_split_dev_test.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | 
 4 | def read_ids(filename):
 5 |     
 6 |     ids = set()
 7 |     with open(filename) as f:
 8 |         for line in f:
 9 |             line_id = line.strip()
10 |             if len(line_id) > 0:
11 |                 ids.add(line_id)
12 |     return ids
13 |         
14 | 
15 | if __name__ == '__main__':
16 |     
17 |     if len(sys.argv)<4:
18 |         print "Usage: %s <coinco-filename> <dev-ids-filename> <test-ids-filename> <eval|gold>" % sys.argv[0]
19 |         sys.exit(1)
20 |         
21 |     coinco_all = open(sys.argv[1],'r')
22 |     coinco_dev = open(sys.argv[1]+'.dev', 'w')
23 |     coinco_test = open(sys.argv[1]+'.test', 'w')
24 |     dev_ids = read_ids(sys.argv[2])
25 |     test_ids = read_ids(sys.argv[3])
26 |     format = sys.argv[4]
27 |     
28 |     
29 |     '''
30 |     eval format:    mission.N       4       1       a mission to end a war
31 |     gold format:    mission.N 4 :: task 2;plan 2;
32 |     '''
33 |     for line in coinco_all:
34 |         if len(line.strip()) > 0:
35 |             if format == 'eval':
36 |                 line_id = line.split('\t')[1]
37 |             elif format == 'gold':
38 |                 line_id = line.split('::')[0].strip().split()[-1]
39 |             else:
40 |                 raise Exception('input format unknown: ' + format)
41 |             if line_id in dev_ids:
42 |                 coinco_dev.write(line)
43 |             elif  line_id in test_ids:
44 |                 coinco_test.write(line)
45 |             else:
46 |                 print "NOTICE: id {} is neither in dev nor in test".format(line_id)
47 |         
48 |     coinco_all.close()
49 |     coinco_dev.close()
50 |     coinco_test.close()
51 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/data/conll_line.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | class ConllLine():
 4 |  
 5 | 
 6 | 
 7 |     def root_init(self):
 8 |         self.id = 0
 9 |         self.form = '*root*'
10 |         self.lemma = '_'
11 |         self.cpostag = '_'
12 |         self.postag = '_'
13 |         self.feats = '_'
14 |         self.head = -1
15 |         self.deptype = 'rroot'
16 |         self.phead = -1 
17 |         self.pdeptype = '_'
18 | 
19 |     def __str__( self ):
20 |         return '\t'.join([str(self.id), self.form, self.lemma, self.cpostag, self.postag, self.feats, str(self.head), self.deptype, str(self.phead), self.pdeptype])
21 |     
22 |     def __init__(self, tokens=None):
23 |         if tokens == None:
24 |             self.root_init()
25 |         else:
26 |             self.id = int(tokens[0])
27 |             self.form = tokens[1]
28 |             self.lemma = tokens[2]
29 |             self.cpostag = tokens[3]
30 |             self.postag = tokens[4]
31 |             self.feats = tokens[5]
32 |             self.head = int(tokens[6])
33 |             self.deptype = tokens[7]
34 |             if len(tokens) > 8:
35 |                 self.phead = -1 if tokens[8] == '_' else int(tokens[8]) 
36 |                 self.pdeptype = tokens[9]
37 |             else:
38 |                 self.phead = -1 
39 |                 self.pdeptype = '_'
40 |         
41 |     tree_line_extractor = re.compile('([a-z]+)\(.+-(\d+), (.+)-(\d+)\)') 
42 |     # stanford parser tree output:  num(Years-3, Five-1) 
43 |     def from_tree_line(self, tree_line):
44 |         self.root_init()
45 |         tok = self.tree_line_extractor.match(tree_line)
46 |         self.id = int(tok.group(4))
47 |         self.form = tok.group(3)
48 |         self.head = int(tok.group(2))
49 |         self.deptype = tok.group(1)
50 |         
51 | 
52 |         
53 | 


--------------------------------------------------------------------------------
/layers/layer.py:
--------------------------------------------------------------------------------
 1 | from collections import OrderedDict
 2 | 
 3 | from libraries.misc.initializers import Initializers
 4 | import theano
 5 | 
 6 | 
 7 | class Parameter:
 8 |     def __init__(self, name, shape, regularizable=False, init_type='uniform'):
 9 |         self.name = name
10 |         self.value = theano.shared(Initializers.init(shape, init_type), name)
11 |         self.regularizable = regularizable
12 | 
13 | 
14 | class Layer:
15 |     def __init__(self, name, init_type='xavier_uniform', regularizable=False):
16 |         """
17 |         A basic layer parent class of all other layers.
18 | 
19 |         """
20 |         assert name is not None
21 |         self.name = name
22 |         self.init_type = init_type
23 |         self.regularizable = regularizable
24 |         self.params = OrderedDict()
25 | 
26 |     def get_params_to_reg(self):
27 |         """
28 |         Helper function that returns parameters that are necessary to regularize.
29 | 
30 |         """
31 |         container = OrderedDict()
32 |         for name in self.params.keys():
33 |             if self.params[name].regularizable:
34 |                 container[name] = self.params[name]
35 |         return container
36 | 
37 |     # TODO: think if I need to pass reg, and init_type like that if they are set as attributes already.
38 |     def add_param(self, name, shape, regularizable=False, init_type='uniform'):
39 |         """
40 |         :param name: parameter's name
41 |         :param shape: shape of the parameter
42 |         :param regularizable: True/False depending on whether you want it be regularized
43 |         :param init_type: what initialization to perform
44 | 
45 |         """
46 |         assert name is not None
47 |         name = "_".join([self.name, name])
48 |         param = Parameter(name, shape=shape, regularizable=regularizable, init_type=init_type)
49 |         self.params[name] = param
50 |         return param.value


--------------------------------------------------------------------------------
/eval/word_pairs_eval.py:
--------------------------------------------------------------------------------
 1 | # a console application to evaluate word pairs
 2 | from support import KL, cosine_sim, read_vectors_to_dict
 3 | import argparse
 4 | 
 5 | 
 6 | def word_pairs_eval(word_pairs_path, mu_vectors_path, sigma_vectors_path):
 7 |     """
 8 |     :param word_pairs_path: file path that contains lines of the form word1 word2 (space separated)
 9 |     :param mu_vectors_path: path to the learned mu vectors
10 |     :type mu_vectors_path: str
11 |     :param sigma_vectors_path: path to the learned sigma vectors
12 |     :type sigma_vectors_path: str
13 | 
14 |     """
15 |     mus_and_sigmas = read_vectors_to_dict(mu_vectors_path, sigma_vectors_path, log_sigmas=False)
16 | 
17 |     with open(word_pairs_path) as f:
18 |         for line in f:
19 |             word1, word2 = line.strip().split()
20 | 
21 |             mu_w1, sigma_w1 = mus_and_sigmas[word1]
22 |             mu_w2, sigma_w2 = mus_and_sigmas[word2]
23 |             kl1 = KL(mu_w1, sigma_w1, mu_w2, sigma_w2)
24 |             kl2 = KL(mu_w2, sigma_w2, mu_w1, sigma_w1)
25 | 
26 |             print "cos_sim(%s, %s) = %f" % (word1, word2, cosine_sim(mu_w1, mu_w2))
27 |             print "kl(%s, %s) = %f" % (word1, word2, kl1)
28 |             print "kl(%s, %s) = %f" % (word2, word1, kl2)
29 | 
30 |             my_str = "%s entails %s"
31 |             if kl1 < kl2:
32 |                 print my_str % (word1, word2)
33 |             else:
34 |                 print my_str % (word2, word1)
35 |             print '---------------------------------------'
36 | 
37 | if __name__ == '__main__':
38 |     parser = argparse.ArgumentParser(description='Word pairs evaluation using BSG learned representations.')
39 |     parser.add_argument('-wpp', '--word_pairs_path', type=str)
40 |     parser.add_argument('-mup', '--mu_vectors_path', type=str)
41 |     parser.add_argument('-sigmap', '--sigma_vectors_path', type=str)
42 |     args = parser.parse_args()
43 |     word_pairs_eval(args.word_pairs_path, args.mu_vectors_path, args.sigma_vectors_path)
44 | 


--------------------------------------------------------------------------------
/libraries/tokenizers/bsg_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from libraries.tools.word_processor import WordProcessor
 2 | from nltk import word_tokenize as default_tokenizer
 3 | try:
 4 |     import re2 as re
 5 | except ImportError:
 6 |     import re
 7 | 
 8 | TOKENS = {"URL_TOKEN": "<URL>", "FLOAT_TOKEN": "<FLOAT>"}
 9 | 
10 | 
11 | class BSGTokenizer:
12 |     """
13 |     A more advanced tokenizer that both tokenizes and cleans textual data. It's specifically tailored for BSG models.
14 | 
15 |     """
16 |     def __init__(self, word_processor_type='none', use_external_tokenizer=True):
17 |         """
18 |         :param use_external_tokenizer: whether to use NLTK tokenizer or rely on the simple splitter(x.split()).
19 |         """
20 |         if use_external_tokenizer:
21 |             self.tokenizer = default_tokenizer
22 |         else:
23 |             self.tokenizer = lambda x: x.split()  # assuming that data was already tokenized
24 |         self.word_processor = WordProcessor(word_processor_type=word_processor_type)
25 | 
26 |     def __call__(self, sentence):
27 |         """
28 |         :param sentence: a string of words
29 |         :return: a list of clean tokens
30 | 
31 |         """
32 |         words = self.tokenizer(sentence)
33 |         tokens = []
34 |         for word in words:
35 |             # check if the word matches some known token
36 |             token = self.__match_to_known_token(word)
37 |             if not token:
38 |                 # clean the word otherwise
39 |                 token = self.word_processor(word)
40 |                 if token == "":
41 |                     continue
42 |             tokens.append(token)
43 | 
44 |         return tokens
45 | 
46 |     @staticmethod
47 |     def __match_to_known_token(word):
48 |         # URL
49 |         if re.match(r"^(https?:\/\/(?:www\.|(?!www))[^\s\.]+\.[^\s]{2,}|www\.[^\s]+\.[^\s]{2,})$", word):
50 |             return TOKENS["URL_TOKEN"]
51 |         # FLOAT
52 |         if re.match(r'^([0-9]+\.)[0-9]+$', word):
53 |             return TOKENS["FLOAT_TOKEN"]


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/data/context_instance.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Instance in the Lexical Substitution Task dataset
 3 | 
 4 | '''
 5 | 
 6 | 
 7 | from pos import from_lst_pos
 8 | 
 9 | CONTEXT_TEXT_BEGIN_INDEX = 3
10 | TARGET_INDEX = 2
11 | 
12 | 
13 | 
14 | 
15 | class ContextInstance(object):
16 |  
17 |     def __init__(self, line, no_pos_flag):
18 |         '''
19 |         Constructor
20 |         '''
21 |         self.line = line
22 |         tokens1 = line.split("\t")
23 |         self.target_ind = int(tokens1[TARGET_INDEX])
24 |         self.words = tokens1[3].split()
25 |         self.target = self.words[self.target_ind]       
26 |         self.full_target_key = tokens1[0]
27 |         self.pos = self.full_target_key.split('.')[-1]
28 |         self.target_key = '.'.join(self.full_target_key.split('.')[:2]) # remove suffix in cases of bar.n.v
29 |         self.target_lemma = self.full_target_key.split('.')[0]       
30 |         self.target_id = tokens1[1]
31 | 
32 |         # I don't see why I need this one?
33 |         # if self.pos in from_lst_pos:
34 |         #     self.pos = from_lst_pos[self.pos]
35 | 
36 |         self.target_pos = '.'.join([self.target, '*']) if no_pos_flag == True else '.'.join([self.target, self.pos])
37 |    
38 |     def get_neighbors(self, window_size):
39 |         tokens = self.line.split()[3:]
40 |         
41 |         if (window_size > 0):                                    
42 |             start_pos = max(self.target_ind-window_size, 0)
43 |             end_pos = min(self.target_ind+window_size+1, len(tokens))
44 |         else:
45 |             start_pos = 0
46 |             end_pos = len(tokens)
47 |             
48 |         neighbors = tokens[start_pos:self.target_ind] + tokens[self.target_ind+1:end_pos]
49 |         return neighbors 
50 |    
51 |     def decorate_context(self):
52 |         tokens = self.line.split('\t')
53 |         words = tokens[CONTEXT_TEXT_BEGIN_INDEX].split()
54 |         words[self.target_ind] = '__'+words[self.target_ind]+'__'
55 |         tokens[CONTEXT_TEXT_BEGIN_INDEX] = ' '.join(words)
56 |         return '\t'.join(tokens)+"\n"     


--------------------------------------------------------------------------------
/layers/custom/bsg_encoder.py:
--------------------------------------------------------------------------------
 1 | from layers.layer import Layer
 2 | from layers.standard.embeddings import Embeddings
 3 | import theano.tensor as T
 4 | from libraries.misc.non_linearity import NonLinearity
 5 | from libraries.utils.other import merge_ordered_dicts
 6 | 
 7 | 
 8 | class BSGEncoder(Layer):
 9 |     """
10 |     Encoder that is specific to the original BSG version. It uses one input representation of words, and performs
11 |     transformation of context and center word representations.
12 | 
13 |     """
14 |     def __init__(self, name, input_dim, output_dim, collection_size, non_linearity='relu'):
15 |         Layer.__init__(self, name=name,init_type='uniform', regularizable=True)
16 |         self.non_linearity = NonLinearity(type=non_linearity)
17 |         self.embeddings = Embeddings(name="emb_encoder", collection_size=collection_size, output_dim=input_dim)
18 |         self.C = self.add_param(name="C", shape=(2*input_dim, output_dim), init_type=self.init_type,
19 |                                 regularizable=self.regularizable)
20 |         # store additional params
21 |         self.params = merge_ordered_dicts(self.embeddings.params, self.params)
22 | 
23 |     def __call__(self, context_words, center_words, mask=None):
24 |         """
25 |         :param context_words: tensor [batch_size, seq_length]
26 |         :param center_words: tensor [batch_size]
27 |         :param mask: tensor [batch_size, seq_length]
28 |         :return: tensor [batch_size, output_dim]
29 | 
30 |         """
31 |         b, full_window_size = context_words.shape
32 | 
33 |         # 0. get representations
34 |         repr_center = T.repeat(self.embeddings(center_words).dimshuffle([0, 'x', 1]), full_window_size, axis=1) \
35 |                       * mask.dimshuffle([0, 1, "x"])
36 |         repr_context = self.embeddings(context_words, mask, perform_dimshuffle=False)
37 | 
38 |         # 1. combine representations
39 |         repr_common = T.concatenate([repr_center, repr_context], axis=2)
40 | 
41 |         # 2. compute hidden layer by summing common representations
42 |         hidden = T.sum(self.non_linearity(T.dot(repr_common, self.C)), axis=1)
43 | 
44 |         return hidden


--------------------------------------------------------------------------------
/interfaces/support.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import sys
 3 | from collections import OrderedDict
 4 | sys.setrecursionlimit(10000)
 5 | 
 6 | 
 7 | def save(obj, file_path):
 8 |     file = open(file_path, 'wb+')
 9 |     pickle.dump(obj=obj, file=file, protocol=pickle.HIGHEST_PROTOCOL)
10 | 
11 | 
12 | def load(file_path):
13 |     file = open(file_path, 'rb+')
14 |     return pickle.load(file)
15 | 
16 | 
17 | def metrics_to_str(metrics, prefix=""):
18 |     return prefix + " " + ", ".join(["%s: %f" % (name, value) for name, value in metrics.items()])
19 | 
20 | 
21 | def infer_attributes_to_log(model):
22 |     """
23 |     Automatically infers parameters/attributes that should be logged. They are either ints/floats or strings.
24 | 
25 |     """
26 |     all_attr = model.__dict__
27 |     attr_to_log = OrderedDict()
28 |     for attr_name, attr_value in all_attr.items():
29 |         if isinstance(attr_value, (int, str, float, list)):
30 |             attr_to_log[attr_name] = attr_value
31 |     return attr_to_log
32 | 
33 | 
34 | def format_experimental_setup(setup):
35 |     """
36 |     A specific for experiments writing function, that formats them property and write to the log file.
37 |     :param params: a hash of params
38 |     """
39 |     st = ""
40 |     st += '---------------------------- \n'
41 |     st += '---- EXPERIMENT\'S SETUP ---- \n'
42 |     for param_name, param_value in setup.iteritems():
43 |         st += param_name + ": " + str(param_value) + '\n'
44 |     st += '--------------------------'
45 |     return st
46 | 
47 | 
48 | def compute_loss(iterator, loss_func):
49 |     """
50 |     Computes the average loss over the whole dataset that is loaded to the iterator.
51 | 
52 |     """
53 |     total_loss = 0.
54 |     batch_size = iterator.batch_size
55 |     datapoints_count = 0.
56 |     # TODO: rethink if it's necessary to do all those mathematical manipulations
57 |     for counter, batch in enumerate(iterator, 1):
58 |         total_loss += loss_func(batch=batch)
59 |         datapoints_count += len(batch)
60 |     # rescale back as the loss was averaged over nr. of datapoints in each batch
61 |     total_loss *= (batch_size / datapoints_count)
62 |     return total_loss


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/ranking.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | import numpy
 3 | from operator import itemgetter
 4 | from numpy.linalg import norm
 5 | 
 6 | EPSILON = 1e-6
 7 | 
 8 | def euclidean(vec1, vec2):
 9 |   diff = vec1 - vec2
10 |   return math.sqrt(diff.dot(diff))
11 | 
12 | def cosine_sim(vec1, vec2):
13 |   vec1 += EPSILON * numpy.ones(len(vec1))
14 |   vec2 += EPSILON * numpy.ones(len(vec1))
15 |   return vec1.dot(vec2)/(norm(vec1)*norm(vec2))
16 | 
17 | def assign_ranks(item_dict):
18 |   ranked_dict = {}
19 |   sorted_list = [(key, val) for (key, val) in sorted(item_dict.items(),
20 |                                                      key=itemgetter(1),
21 |                                                      reverse=True)]
22 |   for i, (key, val) in enumerate(sorted_list):
23 |     same_val_indices = []
24 |     for j, (key2, val2) in enumerate(sorted_list):
25 |       if val2 == val:
26 |         same_val_indices.append(j+1)
27 |     if len(same_val_indices) == 1:
28 |       ranked_dict[key] = i+1
29 |     else:
30 |       ranked_dict[key] = 1.*sum(same_val_indices)/len(same_val_indices)
31 |   return ranked_dict
32 | 
33 | def correlation(dict1, dict2):
34 |   avg1 = 1.*sum([val for key, val in dict1.iteritems()])/len(dict1)
35 |   avg2 = 1.*sum([val for key, val in dict2.iteritems()])/len(dict2)
36 |   numr, den1, den2 = (0., 0., 0.)
37 |   for val1, val2 in zip(dict1.itervalues(), dict2.itervalues()):
38 |     numr += (val1 - avg1) * (val2 - avg2)
39 |     den1 += (val1 - avg1) ** 2
40 |     den2 += (val2 - avg2) ** 2
41 |   return numr / math.sqrt(den1 * den2)
42 | 
43 | def spearmans_rho(ranked_dict1, ranked_dict2):
44 |   assert len(ranked_dict1) == len(ranked_dict2)
45 |   if len(ranked_dict1) == 0 or len(ranked_dict2) == 0:
46 |     return 0.
47 |   x_avg = 1.*sum([val for val in ranked_dict1.values()])/len(ranked_dict1)
48 |   y_avg = 1.*sum([val for val in ranked_dict2.values()])/len(ranked_dict2)
49 |   num, d_x, d_y = (0., 0., 0.)
50 |   for key in ranked_dict1.keys():
51 |     xi = ranked_dict1[key]
52 |     yi = ranked_dict2[key]
53 |     num += (xi-x_avg)*(yi-y_avg)
54 |     d_x += (xi-x_avg)**2
55 |     d_y += (yi-y_avg)**2
56 |   return num/(math.sqrt(d_x*d_y))
57 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/lst/special_word_marker.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import sys
 3 | import re
 4 | 
 5 | RARE_WORD_TOKEN = '<RW>'
 6 | NUMERIC_TOKEN = '<NUM>'
 7 | NAME_TOKEN = '<NAME>'
 8 | MAX_COUNT_FOR_NAME = 10000
 9 | 
10 | # very crude implementation
11 | num_re = re.compile('^[\+\/\:\-,\.\d]*\d[\+\/\:\-,\.\d]*$')
12 | def is_numeric(word_str):
13 |     return num_re.match(word_str) != None
14 | 
15 | def is_name(word, word_lower, vocab, begin_sentence):
16 |     isname = False
17 |     if not begin_sentence:
18 |         if word[:1].isupper():
19 |             if word_lower not in vocab:
20 |                 isname = True
21 |             else:
22 |                 count = vocab[word_lower]
23 |                 if count < MAX_COUNT_FOR_NAME:
24 |                     isname = True
25 |     return isname
26 |     
27 | def load_vocabulary(path):
28 |     vocab = {}
29 |     with open(path, 'r') as f:
30 |         for line in f:
31 |             if len(line) > 0:
32 |                 word = line.split('\t')[0].strip()
33 |                 count = int(line.split('\t')[1])
34 |                 vocab[word] = count                
35 |     return vocab
36 | 
37 |             
38 | def mark_special_words(words, start_ind, vocab):
39 |     for i in xrange(start_ind, len(words)):
40 |         if is_numeric(words[i]):
41 |             words[i] = NUMERIC_TOKEN
42 |         elif is_name(words[i], words[i].lower(), vocab, i==start_ind):
43 |             words[i] = NAME_TOKEN
44 |         else:
45 |             words[i] = words[i].lower()
46 |             
47 |             
48 | 
49 | if __name__ == '__main__':
50 |  
51 |     if (len(sys.argv) < 2):
52 |         print >> sys.stderr, "Usage: %s <vocab-file> <text >output"
53 |         sys.exit(1)
54 |         
55 |     vocab = load_vocabulary(sys.argv[1])
56 |     
57 |     for line in sys.stdin:
58 |         try:     
59 |             segments = line.split('\t')
60 |             words = segments[3].split()
61 |             mark_special_words(words, 0, vocab)
62 |             print '\t'.join(segments[:3]) + '\t' + ' '.join(words)
63 |         except Exception as e:
64 |             print >> sys.stderr, e
65 |             sys.stderr.write("Can't parse line: %s" % line)
66 | 


--------------------------------------------------------------------------------
/interfaces/i_bsg.py:
--------------------------------------------------------------------------------
 1 | from models.bsg import BSG
 2 | from i_base import IBase
 3 | from collections import OrderedDict
 4 | from libraries.batch_iterators.window_batch_iterator import WindowBatchIterator as BatchIterator
 5 | from support import compute_loss
 6 | 
 7 | 
 8 | class IBSG(IBase):
 9 |     """
10 |     Interface class that builds on top of the BSG model. Specifically, it wraps the model's methods to easy user access.
11 | 
12 |     """
13 |     def __init__(self, data_iterator, vocab, half_window_size=5, nr_neg_samples=5, batch_size=5,
14 |                  subsampling_threshold=None, **kwargs):
15 |         # init the parent object
16 |         IBase.__init__(self, vocab=vocab, model_class=BSG, **kwargs)
17 | 
18 |         # general attributes
19 |         self.batch_size = batch_size
20 |         self.half_window_size = half_window_size
21 |         self.nr_neg_samples = nr_neg_samples
22 |         self.subsampling_threshold = subsampling_threshold
23 | 
24 |         self.init_iterator = lambda data_path: BatchIterator(vocab, data_path, data_iterator, half_window_size=half_window_size, nr_neg_samples=nr_neg_samples,
25 |                                                              subsampling_threshold=subsampling_threshold, batch_size=batch_size)
26 | 
27 |     def _measure_performance(self, data_path):
28 |         return {"loss": compute_loss(self.init_iterator(data_path), loss_func=self.loss_func)}
29 | 
30 |     def _train(self, batch):
31 |         mean_margin, mean_kl, avg_log_det = self.model.train(batch.pos_context_words, batch.neg_context_words,
32 |                                                              batch.center_words, batch.mask)
33 |         return OrderedDict((("margin", mean_margin), ("kl", mean_kl), ("log_det", avg_log_det)))
34 | 
35 |     def loss_func(self, batch):
36 |         return self.model.loss(batch.pos_context_words, batch.neg_context_words, batch.center_words, batch.mask)
37 | 
38 |     def _post_training_logic(self):
39 |         # will execute this code after the training workflow is finished, can contain custom functions, e.g. saving
40 |         # of word embeddings
41 |         self.model.save_word_vectors(self.vocab, vectors_folder=self.output_path)
42 |         self.log.write("Word vectors are saved to: %s" % self.output_path)


--------------------------------------------------------------------------------
/libraries/evaluation/entailment/data/bench/baroni2012/data_lex_val.tsv:
--------------------------------------------------------------------------------
 1 | mosque	castle	False
 2 | boar	spokesperson	False
 3 | deanery	dwelling	True
 4 | animal	artillery	False
 5 | information	waitress	False
 6 | contest	vertebrate	False
 7 | inhabitant	resident	False
 8 | animal	panda	False
 9 | bookmark	performer	False
10 | term	word	True
11 | robin	vertebrate	True
12 | yesterday	day	True
13 | pizza	food	True
14 | nanny	adult	True
15 | stallion	animal	True
16 | monastery	building	True
17 | misfortune	catastrophe	False
18 | immunology	science	True
19 | cat	drone	False
20 | bug	animal	True
21 | netball	game	True
22 | building	envy	False
23 | farmhouse	bargain	False
24 | screwdriver	tool	True
25 | trait	competitiveness	False
26 | food	gun	False
27 | ruler	algebra	False
28 | pesticide	beard	False
29 | washer	worker	True
30 | oat	cereal	True
31 | shortcut	feeling	False
32 | checklist	list	True
33 | drummer	performer	True
34 | secret	information	True
35 | objectivity	trait	True
36 | arithmetic	discipline	True
37 | radius	fluid	False
38 | tutor	teacher	True
39 | hamster	science	False
40 | enzyme	chick	False
41 | gall	disease	True
42 | saxophonist	performer	True
43 | pilot	worker	True
44 | cellulose	mineral	False
45 | geometry	discipline	True
46 | folly	trait	True
47 | hardness	consistency	True
48 | sailor	privateer	False
49 | undertaking	slalom	False
50 | kindergarten	institution	True
51 | disease	colt	False
52 | platter	publication	False
53 | champagne	etching	False
54 | charity	institution	True
55 | magnitude	radius	False
56 | holly	tree	True
57 | animal	robin	False
58 | pigeon	vertebrate	True
59 | vertebrate	asp	False
60 | security	fresco	False
61 | building	transaction	False
62 | marker	bookmark	False
63 | dragonfly	animal	True
64 | tench	fish	True
65 | etching	discipline	False
66 | clothing	shirt	False
67 | antibiotic	drug	True
68 | animal	affidavit	False
69 | aneurysm	disorder	True
70 | term	telly	False
71 | trumpeter	performer	True
72 | clothing	karaoke	False
73 | love	feeling	True
74 | doctrine	aesthetic	False
75 | epistle	letter	True
76 | eagle	animal	True
77 | monastery	house	True
78 | bead	jewelry	True
79 | carbohydrate	molecule	True
80 | bridesmaid	woman	True
81 | overpayment	therapist	False
82 | infant	investor	False
83 | hare	fillet	False
84 | melanoma	cancer	True
85 | vertebrate	worker	False
86 | technician	relative	False
87 | radiotherapy	treatment	True
88 | 


--------------------------------------------------------------------------------
/interfaces/interface_configurator.py:
--------------------------------------------------------------------------------
 1 | from libraries.data_iterators.open_text_data_iterator import OpenTextDataIterator
 2 | from libraries.tools.vocabulary import Vocabulary
 3 | from libraries.tokenizers.bsg_tokenizer import BSGTokenizer
 4 | from interfaces.i_bsg import IBSG
 5 | from libraries.misc.optimizations import Adam
 6 | 
 7 | 
 8 | class InterfaceConfigurator:
 9 |     """
10 |     A class for configuring the model's interface. One can alter hyper-params in get_interface.
11 | 
12 |     """
13 |     def __init__(self):
14 |         pass
15 | 
16 |     @staticmethod
17 |     def get_interface(train_data_path, vocab_file_path, output_folder_path=None, params_file_path=None, model_file_path=None):
18 | 
19 |         # Hyper-parameters
20 |         half_window_size = 5  # (one sided)
21 |         input_dim = 100
22 |         h_dim = 100  # the number of components in the first hidden layers
23 |         z_dim = 100  # the number of dimensions of the latent vectors
24 |         alpha = 0.0075  # learning rate
25 |         subsampling_threshold = None
26 |         nr_neg_samples = 10
27 |         margin = 5.0  # margin in the hinge loss
28 |         epochs = 1
29 |         max_vocab_size = 10000
30 |         batch_size = 500
31 | 
32 |         tokenizer = BSGTokenizer(word_processor_type='open_text', use_external_tokenizer=False)
33 |         data_iterator = OpenTextDataIterator(tokenizer=tokenizer)
34 | 
35 |         vocab = Vocabulary(data_iterator, max_size=max_vocab_size, min_count=1)
36 |         vocab.load_or_create(vocab_file_path, train_data_path)
37 |         vocab.assign_distr()
38 | 
39 |         lr_opt = Adam(learning_rate=alpha, beta1=0.9, beta2=0.999)
40 | 
41 |         i_model = IBSG(vocab=vocab, data_iterator=data_iterator, train_data_path=train_data_path, epochs=epochs,
42 |                        half_window_size=half_window_size, nr_neg_samples=nr_neg_samples, subsampling_threshold=subsampling_threshold,
43 |                        batch_size=batch_size, output_dir=output_folder_path)
44 | 
45 |         if model_file_path:
46 |             i_model.load_model(model_file_path)
47 |         else:
48 |             i_model.init_model(vocab_size=len(vocab), input_dim=input_dim, hidden_dim=h_dim, latent_dim=z_dim, lr_opt=lr_opt, margin=margin)
49 | 
50 |         # load params only if the model was not loaded already
51 |         if params_file_path and not model_file_path:
52 |             i_model.load_params(params_file_path)
53 | 
54 |         return i_model
55 | 
56 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/lst/extract_lst_candidates.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Used to extract (or pool) substitute candidates for every target type in the LST dataset
 3 | '''
 4 | import sys
 5 | 
 6 | if __name__ == '__main__':
 7 |     
 8 |     if len(sys.argv)<3:
 9 |         print "Usage: %s <lst-gold-file> <candidates-file> [no-mwe]" % sys.argv[0]
10 |         sys.exit(1)
11 |         
12 |     goldfile = open(sys.argv[1], 'r')
13 |     outfile = open(sys.argv[2], 'w')
14 |     
15 |     ignore_mwe = False
16 |     if (len(sys.argv) > 3):
17 |         sys.stderr.write("ignoring multi-word-expressions\n");
18 |         ignore_mwe = True        
19 |     
20 |     good_oneword_inst = 0
21 |     target2candidates = {}
22 |     # bright.a 5 :: intelligent 3;clever 2;most able 1;capable 1;promising 1;sharp 1;motivated 1;
23 |     for line in goldfile:
24 |         if len(line)>0:
25 |             oneword_in_line = 0 # e.g. ;most able 1;
26 |             segments = line.split("::")
27 |             if len(segments)>=2:
28 |                 target = segments[0][:segments[0].strip().rfind(' ')]
29 |                 target = '.'.join(target.split('.')[:2]) # remove suffix in cases of bar.n.v
30 |                 line_candidates = segments[1].strip().split(';')
31 |                 for candidate_count in line_candidates:
32 |                     if len(candidate_count) > 0:
33 |                         delimiter_ind = candidate_count.rfind(' ')
34 |                         candidate = candidate_count[:delimiter_ind]
35 |                         if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)):
36 |                             continue
37 |                         oneword_in_line += 1                       
38 |                         if target in target2candidates:
39 |                             candidates = target2candidates[target]
40 |                         else:
41 |                             candidates = set()
42 |                             target2candidates[target] = candidates
43 |                         candidates.add(candidate)
44 |             if (oneword_in_line >= 1):
45 |                 good_oneword_inst += 1
46 |     
47 |     if ignore_mwe:
48 |         sys.stderr.write("good_oneword_inst: " + str(good_oneword_inst) + "\n")        
49 |     for target, candidates in target2candidates.iteritems():
50 |         outfile.write(target + '::' + ';'.join(list(candidates)) + '\n')
51 |     
52 |     goldfile.close()
53 |     outfile.close()
54 |         


--------------------------------------------------------------------------------
/libraries/evaluation/GloVe/distance.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | def generate():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--vocab_file', default='vocab.txt', type=str)
 8 |     parser.add_argument('--vectors_file', default='vectors.txt', type=str)
 9 |     args = parser.parse_args()
10 | 
11 |     with open(args.vocab_file, 'r') as f:
12 |         words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13 |     with open(args.vectors_file, 'r') as f:
14 |         vectors = {}
15 |         for line in f:
16 |             vals = line.rstrip().split(' ')
17 |             vectors[vals[0]] = [float(x) for x in vals[1:]]
18 | 
19 |     vocab_size = len(words)
20 |     vocab = {w: idx for idx, w in enumerate(words)}
21 |     ivocab = {idx: w for idx, w in enumerate(words)}
22 | 
23 |     vector_dim = len(vectors[ivocab[0]])
24 |     W = np.zeros((vocab_size, vector_dim))
25 |     for word, v in vectors.items():
26 |         if word == '<unk>':
27 |             continue
28 |         W[vocab[word], :] = v
29 | 
30 |     # normalize each word vector to unit variance
31 |     W_norm = np.zeros(W.shape)
32 |     d = (np.sum(W ** 2, 1) ** (0.5))
33 |     W_norm = (W.T / d).T
34 |     return (W_norm, vocab, ivocab)
35 | 
36 | 
37 | def distance(W, vocab, ivocab, input_term):
38 |     for idx, term in enumerate(input_term.split(' ')):
39 |         if term in vocab:
40 |             print('Word: %s  Position in vocabulary: %i' % (term, vocab[term]))
41 |             if idx == 0:
42 |                 vec_result = W[vocab[term], :] 
43 |             else:
44 |                 vec_result += W[vocab[term], :] 
45 |         else:
46 |             print('Word: %s  Out of dictionary!\n' % term)
47 |             return
48 |     
49 |     vec_norm = np.zeros(vec_result.shape)
50 |     d = (np.sum(vec_result ** 2,) ** (0.5))
51 |     vec_norm = (vec_result.T / d).T
52 | 
53 |     dist = np.dot(W, vec_norm.T)
54 | 
55 |     for term in input_term.split(' '):
56 |         index = vocab[term]
57 |         dist[index] = -np.Inf
58 | 
59 |     a = np.argsort(-dist)[:N]
60 | 
61 |     print("\n                               Word       Cosine distance\n")
62 |     print("---------------------------------------------------------\n")
63 |     for x in a:
64 |         print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     N = 100;          # number of closest words that will be shown
69 |     W, vocab, ivocab = generate()
70 |     while True:
71 |         input_term = raw_input("\nEnter word or sentence (EXIT to break): ")
72 |         if input_term == 'EXIT':
73 |             break
74 |         else:
75 |             distance(W, vocab, ivocab, input_term)
76 | 
77 | 


--------------------------------------------------------------------------------
/libraries/misc/initializers.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | np.random.seed(42)
 3 | 
 4 | class Initializers():
 5 |     def __init__(self):
 6 |         pass
 7 | 
 8 |     @staticmethod
 9 |     def init(size, init_type="uniform"):
10 |         """
11 |         :param init_type: uniform
12 |         :return: initialized numpy matrix with float32:
13 |         """
14 |         assert init_type in ['uniform', 'xavier_normal', 'xavier_uniform', 'zeros', 'bsg_log_sigmas']
15 |         fans = compute_fans(size)
16 |         if init_type == 'zeros':
17 |             return np.zeros(shape=size, dtype="float32")
18 |         if init_type == 'uniform':
19 |             return np.float32(np.random.uniform(low=-0.05, high=0.05, size=size))
20 |         if init_type == 'xavier_normal':
21 |             return np.float32(np.random.normal(0.0, 2./np.sum(fans), size=size))
22 |         if init_type == 'xavier_uniform':
23 |             lim = np.sqrt(6.0 / np.sum(fans))
24 |             return np.float32(np.random.uniform(low=-lim, high=lim, size=size))
25 |         if init_type == 'bsg_log_sigmas':
26 |             return np.float32(np.random.uniform(low=-3.5, high=-1.5, size=size))
27 | 
28 | 
29 | def compute_fans(shape, data_format='channels_last'):
30 |     """Computes the number of input and output units for a weight shape.
31 |     # Arguments
32 |         shape: Integer shape tuple.
33 |         data_format: Image data format to use for convolution kernels.
34 |             Note that all kernels in Keras are standardized on the
35 |             `channels_last` ordering (even when inputs are set
36 |             to `channels_first`).
37 |     # Returns
38 |         A tuple of scalars, `(fan_in, fan_out)`.
39 |     # Raises
40 |         ValueError: in case of invalid `data_format` argument.
41 |     """
42 |     if len(shape) == 2:
43 |         fan_in = shape[0]
44 |         fan_out = shape[1]
45 |     elif len(shape) in {3, 4, 5}:
46 |         # Assuming convolution kernels (1D, 2D or 3D).
47 |         # TH kernel shape: (depth, input_depth, ...)
48 |         # TF kernel shape: (..., input_depth, depth)
49 |         if data_format == 'channels_first':
50 |             receptive_field_size = np.prod(shape[2:])
51 |             fan_in = shape[1] * receptive_field_size
52 |             fan_out = shape[0] * receptive_field_size
53 |         elif data_format == 'channels_last':
54 |             receptive_field_size = np.prod(shape[:2])
55 |             fan_in = shape[-2] * receptive_field_size
56 |             fan_out = shape[-1] * receptive_field_size
57 |         else:
58 |             raise ValueError('Invalid data_format: ' + data_format)
59 |     else:
60 |         # No specific assumptions.
61 |         fan_in = np.sqrt(np.prod(shape))
62 |         fan_out = np.sqrt(np.prod(shape))
63 |     return fan_in, fan_out
64 | 
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/libraries/utils/paths_and_files.py:
--------------------------------------------------------------------------------
 1 | # contains utility functions that are related to retrieving paths, file names, creating folders, etc
 2 | import os
 3 | import glob
 4 | import errno
 5 | 
 6 | 
 7 | def get_immediate_subdirectories(a_dir):
 8 |     return [name for name in os.listdir(a_dir)
 9 |             if os.path.isdir(os.path.join(a_dir, name))]
10 | 
11 | 
12 | def get_file_paths(path, return_file_names=False):
13 |     """
14 |     :param path:
15 |     :return: :rtype: a list of filepaths that are in the folder
16 |     """
17 |     if os.path.isdir(path):
18 |         paths = glob.glob(path + "/*")
19 |     else:
20 |         paths = [path]  # that means there is only one file
21 | 
22 |     if return_file_names:
23 |         paths = [(p, p.split('/')[-1]) for p in paths]
24 |     return paths
25 | 
26 | 
27 | 
28 | def get_subdir_number(path):
29 |     """
30 |     Checks the number of subdirectories, and returns it. Useful for automatic output folders generation
31 |     """
32 |     if not os.path.exists(path):
33 |         return 0
34 |     subdirectories = get_immediate_subdirectories(path)
35 |     return len(subdirectories)
36 | 
37 | 
38 | def create_folders_if_not_exist(filename):
39 |     if os.path.dirname(filename) and not os.path.exists(os.path.dirname(filename)):
40 |         try:
41 |             os.makedirs(os.path.dirname(filename))
42 |         except OSError as exc: # Guard against race condition
43 |             if exc.errno != errno.EEXIST:
44 |                 raise
45 | 
46 | 
47 | def append_to_file(file_path, str):
48 |     create_folders_if_not_exist(file_path)
49 |     with open(file_path, "a") as f:
50 |         f.write(str+" \n")
51 | 
52 | 
53 | def files_len(path):
54 |     file_paths = get_file_paths(path)
55 |     total = 0
56 |     for file_path in file_paths:
57 |       total += files_len(file_path)
58 |     return total
59 | 
60 | 
61 | def file_len(file_path):
62 |     with open(file_path) as f:
63 |             for i, l in enumerate(f):
64 |                 pass
65 |     return i + 1
66 | 
67 | 
68 | def count_number_of_tokens(folder_path):
69 |     """
70 |     Counts the number of tokens in all files in the folder
71 |     """
72 |     nr_tokens = 0
73 |     filenames = glob.glob(folder_path + "/*")
74 |     for fname in filenames:
75 |        with open(fname) as f:
76 |             print fname
77 |             for sentence in f:
78 |                 words = sentence.lower().split()
79 |                 nr_tokens += len(words)
80 |     return nr_tokens
81 | 
82 | 
83 | def merge_text_files(input_folder, output_file_path):
84 |     with open(output_file_path, 'w') as output_file:
85 |         for file_path in get_file_paths(input_folder):
86 |             with open(file_path) as input_file:
87 |                 for line in input_file:
88 |                     output_file.write(line)


--------------------------------------------------------------------------------
/libraries/evaluation/support.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from libraries.evaluation.entailment.entailment import test_entailment, test_directional_entailment
 3 | from libraries.evaluation.word_sim.all_wordsim import all_word_sim
 4 | from libraries.evaluation.word_sim.wordsim import word_sim
 5 | from libraries.evaluation.GloVe.evaluate import glove_evaluate
 6 | 
 7 | 
 8 | # evaluates vectors on Glove's benchmark and offline wordvectors.org benchmark
 9 | # vectors_path : filename or a folder with vectors
10 | # vocab: vocab object, should be passed as there is currently a circular dependency TODO: fix!
11 | def evaluate(mu_vectors_files, sigma_vectors_files=None, vocab_file=None, vocab=None,
12 |              max_count=None, full_sim=False, log_sigmas=False):
13 | 
14 |     # all similarity tests are performed on mu vectors
15 |     for mu_vectors_file in mu_vectors_files:
16 |         # https://github.com/mfaruqui/eval-word-vectors
17 |         # 1. similarity tests
18 |         if full_sim:
19 |             sim_folder = os.path.dirname(os.path.realpath(__file__))+"/word_sim/data/word-sim"
20 |             # sim_folder = os.path.join(os.getcwd(), "../evaluation/word_sim/data/word-sim")
21 |             all_word_sim(mu_vectors_file, sim_folder)
22 |         else:
23 |             all_sim_file = os.path.dirname(os.path.realpath(__file__))+"/word_sim/data/combined-word-sim/TEST.txt"
24 |             # all_sim_file = os.path.join(os.getcwd(), "/../evaluation/word_sim/data/combined-word-sim/TEST.txt")
25 |             word_sim(mu_vectors_file, all_sim_file)
26 |         # https://github.com/stanfordnlp/GloVe
27 | 
28 |         # 2. analogical reasoning
29 |         if vocab_file is not None:
30 |             glove_evaluate(vocab_file, mu_vectors_file, max_count=max_count)
31 |     if not sigma_vectors_files:
32 |         return
33 | 
34 |     for mu_vectors_file, sigma_vectors_file in zip(mu_vectors_files, sigma_vectors_files):
35 |         # print "mu_vectors_file: %s" % mu_vectors_file
36 |         # print "sigma_vectors_file: %s"% sigma_vectors_file
37 | 
38 |         # 3. KL entailment
39 |         for sf in ["kl", "cos", "l2"]:
40 |             test_entailment(mu_vectors_path=mu_vectors_file, sigma_vectors_path=sigma_vectors_file, log_sigmas=log_sigmas,
41 |                             score_func=sf, normalize=False)
42 | 
43 |         # 4. directional entailment on Baroni
44 |         test_directional_entailment(mu_vectors_path=mu_vectors_file, sigma_vectors_path=sigma_vectors_file,
45 |                                     test_path='/data/bench/baroni2012_dir/data.tsv', header=True, vocab=vocab,
46 |                                     log_sigmas=log_sigmas)
47 |         # 5. directional entailment on Bless
48 |         test_directional_entailment(mu_vectors_path=mu_vectors_file, sigma_vectors_path=sigma_vectors_file,
49 |                                     test_path='/data/bench/bless2011_dir/data.tsv', header=True, vocab=vocab,
50 |                                     log_sigmas=log_sigmas)


--------------------------------------------------------------------------------
/libraries/evaluation/GloVe/word_analogy.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import numpy as np
 3 | import sys
 4 | 
 5 | def generate():
 6 |     parser = argparse.ArgumentParser()
 7 |     parser.add_argument('--vocab_file', default='vocab.txt', type=str)
 8 |     parser.add_argument('--vectors_file', default='vectors.txt', type=str)
 9 |     args = parser.parse_args()
10 | 
11 |     with open(args.vocab_file, 'r') as f:
12 |         words = [x.rstrip().split(' ')[0] for x in f.readlines()]
13 |     with open(args.vectors_file, 'r') as f:
14 |         vectors = {}
15 |         for line in f:
16 |             vals = line.rstrip().split(' ')
17 |             vectors[vals[0]] = [float(x) for x in vals[1:]]
18 | 
19 |     vocab_size = len(words)
20 |     vocab = {w: idx for idx, w in enumerate(words)}
21 |     ivocab = {idx: w for idx, w in enumerate(words)}
22 | 
23 |     vector_dim = len(vectors[ivocab[0]])
24 |     W = np.zeros((vocab_size, vector_dim))
25 |     for word, v in vectors.items():
26 |         if word == '<unk>':
27 |             continue
28 |         W[vocab[word], :] = v
29 | 
30 |     # normalize each word vector to unit variance
31 |     W_norm = np.zeros(W.shape)
32 |     d = (np.sum(W ** 2, 1) ** (0.5))
33 |     W_norm = (W.T / d).T
34 |     return (W_norm, vocab, ivocab)
35 | 
36 | 
37 | def distance(W, vocab, ivocab, input_term):
38 |     vecs = {}
39 |     if len(input_term.split(' ')) < 3:
40 |         print("Only %i words were entered.. three words are needed at the input to perform the calculation\n" % len(input_term.split(' ')))
41 |         return 
42 |     else:
43 |         for idx, term in enumerate(input_term.split(' ')):
44 |             if term in vocab:
45 |                 print('Word: %s  Position in vocabulary: %i' % (term, vocab[term]))
46 |                 vecs[idx] = W[vocab[term], :] 
47 |             else:
48 |                 print('Word: %s  Out of dictionary!\n' % term)
49 |                 return
50 | 
51 |         vec_result = vecs[1] - vecs[0] + vecs[2]
52 |         
53 |         vec_norm = np.zeros(vec_result.shape)
54 |         d = (np.sum(vec_result ** 2,) ** (0.5))
55 |         vec_norm = (vec_result.T / d).T
56 | 
57 |         dist = np.dot(W, vec_norm.T)
58 | 
59 |         for term in input_term.split(' '):
60 |             index = vocab[term]
61 |             dist[index] = -np.Inf
62 | 
63 |         a = np.argsort(-dist)[:N]
64 | 
65 |         print("\n                               Word       Cosine distance\n")
66 |         print("---------------------------------------------------------\n")
67 |         for x in a:
68 |             print("%35s\t\t%f\n" % (ivocab[x], dist[x]))
69 | 
70 | 
71 | if __name__ == "__main__":
72 |     N = 100;          # number of closest words that will be shown
73 |     W, vocab, ivocab = generate()
74 |     while True:
75 |         input_term = raw_input("\nEnter three words (EXIT to break): ")
76 |         if input_term == 'EXIT':
77 |             break
78 |         else:
79 |             distance(W, vocab, ivocab, input_term)
80 | 
81 | 


--------------------------------------------------------------------------------
/eval/support.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | def KL(mu_q, sigma_q, mu_p, sigma_p, debug=False):
 5 |     """
 6 |     Kullback Leibler divergence implementation in numpy. Assumes [batch_size x z_dimension ] or [latent_dim, ] inputs.
 7 | 
 8 |     """
 9 |     # adjusting dimensions
10 |     flag = False
11 |     if len(mu_q.shape) == 1 and len(sigma_q.shape) == 1 and len(mu_p.shape) == 1 and len(sigma_p.shape) == 1:
12 |         mu_q = mu_q.reshape((1, -1))
13 |         sigma_q = sigma_q.reshape((1, -1))
14 |         mu_p = mu_p.reshape((1, -1))
15 |         sigma_p = sigma_p.reshape((1, -1))
16 |         flag = True
17 | 
18 |     kl = KL_gauss_spherical(mu_q, sigma_q, mu_p, sigma_p, debug=debug)
19 |     if flag:
20 |         kl = kl[0]
21 |     return kl
22 | 
23 | 
24 | def KL_gauss_spherical(mu_q, sigma_q, mu_p, sigma_p, debug=False, eps=1e-8):
25 |     k = mu_q.shape[1]
26 |     sigma_p_inv = 1./(sigma_p + eps)
27 |     trace = k * sigma_p_inv * sigma_q
28 |     quadr = sigma_p_inv*np.sum(((mu_p - mu_q)**2), axis=1)
29 |     log_det_p = np.log(sigma_p + 1e-10)
30 |     log_det_q = np.log(sigma_q + 1e-10)
31 |     log_det = k*(log_det_p - log_det_q)
32 |     res = 0.5 * (trace + quadr - k + log_det)
33 | 
34 |     if debug:
35 |         print "trace : %s" % str(trace)
36 |         print "quadr : %s" % str(quadr)
37 |         print 'log_det_p : %s' % str(log_det_p)
38 |         print 'log_det_q : %s' % str(log_det_q)
39 |         print "log_det : %s" % str(log_det)
40 |         print 'res : %s'% str(res)
41 |     return res.reshape((-1, ))
42 | 
43 | 
44 | def cosine_sim(x, y):
45 |     return float(np.sum(x*y))/float(np.sqrt(np.sum(x**2)*np.sum(y**2)))
46 | 
47 | 
48 | def read_vectors_to_dict(mus_file_path, sigmas_file_path, log_sigmas=False, vocab=None, header=False):
49 |     dict = {}
50 |     with open(mus_file_path) as f:
51 |             for i, sentence in enumerate(f):
52 |                 if header and i==0:
53 |                     continue
54 | 
55 |                 parts = sentence.strip().split(" ")
56 |                 word = parts[0]
57 | 
58 |                 # filter words that are not in vocab
59 |                 if vocab is not None and word not in vocab.word_to_index:
60 |                     continue
61 | 
62 |                 mu = np.array(parts[1:], dtype="float32")
63 |                 # normalize it
64 |                 # mu = mu / (np.sum(mu**2)**0.5)
65 |                 dict[word] = [mu]
66 |                 # print len(dict)
67 |     with open(sigmas_file_path) as f:
68 |             for i, sentence in enumerate(f):
69 |                 if header and i==0:
70 |                     continue
71 | 
72 |                 parts = sentence.strip().split(" ")
73 |                 word = parts[0]
74 | 
75 |                 # filter words that are not in vocab
76 |                 if vocab is not None and word not in vocab.word_to_index:
77 |                     continue
78 | 
79 |                 sigma = np.array(parts[1:], dtype="float32")
80 |                 if log_sigmas:
81 |                     sigma = np.exp(sigma)
82 |                 dict[word].append(sigma)
83 |     return dict


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/run_lexsub.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | sys.path.append(os.path.join(os.getcwd(), "../../"))
 3 | from libraries.tools.vocabulary import Vocabulary
 4 | from main.simulators_interfaces.bsg_simulator_interface import BsgSimulatorInterface
 5 | from main.skipgram_embeddings import Skipgram_Embeddings
 6 | from main.support import read_vectors
 7 | from main.lex_sub import lex_sub
 8 | 
 9 | 
10 | def run_lexsub(input_folder, output_path, half_window_size=5, input_type="normal", embeddings_type="bsg", sg_files_prefix="",
11 |                    arithm_type="add"):
12 |     assert embeddings_type in ['bsg', 'sg']
13 |     assert input_type in ["normal", "dependency"]
14 |     assert arithm_type in ["add", "mult"]
15 |     # data paths
16 |     candidates_file = os.path.dirname(os.path.realpath(__file__)) + "/datasets/lst.gold.candidates"
17 |     test_file = os.path.dirname(os.path.realpath(__file__)) + "/datasets/lst_all.preprocessed"
18 |     conll_file = os.path.dirname(os.path.realpath(__file__)) + "/datasets/lst_all.conll"
19 |     gold_file = os.path.dirname(os.path.realpath(__file__)) + "/datasets/lst_all.gold"
20 | 
21 |     vocab_file_path = os.path.join(input_folder, 'vocab.txt')
22 |     model_file_path = os.path.join(input_folder, 'model.pkl')
23 |     vocab = Vocabulary()
24 |     vocab.load(vocab_file_path=vocab_file_path)
25 |     if embeddings_type == "bsg":
26 |         embeddings = BsgSimulatorInterface(model_file_path=model_file_path, vocab=vocab)
27 |     else:
28 |         # default skipgram
29 |         # two cases : either we have mu.vectors or prefix_input.vectors and prefix_output.vectors
30 |         input_file_name = "_".join([sg_files_prefix, 'input'])+'.vectors' if sg_files_prefix != "" else "input.vectors"
31 |         output_file_name = "_".join([sg_files_prefix, 'output'])+'.vectors' if sg_files_prefix != "" else "output.vectors"
32 |         input_vectors_file_path = os.path.join(input_folder, input_file_name)
33 |         output_vectors_file_path = os.path.join(input_folder, output_file_name)
34 |         if not os.path.exists(input_vectors_file_path) or not os.path.exists(output_vectors_file_path):
35 |             input_vectors_file_path = os.path.join(input_folder,  'mu.vectors')
36 |             output_vectors_file_path = input_vectors_file_path
37 |         embeddings = Skipgram_Embeddings(target_word_embeddings=read_vectors(input_vectors_file_path),
38 |                                          context_embeddings=read_vectors(output_vectors_file_path))
39 | 
40 |     # extract word_to_index vectors from vocab
41 |     word_to_index = {obj.token:obj.id for obj in vocab}
42 |     target_words_vocab = word_to_index
43 |     context_words_vocab = word_to_index
44 | 
45 |     print ' running  lexical substitution evaluation'
46 |     # run lexical substitution
47 |     lex_sub(embeddings=embeddings, input_type=input_type, target_words_vocab=target_words_vocab,
48 |             context_words_vocab=context_words_vocab, candidates_file=candidates_file, conll_file=conll_file, test_file=test_file,
49 |             gold_file=gold_file, half_window_size=half_window_size, output_path=output_path, arithm_type=arithm_type)
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/main/skipgram_embeddings.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from support import cosine_sim, pos_cosine_sim_normed
 3 | 
 4 | class Skipgram_Embeddings():
 5 | 
 6 |     def __init__(self, target_word_embeddings, context_embeddings):
 7 |         self.w_emb = target_word_embeddings
 8 |         self.c_emb = context_embeddings
 9 | 
10 |     # performs the scoring of candidates embeddings given
11 |     # returns a dictionary of scores
12 |     def score(self, target, context, candidates, ar_type="add", i=-1):
13 |         assert ar_type in ["add", "mult"]
14 |         scores = {}
15 |         if ar_type == "add":
16 |             cont_repr = self.__repr_context_add(target, context, avg_flag=True)
17 |         for cand in candidates:
18 |             if ar_type == "add":
19 |                 scores[cand] = self.__add(cont_repr, cand=self.w_emb[cand])
20 |             elif ar_type == "mult":
21 |                 scores[cand] = self.__mult(target, context, cand, geo_mean_flag=True)
22 |         return scores
23 | 
24 |     # Oren's strange add operation
25 |     def __add(self, repr, cand):
26 |         return np.dot(repr, cand)
27 | 
28 |     def __repr_context_add(self, target, deps, avg_flag=True):
29 |         target_vec = None if target is None else np.copy(self.w_emb[target])
30 |         dep_vec = None
31 |         deps_found = 0
32 |         for dep in deps:
33 |             if dep in self.c_emb:
34 |                 deps_found += 1
35 |                 if dep_vec is None:
36 |                     dep_vec = np.copy(self.c_emb[dep])
37 |                 else:
38 |                     dep_vec += self.c_emb[dep]
39 | 
40 |         ret_vec = None
41 |         if target_vec is not None:
42 |             ret_vec = target_vec
43 |         if dep_vec is not None:
44 |             if avg_flag:
45 |                 dep_vec /= deps_found
46 |             if ret_vec is None:
47 |                 ret_vec = dep_vec
48 |             else:
49 |                 ret_vec += dep_vec
50 | 
51 |         norm = (ret_vec.dot(ret_vec.transpose()))**0.5
52 |         ret_vec /= norm
53 | 
54 |         return ret_vec
55 | 
56 |     def __mult(self, target, deps, subsitute, geo_mean_flag=True):
57 |         target_vec = self.w_emb[target]
58 |         subs_vec = self.w_emb[subsitute]
59 |         score = pos_cosine_sim_normed(target_vec, subs_vec)
60 |         for dep in deps:
61 |             if dep in self.c_emb:
62 |                 dep_vec = self.c_emb[dep]
63 |                 mult_scores = pos_cosine_sim_normed(dep_vec, subs_vec)
64 |                 if geo_mean_flag:
65 |                     mult_scores = mult_scores**(1.0/len(deps)) # TODO: think if you need to fix it because len(deps) +1 should be here
66 |                 score = np.multiply(score, mult_scores)
67 |         return score
68 | 
69 | 
70 | 
71 |     # def __add(self, center_word, context, substitute):
72 |     #     seen = 0
73 |     #     scr = 0
74 |     #     if center_word in self.w_emb:
75 |     #         scr += cosine_sim(self.w_emb[center_word], self.w_emb[substitute])
76 |     #         seen += 1
77 |     #     for c in context:
78 |     #         if c in self.c_emb:
79 |     #             scr += cosine_sim(self.c_emb[c], self.w_emb[substitute])
80 |     #             seen+=1
81 |     #     return scr/(seen)
82 | 


--------------------------------------------------------------------------------
/models/bword2vec.py:
--------------------------------------------------------------------------------
 1 | import pickle
 2 | import os
 3 | import theano
 4 | from support import load, write_vectors, kl_spher
 5 | from pickle import UnpicklingError
 6 | from libraries.tools.ordered_attrs import OrderedAttrs
 7 | 
 8 | ## theano configuration
 9 | theano.optimizer_including = 'cudnn'
10 | 
11 | 
12 | class BWord2Vec(OrderedAttrs):
13 |     """
14 |     Base class for the Bayesian Skip-gram model, it contains methods that can be used for multiple variants of BSG.
15 | 
16 |     """
17 |     def __init__(self):
18 |         OrderedAttrs.__init__(self)
19 |         # the following attributes will be initialized in a child object
20 |         self.params = None
21 |         self.params_full = None
22 |         self.repr_types = None
23 | 
24 |     @staticmethod
25 |     def kl(mu_q, sigma_q, mu_p, sigma_p):
26 |         """
27 |         The generic Kullback Leibler function that passes arguments to the correct function
28 | 
29 |         """
30 |         return kl_spher(mu_q, sigma_q, mu_p, sigma_p)
31 | 
32 |     def save_word_vectors(self, index_to_word, vectors_folder):
33 |         """
34 |         Extracts word vectors from different parameters and saves them to a desired vectors_folder destination
35 |         :param index_to_word:  an array of words from vocab object
36 |         :param vectors_folder: a desired destination path where word vectors should be saved
37 | 
38 |         """
39 |         for name, func in self.repr_types.items():
40 |             write_vectors(index_to_word, os.path.join(vectors_folder, name+".vectors"), func)
41 | 
42 |     def save_params(self, output_dir, output_file_name='params.pkl'):
43 |         """
44 |         Saves parameters via pickle to the output_dir under the specified name.
45 | 
46 |         """
47 |         f = open(os.path.join(output_dir, output_file_name), 'wb')
48 |         for param_name, param in self.params_full.items():
49 |             # get_value() is necessary because param will be a tensor
50 |             pickle.dump([param_name, param['values'].get_value()], f)
51 |         f.close()
52 | 
53 |     def load_params(self, file_path, exclude_params=[]):
54 |         """
55 |         Loads params from a pickle saved file. The format has to correspond to the one that is used in save_params()
56 | 
57 |         """
58 |         f = open(file_path, 'rb')
59 |         initialized_params = []
60 |         while True:
61 |             try:
62 |                 param_name, param = pickle.load(f)
63 |                 if param_name in exclude_params:
64 |                     continue
65 |                 self.initialize_param(param_name, param)
66 |                 initialized_params.append(param_name)
67 |             except (EOFError, UnpicklingError):
68 |                 break
69 |         f.close()
70 |         return initialized_params
71 | 
72 |     def initialize_param(self, param_name, param_value):
73 |         """
74 |         Initializes a parameter with the provided values
75 |         :param param_value: a matrix(array) of parameters
76 | 
77 |         """
78 |         current_params = self.params_full
79 |         if param_name not in current_params:
80 |             raise ValueError("Could not find the parameter by '%s' name" % param_name)
81 |         current_params[param_name]['values'].set_value(param_value)


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/data/word-sim/EN-YP-130.txt:
--------------------------------------------------------------------------------
  1 | brag boast 4.000
  2 | concoct devise 4.000
  3 | divide split 4.000
  4 | build construct 4.000
  5 | end terminate 4.000
  6 | accentuate highlight 4.000
  7 | demonstrate show 3.833
  8 | solve figure 3.833
  9 | consume eat 3.833
 10 | position situate 3.833
 11 | swear vow 3.833
 12 | furnish supply 3.833
 13 | merit deserve 3.667
 14 | submit yield 3.667
 15 | seize take 3.667
 16 | spin twirl 3.500
 17 | enlarge swell 3.500
 18 | swing sway 3.500
 19 | circulate distribute 3.500
 20 | recognize acknowledge 3.333
 21 | resolve settle 3.333
 22 | prolong sustain 3.333
 23 | tap knock 3.333
 24 | block hinder 3.167
 25 | arrange plan 3.167
 26 | twist curl 3.167
 27 | hail acclaim 3.000
 28 | dissipate disperse 3.000
 29 | approve support 3.000
 30 | impose levy 3.000
 31 | hasten accelerate 2.833
 32 | rap tap 2.833
 33 | lean rest 2.833
 34 | make earn 2.833
 35 | show publish 2.833
 36 | sell market 2.833
 37 | weave intertwine 2.667
 38 | refer direct 2.667
 39 | distribute commercialize 2.500
 40 | twist intertwine 2.500
 41 | drain tap 2.500
 42 | depict recognize 2.500
 43 | build organize 2.500
 44 | hail address 2.333
 45 | call refer 2.167
 46 | swing bounce 2.167
 47 | yield seize 2.000
 48 | split crush 2.000
 49 | challenge yield 2.000
 50 | hinder assist 2.000
 51 | welcome recognize 2.000
 52 | need deserve 1.833
 53 | refer explain 1.833
 54 | finance build 1.667
 55 | expect deserve 1.667
 56 | terminate postpone 1.667
 57 | yell boast 1.667
 58 | swell curl 1.667
 59 | rotate situate 1.500
 60 | seize request 1.500
 61 | approve scorn 1.500
 62 | supply consume 1.500
 63 | clip twist 1.500
 64 | divide figure 1.333
 65 | advise furnish 1.333
 66 | complain boast 1.333
 67 | want deserve 1.333
 68 | twist fasten 1.333
 69 | swing crash 1.167
 70 | make trade 1.167
 71 | hinder yield 1.167
 72 | build propose 1.167
 73 | express figure 1.167
 74 | resolve examine 1.167
 75 | bruise split 1.167
 76 | swing break 1.167
 77 | catch consume 1.000
 78 | swear explain 1.000
 79 | request levy 1.000
 80 | arrange study 1.000
 81 | relieve hinder 1.000
 82 | move swell 1.000
 83 | weave print 0.833
 84 | swear think 0.833
 85 | forget resolve 0.833
 86 | supervise concoct 0.833
 87 | situate isolate 0.667
 88 | explain boast 0.667
 89 | ache spin 0.667
 90 | evaluate terminate 0.667
 91 | recognize succeed 0.667
 92 | dilute market 0.667
 93 | hasten permit 0.667
 94 | scorn yield 0.667
 95 | swear describe 0.667
 96 | arrange explain 0.667
 97 | discard arrange 0.667
 98 | list figure 0.667
 99 | stamp weave 0.500
100 | market sweeten 0.500
101 | boil tap 0.500
102 | sustain lower 0.500
103 | resolve publicize 0.500
104 | dissipate isolate 0.500
105 | anger approve 0.500
106 | approve boast 0.500
107 | research distribute 0.500
108 | request concoct 0.500
109 | boast yield 0.500
110 | furnish impress 0.333
111 | refine sustain 0.333
112 | acknowledge distribute 0.333
113 | clean concoct 0.333
114 | lean grate 0.333
115 | postpone show 0.333
116 | hail judge 0.333
117 | remember hail 0.333
118 | scrape lean 0.333
119 | sweat spin 0.333
120 | highlight restore 0.333
121 | seize refer 0.167
122 | levy believe 0.167
123 | alter highlight 0.167
124 | refer carry 0.167
125 | empty situate 0.167
126 | flush spin 0.167
127 | shake swell 0.167
128 | imitate highlight 0.167
129 | correlate levy 0.000
130 | refer lean 0.000
131 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/embedding_inferrer.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Context insensitive inferrer, based on embeddings similarities
 3 | '''
 4 | 
 5 | import time
 6 | 
 7 | from jcs.cs_inferrer import CsInferrer
 8 | from jcs.data.embedding import Embedding
 9 | from jcs.jcs_io import vec_to_str
10 | from jcs.data.pos import to_wordnet_pos
11 | from jcs.jcs_io import load_vocabulary_counts
12 | 
13 | from nltk.stem.wordnet import WordNetLemmatizer
14 | 
15 | 
16 | 
17 | class EmbeddingInferrer(CsInferrer):
18 |     '''
19 |     classdocs
20 |     '''
21 | 
22 | 
23 |     def __init__(self, path, vocabfile, top_inferences_to_analyze):
24 |         CsInferrer.__init__(self)
25 |         self.embeddings = Embedding(path)
26 |         self.top_inferences_to_analyze = top_inferences_to_analyze
27 |         
28 |         self.w2counts, ignore1, ignore2 = load_vocabulary_counts(vocabfile)
29 |         
30 |     def new_target_key(self, target_key):
31 |         pass
32 |     
33 |     def find_inferred(self, lst_instance, tfo):
34 |         
35 |         if lst_instance.target in self.embeddings:
36 |             result_vec, deltatime = self.embeddings.closest_with_time(lst_instance.target, -1)
37 |         else:
38 |             result_vec, deltatime = None, 0
39 |         
40 |         tfo.write("\nDeltatime: %f msec\n" % ((deltatime)*1000))
41 |         self.inference_time(deltatime)
42 |             
43 |         if (result_vec is not None):
44 |             tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n')
45 |         else:
46 |             tfo.write("Top most similar embeddings: " + " contexts: None\n") 
47 |             
48 |         return result_vec
49 |             
50 | 
51 | 
52 |     def filter_inferred(self, result_vec, candidates, pos):
53 |     
54 |         filtered_results = {}
55 |         candidates_found = set()
56 |         
57 |         if result_vec != None:
58 |             for word, weight in result_vec:
59 |                 wn_pos = to_wordnet_pos[pos]
60 |                 lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
61 |                 if lemma in candidates:
62 |                     self.add_inference_result(lemma, weight, filtered_results, candidates_found)
63 |                 if lemma.title() in candidates: # match also capitalized words
64 |                     self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found)
65 |                 if word in candidates: # there are some few cases where the candidates are not lemmatized
66 |                     self.add_inference_result(word, weight, filtered_results, candidates_found)
67 |                 if word.title() in candidates: # there are some few cases where the candidates are not lemmatized
68 |                     self.add_inference_result(word.title(), weight, filtered_results, candidates_found)    
69 |                                     
70 |                     
71 |         # assign negative weights for candidates with no score
72 |         # they will appear last sorted according to their unigram count        
73 |         candidates_left = candidates - candidates_found
74 |         for candidate in candidates_left:            
75 |             count = self.w2counts[candidate] if candidate in self.w2counts else 1
76 |             score = -1 - (1.0/count) # between (-1,-2] 
77 |             filtered_results[candidate] = score        
78 |          
79 |         return filtered_results
80 |     
81 | 
82 |         


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Bayesian Skip-gram(BSG)
 2 | 
 3 | This repository contains Theano code for the Bayesian Skip-gram model, COLING 2018.
 4 |  
 5 |  [1] **Embedding Words as Distributions with a Bayesian Skip-gram Model**, Arthur Bražinskas, Serhii Havrylov, Ivan Titov, [arxiv](https://arxiv.org/abs/1711.11027)
 6 |  
 7 | The model represents words are Gaussian distributions instead of point estimates, and is capable of learning addition word properties, such as generality that is
 8 | encoded in variances. The instructions below provide a guide on how to install and run the model, also how to evaluate word pairs. 
 9 | 
10 | 
11 | ## Requirements
12 | - Python 2.7
13 | - Theano 0.9.0
14 | - numpy 1.14.2
15 | - nltk 3.2.2
16 | - scipy 0.18.1
17 | - Lasagne 0.2.dev1
18 | 
19 | ## Installation
20 | 
21 | First of all, install the dependency Python modules, such as Theano and nltk. 
22 | 
23 | ```
24 | pip install requirements.txt
25 | ```
26 | 
27 | Afterwards, install the necessary NLTK sub-packages. 
28 | 
29 | ```
30 | python -m nltk.downloader wordnet
31 | 
32 | python -m nltk.downloader punkt
33 | ```
34 | 
35 | ## Runing the model 
36 | In order to run the model, please refer to **run_bsg.py** file that contains an example code on how to train and evaluate the model. Upon completion of training,
37 | word representations will be saved to the *output* folder. For example, one can use trained word Gaussian representations(mus and sigmas) as input to word pairs evaluation. 
38 | 
39 | ### Data
40 | A small dataset consisting of [15 million tokens](https://drive.google.com/open?id=1QWC2x6qq8KyHFUCgyvVJJoGHexZrw7gO) dataset is available for smoke tests of the setup. Alternatively, a dataset consisting of approximately [1 billion tokens](http://www.statmt.org/lm-benchmark/) is also available for the public use.
41 | The dataset that was used originally in the research is not publicly available, but can be (requested)[http://wacky.sslmit.unibo.it/doku.php?id=corpora]. 
42 | 
43 | ## Word pairs evaluation
44 | 
45 | One can use the **eval/word_pairs_eval.py** console application as a playground for word pairs evaluation in terms of similarity, Kullback-Leibler divergence,
46 | and entailment directionality. The console application expects paths for word pairs, mu and sigma vectors(i.e. representations of word).
47 | A word pairs file should contain two words(order does not matter) per line separated by space. The latter two files are obtained from a trained BSG model.
48 | Alternative, pre-trained on the 3B tokens dataset [word representations](https://drive.google.com/open?id=1YQQHFV215YjKLlvxpxsKWLm__TlQMw1Q).
49 | 
50 | The example command below will evaluate pairs stored in **eval/example_word_pairs.txt**, and output results to the console. 
51 | ```
52 | python eval/word_pairs_eval.py -wpp eval/example_word_pairs.txt -mup vectors/mu.vectors -sigmap vectors/sigma.vectors
53 | ```
54 | 
55 | 
56 | 
57 | ## Additional resourced used in the project
58 | Lexical substitution benchmark is a modified version of https://github.com/orenmel/lexsub 
59 | 
60 | 
61 | ## Citation
62 | 
63 | ```
64 | @inproceedings{brazinskas-etal-2018-embedding,
65 |     title = "Embedding Words as Distributions with a {B}ayesian Skip-gram Model",
66 |     author = "Bra{\v{z}}inskas, Arthur  and
67 |       Havrylov, Serhii  and
68 |       Titov, Ivan",
69 |     booktitle = "Proceedings of the 27th International Conference on Computational Linguistics",
70 |     month = aug,
71 |     year = "2018",
72 |     address = "Santa Fe, New Mexico, USA",
73 |     publisher = "Association for Computational Linguistics",
74 |     url = "https://www.aclweb.org/anthology/C18-1151",
75 |     pages = "1775--1789",
76 | }
77 | 
78 | ```
79 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/jcs_io.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import heapq
  3 | 
  4 | STOPWORD_TOP_THRESHOLD = 256
  5 | SUBVEC_DIR_SUFFIX = ".DIR"
  6 | VOCAB_TOTAL = "<TOTAL>"
  7 | 
  8 | def wf2ws(weight):
  9 |         return '{0:1.5f}'.format(weight)
 10 |     
 11 | 
 12 | def vec_to_str(subvec, max_n):
 13 |     
 14 |     sub_list_sorted = heapq.nlargest(max_n, subvec, key=lambda x: x[1])
 15 |     sub_strs = [' '.join([word, wf2ws(weight)]) for word, weight in sub_list_sorted]
 16 |     return '\t'.join(sub_strs)
 17 | 
 18 | def vec_to_str_generated(subvec, max_n):
 19 |     
 20 |     sub_list_sorted = heapq.nlargest(max_n, subvec, key=lambda x: x[1])
 21 |     sub_strs = [word for word, weight in sub_list_sorted]
 22 |     return ';'.join(sub_strs)
 23 | 
 24 | def count_file_lines(filename):
 25 |     f = open(filename, 'r')
 26 |     lines_num = sum(1 for line in f)
 27 |     f.close()
 28 |     return lines_num
 29 | 
 30 | def to_rank_weights(subvec):
 31 |     subvec_len = len(subvec)
 32 |     for i in xrange(0, subvec_len):
 33 |         subvec[i] = (subvec[i][0], 1.0-float(i)/subvec_len)
 34 |                         
 35 | def get_pmi_weights(subvec, w2counts, sum_counts, offset, threshold, normalize=False):
 36 |     subvec_pmi = []
 37 |     norm = 0
 38 |     for word, prob in subvec:
 39 |         if prob != 0.0:
 40 |             pmi = math.log(prob * sum_counts / w2counts[word])-offset
 41 |             if pmi>threshold:
 42 |                 subvec_pmi.append((word, pmi))
 43 |                 norm += pmi**2
 44 |             
 45 |     if normalize:
 46 |         norm = norm**0.5
 47 |         for i in xrange(0,len(subvec_pmi)):
 48 |             subvec_pmi[i] = (subvec_pmi[i][0], subvec_pmi[i][1] / norm)       
 49 |             
 50 |     return subvec_pmi
 51 | 
 52 |         
 53 |     
 54 | def extract_word_weight(pair):
 55 |     tokens = pair.split(' ')
 56 |     return tokens[0], float(tokens[1])
 57 | 
 58 | 
 59 | def load_classes(path):
 60 |     w2c = {}
 61 |     max_class_id = 0
 62 |     with open(path) as f:
 63 |         for line in f:
 64 |             tokens = line.split()
 65 |             word = tokens[0]
 66 |             class_id = int(tokens[1])  
 67 |             w2c[word] = class_id
 68 |             max_class_id = max(max_class_id, class_id) 
 69 |     return w2c, max_class_id+1
 70 | 
 71 | def load_vocabulary_w2i(path):
 72 |     with open(path) as f:
 73 |         vocab = [line.split('\t')[0].strip() for line in f if len(line) > 0]
 74 |     return dict([(a, i) for i, a in enumerate(vocab)]), vocab
 75 | 
 76 | def load_vocabulary_counts(path, factor=1.0):
 77 |     stop_words = set()
 78 |     counts = {}
 79 |     sum = 0
 80 |     with open(path) as f:
 81 |         i = 0
 82 |         for line in f:
 83 |             if len(line) > 0:
 84 |                 tokens = line.split('\t')
 85 |                 # tokens = line.split(' ')
 86 |                 word = tokens[0].strip()
 87 |                 count = int(tokens[1].strip())
 88 |                 if (factor != 1.0):           
 89 |                     factored_count = int(count**factor)
 90 |                 else:
 91 |                     factored_count = count
 92 |                 counts[word] = factored_count
 93 |                 sum += factored_count
 94 |                 i += 1
 95 |                 # What is this?!
 96 |                 if (i <= STOPWORD_TOP_THRESHOLD):
 97 |                     stop_words.add(word)
 98 |     total_size = sum #counts[VOCAB_TOTAL]
 99 |     return counts, total_size, stop_words
100 | 
101 | def load_target_counts(path):
102 |     counts = {}
103 |     with open(path) as f:
104 |         for line in f:
105 |             if len(line) > 0:
106 |                 tokens = line.split('\t') 
107 |                 word = tokens[0].strip() 
108 |                 count = int(tokens[1].strip())
109 |                 counts[word] = count
110 |     return counts


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/README.md:
--------------------------------------------------------------------------------
 1 | # Lexical Substitution Evaluation
 2 | 
 3 | This code was used to perform the lexical substitution evaluation described in the following papers:
 4 | 
 5 | **[1] A Simple Word Embedding Model for Lexical Substitution**
 6 | Oren Melamud, Omer Levy, Ido Dagan.  Workshop on Vector Space Modeling for NLP (VSM), 2015 [[pdf]](http://u.cs.biu.ac.il/~melamuo/publications/melamud_vsm15.pdf).
 7 | 
 8 | **[2] context2vec: Learning Generic Context Embedding with Bidirectional LSTM**  
 9 | Oren Melamud, Jacob Goldberger, Ido Dagan. CoNLL, 2016 [[pdf]](http://u.cs.biu.ac.il/~melamuo/publications/context2vec_camera_ready.pdf).
10 | 
11 | 
12 | ## Requirements
13 | 
14 | * Python 2.7
15 | * [NLTK 3.0](http://www.nltk.org/))  - optional (only required for the AWE baseline and MSCC evaluation)
16 | * Numpy
17 | * [context2vec](https://github.com/orenmel/context2vec) - for the context2vec evaluation
18 | 
19 | ## Datasets
20 | 
21 | This repository contains preprocessed data files based on the datasets introduced by the following papers:
22 | 
23 | **[3] Semeval-2007 task 10: English lexical substitution task**
24 | Diana McCarthy, Roberto Navigli, SemEval 2007.  
25 | (files with the prefix 'lst' under the 'dataset' directory)
26 | 
27 | **[4] What substitutes tell us-analysis of an ”all-words” lexical substitution corpus.**
28 | Gerhard Kremer,Katrin Erk, Sebastian Pado,  Stefan Thater. EACL, 2014.  
29 | (files with the prefix 'coinco' under the 'dataset' directory)
30 | 
31 | ## Evaluating the word embedding model [1]
32 | 
33 | * Download the word embeddings, context embeddings from [[here]](http://u.cs.biu.ac.il/~nlp/resources/downloads/lexsub_embeddings/)
34 | * Preprocess the embedding files:
35 | ```
36 | python jcs/text2numpy.py <word-embeddings-filename> <word-embeddings-filename>
37 | python jcs/text2numpy.py <context-embeddings-filename> <context-embeddings-filename>
38 | ```
39 | * To perform the lexical substitution evaluation run (replace the example datasets files and params below as you wish):
40 | ```
41 | python jcs/jcs_main.py --inferrer emb -vocabfile datasets/ukwac.vocab.lower.min100 -testfile datasets/lst_all.preprocessed -testfileconll datasets/lst_all.conll -candidatesfile datasets/lst.gold.candidates -embeddingpath <word-embeddings-filename> -embeddingpathc <context-embeddings-filename> -contextmath mult --debug -resultsfile <result-file>
42 | ```
43 | * This will create the following output files: 
44 | 	- \<result-file\>
45 | 	- \<result-file\>.ranked
46 | 	- \<result-file\>.generate.oot
47 | 	- \<result-file\>.generate.best
48 | * Run the following to compute the candidate ranking GAP score. The results will be written to \<gap-score-file\>.
49 | ```
50 | python jcs/evaluation/lst/lst_gap.py ~/datasets/lst_all.gold <result-file>.ranked <gap-score-file> no-mwe
51 | ```
52 | * Run the following to compute the OOT and BEST substitute prediction scores. The results will be written to \<xxx-score-file\>. score.pl was distributed in [3].
53 | ```
54 | perl dataset/score.pl \<result-file\>.generate.oot datasets/lst_all.gold -t oot > \<oot-score-file\>
55 | ```
56 | ```
57 | perl dataset/score.pl \<result-file\>.generate.best datasets/lst_all.gold -t best > \<best-score-file\>
58 | ```
59 | 
60 | ## Evaluating the context2vec model [2]
61 | 
62 | * See [context2vec](https://github.com/orenmel/context2vec) for how to download or train a \<context2vec-model\>.
63 | * To perform the lexical substitution evaluation run (replace the example datasets files and params below as you wish):
64 | ```
65 | python jcs/jcs_main.py --inferrer lstm -lstm_config \<context2vec-model\>.params -testfile datasets/lst_all.preprocessed -testfileconll datasets/lst_all.conll -candidatesfile datasets/lst.gold.candidates -contextmath mult -resultsfile <result-file> --ignoretarget --debug
66 | ```
67 | * From here, follow the same instructions as in the previous section.
68 | 
69 | 
70 | ## License
71 | 
72 | Apache 2.0
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/data/word-sim/EN-VERB-143.txt:
--------------------------------------------------------------------------------
  1 | happen	say	0.1900
  2 | makes	produced	0.7200
  3 | established	make	0.4800
  4 | calling	say	0.5300
  5 | developed	considers	0.2400
  6 | organising	developed	0.3500
  7 | causes	allows	0.4800
  8 | requiring	organising	0.2700
  9 | made	set	0.3900
 10 | gave	shows	0.4300
 11 | giving	produces	0.3600
 12 | rise	made	0.2300
 13 | seemed	find	0.3500
 14 | form	employ	0.1900
 15 | refuses	protect	0.2300
 16 | produced	reports	0.3100
 17 | gives	employed	0.1600
 18 | sets	starting	0.1900
 19 | organising	shown	0.2100
 20 | works	causes	0.3000
 21 | using	calls	0.1900
 22 | causes	used	0.1300
 23 | affecting	works	0.3100
 24 | cause	starts	0.5300
 25 | strike	form	0.1400
 26 | considering	leading	0.2000
 27 | refuses	refused	0.7900
 28 | affected	reported	0.2900
 29 | show	use	0.1800
 30 | work	allow	0.3100
 31 | starts	requires	0.1900
 32 | gives	produced	0.4000
 33 | exist	produced	0.1900
 34 | starts	set	0.3100
 35 | employed	helping	0.1850
 36 | affected	created	0.2200
 37 | produces	starts	0.3700
 38 | happens	produce	0.2300
 39 | using	employed	0.3300
 40 | establishing	increasing	0.3000
 41 | seemed	seems	0.7300
 42 | increasing	start	0.1800
 43 | said	considering	0.1600
 44 | allow	make	0.2700
 45 | led	established	0.3500
 46 | calling	reported	0.2600
 47 | found	taken	0.2500
 48 | says	set	0.3100
 49 | happens	continue	0.2200
 50 | reducing	works	0.1400
 51 | shows	produced	0.3000
 52 | report	creating	0.1400
 53 | considers	says	0.3300
 54 | created	work	0.1900
 55 | affecting	protect	0.2400
 56 | providing	showing	0.5800
 57 | happening	affected	0.2200
 58 | says	shows	0.4000
 59 | given	working	0.1800
 60 | affect	happen	0.2700
 61 | requires	exists	0.3200
 62 | affecting	helping	0.2800
 63 | making	establishing	0.5900
 64 | applies	cause	0.2300
 65 | found	take	0.2600
 66 | organising	produced	0.2400
 67 | providing	reported	0.3800
 68 | calls	help	0.2500
 69 | refusing	apply	0.1300
 70 | setting	finds	0.1600
 71 | organise	used	0.1600
 72 | considers	allows	0.3000
 73 | allowed	sets	0.2600
 74 | led	organised	0.2600
 75 | helped	starts	0.1800
 76 | recognise	considered	0.3400
 77 | makes	increases	0.2500
 78 | given	showing	0.2900
 79 | creating	exist	0.3300
 80 | happened	showed	0.3700
 81 | starts	refused	0.0700
 82 | establish	creating	0.3900
 83 | employ	affect	0.2600
 84 | working	developing	0.4600
 85 | take	providing	0.1100
 86 | required	considers	0.2000
 87 | affecting	use	0.1800
 88 | recognise	provide	0.1700
 89 | produced	provides	0.3000
 90 | showed	apply	0.2600
 91 | setting	showing	0.3500
 92 | happening	shown	0.2100
 93 | given	use	0.1700
 94 | said	allow	0.1400
 95 | employ	applied	0.2200
 96 | works	dismiss	0.1300
 97 | showing	showed	0.7000
 98 | employ	makes	0.3000
 99 | take	includes	0.1900
100 | refused	provide	0.1500
101 | affected	apply	0.2200
102 | concerned	provides	0.1500
103 | included	allows	0.2700
104 | produce	lead	0.2300
105 | produce	dismiss	0.0700
106 | find	given	0.2600
107 | protected	exist	0.1300
108 | dismiss	finding	0.0900
109 | found	happen	0.2100
110 | give	working	0.1300
111 | reducing	increased	0.0800
112 | take	considered	0.1800
113 | employed	applied	0.2000
114 | pay	includes	0.1500
115 | including	refuses	0.1000
116 | strike	says	0.1900
117 | report	starting	0.1400
118 | included	increased	0.1500
119 | continued	says	0.1200
120 | affect	organising	0.1100
121 | establishing	gives	0.1300
122 | provide	including	0.1900
123 | said	included	0.2100
124 | develop	makes	0.5600
125 | refusing	allows	0.1400
126 | paid	seems	0.1300
127 | establishing	including	0.2800
128 | seem	said	0.2400
129 | seem	called	0.1800
130 | happens	require	0.2100
131 | use	working	0.1900
132 | take	leading	0.1300
133 | working	worked	0.7800
134 | refusing	exist	0.0900
135 | establish	requires	0.1800
136 | allows	employ	0.1700
137 | increased	reported	0.2000
138 | seemed	protects	0.1000
139 | seems	allowed	0.1300
140 | lead	require	0.2000
141 | affect	showed	0.2100
142 | pay	led	0.1800
143 | made	affect	0.2300
144 | employ	using	0.3300
145 | 


--------------------------------------------------------------------------------
/models/support.py:
--------------------------------------------------------------------------------
  1 | # this file contains common functions that are used by models
  2 | import pickle
  3 | import numpy as np
  4 | from theano import tensor as T
  5 | from libraries.utils.paths_and_files import create_folders_if_not_exist
  6 | from pickle import UnpicklingError
  7 | from theano.tensor.shared_randomstreams import RandomStreams
  8 | from theano.sandbox.rng_mrg import MRG_RandomStreams as MRG_RandomStreams
  9 | 
 10 | seed = 1
 11 | r_stream = RandomStreams(seed=seed)
 12 | r_gpu_stream = MRG_RandomStreams(seed=seed)
 13 | 
 14 | 
 15 | def sample_words(self, batch_size, nr_neg_samples):
 16 |     """
 17 |     a function that is used for negative sampling context words, draws sample based on unigram distribution
 18 |     :param batch_size:
 19 |     :return: a matrix of size [batch_size x nr_of_negative_samples]
 20 | 
 21 |     """
 22 |     return self.r_stream.choice(size=(batch_size, nr_neg_samples), replace=True,
 23 |                                 a=self.vocab_size, p=self.uni_distr, dtype='int32')
 24 | 
 25 | 
 26 | def kl_diag(mu_q, sigma_q, mu_p, sigma_p, eps):
 27 |     """
 28 |     Kullback Leibler divergence between two diagonal Gaussians
 29 |     :return: tensor [batch_size x 1]
 30 | 
 31 |     """
 32 |     d = mu_q.shape[1]
 33 |     sigma_p_inv = T.pow(sigma_p + 1e-6, -1)
 34 |     tra = T.sum(sigma_p_inv * sigma_q, axis=1)
 35 |     quadr = T.sum(sigma_p_inv * ((mu_p - mu_q)**2), axis=1)
 36 |     log_det_p = T.sum(T.log(sigma_p), axis=1)
 37 |     log_det_q = T.sum(T.log(sigma_q + eps), axis=1)
 38 |     log_det = log_det_p - log_det_q
 39 |     return 0.5 * (tra + quadr - d + log_det)
 40 | 
 41 | 
 42 | def kl_spher(mu_q, sigma_q, mu_p, sigma_p):
 43 |     """
 44 |     Kullback Leibler divergence between two spherical Gaussians
 45 |     :return: tensor [batch_size x 1]
 46 | 
 47 |     """
 48 |     d = mu_q.shape[1]
 49 |     sigma_p_inv = (1.0/sigma_p)
 50 |     tra = d * sigma_q*sigma_p_inv
 51 |     quadr = sigma_p_inv * T.sum((mu_p - mu_q)**2, axis=1, keepdims=True)
 52 |     log_det = - d*T.log(sigma_q * sigma_p_inv)
 53 |     res = 0.5 * (tra + quadr - d + log_det)
 54 |     return res.reshape((-1, ))
 55 | 
 56 | 
 57 | def l2_sqrd(x, axis=1):
 58 |     return T.sum(x**2, axis=axis)
 59 | 
 60 | 
 61 | # uniform init so far only
 62 | def init_weights(size, low_high_factor=100, scale_factor=1.):
 63 |     """
 64 | 
 65 |     :param size: size of a matrix to initialize
 66 |     :param low_high_factor: a factor in the initialization (see code)
 67 |     :return: initialized matrix of the same size as "size"
 68 |     """
 69 |     return np.float32(scale_factor)*np.float32(np.random.uniform(low=-low_high_factor**-0.5, high=low_high_factor**-0.5, size=size))
 70 | 
 71 | 
 72 | def init_weights2(size, low_factor=-1, high_factor=1, scale_factor=1.):
 73 |     """
 74 |     similar to init_weights but with decoupled low and high factors
 75 |     :param size: size of a matrix to initialize
 76 |     :return: initialized matrix of the same size as "size"
 77 | 
 78 |     """
 79 |     return np.float32(scale_factor)*np.float32(np.random.uniform(low=low_factor, high=high_factor, size=size))
 80 | 
 81 | 
 82 | def write_vectors(vocab, file_path, embeddings_function):
 83 |     """
 84 |     # extracts word vectors via embeddings_function and writes them into a file
 85 |     :param vocab: vocabulary object
 86 |     :param file_path: where to write vectors
 87 |     :param embeddings_function: a function that takes word id as input and return a vector embedding
 88 |     """
 89 |     create_folders_if_not_exist(file_path)
 90 |     with open(file_path, 'w') as output_file:
 91 |         for word_obj in vocab:
 92 |             word_vec = embeddings_function(word_obj.id)
 93 |             output_file.write(word_obj.token + " " + " ".join(str(f) for f in word_vec)+"\n")
 94 | 
 95 | 
 96 | def load(file_path):
 97 |     """
 98 |     a parameters loading function that is used to pre-loading pre-trained parameters to a model
 99 |     :param file_path: a path of a file that contains parameters in the format [parm_name:parm]
100 | 
101 |     """
102 |     f = open(file_path, 'rb')
103 |     while True:
104 |         try:
105 |             name, param = pickle.load(f)
106 |             yield name, param
107 |         except (EOFError, UnpicklingError):
108 |             break
109 |     f.close()


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/measures/generalized_average_precision.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | See following paper for quick description of GAP:
  3 | http://aclweb.org/anthology//P/P10/P10-1097.pdf
  4 | '''
  5 | 
  6 | from operator import itemgetter
  7 | from random import shuffle
  8 | import copy
  9 | 
 10 | class GeneralizedAveragePrecision(object):
 11 |        
 12 |     @staticmethod
 13 |     def accumulate_score(gold_vector):
 14 |         accumulated_vector = []
 15 |         accumulated_score = 0
 16 |         for (key, score) in gold_vector:
 17 |             accumulated_score += float(score)
 18 |             accumulated_vector.append([key, accumulated_score])
 19 |         return accumulated_vector
 20 |     
 21 | 
 22 |     
 23 |     '''        
 24 |     gold_vector: a vector of pairs (key, score) representing all valid results
 25 |     evaluated_vector: a vector of pairs (key, score) representing the results retrieved by the evaluated method
 26 |     gold_vector and evaluated vector don't need to include the same keys or be in the same length
 27 |     '''
 28 |             
 29 |     @staticmethod
 30 |     def calc(gold_vector, evaluated_vector, random=False):  
 31 |         gold_map = {}
 32 |         for [key, value] in gold_vector:
 33 |             gold_map[key]=value
 34 |         sorted_gold_vector = sorted(gold_vector, key=itemgetter(1), reverse=True)          
 35 |         gold_vector_accumulated = GeneralizedAveragePrecision.accumulate_score(sorted_gold_vector)
 36 | 
 37 | 
 38 |         ''' first we use the eval score to sort the eval vector accordingly '''
 39 |         if random is False:
 40 |             sorted_evaluated_vector = sorted(evaluated_vector, key=itemgetter(1), reverse=True)
 41 |         else:
 42 |             sorted_evaluated_vector = copy.copy(evaluated_vector)
 43 |             shuffle(sorted_evaluated_vector)
 44 |         sorted_evaluated_vector_with_gold_scores = []
 45 |         ''' now we replace the eval score with the gold score '''
 46 |         for (key, score) in sorted_evaluated_vector:
 47 |             if (key in gold_map.keys()):
 48 |                 gold_score = gold_map.get(key)
 49 |             else:
 50 |                 gold_score = 0
 51 |             sorted_evaluated_vector_with_gold_scores.append([key, gold_score])
 52 |         evaluated_vector_accumulated = GeneralizedAveragePrecision.accumulate_score(sorted_evaluated_vector_with_gold_scores)   
 53 |                                   
 54 |         ''' this is sum of precisions over all recall points '''                          
 55 |         i = 0
 56 |         nominator = 0.0
 57 |         for (key, accum_score) in evaluated_vector_accumulated:
 58 |             i += 1
 59 |             if (key in gold_map.keys()) and (gold_map.get(key) > 0):
 60 |                 nominator += accum_score/i
 61 |                 
 62 |         ''' this is the optimal sum of precisions possible based on the gold standard ranking '''        
 63 |         i = 0
 64 |         denominator = 0
 65 |         for (key, accum_score) in gold_vector_accumulated:
 66 |             if gold_map.get(key) > 0:
 67 |                 i += 1
 68 |                 denominator += accum_score/i
 69 |                 
 70 |         if (denominator == 0.0):
 71 |             gap = -1
 72 |         else:            
 73 |             gap = nominator/denominator
 74 |         
 75 |         return gap
 76 |                 
 77 |                 
 78 |     @staticmethod
 79 |     def calcTopN(gold_vector, evaluated_vector, n, measure_type):  
 80 |         gold_map = {}
 81 |         for [key, value] in gold_vector:
 82 |             gold_map[key]=value
 83 |         gold_vector_sorted = sorted(gold_vector, key=itemgetter(1), reverse=True)
 84 |         gold_top_score_sum = sum([float(score) for (key, score) in gold_vector_sorted[0:n]])
 85 |                   
 86 |         evaluated_top_score_sum = 0
 87 |         sorted_evaluated_vector = sorted(evaluated_vector, key=itemgetter(1), reverse=True)
 88 |         for (key, score) in sorted_evaluated_vector[0:n]:
 89 |             if key in gold_map:
 90 |                 gold_score = gold_map[key]
 91 |             else:
 92 |                 gold_score = 0
 93 |             evaluated_top_score_sum += float(gold_score)
 94 |         
 95 |         if measure_type == 'sap' or measure_type == 'wap':
 96 |             denominator = n
 97 |         else:
 98 |             denominator = gold_top_score_sum
 99 |                 
100 |         return evaluated_top_score_sum/denominator


--------------------------------------------------------------------------------
/libraries/evaluation/GloVe/README.md:
--------------------------------------------------------------------------------
 1 | ## GloVe: Global Vectors for Word Representation
 2 | 
 3 | <em>frog</em> nearest neighbors | Litoria             |  Leptodactylidae | Rana | Eleutherodactylus
 4 | -------------------------|:-------------------------:|:-------------------------:|:-------------------------:|:-------------------------:|
 5 | <li> frogs <li> toad <li> litoria <li> leptodactylidae <li> rana <li> lizard <li> eleutherodactylus | ![](http://nlp.stanford.edu/projects/glove/images/litoria.jpg)  |  ![](http://nlp.stanford.edu/projects/glove/images/leptodactylidae.jpg) |  ![](http://nlp.stanford.edu/projects/glove/images/rana.jpg) |  ![](http://nlp.stanford.edu/projects/glove/images/eleutherodactylus.jpg)
 6 | 
 7 | We provide an implementation of the GloVe model for learning word representations. Please see the [project page](http://nlp.stanford.edu/projects/glove/) for more information.
 8 | 
 9 | man -> woman             |  city -> zip | comparative -> superlative
10 | :-------------------------:|:-------------------------:|:-------------------------:|:-------------------------:|
11 | ![](http://nlp.stanford.edu/projects/glove/images/man_woman_small.jpg)  |   ![](http://nlp.stanford.edu/projects/glove/images/city_zip_small.jpg) |  ![](http://nlp.stanford.edu/projects/glove/images/comparative_superlative_small.jpg)
12 | 
13 | ## Download pre-trained word vectors
14 | Pre-trained word vectors are made available under the <a href="http://opendatacommons.org/licenses/pddl/">Public Domain Dedication
15 | and License</a>
16 | <div class="entry">
17 | <ul style="padding-left:0px; margin-top:0px; margin-bottom:0px">
18 |   <li> <a href="http://dumps.wikimedia.org/enwiki/20140102/">Wikipedia 2014</a> + <a href="https://catalog.ldc.upenn.edu/LDC2011T07">Gigaword 5</a> (6B tokens, 400K vocab, uncased, 50d, 100d, 200d, &amp; 300d vectors, 822 MB download): <a href="http://nlp.stanford.edu/data/glove.6B.zip">glove.6B.zip</a> </li>
19 |   <li> Common Crawl (42B tokens, 1.9M vocab, uncased, 300d vectors, 1.75 GB download): <a href="http://nlp.stanford.edu/data/glove.42B.300d.zip">glove.42B.300d.zip</a> </li>
20 |   <li> Common Crawl (840B tokens, 2.2M vocab, cased, 300d vectors, 2.03 GB download): <a href="http://nlp.stanford.edu/data/glove.840B.300d.zip">glove.840B.300d.zip</a> </li>
21 |   <li> Twitter (2B tweets, 27B tokens, 1.2M vocab, uncased, 25d, 50d, 100d, &amp; 200d vectors, 1.42 GB download): <a href="http://nlp.stanford.edu/data/glove.twitter.27B.zip">glove.twitter.27B.zip</a> Ruby <a href="preprocess-twitter.rb">script</a> for preprocessing Twitter data </li>
22 | </ul>
23 | </div>
24 | 
25 | ## Train word vectors on a new corpus
26 | 
27 |     $ git clone http://github.com/stanfordnlp/glove
28 |     $ cd glove && make
29 |     $ ./demo.sh
30 | 
31 | The demo.sh scipt downloads a small corpus, consisting of the first 100M characters of Wikipedia. It collects unigram counts, constructs and shuffles cooccurrence data, and trains a simple version of the GloVe model. It also runs a word analogy evaluation script in python. Continue reading for further usage details and instructions for how to run on your own corpus.
32 | 
33 | ### Package Contents
34 | This package includes four main tools:
35 | #### 1) vocab_count
36 | Constructs unigram counts from a corpus, and optionally thresholds the resulting vocabulary based on total vocabulary size or minimum frequency count. This file should already consist of whitespace-separated tokens. Use something like the Stanford Tokenizer (http://nlp.stanford.edu/software/tokenizer.shtml) first on raw text.
37 | #### 2) cooccur
38 | Constructs word-word cooccurrence statistics from a corpus. The user should supply a vocabulary file, as produced by 'vocab_count', and may specify a variety of parameters, as described by running './build/cooccur'.
39 | #### 3) shuffle
40 | Shuffles the binary file of cooccurrence statistics produced by 'cooccur'. For large files, the file is automatically split into chunks, each of which is shuffled and stored on disk before being merged and shuffled togther. The user may specify a number of parameters, as described by running './build/shuffle'.
41 | #### 4) glove
42 | Train the GloVe model on the specified cooccurrence data, which typically will be the output of the 'shuffle' tool. The user should supply a vocabulary file, as given by 'vocab_count', and may specify a number of other parameters, which are described by running './build/glove'.
43 | 
44 | ### License
45 | All work contained in this package is licensed under the Apache License, Version 2.0. See the include LICENSE file.
46 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/main/lex_sub.py:
--------------------------------------------------------------------------------
 1 | import os, sys
 2 | import operator
 3 | from support import read_candidates, wf2ws, flatten, get_best_scores_for_candidates, conll_skip_sentence
 4 | from context_instance import ContextInstance
 5 | import numpy as np
 6 | from libraries.utils.paths_and_files import create_folders_if_not_exist
 7 | from libraries.evaluation.lexsub.jcs.evaluation.lst.lst_gap import compute_gap
 8 | 
 9 | 
10 | def lex_sub(embeddings, input_type, target_words_vocab, context_words_vocab, output_path, candidates_file, conll_file,
11 |             test_file, gold_file, half_window_size=5, arithm_type=None):
12 |     conll = None
13 |     if input_type == "dependency":
14 |         conll = open(conll_file, "r")
15 | 
16 |     # this is a very time consuming part
17 |     all_candidates = read_candidates(candidates_file, allowed_words=target_words_vocab)  # read candidates
18 |     ranked_file_path = os.path.join(output_path, "ranked_bsg.txt")
19 |     create_folders_if_not_exist(ranked_file_path)
20 |     output = open(ranked_file_path, "w")
21 |     with open(test_file, 'r') as f:
22 |         for i, line in enumerate(f):
23 |             # if i % 1 == 0:
24 |             #     print "----------------------"
25 |             #     print 'reading line # %d' % (i+1)
26 |             lst_instance = ContextInstance(line)
27 | 
28 |             # find if the target(center) word in in the vocab ( either his lemma or word)
29 |             target = None
30 |             if lst_instance.target in target_words_vocab:
31 |                 target = lst_instance.target
32 |             elif lst_instance.target_lemma in target_words_vocab:
33 |                 target = lst_instance.target_lemma
34 | 
35 |             # sometimes the target(center) word can be not in vocab, so we skip that instance
36 |             if target and lst_instance.target_key in all_candidates:
37 |                 left_context, right_context = lst_instance.get_neighbors(half_window_size) if input_type == "normal" else\
38 |                     lst_instance.get_dep_context(conll, lst_instance.target)
39 | 
40 |                 # perform filtering by throwing away all words that do not appear in vocab
41 |                 # I do it after creating windows because of indexing problem
42 |                 left_context = [c for c in left_context if c in context_words_vocab]
43 |                 right_context = [c for c in right_context if c in context_words_vocab]
44 | 
45 |                 # print "---------------------------------------"
46 |                 # print "left context for the word \"%s\" is : %s" % (target, str(left_context))
47 |                 # print "right context for the word \"%s\" is : %s" % (target, str(right_context))
48 | 
49 |                 # grab candidates
50 |                 candidates = all_candidates[lst_instance.target_key]  # note that candidates is a dictionary
51 | 
52 |                 # format candidates properly
53 |                 formatted_candidates = np.unique(flatten(candidates.values()))
54 | 
55 |                 # rank them
56 |                 scores = embeddings.score(target=target, left_context=left_context, right_context=right_context,
57 |                                           candidates=formatted_candidates, ar_type=arithm_type)
58 | 
59 |                 # print(candidates.items())
60 |                 # perform mapping to all candidates
61 |                 scores = get_best_scores_for_candidates(candidates, scores)
62 | 
63 |                 # sort in descending order
64 |                 sorted_scores = sorted(scores.items(), key=operator.itemgetter(1), reverse=True)
65 |                 formatted_scores = [' '.join([word, wf2ws(score)]) for word, score in sorted_scores]
66 |                 formatted_scores = "\t" + "\t".join(formatted_scores)
67 |             else:
68 |                 formatted_scores = ""
69 |                 # print 'center word %s is not in the vocabulary' %(target)
70 |                 # print "skipping start"
71 |                 if input_type == "dependency":
72 |                     conll_skip_sentence(conll)  # to make sure that pointers are aligned
73 |                 # print "skipping end"
74 | 
75 |             # write to a file
76 |             output.write("RANKED\t" + ' '.join([lst_instance.full_target_key, lst_instance.target_id]) + formatted_scores + "\n")
77 |     if conll:
78 |         conll.close()
79 |     output.close()
80 | 
81 |     # compute gap
82 |     compute_gap(gold_file_path=gold_file, eval_file_path=ranked_file_path, out_file_path=os.path.join(output_path, "gap.txt"),
83 |                 ignore_mwe=True)
84 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/lst/lst_gap.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Used to compute GAP score for the LST ranking task
  3 | 
  4 | '''
  5 | 
  6 | import sys
  7 | import random
  8 | import re
  9 | 
 10 | from libraries.evaluation.lexsub.jcs.evaluation.measures.generalized_average_precision import GeneralizedAveragePrecision
 11 | 
 12 | 
 13 | #take.v 25 :: consider 2;accept 1;include 1;think about 1;
 14 | def read_gold_line(gold_line, ignore_mwe):
 15 |     segments = gold_line.split("::")
 16 |     instance_id = segments[0].strip()
 17 |     gold_weights = []
 18 |     line_candidates = segments[1].strip().split(';')
 19 |     for candidate_count in line_candidates:
 20 |         if len(candidate_count) > 0:
 21 |             delimiter_ind = candidate_count.rfind(' ')
 22 |             candidate = candidate_count[:delimiter_ind]
 23 |             if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)):
 24 |                 continue
 25 |             count = candidate_count[delimiter_ind:]
 26 |             try:
 27 |                 gold_weights.append((candidate, int(count)))
 28 |             except ValueError as e:
 29 |                 print e
 30 |                 print gold_line
 31 |                 print "cand=%s count=%s" % (candidate,count)
 32 |                 sys.exit(1)
 33 |         
 34 |     return instance_id, gold_weights
 35 | 
 36 | #RESULT    find.v 71    show 0.34657
 37 | def read_eval_line(eval_line, ignore_mwe):
 38 |     eval_weights = []
 39 |     segments = eval_line.split("\t")
 40 |     instance_id = segments[1].strip()
 41 |     for candidate_weight in segments[2:]:
 42 |         if len(candidate_weight) > 0:
 43 |             delimiter_ind = candidate_weight.rfind(' ')
 44 |             candidate = candidate_weight[:delimiter_ind]
 45 |             weight = candidate_weight[delimiter_ind:]
 46 |             if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)):
 47 |                 continue
 48 |             try:
 49 |                 eval_weights.append((candidate, float(weight)))
 50 |             except:
 51 |                 print "Error appending: %s %s" % (candidate, weight)
 52 | 
 53 |     return instance_id, eval_weights
 54 | 
 55 | def compute_gap(gold_file_path, eval_file_path, out_file_path, ignore_mwe=False, randomize=False):
 56 | 
 57 |     gold_file = open(gold_file_path, 'r')
 58 |     eval_file = open(eval_file_path, 'r')
 59 |     out_file = open(out_file_path, 'w')
 60 | 
 61 |     gold_data = {}
 62 |     eval_data = {}
 63 | 
 64 |     i=0
 65 |     sum_gap = 0.0
 66 |     for eval_line in eval_file:
 67 |         eval_instance_id, eval_weights = read_eval_line(eval_line, ignore_mwe)
 68 |         eval_data[eval_instance_id] = eval_weights
 69 | 
 70 |     for gold_line in gold_file:
 71 |         gold_instance_id, gold_weights = read_gold_line(gold_line, ignore_mwe)
 72 |         gold_data[gold_instance_id] = gold_weights
 73 | 
 74 |     ignored = 0
 75 |     for gold_instance_id, gold_weights in gold_data.iteritems():
 76 |         eval_weights = eval_data[gold_instance_id]
 77 |         gap = GeneralizedAveragePrecision.calc(gold_weights, eval_weights, randomize)
 78 |         if (gap < 0): # this happens when there is nothing left to rank after filtering the multi-word expressions
 79 |             ignored += 1
 80 |             continue
 81 |         out_file.write(gold_instance_id + "\t" + str(gap) + "\n")
 82 |         i += 1
 83 |         sum_gap += gap
 84 | 
 85 |     mean_gap = sum_gap/i
 86 |     out_file.write("\ngold_data %d eval_data %d\n" % (len(gold_data),len(eval_data)))
 87 |     out_file.write("\nRead %d test instances\n" % i)
 88 |     out_file.write("\nIgnored %d test instances (couldn't compute gap)\n" % ignored)
 89 |     out_file.write("\nMEAN_GAP\t" + str(mean_gap) + "\n")
 90 | 
 91 | 
 92 |     print "MEAN GAP is %f" %mean_gap
 93 | 
 94 |     gold_file.close()
 95 |     eval_file.close()
 96 |     out_file.close()
 97 | 
 98 | if __name__ == '__main__':
 99 |     
100 |     if len(sys.argv) < 4:
101 |         print "usage: %s <gold-filename> <eval-filename> <output-filename> [no-mwe] [random]"  % (sys.argv[0])
102 |         sys.exit(1)
103 |         
104 | 
105 |     gold_file_path = sys.argv[1]
106 |     eval_file_path = sys.argv[2]
107 |     out_file_path = sys.argv[3]
108 | 
109 |     if len(sys.argv) > 4 and sys.argv[4] == 'no-mwe':
110 |         ignore_mwe = True
111 |     else:
112 |         ignore_mwe = False
113 |         
114 |     if len(sys.argv) > 5 and sys.argv[5] == 'random':
115 |         randomize = True
116 |     else:
117 |         randomize = False
118 | 
119 |     compute_gap(gold_file_path, eval_file_path, out_file_path, ignore_mwe, randomize)
120 |     
121 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/data/embedding.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import heapq
  3 | import math
  4 | import time
  5 | 
  6 | class Embedding:
  7 |     
  8 |     def __init__(self, path):
  9 |         self.m = self.normalize(np.load(path + '.npy'))
 10 |         self.dim = self.m.shape[1]
 11 |         self.wi, self.iw = self.readVocab(path + '.vocab')
 12 |         
 13 |     
 14 |     def zeros(self):
 15 |         return np.zeros(self.dim)
 16 |     
 17 |     def dimension(self):
 18 |         return self.dim
 19 |     
 20 |     def normalize(self, m):
 21 |         norm = np.sqrt(np.sum(m*m, axis=1))
 22 |         norm[norm==0] = 1
 23 |         return m / norm[:, np.newaxis]
 24 |     
 25 |     def readVocab(self, path):
 26 |         vocab = []
 27 |         with open(path) as f:
 28 |             for line in f:
 29 |                 vocab.extend(line.strip().split())
 30 |         return dict([(w, i) for i, w in enumerate(vocab)]), vocab
 31 |     
 32 |     def __contains__(self, w):
 33 |         return w in self.wi
 34 |         
 35 |     def represent(self, w):
 36 |         return self.m[self.wi[w], :]
 37 |     
 38 |     def scores(self, vec):
 39 |         return np.dot(self.m, vec)
 40 | 
 41 |     # why +1 .../2?
 42 |     def pos_scores(self, vec):
 43 |         return (np.dot(self.m, vec)+1)/2
 44 | 
 45 |     def pos_scores2(self, vec):
 46 |         scores = np.dot(self.m, vec)
 47 |         scores[scores < 0.0] = 0.0
 48 |         return scores
 49 | 
 50 | 
 51 |     def top_scores(self, scores, n=10):
 52 |         if n <= 0:
 53 |             n = len(scores)
 54 |         return heapq.nlargest(n, zip(self.iw, scores), key=lambda x: x[1])
 55 |     
 56 |     def closest(self, w, n=10):                        
 57 |         scores = np.dot(self.m, self.represent(w))               
 58 |         return self.top_scores(scores,n)
 59 | 
 60 |     def closest_with_time(self, w, n=10):        
 61 |         start = time.time()                
 62 |         scores = np.dot(self.m, self.represent(w))
 63 |         end = time.time()        
 64 | #        print "\nDeltatime: %f msec\n" % ((end-start)*1000)
 65 |         return self.top_scores(scores,n), end-start
 66 | 
 67 |     def closest_vec(self, wordvec, n=10):
 68 |         #scores = self.m.dot(self.represent(w))
 69 |         scores = np.dot(self.m, wordvec)
 70 |         return self.top_scores(scores, n)
 71 | #        if n <= 0:
 72 | #            n = len(scores)
 73 | #        return heapq.nlargest(n, zip(self.iw, scores))
 74 |     
 75 |     def closest_vec_filtered(self, wordvec, vocab, n=10):
 76 |         scores = np.dot(self.m, wordvec)
 77 |         if n <= 0:
 78 |             n = len(scores)
 79 |         scores_words = zip(self.iw, scores)
 80 |         for i in xrange(0,len(scores_words)):
 81 |             if not scores_words[i][1] in vocab: 
 82 |                 scores_words[i] = (-1, scores_words[i][0])
 83 |         return heapq.nlargest(n, zip(self.iw, scores), key=lambda x: x[1])
 84 |       
 85 |     def closest_prefix(self, w, prefix, n=10):
 86 |         scores = np.dot(self.m, self.represent(w))
 87 |         scores_words = zip(self.iw, scores)
 88 |         for i in xrange(0,len(scores_words)):
 89 |             if not scores_words[i][1].startswith(prefix): 
 90 |                 scores_words[i] = (-1, scores_words[i][0])
 91 |         return heapq.nlargest(n, scores_words, key=lambda x: x[1])
 92 |     
 93 |     def closest_filtered(self, w, vocab, n=10):
 94 |         scores = np.dot(self.m, self.represent(w))
 95 |         scores_words = zip(self.iw, scores)
 96 |         for i in xrange(0,len(scores_words)):
 97 |             if not scores_words[i][1] in vocab: 
 98 |                 scores_words[i] = (-1, scores_words[i][0])
 99 |         return heapq.nlargest(n, scores_words, key=lambda x: x[1])
100 |  
101 |     def similarity(self, w1, w2):
102 |         return self.represent(w1).dot(self.represent(w2))    
103 | 
104 | def norm_vec(vec):
105 |     length = 1.0 * math.sqrt(sum(val ** 2 for val in vec))
106 |     return [val/length for val in vec]
107 | 
108 | def score2string(score):
109 |     return score[1] + "\t" + '{0:1.3f}'.format(score[0])
110 | 
111 | 
112 | def closest_sym_scores(targets, subs, w, n):
113 |     w_target_vec = targets.represent(w)
114 |     w_sub_vec = subs.represent(w)
115 |     w2subs = subs.closest_vec(w_target_vec,0)
116 |     w2subs2w = []
117 |     for entry in w2subs:
118 |         score = (entry[0]+1)/2
119 |         sub = entry[1]
120 |         sub_target_vec = targets.represent(sub)       
121 |         rev_score = (np.dot(sub_target_vec, w_sub_vec)+1)/2
122 |         w2subs2w.append((math.sqrt(score * rev_score), sub))
123 |     return heapq.nlargest(n, w2subs2w)
124 |     
125 |     
126 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/main/support.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import re
  3 | from nltk.corpus import wordnet as wn
  4 | from pos import to_wordnet_pos
  5 | from nltk.stem.wordnet import WordNetLemmatizer
  6 | 
  7 | 
  8 | def read_vectors(file, header=False):
  9 |     dict = {}
 10 |     with open(file, 'r') as f:
 11 |             for i, sentence in enumerate(f):
 12 |                 if header and i == 0:
 13 |                     continue
 14 |                 parts = sentence.strip().split(" ")
 15 |                 word = parts[0]
 16 |                 vec = np.array(parts[1:], dtype="float32")
 17 |                 # normalize
 18 |                 vec = vec/np.sum(vec**2)**0.5
 19 |                 dict[word] = vec
 20 |     return dict
 21 | 
 22 | 
 23 | def conll_skip_sentence(conll, eos_symbol="<eol>"):
 24 |     while True:
 25 |         line = re.split(r'\t', conll.readline())
 26 |         word = line[1]
 27 |         if word == eos_symbol:
 28 |             break
 29 |     conll.readline()
 30 | 
 31 | 
 32 | 
 33 | def read_candidates(candidates_file, allowed_words):
 34 |     target2candidates = {}
 35 |     # finally.r::eventually;ultimately
 36 |     print "--- reading candidates ---"
 37 |     with open(candidates_file, 'r') as f:
 38 |         for i, line in enumerate(f):
 39 |             # if (i+1) % 1 == 0:
 40 |             #     print 'read %d lines' %(i+1)
 41 |             segments = line.strip().split('::')
 42 |             target = segments[0]
 43 |             word, pos = target.split('.')
 44 |             # assuming that candidates are unique initially
 45 |             candidates = [str(c) for c in segments[1].split(';') if c.find(" ") == -1]  # forbid composite words
 46 |             candidates = filter_candidate(pos, candidates, allowed_words)
 47 |             target2candidates[target] = candidates
 48 |     print '--- done ---'
 49 |     return target2candidates
 50 | 
 51 | 
 52 | # performs a 3 step filtering by matching with vocabulary of allowed_words
 53 | # note that candidates is a dictionary old_cand -> [words in vocab], where new_cands are candidates
 54 | # we find matching our allowed_words
 55 | def filter_candidate(pos, candidates, allowed_words):
 56 |     # this fixes the problem with non_wordnet pos tags
 57 |     if pos in to_wordnet_pos:
 58 |         pos = to_wordnet_pos[pos]
 59 |     new_candidates = {}
 60 |     for word in allowed_words.keys():
 61 |         if not is_ascii(word):
 62 |             continue
 63 |         lemma = WordNetLemmatizer().lemmatize(word, pos)
 64 |         # we try 3 things
 65 |         tries = [lemma, lemma.title, word, word.title()]
 66 |         # tries = [lemma]
 67 |         for w in tries:
 68 |             if w in candidates:
 69 |                 if w not in new_candidates:
 70 |                     new_candidates[w] = []
 71 |                 new_candidates[w].append(word)
 72 |     return new_candidates
 73 | 
 74 | def is_ascii(s):
 75 |     return all(ord(c) < 128 for c in s)
 76 | 
 77 | def cosine_sim(x, y):
 78 |     return float(np.sum(x*y))/float(np.sqrt(np.sum(x**2)*np.sum(y**2)))
 79 | 
 80 | # some wierd form of dot pos dot product
 81 | # with normalized vectors
 82 | def pos_cosine_sim_normed(x, y):
 83 |         return (np.dot(x, y)+1)/2
 84 | 
 85 | def wf2ws(weight):
 86 |         return '{0:1.5f}'.format(weight)
 87 | 
 88 | 
 89 | 
 90 | def morphify(word, pos):
 91 |     """ morph a word """
 92 |     synsets = wn.synsets(word, pos=pos)
 93 | 
 94 |     # Word not found
 95 |     if not synsets:
 96 |         return []
 97 | 
 98 |     # Get all  lemmas of the word
 99 |     lemmas = [l for s in synsets for l in s.lemmas() if s.name().split('.')[1] == pos]
100 | 
101 |     # Get related forms
102 |     derivationally_related_forms = [(l, l.derivationally_related_forms()) for l in lemmas]
103 | 
104 |     # filter only the targeted pos
105 |     related_lemmas = [l for drf in derivationally_related_forms \
106 |                            for l in drf[1] if l.synset().name().split('.')[1] == pos]
107 | 
108 |     # Extract the words from the lemmas
109 |     words = [l.name() for l in related_lemmas]
110 |     len_words = len(words)
111 | 
112 |     # Build the result in the form of a list containing tuples (word, probability)
113 |     result = [(w, float(words.count(w))/len_words) for w in set(words)]
114 |     result.sort(key=lambda w: -w[1])
115 | 
116 |     # return all the possibilities sorted by probability
117 |     return result
118 | 
119 | 
120 | def flatten(list):
121 |     return [item for sublist in list for item in sublist]
122 | 
123 | # returns (candidate, best_score)
124 | def get_best_scores_for_candidates(candidates, scores):
125 |     new_scores = {}
126 |     for cand, words in candidates.items():
127 |         # now choose the best one
128 |         new_scores[cand] = max([scores[w] for w in words])
129 |     return new_scores
130 | 
131 | 


--------------------------------------------------------------------------------
/libraries/batch_iterators/support.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | # contains helper functions that are used in different iterators
  3 | import numpy as np
  4 | np.random.seed(1)
  5 | 
  6 | def pad_sents(sentences, max_length, pad_symbol, mask_current=False, padding_mode='both'):
  7 |     """
  8 |     pads many sentences
  9 |     :param mask_current: whether sentence elements that have pad symbols should be masked
 10 |     :param padding_mode: whether pad only the left side(dead useful for LSTM)
 11 | 
 12 |     """
 13 |     padded_sentences, masks = [], []
 14 |     for sentence in sentences:
 15 |         x, m = pad_sent(sentence, max_length, pad_symbol, mask_current=mask_current,
 16 |                         padding_mode=padding_mode)
 17 |         padded_sentences.append(x)
 18 |         masks.append(m)
 19 |     return np.array(padded_sentences, dtype="int32"), np.array(masks, dtype="float32")
 20 | 
 21 | 
 22 | # pads sentence if necessary with some symbol's id
 23 | # note that it returns a binary mask too
 24 | # TODO: write a better documentation for this function!
 25 | def pad_sent(sentence, max_length, pad_symbol, mask_current=False, padding_mode='both'):
 26 |     assert padding_mode in ['left', 'both', 'right']
 27 |     pad_number = max_length - len(sentence)
 28 | 
 29 |     if mask_current:
 30 |         masked_sentence = [s != pad_symbol for s in sentence]
 31 | 
 32 |     # will perform truncation if the sentence is too long
 33 |     if pad_number < 0:
 34 |         if mask_current:
 35 |             mask = masked_sentence
 36 |         else:
 37 |             mask = np.ones((max_length, ))
 38 |         res = sentence[:max_length]
 39 |         return res, mask
 40 | 
 41 |     # padding only left side
 42 |     if padding_mode == 'left':
 43 |         res = [pad_symbol] * pad_number + sentence
 44 |         if mask_current:
 45 |             mask = [0]*pad_number + masked_sentence
 46 |         else:
 47 |             mask = [0]*pad_number + [1]*len(sentence)
 48 |     # padding both sides
 49 |     elif padding_mode == 'both':
 50 |         res = [pad_symbol] * (pad_number/2) + sentence + [pad_symbol] * (pad_number/2)
 51 |         if mask_current:
 52 |             mask = [0]*(pad_number/2) + masked_sentence + [0]*(pad_number/2)
 53 |         else:
 54 |             mask = [0]*(pad_number/2) + [1]*len(sentence) + [0]*(pad_number/2)
 55 |         if pad_number % 2 == 1:
 56 |             res += [pad_symbol]
 57 |             mask += [0]
 58 |     # pad only the right side
 59 |     elif padding_mode == "right":
 60 |         res = sentence + [pad_symbol] * pad_number
 61 |         if mask_current:
 62 |             mask = masked_sentence + [0]*pad_number
 63 |         else:
 64 |             mask = [1]*len(sentence) + [0]*pad_number
 65 | 
 66 |     return res, mask
 67 | 
 68 | 
 69 | def allow_with_prob(word_count, total_words_count, subsampling_threshold=1e-5):
 70 |     """
 71 |     Sub-sampling of frequent words: can improve both accuracy and speed for large data sets
 72 |     Source: "Distributed Representations of Words and Phrases and their Compositionality".
 73 | 
 74 |     """
 75 |     freq = float(word_count) / float(total_words_count)
 76 |     removal_prob = 1.0 - np.sqrt(subsampling_threshold / freq)
 77 |     return np.random.random_sample() > removal_prob
 78 | 
 79 | 
 80 | def create_context_windows(sentence, half_window_size=0):
 81 |     """
 82 |     A generic function that either returns (center_word, window_context) or (center_word, left_context, right_context).
 83 |     To switch from first to the second mode, set half_window_size=0
 84 | 
 85 |     """
 86 |     n = len(sentence)
 87 |     for idx in range(half_window_size, n - half_window_size):
 88 |         center_word = sentence[idx]
 89 |         if half_window_size > 0:
 90 |             context = sentence[idx - half_window_size:idx] + sentence[idx + 1:idx + half_window_size + 1]
 91 |             yield (center_word, context)
 92 |         else:
 93 |             assert half_window_size == 0
 94 |             yield (center_word, sentence[0:idx], sentence[idx + 1:])
 95 | 
 96 | 
 97 | def create_continues_context_windows(sentence, special_center_words, half_window_size):
 98 |     """
 99 |     Creates windows where center words are separately marked(by using differnt vocab_ids) from context words.
100 |     This function was used for the BSG's LSTM data batcher.
101 | 
102 |     """
103 |     n = len(sentence)
104 |     for idx in range(half_window_size, n - half_window_size):
105 |         context = sentence[idx - half_window_size:idx] + sentence[idx + 1:idx + half_window_size + 1]
106 |         context_and_center = sentence[idx - half_window_size:idx] + [special_center_words[idx]] + sentence[idx + 1:idx + half_window_size + 1]
107 |         yield (sentence[idx], context, context_and_center)
108 | 
109 | 
110 | def sample_words(batch_size,  vocab_size, distr, nr_neg_samples=1):
111 |     return np.random.choice(size=(batch_size, nr_neg_samples), replace=True, a=vocab_size, p=distr)


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/cs_inferrer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Base class for context sensitive inference modules
  3 | '''
  4 | 
  5 | import re
  6 | from nltk.stem.wordnet import WordNetLemmatizer
  7 | from jcs.data.pos import to_wordnet_pos
  8 | 
  9 | # just something to return in case not enough words were generated
 10 | default_generated_results = ['time', 'people', 'information', 'work', 'first', 'like', 'year', 'make', 'day', 'service']
 11 | 
 12 | #generated_word_re = re.compile('^[a-zA-Z]+(-[a-zA-Z]+)*$')
 13 | generated_word_re = re.compile('^[a-zA-Z]+$')
 14 | 
 15 | 
 16 | class CsInferrer(object):
 17 |     '''
 18 |     classdocs
 19 |     '''
 20 | 
 21 |     def __init__(self):
 22 |         '''
 23 |         Constructor
 24 |         '''
 25 |         self.time = [0.0, 0]
 26 |      
 27 |      
 28 |     def inference_time(self, seconds):
 29 |         self.time[0] += seconds
 30 |         self.time[1] += 1
 31 |                    
 32 |      # processing time in msec
 33 |     def msec_per_word(self):
 34 |         return 1000*self.time[0]/self.time[1] if self.time[1] > 0 else 0.0
 35 |     
 36 |     def generate_inferred(self, result_vec, target_word, target_lemma, pos):
 37 |     
 38 |         generated_results = {}
 39 |         min_weight = None
 40 |         if result_vec is not None:
 41 |             for word, weight in result_vec:
 42 |                 if generated_word_re.match(word) != None: # make sure this is not junk
 43 |                     wn_pos = to_wordnet_pos[pos]
 44 |                     lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
 45 |                     if word != target_word and lemma != target_lemma:
 46 |                         if lemma in generated_results:
 47 |                             weight = max(weight, generated_results[lemma])
 48 |                         generated_results[lemma] = weight
 49 |                         if min_weight is None:
 50 |                             min_weight = weight
 51 |                         else:
 52 |                             min_weight = min(min_weight, weight)
 53 |                             
 54 |         if min_weight is None:
 55 |             min_weight = 0.0
 56 |         i = 0.0                
 57 |         for lemma in default_generated_results:
 58 |             if len(generated_results) >= len(default_generated_results):
 59 |                 break;
 60 |             i -= 1.0
 61 |             generated_results[lemma] = min_weight + i
 62 |             
 63 |                 
 64 |         return generated_results
 65 |     
 66 |     
 67 |     
 68 |     def filter_inferred(self, result_vec, candidates, pos):
 69 |     
 70 |         filtered_results = {}
 71 |         candidates_found = set()
 72 |         # SO There is no way a composite word can appear?!
 73 |         if result_vec != None:
 74 | 
 75 |             # # TODO: this is my modification to test the difference hypothesis in our impls.
 76 |             # for word, weight in result_vec:
 77 |             #     if word in candidates:
 78 |             #         self.add_inference_result(word, weight, filtered_results, candidates_found)
 79 | 
 80 |             for word, weight in result_vec:
 81 |                 wn_pos = to_wordnet_pos[pos]
 82 |                 lemma = WordNetLemmatizer().lemmatize(word, wn_pos)
 83 |                 if lemma in candidates:
 84 |                     self.add_inference_result(lemma, weight, filtered_results, candidates_found)
 85 |                 if lemma.title() in candidates:
 86 |                     self.add_inference_result(lemma.title(), weight, filtered_results, candidates_found)
 87 |                 if word in candidates:  # there are some few cases where the candidates are not lemmatized
 88 |                     self.add_inference_result(word, weight, filtered_results, candidates_found)
 89 |                 if word.title() in candidates:  # there are some few cases where the candidates are not lemmatized
 90 |                     self.add_inference_result(word.title(), weight, filtered_results, candidates_found)
 91 |                     
 92 |         # assign negative weights for candidates with no score
 93 |         # they will appear last sorted according to their unigram count        
 94 | #        candidates_left = candidates - candidates_found
 95 | #        for candidate in candidates_left:            
 96 | #            count = self.w2counts[candidate] if candidate in self.w2counts else 1
 97 | #            score = -1 - (1.0/count) # between (-1,-2] 
 98 | #            filtered_results[candidate] = score   
 99 |          
100 |         return filtered_results
101 |     
102 |     def add_inference_result(self, token, weight, filtered_results, candidates_found):
103 |         candidates_found.add(token)
104 |         best_last_weight = filtered_results[token] if token in filtered_results else None
105 |         if best_last_weight == None or weight > best_last_weight:
106 |             filtered_results[token] = weight
107 |         
108 |     
109 |     
110 |     


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/data/word-sim/EN-WS-353-SIM.txt:
--------------------------------------------------------------------------------
  1 | tiger	cat	7.35
  2 | tiger	tiger	10.00
  3 | plane	car	5.77
  4 | train	car	6.31
  5 | television	radio	6.77
  6 | media	radio	7.42
  7 | bread	butter	6.19
  8 | cucumber	potato	5.92
  9 | doctor	nurse	7.00
 10 | professor	doctor	6.62
 11 | student	professor	6.81
 12 | smart	stupid	5.81
 13 | wood	forest	7.73
 14 | money	cash	9.15
 15 | king	queen	8.58
 16 | king	rook	5.92
 17 | bishop	rabbi	6.69
 18 | fuck	sex	9.44
 19 | football	soccer	9.03
 20 | football	basketball	6.81
 21 | football	tennis	6.63
 22 | Arafat	Jackson	2.50
 23 | physics	chemistry	7.35
 24 | vodka	gin	8.46
 25 | vodka	brandy	8.13
 26 | drink	eat	6.87
 27 | car	automobile	8.94
 28 | gem	jewel	8.96
 29 | journey	voyage	9.29
 30 | boy	lad	8.83
 31 | coast	shore	9.10
 32 | asylum	madhouse	8.87
 33 | magician	wizard	9.02
 34 | midday	noon	9.29
 35 | furnace	stove	8.79
 36 | food	fruit	7.52
 37 | bird	cock	7.10
 38 | bird	crane	7.38
 39 | food	rooster	4.42
 40 | money	dollar	8.42
 41 | money	currency	9.04
 42 | tiger	jaguar	8.00
 43 | tiger	feline	8.00
 44 | tiger	carnivore	7.08
 45 | tiger	mammal	6.85
 46 | tiger	animal	7.00
 47 | tiger	organism	4.77
 48 | tiger	fauna	5.62
 49 | psychology	psychiatry	8.08
 50 | psychology	science	6.71
 51 | psychology	discipline	5.58
 52 | planet	star	8.45
 53 | planet	moon	8.08
 54 | planet	sun	8.02
 55 | precedent	example	5.85
 56 | precedent	antecedent	6.04
 57 | cup	tableware	6.85
 58 | cup	artifact	2.92
 59 | cup	object	3.69
 60 | cup	entity	2.15
 61 | jaguar	cat	7.42
 62 | jaguar	car	7.27
 63 | mile	kilometer	8.66
 64 | skin	eye	6.22
 65 | Japanese	American	6.50
 66 | century	year	7.59
 67 | announcement	news	7.56
 68 | doctor	personnel	5.00
 69 | Harvard	Yale	8.13
 70 | hospital	infrastructure	4.63
 71 | life	death	7.88
 72 | travel	activity	5.00
 73 | type	kind	8.97
 74 | street	place	6.44
 75 | street	avenue	8.88
 76 | street	block	6.88
 77 | cell	phone	7.81
 78 | dividend	payment	7.63
 79 | calculation	computation	8.44
 80 | profit	loss	7.63
 81 | dollar	yen	7.78
 82 | dollar	buck	9.22
 83 | phone	equipment	7.13
 84 | liquid	water	7.89
 85 | marathon	sprint	7.47
 86 | seafood	food	8.34
 87 | seafood	lobster	8.70
 88 | lobster	food	7.81
 89 | lobster	wine	5.70
 90 | championship	tournament	8.36
 91 | man	woman	8.30
 92 | man	governor	5.25
 93 | murder	manslaughter	8.53
 94 | opera	performance	6.88
 95 | Mexico	Brazil	7.44
 96 | glass	metal	5.56
 97 | aluminum	metal	7.83
 98 | rock	jazz	7.59
 99 | museum	theater	7.19
100 | shower	thunderstorm	6.31
101 | monk	oracle	5.00
102 | cup	food	5.00
103 | journal	association	4.97
104 | street	children	4.94
105 | car	flight	4.94
106 | space	chemistry	4.88
107 | situation	conclusion	4.81
108 | word	similarity	4.75
109 | peace	plan	4.75
110 | consumer	energy	4.75
111 | ministry	culture	4.69
112 | smart	student	4.62
113 | investigation	effort	4.59
114 | image	surface	4.56
115 | life	term	4.50
116 | start	match	4.47
117 | computer	news	4.47
118 | board	recommendation	4.47
119 | lad	brother	4.46
120 | observation	architecture	4.38
121 | coast	hill	4.38
122 | deployment	departure	4.25
123 | benchmark	index	4.25
124 | attempt	peace	4.25
125 | consumer	confidence	4.13
126 | start	year	4.06
127 | focus	life	4.06
128 | development	issue	3.97
129 | theater	history	3.91
130 | situation	isolation	3.88
131 | profit	warning	3.88
132 | media	trading	3.88
133 | chance	credibility	3.88
134 | precedent	information	3.85
135 | architecture	century	3.78
136 | population	development	3.75
137 | stock	live	3.73
138 | peace	atmosphere	3.69
139 | morality	marriage	3.69
140 | minority	peace	3.69
141 | atmosphere	landscape	3.69
142 | report	gain	3.63
143 | music	project	3.63
144 | seven	series	3.56
145 | experience	music	3.47
146 | school	center	3.44
147 | five	month	3.38
148 | announcement	production	3.38
149 | morality	importance	3.31
150 | money	operation	3.31
151 | delay	news	3.31
152 | governor	interview	3.25
153 | practice	institution	3.19
154 | century	nation	3.16
155 | coast	forest	3.15
156 | shore	woodland	3.08
157 | drink	car	3.04
158 | president	medal	3.00
159 | prejudice	recognition	3.00
160 | viewer	serial	2.97
161 | peace	insurance	2.94
162 | Mars	water	2.94
163 | media	gain	2.88
164 | precedent	cognition	2.81
165 | announcement	effort	2.75
166 | line	insurance	2.69
167 | crane	implement	2.69
168 | drink	mother	2.65
169 | opera	industry	2.63
170 | volunteer	motto	2.56
171 | listing	proximity	2.56
172 | precedent	collection	2.50
173 | cup	article	2.40
174 | sign	recess	2.38
175 | problem	airport	2.38
176 | reason	hypertension	2.31
177 | direction	combination	2.25
178 | Wednesday	news	2.22
179 | glass	magician	2.08
180 | cemetery	woodland	2.08
181 | possibility	girl	1.94
182 | cup	substance	1.92
183 | forest	graveyard	1.85
184 | stock	egg	1.81
185 | month	hotel	1.81
186 | energy	secretary	1.81
187 | precedent	group	1.77
188 | production	hike	1.75
189 | stock	phone	1.62
190 | holy	sex	1.62
191 | stock	CD	1.31
192 | drink	ear	1.31
193 | delay	racism	1.19
194 | stock	life	0.92
195 | stock	jaguar	0.92
196 | monk	slave	0.92
197 | lad	wizard	0.92
198 | sugar	approach	0.88
199 | rooster	voyage	0.62
200 | noon	string	0.54
201 | chord	smile	0.54
202 | professor	cucumber	0.31
203 | king	cabbage	0.23
204 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/main/context_instance.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Instance in the Lexical Substitution Task dataset
  3 | 
  4 | '''
  5 | 
  6 | 
  7 | from pos import from_lst_pos
  8 | import re
  9 | 
 10 | CONTEXT_TEXT_BEGIN_INDEX = 3
 11 | TARGET_INDEX = 2
 12 | 
 13 | def encode_utf8(str):
 14 |     new_str = ""
 15 |     for c in str:
 16 |         try:
 17 |             new_c = c.__encode('utf-8')
 18 |             new_str+=new_c
 19 |         except UnicodeError:
 20 |             print "can't encode in utf-8"
 21 | 
 22 |     return new_str
 23 | 
 24 | 
 25 | 
 26 | class ContextInstance(object):
 27 | 
 28 |     def __init__(self, line):
 29 |         '''
 30 |         Constructor
 31 |         '''
 32 |         self.line = line
 33 |         tokens1 = line.split("\t")
 34 |         self.target_ind = int(tokens1[TARGET_INDEX])
 35 |         self.words = [w for w in tokens1[3].split()]
 36 |         self.target = self.words[self.target_ind]
 37 | 
 38 |         self.full_target_key = tokens1[0]
 39 |         self.pos = self.full_target_key.split('.')[-1]  # pos is last, but target_key contains the first pos
 40 |         self.target_key = '.'.join(self.full_target_key.split('.')[:2])  # remove suffix in cases of bar.n.v
 41 |         self.target_lemma = self.full_target_key.split('.')[0]       
 42 |         self.target_id = tokens1[1]
 43 |         if self.pos in from_lst_pos:
 44 |             self.pos = from_lst_pos[self.pos]
 45 | 
 46 |     # for non-parsed test data
 47 |     # TODO: modified version
 48 |     def get_neighbors(self, half_window_size):
 49 |         tokens = self.line.split()[3:]
 50 |         
 51 |         if half_window_size > 0:
 52 |             start_pos = max(self.target_ind-half_window_size, 0)
 53 |             end_pos = min(self.target_ind+half_window_size+1, len(tokens))
 54 |         else:
 55 |             start_pos = 0
 56 |             end_pos = len(tokens)
 57 | 
 58 |         left_neighbors = tokens[start_pos:self.target_ind]
 59 |         right_neighbors = tokens[self.target_ind+1:end_pos]
 60 |             
 61 |         neighbors = left_neighbors + right_neighbors
 62 |         return left_neighbors, right_neighbors
 63 | 
 64 |     # for parsed test data
 65 |     # returns all words that share a dependency label
 66 |     def get_dep_context(self, conll, target):
 67 |         id_to_word = []  # this one contains information for us to find out who point to the target
 68 |         ind_to_context_inds = {}  # it's like where our target points {word_ind =>[word_ind,...]}
 69 |         ind_to_prep_dep_inds = {}
 70 |         target_ind = None
 71 |         while True:
 72 |             line = conll.readline()
 73 |             parts = re.split(r"\t", line)
 74 |             ind, word, dep_word_ind, dep_type = int(parts[0]), parts[1], int(parts[6]), parts[7]
 75 |             if word == "<eol>":
 76 |                 conll.readline()  # to just to the next sentence
 77 |                 break
 78 |             id_to_word.append(word)
 79 | 
 80 |             # bidirectional context
 81 |             if dep_word_ind not in ind_to_context_inds:
 82 |                 ind_to_context_inds[dep_word_ind] = []
 83 |             if ind not in ind_to_context_inds:
 84 |                 ind_to_context_inds[ind] = []
 85 | 
 86 |             # we don't want to add to context prepositions
 87 |             if dep_type != "prep":
 88 |                 ind_to_context_inds[dep_word_ind].append(ind)
 89 |             if dep_type != "pobj":
 90 |                 ind_to_context_inds[ind].append(dep_word_ind)
 91 | 
 92 |             # this part is used later to collapse dependencies
 93 |             if dep_type == "prep":
 94 |                 # prep(ind) <- word(dep_word_ind)
 95 |                 if dep_word_ind not in ind_to_prep_dep_inds:
 96 |                     ind_to_prep_dep_inds[dep_word_ind] = []
 97 |                 ind_to_prep_dep_inds[dep_word_ind].append(ind)
 98 | 
 99 |             if dep_type == "pobj":
100 |                 if ind not in ind_to_prep_dep_inds:
101 |                     ind_to_prep_dep_inds[ind] = []
102 |                 ind_to_prep_dep_inds[ind].append(dep_word_ind)
103 | 
104 |             # the search is based on matching words in FIFO fashion
105 |             if not target_ind and word == target:
106 |                 target_ind = ind
107 | 
108 |         # -1 because array indices start from 0
109 |         # the last check is because we don't care about <eos>
110 |         # now we're going to collect context ids and then convert them to words
111 |         context_inds = []
112 |         # 1. grab all words ids that the target points to
113 |         if target_ind in ind_to_context_inds:
114 |             context_inds += ind_to_context_inds[target_ind]
115 | 
116 |         # 2. grab all words of prep. words
117 |         if target_ind in ind_to_prep_dep_inds:
118 |             for prep_ind in ind_to_prep_dep_inds[target_ind]:
119 |                 for w_ind in ind_to_context_inds[prep_ind]:
120 |                     if w_ind != target_ind:
121 |                         context_inds.append(w_ind)
122 |         # convert to words
123 |         context = [id_to_word[ind-1] for ind in context_inds if len(id_to_word) > ind-1] # we don't care out <eos> dep.
124 |         return context
125 | 
126 |     def decorate_context(self):
127 |         tokens = self.line.split('\t')
128 |         words = tokens[CONTEXT_TEXT_BEGIN_INDEX].split()
129 |         words[self.target_ind] = '__'+words[self.target_ind]+'__'
130 |         tokens[CONTEXT_TEXT_BEGIN_INDEX] = ' '.join(words)
131 |         return '\t'.join(tokens)+"\n"     


--------------------------------------------------------------------------------
/libraries/simulators/support.py:
--------------------------------------------------------------------------------
  1 | # functions specific for simulation
  2 | import numpy as np
  3 | import operator
  4 | import pickle
  5 | import sys
  6 | sys.setrecursionlimit(10000)
  7 | 
  8 | 
  9 | def load(file_path):
 10 |     return pickle.load(open(file_path, 'rb+'))
 11 | 
 12 | 
 13 | # returns KL divergence between two Gaussians or vMF
 14 | # KL(q||p)
 15 | # assumes [batch_size x z_dimension ] or [latent_dim, ] inputs
 16 | def KL(mu_q, sigma_q, mu_p, sigma_p, kl_type="gauss", debug=False):
 17 | 
 18 |     # adjusting dimensions
 19 |     flag = False
 20 |     if len(mu_q.shape) == 1 and len(sigma_q.shape) == 1 and len(mu_p.shape) == 1 and len(sigma_p.shape) == 1:
 21 |         mu_q = mu_q.reshape((1, -1))
 22 |         sigma_q = sigma_q.reshape((1, -1))
 23 |         mu_p = mu_p.reshape((1, -1))
 24 |         sigma_p = sigma_p.reshape((1, -1))
 25 |         flag = True
 26 | 
 27 |     if kl_type == "gauss":
 28 |         kl = KL_gauss(mu_q, sigma_q, mu_p, sigma_p, debug=debug)
 29 |     if kl_type == "vMF":
 30 |         kl = KL_vMF(mu_q, sigma_q, mu_p, sigma_p, debug=debug)
 31 |     if flag:
 32 |         kl = kl[0]
 33 |     return kl
 34 | 
 35 | 
 36 | # standard KL for Gaussians
 37 | def KL_gauss(mu_q, sigma_q, mu_p, sigma_p, debug=False):
 38 |     if sigma_q.shape[1] == 1 and sigma_p.shape[1] == 1:
 39 |         return KL_gauss_spherical(mu_q, sigma_q, mu_p, sigma_p, debug=debug)
 40 |     else:
 41 |         return KL_gauss_diagonal(mu_q, sigma_q, mu_p, sigma_p, debug=debug)
 42 | 
 43 | def KL_gauss_spherical(mu_q, sigma_q, mu_p, sigma_p, debug=False, eps=1e-8):
 44 |     k = mu_q.shape[1]
 45 |     sigma_p_inv = 1./(sigma_p + eps)
 46 |     trace = k * sigma_p_inv * sigma_q
 47 |     quadr = sigma_p_inv*np.sum(((mu_p - mu_q)**2), axis=1)
 48 |     log_det_p = np.log(sigma_p + 1e-10)
 49 |     log_det_q = np.log(sigma_q + 1e-10)
 50 |     log_det = k*(log_det_p - log_det_q)
 51 |     res = 0.5 * (trace + quadr - k + log_det)
 52 | 
 53 |     if debug:
 54 |         print "trace : %s" % str(trace)
 55 |         print "quadr : %s" % str(quadr)
 56 |         print 'log_det_p : %s' % str(log_det_p)
 57 |         print 'log_det_q : %s' % str(log_det_q)
 58 |         print "log_det : %s" % str(log_det)
 59 |         print 'res : %s'% str(res)
 60 |     return res.reshape((-1, ))
 61 | 
 62 | 
 63 | def KL_gauss_diagonal(mu_q, sigma_q, mu_p, sigma_p, debug=False, eps=1e-8):
 64 |     k = mu_q.shape[1]
 65 |     sigma_p_inv = 1./(sigma_p + eps)
 66 |     trace = np.sum(sigma_p_inv * sigma_q, axis=1)
 67 |     quadr = np.sum(sigma_p_inv * ((mu_p - mu_q)**2), axis=1)
 68 | 
 69 |     log_det_p = np.sum(np.log(sigma_p + eps), axis=1)
 70 |     log_det_q = np.sum(np.log(sigma_q + eps), axis=1)
 71 | 
 72 |     log_det = log_det_p - log_det_q
 73 | 
 74 |     if debug:
 75 |         print "trace : %f" % trace
 76 |         print "quadr : %f" % quadr
 77 |         print 'log_det_p : %f' % log_det_p
 78 |         print 'log_det_q : %f' % log_det_q
 79 |         print "log_det : %f" % log_det
 80 | 
 81 |     return 0.5 * (trace + quadr - k + log_det)
 82 | 
 83 | 
 84 | # KL for vMF
 85 | def KL_vMF(mu1, kappa1, mu2, kappa2, debug=False):
 86 |     return kl_vMF(mu1, kappa1, mu2, kappa2)
 87 | 
 88 | def pad_window_size(X, desired_window_size):
 89 |     current_height = X.shape[0]
 90 |     if desired_window_size <= current_height:
 91 |         return X
 92 |     pad_height = (desired_window_size- current_height)/2
 93 |     if current_height % 2 == 1:
 94 |         return np.pad(X, [(pad_height, pad_height+1), (0, 0)], 'constant')
 95 |     else:
 96 |         return np.pad(X, [(pad_height, pad_height), (0,0)], 'constant')
 97 | 
 98 | 
 99 | 
100 | 
101 | def sigmoid(x):
102 |     return 1./(1 + np.exp(-x))
103 | 
104 | def simple_pmf(x):
105 |     return 1./(1. + x)
106 | 
107 | def relu(x):
108 |     return np.maximum(0, x)
109 | 
110 | def l2(x, axis=0):
111 |     return np.sqrt(np.sum(x**2, axis=axis))
112 | 
113 | def cosine_sim(x, y):
114 |     return float(np.sum(x*y))/float(np.sqrt(np.sum(x**2)*np.sum(y**2)))
115 | 
116 | # search_position: over what position to search KL(position 1 || ... ) or KL( ... || position 2)
117 | def argmin_score(mu_q, sigma_q, mus_and_sigmas, num=1, type="kl", search_position=1):
118 |         assert type in ["kl", "l2"]
119 |         scores = {}
120 |         for word, (mu_p, sigma_p) in mus_and_sigmas.items():
121 |             if type == "kl":
122 |                 if search_position==1:
123 |                     scores[word] = KL(mu_p, sigma_p, mu_q, sigma_q)
124 |                 else:
125 |                     scores[word] = KL(mu_q, sigma_q, mu_p, sigma_p)
126 | 
127 |             else:
128 |                 scores[word] = l2(mu_q - mu_p, axis=0)
129 |         sorted_scores = sorted(scores.items(), key=operator.itemgetter(1))
130 |         return sorted_scores[0:num]
131 | 
132 | # search_position: over what position to search KL(position 1 || ... ) or KL( ... || position 2)
133 | def closest_score(score, mu_q, sigma_q, mus_and_sigmas, num=1, type="kl", search_position=1):
134 |     assert type in ["kl"]
135 |     assert search_position in [1, 2]
136 |     scores = {}
137 | 
138 |     for word, (mu_p, sigma_p) in mus_and_sigmas.items():
139 |         if type == "kl":
140 |             if search_position == 1:
141 |                 scores[word] = abs(score - KL(mu_p, sigma_p, mu_q, sigma_q))
142 |             else:
143 |                 scores[word] = abs(score - KL(mu_q, sigma_q, mu_p, sigma_p))
144 | 
145 |     sorted_scores = sorted(scores.items(), key=operator.itemgetter(1))
146 |     return sorted_scores[0:num]
147 | 
148 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/context2vec_inferrer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Context-sensitive inferrer based on context2vec (bidirectional lsmt)
  3 | Used in the paper:
  4 | context2vec: Learning Generic Context Embedding with Bidirectional LSTM. CoNLL, 2016.
  5 | '''
  6 | 
  7 | from cs_inferrer import CsInferrer
  8 | from jcs.jcs_io import vec_to_str
  9 | import numpy as np
 10 | 
 11 | # from context2vec.common.model_reader import ModelReader
 12 |     
 13 | #
 14 | # class Context2vecInferrer(CsInferrer):
 15 | #
 16 | #     def __init__(self, lstm_model_params_filename, ignore_target, context_math, top_inferences_to_analyze):
 17 | #
 18 | #         CsInferrer.__init__(self)
 19 | #         self.ignore_target = ignore_target
 20 | #         self.context_math = context_math
 21 | #         self.top_inferences_to_analyze = top_inferences_to_analyze
 22 | #
 23 | #         model_reader = ModelReader(lstm_model_params_filename)
 24 | #         self.context_model = model_reader.model
 25 | #         self.target_words = model_reader.w
 26 | #         self.word2index = model_reader.word2index
 27 | #         self.index2word = model_reader.index2word
 28 | #
 29 | #     def represent_target_and_context(self, lst_instance, tfo):
 30 | #
 31 | #         sent_words = lst_instance.words
 32 | #         target_ind = lst_instance.target_ind
 33 | #
 34 | #         ignore_target = self.ignore_target and len(sent_words) > 1 # if there's only the target word in the sentence then we don't ignore it...
 35 | #
 36 | #         if not ignore_target:
 37 | #             if lst_instance.target not in self.word2index:
 38 | #                 tfo.write("ERROR: %s not in word embeddings.Trying lemma.\n" % lst_instance.target)
 39 | #                 if lst_instance.target_lemma not in self.word2index:
 40 | #                         tfo.write("ERROR: lemma %s also not in word embeddings.\n" % lst_instance.target_lemma)
 41 | #                 else:
 42 | #                     sent_words[target_ind] = lst_instance.target_lemma
 43 | #
 44 | #         target_v = self.target_words[self.word2index[sent_words[target_ind]]] if not ignore_target else None
 45 | #
 46 | #         if len(sent_words) > 1:
 47 | #             context_v = self.context_model.context2vec(sent_words, target_ind)
 48 | #             context_v = context_v / np.sqrt((context_v * context_v).sum())
 49 | #         else:
 50 | #             context_v = None # just target with no context
 51 | #
 52 | #         return target_v, context_v
 53 | #
 54 | #
 55 | #     def find_inferred(self, lst_instance, tfo):
 56 | #
 57 | #         target_v, context_v = self.represent_target_and_context(lst_instance, tfo)
 58 | #
 59 | #         if target_v is not None and context_v is not None:
 60 | #
 61 | #         #This is not working very well at the moment. Requires more research.
 62 | #
 63 | #             # ZERO-TO-HALF
 64 | #             target_sim = (self.target_words.dot(target_v)+1.0)/2
 65 | #             context_sim = (self.target_words.dot(context_v)+1.0)/2
 66 | #             similarity = target_sim*context_sim
 67 | #
 68 | #             # RANKS
 69 | # #            target_sim = self.target_words.dot(target_v)
 70 | # #            context_sim = self.target_words.dot(context_v)
 71 | # #            for rank, i in enumerate(target_sim.argsort()):
 72 | # #                target_sim[i] = float(rank)
 73 | # #            for rank, i in enumerate(context_sim.argsort()):
 74 | # #                context_sim[i] = float(rank)
 75 | # #
 76 | # #            similarity = (target_sim*context_sim)/(len(target_sim)**2)
 77 | #
 78 | #             # POSITIVE SCORES
 79 | # #            target_sim = self.target_words.dot(target_v)
 80 | # #            context_sim = self.target_words.dot(context_v)
 81 | # #            target_sim[target_sim<0.0] = 0.0
 82 | # #            context_sim[context_sim<0.0] = 0.0
 83 | # #            similarity = target_sim*context_sim
 84 | #
 85 | #             # EXP
 86 | # #            target_sim = self.target_words.dot(target_v)
 87 | # #            target_sim = np.exp(target_sim)
 88 | # #            context_sim = self.target_words.dot(context_v)
 89 | # #            context_sim = np.exp(context_sim)
 90 | #
 91 | #
 92 | #             # NORMALIZE
 93 | # #            target_sim = self.target_words.dot(target_v)
 94 | # #            target_sim_mean = np.mean(target_sim)
 95 | # #            target_sim_std = np.sqrt(np.var(target_sim))
 96 | # #            target_sim = (target_sim - target_sim_mean)/target_sim_std
 97 | # ##            target_sim[target_sim<0.0] = 0.0
 98 | # #            context_sim = self.target_words.dot(context_v)
 99 | # #            context_sim_mean = np.mean(context_sim)
100 | # #            context_sim_std = np.sqrt(np.var(context_sim))
101 | # #            context_sim = (context_sim - context_sim_mean)/context_sim_std
102 | # ##            context_sim[context_sim<0.0] = 0.0
103 | # #
104 | # #            similarity = target_sim + context_sim
105 | #
106 | #         else:
107 | #             if target_v is not None:
108 | #                 similarity = (self.target_words.dot(target_v)+1.0)/2
109 | #             elif context_v is not None:
110 | #                 similarity = (self.target_words.dot(context_v)+1.0)/2
111 | #             else:
112 | #                 raise Exception("Can't find a target nor context.")
113 | #
114 | #         result_vec = sorted(zip(self.index2word, similarity), reverse=True, key=lambda x: x[1])
115 | #
116 | #         tfo.write("Top most similar embeddings: " + vec_to_str(result_vec, self.top_inferences_to_analyze) + '\n')
117 | #
118 | #         return result_vec
119 |         
120 | 
121 | 
122 | 


--------------------------------------------------------------------------------
/libraries/batch_iterators/sentence_batch_iterator.py:
--------------------------------------------------------------------------------
  1 | from support import allow_with_prob, sample_words, create_context_windows, pad_sents
  2 | from base_batch_iterator import BaseBatchIterator
  3 | from libraries.data_iterators.open_text_data_iterator import OpenTextDataIterator
  4 | from window_batch_iterator import Batch
  5 | from libraries.tools.vocabulary import PAD_TOKEN, UNK_TOKEN
  6 | import numpy as np
  7 | try:
  8 |     import re2 as re
  9 | except ImportError:
 10 |     import re
 11 | 
 12 | 
 13 | class SentenceBatchIterator(BaseBatchIterator):
 14 |     """
 15 |     Specific for LSTM based BSG iterator over batches.
 16 | 
 17 |     """
 18 | 
 19 |     def __init__(self, vocab, data_path, data_iterator, subsampling_threshold=None, batch_size=50,
 20 |                  max_sentence_length=None):
 21 |         """
 22 |         :param data_path: a path to data, can be a folder or a file path.
 23 |         :param subsampling_threshold: used in computation of words removal probability. The smaller the threshold
 24 |                                       the larger is the removal probability. In the original paper it was 1e-5.
 25 |                                       If None is passed, the subsampling will not be applied.
 26 | 
 27 |         """
 28 |         assert all([symbol in vocab for symbol in [PAD_TOKEN, UNK_TOKEN]])
 29 |         assert isinstance(data_iterator, OpenTextDataIterator)
 30 | 
 31 |         self.vocab = vocab
 32 |         self.data_path = data_path
 33 |         self.subsampling_threshold = subsampling_threshold
 34 |         self.batch_size = batch_size
 35 |         self.max_sentence_length = max_sentence_length
 36 | 
 37 |         self.data_iterator = data_iterator
 38 |         self.data_iterator.set_data_path(data_path)
 39 | 
 40 |         BaseBatchIterator.__init__(self)
 41 | 
 42 |     def load_data_batches_to_queue(self, queue):
 43 |         """
 44 |         Loads batches sequentially to a queue.
 45 | 
 46 |         """
 47 |         # create data holders(containers)
 48 |         left_context_tokens = []
 49 |         right_context_tokens = []
 50 |         center_tokens = []
 51 |         containers_current_size = 0
 52 |         max_length = 0
 53 |         for sentence, in self.data_iterator:
 54 |             # apply subsampling
 55 |             if self.subsampling_threshold:
 56 |                 sentence = [token for token in sentence if allow_with_prob(self.vocab[token].count,
 57 |                                                                            self.vocab.total_count,
 58 |                                                                            subsampling_threshold=self.subsampling_threshold)]
 59 |             # convert to word_ids
 60 |             sentence_ids = [obj.id for obj in self.vocab[sentence]]
 61 | 
 62 |             # trim the sentence
 63 |             if self.max_sentence_length:
 64 |                 sentence_ids = sentence_ids[:self.max_sentence_length]
 65 | 
 66 |             for center_token, left_context, right_context in create_context_windows(sentence_ids, half_window_size=0):
 67 |                 # add to the data holders
 68 |                 center_tokens.append(center_token)
 69 |                 left_context_tokens.append(left_context)
 70 |                 right_context_tokens.append(right_context)
 71 |                 containers_current_size += 1
 72 |                 max_length = max(max_length, len(left_context), len(right_context))
 73 |                 # return the chunk/batch when the container gets full
 74 |                 if containers_current_size >= self.batch_size:
 75 |                     batch = self.__create_batch(center_words=center_tokens, left_context=left_context_tokens,
 76 |                                                 right_context=right_context_tokens, max_length=max_length)
 77 |                     queue.put(batch)
 78 | 
 79 |                     # reset
 80 |                     left_context_tokens = []
 81 |                     right_context_tokens = []
 82 |                     center_tokens = []
 83 |                     containers_current_size = 0
 84 |                     max_length = 0
 85 | 
 86 |         # return what has been collected if iteration is finished
 87 |         if containers_current_size > 0:
 88 |             batch = self.__create_batch(center_words=center_tokens, left_context=left_context_tokens,
 89 |                                         right_context=right_context_tokens, max_length=max_length)
 90 |             queue.put(batch)
 91 |         queue.put(None)  # to indicate that loading is finished
 92 | 
 93 |     def __create_batch(self, center_words, left_context, right_context, max_length):
 94 | 
 95 |         left_context, left_mask = pad_sents(left_context, max_length, pad_symbol=self.vocab[PAD_TOKEN].id,
 96 |                                             padding_mode='left')
 97 |         right_context, right_mask = pad_sents(right_context, max_length, pad_symbol=self.vocab[PAD_TOKEN].id,
 98 |                                               padding_mode='right')
 99 | 
100 |         context = np.concatenate((left_context, right_context), axis=1)
101 |         mask = np.concatenate((left_mask, right_mask), axis=1)
102 | 
103 |         # generate negative samples
104 |         neg_context = sample_words(context.shape[0], len(self.vocab), self.vocab.uni_distr,
105 |                                    nr_neg_samples=context.shape[1])
106 |         neg_context = np.array(neg_context, dtype="int32")
107 | 
108 |         batch = Batch(pos_context_words=context, neg_context_words=neg_context, center_words=center_words, mask=mask)
109 |         return batch


--------------------------------------------------------------------------------
/libraries/evaluation/entailment/data/bench/baroni2012/data_rnd_val.tsv:
--------------------------------------------------------------------------------
  1 | conservatism	mesothelioma	False
  2 | arithmetic	discipline	True
  3 | insect	librarian	False
  4 | holly	official	False
  5 | scourge	instrument	True
  6 | odyssey	travel	True
  7 | snake	animal	True
  8 | mackerel	seafood	True
  9 | statement	ship	False
 10 | dad	relative	True
 11 | agony	adult	False
 12 | loo	room	True
 13 | velocity	rate	True
 14 | volleyball	game	True
 15 | fork	ware	True
 16 | payment	royalty	False
 17 | drummer	performer	True
 18 | pudding	food	True
 19 | bridesmaid	woman	True
 20 | tyrant	ruler	True
 21 | ore	mineral	True
 22 | sailor	privateer	False
 23 | playing	show	True
 24 | heat	ticket	False
 25 | snooker	game	True
 26 | pram	vehicle	True
 27 | trait	goose	False
 28 | privateer	sailor	True
 29 | lamb	animal	True
 30 | petroleum	payment	False
 31 | roadway	road	True
 32 | sherry	rodent	False
 33 | relative	musician	False
 34 | vertebrate	serpent	False
 35 | turbine	engine	True
 36 | yesterday	day	True
 37 | polyethylene	resin	True
 38 | consumer	drinker	False
 39 | vocalist	performer	True
 40 | lesbian	dyke	False
 41 | gymnasium	school	True
 42 | barrier	obstruction	True
 43 | psychotherapy	science	True
 44 | sack	apostle	False
 45 | turkey	animal	True
 46 | diplomat	ambassador	False
 47 | president	leader	True
 48 | dogma	doctrine	True
 49 | stag	eagle	False
 50 | sunlight	statement	False
 51 | mesothelioma	maker	False
 52 | disorder	insomnia	False
 53 | baritone	adrenaline	False
 54 | pope	leader	True
 55 | golf	sport	True
 56 | sedan	car	True
 57 | cottage	house	True
 58 | airplane	chalet	False
 59 | checklist	information	True
 60 | ape	animal	True
 61 | mesothelioma	disease	True
 62 | reelection	ache	False
 63 | hedgehog	vertebrate	True
 64 | barrier	fender	False
 65 | solid	ape	False
 66 | penguin	animal	True
 67 | sport	garment	False
 68 | conduit	writer	False
 69 | algebra	mathematics	True
 70 | diabetes	furniture	False
 71 | rig	equipment	True
 72 | travel	algebra	False
 73 | missile	weapon	True
 74 | pathologist	doctor	True
 75 | monastery	housing	True
 76 | massage	treatment	True
 77 | biotechnology	discipline	True
 78 | computer	vertebrate	False
 79 | vehicle	auto	False
 80 | trait	maker	False
 81 | whiskey	liquor	True
 82 | country	organization	True
 83 | spokesman	spokesperson	True
 84 | school	gymnasium	False
 85 | fish	shark	False
 86 | equipment	princess	False
 87 | building	instrument	False
 88 | monoclonal	protein	True
 89 | animal	vertebrate	False
 90 | grandmother	relative	True
 91 | horseman	rider	True
 92 | toad	animal	True
 93 | cheese	food	True
 94 | cynicism	feeling	True
 95 | asthma	motel	False
 96 | performer	comedian	False
 97 | castle	house	True
 98 | seaweed	alga	True
 99 | salmon	fish	True
100 | dinghy	boat	True
101 | heterogeneity	discipline	False
102 | etching	art	True
103 | grandchild	offspring	True
104 | transaction	investing	False
105 | karaoke	entertainment	True
106 | booze	drug	True
107 | ambassador	diplomat	True
108 | twelve	integer	True
109 | alcohol	fluid	True
110 | auto	trait	False
111 | passion	feeling	True
112 | section	preamble	False
113 | kindergarten	symbol	False
114 | leaf	petal	False
115 | patience	ceremony	False
116 | produce	raisin	False
117 | food	reimbursement	False
118 | collagen	shopkeeper	False
119 | kangaroo	animal	True
120 | diabetes	corridor	False
121 | dog	annals	False
122 | toast	antibiotic	False
123 | sport	badminton	False
124 | reptile	vertebrate	True
125 | garment	jean	False
126 | enzymology	biochemistry	True
127 | apprehension	bird	False
128 | contestant	building	False
129 | narrative	fairytale	False
130 | aircrew	playlist	False
131 | cow	chocolate	False
132 | tern	vertebrate	True
133 | performer	spear	False
134 | leader	statesman	False
135 | ferry	vehicle	True
136 | shark	fish	True
137 | traveler	performer	False
138 | mistress	woman	True
139 | sweetness	sensation	True
140 | fourteen	integer	True
141 | sis	relative	True
142 | slogan	saying	True
143 | fluorescence	phenomenon	True
144 | jealousy	feeling	True
145 | skating	sport	True
146 | fillet	glider	False
147 | animal	kestrel	False
148 | cost	overhead	False
149 | infusion	animal	False
150 | aristocrat	baroness	False
151 | animal	dorm	False
152 | slogan	mixture	False
153 | calf	animal	True
154 | pigeon	vertebrate	True
155 | panda	woodpecker	False
156 | edge	boundary	True
157 | organization	math	False
158 | furniture	desk	False
159 | cellulose	carbohydrate	True
160 | boat	cost	False
161 | observatory	building	True
162 | alloy	bronze	False
163 | insect	animal	True
164 | theatre	building	True
165 | instrument	voice	False
166 | apprehension	skating	False
167 | psoriasis	disease	True
168 | payment	hostess	False
169 | liquor	beverage	True
170 | supercomputer	computer	True
171 | goldfish	vertebrate	True
172 | vegetable	vehicle	False
173 | panic	fear	True
174 | feeling	nostalgia	False
175 | grandchild	drug	False
176 | aspirin	drug	True
177 | pipe	conduit	True
178 | clubhouse	building	True
179 | firm	business	True
180 | icon	disease	False
181 | monastery	building	True
182 | pathology	science	True
183 | velvet	fabric	True
184 | textbook	book	True
185 | berry	solid	True
186 | statesman	leader	True
187 | cassette	container	True
188 | robbery	crime	True
189 | endurance	reindeer	False
190 | terror	emotion	True
191 | bag	review	False
192 | ban	decree	True
193 | wheat	pain	False
194 | offspring	arithmetic	False
195 | graft	tissue	True
196 | technician	worker	True
197 | potato	food	True
198 | virus	microorganism	True
199 | captivity	internment	False
200 | uprising	conflict	True
201 | snake	chemical	False
202 | bookmark	marker	True
203 | mare	vertebrate	True
204 | cool	temperature	True
205 | margarine	food	True
206 | asp	invertebrate	False
207 | beverage	strut	False
208 | benzene	chemical	True
209 | champagne	etching	False
210 | mammal	cat	False
211 | walk	vehicle	False
212 | biplane	aircraft	True
213 | farmhouse	bargain	False
214 | deanery	house	True
215 | concert	performance	True
216 | book	fruit	False
217 | vertebrate	mare	False
218 | horse	science	False
219 | studio	workplace	True
220 | beech	artillery	False
221 | humanist	scholar	True
222 | strut	walk	True
223 | chant	music	True
224 | 


--------------------------------------------------------------------------------
/libraries/batch_iterators/window_batch_iterator.py:
--------------------------------------------------------------------------------
  1 | from support import pad_sents, allow_with_prob, sample_words, create_context_windows
  2 | from base_batch_iterator import BaseBatchIterator
  3 | from libraries.tools.vocabulary import PAD_TOKEN, UNK_TOKEN
  4 | import numpy as np
  5 | try:
  6 |     import re2 as re
  7 | except ImportError:
  8 |     import re
  9 | 
 10 | 
 11 | class Batch:
 12 |     def __init__(self, pos_context_words, neg_context_words, center_words, mask):
 13 |         self.pos_context_words = pos_context_words
 14 |         self.neg_context_words = neg_context_words
 15 |         self.center_words = center_words
 16 |         self.mask = mask
 17 | 
 18 |     def __len__(self):
 19 |         return self.pos_context_words.shape[0]
 20 | 
 21 | 
 22 | class WindowBatchIterator(BaseBatchIterator):
 23 | 
 24 |     def __init__(self, vocab, data_path, data_iterator, half_window_size=5, nr_neg_samples=5,
 25 |                  subsampling_threshold=None, batch_size=50):
 26 |         """
 27 |         :param data_path: a path to data, can be a folder or a file path.
 28 |         :param subsampling_threshold: used in computation of words removal probability. The smaller the threshold
 29 |                                       the larger is the removal probability. In the original paper it was 1e-5.
 30 |                                       If None is passed, the subsampling will not be applied.
 31 | 
 32 |         """
 33 |         assert all([symbol in vocab for symbol in [PAD_TOKEN, UNK_TOKEN]])
 34 |         self.vocab = vocab
 35 |         self.data_path = data_path
 36 |         self.half_window_size = half_window_size
 37 |         self.nr_neg_samples = nr_neg_samples
 38 |         self.subsampling_threshold = subsampling_threshold
 39 |         self.batch_size = batch_size
 40 | 
 41 |         self.data_iterator = data_iterator
 42 |         self.data_iterator.set_data_path(data_path)
 43 | 
 44 |         BaseBatchIterator.__init__(self)
 45 | 
 46 |     def load_data_batches_to_queue(self, queue):
 47 |         """
 48 |         Loads batches sequentially to a queue.
 49 | 
 50 |         """
 51 |         # create data placeholders
 52 |         pos_context_words = []
 53 |         center_words = []
 54 |         batch_current_size = 0
 55 |         max_length = 0
 56 | 
 57 |         for sentence, in self.data_iterator:
 58 | 
 59 |                 # apply subsampling
 60 |                 if self.subsampling_threshold:
 61 |                     sentence = [token for token in sentence if allow_with_prob(self.vocab[token].count,
 62 |                                                                                self.vocab.total_count,
 63 |                                                                                subsampling_threshold=self.subsampling_threshold)]
 64 |                 # convert to word_ids
 65 |                 sentence_ids = [obj.id for obj in self.vocab[sentence]]
 66 | 
 67 |                 # pad corners
 68 |                 sentence_ids = [self.vocab[PAD_TOKEN].id] * self.half_window_size + sentence_ids + \
 69 |                                [self.vocab[PAD_TOKEN].id] * self.half_window_size
 70 | 
 71 |                 # create windows
 72 |                 for center_token, context_tokens in create_context_windows(sentence_ids, self.half_window_size):
 73 |                     # add to the data holders
 74 |                     center_words.append(center_token)
 75 |                     pos_context_words.append(context_tokens)
 76 |                     batch_current_size += 1
 77 |                     max_length = max(max_length, len(context_tokens))
 78 | 
 79 |                     # return the chunk when the container gets full
 80 |                     if batch_current_size >= self.batch_size:
 81 |                         # generate negative samples
 82 |                         neg_context_words = sample_words(batch_current_size, len(self.vocab), self.vocab.uni_distr,
 83 |                                                          nr_neg_samples=self.nr_neg_samples)
 84 |                         batch = self.__create_batch(pos_context_words=pos_context_words,
 85 |                                                     neg_context_words=neg_context_words,
 86 |                                                     center_words=center_words, max_length=max_length)
 87 |                         queue.put(batch)
 88 | 
 89 |                         # reset
 90 |                         pos_context_words = []
 91 |                         center_words = []
 92 |                         batch_current_size = 0
 93 |                         max_length = 0
 94 | 
 95 |         # return what has been collected if iteration is finished
 96 |         if batch_current_size > 0:
 97 |             # generate negative samples
 98 |             neg_context_words = sample_words(batch_current_size, len(self.vocab), self.vocab.uni_distr,
 99 |                                              nr_neg_samples=self.nr_neg_samples)
100 |             batch = self.__create_batch(pos_context_words=pos_context_words,
101 |                                         neg_context_words=neg_context_words,
102 |                                         center_words=center_words, max_length=max_length)
103 |             queue.put(batch)
104 |         queue.put(None)  # to indicate that loading is finished
105 | 
106 |     def __create_batch(self, pos_context_words, neg_context_words, center_words, max_length):
107 | 
108 |         neg_context_words = np.array(neg_context_words, dtype="int32")
109 |         pos_context_words, mask = pad_sents(pos_context_words, max_length=max_length,
110 |                                             pad_symbol=self.vocab[PAD_TOKEN].id, mask_current=True)
111 |         center_words = np.array(center_words, dtype="int32")
112 | 
113 |         # convert all to numpy arrays
114 |         # neg_context_words = np.array(neg_context_words, dtype="int32")
115 |         pos_context_words = np.array(pos_context_words, dtype="int32")
116 | 
117 |         batch = Batch(pos_context_words=pos_context_words, neg_context_words=neg_context_words,
118 |                       center_words=center_words, mask=mask)
119 |         return batch


--------------------------------------------------------------------------------
/libraries/evaluation/word_sim/data/word-sim/EN-WS-353-REL.txt:
--------------------------------------------------------------------------------
  1 | computer	keyboard	7.62
  2 | Jerusalem	Israel	8.46
  3 | planet	galaxy	8.11
  4 | canyon	landscape	7.53
  5 | OPEC	country	5.63
  6 | day	summer	3.94
  7 | day	dawn	7.53
  8 | country	citizen	7.31
  9 | planet	people	5.75
 10 | environment	ecology	8.81
 11 | Maradona	football	8.62
 12 | OPEC	oil	8.59
 13 | money	bank	8.50
 14 | computer	software	8.50
 15 | law	lawyer	8.38
 16 | weather	forecast	8.34
 17 | network	hardware	8.31
 18 | nature	environment	8.31
 19 | FBI	investigation	8.31
 20 | money	wealth	8.27
 21 | psychology	Freud	8.21
 22 | news	report	8.16
 23 | war	troops	8.13
 24 | physics	proton	8.12
 25 | bank	money	8.12
 26 | stock	market	8.08
 27 | planet	constellation	8.06
 28 | credit	card	8.06
 29 | hotel	reservation	8.03
 30 | closet	clothes	8.00
 31 | soap	opera	7.94
 32 | planet	astronomer	7.94
 33 | planet	space	7.92
 34 | movie	theater	7.92
 35 | treatment	recovery	7.91
 36 | baby	mother	7.85
 37 | money	deposit	7.73
 38 | television	film	7.72
 39 | psychology	mind	7.69
 40 | game	team	7.69
 41 | admission	ticket	7.69
 42 | Jerusalem	Palestinian	7.65
 43 | Arafat	terror	7.65
 44 | boxing	round	7.61
 45 | computer	internet	7.58
 46 | money	property	7.57
 47 | tennis	racket	7.56
 48 | telephone	communication	7.50
 49 | currency	market	7.50
 50 | psychology	cognition	7.48
 51 | seafood	sea	7.47
 52 | book	paper	7.46
 53 | book	library	7.46
 54 | psychology	depression	7.42
 55 | fighting	defeating	7.41
 56 | movie	star	7.38
 57 | hundred	percent	7.38
 58 | dollar	profit	7.38
 59 | money	possession	7.29
 60 | cup	drink	7.25
 61 | psychology	health	7.23
 62 | summer	drought	7.16
 63 | investor	earning	7.13
 64 | company	stock	7.08
 65 | stroke	hospital	7.03
 66 | liability	insurance	7.03
 67 | game	victory	7.03
 68 | psychology	anxiety	7.00
 69 | game	defeat	6.97
 70 | FBI	fingerprint	6.94
 71 | money	withdrawal	6.88
 72 | psychology	fear	6.85
 73 | drug	abuse	6.85
 74 | concert	virtuoso	6.81
 75 | computer	laboratory	6.78
 76 | love	sex	6.77
 77 | problem	challenge	6.75
 78 | movie	critic	6.73
 79 | Arafat	peace	6.73
 80 | bed	closet	6.72
 81 | lawyer	evidence	6.69
 82 | fertility	egg	6.69
 83 | precedent	law	6.65
 84 | minister	party	6.63
 85 | psychology	clinic	6.58
 86 | cup	coffee	6.58
 87 | water	seepage	6.56
 88 | government	crisis	6.56
 89 | space	world	6.53
 90 | dividend	calculation	6.48
 91 | victim	emergency	6.47
 92 | luxury	car	6.47
 93 | tool	implement	6.46
 94 | competition	price	6.44
 95 | psychology	doctor	6.42
 96 | gender	equality	6.41
 97 | listing	category	6.38
 98 | video	archive	6.34
 99 | oil	stock	6.34
100 | governor	office	6.34
101 | discovery	space	6.34
102 | record	number	6.31
103 | brother	monk	6.27
104 | production	crew	6.25
105 | nature	man	6.25
106 | family	planning	6.25
107 | disaster	area	6.25
108 | food	preparation	6.22
109 | preservation	world	6.19
110 | movie	popcorn	6.19
111 | lover	quarrel	6.19
112 | game	series	6.19
113 | dollar	loss	6.09
114 | weapon	secret	6.06
115 | shower	flood	6.03
116 | registration	arrangement	6.00
117 | arrival	hotel	6.00
118 | announcement	warning	6.00
119 | game	round	5.97
120 | baseball	season	5.97
121 | drink	mouth	5.96
122 | life	lesson	5.94
123 | grocery	money	5.94
124 | energy	crisis	5.94
125 | reason	criterion	5.91
126 | equipment	maker	5.91
127 | cup	liquid	5.90
128 | deployment	withdrawal	5.88
129 | tiger	zoo	5.87
130 | journey	car	5.85
131 | money	laundering	5.65
132 | summer	nature	5.63
133 | decoration	valor	5.63
134 | Mars	scientist	5.63
135 | alcohol	chemistry	5.54
136 | disability	death	5.47
137 | change	attitude	5.44
138 | arrangement	accommodation	5.41
139 | territory	surface	5.34
140 | size	prominence	5.31
141 | exhibit	memorabilia	5.31
142 | credit	information	5.31
143 | territory	kilometer	5.28
144 | death	row	5.25
145 | doctor	liability	5.19
146 | impartiality	interest	5.16
147 | energy	laboratory	5.09
148 | secretary	senate	5.06
149 | death	inmate	5.03
150 | monk	oracle	5.00
151 | cup	food	5.00
152 | journal	association	4.97
153 | street	children	4.94
154 | car	flight	4.94
155 | space	chemistry	4.88
156 | situation	conclusion	4.81
157 | word	similarity	4.75
158 | peace	plan	4.75
159 | consumer	energy	4.75
160 | ministry	culture	4.69
161 | smart	student	4.62
162 | investigation	effort	4.59
163 | image	surface	4.56
164 | life	term	4.50
165 | start	match	4.47
166 | computer	news	4.47
167 | board	recommendation	4.47
168 | lad	brother	4.46
169 | observation	architecture	4.38
170 | coast	hill	4.38
171 | deployment	departure	4.25
172 | benchmark	index	4.25
173 | attempt	peace	4.25
174 | consumer	confidence	4.13
175 | start	year	4.06
176 | focus	life	4.06
177 | development	issue	3.97
178 | theater	history	3.91
179 | situation	isolation	3.88
180 | profit	warning	3.88
181 | media	trading	3.88
182 | chance	credibility	3.88
183 | precedent	information	3.85
184 | architecture	century	3.78
185 | population	development	3.75
186 | stock	live	3.73
187 | peace	atmosphere	3.69
188 | morality	marriage	3.69
189 | minority	peace	3.69
190 | atmosphere	landscape	3.69
191 | report	gain	3.63
192 | music	project	3.63
193 | seven	series	3.56
194 | experience	music	3.47
195 | school	center	3.44
196 | five	month	3.38
197 | announcement	production	3.38
198 | morality	importance	3.31
199 | money	operation	3.31
200 | delay	news	3.31
201 | governor	interview	3.25
202 | practice	institution	3.19
203 | century	nation	3.16
204 | coast	forest	3.15
205 | shore	woodland	3.08
206 | drink	car	3.04
207 | president	medal	3.00
208 | prejudice	recognition	3.00
209 | viewer	serial	2.97
210 | peace	insurance	2.94
211 | Mars	water	2.94
212 | media	gain	2.88
213 | precedent	cognition	2.81
214 | announcement	effort	2.75
215 | line	insurance	2.69
216 | crane	implement	2.69
217 | drink	mother	2.65
218 | opera	industry	2.63
219 | volunteer	motto	2.56
220 | listing	proximity	2.56
221 | precedent	collection	2.50
222 | cup	article	2.40
223 | sign	recess	2.38
224 | problem	airport	2.38
225 | reason	hypertension	2.31
226 | direction	combination	2.25
227 | Wednesday	news	2.22
228 | glass	magician	2.08
229 | cemetery	woodland	2.08
230 | possibility	girl	1.94
231 | cup	substance	1.92
232 | forest	graveyard	1.85
233 | stock	egg	1.81
234 | month	hotel	1.81
235 | energy	secretary	1.81
236 | precedent	group	1.77
237 | production	hike	1.75
238 | stock	phone	1.62
239 | holy	sex	1.62
240 | stock	CD	1.31
241 | drink	ear	1.31
242 | delay	racism	1.19
243 | stock	life	0.92
244 | stock	jaguar	0.92
245 | monk	slave	0.92
246 | lad	wizard	0.92
247 | sugar	approach	0.88
248 | rooster	voyage	0.62
249 | noon	string	0.54
250 | chord	smile	0.54
251 | professor	cucumber	0.31
252 | king	cabbage	0.23
253 | 


--------------------------------------------------------------------------------
/libraries/evaluation/lexsub/jcs/evaluation/lst/preprocess_lst_test.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | '''
  3 | 
  4 | # &#8211;    -
  5 | # &#8212;    -
  6 | # &#8220;    "
  7 | # &#8221;    "
  8 | # &#8216;    '
  9 | # &#8217;    '
 10 | 
 11 | # &#8211 ;    -
 12 | # &#8212 ;    -
 13 | # &#8220 ;    "
 14 | # &#8221 ;    "
 15 | # &#8216 ;    '
 16 | # &#8217 ;    '
 17 | 
 18 | # &amp    &
 19 | 
 20 | import sys
 21 | import re
 22 | 
 23 | from nltk.stem.wordnet import WordNetLemmatizer
 24 | from nltk.corpus import wordnet
 25 | from nltk.tag import pos_tag
 26 | 
 27 | 
 28 | first_quoted_re = re.compile('.*"(.*)".*')
 29 | context_re = re.compile('.*<context>(.*)</context>.*')
 30 | head_re = re.compile('.*<head>(.*)</head>.*')
 31 | 
 32 | target_prefix = '<lexelt item='
 33 | instance_prefix = '<instance id'
 34 | context_prefix = '<context>'
 35 | 
 36 | to_wordnet_pos = {'N':wordnet.NOUN,'J':wordnet.ADJ,'V':wordnet.VERB,'R':wordnet.ADV}
 37 | from_wordnet_pos = {wordnet.NOUN:'N',wordnet.ADJ:'J',wordnet.VERB:'V',wordnet.ADV:'R'}
 38 | 
 39 | 
 40 | def html_to_text(text):
 41 |     text = text.replace('&quot;', '"')
 42 |     text = text.replace('&apos;', "'")
 43 |     
 44 |     text = text.replace('&#8211;', " - ")
 45 |     text = text.replace('&#8212;', " - ")
 46 |     text = text.replace('&#8220;', ' " ')
 47 |     text = text.replace('&#8221;', ' " ')
 48 |     text = text.replace('&#8216;', " '")
 49 |     text = text.replace('&#8217;', " '")
 50 |     text = text.replace('&#150;', " ")
 51 |     
 52 |     text = text.replace('&#8211 ;', " - ")
 53 |     text = text.replace('&#8212 ;', " - ")
 54 |     text = text.replace('&#8220 ;', ' " ')
 55 |     text = text.replace('&#8221 ;', ' " ')
 56 |     text = text.replace('&#8216 ;', " '")
 57 |     text = text.replace('&#8217 ;', " '")
 58 |     
 59 |     
 60 |     
 61 |     text = text.replace('&amp;', "&")
 62 |     
 63 |     return text
 64 |     
 65 | def to_lowercase(words):
 66 |     return [word.lower() for word in words]
 67 | 
 68 | def lemmatize(pairs):
 69 |     triples = []
 70 |     for pair in pairs:
 71 |         word = pair[0]
 72 |         pos = pair[1]
 73 |         wordnet_pos = wordnet.NOUN
 74 |         if (len(pos)>=2):
 75 |             pos_prefix = pos[:2]
 76 |             if (pos_prefix in to_wordnet_pos):
 77 |                 wordnet_pos = to_wordnet_pos[pos_prefix]
 78 |         lemma = WordNetLemmatizer().lemmatize(word, wordnet_pos).lower();
 79 |         triples.append([word, wordnet_pos, lemma])
 80 |     return triples
 81 | 
 82 | def parse_context(context):
 83 |     target = head_re.match(context).group(1)
 84 |     tokens = context.split()
 85 |     target_ind = tokens.index('<head>'+target+'</head>')
 86 |     tokens[target_ind] = target
 87 |     
 88 |     return tokens, target_ind
 89 |     
 90 | 
 91 | def add_target(targets, target_with_pos, actual_target):
 92 |     wn_pos = target_with_pos.split('.')[-1]
 93 |     pos = from_wordnet_pos[wn_pos]
 94 |     targets.add(actual_target + "." + pos)
 95 | 
 96 | def is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets):
 97 |     mwe_count = 0
 98 |     for synset in synsets:
 99 |         gloss_lemmas = set([WordNetLemmatizer().lemmatize(word) for word in synset.definition.split()])
100 |         if verb_lemma in gloss_lemmas or complement_lemma in gloss_lemmas:
101 |             return False
102 |         for syn_lemma in synset.lemmas:
103 |             if syn_lemma.name != mwe: 
104 |                 tokens = syn_lemma.name.split('_')
105 |                 for token in tokens:
106 |                     if token == verb_lemma:
107 |                         return False
108 |                 if len(tokens) == 2 and tokens[1] == complement_lemma:
109 |                     return False
110 |         else:
111 |             mwe_count += syn_lemma.count()
112 |     return True   
113 |                 
114 | 
115 | def detect_mwe(text_tokens, target_ind, wordnet_pos):
116 |     if (target_ind < len(text_tokens)-1):
117 |         verb_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind], wordnet_pos)
118 |         complement_lemma = WordNetLemmatizer().lemmatize(text_tokens[target_ind+1])
119 |         mwe = '_'.join([verb_lemma, complement_lemma])
120 |         synsets = wordnet.synsets(mwe, wordnet.VERB) 
121 |         if len(synsets) > 0:
122 |             if (target_ind+1 < len(text_tokens)-1):
123 |                 mwe_right = '_'.join([WordNetLemmatizer().lemmatize(text_tokens[target_ind+1]), WordNetLemmatizer().lemmatize(text_tokens[target_ind+2])])
124 |                 if len(wordnet.synsets(mwe_right)) > 0:
125 |                     return
126 |             if is_atomic_mwe(mwe, verb_lemma, complement_lemma, synsets) == True:
127 |                 mwe = '='.join([text_tokens[target_ind], text_tokens[target_ind+1]])
128 |                 text_tokens[target_ind] = mwe
129 |                 del text_tokens[target_ind+1]
130 |     
131 |     
132 | if __name__ == '__main__':
133 |     
134 |     if (len(sys.argv) > 1):
135 |         input = open(sys.argv[1], 'r')
136 |         output = open(sys.argv[2], 'w')
137 |         detect_mwe_flag = False
138 |         if (len(sys.argv) > 4):
139 |             if sys.argv[4] == 'mwe':
140 |                 detect_mwe_flag = True        
141 |     else:
142 |         input = sys.stdin
143 |         output = sys.stdout
144 |         
145 |     targets = set()
146 |         
147 | target = None
148 | for line in input:
149 |     
150 |     line = line.strip()
151 |     if line.startswith(target_prefix):
152 |         target = first_quoted_re.match(line).group(1)        
153 |     if line.startswith(instance_prefix):
154 |         instance_id = first_quoted_re.match(line).group(1)
155 |         continue
156 |     if line.startswith(context_prefix):
157 |         context = context_re.match(line).group(1)
158 |         context = html_to_text(context)
159 |         text_tokens, target_ind = parse_context(context)
160 |         wn_pos = target.split('.')[-1]
161 |         if wn_pos == wordnet.VERB and detect_mwe_flag:
162 |             detect_mwe(text_tokens, target_ind, wordnet.VERB)
163 |         add_target(targets, target, text_tokens[target_ind])
164 |         text = ' '.join(text_tokens)
165 |         output_line = '\t'.join([target, instance_id, str(target_ind), text])
166 |         print >> output, output_line
167 |         continue
168 |     
169 | if (len(sys.argv) > 0):
170 |         input.close()
171 |         output.close()
172 |         
173 | if (len(sys.argv) > 3):
174 |     target_file = open(sys.argv[3], 'w')
175 |     for target in targets:
176 |         target_file.write(target + "\n")
177 |     target_file.close()
178 |             


--------------------------------------------------------------------------------
/interfaces/i_base.py:
--------------------------------------------------------------------------------
  1 | from libraries.utils.paths_and_files import get_subdir_number
  2 | import sys, os
  3 | from support import load, save, metrics_to_str
  4 | from libraries.tools.log import Log
  5 | from libraries.utils.other import merge_ordered_dicts
  6 | from support import infer_attributes_to_log, format_experimental_setup
  7 | from libraries.tools.ordered_attrs import OrderedAttrs
  8 | 
  9 | # a dirty hack from:
 10 | # http://stackoverflow.com/questions/24171725/scikit-learn-multicore-attributeerror-stdin-instance-has-no-attribute-close
 11 | # to avoid iPython in pycharm crushing
 12 | if not hasattr(sys.stdin, 'close'):
 13 |     def dummy_close():
 14 |         pass
 15 |     sys.stdin.close = dummy_close
 16 | 
 17 | 
 18 | class IBase(OrderedAttrs):
 19 |     """
 20 |     Base interface class that contains methods that must be implemented and the ones that can be used directly in children classes.
 21 | 
 22 |     """
 23 |     def __init__(self, model_class, vocab, train_data_path=None, val_data_path=None, test_data_path=None, epochs=5,
 24 |                  output_dir=None):
 25 |         OrderedAttrs.__init__(self)
 26 | 
 27 |         # will be assigned later on in the child class
 28 |         self.model = None
 29 |         self.init_iterator = None
 30 | 
 31 |         self.model_class = model_class
 32 |         self.vocab = vocab
 33 |         self.train_data_path = train_data_path
 34 |         self.val_data_path = val_data_path
 35 |         self.test_data_path = test_data_path
 36 |         self.epochs = epochs
 37 | 
 38 |         output_dir = output_dir if output_dir else os.path.join(os.getcwd(), 'output')
 39 |         self.output_path = os.path.join(output_dir, str(get_subdir_number(output_dir)))
 40 |         self.log = Log(self.output_path)  # will write log to a current w. dir. if not provide
 41 | 
 42 |     def init_model(self, **kwargs):
 43 |         """
 44 |         Initializes the actual model.
 45 | 
 46 |         """
 47 |         self.model = self.model_class(**kwargs)
 48 |         self.record_experimental_setup()
 49 | 
 50 |     def train_workflow(self, evaluate=True, save_model=True):
 51 |         """
 52 |         Runs a workflow of steps such as training and evaluation. One could modify it in order to create other procedures.
 53 |         :param evaluate: if True will evaluate otherwise not.
 54 | 
 55 |         """
 56 |         assert self.train_data_path
 57 | 
 58 |         for epoch in range(1, self.epochs+1):
 59 | 
 60 |             self.log.write('epoch %d' % epoch)
 61 |             self.train(data_path=self.train_data_path)
 62 | 
 63 |             # evaluate training and validation accuracy and loss
 64 |             if evaluate:
 65 |                 # FIXME: at the moment the training evaluation is disabled, as it's too expensive to perform evaluation over the whole large dataset.
 66 |                 # metrics = self._measure_performance(data_path=self.train_data_path)
 67 |                 # if metrics:
 68 |                 #     self.log.write(metrics_to_str(metrics, "training"))
 69 | 
 70 |                 if self.val_data_path:
 71 |                     metrics = self._measure_performance(data_path=self.val_data_path)
 72 |                     self.log.write(metrics_to_str(metrics, "validation"))
 73 | 
 74 |         if evaluate and self.test_data_path:
 75 |             metrics = self._measure_performance(data_path=self.test_data_path)
 76 |             self.log.write(self.log.write(metrics_to_str(metrics, "test")))
 77 | 
 78 |         # save the actual model
 79 |         if save_model:
 80 |             self.save_model(os.path.join(self.output_path, 'model.pkl'))
 81 |             self.log.write("model is saved to: %s" % self.output_path)
 82 | 
 83 |         # run post training functions
 84 |         self._post_training_logic()
 85 | 
 86 |     def train(self, data_path):
 87 |         """
 88 |         A user accessible train function that wraps the model's train function.
 89 |         :type data_path: str
 90 | 
 91 |         """
 92 |         iterator = self.init_iterator(data_path)
 93 |         for counter, batch in enumerate(iterator, 1):
 94 |             metrics = self._train(batch=batch)
 95 |             if counter % 10 == 0:
 96 |                 self.log.write(metrics_to_str(metrics, prefix="chunk's # %d" % counter))
 97 | 
 98 |     def load_model(self, model_file_path):
 99 |         """
100 |         :param model_file_path: pre-saved pkl file with a model.
101 | 
102 |         """
103 |         self.model = load(model_file_path)
104 |         self.record_experimental_setup()
105 |         self.log.write("loaded the model from: %s" % model_file_path)
106 | 
107 |     def save_model(self, file_path):
108 |         assert self.model  # the model should be initialized
109 |         save(self.model, file_path)
110 | 
111 |     def load_params(self, params_dump_file_path=None, exclude_params=[]):
112 |         """
113 |         Loads parameters from dumps or/and embeddings from a file.
114 |         :param exclude_params: an array of parameter names that should NOT be initialized.
115 | 
116 |         """
117 |         assert self.model  # the model should be initialized
118 | 
119 |         # general parameters loading
120 |         if params_dump_file_path:
121 |             init_params = self.model.load_params(file_path=params_dump_file_path, exclude_params=exclude_params)
122 |             self.log.write("loaded parameters from: %s" % params_dump_file_path)
123 |             self.log.write("initialized the following parameters: %s" % (", ".join(init_params)))
124 | 
125 |     def record_experimental_setup(self):
126 |         """
127 |         Records the experimental setup: basic and the model specific to a log file.
128 | 
129 |         """
130 |         setup = merge_ordered_dicts(infer_attributes_to_log(self.model), infer_attributes_to_log(self))
131 |         self.log.write(format_experimental_setup(setup), include_timestamp=False)
132 | 
133 |     # the following functions will be implemented in children classes
134 |     # TODO: write a basic documentation for those functions
135 | 
136 |     def _train(self, **kwargs):
137 |         """
138 |         A specific wrapper over the model's training function.
139 | 
140 |         """
141 |         raise NotImplementedError
142 | 
143 |     def _measure_performance(self, **kwargs):
144 |         """
145 |         Computes the performance of the model and returns a dictionary with names and values.
146 | 
147 |         """
148 |         raise NotImplementedError
149 | 
150 |     def _post_training_logic(self, **kwargs):
151 |         """
152 |         Logic that is desired to be executed in the train_workflow after the model has finished training.
153 | 
154 |         """
155 |         pass
156 | 


--------------------------------------------------------------------------------
/libraries/evaluation/GloVe/evaluate.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import numpy as np
  3 | import os
  4 | 
  5 | 
  6 | def glove_evaluate(vocab_file, vectors_file, bins=None, max_count=None):
  7 |     counts = {}
  8 |     words = []
  9 |     print("---------------------------------------------------------")
 10 |     print 'Reading %s' % vectors_file
 11 |     if max_count is not None:
 12 |         print "maximum frequency is %d (more freq. words are discarded)" % max_count
 13 |     with open(vocab_file, 'r') as f:
 14 |         # words = [x.rstrip().split(' ')[0] for x in f.readlines()]
 15 |         for line in f:
 16 |             word, count = line.split(' ')
 17 |             count = int(count)
 18 |             if max_count is not None and max_count < count:
 19 |                 continue
 20 |             counts[word] = count
 21 |             words.append(word)
 22 |     vocab = {w: idx for idx, w in enumerate(words)}
 23 |     with open(vectors_file, 'r') as f:
 24 |         vectors = {}
 25 |         for line in f:
 26 |             vals = line.rstrip().split(' ')
 27 |             if vals[0] in vocab:
 28 |                 vectors[vals[0]] = [float(x) for x in vals[1:]]
 29 |     if bins is not None:
 30 |         hist, bin_edges = np.histogram(counts.values(), bins=bins)
 31 | 
 32 |     vocab_size = len(vocab)
 33 |     ivocab = {idx: w for idx, w in enumerate(words)}
 34 | 
 35 |     vector_dim = len(vectors[ivocab[0]])
 36 |     W = np.zeros((vocab_size, vector_dim))
 37 |     for word, v in vectors.items():
 38 |         W[vocab[word], :] = v
 39 | 
 40 |     # normalize each word vector to unit variance
 41 |     W_norm = np.zeros(W.shape)
 42 |     d = (np.sum(W ** 2, 1) ** (0.5))
 43 |     W_norm = (W.T / d).T
 44 | 
 45 |     # print "total vocab size is %d" % len(vocab)
 46 | 
 47 | 
 48 |     if bins is None:
 49 |         evaluate_vectors(W_norm, vocab)
 50 |     else:
 51 |         for i in range(len(bin_edges)-1):
 52 | 
 53 |             if hist[i] == 0:
 54 |                 continue
 55 |             temp_vocab = {}
 56 |             temp_vectors = np.zeros((hist[i], vector_dim))
 57 |             edge1 = bin_edges[i]
 58 |             edge2 = bin_edges[i+1]
 59 |             j = 0  # current idx
 60 |             # collecting words that are between two edges
 61 |             for word, idx in vocab.items():
 62 |                 if counts[word] >= edge1 and counts[word] <= edge2:
 63 |                     temp_vocab[word] = j
 64 |                     temp_vectors[j, :]= W_norm[idx]
 65 |                     j += 1
 66 |             # run evaluation
 67 |             print("---------------------------------------------------------")
 68 |             print "bin frequency limits are [%f, %f]" % (edge1, edge2)
 69 |             print "temp vocab's size is %d " % len(temp_vectors)
 70 |             print "temp vectors size is %d" % len(temp_vocab)
 71 |             evaluate_vectors(temp_vectors, temp_vocab, short=True)
 72 | 
 73 | 
 74 | def evaluate_vectors(W, vocab, short=False):
 75 |     """Evaluate the trained word vectors on a variety of tasks"""
 76 | 
 77 |     filenames = [
 78 |         'capital-common-countries.txt', 'capital-world.txt', 'currency.txt',
 79 |         'city-in-state.txt', 'family.txt', 'gram1-adjective-to-adverb.txt',
 80 |         'gram2-opposite.txt', 'gram3-comparative.txt', 'gram4-superlative.txt',
 81 |         'gram5-present-participle.txt', 'gram6-nationality-adjective.txt',
 82 |         'gram7-past-tense.txt', 'gram8-plural.txt', 'gram9-plural-verbs.txt',
 83 |         ]
 84 |         #prefix = './eval/question-data/'
 85 |     prefix = os.path.dirname(os.path.realpath(__file__))+'/question-data'
 86 | 
 87 |     # to avoid memory overflow, could be increased/decreased
 88 |     # depending on system and vocab size
 89 |     split_size = 100
 90 | 
 91 |     correct_sem = 0; # count correct semantic questions
 92 |     correct_syn = 0; # count correct syntactic questions
 93 |     correct_tot = 0 # count correct questions
 94 |     count_sem = 0; # count all semantic questions
 95 |     count_syn = 0; # count all syntactic questions
 96 |     count_tot = 0 # count all questions
 97 |     full_count = 0 # count all questions, including those with unknown words
 98 | 
 99 |     for i in range(len(filenames)):
100 |         with open('%s/%s' % (prefix, filenames[i]), 'r') as f:
101 |             full_data = [line.rstrip().split(' ') for line in f]
102 |             full_count += len(full_data)
103 |             data = [x for x in full_data if all(word in vocab for word in x)]
104 | 
105 |         indices = np.array([[vocab[word] for word in row] for row in data])
106 |         if len(indices)==0: continue
107 |         ind1, ind2, ind3, ind4 = indices.T
108 | 
109 |         predictions = np.zeros((len(indices),))
110 |         num_iter = int(np.ceil(len(indices) / float(split_size)))
111 |         for j in range(num_iter):
112 |             subset = np.arange(j*split_size, min((j + 1)*split_size, len(ind1)))
113 |             pred_vec = (W[ind2[subset], :] - W[ind1[subset], :]
114 |                 +  W[ind3[subset], :])
115 |             #cosine similarity if input W has been normalized
116 |             dist = np.dot(W, pred_vec.T)
117 | 
118 |             for k in range(len(subset)):
119 |                 dist[ind1[subset[k]], k] = -np.Inf
120 |                 dist[ind2[subset[k]], k] = -np.Inf
121 |                 dist[ind3[subset[k]], k] = -np.Inf
122 | 
123 |             # predicted word index
124 |             predictions[subset] = np.argmax(dist, 0).flatten()
125 | 
126 |         val = (ind4 == predictions) # correct predictions
127 |         count_tot = count_tot + len(ind1)
128 |         correct_tot = correct_tot + sum(val)
129 |         if i < 5:
130 |             count_sem = count_sem + len(ind1)
131 |             correct_sem = correct_sem + sum(val)
132 |         else:
133 |             count_syn = count_syn + len(ind1)
134 |             correct_syn = correct_syn + sum(val)
135 |         if not short:
136 |             print("%s:" % filenames[i])
137 |             print('ACCURACY TOP1: %.2f%% (%d/%d)' %
138 |                 (np.mean(val) * 100, np.sum(val), len(val)))
139 | 
140 |     print('Questions seen/total: %.2f%% (%d/%d)' %
141 |         (100 * count_tot / float(full_count), count_tot, full_count))
142 |     if count_sem != 0:
143 |         print('Semantic accuracy: %.2f%%  (%i/%i)' %
144 |             (100 * correct_sem / float(count_sem), correct_sem, count_sem))
145 |     if count_syn:
146 |         print('Syntactic accuracy: %.2f%%  (%i/%i)' %
147 |             (100 * correct_syn / float(count_syn), correct_syn, count_syn))
148 |     if count_tot != 0:
149 |         print('Total accuracy: %.2f%%  (%i/%i)' % (100 * correct_tot / float(count_tot), correct_tot, count_tot))
150 | 
151 | 


--------------------------------------------------------------------------------