├── parvecs ├── __init__.py ├── common │ ├── __init__.py │ ├── embedding_text2numpy.py │ ├── util.py │ ├── vocab.py │ ├── embedding.py │ └── context_instance.py ├── eval │ ├── __init__.py │ ├── pool_lst_candidates.py │ └── coinco2txt_converter.py ├── setup │ ├── __init__.py │ ├── cluster_subvecs_concurrently.sh │ ├── count_vocab.py │ ├── mark_corpus.py │ ├── subvec_dir.py │ ├── subvecs2pmi.py │ ├── extract_reliable_subvecs.py │ ├── extract_contexts.py │ ├── wn_pseudowords_generator.py │ └── cluster_subvecs.py └── inference │ ├── __init__.py │ ├── parvec_util.py │ ├── contexts_container.py │ ├── parvec_inferrer.py │ ├── word2parvec.py │ ├── context_similarity_measures_eval.py │ └── context_collection.py ├── .gitignore └── README.md /parvecs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /parvecs/common/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /parvecs/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /parvecs/setup/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /parvecs/inference/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | #ignore thumbnails created by windows 3 | Thumbs.db 4 | #Ignore files build by Visual Studio 5 | *.obj 6 | *.exe 7 | *.pdb 8 | *.user 9 | *.aps 10 | *.pch 11 | *.vspscc 12 | *_i.c 13 | *_p.c 14 | *.ncb 15 | *.suo 16 | *.tlb 17 | *.tlh 18 | *.bak 19 | *.cache 20 | *.ilk 21 | *.log 22 | [Bb]in 23 | [Dd]ebug*/ 24 | *.lib 25 | *.sbr 26 | obj/ 27 | [Rr]elease*/ 28 | _ReSharper*/ 29 | [Tt]est[Rr]esult* 30 | *.pyc 31 | -------------------------------------------------------------------------------- /parvecs/setup/cluster_subvecs_concurrently.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # This script is used to apply subvec clustering to multiple subvec files in a directory 3 | 4 | echo source home $1 5 | echo process num $2 6 | echo vocab $3 7 | echo cluster_num $4 8 | echo min avg cluster size $5 9 | echo cluster prunning $6 10 | echo input dir $7 11 | echo output dir $8 12 | echo number inits $9 13 | echo max iterations ${10} 14 | 15 | cd $1 16 | FILECOUNT="$(ls $7 | wc -l)" 17 | echo filecount $FILECOUNT 18 | let FPP=FILECOUNT/$2+1 19 | echo files per process $FPP 20 | COUNTER=0 21 | while [ $COUNTER -lt $FILECOUNT ]; do 22 | let from=COUNTER 23 | let to=COUNTER+FPP 24 | echo "Running: /usr/bin/python parvecs/setup/cluster_subvecs.py $3 $4 $5 $6 $7 $8 $from $to $9 ${10} &" 25 | /usr/bin/python parvecs/setup/cluster_subvecs.py $3 $4 $5 $6 $7 $8 $from $to $9 ${10} & 26 | let COUNTER=COUNTER+FPP 27 | done 28 | -------------------------------------------------------------------------------- /parvecs/common/embedding_text2numpy.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Converts the word2vec embeddings format to a numpy-friendly format 3 | ''' 4 | import numpy as np 5 | import sys 6 | 7 | 8 | def readVectors(path): 9 | vectors = {} 10 | with open(path) as input_f: 11 | for line in input_f.readlines(): 12 | tokens = line.strip().split(' ') 13 | vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]]) 14 | return vectors 15 | 16 | inpath = sys.argv[1] 17 | outpath = sys.argv[2] 18 | 19 | matrix = readVectors(inpath) 20 | 21 | vocab = list(matrix.keys()) 22 | vocab.sort() 23 | with open(outpath+'.vocab', 'w') as output_f: 24 | for word in vocab: 25 | print >>output_f, word, 26 | 27 | new_matrix = np.zeros(shape=(len(vocab), len(matrix[vocab[0]])), dtype=np.float32) 28 | for i, word in enumerate(vocab): 29 | new_matrix[i, :] = matrix[word] 30 | 31 | np.save(outpath+'.npy', new_matrix) -------------------------------------------------------------------------------- /parvecs/inference/parvec_util.py: -------------------------------------------------------------------------------- 1 | ''' 2 | utilities manipulating paraphrase vectors 3 | ''' 4 | import re 5 | import heapq 6 | from nltk.stem.wordnet import WordNetLemmatizer 7 | 8 | lemmatized_word_re = re.compile('^[a-zA-Z\-]+$') 9 | 10 | def parvec_lemmatize(parvec, target_pos): 11 | ''' 12 | lemmatizes a paraphrase vector 13 | :param parvec: input parvec 14 | :param target_pos: part-of-speech used for lemmatization 15 | :returns lemmatized parvec 16 | ''' 17 | 18 | lemmas = {} 19 | if parvec is not None: 20 | for word, weight in parvec: 21 | if lemmatized_word_re.match(word) != None: # filter out non-words 22 | lemma = WordNetLemmatizer().lemmatize(word, target_pos) 23 | if lemma in lemmas: 24 | weight = max(weight, lemmas[lemma]) 25 | lemmas[lemma] = weight 26 | parlemvec = sorted(lemmas.iteritems(), key=lambda x: x[1], reverse=True) 27 | return parlemvec -------------------------------------------------------------------------------- /parvecs/setup/count_vocab.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Input: Tokenized text corpus 3 | Output: Vocabulary counts 4 | ''' 5 | 6 | import sys 7 | import string 8 | from operator import itemgetter 9 | from parvecs.common.vocab import VOCAB_TOTAL_COUNT 10 | 11 | 12 | if __name__ == '__main__': 13 | 14 | if len(sys.argv)<2: 15 | print >> sys.stderr, "Usage: %s < corpus.txt" % sys.argv[0] 16 | sys.exit(1) 17 | 18 | min_count = int(sys.argv[1]) 19 | vocab = {} 20 | i = 0 21 | for line in sys.stdin: 22 | words = line.split() 23 | for word in words: 24 | if (word not in vocab): 25 | vocab[word] = 1 26 | else: 27 | vocab[word] +=1 28 | i += 1 29 | if i % 10000000 == 0: 30 | print >> sys.stderr, 'Read ' + str(i) + ' words' 31 | vocab[VOCAB_TOTAL_COUNT] = i 32 | sorted_vocab = sorted(vocab.iteritems(), key=itemgetter(1), reverse=True) 33 | for word, count in sorted_vocab: 34 | if count < min_count: 35 | break; 36 | print '\t'.join([word, str(count)]) -------------------------------------------------------------------------------- /parvecs/setup/mark_corpus.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Input: Tokenized text corpus 3 | Output: Text corpus with special words marked 4 | ''' 5 | 6 | import sys 7 | 8 | from parvecs.common.vocab import read_vocab 9 | from parvecs.common.vocab import RARE_WORD_TOKEN 10 | from parvecs.common.vocab import NUMERIC_TOKEN 11 | from parvecs.common.util import is_numeric 12 | 13 | if __name__ == '__main__': 14 | 15 | if len(sys.argv) < 2: 16 | print >> sys.stderr, "Usage: %s < corpus.txt" % (sys.argv[0]) 17 | sys.exit(1) 18 | 19 | 20 | vocab = read_vocab(sys.argv[1]) 21 | print >> sys.stderr, "Read vocab of size: " + str(len(vocab)) 22 | 23 | i = 0 24 | for line in sys.stdin: 25 | in_words = line.split() 26 | out_words = [] 27 | for word in in_words: 28 | # if is_numeric(word): 29 | # outword = NUMERIC_TOKEN 30 | outword = word if word in vocab else RARE_WORD_TOKEN 31 | out_words.append(outword) 32 | if len(out_words)>0: 33 | sys.stdout.write(' '.join(out_words) + '\n') 34 | i += 1 35 | if i % 1000000 == 0: 36 | print >> sys.stderr, 'Wrote ' + str(i) + ' lines' 37 | 38 | 39 | -------------------------------------------------------------------------------- /parvecs/setup/subvec_dir.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Input: context subvecs filename 3 | Output: new directory named filename.DIR. In this directory a subvecs file per each target individually) 4 | ''' 5 | 6 | import sys 7 | import os 8 | 9 | SUBVEC_DIR_SUFFIX = ".DIR" 10 | 11 | if __name__ == '__main__': 12 | 13 | if len(sys.argv) < 2: 14 | print "Usage: %s input_subvec_file" % sys.argv[0] 15 | sys.exit(1) 16 | 17 | input_subvec_filename = sys.argv[1] 18 | 19 | subvec_dirname = input_subvec_filename + SUBVEC_DIR_SUFFIX 20 | os.mkdir(subvec_dirname) 21 | 22 | input_subvec_file = open(input_subvec_filename, 'r') 23 | 24 | output_files = {} 25 | 26 | while True: 27 | line1 = input_subvec_file.readline() 28 | line2 = input_subvec_file.readline() 29 | if not line1 or not line2: 30 | break; 31 | 32 | target = line1[:line1.find('\t')] 33 | if target not in output_files: 34 | output_files[target] = open(subvec_dirname + "/" + target, 'w') 35 | 36 | output_files[target].write(line1) 37 | output_files[target].write(line2) 38 | 39 | input_subvec_file.close() 40 | for output_file in output_files.itervalues(): 41 | output_file.close() -------------------------------------------------------------------------------- /parvecs/common/util.py: -------------------------------------------------------------------------------- 1 | import re 2 | import string 3 | import heapq 4 | 5 | def asciize(line): 6 | return filter(lambda x: x in string.printable, line) 7 | 8 | def is_printable(s): 9 | return all(c in string.printable for c in s) 10 | 11 | # very crude implementation 12 | num_re = re.compile('^[\+\/\:\-,\.\d]*\d[\+\/\:\-,\.\d]*$') 13 | def is_numeric(word_str): 14 | return num_re.match(word_str) != None 15 | 16 | def wf2ws(weight): 17 | return '{0:1.5f}'.format(weight) 18 | 19 | def vec_to_str(subvec, max_n): 20 | sub_list_sorted = heapq.nlargest(max_n, subvec, key=lambda x: x[1]) 21 | sub_strs = [' '.join([word, wf2ws(weight)]) for word, weight in sub_list_sorted] 22 | return '\t'.join(sub_strs) 23 | 24 | def count_file_lines(filename): 25 | f = open(filename, 'r') 26 | lines_num = sum(1 for line in f) 27 | f.close() 28 | return lines_num 29 | 30 | class TimeRecorder(object): 31 | 32 | def __init__(self): 33 | self.time = 0.0 34 | self.iterations = 0 35 | 36 | 37 | def iteration_time(self, seconds): 38 | self.time += seconds 39 | self.iterations += 1 40 | 41 | # processing time in msec 42 | def msec_per_iteration(self): 43 | return 1000*self.time/self.iterations if self.iterations > 0 else 0.0 44 | 45 | -------------------------------------------------------------------------------- /parvecs/common/vocab.py: -------------------------------------------------------------------------------- 1 | VOCAB_TOTAL_COUNT = "" 2 | RARE_WORD_TOKEN = "" 3 | NUMERIC_TOKEN = "" 4 | STOPWORD_TOP_THRESHOLD = 256 5 | 6 | import sys 7 | 8 | def read_vocab(vocab_filename): 9 | vocab = {} 10 | with open(vocab_filename,'r') as f: 11 | for line in f: 12 | tokens = line.split('\t') 13 | word = tokens[0].strip() 14 | count = int(tokens[1]) 15 | vocab[word] = count 16 | return vocab 17 | 18 | def vocab_total_size(vocab): 19 | return vocab[VOCAB_TOTAL_COUNT] 20 | 21 | def load_vocabulary_w2i(vocab_filename): 22 | with open(vocab_filename) as f: 23 | vocab = [line.split('\t')[0].strip() for line in f if len(line) > 0] 24 | return dict([(a, i) for i, a in enumerate(vocab)]), vocab 25 | 26 | def load_vocabulary_counts(path): 27 | stop_words = set() 28 | counts = {} 29 | with open(path) as f: 30 | i = 0 31 | for line in f: 32 | if len(line) > 0: 33 | tokens = line.split('\t') 34 | word = tokens[0].strip() 35 | count = int(tokens[1].strip()) 36 | counts[word] = count 37 | i += 1 38 | if (i <= STOPWORD_TOP_THRESHOLD): 39 | stop_words.add(word) 40 | total_size = counts[VOCAB_TOTAL_COUNT] 41 | return counts, total_size, stop_words -------------------------------------------------------------------------------- /parvecs/setup/subvecs2pmi.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Converts substitute weights from conditional probabilities to pmi (or sppmi) 3 | ''' 4 | import sys 5 | from operator import itemgetter 6 | 7 | from parvecs.common.vocab import read_vocab 8 | from parvecs.common.vocab import vocab_total_size 9 | from parvecs.common.context_instance import ContextInstance 10 | from parvecs.common.context_instance import read_context 11 | from parvecs.common.context_instance import get_pmi_weights 12 | 13 | 14 | def write_subvec(output, subvec): 15 | for word, weight in subvec: 16 | output.write(word + " " + '{0:1.8f}'.format(weight) + "\t") 17 | output.write("\n") 18 | 19 | 20 | if __name__ == '__main__': 21 | if len(sys.argv) < 3: 22 | sys.stderr.write("Usage: %s [normalize] output\n" % sys.argv[0]) 23 | sys.exit(1) 24 | 25 | vocab = read_vocab(sys.argv[1]) 26 | total_size = vocab_total_size(vocab) 27 | pmi_shift = float(sys.argv[2]) 28 | normalize = False 29 | if len(sys.argv) > 3 and sys.argv[3] == 'normalize': 30 | normalize = True 31 | 32 | lines = 0 33 | try: 34 | while True: 35 | context_inst, subvec = read_context(sys.stdin) 36 | subvec_pmi = get_pmi_weights(subvec, vocab, total_size, pmi_shift, 0.0, normalize) 37 | sorted_subvec_pmi = sorted(subvec_pmi, key=itemgetter(1), reverse=True) 38 | sys.stdout.write(context_inst.line+'\n') 39 | write_subvec(sys.stdout,sorted_subvec_pmi) 40 | lines += 1 41 | if lines % 10000 == 0: 42 | sys.stderr.write("Read %d subvecs\n" % (lines)) 43 | except EOFError: 44 | sys.stderr.write("Finished loading %d context lines\n" % lines) 45 | 46 | 47 | -------------------------------------------------------------------------------- /parvecs/setup/extract_reliable_subvecs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Input: context subvecs file 3 | Output: only the context subvecs for which the original target word that was observed in this context appears in the subvec 4 | ''' 5 | 6 | 7 | import sys 8 | from parvecs.common.context_instance import ContextInstance 9 | 10 | if __name__ == '__main__': 11 | 12 | if len(sys.argv)<3: 13 | print "Usage: %s " % sys.argv[0] 14 | sys.exit(1) 15 | 16 | input_sub_file = open(sys.argv[1], 'r') 17 | output_sub_file = open(sys.argv[2], 'w') 18 | output_targetfreqs_file = open(sys.argv[3], 'w') 19 | target_freqs = {} 20 | 21 | while True: 22 | context_line = input_sub_file.readline() 23 | subs_line = input_sub_file.readline() 24 | if not context_line or not subs_line: 25 | break 26 | 27 | context_inst = ContextInstance(context_line) 28 | 29 | if context_inst.target != context_inst.target_key: 30 | sys.stderr.write("Skipping bad context: " + context_line) 31 | continue 32 | 33 | substitute_words = subs_line.split()[::2] 34 | 35 | if context_inst.target in substitute_words: 36 | output_sub_file.write(context_line) 37 | output_sub_file.write(subs_line) 38 | if context_inst.target in target_freqs: 39 | target_freqs[context_inst.target] = target_freqs[context_inst.target]+1 40 | else: 41 | target_freqs[context_inst.target] = 1 42 | 43 | for word, freq in sorted(target_freqs.iteritems(), key=lambda x: x[1], reverse=True): 44 | output_targetfreqs_file.write("%s\t%d\n" % (word, freq)) 45 | 46 | input_sub_file.close() 47 | output_sub_file.close() 48 | output_targetfreqs_file.close() 49 | 50 | -------------------------------------------------------------------------------- /parvecs/setup/extract_contexts.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Input: one-sentence-per-line tokenized text corpus and list of target words 3 | Output: contexts of target words 4 | ''' 5 | 6 | import sys 7 | from operator import itemgetter 8 | import string 9 | 10 | MAX_WORDS_IN_LINE = 128 11 | MAX_CHARS_IN_LINE = 1024 12 | 13 | 14 | if __name__ == '__main__': 15 | 16 | if len(sys.argv) < 6: 17 | print >> sys.stderr, "Usage: %s " % (sys.argv[0]) 18 | sys.exit(1) 19 | 20 | corpus_file = sys.argv[1] 21 | targets_file = sys.argv[2] 22 | max_freq = int(sys.argv[3]) 23 | contexts_file = sys.argv[4] 24 | targets_freq_file = sys.argv[5] 25 | 26 | targets = {} 27 | with open(targets_file,'r') as tf: 28 | for line in tf: 29 | word = line.split('\t')[0].strip() 30 | targets[word] = 0 31 | print >> sys.stderr, "Read %d targets " % (len(targets)) 32 | 33 | cf = open(corpus_file,'r') 34 | mf = open(contexts_file , 'w') 35 | 36 | i = 0 37 | full_targets = 0 38 | for line in cf: 39 | if len(line) < MAX_CHARS_IN_LINE: 40 | stripped_line = line.strip() 41 | sent_words = stripped_line.split() 42 | if len(sent_words) <= MAX_WORDS_IN_LINE: 43 | for ind, word in enumerate(sent_words): 44 | if (word in targets and targets[word] < max_freq): 45 | mf.write('\t'.join([word, str(i), str(ind), stripped_line])+'\n') 46 | targets[word] += 1 47 | if targets[word] == max_freq: 48 | full_targets += 1 49 | i += 1 50 | if i % 1000000 == 0: 51 | print >> sys.stderr, 'Read ' + str(i) + ' lines' 52 | if (full_targets == len(targets)): 53 | break 54 | 55 | cf.close() 56 | mf.close() 57 | 58 | with open(targets_freq_file, 'w') as tff: 59 | for target, freq in sorted(targets.iteritems(), key=itemgetter(1), reverse=True): 60 | tff.write(target + '\t' + str(freq) + '\n') -------------------------------------------------------------------------------- /parvecs/inference/contexts_container.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A container of context collections of different target words 3 | ''' 4 | 5 | from parvecs.inference.context_collection import ContextCollection 6 | from parvecs.common.util import count_file_lines 7 | 8 | 9 | class ContextsContainer(): 10 | 11 | 12 | def __init__(self, args, w2i, i2w, w2counts, sum_word_counts, stopwords, embeddings): 13 | 14 | self.args = args 15 | self.container = {} 16 | self.w2i = w2i 17 | self.i2w = i2w 18 | self.w2counts = w2counts 19 | self.sum_word_counts = sum_word_counts 20 | self.stopwords = stopwords 21 | self.embeddings = embeddings 22 | 23 | 24 | def get_target_contexts(self, target): 25 | ''' 26 | :param target: target word 27 | :returns: context collection for target word 28 | ''' 29 | try: 30 | if target not in self.container: 31 | self.load_target_contexts(target) 32 | return self.container[target] 33 | except IOError as e: 34 | return None 35 | 36 | 37 | def load_target_contexts(self, target): 38 | ''' 39 | load into memory the contexts of target 40 | :param target: target word 41 | ''' 42 | target_filename = self.args.contexts_dir+"/"+target 43 | collection_size = count_file_lines(target_filename)/2 # subvec every two lines 44 | target_subfile = open(target_filename, 'r') 45 | self.container[target] = ContextCollection(self.args, self.i2w, self.w2i, collection_size, self.w2counts, self.sum_word_counts, self.stopwords, self.embeddings) 46 | self.container[target].load_contexts(target_subfile) 47 | if len(self.container[target].contexts) != collection_size: 48 | raise EOFError('context collection size mismatch in target %s. collection_size %d len(contexts) %d' % (target, collection_size, len(self.container[target].contexts))) 49 | self.container[target].tocsr() 50 | target_subfile.close() 51 | 52 | 53 | def clear(self): 54 | ''' 55 | clear memory of container 56 | ''' 57 | self.container = {} 58 | 59 | -------------------------------------------------------------------------------- /parvecs/eval/pool_lst_candidates.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Used to pool all substitute candidates for every target type in a lexical substitution dataset 3 | ''' 4 | import sys 5 | 6 | if __name__ == '__main__': 7 | 8 | if len(sys.argv)<3: 9 | print "Usage: %s [no-mwe]" % sys.argv[0] 10 | sys.exit(1) 11 | 12 | goldfile = open(sys.argv[1], 'r') 13 | outfile = open(sys.argv[2], 'w') 14 | 15 | ignore_mwe = False 16 | if (len(sys.argv) > 3): 17 | sys.stderr.write("ignoring multi-word-expressions\n"); 18 | ignore_mwe = True 19 | 20 | good_oneword_inst = 0 21 | target2candidates = {} 22 | # bright.a 5 :: intelligent 3;clever 2;most able 1;capable 1;promising 1;sharp 1;motivated 1; 23 | for line in goldfile: 24 | if len(line)>0: 25 | oneword_in_line = 0 # e.g. ;most able 1; 26 | segments = line.split("::") 27 | if len(segments)>=2: 28 | target = segments[0][:segments[0].strip().rfind(' ')] 29 | target = '.'.join(target.split('.')[:2]) # remove suffix in cases of bar.n.v 30 | line_candidates = segments[1].strip().split(';') 31 | for candidate_count in line_candidates: 32 | if len(candidate_count) > 0: 33 | delimiter_ind = candidate_count.rfind(' ') 34 | candidate = candidate_count[:delimiter_ind] 35 | if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)): 36 | continue 37 | oneword_in_line += 1 38 | if target in target2candidates: 39 | candidates = target2candidates[target] 40 | else: 41 | candidates = set() 42 | target2candidates[target] = candidates 43 | candidates.add(candidate) 44 | if (oneword_in_line >= 1): 45 | good_oneword_inst += 1 46 | 47 | if ignore_mwe: 48 | sys.stderr.write("After discarding MWE, there are %d instances with at least one substitute.\n" % (good_oneword_inst)) 49 | for target, candidates in target2candidates.iteritems(): 50 | outfile.write(target + '::' + ';'.join(list(candidates)) + '\n') 51 | 52 | goldfile.close() 53 | outfile.close() 54 | -------------------------------------------------------------------------------- /parvecs/common/embedding.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Dense vector representations for words 3 | ''' 4 | 5 | import numpy as np 6 | import heapq 7 | import math 8 | import time 9 | 10 | class Embedding: 11 | 12 | def __init__(self, path): 13 | self.m = self.normalize(np.load(path + '.npy')) 14 | self.dim = self.m.shape[1] 15 | self.wi, self.iw = self.read_vocab(path + '.vocab') 16 | 17 | def zeros(self): 18 | return np.zeros(self.dim) 19 | 20 | def dimension(self): 21 | return self.dim 22 | 23 | def normalize(self, m): 24 | norm = np.sqrt(np.sum(m*m, axis=1)) 25 | norm[norm==0] = 1 26 | return m / norm[:, np.newaxis] 27 | 28 | def read_vocab(self, path): 29 | vocab = [] 30 | with open(path) as f: 31 | for line in f: 32 | vocab.extend(line.strip().split()) 33 | return dict([(w, i) for i, w in enumerate(vocab)]), vocab 34 | 35 | def __contains__(self, w): 36 | return w in self.wi 37 | 38 | def represent(self, w): 39 | return self.m[self.wi[w], :] 40 | 41 | def scores(self, vec): 42 | return np.dot(self.m, vec) 43 | 44 | def positive_scores(self, vec): 45 | return (np.dot(self.m, vec)+1)/2 46 | 47 | def top_scores(self, scores, n=10): 48 | if n <= 0: 49 | n = len(scores) 50 | return heapq.nlargest(n, zip(self.iw, scores), key=lambda x: x[1]) 51 | 52 | def closest(self, w, n=10): 53 | scores = np.dot(self.m, self.represent(w)) 54 | return self.top_scores(scores,n) 55 | 56 | def closest_with_time(self, w, n=10): 57 | start = time.time() 58 | scores = np.dot(self.m, self.represent(w)) 59 | end = time.time() 60 | # print "\nDeltatime: %f msec\n" % ((end-start)*1000) 61 | return self.top_scores(scores,n), end-start 62 | 63 | def closest_vec(self, wordvec, n=10): 64 | scores = np.dot(self.m, wordvec) 65 | return self.top_scores(scores,n) 66 | 67 | def closest_vec_filtered(self, wordvec, vocab, n=10): 68 | scores = np.dot(self.m, wordvec) 69 | if n <= 0: 70 | n = len(scores) 71 | scores_words = zip(self.iw, scores) 72 | for i in xrange(0,len(scores_words)): 73 | if not scores_words[i][1] in vocab: 74 | scores_words[i] = (-1, scores_words[i][0]) 75 | return heapq.nlargest(n, zip(self.iw, scores), key=lambda x: x[1]) 76 | 77 | def closest_prefix(self, w, prefix, n=10): 78 | scores = np.dot(self.m, self.represent(w)) 79 | scores_words = zip(self.iw, scores) 80 | for i in xrange(0,len(scores_words)): 81 | if not scores_words[i][1].startswith(prefix): 82 | scores_words[i] = (-1, scores_words[i][0]) 83 | return heapq.nlargest(n, scores_words, key=lambda x: x[1]) 84 | 85 | def closest_filtered(self, w, vocab, n=10): 86 | scores = np.dot(self.m, self.represent(w)) 87 | scores_words = zip(self.iw, scores) 88 | for i in xrange(0,len(scores_words)): 89 | if not scores_words[i][1] in vocab: 90 | scores_words[i] = (-1, scores_words[i][0]) 91 | return heapq.nlargest(n, scores_words, key=lambda x: x[1]) 92 | 93 | def similarity(self, w1, w2): 94 | return self.represent(w1).dot(self.represent(w2)) 95 | 96 | def norm_vec(vec): 97 | length = 1.0 * math.sqrt(sum(val ** 2 for val in vec)) 98 | return [val/length for val in vec] 99 | 100 | def score2string(score): 101 | return score[1] + "\t" + '{0:1.3f}'.format(score[0]) 102 | 103 | 104 | def closest_sym_scores(targets, subs, w, n): 105 | w_target_vec = targets.represent(w) 106 | w_sub_vec = subs.represent(w) 107 | w2subs = subs.closest_vec(w_target_vec,0) 108 | w2subs2w = [] 109 | for entry in w2subs: 110 | score = (entry[0]+1)/2 111 | sub = entry[1] 112 | sub_target_vec = targets.represent(sub) 113 | rev_score = (np.dot(sub_target_vec, w_sub_vec)+1)/2 114 | w2subs2w.append((math.sqrt(score * rev_score), sub)) 115 | return heapq.nlargest(n, w2subs2w) 116 | 117 | 118 | -------------------------------------------------------------------------------- /parvecs/setup/wn_pseudowords_generator.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Generates pseudowords using Wordnet 3 | ''' 4 | import sys 5 | import random 6 | import re 7 | from nltk.corpus import wordnet as wn 8 | from nltk.stem.porter import PorterStemmer 9 | from nltk.stem.wordnet import WordNetLemmatizer 10 | 11 | from parvecs.common.vocab import load_vocabulary_counts 12 | 13 | DEBUG = False 14 | MIN_FREQ = 1000 # words representing pseudo-senses need to have this min frequency in the learning corpus 15 | 16 | word_re = re.compile('^[a-z][a-z]+$') 17 | pos_list = set([wn.NOUN, wn.VERB,wn.ADJ, wn.ADV]) 18 | 19 | 20 | def sample_target_word(vocab_counts, stop_words, min_freq): 21 | ''' 22 | Create a single pseudoword (randomly sampled from vocab) 23 | :param vocab_counts: 24 | :param stop_words: 25 | :param min_freq: minimum required corpus frequency of word 26 | ''' 27 | 28 | accum_counts = [] 29 | n = 0 30 | accum_counts.append((None, 0)) 31 | for word, count in vocab_counts.iteritems(): 32 | if count >= min_freq and word not in stop_words: 33 | n += count 34 | accum_counts.append((word, n)) 35 | max_count = n 36 | 37 | while True: 38 | rnd = random.randint(0, max_count) 39 | for k in xrange(1,len(accum_counts)): 40 | sampled_word = accum_counts[k][0] 41 | if len(wn.synsets(sampled_word))>1 and word_re.match(sampled_word) != None: 42 | if rnd < accum_counts[k][1] and rnd >= accum_counts[k-1][1]: 43 | return sampled_word 44 | 45 | print "Failed to sample target word" 46 | sys.exit(1) 47 | 48 | 49 | if __name__ == '__main__': 50 | 51 | if len(sys.argv) < 5: 52 | print "usage: %s []" % (sys.argv[0]) 53 | sys.exit(1) 54 | 55 | stemmer = PorterStemmer() 56 | 57 | vocab_file = sys.argv[1] 58 | words_num = int(sys.argv[2]) 59 | words2senses_file = open(sys.argv[3], 'w') 60 | senses_file = open(sys.argv[4], 'w') 61 | if len(sys.argv) > 5: 62 | min_freq = int(sys.argv[5]) 63 | else: 64 | min_freq = 1000 65 | 66 | vocab_counts, ignore, stop_words = load_vocabulary_counts(vocab_file) 67 | 68 | words = set() 69 | all_words = set() 70 | 71 | while len(words) < words_num: 72 | while True: 73 | word = sample_target_word(vocab_counts, stop_words, min_freq) 74 | if word not in words: 75 | break; 76 | word_synsets = wn.synsets(word) 77 | if DEBUG: print "Word: [%s] Number of senses: %s" % (word, str(len(word_synsets))) 78 | 79 | senses = set() 80 | for word_synset in word_synsets: 81 | if DEBUG: print "\tsynset: %s" % word_synset 82 | pos = word_synset.pos() 83 | if pos in pos_list: 84 | sense = None 85 | smallest_sense_num_found = sys.maxint 86 | for lemma in word_synset.lemmas(): 87 | if DEBUG: print "\t " + lemma.name(), len(wn.synsets(lemma.name())) 88 | if (stemmer.stem(lemma.name()) != stemmer.stem(word)) and \ 89 | WordNetLemmatizer().lemmatize(lemma.name(), pos) != WordNetLemmatizer().lemmatize(word, pos) and \ 90 | (lemma.name().islower()) and (lemma.name() in vocab_counts) and (vocab_counts[lemma.name()]>=min_freq) and \ 91 | (lemma.name() not in stop_words) and (word_re.match(lemma.name()) != None) and \ 92 | (len(wn.synsets(lemma.name())) < smallest_sense_num_found): # we look for the lemma with least number of senses, i.e. hopefully least ambiguous 93 | sense = lemma.name() 94 | smallest_sense_num_found = len(wn.synsets(lemma.name())) 95 | if sense != None: 96 | if DEBUG: print "\tChosen sense word: %s %d\n" % (sense, smallest_sense_num_found) 97 | senses.add(sense) 98 | else: 99 | if DEBUG: print "\tDidn't find any suitable sense word. Skipping.\n" 100 | if len(senses) > 1: 101 | all_words.update(senses) 102 | sys.stdout.write(word + ':\t' + ' '.join(senses)+'\n') 103 | words2senses_file.write(word + '\t' + ' '.join(senses)+'\n') 104 | words.add(word) 105 | 106 | for pword in all_words: 107 | senses_file.write(pword+"\n") 108 | 109 | words2senses_file.close() 110 | senses_file.close() 111 | -------------------------------------------------------------------------------- /parvecs/eval/coinco2txt_converter.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Used to convert the coinco (Kremer 2014) xml dataset format to a flat format. 3 | 4 | Example of coinco format: 5 | 6 | 7 | 8 | 9 | 10 | 11 | A mission to end a war 12 | 13 | 14 | AUSTIN, Texas -- Tom Karnes was dialing for destiny, but not everyone wanted to cooperate. 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | ''' 39 | 40 | import sys 41 | import string 42 | from xml.etree import ElementTree 43 | 44 | to_wordnet_pos = {'N':'n','J':'a','V':'v','R':'r'} 45 | 46 | def is_printable(s): 47 | return all(c in string.printable for c in s) 48 | 49 | 50 | def clean_token(token): 51 | 52 | token = token.replace('"', '"') 53 | token = token.replace(''', "'") 54 | token = token.replace(chr(int("85",16)), "...") 55 | token = token.replace(chr(int("91",16)), "'") 56 | token = token.replace(chr(int("92",16)), "'") 57 | token = token.replace(chr(int("93",16)), '"') 58 | token = token.replace(chr(int("94",16)), '"') 59 | token = token.replace(chr(int("96",16)), '-') 60 | if not is_printable(token): 61 | sys.stderr.write('TOKEN NOT PRINTABLE: '+''.join([str(c) for c in token if c in string.printable ]) + '\n') 62 | return "" 63 | else: 64 | return token 65 | 66 | def subs2text(subs_element): 67 | subs = [(int(sub.attrib.get('freq')), clean_token(sub.attrib.get('lemma')).replace(';', ',')) for sub in subs_element.iter('subst')] # sub.attrib.get('lemma').replace(';', ',') is used to fix a three cases in coinco where the lemma includes erroneously the char ';'. Since this char is used as a delimiter, we replace it with ','. 68 | sorted_subs = sorted(subs, reverse=True) 69 | return ';'.join([sub + " " + str(freq) for freq, sub in sorted_subs])+';' 70 | 71 | if __name__ == '__main__': 72 | 73 | if len(sys.argv) < 4: 74 | print "Usage: %s " % sys.argv[0] 75 | sys.exit(1) 76 | 77 | with open(sys.argv[1], 'r') as f: 78 | coinco = ElementTree.parse(f) 79 | 80 | test_file = open(sys.argv[2], 'w') 81 | gold_file = open(sys.argv[3], 'w') 82 | 83 | sent_num = 0 84 | tokens_num = 0 85 | 86 | for sent in coinco.iter('sent'): 87 | sent_num += 1 88 | tokens = sent.find('tokens') 89 | sent_text = "" 90 | for token in tokens.iter('token'): 91 | sent_text = sent_text + clean_token(token.attrib.get('wordform')) + " " 92 | sent_text = sent_text.strip() 93 | tok_position = -1 94 | for token in tokens.iter('token'): 95 | tok_position += 1 96 | if token.attrib.get('id') != 'XXX' and token.attrib.get('problematic') == 'no': 97 | tokens_num += 1 98 | try: 99 | target_key = clean_token(token.attrib.get('lemma')) + '.' + to_wordnet_pos[token.attrib.get('posMASC')[0]] 100 | test_file.write("%s\t%s\t%d\t%s\n" % (target_key, token.attrib.get('id'), tok_position, sent_text)) 101 | gold_file.write("%s %s :: %s\n" % (target_key, token.attrib.get('id'), subs2text(token.find('substitutions')))) 102 | except UnicodeEncodeError as e: 103 | sys.stderr.write("ENCODING TARGET ERROR at token_id %s. %s\n" % (token.attrib.get('id'),e)) 104 | sys.exit(1) 105 | 106 | test_file.close() 107 | gold_file.close() 108 | 109 | print 'Read %d sentences %d target tokens' % (sent_num, tokens_num) 110 | 111 | 112 | 113 | -------------------------------------------------------------------------------- /parvecs/common/context_instance.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Represents the given context of a target word instance 3 | ''' 4 | 5 | CONTEXT_TEXT_BEGIN_INDEX = 3 6 | 7 | import math 8 | 9 | class ContextInstance(object): 10 | 11 | def __init__(self, line): 12 | ''' 13 | Constructor 14 | 15 | Example line: 16 | bright.a 1 13 during the siege , george robertson had appointed shuja-ul-mulk , who was a bright boy 17 | ''' 18 | self.line = line 19 | tokens1 = line.split("\t") 20 | self.target_key = tokens1[0] 21 | self.target_id = tokens1[1] 22 | self.target_ind = int(tokens1[2]) 23 | self.target = tokens1[3].split()[self.target_ind] 24 | pos_delimiter_ind = self.target_key.rfind('.') 25 | if pos_delimiter_ind > 0 and pos_delimiter_ind == len(self.target_key)-2: 26 | self.partofspeech = self.target_key[pos_delimiter_ind+1:] 27 | else: 28 | self.partofspeech = None 29 | 30 | 31 | def get_context_tokens(self): 32 | ''' 33 | :returns: a list of the text tokens 34 | ''' 35 | all_tokens = self.line.split() 36 | return all_tokens[CONTEXT_TEXT_BEGIN_INDEX:] 37 | 38 | 39 | 40 | def get_neighbors(self, window_size): 41 | ''' 42 | Get the neighbors of a target word 43 | :param window_size: neighbors window size 44 | :returns: a list of neighbors 45 | ''' 46 | tokens = self.line.split()[3:] 47 | 48 | if (window_size > 0): 49 | start_pos = max(self.target_ind-window_size, 0) 50 | end_pos = min(self.target_ind+window_size+1, len(tokens)) 51 | else: 52 | start_pos = 0 53 | end_pos = len(tokens) 54 | 55 | neighbors = tokens[start_pos:self.target_ind] + tokens[self.target_ind+1:end_pos] 56 | return neighbors 57 | 58 | 59 | def decorate_context(self): 60 | ''' 61 | :returns the context text line with target word highlighted 62 | ''' 63 | tokens = self.line.split('\t') 64 | words = tokens[CONTEXT_TEXT_BEGIN_INDEX].split() 65 | words[self.target_ind] = '__'+words[self.target_ind]+'__' 66 | tokens[CONTEXT_TEXT_BEGIN_INDEX] = ' '.join(words) 67 | return '\t'.join(tokens) 68 | 69 | 70 | def read_context(subfile, maxlen=None): 71 | ''' 72 | Reads a context and substitute vector from file 73 | :param subfile: 74 | :param maxlen: 75 | :returns context instance, subvec 76 | ''' 77 | context_line = subfile.readline() 78 | subvecs_line = subfile.readline() 79 | if not context_line or not subvecs_line: 80 | raise EOFError 81 | 82 | context_inst = ContextInstance(context_line.strip()) 83 | subvecs_line = subvecs_line.strip() 84 | subvec = [__extract_word_weight(pair) for pair in subvecs_line.split("\t")[:maxlen]] if len(subvecs_line) > 0 else [] 85 | 86 | return context_inst, subvec 87 | 88 | 89 | 90 | def get_pmi_weights(subvec, w2counts, sum_counts, shift, threshold, normalize=False): 91 | ''' 92 | Converts a subvec with conditional probability weights to pmi (or sppmi) weights 93 | Also performs the functionality of remove_out_of_vocab 94 | :param subvec: 95 | :param w2counts: 96 | :param sum_counts: 97 | :param shift: 98 | :param threshold: 99 | :param normalize: 100 | :returns: subvec with pmi weights 101 | ''' 102 | subvec_pmi = [] 103 | norm = 0 104 | for word, prob in subvec: 105 | if prob != 0.0 and word in w2counts: 106 | pmi = math.log(prob * sum_counts / w2counts[word])-shift 107 | if pmi>threshold: 108 | subvec_pmi.append((word, pmi)) 109 | norm += pmi**2 110 | 111 | if normalize: 112 | norm = norm**0.5 113 | for i in xrange(0,len(subvec_pmi)): 114 | subvec_pmi[i] = (subvec_pmi[i][0], subvec_pmi[i][1] / norm) 115 | 116 | return subvec_pmi 117 | 118 | def remove_out_of_vocab(subvec, w2counts): 119 | ''' 120 | Removes entries from subvec that are out of the vocabulary 121 | :param subvec: 122 | :param w2counts: 123 | :returns: subvec in vocab 124 | ''' 125 | subvec_vocab = [] 126 | for word, prob in subvec: 127 | if prob != 0.0 and word in w2counts: 128 | subvec_vocab.append((word, prob)) 129 | return subvec_vocab 130 | 131 | 132 | def normalize_subvec(subvec): 133 | ''' 134 | normalizes subvec weights in L2 135 | :param subvec: 136 | :returns: normalized subvec 137 | ''' 138 | norm = 0.0 139 | for word, weight in subvec: 140 | norm += weight**2 141 | norm = norm**0.5 142 | for i in xrange(0,len(subvec)): 143 | subvec[i] = (subvec[i][0], subvec[i][1] / norm) 144 | 145 | 146 | def __extract_word_weight(pair): 147 | tokens = pair.split(' ') 148 | return tokens[0], float(tokens[1]) 149 | 150 | -------------------------------------------------------------------------------- /parvecs/inference/parvec_inferrer.py: -------------------------------------------------------------------------------- 1 | ''' 2 | ParvecInferrer generates a paraphrase vector for a target word in a given context 3 | 4 | ''' 5 | import heapq 6 | import time 7 | 8 | from parvecs.common.vocab import load_vocabulary_w2i 9 | from parvecs.common.vocab import load_vocabulary_counts 10 | from parvecs.common.context_instance import get_pmi_weights 11 | from parvecs.common.context_instance import remove_out_of_vocab 12 | from parvecs.common.util import wf2ws 13 | from parvecs.common.util import vec_to_str 14 | from parvecs.common.util import TimeRecorder 15 | from parvecs.common.embedding import Embedding 16 | from parvecs.inference.contexts_container import ContextsContainer 17 | 18 | 19 | class ParvecInferrer(): 20 | 21 | 22 | def __init__(self, args): 23 | 24 | self.args = args 25 | self.bow_interpolate = self.args.bow_interpolate 26 | self.w2i, self.i2w = load_vocabulary_w2i(args.vocabfile) 27 | self.w2counts, self.sum_word_counts, self.stopwords = load_vocabulary_counts(args.vocabfile) 28 | if args.use_stopwords == False: 29 | self.stopwords = {} 30 | print "Vocab size: " + str(len(self.w2i)) 31 | 32 | if args.embeddingpath != None: 33 | embeddings = Embedding(args.embeddingpath) 34 | print "Read embeddings from " + args.embeddingpath 35 | else: 36 | embeddings = None 37 | 38 | self.context_container = ContextsContainer(args, self.w2i, self.i2w, self.w2counts, self.sum_word_counts, self.stopwords, embeddings) 39 | self.time_recorder = TimeRecorder() 40 | 41 | 42 | def clear(self): 43 | ''' 44 | Clears the contexts cache 45 | ''' 46 | self.context_container.clear() 47 | 48 | 49 | def infer_parvec(self, subvec, context_instance, tfo): 50 | ''' 51 | generate the paraphrase vector 52 | :param orig_subvec: subvec of instance 53 | :param context_instance: context instance 54 | :param tfo: output file 55 | :returns: parvec 56 | ''' 57 | 58 | subvec = self.__preprocess_subvec(subvec, context_instance, tfo) 59 | 60 | if (self.args.debug == True): 61 | tfo.write("\nUsing weightsfactor %s\n" % ('{0:1.1f}'.format(self.args.weightsfactor))) 62 | 63 | target_contexts = self.context_container.get_target_contexts(context_instance.target) 64 | 65 | if target_contexts is not None: 66 | 67 | start1 = time.time() 68 | subvec_matrix = target_contexts.reference_context(subvec, context_instance, self.bow_interpolate) 69 | end1 = time.time() 70 | 71 | if (self.args.debug == True) and (self.bow_interpolate > 0): 72 | tfo.write("\nUsed BOW similarity. bow_interpolate = %f\n\n" % self.bow_interpolate) 73 | 74 | max_len = self.args.debugtop if self.args.debug == True else len(subvec) 75 | trimmed_sorted_subvec = heapq.nlargest(max_len, subvec, key=lambda t: t[1]) 76 | tfo.write("SUBVEC\t" + '\t'.join([' '.join([word, wf2ws(weight)]) for (word, weight) in trimmed_sorted_subvec])+'\n') 77 | 78 | start2 = time.time() 79 | result_vec, contexts_num = target_contexts.avg_contexts(subvec_matrix, self.args.top, self.args.top_percent, self.args.parvec_maxlen, self.args.excluderef, self.args.weightsfactor) 80 | end2 = time.time() 81 | 82 | deltatime = (end1-start1) + (end2-start2) 83 | self.time_recorder.iteration_time(deltatime) 84 | 85 | if (self.args.debug == True): 86 | tfo.write("\nDeltatime: %f msec\n" % (deltatime*1000)) 87 | tfo.write("\nTop similar contexts:\n") 88 | tfo.write("**************************\n") 89 | tfo.write(target_contexts.to_str(min(self.args.debugtop,contexts_num) , self.args.debugtop)+"\n\n") 90 | 91 | if (self.args.debug == True): 92 | if (result_vec is not None): 93 | tfo.write("Avg of top " + str(contexts_num) + " contexts: " + vec_to_str(result_vec, self.args.debugtop) + '\n') 94 | else: 95 | tfo.write("Avg of top " + str(contexts_num) + " contexts: None\n") 96 | tfo.write("*****************************************\n\n") 97 | else: 98 | if (self.args.debug == True): 99 | tfo.write("\nNo subvecs found for target [%s], using only reference subvec.\n" % context_instance.target) 100 | tfo.write("SUBVEC\t" + '\t'.join([' '.join([word, wf2ws(weight)]) for (word, weight) in subvec])+'\n') 101 | result_vec = subvec 102 | 103 | return result_vec 104 | 105 | 106 | def msec_per_word(self): 107 | ''' 108 | returns: mean net processing time per parvec generation 109 | ''' 110 | return self.time_recorder.msec_per_iteration() 111 | 112 | 113 | def __preprocess_subvec(self, subvec, context_instance, tfo): 114 | if (self.args.pmi == True): 115 | subvec = get_pmi_weights(subvec, self.w2counts, self.sum_word_counts, self.args.pmioffset, self.args.pmithreshold) 116 | else: 117 | subvec = remove_out_of_vocab(subvec, self.w2counts) 118 | return sorted(subvec, reverse=True, key=lambda x: x[1]) 119 | 120 | -------------------------------------------------------------------------------- /parvecs/setup/cluster_subvecs.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Created on 22.9.14 3 | 4 | @author: user 5 | ''' 6 | from parvecs.common.context_instance import read_context 7 | from parvecs.common.context_instance import normalize_subvec 8 | from parvecs.common.vocab import load_vocabulary_w2i 9 | 10 | import sys 11 | import os 12 | import os.path 13 | from operator import itemgetter 14 | import numpy as np 15 | from scipy.sparse.dok import dok_matrix 16 | 17 | from sklearn.cluster import KMeans 18 | 19 | 20 | def normalize_centroids(centroids): 21 | for j in xrange(0,len(centroids)): 22 | norm = (np.dot(centroids[j,:],centroids[j,:]))**0.5 23 | if norm > 0: 24 | centroids[j,:] /= norm 25 | 26 | 27 | def cluster_subvec_file(w2i, cluster_prunning, K, ninit, maxiter, min_avg_cluster_size, subvec_filename, cluster_filename): 28 | ''' 29 | kmeans clustering of subvecs given in an input file 30 | :param w2i: word2index 31 | :param cluster_prunning: max size of a cluster centroid 32 | :param K: number of clusters 33 | :param ninit: number of repeating tries 34 | :param maxiter: number of clustering iterations 35 | :param min_avg_cluster_size: min size of clusters (on average) 36 | :param subvec_filename: input filename 37 | :param cluster_filename: output filename 38 | :returns: None 39 | ''' 40 | 41 | if os.path.exists(cluster_filename): 42 | print "NOTICE: cluster file %s already exists. skipping." % cluster_filename 43 | return 44 | 45 | subvec_file = open(subvec_filename, 'r') 46 | subvec_num = sum(1 for line in subvec_file)/2 #subvec is on every second line 47 | subvec_file.seek(0) 48 | 49 | minK = min(subvec_num/min_avg_cluster_size, K) 50 | minK = max(1, minK) 51 | 52 | cluster_file = open(cluster_filename, 'w') 53 | print "Clustering subvecs in file %s. Using K=%d\n" % (cluster_filename, minK) 54 | 55 | target = subvec_filename[subvec_filename.rfind('/')+1:] 56 | subs_matrix = dok_matrix((subvec_num, len(w2i)), dtype=np.float32) 57 | 58 | line = 0 59 | try: 60 | while True: 61 | context_inst, subvec = read_context(subvec_file) 62 | normalize_subvec(subvec) 63 | for word, weight in subvec: 64 | if (weight != 0): 65 | subs_matrix[line, w2i[word]] = weight 66 | line += 1 67 | if line % 10000 == 0: 68 | sys.stderr.write("Read %d subvecs\n" % (line)) 69 | except EOFError: 70 | sys.stderr.write("Finished loading %d context lines\n" % line) 71 | 72 | subs_matrix = subs_matrix.tocsr() 73 | 74 | best_centroids = None 75 | best_inertia = None 76 | 77 | for init_iter in xrange(0, ninit): 78 | 79 | kmeans = KMeans(init='k-means++', n_clusters=minK, n_init=1, max_iter=1) 80 | kmeans.fit(subs_matrix) 81 | centroids = kmeans.cluster_centers_ 82 | normalize_centroids(centroids) 83 | for iter in xrange(1,maxiter): 84 | kmeans = KMeans(init=centroids, n_clusters=minK, n_init=1, max_iter=1) 85 | kmeans.fit(subs_matrix) 86 | centroids = kmeans.cluster_centers_ 87 | normalize_centroids(centroids) 88 | inertia = kmeans.inertia_ 89 | 90 | if best_centroids is None or inertia < best_inertia: 91 | best_inertia = inertia 92 | best_centroids = centroids 93 | 94 | for j in xrange(0,len(best_centroids)): 95 | cluster_vec = [(i2w[i], weight) for (i, weight) in enumerate(best_centroids[j,:]) if weight != 0] 96 | cluster_vec = sorted(cluster_vec, key=itemgetter(1), reverse=True)[:cluster_prunning] 97 | norm = sum([weight**2 for word, weight in cluster_vec])**0.5 98 | cluster_vec = [(word, weight/norm) for word, weight in cluster_vec] 99 | norm = sum([weight**2 for word, weight in cluster_vec])**0.5 100 | cluster_file.write(target + "\t" + str(j) + "\t0\t" + target + "\tCLUSTER\t norm verified = " + '{0:1.8f}'.format(norm) + "\tpruning factor = " + str(cluster_prunning) +"\n") 101 | for (word, weight) in cluster_vec: 102 | cluster_file.write(' '.join([word, '{0:1.8f}'.format(weight)])+'\t') 103 | cluster_file.write('\n') 104 | 105 | subvec_file.close() 106 | cluster_file.close() 107 | 108 | 109 | if __name__ == '__main__': 110 | 111 | if len(sys.argv) < 9: 112 | sys.stderr.write("Usage: %s [n_init] [max_iter]\n" % sys.argv[0]) 113 | sys.exit(1) 114 | 115 | vocab_filename = sys.argv[1] 116 | K = int(sys.argv[2]) 117 | min_avg_cluster_size = int(sys.argv[3]) 118 | cluster_prunning = int(sys.argv[4]) 119 | input_dirname = sys.argv[5] 120 | output_dirname = sys.argv[6] 121 | from_file = int(sys.argv[7]) 122 | to_file = int(sys.argv[8]) 123 | 124 | if from_file == 0: 125 | from_file = None 126 | if to_file == 0: 127 | to_file = None 128 | w2i, i2w = load_vocabulary_w2i(vocab_filename) 129 | 130 | ninit=1 131 | maxiter=30 132 | if len(sys.argv) > 9: 133 | ninit = int(sys.argv[9]) 134 | if len(sys.argv) > 10: 135 | maxiter = int(sys.argv[10]) 136 | sys.stderr.write("K=%d, n_init=%d, max_iter=%d\n" % (K, ninit, maxiter)) 137 | 138 | if not os.path.exists(output_dirname): 139 | os.makedirs(output_dirname) 140 | 141 | filenames = sorted(os.listdir(input_dirname))[from_file:to_file] 142 | 143 | for filename in filenames: 144 | input_filepath = '/'.join([input_dirname, filename]) 145 | output_filepath = '/'.join([output_dirname, filename]) 146 | cluster_subvec_file(w2i, cluster_prunning, K, ninit, maxiter, min_avg_cluster_size, input_filepath, output_filepath) 147 | 148 | 149 | -------------------------------------------------------------------------------- /parvecs/inference/word2parvec.py: -------------------------------------------------------------------------------- 1 | ''' 2 | word2parvec application 3 | converts words in contexts to paraphrase vectors representations 4 | ''' 5 | import sys 6 | import time 7 | import argparse 8 | import numpy 9 | 10 | from parvecs.inference.parvec_inferrer import ParvecInferrer 11 | from parvecs.inference.parvec_util import parvec_lemmatize 12 | from parvecs.common.util import vec_to_str 13 | from parvecs.common.context_instance import read_context 14 | 15 | 16 | def run_app(args, inferrer): 17 | ''' 18 | Runs the application 19 | :param args: all app arguments 20 | :param inferrer: the parvec inferrer that is to be used 21 | :returns: None 22 | ''' 23 | 24 | testfile = open(args.testfile, 'r') 25 | resultsfile = open(args.resultsfile, 'w') 26 | 27 | lines = 0 28 | last_target_key = None 29 | while True: 30 | 31 | try: 32 | context_instance, subvec = read_context(testfile, args.subvec_maxlen) 33 | except EOFError: 34 | break 35 | 36 | lines += 1 37 | if (args.debug == True): 38 | resultsfile.write("\nTest context:\n") 39 | resultsfile.write("=====================\n") 40 | 41 | resultsfile.write("INSTANCE\t" + context_instance.decorate_context()+'\n') 42 | 43 | # Assuming testfile is sorted according to target key - clear container memory every time we move to a new key target word 44 | if context_instance.target_key != last_target_key: 45 | inferrer.clear() 46 | last_target_key = context_instance.target_key 47 | 48 | result_vec = inferrer.infer_parvec(subvec, context_instance, resultsfile) 49 | 50 | max_vec_len = args.debugtop if args.debug == True else args.parvec_maxlen 51 | if (args.debug == True): 52 | resultsfile.write("Paraphrase vector\n") 53 | resultsfile.write("***************\n") 54 | resultsfile.write("PARVEC\t" + vec_to_str(result_vec, max_vec_len)+"\n") 55 | 56 | if (args.lemmatize == True): 57 | result_vec_lemmatized = parvec_lemmatize(result_vec, context_instance.partofspeech) 58 | if (args.debug == True): 59 | resultsfile.write("Lemmatized paraphrase vector\n") 60 | resultsfile.write("***************\n") 61 | resultsfile.write("PARLEMVEC\t" + vec_to_str(result_vec_lemmatized, max_vec_len)+"\n") 62 | 63 | if lines % 100 == 0: 64 | print "Read %d lines" % lines 65 | 66 | print "Read %d word instances in total" % lines 67 | print "Net processing time for computing the paraphrase vectors per each word instance: %f msec" % inferrer.msec_per_word() 68 | testfile.close() 69 | resultsfile.close() 70 | 71 | 72 | def run(args): 73 | ''' 74 | Initialize inferrer and run app 75 | :param args: 76 | ''' 77 | 78 | print "Initializing" 79 | print time.asctime(time.localtime(time.time())) 80 | 81 | inferrer = ParvecInferrer(args) 82 | print "Running" 83 | print time.asctime(time.localtime(time.time())) 84 | 85 | run_app(args, inferrer) 86 | print "Finished" 87 | print time.asctime(time.localtime(time.time())) 88 | 89 | 90 | 91 | if __name__ == '__main__': 92 | 93 | parser = argparse.ArgumentParser(description='Parvec App') 94 | 95 | parser.add_argument('--debug',action='store_true',dest='debug') 96 | parser.add_argument('-debugtop', action="store", dest="debugtop", type=int, default=10, help="Top number of vector entries to print in debug mode.") 97 | 98 | parser.add_argument('-contexts_dir', action="store", dest="contexts_dir", default=None) 99 | parser.add_argument('-vocabfile', action="store", dest="vocabfile", default=None) 100 | parser.add_argument('-testfile', action="store", dest="testfile", default=None) 101 | parser.add_argument('-resultsfile', action="store", dest="resultsfile", default=None) 102 | 103 | parser.add_argument('--lemmatize', action="store_true", dest="lemmatize", default=False, help="Lemmatize output paraphrase vectors.") 104 | parser.add_argument('-parvec_maxlen', action="store", dest="parvec_maxlen", type=int, default=100, help="Max num of paraphrases in each output parvec.") 105 | parser.add_argument('-subvec_maxlen', action="store", dest="subvec_maxlen", type=int, default=None, help="Max num of substitutes read per subvec.") 106 | parser.add_argument('-top', action="store", dest="top", type=int, default=0, help="Num of top most similar contexts to consider for each given context. 0 means all context.") 107 | parser.add_argument('-toppercent', action="store", dest="top_percent", type=float, default=0.0, help="Percent of top contexts to consider. Param 'top' is considered as min number to consider in any case. 0 means all contexts.") 108 | parser.add_argument('-weightsfactor',action='store',dest='weightsfactor', type=float, default=1.0, help="Context similarity weights power factor.") 109 | parser.add_argument('--excluderef',action='store_true',dest='excluderef', default=False, help="Exclude reference (given) context from context averaging.") 110 | 111 | parser.add_argument('--pmi',action='store_true',dest='pmi', default=False, help="Convert conditional probability substitute weights in input files to pmi (or spmmi) weights).") 112 | parser.add_argument('-pmioffset',action='store',dest='pmioffset', type=float, default=0.0, help='pmi=pmi-offset') 113 | parser.add_argument('-pmithreshold',action='store',dest='pmithreshold', type=float, default=0.0, help='pmi=0 if pmi<=threshold') 114 | 115 | parser.add_argument('-bow',action='store',dest='bow_size', default=-1, type=int, help="Context bag-of-words window size used for computing context sim. -1 means bow is not used, 0 means entire sentence.") 116 | parser.add_argument('-bowinter',action='store',dest='bow_interpolate', default=0.0, type=float, help="Interpolation factor between bow and subvec context sims. 0 means only consider subvec similarity.") 117 | parser.add_argument('-cbow',action='store',dest='embeddingpath', default=None, help="Use continuous bow (embeddings avg) instead of bow") 118 | 119 | parser.add_argument('--tfidf',action='store_true',dest='tfidf', default=False, help="Use tfidf weighting in bow.") 120 | parser.add_argument('-tfidfoffset',action='store',dest='tfidf_offset', type=float, default=0.0, help='tfidf=tfidf-offset') 121 | parser.add_argument('-tfidfthreshold',action='store',dest='tfidf_threshold', type=float, default=0.0, help='tfidf=0 if tfidf<=threshold') 122 | parser.add_argument('--nostopwords',action='store_false',dest='use_stopwords', default=True) 123 | 124 | 125 | 126 | if len(sys.argv)==1: 127 | print parser.print_help(sys.stdout) 128 | else: 129 | args = parser.parse_args(sys.argv[1:]) 130 | config_file_name = args.resultsfile + ".CONFIG" 131 | cf = open(config_file_name, 'w') 132 | cf.write(' '.join(sys.argv)+'\n') 133 | cf.close() 134 | numpy.seterr(all='raise', divide='raise', over='raise', under='raise', invalid='raise') 135 | run(args) 136 | 137 | -------------------------------------------------------------------------------- /parvecs/inference/context_similarity_measures_eval.py: -------------------------------------------------------------------------------- 1 | ''' 2 | This application is used to evaluate context similarity measures using pseudowords. 3 | ''' 4 | import sys 5 | import argparse 6 | from random import Random 7 | import numpy 8 | 9 | from parvecs.common.vocab import load_vocabulary_w2i 10 | from parvecs.common.vocab import load_vocabulary_counts 11 | from parvecs.inference.context_collection import ContextCollection 12 | from parvecs.common.embedding import Embedding 13 | from parvecs.common.util import count_file_lines 14 | 15 | 16 | def read_pseudo_words(pseudos_filename): 17 | ''' 18 | Reads pseudo words from file 19 | :param pseudos_filename: 20 | :returns: mapping of each pseudoword to a list of pseudosense 21 | ''' 22 | words = [] 23 | f = open(pseudos_filename, 'r') 24 | for line in f: 25 | word = line[:line.find('\t')] 26 | pseudos = line.split()[1:] 27 | words.append((word, pseudos)) 28 | f.close() 29 | 30 | return words 31 | 32 | def evaluate_word(word, collection, seeded_random, results_file): 33 | ''' 34 | Evaluate context similarity measures on a given pseudoword experiment 35 | :param word: the pseudoword used to perform the experiment 36 | :param collection: the contexts of the pseudoword 37 | :param seeded_random: 38 | :param results_file: 39 | :returns: evaluation results 40 | ''' 41 | 42 | m_precision_at_1 = 0.0 # mean precision@1 43 | m_top_precision = 0.0 # mean precision@top 44 | m_avg_precision = 0.0 # mean average precision 45 | for i in xrange(0,args.sample_num): 46 | precision_at_1, top_precision, avg_precision, debug_str = collection.evaluate_context_similarity(seeded_random, args.random_sim) 47 | m_precision_at_1 += precision_at_1 48 | m_top_precision += top_precision 49 | m_avg_precision += avg_precision 50 | if args.debug: 51 | results_file.write("\n" + debug_str + "\n") 52 | results_file.write("%d: p@1: %f \t p@top: %f \t avg_p: %f\n" % (i, precision_at_1, top_precision, avg_precision)) 53 | 54 | m_precision_at_1 /= args.sample_num 55 | m_top_precision /= args.sample_num 56 | m_avg_precision /= args.sample_num 57 | 58 | print "Mean over all samples for word [%s]: m_p@1: %f \t m_p@top :%f \t m_avg_p: %f\n" % (word, m_precision_at_1, m_top_precision, m_avg_precision) 59 | if args.debug: 60 | results_file.write("\nMean over all samples for word [%s]: m_p@1: %f \t m_p@top: %f \t m_avg_p: %f\n\n" % (word, m_precision_at_1, m_top_precision, m_avg_precision)) 61 | else: 62 | results_file.write("%s\t%f\t%f\t%f\n" % (word, m_precision_at_1, m_top_precision, m_avg_precision)) 63 | results_file.flush() 64 | return m_precision_at_1, m_top_precision, m_avg_precision 65 | 66 | 67 | def add_pseudo_word_to_vocab(i2w, w2i, w2counts, word, pseudo_senses): 68 | ''' 69 | Add pseudo word to vocabulary 70 | :param i2w: 71 | :param w2i: 72 | :param w2counts: 73 | :param word: 74 | :param pseudo_senses: 75 | :returns: the label of the pseudoword in the vocabulary 76 | ''' 77 | label = word+'='+'+'.join(pseudo_senses) 78 | i2w.append(label) 79 | w2i[label] = len(i2w)-1 80 | count = 0 81 | for word in pseudo_senses: 82 | count += w2counts[word] 83 | w2counts[label] = count 84 | return label 85 | 86 | def run(args): 87 | ''' 88 | Run application 89 | :param args: 90 | ''' 91 | 92 | w2i, i2w = load_vocabulary_w2i(args.vocabfile) 93 | w2counts, sum_word_counts, stopwords = load_vocabulary_counts(args.vocabfile) 94 | print "Vocab size: " + str(len(w2i)) 95 | 96 | if args.embeddingpath != None: 97 | embeddings = Embedding(args.embeddingpath) 98 | print "Read embeddings from " + args.embeddingpath 99 | else: 100 | embeddings = None 101 | 102 | words = read_pseudo_words(args.pseudos_file) 103 | 104 | results_file = open(args.resultsfile,'w') 105 | 106 | mm_precision_at_1 = 0.0 # mean mean precision@1 107 | mm_top_precision = 0.0 # mean mean precision@top 108 | mm_avg_precision = 0.0 # mean mean average precision 109 | seeded_random = Random() 110 | for word in words: 111 | 112 | word_name = word[0] 113 | 114 | # word_seed = word_name+' '+' '.join(word[1]) 115 | # the 'star' is used for backward compatibility with previous experiments 116 | word_name_star = word_name+'.*' 117 | word_star = [pseudo+'.*' for pseudo in word[1]] 118 | word_seed = word_name_star+' '+' '.join(word_star) 119 | 120 | seeded_random.seed(word_seed) # we want the same random numbers when repeating experiments with different params etc. 121 | senses = word[1] 122 | 123 | collection_size = 0 124 | for target in senses: 125 | target_filename = args.contexts_dir+"/"+target 126 | collection_size += count_file_lines(target_filename)/2 # subvec every two lines 127 | 128 | pseudos_label = add_pseudo_word_to_vocab(i2w, w2i, w2counts, word_name, senses) 129 | collection = ContextCollection(args, i2w, w2i, collection_size, w2counts, sum_word_counts, stopwords, embeddings) 130 | 131 | if args.debug: 132 | results_file.write("Reading word for word_name [%s]\n" % word_name) 133 | for target in senses: 134 | target_filename = args.contexts_dir+"/"+target 135 | target_subfile = open(target_filename, 'r') 136 | lines_num = collection.load_contexts(target_subfile, set(senses), pseudos_label, tocsr_flag=False) 137 | if args.debug: 138 | results_file.write("Read %d contexts for pseudo [%s]\n" % (lines_num, target)) 139 | target_subfile.close() 140 | collection.tocsr() 141 | m_precision_at_1, m_top_precision, m_avg_precision = evaluate_word(word_name, collection, seeded_random, results_file) 142 | 143 | mm_precision_at_1 += m_precision_at_1 144 | mm_top_precision += m_top_precision 145 | mm_avg_precision += m_avg_precision 146 | 147 | mm_precision_at_1 /= len(words) 148 | mm_top_precision /= len(words) 149 | mm_avg_precision /= len(words) 150 | 151 | results_file.write("TOTAL\t%f\t%f\t%f\n" % (mm_precision_at_1, mm_top_precision, mm_avg_precision)) 152 | 153 | if args.debug: 154 | results_file.write("#WORDS\t%d\n" % len(words)) 155 | results_file.write("MM_P1\t%f\n" % (mm_precision_at_1)) 156 | results_file.write("MM_PTOP\t%f\n" % (mm_top_precision)) 157 | results_file.write("MM_AVG\t%f\n" % (mm_avg_precision)) 158 | 159 | results_file.close() 160 | 161 | 162 | 163 | 164 | 165 | if __name__ == '__main__': 166 | 167 | parser = argparse.ArgumentParser(description='Context similarity measures app') 168 | 169 | parser.add_argument('--debug',action='store_true',dest='debug', default=False) 170 | 171 | parser.add_argument('-samplenum', action="store", dest="sample_num", type=int, default=None, help="number of samples from each pseudowords collection") 172 | parser.add_argument('-pseudosfile', action="store", dest="pseudos_file", default=None) 173 | parser.add_argument('-contexts_dir', action="store", dest="contexts_dir", default=None) 174 | parser.add_argument('-vocabfile', action="store", dest="vocabfile") 175 | parser.add_argument('-resultsfile', action="store", dest="resultsfile") 176 | parser.add_argument('-embeddingpath', action="store", dest="embeddingpath", default=None, help="prefix to files containing word embeddings") 177 | 178 | 179 | parser.add_argument('-top', action="store", dest="top", type=int, help="num of top contexts to consider") 180 | parser.add_argument('-toppercent', action="store", dest="top_percent", type=float, default=0.0, help="percent of top contexts to consider. When using this, top num is considered as min number to consider") 181 | parser.add_argument('-subvec_maxlen', action="store", dest="subvec_maxlen", type=int, default=None, help="max num of substitutes read per subvec") 182 | 183 | parser.add_argument('--randomsim',action='store_true',dest='random_sim', default=False, help='similarity measure returns zero for all context pairs') 184 | parser.add_argument('--pmi',action='store_true',dest='pmi', default=False) 185 | parser.add_argument('-pmioffset',action='store',dest='pmioffset', type=float, default=0.0, help='pmi=pmi-offset') 186 | parser.add_argument('-pmithreshold',action='store',dest='pmithreshold', type=float, default=0.0, help='pmi=0 if pmi<=threshold') 187 | 188 | parser.add_argument('--tfidf',action='store_true',dest='tfidf', default=False) 189 | parser.add_argument('-tfidfoffset',action='store',dest='tfidf_offset', type=float, default=0.0, help='tfidf=tfidf-offset') 190 | parser.add_argument('-tfidfthreshold',action='store',dest='tfidf_threshold', type=float, default=0.0, help='tfidf=0 if tfidf<=threshold') 191 | 192 | 193 | parser.add_argument('-weightsfactor',action='store',dest='weightsfactor', type=float, default=1.0, help="context similarity measure power factor") 194 | parser.add_argument('-bow',action='store',dest='bow_size', default=-1, type=int, help="context bag-of-words window size for context cosine sim. -1 means bow not used, 0 means entire sentence") 195 | parser.add_argument('-bowinter',action='store',dest='bow_interpolate', default=0, type=float, help="interpolation factor between bow and subvec sims. 0 means no bow, -1 means doing backoff instead of interpolation.") 196 | parser.add_argument('-cbow',action='store',dest='embeddingpath', default=None, help="continuous bow (embeddings avg)") 197 | 198 | if len(sys.argv)==1: 199 | print parser.print_help(sys.stdout) 200 | else: 201 | 202 | args = parser.parse_args(sys.argv[1:]) 203 | 204 | config_file_name = args.resultsfile + ".CONFIG" 205 | cf = open(config_file_name, 'w') 206 | cf.write(' '.join(sys.argv)+'\n') 207 | cf.close() 208 | 209 | numpy.seterr(all='raise', divide='raise', over='raise', under='raise', invalid='raise') 210 | 211 | run(args) 212 | 213 | 214 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | WORD2PARVEC TOOLKIT Oren Melamud, 2015 2 | ------------------------------------------- 3 | 4 | NOTE: The word2parvec toolkit is provided 'AS IS' with no warranty whatsoever. 5 | 6 | word2parvec is a toolkit that learns paraphrase vector (parvec) representations for word meanings in context. 7 | The model is described in the following paper (please cite if using this toolkit): 8 | 9 | Oren Melamud, Ido Dagan, Jacob Goldberger. Modeling Word Meaning in Context with Substitute Vectors. NAACL, 2015. 10 | 11 | This readme file explains how to use the toolkit. 12 | The procedure includes the following steps: 13 | 14 | SETUP 15 | 16 | 1. Preprocessing the learning corpus 17 | 2. Learning an n-gram language model from the corpus 18 | 3. Extracting sentential contexts from the corpus for all target words 19 | a. Choosing target words 20 | b. Sampling target words contexts 21 | c. Generating substitute vector (subvec) representations for contexts 22 | INFERENCE 23 | 24 | 4. Generating parvecs for target words in sentential context 25 | 26 | The toolkit also includes: 27 | 28 | 5. A simple Wordnet-based pseudoword generator 29 | 6. An application that evaluates subvec/bow/cbow context similarity measures using pseudowords 30 | 31 | 32 | 1. Preprocessing the learning corpus 33 | ------------------------------------- 34 | This is a common procedure in many NLP tasks. Use your favorite tools to perform the following steps: 35 | 1.1 Sentence split - one sentence per line 36 | 1.2 Tokenize - space-delimited tokens in each line 37 | Optional: 38 | 1.3 Convert all words to lowercase 39 | 1.4 Convert rare words to special token (e.g. ) 40 | 1.5 Convert numbers to special token (e.g. ) 41 | 1.6 Shuffle the lines of the corpus to avoid unintentional bias to corpus structure 42 | 43 | We denote the preprocessed learning corpus file as CORPUS. 44 | Finally, use the following script to generate a vocabulary file, denoted VOCAB, for the corpus: 45 | 46 | cat CORPUS | python count_vocab.py 0 > VOCAB 47 | 48 | 49 | 2. Learning an n-gram language model from the corpus 50 | ---------------------------------------------------- 51 | There are several n-gram language model toolkits. 52 | You can use any toolkit that can export the leanred language model into a standard ARPA format. 53 | We denote the language model ARPA file as LM.arpa 54 | 55 | KenLM is one good option: 56 | You can download this toolkit from https://kheafield.com/code/kenlm/ and follow the instructions. 57 | An example command line for learning a 5-gram Kneser Ney language model is: 58 | bin/lmplz -o 5 -S 48G -T ~/tmp --text CORPUS --prune 0 2 2 2 2 > LM.arpa 59 | 60 | 61 | 3. Extracting sentential contexts from corpus for all target words 62 | --------------------------------------------------------------- 63 | 64 | 3.a. Choosing target words 65 | ------------------------- 66 | Create a file with one word per line comprising all of the target words that you will need in your application. 67 | We denote the target file as TARGETS 68 | 69 | Note that you will need to allocate sufficient disk space for storing the contexts that will be collected from the corpus for each of the targets (~20M per each target word type). 70 | 71 | 72 | 3.b. Sampling target words contexts 73 | ---------------------------------- 74 | Sample sentential contexts for all of your target words using the script below. 75 | 76 | python extract_contexts.py CORPUS TARGETS TARGETS_CONTEXTS TARGETS_FREQS 77 | 78 | TARGETS_CONTEXTS denotes a file containing the corpus contexts sampled for the targets (this can be a very big file) 79 | TARGETS_FREQS denotes a file containing the number of sampled contexts per each target word type 80 | is the maximum number of contexts sampled per each target (e.g. 20000) 81 | 82 | 83 | 3.c. Generating substitute vector (subvec) representations for contexts 84 | ---------------------------------------------------------------------- 85 | 86 | (i) Generating fastsub subvecs 87 | 88 | To compute subvecs for the target words contexts use the FASTSUBS toolkit. 89 | Download FASTSUBS from: https://github.com/ai-ku/fastsubs and use as follows: 90 | 91 | cat TARGETS_CONTEXTS | ./fastsubs-omp -n -m -t -z LM.arpa > TARGETS_SUBVECS 92 | 93 | is the maximum number of entries in each subvec (suggested value 100) 94 | is the maximum number of threads that fastsubs-omp will use on your machine 95 | TARGETS_SUBVECS is the targets context file with subvec representations (this would be an even bigger file...) 96 | 97 | (ii) Optional (recommended) context cleanup: 98 | 99 | The following script extracts only contexts where the original target that was observed with this context appears in the subvec: 100 | 101 | python extract_reliable_subvecs.py TARGETS_SUBVECS TARGETS_SUBVECS.RELIABLE TARGETS_FREQS.RELIABLE 102 | 103 | (iii) Converting subvecs weights from conditional probability to SPPMI: 104 | 105 | cat TARGETS_SUBVECS.RELIABLE | python subvecs2pmi.py VOCAB > TARGETS_SUBVECS.RELIABLE.PMI 106 | 107 | is the sppmi shift parameter (recommended value: 2.0) 108 | 109 | (iv) Converting the large contexts file to a directory of files: 110 | 111 | This script converts the big contexts subvec file to a more application-friendly file-per-target directory. 112 | This will create the directory TARGETS_SUBVECS.RELIABLE.PMI.DIR with a file named w for every target word type w in TARGETS_SUBVECS. 113 | 114 | python subvec_dir.py TARGETS_SUBVECS.RELIABLE.PMI 115 | 116 | (v) Clustering subvecs - Optional 117 | 118 | The following script clusters contexts together in order to reduce the size of the target subvecs directory. 119 | 120 | cluster_subvecs_concurrently.sh VOCAB 1 TARGETS_SUBVECS.RELIABLE.PMI.DIR TARGETS_SUBVECS.RELIABLE.PMI.CLUSTER.DIR [] [] 121 | 122 | is the directory under which the python source code parvecs is installed 123 | number of processes spawned concurrently 124 | is the number of context cluster per each word type 125 | is the max number of entries in cluster vectors 126 | is the number of different random starting points for the clustering process (default 1) 127 | is the max number of iterations performed in the clustering process (default 30) 128 | 129 | Note: the output cluster subvecs are L2-normalized 130 | 131 | 132 | 4. Generating parvecs for target words in sentential context 133 | --------------------------------------------------------------- 134 | To compute parvecs, your words-in-contexts file, denoted TEST, should be formatted in the same way as in the file TARGETS_CONTEXTS from section 3.b. 135 | The follow the instructions in 3.c.(i) and 3.c.(iii) to generate substitute vectors for your test file TEST.SUBVECS.PMI. 136 | In this file there should be for every target word instance two lines: 137 | target_name target_id target_index text_line 138 | sub1 weight1 sub2 weight2 ... 139 | 140 | The substitutes in the second line are for the target at text_line[target_index] (i.e. the word in the target_index position in text_line). 141 | 142 | Note: 143 | - To speed up parvec generation considerably, sort the contexts in TEST according to their target_name (i.e. contexts of the same target word should be grouped together). 144 | - It is generally recommended to use same subvec weighting schemes (e.g. PMI with a shift of 2.0) for both TARGETS_SUBVECS.RELIABLE.PMI and TEST.SUBVECS.PMI. 145 | 146 | To generate parvecs for words in context run: 147 | 148 | python word2parvec.py -contexts_dir TARGETS_SUBVECS.RELIABLE.PMI.DIR -vocabfile VOCAB -testfile TEST.SUBVECS.PMI -resultsfile TEST.PARVECS 149 | or 150 | python word2parvec.py -contexts_dir TARGETS_SUBVECS.RELIABLE.PMI.CLUSTER.DIR --excluderef -vocabfile VOCAB -testfile TEST.SUBVECS.PMI -resultsfile TEST.PARVECS 151 | 152 | TEST.PARVECS is the output file that will be created with the following 3 lines for every target word instance: 153 | INSTANCE target_name target_id target_index text_line 154 | SUBVEC sub1 weight1 sub2 weight2 ... 155 | PARVEC par1 weight1 par2 weight2 ... 156 | 157 | You can use the following runtime arguments: 158 | 159 | --excluderef excludes the given context from the target contexts average. This is recommended when using clustered subvecs (3.c.(v)). 160 | 161 | --lemmatize can be used to convert the parvec to lemmatized form. This is useful, for instance, when evaluating against a lemmatized gold standard, such as the SemEval 2007 Lexical Substitution Task. 162 | When using this option the target_name in TEST should be in the form of .POS where POS is a wordnet part-of-speech identifier (ADJ, ADV, NOUN, VERB = 'a', 'r', 'n', 'v'). 163 | A 4th line will be included in the output: 164 | PARLEMVEC parlem1 weight1 parlem2 weight2 ... 165 | 166 | -top and -toppercent can be used to inject a stronger bias in the parvec towards the given context by averaging only on the top target contexts that are most similar to the given context. 167 | 168 | -weightsfactor sets a float value f. The context similarity function is implemented as sim(c1,c2) = cos(c1,c2)^f, where the default value of f is 1.0 169 | 170 | -parvec_maxlen can be used to limit the number of entries in the generated parvecs 171 | 172 | --debug turns debug logs on 173 | --debugtop limits the number of entries printed per vector 174 | 175 | To generate parvecs for words out-of-context use: -weightsfactor 0.0 --excluderef 176 | 177 | 178 | 5. Pseudoword generator 179 | ------------------------ 180 | To randomly generate pseudowords run: 181 | 182 | wn_pseudoword_generator VOCAB [] 183 | 184 | is the number of pseudowords to be generated 185 | 186 | is an output file, denoted WORDS2SENSES, containing a single line for every pseudoword in the following format: 187 | pseudoword_name sense_word1 sense_word2 ... 188 | 189 | , denoted SENSES, is an output file with all of the senses from all pseudowords (one sense per line) 190 | is the minimum corpus frequency for a sense to be acceptable (default value is 1000). 191 | 192 | 193 | 6. Context similarity measures evaluation 194 | ------------------------------------------- 195 | Performs steps 3.b and 3.c.(i) and 3.c.(iv) using SENSES as TARGETS ( can be set to ~1000) to collect contexts for the pseudo-sense words into PSEUDO_TARGETS_SUBVECS.DIR. 196 | 197 | Run the following script to evaluate subvec (SUB) similarity with conditional probabilities weights: 198 | 199 | python context_similarity_measures_eval.py -samplenum 100 -toppercent 0.01 -pseudosfile WORDS2SENSES -contexts_dir PSEUDO_TARGETS_SUBVECS.DIR -vocabfile VOCAB -resultsfile 200 | 201 | To evaluate SUB sppmi weights add the following params: 202 | --pmi 203 | -pmioffset 204 | 205 | To evaluate with bag-of-words (BOW) context similarities add the following params: 206 | -bowinter 1.0 207 | -bow 208 | (window size zero means the entire sentence) 209 | 210 | To evaluate with continuous bow (CBOW): 211 | 212 | Use word2vec (https://code.google.com/p/word2vec/) to learn word embeddings. 213 | An example command line: 214 | ./word2vec -train CORPUS -output EMBEDDINGS -cbow 0 -size 600 -window 4 -negative 15 -threads 12 -binary 0 -min-count 100 215 | 216 | The following script will convert the format of the embeddings 217 | python embedding_text2numpy.py EMBEDDINGS 218 | 219 | Add the following param to context_similarity_measures_app.py: 220 | -cbow EMBEDDINGS 221 | 222 | To weigh the words in BOW/CBOW with tfidf weighting: 223 | --tfidf 224 | 225 | To evaluate the combined SUB*CBOW measure (interpolation between SUB and CBOW measures), include both SUB and BOW/CBOW config params and use -bowinter 0.5 226 | --debug turns debug logs on 227 | 228 | 229 | 230 | -------------------------------------------------------------------------------- /parvecs/inference/context_collection.py: -------------------------------------------------------------------------------- 1 | ''' 2 | A collection of contexts represented as subvecs and/or bags-of-words 3 | Used to: 4 | - sort contexts according to their similarity to a reference context 5 | - perform a (weighted) average of contexts representations 6 | 7 | ''' 8 | from parvecs.common.context_instance import read_context 9 | from parvecs.common.util import wf2ws 10 | from parvecs.common.context_instance import get_pmi_weights 11 | from parvecs.common.context_instance import remove_out_of_vocab 12 | from scipy.sparse.dok import dok_matrix 13 | from scipy.sparse.csr import csr_matrix 14 | from scipy.sparse import SparseEfficiencyWarning 15 | import numpy as np 16 | import heapq 17 | import math 18 | 19 | import warnings 20 | warnings.simplefilter('error',SparseEfficiencyWarning) 21 | 22 | 23 | 24 | class ContextCollection(): 25 | 26 | def __init__(self, args, i2w, w2i, subvecs_num, w2counts, sum_word_counts, stopwords, embeddings): 27 | 28 | self.args = args 29 | self.w2i = w2i 30 | self.i2w = i2w 31 | self.w2counts = w2counts 32 | self.sum_word_counts = sum_word_counts 33 | self.stopwords = stopwords 34 | 35 | self.contexts = [] 36 | self.sim_scores = None # points either to self.subvecs_sim_scores or to self.bow_sim_scores 37 | 38 | initial_sim_score = 1.0 if subvecs_num==0 else 1.0/subvecs_num 39 | 40 | self.embeddings = embeddings # when this is not None the bow representation is dense (todo: refactor this code) 41 | self.bow_size = args.bow_size 42 | if (self.bow_size >= 0): 43 | if (self.embeddings != None): 44 | bow_dimensionality = self.embeddings.dimension() 45 | self.bow_matrix = np.zeros((subvecs_num, bow_dimensionality), dtype=np.float32) # estimate sim of contexts based on their BOW rep 46 | self.bow_L2_norms = None # we always keep them normalized 47 | self.bow_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose() 48 | else: 49 | bow_dimensionality = len(w2i) 50 | self.bow_matrix = dok_matrix((subvecs_num, bow_dimensionality), dtype=np.float32) # estimate sim of contexts based on their BOW rep 51 | self.bow_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32) 52 | self.bow_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose() 53 | 54 | self.subs_matrix = dok_matrix((subvecs_num, len(w2i)), dtype=np.float32) #used for sim weights calculation, also for sub average only if no dual matrix 55 | self.subvecs_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32) 56 | self.subvecs_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose() 57 | 58 | self.target_counts = {} 59 | 60 | 61 | def load_contexts(self, contexts_file, pseudos=None, pseudos_label=None, tocsr_flag=True): 62 | ''' 63 | loads contexts for this collection 64 | :param contexts_file: 65 | :param pseudos: set of pseudo-sense words (used only for pseudo-word experiments) 66 | :param pseudos_label: pseudo-word label (used only for pseudo-word experiments) 67 | :param tocsr_flag: should be False if intending to load more contexts into this collection 68 | :returns: number of contexts read 69 | ''' 70 | 71 | print "Loading contexts for file %s" % (contexts_file) 72 | lines = 0 73 | try: 74 | while True: 75 | context_instance, subvec = read_context(contexts_file, self.args.subvec_maxlen) 76 | if pseudos != None and pseudos_label != None: 77 | subvec = self.__update_pseudos(subvec, pseudos, pseudos_label) 78 | 79 | if self.args.pmi == True: 80 | subvec = get_pmi_weights(subvec, self.w2counts, self.sum_word_counts, self.args.pmioffset, self.args.pmithreshold) 81 | else: 82 | subvec = remove_out_of_vocab(subvec, self.w2counts) 83 | self.__append_subvec(subvec, context_instance) 84 | 85 | lines += 1 86 | if lines % 10000 == 0: 87 | print "Read %d context lines" % (lines) 88 | except EOFError: 89 | print "Finished loading %d context lines from file %s" % (lines, contexts_file) 90 | if tocsr_flag == True: 91 | self.tocsr() 92 | return lines 93 | 94 | 95 | def tocsr(self): 96 | ''' 97 | Converts collection to an arithmetically-efficient format 98 | :returns: None 99 | ''' 100 | self.subs_matrix = self.subs_matrix.tocsr() 101 | self.subvecs_L2_norms = self.subvecs_L2_norms.tocsr() 102 | if self.bow_size>=0: 103 | if isinstance(self.bow_matrix, dok_matrix): 104 | self.bow_matrix = self.bow_matrix.tocsr() 105 | self.bow_L2_norms = self.bow_L2_norms.tocsr() 106 | 107 | def reference_context(self, subvec, context, bow_interpolate): 108 | ''' 109 | Weighs contexts in this collection according to similarity to the given reference context 110 | :param subvec: subvec representation of given context 111 | :param context: given context 112 | :param bow_interpolate: interpolation factor (between bow and subvec simiarity) 113 | :returns: subvec as a numpy matrix 114 | ''' 115 | subvec_matrix = dok_matrix((len(self.w2i),1), dtype=np.float32) 116 | for word, weight in subvec: 117 | subvec_matrix[self.w2i[word],0] = weight 118 | subvec_matrix = subvec_matrix.tocsr() 119 | 120 | return self.__reference_context_imp(subvec_matrix, context, bow_interpolate) 121 | 122 | 123 | def avg_contexts(self, ref_subvec, top, top_percent, top_inferences_number, exclude_ref, weights_factor): 124 | ''' 125 | Performs a weighted average of 126 | :param ref_subvec: given subvec as a numpy matrix 127 | :param top: 128 | :param top_percent: 129 | :param top_inferences_number: 130 | :param exclude_ref: 131 | :param weights_factor: 132 | :returns: parvec, number of contexts averaged 133 | ''' 134 | 135 | if len(self.contexts) == 0: 136 | return None, 0 137 | 138 | ref_weight = 1 if exclude_ref == False else 0 139 | 140 | if (top > len(self.contexts) + ref_weight): 141 | top = len(self.contexts) + ref_weight 142 | 143 | if (top > 0 or top_percent > 0): 144 | top_contexts_weights = self.sim_scores.todok() 145 | final_top = top-ref_weight # -1 to leave 1 for the ref_subvec 146 | num_top_percent = int(math.ceil(top_percent * (len(self.contexts)+ref_weight)))-ref_weight 147 | final_top = max(final_top, num_top_percent) 148 | 149 | cw_sorted = heapq.nlargest(final_top, top_contexts_weights.iteritems(), key=lambda x: x[1]) 150 | top_contexts_weights = dok_matrix((len(self.contexts),1), dtype=np.float32) 151 | 152 | for (k,j), weight in cw_sorted: 153 | top_contexts_weights[k,j] = weight**weights_factor 154 | 155 | top_contexts_weights = top_contexts_weights.tocsr() 156 | contexts_num = len(cw_sorted) 157 | 158 | else: 159 | contexts_num = len(self.contexts) 160 | if weights_factor == 0.0: 161 | top_contexts_weights = dok_matrix([[1.0]*contexts_num]).tocsr().transpose() 162 | else: 163 | top_contexts_weights = self.sim_scores.copy() 164 | top_contexts_weights.data **= weights_factor 165 | 166 | sum_weights = top_contexts_weights.sum() + ref_weight #weight +1 reserved for ref_subvec 167 | top_contexts_weights.data /= sum_weights 168 | 169 | 170 | weighted_subs_matrix = self.subs_matrix.multiply(top_contexts_weights) #NOT SUPPORTED IN SCIPY 0.7 171 | avg_subvec = weighted_subs_matrix.sum(axis=0) 172 | 173 | if (exclude_ref == False) and (ref_subvec != None): 174 | ref_subvec.data *= 1.0/sum_weights 175 | avg_subvec = avg_subvec + ref_weight * ref_subvec.transpose() 176 | 177 | result_vec = self.__vec_to_sorted_list(avg_subvec, top_inferences_number) 178 | return result_vec, contexts_num 179 | 180 | 181 | def evaluate_context_similarity(self, seeded_random, random_similarity): 182 | ''' 183 | Performs a context similarity measure evaluation on a single 'query' context instance 184 | todo: move this functionality out of this class 185 | :param seeded_random: 186 | :param debug_top_inferences_per_context: 187 | :param random_similarity: 188 | :returns: precision results 189 | ''' 190 | 191 | random_context_ind = seeded_random.randint(0, len(self.contexts)-1) 192 | sample_context = self.contexts[random_context_ind] 193 | sample_target = self.contexts[random_context_ind].target 194 | sample_subvec = self.subs_matrix[random_context_ind,:].transpose() 195 | 196 | all_size = len(self.contexts)-1 # -1 because we used one context as query 197 | all_real_pos = self.target_counts[sample_target]-1 198 | 199 | if (self.args.top > 0 or self.args.top_percent > 0): 200 | top_contexts = self.args.top 201 | num_top_percent = int(math.ceil(self.args.top_percent * all_size)) 202 | top_contexts = max(top_contexts, num_top_percent) 203 | else: 204 | top_contexts = all_size 205 | 206 | bow_interpolate = self.args.bow_interpolate 207 | 208 | if random_similarity: 209 | self.sim_scores = csr_matrix([]) 210 | else: 211 | self.__reference_context_imp(sample_subvec, sample_context, bow_interpolate) 212 | 213 | contexts_weights_sorted = sorted(self.sim_scores.todok().iteritems(), key=lambda x: x[1], reverse=True) 214 | output_items = [] 215 | true_p = 0 216 | all_p = 0 217 | precision_at_1 = None 218 | top_precision = None 219 | avg_precision = 0.0 220 | 221 | # going over all the contexts that got a non-zero score 222 | for i in xrange(0,len(contexts_weights_sorted)): 223 | (j,k), context_weight = contexts_weights_sorted[i] 224 | retrieved_target = self.contexts[j].target 225 | 226 | if j != random_context_ind: # skipping the sampled context in calculation 227 | all_p += 1 228 | if retrieved_target == sample_target: # true positive 229 | true_p += 1 230 | avg_precision += float(true_p) / all_p 231 | if all_p == 1: 232 | precision_at_1 = float(true_p) / all_p 233 | if all_p == top_contexts: 234 | top_precision = float(true_p) / all_p 235 | 236 | if self.args.debug: 237 | subvec = self.subs_matrix_for_sim_weights[j, :].todok() 238 | sub_list_sorted = heapq.nlargest(self.args.debugtop, subvec.iteritems(), key=lambda x: x[1]) 239 | sub_strs = [' '.join([self.i2w[ii], wf2ws(weight)]) for (kk,ii), weight in sub_list_sorted] 240 | prefix = "QRY" if j == random_context_ind else "RET" 241 | output_items.append((prefix, context_weight, self.contexts[j].decorate_context() +'\n' +'\t' + '\t'.join(sub_strs))) 242 | 243 | # for all the contexts that got zero score (were not retrieved at all) we assume that the real positives were retrieved uniformly (like random) 244 | false_n = all_real_pos - true_p 245 | if (false_n > 0): 246 | all_n = all_size - all_p 247 | real_negs_per_one_real_pos = (float(all_n)/false_n)-1 248 | 249 | all_p += real_negs_per_one_real_pos/2 250 | 251 | while all_p < all_size: 252 | if (top_precision == None) and (all_p >= top_contexts): 253 | top_precision = float(true_p) / top_contexts 254 | all_p += 1 255 | true_p += 1 256 | avg_precision += float(true_p) / all_p 257 | all_p += real_negs_per_one_real_pos 258 | if self.args.debug: 259 | output_items.append(("UNF", 0.0, "dummy positive")) 260 | 261 | if (top_precision == None): 262 | top_precision = float(true_p) / top_contexts 263 | 264 | if (precision_at_1 == None): 265 | precision_at_1 = float(all_real_pos) / all_size 266 | 267 | assert(true_p == all_real_pos) 268 | 269 | avg_precision /= max(1,all_real_pos) 270 | 271 | output_lines = ['\t'.join([prefix, wf2ws(context_weight), text]) for prefix, context_weight, text in output_items] 272 | return precision_at_1, top_precision, avg_precision,'\n'.join(output_lines) 273 | 274 | 275 | def __append_subvec(self, subvec, context_instance): 276 | 277 | j = len(self.contexts) 278 | self.contexts.append(context_instance) 279 | 280 | if context_instance.target in self.target_counts: 281 | self.target_counts[context_instance.target] += 1 282 | else: 283 | self.target_counts[context_instance.target] = 1 284 | 285 | if len(subvec) > 0: 286 | L2 = 0.0 287 | for word, weight in subvec: 288 | L2 += weight**2 289 | if L2 == 0: 290 | L2 = 1 291 | self.subvecs_L2_norms[j,0] = 1.0/(L2**0.5) 292 | 293 | for word, weight in subvec: 294 | if (weight != 0): 295 | self.subs_matrix[j, self.w2i[word]] = weight 296 | else: 297 | self.subvecs_L2_norms[j,0] = 1.0 # dummy NORM 298 | 299 | 300 | if self.bow_size >= 0: # using the bow_matrix for sim between contexts 301 | 302 | text_matrix, found_word = self.__context_text_to_vec(context_instance) 303 | 304 | if (self.embeddings == None): 305 | text_matrix = text_matrix.transpose() 306 | 307 | for (zero, word_ind), value in text_matrix.iteritems(): 308 | self.bow_matrix[j, word_ind] = value 309 | 310 | if found_word == True: 311 | L2 = 0 312 | for val in text_matrix.itervalues(): 313 | L2 += val**2 314 | self.bow_L2_norms[j,0] = 1.0 / (L2**0.5) 315 | else: 316 | self.bow_L2_norms[j,0] = 1.0 # dummy NORM 317 | else: 318 | self.bow_matrix[j, :] = text_matrix 319 | 320 | 321 | def __reference_context_imp(self, subvec_matrix, context, bow_interpolate): 322 | 323 | if bow_interpolate == 1: 324 | self.bow_sim_scores = self.__reference_context_bow(context) 325 | self.sim_scores = self.bow_sim_scores 326 | elif bow_interpolate == 0: 327 | self.subvecs_sim_scores = self.__reference_context_subvec(subvec_matrix) 328 | self.sim_scores = self.subvecs_sim_scores 329 | else: 330 | try: 331 | self.bow_sim_scores = self.__reference_context_bow(context) 332 | self.bow_sim_scores.data = self.bow_sim_scores.data**bow_interpolate 333 | except Exception as e: 334 | print e 335 | print context 336 | raise e 337 | self.subvecs_sim_scores = self.__reference_context_subvec(subvec_matrix) 338 | self.subvecs_sim_scores.data = self.subvecs_sim_scores.data**(1-bow_interpolate) 339 | self.sim_scores = self.subvecs_sim_scores.multiply(self.bow_sim_scores) 340 | 341 | return subvec_matrix 342 | 343 | 344 | def __reference_context_bow(self, context): 345 | 346 | refvec_matrix, found_word = self.__context_text_to_vec(context) 347 | sims = self.__compute_sim_scores(refvec_matrix, self.bow_matrix, self.bow_L2_norms, self.embeddings != None) 348 | return sims 349 | 350 | 351 | def __reference_context_subvec(self, refvec_matrix): 352 | sims = self.__compute_sim_scores(refvec_matrix, self.subs_matrix, self.subvecs_L2_norms, False) 353 | return sims 354 | 355 | 356 | 357 | def __compute_sim_scores(self, refvec_matrix, allvecs_matrix, L2_norms, is_embeddings): 358 | contexts_sims = allvecs_matrix.dot(refvec_matrix) 359 | 360 | if is_embeddings: 361 | contexts_sims = (contexts_sims + 1) / 2 # map cosine to [0,1] 362 | contexts_sims = np.reshape(contexts_sims, (len(contexts_sims), 1)) 363 | contexts_sims = csr_matrix(contexts_sims.tolist()) 364 | if L2_norms != None: 365 | contexts_sims = contexts_sims.multiply(L2_norms) 366 | refvec_dp = refvec_matrix.transpose().dot(refvec_matrix) 367 | refvec_L2_norm = refvec_dp.data.max()**0.5 if len(refvec_dp.data) > 0 else 1.0 368 | contexts_sims.data /= refvec_L2_norm # weights -1 <= cosine <= 1, but in practice greater than zero because all weights >= 0 369 | 370 | return contexts_sims 371 | 372 | def __context_text_to_vec(self, context_instance): 373 | found_word = False 374 | 375 | if self.embeddings != None: 376 | dimensionality = self.embeddings.dimension() 377 | weight_dtype = np.float32 378 | w2ind = self.w2i 379 | text_matrix = np.zeros((dimensionality,), dtype=weight_dtype) 380 | else: 381 | dimensionality = len(self.w2i) 382 | weight_dtype = np.float32 if self.args.tfidf else np.int8 383 | w2ind = self.w2i 384 | text_matrix = dok_matrix((dimensionality,1), dtype=weight_dtype) 385 | 386 | context_text_tokens = context_instance.get_context_tokens() 387 | target_pos = context_instance.target_ind 388 | 389 | if (self.bow_size > 0): 390 | start_pos = max(target_pos-self.bow_size, 0) 391 | end_pos = min(target_pos+self.bow_size+1, len(context_text_tokens)) 392 | context_text_tokens = context_text_tokens[start_pos:end_pos] 393 | target_pos = target_pos-start_pos 394 | 395 | stopwords = self.stopwords 396 | context_text_inds_left = [w2ind[word] for word in context_text_tokens[:target_pos] if word not in stopwords and word in w2ind] 397 | context_text_inds_right = [w2ind[word] for word in context_text_tokens[target_pos+1:] if word not in stopwords and word in w2ind] if (target_pos+1) < len(context_text_tokens) else [] 398 | 399 | all_words_inds = context_text_inds_left+context_text_inds_right 400 | total_weights = 0.0 401 | for word_ind in all_words_inds: 402 | w = self.i2w[word_ind] 403 | if self.args.tfidf: 404 | wcount = self.w2counts[w] 405 | log_idf = math.log(float(self.sum_word_counts)/wcount) 406 | log_idf -= self.args.tfidf_offset 407 | if (log_idf <= self.args.tfidf_threshold): 408 | log_idf = 0.0 409 | weight = log_idf 410 | else: 411 | weight = 1 412 | 413 | if weight !=0: 414 | found_word = True 415 | if (self.embeddings != None): 416 | if w in self.embeddings: 417 | wordvec = self.embeddings.represent(w).transpose() 418 | text_matrix = text_matrix + (wordvec * weight) 419 | else: 420 | weight = 0.0 421 | else: 422 | text_matrix[word_ind,0] += weight 423 | total_weights += weight 424 | 425 | # embeddings representations are always normalized 426 | if (self.embeddings != None): 427 | if total_weights != 0: 428 | text_matrix /= total_weights 429 | norm = np.sqrt(np.sum(text_matrix*text_matrix)) 430 | if norm != 0: 431 | text_matrix /= norm 432 | 433 | return text_matrix, found_word 434 | 435 | 436 | def __vec_to_sorted_list(self, subvec, max_n): 437 | sub_list = np.array(subvec)[0].tolist() 438 | n = min(max_n, subvec.nonzero()[0].shape[1]) if max_n > 0 else subvec.nonzero()[0].shape[1] 439 | sub_list_sorted = heapq.nlargest(n, enumerate(sub_list), key=lambda x: x[1]) 440 | sub_list = [(self.i2w[i], weight) for i, weight in sub_list_sorted] 441 | return sub_list 442 | 443 | 444 | def to_str(self, top_contexts, top_inferences_per_context): 445 | 446 | contexts_weights_sorted = heapq.nlargest(top_contexts, self.sim_scores.todok().iteritems(), key=lambda x: x[1]) 447 | output_items = [] 448 | for (j,k), context_weight in contexts_weights_sorted: 449 | subvec = self.subs_matrix[j, :].todok() 450 | sub_list_sorted = heapq.nlargest(top_inferences_per_context, subvec.iteritems(), key=lambda x: x[1]) 451 | sub_strs = [' '.join([self.i2w[i], wf2ws(weight)]) for (k,i), weight in sub_list_sorted] 452 | output_items.append((context_weight, self.contexts[j].decorate_context() +'\n' + '\t'.join(sub_strs))) 453 | 454 | output_lines = ['\t'.join([wf2ws(context_weight), text]) for context_weight, text in output_items] 455 | return '\n'.join(output_lines) 456 | 457 | 458 | def __update_pseudos(self, subvec, pseudos, pseudos_label): 459 | updated_subvec = [] 460 | pseudos_weight = 0.0 461 | for word, weight in subvec: 462 | if word in pseudos: 463 | pseudos_weight += weight 464 | else: 465 | updated_subvec.append((word, weight)) 466 | if pseudos_weight > 0.0: 467 | updated_subvec.append((pseudos_label, pseudos_weight)) 468 | 469 | return sorted(updated_subvec, key=lambda x: x[1], reverse=True) 470 | 471 | 472 | 473 | --------------------------------------------------------------------------------