├── parvecs
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── embedding_text2numpy.py
    │   ├── util.py
    │   ├── vocab.py
    │   ├── embedding.py
    │   └── context_instance.py
    ├── eval
    │   ├── __init__.py
    │   ├── pool_lst_candidates.py
    │   └── coinco2txt_converter.py
    ├── setup
    │   ├── __init__.py
    │   ├── cluster_subvecs_concurrently.sh
    │   ├── count_vocab.py
    │   ├── mark_corpus.py
    │   ├── subvec_dir.py
    │   ├── subvecs2pmi.py
    │   ├── extract_reliable_subvecs.py
    │   ├── extract_contexts.py
    │   ├── wn_pseudowords_generator.py
    │   └── cluster_subvecs.py
    └── inference
    │   ├── __init__.py
    │   ├── parvec_util.py
    │   ├── contexts_container.py
    │   ├── parvec_inferrer.py
    │   ├── word2parvec.py
    │   ├── context_similarity_measures_eval.py
    │   └── context_collection.py
├── .gitignore
└── README.md


/parvecs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/parvecs/common/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/parvecs/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/parvecs/setup/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/parvecs/inference/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | 
 2 | #ignore thumbnails created by windows
 3 | Thumbs.db
 4 | #Ignore files build by Visual Studio
 5 | *.obj
 6 | *.exe
 7 | *.pdb
 8 | *.user
 9 | *.aps
10 | *.pch
11 | *.vspscc
12 | *_i.c
13 | *_p.c
14 | *.ncb
15 | *.suo
16 | *.tlb
17 | *.tlh
18 | *.bak
19 | *.cache
20 | *.ilk
21 | *.log
22 | [Bb]in
23 | [Dd]ebug*/
24 | *.lib
25 | *.sbr
26 | obj/
27 | [Rr]elease*/
28 | _ReSharper*/
29 | [Tt]est[Rr]esult*
30 | *.pyc
31 | 


--------------------------------------------------------------------------------
/parvecs/setup/cluster_subvecs_concurrently.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # This script is used to apply subvec clustering to multiple subvec files in a directory
 3 | 
 4 | echo source home $1 
 5 | echo process num $2
 6 | echo vocab $3
 7 | echo cluster_num $4
 8 | echo min avg cluster size $5
 9 | echo cluster prunning $6
10 | echo input dir $7
11 | echo output dir $8
12 | echo number inits $9
13 | echo max iterations ${10}
14 | 
15 | cd $1
16 | FILECOUNT="$(ls $7 | wc -l)"
17 | echo filecount $FILECOUNT
18 | let FPP=FILECOUNT/$2+1
19 | echo files per process $FPP
20 | COUNTER=0
21 | while [  $COUNTER -lt $FILECOUNT ]; do
22 |     let from=COUNTER
23 |     let to=COUNTER+FPP
24 |     echo "Running: /usr/bin/python parvecs/setup/cluster_subvecs.py $3 $4 $5 $6 $7 $8 $from $to $9 ${10} &"
25 |     /usr/bin/python parvecs/setup/cluster_subvecs.py $3 $4 $5 $6 $7 $8 $from $to $9 ${10} &
26 |     let COUNTER=COUNTER+FPP 
27 | done
28 | 


--------------------------------------------------------------------------------
/parvecs/common/embedding_text2numpy.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Converts the word2vec embeddings format to a numpy-friendly format
 3 | '''
 4 | import numpy as np
 5 | import sys
 6 | 
 7 | 
 8 | def readVectors(path):
 9 |     vectors = {}
10 |     with open(path) as input_f:
11 |         for line in input_f.readlines():
12 |             tokens = line.strip().split(' ')
13 |             vectors[tokens[0]] = np.asarray([float(x) for x in tokens[1:]])
14 |     return vectors
15 | 
16 | inpath = sys.argv[1]
17 | outpath = sys.argv[2]
18 | 
19 | matrix = readVectors(inpath)
20 | 
21 | vocab = list(matrix.keys())
22 | vocab.sort()
23 | with open(outpath+'.vocab', 'w') as output_f:
24 |     for word in vocab:
25 |         print >>output_f, word,
26 | 
27 | new_matrix = np.zeros(shape=(len(vocab), len(matrix[vocab[0]])), dtype=np.float32)
28 | for i, word in enumerate(vocab):
29 |     new_matrix[i, :] = matrix[word]
30 | 
31 | np.save(outpath+'.npy', new_matrix)


--------------------------------------------------------------------------------
/parvecs/inference/parvec_util.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | utilities manipulating paraphrase vectors
 3 | '''
 4 | import re
 5 | import heapq
 6 | from nltk.stem.wordnet import WordNetLemmatizer
 7 | 
 8 | lemmatized_word_re = re.compile('^[a-zA-Z\-]+$')
 9 | 
10 | def parvec_lemmatize(parvec, target_pos):    
11 |     '''
12 |     lemmatizes a paraphrase vector
13 |     :param parvec: input parvec
14 |     :param target_pos: part-of-speech used for lemmatization
15 |     :returns lemmatized parvec
16 |     '''
17 |     
18 |     lemmas = {}
19 |     if parvec is not None:
20 |         for word, weight in parvec:
21 |             if lemmatized_word_re.match(word) != None: # filter out non-words
22 |                 lemma = WordNetLemmatizer().lemmatize(word, target_pos)
23 |                 if lemma in lemmas:
24 |                     weight = max(weight, lemmas[lemma])
25 |                 lemmas[lemma] = weight
26 |     parlemvec = sorted(lemmas.iteritems(), key=lambda x: x[1], reverse=True) 
27 |     return parlemvec


--------------------------------------------------------------------------------
/parvecs/setup/count_vocab.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Input: Tokenized text corpus
 3 | Output: Vocabulary counts
 4 | '''
 5 | 
 6 | import sys
 7 | import string
 8 | from operator import itemgetter
 9 | from parvecs.common.vocab import VOCAB_TOTAL_COUNT
10 | 
11 | 
12 | if __name__ == '__main__':
13 |     
14 |     if len(sys.argv)<2:
15 |         print >> sys.stderr, "Usage: %s <min-count> < corpus.txt" % sys.argv[0]
16 |         sys.exit(1)
17 |         
18 |     min_count = int(sys.argv[1])
19 |     vocab = {}
20 |     i = 0
21 |     for line in sys.stdin:
22 |         words = line.split()
23 |         for word in words:
24 |             if (word not in vocab):
25 |                 vocab[word] = 1
26 |             else:
27 |                 vocab[word] +=1
28 |             i += 1
29 |             if i % 10000000 == 0:
30 |                 print >> sys.stderr, 'Read ' + str(i) + ' words'          
31 |     vocab[VOCAB_TOTAL_COUNT] = i    
32 |     sorted_vocab = sorted(vocab.iteritems(), key=itemgetter(1), reverse=True)   
33 |     for word, count in sorted_vocab:
34 |         if count < min_count:
35 |             break;
36 |         print '\t'.join([word, str(count)])    


--------------------------------------------------------------------------------
/parvecs/setup/mark_corpus.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Input: Tokenized text corpus
 3 | Output: Text corpus with special words marked
 4 | '''
 5 | 
 6 | import sys
 7 | 
 8 | from parvecs.common.vocab import read_vocab
 9 | from parvecs.common.vocab import RARE_WORD_TOKEN
10 | from parvecs.common.vocab import NUMERIC_TOKEN
11 | from parvecs.common.util import is_numeric
12 | 
13 | if __name__ == '__main__':
14 | 
15 |     if len(sys.argv) < 2:
16 |         print >> sys.stderr, "Usage: %s <input-vocab> < corpus.txt"  % (sys.argv[0])
17 |         sys.exit(1)
18 | 
19 | 
20 |     vocab = read_vocab(sys.argv[1])    
21 |     print >> sys.stderr, "Read vocab of size: " + str(len(vocab))
22 |     
23 |     i = 0
24 |     for line in sys.stdin:
25 |         in_words = line.split()
26 |         out_words = []
27 |         for word in in_words:
28 | #            if is_numeric(word):
29 | #                outword = NUMERIC_TOKEN
30 |             outword = word if word in vocab else RARE_WORD_TOKEN
31 |             out_words.append(outword)
32 |         if len(out_words)>0:
33 |             sys.stdout.write(' '.join(out_words) + '\n')
34 |         i += 1
35 |         if i % 1000000 == 0:
36 |             print >> sys.stderr, 'Wrote ' + str(i) + ' lines'       
37 | 
38 |     
39 |  


--------------------------------------------------------------------------------
/parvecs/setup/subvec_dir.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Input: context subvecs filename
 3 | Output: new directory named filename.DIR. In this directory a subvecs file per each target individually)
 4 | '''
 5 | 
 6 | import sys
 7 | import os
 8 | 
 9 | SUBVEC_DIR_SUFFIX = ".DIR"
10 | 
11 | if __name__ == '__main__':
12 |     
13 |     if len(sys.argv) < 2:
14 |         print "Usage: %s input_subvec_file" % sys.argv[0]
15 |         sys.exit(1)
16 |         
17 |     input_subvec_filename = sys.argv[1]
18 |     
19 |     subvec_dirname = input_subvec_filename + SUBVEC_DIR_SUFFIX
20 |     os.mkdir(subvec_dirname)
21 |     
22 |     input_subvec_file = open(input_subvec_filename, 'r')
23 |     
24 |     output_files = {}
25 |     
26 |     while True:
27 |         line1 = input_subvec_file.readline()
28 |         line2 = input_subvec_file.readline()        
29 |         if not line1 or not line2:
30 |             break;
31 |         
32 |         target = line1[:line1.find('\t')]
33 |         if target not in output_files:
34 |             output_files[target] = open(subvec_dirname + "/" + target, 'w') 
35 |         
36 |         output_files[target].write(line1)
37 |         output_files[target].write(line2)
38 |     
39 |     input_subvec_file.close()
40 |     for output_file in output_files.itervalues():
41 |         output_file.close()


--------------------------------------------------------------------------------
/parvecs/common/util.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import string
 3 | import heapq
 4 | 
 5 | def asciize(line):
 6 |     return filter(lambda x: x in string.printable, line)
 7 | 
 8 | def is_printable(s):
 9 |     return all(c in string.printable for c in s)
10 | 
11 | # very crude implementation
12 | num_re = re.compile('^[\+\/\:\-,\.\d]*\d[\+\/\:\-,\.\d]*$')
13 | def is_numeric(word_str):
14 |     return num_re.match(word_str) != None
15 | 
16 | def wf2ws(weight):
17 |         return '{0:1.5f}'.format(weight)
18 |     
19 | def vec_to_str(subvec, max_n):
20 |     sub_list_sorted = heapq.nlargest(max_n, subvec, key=lambda x: x[1])
21 |     sub_strs = [' '.join([word, wf2ws(weight)]) for word, weight in sub_list_sorted]
22 |     return '\t'.join(sub_strs)    
23 |     
24 | def count_file_lines(filename):
25 |     f = open(filename, 'r')
26 |     lines_num = sum(1 for line in f)
27 |     f.close()
28 |     return lines_num
29 | 
30 | class TimeRecorder(object):
31 |     
32 |     def __init__(self):
33 |         self.time = 0.0
34 |         self.iterations = 0
35 |      
36 |      
37 |     def iteration_time(self, seconds):
38 |         self.time += seconds
39 |         self.iterations += 1
40 |                    
41 |      # processing time in msec
42 |     def msec_per_iteration(self):
43 |         return 1000*self.time/self.iterations if self.iterations > 0 else 0.0
44 | 
45 | 


--------------------------------------------------------------------------------
/parvecs/common/vocab.py:
--------------------------------------------------------------------------------
 1 | VOCAB_TOTAL_COUNT = "<VOCAB_TOTAL_COUNT>"
 2 | RARE_WORD_TOKEN = "<RW>"
 3 | NUMERIC_TOKEN = "<NUM>"
 4 | STOPWORD_TOP_THRESHOLD = 256
 5 | 
 6 | import sys
 7 | 
 8 | def read_vocab(vocab_filename):
 9 |     vocab = {}
10 |     with open(vocab_filename,'r') as f:
11 |         for line in f:
12 |             tokens = line.split('\t')
13 |             word = tokens[0].strip()
14 |             count = int(tokens[1])
15 |             vocab[word] = count
16 |     return vocab
17 | 
18 | def vocab_total_size(vocab):
19 |     return vocab[VOCAB_TOTAL_COUNT]
20 | 
21 | def load_vocabulary_w2i(vocab_filename):
22 |     with open(vocab_filename) as f:
23 |         vocab = [line.split('\t')[0].strip() for line in f if len(line) > 0]
24 |     return dict([(a, i) for i, a in enumerate(vocab)]), vocab
25 | 
26 | def load_vocabulary_counts(path):
27 |     stop_words = set()
28 |     counts = {}
29 |     with open(path) as f:
30 |         i = 0
31 |         for line in f:
32 |             if len(line) > 0:
33 |                 tokens = line.split('\t') 
34 |                 word = tokens[0].strip() 
35 |                 count = int(tokens[1].strip())
36 |                 counts[word] = count
37 |                 i += 1 
38 |                 if (i <= STOPWORD_TOP_THRESHOLD):
39 |                     stop_words.add(word)
40 |     total_size = counts[VOCAB_TOTAL_COUNT]                
41 |     return counts, total_size, stop_words


--------------------------------------------------------------------------------
/parvecs/setup/subvecs2pmi.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Converts substitute weights from conditional probabilities to pmi (or sppmi)
 3 | '''
 4 | import sys
 5 | from operator import itemgetter
 6 | 
 7 | from parvecs.common.vocab import read_vocab
 8 | from parvecs.common.vocab import vocab_total_size
 9 | from parvecs.common.context_instance import ContextInstance
10 | from parvecs.common.context_instance import read_context
11 | from parvecs.common.context_instance import get_pmi_weights
12 | 
13 | 
14 | def write_subvec(output, subvec):
15 |     for word, weight in subvec:
16 |         output.write(word + " " + '{0:1.8f}'.format(weight) + "\t")
17 |     output.write("\n")
18 |         
19 |         
20 | if __name__ == '__main__':
21 |     if len(sys.argv) < 3:
22 |         sys.stderr.write("Usage: %s <vocab-file> <pmi-shift> [normalize]  <input >output\n" % sys.argv[0])
23 |         sys.exit(1)
24 |         
25 |     vocab = read_vocab(sys.argv[1])
26 |     total_size = vocab_total_size(vocab)
27 |     pmi_shift = float(sys.argv[2])
28 |     normalize = False
29 |     if len(sys.argv) > 3 and sys.argv[3] == 'normalize':
30 |         normalize = True
31 |  
32 |     lines = 0    
33 |     try:
34 |         while True: 
35 |             context_inst, subvec = read_context(sys.stdin)
36 |             subvec_pmi = get_pmi_weights(subvec, vocab, total_size, pmi_shift, 0.0, normalize)
37 |             sorted_subvec_pmi = sorted(subvec_pmi, key=itemgetter(1), reverse=True)
38 |             sys.stdout.write(context_inst.line+'\n')
39 |             write_subvec(sys.stdout,sorted_subvec_pmi)                                         
40 |             lines += 1
41 |             if lines % 10000 == 0:
42 |                 sys.stderr.write("Read %d subvecs\n" % (lines))
43 |     except EOFError:            
44 |         sys.stderr.write("Finished loading %d context lines\n" % lines)
45 |     
46 |         
47 |     


--------------------------------------------------------------------------------
/parvecs/setup/extract_reliable_subvecs.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Input: context subvecs file
 3 | Output: only the context subvecs for which the original target word that was observed in this context appears in the subvec
 4 | '''
 5 | 
 6 | 
 7 | import sys
 8 | from parvecs.common.context_instance import ContextInstance
 9 | 
10 | if __name__ == '__main__':
11 |     
12 |     if len(sys.argv)<3:
13 |         print "Usage: %s <input-subvec-file> <output-subvec-file> <output-targetfreqs-file>" % sys.argv[0]
14 |         sys.exit(1)
15 |     
16 |     input_sub_file = open(sys.argv[1], 'r')
17 |     output_sub_file = open(sys.argv[2], 'w')
18 |     output_targetfreqs_file = open(sys.argv[3], 'w')
19 |     target_freqs = {}
20 |     
21 |     while True:
22 |         context_line = input_sub_file.readline()
23 |         subs_line = input_sub_file.readline()
24 |         if not context_line or not subs_line:
25 |             break
26 |         
27 |         context_inst = ContextInstance(context_line)
28 |         
29 |         if context_inst.target != context_inst.target_key:
30 |             sys.stderr.write("Skipping bad context: " + context_line)
31 |             continue
32 |         
33 |         substitute_words = subs_line.split()[::2] 
34 |         
35 |         if context_inst.target in substitute_words:
36 |             output_sub_file.write(context_line)
37 |             output_sub_file.write(subs_line)
38 |             if context_inst.target in target_freqs:
39 |                 target_freqs[context_inst.target] = target_freqs[context_inst.target]+1
40 |             else:
41 |                 target_freqs[context_inst.target] = 1  
42 | 
43 |     for word, freq in sorted(target_freqs.iteritems(), key=lambda x: x[1], reverse=True):
44 |         output_targetfreqs_file.write("%s\t%d\n" % (word, freq))
45 |         
46 |     input_sub_file.close()
47 |     output_sub_file.close()
48 |     output_targetfreqs_file.close()
49 | 
50 |  


--------------------------------------------------------------------------------
/parvecs/setup/extract_contexts.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Input: one-sentence-per-line tokenized text corpus and list of target words
 3 | Output: contexts of target words
 4 | '''
 5 | 
 6 | import sys
 7 | from operator import itemgetter
 8 | import string
 9 | 
10 | MAX_WORDS_IN_LINE = 128
11 | MAX_CHARS_IN_LINE = 1024
12 | 
13 | 
14 | if __name__ == '__main__':
15 |     
16 |     if len(sys.argv) < 6:
17 |         print >> sys.stderr, "Usage: %s <corpus-file> <targets-file> <max-freq> <contexts-file> <targets-freqs-file>"  % (sys.argv[0])
18 |         sys.exit(1)
19 |      
20 |     corpus_file = sys.argv[1]   
21 |     targets_file = sys.argv[2]
22 |     max_freq = int(sys.argv[3])
23 |     contexts_file = sys.argv[4]
24 |     targets_freq_file = sys.argv[5]
25 |     
26 |     targets = {}
27 |     with open(targets_file,'r') as tf:
28 |         for line in tf:
29 |             word = line.split('\t')[0].strip()
30 |             targets[word] = 0                
31 |     print >> sys.stderr, "Read %d targets " % (len(targets))
32 |     
33 |     cf = open(corpus_file,'r')
34 |     mf = open(contexts_file , 'w')
35 |     
36 |     i = 0
37 |     full_targets = 0
38 |     for line in cf:    
39 |         if len(line) < MAX_CHARS_IN_LINE:
40 |             stripped_line = line.strip()
41 |             sent_words = stripped_line.split()
42 |             if len(sent_words) <= MAX_WORDS_IN_LINE:                        
43 |                 for ind, word in enumerate(sent_words):   
44 |                     if (word in targets and targets[word] < max_freq):
45 |                         mf.write('\t'.join([word, str(i), str(ind), stripped_line])+'\n')
46 |                         targets[word] += 1
47 |                         if targets[word] == max_freq:
48 |                             full_targets += 1
49 |                 i += 1 
50 |                 if i % 1000000 == 0:
51 |                     print >> sys.stderr, 'Read ' + str(i) + ' lines'
52 |                 if (full_targets == len(targets)):
53 |                     break
54 |                   
55 |     cf.close()
56 |     mf.close()
57 |     
58 |     with open(targets_freq_file, 'w') as tff:
59 |         for target, freq in sorted(targets.iteritems(), key=itemgetter(1), reverse=True):
60 |             tff.write(target + '\t' + str(freq) + '\n')


--------------------------------------------------------------------------------
/parvecs/inference/contexts_container.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | A container of context collections of different target words
 3 | '''
 4 | 
 5 | from parvecs.inference.context_collection import ContextCollection
 6 | from parvecs.common.util import count_file_lines
 7 | 
 8 | 
 9 | class ContextsContainer():
10 | 
11 | 
12 |     def __init__(self, args, w2i, i2w, w2counts, sum_word_counts, stopwords, embeddings):
13 |         
14 |         self.args = args
15 |         self.container = {}
16 |         self.w2i = w2i
17 |         self.i2w = i2w
18 |         self.w2counts = w2counts
19 |         self.sum_word_counts = sum_word_counts
20 |         self.stopwords = stopwords
21 |         self.embeddings = embeddings
22 |         
23 |         
24 |     def get_target_contexts(self, target):        
25 |         '''
26 |         :param target: target word
27 |         :returns: context collection for target word
28 |         '''
29 |         try:
30 |             if target not in self.container:
31 |                 self.load_target_contexts(target)
32 |             return self.container[target]
33 |         except IOError as e:
34 |             return None
35 |                             
36 |             
37 |     def load_target_contexts(self, target):
38 |         '''
39 |         load into memory the contexts of target
40 |         :param target: target word
41 |         '''
42 |         target_filename = self.args.contexts_dir+"/"+target        
43 |         collection_size = count_file_lines(target_filename)/2 # subvec every two lines
44 |         target_subfile = open(target_filename, 'r')
45 |         self.container[target] = ContextCollection(self.args, self.i2w, self.w2i, collection_size, self.w2counts, self.sum_word_counts, self.stopwords, self.embeddings)
46 |         self.container[target].load_contexts(target_subfile)
47 |         if len(self.container[target].contexts) != collection_size:
48 |             raise EOFError('context collection size mismatch in target %s. collection_size %d len(contexts) %d' % (target, collection_size, len(self.container[target].contexts)))
49 |         self.container[target].tocsr()
50 |         target_subfile.close()
51 |         
52 |     
53 |     def clear(self):
54 |         '''
55 |         clear memory of container
56 |         '''
57 |         self.container = {}
58 |         
59 |         


--------------------------------------------------------------------------------
/parvecs/eval/pool_lst_candidates.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Used to pool all substitute candidates for every target type in a lexical substitution dataset
 3 | '''
 4 | import sys
 5 | 
 6 | if __name__ == '__main__':
 7 |     
 8 |     if len(sys.argv)<3:
 9 |         print "Usage: %s <lst-gold-file> <candidates-file> [no-mwe]" % sys.argv[0]
10 |         sys.exit(1)
11 |         
12 |     goldfile = open(sys.argv[1], 'r')
13 |     outfile = open(sys.argv[2], 'w')
14 |     
15 |     ignore_mwe = False
16 |     if (len(sys.argv) > 3):
17 |         sys.stderr.write("ignoring multi-word-expressions\n");
18 |         ignore_mwe = True        
19 |     
20 |     good_oneword_inst = 0
21 |     target2candidates = {}
22 |     # bright.a 5 :: intelligent 3;clever 2;most able 1;capable 1;promising 1;sharp 1;motivated 1;
23 |     for line in goldfile:
24 |         if len(line)>0:
25 |             oneword_in_line = 0 # e.g. ;most able 1;
26 |             segments = line.split("::")
27 |             if len(segments)>=2:
28 |                 target = segments[0][:segments[0].strip().rfind(' ')]
29 |                 target = '.'.join(target.split('.')[:2]) # remove suffix in cases of bar.n.v
30 |                 line_candidates = segments[1].strip().split(';')
31 |                 for candidate_count in line_candidates:
32 |                     if len(candidate_count) > 0:
33 |                         delimiter_ind = candidate_count.rfind(' ')
34 |                         candidate = candidate_count[:delimiter_ind]
35 |                         if ignore_mwe and ((len(candidate.split(' '))>1) or (len(candidate.split('-'))>1)):
36 |                             continue
37 |                         oneword_in_line += 1                       
38 |                         if target in target2candidates:
39 |                             candidates = target2candidates[target]
40 |                         else:
41 |                             candidates = set()
42 |                             target2candidates[target] = candidates
43 |                         candidates.add(candidate)
44 |             if (oneword_in_line >= 1):
45 |                 good_oneword_inst += 1
46 |     
47 |     if ignore_mwe:
48 |         sys.stderr.write("After discarding MWE, there are %d instances with at least one substitute.\n"  % (good_oneword_inst))        
49 |     for target, candidates in target2candidates.iteritems():
50 |         outfile.write(target + '::' + ';'.join(list(candidates)) + '\n')
51 |     
52 |     goldfile.close()
53 |     outfile.close()
54 |         


--------------------------------------------------------------------------------
/parvecs/common/embedding.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Dense vector representations for words
  3 | '''
  4 | 
  5 | import numpy as np
  6 | import heapq
  7 | import math
  8 | import time
  9 | 
 10 | class Embedding:
 11 |     
 12 |     def __init__(self, path):
 13 |         self.m = self.normalize(np.load(path + '.npy'))
 14 |         self.dim = self.m.shape[1]
 15 |         self.wi, self.iw = self.read_vocab(path + '.vocab')        
 16 |     
 17 |     def zeros(self):
 18 |         return np.zeros(self.dim)
 19 |     
 20 |     def dimension(self):
 21 |         return self.dim
 22 |     
 23 |     def normalize(self, m):
 24 |         norm = np.sqrt(np.sum(m*m, axis=1))
 25 |         norm[norm==0] = 1
 26 |         return m / norm[:, np.newaxis]
 27 |     
 28 |     def read_vocab(self, path):
 29 |         vocab = []
 30 |         with open(path) as f:
 31 |             for line in f:
 32 |                 vocab.extend(line.strip().split())
 33 |         return dict([(w, i) for i, w in enumerate(vocab)]), vocab
 34 |     
 35 |     def __contains__(self, w):
 36 |         return w in self.wi
 37 |         
 38 |     def represent(self, w):
 39 |         return self.m[self.wi[w], :]
 40 |     
 41 |     def scores(self, vec):
 42 |         return np.dot(self.m, vec)
 43 |     
 44 |     def positive_scores(self, vec):
 45 |         return (np.dot(self.m, vec)+1)/2
 46 | 
 47 |     def top_scores(self, scores, n=10):
 48 |         if n <= 0:
 49 |             n = len(scores)
 50 |         return heapq.nlargest(n, zip(self.iw, scores), key=lambda x: x[1])
 51 |     
 52 |     def closest(self, w, n=10):                        
 53 |         scores = np.dot(self.m, self.represent(w))               
 54 |         return self.top_scores(scores,n)
 55 | 
 56 |     def closest_with_time(self, w, n=10):        
 57 |         start = time.time()                
 58 |         scores = np.dot(self.m, self.represent(w))
 59 |         end = time.time()        
 60 | #        print "\nDeltatime: %f msec\n" % ((end-start)*1000)
 61 |         return self.top_scores(scores,n), end-start
 62 | 
 63 |     def closest_vec(self, wordvec, n=10):
 64 |         scores = np.dot(self.m, wordvec)
 65 |         return self.top_scores(scores,n)
 66 |     
 67 |     def closest_vec_filtered(self, wordvec, vocab, n=10):
 68 |         scores = np.dot(self.m, wordvec)
 69 |         if n <= 0:
 70 |             n = len(scores)
 71 |         scores_words = zip(self.iw, scores)
 72 |         for i in xrange(0,len(scores_words)):
 73 |             if not scores_words[i][1] in vocab: 
 74 |                 scores_words[i] = (-1, scores_words[i][0])
 75 |         return heapq.nlargest(n, zip(self.iw, scores), key=lambda x: x[1])
 76 |       
 77 |     def closest_prefix(self, w, prefix, n=10):
 78 |         scores = np.dot(self.m, self.represent(w))
 79 |         scores_words = zip(self.iw, scores)
 80 |         for i in xrange(0,len(scores_words)):
 81 |             if not scores_words[i][1].startswith(prefix): 
 82 |                 scores_words[i] = (-1, scores_words[i][0])
 83 |         return heapq.nlargest(n, scores_words, key=lambda x: x[1])
 84 |     
 85 |     def closest_filtered(self, w, vocab, n=10):
 86 |         scores = np.dot(self.m, self.represent(w))
 87 |         scores_words = zip(self.iw, scores)
 88 |         for i in xrange(0,len(scores_words)):
 89 |             if not scores_words[i][1] in vocab: 
 90 |                 scores_words[i] = (-1, scores_words[i][0])
 91 |         return heapq.nlargest(n, scores_words, key=lambda x: x[1])
 92 |  
 93 |     def similarity(self, w1, w2):
 94 |         return self.represent(w1).dot(self.represent(w2))    
 95 | 
 96 | def norm_vec(vec):
 97 |     length = 1.0 * math.sqrt(sum(val ** 2 for val in vec))
 98 |     return [val/length for val in vec]
 99 | 
100 | def score2string(score):
101 |     return score[1] + "\t" + '{0:1.3f}'.format(score[0])
102 | 
103 | 
104 | def closest_sym_scores(targets, subs, w, n):
105 |     w_target_vec = targets.represent(w)
106 |     w_sub_vec = subs.represent(w)
107 |     w2subs = subs.closest_vec(w_target_vec,0)
108 |     w2subs2w = []
109 |     for entry in w2subs:
110 |         score = (entry[0]+1)/2
111 |         sub = entry[1]
112 |         sub_target_vec = targets.represent(sub)       
113 |         rev_score = (np.dot(sub_target_vec, w_sub_vec)+1)/2
114 |         w2subs2w.append((math.sqrt(score * rev_score), sub))
115 |     return heapq.nlargest(n, w2subs2w)
116 |     
117 |     
118 | 


--------------------------------------------------------------------------------
/parvecs/setup/wn_pseudowords_generator.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Generates pseudowords using Wordnet
  3 | '''
  4 | import sys
  5 | import random
  6 | import re
  7 | from nltk.corpus import wordnet as wn
  8 | from nltk.stem.porter import PorterStemmer
  9 | from nltk.stem.wordnet import WordNetLemmatizer
 10 | 
 11 | from parvecs.common.vocab import load_vocabulary_counts
 12 | 
 13 | DEBUG = False
 14 | MIN_FREQ = 1000 # words representing pseudo-senses need to have this min frequency in the learning corpus
 15 | 
 16 | word_re = re.compile('^[a-z][a-z]+$')
 17 | pos_list = set([wn.NOUN, wn.VERB,wn.ADJ, wn.ADV])
 18 | 
 19 | 
 20 | def sample_target_word(vocab_counts, stop_words, min_freq):
 21 |     '''
 22 |     Create a single pseudoword (randomly sampled from vocab)
 23 |     :param vocab_counts:
 24 |     :param stop_words:
 25 |     :param min_freq: minimum required corpus frequency of word
 26 |     '''
 27 |     
 28 |     accum_counts = []
 29 |     n = 0
 30 |     accum_counts.append((None, 0))
 31 |     for word, count in vocab_counts.iteritems():
 32 |         if count >= min_freq and word not in stop_words:
 33 |             n += count
 34 |             accum_counts.append((word, n))
 35 |     max_count = n
 36 |     
 37 |     while True:
 38 |         rnd = random.randint(0, max_count) 
 39 |         for k in xrange(1,len(accum_counts)):
 40 |             sampled_word = accum_counts[k][0]
 41 |             if len(wn.synsets(sampled_word))>1 and word_re.match(sampled_word) != None:
 42 |                 if rnd < accum_counts[k][1] and rnd >= accum_counts[k-1][1]:
 43 |                     return sampled_word
 44 |                 
 45 |     print "Failed to sample target word"
 46 |     sys.exit(1)
 47 |     
 48 | 
 49 | if __name__ == '__main__':
 50 |     
 51 |     if len(sys.argv) < 5:
 52 |         print "usage: %s <vocab-file> <words-num> <words2senses-file> <senses-file> [<min-freq>]"  % (sys.argv[0])
 53 |         sys.exit(1)
 54 |         
 55 |     stemmer = PorterStemmer()
 56 |     
 57 |     vocab_file = sys.argv[1]
 58 |     words_num = int(sys.argv[2])
 59 |     words2senses_file = open(sys.argv[3], 'w')
 60 |     senses_file = open(sys.argv[4], 'w')
 61 |     if len(sys.argv) > 5:
 62 |         min_freq = int(sys.argv[5])
 63 |     else:
 64 |         min_freq = 1000
 65 |     
 66 |     vocab_counts, ignore, stop_words = load_vocabulary_counts(vocab_file)
 67 |     
 68 |     words = set()
 69 |     all_words = set()
 70 |     
 71 |     while len(words) < words_num:
 72 |         while True:
 73 |             word = sample_target_word(vocab_counts, stop_words, min_freq)
 74 |             if word not in words:
 75 |                 break; 
 76 |         word_synsets = wn.synsets(word)
 77 |         if DEBUG: print "Word: [%s] Number of senses: %s" % (word, str(len(word_synsets)))
 78 |         
 79 |         senses = set()
 80 |         for word_synset in word_synsets:
 81 |             if DEBUG: print "\tsynset: %s" % word_synset
 82 |             pos = word_synset.pos()
 83 |             if pos in pos_list:
 84 |                 sense = None
 85 |                 smallest_sense_num_found = sys.maxint
 86 |                 for lemma in word_synset.lemmas():           
 87 |                     if DEBUG: print  "\t " + lemma.name(), len(wn.synsets(lemma.name()))                   
 88 |                     if (stemmer.stem(lemma.name()) != stemmer.stem(word)) and \
 89 |                     WordNetLemmatizer().lemmatize(lemma.name(), pos) != WordNetLemmatizer().lemmatize(word, pos) and \
 90 |                     (lemma.name().islower()) and (lemma.name() in vocab_counts) and (vocab_counts[lemma.name()]>=min_freq) and \
 91 |                     (lemma.name() not in stop_words) and (word_re.match(lemma.name()) != None) and \
 92 |                     (len(wn.synsets(lemma.name())) < smallest_sense_num_found): # we look for the lemma with least number of senses, i.e. hopefully least ambiguous                         
 93 |                         sense = lemma.name()
 94 |                         smallest_sense_num_found = len(wn.synsets(lemma.name()))                
 95 |                 if sense != None:
 96 |                     if DEBUG: print "\tChosen sense word: %s %d\n" % (sense, smallest_sense_num_found)                
 97 |                     senses.add(sense)
 98 |                 else:
 99 |                     if DEBUG: print "\tDidn't find any suitable sense word. Skipping.\n"
100 |         if len(senses) > 1:
101 |                 all_words.update(senses)            
102 |                 sys.stdout.write(word + ':\t' + ' '.join(senses)+'\n')
103 |                 words2senses_file.write(word + '\t' + ' '.join(senses)+'\n')
104 |                 words.add(word)    
105 |     
106 |     for pword in all_words:
107 |         senses_file.write(pword+"\n")
108 |             
109 |     words2senses_file.close()
110 |     senses_file.close()
111 |         


--------------------------------------------------------------------------------
/parvecs/eval/coinco2txt_converter.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Used to convert the coinco (Kremer 2014) xml dataset format to a flat format.
  3 | 
  4 | Example of coinco format:
  5 | <document>
  6 |   <sent MASCfile="NYTnewswire9.txt" MASCsentID="s-r0" >
  7 |     <precontext>
  8 |     
  9 |     </precontext>
 10 |     <targetsentence>
 11 |     A mission to end a war
 12 |     </targetsentence>
 13 |     <postcontext>
 14 |     AUSTIN, Texas -- Tom Karnes was dialing for destiny, but not everyone wanted to cooperate.
 15 |     </postcontext>
 16 |     <tokens>
 17 |       <token id="XXX" wordform="A" lemma="a" posMASC="XXX" posTT="DT" />
 18 |       <token id="4" wordform="mission" lemma="mission" posMASC="NN" posTT="NN" problematic="no" >
 19 |         <substitutions>
 20 |           <subst lemma="calling" pos="NN" freq="1" />
 21 |           <subst lemma="campaign" pos="NN" freq="1" />
 22 |           <subst lemma="dedication" pos="NN" freq="1" />
 23 |           <subst lemma="devotion" pos="NN" freq="1" />
 24 |           <subst lemma="duty" pos="NN" freq="1" />
 25 |           <subst lemma="effort" pos="NN" freq="1" />
 26 |           <subst lemma="goal" pos="NN" freq="2" />
 27 |           <subst lemma="initiative" pos="NN" freq="1" />
 28 |           <subst lemma="intention" pos="NN" freq="1" />
 29 |           <subst lemma="movement" pos="NN" freq="1" />
 30 |           <subst lemma="plan" pos="NN" freq="2" />
 31 |           <subst lemma="pursuit" pos="NN" freq="1" />
 32 |           <subst lemma="quest" pos="NN" freq="1" />
 33 |           <subst lemma="step" pos="NN" freq="1" />
 34 |           <subst lemma="task" pos="NN" freq="2" />
 35 |         </substitutions>
 36 |       </token>
 37 | 
 38 | '''
 39 | 
 40 | import sys
 41 | import string
 42 | from xml.etree import ElementTree
 43 | 
 44 | to_wordnet_pos = {'N':'n','J':'a','V':'v','R':'r'}
 45 | 
 46 | def is_printable(s):
 47 |     return all(c in string.printable for c in s)
 48 | 
 49 | 
 50 | def clean_token(token):
 51 |     
 52 |     token = token.replace('&quot;', '"')
 53 |     token = token.replace('&apos;', "'")
 54 |     token = token.replace(chr(int("85",16)), "...")
 55 |     token = token.replace(chr(int("91",16)), "'")
 56 |     token = token.replace(chr(int("92",16)), "'")
 57 |     token = token.replace(chr(int("93",16)), '"')
 58 |     token = token.replace(chr(int("94",16)), '"')
 59 |     token = token.replace(chr(int("96",16)), '-')         
 60 |     if not is_printable(token):
 61 |         sys.stderr.write('TOKEN NOT PRINTABLE: '+''.join([str(c) for c in token if c in string.printable ]) + '\n')
 62 |         return "<UNK>"
 63 |     else:
 64 |         return token    
 65 | 
 66 | def subs2text(subs_element):
 67 |     subs = [(int(sub.attrib.get('freq')), clean_token(sub.attrib.get('lemma')).replace(';', ',')) for sub in subs_element.iter('subst')]  # sub.attrib.get('lemma').replace(';', ',') is used to fix a three cases in coinco where the lemma includes erroneously the char ';'. Since this char is used as a delimiter, we replace it with ','. 
 68 |     sorted_subs = sorted(subs, reverse=True)
 69 |     return ';'.join([sub + " " + str(freq) for freq, sub in sorted_subs])+';'
 70 | 
 71 | if __name__ == '__main__':
 72 |     
 73 |     if len(sys.argv) < 4:
 74 |         print "Usage: %s <input-coinco-filename> <output-test-filename> <output-gold-filename>" % sys.argv[0]
 75 |         sys.exit(1)
 76 |         
 77 |     with open(sys.argv[1], 'r') as f:
 78 |         coinco = ElementTree.parse(f)
 79 |     
 80 |     test_file = open(sys.argv[2], 'w') 
 81 |     gold_file = open(sys.argv[3], 'w') 
 82 |      
 83 |     sent_num = 0
 84 |     tokens_num = 0
 85 |         
 86 |     for sent in coinco.iter('sent'):
 87 |         sent_num += 1
 88 |         tokens = sent.find('tokens')
 89 |         sent_text = ""
 90 |         for token in tokens.iter('token'):
 91 |             sent_text = sent_text + clean_token(token.attrib.get('wordform')) + " "                
 92 |         sent_text = sent_text.strip()
 93 |         tok_position = -1
 94 |         for token in tokens.iter('token'):
 95 |             tok_position += 1
 96 |             if token.attrib.get('id') != 'XXX' and token.attrib.get('problematic') == 'no':
 97 |                 tokens_num += 1
 98 |                 try:
 99 |                     target_key = clean_token(token.attrib.get('lemma')) + '.' + to_wordnet_pos[token.attrib.get('posMASC')[0]]
100 |                     test_file.write("%s\t%s\t%d\t%s\n" % (target_key, token.attrib.get('id'), tok_position, sent_text))
101 |                     gold_file.write("%s %s :: %s\n" % (target_key, token.attrib.get('id'), subs2text(token.find('substitutions'))))
102 |                 except UnicodeEncodeError as e:
103 |                     sys.stderr.write("ENCODING TARGET ERROR at token_id %s. %s\n" % (token.attrib.get('id'),e))
104 |                     sys.exit(1)
105 |                 
106 |     test_file.close()
107 |     gold_file.close()
108 |                            
109 |     print 'Read %d sentences %d target tokens' % (sent_num, tokens_num)   
110 |     
111 |     
112 |         
113 |     


--------------------------------------------------------------------------------
/parvecs/common/context_instance.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Represents the given context of a target word instance 
  3 | '''
  4 | 
  5 | CONTEXT_TEXT_BEGIN_INDEX = 3
  6 | 
  7 | import math
  8 | 
  9 | class ContextInstance(object):
 10 |  
 11 |     def __init__(self, line):
 12 |         '''
 13 |         Constructor
 14 |         
 15 |         Example line:
 16 |         bright.a        1       13      during the siege , george robertson had appointed shuja-ul-mulk , who was a bright boy
 17 |         '''
 18 |         self.line = line
 19 |         tokens1 = line.split("\t")
 20 |         self.target_key = tokens1[0]
 21 |         self.target_id = tokens1[1]
 22 |         self.target_ind = int(tokens1[2])
 23 |         self.target = tokens1[3].split()[self.target_ind]
 24 |         pos_delimiter_ind = self.target_key.rfind('.')
 25 |         if pos_delimiter_ind > 0 and pos_delimiter_ind == len(self.target_key)-2:
 26 |             self.partofspeech = self.target_key[pos_delimiter_ind+1:]
 27 |         else:
 28 |             self.partofspeech = None
 29 |    
 30 |    
 31 |     def get_context_tokens(self):
 32 |         '''
 33 |         :returns: a list of the text tokens
 34 |         '''
 35 |         all_tokens = self.line.split()
 36 |         return all_tokens[CONTEXT_TEXT_BEGIN_INDEX:]
 37 | 
 38 |     
 39 |     
 40 |     def get_neighbors(self, window_size):
 41 |         '''
 42 |         Get the neighbors of a target word
 43 |         :param window_size: neighbors window size
 44 |         :returns: a list of neighbors
 45 |         '''
 46 |         tokens = self.line.split()[3:]
 47 |         
 48 |         if (window_size > 0):                                    
 49 |             start_pos = max(self.target_ind-window_size, 0)
 50 |             end_pos = min(self.target_ind+window_size+1, len(tokens))
 51 |         else:
 52 |             start_pos = 0
 53 |             end_pos = len(tokens)
 54 |             
 55 |         neighbors = tokens[start_pos:self.target_ind] + tokens[self.target_ind+1:end_pos]
 56 |         return neighbors 
 57 |    
 58 |     
 59 |     def decorate_context(self):
 60 |         '''
 61 |         :returns the context text line with target word highlighted
 62 |         '''
 63 |         tokens = self.line.split('\t')
 64 |         words = tokens[CONTEXT_TEXT_BEGIN_INDEX].split()
 65 |         words[self.target_ind] = '__'+words[self.target_ind]+'__'
 66 |         tokens[CONTEXT_TEXT_BEGIN_INDEX] = ' '.join(words)
 67 |         return '\t'.join(tokens)
 68 |     
 69 | 
 70 | def read_context(subfile, maxlen=None):
 71 |     '''
 72 |     Reads a context and substitute vector from file
 73 |     :param subfile:
 74 |     :param maxlen:
 75 |     :returns context instance, subvec
 76 |     '''
 77 |     context_line = subfile.readline()
 78 |     subvecs_line = subfile.readline()
 79 |     if not context_line or not subvecs_line:
 80 |         raise EOFError
 81 | 
 82 |     context_inst = ContextInstance(context_line.strip())
 83 |     subvecs_line = subvecs_line.strip()
 84 |     subvec = [__extract_word_weight(pair) for pair in subvecs_line.split("\t")[:maxlen]] if len(subvecs_line) > 0 else []
 85 |     
 86 |     return context_inst, subvec
 87 | 
 88 | 
 89 | 
 90 | def get_pmi_weights(subvec, w2counts, sum_counts, shift, threshold, normalize=False):
 91 |     '''
 92 |     Converts a subvec with conditional probability weights to pmi (or sppmi) weights
 93 |     Also performs the functionality of remove_out_of_vocab
 94 |     :param subvec:
 95 |     :param w2counts:
 96 |     :param sum_counts:
 97 |     :param shift:
 98 |     :param threshold:
 99 |     :param normalize:
100 |     :returns: subvec with pmi weights
101 |     '''
102 |     subvec_pmi = []
103 |     norm = 0
104 |     for word, prob in subvec:
105 |         if prob != 0.0 and word in w2counts:
106 |             pmi = math.log(prob * sum_counts / w2counts[word])-shift
107 |             if pmi>threshold:
108 |                 subvec_pmi.append((word, pmi))
109 |                 norm += pmi**2
110 |             
111 |     if normalize:
112 |         norm = norm**0.5
113 |         for i in xrange(0,len(subvec_pmi)):
114 |             subvec_pmi[i] = (subvec_pmi[i][0], subvec_pmi[i][1] / norm)       
115 |             
116 |     return subvec_pmi
117 | 
118 | def remove_out_of_vocab(subvec, w2counts):
119 |     '''
120 |     Removes entries from subvec that are out of the vocabulary
121 |     :param subvec:
122 |     :param w2counts:
123 |     :returns: subvec in vocab
124 |     '''
125 |     subvec_vocab = []
126 |     for word, prob in subvec:
127 |         if prob != 0.0 and word in w2counts:
128 |             subvec_vocab.append((word, prob))
129 |     return subvec_vocab
130 | 
131 | 
132 | def normalize_subvec(subvec):
133 |     '''
134 |     normalizes subvec weights in L2
135 |     :param subvec:
136 |     :returns: normalized subvec
137 |     '''
138 |     norm = 0.0
139 |     for word, weight in subvec:
140 |         norm += weight**2
141 |     norm = norm**0.5
142 |     for i in xrange(0,len(subvec)):
143 |         subvec[i] = (subvec[i][0], subvec[i][1] / norm)  
144 |         
145 | 
146 | def __extract_word_weight(pair):
147 |     tokens = pair.split(' ')
148 |     return tokens[0], float(tokens[1])   
149 | 
150 |   


--------------------------------------------------------------------------------
/parvecs/inference/parvec_inferrer.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | ParvecInferrer generates a paraphrase vector for a target word in a given context
  3 | 
  4 | '''
  5 | import heapq
  6 | import time
  7 | 
  8 | from parvecs.common.vocab import load_vocabulary_w2i
  9 | from parvecs.common.vocab import load_vocabulary_counts
 10 | from parvecs.common.context_instance import get_pmi_weights
 11 | from parvecs.common.context_instance import remove_out_of_vocab
 12 | from parvecs.common.util import wf2ws
 13 | from parvecs.common.util import vec_to_str
 14 | from parvecs.common.util import TimeRecorder
 15 | from parvecs.common.embedding import Embedding
 16 | from parvecs.inference.contexts_container import ContextsContainer
 17 | 
 18 | 
 19 | class ParvecInferrer():
 20 | 
 21 | 
 22 |     def __init__(self, args):
 23 |         
 24 |         self.args = args
 25 |         self.bow_interpolate = self.args.bow_interpolate
 26 |         self.w2i, self.i2w = load_vocabulary_w2i(args.vocabfile)    
 27 |         self.w2counts, self.sum_word_counts, self.stopwords = load_vocabulary_counts(args.vocabfile)
 28 |         if args.use_stopwords == False:
 29 |             self.stopwords = {}
 30 |         print "Vocab size: " + str(len(self.w2i))
 31 |              
 32 |         if args.embeddingpath != None:
 33 |             embeddings = Embedding(args.embeddingpath)
 34 |             print "Read embeddings from " + args.embeddingpath
 35 |         else:
 36 |             embeddings = None
 37 |             
 38 |         self.context_container = ContextsContainer(args, self.w2i, self.i2w, self.w2counts, self.sum_word_counts, self.stopwords, embeddings)
 39 |         self.time_recorder = TimeRecorder()
 40 |         
 41 |                     
 42 |     def clear(self):
 43 |         '''
 44 |         Clears the contexts cache
 45 |         '''
 46 |         self.context_container.clear()
 47 |         
 48 |         
 49 |     def infer_parvec(self, subvec, context_instance, tfo):
 50 |         '''
 51 |         generate the paraphrase vector
 52 |         :param orig_subvec: subvec of instance
 53 |         :param context_instance: context instance
 54 |         :param tfo: output file
 55 |         :returns: parvec
 56 |         '''
 57 |     
 58 |         subvec = self.__preprocess_subvec(subvec, context_instance, tfo)
 59 |         
 60 |         if (self.args.debug == True):
 61 |             tfo.write("\nUsing weightsfactor %s\n" % ('{0:1.1f}'.format(self.args.weightsfactor)))
 62 |         
 63 |         target_contexts = self.context_container.get_target_contexts(context_instance.target)
 64 |         
 65 |         if target_contexts is not None:
 66 |             
 67 |             start1 = time.time()
 68 |             subvec_matrix = target_contexts.reference_context(subvec, context_instance, self.bow_interpolate)
 69 |             end1 = time.time()
 70 |                     
 71 |             if (self.args.debug == True) and (self.bow_interpolate > 0):
 72 |                 tfo.write("\nUsed BOW similarity. bow_interpolate = %f\n\n" % self.bow_interpolate)  
 73 |             
 74 |             max_len = self.args.debugtop if self.args.debug == True else len(subvec) 
 75 |             trimmed_sorted_subvec = heapq.nlargest(max_len, subvec, key=lambda t: t[1])  
 76 |             tfo.write("SUBVEC\t" + '\t'.join([' '.join([word, wf2ws(weight)]) for (word, weight) in trimmed_sorted_subvec])+'\n')        
 77 |             
 78 |             start2 = time.time()
 79 |             result_vec, contexts_num = target_contexts.avg_contexts(subvec_matrix, self.args.top, self.args.top_percent, self.args.parvec_maxlen, self.args.excluderef, self.args.weightsfactor)
 80 |             end2 = time.time()
 81 |             
 82 |             deltatime = (end1-start1) + (end2-start2)
 83 |             self.time_recorder.iteration_time(deltatime)
 84 |             
 85 |             if (self.args.debug == True):
 86 |                 tfo.write("\nDeltatime: %f msec\n" % (deltatime*1000))
 87 |                 tfo.write("\nTop similar contexts:\n")
 88 |                 tfo.write("**************************\n")
 89 |                 tfo.write(target_contexts.to_str(min(self.args.debugtop,contexts_num) , self.args.debugtop)+"\n\n")
 90 |     
 91 |             if (self.args.debug == True):
 92 |                 if (result_vec is not None):
 93 |                     tfo.write("Avg of top " + str(contexts_num) + " contexts: " + vec_to_str(result_vec, self.args.debugtop) + '\n')
 94 |                 else:
 95 |                     tfo.write("Avg of top " + str(contexts_num) + " contexts: None\n")                     
 96 |                 tfo.write("*****************************************\n\n") 
 97 |         else:
 98 |             if (self.args.debug == True):
 99 |                 tfo.write("\nNo subvecs found for target [%s], using only reference subvec.\n" % context_instance.target)
100 |             tfo.write("SUBVEC\t" + '\t'.join([' '.join([word, wf2ws(weight)]) for (word, weight) in subvec])+'\n')
101 |             result_vec = subvec                  
102 |                 
103 |         return result_vec
104 |             
105 | 
106 |     def msec_per_word(self):
107 |         '''
108 |         returns: mean net processing time per parvec generation
109 |         '''
110 |         return self.time_recorder.msec_per_iteration()
111 |     
112 | 
113 |     def __preprocess_subvec(self, subvec, context_instance, tfo):        
114 |         if (self.args.pmi == True):
115 |             subvec = get_pmi_weights(subvec, self.w2counts, self.sum_word_counts, self.args.pmioffset, self.args.pmithreshold)
116 |         else:
117 |             subvec = remove_out_of_vocab(subvec, self.w2counts)                   
118 |         return sorted(subvec, reverse=True, key=lambda x: x[1])
119 |     
120 |  


--------------------------------------------------------------------------------
/parvecs/setup/cluster_subvecs.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | Created on 22.9.14
  3 | 
  4 | @author: user
  5 | '''
  6 | from parvecs.common.context_instance import read_context
  7 | from parvecs.common.context_instance import normalize_subvec
  8 | from parvecs.common.vocab import load_vocabulary_w2i
  9 | 
 10 | import sys
 11 | import os
 12 | import os.path
 13 | from operator import itemgetter
 14 | import numpy as np
 15 | from scipy.sparse.dok import dok_matrix
 16 | 
 17 | from sklearn.cluster import KMeans
 18 | 
 19 | 
 20 | def normalize_centroids(centroids):
 21 |     for j in xrange(0,len(centroids)):
 22 |         norm = (np.dot(centroids[j,:],centroids[j,:]))**0.5
 23 |         if norm > 0:
 24 |             centroids[j,:] /= norm
 25 | 
 26 | 
 27 | def cluster_subvec_file(w2i, cluster_prunning, K, ninit, maxiter, min_avg_cluster_size, subvec_filename, cluster_filename):
 28 |     '''
 29 |     kmeans clustering of subvecs given in an input file
 30 |     :param w2i: word2index
 31 |     :param cluster_prunning: max size of a cluster centroid
 32 |     :param K: number of clusters
 33 |     :param ninit: number of repeating tries
 34 |     :param maxiter: number of clustering iterations
 35 |     :param min_avg_cluster_size: min size of clusters (on average)
 36 |     :param subvec_filename: input filename
 37 |     :param cluster_filename: output filename
 38 |     :returns: None
 39 |     '''
 40 |     
 41 |     if os.path.exists(cluster_filename):
 42 |         print "NOTICE: cluster file %s already exists. skipping." % cluster_filename 
 43 |         return   
 44 |         
 45 |     subvec_file = open(subvec_filename, 'r')
 46 |     subvec_num = sum(1 for line in subvec_file)/2 #subvec is on every second line
 47 |     subvec_file.seek(0)
 48 |     
 49 |     minK = min(subvec_num/min_avg_cluster_size, K)
 50 |     minK = max(1, minK)
 51 |       
 52 |     cluster_file = open(cluster_filename, 'w')    
 53 |     print "Clustering subvecs in file %s. Using K=%d\n" % (cluster_filename, minK)       
 54 |         
 55 |     target = subvec_filename[subvec_filename.rfind('/')+1:]
 56 |     subs_matrix = dok_matrix((subvec_num, len(w2i)), dtype=np.float32)
 57 |     
 58 |     line = 0    
 59 |     try:
 60 |         while True: 
 61 |             context_inst, subvec = read_context(subvec_file)
 62 |             normalize_subvec(subvec)
 63 |             for word, weight in subvec:
 64 |                 if (weight != 0):
 65 |                     subs_matrix[line, w2i[word]] = weight 
 66 |             line += 1
 67 |             if line % 10000 == 0:
 68 |                 sys.stderr.write("Read %d subvecs\n" % (line))
 69 |     except EOFError:            
 70 |         sys.stderr.write("Finished loading %d context lines\n" % line)
 71 |         
 72 |     subs_matrix = subs_matrix.tocsr()
 73 |         
 74 |     best_centroids = None
 75 |     best_inertia = None
 76 |     
 77 |     for init_iter in xrange(0, ninit): 
 78 |  
 79 |         kmeans = KMeans(init='k-means++', n_clusters=minK, n_init=1, max_iter=1)
 80 |         kmeans.fit(subs_matrix)
 81 |         centroids = kmeans.cluster_centers_
 82 |         normalize_centroids(centroids)
 83 |         for iter in xrange(1,maxiter):        
 84 |             kmeans = KMeans(init=centroids, n_clusters=minK, n_init=1, max_iter=1)                 
 85 |             kmeans.fit(subs_matrix)
 86 |             centroids = kmeans.cluster_centers_
 87 |             normalize_centroids(centroids)            
 88 |         inertia = kmeans.inertia_
 89 |         
 90 |         if best_centroids is None or inertia < best_inertia:
 91 |             best_inertia = inertia
 92 |             best_centroids = centroids
 93 |         
 94 |     for j in xrange(0,len(best_centroids)):        
 95 |         cluster_vec = [(i2w[i], weight) for (i, weight) in enumerate(best_centroids[j,:]) if weight != 0]
 96 |         cluster_vec = sorted(cluster_vec, key=itemgetter(1), reverse=True)[:cluster_prunning]
 97 |         norm = sum([weight**2 for word, weight in cluster_vec])**0.5
 98 |         cluster_vec = [(word, weight/norm) for word, weight in cluster_vec]
 99 |         norm = sum([weight**2 for word, weight in cluster_vec])**0.5
100 |         cluster_file.write(target + "\t" + str(j) + "\t0\t" + target + "\tCLUSTER\t norm verified = " + '{0:1.8f}'.format(norm) + "\tpruning factor = " + str(cluster_prunning) +"\n")
101 |         for (word, weight) in cluster_vec:
102 |             cluster_file.write(' '.join([word, '{0:1.8f}'.format(weight)])+'\t')
103 |         cluster_file.write('\n') 
104 |     
105 |     subvec_file.close()
106 |     cluster_file.close()
107 | 
108 | 
109 | if __name__ == '__main__':
110 |     
111 |     if len(sys.argv) < 9:
112 |         sys.stderr.write("Usage: %s <vocab-file> <K> <min-avg-cluster-size> <cluster-prunning> <input-dir> <output-dir> <from> <to> [n_init] [max_iter]\n" % sys.argv[0])
113 |         sys.exit(1)
114 |         
115 |     vocab_filename =  sys.argv[1]
116 |     K = int(sys.argv[2])
117 |     min_avg_cluster_size = int(sys.argv[3])
118 |     cluster_prunning = int(sys.argv[4])
119 |     input_dirname = sys.argv[5]
120 |     output_dirname = sys.argv[6]
121 |     from_file = int(sys.argv[7])
122 |     to_file = int(sys.argv[8])
123 |     
124 |     if from_file == 0:
125 |         from_file = None
126 |     if to_file == 0:
127 |         to_file = None
128 |     w2i, i2w = load_vocabulary_w2i(vocab_filename)
129 | 
130 |     ninit=1
131 |     maxiter=30
132 |     if len(sys.argv) > 9:
133 |         ninit = int(sys.argv[9])
134 |     if len(sys.argv) > 10:
135 |         maxiter = int(sys.argv[10])
136 |     sys.stderr.write("K=%d, n_init=%d, max_iter=%d\n" % (K, ninit, maxiter))
137 |     
138 |     if not os.path.exists(output_dirname):
139 |         os.makedirs(output_dirname)
140 |     
141 |     filenames = sorted(os.listdir(input_dirname))[from_file:to_file]     
142 |    
143 |     for filename in filenames:
144 |         input_filepath = '/'.join([input_dirname, filename])  
145 |         output_filepath = '/'.join([output_dirname, filename])
146 |         cluster_subvec_file(w2i, cluster_prunning, K, ninit, maxiter, min_avg_cluster_size, input_filepath, output_filepath)
147 | 
148 |     
149 |     


--------------------------------------------------------------------------------
/parvecs/inference/word2parvec.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | word2parvec application
  3 | converts words in contexts to paraphrase vectors representations
  4 | '''
  5 | import sys
  6 | import time
  7 | import argparse
  8 | import numpy
  9 | 
 10 | from parvecs.inference.parvec_inferrer import ParvecInferrer
 11 | from parvecs.inference.parvec_util import parvec_lemmatize
 12 | from parvecs.common.util import vec_to_str
 13 | from parvecs.common.context_instance import read_context
 14 | 
 15 | 
 16 | def run_app(args, inferrer):
 17 |     '''
 18 |     Runs the application
 19 |     :param args: all app arguments
 20 |     :param inferrer: the parvec inferrer that is to be used
 21 |     :returns: None
 22 |     '''
 23 |     
 24 |     testfile = open(args.testfile, 'r')
 25 |     resultsfile = open(args.resultsfile, 'w')
 26 |     
 27 |     lines = 0
 28 |     last_target_key = None
 29 |     while True:
 30 |         
 31 |         try:
 32 |             context_instance, subvec = read_context(testfile, args.subvec_maxlen)
 33 |         except EOFError:
 34 |             break
 35 |         
 36 |         lines += 1
 37 |         if (args.debug == True):
 38 |             resultsfile.write("\nTest context:\n")
 39 |             resultsfile.write("=====================\n")
 40 |             
 41 |         resultsfile.write("INSTANCE\t" + context_instance.decorate_context()+'\n')
 42 |                                  
 43 |         # Assuming testfile is sorted according to target key - clear container memory every time we move to a new key target word
 44 |         if context_instance.target_key != last_target_key:
 45 |             inferrer.clear()
 46 |             last_target_key = context_instance.target_key
 47 |                 
 48 |         result_vec = inferrer.infer_parvec(subvec, context_instance, resultsfile)
 49 |         
 50 |         max_vec_len = args.debugtop if args.debug == True else args.parvec_maxlen
 51 |         if (args.debug == True):
 52 |             resultsfile.write("Paraphrase vector\n")
 53 |             resultsfile.write("***************\n")
 54 |         resultsfile.write("PARVEC\t" + vec_to_str(result_vec, max_vec_len)+"\n")
 55 |         
 56 |         if (args.lemmatize == True):
 57 |             result_vec_lemmatized = parvec_lemmatize(result_vec, context_instance.partofspeech)
 58 |             if (args.debug == True):
 59 |                 resultsfile.write("Lemmatized paraphrase vector\n")
 60 |                 resultsfile.write("***************\n")
 61 |             resultsfile.write("PARLEMVEC\t" + vec_to_str(result_vec_lemmatized, max_vec_len)+"\n")
 62 |         
 63 |         if lines % 100 == 0:
 64 |             print "Read %d lines" % lines                      
 65 |         
 66 |     print "Read %d word instances in total" % lines 
 67 |     print "Net processing time for computing the paraphrase vectors per each word instance: %f msec" % inferrer.msec_per_word()          
 68 |     testfile.close()
 69 |     resultsfile.close()
 70 |     
 71 |     
 72 | def run(args):
 73 |     '''
 74 |     Initialize inferrer and run app
 75 |     :param args:
 76 |     '''
 77 |     
 78 |     print "Initializing"
 79 |     print time.asctime(time.localtime(time.time()))
 80 |     
 81 |     inferrer = ParvecInferrer(args)
 82 |     print "Running"
 83 |     print time.asctime(time.localtime(time.time()))
 84 |     
 85 |     run_app(args, inferrer)
 86 |     print "Finished"
 87 |     print time.asctime(time.localtime(time.time()))
 88 | 
 89 | 
 90 |     
 91 | if __name__ == '__main__':
 92 |   
 93 |     parser = argparse.ArgumentParser(description='Parvec App')
 94 | 
 95 |     parser.add_argument('--debug',action='store_true',dest='debug')
 96 |     parser.add_argument('-debugtop', action="store", dest="debugtop", type=int, default=10, help="Top number of vector entries to print in debug mode.")
 97 |     
 98 |     parser.add_argument('-contexts_dir', action="store", dest="contexts_dir", default=None)
 99 |     parser.add_argument('-vocabfile', action="store", dest="vocabfile", default=None)
100 |     parser.add_argument('-testfile', action="store", dest="testfile", default=None)
101 |     parser.add_argument('-resultsfile', action="store", dest="resultsfile", default=None)
102 |     
103 |     parser.add_argument('--lemmatize', action="store_true", dest="lemmatize", default=False, help="Lemmatize output paraphrase vectors.")    
104 |     parser.add_argument('-parvec_maxlen', action="store", dest="parvec_maxlen", type=int, default=100, help="Max num of paraphrases in each output parvec.")
105 |     parser.add_argument('-subvec_maxlen', action="store", dest="subvec_maxlen", type=int, default=None, help="Max num of substitutes read per subvec.")    
106 |     parser.add_argument('-top', action="store", dest="top", type=int, default=0, help="Num of top most similar contexts to consider for each given context. 0 means all context.")
107 |     parser.add_argument('-toppercent', action="store", dest="top_percent", type=float, default=0.0, help="Percent of top contexts to consider. Param 'top' is considered as min number to consider in any case. 0 means all contexts.")
108 |     parser.add_argument('-weightsfactor',action='store',dest='weightsfactor', type=float, default=1.0, help="Context similarity weights power factor.")
109 |     parser.add_argument('--excluderef',action='store_true',dest='excluderef', default=False, help="Exclude reference (given) context from context averaging.")
110 | 
111 |     parser.add_argument('--pmi',action='store_true',dest='pmi', default=False, help="Convert conditional probability substitute weights in input files to pmi (or spmmi) weights).")    
112 |     parser.add_argument('-pmioffset',action='store',dest='pmioffset', type=float, default=0.0, help='pmi=pmi-offset')
113 |     parser.add_argument('-pmithreshold',action='store',dest='pmithreshold', type=float, default=0.0, help='pmi=0 if pmi<=threshold')
114 | 
115 |     parser.add_argument('-bow',action='store',dest='bow_size', default=-1, type=int, help="Context bag-of-words window size used for computing context sim. -1 means bow is not used, 0 means entire sentence.")
116 |     parser.add_argument('-bowinter',action='store',dest='bow_interpolate', default=0.0, type=float, help="Interpolation factor between bow and subvec context sims. 0 means only consider subvec similarity.")
117 |     parser.add_argument('-cbow',action='store',dest='embeddingpath', default=None, help="Use continuous bow (embeddings avg) instead of bow")
118 |     
119 |     parser.add_argument('--tfidf',action='store_true',dest='tfidf', default=False, help="Use tfidf weighting in bow.")    
120 |     parser.add_argument('-tfidfoffset',action='store',dest='tfidf_offset', type=float, default=0.0, help='tfidf=tfidf-offset')
121 |     parser.add_argument('-tfidfthreshold',action='store',dest='tfidf_threshold', type=float, default=0.0, help='tfidf=0 if tfidf<=threshold')
122 |     parser.add_argument('--nostopwords',action='store_false',dest='use_stopwords', default=True)
123 |     
124 |     
125 |     
126 |     if len(sys.argv)==1:
127 |         print parser.print_help(sys.stdout)
128 |     else:
129 |         args = parser.parse_args(sys.argv[1:])
130 |         config_file_name = args.resultsfile + ".CONFIG"
131 |         cf = open(config_file_name, 'w')
132 |         cf.write(' '.join(sys.argv)+'\n')
133 |         cf.close()       
134 |         numpy.seterr(all='raise', divide='raise', over='raise', under='raise', invalid='raise')        
135 |         run(args)
136 |     
137 | 


--------------------------------------------------------------------------------
/parvecs/inference/context_similarity_measures_eval.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | This application is used to evaluate context similarity measures using pseudowords.
  3 | '''
  4 | import sys
  5 | import argparse
  6 | from random import Random
  7 | import numpy
  8 | 
  9 | from parvecs.common.vocab import load_vocabulary_w2i
 10 | from parvecs.common.vocab import load_vocabulary_counts
 11 | from parvecs.inference.context_collection import ContextCollection
 12 | from parvecs.common.embedding import Embedding
 13 | from parvecs.common.util import count_file_lines
 14 | 
 15 | 
 16 | def read_pseudo_words(pseudos_filename):
 17 |     '''
 18 |     Reads pseudo words from file
 19 |     :param pseudos_filename:
 20 |     :returns: mapping of each pseudoword to a list of pseudosense
 21 |     '''   
 22 |     words = []
 23 |     f = open(pseudos_filename, 'r')
 24 |     for line in f:
 25 |         word = line[:line.find('\t')]
 26 |         pseudos = line.split()[1:]        
 27 |         words.append((word, pseudos))   
 28 |     f.close()
 29 |     
 30 |     return words
 31 | 
 32 | def evaluate_word(word, collection, seeded_random, results_file):
 33 |     '''
 34 |     Evaluate context similarity measures on a given pseudoword experiment
 35 |     :param word: the pseudoword used to perform the experiment
 36 |     :param collection: the contexts of the pseudoword
 37 |     :param seeded_random:
 38 |     :param results_file:
 39 |     :returns: evaluation results
 40 |     '''
 41 |     
 42 |     m_precision_at_1 = 0.0 # mean precision@1
 43 |     m_top_precision = 0.0 # mean precision@top
 44 |     m_avg_precision = 0.0 # mean average precision
 45 |     for i in xrange(0,args.sample_num):
 46 |         precision_at_1, top_precision, avg_precision, debug_str = collection.evaluate_context_similarity(seeded_random, args.random_sim)
 47 |         m_precision_at_1 += precision_at_1
 48 |         m_top_precision += top_precision
 49 |         m_avg_precision += avg_precision
 50 |         if args.debug:
 51 |             results_file.write("\n" + debug_str + "\n")
 52 |             results_file.write("%d: p@1: %f \t p@top: %f \t avg_p: %f\n" % (i, precision_at_1, top_precision, avg_precision))
 53 |             
 54 |     m_precision_at_1  /= args.sample_num
 55 |     m_top_precision  /= args.sample_num
 56 |     m_avg_precision  /= args.sample_num
 57 |             
 58 |     print "Mean over all samples for word [%s]: m_p@1: %f \t m_p@top :%f \t m_avg_p: %f\n" % (word, m_precision_at_1, m_top_precision, m_avg_precision)
 59 |     if args.debug:
 60 |         results_file.write("\nMean over all samples for word [%s]: m_p@1: %f \t m_p@top: %f \t m_avg_p: %f\n\n" % (word, m_precision_at_1, m_top_precision, m_avg_precision))
 61 |     else:
 62 |         results_file.write("%s\t%f\t%f\t%f\n" % (word, m_precision_at_1, m_top_precision, m_avg_precision))
 63 |     results_file.flush()
 64 |     return m_precision_at_1, m_top_precision, m_avg_precision
 65 | 
 66 | 
 67 | def add_pseudo_word_to_vocab(i2w, w2i, w2counts, word, pseudo_senses):
 68 |     '''
 69 |     Add pseudo word to vocabulary
 70 |     :param i2w:
 71 |     :param w2i:
 72 |     :param w2counts:
 73 |     :param word:
 74 |     :param pseudo_senses:
 75 |     :returns: the label of the pseudoword in the vocabulary
 76 |     '''
 77 |     label = word+'='+'+'.join(pseudo_senses)
 78 |     i2w.append(label)
 79 |     w2i[label] = len(i2w)-1
 80 |     count = 0
 81 |     for word in pseudo_senses:
 82 |         count += w2counts[word]
 83 |     w2counts[label] = count 
 84 |     return label   
 85 | 
 86 | def run(args):
 87 |     '''
 88 |     Run application
 89 |     :param args:
 90 |     '''
 91 |     
 92 |     w2i, i2w = load_vocabulary_w2i(args.vocabfile)    
 93 |     w2counts, sum_word_counts, stopwords = load_vocabulary_counts(args.vocabfile)
 94 |     print "Vocab size: " + str(len(w2i))
 95 |     
 96 |     if args.embeddingpath != None:
 97 |         embeddings = Embedding(args.embeddingpath)
 98 |         print "Read embeddings from " + args.embeddingpath
 99 |     else:
100 |         embeddings = None
101 | 
102 |     words = read_pseudo_words(args.pseudos_file)
103 |     
104 |     results_file = open(args.resultsfile,'w')
105 |     
106 |     mm_precision_at_1 = 0.0 # mean mean precision@1
107 |     mm_top_precision = 0.0 # mean mean precision@top
108 |     mm_avg_precision = 0.0 # mean mean average precision
109 |     seeded_random = Random()
110 |     for word in words:           
111 |         
112 |         word_name = word[0]
113 |         
114 | #        word_seed = word_name+' '+' '.join(word[1])
115 | #        the 'star' is used for backward compatibility with previous experiments
116 |         word_name_star = word_name+'.*'
117 |         word_star = [pseudo+'.*' for pseudo in word[1]]
118 |         word_seed = word_name_star+' '+' '.join(word_star)        
119 |         
120 |         seeded_random.seed(word_seed) # we want the same random numbers when repeating experiments with different params etc.    
121 |         senses = word[1]
122 |         
123 |         collection_size = 0
124 |         for target in senses:
125 |             target_filename = args.contexts_dir+"/"+target        
126 |             collection_size += count_file_lines(target_filename)/2 # subvec every two lines
127 |         
128 |         pseudos_label = add_pseudo_word_to_vocab(i2w, w2i, w2counts, word_name, senses)    
129 |         collection = ContextCollection(args, i2w, w2i, collection_size, w2counts, sum_word_counts, stopwords, embeddings)
130 |          
131 |         if args.debug:
132 |                 results_file.write("Reading word for word_name [%s]\n" % word_name)                  
133 |         for target in senses:
134 |             target_filename = args.contexts_dir+"/"+target                
135 |             target_subfile = open(target_filename, 'r')
136 |             lines_num = collection.load_contexts(target_subfile, set(senses), pseudos_label, tocsr_flag=False)
137 |             if args.debug:
138 |                 results_file.write("Read %d contexts for pseudo [%s]\n" % (lines_num, target))        
139 |             target_subfile.close()
140 |         collection.tocsr()    
141 |         m_precision_at_1, m_top_precision, m_avg_precision = evaluate_word(word_name, collection, seeded_random, results_file) 
142 |             
143 |         mm_precision_at_1 += m_precision_at_1
144 |         mm_top_precision += m_top_precision
145 |         mm_avg_precision += m_avg_precision
146 |             
147 |     mm_precision_at_1 /= len(words)
148 |     mm_top_precision /= len(words)
149 |     mm_avg_precision /= len(words)
150 |     
151 |     results_file.write("TOTAL\t%f\t%f\t%f\n" % (mm_precision_at_1, mm_top_precision, mm_avg_precision))
152 |     
153 |     if args.debug:
154 |         results_file.write("#WORDS\t%d\n" % len(words))
155 |         results_file.write("MM_P1\t%f\n" % (mm_precision_at_1))
156 |         results_file.write("MM_PTOP\t%f\n" % (mm_top_precision))
157 |         results_file.write("MM_AVG\t%f\n" % (mm_avg_precision))
158 |         
159 |     results_file.close()
160 | 
161 | 
162 | 
163 | 
164 | 
165 | if __name__ == '__main__':
166 |     
167 |     parser = argparse.ArgumentParser(description='Context similarity measures app')
168 |     
169 |     parser.add_argument('--debug',action='store_true',dest='debug', default=False)
170 | 
171 |     parser.add_argument('-samplenum', action="store", dest="sample_num", type=int, default=None, help="number of samples from each pseudowords collection")
172 |     parser.add_argument('-pseudosfile', action="store", dest="pseudos_file", default=None)
173 |     parser.add_argument('-contexts_dir', action="store", dest="contexts_dir", default=None)
174 |     parser.add_argument('-vocabfile', action="store", dest="vocabfile")
175 |     parser.add_argument('-resultsfile', action="store", dest="resultsfile")
176 |     parser.add_argument('-embeddingpath', action="store", dest="embeddingpath", default=None, help="prefix to files containing word embeddings")
177 |     
178 | 
179 |     parser.add_argument('-top', action="store", dest="top", type=int, help="num of top contexts to consider")
180 |     parser.add_argument('-toppercent', action="store", dest="top_percent", type=float, default=0.0, help="percent of top contexts to consider. When using this, top num is considered as min number to consider")
181 |     parser.add_argument('-subvec_maxlen', action="store", dest="subvec_maxlen", type=int, default=None, help="max num of substitutes read per subvec")
182 |     
183 |     parser.add_argument('--randomsim',action='store_true',dest='random_sim', default=False, help='similarity measure returns zero for all context pairs')
184 |     parser.add_argument('--pmi',action='store_true',dest='pmi', default=False)    
185 |     parser.add_argument('-pmioffset',action='store',dest='pmioffset', type=float, default=0.0, help='pmi=pmi-offset')
186 |     parser.add_argument('-pmithreshold',action='store',dest='pmithreshold', type=float, default=0.0, help='pmi=0 if pmi<=threshold')
187 |     
188 |     parser.add_argument('--tfidf',action='store_true',dest='tfidf', default=False)    
189 |     parser.add_argument('-tfidfoffset',action='store',dest='tfidf_offset', type=float, default=0.0, help='tfidf=tfidf-offset')
190 |     parser.add_argument('-tfidfthreshold',action='store',dest='tfidf_threshold', type=float, default=0.0, help='tfidf=0 if tfidf<=threshold')
191 |     
192 |     
193 |     parser.add_argument('-weightsfactor',action='store',dest='weightsfactor', type=float, default=1.0, help="context similarity measure power factor")
194 |     parser.add_argument('-bow',action='store',dest='bow_size', default=-1, type=int, help="context bag-of-words window size for context cosine sim. -1 means bow not used, 0 means entire sentence")
195 |     parser.add_argument('-bowinter',action='store',dest='bow_interpolate', default=0, type=float, help="interpolation factor between bow and subvec sims. 0 means no bow, -1 means doing backoff instead of interpolation.")
196 |     parser.add_argument('-cbow',action='store',dest='embeddingpath', default=None, help="continuous bow (embeddings avg)")
197 |     
198 |     if len(sys.argv)==1:
199 |         print parser.print_help(sys.stdout)
200 |     else:
201 |     
202 |         args = parser.parse_args(sys.argv[1:])
203 |         
204 |         config_file_name = args.resultsfile + ".CONFIG"
205 |         cf = open(config_file_name, 'w')
206 |         cf.write(' '.join(sys.argv)+'\n')
207 |         cf.close()
208 |         
209 |         numpy.seterr(all='raise', divide='raise', over='raise', under='raise', invalid='raise')
210 |         
211 |         run(args)
212 |         
213 |     
214 |         


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | WORD2PARVEC TOOLKIT		Oren Melamud, 2015
  2 | -------------------------------------------
  3 | 
  4 | NOTE: The word2parvec toolkit is provided 'AS IS' with no warranty whatsoever.
  5 | 
  6 | word2parvec is a toolkit that learns paraphrase vector (parvec) representations for word meanings in context.
  7 | The model is described in the following paper (please cite if using this toolkit):
  8 | 
  9 | Oren Melamud, Ido Dagan, Jacob Goldberger. Modeling Word Meaning in Context with Substitute Vectors. NAACL, 2015.
 10 | 
 11 | This readme file explains how to use the toolkit.
 12 | The procedure includes the following steps:
 13 | 
 14 | SETUP
 15 | 
 16 | 	1. Preprocessing the learning corpus
 17 | 	2. Learning an n-gram language model from the corpus
 18 | 	3. Extracting sentential contexts from the corpus for all target words
 19 | 		a. Choosing target words
 20 | 		b. Sampling target words contexts
 21 | 		c. Generating substitute vector (subvec) representations for contexts
 22 | INFERENCE
 23 | 
 24 | 	4. Generating parvecs for target words in sentential context
 25 | 
 26 | The toolkit also includes:
 27 | 
 28 | 	5. A simple Wordnet-based pseudoword generator
 29 | 	6. An application that evaluates subvec/bow/cbow context similarity measures using pseudowords
 30 | 	
 31 | 
 32 | 1. Preprocessing the learning corpus
 33 | -------------------------------------
 34 | This is a common procedure in many NLP tasks. Use your favorite tools to perform the following steps:
 35 | 	1.1 Sentence split - one sentence per line
 36 | 	1.2 Tokenize - space-delimited tokens in each line
 37 | 	Optional:
 38 | 	1.3 Convert all words to lowercase
 39 | 	1.4 Convert rare words to special token (e.g. <RW>)
 40 | 	1.5 Convert numbers to special token (e.g. <NUM>)
 41 | 	1.6 Shuffle the lines of the corpus to avoid unintentional bias to corpus structure
 42 | 
 43 | We denote the preprocessed learning corpus file as CORPUS.
 44 | Finally, use the following script to generate a vocabulary file, denoted VOCAB, for the corpus:
 45 | 
 46 | 	cat CORPUS | python count_vocab.py 0 > VOCAB
 47 | 
 48 | 
 49 | 2. Learning an n-gram language model from the corpus
 50 | ----------------------------------------------------
 51 | There are several n-gram language model toolkits.
 52 | You can use any toolkit that can export the leanred language model into a standard ARPA format.
 53 | We denote the language model ARPA file as LM.arpa
 54 | 
 55 | KenLM is one good option:
 56 | 	You can download this toolkit from https://kheafield.com/code/kenlm/ and follow the instructions.
 57 | 	An example command line for learning a 5-gram Kneser Ney language model is:
 58 | 	bin/lmplz -o 5 -S 48G -T ~/tmp --text CORPUS --prune 0 2 2 2 2 > LM.arpa
 59 | 
 60 | 
 61 | 3. Extracting sentential contexts from corpus for all target words
 62 | ---------------------------------------------------------------
 63 | 
 64 | 3.a. Choosing target words
 65 | -------------------------
 66 | Create a file with one word per line comprising all of the target words that you will need in your application.
 67 | We denote the target file as TARGETS
 68 | 
 69 | Note that you will need to allocate sufficient disk space for storing the contexts that will be collected from the corpus for each of the targets (~20M per each target word type).
 70 | 
 71 | 
 72 | 3.b. Sampling target words contexts
 73 | ----------------------------------
 74 | Sample sentential contexts for all of your target words using the script below.
 75 | 	
 76 | 	python extract_contexts.py CORPUS TARGETS <contexts-num> TARGETS_CONTEXTS TARGETS_FREQS
 77 | 
 78 | 	TARGETS_CONTEXTS denotes a file containing the corpus contexts sampled for the targets (this can be a very big file)
 79 | 	TARGETS_FREQS denotes a file containing the number of sampled contexts per each target word type 
 80 | 	<contexts-num> is the maximum number of contexts sampled per each target (e.g. 20000)
 81 | 
 82 | 
 83 | 3.c. Generating substitute vector (subvec) representations for contexts
 84 | ----------------------------------------------------------------------
 85 | 
 86 | (i) Generating fastsub subvecs
 87 | 
 88 | 	To compute subvecs for the target words contexts use the FASTSUBS toolkit.
 89 | 	Download FASTSUBS from: https://github.com/ai-ku/fastsubs and use as follows:
 90 | 
 91 | 	cat TARGETS_CONTEXTS | ./fastsubs-omp -n <pruning-factor> -m <thread-num> -t -z LM.arpa  > TARGETS_SUBVECS
 92 | 
 93 | 	<pruning-factor> is the maximum number of entries in each subvec (suggested value 100)
 94 | 	<thread-num> is the maximum number of threads that fastsubs-omp will use on your machine
 95 | 	TARGETS_SUBVECS is the targets context file with subvec representations (this would be an even bigger file...)
 96 | 
 97 | (ii) Optional (recommended) context cleanup:
 98 | 
 99 | 	The following script extracts only contexts where the original target that was observed with this context appears in the subvec:
100 | 
101 | 	python extract_reliable_subvecs.py TARGETS_SUBVECS TARGETS_SUBVECS.RELIABLE TARGETS_FREQS.RELIABLE
102 | 
103 | (iii) Converting subvecs weights from conditional probability to SPPMI:
104 | 
105 | 	cat TARGETS_SUBVECS.RELIABLE | python subvecs2pmi.py VOCAB  <shift> > TARGETS_SUBVECS.RELIABLE.PMI
106 | 
107 | 	<shift> is the sppmi shift parameter (recommended value: 2.0)
108 | 
109 | (iv) Converting the large contexts file to a directory of files:
110 | 
111 | 	This script converts the big contexts subvec file to a more application-friendly file-per-target directory.
112 | 	This will create the directory TARGETS_SUBVECS.RELIABLE.PMI.DIR with a file named w for every target word type w in TARGETS_SUBVECS.
113 | 
114 | 	python subvec_dir.py  TARGETS_SUBVECS.RELIABLE.PMI
115 | 
116 | (v) Clustering subvecs - Optional
117 | 
118 | 	The following script clusters contexts together in order to reduce the size of the target subvecs directory.
119 | 
120 | 	cluster_subvecs_concurrently.sh <source-home> <process-num> VOCAB <cluster-num> 1 <cluster-prunning> TARGETS_SUBVECS.RELIABLE.PMI.DIR TARGETS_SUBVECS.RELIABLE.PMI.CLUSTER.DIR [<n_init>] [<max_iter>]
121 | 
122 | 	<source-home> is the directory under which the python source code parvecs is installed
123 | 	<process-num> number of processes spawned concurrently
124 | 	<cluster-num> is the number of context cluster per each word type
125 | 	<cluster-prunning> is the max number of entries in cluster vectors
126 | 	<n_init> is the number of different random starting points for the clustering process (default 1)
127 | 	<max_iter> is the max number of iterations performed in the clustering process (default 30)
128 | 
129 | 	Note: the output cluster subvecs are L2-normalized
130 | 
131 | 
132 | 4. Generating parvecs for target words in sentential context
133 | ---------------------------------------------------------------
134 | To compute parvecs, your words-in-contexts file, denoted TEST, should be formatted in the same way as in the file TARGETS_CONTEXTS from section 3.b.
135 | The follow the instructions in 3.c.(i) and 3.c.(iii) to generate substitute vectors for your test file TEST.SUBVECS.PMI.
136 | In this file there should be for every target word instance two lines:
137 | target_name <tab> target_id <tab> target_index <tab> text_line
138 | sub1 <space> weight1 <tab> sub2 <space> weight2 <tab> ...
139 | 
140 | The substitutes in the second line are for the target at text_line[target_index] (i.e. the word in the target_index position in text_line).
141 | 
142 | Note:
143 | - To speed up parvec generation considerably, sort the contexts in TEST according to their target_name (i.e. contexts of the same target word should be grouped together).
144 | - It is generally recommended to use same subvec weighting schemes (e.g. PMI with a shift of 2.0) for both TARGETS_SUBVECS.RELIABLE.PMI and TEST.SUBVECS.PMI. 
145 | 
146 | To generate parvecs for words in context run:
147 | 
148 | 	python word2parvec.py -contexts_dir TARGETS_SUBVECS.RELIABLE.PMI.DIR -vocabfile VOCAB -testfile TEST.SUBVECS.PMI -resultsfile TEST.PARVECS
149 | 	or
150 | 	python word2parvec.py -contexts_dir TARGETS_SUBVECS.RELIABLE.PMI.CLUSTER.DIR --excluderef -vocabfile VOCAB -testfile TEST.SUBVECS.PMI -resultsfile TEST.PARVECS
151 | 
152 | 	TEST.PARVECS is the output file that will be created with the following 3 lines for every target word instance:
153 | 	INSTANCE <tab> target_name <tab> target_id <tab> target_index <tab> text_line
154 | 	SUBVEC <tab> sub1 <space> weight1 <tab> sub2 <space> weight2 <tab> ...
155 | 	PARVEC <tab> par1 <space> weight1 <tab> par2 <space> weight2 <tab> ...
156 | 
157 | You can use the following runtime arguments:
158 | 
159 | 	--excluderef excludes the given context from the target contexts average. This is recommended when using clustered subvecs (3.c.(v)).
160 | 
161 | 	--lemmatize can be used to convert the parvec to lemmatized form. This is useful, for instance, when evaluating against a lemmatized gold standard, such as the SemEval 2007 Lexical Substitution Task. 
162 | 	When using this option the target_name in TEST should be in the form of <string>.POS where POS is a wordnet part-of-speech identifier (ADJ, ADV, NOUN, VERB = 'a', 'r', 'n', 'v').
163 | 	A 4th line will be included in the output:
164 | 	PARLEMVEC <tab> parlem1 <space> weight1 <tab> parlem2 <space> weight2 <tab> ...
165 | 
166 | 	-top <int> and -toppercent <percent> can be used to inject a stronger bias in the parvec towards the given context by averaging only on the top target contexts that are most similar to the given context.
167 | 
168 | 	-weightsfactor <float> sets a float value f. The context similarity function is implemented as sim(c1,c2) = cos(c1,c2)^f, where the default value of f is 1.0
169 | 
170 | 	-parvec_maxlen <int> can be used to limit the number of entries in the generated parvecs
171 | 
172 | 	--debug turns debug logs on
173 | 	--debugtop <int> limits the number of entries printed per vector
174 | 
175 | To generate parvecs for words out-of-context use: -weightsfactor 0.0 --excluderef
176 | 
177 | 
178 | 5. Pseudoword generator
179 | ------------------------
180 | To randomly generate pseudowords run:
181 | 
182 | 	wn_pseudoword_generator VOCAB <words-num> <words2senses-file> <senses-file> [<min-freq>]
183 | 
184 | 	<words-num> is the number of pseudowords to be generated
185 | 
186 | 	<words2senses-file> is an output file, denoted WORDS2SENSES, containing a single line for every pseudoword in the following format:
187 | 	pseudoword_name <tab> sense_word1 <space> sense_word2 <space> ...
188 | 
189 | 	<senses-file>, denoted SENSES, is an output file with all of the senses from all pseudowords (one sense per line)
190 | 	<min-freq> is the minimum corpus frequency for a sense to be acceptable (default value is 1000).
191 | 
192 | 
193 | 6. Context similarity measures evaluation
194 | -------------------------------------------
195 | Performs steps 3.b and 3.c.(i) and 3.c.(iv) using SENSES as TARGETS (<contexts-num> can be set to ~1000) to collect contexts for the pseudo-sense words into PSEUDO_TARGETS_SUBVECS.DIR.
196 | 
197 | Run the following script to evaluate subvec (SUB) similarity with conditional probabilities weights:
198 | 
199 | 	python context_similarity_measures_eval.py -samplenum 100 -toppercent 0.01 -pseudosfile WORDS2SENSES -contexts_dir PSEUDO_TARGETS_SUBVECS.DIR -vocabfile VOCAB -resultsfile <results>
200 | 
201 | To evaluate SUB sppmi weights add the following params:
202 | 	--pmi 
203 | 	-pmioffset <shift>
204 | 
205 | To evaluate with bag-of-words (BOW) context similarities add the following params:
206 | 	-bowinter 1.0
207 | 	-bow <window-size>
208 | 	(window size zero means the entire sentence)
209 | 
210 | To evaluate with continuous bow (CBOW):
211 | 	
212 | 	Use word2vec (https://code.google.com/p/word2vec/) to learn word embeddings.
213 | 	An example command line:
214 | 	./word2vec -train CORPUS -output EMBEDDINGS -cbow 0 -size 600 -window 4 -negative 15 -threads 12 -binary 0 -min-count 100
215 | 	
216 | 	The following script will convert the format of the embeddings
217 | 	python embedding_text2numpy.py EMBEDDINGS
218 | 	
219 | 	Add the following param to context_similarity_measures_app.py:
220 | 	-cbow EMBEDDINGS
221 | 	
222 | To weigh the words in BOW/CBOW with tfidf weighting:
223 | 	--tfidf
224 | 	
225 | To evaluate the combined SUB*CBOW measure (interpolation between SUB and CBOW measures), include both SUB and BOW/CBOW config params and use -bowinter 0.5	
226 | 	--debug turns debug logs on
227 | 
228 | 	
229 | 	
230 | 


--------------------------------------------------------------------------------
/parvecs/inference/context_collection.py:
--------------------------------------------------------------------------------
  1 | '''
  2 | A collection of contexts represented as subvecs and/or bags-of-words
  3 | Used to:
  4 | - sort contexts according to their similarity to a reference context
  5 | - perform a (weighted) average of contexts representations
  6 | 
  7 | '''
  8 | from parvecs.common.context_instance import read_context
  9 | from parvecs.common.util import wf2ws
 10 | from parvecs.common.context_instance import get_pmi_weights
 11 | from parvecs.common.context_instance import remove_out_of_vocab
 12 | from scipy.sparse.dok import dok_matrix
 13 | from scipy.sparse.csr import csr_matrix
 14 | from scipy.sparse import SparseEfficiencyWarning
 15 | import numpy as np
 16 | import heapq
 17 | import math
 18 | 
 19 | import warnings
 20 | warnings.simplefilter('error',SparseEfficiencyWarning)
 21 | 
 22 | 
 23 | 
 24 | class ContextCollection():
 25 | 
 26 |     def __init__(self, args, i2w, w2i, subvecs_num, w2counts, sum_word_counts, stopwords, embeddings):
 27 |         
 28 |         self.args = args        
 29 |         self.w2i = w2i
 30 |         self.i2w = i2w
 31 |         self.w2counts = w2counts
 32 |         self.sum_word_counts = sum_word_counts
 33 |         self.stopwords = stopwords
 34 |         
 35 |         self.contexts = []
 36 |         self.sim_scores = None # points either to self.subvecs_sim_scores or to self.bow_sim_scores  
 37 |         
 38 |         initial_sim_score = 1.0 if subvecs_num==0 else 1.0/subvecs_num 
 39 |         
 40 |         self.embeddings = embeddings # when this is not None the bow representation is dense (todo: refactor this code)
 41 |         self.bow_size = args.bow_size
 42 |         if (self.bow_size >= 0):
 43 |             if (self.embeddings != None):
 44 |                 bow_dimensionality = self.embeddings.dimension()
 45 |                 self.bow_matrix = np.zeros((subvecs_num, bow_dimensionality), dtype=np.float32) # estimate sim of contexts based on their BOW rep
 46 |                 self.bow_L2_norms = None # we always keep them normalized
 47 |                 self.bow_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose()
 48 |             else:
 49 |                 bow_dimensionality = len(w2i)
 50 |                 self.bow_matrix = dok_matrix((subvecs_num, bow_dimensionality), dtype=np.float32) # estimate sim of contexts based on their BOW rep
 51 |                 self.bow_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32)
 52 |                 self.bow_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose()
 53 |                 
 54 |         self.subs_matrix = dok_matrix((subvecs_num, len(w2i)), dtype=np.float32) #used for sim weights calculation, also for sub average only if no dual matrix
 55 |         self.subvecs_L2_norms = dok_matrix((subvecs_num, 1), dtype=np.float32)
 56 |         self.subvecs_sim_scores = dok_matrix([[initial_sim_score]*subvecs_num]).tocsr().transpose()
 57 |         
 58 |         self.target_counts = {} 
 59 |         
 60 |         
 61 |     def load_contexts(self, contexts_file, pseudos=None, pseudos_label=None, tocsr_flag=True):
 62 |         '''
 63 |         loads contexts for this collection
 64 |         :param contexts_file:
 65 |         :param pseudos: set of pseudo-sense words (used only for pseudo-word experiments)
 66 |         :param pseudos_label: pseudo-word label (used only for pseudo-word experiments)
 67 |         :param tocsr_flag: should be False if intending to load more contexts into this collection
 68 |         :returns: number of contexts read
 69 |         '''
 70 |         
 71 |         print "Loading contexts for file %s" % (contexts_file)
 72 |         lines = 0
 73 |         try:
 74 |             while True: 
 75 |                 context_instance, subvec = read_context(contexts_file, self.args.subvec_maxlen)
 76 |                 if pseudos != None and pseudos_label != None:
 77 |                     subvec = self.__update_pseudos(subvec, pseudos, pseudos_label)
 78 |                 
 79 |                 if self.args.pmi == True:
 80 |                     subvec = get_pmi_weights(subvec, self.w2counts, self.sum_word_counts, self.args.pmioffset, self.args.pmithreshold)
 81 |                 else:
 82 |                     subvec = remove_out_of_vocab(subvec, self.w2counts)                  
 83 |                 self.__append_subvec(subvec, context_instance)
 84 |                 
 85 |                 lines += 1
 86 |                 if lines % 10000 == 0:
 87 |                     print "Read %d context lines" % (lines)
 88 |         except EOFError:
 89 |             print "Finished loading %d context lines from file %s" % (lines, contexts_file)
 90 |         if tocsr_flag == True:
 91 |             self.tocsr()     
 92 |         return lines
 93 |     
 94 | 
 95 |     def tocsr(self):
 96 |         '''
 97 |         Converts collection to an arithmetically-efficient format
 98 |         :returns: None
 99 |         '''        
100 |         self.subs_matrix = self.subs_matrix.tocsr()
101 |         self.subvecs_L2_norms = self.subvecs_L2_norms.tocsr()       
102 |         if self.bow_size>=0:
103 |             if isinstance(self.bow_matrix, dok_matrix):
104 |                 self.bow_matrix = self.bow_matrix.tocsr()
105 |                 self.bow_L2_norms = self.bow_L2_norms.tocsr()
106 |         
107 |     def reference_context(self, subvec, context, bow_interpolate):
108 |         '''
109 |         Weighs contexts in this collection according to similarity to the given reference context
110 |         :param subvec: subvec representation of given context
111 |         :param context: given context
112 |         :param bow_interpolate: interpolation factor (between bow and subvec simiarity)
113 |         :returns: subvec as a numpy matrix
114 |         '''        
115 |         subvec_matrix = dok_matrix((len(self.w2i),1), dtype=np.float32)       
116 |         for word, weight in subvec:
117 |             subvec_matrix[self.w2i[word],0] = weight    
118 |         subvec_matrix = subvec_matrix.tocsr()       
119 |         
120 |         return self.__reference_context_imp(subvec_matrix, context, bow_interpolate)
121 |     
122 | 
123 |     def avg_contexts(self, ref_subvec, top, top_percent, top_inferences_number, exclude_ref, weights_factor):
124 |         '''
125 |         Performs a weighted average of
126 |         :param ref_subvec: given subvec as a numpy matrix
127 |         :param top:
128 |         :param top_percent:
129 |         :param top_inferences_number:
130 |         :param exclude_ref:
131 |         :param weights_factor:
132 |         :returns: parvec, number of contexts averaged
133 |         '''
134 |         
135 |         if len(self.contexts) == 0:
136 |             return None, 0
137 |         
138 |         ref_weight = 1 if exclude_ref == False else 0
139 |                
140 |         if (top > len(self.contexts) + ref_weight):
141 |             top = len(self.contexts) + ref_weight
142 |             
143 |         if (top > 0 or top_percent > 0):
144 |             top_contexts_weights = self.sim_scores.todok()
145 |             final_top = top-ref_weight # -1 to leave 1 for the ref_subvec
146 |             num_top_percent = int(math.ceil(top_percent * (len(self.contexts)+ref_weight)))-ref_weight
147 |             final_top = max(final_top, num_top_percent) 
148 |             
149 |             cw_sorted  = heapq.nlargest(final_top, top_contexts_weights.iteritems(), key=lambda x: x[1])
150 |             top_contexts_weights = dok_matrix((len(self.contexts),1), dtype=np.float32)
151 |             
152 |             for (k,j), weight in cw_sorted:
153 |                 top_contexts_weights[k,j] = weight**weights_factor
154 | 
155 |             top_contexts_weights = top_contexts_weights.tocsr()
156 |             contexts_num = len(cw_sorted)
157 |                 
158 |         else:            
159 |             contexts_num = len(self.contexts)
160 |             if weights_factor == 0.0:
161 |                 top_contexts_weights = dok_matrix([[1.0]*contexts_num]).tocsr().transpose()
162 |             else:
163 |                 top_contexts_weights = self.sim_scores.copy()
164 |                 top_contexts_weights.data **= weights_factor
165 |             
166 |         sum_weights = top_contexts_weights.sum() + ref_weight #weight +1 reserved for ref_subvec
167 |         top_contexts_weights.data /= sum_weights
168 | 
169 | 
170 |         weighted_subs_matrix = self.subs_matrix.multiply(top_contexts_weights)  #NOT SUPPORTED IN SCIPY 0.7        
171 |         avg_subvec = weighted_subs_matrix.sum(axis=0)
172 |         
173 |         if (exclude_ref == False) and (ref_subvec != None):
174 |             ref_subvec.data *= 1.0/sum_weights
175 |             avg_subvec = avg_subvec + ref_weight * ref_subvec.transpose()
176 |         
177 |         result_vec = self.__vec_to_sorted_list(avg_subvec, top_inferences_number)  
178 |         return result_vec, contexts_num        
179 | 
180 | 
181 |     def evaluate_context_similarity(self, seeded_random, random_similarity):
182 |         '''
183 |         Performs a context similarity measure evaluation on a single 'query' context instance
184 |         todo: move this functionality out of this class
185 |         :param seeded_random:
186 |         :param debug_top_inferences_per_context:
187 |         :param random_similarity:
188 |         :returns: precision results
189 |         '''
190 |         
191 |         random_context_ind = seeded_random.randint(0, len(self.contexts)-1)
192 |         sample_context = self.contexts[random_context_ind]
193 |         sample_target = self.contexts[random_context_ind].target
194 |         sample_subvec = self.subs_matrix[random_context_ind,:].transpose()
195 |         
196 |         all_size = len(self.contexts)-1  # -1 because we used one context as query
197 |         all_real_pos = self.target_counts[sample_target]-1
198 |         
199 |         if (self.args.top > 0 or self.args.top_percent > 0):
200 |             top_contexts = self.args.top
201 |             num_top_percent = int(math.ceil(self.args.top_percent * all_size))
202 |             top_contexts = max(top_contexts, num_top_percent) 
203 |         else:
204 |             top_contexts = all_size
205 |         
206 |         bow_interpolate = self.args.bow_interpolate
207 | 
208 |         if random_similarity:
209 |             self.sim_scores = csr_matrix([]) 
210 |         else:
211 |             self.__reference_context_imp(sample_subvec, sample_context, bow_interpolate)
212 |         
213 |         contexts_weights_sorted = sorted(self.sim_scores.todok().iteritems(), key=lambda x: x[1], reverse=True)
214 |         output_items = []
215 |         true_p = 0
216 |         all_p = 0
217 |         precision_at_1 = None
218 |         top_precision = None
219 |         avg_precision = 0.0
220 |         
221 |         # going over all the contexts that got a non-zero score
222 |         for i in xrange(0,len(contexts_weights_sorted)):
223 |             (j,k), context_weight = contexts_weights_sorted[i]
224 |             retrieved_target = self.contexts[j].target
225 |             
226 |             if j != random_context_ind: # skipping the sampled context in calculation
227 |                 all_p += 1
228 |                 if retrieved_target == sample_target: # true positive 
229 |                     true_p += 1
230 |                     avg_precision += float(true_p) / all_p  
231 |                 if all_p == 1:
232 |                     precision_at_1 = float(true_p) / all_p
233 |                 if all_p == top_contexts:
234 |                     top_precision = float(true_p) / all_p
235 |                                                    
236 |             if self.args.debug:
237 |                 subvec = self.subs_matrix_for_sim_weights[j, :].todok()
238 |                 sub_list_sorted  = heapq.nlargest(self.args.debugtop, subvec.iteritems(), key=lambda x: x[1])
239 |                 sub_strs = [' '.join([self.i2w[ii], wf2ws(weight)]) for (kk,ii), weight in sub_list_sorted]
240 |                 prefix = "QRY" if j == random_context_ind else "RET"
241 |                 output_items.append((prefix, context_weight, self.contexts[j].decorate_context() +'\n' +'\t' + '\t'.join(sub_strs)))                
242 |             
243 |         # for all the contexts that got zero score (were not retrieved at all) we assume that the real positives were retrieved uniformly (like random)
244 |         false_n = all_real_pos - true_p
245 |         if (false_n > 0):
246 |             all_n = all_size - all_p
247 |             real_negs_per_one_real_pos = (float(all_n)/false_n)-1
248 |             
249 |             all_p += real_negs_per_one_real_pos/2
250 |             
251 |             while all_p < all_size:                        
252 |                 if (top_precision == None)  and (all_p >= top_contexts):                
253 |                     top_precision = float(true_p) / top_contexts
254 |                 all_p += 1
255 |                 true_p += 1
256 |                 avg_precision += float(true_p) / all_p            
257 |                 all_p += real_negs_per_one_real_pos
258 |                 if self.args.debug:
259 |                     output_items.append(("UNF", 0.0, "dummy positive"))
260 |         
261 |         if (top_precision == None):                
262 |                 top_precision = float(true_p) / top_contexts
263 |                 
264 |         if (precision_at_1 == None):                
265 |                 precision_at_1 = float(all_real_pos) / all_size
266 |             
267 |         assert(true_p == all_real_pos)
268 |         
269 |         avg_precision /= max(1,all_real_pos)
270 |         
271 |         output_lines = ['\t'.join([prefix, wf2ws(context_weight), text]) for prefix, context_weight, text in output_items]  
272 |         return precision_at_1, top_precision, avg_precision,'\n'.join(output_lines)
273 | 
274 | 
275 |     def __append_subvec(self, subvec, context_instance):
276 |        
277 |         j = len(self.contexts)
278 |         self.contexts.append(context_instance)        
279 |         
280 |         if context_instance.target in self.target_counts:
281 |             self.target_counts[context_instance.target] += 1
282 |         else:
283 |             self.target_counts[context_instance.target] = 1
284 |         
285 |         if len(subvec) > 0:
286 |             L2 = 0.0
287 |             for word, weight in subvec:
288 |                 L2 += weight**2
289 |             if L2 == 0:
290 |                 L2 = 1
291 |             self.subvecs_L2_norms[j,0] = 1.0/(L2**0.5)
292 |              
293 |             for word, weight in subvec:
294 |                 if (weight != 0):
295 |                     self.subs_matrix[j, self.w2i[word]] = weight 
296 |         else:
297 |             self.subvecs_L2_norms[j,0] = 1.0 # dummy NORM
298 |             
299 |             
300 |         if self.bow_size >= 0: # using the bow_matrix for sim between contexts
301 |             
302 |             text_matrix, found_word = self.__context_text_to_vec(context_instance)
303 |             
304 |             if (self.embeddings == None):
305 |                 text_matrix = text_matrix.transpose()
306 |                 
307 |                 for (zero, word_ind), value in text_matrix.iteritems():
308 |                     self.bow_matrix[j, word_ind] = value
309 |                 
310 |                 if found_word == True:
311 |                     L2 = 0
312 |                     for val in text_matrix.itervalues():
313 |                         L2 += val**2       
314 |                     self.bow_L2_norms[j,0] = 1.0 / (L2**0.5)
315 |                 else:             
316 |                     self.bow_L2_norms[j,0] = 1.0 # dummy NORM
317 |             else:
318 |                 self.bow_matrix[j, :] = text_matrix
319 | 
320 |         
321 |     def __reference_context_imp(self, subvec_matrix, context, bow_interpolate):
322 |                       
323 |         if bow_interpolate == 1:
324 |             self.bow_sim_scores = self.__reference_context_bow(context)
325 |             self.sim_scores = self.bow_sim_scores            
326 |         elif bow_interpolate == 0:
327 |             self.subvecs_sim_scores = self.__reference_context_subvec(subvec_matrix)
328 |             self.sim_scores = self.subvecs_sim_scores
329 |         else:
330 |             try: 
331 |                 self.bow_sim_scores = self.__reference_context_bow(context)
332 |                 self.bow_sim_scores.data = self.bow_sim_scores.data**bow_interpolate
333 |             except Exception as e:
334 |                 print e
335 |                 print context
336 |                 raise e 
337 |             self.subvecs_sim_scores = self.__reference_context_subvec(subvec_matrix)
338 |             self.subvecs_sim_scores.data = self.subvecs_sim_scores.data**(1-bow_interpolate)
339 |             self.sim_scores = self.subvecs_sim_scores.multiply(self.bow_sim_scores)
340 |                                
341 |         return subvec_matrix
342 | 
343 |    
344 |     def __reference_context_bow(self, context):
345 |         
346 |         refvec_matrix, found_word = self.__context_text_to_vec(context)
347 |         sims = self.__compute_sim_scores(refvec_matrix, self.bow_matrix, self.bow_L2_norms, self.embeddings != None)       
348 |         return sims
349 |         
350 |     
351 |     def __reference_context_subvec(self, refvec_matrix):        
352 |         sims = self.__compute_sim_scores(refvec_matrix, self.subs_matrix, self.subvecs_L2_norms, False)        
353 |         return sims
354 |     
355 |     
356 |     
357 |     def __compute_sim_scores(self, refvec_matrix, allvecs_matrix, L2_norms, is_embeddings):
358 |         contexts_sims = allvecs_matrix.dot(refvec_matrix)        
359 |                 
360 |         if is_embeddings:
361 |             contexts_sims = (contexts_sims + 1) / 2 # map cosine to [0,1]
362 |             contexts_sims = np.reshape(contexts_sims, (len(contexts_sims), 1))
363 |             contexts_sims = csr_matrix(contexts_sims.tolist())     
364 |         if L2_norms != None:
365 |             contexts_sims = contexts_sims.multiply(L2_norms)           
366 |             refvec_dp = refvec_matrix.transpose().dot(refvec_matrix)
367 |             refvec_L2_norm = refvec_dp.data.max()**0.5 if len(refvec_dp.data) > 0 else 1.0
368 |             contexts_sims.data /= refvec_L2_norm # weights -1 <= cosine <= 1, but in practice greater than zero because all weights >= 0
369 | 
370 |         return contexts_sims
371 |     
372 |     def __context_text_to_vec(self, context_instance):
373 |         found_word = False        
374 |         
375 |         if self.embeddings != None:
376 |             dimensionality = self.embeddings.dimension()
377 |             weight_dtype = np.float32
378 |             w2ind = self.w2i
379 |             text_matrix = np.zeros((dimensionality,), dtype=weight_dtype)           
380 |         else:
381 |             dimensionality = len(self.w2i)
382 |             weight_dtype = np.float32 if self.args.tfidf else np.int8
383 |             w2ind = self.w2i
384 |             text_matrix = dok_matrix((dimensionality,1), dtype=weight_dtype)
385 |         
386 |         context_text_tokens = context_instance.get_context_tokens()
387 |         target_pos = context_instance.target_ind
388 |         
389 |         if (self.bow_size > 0):                                    
390 |             start_pos = max(target_pos-self.bow_size, 0)
391 |             end_pos = min(target_pos+self.bow_size+1, len(context_text_tokens))
392 |             context_text_tokens = context_text_tokens[start_pos:end_pos]
393 |             target_pos = target_pos-start_pos
394 |                        
395 |         stopwords = self.stopwords
396 |         context_text_inds_left = [w2ind[word] for word in context_text_tokens[:target_pos] if word not in stopwords and word in w2ind]    
397 |         context_text_inds_right = [w2ind[word] for word in context_text_tokens[target_pos+1:] if word not in stopwords and word in w2ind] if (target_pos+1) < len(context_text_tokens) else []
398 |                              
399 |         all_words_inds = context_text_inds_left+context_text_inds_right
400 |         total_weights = 0.0
401 |         for word_ind in all_words_inds:
402 |             w = self.i2w[word_ind]
403 |             if self.args.tfidf:                
404 |                 wcount = self.w2counts[w]
405 |                 log_idf = math.log(float(self.sum_word_counts)/wcount)
406 |                 log_idf -= self.args.tfidf_offset
407 |                 if (log_idf <= self.args.tfidf_threshold):
408 |                     log_idf = 0.0
409 |                 weight = log_idf
410 |             else:
411 |                 weight = 1
412 |             
413 |             if weight !=0:
414 |                 found_word = True
415 |                 if (self.embeddings != None):
416 |                     if w in self.embeddings:                    
417 |                         wordvec = self.embeddings.represent(w).transpose()
418 |                         text_matrix = text_matrix + (wordvec * weight)
419 |                     else:
420 |                         weight = 0.0
421 |                 else:
422 |                     text_matrix[word_ind,0] += weight
423 |                 total_weights += weight
424 |           
425 |         # embeddings representations are always normalized
426 |         if (self.embeddings != None):
427 |             if total_weights != 0:
428 |                 text_matrix /= total_weights
429 |             norm = np.sqrt(np.sum(text_matrix*text_matrix))
430 |             if norm != 0:
431 |                 text_matrix /= norm
432 | 
433 |         return text_matrix, found_word
434 |     
435 |     
436 |     def __vec_to_sorted_list(self, subvec, max_n):
437 |         sub_list = np.array(subvec)[0].tolist()
438 |         n = min(max_n, subvec.nonzero()[0].shape[1]) if max_n > 0 else subvec.nonzero()[0].shape[1] 
439 |         sub_list_sorted  = heapq.nlargest(n, enumerate(sub_list), key=lambda x: x[1])
440 |         sub_list = [(self.i2w[i], weight) for i, weight in sub_list_sorted]
441 |         return sub_list
442 |     
443 |                
444 |     def to_str(self, top_contexts, top_inferences_per_context):
445 |         
446 |         contexts_weights_sorted = heapq.nlargest(top_contexts, self.sim_scores.todok().iteritems(), key=lambda x: x[1])
447 |         output_items = []
448 |         for (j,k), context_weight in contexts_weights_sorted:
449 |             subvec = self.subs_matrix[j, :].todok()
450 |             sub_list_sorted  = heapq.nlargest(top_inferences_per_context, subvec.iteritems(), key=lambda x: x[1])
451 |             sub_strs = [' '.join([self.i2w[i], wf2ws(weight)]) for (k,i), weight in sub_list_sorted]
452 |             output_items.append((context_weight, self.contexts[j].decorate_context() +'\n' + '\t'.join(sub_strs)))
453 |          
454 |         output_lines = ['\t'.join([wf2ws(context_weight), text]) for context_weight, text in output_items]  
455 |         return '\n'.join(output_lines)
456 |         
457 |     
458 |     def __update_pseudos(self, subvec, pseudos, pseudos_label):
459 |         updated_subvec = []
460 |         pseudos_weight = 0.0
461 |         for word, weight in subvec:
462 |             if word in pseudos:
463 |                 pseudos_weight += weight
464 |             else:
465 |                 updated_subvec.append((word, weight))
466 |         if pseudos_weight > 0.0:
467 |             updated_subvec.append((pseudos_label, pseudos_weight))
468 |         
469 |         return sorted(updated_subvec, key=lambda x: x[1], reverse=True)
470 |         
471 |     
472 |        
473 | 


--------------------------------------------------------------------------------