├── .gitignore
├── 20news.bat
├── README.md
├── anatest.py
├── classEval.py
├── corpusLoader.py
├── csv2topic.py
├── file2topic.py
├── psdvec
    ├── PSDVec.pdf
    ├── README.md
    ├── addheader.py
    ├── analogy.py
    ├── bench.sh
    ├── benchspeed.py
    ├── catbench.py
    ├── cleancorpus.py
    ├── competitors
    │   ├── GloVe-1.2.zip
    │   ├── glove
    │   │   ├── demo.sh
    │   │   ├── rcv1.sh
    │   │   ├── vocab-rcv1.txt
    │   │   ├── vocab-wiki.txt
    │   │   └── wiki.sh
    │   ├── hyperwords.zip
    │   ├── hyperwords
    │   │   ├── pmi2.sh
    │   │   ├── pmi5-rcv1.sh
    │   │   ├── pmi5.sh
    │   │   ├── svd-rcv1.sh
    │   │   ├── svd.sh
    │   │   ├── train-rcv1.sh
    │   │   └── train-wiki.sh
    │   ├── singular.zip
    │   └── sparse.zip
    ├── corpus2liblinear.py
    ├── eval-logs
    │   └── bench.log
    ├── evaluate-toefl.py
    ├── evaluate.py
    ├── extractwiki.py
    ├── fact-rcv1.bat
    ├── fact-rcv1.sh
    ├── fact-wiki.bat
    ├── fact-wiki.sh
    ├── factorize.py
    ├── genSentDict.bat
    ├── gencatdata.py
    ├── gram-rcv1.bat
    ├── gram.bat
    ├── gramcount.pl
    ├── papers
    │   └── emnlp2015.pdf
    ├── patch to gensim.py
    ├── perlxs.h
    ├── removeDoubleNewline.pl
    ├── sent-bench.bat
    ├── sent-gen.conf
    ├── sentbench.py
    ├── tab2list.py
    ├── testsets
    │   ├── analogy
    │   │   ├── EN-TOM-ICLR13-SEM.txt
    │   │   ├── EN-TOM-ICLR13-SYN.txt
    │   │   ├── google.txt
    │   │   └── msr.txt
    │   └── ws
    │   │   ├── EN-RG-65.txt
    │   │   ├── EN-TOEFL-80.txt
    │   │   ├── bruni_men.txt
    │   │   ├── luong_rare.txt
    │   │   ├── radinsky_mturk.txt
    │   │   ├── simlex_999a.txt
    │   │   ├── ws353.txt
    │   │   ├── ws353_relatedness.txt
    │   │   └── ws353_similarity.txt
    ├── topwordsInList.py
    ├── utils.py
    ├── vecnorms.py
    └── xml2corpus.pl
├── reuters.bat
├── snippet2topic.py
├── test-docs
    ├── Drug Goes From 13.50 a Tablet to 750, Overnight.txt
    ├── VR-mitrv.txt
    ├── batman-v-superman.txt
    ├── batman-v-superman.txt-em100.topic.vec
    ├── beijing-haze-news.txt
    ├── brain-scar.txt
    ├── britain-EU.txt
    ├── drugstory.log
    ├── hillary-speech.txt
    ├── hillary-speech2.txt
    ├── nips-wiki.txt
    ├── sanders-speeches.txt
    ├── spacex-news.txt
    └── trump-speech.txt
├── topic-competitors
    ├── LDA
    │   ├── LDAClassify.zip
    │   ├── Readme.txt
    │   ├── classEval.py
    │   ├── corpusLoader.py
    │   └── ldaExp.py
    ├── doc2vec.py
    ├── kmeans.py
    ├── labelEval.py
    ├── lftm2svm.py
    ├── liu-doc2vec.py
    ├── rajarshd-Gaussian_LDA.zip
    ├── sHDP.zip
    └── slda
    │   ├── 20news-test-7532.slda-bow.txt
    │   ├── 20news-test-7532.slda-label.txt
    │   ├── 20news-train-11314.slda-bow.txt
    │   ├── 20news-train-11314.slda-label.txt
    │   ├── Makefile
    │   ├── corpus.cpp
    │   ├── corpus.h
    │   ├── cygblas-0.dll
    │   ├── images.tgz
    │   ├── main.cpp
    │   ├── opt.cpp
    │   ├── opt.h
    │   ├── readme.txt
    │   ├── reuters-test-2255.slda-bow.txt
    │   ├── reuters-test-2255.slda-label.txt
    │   ├── reuters-train-5770.slda-bow.txt
    │   ├── reuters-train-5770.slda-label.txt
    │   ├── settings.h
    │   ├── settings.txt
    │   ├── slda
    │   ├── slda.cpp
    │   ├── slda.exe
    │   ├── slda.h
    │   ├── test-data.dat
    │   ├── test-label.dat
    │   ├── train-data.dat
    │   ├── train-label.dat
    │   ├── utils.cpp
    │   └── utils.h
├── topic-cosine.py
├── topicExp.py
├── topicvec-ext.pdf
├── topicvecDir.py
└── utils.py


/.gitignore:
--------------------------------------------------------------------------------
1 | commit.bat
2 | *.pyc
3 | 


--------------------------------------------------------------------------------
/20news.bat:
--------------------------------------------------------------------------------
1 | python topicExp.py -s 20news train
2 | python topicExp.py -i 20news-train-11314-sep281-em150-best.topic.vec 20news train,test
3 | python classEval.py 20news topicprop
4 | python classEval.py 20news topic-wvavg
5 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # TopicVec
 2 | TopicVec is the source code for "Generative Topic Embedding: a Continuous Representation of Documents" (ACL 2016).
 3 | 
 4 | PSDVec (in folder 'psdvec') is the source code for "A Generative Word Embedding Model and its Low Rank Positive Semidefinite Solution" (EMNLP 2015).
 5 | 
 6 | #### Update v0.7: 
 7 | The topic inference is now 6 times faster.
 8 | 
 9 | #### Update v0.6:
10 | ##### Algorithm update: 
11 | topicvecDir.py: uses exact inference instead a second-order approximation in the M-step.
12 | 
13 | #### Update v0.5:
14 | ##### Main algorithm: 
15 | topicvecDir.py: uses a Dirichlet prior for topic mixting proportions.
16 | 
17 | ####Required files on Dropbox:
18 | https://www.dropbox.com/sh/lqbk3iioobegbp8/AACc8Kfr1KZIkKl9bGaIrOjfa?dl=0
19 | 
20 | 1. Pretrained 180000 embeddings (25000 cores) in 3 archives. For faster loading into Python, 25000-180000-500-BLK-8.0.vec.npy can be used;
21 | 2. Unigram files top1grams-wiki.txt & top1grams-reuters.txt;
22 | 3. RCV1 cleansed corpus ( before downloading, please apply for permission from NIST according to: http://trec.nist.gov/data/reuters/reuters.html ).
23 | 
24 | If you are in China, you can also download the above files from baidu netdisk without the hassle of "climbing over the wall":
25 | https://pan.baidu.com/s/1gVmRhK1HA2XwVWZbZHHLZQ#list/path=%2F
26 | 


--------------------------------------------------------------------------------
/anatest.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from utils import *
 3 | 
 4 | embedding_arrays = np.load("25000-180000-500-BLK-8.0.vec.npy")
 5 | V, vocab, word2ID, skippedWords_whatever = embedding_arrays
 6 | model = VecModel(V, vocab, word2ID, vecNormalize=True)
 7 | w1, w2 = predict_ana(model, "fish", "water", "plant", "soil")
 8 | print w1, w2
 9 | w1, w2 = predict_ana(model, "player", "team", "student", "classroom")
10 | print w1, w2
11 | 


--------------------------------------------------------------------------------
/classEval.py:
--------------------------------------------------------------------------------
 1 | from sklearn import svm, metrics
 2 | from sklearn.datasets import load_svmlight_file
 3 | import sys
 4 | 
 5 | def getScores( true_classes, pred_classes, average):
 6 |     precision = metrics.precision_score( true_classes, pred_classes, average=average )
 7 |     recall = metrics.recall_score( true_classes, pred_classes, average=average )
 8 |     f1 = metrics.f1_score( true_classes, pred_classes, average=average )
 9 |     accuracy = metrics.accuracy_score( true_classes, pred_classes )
10 |     return precision, recall, f1, accuracy
11 |     
12 | """
13 | TopicProp_ITER = int(sys.argv[1])
14 | topicNum = int(sys.argv[2])
15 | train_file = "20news-train-11314-sep%d-em40-i%d.topic.prop" %(topicNum, TopicProp_ITER)
16 | test_file = "20news-test-7532-sep%d-em40-i%d.topic.prop" %(topicNum, TopicProp_ITER)
17 | 
18 | train_features, train_docs_cat_name = load_matrix_from_text( train_file, "training proportion", "\t" )
19 | test_features, test_docs_cat_name = load_matrix_from_text( test_file, "test proportion", "\t" )
20 | 
21 | true_train_classes = []
22 | true_test_classes = []
23 | 
24 | for train_cat_name in train_docs_cat_name[0]:
25 |     true_train_classes.append( int(train_cat_name) )
26 | for test_cat_name in test_docs_cat_name[0]:
27 |     true_test_classes.append( int(test_cat_name) )
28 | """
29 | 
30 | corpus = sys.argv[1]
31 | filetype = sys.argv[2]
32 | # selected feature dimensions can be specified in the last argument as:
33 | # 1-400 (starting from 1)
34 | if len(sys.argv) > 3:
35 |     dims = sys.argv[3].split("-")
36 |     dims[0] = int(dims[0]) - 1
37 |     dims[1] = int(dims[1])
38 | else:
39 |     dims = None
40 |         
41 | if corpus == '20news':
42 |     train_file = "20news-train-11314.svm-%s.txt" %filetype
43 |     test_file = "20news-test-7532.svm-%s.txt" %filetype
44 | else:
45 |     train_file = "reuters-train-5770.svm-%s.txt" %filetype
46 |     test_file = "reuters-test-2255.svm-%s.txt" %filetype
47 | 
48 | train_features_sparse, true_train_classes = load_svmlight_file(train_file)
49 | test_features_sparse, true_test_classes = load_svmlight_file(test_file)
50 | #nonzeroColIDs = np.union1d( train_features_sparse.nonzero()[1], test_features_sparse.nonzero()[1] )
51 | #train_features = train_features_sparse[:, nonzeroColIDs].toarray()
52 | #test_features = test_features_sparse[:, nonzeroColIDs].toarray()
53 |     
54 | #pdb.set_trace()
55 | #print "%dx%d sparse feature matrices reduced to %dx%d" %( tuple(train_features_sparse.shape) +
56 | #                                                tuple(train_features.shape) )
57 | 
58 | train_features = train_features_sparse.toarray()
59 | test_features = test_features_sparse.toarray()
60 | 
61 | print "Train: %dx%d. Test: %dx%d" %( tuple( train_features.shape + test_features.shape ) )
62 | 
63 | if dims:
64 |     train_features = train_features[ :, dims[0]:dims[1] ]
65 |     test_features = test_features[ :, dims[0]:dims[1] ]
66 |     print "Choose only features %d-%d" %( dims[0]+1, dims[1] )
67 | else:
68 |     train_features = train_features[ :, : ]
69 |     test_features = test_features[ :, : ]
70 |         
71 | model = svm.LinearSVC(penalty='l1', dual=False)
72 | 
73 | print "Training...",
74 | model.fit( train_features, true_train_classes )
75 | print "Done."
76 | 
77 | pred_train_classes = model.predict( train_features )
78 | pred_test_classes = model.predict( test_features )
79 | 
80 | print metrics.classification_report(true_train_classes, pred_train_classes, digits=3)
81 | print metrics.classification_report(true_test_classes, pred_test_classes, digits=3)
82 | 
83 | for average in ['micro', 'macro']:
84 |     train_precision, train_recall, train_f1, train_acc = getScores( true_train_classes, pred_train_classes, average )
85 |     print "Train Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %( average, 
86 |                         train_precision, train_recall, train_f1, train_acc )
87 |     
88 |     test_precision, test_recall, test_f1, test_acc = getScores( true_test_classes, pred_test_classes, average )
89 |     print "Test Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %(  average, 
90 |                         test_precision, test_recall, test_f1, test_acc )
91 | 


--------------------------------------------------------------------------------
/corpusLoader.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/corpusLoader.py


--------------------------------------------------------------------------------
/csv2topic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import getopt
  3 | import sys
  4 | import pdb
  5 | import os
  6 | import csv
  7 | from topicvecDir import topicvecDir
  8 | from utils import *
  9 | 
 10 | customStopwords = "based via using approach learning multi algorithm algorithms"
 11 | 
 12 | config = dict(  csv_filenames = None,
 13 |                 short_name = None,
 14 |                 unigramFilename = "top1grams-wiki.txt",
 15 |                 word_vec_file = "25000-180000-500-BLK-8.0.vec",
 16 |                 K = 20,
 17 |                 N0 = 500,
 18 |                 max_l = 5,
 19 |                 init_l = 1,
 20 |                 max_grad_norm = 0,
 21 |                 # cap the sum of Em when updating topic embeddings
 22 |                 # to avoid too big gradients
 23 |                 grad_scale_Em_base = 2500,
 24 |                 topW = 30,
 25 |                 topTopicMassFracPrintThres = 0.1,
 26 |                 alpha0 = 0.1,
 27 |                 alpha1 = 0.1,
 28 |                 iniDelta = 0.1,
 29 |                 MAX_EM_ITERS = 100,
 30 |                 topicDiff_tolerance = 2e-3,
 31 |                 printTopics_iterNum = 10,
 32 |                 zero_topic0 = True,
 33 |                 useDrdtApprox = False,
 34 |                 customStopwords = customStopwords,
 35 |                 remove_stop = True,
 36 |                 normalize_vecs = False,
 37 |                 # shift all embeddings in a document, so that their average is 0
 38 |                 rebase_vecs = True,
 39 |                 rebase_norm_thres = 0.2,
 40 |                 evalKmeans = False,
 41 |                 verbose = 1,
 42 |                 seed = 0
 43 |             )
 44 | 
 45 | def usage():
 46 |     print """topicvecDir.py [ -v vec_file -a alpha ... ] csv_file
 47 | Options:
 48 |   -k:  Number of topic embeddings to extract. Default: 20
 49 |   -v:  Existing embedding file of all words.
 50 |   -r:  Existing residual file of core words.
 51 |   -a:  Hyperparameter alpha. Default: 0.1.
 52 |   -i:  Number of iterations of the EM procedure. Default: 100
 53 |   -u:  Unigram file, to obtain unigram probs.
 54 |   -l:  Magnitude of topic embeddings.
 55 |   -A:  Append to the old log file.
 56 |   -s:  Seed the random number generator to x. Used to repeat experiments
 57 |   -n:  Nickname (short name) for the csv_file
 58 | """
 59 | 
 60 | def getOptions():
 61 |     global config
 62 | 
 63 |     try:
 64 |         opts, args = getopt.getopt(sys.argv[1:],"k:v:i:u:l:s:n:Ah")
 65 |         if len(args) < 1:
 66 |             raise getopt.GetoptError("")
 67 |         config['csv_filenames'] = args
 68 |             
 69 |         for opt, arg in opts:
 70 |             if opt == '-k':
 71 |                 config['K'] = int(arg)
 72 |             if opt == '-v':
 73 |                 config['vec_file'] = arg
 74 |             if opt == '-a':
 75 |                 config['alpha1'] = float(opt)
 76 |             if opt == '-i':
 77 |                 config['MAX_EM_ITERS'] = int(arg)
 78 |             if opt == '-u':
 79 |                 config['unigramFilename'] = arg
 80 |             if opt == '-l':
 81 |                 config['max_l'] = int(arg)
 82 |             if opt == '-s':
 83 |                 config['seed'] = int(arg)
 84 |             if opt == '-A':
 85 |                 config['appendLogfile'] = True
 86 |             if opt == '-n':
 87 |                 config['short_name'] = arg
 88 |             if opt == '-r':
 89 |                 config['useDrdtApprox'] = True
 90 |             if opt == '-h':
 91 |                 usage()
 92 |                 sys.exit(0)
 93 | 
 94 |         basename = os.path.basename(args[0])
 95 |         if config['short_name']:
 96 |             config['logfilename'] = config['short_name']
 97 |         elif len(args) > 1:
 98 |             config['logfilename'] = "(%d)%s" %( len(args), basename )
 99 |         else:
100 |             config['logfilename'] = basename
101 | 
102 |     except getopt.GetoptError:
103 |         usage()
104 |         sys.exit(2)
105 | 
106 |     return config
107 | 
108 | def main():
109 |     config = getOptions()
110 | 
111 |     docwords = []
112 |     csvfiles_filecount = 0
113 |     csvfiles_wc = 0
114 |     csvfiles_rowcount = 0
115 |     file_rownames = []
116 |     for csv_filename in config['csv_filenames']:
117 |         csvfile_wc = 0
118 |         csvfile_rowcount = 0
119 |         with open(csv_filename) as DOC:
120 |             docreader = csv.reader(DOC)
121 |             for row in docreader:
122 |                 doc = row[0] 
123 |                 wordsInSentences, wc = extractSentenceWords(doc, min_length=2)
124 |                 csvfile_wc += wc
125 |                 csvfile_rowcount += 1
126 |                 docwords.append(wordsInSentences)
127 |                 file_rownames.append( "%s-row%d" %(csv_filename, csvfile_rowcount) )
128 |         csvfile_avgwc = csvfile_wc * 1.0 / csvfile_rowcount
129 |         print "%d words extracted from %d rows in '%s'. Avg %.1f words each row" %( csvfile_wc, 
130 |                     csvfile_rowcount, csv_filename, csvfile_avgwc )
131 |                     
132 |         csvfiles_wc += csvfile_wc
133 |         csvfiles_rowcount += csvfile_rowcount
134 |         csvfiles_filecount += 1
135 |     csvfiles_avgwc = csvfiles_wc * 1.0 / csvfiles_rowcount
136 |     if csvfiles_filecount > 1:
137 |         print "%d words extracted from %d rows in %d csv files. Avg %.1f words each row" %(csvfiles_wc, 
138 |                     csvfiles_rowcount, csvfiles_filecount, csvfiles_avgwc)
139 |     
140 |     topicvec = topicvecDir(**config)
141 |     topicvec.setDocs( docwords, file_rownames )
142 |     
143 |     if 'evalKmeans' in config and config['evalKmeans']:
144 |         topicvec.kmeans()
145 |         topicvec.printTopWordsInTopic(None, True)
146 |         exit(0)
147 |         
148 |     best_last_Ts, Em, docs_Em, Pi = topicvec.inference()
149 | 
150 |     basename = os.path.basename(config['logfilename'])
151 |     basetrunk = os.path.splitext(basename)[0]
152 | 
153 |     best_it, best_T, best_loglike = best_last_Ts[0]
154 |     save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T  )
155 | 
156 |     if best_last_Ts[1]:
157 |         last_it, last_T, last_loglike = best_last_Ts[1]
158 |         save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T  )
159 | 
160 | if __name__ == '__main__':
161 |     main()
162 | 


--------------------------------------------------------------------------------
/file2topic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import getopt
  3 | import sys
  4 | import pdb
  5 | import os
  6 | from topicvecDir import topicvecDir
  7 | from utils import *
  8 | 
  9 | customStopwords = "based via using approach learning multi algorithm algorithms"
 10 | 
 11 | config = dict(  doc_filenames = None,
 12 |                 short_name = None,
 13 |                 unigramFilename = "top1grams-wiki.txt",
 14 |                 word_vec_file = "25000-180000-500-BLK-8.0.vec",
 15 |                 K = 20,
 16 |                 N0 = 500,
 17 |                 max_l = 5,
 18 |                 init_l = 1,
 19 |                 max_grad_norm = 0,
 20 |                 # cap the sum of Em when updating topic embeddings
 21 |                 # to avoid too big gradients
 22 |                 grad_scale_Em_base = 2500,
 23 |                 topW = 30,
 24 |                 topTopicMassFracPrintThres = 0.1,
 25 |                 alpha0 = 0.1,
 26 |                 alpha1 = 0.1,
 27 |                 iniDelta = 0.1,
 28 |                 MAX_EM_ITERS = 100,
 29 |                 topicDiff_tolerance = 2e-3,
 30 |                 printTopics_iterNum = 10,
 31 |                 zero_topic0 = True,
 32 |                 useDrdtApprox = False,
 33 |                 customStopwords = customStopwords,
 34 |                 remove_stop = True,
 35 |                 normalize_vecs = False,
 36 |                 # shift all embeddings in a document, so that their average is 0
 37 |                 rebase_vecs = True,
 38 |                 rebase_norm_thres = 0.2,
 39 |                 evalKmeans = False,
 40 |                 verbose = 1,
 41 |                 seed = 0
 42 |             )
 43 | 
 44 | def usage():
 45 |     print """topicvecDir.py [ -v vec_file -a alpha ... ] doc_file
 46 | Options:
 47 |   -k:  Number of topic embeddings to extract. Default: 20
 48 |   -v:  Existing embedding file of all words.
 49 |   -r:  Existing residual file of core words.
 50 |   -a:  Hyperparameter alpha. Default: 0.1.
 51 |   -i:  Number of iterations of the EM procedure. Default: 100
 52 |   -u:  Unigram file, to obtain unigram probs.
 53 |   -l:  Magnitude of topic embeddings.
 54 |   -A:  Append to the old log file.
 55 |   -s:  Seed the random number generator to x. Used to repeat experiments
 56 |   -n:  Nickname (short name) for the doc_file
 57 | """
 58 | 
 59 | def getOptions():
 60 |     global config
 61 | 
 62 |     try:
 63 |         opts, args = getopt.getopt(sys.argv[1:],"k:v:i:u:l:s:n:Ah")
 64 |         if len(args) < 1:
 65 |             raise getopt.GetoptError("")
 66 |         config['doc_filenames'] = args
 67 |             
 68 |         for opt, arg in opts:
 69 |             if opt == '-k':
 70 |                 config['K'] = int(arg)
 71 |             if opt == '-v':
 72 |                 config['vec_file'] = arg
 73 |             if opt == '-a':
 74 |                 config['alpha1'] = float(opt)
 75 |             if opt == '-i':
 76 |                 config['MAX_EM_ITERS'] = int(arg)
 77 |             if opt == '-u':
 78 |                 config['unigramFilename'] = arg
 79 |             if opt == '-l':
 80 |                 config['max_l'] = int(arg)
 81 |             if opt == '-s':
 82 |                 config['seed'] = int(arg)
 83 |             if opt == '-A':
 84 |                 config['appendLogfile'] = True
 85 |             if opt == '-n':
 86 |                 config['short_name'] = arg
 87 |             if opt == '-r':
 88 |                 config['useDrdtApprox'] = True
 89 |             if opt == '-h':
 90 |                 usage()
 91 |                 sys.exit(0)
 92 | 
 93 |         if config['short_name']:
 94 |             config['logfilename'] = config['short_name']
 95 |         elif len(args) > 1:
 96 |             config['logfilename'] = "(%d)%s" %( len(args), args[0] )
 97 |         else:
 98 |             config['logfilename'] = args[0]
 99 | 
100 |     except getopt.GetoptError:
101 |         usage()
102 |         sys.exit(2)
103 | 
104 |     return config
105 | 
106 | def main():
107 |     config = getOptions()
108 | 
109 |     docwords = []
110 |     for doc_filename in config['doc_filenames']:
111 |         with open(doc_filename) as DOC:
112 |             doc = DOC.readlines()
113 |             doc = "".join(doc)
114 |     
115 |         wordsInSentences, wc = extractSentenceWords(doc, 2)
116 |         print "%d words extracted from '%s'" %(wc, doc_filename)
117 |         docwords.append(wordsInSentences)
118 | 
119 |     topicvec = topicvecDir(**config)
120 |     topicvec.setDocs( docwords, config['doc_filenames'] )
121 |     
122 |     if 'evalKmeans' in config and config['evalKmeans']:
123 |         topicvec.kmeans()
124 |         topicvec.printTopWordsInTopic(None, True)
125 |         exit(0)
126 |         
127 |     best_last_Ts, Em, docs_Em, Pi = topicvec.inference()
128 | 
129 |     basename = os.path.basename(config['logfilename'])
130 |     basetrunk = os.path.splitext(basename)[0]
131 | 
132 |     best_it, best_T, best_loglike = best_last_Ts[0]
133 |     save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T  )
134 | 
135 |     if best_last_Ts[1]:
136 |         last_it, last_T, last_loglike = best_last_Ts[1]
137 |         save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T  )
138 | 
139 | if __name__ == '__main__':
140 |     main()
141 | 


--------------------------------------------------------------------------------
/psdvec/PSDVec.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/PSDVec.pdf


--------------------------------------------------------------------------------
/psdvec/README.md:
--------------------------------------------------------------------------------
 1 | # PSDVec
 2 | PSDVec is the source code for "A Generative Word Embedding Model and its Low Rank Positive Semidefinite Solution" (EMNLP 2015).
 3 | 
 4 | See "PSDVec.pdf" for a manual (```PSDVec: a Toolbox for Incremental and Scalable Word Embedding```, accepted by Neurocomputing, 2016).
 5 | 
 6 | #### Update v0.42: Tikhonov Regularization (=Spherical Gaussian Prior) to embeddings in block-wise factorization:
 7 | 1. Obtain 25000 core embeddings using Weighted PSD Approximation, into _25000-500-EM.vec_:
 8 |     * ```python factorize.py -w 25000 top2grams-wiki.txt```  
 9 | 2. Obtain 45000 noncore embeddings using Weighted Least Squares, totaling 80000 (25000 cores + 55000 noncores), into _25000-80000-500-BLK-2.0.vec_:
10 |     * ```python factorize.py -v 25000-500-EM.vec -o 55000 -t2 top2grams-wiki.txt```
11 | 3. Incrementally learn other 50000 noncore embeddings (based on 25000 cores), into _25000-130000-500-BLK-4.0.vec_:
12 |     * ```python factorize.py -v 25000-80000-500-BLK-2.0.vec -b 25000 -o 50000 -t4 top2grams-wiki.txt```
13 | 4. Repeat 3 again, with Tikhonov coeff = 8 to get more embeddings of rarer words, into _25000-180000-500-BLK-8.0.vec_:
14 |     * ```python factorize.py -v 25000-130000-500-BLK-4.0.vec -b 25000 -o 50000 -t8 top2grams-wiki.txt```
15 | 
16 | Pretrained 180,000 embeddings and evaluation results are uploaded. Now the performance is systematically better than other methods.
17 | 
18 | #### Update v0.41: Gradient Descent (GD) solution:
19 | * ```python factorize.py -G 500 -w 120000 top2grams-wiki.txt```
20 | * GD is fast and scalable, but the performance is much worse (~10% lower on the testsets). It's not recommended, unless initialized using unweighted Eigendecomposition (which is still not scalable).
21 | 
22 | #### Update v0.4: Online Block-wise Factorization
23 | 
24 | Testsets are by courtesy of Omer Levy (https://bitbucket.org/omerlevy/hyperwords/src).
25 | 
26 | The Gradient Descent algorithm was based on the suggestion of Peilin Zhao (not included as a part of the papers).
27 | 


--------------------------------------------------------------------------------
/psdvec/addheader.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import os
 3 | import sys
 4 | import re
 5 | 
 6 | oldVecFilename = sys.argv[1]
 7 | newVecFilename = sys.argv[2]
 8 | 
 9 | stream = os.popen( "wc %s" %oldVecFilename )
10 | output = stream.read()
11 | output = output.strip()
12 | linecount, wordcount, charcount, filename = re.split(" +", output)
13 | linecount = int(linecount)
14 | wordcount = int(wordcount)
15 | 
16 | if wordcount % linecount != 0:
17 |     print "Error: line count %d does not divide word count %d" %(linecount, wordcount)
18 |     sys.exit(1)
19 | 
20 | veclen = wordcount / linecount - 1
21 | print "%d %d" %(linecount, veclen)
22 | VEC = open(newVecFilename, "w")
23 | VEC.write( "%d %d\n" %(linecount, veclen) )
24 | VEC.close()
25 | os.popen( "cat %s >> %s" %(oldVecFilename, newVecFilename) )
26 | 
27 | stream = os.popen( "ls -l %s" %oldVecFilename )
28 | print stream.read().strip()
29 | stream = os.popen( "ls -l %s" %newVecFilename )
30 | print stream.read().strip()
31 | 


--------------------------------------------------------------------------------
/psdvec/analogy.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import re
 3 | from utils import *
 4 | 
 5 | def pred_ana( model, a, a2, b, maxcands = 10 ):
 6 |     questWordIndices = [ model.word2id[x] for x in (a,a2,b) ]
 7 |     # b2 is effectively iterating through the vocab. The row is all the cosine values
 8 |     b2a2 = model.sim_row(a2)
 9 |     b2a  = model.sim_row(a)
10 |     b2b  = model.sim_row(b)
11 | 
12 |     mulsims = ( b2a2 + 1 ) * ( b2b + 1 ) / ( b2a + 1.001 )
13 |     mulsims[questWordIndices] = -10000
14 |     b2s = []
15 |     for i in xrange(maxcands):
16 |         imul = np.nanargmax(mulsims)
17 |         b2mul  = model.vocab[imul]
18 |         b2s.append( [ b2mul, mulsims[imul] ] ) 
19 |         mulsims[imul] = -10000
20 |         
21 |     return b2s
22 |     
23 | embedding_npyfile = "25000-180000-500-BLK-8.0.vec.npy"
24 | embedding_arrays = np.load(embedding_npyfile)
25 | V, vocab, word2ID, skippedWords_whatever = embedding_arrays
26 | print "%d words loaded from '%s'" %(len(vocab), embedding_npyfile)
27 | model = VecModel(V, vocab, word2ID, vecNormalize=True)
28 | print "Model initialized. Ready for input:"
29 | 
30 | while True:
31 |     line = raw_input()
32 |     line = line.strip()
33 |     words = re.split("\s+", line)
34 |     if len(words) != 3:
35 |         print "Only 3 words are allowed"
36 |         continue
37 |     
38 |     oov = 0
39 |     for w in words:
40 |         if w not in model:
41 |             print "'%s' not in vocab" %w
42 |             oov += 1
43 |     if oov > 0:
44 |         continue
45 |         
46 |     a, a2, b = words
47 |     b2s = pred_ana( model, a, a2, b )
48 |     for word, sim in b2s:
49 |         print word, sim
50 |     print
51 |         
52 | 


--------------------------------------------------------------------------------
/psdvec/bench.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | export ROOT=/home/shaohua/D
 3 | #export CORPUS=$ROOT/corpus/cleanwiki.txt
 4 | #export DIM=500
 5 | #export MINCOUNT=100
 6 | #export SUFFIX=wiki
 7 | export CORPUS=$ROOT/corpus/rcv1clean.txt
 8 | export DIM=50
 9 | export MINCOUNT=50
10 | export SUFFIX=rcv1
11 | 
12 | cd $ROOT/corpus/
13 | echo PSD:
14 | ./fact-$SUFFIX.sh
15 | cd $ROOT/word2vec
16 | echo word2vec:
17 | time ./word2vec -train $CORPUS -output $ROOT/corpus/word2vec-$SUFFIX.vec -size $DIM -window 5 -sample 1e-4 -negative 15 -min-count $MINCOUNT
18 | cd $ROOT/corpus/glove/
19 | echo glove:
20 | time ./$SUFFIX.sh
21 | cd $ROOT/corpus/singular/
22 | echo singular:
23 | time ./singular --corpus $CORPUS --output ./$SUFFIX --rare $MINCOUNT --window 3 --dim $DIM
24 | echo PPM and SVD:
25 | cd $ROOT/corpus/hyperwords
26 | ./train-$SUFFIX.sh
27 | echo Sparse:
28 | tail -n+2 $ROOT/corpus/word2vec-$SUFFIX.vec >  $ROOT/corpus/word2vec-$SUFFIX-headless.vec
29 | cd $ROOT/corpus/sparse/
30 | time ./sparse ../word2vec-$SUFFIX-headless.vec 5 0.5 1e-5 4 sparse-$SUFFIX.vec
31 | 


--------------------------------------------------------------------------------
/psdvec/benchspeed.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import time
 3 | 
 4 | class Timer(object):
 5 |     def __init__(self, name=None):
 6 |         self.name = name
 7 |         self.tstart = time.time()
 8 |         self.tlast = self.tstart
 9 |         self.firstCall = True
10 | 
11 |     def getElapseTime(self, isStr=True):
12 |         totalElapsed = time.time() - self.tstart
13 |         # elapsed time since last call
14 |         interElapsed = time.time() - self.tlast
15 |         self.tlast = time.time()
16 | 
17 |         firstCall = self.firstCall
18 |         self.firstCall = False
19 | 
20 |         if isStr:
21 |             if self.name:
22 |                 if firstCall:
23 |                     return '%s elapsed: %.2f' % ( self.name, totalElapsed )
24 |                 return '%s elapsed: %.2f/%.2f' % ( self.name, totalElapsed, interElapsed )
25 |             else:
26 |                 if firstCall:
27 |                     return 'Elapsed: %.2f' % ( totalElapsed )
28 |                 return 'Elapsed: %.2f/%.2f' % ( totalElapsed, interElapsed )
29 |         else:
30 |             return totalElapsed, interElapsed
31 | 
32 |     def printElapseTime(self):
33 |         print self.getElapseTime()
34 | 
35 | def timeToStr(timeNum, fmt="%H:%M:%S"):
36 |     timeStr = time.strftime(fmt, time.localtime(timeNum))
37 |     return timeStr
38 | 
39 | def block_factorize( core_size, noncore_size, N0, tikhonovCoeff ):
40 |     # new WGsum: noncore_size * core_size
41 |     WGsum = np.random.random((noncore_size,core_size))
42 |     Wsum = np.random.random((noncore_size,core_size))
43 |     Wsum[ np.isclose(Wsum,0) ] = 0.001
44 |     Gwmean = WGsum
45 | 
46 |     V1 = np.random.random((core_size,N0))
47 |     # embeddings of noncore words
48 |     # new V2: noncore_size * N0
49 |     V2 = np.zeros( ( noncore_size, N0 ), dtype=np.float32 )
50 |     Tikhonov = np.identity(N0) * tikhonovCoeff
51 | 
52 |     timer = Timer()
53 | 
54 |     print "Begin finding embeddings of non-core words"
55 | 
56 |     # Find each noncore word's embedding
57 |     for i in xrange(noncore_size):
58 |         # core_size
59 |         wi = Wsum[i]
60 |         # new VW: N0 * core_size
61 |         VW = V1.T * wi
62 |         # new VWV: N0 * N0
63 |         VWV = VW.dot(V1)
64 |         if False:
65 |             VWV_Tik = VWV + Tikhonov
66 |             V2[i] = np.linalg.inv(VWV_Tik).dot( VW.dot(Gwmean[i]) )
67 |         if i >= 0 and i % 100 == 99:
68 |             print "\r%d / %d." %(i+1,noncore_size),
69 |             print timer.getElapseTime(), "\r",
70 | 
71 |     print
72 | 
73 | block_factorize(15000, 1000, 500, 2)
74 | #block_factorize(15000, 10000, 50, 2)
75 | 


--------------------------------------------------------------------------------
/psdvec/catbench.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | testsetNames = [ "ap", "battig", "esslli" ]
 4 | testsetCatNums = [ 13, 10, 6 ]
 5 | algNames = [ "PSDVec", "word2vec", "CCA" ]
 6 | CLmethods = [ "rbr", "direct", "graph" ]
 7 | vclusterPath = "D:\\cluto-2.1.2\\MSWIN-x86_64-openmp\\vcluster.exe"
 8 | testsetDir = "./concept categorization"
 9 | 
10 | for CLmethod in CLmethods:
11 |     for i, testsetName in enumerate(testsetNames):
12 |         for algName in algNames:
13 |             vecFilename = testsetDir + "/" + testsetName + "-" + algName + ".vec"
14 |             labelFilename = testsetDir + "/" + testsetName + "-" + algName + ".label"
15 |             catNum = testsetCatNums[i]
16 |             print "%s on %s using %s:" %( algName, testsetName, CLmethod )
17 |             stream = os.popen( '%s -rclassfile="%s" -clmethod=%s "%s" %d' %( vclusterPath, 
18 |                                labelFilename, CLmethod, vecFilename, catNum ) )
19 |             output = stream.read()
20 |             lines = output.split("\n")
21 |             for line in lines:
22 |                 if line.find("way clustering") >= 0:
23 |                     print line
24 |     print
25 |     


--------------------------------------------------------------------------------
/psdvec/cleancorpus.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import gensim.corpora.wikicorpus
 4 | 
 5 | # check and process input arguments
 6 | if len(sys.argv) < 3:
 7 |     print "Usage: cleancorpus.py infile_name outfile_name"
 8 |     sys.exit(1)
 9 |     
10 | infilename, outfilename = sys.argv[1:3]
11 | 
12 | if os.path.isfile(outfilename):
13 |     print "Output file %s exists. Change the file name and try again." %outfilename
14 |     sys.exit(1)
15 |     
16 | linecount = 0
17 | bytecount = 0
18 | wordcount = 0
19 | 
20 | output = open(outfilename, 'w')     
21 | IN = open(infilename)
22 | for line in IN:
23 |     tokens = gensim.corpora.wikicorpus.tokenize(line)
24 |     output.write( "%s\n" %(" ".join(tokens)) )
25 |     linecount += 1
26 |     bytecount += len(line)
27 |     wordcount += len(tokens)
28 |     if linecount % 500 == 0:
29 |         print "\r%d    %d    %d    \r" %(linecount, bytecount/1024/1024, wordcount),
30 |         


--------------------------------------------------------------------------------
/psdvec/competitors/GloVe-1.2.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/competitors/GloVe-1.2.zip


--------------------------------------------------------------------------------
/psdvec/competitors/glove/demo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
 4 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python
 5 | 
 6 | CORPUS=../rcv1clean.txt
 7 | VOCAB_FILE=vocab.txt
 8 | COOCCURRENCE_FILE=cooccurrence.bin
 9 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
10 | BUILDDIR=build
11 | SAVE_FILE=glove-rcv1.vec
12 | VERBOSE=2
13 | MEMORY=16.0
14 | VOCAB_MIN_COUNT=50
15 | VECTOR_SIZE=50
16 | MAX_ITER=15
17 | WINDOW_SIZE=3
18 | BINARY=0
19 | NUM_THREADS=8
20 | X_MAX=10
21 | 
22 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
23 | if [[ $? -eq 0 ]]
24 |   then
25 |   $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
26 |   if [[ $? -eq 0 ]]
27 |   then
28 |     $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
29 |     if [[ $? -eq 0 ]]
30 |     then
31 |        $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
32 |        if [[ $? -eq 0 ]]
33 |        then
34 |            if [ "$1" = 'matlab' ]; then
35 |                matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 
36 |            elif [ "$1" = 'octave' ]; then
37 |                octave < ./eval/octave/read_and_evaluate_octave.m 1>&2 
38 |            else
39 |                python eval/python/evaluate.py
40 |            fi
41 |        fi
42 |     fi
43 |   fi
44 | fi
45 | 


--------------------------------------------------------------------------------
/psdvec/competitors/glove/rcv1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
 4 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python
 5 | 
 6 | CORPUS=../rcv1clean.txt
 7 | VOCAB_FILE=vocab-rcv1.txt
 8 | COOCCURRENCE_FILE=cooccurrence-rcv1.bin
 9 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf-rcv1.bin
10 | BUILDDIR=build
11 | SAVE_FILE=glove-rcv1.vec
12 | VERBOSE=2
13 | MEMORY=16.0
14 | VOCAB_MIN_COUNT=50
15 | VECTOR_SIZE=50
16 | MAX_ITER=15
17 | WINDOW_SIZE=3
18 | BINARY=0
19 | NUM_THREADS=8
20 | X_MAX=10
21 | 
22 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
23 | if [[ $? -eq 0 ]]
24 |   then
25 |   $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
26 |   if [[ $? -eq 0 ]]
27 |   then
28 |     $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
29 |     if [[ $? -eq 0 ]]
30 |     then
31 |        $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
32 |        if [[ $? -eq 0 ]]
33 |        then
34 |            if [ "$1" = 'matlab' ]; then
35 |                matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 
36 |            elif [ "$1" = 'octave' ]; then
37 |                octave < ./eval/octave/read_and_evaluate_octave.m 1>&2 
38 |            else
39 |                python eval/python/evaluate.py
40 |            fi
41 |        fi
42 |     fi
43 |   fi
44 | fi
45 | 


--------------------------------------------------------------------------------
/psdvec/competitors/glove/wiki.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
 4 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python
 5 | 
 6 | CORPUS=/home/shaohua/D/corpus/cleanwiki.txt
 7 | VOCAB_FILE=vocab-wiki.txt
 8 | COOCCURRENCE_FILE=cooccurrence-wiki.bin
 9 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf-wiki.bin
10 | BUILDDIR=build
11 | SAVE_FILE=glove-wiki.vec
12 | VERBOSE=2
13 | MEMORY=16.0
14 | VOCAB_MIN_COUNT=100
15 | VECTOR_SIZE=500
16 | MAX_ITER=15
17 | WINDOW_SIZE=3
18 | BINARY=0
19 | NUM_THREADS=8
20 | X_MAX=10
21 | 
22 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
23 | if [[ $? -eq 0 ]]
24 | then
25 |   $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
26 |   if [[ $? -eq 0 ]]
27 |   then
28 |     $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
29 |     if [[ $? -eq 0 ]]
30 |     then
31 |        $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
32 |     fi
33 |   fi
34 | fi
35 | 


--------------------------------------------------------------------------------
/psdvec/competitors/hyperwords.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/competitors/hyperwords.zip


--------------------------------------------------------------------------------
/psdvec/competitors/hyperwords/pmi2.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # A) Window size 2 with " subsampling
 3 | CORPUS=/home/shaohua/D/corpus/cleanwiki.txt
 4 | 
 5 | mkdir w2.sub
 6 | python hyperwords/corpus2pairs.py --win 2 --sub 1e-5 ${CORPUS} > w2.sub/pairs
 7 | scripts/pairs2counts.sh w2.sub/pairs > w2.sub/counts
 8 | python hyperwords/counts2vocab.py w2.sub/counts
 9 | # Calculate PMI matrices for each collection of pairs
10 | python hyperwords/counts2pmi.py --cds 0.75 w2.sub/counts w2.sub/pmi
11 | 


--------------------------------------------------------------------------------
/psdvec/competitors/hyperwords/pmi5-rcv1.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # B) Window size 5 with dynamic contexts and "dirty" subsampling
 4 | 
 5 | CORPUS=/home/shaohua/D/corpus/rcv1clean.txt
 6 | DIR=w5.rcv1.dyn.sub.del
 7 | mkdir $DIR
 8 | python hyperwords/corpus2pairs.py --win 5 --dyn --sub 1e-5 --del ${CORPUS} > $DIR/pairs
 9 | scripts/pairs2counts.sh $DIR/pairs > $DIR/counts
10 | python hyperwords/counts2vocab.py $DIR/counts
11 | 
12 | # Calculate PMI matrices for each collection of pairs
13 | python hyperwords/counts2pmi.py --cds 0.75 $DIR/counts $DIR/pmi
14 | 


--------------------------------------------------------------------------------
/psdvec/competitors/hyperwords/pmi5.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # B) Window size 5 with dynamic contexts and "dirty" subsampling
 4 | 
 5 | CORPUS=/home/shaohua/D/corpus/cleanwiki.txt
 6 | 
 7 | mkdir w5.dyn.sub.del
 8 | python hyperwords/corpus2pairs.py --win 5 --dyn --sub 1e-5 --del ${CORPUS} > w5.dyn.sub.del/pairs
 9 | scripts/pairs2counts.sh w5.dyn.sub.del/pairs > w5.dyn.sub.del/counts
10 | python hyperwords/counts2vocab.py w5.dyn.sub.del/counts
11 | 
12 | # Calculate PMI matrices for each collection of pairs
13 | python hyperwords/counts2pmi.py --cds 0.75 w5.dyn.sub.del/counts w5.dyn.sub.del/pmi
14 | 


--------------------------------------------------------------------------------
/psdvec/competitors/hyperwords/svd-rcv1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | # Create embeddings with SVD
4 | DIR=w5.rcv1.dyn.sub.del
5 | python hyperwords/pmi2svd.py --dim 50 --neg 5 $DIR/pmi $DIR/svd
6 | cp $DIR/pmi.words.vocab $DIR/svd.words.vocab
7 | cp $DIR/pmi.contexts.vocab $DIR/svd.contexts.vocab
8 | 


--------------------------------------------------------------------------------
/psdvec/competitors/hyperwords/svd.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Create embeddings with SVD
 4 | 
 5 | CORPUS=/home/shaohua/D/corpus/cleanwiki.txt
 6 | 
 7 | python hyperwords/pmi2svd.py --dim 500 --neg 5 w2.sub/pmi w2.sub/svd
 8 | cp w2.sub/pmi.words.vocab w2.sub/svd.words.vocab
 9 | cp w2.sub/pmi.contexts.vocab w2.sub/svd.contexts.vocab
10 | python hyperwords/pmi2svd.py --dim 500 --neg 5 w5.dyn.sub.del/pmi w5.dyn.sub.del/svd
11 | cp w5.dyn.sub.del/pmi.words.vocab w5.dyn.sub.del/svd.words.vocab
12 | cp w5.dyn.sub.del/pmi.contexts.vocab w5.dyn.sub.del/svd.contexts.vocab
13 | 


--------------------------------------------------------------------------------
/psdvec/competitors/hyperwords/train-rcv1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/sh
2 | rm -rf w5.rcv1.dyn.sub.del
3 | rm -rf w5.rcv1.dyn.sub.del
4 | time ./pmi5-rcv1.sh
5 | time ./svd-rcv1.sh
6 | 
7 | 
8 | 
9 | 


--------------------------------------------------------------------------------
/psdvec/competitors/hyperwords/train-wiki.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | rm -rf w2.sub w5.dyn.sub.del
 3 | rm -rf w2.sub w5.dyn.sub.del
 4 | time ./pmi2.sh
 5 | time ./pmi5.sh
 6 | time ./svd.sh
 7 | 
 8 | 
 9 | 
10 | 


--------------------------------------------------------------------------------
/psdvec/competitors/singular.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/competitors/singular.zip


--------------------------------------------------------------------------------
/psdvec/competitors/sparse.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/competitors/sparse.zip


--------------------------------------------------------------------------------
/psdvec/corpus2liblinear.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import getopt
  3 | import sys
  4 | from utils import *
  5 | import pdb
  6 | import time
  7 | import os
  8 | import json
  9 | import copy
 10 | 
 11 | def usage():
 12 |     print """Usage:\n  corpus2liblinear.py -d doc_dir -o output_file -v vec_file [ -s sent_file ] label
 13 |   corpus2liblinear.py -c config_file -n alg_name -v vec_file [ -s sent_file ]
 14 | Options:
 15 |   doc_dir:      Directory of the documents to convert.
 16 |   output_file:  File to save the extracted vectors.
 17 |   Label:        Label of documents. Must be 1/+1/-1.
 18 |   config_file:  File specifying multiple directories, labels and output files.
 19 |   vec_file:     File containing embedding vectors.
 20 |   alg_name:     Name of the embedding algorithm that generates vec_file. 
 21 |                 Needed if onbly partial file name is specified in config_file.
 22 |   sent_file:    File containing a list of sentiment words.
 23 | """
 24 | 
 25 | def parseConfigFile(configFilename):
 26 |     CONF = open(configFilename)
 27 |     dir_configs = []
 28 |     for line in CONF:
 29 |         line = line.strip()
 30 |         dir_config = json.loads(line)
 31 |         dir_configs.append(dir_config)
 32 |     return dir_configs
 33 |     
 34 | def getFileFeatures(filename, V, word2id, sentword2id, remove_stop=False):
 35 |     DOC = open(filename)
 36 |     doc = DOC.read()
 37 |     wordsInSentences, wc = extractSentenceWords(doc, 1)
 38 |     
 39 |     countedWC = 0
 40 |     outvocWC = 0
 41 |     stopwordWC = 0
 42 |     sentWC = 0
 43 |     
 44 |     wids = []
 45 |     wid2freq = {}
 46 |     BOWFeatureNum = len(sentword2id)
 47 |     BOWFreqs = np.zeros(BOWFeatureNum)
 48 |     
 49 |     for sentence in wordsInSentences:
 50 |         for w in sentence:
 51 |             w = w.lower()
 52 |             if remove_stop and w in stopwordDict:
 53 |                 stopwordWC += 1
 54 |                 continue
 55 |                 
 56 |             if w in word2id:
 57 |                 wid = word2id[w]
 58 |                 wids.append( wid )
 59 |                 
 60 |                 if wid not in wid2freq:
 61 |                     wid2freq[wid] = 1
 62 |                 else:
 63 |                     wid2freq[wid] += 1
 64 |                 countedWC += 1
 65 |             else:
 66 |                 outvocWC += 1
 67 |     
 68 |             if w in sentword2id:
 69 |                 id = sentword2id[w]
 70 |                 BOWFreqs[id] += 1
 71 |                 sentWC += 1
 72 |                 
 73 |     N0 = V.shape[1]
 74 |     avgv = np.zeros(N0)
 75 |     
 76 |     # avgv is the average embedding vector. Used in Tobias Schnabel et al. (2015) as the only features
 77 |     for wid, freq in wid2freq.items():
 78 |         avgv += np.log( freq + 1 ) * V[wid]
 79 |         
 80 |     #for wid in wids:
 81 |     #    avgv += V[wid]
 82 |     
 83 |     avgv = normalizeF(avgv)
 84 |     return avgv, BOWFreqs
 85 | 
 86 | def processDir( outFilename, docDir, label, appendToOutput, V, word2ID, sentword2id ):
 87 |     print "Process '%s' %s" %( docDir, label )
 88 |     
 89 |     if appendToOutput:
 90 |         OUT = open(outFilename, "a")
 91 |     else:
 92 |         OUT = open(outFilename, "w")
 93 |         
 94 |     filecount = 0
 95 |     
 96 |     for filename in os.listdir(docDir):
 97 |         OUT.write(label)
 98 |         fullFilename = docDir + "/" + filename
 99 |         avgv, BOWFreqs = getFileFeatures( fullFilename, V, word2ID, sentword2id  )
100 |         for i,x in enumerate(avgv):
101 |             OUT.write( " %d:%.4f" %( i+1, x ) )
102 |         # i == N0 - 1 here, dimensionality of the embedding vector
103 |         i += 1
104 |         if BOWFreqs.shape[0] > 0:
105 |             for freq in BOWFreqs:
106 |                 if freq > 0:
107 |                     OUT.write( " %d:%d" %( i+1, freq ) )
108 |                 i += 1
109 |                 
110 |         OUT.write("\n")
111 |         filecount += 1
112 |         if filecount % 500 == 0:
113 |             print "\r%d\r" %filecount,
114 |     
115 |     if appendToOutput:
116 |         writeMode = "appended to" 
117 |     else:
118 |         writeMode = "written into"
119 |     print "%d files processed and %s '%s'" %( filecount, writeMode, outFilename )
120 |     
121 |     OUT.close()
122 |             
123 | def main():
124 |     vecFilename = "25000-180000-500-BLK-8.0.vec"
125 |     algname = None
126 |     topword_cutoff = -1
127 |     topSentWord_cutoff = -1
128 |     
129 |     configFilename = ""
130 |     label = None
131 |     appendToOutput = False
132 |     sentimentWordFile = None
133 |     
134 |     try:
135 |         opts, args = getopt.getopt(sys.argv[1:],"d:o:v:c:n:s:1ah")
136 |         if( len(args) == 1 ):
137 |             if args[0] != "1" and args[0] != "+1":
138 |                 raise getopt.GetoptError( "Unknown free argument '%s'" %args[0] )
139 |             label = "+1"
140 |         elif( len(args) > 1 ):
141 |             raise getopt.GetoptError( "Too many free arguments '%s'" %args )
142 |             
143 |         for opt, arg in opts:
144 |             if opt == '-1':
145 |                 label = "-1"
146 |             
147 |             if opt == '-c':
148 |                 configFilename = arg
149 |             if opt == '-s':
150 |                 sentimentWordFile = arg
151 |             
152 |             if opt == '-n':
153 |                 algname = arg
154 |             if opt == '-d':
155 |                 docDir = arg
156 |             if opt == '-d':
157 |                 docDir = arg
158 |             if opt == '-o':
159 |                 outFilename = arg
160 |             if opt == '-v':
161 |                 vecFilename = arg
162 |             if opt == '-a':
163 |                 appendToOutput = True    
164 |             if opt == '-h':
165 |                 usage()
166 |                 sys.exit(0)
167 | 
168 |     except getopt.GetoptError, e:
169 |         if len(e.args) == 1:
170 |             print "Option error: %s" %e.args[0]
171 |         usage()
172 |         sys.exit(2)
173 | 
174 |     sentword2id = {}
175 |     bowSize = 0
176 |     if sentimentWordFile:
177 |         SENT = open(sentimentWordFile)
178 |         id = 0
179 |         for line in SENT:
180 |             word, freq = line.split("\t")
181 |             sentword2id[word] = id
182 |             id += 1
183 |             # if topSentWord_cutoff == -1, this equality is never satisfied, so no cut off
184 |             if id == topSentWord_cutoff:
185 |                 break
186 |         bowSize = len(sentword2id)
187 |         print "%d sentiment words loaded" %(bowSize)  
188 |     
189 |     if configFilename:
190 |         dir_configs = parseConfigFile(configFilename)
191 |         for conf in dir_configs:
192 |             if 'outFilenameTrunk' in conf:
193 |                 if not algname:
194 |                     print "-n alg_name is needed to generate full output file name"
195 |                     usage()
196 |                     sys.exit(2)
197 |                 
198 |                 if sentimentWordFile:
199 |                     conf['outFilename'] = "%s-%s-bow%d.txt" %( conf['outFilenameTrunk'], algname, bowSize )
200 |                 else:
201 |                     conf['outFilename'] = "%s-%s.txt" %( conf['outFilenameTrunk'], algname )
202 | 
203 |     elif not label:
204 |         print "No config file nor label is specified"
205 |         usage()
206 |         sys.exit(0)
207 |     else:
208 |         dir_config = { 'dir': docDir, 'outFilename': outFilename, 
209 |                         'label': label, 'isAppend': appendToOutput }
210 |         dir_configs = [ dir_config ]      
211 |                       
212 |     V, vocab, word2ID, skippedWords_whatever = load_embeddings( vecFilename, topword_cutoff )
213 | 
214 |     for conf in dir_configs:
215 |         processDir( conf['outFilename'], conf['docDir'], conf['label'], 
216 |                     conf['appendToOutput'], V, word2ID, sentword2id )
217 |             
218 | if __name__ == '__main__':
219 |     main()
220 |     


--------------------------------------------------------------------------------
/psdvec/eval-logs/bench.log:
--------------------------------------------------------------------------------
  1 | shaohua@shaohua:/media/shaohua/Outerspace/corpus$ ./bench.sh 
  2 | PSD:
  3 | Read sim testset ./testsets/ws/ws353_similarity.txt
  4 | Read sim testset ./testsets/ws/ws353_relatedness.txt
  5 | Read sim testset ./testsets/ws/bruni_men.txt
  6 | Read sim testset ./testsets/ws/radinsky_mturk.txt
  7 | Read sim testset ./testsets/ws/luong_rare.txt
  8 | Read sim testset ./testsets/ws/simlex_999a.txt
  9 | Read analogy testset ./testsets/analogy/google.txt
 10 | Read analogy testset ./testsets/analogy/msr.txt
 11 | 
 12 | Loading bigram file 'top2grams-wiki.txt':
 13 | Totally 277025 words
 14 | 277025 words seen, top 25000 & 0 extra to keep. 25000 kept
 15 | Read bigrams:
 16 | 25000
 17 | Cut point 4010: 4/0.000%
 18 | Cut point 2005: 23/0.000%
 19 | Cut point 1002: 123/0.000%
 20 | Cut point 501: 840/0.000%
 21 | Cut point 251: 5383/0.001%
 22 | Cut point 125: 28276/0.005%
 23 | Cut point 63: 124146/0.020%
 24 | Cut point 31: 493469/0.079%
 25 | Cut point 16: 1779998/0.285%
 26 | 493469 (0.079%) elements in Weight-1 cut off at 31.33
 27 | 
 28 | 4 iterations of EM
 29 | Begin EM of weighted factorization by bigram freqs
 30 | 
 31 | EM Iter 1:
 32 | Begin unweighted factorization
 33 | 12450 positive eigenvalues, sum: 1016404.875
 34 | Eigenvalues cut at the 503-th, 186.957 ~ 186.925
 35 | All eigen sum: 1961478.500, Kept eigen sum: 178549.484
 36 | nowe_factorize() elapsed: 1936.64
 37 | L1 Weighted: Gi: 7769817.151, VV: 7350854.952, Gsym-VV: 10842411.309, G-VV: 9977720.915
 38 | Precompute cosine matrix, will need 2.5GB RAM... Done.
 39 | ws353_similarity: 203 test pairs, 195 valid , 0.79632
 40 | ws353_relatedness: 252 test pairs, 241 valid , 0.68286
 41 | bruni_men: 3000 test pairs, 2639 valid , 0.77234
 42 | radinsky_mturk: 287 test pairs, 279 valid , 0.68298
 43 | luong_rare: 2034 test pairs, 396 valid , 0.54562
 44 | simlex_999a: 999 test pairs, 945 valid , 0.39870
 45 | 19500/12552/19544: Add 0.66651, Mul 0.68634
 46 | google: 19544 analogies, 12586 valid . Add Score: 0.66590, Mul Score: 0.68584
 47 | 8000/5030/8000: Add 0.54732, Mul 0.59761
 48 | msr: 8000 analogies, 5030 valid . Add Score: 0.54732, Mul Score: 0.59761
 49 | EM iter 1 elapsed: 1954.66
 50 | 
 51 | EM Iter 2:
 52 | Begin unweighted factorization
 53 | 12338 positive eigenvalues, sum: 239171.609
 54 | Eigenvalues cut at the 502-th, 191.346 ~ 190.984
 55 | All eigen sum: 299793.500, Kept eigen sum: 180740.391
 56 | nowe_factorize() elapsed: 1972.69
 57 | L1 Weighted: Gi: 7878353.598, VV: 7242558.205, Gsym-VV: 10491300.183, G-VV: 9662005.122
 58 | Precompute cosine matrix, will need 2.5GB RAM... Done.
 59 | ws353_similarity: 203 test pairs, 195 valid , 0.79887
 60 | ws353_relatedness: 252 test pairs, 241 valid , 0.68380
 61 | bruni_men: 3000 test pairs, 2639 valid , 0.77014
 62 | radinsky_mturk: 287 test pairs, 279 valid , 0.68362
 63 | luong_rare: 2034 test pairs, 396 valid , 0.54914
 64 | simlex_999a: 999 test pairs, 945 valid , 0.39755
 65 | 19500/12552/19544: Add 0.67049, Mul 0.69025
 66 | google: 19544 analogies, 12586 valid . Add Score: 0.66995, Mul Score: 0.68981
 67 | 8000/5030/8000: Add 0.54274, Mul 0.59543
 68 | msr: 8000 analogies, 5030 valid . Add Score: 0.54274, Mul Score: 0.59543
 69 | EM iter 2 elapsed: 1990.84
 70 | 
 71 | EM Iter 3:
 72 | Begin unweighted factorization
 73 | 12334 positive eigenvalues, sum: 240251.656
 74 | Eigenvalues cut at the 501-th, 195.408 ~ 194.607
 75 | All eigen sum: 299763.188, Kept eigen sum: 183383.266
 76 | nowe_factorize() elapsed: 1932.79
 77 | L1 Weighted: Gi: 7861670.004, VV: 7228707.970, Gsym-VV: 10272011.447, G-VV: 9453111.101
 78 | Precompute cosine matrix, will need 2.5GB RAM... Done.
 79 | ws353_similarity: 203 test pairs, 195 valid , 0.80074
 80 | ws353_relatedness: 252 test pairs, 241 valid , 0.68146
 81 | bruni_men: 3000 test pairs, 2639 valid , 0.76744
 82 | radinsky_mturk: 287 test pairs, 279 valid , 0.68036
 83 | luong_rare: 2034 test pairs, 396 valid , 0.55140
 84 | simlex_999a: 999 test pairs, 945 valid , 0.39524
 85 | 19500/12552/19544: Add 0.67081, Mul 0.69383
 86 | google: 19544 analogies, 12586 valid . Add Score: 0.67027, Mul Score: 0.69355
 87 | 8000/5030/8000: Add 0.53917, Mul 0.58847
 88 | msr: 8000 analogies, 5030 valid . Add Score: 0.53917, Mul Score: 0.58847
 89 | EM iter 3 elapsed: 1950.92
 90 | 
 91 | EM Iter 4:
 92 | Begin unweighted factorization
 93 | 12339 positive eigenvalues, sum: 241826.562
 94 | Eigenvalues cut at the 500-th, 199.922 ~ 198.961
 95 | All eigen sum: 300269.469, Kept eigen sum: 186190.469
 96 | nowe_factorize() elapsed: 2069.77
 97 | L1 Weighted: Gi: 7879650.000, VV: 7250952.037, Gsym-VV: 10122656.270, G-VV: 9301310.810
 98 | Precompute cosine matrix, will need 2.5GB RAM... Done.
 99 | ws353_similarity: 203 test pairs, 195 valid , 0.80089
100 | ws353_relatedness: 252 test pairs, 241 valid , 0.67612
101 | bruni_men: 3000 test pairs, 2639 valid , 0.76526
102 | radinsky_mturk: 287 test pairs, 279 valid , 0.67757
103 | luong_rare: 2034 test pairs, 396 valid , 0.55358
104 | simlex_999a: 999 test pairs, 945 valid , 0.39342
105 | 19500/12552/19544: Add 0.67145, Mul 0.69487
106 | google: 19544 analogies, 12586 valid . Add Score: 0.67098, Mul Score: 0.69466
107 | 8000/5030/8000: Add 0.53320, Mul 0.58569
108 | msr: 8000 analogies, 5030 valid . Add Score: 0.53320, Mul Score: 0.58569
109 | EM iter 4 elapsed: 2087.96
110 | we_factorize_EM() elapsed: 7987.32
111 | 
112 | Save matrix 'V' into 25000-500-EM.vec
113 | 
114 | 
115 | real	137m4.155s
116 | user	815m28.416s
117 | sys	97m3.316s
118 | Using Tikhonov regularization with coeff: 2.0
119 | Read sim testset ./testsets/ws/ws353_similarity.txt
120 | Read sim testset ./testsets/ws/ws353_relatedness.txt
121 | Read sim testset ./testsets/ws/bruni_men.txt
122 | Read sim testset ./testsets/ws/radinsky_mturk.txt
123 | Read sim testset ./testsets/ws/luong_rare.txt
124 | Read sim testset ./testsets/ws/simlex_999a.txt
125 | Read analogy testset ./testsets/analogy/google.txt
126 | Read analogy testset ./testsets/analogy/msr.txt
127 | 
128 | Embeddings of all words in '25000-500-EM.vec' will be loaded as core
129 | Load embedding text file '25000-500-EM.vec'
130 | Will load embeddings of 25000 words
131 | 25000    25000    0    
132 | 25000 embeddings read, 25000 kept
133 | 2 blocks of 25000 core words and 55000 noncore words will be loaded. Skip 0 words
134 | Loading bigram file 'top2grams-wiki.txt' into 2 blocks. Will skip 0 words
135 | Totally 277025 words
136 | 277025 words in file, top 80000 to read into vocab (25000 core, 55000 noncore), 0 skipped
137 | Read bigrams:
138 | 25000 (25000 core, 0 noncore)
139 | 25000 core words are all read.
140 | 80000 (25000 core, 55000 noncore)
141 | Cut point 35: 2419/0.000%
142 | Cut point 18: 48813/0.004%
143 | 2414 (0.000%) elements in Weight-1 cut off at 35.48
144 | 1328 (0.000%) elements in Weight-2 cut off at 35.48
145 | 
146 | del G1, G21
147 | Begin finding embeddings of non-core words
148 | 55000 / 55000. Elapsed: 5851.77/10.41 
149 | del F21, WGsum, VW
150 | Save matrix 'V' into 25000-80000-500-BLK-2.0.vec
151 | Test embeddings derived from block factorization
152 | 
153 | Precompute cosine matrix, will need 25.6GB RAM... Done.
154 | ws353_similarity: 203 test pairs, 203 valid , 0.79212
155 | ws353_relatedness: 252 test pairs, 252 valid , 0.67948
156 | bruni_men: 3000 test pairs, 3000 valid , 0.76389
157 | radinsky_mturk: 287 test pairs, 285 valid , 0.67397
158 | luong_rare: 2034 test pairs, 835 valid , 0.48215
159 | simlex_999a: 999 test pairs, 995 valid , 0.39890
160 | 19500/18401/19544: Add 0.61893, Mul 0.63926
161 | google: 19544 analogies, 18443 valid . Add Score: 0.61866, Mul Score: 0.63921
162 | 8000/6172/8000: Add 0.49579, Mul 0.54277
163 | msr: 8000 analogies, 6172 valid . Add Score: 0.49579, Mul Score: 0.54277
164 | 
165 | real	109m26.625s
166 | user	896m3.444s
167 | sys	278m1.332s
168 | Using Tikhonov regularization with coeff: 4.0
169 | Read sim testset ./testsets/ws/ws353_similarity.txt
170 | Read sim testset ./testsets/ws/ws353_relatedness.txt
171 | Read sim testset ./testsets/ws/bruni_men.txt
172 | Read sim testset ./testsets/ws/radinsky_mturk.txt
173 | Read sim testset ./testsets/ws/luong_rare.txt
174 | Read sim testset ./testsets/ws/simlex_999a.txt
175 | Read analogy testset ./testsets/analogy/google.txt
176 | Read analogy testset ./testsets/analogy/msr.txt
177 | 
178 | Embeddings of top 25000 words in '25000-80000-500-BLK-2.0.vec' will be loaded as core
179 | Load embedding text file '25000-80000-500-BLK-2.0.vec'
180 | Will load embeddings of 80000 words
181 | 80000    80000    0    
182 | 80000 embeddings read, 80000 kept
183 | 2 blocks of 25000 core words and 50000 noncore words will be loaded. Skip 55000 words
184 | Loading bigram file 'top2grams-wiki.txt' into 2 blocks. Will skip 55000 words
185 | Totally 277025 words
186 | 277025 words in file, top 75000 to read into vocab (25000 core, 50000 noncore), 55000 skipped
187 | Read bigrams:
188 | 25000 (25000 core, 0 noncore)
189 | 25000 core words are all read.
190 | 130000 (25000 core, 50000 noncore)
191 | 124073 (0.010%) elements in Weight-1 cut off at 6.63
192 | 111102 (0.009%) elements in Weight-2 cut off at 6.63
193 | 
194 | del G1, G21
195 | Begin finding embeddings of non-core words
196 | 50000 / 50000. Elapsed: 5291.71/10.41 
197 | del F21, WGsum, VW
198 | Save matrix 'V' into 25000-130000-500-BLK-4.0.vec
199 | Test embeddings derived from block factorization
200 | 
201 | ws353_similarity: 203 test pairs, 203 valid , 0.79212
202 | ws353_relatedness: 252 test pairs, 252 valid , 0.67948
203 | bruni_men: 3000 test pairs, 3000 valid , 0.76389
204 | radinsky_mturk: 287 test pairs, 286 valid , 0.67566
205 | luong_rare: 2034 test pairs, 1096 valid , 0.47344
206 | simlex_999a: 999 test pairs, 996 valid , 0.39715
207 | 19500/19158/19544: Add 0.60680, Mul 0.62778
208 | google: 19544 analogies, 19202 valid . Add Score: 0.60650, Mul Score: 0.62770
209 | 8000/6578/8000: Add 0.48100, Mul 0.52676
210 | msr: 8000 analogies, 6578 valid . Add Score: 0.48100, Mul Score: 0.52676
211 | 
212 | real	106m34.491s
213 | user	911m26.380s
214 | sys	262m26.192s
215 | Using Tikhonov regularization with coeff: 8.0
216 | Read sim testset ./testsets/ws/ws353_similarity.txt
217 | Read sim testset ./testsets/ws/ws353_relatedness.txt
218 | Read sim testset ./testsets/ws/bruni_men.txt
219 | Read sim testset ./testsets/ws/radinsky_mturk.txt
220 | Read sim testset ./testsets/ws/luong_rare.txt
221 | Read sim testset ./testsets/ws/simlex_999a.txt
222 | Read analogy testset ./testsets/analogy/google.txt
223 | Read analogy testset ./testsets/analogy/msr.txt
224 | 
225 | Embeddings of top 25000 words in '25000-130000-500-BLK-4.0.vec' will be loaded as core
226 | Load embedding text file '25000-130000-500-BLK-4.0.vec'
227 | Will load embeddings of 130000 words
228 | 130000    130000    0    
229 | 130000 embeddings read, 130000 kept
230 | 2 blocks of 25000 core words and 50000 noncore words will be loaded. Skip 105000 words
231 | Loading bigram file 'top2grams-wiki.txt' into 2 blocks. Will skip 105000 words
232 | Totally 277025 words
233 | 277025 words in file, top 75000 to read into vocab (25000 core, 50000 noncore), 105000 skipped
234 | Read bigrams:
235 | 25000 (25000 core, 0 noncore)
236 | 25000 core words are all read.
237 | 180000 (25000 core, 50000 noncore)
238 | 191415 (0.015%) elements in Weight-1 cut off at 4.12
239 | 183822 (0.015%) elements in Weight-2 cut off at 4.12
240 | 
241 | del G1, G21
242 | Begin finding embeddings of non-core words
243 | 50000 / 50000. Elapsed: 5277.66/10.72 
244 | del F21, WGsum, VW
245 | Save matrix 'V' into 25000-180000-500-BLK-8.0.vec
246 | Test embeddings derived from block factorization
247 | 
248 | ws353_similarity: 203 test pairs, 203 valid , 0.79212
249 | ws353_relatedness: 252 test pairs, 252 valid , 0.67948
250 | bruni_men: 3000 test pairs, 3000 valid , 0.76389
251 | radinsky_mturk: 287 test pairs, 286 valid , 0.67566
252 | luong_rare: 2034 test pairs, 1260 valid , 0.45688
253 | simlex_999a: 999 test pairs, 998 valid , 0.39788
254 | 19500/19320/19544: Add 0.60041, Mul 0.62158
255 | google: 19544 analogies, 19364 valid . Add Score: 0.60013, Mul Score: 0.62151
256 | 8000/7054/8000: Add 0.46187, Mul 0.50383
257 | msr: 8000 analogies, 7054 valid . Add Score: 0.46187, Mul Score: 0.50383
258 | 
259 | real	111m53.430s
260 | user	964m38.856s
261 | sys	271m9.988s
262 | word2vec:
263 | Starting training using file /home/shaohua/D/corpus/cleanwiki.txt
264 | Vocab size: 289625
265 | Words in train file: 2000719401
266 | Alpha: 0.000053  Progress: 99.89%  Words/thread/sec: 65.89k  
267 | real	249m8.382s
268 | user	2530m25.988s
269 | sys	4m4.680s
270 | glove:
271 | BUILDING VOCABULARY
272 | Processed 2042546400 tokens.
273 | Counted 8527820 unique words.
274 | Truncating vocabulary at min count 100.
275 | Using vocabulary of size 289624.
276 | 
277 | COUNTING COOCCURRENCES
278 | window size: 3
279 | context: symmetric
280 | max product: 50983620
281 | overflow length: 152113425
282 | Reading vocab from file "vocab-wiki.txt"...loaded 289624 words.
283 | Building lookup table...table contains 428261749 elements.
284 | Processed 2042546400 tokens.
285 | Writing cooccurrences to disk..........8 files in total.
286 | Merging cooccurrence files: processed 652989173 lines.
287 | 
288 | SHUFFLING COOCCURRENCES
289 | array size: 1020054732
290 | Shuffling by chunks: processed 652989173 lines.
291 | Wrote 1 temporary file(s).
292 | Merging temp files: processed 652989173 lines.
293 | 
294 | TRAINING MODEL
295 | Read 652989173 lines.
296 | Initializing parameters...done.
297 | vector size: 500
298 | vocab size: 289624
299 | x_max: 10.000000
300 | alpha: 0.750000
301 | iter: 001, cost: 0.131547
302 | iter: 002, cost: 0.105234
303 | iter: 003, cost: 0.090557
304 | iter: 004, cost: 0.080886
305 | iter: 005, cost: 0.075276
306 | iter: 006, cost: 0.071524
307 | iter: 007, cost: 0.068758
308 | iter: 008, cost: 0.066837
309 | iter: 009, cost: 0.065186
310 | iter: 010, cost: 0.063919
311 | iter: 011, cost: 0.062813
312 | iter: 012, cost: 0.061871
313 | iter: 013, cost: 0.061140
314 | iter: 014, cost: 0.060471
315 | iter: 015, cost: 0.059661
316 | 
317 | real	229m37.019s
318 | user	1503m26.736s
319 | sys	9m14.064s
320 | singular:
321 | Counting words in file 1/1 .......... 6058672 types
322 | Sliding window in file 1/1 ..........
323 | Writing counts
324 | Loading counts
325 | Calculating SVD
326 | Clustering
327 | 
328 | real	183m26.164s
329 | user	87m43.676s
330 | sys	29m32.096s
331 | 
332 | PMI2:
333 | 15239.57user 3723.37system 4:37:35elapsed 113%CPU (0avgtext+0avgdata 9739144maxresident)k
334 | 99330104inputs+7182248outputs (88major+399600177minor)pagefaults 0swaps
335 | 59223.84user 82148.26system 36:36:14elapsed 107%CPU (0avgtext+0avgdata 31071304maxresident)k
336 | 485837760inputs+23424872outputs (109major+7607047364minor)pagefaults 0swaps
337 | 10169.24user 122.85system 2:53:15elapsed 99%CPU (0avgtext+0avgdata 24847900maxresident)k
338 | 11854888inputs+4056outputs (47major+6317122minor)pagefaults 0swaps
339 | 


--------------------------------------------------------------------------------
/psdvec/evaluate-toefl.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | import getopt
 3 | import glob
 4 | import sys
 5 | import os.path
 6 | from utils import *
 7 | import numpy as np
 8 | import copy
 9 | import pdb
10 | import sys
11 | 
12 | def loadToeflTestset(toeflTestsetFilename):
13 |     TOEFL = open(toeflTestsetFilename)
14 |     toeflTestset = []
15 |     for line in TOEFL:
16 |         line = line.strip()
17 |         words = line.split(" | ")
18 |         toeflTestset.append(words)
19 | 
20 |     print "%d toefl test questions are loaded" %len(toeflTestset)
21 |     return toeflTestset
22 |     
23 | embeddingDir = "./embeddings/"
24 | modelFiles = [ "25000-180000-500-BLK-8.0.vec", "sparse.vec", "singular.vec",  
25 |                "25000-180000-500-BLK-0.0.vec", "word2vec.vec", "glove.vec" ]
26 | 
27 | toeflTestsetFilename = "./testsets/ws/EN-TOEFL-80.txt"
28 | isHyperwordsEmbed = False
29 | hyperwordsType = None
30 | 
31 | def usage():
32 |     print """Usage: evaluate-toefl.py [ -H -m model_file ]
33 | Options:
34 |   -m:    Path to the model file, a ".vec" or a Hyperwords embedding directory (with -H).
35 |   -H:    Hyperwords embeddings type: PPMI or SVD."""
36 |     
37 | try:
38 |     opts, args = getopt.getopt(sys.argv[1:],"m:H:")
39 |     if len(args) != 0:
40 |         raise getopt.GetoptError("")
41 |     for opt, arg in opts:
42 |         if opt == '-m':
43 |             modelFiles = [ arg ]
44 |             embeddingDir = ""
45 |         if opt == '-H':
46 |             isHyperwordsEmbed = True
47 |             hyperwordsType = arg
48 |         if opt == '-h':
49 |             usage()
50 |             sys.exit(0)
51 | 
52 | except getopt.GetoptError:
53 |    usage()
54 |    sys.exit(2)
55 |        
56 | vecNormalize = True
57 | loadwordCutPoint = 180000
58 | 
59 | if loadwordCutPoint > 0:
60 |     print "Load top %d words" %(loadwordCutPoint)
61 | 
62 | toeflTestset = loadToeflTestset(toeflTestsetFilename)
63 | 
64 | for m,modelFile in enumerate(modelFiles):
65 |     modelFile = embeddingDir + modelFile
66 |     if not isHyperwordsEmbed:
67 |         V, vocab2, word2dim, skippedWords = load_embeddings( modelFile, loadwordCutPoint )
68 |         model = VecModel(V, vocab2, word2dim, vecNormalize=vecNormalize)
69 |     else:
70 |         model = load_embeddings_hyper(modelFile, hyperwordsType)
71 |         
72 |     questionNum = 0
73 |     correctNum = 0
74 |     for toeflQuestion in toeflTestset:
75 |         questionWord = toeflQuestion[0]
76 |         maxID = -1
77 |         maxsim = -100
78 |         for i,w in enumerate( toeflQuestion[1:] ):
79 |             sim = model.similarity( questionWord, w )
80 |             if sim > maxsim:
81 |                 maxsim = sim
82 |                 maxID = i
83 |                 
84 |         if maxID == 0:
85 |             correctNum += 1
86 |         else:
87 |             question = copy.copy(toeflQuestion)
88 |             question[maxID+1] = '(' + question[maxID+1] + ')'
89 |             #if m == 0:
90 |             #    pdb.set_trace()
91 |             print "%s: %s, %s, %s, %s" %tuple(question)
92 |         questionNum += 1
93 |     print "%s: %d/%d=%.1f%%" %( modelFile, correctNum, questionNum, correctNum*100.0/questionNum )
94 |     
95 |     
96 | 


--------------------------------------------------------------------------------
/psdvec/evaluate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | import getopt
  4 | import glob
  5 | import sys
  6 | import os.path
  7 | from utils import *
  8 | import numpy as np
  9 | #import pdb
 10 | 
 11 | getAbsentWords = False
 12 | modelFiles = [ "./GoogleNews-vectors-negative300.bin", "./29291-500-EM.vec", "./100000-500-BLKEM.vec",
 13 |                 "./wordvecs/vec_520_forest", "./wiki-glove.vec2.txt" ]
 14 | 
 15 | isModelsBinary = [ True, False, False, False, False ]
 16 | modelID = -1
 17 | 
 18 | # default is current directory
 19 | simTestsetDir = "./testsets/ws/"
 20 | # if set to [], run all testsets
 21 | simTestsetNames = [ "ws353_similarity", "ws353_relatedness", "bruni_men", "radinsky_mturk", "luong_rare", 
 22 |                         "simlex_999a", "EN-RG-65" ]
 23 | anaTestsetDir = "./testsets/analogy/"
 24 | # if set to [], run all testsets
 25 | anaTestsetNames = [ "google", "msr" ]
 26 | 
 27 | unigramFilename = "top1grams-wiki.txt"
 28 | vecNormalize = True
 29 | loadwordCutPoint = -1
 30 | testwordCutPoint = -1
 31 | absentFilename = ""
 32 | extraWordFilename = ""
 33 | # default is in text format
 34 | isModelBinary = False
 35 | modelFile = None
 36 | # precompute the cosine similarity matrix of all pairs of words
 37 | # need W*W*4 bytes of RAM
 38 | precomputeGramian = False
 39 | skipPossessive = False
 40 | evalVecExpectation = False
 41 | doAnaTest = True
 42 | isHyperwordsEmbed = False
 43 | hyperEmbedType = None
 44 | 
 45 | def usage():
 46 |     print """Usage: evaluate.py [ -m model_file -i builtin_model_id -e extra_word_file -a absent_file -u unigram_file ... ]
 47 | Options:
 48 |   -m:    Path to the model file, a ".vec" or ".bin" file for word2vec
 49 |   -b:    Model file is in binary format (default: text)
 50 |   -d:    A directory containing the test files
 51 |   -f:    A list of test files in the specified directory
 52 |   -i:    Builtin model ID for the benchmark. Range: 1 (word2vec),
 53 |          2 (PSD 29291 words), 3 (block PSD 100000 words), 4 (forest), 5(glove)
 54 |   -P:    Do not precompute cosine matrix. When the vocab is huge, 
 55 |          it's necessary to disable computing this matrix.
 56 |   -u:    Unigram file, for missing word check.
 57 |          Its presence will enable checking of what words are missing
 58 |          from the vocabulary and the model
 59 |   -c:    Loaded Model vocabulary cut point. Load top x words from the model file
 60 |   -t:    Vocabulary cut point for the test sets. All words in the test sets
 61 |          whose IDs are below it will be picked out
 62 |   -e:    Extra word file. Words in this list will be loaded anyway
 63 |   -a:    Absent file. Words below the cut point will be saved there
 64 |   -p:    Skip possessive analogy pairs
 65 |   -E:    Compute the expectation of all word embeddings
 66 |   -H:    Hyperwords embeddings type: PPMI or SVD."""
 67 |   
 68 | try:
 69 |     opts, args = getopt.getopt(sys.argv[1:],"m:bd:f:i:Pu:c:t:e:a:shEAH:")
 70 |     if len(args) != 0:
 71 |         raise getopt.GetoptError("")
 72 |     for opt, arg in opts:
 73 |         if opt == '-m':
 74 |             modelID = -1
 75 |             modelFile = arg
 76 |         if opt == '-b':
 77 |             isModelBinary = bool(arg)
 78 |         if opt == '-d':
 79 |             testsetDir = arg
 80 |         if opt == '-f':
 81 |             testsetNames = filter( lambda x: x, arg.split(",") )
 82 |         if opt == '-i':
 83 |             modelID = int(arg)
 84 |         if opt == '-P':
 85 |             precomputeGramian = False
 86 |         if opt == '-u':
 87 |             # unigram file is used to get a full list of words,
 88 |             # and also to sort the absent words by their frequencies
 89 |             unigramFilename = arg
 90 |         if opt == '-c':
 91 |             loadwordCutPoint = int(arg)
 92 |         if opt == '-t':
 93 |             testwordCutPoint = int(arg)
 94 |         if opt == '-e':
 95 |             extraWordFilename = arg
 96 |         if opt == '-a':
 97 |             getAbsentWords = True
 98 |             absentFilename = arg
 99 |         if opt == '-A':
100 |             doAnaTest = False
101 |         if opt == '-s':
102 |             skipPossessive = True
103 |         if opt == '-E':
104 |             evalVecExpectation = True
105 |         if opt == '-H':
106 |             isHyperwordsEmbed = True
107 |             hyperEmbedType = arg
108 |         if opt == '-h':
109 |             usage()
110 |             sys.exit(0)
111 | 
112 |     if getAbsentWords and not unigramFilename:
113 |         print "ERR: -u (Unigram file) has to be specified to get absent words"
114 |         sys.exit(2)
115 |     # "-" means output to console instead of a file
116 |     if absentFilename == "-":
117 |         absentFilename = ""
118 | 
119 | except getopt.GetoptError:
120 |    usage()
121 |    sys.exit(2)
122 | 
123 | if modelID > 0:
124 |     modelFile = modelFiles[ modelID - 1 ]
125 |     isModelBinary = isModelsBinary[ modelID - 1 ]
126 | 
127 | if modelFile is None:
128 |     usage()
129 |     sys.exit(2)
130 |     
131 | vocab = {}
132 | if unigramFilename:
133 |     vocab = loadUnigramFile(unigramFilename)
134 | 
135 | if extraWordFilename:
136 |     extraWords = loadExtraWordFile(extraWordFilename)
137 | else:
138 |     extraWords = {}
139 | 
140 | if loadwordCutPoint > 0:
141 |     print "Load top %d words" %(loadwordCutPoint)
142 | 
143 | if isModelBinary:
144 |     V, vocab2, word2dim, skippedWords = load_embeddings_bin( modelFile, loadwordCutPoint, extraWords )
145 | elif not isHyperwordsEmbed:
146 |     V, vocab2, word2dim, skippedWords =     load_embeddings( modelFile, loadwordCutPoint, extraWords )
147 | else:
148 |     model = load_embeddings_hyper( modelFile, hyperEmbedType )
149 |     # the interface of hyperwords embedding class is incompatible with analogy tasks
150 |     # only compatible with similarity tasks
151 |     doAnaTest = False
152 | 
153 | # if evalVecExpectation = True, compute the expectation of all embeddings
154 | if evalVecExpectation:
155 |     if unigramFilename:
156 |         expVec = np.zeros( len(V[0]) )
157 |         expVecNorm1 = 0
158 |         expVecNorm2 = 0
159 |         totalWords = 0
160 |         expWords = 0
161 |         accumProb = 0.0
162 |         for w in vocab2:
163 |             totalWords += 1
164 |             if w in vocab and vocab[w][0] < 180000:
165 |                 expVec += V[ word2dim[w] ] * vocab[w][2]
166 |                 expVecNorm1 += norm1( V[ word2dim[w] ] ) * vocab[w][2]
167 |                 expVecNorm2 += normF( V[ word2dim[w] ] ) * vocab[w][2]
168 |                 expWords += 1
169 |                 accumProb += vocab[w][2]
170 |     
171 |         expVec /= accumProb
172 |         expVecNorm1 /= accumProb
173 |         expVecNorm2 /= accumProb
174 |         print "totally %d words, %d words in E[v]. Accumu prob: %.2f%%." %( totalWords, expWords, accumProb * 100 )
175 |         print "|E[v]|: %.2f/%.2f, E[|v|]: %.2f/%.2f" %( norm1(expVec), normF(expVec), expVecNorm1, expVecNorm2 )
176 | 
177 |         expMagnitude = norm1(expVec)
178 |         accumProb = 0
179 |         variance = 0
180 |         for w in vocab2:
181 |             if w in vocab and vocab[w][0] < 180000:
182 |                 variance += ( norm1( V[ word2dim[w] ] ) - expMagnitude )**2 * vocab[w][2]
183 |                 accumProb += vocab[w][2]
184 |         
185 |         variance /= accumProb
186 |         
187 |         # variance & standard deviation
188 |         print "var(|v|): %.2f. SD: %.2f. CV: %.2f" %( variance, np.sqrt(variance), np.sqrt(variance) / expVecNorm1 )
189 |                                                                                                 
190 |         sys.exit(0)
191 |         
192 |     else:
193 |         print "ERR: -u (Unigram file) has to be specified to calc expectation of embeddings"
194 |         sys.exit(2)
195 | 
196 | if not isHyperwordsEmbed:        
197 |     model = VecModel(V, vocab2, word2dim, vecNormalize=vecNormalize)
198 | 
199 | if precomputeGramian:
200 |     isEnoughGramian, installedMemGB, requiredMemGB = isMemEnoughGramian( len(V) )
201 |     
202 |     if isEnoughGramian <= 1:
203 |         print "WARN: %.1fGB mem detected, %.1fGB mem required to precompute the cosine matrix" %( installedMemGB, requiredMemGB )
204 |         if isEnoughGramian == 0:
205 |             print "Precomputation of the cosine matrix is disabled automatically."
206 |         else:
207 |             print "In case of memory shortage, you can specify -P to disable"
208 | 
209 |     if isEnoughGramian > 0:
210 |         model.precomputeGramian()
211 |         
212 | print
213 | 
214 | simTestsets = loadTestsets(loadSimTestset, simTestsetDir, simTestsetNames)
215 | 
216 | if skipPossessive:
217 |     anaTestsets = loadTestsets( loadAnaTestset, anaTestsetDir, anaTestsetNames, { 'skipPossessive': 1 } )
218 | else:
219 |     anaTestsets = loadTestsets( loadAnaTestset, anaTestsetDir, anaTestsetNames )
220 | 
221 | print
222 | 
223 | spearmanCoeff, absentModelID2Word1, absentVocabWords1, cutVocabWords1 = \
224 |             evaluate_sim( model, simTestsets, simTestsetNames, getAbsentWords, vocab, testwordCutPoint )
225 | 
226 | print
227 | 
228 | if doAnaTest:
229 |     anaScores,     absentModelID2Word2, absentVocabWords2, cutVocabWords2 = \
230 |             evaluate_ana( model, anaTestsets, anaTestsetNames, getAbsentWords, vocab, testwordCutPoint )
231 | 
232 | if getAbsentWords:
233 |     # merge the two sets of absent words
234 |     absentModelID2Word1.update(absentModelID2Word2)
235 |     absentModelWordIDs = sorted( absentModelID2Word1.keys() )
236 |     absentModelWords = [ absentModelID2Word1[i] for i in absentModelWordIDs ]
237 | 
238 |     absentVocabWords1.update(absentVocabWords2)
239 |     absentVocabWords = sorted( absentVocabWords1.keys() )
240 | 
241 |     cutVocabWords1.update(cutVocabWords2)
242 |     # sort by ID in ascending, so that most frequent words (smaller IDs) first
243 |     cutVocabWords = sorted( cutVocabWords1.keys(), key=lambda w: vocab[w][0] )
244 | 
245 |     print "\n%d words absent from the model:" %len(absentModelWordIDs)
246 |     print "ID:"
247 |     print ",".join( map( lambda i: str(i), absentModelWordIDs) )
248 |     print "\nWords:"
249 |     print ",".join(absentModelWords)
250 | 
251 |     if len(absentVocabWords) > 0:
252 |         print "\n%d words absent from the vocab:" %len(absentVocabWords)
253 |         print "\n".join(absentVocabWords)
254 | 
255 |     print
256 | 
257 |     if absentFilename and len(cutVocabWords):
258 |         ABS = open(absentFilename, "w")
259 |         for w in cutVocabWords:
260 |             ABS.write( "%s\t%d\n" %( w, vocab[w][0] ) )
261 |         ABS.close()
262 |         print "%d words saved to %s" %( len(cutVocabWords), absentFilename )
263 | 


--------------------------------------------------------------------------------
/psdvec/extractwiki.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | # code based on http://textminingonline.com/training-word2vec-model-on-english-wikipedia-by-gensim
 4 |  
 5 | import logging
 6 | import os.path
 7 | import sys
 8 |  
 9 | from gensim.corpora import WikiCorpus
10 |  
11 | if __name__ == '__main__':
12 |     program = os.path.basename(sys.argv[0])
13 |     logger = logging.getLogger(program)
14 |  
15 |     logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')
16 |     logging.root.setLevel(level=logging.INFO)
17 |     logger.info("running %s" % ' '.join(sys.argv))
18 |  
19 |     # check and process input arguments
20 |     if len(sys.argv) < 3:
21 |         print "Usage: extractwiki.py infile_name outfile_name"
22 |         sys.exit(1)
23 |         
24 |     infilename, outfilename = sys.argv[1:3]
25 |  
26 |     if os.path.isfile(outfilename):
27 |         logger.error("Output file %s exists. Change the file name and try again." %outfilename)
28 |         sys.exit(1)
29 |         
30 |     i = 0
31 |     output = open(outfilename, 'w')
32 |     wiki = WikiCorpus(infilename, lemmatize=False, dictionary={})
33 |     for text in wiki.get_texts():
34 |         output.write( " ".join(text) + "\n")
35 |         i = i + 1
36 |         if (i % 10000 == 0):
37 |             logger.info("Saved " + str(i) + " articles")
38 |  
39 |     output.close()
40 |     logger.info("Finished Saved " + str(i) + " articles")
41 |     


--------------------------------------------------------------------------------
/psdvec/fact-rcv1.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | set N0=50
 3 | rem Old way of exact factorization: 
 4 | rem python factorize.py -n 50 -t 28000 -e absentwords.txt top2grams-rcv1.txt
 5 | rem New online fashion:
 6 | rem 1. Obtain 23000 core embeddings, into 25000-50-EM.vec:
 7 | rem python factorize.py -w 23000 -n %N0% top2grams-rcv1.txt
 8 | rem 2. Obtain 23409 noncore embeddings, totaling 46409 (23000 core + 23409 noncore), into 25000-46409-50-BLK-2.0.vec:
 9 | python factorize.py -v 23000-%N0%-EM.vec  -n %N0% -t2 top2grams-rcv1.txt
10 | 


--------------------------------------------------------------------------------
/psdvec/fact-rcv1.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | N0=50
3 | time python factorize.py -w 15000 -n $N0 -E5 top2grams-rcv1.txt
4 | time python factorize.py -v 15000-$N0-EM.vec  -n $N0 -t2 top2grams-rcv1.txt
5 | 


--------------------------------------------------------------------------------
/psdvec/fact-wiki.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | set N0=500
 3 | rem Old way of exact factorization: 
 4 | rem python factorize.py -n 500 -t 28000 -e absentwords.txt top2grams-wiki.txt
 5 | rem New online fashion:
 6 | rem 1. Obtain 25000 core embeddings, into 25000-500-EM.vec:
 7 | python factorize.py -w 25000 -n %N0% top2grams-wiki.txt
 8 | rem 2. Obtain 55000 noncore embeddings, totaling 80000 (25000 core + 55000 noncore), into 25000-80000-500-BLK-2.0.vec:
 9 | python factorize.py -v 25000-%N0%-EM.vec  -n %N0% -o 55000 -t2 top2grams-wiki.txt
10 | rem 3. Incrementally learn other 50000 noncore embeddings (based on 25000 core), into 25000-130000-500-BLK-4.0.vec:
11 | python factorize.py -v 25000-80000-%N0%-BLK-2.0.vec  -n %N0% -b 25000 -o 50000 -t4 top2grams-wiki.txt
12 | rem 4. Repeat 3 again to get more embeddings of rarer words.
13 | python factorize.py -v 25000-130000-%N0%-BLK-4.0.vec  -n %N0% -b 25000 -o 50000 -t8 top2grams-wiki.txt
14 | 


--------------------------------------------------------------------------------
/psdvec/fact-wiki.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | N0=500
 3 | # Old way of exact factorization: 
 4 | # python factorize.py -n 500 -t 28000 -e absentwords.txt top2grams-wiki.txt
 5 | # New online fashion:
 6 | # 1. Obtain 25000 core embeddings, into 25000-500-EM.vec:
 7 | time python factorize.py -w 25000 -n $N0 top2grams-wiki.txt
 8 | # 2. Obtain 55000 noncore embeddings, totaling 80000 (25000 core + 55000 noncore), into 25000-80000-500-BLK-2.0.vec:
 9 | time python factorize.py -v 25000-$N0-EM.vec  -n $N0 -o 55000 -t2 top2grams-wiki.txt
10 | # 3. Incrementally learn other 50000 noncore embeddings (based on 25000 core), into 25000-130000-500-BLK-4.0.vec:
11 | time python factorize.py -v 25000-80000-$N0-BLK-2.0.vec  -n $N0 -b 25000 -o 50000 -t4 top2grams-wiki.txt
12 | # 4. Repeat 3 again to get more embeddings of rarer words.
13 | time python factorize.py -v 25000-130000-$N0-BLK-4.0.vec  -n $N0 -b 25000 -o 50000 -t8 top2grams-wiki.txt
14 | 


--------------------------------------------------------------------------------
/psdvec/genSentDict.bat:
--------------------------------------------------------------------------------
1 | @echo off
2 | set N=200
3 | python topwordsInList.py -c sent-gen-config.txt -l d:\Dropbox\sentiment\positive-words.txt,d:\Dropbox\sentiment\negative-words.txt -n %N% -o topSentWords%N%.txt
4 | 


--------------------------------------------------------------------------------
/psdvec/gencatdata.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import scipy.linalg
 3 | from scipy.special import *
 4 | import getopt
 5 | import sys
 6 | from utils import *
 7 | import pdb
 8 | import time
 9 | 
10 | embed_algs = { "PSDVec": "d:/corpus/embeddings/25000-180000-500-BLK-8.0.vec", 
11 |                 "word2vec": "d:/corpus/embeddings/word2vec2.vec",
12 |                 "CCA": "d:/corpus/embeddings/182800-500-CCA.vec"
13 |              } 
14 |                     # "50000-180000-500-BLK-8.0.vec" }
15 | testsetDir = "./concept categorization"
16 | testsetNames = [ "ap", "battig", "esslli" ]
17 | maxID = -1
18 | 
19 | for algname, vecFilename in embed_algs.iteritems():
20 |     print "Alg %s" %algname
21 |     if vecFilename[-4:] == ".bin":
22 |         V, vocab, word2ID, skippedWords_whatever = load_embeddings_bin(vecFilename, 400000)
23 |     else:
24 |         V, vocab, word2ID, skippedWords_whatever = load_embeddings(vecFilename, 400000)
25 |     
26 |     for testsetName in testsetNames:
27 |         truthFilename = testsetDir + "/" + testsetName + ".txt"
28 |         vecFilename = testsetDir + "/" + testsetName + "-" + algname + ".vec"
29 |         labelFilename = testsetDir + "/" + testsetName + "-" + algname + ".label"
30 |         
31 |         FVEC = open(vecFilename, "w")
32 |         ids = []
33 |         
34 |         FLABEL = open(labelFilename, "w")
35 |         
36 |         with open(truthFilename) as FT:
37 |             # skip header
38 |             FT.readline()
39 |             for line in FT:
40 |                 line = line.strip()
41 |                 fields = line.split("\t")
42 |                 word, cat = fields[:2]
43 |                     
44 |                 if word not in word2ID:
45 |                     print "%s not in vocab" %word
46 |                     continue
47 |                 else:
48 |                     id = word2ID[word]
49 |                     #print "%s: %d" %(word, id)
50 |                     if id > maxID:
51 |                         maxID = id
52 |                     ids.append(id)
53 |                     
54 |                     FLABEL.write("%s\n" %cat)
55 |             
56 |             FVEC.write( "%d %d\n" %( len(ids), V.shape[1] ) )
57 |             for id in ids:            
58 |                 v = V[id]
59 |                 FVEC.write("%.3f" %v[0])
60 |                 for d in v[1:]:
61 |                     FVEC.write(" %.3f" %d)
62 |                 FVEC.write("\n")
63 |         
64 |             FLABEL.close()
65 |             FVEC.close()
66 | 


--------------------------------------------------------------------------------
/psdvec/gram-rcv1.bat:
--------------------------------------------------------------------------------
1 | set CORPUS=rcv1clean.txt
2 | set SUFFIX=rcv1
3 | perl gramcount.pl -i %CORPUS% -m1 --f1 top1grams-%SUFFIX%.txt -c --nofilter --thres1 50,0
4 | perl gramcount.pl -i %CORPUS% -m2 --f1 top1grams-%SUFFIX%.txt --nofilter -c --f2 top2grams-%SUFFIX%.txt -w 3 --thres1 50,0
5 | 


--------------------------------------------------------------------------------
/psdvec/gram.bat:
--------------------------------------------------------------------------------
1 | set CORPUS=reuters-train-5770.orig.txt
2 | set SUFFIX=reuters
3 | perl gramcount.pl -i %CORPUS% -m1 --f1 top1grams-%SUFFIX%.txt -c --nofilter --thres1 5,0
4 | perl gramcount.pl -i %CORPUS% -m2 --f1 top1grams-%SUFFIX%.txt --nofilter -c --f2 top2grams-%SUFFIX%.txt -w 3 --thres1 5,0
5 | 


--------------------------------------------------------------------------------
/psdvec/gramcount.pl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/gramcount.pl


--------------------------------------------------------------------------------
/psdvec/papers/emnlp2015.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/papers/emnlp2015.pdf


--------------------------------------------------------------------------------
/psdvec/patch to gensim.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/patch to gensim.py


--------------------------------------------------------------------------------
/psdvec/perlxs.h:
--------------------------------------------------------------------------------
 1 | // http://ftp.ledas.ac.uk/software/lheasoft/lheasoft6.3.1/source/heacore/pil/perl/Av_CharPtrPtr.c
 2 | 
 3 | /* Used by the INPUT typemap for char**.
 4 |  * Will convert a Perl AV* (containing strings) to a C char**.
 5 |  */
 6 | char** XS_unpack_charPtrPtr( SV* rv )
 7 | {
 8 | 	AV *av;
 9 | 	SV **ssv;
10 | 	char **s;
11 | 	int avlen;
12 | 	int x;
13 | 
14 | 	if( SvROK( rv ) && (SvTYPE(SvRV(rv)) == SVt_PVAV) )
15 | 		av = (AV*)SvRV(rv);
16 |     else {
17 | 		warn("XS_unpack_charPtrPtr: rv was not an AV ref");
18 | 		return( (char**)NULL );
19 | 	}
20 | 
21 | 	/* is it empty? */
22 | 	avlen = av_len(av);
23 | 	if( avlen < 0 ){
24 | 		warn("XS_unpack_charPtrPtr: array was empty");
25 | 		return( (char**)NULL );
26 | 	}
27 | 
28 | 	/* av_len+2 == number of strings, plus 1 for an end-of-array sentinel.
29 | 	 */
30 | 	s = (char **)safemalloc( sizeof(char*) * (avlen + 2) );
31 | 	if( s == NULL ){
32 | 		warn("XS_unpack_charPtrPtr: unable to malloc char**");
33 | 		return( (char**)NULL );
34 | 	}
35 | 	for( x = 0; x <= avlen; ++x ){
36 | 		ssv = av_fetch( av, x, 0 );
37 | 		if( ssv != NULL ){
38 | 			if( SvPOK( *ssv ) ){
39 | 				s[x] = (char *)safemalloc( SvCUR(*ssv) + 1 );
40 | 				if( s[x] == NULL )
41 | 					warn("XS_unpack_charPtrPtr: unable to malloc char*");
42 | 				else
43 | 					strcpy( s[x], SvPV( *ssv, PL_na ) );
44 | 			}
45 | 			else
46 | 				warn("XS_unpack_charPtrPtr: array elem %d was not a string.", x );
47 | 		}
48 | 		else
49 | 			s[x] = (char*)NULL;
50 | 	}
51 | 	s[x] = (char*)NULL; /* sentinel */
52 | 	return( s );
53 | }
54 | 
55 | /* Used by the OUTPUT typemap for char**.
56 |  * Will convert a C char** to a Perl AV*.
57 |  */
58 | void XS_pack_charPtrPtr( SV * st, char ** s )
59 | {
60 | 	AV *av = newAV();
61 | 	SV *sv;
62 | 	char **c;
63 | 
64 | 	for( c = s; *c != NULL; ++c ){
65 | 		sv = newSVpv( *c, 0 );
66 | 		av_push( av, sv );
67 | 	}
68 | 	free ( s );
69 | 	sv = newSVrv( st, NULL );	/* upgrade stack SV to an RV */
70 | 	SvREFCNT_dec( sv );	/* discard */
71 | 	SvRV( st ) = (SV*)av;	/* make stack RV point at our AV */
72 | }
73 | 
74 | 
75 | /* cleanup the temporary char** from XS_unpack_charPtrPtr */
76 | void XS_release_charPtrPtr(char** s)
77 | {
78 | 	char **c;
79 | 	for( c = s; *c != NULL; ++c )
80 | 		Safefree( *c );
81 | 	Safefree( s );
82 | }
83 | 


--------------------------------------------------------------------------------
/psdvec/removeDoubleNewline.pl:
--------------------------------------------------------------------------------
 1 | $wc = 0;
 2 | $doubleNewlineCount = 0;
 3 | while(<>){
 4 |     $wc++;
 5 |     if( /^\r?\n$/ ){
 6 |         if( $lastIsNewline ){
 7 |             $lastIsNewline = 0;
 8 |             $doubleNewlineCount++;
 9 |             next;
10 |         }
11 |         else{
12 |             print;
13 |             $lastIsNewline = 1;
14 |         }
15 |     }
16 |     else{
17 |         print;
18 |     }
19 |     if( $wc % 1000 == 0 ){
20 |         print STDERR "\r$wc $doubleNewlineCount\r";
21 |     }
22 | }
23 | print STDERR "$wc $doubleNewlineCount\n";
24 | 


--------------------------------------------------------------------------------
/psdvec/sent-bench.bat:
--------------------------------------------------------------------------------
 1 | @echo off
 2 | rem cd d:\corpus
 3 | rem python corpus2liblinear.py -d aclImdb\test\pos -o sent-test.txt 1
 4 | rem python corpus2liblinear.py -d aclImdb\test\neg -o sent-test.txt -1 -a
 5 | rem python corpus2liblinear.py -d aclImdb\train\pos -o sent-train.txt 1
 6 | rem python corpus2liblinear.py -d aclImdb\train\neg -o sent-train.txt -1 -a
 7 | pushd d:\liblinear-2.1\windows
 8 | train -s7 -v10 \corpus\sent-train-PSD-reg.txt PSD-reg.model
 9 | predict \corpus\sent-test-PSD-reg.txt PSD-reg.model pred-output.txt
10 | popd
11 | 
12 | 


--------------------------------------------------------------------------------
/psdvec/sent-gen.conf:
--------------------------------------------------------------------------------
1 | { "outFilenameTrunk": "sent-train", "docDir": "D:/corpus/aclImdb/train/neg", "label": "-1", "appendToOutput": false }
2 | { "outFilenameTrunk": "sent-train", "docDir": "D:/corpus/aclImdb/train/pos", "label": "+1", "appendToOutput": true }
3 | { "outFilenameTrunk": "sent-test",  "docDir": "D:/corpus/aclImdb/test/neg", "label": "-1", "appendToOutput": false }
4 | { "outFilenameTrunk": "sent-test",  "docDir": "D:/corpus/aclImdb/test/pos", "label": "+1", "appendToOutput": true }
5 | 


--------------------------------------------------------------------------------
/psdvec/sentbench.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | import subprocess
 4 | 
 5 | alg2vec = { "PSD-reg": "25000-180000-500-BLK-8.0.vec", 
 6 |             #"PSD-unreg": "25000-180000-500-BLK-0.0.vec", 
 7 |             #"word2vec": "word2vec2.vec",
 8 |             #"CCA": "182800-500-CCA.vec",
 9 |             "sparse":  "120000-sparse.vec"
10 |           }
11 | 
12 | vecDir = "d:/corpus/embeddings"
13 | liblinearDir = "D:/liblinear-2.1/windows"
14 | trainExePath = liblinearDir + "/" + "train.exe"
15 | predictExePath = liblinearDir + "/" + "predict.exe"
16 | dataDir = "d:/corpus"
17 | trainFiletrunk = "sent-train"
18 | testFiletrunk = "sent-test"
19 | dataGenScript = dataDir + "/corpus2liblinear.py"
20 | dataGenConfig = dataDir + "/sent-gen.conf"
21 | sentimentWordFile = dataDir + "/topSentWords500.txt"
22 | 
23 | # code below is just to count the words in sentimentWordFile.
24 | # the count is used in file names
25 | sentword2id = {}
26 | bowSize = 0
27 | if sentimentWordFile:
28 |     SENT = open(sentimentWordFile)
29 |     id = 0
30 |     for line in SENT:
31 |         word, freq = line.split("\t")
32 |         sentword2id[word] = id
33 |         id += 1
34 |     bowSize = len(sentword2id)
35 |     SENT.close()
36 |      
37 | # L1 or L2 regularization for the logistic regression solver
38 | # Experiments show this option has little impact on the results
39 | solverReg = 2
40 | if solverReg == 1:
41 |     solverType = "-s6"
42 | elif solverReg == 2:
43 |     solverType = "-s7"
44 |     
45 | for algName, vecFilename in alg2vec.items():
46 |     print "%s:" %algName
47 |     
48 |     vecFullfilename = vecDir + "/" + vecFilename
49 |     
50 |     if sentimentWordFile:
51 |         trainFilename = "%s/%s-%s-bow%d.txt" %( dataDir, trainFiletrunk, algName, bowSize )
52 |         testFilename = "%s/%s-%s-bow%d.txt" %( dataDir, testFiletrunk, algName, bowSize )
53 |     else:
54 |         trainFilename = "%s/%s-%s.txt" %( dataDir, trainFiletrunk, algName )
55 |         testFilename = "%s/%s-%s.txt" %( dataDir, testFiletrunk, algName )
56 |         
57 |     if not ( os.path.isfile(trainFilename) and os.path.isfile(testFilename) ):
58 |         options = [ "python", dataGenScript, "-c", dataGenConfig, "-n", algName, \
59 |                               "-v", vecFullfilename ]
60 |         if sentimentWordFile:
61 |             options.append("-s")
62 |             options.append(sentimentWordFile)
63 |             
64 |         subprocess.call(options)
65 | 
66 |     if sentimentWordFile:
67 |         modelFilename = "%s-bow%d.model" %( algName, bowSize )
68 |         outputFilename = "%s-bow%d.output" %( algName, bowSize )
69 |     else:
70 |         modelFilename = "%s.model" %algName
71 |         outputFilename = "%s.output" %algName
72 |     
73 |     print "Training using %s" %trainFilename
74 |     subprocess.call( [ trainExePath, solverType, "-v10", trainFilename, modelFilename ] )
75 |     print "Testing using %s" %testFilename
76 |     subprocess.call( [ predictExePath, testFilename, modelFilename, outputFilename ] )
77 |     print
78 |     


--------------------------------------------------------------------------------
/psdvec/tab2list.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | CAT = 1
 4 | WORDS = 2
 5 | 
 6 | FTAB = open(sys.argv[1])
 7 | FLIST = open(sys.argv[2], "w")
 8 | FLIST.write("WORD\tTRUECLASS\n")
 9 | 
10 | state = CAT
11 | catnum = 0
12 | wordnum = 0
13 | 
14 | for line in FTAB:
15 |     line = line.strip()
16 | 
17 |     if not line and state == CAT:
18 |         continue
19 | 
20 |     if state == CAT:
21 |         cat = line.replace(" ", "-")
22 |         state = WORDS
23 |         catnum += 1
24 |         continue
25 |     if state == WORDS:
26 |         if line:
27 |             words = line.split(", ")
28 |             for word in words:
29 |                 word = word.replace(",", "")
30 |                 FLIST.write( "%s\t%s\n" %(word, cat) )
31 |                 wordnum += 1
32 |             continue
33 |         else:
34 |             state = CAT
35 |             continue
36 | 
37 | print "%d words in %d categories written into %s" %(wordnum, catnum, sys.argv[2])
38 | 


--------------------------------------------------------------------------------
/psdvec/testsets/ws/EN-RG-65.txt:
--------------------------------------------------------------------------------
 1 | gem	jewel	3.94
 2 | midday	noon	3.94
 3 | automobile	car	3.92
 4 | cemetery	graveyard	3.88
 5 | cushion	pillow	3.84
 6 | boy	lad	3.82
 7 | cock	rooster	3.68
 8 | implement	tool	3.66
 9 | forest	woodland	3.65
10 | coast	shore	3.60
11 | autograph	signature	3.59
12 | journey	voyage	3.58
13 | serf	slave	3.46
14 | grin	smile	3.46
15 | glass	tumbler	3.45
16 | cord	string	3.41
17 | hill	mound	3.29
18 | magician	wizard	3.21
19 | furnace	stove	3.11
20 | asylum	madhouse	3.04
21 | brother	monk	2.74
22 | food	fruit	2.69
23 | bird	cock	2.63
24 | bird	crane	2.63
25 | oracle	sage	2.61
26 | sage	wizard	2.46
27 | brother	lad	2.41
28 | crane	implement	2.37
29 | magician	oracle	1.82
30 | glass	jewel	1.78
31 | cemetery	mound	1.69
32 | car	journey	1.55
33 | hill	woodland	1.48
34 | crane	rooster	1.41
35 | furnace	implement	1.37
36 | coast	hill	1.26
37 | bird	woodland	1.24
38 | shore	voyage	1.22
39 | cemetery	woodland	1.18
40 | food	rooster	1.09
41 | forest	graveyard	1.00
42 | lad	wizard	0.99
43 | mound	shore	0.97
44 | automobile	cushion	0.97
45 | boy	sage	0.96
46 | monk	oracle	0.91
47 | shore	woodland	0.90
48 | grin	lad	0.88
49 | coast	forest	0.85
50 | asylum	cemetery	0.79
51 | monk	slave	0.57
52 | cushion	jewel	0.45
53 | boy	rooster	0.44
54 | glass	magician	0.44
55 | graveyard	madhouse	0.44
56 | asylum	monk	0.39
57 | asylum	fruit	0.19
58 | grin	implement	0.18
59 | mound	stove	0.14
60 | automobile	wizard	0.11
61 | autograph	shore	0.06
62 | fruit	furnace	0.05
63 | noon	string	0.04
64 | rooster	voyage	0.04
65 | cord	smile	0.02


--------------------------------------------------------------------------------
/psdvec/testsets/ws/EN-TOEFL-80.txt:
--------------------------------------------------------------------------------
 1 | enormously | tremendously | appropriately | uniquely | decidedly
 2 | provisions | stipulations | jurisdictions | interrelations | interpretations
 3 | haphazardly | randomly | linearly | dangerously | densely
 4 | prominent | conspicuous | ancient | mysterious | battered
 5 | zenith | pinnacle | completion | decline | outset
 6 | flawed | imperfect | tiny | crude | lustrous
 7 | urgently | desperately | conceivably | typically | tentatively
 8 | consumed | eaten | bred | caught | supplied
 9 | advent | coming | stability | financing | arrest
10 | concisely | succinctly | freely | positively | powerfully
11 | salutes | greetings | privileges | ceremonies | information
12 | solitary | alone | fearless | alert | restless
13 | hasten | accelerate | accompany | determine | permit
14 | perseverance | endurance | skill | generosity | disturbance
15 | fanciful | imaginative | logical | familiar | apparent
16 | showed | demonstrated | published | repeated | postponed
17 | constantly | continually | accidentally | rapidly | instantly
18 | issues | subjects | training | benefits | salaries
19 | furnish | supply | protect | advise | impress
20 | costly | expensive | beautiful | popular | complicated
21 | recognized | acknowledged | welcomed | depicted | successful
22 | spot | location | climate | latitude | sea
23 | make | earn | print | trade | borrow
24 | often | frequently | definitely | chemically | hardly
25 | easygoing | relaxed | boring | frontier | farming
26 | debate | argument | competition | war | election
27 | narrow | thin | poisonous | freezing | clear
28 | arranged | planned | discarded | studied | explained
29 | infinite | limitless | structural | unusual | relative
30 | showy | striking | prickly | incidental | entertaining
31 | levied | imposed | believed | correlated | requested
32 | deftly | skillfully | occasionally | prudently | humorously
33 | distribute | circulate | commercialize | acknowledge | research
34 | discrepancies | differences | weights | wavelengths | deposits
35 | prolific | productive | capable | serious | promising
36 | unmatched | unequaled | emulated | alienated | unrecognized
37 | peculiarly | uniquely | suspiciously | patriotically | partly
38 | hue | color | contrast | scent | glare
39 | hind | rear | curved | muscular | hairy
40 | highlight | accentuate | alter | restore | imitate
41 | hastily | hurriedly | habitually | shrewdly | chronologically
42 | temperate | mild | short | windy | cold
43 | grin | smile | exercise | rest | joke
44 | verbally | orally | verbosely | overtly | fittingly
45 | physician | doctor | chemist | nurse | pharmacist
46 | essentially | basically | eagerly | ordinarily | possibly
47 | keen | sharp | useful | simple | famous
48 | situated | positioned | rotating | emptying | isolated
49 | principal | major | exceptional | numerous | most
50 | slowly | gradually | effectively | continuously | rarely
51 | built | constructed | proposed | organized | financed
52 | tasks | jobs | customers | shops | materials
53 | unlikely | improbable | disagreeable | different | unpopular
54 | halfheartedly | apathetically | unconventionally | bipartisanly | customarily
55 | annals | chronicles | homes | trails | songs
56 | wildly | furiously | mysteriously | abruptly | distinctively
57 | hailed | acclaimed | judged | remembered | addressed
58 | command | mastery | observation | love | awareness
59 | concocted | devised | supervised | requested | cleaned
60 | prospective | potential | prominent | particular | prudent
61 | generally | broadly | controversially | accurately | descriptively
62 | sustained | prolonged | analyzed | refined | lowered
63 | perilous | dangerous | offensive | binding | exciting
64 | tranquillity | peacefulness | weariness | harshness | happiness
65 | dissipate | disperse | isolate | photograph | disguise
66 | primarily | chiefly | consistently | occasionally | cautiously
67 | colloquial | conversational | incorrect | recorded | misunderstood
68 | resolved | settled | examined | forgotten | publicized
69 | feasible | possible | evident | permitted | equitable
70 | expeditiously | rapidly | frequently | repeatedly | actually
71 | percentage | proportion | sample | profit | volume
72 | terminated | ended | posed | evaluated | postponed
73 | uniform | alike | sharp | hard | complex
74 | figure | solve | list | express | divide
75 | sufficient | enough | valuable | physiological | recent
76 | fashion | manner | fathom | craze | ration
77 | marketed | sold | sweetened | diluted | frozen
78 | bigger | larger | steadier | closer | better
79 | roots | origins | function | rituals | cure
80 | normally | ordinarily | periodically | haltingly | permanently
81 | 


--------------------------------------------------------------------------------
/psdvec/testsets/ws/radinsky_mturk.txt:
--------------------------------------------------------------------------------
  1 | episcopal	russia	2.75
  2 | water	shortage	2.714285714
  3 | horse	wedding	2.266666667
  4 | plays	losses	3.2
  5 | classics	advertiser	2.25
  6 | latin	credit	2.0625
  7 | ship	ballots	2.3125
  8 | mistake	error	4.352941176
  9 | disease	plague	4.117647059
 10 | sake	shade	2.529411765
 11 | saints	observatory	1.9375
 12 | treaty	wheat	1.8125
 13 | texas	death	1.533333333
 14 | republicans	challenge	2.3125
 15 | body	peaceful	2.058823529
 16 | admiralty	intensity	2.647058824
 17 | body	improving	2.117647059
 18 | heroin	marijuana	3.375
 19 | scottish	commuters	2.6875
 20 | apollo	myth	2.6
 21 | film	cautious	2.125
 22 | exhibition	art	4.117647059
 23 | chocolate	candy	3.764705882
 24 | republic	candidate	2.8125
 25 | gospel	church	4.0625
 26 | momentum	desirable	2.4
 27 | singapore	sanctions	2.117647059
 28 | english	french	3.823529412
 29 | exile	church	2.941176471
 30 | navy	coordinator	2.235294118
 31 | adventure	flood	2.4375
 32 | radar	plane	3.235294118
 33 | pacific	ocean	4.266666667
 34 | scotch	liquor	4.571428571
 35 | kennedy	gun	3
 36 | garfield	cat	2.866666667
 37 | scale	budget	3.5
 38 | rhythm	blues	3.071428571
 39 | rich	privileges	3.2
 40 | navy	withdrawn	1.571428571
 41 | marble	marching	2.615384615
 42 | polo	charged	2.125
 43 | mark	missing	2.333333333
 44 | battleship	army	4.235294118
 45 | medium	organization	2.5625
 46 | pennsylvania	writer	1.466666667
 47 | hamlet	poet	3.882352941
 48 | battle	prisoners	3.705882353
 49 | guild	smith	2.75
 50 | mud	soil	4.235294118
 51 | crime	assaulted	3.941176471
 52 | mussolini	stability	2.133333333
 53 | lincoln	division	2.4375
 54 | slaves	insured	2.2
 55 | summer	winter	4.375
 56 | integration	dignity	3.058823529
 57 | money	quota	2.5
 58 | honolulu	vacation	3.6875
 59 | libya	forged	2.461538462
 60 | cheers	musician	2.823529412
 61 | session	surprises	1.8125
 62 | billion	campaigning	2.571428571
 63 | perjury	soybean	2.0625
 64 | forswearing	perjury	3.3125
 65 | costume	halloween	3.4375
 66 | bulgarian	nurses	1.941176471
 67 | costume	ultimate	2.5
 68 | faith	judging	2.235294118
 69 | france	bridges	2.235294118
 70 | citizenship	casey	2.2
 71 | recreation	dish	1.4
 72 | intelligence	troubles	1.625
 73 | germany	worst	1.4375
 74 | chaos	death	2.75
 75 | sydney	hancock	2.857142857
 76 | sabbath	stevenson	2.214285714
 77 | espionage	passport	2.3125
 78 | political	today	1.6875
 79 | pipe	convertible	2
 80 | scouting	demonstrate	2.5625
 81 | salute	patterns	2.235294118
 82 | reichstag	germany	2.285714286
 83 | radiation	costumes	1.5625
 84 | horace	grief	1.764705882
 85 | sale	rental	3.470588235
 86 | open	close	4.058823529
 87 | photography	proving	2.375
 88 | propaganda	germany	1.705882353
 89 | assassination	forbes	2.071428571
 90 | mirror	duel	1.928571429
 91 | probability	hanging	2.058823529
 92 | africa	theater	1.5
 93 | hell	heaven	4.117647059
 94 | mussolini	italy	3
 95 | composer	beethoven	3.647058824
 96 | minister	forthcoming	1.764705882
 97 | brussels	sweden	3.176470588
 98 | neutral	parish	1.6
 99 | emotion	taxation	1.733333333
100 | louisiana	simple	2
101 | quarantine	disease	3
102 | cannon	imprisoned	2.625
103 | bronze	suspicion	2
104 | pearl	interim	2.352941176
105 | artist	paint	4.117647059
106 | relay	family	2.0625
107 | art	mortality	2.294117647
108 | food	investment	2.25
109 | alt	tenor	2.692307692
110 | catholics	protestant	3.5625
111 | militia	landlord	3.0625
112 | battle	warships	4.176470588
113 | alcohol	fleeing	2.5625
114 | coil	ashes	3.117647059
115 | poland	russia	4
116 | explosive	builders	2.4375
117 | aeronautics	plane	4.277777778
118 | charge	sentence	3.133333333
119 | pet	retiring	2
120 | drink	alcohol	4.352941176
121 | stability	species	2.375
122 | colonies	depression	2
123 | easter	preference	2.0625
124 | genius	intellect	4.090909091
125 | diamond	killed	1.555555556
126 | slavery	african	2.8
127 | jurisdiction	law	4.454545455
128 | saints	repeal	1.555555556
129 | conspiracy	campaign	2.166666667
130 | operator	extracts	2.214285714
131 | physician	action	2.153846154
132 | electronics	guess	1.916666667
133 | slavery	diamond	2.285714286
134 | quarterback	sport	3.142857143
135 | assassination	killed	4.285714286
136 | slavery	klan	2.230769231
137 | heroin	shoot	2.692307692
138 | birds	disturbances	1.692307692
139 | palestinians	turks	2.5
140 | citizenship	court	2.5
141 | immunity	violation	2.076923077
142 | alternative	contend	2.461538462
143 | chile	plates	2.692307692
144 | abraham	stranger	1.846153846
145 | kansas	city	3.769230769
146 | month	year	3.857142857
147 | month	day	3.857142857
148 | amateur	actor	2.333333333
149 | afghanistan	war	3.384615385
150 | transmission	maxwell	2.25
151 | manchester	ambitious	1.923076923
152 | program	battered	1.928571429
153 | drawing	music	2.583333333
154 | exile	pledges	2.307692308
155 | adventure	sixteen	1.538461538
156 | exile	threats	2.166666667
157 | concrete	wings	1.428571429
158 | seizure	bishops	2
159 | submarine	sea	3.857142857
160 | villa	mayor	2.25
161 | trade	farley	2.375
162 | nature	forest	3.636363636
163 | chronicle	young	1.9
164 | radical	bishops	1.818181818
165 | pakistan	radical	2.875
166 | fire	water	4.266666667
167 | gossip	nuisance	3.0625
168 | con	examiner	2.266666667
169 | satellite	space	3.75
170 | essay	boston	2
171 | miniature	statue	3.6
172 | spill	pollution	3.5
173 | minister	council	3.5625
174 | landscape	mountain	3.5625
175 | religion	remedy	2.5625
176 | ship	storm	3.5
177 | college	scientist	2.8125
178 | crystal	oldest	2.5625
179 | afghanistan	wise	2.066666667
180 | trinity	religion	3.133333333
181 | homer	odyssey	2.857142857
182 | parish	clue	2.4375
183 | actress	actor	4.0625
184 | patent	professionals	2.375
185 | chaos	horrible	3.066666667
186 | acre	earthquake	2.125
187 | government	immunity	2
188 | football	justice	1.8
189 | gambling	money	3.75
190 | corruption	nervous	1.875
191 | cardinals	villages	2.375
192 | life	death	4.103448276
193 | artillery	sanctions	2.428571429
194 | jerusalem	murdered	2.357142857
195 | cell	brick	3.285714286
196 | knowledge	promoter	2.642857143
197 | adventure	rails	2.571428571
198 | houston	crash	2.357142857
199 | oxford	subcommittee	2.642857143
200 | militia	weapon	3.785714286
201 | manufacturer	meat	1.857142857
202 | damages	reaction	3.071428571
203 | sea	fishing	4.357142857
204 | atomic	clash	2.785714286
205 | broadcasting	athletics	3
206 | mystery	expedition	2.538461538
207 | kremlin	soviets	3.166666667
208 | pig	blaze	1.75
209 | riverside	vietnamese	2.25
210 | bitter	protective	1.923076923
211 | disaster	announced	2.384615385
212 | pork	blaze	2.230769231
213 | feet	international	1.916666667
214 | radical	uniform	2.5
215 | gossip	condemned	2.692307692
216 | mozart	wagner	3.166666667
217 | soccer	boxing	3.4
218 | radical	roles	2.75
219 | rescued	slaying	3
220 | researchers	tested	3.538461538
221 | sales	season	2.307692308
222 | homeless	refugees	3.615384615
223 | pakistan	repair	1.75
224 | athens	painting	2.294117647
225 | tiger	woods	3.375
226 | aircraft	plane	4.473684211
227 | solar	carbon	2.842105263
228 | enterprise	bankruptcy	2.5
229 | homer	springfield	2.833333333
230 | coin	awards	2.166666667
231 | rhodes	native	2.25
232 | soccer	curator	2.125
233 | gasoline	stock	2.888888889
234 | guilt	extended	2.105263158
235 | rapid	singapore	1.764705882
236 | coin	banker	3.631578947
237 | london	correspondence	1.944444444
238 | pop	sex	2.6
239 | medicine	bread	2.176470588
240 | asia	animal	1.555555556
241 | pop	clubhouse	3.210526316
242 | nazi	defensive	2.055555556
243 | earth	poles	3.421052632
244 | thailand	crowded	2.166666667
245 | day	independence	3.473684211
246 | controversy	pitch	2.375
247 | stock	gasoline	3.166666667
248 | composers	mozart	3.833333333
249 | tone	piano	3.722222222
250 | paris	chef	2.111111111
251 | profession	responsible	2.722222222
252 | bankruptcy	chronicle	2
253 | lebanon	war	2.722222222
254 | israel	terror	3.055555556
255 | angola	military	2.941176471
256 | chemistry	patients	2.357142857
257 | munich	constitution	3.071428571
258 | piano	theater	3.266666667
259 | poetry	artist	3.8
260 | acre	burned	1.769230769
261 | religion	abortion	2.076923077
262 | jazz	music	4.533333333
263 | government	transportation	3
264 | color	wine	2.533333333
265 | jackson	quota	1.692307692
266 | shariff	deputy	3.642857143
267 | boat	negroes	2
268 | shooting	sentenced	2.933333333
269 | republicans	friedman	2.416666667
270 | politics	brokerage	2.5
271 | russian	stalin	3.357142857
272 | love	philip	2.5
273 | nuclear	plant	3.733333333
274 | jamaica	queens	3.076923077
275 | dollar	asylum	1.846153846
276 | bridge	rowing	2.785714286
277 | berlin	germany	4
278 | funeral	death	4.714285714
279 | albert	einstein	4.266666667
280 | gulf	shore	3.857142857
281 | ecuador	argentina	3.266666667
282 | britain	france	3.714285714
283 | sports	score	3.866666667
284 | socialism	capitalism	3.785714286
285 | treaty	peace	4.166666667
286 | exchange	market	4.266666667
287 | marriage	anniversary	4.333333333
288 | 


--------------------------------------------------------------------------------
/psdvec/testsets/ws/ws353.txt:
--------------------------------------------------------------------------------
  1 | love	sex	6.77
  2 | tiger	cat	7.35
  3 | tiger	tiger	10.00
  4 | book	paper	7.46
  5 | computer	keyboard	7.62
  6 | computer	internet	7.58
  7 | plane	car	5.77
  8 | train	car	6.31
  9 | telephone	communication	7.50
 10 | television	radio	6.77
 11 | media	radio	7.42
 12 | drug	abuse	6.85
 13 | bread	butter	6.19
 14 | cucumber	potato	5.92
 15 | doctor	nurse	7.00
 16 | professor	doctor	6.62
 17 | student	professor	6.81
 18 | smart	student	4.62
 19 | smart	stupid	5.81
 20 | company	stock	7.08
 21 | stock	market	8.08
 22 | stock	phone	1.62
 23 | stock	CD	1.31
 24 | stock	jaguar	0.92
 25 | stock	egg	1.81
 26 | fertility	egg	6.69
 27 | stock	live	3.73
 28 | stock	life	0.92
 29 | book	library	7.46
 30 | bank	money	8.12
 31 | wood	forest	7.73
 32 | money	cash	9.15
 33 | professor	cucumber	0.31
 34 | king	cabbage	0.23
 35 | king	queen	8.58
 36 | king	rook	5.92
 37 | bishop	rabbi	6.69
 38 | Jerusalem	Israel	8.46
 39 | Jerusalem	Palestinian	7.65
 40 | holy	sex	1.62
 41 | fuck	sex	9.44
 42 | Maradona	football	8.62
 43 | football	soccer	9.03
 44 | football	basketball	6.81
 45 | football	tennis	6.63
 46 | tennis	racket	7.56
 47 | Arafat	peace	6.73
 48 | Arafat	terror	7.65
 49 | Arafat	Jackson	2.50
 50 | law	lawyer	8.38
 51 | movie	star	7.38
 52 | movie	popcorn	6.19
 53 | movie	critic	6.73
 54 | movie	theater	7.92
 55 | physics	proton	8.12
 56 | physics	chemistry	7.35
 57 | space	chemistry	4.88
 58 | alcohol	chemistry	5.54
 59 | vodka	gin	8.46
 60 | vodka	brandy	8.13
 61 | drink	car	3.04
 62 | drink	ear	1.31
 63 | drink	mouth	5.96
 64 | drink	eat	6.87
 65 | baby	mother	7.85
 66 | drink	mother	2.65
 67 | car	automobile	8.94
 68 | gem	jewel	8.96
 69 | journey	voyage	9.29
 70 | boy	lad	8.83
 71 | coast	shore	9.10
 72 | asylum	madhouse	8.87
 73 | magician	wizard	9.02
 74 | midday	noon	9.29
 75 | furnace	stove	8.79
 76 | food	fruit	7.52
 77 | bird	cock	7.10
 78 | bird	crane	7.38
 79 | tool	implement	6.46
 80 | brother	monk	6.27
 81 | crane	implement	2.69
 82 | lad	brother	4.46
 83 | journey	car	5.85
 84 | monk	oracle	5.00
 85 | cemetery	woodland	2.08
 86 | food	rooster	4.42
 87 | coast	hill	4.38
 88 | forest	graveyard	1.85
 89 | shore	woodland	3.08
 90 | monk	slave	0.92
 91 | coast	forest	3.15
 92 | lad	wizard	0.92
 93 | chord	smile	0.54
 94 | glass	magician	2.08
 95 | noon	string	0.54
 96 | rooster	voyage	0.62
 97 | money	dollar	8.42
 98 | money	cash	9.08
 99 | money	currency	9.04
100 | money	wealth	8.27
101 | money	property	7.57
102 | money	possession	7.29
103 | money	bank	8.50
104 | money	deposit	7.73
105 | money	withdrawal	6.88
106 | money	laundering	5.65
107 | money	operation	3.31
108 | tiger	jaguar	8.00
109 | tiger	feline	8.00
110 | tiger	carnivore	7.08
111 | tiger	mammal	6.85
112 | tiger	animal	7.00
113 | tiger	organism	4.77
114 | tiger	fauna	5.62
115 | tiger	zoo	5.87
116 | psychology	psychiatry	8.08
117 | psychology	anxiety	7.00
118 | psychology	fear	6.85
119 | psychology	depression	7.42
120 | psychology	clinic	6.58
121 | psychology	doctor	6.42
122 | psychology	Freud	8.21
123 | psychology	mind	7.69
124 | psychology	health	7.23
125 | psychology	science	6.71
126 | psychology	discipline	5.58
127 | psychology	cognition	7.48
128 | planet	star	8.45
129 | planet	constellation	8.06
130 | planet	moon	8.08
131 | planet	sun	8.02
132 | planet	galaxy	8.11
133 | planet	space	7.92
134 | planet	astronomer	7.94
135 | precedent	example	5.85
136 | precedent	information	3.85
137 | precedent	cognition	2.81
138 | precedent	law	6.65
139 | precedent	collection	2.50
140 | precedent	group	1.77
141 | precedent	antecedent	6.04
142 | cup	coffee	6.58
143 | cup	tableware	6.85
144 | cup	article	2.40
145 | cup	artifact	2.92
146 | cup	object	3.69
147 | cup	entity	2.15
148 | cup	drink	7.25
149 | cup	food	5.00
150 | cup	substance	1.92
151 | cup	liquid	5.90
152 | jaguar	cat	7.42
153 | jaguar	car	7.27
154 | energy	secretary	1.81
155 | secretary	senate	5.06
156 | energy	laboratory	5.09
157 | computer	laboratory	6.78
158 | weapon	secret	6.06
159 | FBI	fingerprint	6.94
160 | FBI	investigation	8.31
161 | investigation	effort	4.59
162 | Mars	water	2.94
163 | Mars	scientist	5.63
164 | news	report	8.16
165 | canyon	landscape	7.53
166 | image	surface	4.56
167 | discovery	space	6.34
168 | water	seepage	6.56
169 | sign	recess	2.38
170 | Wednesday	news	2.22
171 | mile	kilometer	8.66
172 | computer	news	4.47
173 | territory	surface	5.34
174 | atmosphere	landscape	3.69
175 | president	medal	3.00
176 | war	troops	8.13
177 | record	number	6.31
178 | skin	eye	6.22
179 | Japanese	American	6.50
180 | theater	history	3.91
181 | volunteer	motto	2.56
182 | prejudice	recognition	3.00
183 | decoration	valor	5.63
184 | century	year	7.59
185 | century	nation	3.16
186 | delay	racism	1.19
187 | delay	news	3.31
188 | minister	party	6.63
189 | peace	plan	4.75
190 | minority	peace	3.69
191 | attempt	peace	4.25
192 | government	crisis	6.56
193 | deployment	departure	4.25
194 | deployment	withdrawal	5.88
195 | energy	crisis	5.94
196 | announcement	news	7.56
197 | announcement	effort	2.75
198 | stroke	hospital	7.03
199 | disability	death	5.47
200 | victim	emergency	6.47
201 | treatment	recovery	7.91
202 | journal	association	4.97
203 | doctor	personnel	5.00
204 | doctor	liability	5.19
205 | liability	insurance	7.03
206 | school	center	3.44
207 | reason	hypertension	2.31
208 | reason	criterion	5.91
209 | hundred	percent	7.38
210 | Harvard	Yale	8.13
211 | hospital	infrastructure	4.63
212 | death	row	5.25
213 | death	inmate	5.03
214 | lawyer	evidence	6.69
215 | life	death	7.88
216 | life	term	4.50
217 | word	similarity	4.75
218 | board	recommendation	4.47
219 | governor	interview	3.25
220 | OPEC	country	5.63
221 | peace	atmosphere	3.69
222 | peace	insurance	2.94
223 | territory	kilometer	5.28
224 | travel	activity	5.00
225 | competition	price	6.44
226 | consumer	confidence	4.13
227 | consumer	energy	4.75
228 | problem	airport	2.38
229 | car	flight	4.94
230 | credit	card	8.06
231 | credit	information	5.31
232 | hotel	reservation	8.03
233 | grocery	money	5.94
234 | registration	arrangement	6.00
235 | arrangement	accommodation	5.41
236 | month	hotel	1.81
237 | type	kind	8.97
238 | arrival	hotel	6.00
239 | bed	closet	6.72
240 | closet	clothes	8.00
241 | situation	conclusion	4.81
242 | situation	isolation	3.88
243 | impartiality	interest	5.16
244 | direction	combination	2.25
245 | street	place	6.44
246 | street	avenue	8.88
247 | street	block	6.88
248 | street	children	4.94
249 | listing	proximity	2.56
250 | listing	category	6.38
251 | cell	phone	7.81
252 | production	hike	1.75
253 | benchmark	index	4.25
254 | media	trading	3.88
255 | media	gain	2.88
256 | dividend	payment	7.63
257 | dividend	calculation	6.48
258 | calculation	computation	8.44
259 | currency	market	7.50
260 | OPEC	oil	8.59
261 | oil	stock	6.34
262 | announcement	production	3.38
263 | announcement	warning	6.00
264 | profit	warning	3.88
265 | profit	loss	7.63
266 | dollar	yen	7.78
267 | dollar	buck	9.22
268 | dollar	profit	7.38
269 | dollar	loss	6.09
270 | computer	software	8.50
271 | network	hardware	8.31
272 | phone	equipment	7.13
273 | equipment	maker	5.91
274 | luxury	car	6.47
275 | five	month	3.38
276 | report	gain	3.63
277 | investor	earning	7.13
278 | liquid	water	7.89
279 | baseball	season	5.97
280 | game	victory	7.03
281 | game	team	7.69
282 | marathon	sprint	7.47
283 | game	series	6.19
284 | game	defeat	6.97
285 | seven	series	3.56
286 | seafood	sea	7.47
287 | seafood	food	8.34
288 | seafood	lobster	8.70
289 | lobster	food	7.81
290 | lobster	wine	5.70
291 | food	preparation	6.22
292 | video	archive	6.34
293 | start	year	4.06
294 | start	match	4.47
295 | game	round	5.97
296 | boxing	round	7.61
297 | championship	tournament	8.36
298 | fighting	defeating	7.41
299 | line	insurance	2.69
300 | day	summer	3.94
301 | summer	drought	7.16
302 | summer	nature	5.63
303 | day	dawn	7.53
304 | nature	environment	8.31
305 | environment	ecology	8.81
306 | nature	man	6.25
307 | man	woman	8.30
308 | man	governor	5.25
309 | murder	manslaughter	8.53
310 | soap	opera	7.94
311 | opera	performance	6.88
312 | life	lesson	5.94
313 | focus	life	4.06
314 | production	crew	6.25
315 | television	film	7.72
316 | lover	quarrel	6.19
317 | viewer	serial	2.97
318 | possibility	girl	1.94
319 | population	development	3.75
320 | morality	importance	3.31
321 | morality	marriage	3.69
322 | Mexico	Brazil	7.44
323 | gender	equality	6.41
324 | change	attitude	5.44
325 | family	planning	6.25
326 | opera	industry	2.63
327 | sugar	approach	0.88
328 | practice	institution	3.19
329 | ministry	culture	4.69
330 | problem	challenge	6.75
331 | size	prominence	5.31
332 | country	citizen	7.31
333 | planet	people	5.75
334 | development	issue	3.97
335 | experience	music	3.47
336 | music	project	3.63
337 | glass	metal	5.56
338 | aluminum	metal	7.83
339 | chance	credibility	3.88
340 | exhibit	memorabilia	5.31
341 | concert	virtuoso	6.81
342 | rock	jazz	7.59
343 | museum	theater	7.19
344 | observation	architecture	4.38
345 | space	world	6.53
346 | preservation	world	6.19
347 | admission	ticket	7.69
348 | shower	thunderstorm	6.31
349 | shower	flood	6.03
350 | weather	forecast	8.34
351 | disaster	area	6.25
352 | governor	office	6.34
353 | architecture	century	3.78
354 | 


--------------------------------------------------------------------------------
/psdvec/testsets/ws/ws353_relatedness.txt:
--------------------------------------------------------------------------------
  1 | computer	keyboard	7.62
  2 | Jerusalem	Israel	8.46
  3 | planet	galaxy	8.11
  4 | canyon	landscape	7.53
  5 | OPEC	country	5.63
  6 | day	summer	3.94
  7 | day	dawn	7.53
  8 | country	citizen	7.31
  9 | planet	people	5.75
 10 | environment	ecology	8.81
 11 | Maradona	football	8.62
 12 | OPEC	oil	8.59
 13 | money	bank	8.50
 14 | computer	software	8.50
 15 | law	lawyer	8.38
 16 | weather	forecast	8.34
 17 | network	hardware	8.31
 18 | nature	environment	8.31
 19 | FBI	investigation	8.31
 20 | money	wealth	8.27
 21 | psychology	Freud	8.21
 22 | news	report	8.16
 23 | war	troops	8.13
 24 | physics	proton	8.12
 25 | bank	money	8.12
 26 | stock	market	8.08
 27 | planet	constellation	8.06
 28 | credit	card	8.06
 29 | hotel	reservation	8.03
 30 | closet	clothes	8.00
 31 | soap	opera	7.94
 32 | planet	astronomer	7.94
 33 | planet	space	7.92
 34 | movie	theater	7.92
 35 | treatment	recovery	7.91
 36 | baby	mother	7.85
 37 | money	deposit	7.73
 38 | television	film	7.72
 39 | psychology	mind	7.69
 40 | game	team	7.69
 41 | admission	ticket	7.69
 42 | Jerusalem	Palestinian	7.65
 43 | Arafat	terror	7.65
 44 | boxing	round	7.61
 45 | computer	internet	7.58
 46 | money	property	7.57
 47 | tennis	racket	7.56
 48 | telephone	communication	7.50
 49 | currency	market	7.50
 50 | psychology	cognition	7.48
 51 | seafood	sea	7.47
 52 | book	paper	7.46
 53 | book	library	7.46
 54 | psychology	depression	7.42
 55 | fighting	defeating	7.41
 56 | movie	star	7.38
 57 | hundred	percent	7.38
 58 | dollar	profit	7.38
 59 | money	possession	7.29
 60 | cup	drink	7.25
 61 | psychology	health	7.23
 62 | summer	drought	7.16
 63 | investor	earning	7.13
 64 | company	stock	7.08
 65 | stroke	hospital	7.03
 66 | liability	insurance	7.03
 67 | game	victory	7.03
 68 | psychology	anxiety	7.00
 69 | game	defeat	6.97
 70 | FBI	fingerprint	6.94
 71 | money	withdrawal	6.88
 72 | psychology	fear	6.85
 73 | drug	abuse	6.85
 74 | concert	virtuoso	6.81
 75 | computer	laboratory	6.78
 76 | love	sex	6.77
 77 | problem	challenge	6.75
 78 | movie	critic	6.73
 79 | Arafat	peace	6.73
 80 | bed	closet	6.72
 81 | lawyer	evidence	6.69
 82 | fertility	egg	6.69
 83 | precedent	law	6.65
 84 | minister	party	6.63
 85 | psychology	clinic	6.58
 86 | cup	coffee	6.58
 87 | water	seepage	6.56
 88 | government	crisis	6.56
 89 | space	world	6.53
 90 | dividend	calculation	6.48
 91 | victim	emergency	6.47
 92 | luxury	car	6.47
 93 | tool	implement	6.46
 94 | competition	price	6.44
 95 | psychology	doctor	6.42
 96 | gender	equality	6.41
 97 | listing	category	6.38
 98 | video	archive	6.34
 99 | oil	stock	6.34
100 | governor	office	6.34
101 | discovery	space	6.34
102 | record	number	6.31
103 | brother	monk	6.27
104 | production	crew	6.25
105 | nature	man	6.25
106 | family	planning	6.25
107 | disaster	area	6.25
108 | food	preparation	6.22
109 | preservation	world	6.19
110 | movie	popcorn	6.19
111 | lover	quarrel	6.19
112 | game	series	6.19
113 | dollar	loss	6.09
114 | weapon	secret	6.06
115 | shower	flood	6.03
116 | registration	arrangement	6.00
117 | arrival	hotel	6.00
118 | announcement	warning	6.00
119 | game	round	5.97
120 | baseball	season	5.97
121 | drink	mouth	5.96
122 | life	lesson	5.94
123 | grocery	money	5.94
124 | energy	crisis	5.94
125 | reason	criterion	5.91
126 | equipment	maker	5.91
127 | cup	liquid	5.90
128 | deployment	withdrawal	5.88
129 | tiger	zoo	5.87
130 | journey	car	5.85
131 | money	laundering	5.65
132 | summer	nature	5.63
133 | decoration	valor	5.63
134 | Mars	scientist	5.63
135 | alcohol	chemistry	5.54
136 | disability	death	5.47
137 | change	attitude	5.44
138 | arrangement	accommodation	5.41
139 | territory	surface	5.34
140 | size	prominence	5.31
141 | exhibit	memorabilia	5.31
142 | credit	information	5.31
143 | territory	kilometer	5.28
144 | death	row	5.25
145 | doctor	liability	5.19
146 | impartiality	interest	5.16
147 | energy	laboratory	5.09
148 | secretary	senate	5.06
149 | death	inmate	5.03
150 | monk	oracle	5.00
151 | cup	food	5.00
152 | journal	association	4.97
153 | street	children	4.94
154 | car	flight	4.94
155 | space	chemistry	4.88
156 | situation	conclusion	4.81
157 | word	similarity	4.75
158 | peace	plan	4.75
159 | consumer	energy	4.75
160 | ministry	culture	4.69
161 | smart	student	4.62
162 | investigation	effort	4.59
163 | image	surface	4.56
164 | life	term	4.50
165 | start	match	4.47
166 | computer	news	4.47
167 | board	recommendation	4.47
168 | lad	brother	4.46
169 | observation	architecture	4.38
170 | coast	hill	4.38
171 | deployment	departure	4.25
172 | benchmark	index	4.25
173 | attempt	peace	4.25
174 | consumer	confidence	4.13
175 | start	year	4.06
176 | focus	life	4.06
177 | development	issue	3.97
178 | theater	history	3.91
179 | situation	isolation	3.88
180 | profit	warning	3.88
181 | media	trading	3.88
182 | chance	credibility	3.88
183 | precedent	information	3.85
184 | architecture	century	3.78
185 | population	development	3.75
186 | stock	live	3.73
187 | peace	atmosphere	3.69
188 | morality	marriage	3.69
189 | minority	peace	3.69
190 | atmosphere	landscape	3.69
191 | report	gain	3.63
192 | music	project	3.63
193 | seven	series	3.56
194 | experience	music	3.47
195 | school	center	3.44
196 | five	month	3.38
197 | announcement	production	3.38
198 | morality	importance	3.31
199 | money	operation	3.31
200 | delay	news	3.31
201 | governor	interview	3.25
202 | practice	institution	3.19
203 | century	nation	3.16
204 | coast	forest	3.15
205 | shore	woodland	3.08
206 | drink	car	3.04
207 | president	medal	3.00
208 | prejudice	recognition	3.00
209 | viewer	serial	2.97
210 | peace	insurance	2.94
211 | Mars	water	2.94
212 | media	gain	2.88
213 | precedent	cognition	2.81
214 | announcement	effort	2.75
215 | line	insurance	2.69
216 | crane	implement	2.69
217 | drink	mother	2.65
218 | opera	industry	2.63
219 | volunteer	motto	2.56
220 | listing	proximity	2.56
221 | precedent	collection	2.50
222 | cup	article	2.40
223 | sign	recess	2.38
224 | problem	airport	2.38
225 | reason	hypertension	2.31
226 | direction	combination	2.25
227 | Wednesday	news	2.22
228 | glass	magician	2.08
229 | cemetery	woodland	2.08
230 | possibility	girl	1.94
231 | cup	substance	1.92
232 | forest	graveyard	1.85
233 | stock	egg	1.81
234 | month	hotel	1.81
235 | energy	secretary	1.81
236 | precedent	group	1.77
237 | production	hike	1.75
238 | stock	phone	1.62
239 | holy	sex	1.62
240 | stock	CD	1.31
241 | drink	ear	1.31
242 | delay	racism	1.19
243 | stock	life	0.92
244 | stock	jaguar	0.92
245 | monk	slave	0.92
246 | lad	wizard	0.92
247 | sugar	approach	0.88
248 | rooster	voyage	0.62
249 | noon	string	0.54
250 | chord	smile	0.54
251 | professor	cucumber	0.31
252 | king	cabbage	0.23
253 | 


--------------------------------------------------------------------------------
/psdvec/testsets/ws/ws353_similarity.txt:
--------------------------------------------------------------------------------
  1 | tiger	cat	7.35
  2 | tiger	tiger	10.00
  3 | plane	car	5.77
  4 | train	car	6.31
  5 | television	radio	6.77
  6 | media	radio	7.42
  7 | bread	butter	6.19
  8 | cucumber	potato	5.92
  9 | doctor	nurse	7.00
 10 | professor	doctor	6.62
 11 | student	professor	6.81
 12 | smart	stupid	5.81
 13 | wood	forest	7.73
 14 | money	cash	9.15
 15 | king	queen	8.58
 16 | king	rook	5.92
 17 | bishop	rabbi	6.69
 18 | fuck	sex	9.44
 19 | football	soccer	9.03
 20 | football	basketball	6.81
 21 | football	tennis	6.63
 22 | Arafat	Jackson	2.50
 23 | physics	chemistry	7.35
 24 | vodka	gin	8.46
 25 | vodka	brandy	8.13
 26 | drink	eat	6.87
 27 | car	automobile	8.94
 28 | gem	jewel	8.96
 29 | journey	voyage	9.29
 30 | boy	lad	8.83
 31 | coast	shore	9.10
 32 | asylum	madhouse	8.87
 33 | magician	wizard	9.02
 34 | midday	noon	9.29
 35 | furnace	stove	8.79
 36 | food	fruit	7.52
 37 | bird	cock	7.10
 38 | bird	crane	7.38
 39 | food	rooster	4.42
 40 | money	dollar	8.42
 41 | money	currency	9.04
 42 | tiger	jaguar	8.00
 43 | tiger	feline	8.00
 44 | tiger	carnivore	7.08
 45 | tiger	mammal	6.85
 46 | tiger	animal	7.00
 47 | tiger	organism	4.77
 48 | tiger	fauna	5.62
 49 | psychology	psychiatry	8.08
 50 | psychology	science	6.71
 51 | psychology	discipline	5.58
 52 | planet	star	8.45
 53 | planet	moon	8.08
 54 | planet	sun	8.02
 55 | precedent	example	5.85
 56 | precedent	antecedent	6.04
 57 | cup	tableware	6.85
 58 | cup	artifact	2.92
 59 | cup	object	3.69
 60 | cup	entity	2.15
 61 | jaguar	cat	7.42
 62 | jaguar	car	7.27
 63 | mile	kilometer	8.66
 64 | skin	eye	6.22
 65 | Japanese	American	6.50
 66 | century	year	7.59
 67 | announcement	news	7.56
 68 | doctor	personnel	5.00
 69 | Harvard	Yale	8.13
 70 | hospital	infrastructure	4.63
 71 | life	death	7.88
 72 | travel	activity	5.00
 73 | type	kind	8.97
 74 | street	place	6.44
 75 | street	avenue	8.88
 76 | street	block	6.88
 77 | cell	phone	7.81
 78 | dividend	payment	7.63
 79 | calculation	computation	8.44
 80 | profit	loss	7.63
 81 | dollar	yen	7.78
 82 | dollar	buck	9.22
 83 | phone	equipment	7.13
 84 | liquid	water	7.89
 85 | marathon	sprint	7.47
 86 | seafood	food	8.34
 87 | seafood	lobster	8.70
 88 | lobster	food	7.81
 89 | lobster	wine	5.70
 90 | championship	tournament	8.36
 91 | man	woman	8.30
 92 | man	governor	5.25
 93 | murder	manslaughter	8.53
 94 | opera	performance	6.88
 95 | Mexico	Brazil	7.44
 96 | glass	metal	5.56
 97 | aluminum	metal	7.83
 98 | rock	jazz	7.59
 99 | museum	theater	7.19
100 | shower	thunderstorm	6.31
101 | monk	oracle	5.00
102 | cup	food	5.00
103 | journal	association	4.97
104 | street	children	4.94
105 | car	flight	4.94
106 | space	chemistry	4.88
107 | situation	conclusion	4.81
108 | word	similarity	4.75
109 | peace	plan	4.75
110 | consumer	energy	4.75
111 | ministry	culture	4.69
112 | smart	student	4.62
113 | investigation	effort	4.59
114 | image	surface	4.56
115 | life	term	4.50
116 | start	match	4.47
117 | computer	news	4.47
118 | board	recommendation	4.47
119 | lad	brother	4.46
120 | observation	architecture	4.38
121 | coast	hill	4.38
122 | deployment	departure	4.25
123 | benchmark	index	4.25
124 | attempt	peace	4.25
125 | consumer	confidence	4.13
126 | start	year	4.06
127 | focus	life	4.06
128 | development	issue	3.97
129 | theater	history	3.91
130 | situation	isolation	3.88
131 | profit	warning	3.88
132 | media	trading	3.88
133 | chance	credibility	3.88
134 | precedent	information	3.85
135 | architecture	century	3.78
136 | population	development	3.75
137 | stock	live	3.73
138 | peace	atmosphere	3.69
139 | morality	marriage	3.69
140 | minority	peace	3.69
141 | atmosphere	landscape	3.69
142 | report	gain	3.63
143 | music	project	3.63
144 | seven	series	3.56
145 | experience	music	3.47
146 | school	center	3.44
147 | five	month	3.38
148 | announcement	production	3.38
149 | morality	importance	3.31
150 | money	operation	3.31
151 | delay	news	3.31
152 | governor	interview	3.25
153 | practice	institution	3.19
154 | century	nation	3.16
155 | coast	forest	3.15
156 | shore	woodland	3.08
157 | drink	car	3.04
158 | president	medal	3.00
159 | prejudice	recognition	3.00
160 | viewer	serial	2.97
161 | peace	insurance	2.94
162 | Mars	water	2.94
163 | media	gain	2.88
164 | precedent	cognition	2.81
165 | announcement	effort	2.75
166 | line	insurance	2.69
167 | crane	implement	2.69
168 | drink	mother	2.65
169 | opera	industry	2.63
170 | volunteer	motto	2.56
171 | listing	proximity	2.56
172 | precedent	collection	2.50
173 | cup	article	2.40
174 | sign	recess	2.38
175 | problem	airport	2.38
176 | reason	hypertension	2.31
177 | direction	combination	2.25
178 | Wednesday	news	2.22
179 | glass	magician	2.08
180 | cemetery	woodland	2.08
181 | possibility	girl	1.94
182 | cup	substance	1.92
183 | forest	graveyard	1.85
184 | stock	egg	1.81
185 | month	hotel	1.81
186 | energy	secretary	1.81
187 | precedent	group	1.77
188 | production	hike	1.75
189 | stock	phone	1.62
190 | holy	sex	1.62
191 | stock	CD	1.31
192 | drink	ear	1.31
193 | delay	racism	1.19
194 | stock	life	0.92
195 | stock	jaguar	0.92
196 | monk	slave	0.92
197 | lad	wizard	0.92
198 | sugar	approach	0.88
199 | rooster	voyage	0.62
200 | noon	string	0.54
201 | chord	smile	0.54
202 | professor	cucumber	0.31
203 | king	cabbage	0.23
204 | 


--------------------------------------------------------------------------------
/psdvec/topwordsInList.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import getopt
  3 | import sys
  4 | from utils import *
  5 | import pdb
  6 | import time
  7 | import os
  8 | import json
  9 | 
 10 | def usage():
 11 |     print """Usage:\n  topsentwords.py -c config_file -l f1,f2... -o out_file -n count
 12 | Options:
 13 |   config_file:  Same config file used by corpus2liblinear.py, 
 14 |                 which specifying multiple document directories.
 15 |   f1,f2:        Files containg lists of interesting words.
 16 |   out_file:     Output file to save top interesting words. 
 17 |                 Default: 'topwords.txt'
 18 |   count:        Top k words that will be counted. Default: 1000.
 19 | """
 20 | 
 21 | def parseConfigFile(configFilename):
 22 |     CONF = open(configFilename)
 23 |     dir_configs = []
 24 |     for line in CONF:
 25 |         line = line.strip()
 26 |         dir_config = json.loads(line)
 27 |         dir_configs.append(dir_config)
 28 |     return dir_configs
 29 | 
 30 | def getListWordCount( docPath, word2freq ):
 31 |     DOC = open(docPath)
 32 |     doc = DOC.read()
 33 |     wordsInSentences, wc = extractSentenceWords(doc, 1)
 34 |     
 35 |     interestingWc = 0
 36 |     for sentence in wordsInSentences:
 37 |         for w in sentence:
 38 |             w = w.lower()
 39 |             if w in word2freq:
 40 |                 word2freq[w] += 1
 41 |                 interestingWc += 1
 42 | 
 43 |     return wc, interestingWc
 44 |     
 45 | def processDir( docDir, word2freq ):
 46 |     print "Processing '%s'" %( docDir )
 47 |     
 48 |     filecount = 0
 49 |     totalwc = 0
 50 |     totalInterestingWc = 0
 51 |     
 52 |     for filename in os.listdir(docDir):
 53 |         docPath = docDir + "/" + filename
 54 |         wc, interestingWc = getListWordCount( docPath, word2freq )
 55 |         
 56 |         totalwc += wc
 57 |         totalInterestingWc += interestingWc
 58 |         filecount += 1
 59 |         
 60 |         if filecount % 500 == 0:
 61 |             print "\r%d\r" %filecount,
 62 |     
 63 |     print "%d files scanned, totally %d words, %d are interesting" %( filecount, totalwc, totalInterestingWc )
 64 |             
 65 | def main():
 66 |     topword_cutoff = 1000
 67 |     
 68 |     configFilename = None
 69 |     listFilenames = None
 70 |     outFilename = "topwords.txt"
 71 |     
 72 |     try:
 73 |         opts, args = getopt.getopt(sys.argv[1:],"c:l:o:n:h")
 74 |             
 75 |         for opt, arg in opts:
 76 |             if opt == '-c':
 77 |                 configFilename = arg
 78 |             if opt == '-o':
 79 |                 outFilename = arg
 80 |             if opt == '-n':
 81 |                 topword_cutoff = int(arg)
 82 |             if opt == '-l':
 83 |                 listFilenames = arg.split(",")
 84 |             if opt == '-h':
 85 |                 usage()
 86 |                 sys.exit(0)
 87 | 
 88 |     except getopt.GetoptError, e:
 89 |         if len(e.args) == 1:
 90 |             print "Option error: %s" %e.args[0]
 91 |         usage()
 92 |         sys.exit(2)
 93 |     
 94 |     if not configFilename or not listFilenames:
 95 |         usage()
 96 |         sys.exit(2)
 97 |         
 98 |     dir_configs = parseConfigFile(configFilename)
 99 |     
100 |     word2freq = {}
101 |     
102 |     totalwc = 0
103 |     for listFilename in listFilenames:
104 |         filewc = 0
105 |         LIST = open(listFilename)
106 |         for line in LIST:
107 |             if line[0] == ';':
108 |                 continue
109 |             line = line.strip()
110 |             if not line:
111 |                 continue
112 |             word2freq[line] = 0
113 |             filewc += 1
114 |             totalwc += 1
115 |         print "%d words loaded from '%s'" %( filewc, listFilename )
116 |     
117 |     print "%d words loaded from %d files" %( totalwc, len(listFilenames) )        
118 | 
119 |     for conf in dir_configs:
120 |         processDir( conf['docDir'], word2freq )
121 | 
122 |     words = sorted( word2freq.keys(), key=lambda w: word2freq[w], reverse=True )
123 |     topwords = words[:topword_cutoff]
124 |     OUT = open(outFilename, "w")
125 |     for w in topwords:
126 |         OUT.write( "%s\t%d\n" %( w, word2freq[w] ) )
127 |     print "%d words written into '%s'" %( len(topwords), outFilename )
128 |                     
129 | if __name__ == '__main__':
130 |     main()
131 |     


--------------------------------------------------------------------------------
/psdvec/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/utils.py


--------------------------------------------------------------------------------
/psdvec/vecnorms.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # this simple script is to find patterns of the norms (L1) of the learned embeddings
  4 | from utils import *
  5 | import sys
  6 | import operator
  7 | import os
  8 | import getopt
  9 | import math
 10 | import pdb
 11 | 
 12 | def usage():
 13 |     print "Usage: vecnorms.py [-s -1 first_block_count -2 second_block_count ] embedding_filename"
 14 | 
 15 | def expectation(value_probs):
 16 |     accuProb = 0
 17 |     accuExp = 0
 18 |     for v, p in value_probs:
 19 |         accuExp += v * p
 20 |         accuProb += p
 21 | 
 22 |     return accuExp / accuProb
 23 | 
 24 | def var_div(value_probs):
 25 |     expect = expectation(value_probs)
 26 |     accuVar = 0
 27 |     accuProb = 0
 28 |     for v, p in value_probs:
 29 |         accuVar += (v - expect)**2 * p
 30 |         accuProb += p
 31 |     var = accuVar / accuProb
 32 |     div = math.sqrt(var)
 33 |     return var, div
 34 | 
 35 | if len(sys.argv) == 1:
 36 |     usage()
 37 |     sys.exit(1)
 38 | 
 39 | doSort = False
 40 | first_block_count = -1
 41 | second_block_count = -1
 42 | unigramFilename = 'top1grams-wiki.txt'
 43 | 
 44 | try:
 45 |     opts, args = getopt.getopt(sys.argv[1:],"s1:2:")
 46 |     if len(args) != 1:
 47 |         raise getopt.GetoptError("")
 48 |     embeddingFilename = args[0]
 49 |     for opt, arg in opts:
 50 |         if opt == '-s':
 51 |             doSort = True
 52 |         if opt == '-1':
 53 |             first_block_count = int(arg)
 54 |             print 'First block: 1-%d' %first_block_count
 55 |         if opt == '-2':
 56 |             second_block_count = int(arg)
 57 |             print 'Second block: %d-%d' %(first_block_count, second_block_count)
 58 |         if opt == '-u':
 59 |             # unigram file is used to get a full list of words,
 60 |             # and also to sort the absent words by their frequencies
 61 |             unigramFilename = arg
 62 |             
 63 | except getopt.GetoptError:
 64 |      usage()
 65 |      sys.exit(2)
 66 | 
 67 | vocab_prob = loadUnigramFile(unigramFilename)
 68 | V, vocab, word2id, skippedWords = load_embeddings( embeddingFilename, second_block_count )
 69 | warning("\nCompute norms...")
 70 | 
 71 | word2norm = {}
 72 | wordnorms = []
 73 | word_probs1 = []
 74 | word_probs2 = []
 75 | 
 76 | for i in xrange( len(V) ):
 77 |     w = vocab[i]
 78 |     if w not in vocab_prob:
 79 |         warning( "%s not in vocab, skip\n" %w )
 80 |         continue
 81 |     
 82 |     mag = norm1( V[i] )
 83 |     word2norm[w] = mag
 84 |     prob = vocab_prob[w][2]
 85 |     wordnorms.append( [ w, mag ] )
 86 |     if i < first_block_count:
 87 |         word_probs1.append( [ mag, prob ] )
 88 |     elif i < second_block_count:
 89 |         word_probs2.append( [ mag, prob ] )
 90 | 
 91 | warning("Done\n")
 92 | 
 93 | if len(word_probs1) > 0:
 94 |     var1, div1 = var_div(word_probs1)
 95 |     expect = expectation(word_probs1)
 96 |     print "First block: %d words, exp: %.2f, var: %.2f, div: %.2f" %( len(word_probs1), expect, var1, div1 )
 97 | if len(word_probs2) > 0:
 98 |     var2, div2 = var_div(word_probs2)
 99 |     expect = expectation(word_probs2)
100 |     print "Second block: %d words, exp: %.2f, var: %.2f, div: %.2f" %( len(word_probs2), expect, var2, div2 )
101 | 
102 | 
103 | if doSort:
104 |     warning("Done\nSorting words ascendingly by norm...")
105 |     # sort ascendingly by the norm length
106 |     sorted_wordnorms = sorted( wordnorms, key=operator.itemgetter(1) )
107 |     wordnorms = sorted_wordnorms
108 |     
109 | embeddingFilename = os.path.basename(embeddingFilename)
110 | embeddingFilename = os.path.splitext(embeddingFilename)[0]
111 | 
112 | normFilename = "norms_" + embeddingFilename + "-%d.txt" %( len(V) )
113 | 
114 | warning( "Save norms into %s\n" %normFilename )
115 | NORM = open(normFilename, "w")
116 | 
117 | wc = 0
118 | for word_norm in wordnorms:
119 |     word, norm = word_norm
120 |     NORM.write( "%i %s: %.2f\n" %( word2id[word], word, norm ) )
121 |     wc += 1
122 | 
123 | warning( "%d words saved\n" %wc )    
124 | 


--------------------------------------------------------------------------------
/psdvec/xml2corpus.pl:
--------------------------------------------------------------------------------
 1 | use strict; 
 2 | use warnings;
 3 | use XML::LibXML;
 4 | use File::Find;
 5 | 
 6 | my $rootdir = "D:/corpus/rcv1/";
 7 | my $fc = 0;
 8 | my $totalbytes = 0;
 9 | 
10 | find({ wanted => \&process_file, no_chdir => 1 }, $rootdir);
11 | my $totalMB = int( $totalbytes / 1024 / 1024 );
12 | print STDERR "$fc files processed, totally $totalMB MB\n";
13 | 
14 | sub process_file {
15 |     if ( /\.xml$/ ) {
16 |         my $doc = XML::LibXML->load_xml(location => $_);
17 |         for my $textnode ( $doc->findnodes('/newsitem/text') ){
18 |             print $textnode->textContent();
19 |             $totalbytes += length( $textnode->textContent() );
20 |             $totalMB = int( $totalbytes / 1024 / 1024 );
21 |         }
22 |         $fc++;
23 |         if( $fc % 500 == 0 ){
24 |             print STDERR "\r$fc $totalMB\r";
25 |         }
26 |     }
27 | }
28 | 


--------------------------------------------------------------------------------
/reuters.bat:
--------------------------------------------------------------------------------
1 | python topicExp.py -s reuters train
2 | python topicExp.py -i reuters-train-5770-sep91-em150-best.topic.vec reuters train,test
3 | python classEval.py reuters topicprop
4 | python classEval.py reuters topic-wvavg
5 | 


--------------------------------------------------------------------------------
/snippet2topic.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import getopt
  3 | import sys
  4 | import pdb
  5 | import os
  6 | from topicvecDir import topicvecDir
  7 | from utils import *
  8 | 
  9 | customStopwords = "based via using approach learning multi algorithm algorithms"
 10 | 
 11 | config = dict(  snip_filenames = None,
 12 |                 short_name = None,
 13 |                 unigramFilename = "top1grams-wiki.txt",
 14 |                 word_vec_file = "25000-180000-500-BLK-8.0.vec",
 15 |                 K = 20,
 16 |                 N0 = 500,
 17 |                 max_l = 5,
 18 |                 init_l = 1,
 19 |                 max_grad_norm = 0,
 20 |                 # cap the sum of Em when updating topic embeddings
 21 |                 # to avoid too big gradients
 22 |                 grad_scale_Em_base = 2500,
 23 |                 topW = 30,
 24 |                 topTopicMassFracPrintThres = 0.1,
 25 |                 alpha0 = 0.1,
 26 |                 alpha1 = 0.1,
 27 |                 iniDelta = 0.1,
 28 |                 MAX_EM_ITERS = 100,
 29 |                 topicDiff_tolerance = 2e-3,
 30 |                 printTopics_iterNum = 10,
 31 |                 zero_topic0 = True,
 32 |                 useDrdtApprox = False,
 33 |                 customStopwords = customStopwords,
 34 |                 remove_stop = True,
 35 |                 normalize_vecs = False,
 36 |                 # shift all embeddings in a document, so that their average is 0
 37 |                 rebase_vecs = True,
 38 |                 rebase_norm_thres = 0.2,
 39 |                 evalKmeans = False,
 40 |                 verbose = 1,
 41 |                 seed = 0
 42 |             )
 43 | 
 44 | def usage():
 45 |     print """topicvecDir.py [ -v vec_file -a alpha ... ] snip_file
 46 | Options:
 47 |   -k:  Number of topic embeddings to extract. Default: 20
 48 |   -v:  Existing embedding file of all words.
 49 |   -r:  Existing residual file of core words.
 50 |   -a:  Hyperparameter alpha. Default: 0.1.
 51 |   -i:  Number of iterations of the EM procedure. Default: 100
 52 |   -u:  Unigram file, to obtain unigram probs.
 53 |   -l:  Magnitude of topic embeddings.
 54 |   -A:  Append to the old log file.
 55 |   -s:  Seed the random number generator to x. Used to repeat experiments
 56 |   -n:  Nickname (short name) for the snip_file
 57 | """
 58 | 
 59 | def getOptions():
 60 |     global config
 61 | 
 62 |     try:
 63 |         opts, args = getopt.getopt(sys.argv[1:],"k:v:i:u:l:s:n:Ah")
 64 |         if len(args) != 1:
 65 |             raise getopt.GetoptError("")
 66 |         config['snip_filename'] = args[0]
 67 |             
 68 |         for opt, arg in opts:
 69 |             if opt == '-k':
 70 |                 config['K'] = int(arg)
 71 |             if opt == '-v':
 72 |                 config['vec_file'] = arg
 73 |             if opt == '-a':
 74 |                 config['alpha1'] = float(opt)
 75 |             if opt == '-i':
 76 |                 config['MAX_EM_ITERS'] = int(arg)
 77 |             if opt == '-u':
 78 |                 config['unigramFilename'] = arg
 79 |             if opt == '-l':
 80 |                 config['max_l'] = int(arg)
 81 |             if opt == '-s':
 82 |                 config['seed'] = int(arg)
 83 |             if opt == '-A':
 84 |                 config['appendLogfile'] = True
 85 |             if opt == '-n':
 86 |                 config['short_name'] = arg
 87 |             if opt == '-r':
 88 |                 config['useDrdtApprox'] = True
 89 |             if opt == '-h':
 90 |                 usage()
 91 |                 sys.exit(0)
 92 | 
 93 |         basename = os.path.basename(args[0])
 94 |         if config['short_name']:
 95 |             config['logfilename'] = config['short_name']
 96 |         elif len(args) > 1:
 97 |             config['logfilename'] = "(%d)%s" %( len(args), basename )
 98 |         else:
 99 |             config['logfilename'] = basename
100 | 
101 |     except getopt.GetoptError:
102 |         usage()
103 |         sys.exit(2)
104 | 
105 |     return config
106 | 
107 | def main():
108 | 
109 |     config = getOptions()
110 |     snip_filename = config['snip_filename']
111 |     snips_words = []
112 |     snips_name = []
113 |     
114 |     with open(snip_filename) as DOC:
115 |         snip_lines = []
116 |         snipcount = 0
117 |         snips_wc = 0
118 |         for line in DOC:
119 |             line = line.strip()
120 |             if line:
121 |                 snip_lines.append(line)
122 |             else:
123 |                 sniptext = " ".join(snip_lines)
124 |                 wordsInSentences, wc = extractSentenceWords(sniptext, remove_punc="iso-8859-1")
125 |                 snips_wc += wc
126 |                 snipcount += 1
127 |                 snips_words.append(wordsInSentences)
128 |                 snips_name.append( "%s-row%d" %(snip_filename, snipcount) )
129 |                 
130 |     snipfile_avgwc = snips_wc * 1.0 / snipcount
131 |     print "%d words extracted from %d snippets in '%s'. Avg %.1f words each row" %( snips_wc, 
132 |                 snipcount, snip_filename, snipfile_avgwc )
133 |     
134 |     topicvec = topicvecDir(**config)
135 |     topicvec.setDocs( snips_words, snips_name )
136 |     
137 |     best_last_Ts, Em, docs_Em, Pi = topicvec.inference()
138 | 
139 |     basename = os.path.basename(config['logfilename'])
140 |     basetrunk = os.path.splitext(basename)[0]
141 | 
142 |     best_it, best_T, best_loglike = best_last_Ts[0]
143 |     save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T  )
144 | 
145 |     if best_last_Ts[1]:
146 |         last_it, last_T, last_loglike = best_last_Ts[1]
147 |         save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T  )
148 | 
149 | if __name__ == '__main__':
150 |     main()
151 | 


--------------------------------------------------------------------------------
/test-docs/Drug Goes From 13.50 a Tablet to 750, Overnight.txt:
--------------------------------------------------------------------------------
 1 | Specialists in infectious disease are protesting a gigantic overnight increase in the price of a 62-year-old drug that is the standard of care for treating a life-threatening parasitic infection.
 2 | 
 3 | The drug, called Daraprim, was acquired in August by Turing Pharmaceuticals, a start-up run by a former hedge fund manager. Turing immediately raised the price to $750 a tablet from $13.50, bringing the annual cost of treatment for some patients to hundreds of thousands of dollars.
 4 | 
 5 | “What is it that they are doing differently that has led to this dramatic increase?” said Dr. Judith Aberg, the chief of the division of infectious diseases at the Icahn School of Medicine at Mount Sinai. She said the price increase could force hospitals to use “alternative therapies that may not have the same efficacy.”
 6 | 
 7 | Turing’s price increase is not an isolated example. While most of the attention on pharmaceutical prices has been on new drugs for diseases like cancer, hepatitis C and high cholesterol, there is also growing concern about huge price increases on older drugs, some of them generic, that have long been mainstays of treatment.
 8 | 
 9 | Although some price increases have been caused by shortages, others have resulted from a business strategy of buying old neglected drugs and turning them into high-priced “specialty drugs.”
10 | 
11 | Cycloserine, a drug used to treat dangerous multidrug-resistant tuberculosis, was just increased in price to $10,800 for 30 pills from $500 after its acquisition by Rodelis Therapeutics. Scott Spencer, general manager of Rodelis, said the company needed to invest to make sure the supply of the drug remained reliable. He said the company provided the drug free to certain needy patients.
12 | 
13 | In August, two members of Congress investigating generic drug price increases wrote to Valeant Pharmaceuticals after that company acquired two heart drugs, Isuprel and Nitropress, from Marathon Pharmaceuticals and promptly raised their prices by 525 percent and 212 percent respectively. Marathon had acquired the drugs from another company in 2013 and had quintupled their prices, according to the lawmakers, Senator Bernie Sanders, the Vermont independent who is seeking the Democratic nomination for president, and Representative Elijah E. Cummings, Democrat of Maryland.
14 | 
15 | Doxycycline, an antibiotic, went from $20 a bottle in October 2013 to $1,849 by April 2014, according to the two lawmakers.
16 | 
17 | The Infectious Diseases Society of America and the HIV Medicine Association sent a joint letter to Turing earlier this month calling the price increase for Daraprim “unjustifiable for the medically vulnerable patient population” and “unsustainable for the health care system.” An organization representing the directors of state AIDS programs has also been looking into the price increase, according to doctors and patient advocates.
18 | 
19 | Daraprim, known generically as pyrimethamine, is used mainly to treat toxoplasmosis, a parasite infection that can cause serious or even life-threatening problems for babies born to women who become infected during pregnancy, and also for people with compromised immune systems, like AIDS patients and certain cancer patients.
20 | 
21 | Martin Shkreli, the founder and chief executive of Turing, said that the drug is so rarely used that the impact on the health system would be minuscule and that Turing would use the money it earns to develop better treatments for toxoplasmosis, with fewer side effects.
22 | 
23 | “This isn’t the greedy drug company trying to gouge patients, it is us trying to stay in business,” Mr. Shkreli said. He said that many patients use the drug for far less than a year and that the price was now more in line with those of other drugs for rare diseases.
24 | 
25 | “This is still one of the smallest pharmaceutical products in the world,” he said. “It really doesn’t make sense to get any criticism for this.”
26 | 
27 | This is not the first time the 32-year-old Mr. Shkreli, who has a reputation for both brilliance and brashness, has been the center of controversy. He started MSMB Capital, a hedge fund company, in his 20s and drew attention for urging the Food and Drug Administration not to approve certain drugs made by companies whose stock he was shorting.
28 | 
29 | In 2011, Mr. Shkreli started Retrophin, which also acquired old neglected drugs and sharply raised their prices. Retrophin’s board fired Mr. Shkreli a year ago. Last month, it filed a complaint in Federal District Court in Manhattan, accusing him of using Retrophin as a personal piggy bank to pay back angry investors in his hedge fund.
30 | 
31 | Mr. Shkreli has denied the accusations. He has filed for arbitration against his old company, which he says owes him at least $25 million in severance. “They are sort of concocting this wild and crazy and unlikely story to swindle me out of the money,” he said.
32 | 
33 | Daraprim, which is also used to treat malaria, was approved by the F.D.A. in 1953 and has long been made by GlaxoSmithKline. Glaxo sold United States marketing rights to CorePharma in 2010. Last year, Impax Laboratories agreed to buy Core and affiliated companies for $700 million. In August, Impax sold Daraprim to Turing for $55 million, a deal announced the same day Turing said it had raised $90 million from Mr. Shkreli and other investors in its first round of financing.
34 | 
35 | Daraprim cost only about $1 a tablet several years ago, but the drug’s price rose sharply after CorePharma acquired it. According to IMS Health, which tracks prescriptions, sales of the drug jumped to $6.3 million in 2011 from $667,000 in 2010, even as prescriptions held steady at about 12,700. In 2014, after further price increases, sales were $9.9 million, as the number of prescriptions shrank to 8,821. The figures do not include inpatient use in hospitals.
36 | 
37 | Turing’s price increase could bring sales to tens or even hundreds of millions of dollars a year if use remains constant. Medicaid and certain hospitals will be able to get the drug inexpensively under federal rules for discounts and rebates. But private insurers, Medicare and hospitalized patients would have to pay an amount closer to the list price.
38 | 
39 | 
40 | Some doctors questioned Turing’s claim that there was a need for better drugs, saying the side effects, while potentially serious, could be managed.
41 | 
42 | “I certainly don’t think this is one of those diseases where we have been clamoring for better therapies,” said Dr. Wendy Armstrong, professor of infectious diseases at Emory University in Atlanta.
43 | 
44 | With the price now high, other companies could conceivably make generic copies, since patents have long expired. One factor that could discourage that option is that Daraprim’s distribution is now tightly controlled, making it harder for generic companies to get the samples they need for the required testing.
45 | 
46 | The switch from drugstores to controlled distribution was made in June by Impax, not by Turing. Still, controlled distribution was a strategy Mr. Shkreli talked about at his previous company as a way to thwart generics.
47 | 
48 | Some hospitals say they now have trouble getting the drug. “We’ve not had access to the drug for a few months,” said Dr. Armstrong, who also works at Grady Memorial Hospital, a huge public treatment center in Atlanta that serves many low-income patients.
49 | 
50 | But Dr. Rima McLeod, medical director of the toxoplasmosis center at the University of Chicago, said that Turing had been good about delivering drugs quickly to patients, sometimes without charge.
51 | 
52 | “They have jumped every time I’ve called,” she said. The situation, she added, “seems workable” despite the price increase.
53 | 
54 | Daraprim is the standard first treatment for toxoplasmosis, in combination with an antibiotic called sulfadiazine. There are alternative treatments, but there is less data supporting their efficacy.
55 | 
56 | Dr. Aberg of Mount Sinai said some hospitals will now find Daraprim too expensive to keep in stock, possibly resulting in treatment delays. She said that Mount Sinai was continuing to use the drug, but each use now required a special review.
57 | 
58 | “This seems to be all profit-driven for somebody,” Dr. Aberg said, “and I just think it’s a very dangerous process.”
59 | 


--------------------------------------------------------------------------------
/test-docs/VR-mitrv.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/VR-mitrv.txt


--------------------------------------------------------------------------------
/test-docs/batman-v-superman.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/batman-v-superman.txt


--------------------------------------------------------------------------------
/test-docs/beijing-haze-news.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/beijing-haze-news.txt


--------------------------------------------------------------------------------
/test-docs/brain-scar.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/brain-scar.txt


--------------------------------------------------------------------------------
/test-docs/britain-EU.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/britain-EU.txt


--------------------------------------------------------------------------------
/test-docs/hillary-speech.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/hillary-speech.txt


--------------------------------------------------------------------------------
/test-docs/hillary-speech2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/hillary-speech2.txt


--------------------------------------------------------------------------------
/test-docs/nips-wiki.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/nips-wiki.txt


--------------------------------------------------------------------------------
/test-docs/sanders-speeches.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/sanders-speeches.txt


--------------------------------------------------------------------------------
/test-docs/spacex-news.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/spacex-news.txt


--------------------------------------------------------------------------------
/test-docs/trump-speech.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/trump-speech.txt


--------------------------------------------------------------------------------
/topic-competitors/LDA/LDAClassify.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/LDAClassify.zip


--------------------------------------------------------------------------------
/topic-competitors/LDA/Readme.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/Readme.txt


--------------------------------------------------------------------------------
/topic-competitors/LDA/classEval.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/classEval.py


--------------------------------------------------------------------------------
/topic-competitors/LDA/corpusLoader.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/corpusLoader.py


--------------------------------------------------------------------------------
/topic-competitors/LDA/ldaExp.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/ldaExp.py


--------------------------------------------------------------------------------
/topic-competitors/doc2vec.py:
--------------------------------------------------------------------------------
 1 | import gensim.models.doc2vec as doc2vec
 2 | import sys
 3 | import pdb
 4 | 
 5 | corpus = sys.argv[1]
 6 | 
 7 | if corpus == '20news':
 8 |     all_words_file = "20news-all-18791.gibbslda-bow.txt"
 9 |     train_label_file = "20news-train-11314.slda-label.txt"
10 |     train_docvec_file = "20news-train-11314.svm-doc2vec.txt"
11 |     test_label_file = "20news-test-7532.slda-label.txt"
12 |     test_docvec_file = "20news-test-7532.svm-doc2vec.txt"
13 |     all_count = 18791
14 |     train_count = 11285
15 |     test_count = 7506
16 | else:
17 |     all_words_file = "reuters-all-8025.gibbslda-bow.txt"
18 |     train_label_file = "reuters-train-5770.slda-label.txt"
19 |     train_docvec_file = "reuters-train-5770.svm-doc2vec.txt"
20 |     test_label_file = "reuters-test-2255.slda-label.txt"
21 |     test_docvec_file = "reuters-test-2255.svm-doc2vec.txt"
22 |     all_count = 8025
23 |     train_count = 5770
24 |     test_count = 2255
25 | 
26 | dim = 400
27 | corpus = doc2vec.TaggedLineDocument(all_words_file)
28 | model = doc2vec.Doc2Vec(corpus,size=dim, window=8, min_count=5, workers=4)
29 | TRAIN_DOC2VEC = open(train_docvec_file, "w")
30 | TRAIN_LABEL = open(train_label_file)
31 | 
32 | #pdb.set_trace()
33 | 
34 | for d in xrange(1, train_count + 1):
35 |     doc_vec = model.docvecs[d]
36 |     label_line = TRAIN_LABEL.readline().strip()
37 |     label = int(label_line)
38 | 
39 |     TRAIN_DOC2VEC.write( "%d" %(label+1) )
40 | 
41 |     for k in xrange(dim):
42 |         TRAIN_DOC2VEC.write( " %d:%.3f" %( k + 1, doc_vec[k] ) )
43 | 
44 |     TRAIN_DOC2VEC.write("\n")
45 | 
46 | TRAIN_DOC2VEC.close()
47 | 
48 | print "%d doc vecs written in svm format into '%s'" %( train_count, train_docvec_file )
49 | 
50 | TEST_DOC2VEC = open(test_docvec_file, "w")
51 | TEST_LABEL = open(test_label_file)
52 | for d in xrange(train_count + 1, all_count + 1):
53 |     doc_vec = model.docvecs[d]
54 |     label_line = TEST_LABEL.readline().strip()
55 |     label = int(label_line)
56 | 
57 |     TEST_DOC2VEC.write( "%d" %(label+1) )
58 | 
59 |     for k in xrange(dim):
60 |         TEST_DOC2VEC.write( " %d:%.3f" %( k + 1, doc_vec[k] ) )
61 | 
62 |     TEST_DOC2VEC.write("\n")
63 | 
64 | TEST_DOC2VEC.close()
65 | 
66 | print "%d doc vecs written in svm format into '%s'" %( test_count, test_docvec_file )
67 | 


--------------------------------------------------------------------------------
/topic-competitors/kmeans.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # kmeans.py using any of the 20-odd metrics in scipy.spatial.distance
  3 | # kmeanssample 2 pass, first sample sqrt(N)
  4 | 
  5 | from __future__ import division
  6 | import random
  7 | import numpy as np
  8 | from scipy.spatial.distance import cdist  # $scipy/spatial/distance.py
  9 |     # http://docs.scipy.org/doc/scipy/reference/spatial.html
 10 | from scipy.sparse import issparse  # $scipy/sparse/csr.py
 11 | 
 12 | __date__ = "2011-11-17 Nov denis"
 13 |     # X sparse, any cdist metric: real app ?
 14 |     # centres get dense rapidly, metrics in high dim hit distance whiteout
 15 |     # vs unsupervised / semi-supervised svm
 16 | 
 17 | #...............................................................................
 18 | def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1 ):
 19 |     """ centres, Xtocentre, distances = kmeans( X, initial centres ... )
 20 |     in:
 21 |         X N x dim  may be sparse
 22 |         centres k x dim: initial centres, e.g. random.sample( X, k )
 23 |         delta: relative error, iterate until the average distance to centres
 24 |             is within delta of the previous average distance
 25 |         maxiter
 26 |         metric: any of the 20-odd in scipy.spatial.distance
 27 |             "chebyshev" = max, "cityblock" = L1, "minkowski" with p=
 28 |             or a function( Xvec, centrevec ), e.g. Lqmetric below
 29 |         p: for minkowski metric -- local mod cdist for 0 < p < 1 too
 30 |         verbose: 0 silent, 2 prints running distances
 31 |     out:
 32 |         centres, k x dim
 33 |         Xtocentre: each X -> its nearest centre, ints N -> k
 34 |         distances, N
 35 |     see also: kmeanssample below, class Kmeans below.
 36 |     """
 37 | 
 38 |     if verbose:
 39 |         print "kmeans: X %s  centres %s  delta=%.2g  maxiter=%d  metric=%s" % (
 40 |             X.shape, centres.shape, delta, maxiter, metric)
 41 |     allx = np.arange(N)
 42 |     prevdist = 0
 43 |     for jiter in range( 1, maxiter+1 ):
 44 |         D = cdist( X, centres, metric=metric, p=p )  # |X| x |centres|
 45 |         xtoc = D.argmin(axis=1)  # X -> nearest centre
 46 |         distances = D[allx,xtoc]
 47 |         avdist = distances.mean()  # median ?
 48 |         if verbose >= 2:
 49 |             print "kmeans: av |X - nearest centre| = %.4g" % avdist
 50 |         if (1 - delta) * prevdist <= avdist <= prevdist \
 51 |         or jiter == maxiter:
 52 |             break
 53 |         prevdist = avdist
 54 |         for jc in range(k):  # (1 pass in C)
 55 |             c = np.where( xtoc == jc )[0]
 56 |             if len(c) > 0:
 57 |                 centres[jc] = X[c].mean( axis=0 )
 58 |     if verbose:
 59 |         print "kmeans: %d iterations  cluster sizes:" % jiter, np.bincount(xtoc)
 60 |     if verbose >= 2:
 61 |         r50 = np.zeros(k)
 62 |         r90 = np.zeros(k)
 63 |         for j in range(k):
 64 |             dist = distances[ xtoc == j ]
 65 |             if len(dist) > 0:
 66 |                 r50[j], r90[j] = np.percentile( dist, (50, 90) )
 67 |         print "kmeans: cluster 50 % radius", r50.astype(int)
 68 |         print "kmeans: cluster 90 % radius", r90.astype(int)
 69 |             # scale L1 / dim, L2 / sqrt(dim) ?
 70 |     return centres, xtoc, distances
 71 | 
 72 | def randomsample( X, n ):
 73 |     """ random.sample of the rows of X
 74 |         X may be sparse -- best csr
 75 |     """
 76 |     sampleix = random.sample( xrange( X.shape[0] ), int(n) )
 77 |     return X[sampleix]
 78 | 
 79 | if __name__ == "__main__":
 80 |     import random
 81 |     import sys
 82 |     from time import time
 83 | 
 84 |     N = 10000
 85 |     dim = 10
 86 |     ncluster = 10
 87 |     kmdelta = .001
 88 |     kmiter = 10
 89 |     metric = "cosine"
 90 |     seed = 1
 91 | 
 92 |     np.set_printoptions( 1, threshold=200, edgeitems=5, suppress=True )
 93 |     np.random.seed(seed)
 94 |     random.seed(seed)
 95 | 
 96 |     unigramFilename = "top1grams-wiki.txt"
 97 |     word_vec_file = "25000-180000-500-BLK-8.0.vec"
 98 |                 
 99 |     vocab_dict = loadUnigramFile(unigramFilename)
100 |     V, vocab, word2ID, skippedWords_whatever = load_embeddings(word_vec_file)
101 |     # map of word -> id of all words with embeddings
102 |     vocab_dict2 = {}
103 |     
104 |     if normalize_vecs:
105 |         Vnorm = np.array( [ normF(x) for x in V ] )
106 |         for i,w in enumerate(vocab):
107 |             if Vnorm[i] == 0:
108 |                 print "WARN: %s norm is 0" %w
109 |                 # set to 1 to avoid "divided by 0 exception"
110 |                 Vnorm[i] = 1
111 |             
112 |         V /= Vnorm[:, None]
113 |         
114 |     # dimensionality of topic/word embeddings
115 |     N0 = V.shape[1]
116 | 
117 |     customStopwordList = re.split( "\s+", self.customStopwords )
118 |     for stop_w in customStopwordList:
119 |         stopwordDict[stop_w] = 1
120 |     print "Custom stopwords: %s" %( ", ".join(customStopwordList) )
121 |                 
122 |     print "N %d  dim %d  ncluster %d  metric %s" % (N, dim, ncluster, metric)
123 |     t0 = time()
124 | 
125 |     randomcentres = randomsample( X, ncluster )
126 |     centres, xtoc, dist = kmeans( X, randomcentres,
127 |         delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )
128 |     print "%.0f msec" % ((time() - t0) * 1000)
129 |     


--------------------------------------------------------------------------------
/topic-competitors/labelEval.py:
--------------------------------------------------------------------------------
 1 | from sklearn import metrics
 2 | import sys
 3 | 
 4 | def getScores( true_classes, pred_classes, average):
 5 |     precision = metrics.precision_score( true_classes, pred_classes, average=average )
 6 |     recall = metrics.recall_score( true_classes, pred_classes, average=average )
 7 |     f1 = metrics.f1_score( true_classes, pred_classes, average=average )
 8 |     accuracy = metrics.accuracy_score( true_classes, pred_classes )
 9 |     return precision, recall, f1, accuracy
10 | 
11 | true_labelfile = sys.argv[1]
12 | pred_labelfile = sys.argv[2]
13 | 
14 | TRUE = open(true_labelfile)
15 | PRED = open(pred_labelfile)
16 | 
17 | true_classes = []
18 | pred_classes = []
19 | 
20 | for line in TRUE:
21 |     line = line.strip()
22 |     label = int(line)
23 |     true_classes.append(label)
24 | 
25 | for line in PRED:
26 |     line = line.strip()
27 |     label = int(line)
28 |     pred_classes.append(label)
29 | 
30 | print metrics.classification_report(true_classes, pred_classes, digits=3)
31 | 
32 | for average in ['micro', 'macro']:
33 |     precision, recall, f1, acc = getScores( true_classes, pred_classes, average )
34 |     print "Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %(  average, 
35 |                         precision, recall, f1, acc )
36 | 


--------------------------------------------------------------------------------
/topic-competitors/lftm2svm.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | lftm_file = sys.argv[1]
 4 | 
 5 | train_words_file = "reuters-train-5770.gibbslda-words.txt"
 6 | train_label_file = "reuters-train-5770.slda-label.txt"
 7 | test_words_file = "reuters-test-2255.gibbslda-words.txt"
 8 | test_label_file = "reuters-test-2255.slda-label.txt"
 9 | 
10 | LFTM_TOPIC = open(lftm_file)
11 | TRAIN_WORDS = open(train_words_file)
12 | TEST_WORDS = open(test_words_file)
13 | TRAIN_LABELS = open(train_label_file)
14 | TEST_LABELS = open(test_label_file)
15 | 
16 | for i in xrange(2):
17 | 	WORDS =  [ TRAIN_WORDS, TEST_WORDS ][i]
18 | 	LABELS = [TRAIN_LABELS, TEST_LABELS][i]
19 | 	if i == 0:
20 | 		output_file = "reuters-train-5770.svm-lftm.txt"
21 | 	else:
22 | 		output_file = "reuters-test-2255.svm-lftm.txt"
23 | 
24 | 	OUTPUT = open(output_file, "w")
25 | 	
26 | 	setName = ["train", "test"][i]
27 | 
28 | 	lineno = 0
29 | 	validDocNum = 0
30 | 	for line in WORDS:
31 | 		lineno += 1 
32 | 		line = line.strip()
33 | 		label_line = LABELS.readline().strip()
34 | 		if not line:
35 | 			print "Empty doc %s-%d skipped" %(setName, lineno)
36 | 			continue
37 | 		label = int(label_line)
38 | 		OUTPUT.write( "%d" %(label+1) )
39 | 		lftm_topic_line = LFTM_TOPIC.readline().strip()
40 | 		lftm_topicprops = lftm_topic_line.split(" ")
41 | 		for k in xrange(50):
42 | 			topicprop = float(lftm_topicprops[k])
43 | 			OUTPUT.write( " %d:%.3f" %(k+1, topicprop) )
44 | 		OUTPUT.write("\n")
45 | 		validDocNum += 1
46 | 	print "%d %s docs, %d written into '%s'" %(lineno, setName, validDocNum, output_file)
47 | 	OUTPUT.close()
48 | 
49 | lineno = 0
50 | for line in LFTM_TOPIC:
51 | 	lineno += 1
52 | 
53 | if lineno > 0:
54 | 	print "Warn: %d lines left in '%s'" %(lineno, lftm_file)
55 | 


--------------------------------------------------------------------------------
/topic-competitors/liu-doc2vec.py:
--------------------------------------------------------------------------------
  1 | from utils import *
  2 | import pdb
  3 | 
  4 | def genDocEmbedding( setName, words_file, topics_file, label_file, V, word2ID, T ):
  5 |     WORDS = open(words_file)
  6 |     TOPICS = open(topics_file)
  7 |     LABEL = open(label_file)
  8 | 
  9 |     filename_trunk = words_file.split('.')[0]
 10 |     docvec_file = ".".join( [ filename_trunk, "svm-liu", "txt" ] )
 11 |     docvecbow_file = ".".join( [ filename_trunk, "svm-liubow", "txt" ] )
 12 | 
 13 |     DOCVEC = open( docvec_file, "w" )
 14 |     DOCVECBOW = open( docvecbow_file, "w" )
 15 | 
 16 |     dim = V.shape[1] + T.shape[1]
 17 | 
 18 |     lineno = 0
 19 |     emptyDocIds = []
 20 | 
 21 |     for word_line in WORDS:
 22 |         lineno += 1
 23 |         word_line = word_line.strip()
 24 |         topic_line = TOPICS.readline().strip()
 25 |         label_line = LABEL.readline().strip()
 26 |         # encounter an empty doc
 27 |         if not word_line:
 28 |             words = []
 29 |             topics = []
 30 |         else:
 31 |             words = word_line.split(" ")
 32 |             topics = topic_line.split(" ")
 33 |         assert len(words) == len(topics), \
 34 |             "Words number %d != topic number %d in line %d" %( len(words), len(topics), lineno )
 35 |         label = int(label_line)
 36 | 
 37 |         sum_vec = np.zeros(dim)
 38 |         doc_vec = np.zeros(dim)
 39 |         validWordNum = 0
 40 | 
 41 |         wid2freq = {}
 42 | 
 43 |         for i in xrange(len(words)):
 44 |             word = words[i]
 45 |             topic = int(topics[i])
 46 | 
 47 |             if word not in word2ID:
 48 |                 continue
 49 |             validWordNum += 1
 50 |             wid = word2ID[word]
 51 |             sum_vec += np.concatenate( [ V[wid], T[topic] ] )
 52 | 
 53 |             if wid in wid2freq:
 54 |                 wid2freq[wid] += 1
 55 |             else:
 56 |                 wid2freq[wid] = 1
 57 | 
 58 |         if validWordNum > 0:
 59 |             doc_vec = sum_vec / validWordNum
 60 |         else:
 61 |             emptyDocIds.append(lineno)
 62 | 
 63 |         sorted_wids = sorted( wid2freq.keys() )
 64 | 
 65 |         DOCVEC.write( "%d" %(label+1) )
 66 |         DOCVECBOW.write( "%d" %(label+1) )
 67 |         
 68 |         for k in xrange(dim):
 69 |             DOCVEC.write( " %d:%.3f" %( k + 1, doc_vec[k] ) )
 70 |             DOCVECBOW.write( " %d:%.3f" %( k + 1, doc_vec[k] ) )
 71 | 
 72 |         for wid in sorted_wids:
 73 |             # first dim indices are reserved for topic features, so add dim here
 74 |             # add 1 to make wid start from 1
 75 |             DOCVECBOW.write( " %d:%d" %( wid + dim + 1, wid2freq[wid] ) )
 76 | 
 77 |         DOCVEC.write("\n")
 78 |         DOCVECBOW.write("\n")
 79 | 
 80 |     print "%d %s docs converted to Liu et al's docvec in svm format." %( lineno, setName )
 81 |     if len(emptyDocIds) > 0:
 82 |         print "Empty docs: %s" %emptyDocIds
 83 | 
 84 |     DOCVEC.close()
 85 |     DOCVECBOW.close()
 86 |     WORDS.close()
 87 |     TOPICS.close()
 88 |     LABEL.close()
 89 | 
 90 | corpus = sys.argv[1]
 91 | 
 92 | if corpus == '20news':
 93 |     train_words_file = "20news-train-11314.gibbslda-words.txt"
 94 |     train_topics_file = "20news-train-11314.gibbslda-topics.txt"
 95 |     train_wordvec_file = "20news-train-11314.liu-wordvec2.txt"
 96 |     train_topicvec_file = "20news-train-11314.liu-topicvec2.txt"
 97 |     train_label_file = "20news-train-11314.slda-label.txt"
 98 |     test_words_file = "20news-test-7532.gibbslda-words.txt"
 99 |     test_topics_file = "20news-test-7532.gibbslda-topics.txt"
100 |     test_label_file = "20news-test-7532.slda-label.txt"
101 | else:
102 |     train_words_file = "reuters-train-5770.gibbslda-words.txt"
103 |     train_topics_file = "reuters-train-5770.gibbslda-topics.txt"
104 |     train_wordvec_file = "reuters-train-5770.liu-wordvec2.txt"
105 |     train_topicvec_file = "reuters-train-5770.liu-topicvec2.txt"
106 |     train_label_file = "reuters-train-5770.slda-label.txt"
107 |     test_words_file = "reuters-test-2255.gibbslda-words.txt"
108 |     test_topics_file = "reuters-test-2255.gibbslda-topics.txt"
109 |     test_label_file = "reuters-test-2255.slda-label.txt"
110 | 
111 | V, vocab, word2ID, skippedWords_whatever = load_embeddings(train_wordvec_file)
112 | T = load_matrix_from_text( train_topicvec_file, "topic embedding" )
113 | genDocEmbedding( "train", train_words_file, train_topics_file, train_label_file, V, word2ID, T )
114 | genDocEmbedding( "test", test_words_file, test_topics_file, test_label_file, V, word2ID, T )
115 | 


--------------------------------------------------------------------------------
/topic-competitors/rajarshd-Gaussian_LDA.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/rajarshd-Gaussian_LDA.zip


--------------------------------------------------------------------------------
/topic-competitors/sHDP.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/sHDP.zip


--------------------------------------------------------------------------------
/topic-competitors/slda/Makefile:
--------------------------------------------------------------------------------
 1 | CC = g++
 2 | LDFLAGS = -lgsl -lm -lgslcblas
 3 | 
 4 | 
 5 | LSOURCE = main.cpp corpus.cpp slda.cpp utils.cpp opt.cpp
 6 | LHEADER = corpus.h slda.h utils.h opt.h settings.h
 7 | 
 8 | slda: $(LSOURCE) $(HEADER)
 9 | 	  $(CC) $(LSOURCE) -o $@ $(LDFLAGS)
10 | 
11 | clean:
12 | 	-rm -f *.o slda
13 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/corpus.cpp:
--------------------------------------------------------------------------------
  1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei
  2 | 
  3 | // written by Chong Wang, chongw@cs.princeton.edu
  4 | 
  5 | // This file is part of slda.
  6 | 
  7 | // slda is free software; you can redistribute it and/or modify it under
  8 | // the terms of the GNU General Public License as published by the Free
  9 | // Software Foundation; either version 2 of the License, or (at your
 10 | // option) any later version.
 11 | 
 12 | // slda is distributed in the hope that it will be useful, but WITHOUT
 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 14 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 15 | // for more details.
 16 | 
 17 | // You should have received a copy of the GNU General Public License
 18 | // along with this program; if not, write to the Free Software
 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 20 | // USA
 21 | 
 22 | #include "corpus.h"
 23 | #include <assert.h>
 24 | #include <stdio.h>
 25 | 
 26 | corpus::corpus()
 27 | {
 28 |     num_docs = 0;
 29 |     size_vocab = 0;
 30 |     num_classes = 0;
 31 |     num_total_words = 0;
 32 | }
 33 | 
 34 | corpus::~corpus()
 35 | {
 36 |     for (int i = 0; i < num_docs; i ++)
 37 |     {
 38 |         document * doc = docs[i];
 39 |         delete doc;
 40 |     }
 41 |     docs.clear();
 42 | 
 43 |     num_docs = 0;
 44 |     size_vocab = 0;
 45 |     num_classes = 0;
 46 |     num_total_words = 0;
 47 | }
 48 | 
 49 | void corpus::read_data(const char * data_filename,
 50 |                        const char * label_filename)
 51 | {
 52 |     int OFFSET = 0;
 53 |     int length = 0, count = 0, word = 0,
 54 |         n = 0, nd = 0, nw = 0, label = -1;
 55 | 
 56 |     FILE * fileptr;
 57 |     fileptr = fopen(data_filename, "r");
 58 |     printf("\nreading data from %s\n", data_filename);
 59 |     nd = 0;
 60 |     nw = 0;
 61 | 
 62 |     while ((fscanf(fileptr, "%10d", &length) != EOF))
 63 |     {
 64 |         document * doc = new document(length);
 65 |         for (n = 0; n < length; n++)
 66 |         {
 67 |             fscanf(fileptr, "%10d:%10d", &word, &count);
 68 |             word = word - OFFSET;
 69 |             doc->words[n] = word;
 70 |             doc->counts[n] = count;
 71 |             doc->total += count;
 72 |             if (word >= nw)
 73 |             {
 74 |                 nw = word + 1;
 75 |             }
 76 |         }
 77 |         num_total_words += doc->total;
 78 |         docs.push_back(doc);
 79 |         nd++;
 80 |     }
 81 |     fclose(fileptr);
 82 |     num_docs = nd;
 83 |     size_vocab = nw;
 84 |     printf("number of docs  : %d\n", nd);
 85 |     printf("number of terms : %d\n", nw);
 86 |     printf("number of total words : %d\n", num_total_words);
 87 | 
 88 |     fileptr = fopen(label_filename, "r");
 89 |     printf("\nreading labels from %s\n", label_filename);
 90 |     nd = 0;
 91 |     while ((fscanf(fileptr, "%10d", &label) != EOF))
 92 |     {
 93 |         document * doc = docs[nd];
 94 |         doc->label = label;
 95 |         if (label >= num_classes)
 96 |         {
 97 |             num_classes = label + 1;
 98 |         }
 99 |         nd ++;
100 |     }
101 |     assert(nd == int(docs.size()));
102 |     printf("number of classes : %d\n\n", num_classes);
103 | }
104 | 
105 | int corpus::max_corpus_length() {
106 |     int max_length = 0;
107 | 
108 |     for (int d = 0; d < num_docs; d++) {
109 |         if (docs[d]->length > max_length)
110 |             max_length = docs[d]->length;
111 |     }
112 |     return max_length;
113 | }
114 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/corpus.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei
 2 | 
 3 | // written by Chong Wang, chongw@cs.princeton.edu
 4 | 
 5 | // This file is part of slda.
 6 | 
 7 | // slda is free software; you can redistribute it and/or modify it under
 8 | // the terms of the GNU General Public License as published by the Free
 9 | // Software Foundation; either version 2 of the License, or (at your
10 | // option) any later version.
11 | 
12 | // slda is distributed in the hope that it will be useful, but WITHOUT
13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 | // for more details.
16 | 
17 | // You should have received a copy of the GNU General Public License
18 | // along with this program; if not, write to the Free Software
19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 | // USA
21 | 
22 | #ifndef CORPUS_H
23 | #define CORPUS_H
24 | 
25 | #include <cstddef>
26 | #include <vector>
27 | using namespace std;
28 | 
29 | class document
30 | {
31 | public:
32 |     int * words;
33 |     int * counts;
34 |     int length;
35 |     int total;
36 |     int label;
37 | public:
38 |     document()
39 |     {
40 |         words = NULL;
41 |         counts = NULL;
42 |         length = 0;
43 |         total = 0;
44 |         label = -1;
45 |     }
46 |     document(int len)
47 |     {
48 |         length = len;
49 |         words = new int [length];
50 |         counts = new int [length];
51 |         total = 0;
52 |         label = -1;
53 |     }
54 |     ~document()
55 |     {
56 |         if (words != NULL)
57 |         {
58 |             delete [] words;
59 |             delete [] counts;
60 |             length = 0;
61 |             total = 0;
62 |             label = -1;
63 |         }
64 |     }
65 | };
66 | 
67 | class corpus
68 | {
69 | public:
70 |     corpus();
71 |     ~corpus();
72 |     void read_data(const char * data_filename, const char * label_filename);
73 |     int max_corpus_length();
74 | public:
75 |     int num_docs;
76 |     int size_vocab;
77 |     int num_classes;
78 |     int num_total_words;
79 |     vector<document*> docs;
80 | };
81 | 
82 | #endif // CORPUS_H
83 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/cygblas-0.dll:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/slda/cygblas-0.dll


--------------------------------------------------------------------------------
/topic-competitors/slda/main.cpp:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei
 2 | 
 3 | // written by Chong Wang, chongw@cs.princeton.edu
 4 | 
 5 | // This file is part of slda.
 6 | 
 7 | // slda is free software; you can redistribute it and/or modify it under
 8 | // the terms of the GNU General Public License as published by the Free
 9 | // Software Foundation; either version 2 of the License, or (at your
10 | // option) any later version.
11 | 
12 | // slda is distributed in the hope that it will be useful, but WITHOUT
13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 | // for more details.
16 | 
17 | // You should have received a copy of the GNU General Public License
18 | // along with this program; if not, write to the Free Software
19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 | // USA
21 | 
22 | #include <stdio.h>
23 | #include <string.h>
24 | #include "corpus.h"
25 | #include "utils.h"
26 | #include "slda.h"
27 | 
28 | void help( void ) {
29 |     printf("usage: slda [est] [data] [label] [settings] [alpha] [k] [random/seeded/model_path] [directory]\n");
30 |     printf("       slda [inf] [data] [label] [settings] [model] [directory]\n");
31 | }
32 | 
33 | int main(int argc, char* argv[])
34 | {
35 |     if (argc < 2)
36 |     {
37 |         help();
38 |         return 0;
39 |     }
40 |     if (strcmp(argv[1], "est") == 0)
41 |     {
42 |         corpus c;
43 |         char * data_filename = argv[2];
44 |         char * label_filename = argv[3];
45 |         c.read_data(data_filename, label_filename);
46 |         settings setting;
47 |         char * setting_filename = argv[4];
48 |         setting.read_settings(setting_filename);
49 | 
50 |         double alpha = atof(argv[5]);
51 |         int num_topics = atoi(argv[6]);
52 |         printf("number of topics is %d\n", num_topics);
53 |         char * init_method = argv[7];
54 |         char * directory = argv[8];
55 |         printf("models will be saved in %s\n", directory);
56 |         make_directory(directory);
57 | 
58 |         slda model;
59 |         model.init(alpha, num_topics, &c);
60 |         model.v_em(&c, &setting, init_method, directory);
61 |     }
62 | 
63 |     if (strcmp(argv[1], "inf") == 0)
64 |     {
65 |         corpus c;
66 |         char * data_filename = argv[2];
67 |         char * label_filename = argv[3];
68 |         c.read_data(data_filename, label_filename);
69 |         settings setting;
70 |         char * setting_filename = argv[4];
71 |         setting.read_settings(setting_filename);
72 | 
73 |         char * model_filename = argv[5];
74 |         char * directory = argv[6];
75 |         printf("\nresults will be saved in %s\n", directory);
76 |         make_directory(directory);
77 | 
78 |         slda model;
79 |         model.load_model(model_filename);
80 |         model.infer_only(&c, &setting, directory);
81 |     }
82 | 
83 |     return 0;
84 | }
85 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/opt.cpp:
--------------------------------------------------------------------------------
  1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei
  2 | 
  3 | // written by Chong Wang, chongw@cs.princeton.edu
  4 | 
  5 | // This file is part of slda.
  6 | 
  7 | // slda is free software; you can redistribute it and/or modify it under
  8 | // the terms of the GNU General Public License as published by the Free
  9 | // Software Foundation; either version 2 of the License, or (at your
 10 | // option) any later version.
 11 | 
 12 | // slda is distributed in the hope that it will be useful, but WITHOUT
 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 14 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 15 | // for more details.
 16 | 
 17 | // You should have received a copy of the GNU General Public License
 18 | // along with this program; if not, write to the Free Software
 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 20 | // USA
 21 | #include "opt.h"
 22 | #include "slda.h"
 23 | #include "utils.h"
 24 | /*
 25 |  * Here the implementation is slightly different from the equations
 26 |  * in the paper, we instead use a second-order taylor expansion to approximate
 27 |  * the second line in eqaution (6).
 28 |  */
 29 | 
 30 | double softmax_f(const gsl_vector * x, void * opt_param)
 31 | {
 32 |     opt_parameter * gsl_param = (opt_parameter *)opt_param;
 33 |     double PENALTY = gsl_param->PENALTY;
 34 |     slda * model = gsl_param->model;
 35 |     suffstats * ss = gsl_param->ss;
 36 | 
 37 |     double f, t, a1 = 0.0, a2 = 0.0;
 38 | 
 39 |     int k, d, j, l, idx;
 40 | 
 41 |     double f_regularization = 0.0;
 42 | 
 43 | 
 44 |     for (l = 0; l < model->num_classes-1; l ++)
 45 |     {
 46 |         for (k = 0; k < model->num_topics; k ++)
 47 |         {
 48 |             model->eta[l][k] = gsl_vector_get(x, l*model->num_topics + k);
 49 |             f_regularization -= pow(model->eta[l][k], 2) * PENALTY/2.0;
 50 |         }
 51 |     }
 52 |     f = 0.0; //log likelihood
 53 |     for (d = 0; d < ss->num_docs; d ++)
 54 |     {
 55 |         for (k = 0; k < model->num_topics; k ++)
 56 |         {
 57 |             if (ss->labels[d] < model->num_classes-1)
 58 |             {
 59 |                 f += model->eta[ss->labels[d]][k] * ss->z_bar[d].z_bar_m[k];
 60 |             }
 61 |         }
 62 | 
 63 |         t = 0.0; // in log space,  1+exp()+exp()...
 64 |         for (l = 0; l < model->num_classes-1; l ++)
 65 |         {
 66 |             a1 = 0.0; // \eta_k^T * \bar{\phi}_d
 67 |             a2 = 0.0; // 1 + 0.5 * \eta_k^T * Var(z_bar)\eta_k
 68 |             for (k = 0; k < model->num_topics; k ++)
 69 |             {
 70 |                 a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k];
 71 |                 for (j = 0; j < model->num_topics; j ++)
 72 |                 {
 73 |                     idx = map_idx(k, j, model->num_topics);
 74 |                     a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j];
 75 |                 }
 76 |             }
 77 |             a2 = 1.0 + 0.5 * a2;
 78 |             t = log_sum(t, a1 + log(a2));
 79 |         }
 80 |         f -= t; 
 81 |     }
 82 | 
 83 |     return -(f + f_regularization);
 84 | }
 85 | void softmax_df(const gsl_vector * x, void * opt_param, gsl_vector * df)
 86 | {
 87 | 
 88 |     opt_parameter * gsl_param = (opt_parameter *)opt_param;
 89 |     double PENALTY = gsl_param->PENALTY;
 90 |     slda * model = gsl_param->model;
 91 |     suffstats * ss = gsl_param->ss;
 92 |     gsl_vector_set_zero(df);
 93 |     gsl_vector * df_tmp = gsl_vector_alloc(df->size);
 94 | 
 95 |     double t, a1 = 0.0, a2 = 0.0, g;
 96 |     int k, d, j, l, idx;
 97 | 
 98 |     double * eta_aux = new double [model->num_topics];
 99 | 
100 |     for (l = 0; l < model->num_classes-1; l ++)
101 |     {
102 |         for (k = 0; k < model->num_topics; k ++)
103 |         {
104 |             idx = l*model->num_topics + k;
105 |             model->eta[l][k] = gsl_vector_get(x, idx); 
106 |             g = -PENALTY * model->eta[l][k];
107 |             gsl_vector_set(df, idx, g);
108 |         }
109 |     }
110 |     for (d = 0; d < ss->num_docs; d ++)
111 |     {
112 |         for (k = 0; k < model->num_topics; k ++)
113 |         {
114 |             l = ss->labels[d];
115 |             if (l < model->num_classes-1)
116 |             {
117 |                 idx = l*model->num_topics + k;
118 |                 g = gsl_vector_get(df, idx) + ss->z_bar[d].z_bar_m[k];
119 |                 gsl_vector_set(df, idx, g);
120 |             }
121 |         }
122 | 
123 |         t = 0.0; // in log space, 1+exp()+exp()+....
124 |         gsl_vector_memcpy(df_tmp, df);
125 |         gsl_vector_set_zero(df);
126 |         for (l = 0; l < model->num_classes-1; l ++)
127 |         {
128 |             memset(eta_aux, 0, sizeof(double)*model->num_topics);
129 |             a1 = 0.0; // \eta_k^T * \bar{\phi}_d
130 |             a2 = 0.0; // 1 + 0.5*\eta_k^T * Var(z_bar)\eta_k
131 |             for (k = 0; k < model->num_topics; k ++)
132 |             {
133 |                 a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k];
134 |                 for (j = 0; j < model->num_topics; j ++)
135 |                 {
136 |                     idx = map_idx(k, j, model->num_topics);
137 |                     a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j];
138 |                     eta_aux[k] += ss->z_bar[d].z_bar_var[idx] * model->eta[l][j];
139 |                 }
140 |             }
141 |             a2 = 1.0 + 0.5 * a2;
142 |             t = log_sum(t, a1 + log(a2));
143 | 
144 |             for (k = 0; k < model->num_topics; k ++)
145 |             {
146 |                 idx = l*model->num_topics + k;
147 |                 g =  gsl_vector_get(df, idx) -
148 |                      exp(a1) * (ss->z_bar[d].z_bar_m[k] * a2 + eta_aux[k]);
149 |                 gsl_vector_set(df, idx, g);
150 |             }
151 |         }
152 |         gsl_vector_scale(df, exp(-t));
153 |         gsl_vector_add(df, df_tmp);
154 |     }
155 |     gsl_vector_scale(df, -1.0);
156 |     delete [] eta_aux;
157 |     gsl_vector_free(df_tmp);
158 | }
159 | void softmax_fdf(const gsl_vector * x, void * opt_param, double * f, gsl_vector * df)
160 | {
161 |     opt_parameter * gsl_param = (opt_parameter *)opt_param;
162 |     double PENALTY = gsl_param->PENALTY;
163 |     slda * model = gsl_param->model;
164 |     suffstats * ss = gsl_param->ss;
165 |     gsl_vector_set_zero(df);
166 |     gsl_vector * df_tmp = gsl_vector_alloc(df->size);
167 | 
168 |     double t, a1 = 0.0, a2 = 0.0, g;
169 |     int k, d, j, l, idx;
170 | 
171 |     double f_regularization = 0.0;
172 | 
173 |     double* eta_aux = new double [model->num_topics];
174 | 
175 |     for (l = 0; l < model->num_classes-1; l ++)
176 |     {
177 |         for (k = 0; k < model->num_topics; k ++)
178 |         {
179 |             model->eta[l][k] = gsl_vector_get(x, l*model->num_topics + k);
180 |             f_regularization -= pow(model->eta[l][k], 2) * PENALTY/2.0;
181 |             idx = l*model->num_topics + k;
182 |             g = -PENALTY * model->eta[l][k];
183 |             gsl_vector_set(df, idx, g);
184 |         }
185 |     }
186 |     *f = 0.0; //log likelihood
187 |     for (d = 0; d < ss->num_docs; d ++)
188 |     {
189 |         for (k = 0; k < model->num_topics; k ++)
190 |         {
191 |             l = ss->labels[d];
192 |             if (l < model->num_classes-1)
193 |             {
194 |                 *f += model->eta[l][k] * ss->z_bar[d].z_bar_m[k];
195 |                 idx = l*model->num_topics + k;
196 |                 g = gsl_vector_get(df, idx) + ss->z_bar[d].z_bar_m[k];
197 |                 gsl_vector_set(df, idx, g);
198 |             }
199 |         }
200 |         t = 0.0; // in log space,  base class 1+exp()+exp()
201 |         gsl_vector_memcpy(df_tmp, df);
202 |         gsl_vector_set_zero(df);
203 |         for (l = 0; l < model->num_classes-1; l ++)
204 |         {
205 |             memset(eta_aux, 0, sizeof(double)*model->num_topics);
206 |             a1 = 0.0; // \eta_k^T * \bar{\phi}_d
207 |             a2 = 0.0; // 1 + 0.5 * \eta_k^T * Var(z_bar)\eta_k
208 |             for (k = 0; k < model->num_topics; k ++)
209 |             {
210 |                 a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k];
211 |                 for (j = 0; j < model->num_topics; j ++)
212 |                 {
213 |                     idx = map_idx(k, j, model->num_topics);
214 |                     a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j];
215 |                     eta_aux[k] += ss->z_bar[d].z_bar_var[idx] * model->eta[l][j];
216 |                 }
217 |             }
218 |             a2 = 1.0 + 0.5 * a2;
219 |             t = log_sum(t, a1 + log(a2));
220 | 
221 |             for (k = 0; k < model->num_topics; k ++)
222 |             {
223 |                 idx = l*model->num_topics + k;
224 |                 g =  gsl_vector_get(df, idx) -
225 |                      exp(a1) * (ss->z_bar[d].z_bar_m[k] * a2 + eta_aux[k]);
226 |                 gsl_vector_set(df, idx, g);
227 |             }
228 |         }
229 |         gsl_vector_scale(df, exp(-t));
230 |         gsl_vector_add(df, df_tmp);
231 |         *f -= t; 
232 |     }
233 |     gsl_vector_scale(df, -1.0);
234 |     *f = -(*f + f_regularization);
235 |     delete [] eta_aux;
236 |     gsl_vector_free(df_tmp);
237 | }
238 | 
239 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/opt.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei
 2 | 
 3 | // written by Chong Wang, chongw@cs.princeton.edu
 4 | 
 5 | // This file is part of slda.
 6 | 
 7 | // slda is free software; you can redistribute it and/or modify it under
 8 | // the terms of the GNU General Public License as published by the Free
 9 | // Software Foundation; either version 2 of the License, or (at your
10 | // option) any later version.
11 | 
12 | // slda is distributed in the hope that it will be useful, but WITHOUT
13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 | // for more details.
16 | 
17 | // You should have received a copy of the GNU General Public License
18 | // along with this program; if not, write to the Free Software
19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 | // USA
21 | #ifndef OPT_H_INCLUDED
22 | #define OPT_H_INCLUDED
23 | #include <gsl/gsl_vector.h>
24 | #include "slda.h"
25 | 
26 | /*
27 |  * structure for the gsl optimization routine
28 |  *
29 |  */
30 | 
31 | struct opt_parameter
32 | {
33 | 	suffstats * ss;
34 | 	slda * model;
35 | 	double PENALTY;
36 | };
37 | 
38 | /*
39 |  * function to compute the value of the obj function, then 
40 |  * return it
41 |  */
42 | 
43 | double softmax_f(const gsl_vector * x, void * opt_param);
44 | 
45 | /*
46 |  * function to compute the derivatives of function 
47 |  *
48 |  */
49 | 
50 | void softmax_df(const gsl_vector * x, void * opt_param, gsl_vector * df);
51 | 
52 | /*
53 |  * function to compute the value and derivatives of the function 
54 |  *
55 |  */
56 | 
57 | void softmax_fdf(const gsl_vector * x, void * opt_param, double * f, gsl_vector * df);
58 | 
59 | #endif // OPT_H_INCLUDED
60 | 
61 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/readme.txt:
--------------------------------------------------------------------------------
  1 | **********************************************************
  2 | SUPERVISED LATENT DIRICHLET ALLOCATION FOR CLASSIFICATION
  3 | **********************************************************
  4 | 
  5 | (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei
  6 | 
  7 | written by Chong Wang, chongw@cs.princeton.edu, part of code
  8 | is from http://www.cs.princeton.edu/~blei/lda-c/index.html.
  9 | 
 10 | This file is part of slda.
 11 | 
 12 | slda is free software; you can redistribute it and/or modify it under
 13 | the terms of the GNU General Public License as published by the Free
 14 | Software Foundation; either version 2 of the License, or (at your
 15 | option) any later version.
 16 | 
 17 | slda is distributed in the hope that it will be useful, but WITHOUT
 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 19 | FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
 20 | for more details.
 21 | 
 22 | You should have received a copy of the GNU General Public License
 23 | along with this program; if not, write to the Free Software
 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
 25 | USA
 26 | 
 27 | 
 28 | ------------------------------------------------------------------------
 29 | 
 30 | This is a C++ implementation of supervised latent Dirichlet allocation (sLDA) 
 31 | for classification.
 32 | 
 33 | Note that this code requires the Gnu Scientific Library, http://www.gnu.org/software/gsl/
 34 | 
 35 | ------------------------------------------------------------------------
 36 | 
 37 | 
 38 | TABLE OF CONTENTS
 39 | 
 40 | 
 41 | A. COMPILING
 42 | 
 43 | B. ESTIMATION
 44 | 
 45 | C. INFERENCE
 46 | 
 47 | 
 48 | ------------------------------------------------------------------------
 49 | 
 50 | A. COMPILING
 51 | 
 52 | Type "make" in a shell. Make sure the GSL is installed.
 53 | 
 54 | 
 55 | ------------------------------------------------------------------------
 56 | 
 57 | B. ESTIMATION
 58 | 
 59 | Estimate the model by executing:
 60 | 
 61 |      slda [est] [data] [label] [settings] [alpha] [k] [seeded/random/model_path] [directory]
 62 | 
 63 | The saved models are in two files:
 64 | 
 65 |      <iteration>.model is the model saved in the binary format, which is easy and
 66 |      fast to use for inference.
 67 | 
 68 |      <iteration>.model.txt is the model saved in the text format, which is
 69 |      convenient for printing topics or analysis using python.
 70 |      
 71 | 
 72 | The variational posterior Dirichlets are in:
 73 | 
 74 |      <iteration>.gamma
 75 | 
 76 | 
 77 | Data format
 78 | 
 79 | (1) [data] is a file where each line is of the form:
 80 | 
 81 |      [M] [term_1]:[count] [term_2]:[count] ...  [term_N]:[count]
 82 | 
 83 | where [M] is the number of unique terms in the document, and the
 84 | [count] associated with each term is how many times that term appeared
 85 | in the document. 
 86 | 
 87 | (2) [label] is a file where each line is the corresponding label for [data].
 88 | The labels must be 0, 1, ..., C-1, if we have C classes.
 89 | 
 90 | 
 91 | ------------------------------------------------------------------------
 92 | 
 93 | C. INFERENCE
 94 | 
 95 | To perform inference on a different set of data (in the same format as
 96 | for estimation), execute:
 97 | 
 98 |      slda [inf] [data] [label] [settings] [model] [directory]
 99 |     
100 | where [model] is the binary file from the estimation.
101 |      
102 | The predictive labels are in:
103 | 
104 |      inf-labels.dat
105 | 
106 | The variational posterior Dirichlets are in:
107 | 
108 |      inf-gamma.dat
109 | 
110 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/settings.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei
 2 | 
 3 | // written by Chong Wang, chongw@cs.princeton.edu
 4 | 
 5 | // This file is part of slda.
 6 | 
 7 | // slda is free software; you can redistribute it and/or modify it under
 8 | // the terms of the GNU General Public License as published by the Free
 9 | // Software Foundation; either version 2 of the License, or (at your
10 | // option) any later version.
11 | 
12 | // slda is distributed in the hope that it will be useful, but WITHOUT
13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 | // for more details.
16 | 
17 | // You should have received a copy of the GNU General Public License
18 | // along with this program; if not, write to the Free Software
19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 | // USA
21 | #ifndef SETTINGS_H
22 | #define SETTINGS_H
23 | #include <stdio.h>
24 | #include <string.h>
25 | 
26 | struct settings
27 | {
28 |     float VAR_CONVERGED;
29 |     int   VAR_MAX_ITER;
30 |     float EM_CONVERGED;
31 |     int   EM_MAX_ITER;
32 |     int   ESTIMATE_ALPHA;
33 |     float PENALTY;
34 | 
35 |     void read_settings(char* filename)
36 |     {
37 |         FILE * fileptr;
38 |         char alpha_action[100];
39 | 
40 |         fileptr = fopen(filename, "r");
41 |         fscanf(fileptr, "var max iter %d\n", &this->VAR_MAX_ITER);
42 |         fscanf(fileptr, "var convergence %f\n", &this->VAR_CONVERGED);
43 |         fscanf(fileptr, "em max iter %d\n", &this->EM_MAX_ITER);
44 |         fscanf(fileptr, "em convergence %f\n", &this->EM_CONVERGED);
45 |         fscanf(fileptr, "L2 penalty %f\n", &this->PENALTY);
46 | 
47 |         fscanf(fileptr, "alpha %s", alpha_action);
48 |         if (strcmp(alpha_action, "fixed") == 0)
49 |         {
50 |             this->ESTIMATE_ALPHA = 0;
51 |             printf("alpha is fixed ...\n");
52 |         }
53 |         else
54 |         {
55 |             this->ESTIMATE_ALPHA = 1;
56 |             printf("alpha is esimated ...\n");
57 |         }
58 |         fclose(fileptr);
59 |         printf("var max iter %d\n", this->VAR_MAX_ITER);
60 |         printf("var convergence %.2E\n", this->VAR_CONVERGED);
61 |         printf("em max iter %d\n", this->EM_MAX_ITER);
62 |         printf("em convergence %.2E\n", this->EM_CONVERGED);
63 |         printf("L2 penalty %.2E\n", this->PENALTY);
64 |     }
65 | };
66 | 
67 | #endif // SETTINGS_H
68 | 
69 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/settings.txt:
--------------------------------------------------------------------------------
1 | var max iter 20
2 | var convergence 1e-3
3 | em max iter 50
4 | em convergence 1e-4
5 | L2 penalty 0.01
6 | alpha fixed
7 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/slda:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/slda/slda


--------------------------------------------------------------------------------
/topic-competitors/slda/slda.exe:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/slda/slda.exe


--------------------------------------------------------------------------------
/topic-competitors/slda/slda.h:
--------------------------------------------------------------------------------
 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei
 2 | 
 3 | // written by Chong Wang, chongw@cs.princeton.edu
 4 | 
 5 | // This file is part of slda.
 6 | 
 7 | // slda is free software; you can redistribute it and/or modify it under
 8 | // the terms of the GNU General Public License as published by the Free
 9 | // Software Foundation; either version 2 of the License, or (at your
10 | // option) any later version.
11 | 
12 | // slda is distributed in the hope that it will be useful, but WITHOUT
13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 | // FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
15 | // for more details.
16 | 
17 | // You should have received a copy of the GNU General Public License
18 | // along with this program; if not, write to the Free Software
19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307
20 | // USA
21 | 
22 | #ifndef SLDA_H
23 | #define SLDA_H
24 | #include "settings.h"
25 | #include "corpus.h"
26 | 
27 | typedef struct {
28 |     double * z_bar_m;
29 |     double * z_bar_var;
30 | } z_stat;
31 | 
32 | typedef struct {
33 |     double ** word_ss;
34 |     double * word_total_ss;
35 |     int num_docs;
36 |     z_stat * z_bar;
37 |     int * labels;
38 |     int * tot_labels;
39 | } suffstats;
40 | 
41 | class slda
42 | {
43 | public:
44 |     slda();
45 |     ~slda();
46 |     void free_model();
47 |     void init(double alpha_, int num_topics_, const corpus * c);
48 |     void v_em(corpus * c, const settings * setting,
49 |               const char * start, const char * directory);
50 | 
51 |     void save_model(const char * filename);
52 |     void save_model_text(const char * filename);
53 |     void load_model(const char * model_filename);
54 |     void infer_only(corpus * c, const settings * setting,
55 |                     const char * directory);
56 | 
57 |     suffstats * new_suffstats(int num_docs);
58 |     void free_suffstats(suffstats * ss);
59 |     void zero_initialize_ss(suffstats * ss);
60 |     void random_initialize_ss(suffstats * ss, corpus * c);
61 |     void corpus_initialize_ss(suffstats* ss, corpus * c);
62 |     void load_model_initialize_ss(suffstats* ss, corpus * c);
63 |     void mle(suffstats * ss, int eta_update, const settings * setting);
64 | 
65 |     double doc_e_step(document* doc, double* gamma, double** phi, suffstats * ss, int eta_update, const settings * setting);
66 | 
67 |     double lda_inference(document* doc, double* var_gamma, double** phi, const settings * setting);
68 |     double lda_compute_likelihood(document* doc, double** phi, double* var_gamma);
69 |     double slda_inference(document* doc, double* var_gamma, double** phi, const settings * setting);
70 |     double slda_compute_likelihood(document* doc, double** phi, double* var_gamma);
71 | 
72 |     void save_gamma(char* filename, double** gamma, int num_docs);
73 |     void write_word_assignment(FILE* f, document* doc, double** phi);
74 | 
75 | 
76 | public:
77 |     double alpha; // the parameter for the dirichlet
78 |     int num_topics;
79 |     int num_classes;
80 |     int size_vocab;
81 | 
82 |     double ** log_prob_w; //the log of the topic distribution
83 |     double ** eta; //softmax regression, in general, there are num_classes-1 etas, we don't need a intercept here, since \sum_i \bar{z_i} = 1
84 | };
85 | 
86 | #endif // SLDA_H
87 | 
88 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/test-label.dat:
--------------------------------------------------------------------------------
  1 | 0
  2 | 0
  3 | 0
  4 | 0
  5 | 0
  6 | 0
  7 | 0
  8 | 0
  9 | 0
 10 | 0
 11 | 0
 12 | 0
 13 | 0
 14 | 0
 15 | 0
 16 | 0
 17 | 0
 18 | 0
 19 | 0
 20 | 0
 21 | 0
 22 | 0
 23 | 0
 24 | 0
 25 | 0
 26 | 0
 27 | 0
 28 | 0
 29 | 0
 30 | 0
 31 | 0
 32 | 0
 33 | 0
 34 | 0
 35 | 0
 36 | 0
 37 | 0
 38 | 0
 39 | 0
 40 | 0
 41 | 0
 42 | 0
 43 | 0
 44 | 0
 45 | 0
 46 | 0
 47 | 0
 48 | 0
 49 | 0
 50 | 0
 51 | 0
 52 | 0
 53 | 0
 54 | 0
 55 | 0
 56 | 0
 57 | 0
 58 | 0
 59 | 0
 60 | 0
 61 | 0
 62 | 0
 63 | 0
 64 | 0
 65 | 0
 66 | 0
 67 | 0
 68 | 0
 69 | 0
 70 | 0
 71 | 0
 72 | 0
 73 | 0
 74 | 0
 75 | 0
 76 | 0
 77 | 0
 78 | 0
 79 | 0
 80 | 0
 81 | 0
 82 | 0
 83 | 0
 84 | 0
 85 | 0
 86 | 0
 87 | 0
 88 | 0
 89 | 0
 90 | 0
 91 | 0
 92 | 0
 93 | 0
 94 | 0
 95 | 0
 96 | 0
 97 | 0
 98 | 0
 99 | 0
100 | 0
101 | 1
102 | 1
103 | 1
104 | 1
105 | 1
106 | 1
107 | 1
108 | 1
109 | 1
110 | 1
111 | 1
112 | 1
113 | 1
114 | 1
115 | 1
116 | 1
117 | 1
118 | 1
119 | 1
120 | 1
121 | 1
122 | 1
123 | 1
124 | 1
125 | 1
126 | 1
127 | 1
128 | 1
129 | 1
130 | 1
131 | 1
132 | 1
133 | 1
134 | 1
135 | 1
136 | 1
137 | 1
138 | 1
139 | 1
140 | 1
141 | 1
142 | 1
143 | 1
144 | 1
145 | 1
146 | 1
147 | 1
148 | 1
149 | 1
150 | 1
151 | 1
152 | 1
153 | 1
154 | 1
155 | 1
156 | 1
157 | 1
158 | 1
159 | 1
160 | 1
161 | 1
162 | 1
163 | 1
164 | 1
165 | 1
166 | 1
167 | 1
168 | 1
169 | 1
170 | 1
171 | 1
172 | 1
173 | 1
174 | 1
175 | 1
176 | 1
177 | 1
178 | 1
179 | 1
180 | 1
181 | 1
182 | 1
183 | 1
184 | 1
185 | 1
186 | 1
187 | 1
188 | 1
189 | 1
190 | 1
191 | 1
192 | 1
193 | 1
194 | 1
195 | 1
196 | 1
197 | 1
198 | 1
199 | 1
200 | 1
201 | 2
202 | 2
203 | 2
204 | 2
205 | 2
206 | 2
207 | 2
208 | 2
209 | 2
210 | 2
211 | 2
212 | 2
213 | 2
214 | 2
215 | 2
216 | 2
217 | 2
218 | 2
219 | 2
220 | 2
221 | 2
222 | 2
223 | 2
224 | 2
225 | 2
226 | 2
227 | 2
228 | 2
229 | 2
230 | 2
231 | 2
232 | 2
233 | 2
234 | 2
235 | 2
236 | 2
237 | 2
238 | 2
239 | 2
240 | 2
241 | 2
242 | 2
243 | 2
244 | 2
245 | 2
246 | 2
247 | 2
248 | 2
249 | 2
250 | 2
251 | 2
252 | 2
253 | 2
254 | 2
255 | 2
256 | 2
257 | 2
258 | 2
259 | 2
260 | 2
261 | 2
262 | 2
263 | 2
264 | 2
265 | 2
266 | 2
267 | 2
268 | 2
269 | 2
270 | 2
271 | 2
272 | 2
273 | 2
274 | 2
275 | 2
276 | 2
277 | 2
278 | 2
279 | 2
280 | 2
281 | 2
282 | 2
283 | 2
284 | 2
285 | 2
286 | 2
287 | 2
288 | 2
289 | 2
290 | 2
291 | 2
292 | 2
293 | 2
294 | 2
295 | 2
296 | 2
297 | 2
298 | 2
299 | 2
300 | 2
301 | 3
302 | 3
303 | 3
304 | 3
305 | 3
306 | 3
307 | 3
308 | 3
309 | 3
310 | 3
311 | 3
312 | 3
313 | 3
314 | 3
315 | 3
316 | 3
317 | 3
318 | 3
319 | 3
320 | 3
321 | 3
322 | 3
323 | 3
324 | 3
325 | 3
326 | 3
327 | 3
328 | 3
329 | 3
330 | 3
331 | 3
332 | 3
333 | 3
334 | 3
335 | 3
336 | 3
337 | 3
338 | 3
339 | 3
340 | 3
341 | 3
342 | 3
343 | 3
344 | 3
345 | 3
346 | 3
347 | 3
348 | 3
349 | 3
350 | 3
351 | 3
352 | 3
353 | 3
354 | 3
355 | 3
356 | 3
357 | 3
358 | 3
359 | 3
360 | 3
361 | 3
362 | 3
363 | 3
364 | 3
365 | 3
366 | 3
367 | 3
368 | 3
369 | 3
370 | 3
371 | 3
372 | 3
373 | 3
374 | 3
375 | 3
376 | 3
377 | 3
378 | 3
379 | 3
380 | 3
381 | 3
382 | 3
383 | 3
384 | 3
385 | 3
386 | 3
387 | 3
388 | 3
389 | 3
390 | 3
391 | 3
392 | 3
393 | 3
394 | 3
395 | 3
396 | 3
397 | 3
398 | 3
399 | 3
400 | 3
401 | 4
402 | 4
403 | 4
404 | 4
405 | 4
406 | 4
407 | 4
408 | 4
409 | 4
410 | 4
411 | 4
412 | 4
413 | 4
414 | 4
415 | 4
416 | 4
417 | 4
418 | 4
419 | 4
420 | 4
421 | 4
422 | 4
423 | 4
424 | 4
425 | 4
426 | 4
427 | 4
428 | 4
429 | 4
430 | 4
431 | 4
432 | 4
433 | 4
434 | 4
435 | 4
436 | 4
437 | 4
438 | 4
439 | 4
440 | 4
441 | 4
442 | 4
443 | 4
444 | 4
445 | 4
446 | 4
447 | 4
448 | 4
449 | 4
450 | 4
451 | 4
452 | 4
453 | 4
454 | 4
455 | 4
456 | 4
457 | 4
458 | 4
459 | 4
460 | 4
461 | 4
462 | 4
463 | 4
464 | 4
465 | 4
466 | 4
467 | 4
468 | 4
469 | 4
470 | 4
471 | 4
472 | 4
473 | 4
474 | 4
475 | 4
476 | 4
477 | 4
478 | 4
479 | 4
480 | 4
481 | 4
482 | 4
483 | 4
484 | 4
485 | 4
486 | 4
487 | 4
488 | 4
489 | 4
490 | 4
491 | 4
492 | 4
493 | 4
494 | 4
495 | 4
496 | 4
497 | 4
498 | 4
499 | 4
500 | 4
501 | 5
502 | 5
503 | 5
504 | 5
505 | 5
506 | 5
507 | 5
508 | 5
509 | 5
510 | 5
511 | 5
512 | 5
513 | 5
514 | 5
515 | 5
516 | 5
517 | 5
518 | 5
519 | 5
520 | 5
521 | 5
522 | 5
523 | 5
524 | 5
525 | 5
526 | 5
527 | 5
528 | 5
529 | 5
530 | 5
531 | 5
532 | 5
533 | 5
534 | 5
535 | 5
536 | 5
537 | 5
538 | 5
539 | 5
540 | 5
541 | 5
542 | 5
543 | 5
544 | 5
545 | 5
546 | 5
547 | 5
548 | 5
549 | 5
550 | 5
551 | 5
552 | 5
553 | 5
554 | 5
555 | 5
556 | 5
557 | 5
558 | 5
559 | 5
560 | 5
561 | 5
562 | 5
563 | 5
564 | 5
565 | 5
566 | 5
567 | 5
568 | 5
569 | 5
570 | 5
571 | 5
572 | 5
573 | 5
574 | 5
575 | 5
576 | 5
577 | 5
578 | 5
579 | 5
580 | 5
581 | 5
582 | 5
583 | 5
584 | 5
585 | 5
586 | 5
587 | 5
588 | 5
589 | 5
590 | 5
591 | 5
592 | 5
593 | 5
594 | 5
595 | 5
596 | 5
597 | 5
598 | 5
599 | 5
600 | 5
601 | 6
602 | 6
603 | 6
604 | 6
605 | 6
606 | 6
607 | 6
608 | 6
609 | 6
610 | 6
611 | 6
612 | 6
613 | 6
614 | 6
615 | 6
616 | 6
617 | 6
618 | 6
619 | 6
620 | 6
621 | 6
622 | 6
623 | 6
624 | 6
625 | 6
626 | 6
627 | 6
628 | 6
629 | 6
630 | 6
631 | 6
632 | 6
633 | 6
634 | 6
635 | 6
636 | 6
637 | 6
638 | 6
639 | 6
640 | 6
641 | 6
642 | 6
643 | 6
644 | 6
645 | 6
646 | 6
647 | 6
648 | 6
649 | 6
650 | 6
651 | 6
652 | 6
653 | 6
654 | 6
655 | 6
656 | 6
657 | 6
658 | 6
659 | 6
660 | 6
661 | 6
662 | 6
663 | 6
664 | 6
665 | 6
666 | 6
667 | 6
668 | 6
669 | 6
670 | 6
671 | 6
672 | 6
673 | 6
674 | 6
675 | 6
676 | 6
677 | 6
678 | 6
679 | 6
680 | 6
681 | 6
682 | 6
683 | 6
684 | 6
685 | 6
686 | 6
687 | 6
688 | 6
689 | 6
690 | 6
691 | 6
692 | 6
693 | 6
694 | 6
695 | 6
696 | 6
697 | 6
698 | 6
699 | 6
700 | 6
701 | 7
702 | 7
703 | 7
704 | 7
705 | 7
706 | 7
707 | 7
708 | 7
709 | 7
710 | 7
711 | 7
712 | 7
713 | 7
714 | 7
715 | 7
716 | 7
717 | 7
718 | 7
719 | 7
720 | 7
721 | 7
722 | 7
723 | 7
724 | 7
725 | 7
726 | 7
727 | 7
728 | 7
729 | 7
730 | 7
731 | 7
732 | 7
733 | 7
734 | 7
735 | 7
736 | 7
737 | 7
738 | 7
739 | 7
740 | 7
741 | 7
742 | 7
743 | 7
744 | 7
745 | 7
746 | 7
747 | 7
748 | 7
749 | 7
750 | 7
751 | 7
752 | 7
753 | 7
754 | 7
755 | 7
756 | 7
757 | 7
758 | 7
759 | 7
760 | 7
761 | 7
762 | 7
763 | 7
764 | 7
765 | 7
766 | 7
767 | 7
768 | 7
769 | 7
770 | 7
771 | 7
772 | 7
773 | 7
774 | 7
775 | 7
776 | 7
777 | 7
778 | 7
779 | 7
780 | 7
781 | 7
782 | 7
783 | 7
784 | 7
785 | 7
786 | 7
787 | 7
788 | 7
789 | 7
790 | 7
791 | 7
792 | 7
793 | 7
794 | 7
795 | 7
796 | 7
797 | 7
798 | 7
799 | 7
800 | 7
801 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/train-label.dat:
--------------------------------------------------------------------------------
  1 | 0
  2 | 0
  3 | 0
  4 | 0
  5 | 0
  6 | 0
  7 | 0
  8 | 0
  9 | 0
 10 | 0
 11 | 0
 12 | 0
 13 | 0
 14 | 0
 15 | 0
 16 | 0
 17 | 0
 18 | 0
 19 | 0
 20 | 0
 21 | 0
 22 | 0
 23 | 0
 24 | 0
 25 | 0
 26 | 0
 27 | 0
 28 | 0
 29 | 0
 30 | 0
 31 | 0
 32 | 0
 33 | 0
 34 | 0
 35 | 0
 36 | 0
 37 | 0
 38 | 0
 39 | 0
 40 | 0
 41 | 0
 42 | 0
 43 | 0
 44 | 0
 45 | 0
 46 | 0
 47 | 0
 48 | 0
 49 | 0
 50 | 0
 51 | 0
 52 | 0
 53 | 0
 54 | 0
 55 | 0
 56 | 0
 57 | 0
 58 | 0
 59 | 0
 60 | 0
 61 | 0
 62 | 0
 63 | 0
 64 | 0
 65 | 0
 66 | 0
 67 | 0
 68 | 0
 69 | 0
 70 | 0
 71 | 0
 72 | 0
 73 | 0
 74 | 0
 75 | 0
 76 | 0
 77 | 0
 78 | 0
 79 | 0
 80 | 0
 81 | 0
 82 | 0
 83 | 0
 84 | 0
 85 | 0
 86 | 0
 87 | 0
 88 | 0
 89 | 0
 90 | 0
 91 | 0
 92 | 0
 93 | 0
 94 | 0
 95 | 0
 96 | 0
 97 | 0
 98 | 0
 99 | 0
100 | 0
101 | 1
102 | 1
103 | 1
104 | 1
105 | 1
106 | 1
107 | 1
108 | 1
109 | 1
110 | 1
111 | 1
112 | 1
113 | 1
114 | 1
115 | 1
116 | 1
117 | 1
118 | 1
119 | 1
120 | 1
121 | 1
122 | 1
123 | 1
124 | 1
125 | 1
126 | 1
127 | 1
128 | 1
129 | 1
130 | 1
131 | 1
132 | 1
133 | 1
134 | 1
135 | 1
136 | 1
137 | 1
138 | 1
139 | 1
140 | 1
141 | 1
142 | 1
143 | 1
144 | 1
145 | 1
146 | 1
147 | 1
148 | 1
149 | 1
150 | 1
151 | 1
152 | 1
153 | 1
154 | 1
155 | 1
156 | 1
157 | 1
158 | 1
159 | 1
160 | 1
161 | 1
162 | 1
163 | 1
164 | 1
165 | 1
166 | 1
167 | 1
168 | 1
169 | 1
170 | 1
171 | 1
172 | 1
173 | 1
174 | 1
175 | 1
176 | 1
177 | 1
178 | 1
179 | 1
180 | 1
181 | 1
182 | 1
183 | 1
184 | 1
185 | 1
186 | 1
187 | 1
188 | 1
189 | 1
190 | 1
191 | 1
192 | 1
193 | 1
194 | 1
195 | 1
196 | 1
197 | 1
198 | 1
199 | 1
200 | 1
201 | 2
202 | 2
203 | 2
204 | 2
205 | 2
206 | 2
207 | 2
208 | 2
209 | 2
210 | 2
211 | 2
212 | 2
213 | 2
214 | 2
215 | 2
216 | 2
217 | 2
218 | 2
219 | 2
220 | 2
221 | 2
222 | 2
223 | 2
224 | 2
225 | 2
226 | 2
227 | 2
228 | 2
229 | 2
230 | 2
231 | 2
232 | 2
233 | 2
234 | 2
235 | 2
236 | 2
237 | 2
238 | 2
239 | 2
240 | 2
241 | 2
242 | 2
243 | 2
244 | 2
245 | 2
246 | 2
247 | 2
248 | 2
249 | 2
250 | 2
251 | 2
252 | 2
253 | 2
254 | 2
255 | 2
256 | 2
257 | 2
258 | 2
259 | 2
260 | 2
261 | 2
262 | 2
263 | 2
264 | 2
265 | 2
266 | 2
267 | 2
268 | 2
269 | 2
270 | 2
271 | 2
272 | 2
273 | 2
274 | 2
275 | 2
276 | 2
277 | 2
278 | 2
279 | 2
280 | 2
281 | 2
282 | 2
283 | 2
284 | 2
285 | 2
286 | 2
287 | 2
288 | 2
289 | 2
290 | 2
291 | 2
292 | 2
293 | 2
294 | 2
295 | 2
296 | 2
297 | 2
298 | 2
299 | 2
300 | 2
301 | 3
302 | 3
303 | 3
304 | 3
305 | 3
306 | 3
307 | 3
308 | 3
309 | 3
310 | 3
311 | 3
312 | 3
313 | 3
314 | 3
315 | 3
316 | 3
317 | 3
318 | 3
319 | 3
320 | 3
321 | 3
322 | 3
323 | 3
324 | 3
325 | 3
326 | 3
327 | 3
328 | 3
329 | 3
330 | 3
331 | 3
332 | 3
333 | 3
334 | 3
335 | 3
336 | 3
337 | 3
338 | 3
339 | 3
340 | 3
341 | 3
342 | 3
343 | 3
344 | 3
345 | 3
346 | 3
347 | 3
348 | 3
349 | 3
350 | 3
351 | 3
352 | 3
353 | 3
354 | 3
355 | 3
356 | 3
357 | 3
358 | 3
359 | 3
360 | 3
361 | 3
362 | 3
363 | 3
364 | 3
365 | 3
366 | 3
367 | 3
368 | 3
369 | 3
370 | 3
371 | 3
372 | 3
373 | 3
374 | 3
375 | 3
376 | 3
377 | 3
378 | 3
379 | 3
380 | 3
381 | 3
382 | 3
383 | 3
384 | 3
385 | 3
386 | 3
387 | 3
388 | 3
389 | 3
390 | 3
391 | 3
392 | 3
393 | 3
394 | 3
395 | 3
396 | 3
397 | 3
398 | 3
399 | 3
400 | 3
401 | 4
402 | 4
403 | 4
404 | 4
405 | 4
406 | 4
407 | 4
408 | 4
409 | 4
410 | 4
411 | 4
412 | 4
413 | 4
414 | 4
415 | 4
416 | 4
417 | 4
418 | 4
419 | 4
420 | 4
421 | 4
422 | 4
423 | 4
424 | 4
425 | 4
426 | 4
427 | 4
428 | 4
429 | 4
430 | 4
431 | 4
432 | 4
433 | 4
434 | 4
435 | 4
436 | 4
437 | 4
438 | 4
439 | 4
440 | 4
441 | 4
442 | 4
443 | 4
444 | 4
445 | 4
446 | 4
447 | 4
448 | 4
449 | 4
450 | 4
451 | 4
452 | 4
453 | 4
454 | 4
455 | 4
456 | 4
457 | 4
458 | 4
459 | 4
460 | 4
461 | 4
462 | 4
463 | 4
464 | 4
465 | 4
466 | 4
467 | 4
468 | 4
469 | 4
470 | 4
471 | 4
472 | 4
473 | 4
474 | 4
475 | 4
476 | 4
477 | 4
478 | 4
479 | 4
480 | 4
481 | 4
482 | 4
483 | 4
484 | 4
485 | 4
486 | 4
487 | 4
488 | 4
489 | 4
490 | 4
491 | 4
492 | 4
493 | 4
494 | 4
495 | 4
496 | 4
497 | 4
498 | 4
499 | 4
500 | 4
501 | 5
502 | 5
503 | 5
504 | 5
505 | 5
506 | 5
507 | 5
508 | 5
509 | 5
510 | 5
511 | 5
512 | 5
513 | 5
514 | 5
515 | 5
516 | 5
517 | 5
518 | 5
519 | 5
520 | 5
521 | 5
522 | 5
523 | 5
524 | 5
525 | 5
526 | 5
527 | 5
528 | 5
529 | 5
530 | 5
531 | 5
532 | 5
533 | 5
534 | 5
535 | 5
536 | 5
537 | 5
538 | 5
539 | 5
540 | 5
541 | 5
542 | 5
543 | 5
544 | 5
545 | 5
546 | 5
547 | 5
548 | 5
549 | 5
550 | 5
551 | 5
552 | 5
553 | 5
554 | 5
555 | 5
556 | 5
557 | 5
558 | 5
559 | 5
560 | 5
561 | 5
562 | 5
563 | 5
564 | 5
565 | 5
566 | 5
567 | 5
568 | 5
569 | 5
570 | 5
571 | 5
572 | 5
573 | 5
574 | 5
575 | 5
576 | 5
577 | 5
578 | 5
579 | 5
580 | 5
581 | 5
582 | 5
583 | 5
584 | 5
585 | 5
586 | 5
587 | 5
588 | 5
589 | 5
590 | 5
591 | 5
592 | 5
593 | 5
594 | 5
595 | 5
596 | 5
597 | 5
598 | 5
599 | 5
600 | 5
601 | 6
602 | 6
603 | 6
604 | 6
605 | 6
606 | 6
607 | 6
608 | 6
609 | 6
610 | 6
611 | 6
612 | 6
613 | 6
614 | 6
615 | 6
616 | 6
617 | 6
618 | 6
619 | 6
620 | 6
621 | 6
622 | 6
623 | 6
624 | 6
625 | 6
626 | 6
627 | 6
628 | 6
629 | 6
630 | 6
631 | 6
632 | 6
633 | 6
634 | 6
635 | 6
636 | 6
637 | 6
638 | 6
639 | 6
640 | 6
641 | 6
642 | 6
643 | 6
644 | 6
645 | 6
646 | 6
647 | 6
648 | 6
649 | 6
650 | 6
651 | 6
652 | 6
653 | 6
654 | 6
655 | 6
656 | 6
657 | 6
658 | 6
659 | 6
660 | 6
661 | 6
662 | 6
663 | 6
664 | 6
665 | 6
666 | 6
667 | 6
668 | 6
669 | 6
670 | 6
671 | 6
672 | 6
673 | 6
674 | 6
675 | 6
676 | 6
677 | 6
678 | 6
679 | 6
680 | 6
681 | 6
682 | 6
683 | 6
684 | 6
685 | 6
686 | 6
687 | 6
688 | 6
689 | 6
690 | 6
691 | 6
692 | 6
693 | 6
694 | 6
695 | 6
696 | 6
697 | 6
698 | 6
699 | 6
700 | 6
701 | 7
702 | 7
703 | 7
704 | 7
705 | 7
706 | 7
707 | 7
708 | 7
709 | 7
710 | 7
711 | 7
712 | 7
713 | 7
714 | 7
715 | 7
716 | 7
717 | 7
718 | 7
719 | 7
720 | 7
721 | 7
722 | 7
723 | 7
724 | 7
725 | 7
726 | 7
727 | 7
728 | 7
729 | 7
730 | 7
731 | 7
732 | 7
733 | 7
734 | 7
735 | 7
736 | 7
737 | 7
738 | 7
739 | 7
740 | 7
741 | 7
742 | 7
743 | 7
744 | 7
745 | 7
746 | 7
747 | 7
748 | 7
749 | 7
750 | 7
751 | 7
752 | 7
753 | 7
754 | 7
755 | 7
756 | 7
757 | 7
758 | 7
759 | 7
760 | 7
761 | 7
762 | 7
763 | 7
764 | 7
765 | 7
766 | 7
767 | 7
768 | 7
769 | 7
770 | 7
771 | 7
772 | 7
773 | 7
774 | 7
775 | 7
776 | 7
777 | 7
778 | 7
779 | 7
780 | 7
781 | 7
782 | 7
783 | 7
784 | 7
785 | 7
786 | 7
787 | 7
788 | 7
789 | 7
790 | 7
791 | 7
792 | 7
793 | 7
794 | 7
795 | 7
796 | 7
797 | 7
798 | 7
799 | 7
800 | 7
801 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/utils.cpp:
--------------------------------------------------------------------------------
  1 | #include "utils.h"
  2 | 
  3 | /*
  4 |  * given log(a) and log(b), return log(a + b)
  5 |  *
  6 |  */
  7 | 
  8 | double log_sum(double log_a, double log_b)
  9 | {
 10 |     double v;
 11 | 
 12 |     if (log_a < log_b)
 13 |         v = log_b+log(1 + exp(log_a-log_b));
 14 |     else
 15 |         v = log_a+log(1 + exp(log_b-log_a));
 16 | 
 17 |     return v;
 18 | }
 19 | 
 20 | /**
 21 |  * Proc to calculate the value of the trigamma, the second
 22 |  * derivative of the loggamma function. Accepts positive matrices.
 23 |  * From Abromowitz and Stegun.  Uses formulas 6.4.11 and 6.4.12 with
 24 |  * recurrence formula 6.4.6.  Each requires workspace at least 5
 25 |  * times the size of X.
 26 |  *
 27 |  **/
 28 | 
 29 | double trigamma(double x)
 30 | {
 31 |     double p;
 32 |     int i;
 33 | 
 34 |     x = x+6;
 35 |     p = 1/(x*x);
 36 |     p = (((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)*p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p;
 37 |     for (i=0; i<6 ;i++)
 38 |     {
 39 |         x = x-1;
 40 |         p = 1/(x*x)+p;
 41 |     }
 42 |     return p;
 43 | }
 44 | 
 45 | 
 46 | /*
 47 |  * taylor approximation of first derivative of the log gamma function
 48 |  *
 49 |  */
 50 | 
 51 | double digamma(double x)
 52 | {
 53 |     double p;
 54 |     x = x+6;
 55 |     p = 1/(x*x);
 56 |     p = (((0.004166666666667*p-0.003968253986254)*p+0.008333333333333)*p-0.083333333333333)*p;
 57 |     p = p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6);
 58 |     return p;
 59 | }
 60 | 
 61 | /*
 62 |  * this log gamma function has the implementation of this function
 63 |  *
 64 |  */
 65 | 
 66 | /* double lgamma(double x)
 67 |  * {
 68 |  * double x0,x2,xp,gl,gl0;
 69 |  * int n,k;
 70 |  * static double a[] = {
 71 |  * 8.333333333333333e-02,
 72 |  * -2.777777777777778e-03,
 73 |  * 7.936507936507937e-04,
 74 |  * -5.952380952380952e-04,
 75 |  * 8.417508417508418e-04,
 76 |  * -1.917526917526918e-03,
 77 |  * 6.410256410256410e-03,
 78 |  * -2.955065359477124e-02,
 79 |  * 1.796443723688307e-01,
 80 |  * -1.39243221690590
 81 |  * };
 82 |  *
 83 |  * x0 = x;
 84 |  * if (x <= 0.0) return 1e308;
 85 |  * else if ((x == 1.0) || (x == 2.0)) return 0.0;
 86 |  * else if (x <= 7.0) {
 87 |  * n = (int)(7-x);
 88 |  * x0 = x+n;
 89 |  * }
 90 |  * x2 = 1.0/(x0*x0);
 91 |  * xp = 2.0*M_PI;
 92 |  * gl0 = a[9];
 93 |  * for (k=8;k>=0;k--) {
 94 |  * gl0 = gl0*x2 + a[k];
 95 |  * }
 96 |  * gl = gl0/x0+0.5*log(xp)+(x0-0.5)*log(x0)-x0;
 97 |  * if (x <= 7.0) {
 98 |  * for (k=1;k<=n;k++) {
 99 |  * gl -= log(x0-1.0);
100 |  * x0 -= 1.0;
101 |  * }
102 |  * }
103 |  * return gl;
104 |  * }
105 |  */
106 | 
107 | 
108 | 
109 | /*
110 |  * make directory
111 |  *
112 |  */
113 | 
114 | void make_directory(char* name)
115 | {
116 |     mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR);
117 | }
118 | 
119 | 
120 | /*
121 |  * argmax
122 |  *
123 |  */
124 | 
125 | int argmax(double* x, int n)
126 | {
127 |     int i, argmax = 0;
128 |     double max = x[0];
129 | 
130 |     for (i = 1; i < n; i++)
131 |     {
132 |         if (x[i] > max)
133 |         {
134 |             max = x[i];
135 |             argmax = i;
136 |         }
137 |     }
138 |     return argmax;
139 | }
140 | 
141 | /*
142 |  * return the correponding index in the n(n+1)/2 given row and col
143 |  * this is a upper triangle matrix, we can do this since this is 
144 |  * a symmetric matrix
145 |  *
146 |  */
147 | 
148 | int map_idx(int row, int col, int dim)
149 | {
150 |     int swap, idx;
151 |     if (row > col)
152 |     {
153 |         swap = row;
154 |         row  = col;
155 |         col  = swap;
156 |     }
157 |     //now row <= col
158 |     idx = (2*dim - row + 1)*row/2 + col - row;
159 |     return idx;
160 | }
161 | 
162 | 


--------------------------------------------------------------------------------
/topic-competitors/slda/utils.h:
--------------------------------------------------------------------------------
 1 | #ifndef UTILS_H
 2 | #define UTILS_H
 3 | 
 4 | #include <stdio.h>
 5 | #include <math.h>
 6 | #include <float.h>
 7 | #include <stdlib.h>
 8 | #include <sys/stat.h>
 9 | #include <sys/types.h>
10 | 
11 | double log_sum(double log_a, double log_b);
12 | double trigamma(double x);
13 | double digamma(double x);
14 | //double lgamma(double x);
15 | void make_directory(char* name);
16 | int argmax(double* x, int n);
17 | int map_idx(int row, int col, int dim); 
18 | 
19 | #endif
20 | 
21 | 


--------------------------------------------------------------------------------
/topic-cosine.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import sys
 3 | import pdb
 4 | from utils import *
 5 | 
 6 | topic_vec_file = sys.argv[1]
 7 | T = load_matrix_from_text( topic_vec_file, "topic" )
 8 | K = T.shape[0]
 9 | cosine_mat = []
10 | for x in xrange(K):
11 |     for y in xrange(x):
12 |         if normF(T[x]) < 1e-6 or normF(T[y]) < 1e-6:
13 |             continue
14 |         cosine = np.dot( T[x], T[y] ) / normF(T[x]) / normF(T[y])
15 |         cosine_mat.append( [ cosine, x, y ] )
16 | 
17 | cosine_sum = 0
18 | for i in xrange( len(cosine_mat) ):
19 |     cosine_sum += cosine_mat[i][0]
20 | 
21 | print "Avg: %.5f" %( cosine_sum / len(cosine_mat) )
22 | cosine_sorted = sorted( cosine_mat, key=lambda cosine_tuple: cosine_tuple[0], reverse=True )
23 | for i in xrange(10):
24 |     cosine, x, y = cosine_sorted[i]
25 |     print "%d,%d: %.5f" %( x, y, cosine )
26 |     print "%d: %s" %( x, T[x][:10] )
27 |     print "%d: %s" %( y, T[y][:10] )
28 |     print
29 |             


--------------------------------------------------------------------------------
/topicvec-ext.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topicvec-ext.pdf


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/utils.py


--------------------------------------------------------------------------------