├── .gitignore ├── 20news.bat ├── README.md ├── anatest.py ├── classEval.py ├── corpusLoader.py ├── csv2topic.py ├── file2topic.py ├── psdvec ├── PSDVec.pdf ├── README.md ├── addheader.py ├── analogy.py ├── bench.sh ├── benchspeed.py ├── catbench.py ├── cleancorpus.py ├── competitors │ ├── GloVe-1.2.zip │ ├── glove │ │ ├── demo.sh │ │ ├── rcv1.sh │ │ ├── vocab-rcv1.txt │ │ ├── vocab-wiki.txt │ │ └── wiki.sh │ ├── hyperwords.zip │ ├── hyperwords │ │ ├── pmi2.sh │ │ ├── pmi5-rcv1.sh │ │ ├── pmi5.sh │ │ ├── svd-rcv1.sh │ │ ├── svd.sh │ │ ├── train-rcv1.sh │ │ └── train-wiki.sh │ ├── singular.zip │ └── sparse.zip ├── corpus2liblinear.py ├── eval-logs │ └── bench.log ├── evaluate-toefl.py ├── evaluate.py ├── extractwiki.py ├── fact-rcv1.bat ├── fact-rcv1.sh ├── fact-wiki.bat ├── fact-wiki.sh ├── factorize.py ├── genSentDict.bat ├── gencatdata.py ├── gram-rcv1.bat ├── gram.bat ├── gramcount.pl ├── papers │ └── emnlp2015.pdf ├── patch to gensim.py ├── perlxs.h ├── removeDoubleNewline.pl ├── sent-bench.bat ├── sent-gen.conf ├── sentbench.py ├── tab2list.py ├── testsets │ ├── analogy │ │ ├── EN-TOM-ICLR13-SEM.txt │ │ ├── EN-TOM-ICLR13-SYN.txt │ │ ├── google.txt │ │ └── msr.txt │ └── ws │ │ ├── EN-RG-65.txt │ │ ├── EN-TOEFL-80.txt │ │ ├── bruni_men.txt │ │ ├── luong_rare.txt │ │ ├── radinsky_mturk.txt │ │ ├── simlex_999a.txt │ │ ├── ws353.txt │ │ ├── ws353_relatedness.txt │ │ └── ws353_similarity.txt ├── topwordsInList.py ├── utils.py ├── vecnorms.py └── xml2corpus.pl ├── reuters.bat ├── snippet2topic.py ├── test-docs ├── Drug Goes From 13.50 a Tablet to 750, Overnight.txt ├── VR-mitrv.txt ├── batman-v-superman.txt ├── batman-v-superman.txt-em100.topic.vec ├── beijing-haze-news.txt ├── brain-scar.txt ├── britain-EU.txt ├── drugstory.log ├── hillary-speech.txt ├── hillary-speech2.txt ├── nips-wiki.txt ├── sanders-speeches.txt ├── spacex-news.txt └── trump-speech.txt ├── topic-competitors ├── LDA │ ├── LDAClassify.zip │ ├── Readme.txt │ ├── classEval.py │ ├── corpusLoader.py │ └── ldaExp.py ├── doc2vec.py ├── kmeans.py ├── labelEval.py ├── lftm2svm.py ├── liu-doc2vec.py ├── rajarshd-Gaussian_LDA.zip ├── sHDP.zip └── slda │ ├── 20news-test-7532.slda-bow.txt │ ├── 20news-test-7532.slda-label.txt │ ├── 20news-train-11314.slda-bow.txt │ ├── 20news-train-11314.slda-label.txt │ ├── Makefile │ ├── corpus.cpp │ ├── corpus.h │ ├── cygblas-0.dll │ ├── images.tgz │ ├── main.cpp │ ├── opt.cpp │ ├── opt.h │ ├── readme.txt │ ├── reuters-test-2255.slda-bow.txt │ ├── reuters-test-2255.slda-label.txt │ ├── reuters-train-5770.slda-bow.txt │ ├── reuters-train-5770.slda-label.txt │ ├── settings.h │ ├── settings.txt │ ├── slda │ ├── slda.cpp │ ├── slda.exe │ ├── slda.h │ ├── test-data.dat │ ├── test-label.dat │ ├── train-data.dat │ ├── train-label.dat │ ├── utils.cpp │ └── utils.h ├── topic-cosine.py ├── topicExp.py ├── topicvec-ext.pdf ├── topicvecDir.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | commit.bat 2 | *.pyc 3 | -------------------------------------------------------------------------------- /20news.bat: -------------------------------------------------------------------------------- 1 | python topicExp.py -s 20news train 2 | python topicExp.py -i 20news-train-11314-sep281-em150-best.topic.vec 20news train,test 3 | python classEval.py 20news topicprop 4 | python classEval.py 20news topic-wvavg 5 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TopicVec 2 | TopicVec is the source code for "Generative Topic Embedding: a Continuous Representation of Documents" (ACL 2016). 3 | 4 | PSDVec (in folder 'psdvec') is the source code for "A Generative Word Embedding Model and its Low Rank Positive Semidefinite Solution" (EMNLP 2015). 5 | 6 | #### Update v0.7: 7 | The topic inference is now 6 times faster. 8 | 9 | #### Update v0.6: 10 | ##### Algorithm update: 11 | topicvecDir.py: uses exact inference instead a second-order approximation in the M-step. 12 | 13 | #### Update v0.5: 14 | ##### Main algorithm: 15 | topicvecDir.py: uses a Dirichlet prior for topic mixting proportions. 16 | 17 | ####Required files on Dropbox: 18 | https://www.dropbox.com/sh/lqbk3iioobegbp8/AACc8Kfr1KZIkKl9bGaIrOjfa?dl=0 19 | 20 | 1. Pretrained 180000 embeddings (25000 cores) in 3 archives. For faster loading into Python, 25000-180000-500-BLK-8.0.vec.npy can be used; 21 | 2. Unigram files top1grams-wiki.txt & top1grams-reuters.txt; 22 | 3. RCV1 cleansed corpus ( before downloading, please apply for permission from NIST according to: http://trec.nist.gov/data/reuters/reuters.html ). 23 | 24 | If you are in China, you can also download the above files from baidu netdisk without the hassle of "climbing over the wall": 25 | https://pan.baidu.com/s/1gVmRhK1HA2XwVWZbZHHLZQ#list/path=%2F 26 | -------------------------------------------------------------------------------- /anatest.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from utils import * 3 | 4 | embedding_arrays = np.load("25000-180000-500-BLK-8.0.vec.npy") 5 | V, vocab, word2ID, skippedWords_whatever = embedding_arrays 6 | model = VecModel(V, vocab, word2ID, vecNormalize=True) 7 | w1, w2 = predict_ana(model, "fish", "water", "plant", "soil") 8 | print w1, w2 9 | w1, w2 = predict_ana(model, "player", "team", "student", "classroom") 10 | print w1, w2 11 | -------------------------------------------------------------------------------- /classEval.py: -------------------------------------------------------------------------------- 1 | from sklearn import svm, metrics 2 | from sklearn.datasets import load_svmlight_file 3 | import sys 4 | 5 | def getScores( true_classes, pred_classes, average): 6 | precision = metrics.precision_score( true_classes, pred_classes, average=average ) 7 | recall = metrics.recall_score( true_classes, pred_classes, average=average ) 8 | f1 = metrics.f1_score( true_classes, pred_classes, average=average ) 9 | accuracy = metrics.accuracy_score( true_classes, pred_classes ) 10 | return precision, recall, f1, accuracy 11 | 12 | """ 13 | TopicProp_ITER = int(sys.argv[1]) 14 | topicNum = int(sys.argv[2]) 15 | train_file = "20news-train-11314-sep%d-em40-i%d.topic.prop" %(topicNum, TopicProp_ITER) 16 | test_file = "20news-test-7532-sep%d-em40-i%d.topic.prop" %(topicNum, TopicProp_ITER) 17 | 18 | train_features, train_docs_cat_name = load_matrix_from_text( train_file, "training proportion", "\t" ) 19 | test_features, test_docs_cat_name = load_matrix_from_text( test_file, "test proportion", "\t" ) 20 | 21 | true_train_classes = [] 22 | true_test_classes = [] 23 | 24 | for train_cat_name in train_docs_cat_name[0]: 25 | true_train_classes.append( int(train_cat_name) ) 26 | for test_cat_name in test_docs_cat_name[0]: 27 | true_test_classes.append( int(test_cat_name) ) 28 | """ 29 | 30 | corpus = sys.argv[1] 31 | filetype = sys.argv[2] 32 | # selected feature dimensions can be specified in the last argument as: 33 | # 1-400 (starting from 1) 34 | if len(sys.argv) > 3: 35 | dims = sys.argv[3].split("-") 36 | dims[0] = int(dims[0]) - 1 37 | dims[1] = int(dims[1]) 38 | else: 39 | dims = None 40 | 41 | if corpus == '20news': 42 | train_file = "20news-train-11314.svm-%s.txt" %filetype 43 | test_file = "20news-test-7532.svm-%s.txt" %filetype 44 | else: 45 | train_file = "reuters-train-5770.svm-%s.txt" %filetype 46 | test_file = "reuters-test-2255.svm-%s.txt" %filetype 47 | 48 | train_features_sparse, true_train_classes = load_svmlight_file(train_file) 49 | test_features_sparse, true_test_classes = load_svmlight_file(test_file) 50 | #nonzeroColIDs = np.union1d( train_features_sparse.nonzero()[1], test_features_sparse.nonzero()[1] ) 51 | #train_features = train_features_sparse[:, nonzeroColIDs].toarray() 52 | #test_features = test_features_sparse[:, nonzeroColIDs].toarray() 53 | 54 | #pdb.set_trace() 55 | #print "%dx%d sparse feature matrices reduced to %dx%d" %( tuple(train_features_sparse.shape) + 56 | # tuple(train_features.shape) ) 57 | 58 | train_features = train_features_sparse.toarray() 59 | test_features = test_features_sparse.toarray() 60 | 61 | print "Train: %dx%d. Test: %dx%d" %( tuple( train_features.shape + test_features.shape ) ) 62 | 63 | if dims: 64 | train_features = train_features[ :, dims[0]:dims[1] ] 65 | test_features = test_features[ :, dims[0]:dims[1] ] 66 | print "Choose only features %d-%d" %( dims[0]+1, dims[1] ) 67 | else: 68 | train_features = train_features[ :, : ] 69 | test_features = test_features[ :, : ] 70 | 71 | model = svm.LinearSVC(penalty='l1', dual=False) 72 | 73 | print "Training...", 74 | model.fit( train_features, true_train_classes ) 75 | print "Done." 76 | 77 | pred_train_classes = model.predict( train_features ) 78 | pred_test_classes = model.predict( test_features ) 79 | 80 | print metrics.classification_report(true_train_classes, pred_train_classes, digits=3) 81 | print metrics.classification_report(true_test_classes, pred_test_classes, digits=3) 82 | 83 | for average in ['micro', 'macro']: 84 | train_precision, train_recall, train_f1, train_acc = getScores( true_train_classes, pred_train_classes, average ) 85 | print "Train Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %( average, 86 | train_precision, train_recall, train_f1, train_acc ) 87 | 88 | test_precision, test_recall, test_f1, test_acc = getScores( true_test_classes, pred_test_classes, average ) 89 | print "Test Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %( average, 90 | test_precision, test_recall, test_f1, test_acc ) 91 | -------------------------------------------------------------------------------- /corpusLoader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/corpusLoader.py -------------------------------------------------------------------------------- /csv2topic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import getopt 3 | import sys 4 | import pdb 5 | import os 6 | import csv 7 | from topicvecDir import topicvecDir 8 | from utils import * 9 | 10 | customStopwords = "based via using approach learning multi algorithm algorithms" 11 | 12 | config = dict( csv_filenames = None, 13 | short_name = None, 14 | unigramFilename = "top1grams-wiki.txt", 15 | word_vec_file = "25000-180000-500-BLK-8.0.vec", 16 | K = 20, 17 | N0 = 500, 18 | max_l = 5, 19 | init_l = 1, 20 | max_grad_norm = 0, 21 | # cap the sum of Em when updating topic embeddings 22 | # to avoid too big gradients 23 | grad_scale_Em_base = 2500, 24 | topW = 30, 25 | topTopicMassFracPrintThres = 0.1, 26 | alpha0 = 0.1, 27 | alpha1 = 0.1, 28 | iniDelta = 0.1, 29 | MAX_EM_ITERS = 100, 30 | topicDiff_tolerance = 2e-3, 31 | printTopics_iterNum = 10, 32 | zero_topic0 = True, 33 | useDrdtApprox = False, 34 | customStopwords = customStopwords, 35 | remove_stop = True, 36 | normalize_vecs = False, 37 | # shift all embeddings in a document, so that their average is 0 38 | rebase_vecs = True, 39 | rebase_norm_thres = 0.2, 40 | evalKmeans = False, 41 | verbose = 1, 42 | seed = 0 43 | ) 44 | 45 | def usage(): 46 | print """topicvecDir.py [ -v vec_file -a alpha ... ] csv_file 47 | Options: 48 | -k: Number of topic embeddings to extract. Default: 20 49 | -v: Existing embedding file of all words. 50 | -r: Existing residual file of core words. 51 | -a: Hyperparameter alpha. Default: 0.1. 52 | -i: Number of iterations of the EM procedure. Default: 100 53 | -u: Unigram file, to obtain unigram probs. 54 | -l: Magnitude of topic embeddings. 55 | -A: Append to the old log file. 56 | -s: Seed the random number generator to x. Used to repeat experiments 57 | -n: Nickname (short name) for the csv_file 58 | """ 59 | 60 | def getOptions(): 61 | global config 62 | 63 | try: 64 | opts, args = getopt.getopt(sys.argv[1:],"k:v:i:u:l:s:n:Ah") 65 | if len(args) < 1: 66 | raise getopt.GetoptError("") 67 | config['csv_filenames'] = args 68 | 69 | for opt, arg in opts: 70 | if opt == '-k': 71 | config['K'] = int(arg) 72 | if opt == '-v': 73 | config['vec_file'] = arg 74 | if opt == '-a': 75 | config['alpha1'] = float(opt) 76 | if opt == '-i': 77 | config['MAX_EM_ITERS'] = int(arg) 78 | if opt == '-u': 79 | config['unigramFilename'] = arg 80 | if opt == '-l': 81 | config['max_l'] = int(arg) 82 | if opt == '-s': 83 | config['seed'] = int(arg) 84 | if opt == '-A': 85 | config['appendLogfile'] = True 86 | if opt == '-n': 87 | config['short_name'] = arg 88 | if opt == '-r': 89 | config['useDrdtApprox'] = True 90 | if opt == '-h': 91 | usage() 92 | sys.exit(0) 93 | 94 | basename = os.path.basename(args[0]) 95 | if config['short_name']: 96 | config['logfilename'] = config['short_name'] 97 | elif len(args) > 1: 98 | config['logfilename'] = "(%d)%s" %( len(args), basename ) 99 | else: 100 | config['logfilename'] = basename 101 | 102 | except getopt.GetoptError: 103 | usage() 104 | sys.exit(2) 105 | 106 | return config 107 | 108 | def main(): 109 | config = getOptions() 110 | 111 | docwords = [] 112 | csvfiles_filecount = 0 113 | csvfiles_wc = 0 114 | csvfiles_rowcount = 0 115 | file_rownames = [] 116 | for csv_filename in config['csv_filenames']: 117 | csvfile_wc = 0 118 | csvfile_rowcount = 0 119 | with open(csv_filename) as DOC: 120 | docreader = csv.reader(DOC) 121 | for row in docreader: 122 | doc = row[0] 123 | wordsInSentences, wc = extractSentenceWords(doc, min_length=2) 124 | csvfile_wc += wc 125 | csvfile_rowcount += 1 126 | docwords.append(wordsInSentences) 127 | file_rownames.append( "%s-row%d" %(csv_filename, csvfile_rowcount) ) 128 | csvfile_avgwc = csvfile_wc * 1.0 / csvfile_rowcount 129 | print "%d words extracted from %d rows in '%s'. Avg %.1f words each row" %( csvfile_wc, 130 | csvfile_rowcount, csv_filename, csvfile_avgwc ) 131 | 132 | csvfiles_wc += csvfile_wc 133 | csvfiles_rowcount += csvfile_rowcount 134 | csvfiles_filecount += 1 135 | csvfiles_avgwc = csvfiles_wc * 1.0 / csvfiles_rowcount 136 | if csvfiles_filecount > 1: 137 | print "%d words extracted from %d rows in %d csv files. Avg %.1f words each row" %(csvfiles_wc, 138 | csvfiles_rowcount, csvfiles_filecount, csvfiles_avgwc) 139 | 140 | topicvec = topicvecDir(**config) 141 | topicvec.setDocs( docwords, file_rownames ) 142 | 143 | if 'evalKmeans' in config and config['evalKmeans']: 144 | topicvec.kmeans() 145 | topicvec.printTopWordsInTopic(None, True) 146 | exit(0) 147 | 148 | best_last_Ts, Em, docs_Em, Pi = topicvec.inference() 149 | 150 | basename = os.path.basename(config['logfilename']) 151 | basetrunk = os.path.splitext(basename)[0] 152 | 153 | best_it, best_T, best_loglike = best_last_Ts[0] 154 | save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T ) 155 | 156 | if best_last_Ts[1]: 157 | last_it, last_T, last_loglike = best_last_Ts[1] 158 | save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T ) 159 | 160 | if __name__ == '__main__': 161 | main() 162 | -------------------------------------------------------------------------------- /file2topic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import getopt 3 | import sys 4 | import pdb 5 | import os 6 | from topicvecDir import topicvecDir 7 | from utils import * 8 | 9 | customStopwords = "based via using approach learning multi algorithm algorithms" 10 | 11 | config = dict( doc_filenames = None, 12 | short_name = None, 13 | unigramFilename = "top1grams-wiki.txt", 14 | word_vec_file = "25000-180000-500-BLK-8.0.vec", 15 | K = 20, 16 | N0 = 500, 17 | max_l = 5, 18 | init_l = 1, 19 | max_grad_norm = 0, 20 | # cap the sum of Em when updating topic embeddings 21 | # to avoid too big gradients 22 | grad_scale_Em_base = 2500, 23 | topW = 30, 24 | topTopicMassFracPrintThres = 0.1, 25 | alpha0 = 0.1, 26 | alpha1 = 0.1, 27 | iniDelta = 0.1, 28 | MAX_EM_ITERS = 100, 29 | topicDiff_tolerance = 2e-3, 30 | printTopics_iterNum = 10, 31 | zero_topic0 = True, 32 | useDrdtApprox = False, 33 | customStopwords = customStopwords, 34 | remove_stop = True, 35 | normalize_vecs = False, 36 | # shift all embeddings in a document, so that their average is 0 37 | rebase_vecs = True, 38 | rebase_norm_thres = 0.2, 39 | evalKmeans = False, 40 | verbose = 1, 41 | seed = 0 42 | ) 43 | 44 | def usage(): 45 | print """topicvecDir.py [ -v vec_file -a alpha ... ] doc_file 46 | Options: 47 | -k: Number of topic embeddings to extract. Default: 20 48 | -v: Existing embedding file of all words. 49 | -r: Existing residual file of core words. 50 | -a: Hyperparameter alpha. Default: 0.1. 51 | -i: Number of iterations of the EM procedure. Default: 100 52 | -u: Unigram file, to obtain unigram probs. 53 | -l: Magnitude of topic embeddings. 54 | -A: Append to the old log file. 55 | -s: Seed the random number generator to x. Used to repeat experiments 56 | -n: Nickname (short name) for the doc_file 57 | """ 58 | 59 | def getOptions(): 60 | global config 61 | 62 | try: 63 | opts, args = getopt.getopt(sys.argv[1:],"k:v:i:u:l:s:n:Ah") 64 | if len(args) < 1: 65 | raise getopt.GetoptError("") 66 | config['doc_filenames'] = args 67 | 68 | for opt, arg in opts: 69 | if opt == '-k': 70 | config['K'] = int(arg) 71 | if opt == '-v': 72 | config['vec_file'] = arg 73 | if opt == '-a': 74 | config['alpha1'] = float(opt) 75 | if opt == '-i': 76 | config['MAX_EM_ITERS'] = int(arg) 77 | if opt == '-u': 78 | config['unigramFilename'] = arg 79 | if opt == '-l': 80 | config['max_l'] = int(arg) 81 | if opt == '-s': 82 | config['seed'] = int(arg) 83 | if opt == '-A': 84 | config['appendLogfile'] = True 85 | if opt == '-n': 86 | config['short_name'] = arg 87 | if opt == '-r': 88 | config['useDrdtApprox'] = True 89 | if opt == '-h': 90 | usage() 91 | sys.exit(0) 92 | 93 | if config['short_name']: 94 | config['logfilename'] = config['short_name'] 95 | elif len(args) > 1: 96 | config['logfilename'] = "(%d)%s" %( len(args), args[0] ) 97 | else: 98 | config['logfilename'] = args[0] 99 | 100 | except getopt.GetoptError: 101 | usage() 102 | sys.exit(2) 103 | 104 | return config 105 | 106 | def main(): 107 | config = getOptions() 108 | 109 | docwords = [] 110 | for doc_filename in config['doc_filenames']: 111 | with open(doc_filename) as DOC: 112 | doc = DOC.readlines() 113 | doc = "".join(doc) 114 | 115 | wordsInSentences, wc = extractSentenceWords(doc, 2) 116 | print "%d words extracted from '%s'" %(wc, doc_filename) 117 | docwords.append(wordsInSentences) 118 | 119 | topicvec = topicvecDir(**config) 120 | topicvec.setDocs( docwords, config['doc_filenames'] ) 121 | 122 | if 'evalKmeans' in config and config['evalKmeans']: 123 | topicvec.kmeans() 124 | topicvec.printTopWordsInTopic(None, True) 125 | exit(0) 126 | 127 | best_last_Ts, Em, docs_Em, Pi = topicvec.inference() 128 | 129 | basename = os.path.basename(config['logfilename']) 130 | basetrunk = os.path.splitext(basename)[0] 131 | 132 | best_it, best_T, best_loglike = best_last_Ts[0] 133 | save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T ) 134 | 135 | if best_last_Ts[1]: 136 | last_it, last_T, last_loglike = best_last_Ts[1] 137 | save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T ) 138 | 139 | if __name__ == '__main__': 140 | main() 141 | -------------------------------------------------------------------------------- /psdvec/PSDVec.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/PSDVec.pdf -------------------------------------------------------------------------------- /psdvec/README.md: -------------------------------------------------------------------------------- 1 | # PSDVec 2 | PSDVec is the source code for "A Generative Word Embedding Model and its Low Rank Positive Semidefinite Solution" (EMNLP 2015). 3 | 4 | See "PSDVec.pdf" for a manual (```PSDVec: a Toolbox for Incremental and Scalable Word Embedding```, accepted by Neurocomputing, 2016). 5 | 6 | #### Update v0.42: Tikhonov Regularization (=Spherical Gaussian Prior) to embeddings in block-wise factorization: 7 | 1. Obtain 25000 core embeddings using Weighted PSD Approximation, into _25000-500-EM.vec_: 8 | * ```python factorize.py -w 25000 top2grams-wiki.txt``` 9 | 2. Obtain 45000 noncore embeddings using Weighted Least Squares, totaling 80000 (25000 cores + 55000 noncores), into _25000-80000-500-BLK-2.0.vec_: 10 | * ```python factorize.py -v 25000-500-EM.vec -o 55000 -t2 top2grams-wiki.txt``` 11 | 3. Incrementally learn other 50000 noncore embeddings (based on 25000 cores), into _25000-130000-500-BLK-4.0.vec_: 12 | * ```python factorize.py -v 25000-80000-500-BLK-2.0.vec -b 25000 -o 50000 -t4 top2grams-wiki.txt``` 13 | 4. Repeat 3 again, with Tikhonov coeff = 8 to get more embeddings of rarer words, into _25000-180000-500-BLK-8.0.vec_: 14 | * ```python factorize.py -v 25000-130000-500-BLK-4.0.vec -b 25000 -o 50000 -t8 top2grams-wiki.txt``` 15 | 16 | Pretrained 180,000 embeddings and evaluation results are uploaded. Now the performance is systematically better than other methods. 17 | 18 | #### Update v0.41: Gradient Descent (GD) solution: 19 | * ```python factorize.py -G 500 -w 120000 top2grams-wiki.txt``` 20 | * GD is fast and scalable, but the performance is much worse (~10% lower on the testsets). It's not recommended, unless initialized using unweighted Eigendecomposition (which is still not scalable). 21 | 22 | #### Update v0.4: Online Block-wise Factorization 23 | 24 | Testsets are by courtesy of Omer Levy (https://bitbucket.org/omerlevy/hyperwords/src). 25 | 26 | The Gradient Descent algorithm was based on the suggestion of Peilin Zhao (not included as a part of the papers). 27 | -------------------------------------------------------------------------------- /psdvec/addheader.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import os 3 | import sys 4 | import re 5 | 6 | oldVecFilename = sys.argv[1] 7 | newVecFilename = sys.argv[2] 8 | 9 | stream = os.popen( "wc %s" %oldVecFilename ) 10 | output = stream.read() 11 | output = output.strip() 12 | linecount, wordcount, charcount, filename = re.split(" +", output) 13 | linecount = int(linecount) 14 | wordcount = int(wordcount) 15 | 16 | if wordcount % linecount != 0: 17 | print "Error: line count %d does not divide word count %d" %(linecount, wordcount) 18 | sys.exit(1) 19 | 20 | veclen = wordcount / linecount - 1 21 | print "%d %d" %(linecount, veclen) 22 | VEC = open(newVecFilename, "w") 23 | VEC.write( "%d %d\n" %(linecount, veclen) ) 24 | VEC.close() 25 | os.popen( "cat %s >> %s" %(oldVecFilename, newVecFilename) ) 26 | 27 | stream = os.popen( "ls -l %s" %oldVecFilename ) 28 | print stream.read().strip() 29 | stream = os.popen( "ls -l %s" %newVecFilename ) 30 | print stream.read().strip() 31 | -------------------------------------------------------------------------------- /psdvec/analogy.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import re 3 | from utils import * 4 | 5 | def pred_ana( model, a, a2, b, maxcands = 10 ): 6 | questWordIndices = [ model.word2id[x] for x in (a,a2,b) ] 7 | # b2 is effectively iterating through the vocab. The row is all the cosine values 8 | b2a2 = model.sim_row(a2) 9 | b2a = model.sim_row(a) 10 | b2b = model.sim_row(b) 11 | 12 | mulsims = ( b2a2 + 1 ) * ( b2b + 1 ) / ( b2a + 1.001 ) 13 | mulsims[questWordIndices] = -10000 14 | b2s = [] 15 | for i in xrange(maxcands): 16 | imul = np.nanargmax(mulsims) 17 | b2mul = model.vocab[imul] 18 | b2s.append( [ b2mul, mulsims[imul] ] ) 19 | mulsims[imul] = -10000 20 | 21 | return b2s 22 | 23 | embedding_npyfile = "25000-180000-500-BLK-8.0.vec.npy" 24 | embedding_arrays = np.load(embedding_npyfile) 25 | V, vocab, word2ID, skippedWords_whatever = embedding_arrays 26 | print "%d words loaded from '%s'" %(len(vocab), embedding_npyfile) 27 | model = VecModel(V, vocab, word2ID, vecNormalize=True) 28 | print "Model initialized. Ready for input:" 29 | 30 | while True: 31 | line = raw_input() 32 | line = line.strip() 33 | words = re.split("\s+", line) 34 | if len(words) != 3: 35 | print "Only 3 words are allowed" 36 | continue 37 | 38 | oov = 0 39 | for w in words: 40 | if w not in model: 41 | print "'%s' not in vocab" %w 42 | oov += 1 43 | if oov > 0: 44 | continue 45 | 46 | a, a2, b = words 47 | b2s = pred_ana( model, a, a2, b ) 48 | for word, sim in b2s: 49 | print word, sim 50 | print 51 | 52 | -------------------------------------------------------------------------------- /psdvec/bench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | export ROOT=/home/shaohua/D 3 | #export CORPUS=$ROOT/corpus/cleanwiki.txt 4 | #export DIM=500 5 | #export MINCOUNT=100 6 | #export SUFFIX=wiki 7 | export CORPUS=$ROOT/corpus/rcv1clean.txt 8 | export DIM=50 9 | export MINCOUNT=50 10 | export SUFFIX=rcv1 11 | 12 | cd $ROOT/corpus/ 13 | echo PSD: 14 | ./fact-$SUFFIX.sh 15 | cd $ROOT/word2vec 16 | echo word2vec: 17 | time ./word2vec -train $CORPUS -output $ROOT/corpus/word2vec-$SUFFIX.vec -size $DIM -window 5 -sample 1e-4 -negative 15 -min-count $MINCOUNT 18 | cd $ROOT/corpus/glove/ 19 | echo glove: 20 | time ./$SUFFIX.sh 21 | cd $ROOT/corpus/singular/ 22 | echo singular: 23 | time ./singular --corpus $CORPUS --output ./$SUFFIX --rare $MINCOUNT --window 3 --dim $DIM 24 | echo PPM and SVD: 25 | cd $ROOT/corpus/hyperwords 26 | ./train-$SUFFIX.sh 27 | echo Sparse: 28 | tail -n+2 $ROOT/corpus/word2vec-$SUFFIX.vec > $ROOT/corpus/word2vec-$SUFFIX-headless.vec 29 | cd $ROOT/corpus/sparse/ 30 | time ./sparse ../word2vec-$SUFFIX-headless.vec 5 0.5 1e-5 4 sparse-$SUFFIX.vec 31 | -------------------------------------------------------------------------------- /psdvec/benchspeed.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import time 3 | 4 | class Timer(object): 5 | def __init__(self, name=None): 6 | self.name = name 7 | self.tstart = time.time() 8 | self.tlast = self.tstart 9 | self.firstCall = True 10 | 11 | def getElapseTime(self, isStr=True): 12 | totalElapsed = time.time() - self.tstart 13 | # elapsed time since last call 14 | interElapsed = time.time() - self.tlast 15 | self.tlast = time.time() 16 | 17 | firstCall = self.firstCall 18 | self.firstCall = False 19 | 20 | if isStr: 21 | if self.name: 22 | if firstCall: 23 | return '%s elapsed: %.2f' % ( self.name, totalElapsed ) 24 | return '%s elapsed: %.2f/%.2f' % ( self.name, totalElapsed, interElapsed ) 25 | else: 26 | if firstCall: 27 | return 'Elapsed: %.2f' % ( totalElapsed ) 28 | return 'Elapsed: %.2f/%.2f' % ( totalElapsed, interElapsed ) 29 | else: 30 | return totalElapsed, interElapsed 31 | 32 | def printElapseTime(self): 33 | print self.getElapseTime() 34 | 35 | def timeToStr(timeNum, fmt="%H:%M:%S"): 36 | timeStr = time.strftime(fmt, time.localtime(timeNum)) 37 | return timeStr 38 | 39 | def block_factorize( core_size, noncore_size, N0, tikhonovCoeff ): 40 | # new WGsum: noncore_size * core_size 41 | WGsum = np.random.random((noncore_size,core_size)) 42 | Wsum = np.random.random((noncore_size,core_size)) 43 | Wsum[ np.isclose(Wsum,0) ] = 0.001 44 | Gwmean = WGsum 45 | 46 | V1 = np.random.random((core_size,N0)) 47 | # embeddings of noncore words 48 | # new V2: noncore_size * N0 49 | V2 = np.zeros( ( noncore_size, N0 ), dtype=np.float32 ) 50 | Tikhonov = np.identity(N0) * tikhonovCoeff 51 | 52 | timer = Timer() 53 | 54 | print "Begin finding embeddings of non-core words" 55 | 56 | # Find each noncore word's embedding 57 | for i in xrange(noncore_size): 58 | # core_size 59 | wi = Wsum[i] 60 | # new VW: N0 * core_size 61 | VW = V1.T * wi 62 | # new VWV: N0 * N0 63 | VWV = VW.dot(V1) 64 | if False: 65 | VWV_Tik = VWV + Tikhonov 66 | V2[i] = np.linalg.inv(VWV_Tik).dot( VW.dot(Gwmean[i]) ) 67 | if i >= 0 and i % 100 == 99: 68 | print "\r%d / %d." %(i+1,noncore_size), 69 | print timer.getElapseTime(), "\r", 70 | 71 | print 72 | 73 | block_factorize(15000, 1000, 500, 2) 74 | #block_factorize(15000, 10000, 50, 2) 75 | -------------------------------------------------------------------------------- /psdvec/catbench.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | testsetNames = [ "ap", "battig", "esslli" ] 4 | testsetCatNums = [ 13, 10, 6 ] 5 | algNames = [ "PSDVec", "word2vec", "CCA" ] 6 | CLmethods = [ "rbr", "direct", "graph" ] 7 | vclusterPath = "D:\\cluto-2.1.2\\MSWIN-x86_64-openmp\\vcluster.exe" 8 | testsetDir = "./concept categorization" 9 | 10 | for CLmethod in CLmethods: 11 | for i, testsetName in enumerate(testsetNames): 12 | for algName in algNames: 13 | vecFilename = testsetDir + "/" + testsetName + "-" + algName + ".vec" 14 | labelFilename = testsetDir + "/" + testsetName + "-" + algName + ".label" 15 | catNum = testsetCatNums[i] 16 | print "%s on %s using %s:" %( algName, testsetName, CLmethod ) 17 | stream = os.popen( '%s -rclassfile="%s" -clmethod=%s "%s" %d' %( vclusterPath, 18 | labelFilename, CLmethod, vecFilename, catNum ) ) 19 | output = stream.read() 20 | lines = output.split("\n") 21 | for line in lines: 22 | if line.find("way clustering") >= 0: 23 | print line 24 | print 25 | -------------------------------------------------------------------------------- /psdvec/cleancorpus.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import gensim.corpora.wikicorpus 4 | 5 | # check and process input arguments 6 | if len(sys.argv) < 3: 7 | print "Usage: cleancorpus.py infile_name outfile_name" 8 | sys.exit(1) 9 | 10 | infilename, outfilename = sys.argv[1:3] 11 | 12 | if os.path.isfile(outfilename): 13 | print "Output file %s exists. Change the file name and try again." %outfilename 14 | sys.exit(1) 15 | 16 | linecount = 0 17 | bytecount = 0 18 | wordcount = 0 19 | 20 | output = open(outfilename, 'w') 21 | IN = open(infilename) 22 | for line in IN: 23 | tokens = gensim.corpora.wikicorpus.tokenize(line) 24 | output.write( "%s\n" %(" ".join(tokens)) ) 25 | linecount += 1 26 | bytecount += len(line) 27 | wordcount += len(tokens) 28 | if linecount % 500 == 0: 29 | print "\r%d %d %d \r" %(linecount, bytecount/1024/1024, wordcount), 30 | -------------------------------------------------------------------------------- /psdvec/competitors/GloVe-1.2.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/competitors/GloVe-1.2.zip -------------------------------------------------------------------------------- /psdvec/competitors/glove/demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. 4 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python 5 | 6 | CORPUS=../rcv1clean.txt 7 | VOCAB_FILE=vocab.txt 8 | COOCCURRENCE_FILE=cooccurrence.bin 9 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin 10 | BUILDDIR=build 11 | SAVE_FILE=glove-rcv1.vec 12 | VERBOSE=2 13 | MEMORY=16.0 14 | VOCAB_MIN_COUNT=50 15 | VECTOR_SIZE=50 16 | MAX_ITER=15 17 | WINDOW_SIZE=3 18 | BINARY=0 19 | NUM_THREADS=8 20 | X_MAX=10 21 | 22 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE 23 | if [[ $? -eq 0 ]] 24 | then 25 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE 26 | if [[ $? -eq 0 ]] 27 | then 28 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE 29 | if [[ $? -eq 0 ]] 30 | then 31 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE 32 | if [[ $? -eq 0 ]] 33 | then 34 | if [ "$1" = 'matlab' ]; then 35 | matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 36 | elif [ "$1" = 'octave' ]; then 37 | octave < ./eval/octave/read_and_evaluate_octave.m 1>&2 38 | else 39 | python eval/python/evaluate.py 40 | fi 41 | fi 42 | fi 43 | fi 44 | fi 45 | -------------------------------------------------------------------------------- /psdvec/competitors/glove/rcv1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. 4 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python 5 | 6 | CORPUS=../rcv1clean.txt 7 | VOCAB_FILE=vocab-rcv1.txt 8 | COOCCURRENCE_FILE=cooccurrence-rcv1.bin 9 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf-rcv1.bin 10 | BUILDDIR=build 11 | SAVE_FILE=glove-rcv1.vec 12 | VERBOSE=2 13 | MEMORY=16.0 14 | VOCAB_MIN_COUNT=50 15 | VECTOR_SIZE=50 16 | MAX_ITER=15 17 | WINDOW_SIZE=3 18 | BINARY=0 19 | NUM_THREADS=8 20 | X_MAX=10 21 | 22 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE 23 | if [[ $? -eq 0 ]] 24 | then 25 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE 26 | if [[ $? -eq 0 ]] 27 | then 28 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE 29 | if [[ $? -eq 0 ]] 30 | then 31 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE 32 | if [[ $? -eq 0 ]] 33 | then 34 | if [ "$1" = 'matlab' ]; then 35 | matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2 36 | elif [ "$1" = 'octave' ]; then 37 | octave < ./eval/octave/read_and_evaluate_octave.m 1>&2 38 | else 39 | python eval/python/evaluate.py 40 | fi 41 | fi 42 | fi 43 | fi 44 | fi 45 | -------------------------------------------------------------------------------- /psdvec/competitors/glove/wiki.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Makes programs, downloads sample data, trains a GloVe model, and then evaluates it. 4 | # One optional argument can specify the language used for eval script: matlab, octave or [default] python 5 | 6 | CORPUS=/home/shaohua/D/corpus/cleanwiki.txt 7 | VOCAB_FILE=vocab-wiki.txt 8 | COOCCURRENCE_FILE=cooccurrence-wiki.bin 9 | COOCCURRENCE_SHUF_FILE=cooccurrence.shuf-wiki.bin 10 | BUILDDIR=build 11 | SAVE_FILE=glove-wiki.vec 12 | VERBOSE=2 13 | MEMORY=16.0 14 | VOCAB_MIN_COUNT=100 15 | VECTOR_SIZE=500 16 | MAX_ITER=15 17 | WINDOW_SIZE=3 18 | BINARY=0 19 | NUM_THREADS=8 20 | X_MAX=10 21 | 22 | $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE 23 | if [[ $? -eq 0 ]] 24 | then 25 | $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE 26 | if [[ $? -eq 0 ]] 27 | then 28 | $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE 29 | if [[ $? -eq 0 ]] 30 | then 31 | $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE 32 | fi 33 | fi 34 | fi 35 | -------------------------------------------------------------------------------- /psdvec/competitors/hyperwords.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/competitors/hyperwords.zip -------------------------------------------------------------------------------- /psdvec/competitors/hyperwords/pmi2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # A) Window size 2 with " subsampling 3 | CORPUS=/home/shaohua/D/corpus/cleanwiki.txt 4 | 5 | mkdir w2.sub 6 | python hyperwords/corpus2pairs.py --win 2 --sub 1e-5 ${CORPUS} > w2.sub/pairs 7 | scripts/pairs2counts.sh w2.sub/pairs > w2.sub/counts 8 | python hyperwords/counts2vocab.py w2.sub/counts 9 | # Calculate PMI matrices for each collection of pairs 10 | python hyperwords/counts2pmi.py --cds 0.75 w2.sub/counts w2.sub/pmi 11 | -------------------------------------------------------------------------------- /psdvec/competitors/hyperwords/pmi5-rcv1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # B) Window size 5 with dynamic contexts and "dirty" subsampling 4 | 5 | CORPUS=/home/shaohua/D/corpus/rcv1clean.txt 6 | DIR=w5.rcv1.dyn.sub.del 7 | mkdir $DIR 8 | python hyperwords/corpus2pairs.py --win 5 --dyn --sub 1e-5 --del ${CORPUS} > $DIR/pairs 9 | scripts/pairs2counts.sh $DIR/pairs > $DIR/counts 10 | python hyperwords/counts2vocab.py $DIR/counts 11 | 12 | # Calculate PMI matrices for each collection of pairs 13 | python hyperwords/counts2pmi.py --cds 0.75 $DIR/counts $DIR/pmi 14 | -------------------------------------------------------------------------------- /psdvec/competitors/hyperwords/pmi5.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # B) Window size 5 with dynamic contexts and "dirty" subsampling 4 | 5 | CORPUS=/home/shaohua/D/corpus/cleanwiki.txt 6 | 7 | mkdir w5.dyn.sub.del 8 | python hyperwords/corpus2pairs.py --win 5 --dyn --sub 1e-5 --del ${CORPUS} > w5.dyn.sub.del/pairs 9 | scripts/pairs2counts.sh w5.dyn.sub.del/pairs > w5.dyn.sub.del/counts 10 | python hyperwords/counts2vocab.py w5.dyn.sub.del/counts 11 | 12 | # Calculate PMI matrices for each collection of pairs 13 | python hyperwords/counts2pmi.py --cds 0.75 w5.dyn.sub.del/counts w5.dyn.sub.del/pmi 14 | -------------------------------------------------------------------------------- /psdvec/competitors/hyperwords/svd-rcv1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create embeddings with SVD 4 | DIR=w5.rcv1.dyn.sub.del 5 | python hyperwords/pmi2svd.py --dim 50 --neg 5 $DIR/pmi $DIR/svd 6 | cp $DIR/pmi.words.vocab $DIR/svd.words.vocab 7 | cp $DIR/pmi.contexts.vocab $DIR/svd.contexts.vocab 8 | -------------------------------------------------------------------------------- /psdvec/competitors/hyperwords/svd.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create embeddings with SVD 4 | 5 | CORPUS=/home/shaohua/D/corpus/cleanwiki.txt 6 | 7 | python hyperwords/pmi2svd.py --dim 500 --neg 5 w2.sub/pmi w2.sub/svd 8 | cp w2.sub/pmi.words.vocab w2.sub/svd.words.vocab 9 | cp w2.sub/pmi.contexts.vocab w2.sub/svd.contexts.vocab 10 | python hyperwords/pmi2svd.py --dim 500 --neg 5 w5.dyn.sub.del/pmi w5.dyn.sub.del/svd 11 | cp w5.dyn.sub.del/pmi.words.vocab w5.dyn.sub.del/svd.words.vocab 12 | cp w5.dyn.sub.del/pmi.contexts.vocab w5.dyn.sub.del/svd.contexts.vocab 13 | -------------------------------------------------------------------------------- /psdvec/competitors/hyperwords/train-rcv1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | rm -rf w5.rcv1.dyn.sub.del 3 | rm -rf w5.rcv1.dyn.sub.del 4 | time ./pmi5-rcv1.sh 5 | time ./svd-rcv1.sh 6 | 7 | 8 | 9 | -------------------------------------------------------------------------------- /psdvec/competitors/hyperwords/train-wiki.sh: -------------------------------------------------------------------------------- 1 | #!/bin/sh 2 | rm -rf w2.sub w5.dyn.sub.del 3 | rm -rf w2.sub w5.dyn.sub.del 4 | time ./pmi2.sh 5 | time ./pmi5.sh 6 | time ./svd.sh 7 | 8 | 9 | 10 | -------------------------------------------------------------------------------- /psdvec/competitors/singular.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/competitors/singular.zip -------------------------------------------------------------------------------- /psdvec/competitors/sparse.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/competitors/sparse.zip -------------------------------------------------------------------------------- /psdvec/corpus2liblinear.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import getopt 3 | import sys 4 | from utils import * 5 | import pdb 6 | import time 7 | import os 8 | import json 9 | import copy 10 | 11 | def usage(): 12 | print """Usage:\n corpus2liblinear.py -d doc_dir -o output_file -v vec_file [ -s sent_file ] label 13 | corpus2liblinear.py -c config_file -n alg_name -v vec_file [ -s sent_file ] 14 | Options: 15 | doc_dir: Directory of the documents to convert. 16 | output_file: File to save the extracted vectors. 17 | Label: Label of documents. Must be 1/+1/-1. 18 | config_file: File specifying multiple directories, labels and output files. 19 | vec_file: File containing embedding vectors. 20 | alg_name: Name of the embedding algorithm that generates vec_file. 21 | Needed if onbly partial file name is specified in config_file. 22 | sent_file: File containing a list of sentiment words. 23 | """ 24 | 25 | def parseConfigFile(configFilename): 26 | CONF = open(configFilename) 27 | dir_configs = [] 28 | for line in CONF: 29 | line = line.strip() 30 | dir_config = json.loads(line) 31 | dir_configs.append(dir_config) 32 | return dir_configs 33 | 34 | def getFileFeatures(filename, V, word2id, sentword2id, remove_stop=False): 35 | DOC = open(filename) 36 | doc = DOC.read() 37 | wordsInSentences, wc = extractSentenceWords(doc, 1) 38 | 39 | countedWC = 0 40 | outvocWC = 0 41 | stopwordWC = 0 42 | sentWC = 0 43 | 44 | wids = [] 45 | wid2freq = {} 46 | BOWFeatureNum = len(sentword2id) 47 | BOWFreqs = np.zeros(BOWFeatureNum) 48 | 49 | for sentence in wordsInSentences: 50 | for w in sentence: 51 | w = w.lower() 52 | if remove_stop and w in stopwordDict: 53 | stopwordWC += 1 54 | continue 55 | 56 | if w in word2id: 57 | wid = word2id[w] 58 | wids.append( wid ) 59 | 60 | if wid not in wid2freq: 61 | wid2freq[wid] = 1 62 | else: 63 | wid2freq[wid] += 1 64 | countedWC += 1 65 | else: 66 | outvocWC += 1 67 | 68 | if w in sentword2id: 69 | id = sentword2id[w] 70 | BOWFreqs[id] += 1 71 | sentWC += 1 72 | 73 | N0 = V.shape[1] 74 | avgv = np.zeros(N0) 75 | 76 | # avgv is the average embedding vector. Used in Tobias Schnabel et al. (2015) as the only features 77 | for wid, freq in wid2freq.items(): 78 | avgv += np.log( freq + 1 ) * V[wid] 79 | 80 | #for wid in wids: 81 | # avgv += V[wid] 82 | 83 | avgv = normalizeF(avgv) 84 | return avgv, BOWFreqs 85 | 86 | def processDir( outFilename, docDir, label, appendToOutput, V, word2ID, sentword2id ): 87 | print "Process '%s' %s" %( docDir, label ) 88 | 89 | if appendToOutput: 90 | OUT = open(outFilename, "a") 91 | else: 92 | OUT = open(outFilename, "w") 93 | 94 | filecount = 0 95 | 96 | for filename in os.listdir(docDir): 97 | OUT.write(label) 98 | fullFilename = docDir + "/" + filename 99 | avgv, BOWFreqs = getFileFeatures( fullFilename, V, word2ID, sentword2id ) 100 | for i,x in enumerate(avgv): 101 | OUT.write( " %d:%.4f" %( i+1, x ) ) 102 | # i == N0 - 1 here, dimensionality of the embedding vector 103 | i += 1 104 | if BOWFreqs.shape[0] > 0: 105 | for freq in BOWFreqs: 106 | if freq > 0: 107 | OUT.write( " %d:%d" %( i+1, freq ) ) 108 | i += 1 109 | 110 | OUT.write("\n") 111 | filecount += 1 112 | if filecount % 500 == 0: 113 | print "\r%d\r" %filecount, 114 | 115 | if appendToOutput: 116 | writeMode = "appended to" 117 | else: 118 | writeMode = "written into" 119 | print "%d files processed and %s '%s'" %( filecount, writeMode, outFilename ) 120 | 121 | OUT.close() 122 | 123 | def main(): 124 | vecFilename = "25000-180000-500-BLK-8.0.vec" 125 | algname = None 126 | topword_cutoff = -1 127 | topSentWord_cutoff = -1 128 | 129 | configFilename = "" 130 | label = None 131 | appendToOutput = False 132 | sentimentWordFile = None 133 | 134 | try: 135 | opts, args = getopt.getopt(sys.argv[1:],"d:o:v:c:n:s:1ah") 136 | if( len(args) == 1 ): 137 | if args[0] != "1" and args[0] != "+1": 138 | raise getopt.GetoptError( "Unknown free argument '%s'" %args[0] ) 139 | label = "+1" 140 | elif( len(args) > 1 ): 141 | raise getopt.GetoptError( "Too many free arguments '%s'" %args ) 142 | 143 | for opt, arg in opts: 144 | if opt == '-1': 145 | label = "-1" 146 | 147 | if opt == '-c': 148 | configFilename = arg 149 | if opt == '-s': 150 | sentimentWordFile = arg 151 | 152 | if opt == '-n': 153 | algname = arg 154 | if opt == '-d': 155 | docDir = arg 156 | if opt == '-d': 157 | docDir = arg 158 | if opt == '-o': 159 | outFilename = arg 160 | if opt == '-v': 161 | vecFilename = arg 162 | if opt == '-a': 163 | appendToOutput = True 164 | if opt == '-h': 165 | usage() 166 | sys.exit(0) 167 | 168 | except getopt.GetoptError, e: 169 | if len(e.args) == 1: 170 | print "Option error: %s" %e.args[0] 171 | usage() 172 | sys.exit(2) 173 | 174 | sentword2id = {} 175 | bowSize = 0 176 | if sentimentWordFile: 177 | SENT = open(sentimentWordFile) 178 | id = 0 179 | for line in SENT: 180 | word, freq = line.split("\t") 181 | sentword2id[word] = id 182 | id += 1 183 | # if topSentWord_cutoff == -1, this equality is never satisfied, so no cut off 184 | if id == topSentWord_cutoff: 185 | break 186 | bowSize = len(sentword2id) 187 | print "%d sentiment words loaded" %(bowSize) 188 | 189 | if configFilename: 190 | dir_configs = parseConfigFile(configFilename) 191 | for conf in dir_configs: 192 | if 'outFilenameTrunk' in conf: 193 | if not algname: 194 | print "-n alg_name is needed to generate full output file name" 195 | usage() 196 | sys.exit(2) 197 | 198 | if sentimentWordFile: 199 | conf['outFilename'] = "%s-%s-bow%d.txt" %( conf['outFilenameTrunk'], algname, bowSize ) 200 | else: 201 | conf['outFilename'] = "%s-%s.txt" %( conf['outFilenameTrunk'], algname ) 202 | 203 | elif not label: 204 | print "No config file nor label is specified" 205 | usage() 206 | sys.exit(0) 207 | else: 208 | dir_config = { 'dir': docDir, 'outFilename': outFilename, 209 | 'label': label, 'isAppend': appendToOutput } 210 | dir_configs = [ dir_config ] 211 | 212 | V, vocab, word2ID, skippedWords_whatever = load_embeddings( vecFilename, topword_cutoff ) 213 | 214 | for conf in dir_configs: 215 | processDir( conf['outFilename'], conf['docDir'], conf['label'], 216 | conf['appendToOutput'], V, word2ID, sentword2id ) 217 | 218 | if __name__ == '__main__': 219 | main() 220 | -------------------------------------------------------------------------------- /psdvec/eval-logs/bench.log: -------------------------------------------------------------------------------- 1 | shaohua@shaohua:/media/shaohua/Outerspace/corpus$ ./bench.sh 2 | PSD: 3 | Read sim testset ./testsets/ws/ws353_similarity.txt 4 | Read sim testset ./testsets/ws/ws353_relatedness.txt 5 | Read sim testset ./testsets/ws/bruni_men.txt 6 | Read sim testset ./testsets/ws/radinsky_mturk.txt 7 | Read sim testset ./testsets/ws/luong_rare.txt 8 | Read sim testset ./testsets/ws/simlex_999a.txt 9 | Read analogy testset ./testsets/analogy/google.txt 10 | Read analogy testset ./testsets/analogy/msr.txt 11 | 12 | Loading bigram file 'top2grams-wiki.txt': 13 | Totally 277025 words 14 | 277025 words seen, top 25000 & 0 extra to keep. 25000 kept 15 | Read bigrams: 16 | 25000 17 | Cut point 4010: 4/0.000% 18 | Cut point 2005: 23/0.000% 19 | Cut point 1002: 123/0.000% 20 | Cut point 501: 840/0.000% 21 | Cut point 251: 5383/0.001% 22 | Cut point 125: 28276/0.005% 23 | Cut point 63: 124146/0.020% 24 | Cut point 31: 493469/0.079% 25 | Cut point 16: 1779998/0.285% 26 | 493469 (0.079%) elements in Weight-1 cut off at 31.33 27 | 28 | 4 iterations of EM 29 | Begin EM of weighted factorization by bigram freqs 30 | 31 | EM Iter 1: 32 | Begin unweighted factorization 33 | 12450 positive eigenvalues, sum: 1016404.875 34 | Eigenvalues cut at the 503-th, 186.957 ~ 186.925 35 | All eigen sum: 1961478.500, Kept eigen sum: 178549.484 36 | nowe_factorize() elapsed: 1936.64 37 | L1 Weighted: Gi: 7769817.151, VV: 7350854.952, Gsym-VV: 10842411.309, G-VV: 9977720.915 38 | Precompute cosine matrix, will need 2.5GB RAM... Done. 39 | ws353_similarity: 203 test pairs, 195 valid , 0.79632 40 | ws353_relatedness: 252 test pairs, 241 valid , 0.68286 41 | bruni_men: 3000 test pairs, 2639 valid , 0.77234 42 | radinsky_mturk: 287 test pairs, 279 valid , 0.68298 43 | luong_rare: 2034 test pairs, 396 valid , 0.54562 44 | simlex_999a: 999 test pairs, 945 valid , 0.39870 45 | 19500/12552/19544: Add 0.66651, Mul 0.68634 46 | google: 19544 analogies, 12586 valid . Add Score: 0.66590, Mul Score: 0.68584 47 | 8000/5030/8000: Add 0.54732, Mul 0.59761 48 | msr: 8000 analogies, 5030 valid . Add Score: 0.54732, Mul Score: 0.59761 49 | EM iter 1 elapsed: 1954.66 50 | 51 | EM Iter 2: 52 | Begin unweighted factorization 53 | 12338 positive eigenvalues, sum: 239171.609 54 | Eigenvalues cut at the 502-th, 191.346 ~ 190.984 55 | All eigen sum: 299793.500, Kept eigen sum: 180740.391 56 | nowe_factorize() elapsed: 1972.69 57 | L1 Weighted: Gi: 7878353.598, VV: 7242558.205, Gsym-VV: 10491300.183, G-VV: 9662005.122 58 | Precompute cosine matrix, will need 2.5GB RAM... Done. 59 | ws353_similarity: 203 test pairs, 195 valid , 0.79887 60 | ws353_relatedness: 252 test pairs, 241 valid , 0.68380 61 | bruni_men: 3000 test pairs, 2639 valid , 0.77014 62 | radinsky_mturk: 287 test pairs, 279 valid , 0.68362 63 | luong_rare: 2034 test pairs, 396 valid , 0.54914 64 | simlex_999a: 999 test pairs, 945 valid , 0.39755 65 | 19500/12552/19544: Add 0.67049, Mul 0.69025 66 | google: 19544 analogies, 12586 valid . Add Score: 0.66995, Mul Score: 0.68981 67 | 8000/5030/8000: Add 0.54274, Mul 0.59543 68 | msr: 8000 analogies, 5030 valid . Add Score: 0.54274, Mul Score: 0.59543 69 | EM iter 2 elapsed: 1990.84 70 | 71 | EM Iter 3: 72 | Begin unweighted factorization 73 | 12334 positive eigenvalues, sum: 240251.656 74 | Eigenvalues cut at the 501-th, 195.408 ~ 194.607 75 | All eigen sum: 299763.188, Kept eigen sum: 183383.266 76 | nowe_factorize() elapsed: 1932.79 77 | L1 Weighted: Gi: 7861670.004, VV: 7228707.970, Gsym-VV: 10272011.447, G-VV: 9453111.101 78 | Precompute cosine matrix, will need 2.5GB RAM... Done. 79 | ws353_similarity: 203 test pairs, 195 valid , 0.80074 80 | ws353_relatedness: 252 test pairs, 241 valid , 0.68146 81 | bruni_men: 3000 test pairs, 2639 valid , 0.76744 82 | radinsky_mturk: 287 test pairs, 279 valid , 0.68036 83 | luong_rare: 2034 test pairs, 396 valid , 0.55140 84 | simlex_999a: 999 test pairs, 945 valid , 0.39524 85 | 19500/12552/19544: Add 0.67081, Mul 0.69383 86 | google: 19544 analogies, 12586 valid . Add Score: 0.67027, Mul Score: 0.69355 87 | 8000/5030/8000: Add 0.53917, Mul 0.58847 88 | msr: 8000 analogies, 5030 valid . Add Score: 0.53917, Mul Score: 0.58847 89 | EM iter 3 elapsed: 1950.92 90 | 91 | EM Iter 4: 92 | Begin unweighted factorization 93 | 12339 positive eigenvalues, sum: 241826.562 94 | Eigenvalues cut at the 500-th, 199.922 ~ 198.961 95 | All eigen sum: 300269.469, Kept eigen sum: 186190.469 96 | nowe_factorize() elapsed: 2069.77 97 | L1 Weighted: Gi: 7879650.000, VV: 7250952.037, Gsym-VV: 10122656.270, G-VV: 9301310.810 98 | Precompute cosine matrix, will need 2.5GB RAM... Done. 99 | ws353_similarity: 203 test pairs, 195 valid , 0.80089 100 | ws353_relatedness: 252 test pairs, 241 valid , 0.67612 101 | bruni_men: 3000 test pairs, 2639 valid , 0.76526 102 | radinsky_mturk: 287 test pairs, 279 valid , 0.67757 103 | luong_rare: 2034 test pairs, 396 valid , 0.55358 104 | simlex_999a: 999 test pairs, 945 valid , 0.39342 105 | 19500/12552/19544: Add 0.67145, Mul 0.69487 106 | google: 19544 analogies, 12586 valid . Add Score: 0.67098, Mul Score: 0.69466 107 | 8000/5030/8000: Add 0.53320, Mul 0.58569 108 | msr: 8000 analogies, 5030 valid . Add Score: 0.53320, Mul Score: 0.58569 109 | EM iter 4 elapsed: 2087.96 110 | we_factorize_EM() elapsed: 7987.32 111 | 112 | Save matrix 'V' into 25000-500-EM.vec 113 | 114 | 115 | real 137m4.155s 116 | user 815m28.416s 117 | sys 97m3.316s 118 | Using Tikhonov regularization with coeff: 2.0 119 | Read sim testset ./testsets/ws/ws353_similarity.txt 120 | Read sim testset ./testsets/ws/ws353_relatedness.txt 121 | Read sim testset ./testsets/ws/bruni_men.txt 122 | Read sim testset ./testsets/ws/radinsky_mturk.txt 123 | Read sim testset ./testsets/ws/luong_rare.txt 124 | Read sim testset ./testsets/ws/simlex_999a.txt 125 | Read analogy testset ./testsets/analogy/google.txt 126 | Read analogy testset ./testsets/analogy/msr.txt 127 | 128 | Embeddings of all words in '25000-500-EM.vec' will be loaded as core 129 | Load embedding text file '25000-500-EM.vec' 130 | Will load embeddings of 25000 words 131 | 25000 25000 0 132 | 25000 embeddings read, 25000 kept 133 | 2 blocks of 25000 core words and 55000 noncore words will be loaded. Skip 0 words 134 | Loading bigram file 'top2grams-wiki.txt' into 2 blocks. Will skip 0 words 135 | Totally 277025 words 136 | 277025 words in file, top 80000 to read into vocab (25000 core, 55000 noncore), 0 skipped 137 | Read bigrams: 138 | 25000 (25000 core, 0 noncore) 139 | 25000 core words are all read. 140 | 80000 (25000 core, 55000 noncore) 141 | Cut point 35: 2419/0.000% 142 | Cut point 18: 48813/0.004% 143 | 2414 (0.000%) elements in Weight-1 cut off at 35.48 144 | 1328 (0.000%) elements in Weight-2 cut off at 35.48 145 | 146 | del G1, G21 147 | Begin finding embeddings of non-core words 148 | 55000 / 55000. Elapsed: 5851.77/10.41 149 | del F21, WGsum, VW 150 | Save matrix 'V' into 25000-80000-500-BLK-2.0.vec 151 | Test embeddings derived from block factorization 152 | 153 | Precompute cosine matrix, will need 25.6GB RAM... Done. 154 | ws353_similarity: 203 test pairs, 203 valid , 0.79212 155 | ws353_relatedness: 252 test pairs, 252 valid , 0.67948 156 | bruni_men: 3000 test pairs, 3000 valid , 0.76389 157 | radinsky_mturk: 287 test pairs, 285 valid , 0.67397 158 | luong_rare: 2034 test pairs, 835 valid , 0.48215 159 | simlex_999a: 999 test pairs, 995 valid , 0.39890 160 | 19500/18401/19544: Add 0.61893, Mul 0.63926 161 | google: 19544 analogies, 18443 valid . Add Score: 0.61866, Mul Score: 0.63921 162 | 8000/6172/8000: Add 0.49579, Mul 0.54277 163 | msr: 8000 analogies, 6172 valid . Add Score: 0.49579, Mul Score: 0.54277 164 | 165 | real 109m26.625s 166 | user 896m3.444s 167 | sys 278m1.332s 168 | Using Tikhonov regularization with coeff: 4.0 169 | Read sim testset ./testsets/ws/ws353_similarity.txt 170 | Read sim testset ./testsets/ws/ws353_relatedness.txt 171 | Read sim testset ./testsets/ws/bruni_men.txt 172 | Read sim testset ./testsets/ws/radinsky_mturk.txt 173 | Read sim testset ./testsets/ws/luong_rare.txt 174 | Read sim testset ./testsets/ws/simlex_999a.txt 175 | Read analogy testset ./testsets/analogy/google.txt 176 | Read analogy testset ./testsets/analogy/msr.txt 177 | 178 | Embeddings of top 25000 words in '25000-80000-500-BLK-2.0.vec' will be loaded as core 179 | Load embedding text file '25000-80000-500-BLK-2.0.vec' 180 | Will load embeddings of 80000 words 181 | 80000 80000 0 182 | 80000 embeddings read, 80000 kept 183 | 2 blocks of 25000 core words and 50000 noncore words will be loaded. Skip 55000 words 184 | Loading bigram file 'top2grams-wiki.txt' into 2 blocks. Will skip 55000 words 185 | Totally 277025 words 186 | 277025 words in file, top 75000 to read into vocab (25000 core, 50000 noncore), 55000 skipped 187 | Read bigrams: 188 | 25000 (25000 core, 0 noncore) 189 | 25000 core words are all read. 190 | 130000 (25000 core, 50000 noncore) 191 | 124073 (0.010%) elements in Weight-1 cut off at 6.63 192 | 111102 (0.009%) elements in Weight-2 cut off at 6.63 193 | 194 | del G1, G21 195 | Begin finding embeddings of non-core words 196 | 50000 / 50000. Elapsed: 5291.71/10.41 197 | del F21, WGsum, VW 198 | Save matrix 'V' into 25000-130000-500-BLK-4.0.vec 199 | Test embeddings derived from block factorization 200 | 201 | ws353_similarity: 203 test pairs, 203 valid , 0.79212 202 | ws353_relatedness: 252 test pairs, 252 valid , 0.67948 203 | bruni_men: 3000 test pairs, 3000 valid , 0.76389 204 | radinsky_mturk: 287 test pairs, 286 valid , 0.67566 205 | luong_rare: 2034 test pairs, 1096 valid , 0.47344 206 | simlex_999a: 999 test pairs, 996 valid , 0.39715 207 | 19500/19158/19544: Add 0.60680, Mul 0.62778 208 | google: 19544 analogies, 19202 valid . Add Score: 0.60650, Mul Score: 0.62770 209 | 8000/6578/8000: Add 0.48100, Mul 0.52676 210 | msr: 8000 analogies, 6578 valid . Add Score: 0.48100, Mul Score: 0.52676 211 | 212 | real 106m34.491s 213 | user 911m26.380s 214 | sys 262m26.192s 215 | Using Tikhonov regularization with coeff: 8.0 216 | Read sim testset ./testsets/ws/ws353_similarity.txt 217 | Read sim testset ./testsets/ws/ws353_relatedness.txt 218 | Read sim testset ./testsets/ws/bruni_men.txt 219 | Read sim testset ./testsets/ws/radinsky_mturk.txt 220 | Read sim testset ./testsets/ws/luong_rare.txt 221 | Read sim testset ./testsets/ws/simlex_999a.txt 222 | Read analogy testset ./testsets/analogy/google.txt 223 | Read analogy testset ./testsets/analogy/msr.txt 224 | 225 | Embeddings of top 25000 words in '25000-130000-500-BLK-4.0.vec' will be loaded as core 226 | Load embedding text file '25000-130000-500-BLK-4.0.vec' 227 | Will load embeddings of 130000 words 228 | 130000 130000 0 229 | 130000 embeddings read, 130000 kept 230 | 2 blocks of 25000 core words and 50000 noncore words will be loaded. Skip 105000 words 231 | Loading bigram file 'top2grams-wiki.txt' into 2 blocks. Will skip 105000 words 232 | Totally 277025 words 233 | 277025 words in file, top 75000 to read into vocab (25000 core, 50000 noncore), 105000 skipped 234 | Read bigrams: 235 | 25000 (25000 core, 0 noncore) 236 | 25000 core words are all read. 237 | 180000 (25000 core, 50000 noncore) 238 | 191415 (0.015%) elements in Weight-1 cut off at 4.12 239 | 183822 (0.015%) elements in Weight-2 cut off at 4.12 240 | 241 | del G1, G21 242 | Begin finding embeddings of non-core words 243 | 50000 / 50000. Elapsed: 5277.66/10.72 244 | del F21, WGsum, VW 245 | Save matrix 'V' into 25000-180000-500-BLK-8.0.vec 246 | Test embeddings derived from block factorization 247 | 248 | ws353_similarity: 203 test pairs, 203 valid , 0.79212 249 | ws353_relatedness: 252 test pairs, 252 valid , 0.67948 250 | bruni_men: 3000 test pairs, 3000 valid , 0.76389 251 | radinsky_mturk: 287 test pairs, 286 valid , 0.67566 252 | luong_rare: 2034 test pairs, 1260 valid , 0.45688 253 | simlex_999a: 999 test pairs, 998 valid , 0.39788 254 | 19500/19320/19544: Add 0.60041, Mul 0.62158 255 | google: 19544 analogies, 19364 valid . Add Score: 0.60013, Mul Score: 0.62151 256 | 8000/7054/8000: Add 0.46187, Mul 0.50383 257 | msr: 8000 analogies, 7054 valid . Add Score: 0.46187, Mul Score: 0.50383 258 | 259 | real 111m53.430s 260 | user 964m38.856s 261 | sys 271m9.988s 262 | word2vec: 263 | Starting training using file /home/shaohua/D/corpus/cleanwiki.txt 264 | Vocab size: 289625 265 | Words in train file: 2000719401 266 | Alpha: 0.000053 Progress: 99.89% Words/thread/sec: 65.89k 267 | real 249m8.382s 268 | user 2530m25.988s 269 | sys 4m4.680s 270 | glove: 271 | BUILDING VOCABULARY 272 | Processed 2042546400 tokens. 273 | Counted 8527820 unique words. 274 | Truncating vocabulary at min count 100. 275 | Using vocabulary of size 289624. 276 | 277 | COUNTING COOCCURRENCES 278 | window size: 3 279 | context: symmetric 280 | max product: 50983620 281 | overflow length: 152113425 282 | Reading vocab from file "vocab-wiki.txt"...loaded 289624 words. 283 | Building lookup table...table contains 428261749 elements. 284 | Processed 2042546400 tokens. 285 | Writing cooccurrences to disk..........8 files in total. 286 | Merging cooccurrence files: processed 652989173 lines. 287 | 288 | SHUFFLING COOCCURRENCES 289 | array size: 1020054732 290 | Shuffling by chunks: processed 652989173 lines. 291 | Wrote 1 temporary file(s). 292 | Merging temp files: processed 652989173 lines. 293 | 294 | TRAINING MODEL 295 | Read 652989173 lines. 296 | Initializing parameters...done. 297 | vector size: 500 298 | vocab size: 289624 299 | x_max: 10.000000 300 | alpha: 0.750000 301 | iter: 001, cost: 0.131547 302 | iter: 002, cost: 0.105234 303 | iter: 003, cost: 0.090557 304 | iter: 004, cost: 0.080886 305 | iter: 005, cost: 0.075276 306 | iter: 006, cost: 0.071524 307 | iter: 007, cost: 0.068758 308 | iter: 008, cost: 0.066837 309 | iter: 009, cost: 0.065186 310 | iter: 010, cost: 0.063919 311 | iter: 011, cost: 0.062813 312 | iter: 012, cost: 0.061871 313 | iter: 013, cost: 0.061140 314 | iter: 014, cost: 0.060471 315 | iter: 015, cost: 0.059661 316 | 317 | real 229m37.019s 318 | user 1503m26.736s 319 | sys 9m14.064s 320 | singular: 321 | Counting words in file 1/1 .......... 6058672 types 322 | Sliding window in file 1/1 .......... 323 | Writing counts 324 | Loading counts 325 | Calculating SVD 326 | Clustering 327 | 328 | real 183m26.164s 329 | user 87m43.676s 330 | sys 29m32.096s 331 | 332 | PMI2: 333 | 15239.57user 3723.37system 4:37:35elapsed 113%CPU (0avgtext+0avgdata 9739144maxresident)k 334 | 99330104inputs+7182248outputs (88major+399600177minor)pagefaults 0swaps 335 | 59223.84user 82148.26system 36:36:14elapsed 107%CPU (0avgtext+0avgdata 31071304maxresident)k 336 | 485837760inputs+23424872outputs (109major+7607047364minor)pagefaults 0swaps 337 | 10169.24user 122.85system 2:53:15elapsed 99%CPU (0avgtext+0avgdata 24847900maxresident)k 338 | 11854888inputs+4056outputs (47major+6317122minor)pagefaults 0swaps 339 | -------------------------------------------------------------------------------- /psdvec/evaluate-toefl.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | import getopt 3 | import glob 4 | import sys 5 | import os.path 6 | from utils import * 7 | import numpy as np 8 | import copy 9 | import pdb 10 | import sys 11 | 12 | def loadToeflTestset(toeflTestsetFilename): 13 | TOEFL = open(toeflTestsetFilename) 14 | toeflTestset = [] 15 | for line in TOEFL: 16 | line = line.strip() 17 | words = line.split(" | ") 18 | toeflTestset.append(words) 19 | 20 | print "%d toefl test questions are loaded" %len(toeflTestset) 21 | return toeflTestset 22 | 23 | embeddingDir = "./embeddings/" 24 | modelFiles = [ "25000-180000-500-BLK-8.0.vec", "sparse.vec", "singular.vec", 25 | "25000-180000-500-BLK-0.0.vec", "word2vec.vec", "glove.vec" ] 26 | 27 | toeflTestsetFilename = "./testsets/ws/EN-TOEFL-80.txt" 28 | isHyperwordsEmbed = False 29 | hyperwordsType = None 30 | 31 | def usage(): 32 | print """Usage: evaluate-toefl.py [ -H -m model_file ] 33 | Options: 34 | -m: Path to the model file, a ".vec" or a Hyperwords embedding directory (with -H). 35 | -H: Hyperwords embeddings type: PPMI or SVD.""" 36 | 37 | try: 38 | opts, args = getopt.getopt(sys.argv[1:],"m:H:") 39 | if len(args) != 0: 40 | raise getopt.GetoptError("") 41 | for opt, arg in opts: 42 | if opt == '-m': 43 | modelFiles = [ arg ] 44 | embeddingDir = "" 45 | if opt == '-H': 46 | isHyperwordsEmbed = True 47 | hyperwordsType = arg 48 | if opt == '-h': 49 | usage() 50 | sys.exit(0) 51 | 52 | except getopt.GetoptError: 53 | usage() 54 | sys.exit(2) 55 | 56 | vecNormalize = True 57 | loadwordCutPoint = 180000 58 | 59 | if loadwordCutPoint > 0: 60 | print "Load top %d words" %(loadwordCutPoint) 61 | 62 | toeflTestset = loadToeflTestset(toeflTestsetFilename) 63 | 64 | for m,modelFile in enumerate(modelFiles): 65 | modelFile = embeddingDir + modelFile 66 | if not isHyperwordsEmbed: 67 | V, vocab2, word2dim, skippedWords = load_embeddings( modelFile, loadwordCutPoint ) 68 | model = VecModel(V, vocab2, word2dim, vecNormalize=vecNormalize) 69 | else: 70 | model = load_embeddings_hyper(modelFile, hyperwordsType) 71 | 72 | questionNum = 0 73 | correctNum = 0 74 | for toeflQuestion in toeflTestset: 75 | questionWord = toeflQuestion[0] 76 | maxID = -1 77 | maxsim = -100 78 | for i,w in enumerate( toeflQuestion[1:] ): 79 | sim = model.similarity( questionWord, w ) 80 | if sim > maxsim: 81 | maxsim = sim 82 | maxID = i 83 | 84 | if maxID == 0: 85 | correctNum += 1 86 | else: 87 | question = copy.copy(toeflQuestion) 88 | question[maxID+1] = '(' + question[maxID+1] + ')' 89 | #if m == 0: 90 | # pdb.set_trace() 91 | print "%s: %s, %s, %s, %s" %tuple(question) 92 | questionNum += 1 93 | print "%s: %d/%d=%.1f%%" %( modelFile, correctNum, questionNum, correctNum*100.0/questionNum ) 94 | 95 | 96 | -------------------------------------------------------------------------------- /psdvec/evaluate.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | import getopt 4 | import glob 5 | import sys 6 | import os.path 7 | from utils import * 8 | import numpy as np 9 | #import pdb 10 | 11 | getAbsentWords = False 12 | modelFiles = [ "./GoogleNews-vectors-negative300.bin", "./29291-500-EM.vec", "./100000-500-BLKEM.vec", 13 | "./wordvecs/vec_520_forest", "./wiki-glove.vec2.txt" ] 14 | 15 | isModelsBinary = [ True, False, False, False, False ] 16 | modelID = -1 17 | 18 | # default is current directory 19 | simTestsetDir = "./testsets/ws/" 20 | # if set to [], run all testsets 21 | simTestsetNames = [ "ws353_similarity", "ws353_relatedness", "bruni_men", "radinsky_mturk", "luong_rare", 22 | "simlex_999a", "EN-RG-65" ] 23 | anaTestsetDir = "./testsets/analogy/" 24 | # if set to [], run all testsets 25 | anaTestsetNames = [ "google", "msr" ] 26 | 27 | unigramFilename = "top1grams-wiki.txt" 28 | vecNormalize = True 29 | loadwordCutPoint = -1 30 | testwordCutPoint = -1 31 | absentFilename = "" 32 | extraWordFilename = "" 33 | # default is in text format 34 | isModelBinary = False 35 | modelFile = None 36 | # precompute the cosine similarity matrix of all pairs of words 37 | # need W*W*4 bytes of RAM 38 | precomputeGramian = False 39 | skipPossessive = False 40 | evalVecExpectation = False 41 | doAnaTest = True 42 | isHyperwordsEmbed = False 43 | hyperEmbedType = None 44 | 45 | def usage(): 46 | print """Usage: evaluate.py [ -m model_file -i builtin_model_id -e extra_word_file -a absent_file -u unigram_file ... ] 47 | Options: 48 | -m: Path to the model file, a ".vec" or ".bin" file for word2vec 49 | -b: Model file is in binary format (default: text) 50 | -d: A directory containing the test files 51 | -f: A list of test files in the specified directory 52 | -i: Builtin model ID for the benchmark. Range: 1 (word2vec), 53 | 2 (PSD 29291 words), 3 (block PSD 100000 words), 4 (forest), 5(glove) 54 | -P: Do not precompute cosine matrix. When the vocab is huge, 55 | it's necessary to disable computing this matrix. 56 | -u: Unigram file, for missing word check. 57 | Its presence will enable checking of what words are missing 58 | from the vocabulary and the model 59 | -c: Loaded Model vocabulary cut point. Load top x words from the model file 60 | -t: Vocabulary cut point for the test sets. All words in the test sets 61 | whose IDs are below it will be picked out 62 | -e: Extra word file. Words in this list will be loaded anyway 63 | -a: Absent file. Words below the cut point will be saved there 64 | -p: Skip possessive analogy pairs 65 | -E: Compute the expectation of all word embeddings 66 | -H: Hyperwords embeddings type: PPMI or SVD.""" 67 | 68 | try: 69 | opts, args = getopt.getopt(sys.argv[1:],"m:bd:f:i:Pu:c:t:e:a:shEAH:") 70 | if len(args) != 0: 71 | raise getopt.GetoptError("") 72 | for opt, arg in opts: 73 | if opt == '-m': 74 | modelID = -1 75 | modelFile = arg 76 | if opt == '-b': 77 | isModelBinary = bool(arg) 78 | if opt == '-d': 79 | testsetDir = arg 80 | if opt == '-f': 81 | testsetNames = filter( lambda x: x, arg.split(",") ) 82 | if opt == '-i': 83 | modelID = int(arg) 84 | if opt == '-P': 85 | precomputeGramian = False 86 | if opt == '-u': 87 | # unigram file is used to get a full list of words, 88 | # and also to sort the absent words by their frequencies 89 | unigramFilename = arg 90 | if opt == '-c': 91 | loadwordCutPoint = int(arg) 92 | if opt == '-t': 93 | testwordCutPoint = int(arg) 94 | if opt == '-e': 95 | extraWordFilename = arg 96 | if opt == '-a': 97 | getAbsentWords = True 98 | absentFilename = arg 99 | if opt == '-A': 100 | doAnaTest = False 101 | if opt == '-s': 102 | skipPossessive = True 103 | if opt == '-E': 104 | evalVecExpectation = True 105 | if opt == '-H': 106 | isHyperwordsEmbed = True 107 | hyperEmbedType = arg 108 | if opt == '-h': 109 | usage() 110 | sys.exit(0) 111 | 112 | if getAbsentWords and not unigramFilename: 113 | print "ERR: -u (Unigram file) has to be specified to get absent words" 114 | sys.exit(2) 115 | # "-" means output to console instead of a file 116 | if absentFilename == "-": 117 | absentFilename = "" 118 | 119 | except getopt.GetoptError: 120 | usage() 121 | sys.exit(2) 122 | 123 | if modelID > 0: 124 | modelFile = modelFiles[ modelID - 1 ] 125 | isModelBinary = isModelsBinary[ modelID - 1 ] 126 | 127 | if modelFile is None: 128 | usage() 129 | sys.exit(2) 130 | 131 | vocab = {} 132 | if unigramFilename: 133 | vocab = loadUnigramFile(unigramFilename) 134 | 135 | if extraWordFilename: 136 | extraWords = loadExtraWordFile(extraWordFilename) 137 | else: 138 | extraWords = {} 139 | 140 | if loadwordCutPoint > 0: 141 | print "Load top %d words" %(loadwordCutPoint) 142 | 143 | if isModelBinary: 144 | V, vocab2, word2dim, skippedWords = load_embeddings_bin( modelFile, loadwordCutPoint, extraWords ) 145 | elif not isHyperwordsEmbed: 146 | V, vocab2, word2dim, skippedWords = load_embeddings( modelFile, loadwordCutPoint, extraWords ) 147 | else: 148 | model = load_embeddings_hyper( modelFile, hyperEmbedType ) 149 | # the interface of hyperwords embedding class is incompatible with analogy tasks 150 | # only compatible with similarity tasks 151 | doAnaTest = False 152 | 153 | # if evalVecExpectation = True, compute the expectation of all embeddings 154 | if evalVecExpectation: 155 | if unigramFilename: 156 | expVec = np.zeros( len(V[0]) ) 157 | expVecNorm1 = 0 158 | expVecNorm2 = 0 159 | totalWords = 0 160 | expWords = 0 161 | accumProb = 0.0 162 | for w in vocab2: 163 | totalWords += 1 164 | if w in vocab and vocab[w][0] < 180000: 165 | expVec += V[ word2dim[w] ] * vocab[w][2] 166 | expVecNorm1 += norm1( V[ word2dim[w] ] ) * vocab[w][2] 167 | expVecNorm2 += normF( V[ word2dim[w] ] ) * vocab[w][2] 168 | expWords += 1 169 | accumProb += vocab[w][2] 170 | 171 | expVec /= accumProb 172 | expVecNorm1 /= accumProb 173 | expVecNorm2 /= accumProb 174 | print "totally %d words, %d words in E[v]. Accumu prob: %.2f%%." %( totalWords, expWords, accumProb * 100 ) 175 | print "|E[v]|: %.2f/%.2f, E[|v|]: %.2f/%.2f" %( norm1(expVec), normF(expVec), expVecNorm1, expVecNorm2 ) 176 | 177 | expMagnitude = norm1(expVec) 178 | accumProb = 0 179 | variance = 0 180 | for w in vocab2: 181 | if w in vocab and vocab[w][0] < 180000: 182 | variance += ( norm1( V[ word2dim[w] ] ) - expMagnitude )**2 * vocab[w][2] 183 | accumProb += vocab[w][2] 184 | 185 | variance /= accumProb 186 | 187 | # variance & standard deviation 188 | print "var(|v|): %.2f. SD: %.2f. CV: %.2f" %( variance, np.sqrt(variance), np.sqrt(variance) / expVecNorm1 ) 189 | 190 | sys.exit(0) 191 | 192 | else: 193 | print "ERR: -u (Unigram file) has to be specified to calc expectation of embeddings" 194 | sys.exit(2) 195 | 196 | if not isHyperwordsEmbed: 197 | model = VecModel(V, vocab2, word2dim, vecNormalize=vecNormalize) 198 | 199 | if precomputeGramian: 200 | isEnoughGramian, installedMemGB, requiredMemGB = isMemEnoughGramian( len(V) ) 201 | 202 | if isEnoughGramian <= 1: 203 | print "WARN: %.1fGB mem detected, %.1fGB mem required to precompute the cosine matrix" %( installedMemGB, requiredMemGB ) 204 | if isEnoughGramian == 0: 205 | print "Precomputation of the cosine matrix is disabled automatically." 206 | else: 207 | print "In case of memory shortage, you can specify -P to disable" 208 | 209 | if isEnoughGramian > 0: 210 | model.precomputeGramian() 211 | 212 | print 213 | 214 | simTestsets = loadTestsets(loadSimTestset, simTestsetDir, simTestsetNames) 215 | 216 | if skipPossessive: 217 | anaTestsets = loadTestsets( loadAnaTestset, anaTestsetDir, anaTestsetNames, { 'skipPossessive': 1 } ) 218 | else: 219 | anaTestsets = loadTestsets( loadAnaTestset, anaTestsetDir, anaTestsetNames ) 220 | 221 | print 222 | 223 | spearmanCoeff, absentModelID2Word1, absentVocabWords1, cutVocabWords1 = \ 224 | evaluate_sim( model, simTestsets, simTestsetNames, getAbsentWords, vocab, testwordCutPoint ) 225 | 226 | print 227 | 228 | if doAnaTest: 229 | anaScores, absentModelID2Word2, absentVocabWords2, cutVocabWords2 = \ 230 | evaluate_ana( model, anaTestsets, anaTestsetNames, getAbsentWords, vocab, testwordCutPoint ) 231 | 232 | if getAbsentWords: 233 | # merge the two sets of absent words 234 | absentModelID2Word1.update(absentModelID2Word2) 235 | absentModelWordIDs = sorted( absentModelID2Word1.keys() ) 236 | absentModelWords = [ absentModelID2Word1[i] for i in absentModelWordIDs ] 237 | 238 | absentVocabWords1.update(absentVocabWords2) 239 | absentVocabWords = sorted( absentVocabWords1.keys() ) 240 | 241 | cutVocabWords1.update(cutVocabWords2) 242 | # sort by ID in ascending, so that most frequent words (smaller IDs) first 243 | cutVocabWords = sorted( cutVocabWords1.keys(), key=lambda w: vocab[w][0] ) 244 | 245 | print "\n%d words absent from the model:" %len(absentModelWordIDs) 246 | print "ID:" 247 | print ",".join( map( lambda i: str(i), absentModelWordIDs) ) 248 | print "\nWords:" 249 | print ",".join(absentModelWords) 250 | 251 | if len(absentVocabWords) > 0: 252 | print "\n%d words absent from the vocab:" %len(absentVocabWords) 253 | print "\n".join(absentVocabWords) 254 | 255 | print 256 | 257 | if absentFilename and len(cutVocabWords): 258 | ABS = open(absentFilename, "w") 259 | for w in cutVocabWords: 260 | ABS.write( "%s\t%d\n" %( w, vocab[w][0] ) ) 261 | ABS.close() 262 | print "%d words saved to %s" %( len(cutVocabWords), absentFilename ) 263 | -------------------------------------------------------------------------------- /psdvec/extractwiki.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # code based on http://textminingonline.com/training-word2vec-model-on-english-wikipedia-by-gensim 4 | 5 | import logging 6 | import os.path 7 | import sys 8 | 9 | from gensim.corpora import WikiCorpus 10 | 11 | if __name__ == '__main__': 12 | program = os.path.basename(sys.argv[0]) 13 | logger = logging.getLogger(program) 14 | 15 | logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s') 16 | logging.root.setLevel(level=logging.INFO) 17 | logger.info("running %s" % ' '.join(sys.argv)) 18 | 19 | # check and process input arguments 20 | if len(sys.argv) < 3: 21 | print "Usage: extractwiki.py infile_name outfile_name" 22 | sys.exit(1) 23 | 24 | infilename, outfilename = sys.argv[1:3] 25 | 26 | if os.path.isfile(outfilename): 27 | logger.error("Output file %s exists. Change the file name and try again." %outfilename) 28 | sys.exit(1) 29 | 30 | i = 0 31 | output = open(outfilename, 'w') 32 | wiki = WikiCorpus(infilename, lemmatize=False, dictionary={}) 33 | for text in wiki.get_texts(): 34 | output.write( " ".join(text) + "\n") 35 | i = i + 1 36 | if (i % 10000 == 0): 37 | logger.info("Saved " + str(i) + " articles") 38 | 39 | output.close() 40 | logger.info("Finished Saved " + str(i) + " articles") 41 | -------------------------------------------------------------------------------- /psdvec/fact-rcv1.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | set N0=50 3 | rem Old way of exact factorization: 4 | rem python factorize.py -n 50 -t 28000 -e absentwords.txt top2grams-rcv1.txt 5 | rem New online fashion: 6 | rem 1. Obtain 23000 core embeddings, into 25000-50-EM.vec: 7 | rem python factorize.py -w 23000 -n %N0% top2grams-rcv1.txt 8 | rem 2. Obtain 23409 noncore embeddings, totaling 46409 (23000 core + 23409 noncore), into 25000-46409-50-BLK-2.0.vec: 9 | python factorize.py -v 23000-%N0%-EM.vec -n %N0% -t2 top2grams-rcv1.txt 10 | -------------------------------------------------------------------------------- /psdvec/fact-rcv1.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | N0=50 3 | time python factorize.py -w 15000 -n $N0 -E5 top2grams-rcv1.txt 4 | time python factorize.py -v 15000-$N0-EM.vec -n $N0 -t2 top2grams-rcv1.txt 5 | -------------------------------------------------------------------------------- /psdvec/fact-wiki.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | set N0=500 3 | rem Old way of exact factorization: 4 | rem python factorize.py -n 500 -t 28000 -e absentwords.txt top2grams-wiki.txt 5 | rem New online fashion: 6 | rem 1. Obtain 25000 core embeddings, into 25000-500-EM.vec: 7 | python factorize.py -w 25000 -n %N0% top2grams-wiki.txt 8 | rem 2. Obtain 55000 noncore embeddings, totaling 80000 (25000 core + 55000 noncore), into 25000-80000-500-BLK-2.0.vec: 9 | python factorize.py -v 25000-%N0%-EM.vec -n %N0% -o 55000 -t2 top2grams-wiki.txt 10 | rem 3. Incrementally learn other 50000 noncore embeddings (based on 25000 core), into 25000-130000-500-BLK-4.0.vec: 11 | python factorize.py -v 25000-80000-%N0%-BLK-2.0.vec -n %N0% -b 25000 -o 50000 -t4 top2grams-wiki.txt 12 | rem 4. Repeat 3 again to get more embeddings of rarer words. 13 | python factorize.py -v 25000-130000-%N0%-BLK-4.0.vec -n %N0% -b 25000 -o 50000 -t8 top2grams-wiki.txt 14 | -------------------------------------------------------------------------------- /psdvec/fact-wiki.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | N0=500 3 | # Old way of exact factorization: 4 | # python factorize.py -n 500 -t 28000 -e absentwords.txt top2grams-wiki.txt 5 | # New online fashion: 6 | # 1. Obtain 25000 core embeddings, into 25000-500-EM.vec: 7 | time python factorize.py -w 25000 -n $N0 top2grams-wiki.txt 8 | # 2. Obtain 55000 noncore embeddings, totaling 80000 (25000 core + 55000 noncore), into 25000-80000-500-BLK-2.0.vec: 9 | time python factorize.py -v 25000-$N0-EM.vec -n $N0 -o 55000 -t2 top2grams-wiki.txt 10 | # 3. Incrementally learn other 50000 noncore embeddings (based on 25000 core), into 25000-130000-500-BLK-4.0.vec: 11 | time python factorize.py -v 25000-80000-$N0-BLK-2.0.vec -n $N0 -b 25000 -o 50000 -t4 top2grams-wiki.txt 12 | # 4. Repeat 3 again to get more embeddings of rarer words. 13 | time python factorize.py -v 25000-130000-$N0-BLK-4.0.vec -n $N0 -b 25000 -o 50000 -t8 top2grams-wiki.txt 14 | -------------------------------------------------------------------------------- /psdvec/genSentDict.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | set N=200 3 | python topwordsInList.py -c sent-gen-config.txt -l d:\Dropbox\sentiment\positive-words.txt,d:\Dropbox\sentiment\negative-words.txt -n %N% -o topSentWords%N%.txt 4 | -------------------------------------------------------------------------------- /psdvec/gencatdata.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import scipy.linalg 3 | from scipy.special import * 4 | import getopt 5 | import sys 6 | from utils import * 7 | import pdb 8 | import time 9 | 10 | embed_algs = { "PSDVec": "d:/corpus/embeddings/25000-180000-500-BLK-8.0.vec", 11 | "word2vec": "d:/corpus/embeddings/word2vec2.vec", 12 | "CCA": "d:/corpus/embeddings/182800-500-CCA.vec" 13 | } 14 | # "50000-180000-500-BLK-8.0.vec" } 15 | testsetDir = "./concept categorization" 16 | testsetNames = [ "ap", "battig", "esslli" ] 17 | maxID = -1 18 | 19 | for algname, vecFilename in embed_algs.iteritems(): 20 | print "Alg %s" %algname 21 | if vecFilename[-4:] == ".bin": 22 | V, vocab, word2ID, skippedWords_whatever = load_embeddings_bin(vecFilename, 400000) 23 | else: 24 | V, vocab, word2ID, skippedWords_whatever = load_embeddings(vecFilename, 400000) 25 | 26 | for testsetName in testsetNames: 27 | truthFilename = testsetDir + "/" + testsetName + ".txt" 28 | vecFilename = testsetDir + "/" + testsetName + "-" + algname + ".vec" 29 | labelFilename = testsetDir + "/" + testsetName + "-" + algname + ".label" 30 | 31 | FVEC = open(vecFilename, "w") 32 | ids = [] 33 | 34 | FLABEL = open(labelFilename, "w") 35 | 36 | with open(truthFilename) as FT: 37 | # skip header 38 | FT.readline() 39 | for line in FT: 40 | line = line.strip() 41 | fields = line.split("\t") 42 | word, cat = fields[:2] 43 | 44 | if word not in word2ID: 45 | print "%s not in vocab" %word 46 | continue 47 | else: 48 | id = word2ID[word] 49 | #print "%s: %d" %(word, id) 50 | if id > maxID: 51 | maxID = id 52 | ids.append(id) 53 | 54 | FLABEL.write("%s\n" %cat) 55 | 56 | FVEC.write( "%d %d\n" %( len(ids), V.shape[1] ) ) 57 | for id in ids: 58 | v = V[id] 59 | FVEC.write("%.3f" %v[0]) 60 | for d in v[1:]: 61 | FVEC.write(" %.3f" %d) 62 | FVEC.write("\n") 63 | 64 | FLABEL.close() 65 | FVEC.close() 66 | -------------------------------------------------------------------------------- /psdvec/gram-rcv1.bat: -------------------------------------------------------------------------------- 1 | set CORPUS=rcv1clean.txt 2 | set SUFFIX=rcv1 3 | perl gramcount.pl -i %CORPUS% -m1 --f1 top1grams-%SUFFIX%.txt -c --nofilter --thres1 50,0 4 | perl gramcount.pl -i %CORPUS% -m2 --f1 top1grams-%SUFFIX%.txt --nofilter -c --f2 top2grams-%SUFFIX%.txt -w 3 --thres1 50,0 5 | -------------------------------------------------------------------------------- /psdvec/gram.bat: -------------------------------------------------------------------------------- 1 | set CORPUS=reuters-train-5770.orig.txt 2 | set SUFFIX=reuters 3 | perl gramcount.pl -i %CORPUS% -m1 --f1 top1grams-%SUFFIX%.txt -c --nofilter --thres1 5,0 4 | perl gramcount.pl -i %CORPUS% -m2 --f1 top1grams-%SUFFIX%.txt --nofilter -c --f2 top2grams-%SUFFIX%.txt -w 3 --thres1 5,0 5 | -------------------------------------------------------------------------------- /psdvec/gramcount.pl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/gramcount.pl -------------------------------------------------------------------------------- /psdvec/papers/emnlp2015.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/papers/emnlp2015.pdf -------------------------------------------------------------------------------- /psdvec/patch to gensim.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/patch to gensim.py -------------------------------------------------------------------------------- /psdvec/perlxs.h: -------------------------------------------------------------------------------- 1 | // http://ftp.ledas.ac.uk/software/lheasoft/lheasoft6.3.1/source/heacore/pil/perl/Av_CharPtrPtr.c 2 | 3 | /* Used by the INPUT typemap for char**. 4 | * Will convert a Perl AV* (containing strings) to a C char**. 5 | */ 6 | char** XS_unpack_charPtrPtr( SV* rv ) 7 | { 8 | AV *av; 9 | SV **ssv; 10 | char **s; 11 | int avlen; 12 | int x; 13 | 14 | if( SvROK( rv ) && (SvTYPE(SvRV(rv)) == SVt_PVAV) ) 15 | av = (AV*)SvRV(rv); 16 | else { 17 | warn("XS_unpack_charPtrPtr: rv was not an AV ref"); 18 | return( (char**)NULL ); 19 | } 20 | 21 | /* is it empty? */ 22 | avlen = av_len(av); 23 | if( avlen < 0 ){ 24 | warn("XS_unpack_charPtrPtr: array was empty"); 25 | return( (char**)NULL ); 26 | } 27 | 28 | /* av_len+2 == number of strings, plus 1 for an end-of-array sentinel. 29 | */ 30 | s = (char **)safemalloc( sizeof(char*) * (avlen + 2) ); 31 | if( s == NULL ){ 32 | warn("XS_unpack_charPtrPtr: unable to malloc char**"); 33 | return( (char**)NULL ); 34 | } 35 | for( x = 0; x <= avlen; ++x ){ 36 | ssv = av_fetch( av, x, 0 ); 37 | if( ssv != NULL ){ 38 | if( SvPOK( *ssv ) ){ 39 | s[x] = (char *)safemalloc( SvCUR(*ssv) + 1 ); 40 | if( s[x] == NULL ) 41 | warn("XS_unpack_charPtrPtr: unable to malloc char*"); 42 | else 43 | strcpy( s[x], SvPV( *ssv, PL_na ) ); 44 | } 45 | else 46 | warn("XS_unpack_charPtrPtr: array elem %d was not a string.", x ); 47 | } 48 | else 49 | s[x] = (char*)NULL; 50 | } 51 | s[x] = (char*)NULL; /* sentinel */ 52 | return( s ); 53 | } 54 | 55 | /* Used by the OUTPUT typemap for char**. 56 | * Will convert a C char** to a Perl AV*. 57 | */ 58 | void XS_pack_charPtrPtr( SV * st, char ** s ) 59 | { 60 | AV *av = newAV(); 61 | SV *sv; 62 | char **c; 63 | 64 | for( c = s; *c != NULL; ++c ){ 65 | sv = newSVpv( *c, 0 ); 66 | av_push( av, sv ); 67 | } 68 | free ( s ); 69 | sv = newSVrv( st, NULL ); /* upgrade stack SV to an RV */ 70 | SvREFCNT_dec( sv ); /* discard */ 71 | SvRV( st ) = (SV*)av; /* make stack RV point at our AV */ 72 | } 73 | 74 | 75 | /* cleanup the temporary char** from XS_unpack_charPtrPtr */ 76 | void XS_release_charPtrPtr(char** s) 77 | { 78 | char **c; 79 | for( c = s; *c != NULL; ++c ) 80 | Safefree( *c ); 81 | Safefree( s ); 82 | } 83 | -------------------------------------------------------------------------------- /psdvec/removeDoubleNewline.pl: -------------------------------------------------------------------------------- 1 | $wc = 0; 2 | $doubleNewlineCount = 0; 3 | while(<>){ 4 | $wc++; 5 | if( /^\r?\n$/ ){ 6 | if( $lastIsNewline ){ 7 | $lastIsNewline = 0; 8 | $doubleNewlineCount++; 9 | next; 10 | } 11 | else{ 12 | print; 13 | $lastIsNewline = 1; 14 | } 15 | } 16 | else{ 17 | print; 18 | } 19 | if( $wc % 1000 == 0 ){ 20 | print STDERR "\r$wc $doubleNewlineCount\r"; 21 | } 22 | } 23 | print STDERR "$wc $doubleNewlineCount\n"; 24 | -------------------------------------------------------------------------------- /psdvec/sent-bench.bat: -------------------------------------------------------------------------------- 1 | @echo off 2 | rem cd d:\corpus 3 | rem python corpus2liblinear.py -d aclImdb\test\pos -o sent-test.txt 1 4 | rem python corpus2liblinear.py -d aclImdb\test\neg -o sent-test.txt -1 -a 5 | rem python corpus2liblinear.py -d aclImdb\train\pos -o sent-train.txt 1 6 | rem python corpus2liblinear.py -d aclImdb\train\neg -o sent-train.txt -1 -a 7 | pushd d:\liblinear-2.1\windows 8 | train -s7 -v10 \corpus\sent-train-PSD-reg.txt PSD-reg.model 9 | predict \corpus\sent-test-PSD-reg.txt PSD-reg.model pred-output.txt 10 | popd 11 | 12 | -------------------------------------------------------------------------------- /psdvec/sent-gen.conf: -------------------------------------------------------------------------------- 1 | { "outFilenameTrunk": "sent-train", "docDir": "D:/corpus/aclImdb/train/neg", "label": "-1", "appendToOutput": false } 2 | { "outFilenameTrunk": "sent-train", "docDir": "D:/corpus/aclImdb/train/pos", "label": "+1", "appendToOutput": true } 3 | { "outFilenameTrunk": "sent-test", "docDir": "D:/corpus/aclImdb/test/neg", "label": "-1", "appendToOutput": false } 4 | { "outFilenameTrunk": "sent-test", "docDir": "D:/corpus/aclImdb/test/pos", "label": "+1", "appendToOutput": true } 5 | -------------------------------------------------------------------------------- /psdvec/sentbench.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import subprocess 4 | 5 | alg2vec = { "PSD-reg": "25000-180000-500-BLK-8.0.vec", 6 | #"PSD-unreg": "25000-180000-500-BLK-0.0.vec", 7 | #"word2vec": "word2vec2.vec", 8 | #"CCA": "182800-500-CCA.vec", 9 | "sparse": "120000-sparse.vec" 10 | } 11 | 12 | vecDir = "d:/corpus/embeddings" 13 | liblinearDir = "D:/liblinear-2.1/windows" 14 | trainExePath = liblinearDir + "/" + "train.exe" 15 | predictExePath = liblinearDir + "/" + "predict.exe" 16 | dataDir = "d:/corpus" 17 | trainFiletrunk = "sent-train" 18 | testFiletrunk = "sent-test" 19 | dataGenScript = dataDir + "/corpus2liblinear.py" 20 | dataGenConfig = dataDir + "/sent-gen.conf" 21 | sentimentWordFile = dataDir + "/topSentWords500.txt" 22 | 23 | # code below is just to count the words in sentimentWordFile. 24 | # the count is used in file names 25 | sentword2id = {} 26 | bowSize = 0 27 | if sentimentWordFile: 28 | SENT = open(sentimentWordFile) 29 | id = 0 30 | for line in SENT: 31 | word, freq = line.split("\t") 32 | sentword2id[word] = id 33 | id += 1 34 | bowSize = len(sentword2id) 35 | SENT.close() 36 | 37 | # L1 or L2 regularization for the logistic regression solver 38 | # Experiments show this option has little impact on the results 39 | solverReg = 2 40 | if solverReg == 1: 41 | solverType = "-s6" 42 | elif solverReg == 2: 43 | solverType = "-s7" 44 | 45 | for algName, vecFilename in alg2vec.items(): 46 | print "%s:" %algName 47 | 48 | vecFullfilename = vecDir + "/" + vecFilename 49 | 50 | if sentimentWordFile: 51 | trainFilename = "%s/%s-%s-bow%d.txt" %( dataDir, trainFiletrunk, algName, bowSize ) 52 | testFilename = "%s/%s-%s-bow%d.txt" %( dataDir, testFiletrunk, algName, bowSize ) 53 | else: 54 | trainFilename = "%s/%s-%s.txt" %( dataDir, trainFiletrunk, algName ) 55 | testFilename = "%s/%s-%s.txt" %( dataDir, testFiletrunk, algName ) 56 | 57 | if not ( os.path.isfile(trainFilename) and os.path.isfile(testFilename) ): 58 | options = [ "python", dataGenScript, "-c", dataGenConfig, "-n", algName, \ 59 | "-v", vecFullfilename ] 60 | if sentimentWordFile: 61 | options.append("-s") 62 | options.append(sentimentWordFile) 63 | 64 | subprocess.call(options) 65 | 66 | if sentimentWordFile: 67 | modelFilename = "%s-bow%d.model" %( algName, bowSize ) 68 | outputFilename = "%s-bow%d.output" %( algName, bowSize ) 69 | else: 70 | modelFilename = "%s.model" %algName 71 | outputFilename = "%s.output" %algName 72 | 73 | print "Training using %s" %trainFilename 74 | subprocess.call( [ trainExePath, solverType, "-v10", trainFilename, modelFilename ] ) 75 | print "Testing using %s" %testFilename 76 | subprocess.call( [ predictExePath, testFilename, modelFilename, outputFilename ] ) 77 | print 78 | -------------------------------------------------------------------------------- /psdvec/tab2list.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | CAT = 1 4 | WORDS = 2 5 | 6 | FTAB = open(sys.argv[1]) 7 | FLIST = open(sys.argv[2], "w") 8 | FLIST.write("WORD\tTRUECLASS\n") 9 | 10 | state = CAT 11 | catnum = 0 12 | wordnum = 0 13 | 14 | for line in FTAB: 15 | line = line.strip() 16 | 17 | if not line and state == CAT: 18 | continue 19 | 20 | if state == CAT: 21 | cat = line.replace(" ", "-") 22 | state = WORDS 23 | catnum += 1 24 | continue 25 | if state == WORDS: 26 | if line: 27 | words = line.split(", ") 28 | for word in words: 29 | word = word.replace(",", "") 30 | FLIST.write( "%s\t%s\n" %(word, cat) ) 31 | wordnum += 1 32 | continue 33 | else: 34 | state = CAT 35 | continue 36 | 37 | print "%d words in %d categories written into %s" %(wordnum, catnum, sys.argv[2]) 38 | -------------------------------------------------------------------------------- /psdvec/testsets/ws/EN-RG-65.txt: -------------------------------------------------------------------------------- 1 | gem jewel 3.94 2 | midday noon 3.94 3 | automobile car 3.92 4 | cemetery graveyard 3.88 5 | cushion pillow 3.84 6 | boy lad 3.82 7 | cock rooster 3.68 8 | implement tool 3.66 9 | forest woodland 3.65 10 | coast shore 3.60 11 | autograph signature 3.59 12 | journey voyage 3.58 13 | serf slave 3.46 14 | grin smile 3.46 15 | glass tumbler 3.45 16 | cord string 3.41 17 | hill mound 3.29 18 | magician wizard 3.21 19 | furnace stove 3.11 20 | asylum madhouse 3.04 21 | brother monk 2.74 22 | food fruit 2.69 23 | bird cock 2.63 24 | bird crane 2.63 25 | oracle sage 2.61 26 | sage wizard 2.46 27 | brother lad 2.41 28 | crane implement 2.37 29 | magician oracle 1.82 30 | glass jewel 1.78 31 | cemetery mound 1.69 32 | car journey 1.55 33 | hill woodland 1.48 34 | crane rooster 1.41 35 | furnace implement 1.37 36 | coast hill 1.26 37 | bird woodland 1.24 38 | shore voyage 1.22 39 | cemetery woodland 1.18 40 | food rooster 1.09 41 | forest graveyard 1.00 42 | lad wizard 0.99 43 | mound shore 0.97 44 | automobile cushion 0.97 45 | boy sage 0.96 46 | monk oracle 0.91 47 | shore woodland 0.90 48 | grin lad 0.88 49 | coast forest 0.85 50 | asylum cemetery 0.79 51 | monk slave 0.57 52 | cushion jewel 0.45 53 | boy rooster 0.44 54 | glass magician 0.44 55 | graveyard madhouse 0.44 56 | asylum monk 0.39 57 | asylum fruit 0.19 58 | grin implement 0.18 59 | mound stove 0.14 60 | automobile wizard 0.11 61 | autograph shore 0.06 62 | fruit furnace 0.05 63 | noon string 0.04 64 | rooster voyage 0.04 65 | cord smile 0.02 -------------------------------------------------------------------------------- /psdvec/testsets/ws/EN-TOEFL-80.txt: -------------------------------------------------------------------------------- 1 | enormously | tremendously | appropriately | uniquely | decidedly 2 | provisions | stipulations | jurisdictions | interrelations | interpretations 3 | haphazardly | randomly | linearly | dangerously | densely 4 | prominent | conspicuous | ancient | mysterious | battered 5 | zenith | pinnacle | completion | decline | outset 6 | flawed | imperfect | tiny | crude | lustrous 7 | urgently | desperately | conceivably | typically | tentatively 8 | consumed | eaten | bred | caught | supplied 9 | advent | coming | stability | financing | arrest 10 | concisely | succinctly | freely | positively | powerfully 11 | salutes | greetings | privileges | ceremonies | information 12 | solitary | alone | fearless | alert | restless 13 | hasten | accelerate | accompany | determine | permit 14 | perseverance | endurance | skill | generosity | disturbance 15 | fanciful | imaginative | logical | familiar | apparent 16 | showed | demonstrated | published | repeated | postponed 17 | constantly | continually | accidentally | rapidly | instantly 18 | issues | subjects | training | benefits | salaries 19 | furnish | supply | protect | advise | impress 20 | costly | expensive | beautiful | popular | complicated 21 | recognized | acknowledged | welcomed | depicted | successful 22 | spot | location | climate | latitude | sea 23 | make | earn | print | trade | borrow 24 | often | frequently | definitely | chemically | hardly 25 | easygoing | relaxed | boring | frontier | farming 26 | debate | argument | competition | war | election 27 | narrow | thin | poisonous | freezing | clear 28 | arranged | planned | discarded | studied | explained 29 | infinite | limitless | structural | unusual | relative 30 | showy | striking | prickly | incidental | entertaining 31 | levied | imposed | believed | correlated | requested 32 | deftly | skillfully | occasionally | prudently | humorously 33 | distribute | circulate | commercialize | acknowledge | research 34 | discrepancies | differences | weights | wavelengths | deposits 35 | prolific | productive | capable | serious | promising 36 | unmatched | unequaled | emulated | alienated | unrecognized 37 | peculiarly | uniquely | suspiciously | patriotically | partly 38 | hue | color | contrast | scent | glare 39 | hind | rear | curved | muscular | hairy 40 | highlight | accentuate | alter | restore | imitate 41 | hastily | hurriedly | habitually | shrewdly | chronologically 42 | temperate | mild | short | windy | cold 43 | grin | smile | exercise | rest | joke 44 | verbally | orally | verbosely | overtly | fittingly 45 | physician | doctor | chemist | nurse | pharmacist 46 | essentially | basically | eagerly | ordinarily | possibly 47 | keen | sharp | useful | simple | famous 48 | situated | positioned | rotating | emptying | isolated 49 | principal | major | exceptional | numerous | most 50 | slowly | gradually | effectively | continuously | rarely 51 | built | constructed | proposed | organized | financed 52 | tasks | jobs | customers | shops | materials 53 | unlikely | improbable | disagreeable | different | unpopular 54 | halfheartedly | apathetically | unconventionally | bipartisanly | customarily 55 | annals | chronicles | homes | trails | songs 56 | wildly | furiously | mysteriously | abruptly | distinctively 57 | hailed | acclaimed | judged | remembered | addressed 58 | command | mastery | observation | love | awareness 59 | concocted | devised | supervised | requested | cleaned 60 | prospective | potential | prominent | particular | prudent 61 | generally | broadly | controversially | accurately | descriptively 62 | sustained | prolonged | analyzed | refined | lowered 63 | perilous | dangerous | offensive | binding | exciting 64 | tranquillity | peacefulness | weariness | harshness | happiness 65 | dissipate | disperse | isolate | photograph | disguise 66 | primarily | chiefly | consistently | occasionally | cautiously 67 | colloquial | conversational | incorrect | recorded | misunderstood 68 | resolved | settled | examined | forgotten | publicized 69 | feasible | possible | evident | permitted | equitable 70 | expeditiously | rapidly | frequently | repeatedly | actually 71 | percentage | proportion | sample | profit | volume 72 | terminated | ended | posed | evaluated | postponed 73 | uniform | alike | sharp | hard | complex 74 | figure | solve | list | express | divide 75 | sufficient | enough | valuable | physiological | recent 76 | fashion | manner | fathom | craze | ration 77 | marketed | sold | sweetened | diluted | frozen 78 | bigger | larger | steadier | closer | better 79 | roots | origins | function | rituals | cure 80 | normally | ordinarily | periodically | haltingly | permanently 81 | -------------------------------------------------------------------------------- /psdvec/testsets/ws/radinsky_mturk.txt: -------------------------------------------------------------------------------- 1 | episcopal russia 2.75 2 | water shortage 2.714285714 3 | horse wedding 2.266666667 4 | plays losses 3.2 5 | classics advertiser 2.25 6 | latin credit 2.0625 7 | ship ballots 2.3125 8 | mistake error 4.352941176 9 | disease plague 4.117647059 10 | sake shade 2.529411765 11 | saints observatory 1.9375 12 | treaty wheat 1.8125 13 | texas death 1.533333333 14 | republicans challenge 2.3125 15 | body peaceful 2.058823529 16 | admiralty intensity 2.647058824 17 | body improving 2.117647059 18 | heroin marijuana 3.375 19 | scottish commuters 2.6875 20 | apollo myth 2.6 21 | film cautious 2.125 22 | exhibition art 4.117647059 23 | chocolate candy 3.764705882 24 | republic candidate 2.8125 25 | gospel church 4.0625 26 | momentum desirable 2.4 27 | singapore sanctions 2.117647059 28 | english french 3.823529412 29 | exile church 2.941176471 30 | navy coordinator 2.235294118 31 | adventure flood 2.4375 32 | radar plane 3.235294118 33 | pacific ocean 4.266666667 34 | scotch liquor 4.571428571 35 | kennedy gun 3 36 | garfield cat 2.866666667 37 | scale budget 3.5 38 | rhythm blues 3.071428571 39 | rich privileges 3.2 40 | navy withdrawn 1.571428571 41 | marble marching 2.615384615 42 | polo charged 2.125 43 | mark missing 2.333333333 44 | battleship army 4.235294118 45 | medium organization 2.5625 46 | pennsylvania writer 1.466666667 47 | hamlet poet 3.882352941 48 | battle prisoners 3.705882353 49 | guild smith 2.75 50 | mud soil 4.235294118 51 | crime assaulted 3.941176471 52 | mussolini stability 2.133333333 53 | lincoln division 2.4375 54 | slaves insured 2.2 55 | summer winter 4.375 56 | integration dignity 3.058823529 57 | money quota 2.5 58 | honolulu vacation 3.6875 59 | libya forged 2.461538462 60 | cheers musician 2.823529412 61 | session surprises 1.8125 62 | billion campaigning 2.571428571 63 | perjury soybean 2.0625 64 | forswearing perjury 3.3125 65 | costume halloween 3.4375 66 | bulgarian nurses 1.941176471 67 | costume ultimate 2.5 68 | faith judging 2.235294118 69 | france bridges 2.235294118 70 | citizenship casey 2.2 71 | recreation dish 1.4 72 | intelligence troubles 1.625 73 | germany worst 1.4375 74 | chaos death 2.75 75 | sydney hancock 2.857142857 76 | sabbath stevenson 2.214285714 77 | espionage passport 2.3125 78 | political today 1.6875 79 | pipe convertible 2 80 | scouting demonstrate 2.5625 81 | salute patterns 2.235294118 82 | reichstag germany 2.285714286 83 | radiation costumes 1.5625 84 | horace grief 1.764705882 85 | sale rental 3.470588235 86 | open close 4.058823529 87 | photography proving 2.375 88 | propaganda germany 1.705882353 89 | assassination forbes 2.071428571 90 | mirror duel 1.928571429 91 | probability hanging 2.058823529 92 | africa theater 1.5 93 | hell heaven 4.117647059 94 | mussolini italy 3 95 | composer beethoven 3.647058824 96 | minister forthcoming 1.764705882 97 | brussels sweden 3.176470588 98 | neutral parish 1.6 99 | emotion taxation 1.733333333 100 | louisiana simple 2 101 | quarantine disease 3 102 | cannon imprisoned 2.625 103 | bronze suspicion 2 104 | pearl interim 2.352941176 105 | artist paint 4.117647059 106 | relay family 2.0625 107 | art mortality 2.294117647 108 | food investment 2.25 109 | alt tenor 2.692307692 110 | catholics protestant 3.5625 111 | militia landlord 3.0625 112 | battle warships 4.176470588 113 | alcohol fleeing 2.5625 114 | coil ashes 3.117647059 115 | poland russia 4 116 | explosive builders 2.4375 117 | aeronautics plane 4.277777778 118 | charge sentence 3.133333333 119 | pet retiring 2 120 | drink alcohol 4.352941176 121 | stability species 2.375 122 | colonies depression 2 123 | easter preference 2.0625 124 | genius intellect 4.090909091 125 | diamond killed 1.555555556 126 | slavery african 2.8 127 | jurisdiction law 4.454545455 128 | saints repeal 1.555555556 129 | conspiracy campaign 2.166666667 130 | operator extracts 2.214285714 131 | physician action 2.153846154 132 | electronics guess 1.916666667 133 | slavery diamond 2.285714286 134 | quarterback sport 3.142857143 135 | assassination killed 4.285714286 136 | slavery klan 2.230769231 137 | heroin shoot 2.692307692 138 | birds disturbances 1.692307692 139 | palestinians turks 2.5 140 | citizenship court 2.5 141 | immunity violation 2.076923077 142 | alternative contend 2.461538462 143 | chile plates 2.692307692 144 | abraham stranger 1.846153846 145 | kansas city 3.769230769 146 | month year 3.857142857 147 | month day 3.857142857 148 | amateur actor 2.333333333 149 | afghanistan war 3.384615385 150 | transmission maxwell 2.25 151 | manchester ambitious 1.923076923 152 | program battered 1.928571429 153 | drawing music 2.583333333 154 | exile pledges 2.307692308 155 | adventure sixteen 1.538461538 156 | exile threats 2.166666667 157 | concrete wings 1.428571429 158 | seizure bishops 2 159 | submarine sea 3.857142857 160 | villa mayor 2.25 161 | trade farley 2.375 162 | nature forest 3.636363636 163 | chronicle young 1.9 164 | radical bishops 1.818181818 165 | pakistan radical 2.875 166 | fire water 4.266666667 167 | gossip nuisance 3.0625 168 | con examiner 2.266666667 169 | satellite space 3.75 170 | essay boston 2 171 | miniature statue 3.6 172 | spill pollution 3.5 173 | minister council 3.5625 174 | landscape mountain 3.5625 175 | religion remedy 2.5625 176 | ship storm 3.5 177 | college scientist 2.8125 178 | crystal oldest 2.5625 179 | afghanistan wise 2.066666667 180 | trinity religion 3.133333333 181 | homer odyssey 2.857142857 182 | parish clue 2.4375 183 | actress actor 4.0625 184 | patent professionals 2.375 185 | chaos horrible 3.066666667 186 | acre earthquake 2.125 187 | government immunity 2 188 | football justice 1.8 189 | gambling money 3.75 190 | corruption nervous 1.875 191 | cardinals villages 2.375 192 | life death 4.103448276 193 | artillery sanctions 2.428571429 194 | jerusalem murdered 2.357142857 195 | cell brick 3.285714286 196 | knowledge promoter 2.642857143 197 | adventure rails 2.571428571 198 | houston crash 2.357142857 199 | oxford subcommittee 2.642857143 200 | militia weapon 3.785714286 201 | manufacturer meat 1.857142857 202 | damages reaction 3.071428571 203 | sea fishing 4.357142857 204 | atomic clash 2.785714286 205 | broadcasting athletics 3 206 | mystery expedition 2.538461538 207 | kremlin soviets 3.166666667 208 | pig blaze 1.75 209 | riverside vietnamese 2.25 210 | bitter protective 1.923076923 211 | disaster announced 2.384615385 212 | pork blaze 2.230769231 213 | feet international 1.916666667 214 | radical uniform 2.5 215 | gossip condemned 2.692307692 216 | mozart wagner 3.166666667 217 | soccer boxing 3.4 218 | radical roles 2.75 219 | rescued slaying 3 220 | researchers tested 3.538461538 221 | sales season 2.307692308 222 | homeless refugees 3.615384615 223 | pakistan repair 1.75 224 | athens painting 2.294117647 225 | tiger woods 3.375 226 | aircraft plane 4.473684211 227 | solar carbon 2.842105263 228 | enterprise bankruptcy 2.5 229 | homer springfield 2.833333333 230 | coin awards 2.166666667 231 | rhodes native 2.25 232 | soccer curator 2.125 233 | gasoline stock 2.888888889 234 | guilt extended 2.105263158 235 | rapid singapore 1.764705882 236 | coin banker 3.631578947 237 | london correspondence 1.944444444 238 | pop sex 2.6 239 | medicine bread 2.176470588 240 | asia animal 1.555555556 241 | pop clubhouse 3.210526316 242 | nazi defensive 2.055555556 243 | earth poles 3.421052632 244 | thailand crowded 2.166666667 245 | day independence 3.473684211 246 | controversy pitch 2.375 247 | stock gasoline 3.166666667 248 | composers mozart 3.833333333 249 | tone piano 3.722222222 250 | paris chef 2.111111111 251 | profession responsible 2.722222222 252 | bankruptcy chronicle 2 253 | lebanon war 2.722222222 254 | israel terror 3.055555556 255 | angola military 2.941176471 256 | chemistry patients 2.357142857 257 | munich constitution 3.071428571 258 | piano theater 3.266666667 259 | poetry artist 3.8 260 | acre burned 1.769230769 261 | religion abortion 2.076923077 262 | jazz music 4.533333333 263 | government transportation 3 264 | color wine 2.533333333 265 | jackson quota 1.692307692 266 | shariff deputy 3.642857143 267 | boat negroes 2 268 | shooting sentenced 2.933333333 269 | republicans friedman 2.416666667 270 | politics brokerage 2.5 271 | russian stalin 3.357142857 272 | love philip 2.5 273 | nuclear plant 3.733333333 274 | jamaica queens 3.076923077 275 | dollar asylum 1.846153846 276 | bridge rowing 2.785714286 277 | berlin germany 4 278 | funeral death 4.714285714 279 | albert einstein 4.266666667 280 | gulf shore 3.857142857 281 | ecuador argentina 3.266666667 282 | britain france 3.714285714 283 | sports score 3.866666667 284 | socialism capitalism 3.785714286 285 | treaty peace 4.166666667 286 | exchange market 4.266666667 287 | marriage anniversary 4.333333333 288 | -------------------------------------------------------------------------------- /psdvec/testsets/ws/ws353.txt: -------------------------------------------------------------------------------- 1 | love sex 6.77 2 | tiger cat 7.35 3 | tiger tiger 10.00 4 | book paper 7.46 5 | computer keyboard 7.62 6 | computer internet 7.58 7 | plane car 5.77 8 | train car 6.31 9 | telephone communication 7.50 10 | television radio 6.77 11 | media radio 7.42 12 | drug abuse 6.85 13 | bread butter 6.19 14 | cucumber potato 5.92 15 | doctor nurse 7.00 16 | professor doctor 6.62 17 | student professor 6.81 18 | smart student 4.62 19 | smart stupid 5.81 20 | company stock 7.08 21 | stock market 8.08 22 | stock phone 1.62 23 | stock CD 1.31 24 | stock jaguar 0.92 25 | stock egg 1.81 26 | fertility egg 6.69 27 | stock live 3.73 28 | stock life 0.92 29 | book library 7.46 30 | bank money 8.12 31 | wood forest 7.73 32 | money cash 9.15 33 | professor cucumber 0.31 34 | king cabbage 0.23 35 | king queen 8.58 36 | king rook 5.92 37 | bishop rabbi 6.69 38 | Jerusalem Israel 8.46 39 | Jerusalem Palestinian 7.65 40 | holy sex 1.62 41 | fuck sex 9.44 42 | Maradona football 8.62 43 | football soccer 9.03 44 | football basketball 6.81 45 | football tennis 6.63 46 | tennis racket 7.56 47 | Arafat peace 6.73 48 | Arafat terror 7.65 49 | Arafat Jackson 2.50 50 | law lawyer 8.38 51 | movie star 7.38 52 | movie popcorn 6.19 53 | movie critic 6.73 54 | movie theater 7.92 55 | physics proton 8.12 56 | physics chemistry 7.35 57 | space chemistry 4.88 58 | alcohol chemistry 5.54 59 | vodka gin 8.46 60 | vodka brandy 8.13 61 | drink car 3.04 62 | drink ear 1.31 63 | drink mouth 5.96 64 | drink eat 6.87 65 | baby mother 7.85 66 | drink mother 2.65 67 | car automobile 8.94 68 | gem jewel 8.96 69 | journey voyage 9.29 70 | boy lad 8.83 71 | coast shore 9.10 72 | asylum madhouse 8.87 73 | magician wizard 9.02 74 | midday noon 9.29 75 | furnace stove 8.79 76 | food fruit 7.52 77 | bird cock 7.10 78 | bird crane 7.38 79 | tool implement 6.46 80 | brother monk 6.27 81 | crane implement 2.69 82 | lad brother 4.46 83 | journey car 5.85 84 | monk oracle 5.00 85 | cemetery woodland 2.08 86 | food rooster 4.42 87 | coast hill 4.38 88 | forest graveyard 1.85 89 | shore woodland 3.08 90 | monk slave 0.92 91 | coast forest 3.15 92 | lad wizard 0.92 93 | chord smile 0.54 94 | glass magician 2.08 95 | noon string 0.54 96 | rooster voyage 0.62 97 | money dollar 8.42 98 | money cash 9.08 99 | money currency 9.04 100 | money wealth 8.27 101 | money property 7.57 102 | money possession 7.29 103 | money bank 8.50 104 | money deposit 7.73 105 | money withdrawal 6.88 106 | money laundering 5.65 107 | money operation 3.31 108 | tiger jaguar 8.00 109 | tiger feline 8.00 110 | tiger carnivore 7.08 111 | tiger mammal 6.85 112 | tiger animal 7.00 113 | tiger organism 4.77 114 | tiger fauna 5.62 115 | tiger zoo 5.87 116 | psychology psychiatry 8.08 117 | psychology anxiety 7.00 118 | psychology fear 6.85 119 | psychology depression 7.42 120 | psychology clinic 6.58 121 | psychology doctor 6.42 122 | psychology Freud 8.21 123 | psychology mind 7.69 124 | psychology health 7.23 125 | psychology science 6.71 126 | psychology discipline 5.58 127 | psychology cognition 7.48 128 | planet star 8.45 129 | planet constellation 8.06 130 | planet moon 8.08 131 | planet sun 8.02 132 | planet galaxy 8.11 133 | planet space 7.92 134 | planet astronomer 7.94 135 | precedent example 5.85 136 | precedent information 3.85 137 | precedent cognition 2.81 138 | precedent law 6.65 139 | precedent collection 2.50 140 | precedent group 1.77 141 | precedent antecedent 6.04 142 | cup coffee 6.58 143 | cup tableware 6.85 144 | cup article 2.40 145 | cup artifact 2.92 146 | cup object 3.69 147 | cup entity 2.15 148 | cup drink 7.25 149 | cup food 5.00 150 | cup substance 1.92 151 | cup liquid 5.90 152 | jaguar cat 7.42 153 | jaguar car 7.27 154 | energy secretary 1.81 155 | secretary senate 5.06 156 | energy laboratory 5.09 157 | computer laboratory 6.78 158 | weapon secret 6.06 159 | FBI fingerprint 6.94 160 | FBI investigation 8.31 161 | investigation effort 4.59 162 | Mars water 2.94 163 | Mars scientist 5.63 164 | news report 8.16 165 | canyon landscape 7.53 166 | image surface 4.56 167 | discovery space 6.34 168 | water seepage 6.56 169 | sign recess 2.38 170 | Wednesday news 2.22 171 | mile kilometer 8.66 172 | computer news 4.47 173 | territory surface 5.34 174 | atmosphere landscape 3.69 175 | president medal 3.00 176 | war troops 8.13 177 | record number 6.31 178 | skin eye 6.22 179 | Japanese American 6.50 180 | theater history 3.91 181 | volunteer motto 2.56 182 | prejudice recognition 3.00 183 | decoration valor 5.63 184 | century year 7.59 185 | century nation 3.16 186 | delay racism 1.19 187 | delay news 3.31 188 | minister party 6.63 189 | peace plan 4.75 190 | minority peace 3.69 191 | attempt peace 4.25 192 | government crisis 6.56 193 | deployment departure 4.25 194 | deployment withdrawal 5.88 195 | energy crisis 5.94 196 | announcement news 7.56 197 | announcement effort 2.75 198 | stroke hospital 7.03 199 | disability death 5.47 200 | victim emergency 6.47 201 | treatment recovery 7.91 202 | journal association 4.97 203 | doctor personnel 5.00 204 | doctor liability 5.19 205 | liability insurance 7.03 206 | school center 3.44 207 | reason hypertension 2.31 208 | reason criterion 5.91 209 | hundred percent 7.38 210 | Harvard Yale 8.13 211 | hospital infrastructure 4.63 212 | death row 5.25 213 | death inmate 5.03 214 | lawyer evidence 6.69 215 | life death 7.88 216 | life term 4.50 217 | word similarity 4.75 218 | board recommendation 4.47 219 | governor interview 3.25 220 | OPEC country 5.63 221 | peace atmosphere 3.69 222 | peace insurance 2.94 223 | territory kilometer 5.28 224 | travel activity 5.00 225 | competition price 6.44 226 | consumer confidence 4.13 227 | consumer energy 4.75 228 | problem airport 2.38 229 | car flight 4.94 230 | credit card 8.06 231 | credit information 5.31 232 | hotel reservation 8.03 233 | grocery money 5.94 234 | registration arrangement 6.00 235 | arrangement accommodation 5.41 236 | month hotel 1.81 237 | type kind 8.97 238 | arrival hotel 6.00 239 | bed closet 6.72 240 | closet clothes 8.00 241 | situation conclusion 4.81 242 | situation isolation 3.88 243 | impartiality interest 5.16 244 | direction combination 2.25 245 | street place 6.44 246 | street avenue 8.88 247 | street block 6.88 248 | street children 4.94 249 | listing proximity 2.56 250 | listing category 6.38 251 | cell phone 7.81 252 | production hike 1.75 253 | benchmark index 4.25 254 | media trading 3.88 255 | media gain 2.88 256 | dividend payment 7.63 257 | dividend calculation 6.48 258 | calculation computation 8.44 259 | currency market 7.50 260 | OPEC oil 8.59 261 | oil stock 6.34 262 | announcement production 3.38 263 | announcement warning 6.00 264 | profit warning 3.88 265 | profit loss 7.63 266 | dollar yen 7.78 267 | dollar buck 9.22 268 | dollar profit 7.38 269 | dollar loss 6.09 270 | computer software 8.50 271 | network hardware 8.31 272 | phone equipment 7.13 273 | equipment maker 5.91 274 | luxury car 6.47 275 | five month 3.38 276 | report gain 3.63 277 | investor earning 7.13 278 | liquid water 7.89 279 | baseball season 5.97 280 | game victory 7.03 281 | game team 7.69 282 | marathon sprint 7.47 283 | game series 6.19 284 | game defeat 6.97 285 | seven series 3.56 286 | seafood sea 7.47 287 | seafood food 8.34 288 | seafood lobster 8.70 289 | lobster food 7.81 290 | lobster wine 5.70 291 | food preparation 6.22 292 | video archive 6.34 293 | start year 4.06 294 | start match 4.47 295 | game round 5.97 296 | boxing round 7.61 297 | championship tournament 8.36 298 | fighting defeating 7.41 299 | line insurance 2.69 300 | day summer 3.94 301 | summer drought 7.16 302 | summer nature 5.63 303 | day dawn 7.53 304 | nature environment 8.31 305 | environment ecology 8.81 306 | nature man 6.25 307 | man woman 8.30 308 | man governor 5.25 309 | murder manslaughter 8.53 310 | soap opera 7.94 311 | opera performance 6.88 312 | life lesson 5.94 313 | focus life 4.06 314 | production crew 6.25 315 | television film 7.72 316 | lover quarrel 6.19 317 | viewer serial 2.97 318 | possibility girl 1.94 319 | population development 3.75 320 | morality importance 3.31 321 | morality marriage 3.69 322 | Mexico Brazil 7.44 323 | gender equality 6.41 324 | change attitude 5.44 325 | family planning 6.25 326 | opera industry 2.63 327 | sugar approach 0.88 328 | practice institution 3.19 329 | ministry culture 4.69 330 | problem challenge 6.75 331 | size prominence 5.31 332 | country citizen 7.31 333 | planet people 5.75 334 | development issue 3.97 335 | experience music 3.47 336 | music project 3.63 337 | glass metal 5.56 338 | aluminum metal 7.83 339 | chance credibility 3.88 340 | exhibit memorabilia 5.31 341 | concert virtuoso 6.81 342 | rock jazz 7.59 343 | museum theater 7.19 344 | observation architecture 4.38 345 | space world 6.53 346 | preservation world 6.19 347 | admission ticket 7.69 348 | shower thunderstorm 6.31 349 | shower flood 6.03 350 | weather forecast 8.34 351 | disaster area 6.25 352 | governor office 6.34 353 | architecture century 3.78 354 | -------------------------------------------------------------------------------- /psdvec/testsets/ws/ws353_relatedness.txt: -------------------------------------------------------------------------------- 1 | computer keyboard 7.62 2 | Jerusalem Israel 8.46 3 | planet galaxy 8.11 4 | canyon landscape 7.53 5 | OPEC country 5.63 6 | day summer 3.94 7 | day dawn 7.53 8 | country citizen 7.31 9 | planet people 5.75 10 | environment ecology 8.81 11 | Maradona football 8.62 12 | OPEC oil 8.59 13 | money bank 8.50 14 | computer software 8.50 15 | law lawyer 8.38 16 | weather forecast 8.34 17 | network hardware 8.31 18 | nature environment 8.31 19 | FBI investigation 8.31 20 | money wealth 8.27 21 | psychology Freud 8.21 22 | news report 8.16 23 | war troops 8.13 24 | physics proton 8.12 25 | bank money 8.12 26 | stock market 8.08 27 | planet constellation 8.06 28 | credit card 8.06 29 | hotel reservation 8.03 30 | closet clothes 8.00 31 | soap opera 7.94 32 | planet astronomer 7.94 33 | planet space 7.92 34 | movie theater 7.92 35 | treatment recovery 7.91 36 | baby mother 7.85 37 | money deposit 7.73 38 | television film 7.72 39 | psychology mind 7.69 40 | game team 7.69 41 | admission ticket 7.69 42 | Jerusalem Palestinian 7.65 43 | Arafat terror 7.65 44 | boxing round 7.61 45 | computer internet 7.58 46 | money property 7.57 47 | tennis racket 7.56 48 | telephone communication 7.50 49 | currency market 7.50 50 | psychology cognition 7.48 51 | seafood sea 7.47 52 | book paper 7.46 53 | book library 7.46 54 | psychology depression 7.42 55 | fighting defeating 7.41 56 | movie star 7.38 57 | hundred percent 7.38 58 | dollar profit 7.38 59 | money possession 7.29 60 | cup drink 7.25 61 | psychology health 7.23 62 | summer drought 7.16 63 | investor earning 7.13 64 | company stock 7.08 65 | stroke hospital 7.03 66 | liability insurance 7.03 67 | game victory 7.03 68 | psychology anxiety 7.00 69 | game defeat 6.97 70 | FBI fingerprint 6.94 71 | money withdrawal 6.88 72 | psychology fear 6.85 73 | drug abuse 6.85 74 | concert virtuoso 6.81 75 | computer laboratory 6.78 76 | love sex 6.77 77 | problem challenge 6.75 78 | movie critic 6.73 79 | Arafat peace 6.73 80 | bed closet 6.72 81 | lawyer evidence 6.69 82 | fertility egg 6.69 83 | precedent law 6.65 84 | minister party 6.63 85 | psychology clinic 6.58 86 | cup coffee 6.58 87 | water seepage 6.56 88 | government crisis 6.56 89 | space world 6.53 90 | dividend calculation 6.48 91 | victim emergency 6.47 92 | luxury car 6.47 93 | tool implement 6.46 94 | competition price 6.44 95 | psychology doctor 6.42 96 | gender equality 6.41 97 | listing category 6.38 98 | video archive 6.34 99 | oil stock 6.34 100 | governor office 6.34 101 | discovery space 6.34 102 | record number 6.31 103 | brother monk 6.27 104 | production crew 6.25 105 | nature man 6.25 106 | family planning 6.25 107 | disaster area 6.25 108 | food preparation 6.22 109 | preservation world 6.19 110 | movie popcorn 6.19 111 | lover quarrel 6.19 112 | game series 6.19 113 | dollar loss 6.09 114 | weapon secret 6.06 115 | shower flood 6.03 116 | registration arrangement 6.00 117 | arrival hotel 6.00 118 | announcement warning 6.00 119 | game round 5.97 120 | baseball season 5.97 121 | drink mouth 5.96 122 | life lesson 5.94 123 | grocery money 5.94 124 | energy crisis 5.94 125 | reason criterion 5.91 126 | equipment maker 5.91 127 | cup liquid 5.90 128 | deployment withdrawal 5.88 129 | tiger zoo 5.87 130 | journey car 5.85 131 | money laundering 5.65 132 | summer nature 5.63 133 | decoration valor 5.63 134 | Mars scientist 5.63 135 | alcohol chemistry 5.54 136 | disability death 5.47 137 | change attitude 5.44 138 | arrangement accommodation 5.41 139 | territory surface 5.34 140 | size prominence 5.31 141 | exhibit memorabilia 5.31 142 | credit information 5.31 143 | territory kilometer 5.28 144 | death row 5.25 145 | doctor liability 5.19 146 | impartiality interest 5.16 147 | energy laboratory 5.09 148 | secretary senate 5.06 149 | death inmate 5.03 150 | monk oracle 5.00 151 | cup food 5.00 152 | journal association 4.97 153 | street children 4.94 154 | car flight 4.94 155 | space chemistry 4.88 156 | situation conclusion 4.81 157 | word similarity 4.75 158 | peace plan 4.75 159 | consumer energy 4.75 160 | ministry culture 4.69 161 | smart student 4.62 162 | investigation effort 4.59 163 | image surface 4.56 164 | life term 4.50 165 | start match 4.47 166 | computer news 4.47 167 | board recommendation 4.47 168 | lad brother 4.46 169 | observation architecture 4.38 170 | coast hill 4.38 171 | deployment departure 4.25 172 | benchmark index 4.25 173 | attempt peace 4.25 174 | consumer confidence 4.13 175 | start year 4.06 176 | focus life 4.06 177 | development issue 3.97 178 | theater history 3.91 179 | situation isolation 3.88 180 | profit warning 3.88 181 | media trading 3.88 182 | chance credibility 3.88 183 | precedent information 3.85 184 | architecture century 3.78 185 | population development 3.75 186 | stock live 3.73 187 | peace atmosphere 3.69 188 | morality marriage 3.69 189 | minority peace 3.69 190 | atmosphere landscape 3.69 191 | report gain 3.63 192 | music project 3.63 193 | seven series 3.56 194 | experience music 3.47 195 | school center 3.44 196 | five month 3.38 197 | announcement production 3.38 198 | morality importance 3.31 199 | money operation 3.31 200 | delay news 3.31 201 | governor interview 3.25 202 | practice institution 3.19 203 | century nation 3.16 204 | coast forest 3.15 205 | shore woodland 3.08 206 | drink car 3.04 207 | president medal 3.00 208 | prejudice recognition 3.00 209 | viewer serial 2.97 210 | peace insurance 2.94 211 | Mars water 2.94 212 | media gain 2.88 213 | precedent cognition 2.81 214 | announcement effort 2.75 215 | line insurance 2.69 216 | crane implement 2.69 217 | drink mother 2.65 218 | opera industry 2.63 219 | volunteer motto 2.56 220 | listing proximity 2.56 221 | precedent collection 2.50 222 | cup article 2.40 223 | sign recess 2.38 224 | problem airport 2.38 225 | reason hypertension 2.31 226 | direction combination 2.25 227 | Wednesday news 2.22 228 | glass magician 2.08 229 | cemetery woodland 2.08 230 | possibility girl 1.94 231 | cup substance 1.92 232 | forest graveyard 1.85 233 | stock egg 1.81 234 | month hotel 1.81 235 | energy secretary 1.81 236 | precedent group 1.77 237 | production hike 1.75 238 | stock phone 1.62 239 | holy sex 1.62 240 | stock CD 1.31 241 | drink ear 1.31 242 | delay racism 1.19 243 | stock life 0.92 244 | stock jaguar 0.92 245 | monk slave 0.92 246 | lad wizard 0.92 247 | sugar approach 0.88 248 | rooster voyage 0.62 249 | noon string 0.54 250 | chord smile 0.54 251 | professor cucumber 0.31 252 | king cabbage 0.23 253 | -------------------------------------------------------------------------------- /psdvec/testsets/ws/ws353_similarity.txt: -------------------------------------------------------------------------------- 1 | tiger cat 7.35 2 | tiger tiger 10.00 3 | plane car 5.77 4 | train car 6.31 5 | television radio 6.77 6 | media radio 7.42 7 | bread butter 6.19 8 | cucumber potato 5.92 9 | doctor nurse 7.00 10 | professor doctor 6.62 11 | student professor 6.81 12 | smart stupid 5.81 13 | wood forest 7.73 14 | money cash 9.15 15 | king queen 8.58 16 | king rook 5.92 17 | bishop rabbi 6.69 18 | fuck sex 9.44 19 | football soccer 9.03 20 | football basketball 6.81 21 | football tennis 6.63 22 | Arafat Jackson 2.50 23 | physics chemistry 7.35 24 | vodka gin 8.46 25 | vodka brandy 8.13 26 | drink eat 6.87 27 | car automobile 8.94 28 | gem jewel 8.96 29 | journey voyage 9.29 30 | boy lad 8.83 31 | coast shore 9.10 32 | asylum madhouse 8.87 33 | magician wizard 9.02 34 | midday noon 9.29 35 | furnace stove 8.79 36 | food fruit 7.52 37 | bird cock 7.10 38 | bird crane 7.38 39 | food rooster 4.42 40 | money dollar 8.42 41 | money currency 9.04 42 | tiger jaguar 8.00 43 | tiger feline 8.00 44 | tiger carnivore 7.08 45 | tiger mammal 6.85 46 | tiger animal 7.00 47 | tiger organism 4.77 48 | tiger fauna 5.62 49 | psychology psychiatry 8.08 50 | psychology science 6.71 51 | psychology discipline 5.58 52 | planet star 8.45 53 | planet moon 8.08 54 | planet sun 8.02 55 | precedent example 5.85 56 | precedent antecedent 6.04 57 | cup tableware 6.85 58 | cup artifact 2.92 59 | cup object 3.69 60 | cup entity 2.15 61 | jaguar cat 7.42 62 | jaguar car 7.27 63 | mile kilometer 8.66 64 | skin eye 6.22 65 | Japanese American 6.50 66 | century year 7.59 67 | announcement news 7.56 68 | doctor personnel 5.00 69 | Harvard Yale 8.13 70 | hospital infrastructure 4.63 71 | life death 7.88 72 | travel activity 5.00 73 | type kind 8.97 74 | street place 6.44 75 | street avenue 8.88 76 | street block 6.88 77 | cell phone 7.81 78 | dividend payment 7.63 79 | calculation computation 8.44 80 | profit loss 7.63 81 | dollar yen 7.78 82 | dollar buck 9.22 83 | phone equipment 7.13 84 | liquid water 7.89 85 | marathon sprint 7.47 86 | seafood food 8.34 87 | seafood lobster 8.70 88 | lobster food 7.81 89 | lobster wine 5.70 90 | championship tournament 8.36 91 | man woman 8.30 92 | man governor 5.25 93 | murder manslaughter 8.53 94 | opera performance 6.88 95 | Mexico Brazil 7.44 96 | glass metal 5.56 97 | aluminum metal 7.83 98 | rock jazz 7.59 99 | museum theater 7.19 100 | shower thunderstorm 6.31 101 | monk oracle 5.00 102 | cup food 5.00 103 | journal association 4.97 104 | street children 4.94 105 | car flight 4.94 106 | space chemistry 4.88 107 | situation conclusion 4.81 108 | word similarity 4.75 109 | peace plan 4.75 110 | consumer energy 4.75 111 | ministry culture 4.69 112 | smart student 4.62 113 | investigation effort 4.59 114 | image surface 4.56 115 | life term 4.50 116 | start match 4.47 117 | computer news 4.47 118 | board recommendation 4.47 119 | lad brother 4.46 120 | observation architecture 4.38 121 | coast hill 4.38 122 | deployment departure 4.25 123 | benchmark index 4.25 124 | attempt peace 4.25 125 | consumer confidence 4.13 126 | start year 4.06 127 | focus life 4.06 128 | development issue 3.97 129 | theater history 3.91 130 | situation isolation 3.88 131 | profit warning 3.88 132 | media trading 3.88 133 | chance credibility 3.88 134 | precedent information 3.85 135 | architecture century 3.78 136 | population development 3.75 137 | stock live 3.73 138 | peace atmosphere 3.69 139 | morality marriage 3.69 140 | minority peace 3.69 141 | atmosphere landscape 3.69 142 | report gain 3.63 143 | music project 3.63 144 | seven series 3.56 145 | experience music 3.47 146 | school center 3.44 147 | five month 3.38 148 | announcement production 3.38 149 | morality importance 3.31 150 | money operation 3.31 151 | delay news 3.31 152 | governor interview 3.25 153 | practice institution 3.19 154 | century nation 3.16 155 | coast forest 3.15 156 | shore woodland 3.08 157 | drink car 3.04 158 | president medal 3.00 159 | prejudice recognition 3.00 160 | viewer serial 2.97 161 | peace insurance 2.94 162 | Mars water 2.94 163 | media gain 2.88 164 | precedent cognition 2.81 165 | announcement effort 2.75 166 | line insurance 2.69 167 | crane implement 2.69 168 | drink mother 2.65 169 | opera industry 2.63 170 | volunteer motto 2.56 171 | listing proximity 2.56 172 | precedent collection 2.50 173 | cup article 2.40 174 | sign recess 2.38 175 | problem airport 2.38 176 | reason hypertension 2.31 177 | direction combination 2.25 178 | Wednesday news 2.22 179 | glass magician 2.08 180 | cemetery woodland 2.08 181 | possibility girl 1.94 182 | cup substance 1.92 183 | forest graveyard 1.85 184 | stock egg 1.81 185 | month hotel 1.81 186 | energy secretary 1.81 187 | precedent group 1.77 188 | production hike 1.75 189 | stock phone 1.62 190 | holy sex 1.62 191 | stock CD 1.31 192 | drink ear 1.31 193 | delay racism 1.19 194 | stock life 0.92 195 | stock jaguar 0.92 196 | monk slave 0.92 197 | lad wizard 0.92 198 | sugar approach 0.88 199 | rooster voyage 0.62 200 | noon string 0.54 201 | chord smile 0.54 202 | professor cucumber 0.31 203 | king cabbage 0.23 204 | -------------------------------------------------------------------------------- /psdvec/topwordsInList.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import getopt 3 | import sys 4 | from utils import * 5 | import pdb 6 | import time 7 | import os 8 | import json 9 | 10 | def usage(): 11 | print """Usage:\n topsentwords.py -c config_file -l f1,f2... -o out_file -n count 12 | Options: 13 | config_file: Same config file used by corpus2liblinear.py, 14 | which specifying multiple document directories. 15 | f1,f2: Files containg lists of interesting words. 16 | out_file: Output file to save top interesting words. 17 | Default: 'topwords.txt' 18 | count: Top k words that will be counted. Default: 1000. 19 | """ 20 | 21 | def parseConfigFile(configFilename): 22 | CONF = open(configFilename) 23 | dir_configs = [] 24 | for line in CONF: 25 | line = line.strip() 26 | dir_config = json.loads(line) 27 | dir_configs.append(dir_config) 28 | return dir_configs 29 | 30 | def getListWordCount( docPath, word2freq ): 31 | DOC = open(docPath) 32 | doc = DOC.read() 33 | wordsInSentences, wc = extractSentenceWords(doc, 1) 34 | 35 | interestingWc = 0 36 | for sentence in wordsInSentences: 37 | for w in sentence: 38 | w = w.lower() 39 | if w in word2freq: 40 | word2freq[w] += 1 41 | interestingWc += 1 42 | 43 | return wc, interestingWc 44 | 45 | def processDir( docDir, word2freq ): 46 | print "Processing '%s'" %( docDir ) 47 | 48 | filecount = 0 49 | totalwc = 0 50 | totalInterestingWc = 0 51 | 52 | for filename in os.listdir(docDir): 53 | docPath = docDir + "/" + filename 54 | wc, interestingWc = getListWordCount( docPath, word2freq ) 55 | 56 | totalwc += wc 57 | totalInterestingWc += interestingWc 58 | filecount += 1 59 | 60 | if filecount % 500 == 0: 61 | print "\r%d\r" %filecount, 62 | 63 | print "%d files scanned, totally %d words, %d are interesting" %( filecount, totalwc, totalInterestingWc ) 64 | 65 | def main(): 66 | topword_cutoff = 1000 67 | 68 | configFilename = None 69 | listFilenames = None 70 | outFilename = "topwords.txt" 71 | 72 | try: 73 | opts, args = getopt.getopt(sys.argv[1:],"c:l:o:n:h") 74 | 75 | for opt, arg in opts: 76 | if opt == '-c': 77 | configFilename = arg 78 | if opt == '-o': 79 | outFilename = arg 80 | if opt == '-n': 81 | topword_cutoff = int(arg) 82 | if opt == '-l': 83 | listFilenames = arg.split(",") 84 | if opt == '-h': 85 | usage() 86 | sys.exit(0) 87 | 88 | except getopt.GetoptError, e: 89 | if len(e.args) == 1: 90 | print "Option error: %s" %e.args[0] 91 | usage() 92 | sys.exit(2) 93 | 94 | if not configFilename or not listFilenames: 95 | usage() 96 | sys.exit(2) 97 | 98 | dir_configs = parseConfigFile(configFilename) 99 | 100 | word2freq = {} 101 | 102 | totalwc = 0 103 | for listFilename in listFilenames: 104 | filewc = 0 105 | LIST = open(listFilename) 106 | for line in LIST: 107 | if line[0] == ';': 108 | continue 109 | line = line.strip() 110 | if not line: 111 | continue 112 | word2freq[line] = 0 113 | filewc += 1 114 | totalwc += 1 115 | print "%d words loaded from '%s'" %( filewc, listFilename ) 116 | 117 | print "%d words loaded from %d files" %( totalwc, len(listFilenames) ) 118 | 119 | for conf in dir_configs: 120 | processDir( conf['docDir'], word2freq ) 121 | 122 | words = sorted( word2freq.keys(), key=lambda w: word2freq[w], reverse=True ) 123 | topwords = words[:topword_cutoff] 124 | OUT = open(outFilename, "w") 125 | for w in topwords: 126 | OUT.write( "%s\t%d\n" %( w, word2freq[w] ) ) 127 | print "%d words written into '%s'" %( len(topwords), outFilename ) 128 | 129 | if __name__ == '__main__': 130 | main() 131 | -------------------------------------------------------------------------------- /psdvec/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/psdvec/utils.py -------------------------------------------------------------------------------- /psdvec/vecnorms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # this simple script is to find patterns of the norms (L1) of the learned embeddings 4 | from utils import * 5 | import sys 6 | import operator 7 | import os 8 | import getopt 9 | import math 10 | import pdb 11 | 12 | def usage(): 13 | print "Usage: vecnorms.py [-s -1 first_block_count -2 second_block_count ] embedding_filename" 14 | 15 | def expectation(value_probs): 16 | accuProb = 0 17 | accuExp = 0 18 | for v, p in value_probs: 19 | accuExp += v * p 20 | accuProb += p 21 | 22 | return accuExp / accuProb 23 | 24 | def var_div(value_probs): 25 | expect = expectation(value_probs) 26 | accuVar = 0 27 | accuProb = 0 28 | for v, p in value_probs: 29 | accuVar += (v - expect)**2 * p 30 | accuProb += p 31 | var = accuVar / accuProb 32 | div = math.sqrt(var) 33 | return var, div 34 | 35 | if len(sys.argv) == 1: 36 | usage() 37 | sys.exit(1) 38 | 39 | doSort = False 40 | first_block_count = -1 41 | second_block_count = -1 42 | unigramFilename = 'top1grams-wiki.txt' 43 | 44 | try: 45 | opts, args = getopt.getopt(sys.argv[1:],"s1:2:") 46 | if len(args) != 1: 47 | raise getopt.GetoptError("") 48 | embeddingFilename = args[0] 49 | for opt, arg in opts: 50 | if opt == '-s': 51 | doSort = True 52 | if opt == '-1': 53 | first_block_count = int(arg) 54 | print 'First block: 1-%d' %first_block_count 55 | if opt == '-2': 56 | second_block_count = int(arg) 57 | print 'Second block: %d-%d' %(first_block_count, second_block_count) 58 | if opt == '-u': 59 | # unigram file is used to get a full list of words, 60 | # and also to sort the absent words by their frequencies 61 | unigramFilename = arg 62 | 63 | except getopt.GetoptError: 64 | usage() 65 | sys.exit(2) 66 | 67 | vocab_prob = loadUnigramFile(unigramFilename) 68 | V, vocab, word2id, skippedWords = load_embeddings( embeddingFilename, second_block_count ) 69 | warning("\nCompute norms...") 70 | 71 | word2norm = {} 72 | wordnorms = [] 73 | word_probs1 = [] 74 | word_probs2 = [] 75 | 76 | for i in xrange( len(V) ): 77 | w = vocab[i] 78 | if w not in vocab_prob: 79 | warning( "%s not in vocab, skip\n" %w ) 80 | continue 81 | 82 | mag = norm1( V[i] ) 83 | word2norm[w] = mag 84 | prob = vocab_prob[w][2] 85 | wordnorms.append( [ w, mag ] ) 86 | if i < first_block_count: 87 | word_probs1.append( [ mag, prob ] ) 88 | elif i < second_block_count: 89 | word_probs2.append( [ mag, prob ] ) 90 | 91 | warning("Done\n") 92 | 93 | if len(word_probs1) > 0: 94 | var1, div1 = var_div(word_probs1) 95 | expect = expectation(word_probs1) 96 | print "First block: %d words, exp: %.2f, var: %.2f, div: %.2f" %( len(word_probs1), expect, var1, div1 ) 97 | if len(word_probs2) > 0: 98 | var2, div2 = var_div(word_probs2) 99 | expect = expectation(word_probs2) 100 | print "Second block: %d words, exp: %.2f, var: %.2f, div: %.2f" %( len(word_probs2), expect, var2, div2 ) 101 | 102 | 103 | if doSort: 104 | warning("Done\nSorting words ascendingly by norm...") 105 | # sort ascendingly by the norm length 106 | sorted_wordnorms = sorted( wordnorms, key=operator.itemgetter(1) ) 107 | wordnorms = sorted_wordnorms 108 | 109 | embeddingFilename = os.path.basename(embeddingFilename) 110 | embeddingFilename = os.path.splitext(embeddingFilename)[0] 111 | 112 | normFilename = "norms_" + embeddingFilename + "-%d.txt" %( len(V) ) 113 | 114 | warning( "Save norms into %s\n" %normFilename ) 115 | NORM = open(normFilename, "w") 116 | 117 | wc = 0 118 | for word_norm in wordnorms: 119 | word, norm = word_norm 120 | NORM.write( "%i %s: %.2f\n" %( word2id[word], word, norm ) ) 121 | wc += 1 122 | 123 | warning( "%d words saved\n" %wc ) 124 | -------------------------------------------------------------------------------- /psdvec/xml2corpus.pl: -------------------------------------------------------------------------------- 1 | use strict; 2 | use warnings; 3 | use XML::LibXML; 4 | use File::Find; 5 | 6 | my $rootdir = "D:/corpus/rcv1/"; 7 | my $fc = 0; 8 | my $totalbytes = 0; 9 | 10 | find({ wanted => \&process_file, no_chdir => 1 }, $rootdir); 11 | my $totalMB = int( $totalbytes / 1024 / 1024 ); 12 | print STDERR "$fc files processed, totally $totalMB MB\n"; 13 | 14 | sub process_file { 15 | if ( /\.xml$/ ) { 16 | my $doc = XML::LibXML->load_xml(location => $_); 17 | for my $textnode ( $doc->findnodes('/newsitem/text') ){ 18 | print $textnode->textContent(); 19 | $totalbytes += length( $textnode->textContent() ); 20 | $totalMB = int( $totalbytes / 1024 / 1024 ); 21 | } 22 | $fc++; 23 | if( $fc % 500 == 0 ){ 24 | print STDERR "\r$fc $totalMB\r"; 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /reuters.bat: -------------------------------------------------------------------------------- 1 | python topicExp.py -s reuters train 2 | python topicExp.py -i reuters-train-5770-sep91-em150-best.topic.vec reuters train,test 3 | python classEval.py reuters topicprop 4 | python classEval.py reuters topic-wvavg 5 | -------------------------------------------------------------------------------- /snippet2topic.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import getopt 3 | import sys 4 | import pdb 5 | import os 6 | from topicvecDir import topicvecDir 7 | from utils import * 8 | 9 | customStopwords = "based via using approach learning multi algorithm algorithms" 10 | 11 | config = dict( snip_filenames = None, 12 | short_name = None, 13 | unigramFilename = "top1grams-wiki.txt", 14 | word_vec_file = "25000-180000-500-BLK-8.0.vec", 15 | K = 20, 16 | N0 = 500, 17 | max_l = 5, 18 | init_l = 1, 19 | max_grad_norm = 0, 20 | # cap the sum of Em when updating topic embeddings 21 | # to avoid too big gradients 22 | grad_scale_Em_base = 2500, 23 | topW = 30, 24 | topTopicMassFracPrintThres = 0.1, 25 | alpha0 = 0.1, 26 | alpha1 = 0.1, 27 | iniDelta = 0.1, 28 | MAX_EM_ITERS = 100, 29 | topicDiff_tolerance = 2e-3, 30 | printTopics_iterNum = 10, 31 | zero_topic0 = True, 32 | useDrdtApprox = False, 33 | customStopwords = customStopwords, 34 | remove_stop = True, 35 | normalize_vecs = False, 36 | # shift all embeddings in a document, so that their average is 0 37 | rebase_vecs = True, 38 | rebase_norm_thres = 0.2, 39 | evalKmeans = False, 40 | verbose = 1, 41 | seed = 0 42 | ) 43 | 44 | def usage(): 45 | print """topicvecDir.py [ -v vec_file -a alpha ... ] snip_file 46 | Options: 47 | -k: Number of topic embeddings to extract. Default: 20 48 | -v: Existing embedding file of all words. 49 | -r: Existing residual file of core words. 50 | -a: Hyperparameter alpha. Default: 0.1. 51 | -i: Number of iterations of the EM procedure. Default: 100 52 | -u: Unigram file, to obtain unigram probs. 53 | -l: Magnitude of topic embeddings. 54 | -A: Append to the old log file. 55 | -s: Seed the random number generator to x. Used to repeat experiments 56 | -n: Nickname (short name) for the snip_file 57 | """ 58 | 59 | def getOptions(): 60 | global config 61 | 62 | try: 63 | opts, args = getopt.getopt(sys.argv[1:],"k:v:i:u:l:s:n:Ah") 64 | if len(args) != 1: 65 | raise getopt.GetoptError("") 66 | config['snip_filename'] = args[0] 67 | 68 | for opt, arg in opts: 69 | if opt == '-k': 70 | config['K'] = int(arg) 71 | if opt == '-v': 72 | config['vec_file'] = arg 73 | if opt == '-a': 74 | config['alpha1'] = float(opt) 75 | if opt == '-i': 76 | config['MAX_EM_ITERS'] = int(arg) 77 | if opt == '-u': 78 | config['unigramFilename'] = arg 79 | if opt == '-l': 80 | config['max_l'] = int(arg) 81 | if opt == '-s': 82 | config['seed'] = int(arg) 83 | if opt == '-A': 84 | config['appendLogfile'] = True 85 | if opt == '-n': 86 | config['short_name'] = arg 87 | if opt == '-r': 88 | config['useDrdtApprox'] = True 89 | if opt == '-h': 90 | usage() 91 | sys.exit(0) 92 | 93 | basename = os.path.basename(args[0]) 94 | if config['short_name']: 95 | config['logfilename'] = config['short_name'] 96 | elif len(args) > 1: 97 | config['logfilename'] = "(%d)%s" %( len(args), basename ) 98 | else: 99 | config['logfilename'] = basename 100 | 101 | except getopt.GetoptError: 102 | usage() 103 | sys.exit(2) 104 | 105 | return config 106 | 107 | def main(): 108 | 109 | config = getOptions() 110 | snip_filename = config['snip_filename'] 111 | snips_words = [] 112 | snips_name = [] 113 | 114 | with open(snip_filename) as DOC: 115 | snip_lines = [] 116 | snipcount = 0 117 | snips_wc = 0 118 | for line in DOC: 119 | line = line.strip() 120 | if line: 121 | snip_lines.append(line) 122 | else: 123 | sniptext = " ".join(snip_lines) 124 | wordsInSentences, wc = extractSentenceWords(sniptext, remove_punc="iso-8859-1") 125 | snips_wc += wc 126 | snipcount += 1 127 | snips_words.append(wordsInSentences) 128 | snips_name.append( "%s-row%d" %(snip_filename, snipcount) ) 129 | 130 | snipfile_avgwc = snips_wc * 1.0 / snipcount 131 | print "%d words extracted from %d snippets in '%s'. Avg %.1f words each row" %( snips_wc, 132 | snipcount, snip_filename, snipfile_avgwc ) 133 | 134 | topicvec = topicvecDir(**config) 135 | topicvec.setDocs( snips_words, snips_name ) 136 | 137 | best_last_Ts, Em, docs_Em, Pi = topicvec.inference() 138 | 139 | basename = os.path.basename(config['logfilename']) 140 | basetrunk = os.path.splitext(basename)[0] 141 | 142 | best_it, best_T, best_loglike = best_last_Ts[0] 143 | save_matrix_as_text( basetrunk + "-em%d-best.topic.vec" %best_it, "topic", best_T ) 144 | 145 | if best_last_Ts[1]: 146 | last_it, last_T, last_loglike = best_last_Ts[1] 147 | save_matrix_as_text( basetrunk + "-em%d-last.topic.vec" %last_it, "topic", last_T ) 148 | 149 | if __name__ == '__main__': 150 | main() 151 | -------------------------------------------------------------------------------- /test-docs/Drug Goes From 13.50 a Tablet to 750, Overnight.txt: -------------------------------------------------------------------------------- 1 | Specialists in infectious disease are protesting a gigantic overnight increase in the price of a 62-year-old drug that is the standard of care for treating a life-threatening parasitic infection. 2 | 3 | The drug, called Daraprim, was acquired in August by Turing Pharmaceuticals, a start-up run by a former hedge fund manager. Turing immediately raised the price to $750 a tablet from $13.50, bringing the annual cost of treatment for some patients to hundreds of thousands of dollars. 4 | 5 | “What is it that they are doing differently that has led to this dramatic increase?” said Dr. Judith Aberg, the chief of the division of infectious diseases at the Icahn School of Medicine at Mount Sinai. She said the price increase could force hospitals to use “alternative therapies that may not have the same efficacy.” 6 | 7 | Turing’s price increase is not an isolated example. While most of the attention on pharmaceutical prices has been on new drugs for diseases like cancer, hepatitis C and high cholesterol, there is also growing concern about huge price increases on older drugs, some of them generic, that have long been mainstays of treatment. 8 | 9 | Although some price increases have been caused by shortages, others have resulted from a business strategy of buying old neglected drugs and turning them into high-priced “specialty drugs.” 10 | 11 | Cycloserine, a drug used to treat dangerous multidrug-resistant tuberculosis, was just increased in price to $10,800 for 30 pills from $500 after its acquisition by Rodelis Therapeutics. Scott Spencer, general manager of Rodelis, said the company needed to invest to make sure the supply of the drug remained reliable. He said the company provided the drug free to certain needy patients. 12 | 13 | In August, two members of Congress investigating generic drug price increases wrote to Valeant Pharmaceuticals after that company acquired two heart drugs, Isuprel and Nitropress, from Marathon Pharmaceuticals and promptly raised their prices by 525 percent and 212 percent respectively. Marathon had acquired the drugs from another company in 2013 and had quintupled their prices, according to the lawmakers, Senator Bernie Sanders, the Vermont independent who is seeking the Democratic nomination for president, and Representative Elijah E. Cummings, Democrat of Maryland. 14 | 15 | Doxycycline, an antibiotic, went from $20 a bottle in October 2013 to $1,849 by April 2014, according to the two lawmakers. 16 | 17 | The Infectious Diseases Society of America and the HIV Medicine Association sent a joint letter to Turing earlier this month calling the price increase for Daraprim “unjustifiable for the medically vulnerable patient population” and “unsustainable for the health care system.” An organization representing the directors of state AIDS programs has also been looking into the price increase, according to doctors and patient advocates. 18 | 19 | Daraprim, known generically as pyrimethamine, is used mainly to treat toxoplasmosis, a parasite infection that can cause serious or even life-threatening problems for babies born to women who become infected during pregnancy, and also for people with compromised immune systems, like AIDS patients and certain cancer patients. 20 | 21 | Martin Shkreli, the founder and chief executive of Turing, said that the drug is so rarely used that the impact on the health system would be minuscule and that Turing would use the money it earns to develop better treatments for toxoplasmosis, with fewer side effects. 22 | 23 | “This isn’t the greedy drug company trying to gouge patients, it is us trying to stay in business,” Mr. Shkreli said. He said that many patients use the drug for far less than a year and that the price was now more in line with those of other drugs for rare diseases. 24 | 25 | “This is still one of the smallest pharmaceutical products in the world,” he said. “It really doesn’t make sense to get any criticism for this.” 26 | 27 | This is not the first time the 32-year-old Mr. Shkreli, who has a reputation for both brilliance and brashness, has been the center of controversy. He started MSMB Capital, a hedge fund company, in his 20s and drew attention for urging the Food and Drug Administration not to approve certain drugs made by companies whose stock he was shorting. 28 | 29 | In 2011, Mr. Shkreli started Retrophin, which also acquired old neglected drugs and sharply raised their prices. Retrophin’s board fired Mr. Shkreli a year ago. Last month, it filed a complaint in Federal District Court in Manhattan, accusing him of using Retrophin as a personal piggy bank to pay back angry investors in his hedge fund. 30 | 31 | Mr. Shkreli has denied the accusations. He has filed for arbitration against his old company, which he says owes him at least $25 million in severance. “They are sort of concocting this wild and crazy and unlikely story to swindle me out of the money,” he said. 32 | 33 | Daraprim, which is also used to treat malaria, was approved by the F.D.A. in 1953 and has long been made by GlaxoSmithKline. Glaxo sold United States marketing rights to CorePharma in 2010. Last year, Impax Laboratories agreed to buy Core and affiliated companies for $700 million. In August, Impax sold Daraprim to Turing for $55 million, a deal announced the same day Turing said it had raised $90 million from Mr. Shkreli and other investors in its first round of financing. 34 | 35 | Daraprim cost only about $1 a tablet several years ago, but the drug’s price rose sharply after CorePharma acquired it. According to IMS Health, which tracks prescriptions, sales of the drug jumped to $6.3 million in 2011 from $667,000 in 2010, even as prescriptions held steady at about 12,700. In 2014, after further price increases, sales were $9.9 million, as the number of prescriptions shrank to 8,821. The figures do not include inpatient use in hospitals. 36 | 37 | Turing’s price increase could bring sales to tens or even hundreds of millions of dollars a year if use remains constant. Medicaid and certain hospitals will be able to get the drug inexpensively under federal rules for discounts and rebates. But private insurers, Medicare and hospitalized patients would have to pay an amount closer to the list price. 38 | 39 | 40 | Some doctors questioned Turing’s claim that there was a need for better drugs, saying the side effects, while potentially serious, could be managed. 41 | 42 | “I certainly don’t think this is one of those diseases where we have been clamoring for better therapies,” said Dr. Wendy Armstrong, professor of infectious diseases at Emory University in Atlanta. 43 | 44 | With the price now high, other companies could conceivably make generic copies, since patents have long expired. One factor that could discourage that option is that Daraprim’s distribution is now tightly controlled, making it harder for generic companies to get the samples they need for the required testing. 45 | 46 | The switch from drugstores to controlled distribution was made in June by Impax, not by Turing. Still, controlled distribution was a strategy Mr. Shkreli talked about at his previous company as a way to thwart generics. 47 | 48 | Some hospitals say they now have trouble getting the drug. “We’ve not had access to the drug for a few months,” said Dr. Armstrong, who also works at Grady Memorial Hospital, a huge public treatment center in Atlanta that serves many low-income patients. 49 | 50 | But Dr. Rima McLeod, medical director of the toxoplasmosis center at the University of Chicago, said that Turing had been good about delivering drugs quickly to patients, sometimes without charge. 51 | 52 | “They have jumped every time I’ve called,” she said. The situation, she added, “seems workable” despite the price increase. 53 | 54 | Daraprim is the standard first treatment for toxoplasmosis, in combination with an antibiotic called sulfadiazine. There are alternative treatments, but there is less data supporting their efficacy. 55 | 56 | Dr. Aberg of Mount Sinai said some hospitals will now find Daraprim too expensive to keep in stock, possibly resulting in treatment delays. She said that Mount Sinai was continuing to use the drug, but each use now required a special review. 57 | 58 | “This seems to be all profit-driven for somebody,” Dr. Aberg said, “and I just think it’s a very dangerous process.” 59 | -------------------------------------------------------------------------------- /test-docs/VR-mitrv.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/VR-mitrv.txt -------------------------------------------------------------------------------- /test-docs/batman-v-superman.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/batman-v-superman.txt -------------------------------------------------------------------------------- /test-docs/beijing-haze-news.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/beijing-haze-news.txt -------------------------------------------------------------------------------- /test-docs/brain-scar.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/brain-scar.txt -------------------------------------------------------------------------------- /test-docs/britain-EU.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/britain-EU.txt -------------------------------------------------------------------------------- /test-docs/hillary-speech.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/hillary-speech.txt -------------------------------------------------------------------------------- /test-docs/hillary-speech2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/hillary-speech2.txt -------------------------------------------------------------------------------- /test-docs/nips-wiki.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/nips-wiki.txt -------------------------------------------------------------------------------- /test-docs/sanders-speeches.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/sanders-speeches.txt -------------------------------------------------------------------------------- /test-docs/spacex-news.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/spacex-news.txt -------------------------------------------------------------------------------- /test-docs/trump-speech.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/test-docs/trump-speech.txt -------------------------------------------------------------------------------- /topic-competitors/LDA/LDAClassify.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/LDAClassify.zip -------------------------------------------------------------------------------- /topic-competitors/LDA/Readme.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/Readme.txt -------------------------------------------------------------------------------- /topic-competitors/LDA/classEval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/classEval.py -------------------------------------------------------------------------------- /topic-competitors/LDA/corpusLoader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/corpusLoader.py -------------------------------------------------------------------------------- /topic-competitors/LDA/ldaExp.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/LDA/ldaExp.py -------------------------------------------------------------------------------- /topic-competitors/doc2vec.py: -------------------------------------------------------------------------------- 1 | import gensim.models.doc2vec as doc2vec 2 | import sys 3 | import pdb 4 | 5 | corpus = sys.argv[1] 6 | 7 | if corpus == '20news': 8 | all_words_file = "20news-all-18791.gibbslda-bow.txt" 9 | train_label_file = "20news-train-11314.slda-label.txt" 10 | train_docvec_file = "20news-train-11314.svm-doc2vec.txt" 11 | test_label_file = "20news-test-7532.slda-label.txt" 12 | test_docvec_file = "20news-test-7532.svm-doc2vec.txt" 13 | all_count = 18791 14 | train_count = 11285 15 | test_count = 7506 16 | else: 17 | all_words_file = "reuters-all-8025.gibbslda-bow.txt" 18 | train_label_file = "reuters-train-5770.slda-label.txt" 19 | train_docvec_file = "reuters-train-5770.svm-doc2vec.txt" 20 | test_label_file = "reuters-test-2255.slda-label.txt" 21 | test_docvec_file = "reuters-test-2255.svm-doc2vec.txt" 22 | all_count = 8025 23 | train_count = 5770 24 | test_count = 2255 25 | 26 | dim = 400 27 | corpus = doc2vec.TaggedLineDocument(all_words_file) 28 | model = doc2vec.Doc2Vec(corpus,size=dim, window=8, min_count=5, workers=4) 29 | TRAIN_DOC2VEC = open(train_docvec_file, "w") 30 | TRAIN_LABEL = open(train_label_file) 31 | 32 | #pdb.set_trace() 33 | 34 | for d in xrange(1, train_count + 1): 35 | doc_vec = model.docvecs[d] 36 | label_line = TRAIN_LABEL.readline().strip() 37 | label = int(label_line) 38 | 39 | TRAIN_DOC2VEC.write( "%d" %(label+1) ) 40 | 41 | for k in xrange(dim): 42 | TRAIN_DOC2VEC.write( " %d:%.3f" %( k + 1, doc_vec[k] ) ) 43 | 44 | TRAIN_DOC2VEC.write("\n") 45 | 46 | TRAIN_DOC2VEC.close() 47 | 48 | print "%d doc vecs written in svm format into '%s'" %( train_count, train_docvec_file ) 49 | 50 | TEST_DOC2VEC = open(test_docvec_file, "w") 51 | TEST_LABEL = open(test_label_file) 52 | for d in xrange(train_count + 1, all_count + 1): 53 | doc_vec = model.docvecs[d] 54 | label_line = TEST_LABEL.readline().strip() 55 | label = int(label_line) 56 | 57 | TEST_DOC2VEC.write( "%d" %(label+1) ) 58 | 59 | for k in xrange(dim): 60 | TEST_DOC2VEC.write( " %d:%.3f" %( k + 1, doc_vec[k] ) ) 61 | 62 | TEST_DOC2VEC.write("\n") 63 | 64 | TEST_DOC2VEC.close() 65 | 66 | print "%d doc vecs written in svm format into '%s'" %( test_count, test_docvec_file ) 67 | -------------------------------------------------------------------------------- /topic-competitors/kmeans.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # kmeans.py using any of the 20-odd metrics in scipy.spatial.distance 3 | # kmeanssample 2 pass, first sample sqrt(N) 4 | 5 | from __future__ import division 6 | import random 7 | import numpy as np 8 | from scipy.spatial.distance import cdist # $scipy/spatial/distance.py 9 | # http://docs.scipy.org/doc/scipy/reference/spatial.html 10 | from scipy.sparse import issparse # $scipy/sparse/csr.py 11 | 12 | __date__ = "2011-11-17 Nov denis" 13 | # X sparse, any cdist metric: real app ? 14 | # centres get dense rapidly, metrics in high dim hit distance whiteout 15 | # vs unsupervised / semi-supervised svm 16 | 17 | #............................................................................... 18 | def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1 ): 19 | """ centres, Xtocentre, distances = kmeans( X, initial centres ... ) 20 | in: 21 | X N x dim may be sparse 22 | centres k x dim: initial centres, e.g. random.sample( X, k ) 23 | delta: relative error, iterate until the average distance to centres 24 | is within delta of the previous average distance 25 | maxiter 26 | metric: any of the 20-odd in scipy.spatial.distance 27 | "chebyshev" = max, "cityblock" = L1, "minkowski" with p= 28 | or a function( Xvec, centrevec ), e.g. Lqmetric below 29 | p: for minkowski metric -- local mod cdist for 0 < p < 1 too 30 | verbose: 0 silent, 2 prints running distances 31 | out: 32 | centres, k x dim 33 | Xtocentre: each X -> its nearest centre, ints N -> k 34 | distances, N 35 | see also: kmeanssample below, class Kmeans below. 36 | """ 37 | 38 | if verbose: 39 | print "kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s" % ( 40 | X.shape, centres.shape, delta, maxiter, metric) 41 | allx = np.arange(N) 42 | prevdist = 0 43 | for jiter in range( 1, maxiter+1 ): 44 | D = cdist( X, centres, metric=metric, p=p ) # |X| x |centres| 45 | xtoc = D.argmin(axis=1) # X -> nearest centre 46 | distances = D[allx,xtoc] 47 | avdist = distances.mean() # median ? 48 | if verbose >= 2: 49 | print "kmeans: av |X - nearest centre| = %.4g" % avdist 50 | if (1 - delta) * prevdist <= avdist <= prevdist \ 51 | or jiter == maxiter: 52 | break 53 | prevdist = avdist 54 | for jc in range(k): # (1 pass in C) 55 | c = np.where( xtoc == jc )[0] 56 | if len(c) > 0: 57 | centres[jc] = X[c].mean( axis=0 ) 58 | if verbose: 59 | print "kmeans: %d iterations cluster sizes:" % jiter, np.bincount(xtoc) 60 | if verbose >= 2: 61 | r50 = np.zeros(k) 62 | r90 = np.zeros(k) 63 | for j in range(k): 64 | dist = distances[ xtoc == j ] 65 | if len(dist) > 0: 66 | r50[j], r90[j] = np.percentile( dist, (50, 90) ) 67 | print "kmeans: cluster 50 % radius", r50.astype(int) 68 | print "kmeans: cluster 90 % radius", r90.astype(int) 69 | # scale L1 / dim, L2 / sqrt(dim) ? 70 | return centres, xtoc, distances 71 | 72 | def randomsample( X, n ): 73 | """ random.sample of the rows of X 74 | X may be sparse -- best csr 75 | """ 76 | sampleix = random.sample( xrange( X.shape[0] ), int(n) ) 77 | return X[sampleix] 78 | 79 | if __name__ == "__main__": 80 | import random 81 | import sys 82 | from time import time 83 | 84 | N = 10000 85 | dim = 10 86 | ncluster = 10 87 | kmdelta = .001 88 | kmiter = 10 89 | metric = "cosine" 90 | seed = 1 91 | 92 | np.set_printoptions( 1, threshold=200, edgeitems=5, suppress=True ) 93 | np.random.seed(seed) 94 | random.seed(seed) 95 | 96 | unigramFilename = "top1grams-wiki.txt" 97 | word_vec_file = "25000-180000-500-BLK-8.0.vec" 98 | 99 | vocab_dict = loadUnigramFile(unigramFilename) 100 | V, vocab, word2ID, skippedWords_whatever = load_embeddings(word_vec_file) 101 | # map of word -> id of all words with embeddings 102 | vocab_dict2 = {} 103 | 104 | if normalize_vecs: 105 | Vnorm = np.array( [ normF(x) for x in V ] ) 106 | for i,w in enumerate(vocab): 107 | if Vnorm[i] == 0: 108 | print "WARN: %s norm is 0" %w 109 | # set to 1 to avoid "divided by 0 exception" 110 | Vnorm[i] = 1 111 | 112 | V /= Vnorm[:, None] 113 | 114 | # dimensionality of topic/word embeddings 115 | N0 = V.shape[1] 116 | 117 | customStopwordList = re.split( "\s+", self.customStopwords ) 118 | for stop_w in customStopwordList: 119 | stopwordDict[stop_w] = 1 120 | print "Custom stopwords: %s" %( ", ".join(customStopwordList) ) 121 | 122 | print "N %d dim %d ncluster %d metric %s" % (N, dim, ncluster, metric) 123 | t0 = time() 124 | 125 | randomcentres = randomsample( X, ncluster ) 126 | centres, xtoc, dist = kmeans( X, randomcentres, 127 | delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 ) 128 | print "%.0f msec" % ((time() - t0) * 1000) 129 | -------------------------------------------------------------------------------- /topic-competitors/labelEval.py: -------------------------------------------------------------------------------- 1 | from sklearn import metrics 2 | import sys 3 | 4 | def getScores( true_classes, pred_classes, average): 5 | precision = metrics.precision_score( true_classes, pred_classes, average=average ) 6 | recall = metrics.recall_score( true_classes, pred_classes, average=average ) 7 | f1 = metrics.f1_score( true_classes, pred_classes, average=average ) 8 | accuracy = metrics.accuracy_score( true_classes, pred_classes ) 9 | return precision, recall, f1, accuracy 10 | 11 | true_labelfile = sys.argv[1] 12 | pred_labelfile = sys.argv[2] 13 | 14 | TRUE = open(true_labelfile) 15 | PRED = open(pred_labelfile) 16 | 17 | true_classes = [] 18 | pred_classes = [] 19 | 20 | for line in TRUE: 21 | line = line.strip() 22 | label = int(line) 23 | true_classes.append(label) 24 | 25 | for line in PRED: 26 | line = line.strip() 27 | label = int(line) 28 | pred_classes.append(label) 29 | 30 | print metrics.classification_report(true_classes, pred_classes, digits=3) 31 | 32 | for average in ['micro', 'macro']: 33 | precision, recall, f1, acc = getScores( true_classes, pred_classes, average ) 34 | print "Prec (%s average): %.3f, recall: %.3f, F1: %.3f, Acc: %.3f" %( average, 35 | precision, recall, f1, acc ) 36 | -------------------------------------------------------------------------------- /topic-competitors/lftm2svm.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | lftm_file = sys.argv[1] 4 | 5 | train_words_file = "reuters-train-5770.gibbslda-words.txt" 6 | train_label_file = "reuters-train-5770.slda-label.txt" 7 | test_words_file = "reuters-test-2255.gibbslda-words.txt" 8 | test_label_file = "reuters-test-2255.slda-label.txt" 9 | 10 | LFTM_TOPIC = open(lftm_file) 11 | TRAIN_WORDS = open(train_words_file) 12 | TEST_WORDS = open(test_words_file) 13 | TRAIN_LABELS = open(train_label_file) 14 | TEST_LABELS = open(test_label_file) 15 | 16 | for i in xrange(2): 17 | WORDS = [ TRAIN_WORDS, TEST_WORDS ][i] 18 | LABELS = [TRAIN_LABELS, TEST_LABELS][i] 19 | if i == 0: 20 | output_file = "reuters-train-5770.svm-lftm.txt" 21 | else: 22 | output_file = "reuters-test-2255.svm-lftm.txt" 23 | 24 | OUTPUT = open(output_file, "w") 25 | 26 | setName = ["train", "test"][i] 27 | 28 | lineno = 0 29 | validDocNum = 0 30 | for line in WORDS: 31 | lineno += 1 32 | line = line.strip() 33 | label_line = LABELS.readline().strip() 34 | if not line: 35 | print "Empty doc %s-%d skipped" %(setName, lineno) 36 | continue 37 | label = int(label_line) 38 | OUTPUT.write( "%d" %(label+1) ) 39 | lftm_topic_line = LFTM_TOPIC.readline().strip() 40 | lftm_topicprops = lftm_topic_line.split(" ") 41 | for k in xrange(50): 42 | topicprop = float(lftm_topicprops[k]) 43 | OUTPUT.write( " %d:%.3f" %(k+1, topicprop) ) 44 | OUTPUT.write("\n") 45 | validDocNum += 1 46 | print "%d %s docs, %d written into '%s'" %(lineno, setName, validDocNum, output_file) 47 | OUTPUT.close() 48 | 49 | lineno = 0 50 | for line in LFTM_TOPIC: 51 | lineno += 1 52 | 53 | if lineno > 0: 54 | print "Warn: %d lines left in '%s'" %(lineno, lftm_file) 55 | -------------------------------------------------------------------------------- /topic-competitors/liu-doc2vec.py: -------------------------------------------------------------------------------- 1 | from utils import * 2 | import pdb 3 | 4 | def genDocEmbedding( setName, words_file, topics_file, label_file, V, word2ID, T ): 5 | WORDS = open(words_file) 6 | TOPICS = open(topics_file) 7 | LABEL = open(label_file) 8 | 9 | filename_trunk = words_file.split('.')[0] 10 | docvec_file = ".".join( [ filename_trunk, "svm-liu", "txt" ] ) 11 | docvecbow_file = ".".join( [ filename_trunk, "svm-liubow", "txt" ] ) 12 | 13 | DOCVEC = open( docvec_file, "w" ) 14 | DOCVECBOW = open( docvecbow_file, "w" ) 15 | 16 | dim = V.shape[1] + T.shape[1] 17 | 18 | lineno = 0 19 | emptyDocIds = [] 20 | 21 | for word_line in WORDS: 22 | lineno += 1 23 | word_line = word_line.strip() 24 | topic_line = TOPICS.readline().strip() 25 | label_line = LABEL.readline().strip() 26 | # encounter an empty doc 27 | if not word_line: 28 | words = [] 29 | topics = [] 30 | else: 31 | words = word_line.split(" ") 32 | topics = topic_line.split(" ") 33 | assert len(words) == len(topics), \ 34 | "Words number %d != topic number %d in line %d" %( len(words), len(topics), lineno ) 35 | label = int(label_line) 36 | 37 | sum_vec = np.zeros(dim) 38 | doc_vec = np.zeros(dim) 39 | validWordNum = 0 40 | 41 | wid2freq = {} 42 | 43 | for i in xrange(len(words)): 44 | word = words[i] 45 | topic = int(topics[i]) 46 | 47 | if word not in word2ID: 48 | continue 49 | validWordNum += 1 50 | wid = word2ID[word] 51 | sum_vec += np.concatenate( [ V[wid], T[topic] ] ) 52 | 53 | if wid in wid2freq: 54 | wid2freq[wid] += 1 55 | else: 56 | wid2freq[wid] = 1 57 | 58 | if validWordNum > 0: 59 | doc_vec = sum_vec / validWordNum 60 | else: 61 | emptyDocIds.append(lineno) 62 | 63 | sorted_wids = sorted( wid2freq.keys() ) 64 | 65 | DOCVEC.write( "%d" %(label+1) ) 66 | DOCVECBOW.write( "%d" %(label+1) ) 67 | 68 | for k in xrange(dim): 69 | DOCVEC.write( " %d:%.3f" %( k + 1, doc_vec[k] ) ) 70 | DOCVECBOW.write( " %d:%.3f" %( k + 1, doc_vec[k] ) ) 71 | 72 | for wid in sorted_wids: 73 | # first dim indices are reserved for topic features, so add dim here 74 | # add 1 to make wid start from 1 75 | DOCVECBOW.write( " %d:%d" %( wid + dim + 1, wid2freq[wid] ) ) 76 | 77 | DOCVEC.write("\n") 78 | DOCVECBOW.write("\n") 79 | 80 | print "%d %s docs converted to Liu et al's docvec in svm format." %( lineno, setName ) 81 | if len(emptyDocIds) > 0: 82 | print "Empty docs: %s" %emptyDocIds 83 | 84 | DOCVEC.close() 85 | DOCVECBOW.close() 86 | WORDS.close() 87 | TOPICS.close() 88 | LABEL.close() 89 | 90 | corpus = sys.argv[1] 91 | 92 | if corpus == '20news': 93 | train_words_file = "20news-train-11314.gibbslda-words.txt" 94 | train_topics_file = "20news-train-11314.gibbslda-topics.txt" 95 | train_wordvec_file = "20news-train-11314.liu-wordvec2.txt" 96 | train_topicvec_file = "20news-train-11314.liu-topicvec2.txt" 97 | train_label_file = "20news-train-11314.slda-label.txt" 98 | test_words_file = "20news-test-7532.gibbslda-words.txt" 99 | test_topics_file = "20news-test-7532.gibbslda-topics.txt" 100 | test_label_file = "20news-test-7532.slda-label.txt" 101 | else: 102 | train_words_file = "reuters-train-5770.gibbslda-words.txt" 103 | train_topics_file = "reuters-train-5770.gibbslda-topics.txt" 104 | train_wordvec_file = "reuters-train-5770.liu-wordvec2.txt" 105 | train_topicvec_file = "reuters-train-5770.liu-topicvec2.txt" 106 | train_label_file = "reuters-train-5770.slda-label.txt" 107 | test_words_file = "reuters-test-2255.gibbslda-words.txt" 108 | test_topics_file = "reuters-test-2255.gibbslda-topics.txt" 109 | test_label_file = "reuters-test-2255.slda-label.txt" 110 | 111 | V, vocab, word2ID, skippedWords_whatever = load_embeddings(train_wordvec_file) 112 | T = load_matrix_from_text( train_topicvec_file, "topic embedding" ) 113 | genDocEmbedding( "train", train_words_file, train_topics_file, train_label_file, V, word2ID, T ) 114 | genDocEmbedding( "test", test_words_file, test_topics_file, test_label_file, V, word2ID, T ) 115 | -------------------------------------------------------------------------------- /topic-competitors/rajarshd-Gaussian_LDA.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/rajarshd-Gaussian_LDA.zip -------------------------------------------------------------------------------- /topic-competitors/sHDP.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/sHDP.zip -------------------------------------------------------------------------------- /topic-competitors/slda/Makefile: -------------------------------------------------------------------------------- 1 | CC = g++ 2 | LDFLAGS = -lgsl -lm -lgslcblas 3 | 4 | 5 | LSOURCE = main.cpp corpus.cpp slda.cpp utils.cpp opt.cpp 6 | LHEADER = corpus.h slda.h utils.h opt.h settings.h 7 | 8 | slda: $(LSOURCE) $(HEADER) 9 | $(CC) $(LSOURCE) -o $@ $(LDFLAGS) 10 | 11 | clean: 12 | -rm -f *.o slda 13 | -------------------------------------------------------------------------------- /topic-competitors/slda/corpus.cpp: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei 2 | 3 | // written by Chong Wang, chongw@cs.princeton.edu 4 | 5 | // This file is part of slda. 6 | 7 | // slda is free software; you can redistribute it and/or modify it under 8 | // the terms of the GNU General Public License as published by the Free 9 | // Software Foundation; either version 2 of the License, or (at your 10 | // option) any later version. 11 | 12 | // slda is distributed in the hope that it will be useful, but WITHOUT 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 | // for more details. 16 | 17 | // You should have received a copy of the GNU General Public License 18 | // along with this program; if not, write to the Free Software 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 20 | // USA 21 | 22 | #include "corpus.h" 23 | #include 24 | #include 25 | 26 | corpus::corpus() 27 | { 28 | num_docs = 0; 29 | size_vocab = 0; 30 | num_classes = 0; 31 | num_total_words = 0; 32 | } 33 | 34 | corpus::~corpus() 35 | { 36 | for (int i = 0; i < num_docs; i ++) 37 | { 38 | document * doc = docs[i]; 39 | delete doc; 40 | } 41 | docs.clear(); 42 | 43 | num_docs = 0; 44 | size_vocab = 0; 45 | num_classes = 0; 46 | num_total_words = 0; 47 | } 48 | 49 | void corpus::read_data(const char * data_filename, 50 | const char * label_filename) 51 | { 52 | int OFFSET = 0; 53 | int length = 0, count = 0, word = 0, 54 | n = 0, nd = 0, nw = 0, label = -1; 55 | 56 | FILE * fileptr; 57 | fileptr = fopen(data_filename, "r"); 58 | printf("\nreading data from %s\n", data_filename); 59 | nd = 0; 60 | nw = 0; 61 | 62 | while ((fscanf(fileptr, "%10d", &length) != EOF)) 63 | { 64 | document * doc = new document(length); 65 | for (n = 0; n < length; n++) 66 | { 67 | fscanf(fileptr, "%10d:%10d", &word, &count); 68 | word = word - OFFSET; 69 | doc->words[n] = word; 70 | doc->counts[n] = count; 71 | doc->total += count; 72 | if (word >= nw) 73 | { 74 | nw = word + 1; 75 | } 76 | } 77 | num_total_words += doc->total; 78 | docs.push_back(doc); 79 | nd++; 80 | } 81 | fclose(fileptr); 82 | num_docs = nd; 83 | size_vocab = nw; 84 | printf("number of docs : %d\n", nd); 85 | printf("number of terms : %d\n", nw); 86 | printf("number of total words : %d\n", num_total_words); 87 | 88 | fileptr = fopen(label_filename, "r"); 89 | printf("\nreading labels from %s\n", label_filename); 90 | nd = 0; 91 | while ((fscanf(fileptr, "%10d", &label) != EOF)) 92 | { 93 | document * doc = docs[nd]; 94 | doc->label = label; 95 | if (label >= num_classes) 96 | { 97 | num_classes = label + 1; 98 | } 99 | nd ++; 100 | } 101 | assert(nd == int(docs.size())); 102 | printf("number of classes : %d\n\n", num_classes); 103 | } 104 | 105 | int corpus::max_corpus_length() { 106 | int max_length = 0; 107 | 108 | for (int d = 0; d < num_docs; d++) { 109 | if (docs[d]->length > max_length) 110 | max_length = docs[d]->length; 111 | } 112 | return max_length; 113 | } 114 | -------------------------------------------------------------------------------- /topic-competitors/slda/corpus.h: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei 2 | 3 | // written by Chong Wang, chongw@cs.princeton.edu 4 | 5 | // This file is part of slda. 6 | 7 | // slda is free software; you can redistribute it and/or modify it under 8 | // the terms of the GNU General Public License as published by the Free 9 | // Software Foundation; either version 2 of the License, or (at your 10 | // option) any later version. 11 | 12 | // slda is distributed in the hope that it will be useful, but WITHOUT 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 | // for more details. 16 | 17 | // You should have received a copy of the GNU General Public License 18 | // along with this program; if not, write to the Free Software 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 20 | // USA 21 | 22 | #ifndef CORPUS_H 23 | #define CORPUS_H 24 | 25 | #include 26 | #include 27 | using namespace std; 28 | 29 | class document 30 | { 31 | public: 32 | int * words; 33 | int * counts; 34 | int length; 35 | int total; 36 | int label; 37 | public: 38 | document() 39 | { 40 | words = NULL; 41 | counts = NULL; 42 | length = 0; 43 | total = 0; 44 | label = -1; 45 | } 46 | document(int len) 47 | { 48 | length = len; 49 | words = new int [length]; 50 | counts = new int [length]; 51 | total = 0; 52 | label = -1; 53 | } 54 | ~document() 55 | { 56 | if (words != NULL) 57 | { 58 | delete [] words; 59 | delete [] counts; 60 | length = 0; 61 | total = 0; 62 | label = -1; 63 | } 64 | } 65 | }; 66 | 67 | class corpus 68 | { 69 | public: 70 | corpus(); 71 | ~corpus(); 72 | void read_data(const char * data_filename, const char * label_filename); 73 | int max_corpus_length(); 74 | public: 75 | int num_docs; 76 | int size_vocab; 77 | int num_classes; 78 | int num_total_words; 79 | vector docs; 80 | }; 81 | 82 | #endif // CORPUS_H 83 | -------------------------------------------------------------------------------- /topic-competitors/slda/cygblas-0.dll: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/slda/cygblas-0.dll -------------------------------------------------------------------------------- /topic-competitors/slda/main.cpp: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei 2 | 3 | // written by Chong Wang, chongw@cs.princeton.edu 4 | 5 | // This file is part of slda. 6 | 7 | // slda is free software; you can redistribute it and/or modify it under 8 | // the terms of the GNU General Public License as published by the Free 9 | // Software Foundation; either version 2 of the License, or (at your 10 | // option) any later version. 11 | 12 | // slda is distributed in the hope that it will be useful, but WITHOUT 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 | // for more details. 16 | 17 | // You should have received a copy of the GNU General Public License 18 | // along with this program; if not, write to the Free Software 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 20 | // USA 21 | 22 | #include 23 | #include 24 | #include "corpus.h" 25 | #include "utils.h" 26 | #include "slda.h" 27 | 28 | void help( void ) { 29 | printf("usage: slda [est] [data] [label] [settings] [alpha] [k] [random/seeded/model_path] [directory]\n"); 30 | printf(" slda [inf] [data] [label] [settings] [model] [directory]\n"); 31 | } 32 | 33 | int main(int argc, char* argv[]) 34 | { 35 | if (argc < 2) 36 | { 37 | help(); 38 | return 0; 39 | } 40 | if (strcmp(argv[1], "est") == 0) 41 | { 42 | corpus c; 43 | char * data_filename = argv[2]; 44 | char * label_filename = argv[3]; 45 | c.read_data(data_filename, label_filename); 46 | settings setting; 47 | char * setting_filename = argv[4]; 48 | setting.read_settings(setting_filename); 49 | 50 | double alpha = atof(argv[5]); 51 | int num_topics = atoi(argv[6]); 52 | printf("number of topics is %d\n", num_topics); 53 | char * init_method = argv[7]; 54 | char * directory = argv[8]; 55 | printf("models will be saved in %s\n", directory); 56 | make_directory(directory); 57 | 58 | slda model; 59 | model.init(alpha, num_topics, &c); 60 | model.v_em(&c, &setting, init_method, directory); 61 | } 62 | 63 | if (strcmp(argv[1], "inf") == 0) 64 | { 65 | corpus c; 66 | char * data_filename = argv[2]; 67 | char * label_filename = argv[3]; 68 | c.read_data(data_filename, label_filename); 69 | settings setting; 70 | char * setting_filename = argv[4]; 71 | setting.read_settings(setting_filename); 72 | 73 | char * model_filename = argv[5]; 74 | char * directory = argv[6]; 75 | printf("\nresults will be saved in %s\n", directory); 76 | make_directory(directory); 77 | 78 | slda model; 79 | model.load_model(model_filename); 80 | model.infer_only(&c, &setting, directory); 81 | } 82 | 83 | return 0; 84 | } 85 | -------------------------------------------------------------------------------- /topic-competitors/slda/opt.cpp: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei 2 | 3 | // written by Chong Wang, chongw@cs.princeton.edu 4 | 5 | // This file is part of slda. 6 | 7 | // slda is free software; you can redistribute it and/or modify it under 8 | // the terms of the GNU General Public License as published by the Free 9 | // Software Foundation; either version 2 of the License, or (at your 10 | // option) any later version. 11 | 12 | // slda is distributed in the hope that it will be useful, but WITHOUT 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 | // for more details. 16 | 17 | // You should have received a copy of the GNU General Public License 18 | // along with this program; if not, write to the Free Software 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 20 | // USA 21 | #include "opt.h" 22 | #include "slda.h" 23 | #include "utils.h" 24 | /* 25 | * Here the implementation is slightly different from the equations 26 | * in the paper, we instead use a second-order taylor expansion to approximate 27 | * the second line in eqaution (6). 28 | */ 29 | 30 | double softmax_f(const gsl_vector * x, void * opt_param) 31 | { 32 | opt_parameter * gsl_param = (opt_parameter *)opt_param; 33 | double PENALTY = gsl_param->PENALTY; 34 | slda * model = gsl_param->model; 35 | suffstats * ss = gsl_param->ss; 36 | 37 | double f, t, a1 = 0.0, a2 = 0.0; 38 | 39 | int k, d, j, l, idx; 40 | 41 | double f_regularization = 0.0; 42 | 43 | 44 | for (l = 0; l < model->num_classes-1; l ++) 45 | { 46 | for (k = 0; k < model->num_topics; k ++) 47 | { 48 | model->eta[l][k] = gsl_vector_get(x, l*model->num_topics + k); 49 | f_regularization -= pow(model->eta[l][k], 2) * PENALTY/2.0; 50 | } 51 | } 52 | f = 0.0; //log likelihood 53 | for (d = 0; d < ss->num_docs; d ++) 54 | { 55 | for (k = 0; k < model->num_topics; k ++) 56 | { 57 | if (ss->labels[d] < model->num_classes-1) 58 | { 59 | f += model->eta[ss->labels[d]][k] * ss->z_bar[d].z_bar_m[k]; 60 | } 61 | } 62 | 63 | t = 0.0; // in log space, 1+exp()+exp()... 64 | for (l = 0; l < model->num_classes-1; l ++) 65 | { 66 | a1 = 0.0; // \eta_k^T * \bar{\phi}_d 67 | a2 = 0.0; // 1 + 0.5 * \eta_k^T * Var(z_bar)\eta_k 68 | for (k = 0; k < model->num_topics; k ++) 69 | { 70 | a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k]; 71 | for (j = 0; j < model->num_topics; j ++) 72 | { 73 | idx = map_idx(k, j, model->num_topics); 74 | a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j]; 75 | } 76 | } 77 | a2 = 1.0 + 0.5 * a2; 78 | t = log_sum(t, a1 + log(a2)); 79 | } 80 | f -= t; 81 | } 82 | 83 | return -(f + f_regularization); 84 | } 85 | void softmax_df(const gsl_vector * x, void * opt_param, gsl_vector * df) 86 | { 87 | 88 | opt_parameter * gsl_param = (opt_parameter *)opt_param; 89 | double PENALTY = gsl_param->PENALTY; 90 | slda * model = gsl_param->model; 91 | suffstats * ss = gsl_param->ss; 92 | gsl_vector_set_zero(df); 93 | gsl_vector * df_tmp = gsl_vector_alloc(df->size); 94 | 95 | double t, a1 = 0.0, a2 = 0.0, g; 96 | int k, d, j, l, idx; 97 | 98 | double * eta_aux = new double [model->num_topics]; 99 | 100 | for (l = 0; l < model->num_classes-1; l ++) 101 | { 102 | for (k = 0; k < model->num_topics; k ++) 103 | { 104 | idx = l*model->num_topics + k; 105 | model->eta[l][k] = gsl_vector_get(x, idx); 106 | g = -PENALTY * model->eta[l][k]; 107 | gsl_vector_set(df, idx, g); 108 | } 109 | } 110 | for (d = 0; d < ss->num_docs; d ++) 111 | { 112 | for (k = 0; k < model->num_topics; k ++) 113 | { 114 | l = ss->labels[d]; 115 | if (l < model->num_classes-1) 116 | { 117 | idx = l*model->num_topics + k; 118 | g = gsl_vector_get(df, idx) + ss->z_bar[d].z_bar_m[k]; 119 | gsl_vector_set(df, idx, g); 120 | } 121 | } 122 | 123 | t = 0.0; // in log space, 1+exp()+exp()+.... 124 | gsl_vector_memcpy(df_tmp, df); 125 | gsl_vector_set_zero(df); 126 | for (l = 0; l < model->num_classes-1; l ++) 127 | { 128 | memset(eta_aux, 0, sizeof(double)*model->num_topics); 129 | a1 = 0.0; // \eta_k^T * \bar{\phi}_d 130 | a2 = 0.0; // 1 + 0.5*\eta_k^T * Var(z_bar)\eta_k 131 | for (k = 0; k < model->num_topics; k ++) 132 | { 133 | a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k]; 134 | for (j = 0; j < model->num_topics; j ++) 135 | { 136 | idx = map_idx(k, j, model->num_topics); 137 | a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j]; 138 | eta_aux[k] += ss->z_bar[d].z_bar_var[idx] * model->eta[l][j]; 139 | } 140 | } 141 | a2 = 1.0 + 0.5 * a2; 142 | t = log_sum(t, a1 + log(a2)); 143 | 144 | for (k = 0; k < model->num_topics; k ++) 145 | { 146 | idx = l*model->num_topics + k; 147 | g = gsl_vector_get(df, idx) - 148 | exp(a1) * (ss->z_bar[d].z_bar_m[k] * a2 + eta_aux[k]); 149 | gsl_vector_set(df, idx, g); 150 | } 151 | } 152 | gsl_vector_scale(df, exp(-t)); 153 | gsl_vector_add(df, df_tmp); 154 | } 155 | gsl_vector_scale(df, -1.0); 156 | delete [] eta_aux; 157 | gsl_vector_free(df_tmp); 158 | } 159 | void softmax_fdf(const gsl_vector * x, void * opt_param, double * f, gsl_vector * df) 160 | { 161 | opt_parameter * gsl_param = (opt_parameter *)opt_param; 162 | double PENALTY = gsl_param->PENALTY; 163 | slda * model = gsl_param->model; 164 | suffstats * ss = gsl_param->ss; 165 | gsl_vector_set_zero(df); 166 | gsl_vector * df_tmp = gsl_vector_alloc(df->size); 167 | 168 | double t, a1 = 0.0, a2 = 0.0, g; 169 | int k, d, j, l, idx; 170 | 171 | double f_regularization = 0.0; 172 | 173 | double* eta_aux = new double [model->num_topics]; 174 | 175 | for (l = 0; l < model->num_classes-1; l ++) 176 | { 177 | for (k = 0; k < model->num_topics; k ++) 178 | { 179 | model->eta[l][k] = gsl_vector_get(x, l*model->num_topics + k); 180 | f_regularization -= pow(model->eta[l][k], 2) * PENALTY/2.0; 181 | idx = l*model->num_topics + k; 182 | g = -PENALTY * model->eta[l][k]; 183 | gsl_vector_set(df, idx, g); 184 | } 185 | } 186 | *f = 0.0; //log likelihood 187 | for (d = 0; d < ss->num_docs; d ++) 188 | { 189 | for (k = 0; k < model->num_topics; k ++) 190 | { 191 | l = ss->labels[d]; 192 | if (l < model->num_classes-1) 193 | { 194 | *f += model->eta[l][k] * ss->z_bar[d].z_bar_m[k]; 195 | idx = l*model->num_topics + k; 196 | g = gsl_vector_get(df, idx) + ss->z_bar[d].z_bar_m[k]; 197 | gsl_vector_set(df, idx, g); 198 | } 199 | } 200 | t = 0.0; // in log space, base class 1+exp()+exp() 201 | gsl_vector_memcpy(df_tmp, df); 202 | gsl_vector_set_zero(df); 203 | for (l = 0; l < model->num_classes-1; l ++) 204 | { 205 | memset(eta_aux, 0, sizeof(double)*model->num_topics); 206 | a1 = 0.0; // \eta_k^T * \bar{\phi}_d 207 | a2 = 0.0; // 1 + 0.5 * \eta_k^T * Var(z_bar)\eta_k 208 | for (k = 0; k < model->num_topics; k ++) 209 | { 210 | a1 += model->eta[l][k] * ss->z_bar[d].z_bar_m[k]; 211 | for (j = 0; j < model->num_topics; j ++) 212 | { 213 | idx = map_idx(k, j, model->num_topics); 214 | a2 += model->eta[l][k] * ss->z_bar[d].z_bar_var[idx] * model->eta[l][j]; 215 | eta_aux[k] += ss->z_bar[d].z_bar_var[idx] * model->eta[l][j]; 216 | } 217 | } 218 | a2 = 1.0 + 0.5 * a2; 219 | t = log_sum(t, a1 + log(a2)); 220 | 221 | for (k = 0; k < model->num_topics; k ++) 222 | { 223 | idx = l*model->num_topics + k; 224 | g = gsl_vector_get(df, idx) - 225 | exp(a1) * (ss->z_bar[d].z_bar_m[k] * a2 + eta_aux[k]); 226 | gsl_vector_set(df, idx, g); 227 | } 228 | } 229 | gsl_vector_scale(df, exp(-t)); 230 | gsl_vector_add(df, df_tmp); 231 | *f -= t; 232 | } 233 | gsl_vector_scale(df, -1.0); 234 | *f = -(*f + f_regularization); 235 | delete [] eta_aux; 236 | gsl_vector_free(df_tmp); 237 | } 238 | 239 | -------------------------------------------------------------------------------- /topic-competitors/slda/opt.h: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei 2 | 3 | // written by Chong Wang, chongw@cs.princeton.edu 4 | 5 | // This file is part of slda. 6 | 7 | // slda is free software; you can redistribute it and/or modify it under 8 | // the terms of the GNU General Public License as published by the Free 9 | // Software Foundation; either version 2 of the License, or (at your 10 | // option) any later version. 11 | 12 | // slda is distributed in the hope that it will be useful, but WITHOUT 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 | // for more details. 16 | 17 | // You should have received a copy of the GNU General Public License 18 | // along with this program; if not, write to the Free Software 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 20 | // USA 21 | #ifndef OPT_H_INCLUDED 22 | #define OPT_H_INCLUDED 23 | #include 24 | #include "slda.h" 25 | 26 | /* 27 | * structure for the gsl optimization routine 28 | * 29 | */ 30 | 31 | struct opt_parameter 32 | { 33 | suffstats * ss; 34 | slda * model; 35 | double PENALTY; 36 | }; 37 | 38 | /* 39 | * function to compute the value of the obj function, then 40 | * return it 41 | */ 42 | 43 | double softmax_f(const gsl_vector * x, void * opt_param); 44 | 45 | /* 46 | * function to compute the derivatives of function 47 | * 48 | */ 49 | 50 | void softmax_df(const gsl_vector * x, void * opt_param, gsl_vector * df); 51 | 52 | /* 53 | * function to compute the value and derivatives of the function 54 | * 55 | */ 56 | 57 | void softmax_fdf(const gsl_vector * x, void * opt_param, double * f, gsl_vector * df); 58 | 59 | #endif // OPT_H_INCLUDED 60 | 61 | -------------------------------------------------------------------------------- /topic-competitors/slda/readme.txt: -------------------------------------------------------------------------------- 1 | ********************************************************** 2 | SUPERVISED LATENT DIRICHLET ALLOCATION FOR CLASSIFICATION 3 | ********************************************************** 4 | 5 | (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei 6 | 7 | written by Chong Wang, chongw@cs.princeton.edu, part of code 8 | is from http://www.cs.princeton.edu/~blei/lda-c/index.html. 9 | 10 | This file is part of slda. 11 | 12 | slda is free software; you can redistribute it and/or modify it under 13 | the terms of the GNU General Public License as published by the Free 14 | Software Foundation; either version 2 of the License, or (at your 15 | option) any later version. 16 | 17 | slda is distributed in the hope that it will be useful, but WITHOUT 18 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 19 | FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 20 | for more details. 21 | 22 | You should have received a copy of the GNU General Public License 23 | along with this program; if not, write to the Free Software 24 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 25 | USA 26 | 27 | 28 | ------------------------------------------------------------------------ 29 | 30 | This is a C++ implementation of supervised latent Dirichlet allocation (sLDA) 31 | for classification. 32 | 33 | Note that this code requires the Gnu Scientific Library, http://www.gnu.org/software/gsl/ 34 | 35 | ------------------------------------------------------------------------ 36 | 37 | 38 | TABLE OF CONTENTS 39 | 40 | 41 | A. COMPILING 42 | 43 | B. ESTIMATION 44 | 45 | C. INFERENCE 46 | 47 | 48 | ------------------------------------------------------------------------ 49 | 50 | A. COMPILING 51 | 52 | Type "make" in a shell. Make sure the GSL is installed. 53 | 54 | 55 | ------------------------------------------------------------------------ 56 | 57 | B. ESTIMATION 58 | 59 | Estimate the model by executing: 60 | 61 | slda [est] [data] [label] [settings] [alpha] [k] [seeded/random/model_path] [directory] 62 | 63 | The saved models are in two files: 64 | 65 | .model is the model saved in the binary format, which is easy and 66 | fast to use for inference. 67 | 68 | .model.txt is the model saved in the text format, which is 69 | convenient for printing topics or analysis using python. 70 | 71 | 72 | The variational posterior Dirichlets are in: 73 | 74 | .gamma 75 | 76 | 77 | Data format 78 | 79 | (1) [data] is a file where each line is of the form: 80 | 81 | [M] [term_1]:[count] [term_2]:[count] ... [term_N]:[count] 82 | 83 | where [M] is the number of unique terms in the document, and the 84 | [count] associated with each term is how many times that term appeared 85 | in the document. 86 | 87 | (2) [label] is a file where each line is the corresponding label for [data]. 88 | The labels must be 0, 1, ..., C-1, if we have C classes. 89 | 90 | 91 | ------------------------------------------------------------------------ 92 | 93 | C. INFERENCE 94 | 95 | To perform inference on a different set of data (in the same format as 96 | for estimation), execute: 97 | 98 | slda [inf] [data] [label] [settings] [model] [directory] 99 | 100 | where [model] is the binary file from the estimation. 101 | 102 | The predictive labels are in: 103 | 104 | inf-labels.dat 105 | 106 | The variational posterior Dirichlets are in: 107 | 108 | inf-gamma.dat 109 | 110 | -------------------------------------------------------------------------------- /topic-competitors/slda/settings.h: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei 2 | 3 | // written by Chong Wang, chongw@cs.princeton.edu 4 | 5 | // This file is part of slda. 6 | 7 | // slda is free software; you can redistribute it and/or modify it under 8 | // the terms of the GNU General Public License as published by the Free 9 | // Software Foundation; either version 2 of the License, or (at your 10 | // option) any later version. 11 | 12 | // slda is distributed in the hope that it will be useful, but WITHOUT 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 | // for more details. 16 | 17 | // You should have received a copy of the GNU General Public License 18 | // along with this program; if not, write to the Free Software 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 20 | // USA 21 | #ifndef SETTINGS_H 22 | #define SETTINGS_H 23 | #include 24 | #include 25 | 26 | struct settings 27 | { 28 | float VAR_CONVERGED; 29 | int VAR_MAX_ITER; 30 | float EM_CONVERGED; 31 | int EM_MAX_ITER; 32 | int ESTIMATE_ALPHA; 33 | float PENALTY; 34 | 35 | void read_settings(char* filename) 36 | { 37 | FILE * fileptr; 38 | char alpha_action[100]; 39 | 40 | fileptr = fopen(filename, "r"); 41 | fscanf(fileptr, "var max iter %d\n", &this->VAR_MAX_ITER); 42 | fscanf(fileptr, "var convergence %f\n", &this->VAR_CONVERGED); 43 | fscanf(fileptr, "em max iter %d\n", &this->EM_MAX_ITER); 44 | fscanf(fileptr, "em convergence %f\n", &this->EM_CONVERGED); 45 | fscanf(fileptr, "L2 penalty %f\n", &this->PENALTY); 46 | 47 | fscanf(fileptr, "alpha %s", alpha_action); 48 | if (strcmp(alpha_action, "fixed") == 0) 49 | { 50 | this->ESTIMATE_ALPHA = 0; 51 | printf("alpha is fixed ...\n"); 52 | } 53 | else 54 | { 55 | this->ESTIMATE_ALPHA = 1; 56 | printf("alpha is esimated ...\n"); 57 | } 58 | fclose(fileptr); 59 | printf("var max iter %d\n", this->VAR_MAX_ITER); 60 | printf("var convergence %.2E\n", this->VAR_CONVERGED); 61 | printf("em max iter %d\n", this->EM_MAX_ITER); 62 | printf("em convergence %.2E\n", this->EM_CONVERGED); 63 | printf("L2 penalty %.2E\n", this->PENALTY); 64 | } 65 | }; 66 | 67 | #endif // SETTINGS_H 68 | 69 | -------------------------------------------------------------------------------- /topic-competitors/slda/settings.txt: -------------------------------------------------------------------------------- 1 | var max iter 20 2 | var convergence 1e-3 3 | em max iter 50 4 | em convergence 1e-4 5 | L2 penalty 0.01 6 | alpha fixed 7 | -------------------------------------------------------------------------------- /topic-competitors/slda/slda: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/slda/slda -------------------------------------------------------------------------------- /topic-competitors/slda/slda.exe: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topic-competitors/slda/slda.exe -------------------------------------------------------------------------------- /topic-competitors/slda/slda.h: -------------------------------------------------------------------------------- 1 | // (C) Copyright 2009, Chong Wang, David Blei and Li Fei-Fei 2 | 3 | // written by Chong Wang, chongw@cs.princeton.edu 4 | 5 | // This file is part of slda. 6 | 7 | // slda is free software; you can redistribute it and/or modify it under 8 | // the terms of the GNU General Public License as published by the Free 9 | // Software Foundation; either version 2 of the License, or (at your 10 | // option) any later version. 11 | 12 | // slda is distributed in the hope that it will be useful, but WITHOUT 13 | // ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 | // FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 | // for more details. 16 | 17 | // You should have received a copy of the GNU General Public License 18 | // along with this program; if not, write to the Free Software 19 | // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 20 | // USA 21 | 22 | #ifndef SLDA_H 23 | #define SLDA_H 24 | #include "settings.h" 25 | #include "corpus.h" 26 | 27 | typedef struct { 28 | double * z_bar_m; 29 | double * z_bar_var; 30 | } z_stat; 31 | 32 | typedef struct { 33 | double ** word_ss; 34 | double * word_total_ss; 35 | int num_docs; 36 | z_stat * z_bar; 37 | int * labels; 38 | int * tot_labels; 39 | } suffstats; 40 | 41 | class slda 42 | { 43 | public: 44 | slda(); 45 | ~slda(); 46 | void free_model(); 47 | void init(double alpha_, int num_topics_, const corpus * c); 48 | void v_em(corpus * c, const settings * setting, 49 | const char * start, const char * directory); 50 | 51 | void save_model(const char * filename); 52 | void save_model_text(const char * filename); 53 | void load_model(const char * model_filename); 54 | void infer_only(corpus * c, const settings * setting, 55 | const char * directory); 56 | 57 | suffstats * new_suffstats(int num_docs); 58 | void free_suffstats(suffstats * ss); 59 | void zero_initialize_ss(suffstats * ss); 60 | void random_initialize_ss(suffstats * ss, corpus * c); 61 | void corpus_initialize_ss(suffstats* ss, corpus * c); 62 | void load_model_initialize_ss(suffstats* ss, corpus * c); 63 | void mle(suffstats * ss, int eta_update, const settings * setting); 64 | 65 | double doc_e_step(document* doc, double* gamma, double** phi, suffstats * ss, int eta_update, const settings * setting); 66 | 67 | double lda_inference(document* doc, double* var_gamma, double** phi, const settings * setting); 68 | double lda_compute_likelihood(document* doc, double** phi, double* var_gamma); 69 | double slda_inference(document* doc, double* var_gamma, double** phi, const settings * setting); 70 | double slda_compute_likelihood(document* doc, double** phi, double* var_gamma); 71 | 72 | void save_gamma(char* filename, double** gamma, int num_docs); 73 | void write_word_assignment(FILE* f, document* doc, double** phi); 74 | 75 | 76 | public: 77 | double alpha; // the parameter for the dirichlet 78 | int num_topics; 79 | int num_classes; 80 | int size_vocab; 81 | 82 | double ** log_prob_w; //the log of the topic distribution 83 | double ** eta; //softmax regression, in general, there are num_classes-1 etas, we don't need a intercept here, since \sum_i \bar{z_i} = 1 84 | }; 85 | 86 | #endif // SLDA_H 87 | 88 | -------------------------------------------------------------------------------- /topic-competitors/slda/test-label.dat: -------------------------------------------------------------------------------- 1 | 0 2 | 0 3 | 0 4 | 0 5 | 0 6 | 0 7 | 0 8 | 0 9 | 0 10 | 0 11 | 0 12 | 0 13 | 0 14 | 0 15 | 0 16 | 0 17 | 0 18 | 0 19 | 0 20 | 0 21 | 0 22 | 0 23 | 0 24 | 0 25 | 0 26 | 0 27 | 0 28 | 0 29 | 0 30 | 0 31 | 0 32 | 0 33 | 0 34 | 0 35 | 0 36 | 0 37 | 0 38 | 0 39 | 0 40 | 0 41 | 0 42 | 0 43 | 0 44 | 0 45 | 0 46 | 0 47 | 0 48 | 0 49 | 0 50 | 0 51 | 0 52 | 0 53 | 0 54 | 0 55 | 0 56 | 0 57 | 0 58 | 0 59 | 0 60 | 0 61 | 0 62 | 0 63 | 0 64 | 0 65 | 0 66 | 0 67 | 0 68 | 0 69 | 0 70 | 0 71 | 0 72 | 0 73 | 0 74 | 0 75 | 0 76 | 0 77 | 0 78 | 0 79 | 0 80 | 0 81 | 0 82 | 0 83 | 0 84 | 0 85 | 0 86 | 0 87 | 0 88 | 0 89 | 0 90 | 0 91 | 0 92 | 0 93 | 0 94 | 0 95 | 0 96 | 0 97 | 0 98 | 0 99 | 0 100 | 0 101 | 1 102 | 1 103 | 1 104 | 1 105 | 1 106 | 1 107 | 1 108 | 1 109 | 1 110 | 1 111 | 1 112 | 1 113 | 1 114 | 1 115 | 1 116 | 1 117 | 1 118 | 1 119 | 1 120 | 1 121 | 1 122 | 1 123 | 1 124 | 1 125 | 1 126 | 1 127 | 1 128 | 1 129 | 1 130 | 1 131 | 1 132 | 1 133 | 1 134 | 1 135 | 1 136 | 1 137 | 1 138 | 1 139 | 1 140 | 1 141 | 1 142 | 1 143 | 1 144 | 1 145 | 1 146 | 1 147 | 1 148 | 1 149 | 1 150 | 1 151 | 1 152 | 1 153 | 1 154 | 1 155 | 1 156 | 1 157 | 1 158 | 1 159 | 1 160 | 1 161 | 1 162 | 1 163 | 1 164 | 1 165 | 1 166 | 1 167 | 1 168 | 1 169 | 1 170 | 1 171 | 1 172 | 1 173 | 1 174 | 1 175 | 1 176 | 1 177 | 1 178 | 1 179 | 1 180 | 1 181 | 1 182 | 1 183 | 1 184 | 1 185 | 1 186 | 1 187 | 1 188 | 1 189 | 1 190 | 1 191 | 1 192 | 1 193 | 1 194 | 1 195 | 1 196 | 1 197 | 1 198 | 1 199 | 1 200 | 1 201 | 2 202 | 2 203 | 2 204 | 2 205 | 2 206 | 2 207 | 2 208 | 2 209 | 2 210 | 2 211 | 2 212 | 2 213 | 2 214 | 2 215 | 2 216 | 2 217 | 2 218 | 2 219 | 2 220 | 2 221 | 2 222 | 2 223 | 2 224 | 2 225 | 2 226 | 2 227 | 2 228 | 2 229 | 2 230 | 2 231 | 2 232 | 2 233 | 2 234 | 2 235 | 2 236 | 2 237 | 2 238 | 2 239 | 2 240 | 2 241 | 2 242 | 2 243 | 2 244 | 2 245 | 2 246 | 2 247 | 2 248 | 2 249 | 2 250 | 2 251 | 2 252 | 2 253 | 2 254 | 2 255 | 2 256 | 2 257 | 2 258 | 2 259 | 2 260 | 2 261 | 2 262 | 2 263 | 2 264 | 2 265 | 2 266 | 2 267 | 2 268 | 2 269 | 2 270 | 2 271 | 2 272 | 2 273 | 2 274 | 2 275 | 2 276 | 2 277 | 2 278 | 2 279 | 2 280 | 2 281 | 2 282 | 2 283 | 2 284 | 2 285 | 2 286 | 2 287 | 2 288 | 2 289 | 2 290 | 2 291 | 2 292 | 2 293 | 2 294 | 2 295 | 2 296 | 2 297 | 2 298 | 2 299 | 2 300 | 2 301 | 3 302 | 3 303 | 3 304 | 3 305 | 3 306 | 3 307 | 3 308 | 3 309 | 3 310 | 3 311 | 3 312 | 3 313 | 3 314 | 3 315 | 3 316 | 3 317 | 3 318 | 3 319 | 3 320 | 3 321 | 3 322 | 3 323 | 3 324 | 3 325 | 3 326 | 3 327 | 3 328 | 3 329 | 3 330 | 3 331 | 3 332 | 3 333 | 3 334 | 3 335 | 3 336 | 3 337 | 3 338 | 3 339 | 3 340 | 3 341 | 3 342 | 3 343 | 3 344 | 3 345 | 3 346 | 3 347 | 3 348 | 3 349 | 3 350 | 3 351 | 3 352 | 3 353 | 3 354 | 3 355 | 3 356 | 3 357 | 3 358 | 3 359 | 3 360 | 3 361 | 3 362 | 3 363 | 3 364 | 3 365 | 3 366 | 3 367 | 3 368 | 3 369 | 3 370 | 3 371 | 3 372 | 3 373 | 3 374 | 3 375 | 3 376 | 3 377 | 3 378 | 3 379 | 3 380 | 3 381 | 3 382 | 3 383 | 3 384 | 3 385 | 3 386 | 3 387 | 3 388 | 3 389 | 3 390 | 3 391 | 3 392 | 3 393 | 3 394 | 3 395 | 3 396 | 3 397 | 3 398 | 3 399 | 3 400 | 3 401 | 4 402 | 4 403 | 4 404 | 4 405 | 4 406 | 4 407 | 4 408 | 4 409 | 4 410 | 4 411 | 4 412 | 4 413 | 4 414 | 4 415 | 4 416 | 4 417 | 4 418 | 4 419 | 4 420 | 4 421 | 4 422 | 4 423 | 4 424 | 4 425 | 4 426 | 4 427 | 4 428 | 4 429 | 4 430 | 4 431 | 4 432 | 4 433 | 4 434 | 4 435 | 4 436 | 4 437 | 4 438 | 4 439 | 4 440 | 4 441 | 4 442 | 4 443 | 4 444 | 4 445 | 4 446 | 4 447 | 4 448 | 4 449 | 4 450 | 4 451 | 4 452 | 4 453 | 4 454 | 4 455 | 4 456 | 4 457 | 4 458 | 4 459 | 4 460 | 4 461 | 4 462 | 4 463 | 4 464 | 4 465 | 4 466 | 4 467 | 4 468 | 4 469 | 4 470 | 4 471 | 4 472 | 4 473 | 4 474 | 4 475 | 4 476 | 4 477 | 4 478 | 4 479 | 4 480 | 4 481 | 4 482 | 4 483 | 4 484 | 4 485 | 4 486 | 4 487 | 4 488 | 4 489 | 4 490 | 4 491 | 4 492 | 4 493 | 4 494 | 4 495 | 4 496 | 4 497 | 4 498 | 4 499 | 4 500 | 4 501 | 5 502 | 5 503 | 5 504 | 5 505 | 5 506 | 5 507 | 5 508 | 5 509 | 5 510 | 5 511 | 5 512 | 5 513 | 5 514 | 5 515 | 5 516 | 5 517 | 5 518 | 5 519 | 5 520 | 5 521 | 5 522 | 5 523 | 5 524 | 5 525 | 5 526 | 5 527 | 5 528 | 5 529 | 5 530 | 5 531 | 5 532 | 5 533 | 5 534 | 5 535 | 5 536 | 5 537 | 5 538 | 5 539 | 5 540 | 5 541 | 5 542 | 5 543 | 5 544 | 5 545 | 5 546 | 5 547 | 5 548 | 5 549 | 5 550 | 5 551 | 5 552 | 5 553 | 5 554 | 5 555 | 5 556 | 5 557 | 5 558 | 5 559 | 5 560 | 5 561 | 5 562 | 5 563 | 5 564 | 5 565 | 5 566 | 5 567 | 5 568 | 5 569 | 5 570 | 5 571 | 5 572 | 5 573 | 5 574 | 5 575 | 5 576 | 5 577 | 5 578 | 5 579 | 5 580 | 5 581 | 5 582 | 5 583 | 5 584 | 5 585 | 5 586 | 5 587 | 5 588 | 5 589 | 5 590 | 5 591 | 5 592 | 5 593 | 5 594 | 5 595 | 5 596 | 5 597 | 5 598 | 5 599 | 5 600 | 5 601 | 6 602 | 6 603 | 6 604 | 6 605 | 6 606 | 6 607 | 6 608 | 6 609 | 6 610 | 6 611 | 6 612 | 6 613 | 6 614 | 6 615 | 6 616 | 6 617 | 6 618 | 6 619 | 6 620 | 6 621 | 6 622 | 6 623 | 6 624 | 6 625 | 6 626 | 6 627 | 6 628 | 6 629 | 6 630 | 6 631 | 6 632 | 6 633 | 6 634 | 6 635 | 6 636 | 6 637 | 6 638 | 6 639 | 6 640 | 6 641 | 6 642 | 6 643 | 6 644 | 6 645 | 6 646 | 6 647 | 6 648 | 6 649 | 6 650 | 6 651 | 6 652 | 6 653 | 6 654 | 6 655 | 6 656 | 6 657 | 6 658 | 6 659 | 6 660 | 6 661 | 6 662 | 6 663 | 6 664 | 6 665 | 6 666 | 6 667 | 6 668 | 6 669 | 6 670 | 6 671 | 6 672 | 6 673 | 6 674 | 6 675 | 6 676 | 6 677 | 6 678 | 6 679 | 6 680 | 6 681 | 6 682 | 6 683 | 6 684 | 6 685 | 6 686 | 6 687 | 6 688 | 6 689 | 6 690 | 6 691 | 6 692 | 6 693 | 6 694 | 6 695 | 6 696 | 6 697 | 6 698 | 6 699 | 6 700 | 6 701 | 7 702 | 7 703 | 7 704 | 7 705 | 7 706 | 7 707 | 7 708 | 7 709 | 7 710 | 7 711 | 7 712 | 7 713 | 7 714 | 7 715 | 7 716 | 7 717 | 7 718 | 7 719 | 7 720 | 7 721 | 7 722 | 7 723 | 7 724 | 7 725 | 7 726 | 7 727 | 7 728 | 7 729 | 7 730 | 7 731 | 7 732 | 7 733 | 7 734 | 7 735 | 7 736 | 7 737 | 7 738 | 7 739 | 7 740 | 7 741 | 7 742 | 7 743 | 7 744 | 7 745 | 7 746 | 7 747 | 7 748 | 7 749 | 7 750 | 7 751 | 7 752 | 7 753 | 7 754 | 7 755 | 7 756 | 7 757 | 7 758 | 7 759 | 7 760 | 7 761 | 7 762 | 7 763 | 7 764 | 7 765 | 7 766 | 7 767 | 7 768 | 7 769 | 7 770 | 7 771 | 7 772 | 7 773 | 7 774 | 7 775 | 7 776 | 7 777 | 7 778 | 7 779 | 7 780 | 7 781 | 7 782 | 7 783 | 7 784 | 7 785 | 7 786 | 7 787 | 7 788 | 7 789 | 7 790 | 7 791 | 7 792 | 7 793 | 7 794 | 7 795 | 7 796 | 7 797 | 7 798 | 7 799 | 7 800 | 7 801 | -------------------------------------------------------------------------------- /topic-competitors/slda/train-label.dat: -------------------------------------------------------------------------------- 1 | 0 2 | 0 3 | 0 4 | 0 5 | 0 6 | 0 7 | 0 8 | 0 9 | 0 10 | 0 11 | 0 12 | 0 13 | 0 14 | 0 15 | 0 16 | 0 17 | 0 18 | 0 19 | 0 20 | 0 21 | 0 22 | 0 23 | 0 24 | 0 25 | 0 26 | 0 27 | 0 28 | 0 29 | 0 30 | 0 31 | 0 32 | 0 33 | 0 34 | 0 35 | 0 36 | 0 37 | 0 38 | 0 39 | 0 40 | 0 41 | 0 42 | 0 43 | 0 44 | 0 45 | 0 46 | 0 47 | 0 48 | 0 49 | 0 50 | 0 51 | 0 52 | 0 53 | 0 54 | 0 55 | 0 56 | 0 57 | 0 58 | 0 59 | 0 60 | 0 61 | 0 62 | 0 63 | 0 64 | 0 65 | 0 66 | 0 67 | 0 68 | 0 69 | 0 70 | 0 71 | 0 72 | 0 73 | 0 74 | 0 75 | 0 76 | 0 77 | 0 78 | 0 79 | 0 80 | 0 81 | 0 82 | 0 83 | 0 84 | 0 85 | 0 86 | 0 87 | 0 88 | 0 89 | 0 90 | 0 91 | 0 92 | 0 93 | 0 94 | 0 95 | 0 96 | 0 97 | 0 98 | 0 99 | 0 100 | 0 101 | 1 102 | 1 103 | 1 104 | 1 105 | 1 106 | 1 107 | 1 108 | 1 109 | 1 110 | 1 111 | 1 112 | 1 113 | 1 114 | 1 115 | 1 116 | 1 117 | 1 118 | 1 119 | 1 120 | 1 121 | 1 122 | 1 123 | 1 124 | 1 125 | 1 126 | 1 127 | 1 128 | 1 129 | 1 130 | 1 131 | 1 132 | 1 133 | 1 134 | 1 135 | 1 136 | 1 137 | 1 138 | 1 139 | 1 140 | 1 141 | 1 142 | 1 143 | 1 144 | 1 145 | 1 146 | 1 147 | 1 148 | 1 149 | 1 150 | 1 151 | 1 152 | 1 153 | 1 154 | 1 155 | 1 156 | 1 157 | 1 158 | 1 159 | 1 160 | 1 161 | 1 162 | 1 163 | 1 164 | 1 165 | 1 166 | 1 167 | 1 168 | 1 169 | 1 170 | 1 171 | 1 172 | 1 173 | 1 174 | 1 175 | 1 176 | 1 177 | 1 178 | 1 179 | 1 180 | 1 181 | 1 182 | 1 183 | 1 184 | 1 185 | 1 186 | 1 187 | 1 188 | 1 189 | 1 190 | 1 191 | 1 192 | 1 193 | 1 194 | 1 195 | 1 196 | 1 197 | 1 198 | 1 199 | 1 200 | 1 201 | 2 202 | 2 203 | 2 204 | 2 205 | 2 206 | 2 207 | 2 208 | 2 209 | 2 210 | 2 211 | 2 212 | 2 213 | 2 214 | 2 215 | 2 216 | 2 217 | 2 218 | 2 219 | 2 220 | 2 221 | 2 222 | 2 223 | 2 224 | 2 225 | 2 226 | 2 227 | 2 228 | 2 229 | 2 230 | 2 231 | 2 232 | 2 233 | 2 234 | 2 235 | 2 236 | 2 237 | 2 238 | 2 239 | 2 240 | 2 241 | 2 242 | 2 243 | 2 244 | 2 245 | 2 246 | 2 247 | 2 248 | 2 249 | 2 250 | 2 251 | 2 252 | 2 253 | 2 254 | 2 255 | 2 256 | 2 257 | 2 258 | 2 259 | 2 260 | 2 261 | 2 262 | 2 263 | 2 264 | 2 265 | 2 266 | 2 267 | 2 268 | 2 269 | 2 270 | 2 271 | 2 272 | 2 273 | 2 274 | 2 275 | 2 276 | 2 277 | 2 278 | 2 279 | 2 280 | 2 281 | 2 282 | 2 283 | 2 284 | 2 285 | 2 286 | 2 287 | 2 288 | 2 289 | 2 290 | 2 291 | 2 292 | 2 293 | 2 294 | 2 295 | 2 296 | 2 297 | 2 298 | 2 299 | 2 300 | 2 301 | 3 302 | 3 303 | 3 304 | 3 305 | 3 306 | 3 307 | 3 308 | 3 309 | 3 310 | 3 311 | 3 312 | 3 313 | 3 314 | 3 315 | 3 316 | 3 317 | 3 318 | 3 319 | 3 320 | 3 321 | 3 322 | 3 323 | 3 324 | 3 325 | 3 326 | 3 327 | 3 328 | 3 329 | 3 330 | 3 331 | 3 332 | 3 333 | 3 334 | 3 335 | 3 336 | 3 337 | 3 338 | 3 339 | 3 340 | 3 341 | 3 342 | 3 343 | 3 344 | 3 345 | 3 346 | 3 347 | 3 348 | 3 349 | 3 350 | 3 351 | 3 352 | 3 353 | 3 354 | 3 355 | 3 356 | 3 357 | 3 358 | 3 359 | 3 360 | 3 361 | 3 362 | 3 363 | 3 364 | 3 365 | 3 366 | 3 367 | 3 368 | 3 369 | 3 370 | 3 371 | 3 372 | 3 373 | 3 374 | 3 375 | 3 376 | 3 377 | 3 378 | 3 379 | 3 380 | 3 381 | 3 382 | 3 383 | 3 384 | 3 385 | 3 386 | 3 387 | 3 388 | 3 389 | 3 390 | 3 391 | 3 392 | 3 393 | 3 394 | 3 395 | 3 396 | 3 397 | 3 398 | 3 399 | 3 400 | 3 401 | 4 402 | 4 403 | 4 404 | 4 405 | 4 406 | 4 407 | 4 408 | 4 409 | 4 410 | 4 411 | 4 412 | 4 413 | 4 414 | 4 415 | 4 416 | 4 417 | 4 418 | 4 419 | 4 420 | 4 421 | 4 422 | 4 423 | 4 424 | 4 425 | 4 426 | 4 427 | 4 428 | 4 429 | 4 430 | 4 431 | 4 432 | 4 433 | 4 434 | 4 435 | 4 436 | 4 437 | 4 438 | 4 439 | 4 440 | 4 441 | 4 442 | 4 443 | 4 444 | 4 445 | 4 446 | 4 447 | 4 448 | 4 449 | 4 450 | 4 451 | 4 452 | 4 453 | 4 454 | 4 455 | 4 456 | 4 457 | 4 458 | 4 459 | 4 460 | 4 461 | 4 462 | 4 463 | 4 464 | 4 465 | 4 466 | 4 467 | 4 468 | 4 469 | 4 470 | 4 471 | 4 472 | 4 473 | 4 474 | 4 475 | 4 476 | 4 477 | 4 478 | 4 479 | 4 480 | 4 481 | 4 482 | 4 483 | 4 484 | 4 485 | 4 486 | 4 487 | 4 488 | 4 489 | 4 490 | 4 491 | 4 492 | 4 493 | 4 494 | 4 495 | 4 496 | 4 497 | 4 498 | 4 499 | 4 500 | 4 501 | 5 502 | 5 503 | 5 504 | 5 505 | 5 506 | 5 507 | 5 508 | 5 509 | 5 510 | 5 511 | 5 512 | 5 513 | 5 514 | 5 515 | 5 516 | 5 517 | 5 518 | 5 519 | 5 520 | 5 521 | 5 522 | 5 523 | 5 524 | 5 525 | 5 526 | 5 527 | 5 528 | 5 529 | 5 530 | 5 531 | 5 532 | 5 533 | 5 534 | 5 535 | 5 536 | 5 537 | 5 538 | 5 539 | 5 540 | 5 541 | 5 542 | 5 543 | 5 544 | 5 545 | 5 546 | 5 547 | 5 548 | 5 549 | 5 550 | 5 551 | 5 552 | 5 553 | 5 554 | 5 555 | 5 556 | 5 557 | 5 558 | 5 559 | 5 560 | 5 561 | 5 562 | 5 563 | 5 564 | 5 565 | 5 566 | 5 567 | 5 568 | 5 569 | 5 570 | 5 571 | 5 572 | 5 573 | 5 574 | 5 575 | 5 576 | 5 577 | 5 578 | 5 579 | 5 580 | 5 581 | 5 582 | 5 583 | 5 584 | 5 585 | 5 586 | 5 587 | 5 588 | 5 589 | 5 590 | 5 591 | 5 592 | 5 593 | 5 594 | 5 595 | 5 596 | 5 597 | 5 598 | 5 599 | 5 600 | 5 601 | 6 602 | 6 603 | 6 604 | 6 605 | 6 606 | 6 607 | 6 608 | 6 609 | 6 610 | 6 611 | 6 612 | 6 613 | 6 614 | 6 615 | 6 616 | 6 617 | 6 618 | 6 619 | 6 620 | 6 621 | 6 622 | 6 623 | 6 624 | 6 625 | 6 626 | 6 627 | 6 628 | 6 629 | 6 630 | 6 631 | 6 632 | 6 633 | 6 634 | 6 635 | 6 636 | 6 637 | 6 638 | 6 639 | 6 640 | 6 641 | 6 642 | 6 643 | 6 644 | 6 645 | 6 646 | 6 647 | 6 648 | 6 649 | 6 650 | 6 651 | 6 652 | 6 653 | 6 654 | 6 655 | 6 656 | 6 657 | 6 658 | 6 659 | 6 660 | 6 661 | 6 662 | 6 663 | 6 664 | 6 665 | 6 666 | 6 667 | 6 668 | 6 669 | 6 670 | 6 671 | 6 672 | 6 673 | 6 674 | 6 675 | 6 676 | 6 677 | 6 678 | 6 679 | 6 680 | 6 681 | 6 682 | 6 683 | 6 684 | 6 685 | 6 686 | 6 687 | 6 688 | 6 689 | 6 690 | 6 691 | 6 692 | 6 693 | 6 694 | 6 695 | 6 696 | 6 697 | 6 698 | 6 699 | 6 700 | 6 701 | 7 702 | 7 703 | 7 704 | 7 705 | 7 706 | 7 707 | 7 708 | 7 709 | 7 710 | 7 711 | 7 712 | 7 713 | 7 714 | 7 715 | 7 716 | 7 717 | 7 718 | 7 719 | 7 720 | 7 721 | 7 722 | 7 723 | 7 724 | 7 725 | 7 726 | 7 727 | 7 728 | 7 729 | 7 730 | 7 731 | 7 732 | 7 733 | 7 734 | 7 735 | 7 736 | 7 737 | 7 738 | 7 739 | 7 740 | 7 741 | 7 742 | 7 743 | 7 744 | 7 745 | 7 746 | 7 747 | 7 748 | 7 749 | 7 750 | 7 751 | 7 752 | 7 753 | 7 754 | 7 755 | 7 756 | 7 757 | 7 758 | 7 759 | 7 760 | 7 761 | 7 762 | 7 763 | 7 764 | 7 765 | 7 766 | 7 767 | 7 768 | 7 769 | 7 770 | 7 771 | 7 772 | 7 773 | 7 774 | 7 775 | 7 776 | 7 777 | 7 778 | 7 779 | 7 780 | 7 781 | 7 782 | 7 783 | 7 784 | 7 785 | 7 786 | 7 787 | 7 788 | 7 789 | 7 790 | 7 791 | 7 792 | 7 793 | 7 794 | 7 795 | 7 796 | 7 797 | 7 798 | 7 799 | 7 800 | 7 801 | -------------------------------------------------------------------------------- /topic-competitors/slda/utils.cpp: -------------------------------------------------------------------------------- 1 | #include "utils.h" 2 | 3 | /* 4 | * given log(a) and log(b), return log(a + b) 5 | * 6 | */ 7 | 8 | double log_sum(double log_a, double log_b) 9 | { 10 | double v; 11 | 12 | if (log_a < log_b) 13 | v = log_b+log(1 + exp(log_a-log_b)); 14 | else 15 | v = log_a+log(1 + exp(log_b-log_a)); 16 | 17 | return v; 18 | } 19 | 20 | /** 21 | * Proc to calculate the value of the trigamma, the second 22 | * derivative of the loggamma function. Accepts positive matrices. 23 | * From Abromowitz and Stegun. Uses formulas 6.4.11 and 6.4.12 with 24 | * recurrence formula 6.4.6. Each requires workspace at least 5 25 | * times the size of X. 26 | * 27 | **/ 28 | 29 | double trigamma(double x) 30 | { 31 | double p; 32 | int i; 33 | 34 | x = x+6; 35 | p = 1/(x*x); 36 | p = (((((0.075757575757576*p-0.033333333333333)*p+0.0238095238095238)*p-0.033333333333333)*p+0.166666666666667)*p+1)/x+0.5*p; 37 | for (i=0; i<6 ;i++) 38 | { 39 | x = x-1; 40 | p = 1/(x*x)+p; 41 | } 42 | return p; 43 | } 44 | 45 | 46 | /* 47 | * taylor approximation of first derivative of the log gamma function 48 | * 49 | */ 50 | 51 | double digamma(double x) 52 | { 53 | double p; 54 | x = x+6; 55 | p = 1/(x*x); 56 | p = (((0.004166666666667*p-0.003968253986254)*p+0.008333333333333)*p-0.083333333333333)*p; 57 | p = p+log(x)-0.5/x-1/(x-1)-1/(x-2)-1/(x-3)-1/(x-4)-1/(x-5)-1/(x-6); 58 | return p; 59 | } 60 | 61 | /* 62 | * this log gamma function has the implementation of this function 63 | * 64 | */ 65 | 66 | /* double lgamma(double x) 67 | * { 68 | * double x0,x2,xp,gl,gl0; 69 | * int n,k; 70 | * static double a[] = { 71 | * 8.333333333333333e-02, 72 | * -2.777777777777778e-03, 73 | * 7.936507936507937e-04, 74 | * -5.952380952380952e-04, 75 | * 8.417508417508418e-04, 76 | * -1.917526917526918e-03, 77 | * 6.410256410256410e-03, 78 | * -2.955065359477124e-02, 79 | * 1.796443723688307e-01, 80 | * -1.39243221690590 81 | * }; 82 | * 83 | * x0 = x; 84 | * if (x <= 0.0) return 1e308; 85 | * else if ((x == 1.0) || (x == 2.0)) return 0.0; 86 | * else if (x <= 7.0) { 87 | * n = (int)(7-x); 88 | * x0 = x+n; 89 | * } 90 | * x2 = 1.0/(x0*x0); 91 | * xp = 2.0*M_PI; 92 | * gl0 = a[9]; 93 | * for (k=8;k>=0;k--) { 94 | * gl0 = gl0*x2 + a[k]; 95 | * } 96 | * gl = gl0/x0+0.5*log(xp)+(x0-0.5)*log(x0)-x0; 97 | * if (x <= 7.0) { 98 | * for (k=1;k<=n;k++) { 99 | * gl -= log(x0-1.0); 100 | * x0 -= 1.0; 101 | * } 102 | * } 103 | * return gl; 104 | * } 105 | */ 106 | 107 | 108 | 109 | /* 110 | * make directory 111 | * 112 | */ 113 | 114 | void make_directory(char* name) 115 | { 116 | mkdir(name, S_IRUSR|S_IWUSR|S_IXUSR); 117 | } 118 | 119 | 120 | /* 121 | * argmax 122 | * 123 | */ 124 | 125 | int argmax(double* x, int n) 126 | { 127 | int i, argmax = 0; 128 | double max = x[0]; 129 | 130 | for (i = 1; i < n; i++) 131 | { 132 | if (x[i] > max) 133 | { 134 | max = x[i]; 135 | argmax = i; 136 | } 137 | } 138 | return argmax; 139 | } 140 | 141 | /* 142 | * return the correponding index in the n(n+1)/2 given row and col 143 | * this is a upper triangle matrix, we can do this since this is 144 | * a symmetric matrix 145 | * 146 | */ 147 | 148 | int map_idx(int row, int col, int dim) 149 | { 150 | int swap, idx; 151 | if (row > col) 152 | { 153 | swap = row; 154 | row = col; 155 | col = swap; 156 | } 157 | //now row <= col 158 | idx = (2*dim - row + 1)*row/2 + col - row; 159 | return idx; 160 | } 161 | 162 | -------------------------------------------------------------------------------- /topic-competitors/slda/utils.h: -------------------------------------------------------------------------------- 1 | #ifndef UTILS_H 2 | #define UTILS_H 3 | 4 | #include 5 | #include 6 | #include 7 | #include 8 | #include 9 | #include 10 | 11 | double log_sum(double log_a, double log_b); 12 | double trigamma(double x); 13 | double digamma(double x); 14 | //double lgamma(double x); 15 | void make_directory(char* name); 16 | int argmax(double* x, int n); 17 | int map_idx(int row, int col, int dim); 18 | 19 | #endif 20 | 21 | -------------------------------------------------------------------------------- /topic-cosine.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import sys 3 | import pdb 4 | from utils import * 5 | 6 | topic_vec_file = sys.argv[1] 7 | T = load_matrix_from_text( topic_vec_file, "topic" ) 8 | K = T.shape[0] 9 | cosine_mat = [] 10 | for x in xrange(K): 11 | for y in xrange(x): 12 | if normF(T[x]) < 1e-6 or normF(T[y]) < 1e-6: 13 | continue 14 | cosine = np.dot( T[x], T[y] ) / normF(T[x]) / normF(T[y]) 15 | cosine_mat.append( [ cosine, x, y ] ) 16 | 17 | cosine_sum = 0 18 | for i in xrange( len(cosine_mat) ): 19 | cosine_sum += cosine_mat[i][0] 20 | 21 | print "Avg: %.5f" %( cosine_sum / len(cosine_mat) ) 22 | cosine_sorted = sorted( cosine_mat, key=lambda cosine_tuple: cosine_tuple[0], reverse=True ) 23 | for i in xrange(10): 24 | cosine, x, y = cosine_sorted[i] 25 | print "%d,%d: %.5f" %( x, y, cosine ) 26 | print "%d: %s" %( x, T[x][:10] ) 27 | print "%d: %s" %( y, T[y][:10] ) 28 | print 29 | -------------------------------------------------------------------------------- /topicvec-ext.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/topicvec-ext.pdf -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/askerlee/topicvec/1c5d2c5ec8a0e61c53a9aa416d493ebea324a6a8/utils.py --------------------------------------------------------------------------------