├── activelearn ├── mmms.py ├── mmpm.py ├── oracle.py ├── qbc4.py ├── qbc_dist.py ├── uncert_dist.py ├── uncertain.py ├── uncertain2.py ├── uncertain3.py └── uncertain4.py ├── clustering ├── dpm.py └── irm.py ├── data ├── 4million.corpus ├── gen_cluto.rb ├── gen_corpus.rb ├── gen_libsvm.rb ├── henry-poe └── ohenry.corpus ├── difficulty └── wordbook.txt ├── dnn ├── README.md ├── cdcgan-svhn.ini ├── cdcgan-svhn.py ├── cgan-mnist.py ├── dcgan-svhn.py ├── e2emn.py └── gan-mnist.py ├── extractcontent ├── test.py ├── train.py └── webextract.py ├── hac ├── fselect.rb ├── hac.rb └── naive_hac.rb ├── irt └── irt.rb ├── langdetect ├── common.rb ├── crawler.rb ├── detect.rb ├── filetest.rb ├── model.json ├── test.rb └── train.rb ├── lda ├── hdp_online.py ├── hdplda.py ├── hdplda2.py ├── itm.py ├── lda.py ├── lda.r ├── lda_cvb0.py ├── lda_test.py ├── lda_test2.py ├── ldacvb0_cpp │ ├── README.md │ ├── ldacvb0.sln │ ├── ldacvb0 │ │ ├── ldacvb0.cpp │ │ ├── ldacvb0.hpp │ │ └── ldacvb0.vcxproj │ └── ldacvb0_test │ │ ├── ldacvb0_test.vcxproj │ │ └── test.cpp ├── llda.py ├── llda_nltk.py ├── test_hdplda2.py ├── twentygroups.py └── vocabulary.py ├── lib ├── extract_gutenberg.rb ├── infinitive.rb ├── inflist.txt └── wordbook.txt ├── lr └── lr.r ├── misc ├── linear_regression.r ├── linear_regression.xlsx └── zipf.rb ├── neural ├── adult.rb ├── classification.rb ├── classification.txt ├── curve_fitting.rb ├── iris.rb ├── mnist.rb ├── mnist2.rb ├── neural.rb └── xor.rb ├── ngram ├── knlm.py ├── knsmooth.py ├── ngram.rb ├── rnnlm.py └── wordcount.py ├── pca ├── bayes.r ├── ema.r └── ppca.r ├── perceptron ├── avg_percep_test.rb ├── percep_test.rb ├── test.rb └── train.rb ├── privacy └── randomized-response │ ├── rr-gibbs.py │ ├── rr-mle.py │ └── rr-vb.py ├── sampling └── hmc.r ├── semisupervised └── ssnb.py ├── sequence ├── crf.py ├── hmm.py ├── pg.py └── testcrf.py ├── trie ├── da.py ├── test_da.py └── trie.py └── unsupervised ├── bs.rb ├── ema.r ├── plsi.rb ├── vb.r └── vb_result.csv /activelearn/mmms.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning for 20 newsgroups : MCMI[min] with margin sampling 5 | # MCMI[min] refers to (Guo+ IJCAI-07) 6 | # Yuhong Guo and Russ Greiner, Optimistic Active Learning using Mutual Information, IJCAI-07 7 | 8 | # This code is available under the MIT License. 9 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 10 | 11 | import optparse 12 | import numpy 13 | import scipy.sparse 14 | import sklearn.datasets 15 | from sklearn.linear_model import LogisticRegression 16 | from sklearn.naive_bayes import MultinomialNB 17 | 18 | def activelearn(data, test, train, pool, classifier_factory, max_train, seed): 19 | numpy.random.seed(seed) 20 | 21 | # copy initial indexes of training and pool 22 | train = list(train) 23 | pool = list(pool) 24 | 25 | accuracies = [] 26 | Z = len(test.target) 27 | K = data.target.max() + 1 28 | while len(train) < max_train: 29 | if len(accuracies) > 0: 30 | predict = classifier.predict_proba(data.data[pool,:]) 31 | predict.sort(axis=1) 32 | margin = predict[:,-1] - predict[:,-2] 33 | candidate = margin.argsort()[:30] 34 | 35 | i_star = y_i_star = None 36 | f_i_star = 1e300 37 | print "i\ty_i\t(actual)\tf_i\tmargin" 38 | for i in candidate: 39 | x = pool[i] 40 | L_x_i = data.data[train + [x], :] 41 | L_y = data.target[train] 42 | entropies = numpy.zeros(K) 43 | for y in xrange(K): 44 | l = list(L_y) 45 | l.append(y) 46 | phi_i = classifier_factory().fit(L_x_i, l) 47 | 48 | p = phi_i.predict_proba(data.data[pool]) 49 | entropies[y] = -(numpy.nan_to_num(numpy.log(p)) * p).sum() 50 | y_i = entropies.argmin() 51 | f_i = entropies[y_i] 52 | print "%d\t%d\t%d\t%f\t%f" % (x, y_i, data.target[x], f_i, margin[i]) 53 | if f_i < f_i_star: 54 | i_star = i 55 | y_i_star = y_i 56 | f_i_star = f_i 57 | 58 | x = pool[i_star] 59 | print "select : %d (MM=%f, predict=%d, actual=%d)" % (x, f_i_star, y_i_star, data.target[x]) 60 | train.append(x) 61 | del pool[i_star] 62 | 63 | classifier = classifier_factory().fit(data.data[train,:], data.target[train]) 64 | accuracy = classifier.score(test.data, test.target) 65 | print "%d : %f" % (len(train), accuracy) 66 | accuracies.append((len(train), accuracy)) 67 | 68 | return accuracies 69 | 70 | def main(): 71 | parser = optparse.OptionParser() 72 | parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None) 73 | parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None) 74 | parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None) 75 | 76 | parser.add_option("-K", dest="class_size", type="int", help="number of class", default=4) 77 | parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=100) 78 | parser.add_option("-t", dest="training", help="specify indexes of training", default=None) 79 | parser.add_option("-N", dest="trying", type="int", help="number of trying", default=100) 80 | 81 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 82 | (opt, args) = parser.parse_args() 83 | 84 | data = sklearn.datasets.fetch_20newsgroups_vectorized() 85 | print "(train size, voca size) : (%d, %d)" % data.data.shape 86 | 87 | if opt.class_size: 88 | index = data.target < opt.class_size 89 | a = data.data.toarray()[index, :] 90 | data.data = scipy.sparse.csr_matrix(a) 91 | data.target = data.target[index] 92 | print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape 93 | 94 | 95 | N_CLASS = data.target.max() + 1 96 | if opt.training: 97 | train = [int(x) for x in opt.training.split(",")] 98 | else: 99 | train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)] 100 | print "indexes of training set : ", ",".join("%d" % x for x in train) 101 | 102 | pool = range(data.data.shape[0]) 103 | for x in train: pool.remove(x) 104 | 105 | classifier_factory = None 106 | if opt.logistic_l1: 107 | print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1 108 | classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1) 109 | elif opt.logistic_l2: 110 | print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2 111 | classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2) 112 | elif opt.naive_bayes: 113 | print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes 114 | classifier_factory = lambda: MultinomialNB(alpha=opt.naive_bayes) 115 | 116 | if classifier_factory: 117 | test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test') 118 | print "(test size, voca size) : (%d, %d)" % test.data.shape 119 | if opt.class_size: 120 | index = test.target < opt.class_size 121 | a = test.data.toarray()[index, :] 122 | test.data = scipy.sparse.csr_matrix(a) 123 | test.target = test.target[index] 124 | print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape 125 | 126 | print "score for all data: %f" % classifier_factory().fit(data.data, data.target).score(test.data, test.target) 127 | 128 | for n in xrange(opt.trying): 129 | print "trying.. %d" % n 130 | train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)] 131 | pool = range(data.data.shape[0]) 132 | for x in train: pool.remove(x) 133 | results = activelearn(data, test, train, pool, classifier_factory, opt.max_train, opt.seed) 134 | 135 | with open("output_mmms_%d_%d.txt" % (opt.class_size, opt.max_train), "ab") as f: 136 | f.write("\t".join("%f" % x[1] for x in results)) 137 | f.write("\n") 138 | 139 | if __name__ == "__main__": 140 | main() 141 | -------------------------------------------------------------------------------- /activelearn/mmpm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning for 20 newsgroups with MM+M (Guo+ IJCAI-07) 5 | # Yuhong Guo and Russ Greiner, Optimistic Active Learning using Mutual Information, IJCAI-07 6 | 7 | # This code is available under the MIT License. 8 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 9 | 10 | import optparse 11 | import numpy 12 | import scipy.sparse 13 | import sklearn.datasets 14 | from sklearn.linear_model import LogisticRegression 15 | 16 | def activelearn(data, test, train, pool, classifier_factory, max_train, seed): 17 | numpy.random.seed(seed) 18 | 19 | # copy initial indexes of training and pool 20 | train = list(train) 21 | pool = list(pool) 22 | 23 | accuracies = [] 24 | Z = len(test.target) 25 | K = data.target.max() + 1 26 | while len(train) < max_train: 27 | if len(accuracies) > 0: 28 | predict = classifier.predict_proba(data.data[pool,:]) 29 | entbase = -numpy.nan_to_num(predict * numpy.log(predict)).sum(axis=1) 30 | predict.sort(axis=1) 31 | margin = predict[:,-1] - predict[:,-2] 32 | uncertain = predict[:,-1] 33 | 34 | i_star = y_i_star = None 35 | f_i_star = 1e300 36 | print "i\ty_i\tf_i\tuncertain\tmargin\tent" 37 | for i, x in enumerate(pool): 38 | L_x_i = data.data[train + [x], :] 39 | L_y = data.target[train] 40 | entropies = numpy.zeros(K) 41 | for y in xrange(K): 42 | l = list(L_y) 43 | l.append(y) 44 | phi_i = classifier_factory().fit(L_x_i, l) 45 | 46 | p = phi_i.predict_proba(data.data[pool]) 47 | entropies[y] = -(numpy.nan_to_num(numpy.log(p)) * p).sum() 48 | y_i = entropies.argmin() 49 | f_i = entropies[y_i] 50 | print "%d\t%d\t%f\t%f\t%f\t%f" % (x, y_i, f_i, uncertain[i], margin[i], entbase[i]) 51 | if f_i < f_i_star: 52 | i_star = i 53 | y_i_star = y_i 54 | f_i_star = f_i 55 | 56 | x = pool[i_star] 57 | print "select : %d (MM=%f, predict=%d, actual=%d)" % (x, f_i_star, y_i_star, data.target[x]) 58 | train.append(x) 59 | del pool[i_star] 60 | 61 | if data.target[x] != y_i_star: 62 | phi = classifier_factory().fit(data.data[train, :], data.target[train]) 63 | p = phi_i.predict_proba(data.data[pool]) 64 | i_star = (numpy.nan_to_num(numpy.log(p)) * p).sum(axis=1).argmin() 65 | 66 | x = pool[i_star] 67 | print "select : %d (actual=%d)" % (x, data.target[x]) 68 | train.append(x) 69 | del pool[i_star] 70 | 71 | classifier = classifier_factory().fit(data.data[train,:], data.target[train]) 72 | accuracy = classifier.score(test.data, test.target) 73 | print "%d : %f" % (len(train), accuracy) 74 | accuracies.append((len(train), accuracy)) 75 | 76 | return accuracies 77 | 78 | def main(): 79 | parser = optparse.OptionParser() 80 | parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None) 81 | parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None) 82 | parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None) 83 | 84 | parser.add_option("-K", dest="class_size", type="int", help="number of class", default=None) 85 | parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=30) 86 | parser.add_option("-t", dest="training", help="specify indexes of training", default=None) 87 | 88 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 89 | (opt, args) = parser.parse_args() 90 | 91 | data = sklearn.datasets.fetch_20newsgroups_vectorized() 92 | print "(train size, voca size) : (%d, %d)" % data.data.shape 93 | 94 | if opt.class_size: 95 | index = data.target < opt.class_size 96 | a = data.data.toarray()[index, :] 97 | data.data = scipy.sparse.csr_matrix(a) 98 | data.target = data.target[index] 99 | print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape 100 | 101 | 102 | N_CLASS = data.target.max() + 1 103 | if opt.training: 104 | train = [int(x) for x in opt.training.split(",")] 105 | else: 106 | train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)] 107 | print "indexes of training set : ", ",".join("%d" % x for x in train) 108 | 109 | pool = range(data.data.shape[0]) 110 | for x in train: pool.remove(x) 111 | 112 | classifier_factory = None 113 | if opt.logistic_l1: 114 | print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1 115 | classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1) 116 | elif opt.logistic_l2: 117 | print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2 118 | classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2) 119 | else: 120 | pass 121 | 122 | if classifier_factory: 123 | test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test') 124 | print "(test size, voca size) : (%d, %d)" % test.data.shape 125 | if opt.class_size: 126 | index = test.target < opt.class_size 127 | a = test.data.toarray()[index, :] 128 | test.data = scipy.sparse.csr_matrix(a) 129 | test.target = test.target[index] 130 | print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape 131 | 132 | print "score for all data: %f" % classifier_factory().fit(data.data, data.target).score(test.data, test.target) 133 | 134 | results = activelearn(data, test, train, pool, classifier_factory, opt.max_train, opt.seed) 135 | 136 | for x in results: 137 | print "%d\t%f" % x 138 | 139 | if __name__ == "__main__": 140 | main() 141 | -------------------------------------------------------------------------------- /activelearn/oracle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning for 20 newsgroups with Oracle and testset 5 | 6 | # This code is available under the MIT License. 7 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 8 | 9 | import optparse 10 | import numpy 11 | import scipy.sparse 12 | import sklearn.datasets 13 | from sklearn.linear_model import LogisticRegression 14 | from sklearn.naive_bayes import MultinomialNB 15 | 16 | def activelearn(data, test, train, pool, classifier_factory, max_train, n_candidate, seed): 17 | numpy.random.seed(seed) 18 | 19 | # copy initial indexes of training and pool 20 | train = list(train) 21 | pool = list(pool) 22 | 23 | accuracies = [] 24 | Z = len(test.target) 25 | K = data.target.max() + 1 26 | while len(train) < max_train: 27 | if len(accuracies) > 0: 28 | i_star = None 29 | max_score = 0.0 30 | candidate = pool 31 | if 0 < n_candidate < len(pool): 32 | numpy.random.shuffle(pool) 33 | candidate = pool[:n_candidate] 34 | for i, x in enumerate(candidate): 35 | t = train + [x] 36 | s = classifier_factory().fit(data.data[t, :], data.target[t]).score(test.data, test.target) 37 | if max_score < s: 38 | print "%d\t%f" % (x, s) 39 | max_score = s 40 | i_star = i 41 | train.append(pool[i_star]) 42 | del pool[i_star] 43 | 44 | classifier = classifier_factory().fit(data.data[train,:], data.target[train]) 45 | accuracy = classifier.score(test.data, test.target) 46 | print "%d : %f" % (len(train), accuracy) 47 | accuracies.append((len(train), accuracy)) 48 | 49 | return accuracies 50 | 51 | def main(): 52 | parser = optparse.OptionParser() 53 | parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None) 54 | parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None) 55 | parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None) 56 | 57 | parser.add_option("-K", dest="class_size", type="int", help="number of class", default=None) 58 | parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=30) 59 | parser.add_option("-t", dest="training", help="specify indexes of training", default=None) 60 | parser.add_option("-T", dest="candidate", type="int", help="candidate size", default=-1) 61 | 62 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 63 | (opt, args) = parser.parse_args() 64 | 65 | data = sklearn.datasets.fetch_20newsgroups_vectorized() 66 | print "(train size, voca size) : (%d, %d)" % data.data.shape 67 | 68 | if opt.class_size: 69 | index = data.target < opt.class_size 70 | a = data.data.toarray()[index, :] 71 | data.data = scipy.sparse.csr_matrix(a) 72 | data.target = data.target[index] 73 | print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape 74 | 75 | 76 | N_CLASS = data.target.max() + 1 77 | if opt.training: 78 | train = [int(x) for x in opt.training.split(",")] 79 | else: 80 | train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)] 81 | print "indexes of training set : ", ",".join("%d" % x for x in train) 82 | 83 | pool = range(data.data.shape[0]) 84 | for x in train: pool.remove(x) 85 | 86 | classifier_factory = None 87 | if opt.logistic_l1: 88 | print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1 89 | classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1) 90 | elif opt.logistic_l2: 91 | print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2 92 | classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2) 93 | elif opt.naive_bayes: 94 | print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes 95 | classifier_factory = lambda: MultinomialNB(alpha=opt.naive_bayes) 96 | 97 | if classifier_factory: 98 | test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test') 99 | print "(test size, voca size) : (%d, %d)" % test.data.shape 100 | if opt.class_size: 101 | index = test.target < opt.class_size 102 | a = test.data.toarray()[index, :] 103 | test.data = scipy.sparse.csr_matrix(a) 104 | test.target = test.target[index] 105 | print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape 106 | 107 | print "score for all data: %f" % classifier_factory().fit(data.data, data.target).score(test.data, test.target) 108 | 109 | results = activelearn(data, test, train, pool, classifier_factory, opt.max_train, opt.candidate, opt.seed) 110 | 111 | for x in results: 112 | print "%d\t%f" % x 113 | 114 | if __name__ == "__main__": 115 | main() 116 | -------------------------------------------------------------------------------- /activelearn/qbc4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning (Query-By-Committee) for 20 newsgroups 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import optparse 9 | import numpy 10 | import sklearn.datasets 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.naive_bayes import MultinomialNB 13 | 14 | def activelearn(results, data, test, strategy, train, pool, classifier_factories, max_train, densities): 15 | print strategy 16 | 17 | # copy initial indexes of training and pool 18 | train = list(train) 19 | pool = list(pool) 20 | 21 | accuracies = [] 22 | Z = len(test.target) 23 | while len(train) < max_train: 24 | if len(accuracies) > 0: 25 | if strategy == "random": 26 | x = numpy.random.randint(len(pool)) 27 | else: 28 | if strategy == "vote entropy": 29 | p = numpy.array([c.predict(data.data[pool,:]) for c in classifiers]) 30 | # This is equivalent to Vote Entropy when # of classifiers = 3 31 | x = ((p[:,0:2]==p[:,1:3]).sum(axis=1) + (p[:,0]==p[:,2])) 32 | elif strategy == "average KL": 33 | p = numpy.array([c.predict_proba(data.data[pool,:]) for c in classifiers]) # 3 * N * K 34 | pc = p.mean(axis=0) # N * K 35 | x = numpy.nan_to_num(p * numpy.log(pc / p)).sum(axis=2).sum(axis=0) 36 | elif strategy == "qbc+margin sampling": 37 | p = numpy.array([c.predict_proba(data.data[pool,:]) for c in classifiers]) # 3 * N * K 38 | pc = p.mean(axis=0) # N * K 39 | pc.sort(axis=1) 40 | x = pc[:,-1] - pc[:,-2] 41 | if densities != None: x *= densities[pool] 42 | x = x.argmin() 43 | train.append(pool[x]) 44 | del pool[x] 45 | 46 | classifiers = [f().fit(data.data[train,:], data.target[train]) for f in classifier_factories] 47 | 48 | predict = sum(c.predict_proba(test.data) for c in classifiers) 49 | correct = (predict.argmax(axis=1) == test.target).sum() 50 | accuracy = float(correct) / Z 51 | print "%s %d : %d / %d = %f" % (strategy, len(train), correct, Z, accuracy) 52 | accuracies.append(accuracy) 53 | 54 | results.append((strategy, accuracies)) 55 | 56 | def main(): 57 | parser = optparse.OptionParser() 58 | parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None) 59 | parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None) 60 | parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None) 61 | 62 | parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=300) 63 | parser.add_option("-t", dest="training", help="specify indexes of training", default=None) 64 | 65 | parser.add_option("-b", dest="beta", type="float", help="density importance", default=0) 66 | 67 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 68 | (opt, args) = parser.parse_args() 69 | numpy.random.seed(opt.seed) 70 | 71 | data = sklearn.datasets.fetch_20newsgroups_vectorized() 72 | print "(train size, voca size) : (%d, %d)" % data.data.shape 73 | 74 | N_CLASS = data.target.max() + 1 75 | if opt.training: 76 | train = [int(x) for x in opt.training.split(",")] 77 | else: 78 | train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)] 79 | print "indexes of training set : ", ",".join("%d" % x for x in train) 80 | 81 | pool = range(data.data.shape[0]) 82 | for x in train: pool.remove(x) 83 | 84 | classifier_factories = [] 85 | if opt.logistic_l1: 86 | print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1 87 | classifier_factories.append(lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1)) 88 | if opt.logistic_l2: 89 | print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2 90 | classifier_factories.append(lambda: LogisticRegression(C=opt.logistic_l2)) 91 | if opt.naive_bayes: 92 | print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes 93 | classifier_factories.append(lambda: MultinomialNB(alpha=opt.naive_bayes)) 94 | 95 | if len(classifier_factories) >= 2: 96 | test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test') 97 | print "(test size, voca size) : (%d, %d)" % test.data.shape 98 | 99 | densities = None 100 | if opt.beta > 0: 101 | densities = (data.data * data.data.T).mean(axis=0).A[0] ** opt.beta 102 | 103 | methods = ["random", "vote entropy", "average KL", "qbc+margin sampling", ] 104 | results = [] 105 | for x in methods: 106 | activelearn(results, data, test, x, train, pool, classifier_factories, opt.max_train, densities) 107 | 108 | print "\t%s" % "\t".join(x[0] for x in results) 109 | d = len(train) 110 | for i in xrange(len(results[0][1])): 111 | print "%d\t%s" % (i+d, "\t".join("%f" % x[1][i] for x in results)) 112 | 113 | if __name__ == "__main__": 114 | main() 115 | -------------------------------------------------------------------------------- /activelearn/qbc_dist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning (Query-By-Committee) for 20 newsgroups 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import optparse 9 | import numpy 10 | import sklearn.datasets 11 | from sklearn.linear_model import LogisticRegression 12 | from sklearn.naive_bayes import MultinomialNB 13 | 14 | def activelearn(data, test, strategy, train, pool, classifier_factories, max_train, densities): 15 | # copy initial indexes of training and pool 16 | train = list(train) 17 | pool = list(pool) 18 | 19 | accuracies = [] 20 | Z = len(test.target) 21 | while len(train) < max_train: 22 | if len(accuracies) > 0: 23 | if strategy == "random": 24 | x = numpy.random.randint(len(pool)) 25 | else: 26 | if strategy == "vote entropy": 27 | p = numpy.array([c.predict(data.data[pool,:]) for c in classifiers]) 28 | # This is equivalent to Vote Entropy when # of classifiers = 3 29 | x = ((p[:,0:2]==p[:,1:3]).sum(axis=1) + (p[:,0]==p[:,2])) 30 | elif strategy == "average KL": 31 | p = numpy.array([c.predict_proba(data.data[pool,:]) for c in classifiers]) # 3 * N * K 32 | pc = p.mean(axis=0) # N * K 33 | x = numpy.nan_to_num(p * numpy.log(pc / p)).sum(axis=2).sum(axis=0) 34 | elif strategy == "qbc+margin sampling": 35 | p = numpy.array([c.predict_proba(data.data[pool,:]) for c in classifiers]) # 3 * N * K 36 | pc = p.mean(axis=0) # N * K 37 | pc.sort(axis=1) 38 | x = pc[:,-1] - pc[:,-2] 39 | if densities != None: x *= densities[pool] 40 | x = x.argmin() 41 | train.append(pool[x]) 42 | del pool[x] 43 | 44 | classifiers = [f().fit(data.data[train,:], data.target[train]) for f in classifier_factories] 45 | 46 | predict = sum(c.predict_proba(test.data) for c in classifiers) 47 | correct = (predict.argmax(axis=1) == test.target).sum() 48 | accuracy = float(correct) / Z 49 | print "%d : %d / %d = %f" % (len(train), correct, Z, accuracy) 50 | accuracies.append(accuracy) 51 | return accuracies 52 | 53 | def main(): 54 | parser = optparse.OptionParser() 55 | parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None) 56 | parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None) 57 | parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None) 58 | 59 | parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=300) 60 | parser.add_option("-N", dest="trying", type="int", help="number of trying", default=100) 61 | 62 | parser.add_option("-b", dest="beta", type="float", help="density importance", default=0) 63 | 64 | (opt, args) = parser.parse_args() 65 | 66 | data = sklearn.datasets.fetch_20newsgroups_vectorized() 67 | print "(train size, voca size) : (%d, %d)" % data.data.shape 68 | 69 | N_CLASS = data.target.max() + 1 70 | 71 | classifier_factories = [] 72 | if opt.logistic_l1: 73 | print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1 74 | classifier_factories.append(lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1)) 75 | if opt.logistic_l2: 76 | print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2 77 | classifier_factories.append(lambda: LogisticRegression(C=opt.logistic_l2)) 78 | if opt.naive_bayes: 79 | print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes 80 | classifier_factories.append(lambda: MultinomialNB(alpha=opt.naive_bayes)) 81 | 82 | if len(classifier_factories) >= 2: 83 | test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test') 84 | print "(test size, voca size) : (%d, %d)" % test.data.shape 85 | 86 | densities = None 87 | if opt.beta > 0: 88 | densities = (data.data * data.data.T).mean(axis=0).A[0] ** opt.beta 89 | 90 | methods = ["random", "vote entropy", "average KL", "qbc+margin sampling", ] 91 | results = [] 92 | for n in xrange(opt.trying): 93 | for method in methods: 94 | print "%s : %d" % (method, n) 95 | train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)] 96 | pool = range(data.data.shape[0]) 97 | for x in train: pool.remove(x) 98 | 99 | results = activelearn(data, test, method, train, pool, classifier_factories, opt.max_train, densities) 100 | 101 | d = len(train) 102 | with open("output_qbc_%d.txt" % opt.max_train, "ab") as f: 103 | f.write("%s\t%s\n" % (method, "\t".join("%f" % x for x in results))) 104 | 105 | 106 | if __name__ == "__main__": 107 | main() 108 | -------------------------------------------------------------------------------- /activelearn/uncert_dist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning (Uncertainly Sampling and Information Density) for 20 newsgroups 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import optparse 9 | import numpy 10 | import scipy.sparse 11 | import sklearn.datasets 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.naive_bayes import MultinomialNB 14 | 15 | def activelearn(results, data, test, strategy, train, pool, classifier_factory, max_train, densities): 16 | # copy initial indexes of training and pool 17 | train = list(train) 18 | pool = list(pool) 19 | 20 | accuracies = [] 21 | while len(train) < max_train: 22 | if len(accuracies) > 0: 23 | if strategy == "random": 24 | x = numpy.random.randint(len(pool)) 25 | else: 26 | predict = cl.predict_proba(data.data[pool,:]) 27 | if strategy == "least confident": 28 | x = predict.max(axis=1)-1 29 | elif strategy == "margin sampling": 30 | predict.sort(axis=1) 31 | x = (predict[:,-1] - predict[:,-2]) 32 | elif strategy == "entropy-based": 33 | x = numpy.nan_to_num(predict * numpy.log(predict)).sum(axis=1) 34 | if densities != None: x *= densities[pool] 35 | x = x.argmin() 36 | train.append(pool[x]) 37 | del pool[x] 38 | 39 | cl = classifier_factory() 40 | cl.fit(data.data[train,:], data.target[train]) 41 | accuracy = cl.score(test.data, test.target) 42 | print "%d : %f" % (len(train), accuracy) 43 | accuracies.append(accuracy) 44 | 45 | results.append((strategy, accuracies)) 46 | 47 | 48 | def main(): 49 | parser = optparse.OptionParser() 50 | parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None) 51 | parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None) 52 | parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None) 53 | 54 | parser.add_option("-K", dest="class_size", type="int", help="number of class", default=None) 55 | parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=100) 56 | parser.add_option("-N", dest="trying", type="int", help="number of trying", default=100) 57 | 58 | parser.add_option("-b", dest="beta", type="float", help="density importance", default=0) 59 | (opt, args) = parser.parse_args() 60 | 61 | data = sklearn.datasets.fetch_20newsgroups_vectorized() 62 | print "(train size, voca size) : (%d, %d)" % data.data.shape 63 | 64 | if opt.class_size: 65 | index = data.target < opt.class_size 66 | a = data.data.toarray()[index, :] 67 | data.data = scipy.sparse.csr_matrix(a) 68 | data.target = data.target[index] 69 | print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape 70 | 71 | classifier_factory = clz = None 72 | if opt.logistic_l1: 73 | print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1 74 | classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1) 75 | clz = "lrl1" 76 | elif opt.logistic_l2: 77 | print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2 78 | classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2) 79 | clz = "lrl2" 80 | elif opt.naive_bayes: 81 | print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes 82 | classifier_factory = lambda: MultinomialNB(alpha=opt.naive_bayes) 83 | clz = "nb" 84 | 85 | if classifier_factory: 86 | test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test') 87 | print "(test size, voca size) : (%d, %d)" % test.data.shape 88 | if opt.class_size: 89 | index = test.target < opt.class_size 90 | a = test.data.toarray()[index, :] 91 | test.data = scipy.sparse.csr_matrix(a) 92 | test.target = test.target[index] 93 | print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape 94 | 95 | densities = None 96 | if opt.beta > 0: 97 | densities = (data.data * data.data.T).mean(axis=0).A[0] ** opt.beta 98 | 99 | N_CLASS = data.target.max() + 1 100 | for method in ["random", "least confident", "margin sampling", "entropy-based"]: 101 | results = [] 102 | for n in xrange(opt.trying): 103 | print "%s : %d" % (method, n) 104 | train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)] 105 | pool = range(data.data.shape[0]) 106 | for x in train: pool.remove(x) 107 | 108 | activelearn(results, data, test, method, train, pool, classifier_factory, opt.max_train, densities) 109 | 110 | d = len(train) 111 | with open("output_%s_%s.txt" % (method, clz), "wb") as f: 112 | f.write(method) 113 | f.write("\n") 114 | for i in xrange(len(results[0][1])): 115 | f.write("%d\t%s\n" % (i+d, "\t".join("%f" % x[1][i] for x in results))) 116 | 117 | 118 | if __name__ == "__main__": 119 | main() 120 | -------------------------------------------------------------------------------- /activelearn/uncertain.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning (Uncertainly Sampling) 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import sys, numpy 9 | from sklearn.linear_model import LogisticRegression 10 | from sklearn import cross_validation 11 | 12 | import optparse 13 | parser = optparse.OptionParser() 14 | #parser.add_option("-c", dest="corpus", help="corpus module name under nltk.corpus (e.g. brown, reuters)", default='brown') 15 | #parser.add_option("-r", dest="testrate", type="float", help="rate of test dataset in corpus", default=0.1) 16 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 17 | (opt, args) = parser.parse_args() 18 | numpy.random.seed(opt.seed) 19 | 20 | output = False 21 | 22 | def activelearn(data, label, strategy): 23 | #print strategy 24 | 25 | N, D = data.shape 26 | train = list(range(D)) 27 | pool = range(D,N) 28 | predict = None 29 | 30 | for i in xrange(30-D): 31 | if predict != None: 32 | if strategy == "random": 33 | x = numpy.random.randint(len(pool)) 34 | elif strategy == "least confident": 35 | x = predict.max(axis=1).argmin() 36 | elif strategy == "margin sampling": 37 | predict.sort(axis=1) 38 | x = (numpy.exp(predict[:,-1])-numpy.exp(predict[:,-2])).argmin() 39 | elif strategy == "entropy-based": 40 | x = numpy.nan_to_num(numpy.exp(predict)*predict).sum(axis=1).argmin() 41 | train.append(pool[x]) 42 | del pool[x] 43 | 44 | cl = LogisticRegression() 45 | #cl = LogisticRegression(C=0.1, penalty="l1") 46 | cl.fit(data[train,:], label[train]) 47 | predict = cl.predict_log_proba(data[pool,:]) 48 | log_likelihood = 0 49 | correct = 0 50 | for n, logprob in zip(pool,predict): 51 | c = label[n] 52 | log_likelihood += logprob[c] 53 | if c == logprob.argmax(): correct += 1 54 | 55 | Z = len(pool) 56 | precision = float(correct) / Z 57 | perplexity = numpy.exp(-log_likelihood / Z) 58 | if output: 59 | print "%d : %d / %d = %f, %f" % (len(train), correct, Z, precision, perplexity) 60 | 61 | #print data[train,:], label[train] 62 | 63 | if D==2: 64 | import matplotlib.pyplot as plt 65 | plt.plot(data[pool,0], data[pool,1], 'x', color="red") 66 | plt.plot(data[train,0], data[train,1], 'o', color="red") 67 | plt.title(strategy) 68 | plt.show() 69 | 70 | return precision, perplexity 71 | 72 | 73 | D=10 74 | N=1000 75 | presicions = [] 76 | perplexities = [] 77 | for i in xrange(100): 78 | data = numpy.random.randn(N,D) 79 | label = numpy.zeros(N, dtype=int) 80 | for n in xrange(N): 81 | c = n % D 82 | data[n, c] += 2 83 | label[n] = c 84 | 85 | result = [] 86 | result.append(activelearn(data, label, "random")) 87 | result.append(activelearn(data, label, "least confident")) 88 | result.append(activelearn(data, label, "margin sampling")) 89 | result.append(activelearn(data, label, "entropy-based")) 90 | 91 | x = numpy.array(result) 92 | presicions.append(x[:,0]) 93 | perplexities.append(x[:,1]) 94 | 95 | print numpy.mean(presicions, axis=0) 96 | 97 | -------------------------------------------------------------------------------- /activelearn/uncertain2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning (Uncertainly Sampling) 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import re, collections, numpy 9 | from nltk.corpus import movie_reviews 10 | from nltk.stem import WordNetLemmatizer 11 | 12 | voca = dict() 13 | vocalist = [] 14 | doclist = [] 15 | labels = [] 16 | realphabet = re.compile('^[a-z]+$') 17 | wnl = WordNetLemmatizer() 18 | for id in movie_reviews.fileids(): 19 | doc = collections.defaultdict(int) 20 | for w in movie_reviews.words(id): 21 | if realphabet.match(w): 22 | w = wnl.lemmatize(w) 23 | if w not in voca: 24 | voca[w] = len(vocalist) 25 | vocalist.append(w) 26 | doc[voca[w]] += 1 27 | if len(doc) > 0: doclist.append(doc) 28 | cat = movie_reviews.categories(id)[0] 29 | labels.append(1 if cat == "pos" else 0) 30 | print len(voca) 31 | 32 | labels = numpy.array(labels) 33 | data = numpy.zeros((len(doclist), len(voca))) 34 | for j, doc in enumerate(doclist): 35 | for i, c in doc.iteritems(): 36 | data[j, i] = c 37 | 38 | 39 | from sklearn.feature_extraction.text import TfidfTransformer 40 | transformer = TfidfTransformer(norm=None) 41 | data = transformer.fit_transform(data) 42 | 43 | 44 | from sklearn import cross_validation 45 | 46 | from sklearn.linear_model import LogisticRegression 47 | cl = LogisticRegression() 48 | 49 | from sklearn.naive_bayes import MultinomialNB 50 | #cl = MultinomialNB() 51 | 52 | from sklearn.naive_bayes import BernoulliNB 53 | #cl = BernoulliNB() 54 | 55 | from sklearn.svm import SVC 56 | #cl = SVC() 57 | 58 | from sklearn.ensemble import RandomForestClassifier 59 | #cl = RandomForestClassifier() 60 | 61 | 62 | print cross_validation.cross_val_score(cl, data, labels, cv=10) 63 | 64 | 65 | 66 | """ 67 | import sys, numpy 68 | from sklearn.linear_model import LogisticRegression 69 | from sklearn import cross_validation 70 | 71 | import optparse 72 | parser = optparse.OptionParser() 73 | #parser.add_option("-c", dest="corpus", help="corpus module name under nltk.corpus (e.g. brown, reuters)", default='brown') 74 | #parser.add_option("-r", dest="testrate", type="float", help="rate of test dataset in corpus", default=0.1) 75 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 76 | (opt, args) = parser.parse_args() 77 | numpy.random.seed(opt.seed) 78 | 79 | output = False 80 | 81 | def activelearn(data, label, strategy): 82 | #print strategy 83 | 84 | N, D = data.shape 85 | train = list(range(D)) 86 | pool = range(D,N) 87 | predict = None 88 | 89 | for i in xrange(30-D): 90 | if predict != None: 91 | if strategy == "random": 92 | x = numpy.random.randint(len(pool)) 93 | elif strategy == "least confident": 94 | x = predict.max(axis=1).argmin() 95 | elif strategy == "margin sampling": 96 | predict.sort(axis=1) 97 | x = (numpy.exp(predict[:,-1])-numpy.exp(predict[:,-2])).argmin() 98 | elif strategy == "entropy-based": 99 | x = numpy.nan_to_num(numpy.exp(predict)*predict).sum(axis=1).argmin() 100 | train.append(pool[x]) 101 | del pool[x] 102 | 103 | cl = LogisticRegression() 104 | #cl = LogisticRegression(C=0.1, penalty="l1") 105 | cl.fit(data[train,:], label[train]) 106 | predict = cl.predict_log_proba(data[pool,:]) 107 | log_likelihood = 0 108 | correct = 0 109 | for n, logprob in zip(pool,predict): 110 | c = label[n] 111 | log_likelihood += logprob[c] 112 | if c == logprob.argmax(): correct += 1 113 | 114 | Z = len(pool) 115 | precision = float(correct) / Z 116 | perplexity = numpy.exp(-log_likelihood / Z) 117 | if output: 118 | print "%d : %d / %d = %f, %f" % (len(train), correct, Z, precision, perplexity) 119 | 120 | #print data[train,:], label[train] 121 | 122 | if D==2: 123 | import matplotlib.pyplot as plt 124 | plt.plot(data[pool,0], data[pool,1], 'x', color="red") 125 | plt.plot(data[train,0], data[train,1], 'o', color="red") 126 | plt.title(strategy) 127 | plt.show() 128 | 129 | return precision, perplexity 130 | 131 | 132 | D=10 133 | N=1000 134 | presicions = [] 135 | perplexities = [] 136 | for i in xrange(100): 137 | data = numpy.random.randn(N,D) 138 | label = numpy.zeros(N, dtype=int) 139 | for n in xrange(N): 140 | c = n % D 141 | data[n, c] += 2 142 | label[n] = c 143 | 144 | result = [] 145 | result.append(activelearn(data, label, "random")) 146 | result.append(activelearn(data, label, "least confident")) 147 | result.append(activelearn(data, label, "margin sampling")) 148 | result.append(activelearn(data, label, "entropy-based")) 149 | 150 | x = numpy.array(result) 151 | presicions.append(x[:,0]) 152 | perplexities.append(x[:,1]) 153 | 154 | print numpy.mean(presicions, axis=0) 155 | 156 | """ 157 | -------------------------------------------------------------------------------- /activelearn/uncertain3.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning (Uncertainly Sampling) 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import numpy 9 | import dataset 10 | from sklearn.linear_model import LogisticRegression 11 | 12 | categories = ['crude', 'money-fx', 'trade', 'interest', 'ship', 'wheat', 'corn'] 13 | doclist, labels, voca, vocalist = dataset.load(categories) 14 | print "document size : %d" % len(doclist) 15 | print "vocaburary size : %d" % len(voca) 16 | 17 | data = numpy.zeros((len(doclist), len(voca))) 18 | for j, doc in enumerate(doclist): 19 | for i, c in doc.iteritems(): 20 | data[j, i] = c 21 | 22 | def activelearn(data, label, strategy, train): 23 | print strategy 24 | 25 | N, D = data.shape 26 | train = list(train) # copy initial indexes of training 27 | pool = range(N) 28 | for x in train: pool.remove(x) 29 | 30 | predict = None 31 | precisions = [] 32 | while len(train) < 300: 33 | if predict != None: 34 | if strategy == "random": 35 | x = numpy.random.randint(len(pool)) 36 | elif strategy == "least confident": 37 | x = predict.max(axis=1).argmin() 38 | elif strategy == "margin sampling": 39 | predict.sort(axis=1) 40 | x = (numpy.exp(predict[:,-1])-numpy.exp(predict[:,-2])).argmin() 41 | elif strategy == "entropy-based": 42 | x = numpy.nan_to_num(numpy.exp(predict)*predict).sum(axis=1).argmin() 43 | train.append(pool[x]) 44 | del pool[x] 45 | 46 | cl = LogisticRegression() 47 | cl.fit(data[train,:], label[train]) 48 | predict = cl.predict_log_proba(data[pool,:]) 49 | log_likelihood = 0 50 | correct = 0 51 | for n, logprob in zip(pool,predict): 52 | c = label[n] 53 | log_likelihood += logprob[c] 54 | if c == logprob.argmax(): correct += 1 55 | 56 | Z = len(pool) 57 | precision = float(correct) / Z 58 | perplexity = numpy.exp(-log_likelihood / Z) 59 | print "%d : %d / %d = %f, %f" % (len(train), correct, Z, precision, perplexity) 60 | 61 | precisions.append(precision) 62 | 63 | return precisions 64 | 65 | N_CLASS = labels.max() + 1 66 | train = [numpy.random.choice((labels==k).nonzero()[0]) for k in xrange(N_CLASS)] 67 | 68 | methods = ["random", "least confident", "margin sampling", "entropy-based"] 69 | results = [] 70 | for x in methods: 71 | results.append(activelearn(data, labels, x, train)) 72 | print "\t%s" % "\t".join(methods) 73 | d = len(categories) 74 | for i in xrange(len(results[0])): 75 | print "%d\t%s" % (i+d, "\t".join("%f" % x[i] for x in results)) 76 | -------------------------------------------------------------------------------- /activelearn/uncertain4.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Active Learning (Uncertainly Sampling and Information Density) for 20 newsgroups 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import optparse 9 | import numpy 10 | import scipy.sparse 11 | import sklearn.datasets 12 | from sklearn.linear_model import LogisticRegression 13 | from sklearn.naive_bayes import MultinomialNB 14 | 15 | def activelearn(results, data, test, strategy, train, pool, classifier_factory, max_train, densities): 16 | print strategy 17 | 18 | # copy initial indexes of training and pool 19 | train = list(train) 20 | pool = list(pool) 21 | 22 | accuracies = [] 23 | while len(train) < max_train: 24 | if len(accuracies) > 0: 25 | if strategy == "random": 26 | x = numpy.random.randint(len(pool)) 27 | else: 28 | predict = cl.predict_proba(data.data[pool,:]) 29 | if strategy == "least confident": 30 | x = predict.max(axis=1)-1 31 | elif strategy == "margin sampling": 32 | predict.sort(axis=1) 33 | x = (predict[:,-1] - predict[:,-2]) 34 | elif strategy == "entropy-based": 35 | x = numpy.nan_to_num(predict * numpy.log(predict)).sum(axis=1) 36 | if densities != None: x *= densities[pool] 37 | x = x.argmin() 38 | train.append(pool[x]) 39 | del pool[x] 40 | 41 | cl = classifier_factory() 42 | cl.fit(data.data[train,:], data.target[train]) 43 | accuracy = cl.score(test.data, test.target) 44 | print "%s %d : %f" % (strategy, len(train), accuracy) 45 | accuracies.append(accuracy) 46 | 47 | results.append((strategy, accuracies)) 48 | 49 | 50 | def main(): 51 | parser = optparse.OptionParser() 52 | parser.add_option("-r", dest="method_random", action="store_true", help="use random sampling", default=False) 53 | parser.add_option("-l", dest="method_least", action="store_true", help="use least confident", default=False) 54 | parser.add_option("-m", dest="method_margin", action="store_true", help="use margin sampling", default=False) 55 | parser.add_option("-e", dest="method_entropy", action="store_true", help="use entropy-based method", default=False) 56 | parser.add_option("-a", dest="method_all", action="store_true", help="use all methods", default=False) 57 | 58 | parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None) 59 | parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None) 60 | parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None) 61 | 62 | parser.add_option("-K", dest="class_size", type="int", help="number of class", default=None) 63 | parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=300) 64 | parser.add_option("-t", dest="training", help="specify indexes of training", default=None) 65 | 66 | parser.add_option("-b", dest="beta", type="float", help="density importance", default=0) 67 | 68 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 69 | (opt, args) = parser.parse_args() 70 | numpy.random.seed(opt.seed) 71 | 72 | data = sklearn.datasets.fetch_20newsgroups_vectorized() 73 | print "(train size, voca size) : (%d, %d)" % data.data.shape 74 | 75 | N_CLASS = data.target.max() + 1 76 | if opt.training: 77 | train = [int(x) for x in opt.training.split(",")] 78 | else: 79 | train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)] 80 | print "indexes of training set : ", ",".join("%d" % x for x in train) 81 | if opt.class_size: 82 | index = data.target < opt.class_size 83 | a = data.data.toarray()[index, :] 84 | data.data = scipy.sparse.csr_matrix(a) 85 | data.target = data.target[index] 86 | print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape 87 | 88 | pool = range(data.data.shape[0]) 89 | for x in train: pool.remove(x) 90 | 91 | methods = [] 92 | if opt.method_all: 93 | methods = ["random", "least confident", "margin sampling", "entropy-based"] 94 | else: 95 | if opt.method_random: methods.append("random") 96 | if opt.method_least: methods.append("least confident") 97 | if opt.method_margin: methods.append("margin sampling") 98 | if opt.method_entropy: methods.append("entropy-based") 99 | 100 | if len(methods) > 0: 101 | test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test') 102 | print "(test size, voca size) : (%d, %d)" % test.data.shape 103 | if opt.class_size: 104 | index = test.target < opt.class_size 105 | a = test.data.toarray()[index, :] 106 | test.data = scipy.sparse.csr_matrix(a) 107 | test.target = test.target[index] 108 | print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape 109 | 110 | densities = None 111 | if opt.beta > 0: 112 | densities = (data.data * data.data.T).mean(axis=0).A[0] ** opt.beta 113 | 114 | if opt.logistic_l1: 115 | print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1 116 | classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1) 117 | elif opt.logistic_l2: 118 | print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2 119 | classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2) 120 | else: 121 | a = opt.naive_bayes or 0.01 122 | print "Naive Bayes Classifier : alpha = %f" % a 123 | classifier_factory = lambda: MultinomialNB(alpha=a) 124 | 125 | results = [] 126 | for x in methods: 127 | activelearn(results, data, test, x, train, pool, classifier_factory, opt.max_train, densities) 128 | 129 | print "\t%s" % "\t".join(x[0] for x in results) 130 | d = len(train) 131 | for i in xrange(len(results[0][1])): 132 | print "%d\t%s" % (i+d, "\t".join("%f" % x[1][i] for x in results)) 133 | 134 | 135 | if __name__ == "__main__": 136 | main() 137 | -------------------------------------------------------------------------------- /clustering/irm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Infinite Relational Model 5 | # via 石井健一郎・上田修功 "続・わかりやすいパターン認識" Chapter 14 6 | 7 | # This code is available under the MIT License. 8 | # (c)2016 Nakatani Shuyo / Cybozu Labs Inc. 9 | 10 | import numpy 11 | from scipy.special import betaln, gammaln 12 | 13 | def log_ps(a, n, N): 14 | c = len(n) 15 | return c * numpy.log(a) + gammaln(n).sum() - gammaln(a + N) + gammaln(a) - gammaln(c+1) 16 | 17 | class IRM(object): 18 | def __init__(self, data, alpha, a, b): 19 | self.R = data 20 | self.K, self.L = data.shape 21 | self.alpha = alpha 22 | self.a = a 23 | self.b = b 24 | self.s1 = numpy.zeros(self.K, dtype=int) - 1 25 | self.s2 = numpy.zeros(self.L, dtype=int) - 1 26 | self.n1 = [] 27 | self.n2 = [] 28 | 29 | def update(self): 30 | for k in range(self.K): 31 | p = self.update_cluster(k, self.s1, self.s2, self.n1, self.n2, self.R) 32 | for l in range(self.L): 33 | p = self.update_cluster(l, self.s2, self.s1, self.n2, self.n1, self.R.T) 34 | 35 | def update_cluster(self, k, s1, s2, n1, n2, R): 36 | now_i = s1[k] 37 | s1[k] = -1 38 | if now_i >= 0: 39 | n1[now_i] -= 1 40 | if n1[now_i] == 0: 41 | n1.pop(now_i) 42 | s1[s1>now_i] -= 1 43 | 44 | c1 = len(n1) 45 | c2 = len(n2) 46 | m1, m0, m1k, m0k = self.count_nij(R, s1, s2, c1, c2) 47 | 48 | logps = numpy.zeros(c1+1) 49 | for i in range(c1): 50 | p = numpy.log(n1[i]) 51 | p += self.logZ(self.a+m1[i]+m1k, self.b+m0[i]+m0k).sum() 52 | p -= self.logZ(self.a+m1[i], self.b+m0[i]).sum() 53 | logps[i] = p 54 | p = numpy.log(self.alpha) 55 | p += self.logZ(self.a+m1k, self.b+m0k).sum() 56 | p -= c2 * self.logZ(self.a, self.b) 57 | logps[c1] = p 58 | 59 | logps -= logps.max() 60 | ps = numpy.exp(logps) 61 | ps /= ps.sum() 62 | new_i = numpy.random.choice(c1+1, 1, p=ps) 63 | if new_i=0: 94 | m1i = m1[i] 95 | m0i = m0[i] 96 | else: 97 | m1i = m1k 98 | m0i = m0k 99 | for j, r in zip(s2, rk): 100 | if j<0: continue 101 | m1i[j] += r 102 | m0i[j] += 1-r 103 | return m1, m0, m1k, m0k 104 | 105 | class PoissonIRM(IRM): 106 | def logZ(self, a, b): 107 | "Log Normalization Constant of Gamma Distribution" 108 | return gammaln(a) - a * numpy.log(b) 109 | 110 | def count_nij(self, R, s1, s2, c1, c2): 111 | m1 = numpy.zeros((c1,c2), dtype=int) # C_(-k,+)[i,j] 112 | m0 = numpy.zeros((c1,c2), dtype=int) # m 113 | m1k = numpy.zeros(c2, dtype=int) # C_(k,+)[j] where s_k=ω_i 114 | m0k = numpy.zeros(c2, dtype=int) # m 115 | for i, rk in zip(s1, R): 116 | if i>=0: 117 | m1i = m1[i] 118 | m0i = m0[i] 119 | else: 120 | m1i = m1k 121 | m0i = m0k 122 | for j, r in zip(s2, rk): 123 | if j<0: continue 124 | m1i[j] += r 125 | m0i[j] += 1 126 | return m1, m0, m1k, m0k 127 | 128 | if __name__ == "__main__": 129 | from numpy.random import binomial 130 | from numpy import concatenate as concat 131 | 132 | numpy.random.seed(123) 133 | d = 5 134 | phi = [[0.1, 0.7, 0.2], [0.1, 0.3, 0.9], [0.8, 0.1, 0.2]] 135 | orgR = concat([concat([binomial(1, p, size=(d,d)) for p in pp], axis=1) for pp in phi]) 136 | i = numpy.arange(orgR.shape[0]) 137 | numpy.random.shuffle(i) 138 | R = orgR[i,:] 139 | i = numpy.arange(orgR.shape[1]) 140 | numpy.random.shuffle(i) 141 | R = R[:,i] 142 | 143 | model = IRM(R, alpha=1.0, a=1.0, b=1.0) 144 | maxv = -1e9 145 | for i in range(200): 146 | model.update() 147 | v = model.log_posterior() 148 | if v > maxv: 149 | maxv = v 150 | maxm = model.clone() 151 | print(i, v) 152 | RR = R[numpy.argsort(maxm.s1), :] 153 | RR = RR[:, numpy.argsort(maxm.s2)] 154 | print("--------") 155 | print(orgR) 156 | print(R) 157 | print(maxm.s1) 158 | print(maxm.s2) 159 | print(RR) 160 | print(maxv) 161 | -------------------------------------------------------------------------------- /data/4million.corpus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/data/4million.corpus -------------------------------------------------------------------------------- /data/gen_cluto.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | # generate *.mat/*.clabel for CLUTO 4 | 5 | input = ARGV[0] || 'corpus' 6 | output = input.sub(/\.[^\.]+/, '') 7 | 8 | data = open(input){|f| Marshal.load(f) } 9 | docs = data[:docs] 10 | terms = data[:terms] 11 | 12 | total = 0 13 | terms.each do |term, map| 14 | total += map.size 15 | end 16 | termlist = terms.keys 17 | 18 | open(output+".mat", "w") do |f| 19 | f.puts "#{docs.size} #{terms.size} #{total}" 20 | (0..(docs.size-1)).each do |doc_id| 21 | row = [] 22 | termlist.each_with_index do |term, term_id| 23 | v = terms[term][doc_id] 24 | row << term_id+1 << v if v && v>0 25 | end 26 | f.puts row.join(" ") 27 | end 28 | end 29 | 30 | open(output+".clabel", "w") do |f| 31 | termlist.each do |term| 32 | f.puts term 33 | end 34 | end 35 | 36 | open(output+".rlabel", "w") do |f| 37 | docs.each do |doc| 38 | f.puts doc[:title] 39 | end 40 | end 41 | 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /data/gen_corpus.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | # ruby gen_corpus.rb ohenry/1444.zip ohenry/1646.zip ohenry/1725.zip ohenry/2777.zip ohenry/2776.zip 3 | 4 | require 'pstore' 5 | require "../lib/extract_gutenberg.rb" 6 | require '../lib/infinitive.rb' 7 | INF = Infinitive.new 8 | 9 | DEST_DIR = "output" 10 | DEBUG = false 11 | 12 | docs = Array.new 13 | terms = Hash.new 14 | 15 | doc_id = 0 16 | ARGV.each do |path| 17 | file = path.dup 18 | file = $1 if path =~ /\/([^\/]+)$/ 19 | text = nil 20 | if path =~ /\.zip$/i 21 | file.sub!(/\.zip$/i, ".txt") 22 | text = `unzip -cq #{path} "*.txt"` 23 | open("#{DEST_DIR}/#{file}.org", "w"){|f| f.write text} 24 | else 25 | text = open(path){|f| f.read} 26 | end 27 | text = Gutenberg.extract(text) 28 | open("#{DEST_DIR}/#{file}", "w"){|f| f.write text} 29 | 30 | list = text.split(/^[IVX]+\s*\.?$/)[1..-1] 31 | list = text.split(/^\n{4}$/) if list.size<=1 32 | list.each do |x| 33 | next unless x =~ /^(.+)$/ 34 | title = $1 35 | 36 | words = x.scan(/[A-Za-z]+(?:'t)?/) 37 | next if words.size < 1000 38 | 39 | n = 0 40 | words.each do |word| 41 | word = INF.infinitive(word) 42 | terms[word] ||= Hash.new(0) 43 | terms[word][doc_id] += 1 44 | n += 1 45 | end 46 | 47 | docs[doc_id] = {:title=>title, :n_words=>n} 48 | doc_id += 1 49 | end 50 | end 51 | 52 | db = PStore.new('corpus') 53 | db.transaction do 54 | db[:docs] = docs 55 | db[:terms] = terms 56 | end 57 | -------------------------------------------------------------------------------- /data/gen_libsvm.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | # generate libsvm-format data 3 | # gen_libsvm.rb [positive] [negative] 4 | if ARGV.size<2 5 | puts "gen_libsvm.rb [positive] [negative]" 6 | exit 1 7 | end 8 | 9 | require "../lib/extract_gutenberg.rb" 10 | require '../lib/infinitive.rb' 11 | INF = Infinitive.new 12 | 13 | REG_TITLE = /^([A-Z][A-Z\-', ]+)$/ 14 | class Analyzer 15 | def initialize 16 | @terms = Hash.new 17 | @docs = Array.new 18 | end 19 | attr_reader :docs, :terms 20 | def extract_words(path, sign) 21 | file = path.dup 22 | file = $1 if path =~ /\/([^\/]+)$/ 23 | text = if path =~ /\.zip$/i 24 | file.sub!(/\.zip$/i, ".txt") 25 | `unzip -cq #{path} "*.txt"` 26 | else 27 | open(path){|f| f.read} 28 | end 29 | text = Gutenberg.extract(text) 30 | 31 | list = text.split(REG_TITLE) 32 | 33 | title = nil 34 | list.each do |x| 35 | if x =~ REG_TITLE 36 | title = x 37 | next 38 | end 39 | words = x.scan(/[A-Za-z]+(?:'t)?/) 40 | next if words.size < 1000 41 | 42 | while words.size >= 100 43 | subwords = words.slice!(0, 100) 44 | n = 0 45 | doc_id = @docs.size 46 | subwords.each do |word| 47 | word = INF.infinitive(word) 48 | @terms[word] ||= Hash.new(0) 49 | @terms[word][doc_id] += 1 50 | n += 1 51 | end 52 | @docs << {:title=>title, :n_words=>n, :sign=>sign} 53 | end 54 | end 55 | end 56 | end 57 | 58 | ana = Analyzer.new 59 | ana.extract_words ARGV[0], "+1" 60 | ana.extract_words ARGV[1], "-1" 61 | 62 | words = ana.terms.keys 63 | ana.docs.each_with_index do |doc, doc_id| 64 | buf = [doc[:sign]] 65 | words.each_with_index do |word, word_id| 66 | if ana.terms[word] 67 | freq = ana.terms[word][doc_id] 68 | buf << "#{word_id}:#{freq}" if freq>0 69 | end 70 | end 71 | puts buf.join(' ') 72 | end 73 | 74 | -------------------------------------------------------------------------------- /data/ohenry.corpus: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/data/ohenry.corpus -------------------------------------------------------------------------------- /dnn/README.md: -------------------------------------------------------------------------------- 1 | # Deep Learning Experiment Code 2 | ## Video 3 | 4 | * SVHN Generator with DCGAN ( [dcgan-svhn.py](/dnn/dcgan-svhn.py) ) 5 | * https://youtu.be/yXyJq35w5gk 6 | 7 | * SVHN Generator based on DCGAN+Conditional GAN ( [cdcgan-svhn.py](/dnn/cdcgan-svhn.py) ) 8 | * https://youtu.be/IXaeo9wxSoQ 9 | -------------------------------------------------------------------------------- /dnn/cdcgan-svhn.ini: -------------------------------------------------------------------------------- 1 | 2 | [DEFAULT] 3 | model filename = cdcgan-svhn.model 4 | 5 | # download train_32x32.mat in advance from http://ufldl.stanford.edu/housenumbers/ 6 | SVHN path = data/train_32x32.mat 7 | number of labels = 10 8 | 9 | noise dim = 100 10 | discriminator hidden units = 64 128 256 11 | generator hidden units = 512 256 128 12 | 13 | alpha = 2e-4 14 | mini batch size = 128 15 | epoch = 10 16 | 17 | working directory = svhn_gen 18 | 19 | [small] 20 | noise dim = 10 21 | discriminator hidden units = 16 32 64 22 | generator hidden units = 128 64 32 23 | epoch = 10 24 | working directory = svhn_gen2 25 | 26 | [large] 27 | noise dim = 100 28 | discriminator hidden units = 128 256 512 29 | generator hidden units = 1024 512 256 30 | epoch = 100 31 | working directory = svhn_gen3 32 | 33 | alpha = 5e-5 34 | 35 | -------------------------------------------------------------------------------- /dnn/cgan-mnist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # MNIST generator based on Conditional Generative Adversarial Networks with Tensorflow 5 | # (M. Mirza and S. Osindero. Conditional generative adversarial nets. CoRR, abs/1411.1784, 2014.) 6 | 7 | # This code is available under the MIT License. 8 | # (c)2016 Nakatani Shuyo / Cybozu Labs Inc. 9 | 10 | import numpy, math 11 | import tensorflow as tf 12 | from tensorflow.examples.tutorials.mnist import input_data 13 | import matplotlib.pyplot as plt 14 | 15 | # model parameter 16 | noise_dim = 10 # input noise size of Generator 17 | Dhidden = 256 # hidden units of Discriminator's network 18 | Ghidden = 512 # hidden units of Generator's network 19 | K = 8 # maxout units of Discriminator 20 | 21 | mini_batch_size = 50 22 | epoch = 50 23 | nsamples = 7 # drawing samples 24 | 25 | mnist = input_data.read_data_sets("data/", one_hot=True) 26 | N, num_features = mnist.train.images.shape 27 | _, num_labels = mnist.train.labels.shape 28 | period = N // mini_batch_size 29 | 30 | X = tf.placeholder(tf.float32, shape=(None, num_features)) 31 | Y = tf.placeholder(tf.float32, shape=(None, num_labels)) 32 | Z = tf.placeholder(tf.float32, shape=(None, noise_dim)) 33 | keep_prob = tf.placeholder(tf.float32) 34 | 35 | GW1z = tf.Variable(tf.random_normal([noise_dim, Ghidden], stddev=0.1), name="GW1z") 36 | GW1y = tf.Variable(tf.random_normal([num_labels, Ghidden], stddev=0.1), name="GW1y") 37 | Gb1 = tf.Variable(tf.zeros(Ghidden), name="Gb1") 38 | GW2 = tf.Variable(tf.random_normal([Ghidden, num_features], stddev=0.1), name="GW2") 39 | Gb2 = tf.Variable(tf.zeros(num_features), name="Gb2") 40 | 41 | DW1x = tf.Variable(tf.random_normal([num_features, K * Dhidden], stddev=0.01), name="DW1x") 42 | DW1y = tf.Variable(tf.random_normal([num_labels, K * Dhidden], stddev=0.01), name="DW1y") 43 | Db1 = tf.Variable(tf.zeros(K * Dhidden), name="Db1") 44 | DW2 = tf.Variable(tf.random_normal([Dhidden, 1], stddev=0.01), name="DW2") 45 | Db2 = tf.Variable(tf.zeros(1), name="Db2") 46 | 47 | def discriminator(x, y): 48 | u = tf.reshape(tf.matmul(x, DW1x) + tf.matmul(y, DW1y) + Db1, [-1, K, Dhidden]) 49 | Dh1 = tf.nn.dropout(tf.reduce_max(u, reduction_indices=[1]), keep_prob) 50 | return tf.nn.sigmoid(tf.matmul(Dh1, DW2) + Db2) 51 | 52 | Gh1 = tf.nn.relu(tf.matmul(Z, GW1z) + tf.matmul(Y, GW1y) + Gb1) 53 | G = tf.nn.sigmoid(tf.matmul(Gh1, GW2) + Gb2) 54 | DG = discriminator(G, Y) 55 | 56 | Dloss = -tf.reduce_mean(tf.log(discriminator(X, Y)) + tf.log(1 - DG)) 57 | Gloss = tf.reduce_mean(tf.log(1 - DG) - tf.log(DG + 1e-9)) # the second term for stable learning 58 | 59 | vars = tf.trainable_variables() 60 | Dvars = [v for v in vars if v.name.startswith("D")] 61 | Gvars = [v for v in vars if v.name.startswith("G")] 62 | 63 | Doptimizer = tf.train.AdamOptimizer().minimize(Dloss, var_list=Dvars) 64 | Goptimizer = tf.train.AdamOptimizer().minimize(Gloss, var_list=Gvars) 65 | 66 | sess = tf.Session() 67 | sess.run(tf.initialize_all_variables()) 68 | 69 | for e in range(epoch): 70 | dloss = gloss = 0.0 71 | for i in range(period): 72 | x, y = mnist.train.next_batch(mini_batch_size) 73 | z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim)) 74 | loss, _ = sess.run([Dloss, Doptimizer], feed_dict={X:x, Y:y, Z:z, keep_prob:0.5}) 75 | dloss += loss 76 | z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim)) 77 | loss, _ = sess.run([Gloss, Goptimizer], feed_dict={Y:y, Z:z, keep_prob:1.0}) 78 | gloss += loss 79 | 80 | print("%d: dloss=%.5f, gloss=%.5f" % (e+1, dloss / period, gloss / period)) 81 | if math.isnan(dloss) or math.isnan(gloss): 82 | sess.run(tf.initialize_all_variables()) # initialize & retry if NaN 83 | 84 | def save_figures(path, z): 85 | fig = plt.figure() 86 | fig.subplots_adjust(left=0,bottom=0,right=1,top=1) 87 | for i in range(num_labels): 88 | y = numpy.zeros((z.shape[0], num_labels)) 89 | y[:,i] = 1 90 | Gz = sess.run(G, feed_dict={Y:y, Z: z}) 91 | for j in range(nsamples): 92 | ax = fig.add_subplot(nsamples, num_labels, j * num_labels + i + 1) 93 | ax.axis("off") 94 | ax.imshow(Gz[j,:].reshape((28,28)), cmap=plt.get_cmap("gray")) 95 | fig.savefig(path) 96 | plt.close(fig) 97 | 98 | z = numpy.random.uniform(-1, 1, size=(nsamples, noise_dim)) 99 | #z[:,0] = numpy.arange(0, nsamples) / (nsamples - 1) * 2 - 1 100 | save_figures("cgan-mnist.png", z) 101 | 102 | -------------------------------------------------------------------------------- /dnn/dcgan-svhn.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # SVHN generator based on DCGAN with Tensorflow 5 | # Radford, A., Metz, L., and Chintala, S. Unsupervised representation learning with deep convolutional generative adversarial networks. 2016. 6 | 7 | # This code is available under the MIT License. 8 | # (c)2016 Nakatani Shuyo / Cybozu Labs Inc. 9 | 10 | import numpy, math, time 11 | import scipy.io 12 | import tensorflow as tf 13 | import matplotlib.pyplot as plt 14 | 15 | # model parameter 16 | noise_dim = 100 # input noise size of Generator 17 | Dhidden = [64, 128, 256] # hidden units of Discriminator's network 18 | Ghidden = [512, 256, 128] # hidden units of Generator's network 19 | 20 | mini_batch_size = 128 21 | samples=(10,12) # samples drawing size 22 | nsamples = samples[0] * samples[1] 23 | assert nsamples <= mini_batch_size 24 | epoch = 100 25 | 26 | # download train_32x32.mat in advance from http://ufldl.stanford.edu/housenumbers/ 27 | svhn = scipy.io.loadmat("data/train_32x32.mat") 28 | train_data = svhn["X"] 29 | #train_data = train_data[:, :, :, :256] # small dataset 30 | fig_width, fig_height, n_channels, N = train_data.shape 31 | train_data = train_data.reshape(fig_width * fig_height * n_channels, N) 32 | train_data -= train_data.min(axis=0) 33 | train_data = (numpy.array(train_data, dtype=numpy.float32) / train_data.max(axis=0)).T.reshape(N, fig_width, fig_height, n_channels) 34 | period = N // mini_batch_size 35 | 36 | X = tf.placeholder(tf.float32, shape=(None, fig_width, fig_height, n_channels)) 37 | Z = tf.placeholder(tf.float32, shape=(None, noise_dim)) 38 | keep_prob = tf.placeholder(tf.float32) 39 | 40 | with tf.variable_scope("G"): 41 | GW0 = tf.Variable(tf.random_normal([noise_dim, Ghidden[0]*4*4], stddev=0.01)) 42 | Gb0 = tf.Variable(tf.zeros(Ghidden[0])) 43 | GW1 = tf.Variable(tf.random_normal([5, 5, Ghidden[1], Ghidden[0]], stddev=0.01)) 44 | Gb1 = tf.Variable(tf.zeros(Ghidden[1])) 45 | GW2 = tf.Variable(tf.random_normal([5, 5, Ghidden[2], Ghidden[1]], stddev=0.01)) 46 | Gb2 = tf.Variable(tf.zeros(Ghidden[2])) 47 | GW3 = tf.Variable(tf.random_normal([5, 5, n_channels, Ghidden[2]], stddev=0.01)) 48 | Gb3 = tf.Variable(tf.zeros(n_channels)) 49 | 50 | # batch normalization & relu 51 | def bn(u): 52 | mean, variance = tf.nn.moments(u, axes=[0, 1, 2]) 53 | return tf.nn.relu(tf.nn.batch_normalization(u, mean, variance, None, None, 1e-5)) 54 | 55 | Gh0 = bn(tf.nn.bias_add(tf.reshape(tf.matmul(Z, GW0), [-1, fig_width//8, fig_height//8, Ghidden[0]]), Gb0)) 56 | Gh1 = bn(tf.nn.bias_add(tf.nn.conv2d_transpose(Gh0, GW1, [mini_batch_size, fig_width//4, fig_height//4, Ghidden[1]], [1, 2, 2, 1]), Gb1)) 57 | Gh2 = bn(tf.nn.bias_add(tf.nn.conv2d_transpose(Gh1, GW2, [mini_batch_size, fig_width//2, fig_height//2, Ghidden[2]], [1, 2, 2, 1]), Gb2)) 58 | G = tf.nn.tanh(tf.nn.bias_add(tf.nn.conv2d_transpose(Gh2, GW3, [mini_batch_size, fig_width, fig_height, n_channels], [1, 2, 2, 1]), Gb3)) 59 | 60 | with tf.variable_scope("D"): 61 | DW0 = tf.Variable(tf.random_normal([5, 5, n_channels, Dhidden[0]], stddev=0.01)) 62 | Db0 = tf.Variable(tf.zeros(Dhidden[0])) 63 | DW1 = tf.Variable(tf.random_normal([5, 5, Dhidden[0], Dhidden[1]], stddev=0.01)) 64 | Db1 = tf.Variable(tf.zeros(Dhidden[1])) 65 | DW2 = tf.Variable(tf.random_normal([5, 5, Dhidden[1], Dhidden[2]], stddev=0.01)) 66 | Db2 = tf.Variable(tf.zeros(Dhidden[2])) 67 | DW3 = tf.Variable(tf.random_normal([(fig_width//8)*(fig_height//8)*Dhidden[2], 1], stddev=0.01)) 68 | Db3 = tf.Variable(tf.zeros(1)) 69 | 70 | # batch normalization & leaky relu 71 | def bnl(u, a=0.2): 72 | mean, variance = tf.nn.moments(u, axes=[0, 1, 2]) 73 | b = tf.nn.batch_normalization(u, mean, variance, None, None, 1e-5) 74 | return tf.maximum(a * b, b) 75 | 76 | def discriminator(xx): 77 | Dh0 = bnl(tf.nn.bias_add(tf.nn.conv2d(xx, DW0, [1, 2, 2, 1], padding='SAME'), Db0)) 78 | Dh1 = bnl(tf.nn.bias_add(tf.nn.conv2d(Dh0, DW1, [1, 2, 2, 1], padding='SAME'), Db1)) 79 | Dh2 = bnl(tf.nn.bias_add(tf.nn.conv2d(Dh1, DW2, [1, 2, 2, 1], padding='SAME'), Db2)) 80 | return tf.nn.sigmoid(tf.matmul(tf.reshape(Dh2, [-1, (fig_width//8)*(fig_height//8)*Dhidden[2]]), DW3) + Db3) 81 | 82 | DG = discriminator(G) 83 | Dloss = -tf.reduce_mean(tf.log(discriminator(X)) + tf.log(1 - DG)) 84 | Gloss = tf.reduce_mean(tf.log(1 - DG) - tf.log(DG + 1e-9)) # the second term for stable learning 85 | 86 | vars = tf.trainable_variables() 87 | Dvars = [v for v in vars if v.name.startswith("D")] 88 | Gvars = [v for v in vars if v.name.startswith("G")] 89 | 90 | Doptimizer = tf.train.AdamOptimizer(learning_rate=2e-4).minimize(Dloss, var_list=Dvars) 91 | Goptimizer = tf.train.AdamOptimizer(learning_rate=2e-4).minimize(Gloss, var_list=Gvars) 92 | 93 | sess = tf.Session() 94 | sess.run(tf.initialize_all_variables()) 95 | 96 | def save_figure(path, z): 97 | Gz = sess.run(G, feed_dict={Z: z}) 98 | #plt.ion() 99 | fig = plt.gcf() 100 | fig.subplots_adjust(left=0,bottom=0,right=1,top=1) 101 | for i in range(nsamples): 102 | ax = fig.add_subplot(samples[0], samples[1], i + 1) 103 | ax.axis("off") 104 | ax.imshow(Gz[i,:,:,:]) 105 | plt.savefig(path) 106 | plt.draw() 107 | plt.pause(0.01) 108 | 109 | t0 = time.time() 110 | drawz = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim)) # nsamples < mini_batch_size 111 | for e in range(epoch): 112 | index = numpy.random.permutation(N) 113 | dloss = gloss = 0.0 114 | for i in range(period): 115 | x = train_data[index[i*mini_batch_size:(i+1)*mini_batch_size], :] 116 | z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim)) 117 | loss, _ = sess.run([Dloss, Doptimizer], feed_dict={X:x, Z:z, keep_prob:0.5}) 118 | dloss += loss 119 | z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim)) 120 | loss, _ = sess.run([Gloss, Goptimizer], feed_dict={Z:z, keep_prob:1.0}) 121 | gloss += loss 122 | 123 | if math.isnan(dloss) or math.isnan(gloss): 124 | sess.run(tf.initialize_all_variables()) # initialize & retry if NaN 125 | print("...initialize parameters for nan...") 126 | dloss = gloss = 0.0 127 | 128 | print("%d: dloss=%.5f, gloss=%.5f, time=%.1f" % (e+1, dloss / period, gloss / period, time.time()-t0)) 129 | save_figure("png/dcgan-svhn-%03d.png" % (e+1), drawz) 130 | 131 | saver = tf.train.Saver() 132 | saver.save(sess, "dcgan-svhn.model") 133 | -------------------------------------------------------------------------------- /dnn/gan-mnist.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # MNIST generator based on Generative Adversarial Networks with Tensorflow 5 | # (I. Goodfellow, J. Pouget-Abadie, M. Mirza, B. Xu, D. Warde-Farley, S. Ozair, A. Courville, and Y. Bengio. Generative adversarial nets. In NIPS, pages 2672–2680. 2014.) 6 | 7 | # This code is available under the MIT License. 8 | # (c)2016 Nakatani Shuyo / Cybozu Labs Inc. 9 | 10 | import numpy, math 11 | import tensorflow as tf 12 | from tensorflow.examples.tutorials.mnist import input_data 13 | import matplotlib.pyplot as plt 14 | 15 | # model parameter 16 | noise_dim = 32 # input noise size of Generator 17 | Dhidden = 256 # hidden units of Discriminator's network 18 | Ghidden = 512 # hidden units of Generator's network 19 | K = 8 # maxout units of Discriminator 20 | 21 | mini_batch_size = 50 22 | epoch = 50 23 | samples=(5,6) # samples drawing size 24 | 25 | mnist = input_data.read_data_sets("data/", one_hot=True) 26 | N, num_features = mnist.train.images.shape 27 | period = N // mini_batch_size 28 | 29 | X = tf.placeholder(tf.float32, shape=(None, num_features)) 30 | Z = tf.placeholder(tf.float32, shape=(None, noise_dim)) 31 | keep_prob = tf.placeholder(tf.float32) 32 | 33 | with tf.variable_scope("G"): 34 | GW1 = tf.Variable(tf.random_normal([noise_dim, Ghidden], stddev=0.1)) 35 | Gb1 = tf.Variable(tf.zeros(Ghidden)) 36 | GW2 = tf.Variable(tf.random_normal([Ghidden, num_features], stddev=0.1)) 37 | Gb2 = tf.Variable(tf.zeros(num_features)) 38 | 39 | with tf.variable_scope("D"): 40 | DW1 = tf.Variable(tf.random_normal([num_features, K * Dhidden], stddev=0.01)) 41 | Db1 = tf.Variable(tf.zeros(K * Dhidden)) 42 | DW2 = tf.Variable(tf.random_normal([Dhidden, 1], stddev=0.01)) 43 | Db2 = tf.Variable(tf.zeros(1)) 44 | 45 | def discriminator(xx): 46 | u = tf.reshape(tf.matmul(xx, DW1) + Db1, [-1, K, Dhidden]) 47 | Dh1 = tf.nn.dropout(tf.reduce_max(u, reduction_indices=[1]), keep_prob) 48 | return tf.nn.sigmoid(tf.matmul(Dh1, DW2) + Db2) 49 | 50 | Gh1 = tf.nn.relu(tf.matmul(Z, GW1) + Gb1) 51 | G = tf.nn.sigmoid(tf.matmul(Gh1, GW2) + Gb2) 52 | DG = discriminator(G) 53 | Dloss = -tf.reduce_mean(tf.log(discriminator(X)) + tf.log(1 - DG)) 54 | Gloss = tf.reduce_mean(tf.log(1 - DG) - tf.log(DG + 1e-9)) # the second term for stable learning 55 | 56 | vars = tf.trainable_variables() 57 | Dvars = [v for v in vars if v.name.startswith("D")] 58 | Gvars = [v for v in vars if v.name.startswith("G")] 59 | 60 | Doptimizer = tf.train.AdamOptimizer().minimize(Dloss, var_list=Dvars) 61 | Goptimizer = tf.train.AdamOptimizer().minimize(Gloss, var_list=Gvars) 62 | 63 | sess = tf.Session() 64 | sess.run(tf.initialize_all_variables()) 65 | 66 | dloss = gloss = 0.0 67 | for i in range(epoch * period): 68 | x, _ = mnist.train.next_batch(mini_batch_size) 69 | z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim)) 70 | loss, _ = sess.run([Dloss, Doptimizer], feed_dict={X:x, Z:z, keep_prob:0.5}) 71 | dloss += loss 72 | z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim)) 73 | loss, _ = sess.run([Gloss, Goptimizer], feed_dict={Z:z, keep_prob:1.0}) 74 | gloss += loss 75 | 76 | if (i+1) % period == 0: 77 | print("%d: dloss=%.5f, gloss=%.5f" % ((i+1)//period, dloss / period, gloss / period)) 78 | if math.isnan(dloss) or math.isnan(gloss): 79 | sess.run(tf.initialize_all_variables()) # initialize & retry if NaN 80 | dloss = gloss = 0.0 81 | 82 | nsamples = samples[0] * samples[1] 83 | def save_figures(path, z): 84 | Gz = sess.run(G, feed_dict={Z: z}) 85 | fig = plt.figure() 86 | fig.subplots_adjust(left=0,bottom=0,right=1,top=1) 87 | for i in range(nsamples): 88 | ax = fig.add_subplot(samples[0], samples[1], i + 1) 89 | ax.axis("off") 90 | ax.imshow(Gz[i,:].reshape((28,28)), cmap=plt.get_cmap("gray")) 91 | fig.savefig(path) 92 | plt.close(fig) 93 | 94 | z = numpy.random.uniform(-1, 1, size=(nsamples, noise_dim)) 95 | #z[:,0] = numpy.arange(0, nsamples) / (nsamples - 1) * 2 - 1 96 | save_figures("gan-mnist.png", z) 97 | 98 | -------------------------------------------------------------------------------- /extractcontent/test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Extract Web Content - Test 5 | # (c)2010 Nakatani Shuyo, Cybozu Labs Inc. 6 | 7 | import sys, os, re 8 | from optparse import OptionParser 9 | sys.path.append("../hmm") 10 | from hmm import HMM 11 | 12 | def load_data(directory): 13 | import glob 14 | htmllist = glob.glob(os.path.join(directory, "*.html")) 15 | features = [] 16 | for filename in htmllist: 17 | taglist = [] 18 | f = open(filename, 'r') 19 | for line in f: 20 | tags = re.findall(r'<(\w+)',line) 21 | if len(tags)>0: taglist.extend([x.lower() for x in tags]) 22 | f.close() 23 | features.append(taglist) 24 | return features 25 | 26 | def main(): 27 | parser = OptionParser() 28 | parser.add_option("-t", dest="test", help="test data directory") 29 | parser.add_option("-m", dest="model", help="model data filename to save") 30 | (options, args) = parser.parse_args() 31 | if not options.model: parser.error("need model data filename(-m)") 32 | 33 | hmm = HMM() 34 | hmm.load(options.model) 35 | 36 | if options.test: 37 | tests = load_data(options.test) 38 | for x in tests: 39 | print zip(x, hmm.Viterbi(hmm.words2id(x))) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | 44 | -------------------------------------------------------------------------------- /extractcontent/train.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Extract Web Content with HMM 5 | # (c)2010 Nakatani Shuyo, Cybozu Labs Inc. 6 | 7 | import sys, os, re 8 | from optparse import OptionParser 9 | sys.path.append("../hmm") 10 | from hmm import HMM 11 | #import numpy 12 | #from numpy.random import dirichlet, randn 13 | 14 | def load_data(directory): 15 | import glob 16 | htmllist = glob.glob(os.path.join(directory, "*.html")) 17 | features = [] 18 | for filename in htmllist: 19 | taglist = [] 20 | f = open(filename, 'r') 21 | for line in f: 22 | tags = re.findall(r'<(\w+)',line) 23 | if len(tags)>0: taglist.extend([x.lower() for x in tags]) 24 | f.close() 25 | features.append(taglist) 26 | return features 27 | 28 | def main(): 29 | parser = OptionParser() 30 | parser.add_option("-d", dest="training", help="training data directory") 31 | parser.add_option("-k", dest="K", type="int", help="number of latent states", default=6) 32 | parser.add_option("-a", dest="a", type="float", help="Dirichlet parameter", default=1.0) 33 | parser.add_option("-i", dest="I", type="int", help="iteration count", default=10) 34 | parser.add_option("-m", dest="model", help="model data filename to save") 35 | (options, args) = parser.parse_args() 36 | if not options.training: parser.error("need training data directory(-d)") 37 | 38 | features = load_data(options.training) 39 | 40 | hmm = HMM() 41 | hmm.set_corpus(features) 42 | hmm.init_inference(options.K, options.a) 43 | pre_L = -1e10 44 | for i in range(options.I): 45 | log_likelihood = hmm.inference() 46 | print i, ":", log_likelihood 47 | if pre_L > log_likelihood: break 48 | pre_L = log_likelihood 49 | if options.model: 50 | hmm.save(options.model) 51 | else: 52 | hmm.dump() 53 | 54 | if __name__ == "__main__": 55 | main() 56 | 57 | -------------------------------------------------------------------------------- /hac/fselect.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | # feature selection 3 | # (cf. "Feature Selection and Document Clustering" http://www.csee.umbc.edu/cadip/2002Symposium/kogan.pdf ) 4 | 5 | require '../lib/infinitive.rb' 6 | INF = Infinitive.new 7 | 8 | require 'optparse' 9 | opt = {:n_words=>1000, :type=>:q0, :stopwords=>true} 10 | parser = OptionParser.new 11 | parser.on('-n [VAL]', Integer) {|v| opt[:n_words] = v } 12 | parser.on('-t [VAL]', [:q0, :q1]) {|v| opt[:type] = v } 13 | parser.on('-s', 'exclude stop words') {|v| opt[:stopwords] = false } 14 | parser.parse!(ARGV) 15 | 16 | 17 | filename = ARGV[0] || 'corpus' 18 | data = open(filename){|f| Marshal.load(f) } 19 | docs = data[:docs] 20 | terms = data[:terms] 21 | 22 | 23 | stopwords = Hash.new(true) 24 | <a[1]}[0..opt[:n_words]-1] 75 | 76 | new_terms = {} 77 | ev.each do |term, v| 78 | new_terms[term] = terms[term] 79 | end 80 | 81 | open("#{filename}.#{opt[:type]}", "w") do |f| 82 | Marshal.dump({:docs=>docs, :terms=>new_terms}, f) 83 | end 84 | 85 | puts "#{terms.size} => #{new_terms.size}" 86 | #puts ev.map{|x| x[0]}.join(' ') 87 | -------------------------------------------------------------------------------- /hac/hac.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/hac/hac.rb -------------------------------------------------------------------------------- /hac/naive_hac.rb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/hac/naive_hac.rb -------------------------------------------------------------------------------- /irt/irt.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | require 'zlib' 3 | 4 | THRESHOLD = 1.2 5 | 6 | learnfile = ARGV[0] || 'irt.data' 7 | users = Hash.new 8 | words = Hash.new 9 | begin 10 | open(learnfile) do |f| 11 | users, words = Marshal.load(f) 12 | end 13 | rescue 14 | puts "create new learning data randomizely." 15 | end 16 | 17 | data = [] 18 | Zlib::GzipReader.open('word_scores.txt.gz') do |f| 19 | while line = f.gets 20 | if line =~ /^([0-9]+)\s+([0-9]+)\s*([0-9\.]+)$/ 21 | user_id = $1.to_i 22 | word_id = $2.to_i 23 | point = $3.to_f 24 | #next if point > 10000 25 | 26 | t = if point < THRESHOLD then 1 else 0 end 27 | data << [user_id, word_id, t] 28 | 29 | users[user_id] = rand unless users.key?(user_id) 30 | words[word_id] = rand unless words.key?(word_id) 31 | end 32 | end 33 | end 34 | 35 | 10000.times do |k| 36 | eta = 0.01 #1.0/(k+10) 37 | e = 0 38 | error = 0 39 | data.sort_by{rand}.each do |user_id, word_id, t| 40 | z = users[user_id] - words[word_id] 41 | y = 1.0/(1.0+Math.exp(-z)) 42 | e -= if t==1 then Math.log(y) else Math.log(1-y) end 43 | error += 1 if (t==1 && y<0.5) || (t==0 && y>0.5) 44 | 45 | grad_e_eta = eta*(y - t) 46 | users[user_id] -= grad_e_eta 47 | words[word_id] += grad_e_eta 48 | end 49 | puts "#{k}: #{error}, #{e}" 50 | open(learnfile+".1", 'w'){|f| Marshal.dump([users,words], f) } if (k % 50) == 0 51 | end 52 | 53 | open(learnfile, 'w'){|f| Marshal.dump([users,words], f) } 54 | 55 | -------------------------------------------------------------------------------- /langdetect/common.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby -Ku 2 | 3 | require 'mysql' 4 | require 'optparse' 5 | 6 | module LD 7 | ENTITIES = { 8 | "'"=>"'", 9 | "&"=>"&", 10 | ">"=>">", 11 | "<"=>"<", 12 | """=>'"', 13 | "»"=>"" 14 | } 15 | 16 | def self.optparser 17 | @opt = { 18 | :host=>'localhost', :user=>'root', :passwd=>'', :dbname=>'googlenews', :port=>3306, 19 | :model=>'model.json' 20 | } 21 | 22 | parser = OptionParser.new 23 | parser.on('--host=VAL', String, 'database host') {|v| @opt[:host] = v } 24 | parser.on('--user=VAL', String, 'database user') {|v| @opt[:user] = v } 25 | parser.on('--password=VAL', String, 'database password') {|v| @opt[:passwd] = v } 26 | parser.on('--dbname=VAL', String, 'database name') {|v| @opt[:dbname] = v } 27 | parser.on('--port=VAL', Integer, 'database port') {|v| @opt[:port] = v } 28 | parser.on('-f VAL', String, 'model filename') {|v| @opt[:model] = v } 29 | parser 30 | end 31 | def self.model_filename 32 | @opt[:model] 33 | end 34 | 35 | def self.db_connect 36 | db = Mysql::init 37 | db.options Mysql::SET_CHARSET_NAME, 'utf8' 38 | db.real_connect @opt[:host], @opt[:user], @opt[:passwd], @opt[:dbname], @opt[:port] 39 | db 40 | end 41 | 42 | def self.decode_entity(st) 43 | st.gsub(/&[^ &]+?;/){|m| ENTITIES[m] || m} 44 | end 45 | end 46 | 47 | -------------------------------------------------------------------------------- /langdetect/crawler.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # -*- coding: utf-8 -*- 3 | 4 | #require 'rubygems' 5 | require 'open-uri' 6 | require 'rss/2.0' 7 | 8 | require 'common.rb' 9 | require 'detect.rb' 10 | 11 | LD::optparser.parse!(ARGV) 12 | db = LD::db_connect 13 | 14 | # Database 15 | # create database googlenews character set utf8; 16 | # create table news (id int auto_increment, title varchar(1024), lang varchar(8), body text, primary key (id)); 17 | # create index news_title on news (title); 18 | # create index news_lang on news (lang); 19 | ps_select = db.prepare("select id from news where title=?") 20 | ps_insert = db.prepare("insert into news (title,lang,body) values (?,?,?)") 21 | 22 | # Google News RSS 23 | def rssurl(lang) 24 | if lang=="ja" 25 | 'http://news.google.com/news?hl=ja&ned=us&ie=UTF-8&oe=UTF-8&output=rss' 26 | else 27 | "http://news.google.com/news?pz=1&cf=all&hl=#{lang}&output=rss" 28 | end 29 | end 30 | 31 | langlist = LanguageDetector::LANGLIST 32 | 33 | langlist.each do |lang| 34 | url = rssurl(lang) 35 | #puts url 36 | rss = open(url) {|f| RSS::Parser.parse(f.read, false) } 37 | 38 | rss.items.each do |item| 39 | rs = ps_select.execute(item.title) 40 | if !rs.fetch 41 | body = item.description.gsub(/.*?<\/nobr>/, '').gsub(/<[^>]*>/, ' ').gsub(/ /, ' ').gsub(/[ \t]+/, ' ') 42 | ps_insert.execute item.title, lang, body 43 | end 44 | end 45 | sleep 1 46 | end 47 | 48 | -------------------------------------------------------------------------------- /langdetect/filetest.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # -*- coding: utf-8 -*- 3 | 4 | require 'optparse' 5 | require 'nkf' 6 | require 'detect.rb' 7 | 8 | parser = OptionParser.new 9 | model = 'model' 10 | alpha = 1.0 11 | debug_flag = false 12 | parser.on('-f VAL', String, 'model filename') {|v| model = v } 13 | parser.on('-a VAL', Float, 'alpha (additive smoothing)') {|v| alpha = v } 14 | parser.on('-d', 'debug mode') { debug_flag = true } 15 | parser.parse!(ARGV) 16 | 17 | detector = LanguageDetector::Detector.new(model) 18 | detector.debug_on if debug_flag 19 | 20 | ARGV.each do |filename| 21 | text = open(filename){|f| NKF.nkf('-w', f.read) } 22 | text.gsub!(/https?:\/\/[0-9a-zA-Z\.\/\?=\&\-]+/, '') 23 | problist = detector.detect(text, alpha) 24 | puts "#{filename},#{problist.inspect},#{text[0..100].gsub(/\s+/, ' ').strip}" 25 | end 26 | -------------------------------------------------------------------------------- /langdetect/test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # -*- coding: utf-8 -*- 3 | 4 | require 'common.rb' 5 | require 'detect.rb' 6 | 7 | parser= LD::optparser 8 | target_id = nil 9 | alpha = 1.0 10 | parser.on('--id=VAL', Integer, 'target text id') {|v| target_id = v } 11 | parser.on('-a VAL', Float, 'alpha (additive smoothing)') {|v| alpha = v } 12 | parser.parse!(ARGV) 13 | 14 | detector = LanguageDetector::Detector.new(LD::model_filename, alpha) 15 | 16 | # Database 17 | db = LD::db_connect 18 | ps_select = if target_id 19 | detector.debug_on 20 | db.prepare("select id,title,lang,body from news where id = ?").execute target_id 21 | else 22 | db.prepare("select id,title,lang,body from news order by lang").execute 23 | end 24 | 25 | count = Hash.new(0) 26 | correct = Hash.new(0) 27 | detected = Hash.new{|h,k| h[k]=Hash.new(0)} 28 | ngramer = detector.ngramer 29 | while rs = ps_select.fetch 30 | id, title, lang, body = rs 31 | title.sub!(/ - [^\-]+$/, '') 32 | text = LD::decode_entity(title + "\n" + body) 33 | 34 | ngramer.clear 35 | detector.init 36 | text.scan(/./) do |x| 37 | ngramer.append x 38 | ngramer.each do |z| 39 | detector.append z 40 | end 41 | break if detector.maxprob > 0.99999 42 | end 43 | 44 | problist = detector.problist 45 | puts "#{id},#{lang},#{title},#{problist.inspect}" 46 | count[lang] += 1 47 | correct[lang] += 1 if problist[0][0] == lang 48 | detected[lang][problist[0][0]] += 1 49 | end 50 | 51 | sum = correct_sum = 0 52 | count.keys.sort.each do |lang| 53 | rate = (10000.0 * correct[lang] / count[lang]).to_i / 100.0 54 | list = detected[lang].to_a.sort_by{|x| -x[1]}.map{|x| x.join(':')}.join(',') 55 | puts "#{lang} #{correct[lang]} / #{count[lang]} (#{rate}) [#{list}]" 56 | sum += count[lang] 57 | correct_sum += correct[lang] 58 | end 59 | puts "total: #{correct_sum} / #{sum} (#{(10000.0 * correct_sum / sum).to_i / 100.0})" 60 | 61 | -------------------------------------------------------------------------------- /langdetect/train.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | # -*- coding: utf-8 -*- 3 | 4 | require 'mysql' 5 | require 'json' 6 | require 'common.rb' 7 | require 'detect.rb' 8 | 9 | parser= LD::optparser 10 | opt = {:N=>3, :training_size=>150, :csv=>false, :json=>false} 11 | parser.on('-n VAL', Integer, 'N-gram') {|v| opt[:N] = v } 12 | parser.on('--size=VAL', Integer, 'max size of training data') {|v| opt[:training_size] = v } 13 | parser.on('--csv') {|v| opt[:csv] = true } 14 | parser.parse!(ARGV) 15 | 16 | # Database 17 | db = LD::db_connect 18 | #ps_select = db.prepare("select title,lang,body from news order by id desc") 19 | ps_select = db.prepare("select title,lang,body from news order by rand()") 20 | 21 | ps_select.execute 22 | n_k = Hash.new(0) 23 | p_ik = Hash.new{|h,k| h[k]=Hash.new(0)} 24 | ngramer = LanguageDetector::Ngramer.new(opt[:N]) 25 | while rs = ps_select.fetch 26 | title, lang, body = rs 27 | title.sub!(/ - [^\-]+$/, '') 28 | next if n_k[lang] >= opt[:training_size] 29 | n_k[lang] += 1 30 | text = LD::decode_entity(title + "\n" + body) 31 | 32 | grams = Hash.new 33 | ngramer.clear 34 | text.scan(/./) do |x| 35 | ngramer.append x 36 | ngramer.each do |z| 37 | grams[z] = 1 38 | end 39 | end 40 | grams.each do |gram, dummy| 41 | p_ik[gram][lang] += 1 42 | end 43 | end 44 | 45 | if opt[:csv] 46 | puts ","+LD::LANGLIST.join(',') 47 | p_ik.to_a.sort.each do |unigram,langs| 48 | langs.default = '' 49 | puts "'#{unigram.unpack('H*')[0]},#{LD::LANGLIST.map{|lang| langs[lang]}.join(',')}" 50 | end 51 | end 52 | 53 | keys = p_ik.keys 54 | keys.each do |chunk| 55 | langs = p_ik[chunk].keys 56 | langs.each do |lang| 57 | p_ik[chunk].delete lang if p_ik[chunk][lang] <= 2 58 | end 59 | p_ik.delete chunk if p_ik[chunk].size == 0 60 | end 61 | 62 | p_ik.default = 0 63 | open(LD::model_filename, 'w') do |f| 64 | JSON.dump([n_k, p_ik, opt[:N]], f) 65 | end 66 | 67 | -------------------------------------------------------------------------------- /lda/lda.r: -------------------------------------------------------------------------------- 1 | 2 | # Latent Dirichlet Allocation + collapsed Gibbs sampling 3 | # This code is available under the MIT License. 4 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc. 5 | 6 | K <- 50; 7 | I <- 200; 8 | 9 | filename <- "../data/gift_of_magi.txt"; 10 | argv <- commandArgs(T); 11 | if (length(argv)>0) filename <- commandArgs(T)[1]; 12 | text <- tolower(readLines(filename)); 13 | corpus <- strsplit(text, split="[[:blank:][:punct:]]", perl=T); 14 | 15 | words <- c(); 16 | words_id <- list(); 17 | docs <- list(); 18 | M <- 0; 19 | for(line in corpus) { 20 | doc <- c(); 21 | for (term in line) { 22 | if (term == "") next; 23 | if (is.null(words_id[[term]])) { 24 | words <- append(words, term); 25 | words_id[[term]] <- length(words); 26 | } 27 | doc <- append(doc, words_id[[term]]); 28 | } 29 | if (length(doc)==0) next; 30 | M <- M + 1; 31 | docs[[M]] <- doc; 32 | } 33 | V <- length(words); 34 | 35 | z_m_n <- list(); # M * N_m 36 | n_m_z <- matrix(numeric(M*K),M); 37 | n_z_t <- matrix(numeric(K*V),K); 38 | n_z <- numeric(K); 39 | n_terms <- 0; 40 | 41 | for(m in 1:M) { 42 | doc <- docs[[m]]; 43 | N_m <- length(doc); 44 | 45 | z_n <- sample(1:K, N_m, replace=T); 46 | z_m_n[[m]] <- z_n; 47 | for(n in 1:N_m) { 48 | z <- z_n[n]; 49 | t <- doc[n]; 50 | n_m_z[m,z] <- n_m_z[m,z] + 1; 51 | n_z_t[z,t] <- n_z_t[z,t] + 1; 52 | n_z[z] <- n_z[z] + 1; 53 | } 54 | n_terms <- n_terms + N_m; 55 | } 56 | 57 | alpha <- 0.001; 58 | beta <- 0.001; 59 | 60 | for(ita in 1:I) { 61 | #print("-------------------------------------------------------------------"); 62 | #print(ita); 63 | 64 | changes <- 0; 65 | for(m in 1:M) { 66 | doc <- docs[[m]]; 67 | N_m <- length(doc); 68 | for(n in 1:N_m) { 69 | t <- doc[n]; 70 | z <- z_m_n[[m]][n]; # z_i 71 | 72 | # z_{-i} の状況を作る 73 | n_m_z[m,z] <- n_m_z[m,z] - 1; 74 | n_z_t[z,t] <- n_z_t[z,t] - 1; 75 | n_z[z] <- n_z[z] - 1; 76 | 77 | # p(z|z_{-i}) からサンプリング 78 | denom_a <- sum(n_m_z[m,]) + K * alpha; 79 | denom_b <- rowSums(n_z_t) + V * beta; 80 | p_z <- (n_z_t[,t] + beta) / denom_b * (n_m_z[m,] + alpha) / denom_a; 81 | z_i <- sample(1:K, 1, prob=p_z); 82 | 83 | z_m_n[[m]][n] <- z_i; 84 | #print(p_z); 85 | #cat(sprintf("%d,%d: %d => %d\n", m, n, z, z_i)); 86 | if (z != z_i) changes <- changes + 1; 87 | 88 | n_m_z[m,z_i] <- n_m_z[m,z_i] + 1; 89 | n_z_t[z_i,t] <- n_z_t[z_i,t] + 1; 90 | n_z[z_i] <- n_z[z_i] + 1; 91 | } 92 | } 93 | cat(sprintf("%d: %d/%d\n", ita, changes, n_terms)); 94 | } 95 | 96 | phi <- matrix(numeric(K*V), K); 97 | theta <- matrix(numeric(M*K), M); 98 | for(m in 1:M) { 99 | theta_m <- n_m_z[m,] + alpha; 100 | theta[m,] <- theta_m / sum(theta_m); 101 | } 102 | for(z in 1:K) { 103 | phi_z <- n_z_t[z,] + beta; 104 | phi[z,] <- phi_z / sum(phi_z); 105 | } 106 | colnames(phi) <- words; 107 | 108 | options(digits=5, scipen=1, width=100); 109 | sink(format(Sys.time(), "lda%m%d%H%M.txt")); 110 | 111 | for(m in 1:M) { 112 | doc <- docs[[m]]; 113 | N_m <- length(doc); 114 | cat(sprintf("\n[corpus %d]-------------------------------------\n", m)); 115 | print(theta[m,]); 116 | for(n in 1:N_m) { 117 | cat(sprintf("%s : %d\n", words[[doc[n]]], z_m_n[[m]][n])); 118 | } 119 | } 120 | 121 | print(phi); 122 | sink(); 123 | 124 | -------------------------------------------------------------------------------- /lda/lda_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This code is available under the MIT License. 4 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc. 5 | 6 | import numpy 7 | 8 | class FileOutput: 9 | def __init__(self, file): 10 | import datetime 11 | self.file = file + datetime.datetime.now().strftime('_%m%d_%H%M%S.txt') 12 | def out(self, st): 13 | with open(self.file, 'a') as f: 14 | print >>f, st 15 | 16 | def lda_learning(f, LDA, smartinit, options, docs, voca, plimit=1): 17 | import time 18 | t0 = time.time() 19 | 20 | if options.seed != None: numpy.random.seed(options.seed) 21 | lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), smartinit) 22 | 23 | pre_perp = lda.perplexity() 24 | f.out("alg=%s smart_init=%s initial perplexity=%f" % (LDA.__name__, smartinit, pre_perp)) 25 | 26 | pc = 0 27 | for i in range(options.iteration): 28 | if i % 10==0: output_word_topic_dist(f, lda, voca) 29 | lda.inference() 30 | perp = lda.perplexity() 31 | f.out("-%d p=%f" % (i + 1, perp)) 32 | if pre_perp is not None: 33 | if pre_perp < perp: 34 | pc += 1 35 | if pc >= plimit: 36 | output_word_topic_dist(f, lda, voca) 37 | pre_perp = None 38 | else: 39 | pc = 0 40 | pre_perp = perp 41 | output_word_topic_dist(f, lda, voca) 42 | 43 | t1 = time.time() 44 | f.out("time = %f\n" % (t1 - t0)) 45 | 46 | def output_word_topic_dist(f, lda, voca): 47 | phi = lda.worddist() 48 | for k in range(lda.K): 49 | f.out("\n-- topic: %d" % k) 50 | for w in numpy.argsort(-phi[k])[:20]: 51 | f.out("%s: %f" % (voca[w], phi[k,w])) 52 | 53 | def main(): 54 | import optparse 55 | import vocabulary 56 | import lda 57 | import lda_cvb0 58 | parser = optparse.OptionParser() 59 | parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)", default="1:100") 60 | parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) 61 | parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) 62 | parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) 63 | parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) 64 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 65 | parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) 66 | parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=1) 67 | (options, args) = parser.parse_args() 68 | 69 | corpus = vocabulary.load_corpus(options.corpus) 70 | voca = vocabulary.Vocabulary(options.stopwords) 71 | docs = [voca.doc_to_ids(doc) for doc in corpus] 72 | if options.df > 0: docs = voca.cut_low_freq(docs, options.df) 73 | 74 | f = FileOutput("lda_test") 75 | f.out("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(docs), len(voca.vocas), options.K, options.alpha, options.beta)) 76 | 77 | lda_learning(f, lda_cvb0.LDA_CVB0, False, options, docs, voca) 78 | lda_learning(f, lda_cvb0.LDA_CVB0, True, options, docs, voca) 79 | lda_learning(f, lda.LDA, False, options, docs, voca, 2) 80 | lda_learning(f, lda.LDA, True, options, docs, voca, 2) 81 | 82 | if __name__ == "__main__": 83 | main() 84 | 85 | -------------------------------------------------------------------------------- /lda/lda_test2.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | # This code is available under the MIT License. 4 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc. 5 | 6 | import numpy 7 | 8 | class FileOutput: 9 | def __init__(self, file): 10 | import datetime 11 | self.file = file + datetime.datetime.now().strftime('_%m%d_%H%M%S.txt') 12 | def out(self, st): 13 | with open(self.file, 'a') as f: 14 | print >>f, st 15 | 16 | def lda_learning(f, LDA, smartinit, options, docs, test_docs, voca, plimit=1): 17 | import time 18 | t0 = time.time() 19 | 20 | if options.seed != None: numpy.random.seed(options.seed) 21 | lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), smartinit) 22 | 23 | pre_perp = lda.perplexity(test_docs) 24 | f.out("alg=%s smart_init=%s initial perplexity=%f" % (LDA.__name__, smartinit, pre_perp)) 25 | 26 | pc = 0 27 | for i in range(options.iteration): 28 | if i % 10==0: output_word_topic_dist(f, lda, voca) 29 | lda.inference() 30 | perp = lda.perplexity(test_docs) 31 | f.out("-%d p=%f" % (i + 1, perp)) 32 | if pre_perp is not None: 33 | if pre_perp < perp: 34 | pc += 1 35 | if pc >= plimit: 36 | output_word_topic_dist(f, lda, voca) 37 | pre_perp = None 38 | else: 39 | pc = 0 40 | pre_perp = perp 41 | output_word_topic_dist(f, lda, voca) 42 | 43 | t1 = time.time() 44 | f.out("time = %f\n" % (t1 - t0)) 45 | 46 | def output_word_topic_dist(f, lda, voca): 47 | phi = lda.worddist() 48 | for k in range(lda.K): 49 | f.out("\n-- topic: %d" % k) 50 | for w in numpy.argsort(-phi[k])[:20]: 51 | f.out("%s: %f" % (voca[w], phi[k,w])) 52 | 53 | def conv_word_freq(docs): 54 | result = [] 55 | for doc in docs: 56 | term_freq = dict() 57 | for w in doc: 58 | if w in term_freq: 59 | term_freq[w] += 1 60 | else: 61 | term_freq[w] = 1 62 | result.append(term_freq.items()) 63 | return result 64 | 65 | def main(): 66 | import optparse 67 | import vocabulary 68 | import lda 69 | import lda_cvb0 70 | parser = optparse.OptionParser() 71 | parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)", default="0:100") 72 | parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) 73 | parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) 74 | parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) 75 | parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) 76 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 77 | parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) 78 | parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=10) 79 | (options, args) = parser.parse_args() 80 | 81 | corpus = vocabulary.load_corpus(options.corpus) 82 | voca = vocabulary.Vocabulary(options.stopwords) 83 | docs = [voca.doc_to_ids(doc) for doc in corpus] 84 | if options.df > 0: docs = voca.cut_low_freq(docs, options.df) 85 | train_docs = [[x for i, x in enumerate(doc) if i % 10 != 0] for doc in docs] 86 | test_docs = [[x for i, x in enumerate(doc) if i % 10 == 0] for doc in docs] 87 | test_docs_wf = conv_word_freq(test_docs) 88 | 89 | f = FileOutput("lda_test2") 90 | f.out("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(docs), len(voca.vocas), options.K, options.alpha, options.beta)) 91 | 92 | lda_learning(f, lda_cvb0.LDA_CVB0, False, options, train_docs, test_docs_wf, voca) 93 | lda_learning(f, lda_cvb0.LDA_CVB0, True, options, train_docs, test_docs_wf, voca) 94 | lda_learning(f, lda.LDA, False, options, train_docs, test_docs, voca, 2) 95 | lda_learning(f, lda.LDA, True, options, train_docs, test_docs, voca, 2) 96 | 97 | if __name__ == "__main__": 98 | main() 99 | 100 | -------------------------------------------------------------------------------- /lda/ldacvb0_cpp/README.md: -------------------------------------------------------------------------------- 1 | LDA CVB0 in C++ 2 | ====================== 3 | 4 | 5 | How to Build 6 | ------ 7 | 8 | git clone git://github.com/shuyo/iir.git 9 | cd iir/lda/ldacvb0_cpp 10 | git clone git://github.com/herumi/cybozulib.git 11 | MSBuild.exe ldacvb0.sln /p:Configuration=Release /p:Platform="Win32" 12 | 13 | 14 | Usage 15 | ------ 16 | 17 | On cygwin, 18 | 19 | curl http://nltk.googlecode.com/svn/trunk/nltk_data/packages/corpora/brown.zip -O 20 | unzip brown.zip 21 | Release/ldacvb0.exe brown/???? 22 | 23 | 24 | Options 25 | ------ 26 | 27 | + -k : topic size (20) 28 | + -i : number of learning iteration (100) 29 | + -a : parameter alpha (0.1) 30 | + -b : parameter beta (0.01) 31 | + -n : how many top to print in topic-word distribution (20) 32 | + -p : use corpus with POS annotation 33 | 34 | 35 | License 36 | ---------- 37 | 38 | Copyright © 2013 Nakatani Shuyo / Cybozu Labs, Inc 39 | 40 | Distributed under the [MIT License][mit]. 41 | 42 | [MIT]: http://www.opensource.org/licenses/mit-license.php 43 | -------------------------------------------------------------------------------- /lda/ldacvb0_cpp/ldacvb0.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 11.00 3 | # Visual Studio 2010 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ldacvb0", "ldacvb0\ldacvb0.vcxproj", "{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}" 5 | EndProject 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ldacvb0_test", "ldacvb0_test\ldacvb0_test.vcxproj", "{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}" 7 | ProjectSection(ProjectDependencies) = postProject 8 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25} = {7E7F27E1-8399-470B-A9F5-877EC4E8BA25} 9 | EndProjectSection 10 | EndProject 11 | Global 12 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 13 | Debug|Win32 = Debug|Win32 14 | Debug|x64 = Debug|x64 15 | Release|Win32 = Release|Win32 16 | Release|x64 = Release|x64 17 | EndGlobalSection 18 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 19 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|Win32.ActiveCfg = Debug|Win32 20 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|Win32.Build.0 = Debug|Win32 21 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|x64.ActiveCfg = Debug|x64 22 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|x64.Build.0 = Debug|x64 23 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|Win32.ActiveCfg = Release|Win32 24 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|Win32.Build.0 = Release|Win32 25 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|x64.ActiveCfg = Release|x64 26 | {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|x64.Build.0 = Release|x64 27 | {CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|Win32.ActiveCfg = Debug|Win32 28 | {CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|Win32.Build.0 = Debug|Win32 29 | {CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|x64.ActiveCfg = Debug|x64 30 | {CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|x64.Build.0 = Debug|x64 31 | {CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|Win32.ActiveCfg = Release|Win32 32 | {CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|Win32.Build.0 = Release|Win32 33 | {CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|x64.ActiveCfg = Release|x64 34 | {CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|x64.Build.0 = Release|x64 35 | EndGlobalSection 36 | GlobalSection(SolutionProperties) = preSolution 37 | HideSolutionNode = FALSE 38 | EndGlobalSection 39 | EndGlobal 40 | -------------------------------------------------------------------------------- /lda/ldacvb0_cpp/ldacvb0/ldacvb0.cpp: -------------------------------------------------------------------------------- 1 | /** 2 | @file 3 | @brief LDA CVB0 4 | Latent Dirichlet Allocation - Collapsed Variational Bayesian Estimation 5 | 6 | Copyright (C) 2013 Nakatani Shuyo / Cybozu Labs, Inc., all rights reserved. 7 | This code is licensed under the MIT license. 8 | */ 9 | 10 | #include 11 | #include 12 | #include 13 | #include "cybozu/string.hpp" 14 | #include 15 | #include 16 | #include "ldacvb0.hpp" 17 | 18 | void printnwk(const cybozu::ldacvb0::LDA_CVB0& model, const std::string& word) { 19 | auto voca = model.docs_.vocabularies; 20 | size_t w = voca.id(word); 21 | auto i = model.n_wk.begin()+ w * model.K_; 22 | 23 | std::cout << "[" << word << "]" << std::endl; 24 | std::cout << "( "; 25 | for (size_t k=0;k 32 | void printHighFreqWords(const cybozu::ldacvb0::Documents &docs) { 33 | for(auto df=docs.docfreq.begin(), dfend = docs.docfreq.end(); df!=dfend; ++df) { 34 | if (df->second > (int)M/2) { 35 | std::cout << docs.vocabularies.vocalist[df->first] << " " << df->second << std::endl; 36 | } 37 | } 38 | } 39 | 40 | 41 | int main(int argc, char* argv[]) { 42 | 43 | int K = 20, I = 100, N_WORDS = 20; 44 | size_t ldf = 1, udf = 0; // lower and upper limit of document frequency 45 | double alpha = 0.1; 46 | double beta = 0.01; 47 | bool isCorpusWithPos = false; 48 | 49 | std::vector files; 50 | 51 | for(int i=1;i=argc) goto ERROR_OPT_K; 56 | K = atoi(argv[i]); 57 | } else if (st == "-i") { 58 | if (++i>=argc) goto ERROR_OPT_I; 59 | I = atoi(argv[i]); 60 | } else if (st == "-n") { 61 | if (++i>=argc) goto ERROR_OPT_N; 62 | N_WORDS = atoi(argv[i]); 63 | } else if (st == "--ldf") { 64 | if (++i>=argc) goto ERROR_OPT_DF; 65 | ldf = atoi(argv[i]); 66 | } else if (st == "--udf") { 67 | if (++i>=argc) goto ERROR_OPT_DF; 68 | udf = atoi(argv[i]); 69 | } else if (st == "-a") { 70 | if (++i>=argc) goto ERROR_OPT_A; 71 | alpha = atof(argv[i]); 72 | } else if (st == "-b") { 73 | if (++i>=argc) goto ERROR_OPT_B; 74 | beta = atof(argv[i]); 75 | } else if (st == "-p") { 76 | isCorpusWithPos = true; 77 | } else { 78 | files.push_back(st); 79 | } 80 | } 81 | 82 | { 83 | cybozu::ldacvb0::Documents orgdocs(isCorpusWithPos?cybozu::ldacvb0::REXWORD_WITH_POS:cybozu::ldacvb0::REXWORD), docs; 84 | 85 | for(auto i=files.begin(), iend=files.end();i!=iend;++i) { 86 | try { 87 | cybozu::Mmap map(*i); 88 | const char *p = map.get(); 89 | const char *end = p + map.size(); 90 | orgdocs.add(p, end); 91 | } catch (std::exception& e) { 92 | printf("%s\n", e.what()); 93 | } 94 | } 95 | 96 | size_t M = orgdocs.size(); 97 | size_t orgV = orgdocs.vocabularies.size(); 98 | if (orgV <= 0) goto ERROR_NO_VOCA; 99 | 100 | if (udf == 0) udf = M / 2; 101 | truncDocFreq(docs, orgdocs, ldf, udf); 102 | 103 | size_t V = docs.vocabularies.size(); 104 | if (V <= 0) goto ERROR_NO_VOCA; 105 | 106 | std::cout << "M = " << M; 107 | std::cout << ", N = " << docs.N; 108 | std::cout << ", V = " << V << " / " << orgV << std::endl; 109 | std::cout << "K = " << K << ", alpha = " << alpha << ", beta = " << beta << std::endl; 110 | 111 | cybozu::ldacvb0::LDA_CVB0 model(K, V, alpha, beta, docs); 112 | 113 | for(int i=0;i ts(N_WORDS); 126 | size_t id = 0; 127 | for(auto i = worddist.begin() + k; id < V; i+=K, ++id) { 128 | ts.add(*i, id); 129 | } 130 | 131 | auto table = ts.getTable(); 132 | auto tend = table.end(); 133 | for (auto t = table.begin(); t!=tend; ++t) { 134 | const std::string& w = voca.vocalist[t->idx]; 135 | std::cout << w << "\t" << docs.docfreq[t->idx] << "\t" << voca.count(w) << "\t" << t->score << std::endl; 136 | } 137 | } 138 | 139 | /* 140 | auto i = worddist.begin(); 141 | //auto i = model.n_wk->begin(); 142 | for(size_t id = 0; id < V;++id) { 143 | const std::string& w = voca.vocalist[id]; 144 | std::cout << id << "\t" << w << "\t" << docs.docfreq[id] << "\t" << voca.count(w); 145 | for (int k=0;k0: 27 | corpus.append(doc) 28 | labels.append(label) 29 | f.close() 30 | return labelmap.keys(), corpus, labels 31 | 32 | class LLDA: 33 | def __init__(self, K, alpha, beta): 34 | #self.K = K 35 | self.alpha = alpha 36 | self.beta = beta 37 | 38 | def term_to_id(self, term): 39 | if term not in self.vocas_id: 40 | voca_id = len(self.vocas) 41 | self.vocas_id[term] = voca_id 42 | self.vocas.append(term) 43 | else: 44 | voca_id = self.vocas_id[term] 45 | return voca_id 46 | 47 | def complement_label(self, label): 48 | if not label: return numpy.ones(len(self.labelmap)) 49 | vec = numpy.zeros(len(self.labelmap)) 50 | vec[0] = 1.0 51 | for x in label: vec[self.labelmap[x]] = 1.0 52 | return vec 53 | 54 | def set_corpus(self, labelset, corpus, labels): 55 | labelset.insert(0, "common") 56 | self.labelmap = dict(zip(labelset, range(len(labelset)))) 57 | self.K = len(self.labelmap) 58 | 59 | self.vocas = [] 60 | self.vocas_id = dict() 61 | self.labels = numpy.array([self.complement_label(label) for label in labels]) 62 | self.docs = [[self.term_to_id(term) for term in doc] for doc in corpus] 63 | 64 | M = len(corpus) 65 | V = len(self.vocas) 66 | 67 | self.z_m_n = [] 68 | self.n_m_z = numpy.zeros((M, self.K), dtype=int) 69 | self.n_z_t = numpy.zeros((self.K, V), dtype=int) 70 | self.n_z = numpy.zeros(self.K, dtype=int) 71 | 72 | for m, doc, label in zip(range(M), self.docs, self.labels): 73 | N_m = len(doc) 74 | #z_n = [label[x] for x in numpy.random.randint(len(label), size=N_m)] 75 | z_n = [numpy.random.multinomial(1, label / label.sum()).argmax() for x in range(N_m)] 76 | self.z_m_n.append(z_n) 77 | for t, z in zip(doc, z_n): 78 | self.n_m_z[m, z] += 1 79 | self.n_z_t[z, t] += 1 80 | self.n_z[z] += 1 81 | 82 | def inference(self): 83 | V = len(self.vocas) 84 | for m, doc, label in zip(range(len(self.docs)), self.docs, self.labels): 85 | for n in range(len(doc)): 86 | t = doc[n] 87 | z = self.z_m_n[m][n] 88 | self.n_m_z[m, z] -= 1 89 | self.n_z_t[z, t] -= 1 90 | self.n_z[z] -= 1 91 | 92 | denom_a = self.n_m_z[m].sum() + self.K * self.alpha 93 | denom_b = self.n_z_t.sum(axis=1) + V * self.beta 94 | p_z = label * (self.n_z_t[:, t] + self.beta) / denom_b * (self.n_m_z[m] + self.alpha) / denom_a 95 | new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax() 96 | 97 | self.z_m_n[m][n] = new_z 98 | self.n_m_z[m, new_z] += 1 99 | self.n_z_t[new_z, t] += 1 100 | self.n_z[new_z] += 1 101 | 102 | def phi(self): 103 | V = len(self.vocas) 104 | return (self.n_z_t + self.beta) / (self.n_z[:, numpy.newaxis] + V * self.beta) 105 | 106 | def theta(self): 107 | """document-topic distribution""" 108 | n_alpha = self.n_m_z + self.labels * self.alpha 109 | return n_alpha / n_alpha.sum(axis=1)[:, numpy.newaxis] 110 | 111 | def perplexity(self, docs=None): 112 | if docs == None: docs = self.docs 113 | phi = self.phi() 114 | thetas = self.theta() 115 | 116 | log_per = N = 0 117 | for doc, theta in zip(docs, thetas): 118 | for w in doc: 119 | log_per -= numpy.log(numpy.inner(phi[:,w], theta)) 120 | N += len(doc) 121 | return numpy.exp(log_per / N) 122 | 123 | def main(): 124 | parser = OptionParser() 125 | parser.add_option("-f", dest="filename", help="corpus filename") 126 | parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001) 127 | parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001) 128 | parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) 129 | parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) 130 | (options, args) = parser.parse_args() 131 | if not options.filename: parser.error("need corpus filename(-f)") 132 | 133 | labelset, corpus, labels = load_corpus(options.filename) 134 | 135 | llda = LLDA(options.K, options.alpha, options.beta) 136 | llda.set_corpus(labelset, corpus, labels) 137 | 138 | for i in range(options.iteration): 139 | sys.stderr.write("-- %d " % (i + 1)) 140 | llda.inference() 141 | #print llda.z_m_n 142 | 143 | phi = llda.phi() 144 | for v, voca in enumerate(llda.vocas): 145 | #print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]]) 146 | print ','.join([voca]+[str(x) for x in phi[:,v]]) 147 | 148 | if __name__ == "__main__": 149 | main() 150 | -------------------------------------------------------------------------------- /lda/llda_nltk.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Labeled LDA using nltk.corpus.reuters as dataset 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import sys, string, random, numpy 9 | from nltk.corpus import reuters 10 | from llda import LLDA 11 | from optparse import OptionParser 12 | 13 | parser = OptionParser() 14 | parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001) 15 | parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001) 16 | parser.add_option("-k", dest="K", type="int", help="number of topics", default=50) 17 | parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) 18 | parser.add_option("-s", dest="seed", type="int", help="random seed", default=None) 19 | parser.add_option("-n", dest="samplesize", type="int", help="dataset sample size", default=100) 20 | (options, args) = parser.parse_args() 21 | random.seed(options.seed) 22 | numpy.random.seed(options.seed) 23 | 24 | idlist = random.sample(reuters.fileids(), options.samplesize) 25 | 26 | labels = [] 27 | corpus = [] 28 | for id in idlist: 29 | labels.append(reuters.categories(id)) 30 | corpus.append([x.lower() for x in reuters.words(id) if x[0] in string.ascii_letters]) 31 | reuters.words(id).close() 32 | labelset = list(set(reduce(list.__add__, labels))) 33 | 34 | 35 | llda = LLDA(options.K, options.alpha, options.beta) 36 | llda.set_corpus(labelset, corpus, labels) 37 | 38 | print "M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K) 39 | 40 | for i in range(options.iteration): 41 | sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity())) 42 | llda.inference() 43 | print "perplexity : %.4f" % llda.perplexity() 44 | 45 | phi = llda.phi() 46 | for k, label in enumerate(labelset): 47 | print "\n-- label %d : %s" % (k, label) 48 | for w in numpy.argsort(-phi[k])[:20]: 49 | print "%s: %.4f" % (llda.vocas[w], phi[k,w]) 50 | 51 | -------------------------------------------------------------------------------- /lda/twentygroups.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # 20 Groups Loader 5 | # - load data at http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.html 6 | # This code is available under the MIT License. 7 | # (c)2012 Nakatani Shuyo / Cybozu Labs Inc. 8 | 9 | import os, codecs, re 10 | 11 | STOPWORDS = """ 12 | a b c d e f g h i j k l m n o p q r s t u v w x y z 13 | the of in and have to it was or were this that with is some on for so 14 | how you if would com be your my one not never then take for an can no 15 | but aaa when as out just from does they back up she those who another 16 | her do by must what there at very are am much way all any other me he 17 | something someone doesn his also its has into us him than about their 18 | may too will had been we them why did being over without these could 19 | out which only should even well more where after while anyone our now 20 | such under two ten else always going either each however non let done 21 | ever between anything before every same since because quite sure here 22 | nothing new don off still down yes around few many own 23 | go get know think like make say see look use said 24 | """ 25 | 26 | def readTerms(target): 27 | with codecs.open(target, 'rb', 'latin1') as f: 28 | text = re.sub(r'^(.+\n)*\n', '', f.read()) 29 | return [w.group(0).lower() for w in re.finditer(r'[A-Za-z]+', text)] 30 | 31 | class Loader: 32 | def __init__(self, dirpath, freq_threshold=1, docs_threshold_each_label=100, includes_stopwords=False): 33 | if includes_stopwords: 34 | stopwords = set(re.split(r'\s', STOPWORDS)) 35 | else: 36 | stopwords = [] 37 | 38 | self.resourcenames = [] 39 | self.labels = [] 40 | self.label2id = dict() 41 | self.doclabelids = [] 42 | vocacount = dict() 43 | tempdocs = [] 44 | 45 | dirlist = os.listdir(dirpath) 46 | for label in dirlist: 47 | path = os.path.join(dirpath, label) 48 | if os.path.isdir(path): 49 | label_id = len(self.labels) 50 | self.label2id[label] = label_id 51 | self.labels.append(label) 52 | 53 | filelist = os.listdir(path) 54 | for i, s in enumerate(filelist): 55 | if i >= docs_threshold_each_label: break 56 | 57 | self.resourcenames.append(os.path.join(label, s)) 58 | self.doclabelids.append(label_id) 59 | 60 | wordlist = readTerms(os.path.join(path, s)) 61 | tempdocs.append(wordlist) 62 | 63 | for w in wordlist: 64 | if w in vocacount: 65 | vocacount[w] += 1 66 | else: 67 | vocacount[w] = 1 68 | 69 | self.vocabulary = [] 70 | self.vocabulary2id = dict() 71 | for w in vocacount: 72 | if w not in stopwords and vocacount[w] >= freq_threshold: 73 | self.vocabulary2id[w] = len(self.vocabulary) 74 | self.vocabulary.append(w) 75 | 76 | self.docs = [] 77 | for doc in tempdocs: 78 | self.docs.append([self.vocabulary2id[w] for w in doc if w in self.vocabulary2id]) 79 | 80 | def main(): 81 | import optparse 82 | parser = optparse.OptionParser() 83 | parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1) 84 | parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001) 85 | parser.add_option("-k", dest="K", type="int", help="number of topics", default=10) 86 | parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=20) 87 | parser.add_option("--seed", dest="seed", type="int", help="random seed", default=None) 88 | parser.add_option("--word_freq_threshold", dest="word_freq_threshold", type="int", default=3) 89 | parser.add_option("--docs_threshold_each_label", dest="docs_threshold_each_label", type="int", default=100) 90 | parser.add_option("-d", dest="dir", help="directory of 20-newsgroups dataset", default="./20groups/mini_newsgroups/") 91 | (options, args) = parser.parse_args() 92 | import numpy 93 | numpy.random.seed(options.seed) 94 | 95 | corpus = Loader(options.dir, options.word_freq_threshold, options.docs_threshold_each_label, True) 96 | V = len(corpus.vocabulary) 97 | 98 | import lda_cvb0 as lda 99 | model = lda.LDA_CVB0(options.K, options.alpha, options.beta, corpus.docs, V, True) 100 | print("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus.docs), V, options.K, options.alpha, options.beta)) 101 | 102 | pre_perp = model.perplexity() 103 | print("initial perplexity=%f" % pre_perp) 104 | for i in range(options.iteration): 105 | model.inference() 106 | perp = model.perplexity() 107 | print("-%d p=%f" % (i + 1, perp)) 108 | lda.output_word_topic_dist(model, corpus.vocabulary) 109 | 110 | if __name__ == "__main__": 111 | main() 112 | 113 | -------------------------------------------------------------------------------- /lib/extract_gutenberg.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | module Gutenberg 4 | def self.extract(text) 5 | text = text.gsub(/[ \r]+$/, "") + "\n\n" 6 | $stderr.puts "Warning: HTML-formed comment in #{path}"if text.gsub!(/<-- .+? -->/m, "") 7 | $stderr.puts "Warning: HTML tag in #{path}"if text.gsub!(/.+?<\/HTML>/mi, "") 8 | 9 | negative_phrase = /http|internet|project gutenberg|mail|ocr/i 10 | separator = /^(?:.+?END\*{1,2}|\*{3} START OF THE PROJECT GUTENBERG E(?:BOOK|TEXT).*? \*{3}|\*{9}END OF .+?|\*{3} END OF THE PROJECT GUTENBERG E(?:BOOK|TEXT).+?|\*{3}START\*.+\*START\*{3}|\**\s*This file should be named .+|\*{5}These [eE](?:Books|texts) (?:Are|Were) Prepared By .+\*{5})$/ 11 | 12 | while text =~ separator 13 | pre, post = $`, $' 14 | text = if pre.length > post.length*3 then 15 | pre 16 | elsif post.length > pre.length*3 then 17 | post 18 | elsif pre.scan(negative_phrase).length < post.scan(negative_phrase).length 19 | pre 20 | else 21 | post 22 | end 23 | end 24 | 25 | text.gsub!(/^(?:Executive Director's Notes:|\[?Transcriber's Note|PREPARER'S NOTE|\[Redactor's note|\{This e-text has been prepared|As you may be aware, Project Gutenberg has been involved with|[\[\*]Portions of this header are|A note from the digitizer|ETEXT EDITOR'S BOOKMARKS|\[NOTE:|\[Project Gutenberg is|INFORMATION ABOUT THIS E-TEXT EDITION\n+|If you find any errors|This electronic edition was|Notes about this etext:|A request to all readers:|Comments on the preparation of the E-Text:|The base text for this edition has been provided by).+?\n(?:[\-\*]+)?\n\n/mi, "") 26 | text.gsub!(/^[\[\n](?:[^\[\]\n]+\n)*[^\n]*(?:Project\sGutenberg|\setext\s|\s[A-Za-z0-9]+@[a-z\-]+\.(?:com|net))[^\n]*(?:\n[^\[\]\n]+)*[\]\n]$/i, "") 27 | text.gsub!(/\{The end of etext of .+?\}/, "") 28 | text = text.strip + "\n\n" 29 | 30 | text.gsub!(/^(?:(?:End )?(?:of ?)?(?:by |This |The )?Project Gutenberg(?:'s )?(?:Etext)?|This (?:Gutenberg )?Etext).+?\n\n/mi, "") 31 | text.gsub!(/^(?:\(?E?-?(?:text )?(?:prepared|Processed|scanned|Typed|Produced|Edited|Entered|Transcribed|Converted|created) by|Transcribed from|Scanning and first proofing by|Scanned and proofed by|This e-text|This EBook of|Scanned with|This Etext created by|This eBook was (?:produced|updated) by|Image files scanned in by|\[[^\n]*mostly scanned by|This text was prepared by).+?\n\n/mi, "") 32 | 33 | if text=~/gutenberg|\setext\s|scanner|David Reed/i 34 | $stderr.puts "Warning: remain '#{$&.strip}'" 35 | elsif text=~/[^\s\*]@[^\s]+\./ 36 | $stderr.puts "Warning: maybe remain mail adress" 37 | elsif text.length < 1024 38 | $stderr.puts "Warning: too small body" 39 | end 40 | 41 | text 42 | end 43 | end 44 | 45 | puts Gutenberg.extract(ARGF.read) if $0 == __FILE__ 46 | -------------------------------------------------------------------------------- /lib/infinitive.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require 'rubygems' 4 | require 'stemmer' 5 | require 'linguistics' 6 | Linguistics::use( :en ) 7 | 8 | class Infinitive 9 | def initialize(inflist_file = nil, wordbook_file = nil) 10 | dir = caller(0)[0].sub(/\/[^\/]*:\d+:.+$/,"") 11 | inflist_file = "#{dir}/inflist.txt" unless inflist_file 12 | wordbook_file = "#{dir}/wordbook.txt" unless wordbook_file 13 | 14 | @inflist = Hash.new 15 | open(inflist_file) do |f| 16 | while line = f.gets 17 | @inflist[$1]=$2 if line =~ /^(.+)\t(.+)\n/ 18 | end 19 | end 20 | @infcache = Hash.new 21 | 22 | @wordbook = Hash.new 23 | open(wordbook_file) do |f| 24 | while line = f.gets 25 | @wordbook[line.chomp.stem.downcase]=0 if line !~ /^\s+$|^\s*#/ 26 | end 27 | end 28 | end 29 | 30 | def inf(src) 31 | return @infcache[src] if @infcache.key?(src) 32 | st = @inflist[src] || src.en.infinitive 33 | @infcache[src] = (if st == "" then src else st end).stem 34 | end 35 | 36 | def infinitive(word) 37 | st = word2 = word.downcase 38 | st = word2.stem 39 | if @wordbook.key?(st) 40 | st 41 | else 42 | inf(word2) || st 43 | end 44 | end 45 | end 46 | 47 | -------------------------------------------------------------------------------- /lib/inflist.txt: -------------------------------------------------------------------------------- 1 | women woman 2 | feet foot 3 | gentlemen gentleman 4 | policemen policeman 5 | -------------------------------------------------------------------------------- /lr/lr.r: -------------------------------------------------------------------------------- 1 | # Multi-class Logistic Regression + Stochastic Gradient Descent for R 2 | # (c)2011 Nakatani Shuyo / Cybozu Labs, Inc. 3 | # This code is available under the MIT Licence. 4 | 5 | # コマンドライン処理 6 | commandline <- commandArgs(TRUE) 7 | chart <- "--chart" %in% commandline 8 | i <- match("-i", commandline) 9 | if (is.na(i)) { 10 | I <- 1 11 | } else { 12 | I <- as.numeric(commandline[i + 1]) 13 | } 14 | 15 | # iris dataset 16 | xlist <- scale(iris[1:4]) 17 | tlist <- cbind( 18 | ifelse(iris[5]=="setosa",1,0), 19 | ifelse(iris[5]=="versicolor",1,0), 20 | ifelse(iris[5]=="virginica",1,0) 21 | ) 22 | N <- nrow(xlist) # データ件数 23 | 24 | # 事後確率 25 | y <- function(phi, w) { 26 | y <- c(phi %*% w) 27 | y <- exp(y - max(y)) # exp の中身から、その最大値を引く(オーバーフロー対策) 28 | return(y / sum(y)) 29 | } 30 | 31 | # 誤差関数&勾配 32 | En <- function(phi, t, w) -log(sum(y(phi, w) * t)) 33 | dEn <- function(phi, t, w) outer(phi, y(phi, w) - t) 34 | 35 | inference <- function(title, xlist, tlist, phi) { 36 | PHI <- t(apply(xlist, 1, phi)) # NxM - design matrix 37 | M <- ncol(PHI) # 特徴数(特徴空間の次元) 38 | K <- ncol(tlist) # クラス数 39 | 40 | for (i in 1:I) { 41 | # 重み初期化 42 | w <- matrix(rnorm(M * K), M) 43 | 44 | eta <- 0.1 # 学習率 45 | while (eta > 0.0001) { 46 | for(n in sample(N)) { 47 | w <- w - eta * dEn(PHI[n,], tlist[n,], w) # 確率的勾配降下法 48 | } 49 | eta <- eta * 0.95 50 | } 51 | 52 | ylist <- t(apply(PHI, 1, function(phi) y(phi, w))) 53 | error <- sum(sapply(1:nrow(PHI), function(n) En(PHI[n,], tlist[n,], w))) 54 | cat(sprintf("%s: error=%.3f", title, error), "\n") 55 | 56 | # 可視化 57 | if (chart) { 58 | pairs(xlist, col=rgb(ylist), main=title) 59 | plot(xlist[,c(1,2)], 60 | col=rgb(ylist), 61 | pch=(tlist %*% c(17,16,22)), 62 | main=title, 63 | sub=sprintf("Negative Log Likelihood = %.3f", error) 64 | ) 65 | } 66 | } 67 | 68 | return(w) 69 | } 70 | 71 | if (chart) png(width=640, height=640) 72 | 73 | # 線形特徴関数 74 | phi <- function(x) c(1, x[1], x[2], x[3], x[4]) 75 | w <- inference("Linear Features", xlist, tlist, phi) 76 | 77 | # 二次特徴関数 78 | phi <- function(x) c(1, x[1], x[2], x[3], x[4], 79 | x[1]*x[1], x[1]*x[2], x[1]*x[3], x[1]*x[4], x[2]*x[2], 80 | x[2]*x[3], x[2]*x[4], x[3]*x[3], x[3]*x[4], x[4]*x[4]) 81 | w <- inference("Quadratic Features", xlist, tlist, phi) 82 | 83 | # RBF 特徴関数 84 | for (s in 1:10) { 85 | phi <- function(x) { 86 | c <- seq(-2.5,2.5,by=1) 87 | d <- outer(c,x,"-")^2 88 | return(exp(-c(0, outer(c(outer(c(outer(d[,1],d[,2],"+")),d[,3],"+")),d[,4],"+"))/s)) 89 | } 90 | w <- inference(sprintf("RBF Features (s=%d)", s), xlist, tlist, phi) 91 | } 92 | 93 | if (chart) dev.off() 94 | 95 | 96 | -------------------------------------------------------------------------------- /misc/linear_regression.r: -------------------------------------------------------------------------------- 1 | 2 | polynomial_basis_func <- function(M) { 3 | lapply(1:M, function(u){ u1=u; function(x) x^(u1-1) }) 4 | } 5 | gaussian_basis_func <- function(M, has_bias=T, s=0.1) { 6 | phi <- c() 7 | if (has_bias) phi <- function(x) 0*x+1 # bias 8 | u_i <- seq(0,1,length=ifelse(has_bias, M-1, M)) 9 | append(phi, lapply(u_i, function(u){ u1=u; function(x) exp(-(x-u1)^2/(2*s*s)) })) 10 | } 11 | sigmoid_basis_func <- function(M, has_bias=T, s=0.1) { 12 | phi <- c() 13 | if (has_bias) phi <- function(x) 0*x+1 # bias rep(1, length(x)) 14 | u_i <- seq(0,1,length=ifelse(has_bias, M-1, M)) 15 | append(phi, lapply(u_i, function(u){ u1=u; function(x) 1/(1+exp(-(x-u1)/s)) } )) 16 | } 17 | 18 | xlist <- seq(0, 1, length=250) 19 | tlist <- sin(2*pi*xlist)+rnorm(length(xlist), sd=0.2) 20 | D <- data.frame(x=xlist, t=tlist) 21 | 22 | # PRML's synthetic data set 23 | curve_fitting <- data.frame( 24 | x=c(0.000000,0.111111,0.222222,0.333333,0.444444,0.555556,0.666667,0.777778,0.888889,1.000000), 25 | t=c(0.349486,0.830839,1.007332,0.971507,0.133066,0.166823,-0.848307,-0.445686,-0.563567,0.261502) 26 | ) 27 | 28 | calc_evidence <- function(phi, D, alpha=2, beta=25, graph=NULL) { 29 | M <- length(phi) 30 | N <- length(D$x) 31 | PHI <- sapply(phi, function(f)f(D$x)) 32 | 33 | if (!is.null(graph)) { 34 | plot(graph, lty=2, col="blue", xlim=c(0,1), ylim=c(-1.1,1.1), ylab="") 35 | par(new=T) 36 | plot(D, xlim=c(0,1), ylim=c(-1.1,1.1), xlab="", ylab="") 37 | } 38 | 39 | if (beta=="ml") { 40 | w_ML <- solve(t(PHI) %*% PHI) %*% t(PHI) %*% D$t 41 | loss_ML <- D$t - PHI %*% w_ML 42 | beta <- N / sum(loss^2) 43 | if (!is.null(graph)) { 44 | par(new=T) 45 | plot( function(x) sapply(phi, function(f)f(x)) %*% w_ML , col="red", xlim=c(0,1), ylim=c(-1.1,1.1), ylab="") 46 | } 47 | } 48 | 49 | A <- alpha * diag(M) + beta * t(PHI) %*% PHI # equal to S_N(PRML 3.54) 50 | m_N <- beta * solve(A) %*% t(PHI) %*% D$t 51 | loss_m_N <- D$t - PHI %*% m_N 52 | E_m_N <- beta / 2 * sum(loss_m_N^2) + alpha / 2 * sum(m_N^2) 53 | 54 | if (!is.null(graph)) { 55 | par(new=T) 56 | plot( function(x) sapply(phi, function(f)f(x)) %*% m_N, xlim=c(0,1), ylim=c(-1.1,1.1), ylab="") 57 | } 58 | 59 | # model evidence 60 | c(M/2*log(alpha) + N/2*log(beta) - E_m_N - 1/2*log(det(A)) - N/2*log(2*pi), beta) 61 | } 62 | 63 | 64 | a<-sapply(1:9, function(n) calc_evidence(polynomial_basis_func(n), curve_fitting, alpha=5e-3)) 65 | 66 | orig_func <- function(x)sin(2*pi*x) 67 | calc_evidence(gaussian_basis_func(9, F, s=0.37), D, beta="ml", alpha=2, graph=orig_func) 68 | calc_evidence(polynomial_basis_func(4), curve_fitting, alpha=5e-3, beta="ml", graph=orig_func) 69 | 70 | calc_evidence(gaussian_basis_func(6, F), D0, alpha=2, beta="ml", graph=orig_func) 71 | 72 | # ---- 73 | 74 | > a<-sapply(1:9, function(n) calc_evidence(polynomial_basis_func(n), curve_fitting, alpha=5e-3, beta="ml")) 75 | > data.frame(M=0:8, evidence=a[1,], beta_ML=a[2,]) 76 | M evidence beta_ML 77 | 1 0 -13.60463 2.649926 78 | 2 1 -14.48098 4.680463 79 | 3 2 -16.60761 4.752649 80 | 4 3 -14.38654 28.600038 81 | 5 4 -14.20562 28.651286 82 | 6 5 -15.12706 29.206330 83 | 7 6 -15.86874 30.294868 84 | 8 7 -16.43925 30.954700 85 | 9 8 -17.37590 35.353486 86 | > a<-sapply(1:9, function(n) calc_evidence(polynomial_basis_func(n), curve_fitting, alpha=5e-3, beta=11.1)) 87 | > data.frame(M=0:8, evidence=a[1,], beta_ML=a[2,]) 88 | M evidence beta_ML 89 | 1 0 -23.10268 11.1 90 | 2 1 -17.88419 11.1 91 | 3 2 -20.30879 11.1 92 | 4 3 -13.93411 11.1 93 | 5 4 -13.71294 11.1 94 | 6 5 -14.35868 11.1 95 | 7 6 -14.98120 11.1 96 | 8 7 -15.48112 11.1 97 | 9 8 -15.90587 11.1 98 | 99 | # ---- 100 | 101 | a<-sapply(1:9, function(n) calc_evidence(gaussian_basis_func(n), D0, alpha=2, beta="ml")) 102 | data.frame(n=1:9, evidence=a[1,], beta_ML=a[2,]) 103 | 104 | n evidence beta_ML 105 | 1 1 -24.15372 1.991818 106 | 2 2 -26.00534 2.080833 107 | 3 3 -28.32203 2.199902 108 | 4 4 -28.98627 2.204309 109 | 5 5 -141.40382 11.622494 110 | 6 6 -262.51315 20.230023 111 | 7 7 -531.38789 39.600405 112 | 8 8 -558.69096 41.400126 113 | 9 9 -566.97144 41.904756 114 | 115 | a<-sapply(1:9, function(n) calc_evidence(gaussian_basis_func(n, F), D0, alpha=2, beta="ml")) 116 | data.frame(n=1:9, evidence=a[1,], beta_ML=a[2,]) 117 | n evidence beta_ML 118 | 1 1 -23.54826 2.069294 119 | 2 2 -27.36097 2.194946 120 | 3 3 -28.18806 2.194992 121 | 4 4 -139.53185 11.503610 122 | 5 5 -260.09231 20.131365 123 | 6 6 -523.95049 39.104824 124 | 7 7 -555.47110 41.216737 125 | 8 8 -564.31398 41.761091 126 | 9 9 -587.98582 43.413693 127 | 128 | a<-sapply(1:9, function(n) calc_evidence(gaussian_basis_func(n, F), curve_fitting, alpha=2, beta="ml")) 129 | data.frame(n=1:9, evidence=a[1,], beta_ML=a[2,]) 130 | n evidence beta_ML 131 | 1 1 -9.947638 2.844855 132 | 2 2 -10.661711 2.849207 133 | 3 3 -10.915992 2.895102 134 | 4 4 -32.215789 9.113815 135 | 5 5 -84.841899 21.990299 136 | 6 6 -84.348342 21.365737 137 | 7 7 -110.852386 27.425745 138 | 8 8 -119.979257 29.427823 139 | 9 9 -131.627930 32.218532 140 | 141 | a<-sapply(1:9, function(n) calc_evidence(gaussian_basis_func(n), curve_fitting, alpha=2, beta="ml")) 142 | data.frame(n=1:9, evidence=a[1,], beta_ML=a[2,]) 143 | n evidence beta_ML 144 | 1 1 -8.439416 2.649926 145 | 2 2 -9.732442 2.903392 146 | 3 3 -10.598375 2.953347 147 | 4 4 -11.048473 2.954229 148 | 5 5 -32.992820 9.370027 149 | 6 6 -93.909226 24.235243 150 | 7 7 -85.088391 21.481192 151 | 8 8 -132.661228 32.884408 152 | 9 9 -145.904973 35.879891 153 | 154 | -------------------------------------------------------------------------------- /misc/linear_regression.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/misc/linear_regression.xlsx -------------------------------------------------------------------------------- /misc/zipf.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | list = " etaonisrhdlucmfwgypbvkxjqz".split(//) 4 | prob = if ARGV[0] == "unif" 5 | [0.2] + Array.new(26){ 0.8 / 26.0 } 6 | elsif ARGV[0] == "linear" 7 | (1..27).map{|i| (28.0 - i) / (14 * 27) } 8 | elsif ARGV[0] == "zipf" 9 | (1..27).map{|i| 0.256973175704523 / i } 10 | else 11 | [0.1918182,0.1041442,0.0729357,0.0651738,0.0596302,0.0564513,0.0558094,0.0515760,0.0497563, 12 | 0.0492888,0.0349835,0.0331490,0.0225134,0.0217339,0.0202124,0.0197881,0.0171272,0.0158610, 13 | 0.0145984,0.0137645,0.0124248,0.0082903,0.0050529,0.0013692,0.0009033,0.0008606,0.0007836] 14 | end 15 | name = ARGV[0] || "orig" 16 | size = (ARGV[1] || 5000000).to_i 17 | 18 | prob_sum = 0 19 | cum_prob = [] 20 | prob.each do |x| 21 | cum_prob << prob_sum 22 | prob_sum += x 23 | end 24 | cum_prob << 1.0 25 | 26 | module R;def self.rand;Kernel::rand;end;end 27 | random = Random.new rescue R 28 | 29 | map = Hash.new(0) 30 | word = "" 31 | while true 32 | r = random.rand 33 | l = 0 34 | h = prob.size 35 | while h>l+1 36 | m = (h+l)/2 37 | if r < cum_prob[m] 38 | h = m 39 | else 40 | l = m 41 | end 42 | end 43 | x = list[l] 44 | 45 | if x == " " 46 | if word.length > 0 47 | map[word] += 1 48 | break if map.size == size 49 | end 50 | word = "" 51 | else 52 | word += x 53 | end 54 | end 55 | 56 | open("#{name}#{map.size/1000}k.txt", "w") do |f| 57 | f.puts "rank,word,freq,rank*freq,freq_freq" 58 | freq = rank = 1 59 | map.to_a.sort_by{|x| -x[1]}.each_with_index do |x, r| 60 | if freq != x[1] 61 | f.puts "#{rank},#{x[0]},#{x[1]},#{rank*x[1]},#{r-rank+2}" if rank>0 62 | freq = x[1] 63 | rank = r + 2 64 | end 65 | end 66 | end 67 | 68 | -------------------------------------------------------------------------------- /neural/adult.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require "neural.rb" 4 | #OUTPUT_CODE = true 5 | LOGFILE = "adult.log" 6 | 7 | # training data 8 | CLZ = {"<=50K"=>0, ">50K"=>1} 9 | dataset = [] 10 | categories = (1..14).map{[]} 11 | open("adult.data") do |f| 12 | while line = f.gets 13 | buf = line.chomp.split(',').map{|x| x.strip} 14 | clz = buf.pop 15 | break unless clz 16 | buf.each_with_index do |x, i| 17 | if x !~ /^[0-9]+$/ 18 | categories[i] << x if !categories[i].include?(x) 19 | end 20 | end 21 | dataset << [buf, CLZ[clz]] 22 | end 23 | end 24 | 25 | dataset.each_with_index do |data, idx| 26 | vector = [] 27 | data[0].each_with_index do |x, i| 28 | if categories[i].length > 0 29 | one_of_k = [0] * categories[i].length 30 | one_of_k[categories[i].index(x)] = 1 31 | vector.concat(one_of_k) 32 | else 33 | vector << x.to_f 34 | end 35 | end 36 | dataset[idx] = [vector, data[1]] 37 | end 38 | 39 | #dataset = dataset[0, 100] 40 | 41 | 42 | # units 43 | in_units = (1..dataset[0][0].length).map{|i| Unit.new("x#{i}")} 44 | hiddenunits1 = (1..20).map{|i| TanhUnit.new("z#{i}")} 45 | #hiddenunits2 = (1..30).map{|i| TanhUnit.new("w#{i}")} 46 | out_unit = [SigUnit.new("y1")] 47 | 48 | # network 49 | network = Network.new(:error_func=>ErrorFunction::CrossEntropy) 50 | network.in = in_units 51 | network.link in_units, hiddenunits1 52 | network.link hiddenunits1, out_unit 53 | network.out = out_unit 54 | 55 | open(LOGFILE, "a") {|f| f.puts "==== start (#{Time.now})" } 56 | 57 | max_correct = 0 58 | 10.times do |trial| 59 | network.weights.init_parameters 60 | 61 | 100.times do |tau| 62 | eta = if tau<10 then 0.1 elsif tau<50 then 0.05 elsif tau<100 then 0.01 else 0.005 end 63 | t = Time.now.to_i 64 | dataset.sort{rand}.each do |data| 65 | grad = network.gradient_E(data[0], [data[1]]) 66 | network.weights.descent eta, grad 67 | #e += network.error_function(data[0], [data[1]]) 68 | end 69 | puts "#{tau}: #{Time.now.to_i - t}s" 70 | end 71 | 72 | correct = 0 73 | dataset.each do |data| 74 | y = network.apply(*data[0]) 75 | correct += 1 if (data[1]==0 && y[0]<0.5) || (data[1]==1 && y[0]>0.5) 76 | end 77 | 78 | #log 79 | log = "#{trial+1}: correct = #{correct}, mistake = #{dataset.length - correct}, rate = #{(10000.0*correct/dataset.length).to_i/100.0}" 80 | puts log 81 | open(LOGFILE, "a") do |f| 82 | f.puts log 83 | if max_correct < correct 84 | max_correct = correct 85 | f.puts network.weights.dump 86 | end 87 | end 88 | end 89 | open(LOGFILE, "a") {|f| f.puts "==== end (#{Time.now})" } 90 | 91 | -------------------------------------------------------------------------------- /neural/classification.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | require "neural.rb" 3 | 4 | data = [] 5 | open("classification.txt") do |f| 6 | while line = f.gets 7 | x1, x2, t = line.split 8 | data << [[x1.to_f, x2.to_f], [t.to_f]] 9 | end 10 | end 11 | 12 | # units 13 | in_units = [Unit.new("x1"), Unit.new("x2")] 14 | hiddenunits = (1..6).map{|i| TanhUnit.new("z1#{i}")} 15 | hiddenunits2 = (1..6).map{|i| TanhUnit.new("z2#{i}")} 16 | out_unit = [SigUnit.new("y1")] 17 | 18 | # network 19 | network = Network.new(:error_func=>ErrorFunction::CrossEntropy, :code_generate=>true) 20 | network.in = in_units 21 | network.link in_units, hiddenunits 22 | network.link hiddenunits, hiddenunits2 23 | network.link hiddenunits2, out_unit 24 | network.out = out_unit 25 | 26 | eta = 0.1 27 | sum_e = 999999 28 | 1000.times do |tau| 29 | =begin 30 | s = 0 31 | data.each do |d| 32 | s += network.error_function(d[0], d[1]) 33 | end 34 | puts "sum of errors: #{tau} => #{s}" 35 | break if s > sum_e 36 | sum_e = s 37 | =end 38 | data.sort{rand}.each do |d| 39 | grad = network.gradient_E(d[0], d[1]) 40 | network.weights.descent eta, grad 41 | end 42 | end 43 | network.weights.dump 44 | -------------------------------------------------------------------------------- /neural/classification.txt: -------------------------------------------------------------------------------- 1 | 1.208985 0.421448 0.000000 2 | 0.504542 -0.285730 1.000000 3 | 0.630568 1.054712 0.000000 4 | 1.056364 0.601873 0.000000 5 | 1.095326 -1.447579 1.000000 6 | -0.210165 0.000284 1.000000 7 | -0.367151 -1.255189 1.000000 8 | 0.868013 -1.063465 0.000000 9 | 1.704441 -0.644833 0.000000 10 | 0.565619 -1.637858 1.000000 11 | 0.598389 -1.477808 0.000000 12 | 0.580927 -0.783898 1.000000 13 | 1.183283 -1.797936 0.000000 14 | 0.331843 -1.869486 0.000000 15 | -0.051195 0.989475 1.000000 16 | 2.427090 0.173557 0.000000 17 | 1.603778 -0.030691 1.000000 18 | 1.286206 -1.079916 0.000000 19 | -1.243951 1.005355 1.000000 20 | 1.181748 1.523744 0.000000 21 | 0.896222 1.899568 0.000000 22 | -0.366207 -0.664987 1.000000 23 | -0.078800 1.007368 1.000000 24 | -1.351435 1.766786 1.000000 25 | -0.220423 -0.442405 1.000000 26 | 0.836253 -1.927526 0.000000 27 | 0.039899 -1.435842 0.000000 28 | 0.256755 0.946722 0.000000 29 | 0.974836 -0.944967 0.000000 30 | 0.705256 -2.618644 0.000000 31 | 0.738188 -1.666242 0.000000 32 | 1.245931 -2.200826 0.000000 33 | 0.297604 0.159463 1.000000 34 | -2.210680 1.195815 1.000000 35 | -0.872624 -0.131252 1.000000 36 | 1.112762 -0.653777 0.000000 37 | 1.123989 -1.347470 0.000000 38 | 0.750833 0.811870 0.000000 39 | -0.183497 1.416116 1.000000 40 | 0.287582 -1.342512 0.000000 41 | 1.092719 1.380559 0.000000 42 | 0.719502 1.594624 0.000000 43 | -1.016254 0.651607 1.000000 44 | 0.379677 2.802498 0.000000 45 | 0.150675 0.474679 1.000000 46 | -0.116477 0.437483 1.000000 47 | 1.122528 0.698541 0.000000 48 | 0.953551 1.088368 0.000000 49 | -0.000228 0.347187 1.000000 50 | 0.505024 0.455407 1.000000 51 | 0.113753 0.559572 1.000000 52 | -0.677993 0.322716 1.000000 53 | 1.114811 -0.735813 0.000000 54 | 0.344114 -1.770137 0.000000 55 | 0.684242 -0.636027 1.000000 56 | -0.684629 -0.300568 1.000000 57 | -0.362677 -0.669101 1.000000 58 | 0.604984 -1.558581 0.000000 59 | 0.514202 -0.225827 0.000000 60 | 0.227014 -1.579346 1.000000 61 | 1.044068 -1.491114 0.000000 62 | 0.314855 -2.535762 1.000000 63 | 1.187904 -1.367278 0.000000 64 | 0.517132 1.375811 0.000000 65 | 1.244285 -0.764164 0.000000 66 | -0.831841 1.728708 1.000000 67 | 1.719616 -2.491282 1.000000 68 | 0.594216 1.137571 1.000000 69 | 0.939919 -0.474988 0.000000 70 | -0.918736 -0.748474 1.000000 71 | 0.913760 -1.194336 0.000000 72 | 0.893221 -1.569459 0.000000 73 | 0.653152 0.510498 0.000000 74 | 0.766890 -1.577565 0.000000 75 | 0.868315 -1.966740 1.000000 76 | 0.874218 0.514959 1.000000 77 | -0.559543 1.749552 1.000000 78 | 1.526669 -1.797734 1.000000 79 | 1.843439 -0.363161 0.000000 80 | 1.163746 2.062245 0.000000 81 | 0.565749 -2.432301 1.000000 82 | 1.016715 2.878822 0.000000 83 | 1.433979 -1.944960 1.000000 84 | -0.510225 0.295742 1.000000 85 | -0.385261 0.278145 1.000000 86 | 1.042889 -0.564351 0.000000 87 | -0.607265 1.885851 1.000000 88 | -0.355286 -1.813131 1.000000 89 | -0.790644 -0.790761 1.000000 90 | 1.372382 0.879619 0.000000 91 | 1.133019 -0.300956 0.000000 92 | 1.395009 -1.006842 0.000000 93 | 0.887843 0.222319 1.000000 94 | 1.484690 0.095074 0.000000 95 | 1.268061 1.832532 0.000000 96 | 0.124568 0.910824 1.000000 97 | 1.061504 -0.768175 1.000000 98 | 0.298551 2.573175 0.000000 99 | 0.241114 -0.613155 0.000000 100 | -0.423781 -1.524901 1.000000 101 | 0.528691 -0.939526 0.000000 102 | 1.601252 1.791658 0.000000 103 | 0.793609 0.812783 1.000000 104 | 0.327097 0.326998 0.000000 105 | 1.131868 -0.985696 1.000000 106 | 1.273154 1.656441 0.000000 107 | -0.816691 0.961580 1.000000 108 | 0.669064 1.162614 0.000000 109 | -0.453759 -1.146883 1.000000 110 | 2.055105 0.025811 0.000000 111 | 0.463119 -0.813294 1.000000 112 | 0.802392 -0.140807 1.000000 113 | -0.730255 -0.145175 1.000000 114 | 0.569256 0.567628 1.000000 115 | 0.486947 1.130519 0.000000 116 | 1.793588 -1.426926 0.000000 117 | 1.178831 -0.581314 1.000000 118 | 0.480055 1.257981 0.000000 119 | 0.683732 0.190071 1.000000 120 | -0.119082 -0.004020 1.000000 121 | -1.251554 -0.176027 1.000000 122 | 1.094741 -1.099305 0.000000 123 | -0.238250 -1.277484 1.000000 124 | -0.661556 1.327722 1.000000 125 | 1.442837 1.241720 0.000000 126 | 1.202320 0.489702 0.000000 127 | 0.932890 0.296430 0.000000 128 | 0.665568 -1.314006 0.000000 129 | -0.058993 1.322294 1.000000 130 | 0.209525 -1.006357 0.000000 131 | 1.023340 0.219375 0.000000 132 | 1.324444 0.446567 1.000000 133 | 1.453910 -1.151325 0.000000 134 | 0.616303 0.974796 0.000000 135 | 1.492010 -0.885984 0.000000 136 | 1.738658 0.686807 1.000000 137 | 0.900582 -0.280724 0.000000 138 | 0.961914 -0.053991 1.000000 139 | 1.819706 -0.953273 1.000000 140 | 1.581289 -0.340552 0.000000 141 | 0.520837 -0.680639 1.000000 142 | 1.433771 -0.914798 0.000000 143 | 0.611594 -1.691685 0.000000 144 | 1.591513 -0.978986 1.000000 145 | 1.282094 0.113769 0.000000 146 | 0.985715 0.275551 0.000000 147 | -1.805143 2.628696 1.000000 148 | 1.473100 -0.241372 0.000000 149 | -0.242212 -1.040151 1.000000 150 | 1.175525 -1.662026 0.000000 151 | 0.696040 0.154387 0.000000 152 | 1.457713 1.608681 0.000000 153 | 0.883215 1.330538 0.000000 154 | -0.681209 0.622394 1.000000 155 | -0.355082 0.432941 1.000000 156 | 0.633011 -1.194431 0.000000 157 | 0.782723 1.060008 1.000000 158 | 0.670180 -0.766999 1.000000 159 | -0.047154 0.698693 1.000000 160 | 0.287385 -1.097756 0.000000 161 | 0.069561 1.632585 1.000000 162 | 1.013230 1.111551 0.000000 163 | 0.639065 -0.697237 0.000000 164 | 1.174621 2.240022 1.000000 165 | 1.322020 0.040277 1.000000 166 | 0.019127 0.105667 1.000000 167 | 0.584584 1.101914 0.000000 168 | 1.157265 -0.665947 0.000000 169 | 1.565230 -0.840790 0.000000 170 | 1.759315 0.963703 1.000000 171 | 1.687068 -1.086466 0.000000 172 | 0.578314 -0.340961 1.000000 173 | 0.118925 -1.487694 1.000000 174 | 0.471201 0.330872 1.000000 175 | -0.268209 -0.353477 0.000000 176 | 1.625390 -1.718798 0.000000 177 | 1.117791 2.752549 0.000000 178 | -0.194552 -0.752687 1.000000 179 | 0.769548 -2.066152 0.000000 180 | 0.186062 0.022072 1.000000 181 | 1.771337 -0.393550 0.000000 182 | -1.300597 0.962803 1.000000 183 | 0.708730 -1.013371 0.000000 184 | -0.624235 -0.892995 1.000000 185 | 0.377055 -1.296098 0.000000 186 | 0.804404 -0.856253 1.000000 187 | 1.359887 -0.974291 0.000000 188 | -0.115505 0.228439 1.000000 189 | 0.913645 -0.344936 1.000000 190 | 0.318875 -0.886290 1.000000 191 | 0.822157 0.102548 0.000000 192 | -0.281208 1.302572 1.000000 193 | 0.044639 -1.107980 1.000000 194 | -0.029205 -2.033973 0.000000 195 | 0.879914 -2.000582 1.000000 196 | 0.601936 -0.503923 0.000000 197 | -0.490114 -0.841122 1.000000 198 | 1.847075 2.362322 0.000000 199 | -0.279703 0.753196 1.000000 200 | 1.953357 -0.746632 0.000000 201 | -------------------------------------------------------------------------------- /neural/curve_fitting.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require "neural.rb" 4 | 5 | # training data ( y = sin(2 PI x) + N(0, 0.3) ) 6 | D = [ 7 | [0.000000, 0.349486], [0.111111, 0.830839], 8 | [0.222222, 1.007332], [0.333333, 0.971507], 9 | [0.444444, 0.133066], [0.555556, 0.166823], 10 | [0.666667, -0.848307], [0.777778, -0.445686], 11 | [0.888889, -0.563567], [1.000000, 0.261502], 12 | ] 13 | 14 | # units 15 | in_unit = [Unit.new("x1")] 16 | hiddenunits = [TanhUnit.new("z1"), TanhUnit.new("z2"), TanhUnit.new("z3"), TanhUnit.new("z4")] 17 | out_unit = [IdentityUnit.new("y1")] 18 | 19 | # network 20 | network = Network.new 21 | network.in = in_unit 22 | network.link in_unit, hiddenunits 23 | network.link hiddenunits, out_unit 24 | network.out = out_unit 25 | 26 | eta = 0.1 27 | sum_e = 999999 28 | 1000.times do |tau| 29 | error = 0 30 | D.sort{rand}.each do |data| 31 | error += network.error_function([data[0]], [data[1]]) 32 | grad = network.gradient_E([data[0]], [data[1]]) 33 | network.weights.descent eta, grad 34 | end 35 | puts "error func(#{tau}): #{error}" 36 | break if sum_e < error 37 | sum_e = error 38 | end 39 | network.weights.dump 40 | 41 | 42 | 43 | =begin 44 | x = 0.0 45 | while x < 1.0 46 | y = network.apply(x) 47 | p [x, y] 48 | x += 0.05 49 | end 50 | =end 51 | -------------------------------------------------------------------------------- /neural/iris.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require "neural.rb" 4 | #OUTPUT_CODE = true 5 | LOGFILE = "iris.log" 6 | 7 | # training data 8 | CLZ = {"Iris-setosa"=>[1,0,0], "Iris-versicolor"=>[0,1,0], "Iris-virginica"=>[0,0,1]} 9 | dataset = [] 10 | open("iris.data") do |f| 11 | while line = f.gets 12 | buf = line.chomp.split(',') 13 | clz = buf.pop 14 | break unless clz 15 | x = buf.map{|x| x.to_f} 16 | dataset << [x, CLZ[clz]] 17 | end 18 | end 19 | 20 | def generate_network(network_type) 21 | # units 22 | in_units = (1..4).map{|i| Unit.new("x#{i}")} 23 | hiddenunits1 = (1..6).map{|i| TanhUnit.new("z#{i}")} 24 | hiddenunits2 = (1..6).map{|i| TanhUnit.new("w#{i}")} 25 | out_unit = (1..3).map{|i| SoftMaxUnit.new("y#{i}")} 26 | 27 | # network 28 | network = Network.new(:error_func=>ErrorFunction::SoftMax) 29 | network.in = in_units 30 | 31 | name = nil 32 | case network_type 33 | when 0 34 | name = "full link(6)" 35 | network.link in_units, hiddenunits1 36 | network.link hiddenunits1, out_unit 37 | when 1 38 | name = "full link(12)" 39 | network.link in_units, hiddenunits1+hiddenunits2 40 | network.link hiddenunits1+hiddenunits2, out_unit 41 | when 2 42 | name = "full link(6+6)" 43 | network.link in_units, hiddenunits1 44 | network.link hiddenunits1, hiddenunits2 45 | network.link hiddenunits2, out_unit 46 | when 3 47 | name = "each 2-input-units" 48 | network.link [in_units[0], in_units[1]], [hiddenunits1[0]] 49 | network.link [in_units[0], in_units[2]], [hiddenunits1[1]] 50 | network.link [in_units[0], in_units[3]], [hiddenunits1[2]] 51 | network.link [in_units[1], in_units[2]], [hiddenunits1[3]] 52 | network.link [in_units[1], in_units[3]], [hiddenunits1[4]] 53 | network.link [in_units[2], in_units[3]], [hiddenunits1[5]] 54 | network.link hiddenunits1[0, 6], out_unit 55 | end 56 | 57 | network.out = out_unit 58 | [name, network] 59 | end 60 | 61 | N_TRIALS = 100 62 | 63 | open(LOGFILE, "a") {|f| f.puts "==== start (#{Time.now})" } 64 | 65 | 4.times do |network_type| 66 | name, network = generate_network(network_type) 67 | max_correct = 0 68 | sum_correct = 0 69 | 70 | open(LOGFILE, "a") {|f| f.puts "-- #{name}" } 71 | t0 = Time.now.to_i 72 | N_TRIALS.times do |trial| 73 | network.weights.init_parameters 0, 3 74 | 200.times do |tau| 75 | eta = if tau<10 then 0.1 elsif tau<50 then 0.05 elsif tau<100 then 0.01 else 0.005 end 76 | dataset.sort{rand}.each do |data| 77 | grad = network.gradient_E(data[0], data[1]) 78 | network.weights.descent eta, grad 79 | end 80 | end 81 | 82 | correct = 0 83 | dataset.each do |data| 84 | y = network.apply(*data[0]) 85 | predict = (0..2).max_by{|i| y[i]} 86 | #puts "y = #{y.map{|x| (x*10000).to_i/10000.0}.inspect}, answer = #{data[1].inspect}" 87 | correct += 1 if data[1][predict]==1 88 | end 89 | sum_correct += correct 90 | 91 | #log 92 | log = "#{trial+1}: correct = #{correct}, mistake = #{dataset.length - correct}, rate = #{(10000.0*correct/dataset.length).to_i/100.0}" 93 | puts log 94 | open(LOGFILE, "a") do |f| 95 | f.puts log 96 | if max_correct < correct 97 | max_correct = correct 98 | f.puts network.weights.dump 99 | end 100 | end 101 | end 102 | open(LOGFILE, "a") do |f| 103 | f.puts "max of rate = #{(10000*max_correct/dataset.length).to_i/100.0}, average of rate = #{(10000*sum_correct/(dataset.length*N_TRIALS)).to_i/100.0} (#{Time.now.to_i - t0}sec)" 104 | end 105 | end 106 | open(LOGFILE, "a") {|f| f.puts "==== end (#{Time.now})" } 107 | 108 | 109 | -------------------------------------------------------------------------------- /neural/mnist.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require 'zlib' 4 | require "neural.rb" 5 | 6 | n_rows = n_cols = nil 7 | images = [] 8 | labels = [] 9 | Zlib::GzipReader.open('train-images-idx3-ubyte.gz') do |f| 10 | magic, n_images = f.read(8).unpack('N2') 11 | raise 'This is not MNIST image file' if magic != 2051 12 | n_rows, n_cols = f.read(8).unpack('N2') 13 | n_images.times do 14 | images << f.read(n_rows * n_cols) 15 | end 16 | end 17 | 18 | Zlib::GzipReader.open('train-labels-idx1-ubyte.gz') do |f| 19 | magic, n_labels = f.read(8).unpack('N2') 20 | raise 'This is not MNIST label file' if magic != 2049 21 | labels = f.read(n_labels).unpack('C*') 22 | end 23 | 24 | # output pgm 25 | def output_pgm(filename, images, n_rows, n_cols, n_width, n_height) 26 | open(filename, "wb") do |f| 27 | f.printf("P5 %d %d %d ", n_rows*n_width, n_cols*n_height, 0xff) 28 | offset = 0 29 | buf = "" 30 | n_height.times do 31 | n_cols.times do |y| 32 | n_width.times do |idx| 33 | st = images[offset + idx][y * n_rows, n_rows].unpack('C*').map{|p| 0xff - p }.pack("C*") 34 | #p images[offset + idx][y * n_rows, n_rows].unpack('C*') if st.length != 28 35 | buf << st 36 | end 37 | end 38 | offset += n_width 39 | end 40 | f.puts buf 41 | end 42 | end 43 | #output_pgm "mnist.pgm", images, n_rows, n_cols, 300, 200 44 | 45 | # units 46 | in_units = (1..(28*28)).map{|i| Unit.new("x#{i}")} 47 | hiddenunits = (1..100).map{|i| TanhUnit.new("z#{i}")} 48 | out_unit = (1..10).map{|i| SoftMaxUnit.new("y#{i}")} 49 | 50 | # network 51 | network = Network.new(:error_func=>ErrorFunction::SoftMax, :code_generate=>true) 52 | network.in = in_units 53 | network.link in_units, hiddenunits 54 | network.link hiddenunits, out_unit 55 | network.out = out_unit 56 | 57 | # training 58 | t1 = Time.now.to_f 59 | N_IMAGES = 1000 60 | 10.times do |n| 61 | eta = if n<2 then 0.1 elsif n<5 then 0.05 else 0.01 end 62 | (0..(N_IMAGES-1)).sort_by{rand}.each do |idx| 63 | image = images[idx].unpack('C*') 64 | target = [0]*10 65 | target[labels[idx]] = 1 66 | 67 | puts "(#{n+1}, #{idx}): correct: #{labels[idx]}" 68 | #puts "#{network.apply(*image).map{|x| (x*10000).floor/10000.0}.inspect}, e=#{(network.error_function(image, target)*1000)/1000.0}" 69 | 70 | grad = Gradient::BackPropagate.call(network, image, target) 71 | network.weights.descent eta, grad 72 | 73 | #puts "#{network.apply(*image).map{|x| (x*10000).floor/10000.0}.inspect}, e=#{(network.error_function(image, target)*1000)/1000.0}" 74 | end 75 | end 76 | t2 = Time.now.to_f 77 | 78 | # test 79 | puts "------------------------------" 80 | correct = mistake = 0 81 | (0..(N_IMAGES*2-1)).each do |idx| 82 | image = images[idx].unpack('C*') 83 | target = [0]*10 84 | target[labels[idx]] = 1 85 | 86 | y = network.apply(*image) 87 | predict = (0..9).max_by{|i| y[i]} 88 | puts "#{idx}: predict = #{predict}, expect = #{labels[idx]}" 89 | puts "#{y.map{|x| (x*10000).floor/10000.0}.inspect}, e=#{(network.error_function(image, target)*1000)/1000.0}" 90 | if labels[idx] == predict 91 | correct += 1 92 | else 93 | mistake += 1 94 | end 95 | end 96 | t3 = Time.now.to_f 97 | 98 | # 99 | puts "correct = #{correct}, mistake = #{mistake}, rate = #{(correct.to_f/(correct+mistake)*10000).floor/100.0}%" 100 | 101 | puts "learning: #{t2-t1}" 102 | puts "testing: #{t3-t2}" 103 | 104 | 105 | -------------------------------------------------------------------------------- /neural/xor.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require "neural.rb" 4 | OUTPUT_CODE = false 5 | 6 | # training data 7 | D = [ 8 | [[0, 0], [0]], 9 | [[1, 1], [0]], 10 | [[0, 1], [1]], 11 | [[1, 0], [1]], 12 | ] 13 | 14 | # units 15 | in_units = [Unit.new("x1"), Unit.new("x2")] 16 | hiddenunits = [TanhUnit.new("z1"), TanhUnit.new("z2"), TanhUnit.new("z3"), TanhUnit.new("z4")] 17 | out_unit = [SigUnit.new("y1")] 18 | 19 | # network 20 | network = Network.new(:error_func=>ErrorFunction::CrossEntropy) 21 | network.in = in_units 22 | network.link in_units, hiddenunits 23 | network.link hiddenunits, out_unit 24 | network.out = out_unit 25 | 26 | t1 = Time.now.to_f 27 | eta = 0.1 28 | sum_e = 999999 29 | 10000.times do |tau| 30 | s = 0 31 | D.each do |data| 32 | s += network.error_function(data[0], data[1]) 33 | end 34 | #puts "sum of errors: #{tau} => #{s}" 35 | break if s > sum_e 36 | sum_e = s 37 | 38 | D.sort{rand}.each do |data| 39 | grad = network.gradient_E(data[0], data[1]) 40 | network.weights.descent eta, grad 41 | end 42 | end 43 | #network.weights.dump 44 | 45 | t2 = Time.now.to_f 46 | puts "#{RUBY_VERSION}(#{RUBY_RELEASE_DATE})[#{RUBY_PLATFORM}] #{((t2-t1)*1000).to_i/1000.0} sec" 47 | 48 | #puts "(0, 0) => #{network.apply(0, 0)}, (1, 1) => #{network.apply(1, 1)}" 49 | #puts "(0, 1) => #{network.apply(0, 1)}, (1, 0) => #{network.apply(1, 0)}" 50 | 51 | -------------------------------------------------------------------------------- /ngram/knlm.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # n-Gram Language Model with Knerser-Ney Smoother 5 | # This code is available under the MIT License. 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import sys, codecs, re, numpy 9 | 10 | class NGram(dict): 11 | def __init__(self, N, depth=1): 12 | self.freq = 0 13 | self.N = N 14 | self.depth = depth 15 | def inc(self, v): 16 | if self.depth <= self.N: 17 | if v not in self: 18 | self[v] = NGram(self.N, self.depth + 1) 19 | self[v].freq += 1 20 | return self[v] 21 | def dump(self): 22 | if self.depth <= self.N: 23 | return "%d:{%s}" % (self.freq, ",".join("'%s':%s" % (k,d.dump()) for k,d in self.iteritems())) 24 | return "%d" % self.freq 25 | 26 | def probKN(self, D, given=""): 27 | assert D >= 0.0 and D <= 1.0 28 | if given == "": 29 | voca = self.keys() 30 | n = float(self.freq) 31 | return voca, [self[v].freq / n for v in voca] 32 | else: 33 | if len(given) >= self.N: 34 | given = given[-(self.N-1):] 35 | voca, low_prob = self.probKN(D, given[1:]) 36 | cur_ngram = self 37 | for v in given: 38 | if v not in cur_ngram: return voca, low_prob 39 | cur_ngram = cur_ngram[v] 40 | g = 0.0 # for normalization 41 | freq = [] 42 | for v in voca: 43 | c = cur_ngram[v].freq if v in cur_ngram else 0 44 | if c > D: 45 | g += D 46 | c -= D 47 | freq.append(c) 48 | n = float(cur_ngram.freq) 49 | return voca, [(c + g * lp) / n for c, lp in zip(freq, low_prob)] 50 | 51 | class Generator(object): 52 | def __init__(self, ngram): 53 | self.ngram = ngram 54 | self.start() 55 | def start(self): 56 | self.pointers = [] 57 | def inc(self, v): 58 | pointers = self.pointers + [self.ngram] 59 | self.pointers = [d.inc(v) for d in pointers if d != None] 60 | self.ngram.freq += 1 61 | 62 | def main(): 63 | import optparse 64 | 65 | parser = optparse.OptionParser() 66 | parser.add_option("-n", dest="ngram", type="int", help="n-gram", default=7) 67 | parser.add_option("-d", dest="discount", type="float", help="discount parameter of Knerser-Ney", default=0.5) 68 | parser.add_option("-i", dest="numgen", type="int", help="number of texts to generate", default=100) 69 | parser.add_option("-e", dest="encode", help="character code of input file(s)", default='utf-8') 70 | parser.add_option("-o", dest="output", help="output filename", default="generated.txt") 71 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 72 | (opt, args) = parser.parse_args() 73 | 74 | numpy.random.seed(opt.seed) 75 | 76 | START = u"\u0001" 77 | END = u"\u0002" 78 | 79 | ngram = NGram(opt.ngram) 80 | gen = Generator(ngram) 81 | for filename in args: 82 | with codecs.open(filename, "rb", opt.encode) as f: 83 | for s in f: 84 | s = s.strip() 85 | if len(s) == 0: continue 86 | s = START + s + END 87 | gen.start() 88 | for c in s: 89 | gen.inc(c) 90 | 91 | D = opt.discount 92 | with codecs.open(opt.output, "wb", "utf-8") as f: 93 | for n in xrange(opt.numgen): 94 | st = START 95 | for i in xrange(1000): 96 | voca, prob = ngram.probKN(D, st) 97 | i = numpy.random.multinomial(1, prob).argmax() 98 | v = voca[i] 99 | if v == END: break 100 | st += v 101 | f.write(st[1:]) 102 | f.write("\n") 103 | 104 | if __name__ == "__main__": 105 | main() 106 | 107 | -------------------------------------------------------------------------------- /ngram/ngram.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env ruby 2 | 3 | testtext = open(ARGV[0]) {|f| f.read } 4 | traintext = open(ARGV[1]) {|f| f.read } 5 | 6 | count = 0 7 | freq = Hash.new(0) 8 | freq2 = Hash.new(0) 9 | pre = nil 10 | traintext.scan(/\w+/) do |word| 11 | word.downcase! 12 | freq[word] += 1 13 | freq2[pre+" "+word] += 1 if pre 14 | pre = word 15 | count += 1 16 | end 17 | 18 | class MLE 19 | def initialize(freq, freq2, count) 20 | @prob = Hash.new(1) 21 | @prob2 = Hash.new(1) 22 | count = count.to_f 23 | freq.each do |word, f| 24 | @prob[word] = f / count 25 | end 26 | freq2.each do |word, f| 27 | @prob2[word] = f / (count-1) 28 | end 29 | end 30 | def set_target(text);end 31 | def key?(word);@prob.key?(word);end 32 | def [](word);@prob[word];end 33 | def bigram(word, word2);@prob2[word+" "+word2]/@prob[word];end 34 | end 35 | class Laplace 36 | def initialize(freq, freq2, count, lambda=1.0) 37 | @freq = freq.clone 38 | @freq2 = freq2.clone 39 | @count = count 40 | @lambda = lambda 41 | end 42 | def set_target(text) 43 | @voca = @freq.clone 44 | text.scan(/\w+/) do |word| 45 | @voca[word] = 0 unless @voca.key?(word) 46 | end 47 | @V = @voca.size 48 | end 49 | def key?(word);@voca.key?(word);end 50 | def [](word);(@voca[word]+@lambda)/(@count+@V*@lambda);end 51 | def bigram(word, word2) 52 | (@freq2[word+" "+word2]+@lambda)/(@count-1+@V*@V*@lambda)/self[word] 53 | end 54 | end 55 | 56 | model_MLE = MLE.new(freq, freq2, count) 57 | model_Laplace = Laplace.new(freq, freq2, count) 58 | model_Laplace.set_target(testtext) 59 | model_ELE = Laplace.new(freq, freq2, count, 0.5) 60 | model_ELE.set_target(testtext) 61 | 62 | def loglikelihood(text, model) 63 | logL = 0.0 # unigram 64 | logL2 = 0.0 # bigram 65 | count = 0 66 | pre = nil 67 | text.scan(/\w+/) do |word| 68 | word.downcase! 69 | if model.key?(word) 70 | logL += Math.log(model[word]) 71 | count += 1 72 | if pre 73 | logL2 += Math.log(model.bigram(pre, word)) 74 | else 75 | logL2 += Math.log(model[word]) 76 | end 77 | pre = word 78 | end 79 | end 80 | [logL, logL2, count] 81 | end 82 | 83 | def entropy(text, model) 84 | ent = 0.0 85 | text.scan(/\w+/) do |word| 86 | w = word.downcase 87 | ent -= model[w] * Math.log(model[w]) 88 | end 89 | ent 90 | end 91 | 92 | 93 | puts "MLE:" 94 | logL, logL2, count = loglikelihood(traintext, model_MLE) 95 | crossent = -logL / count / Math.log(2) 96 | crossent2 = -logL2 / count / Math.log(2) 97 | puts "train text(#{count} words)" 98 | puts "logL of unigrams = #{logL}, cross entropy = #{crossent}" 99 | puts "logL of bigrams = #{logL2}, cross entropy = #{crossent2}" 100 | #logL, count = loglikelihood(testtext, model_MLE) 101 | #crossent = -logL / count / Math.log(2) 102 | #puts "logL of test text(#{count} words, ignore unseen words) = #{logL}, cross entropy = #{crossent}" 103 | 104 | puts "Laplace:" 105 | logL, logL2, count = loglikelihood(traintext, model_Laplace) 106 | crossent = -logL / count / Math.log(2) 107 | crossent2 = -logL2 / count / Math.log(2) 108 | puts "logL of train text(#{count} words) = #{logL}, cross entropy = #{crossent}" 109 | puts "logL2 of train text(#{count} words) = #{logL2}, cross entropy 2 = #{crossent2}" 110 | logL, logL2, count = loglikelihood(testtext, model_Laplace) 111 | crossent = -logL / count / Math.log(2) 112 | crossent2 = -logL2 / count / Math.log(2) 113 | puts "logL of test text(#{count} words) = #{logL}, cross entropy = #{crossent}" 114 | puts "logL2 of test text(#{count} words) = #{logL2}, cross entropy 2 = #{crossent2}" 115 | 116 | puts "ELE:" 117 | logL, logL2, count = loglikelihood(traintext, model_ELE) 118 | crossent = -logL / count / Math.log(2) 119 | crossent2 = -logL2 / count / Math.log(2) 120 | puts "logL of train text(#{count} words) = #{logL}, cross entropy = #{crossent}" 121 | puts "logL2 of train text(#{count} words) = #{logL2}, cross entropy 2 = #{crossent2}" 122 | logL, logL2, count = loglikelihood(testtext, model_ELE) 123 | crossent = -logL / count / Math.log(2) 124 | crossent2 = -logL2 / count / Math.log(2) 125 | puts "logL of test text(#{count} words) = #{logL}, cross entropy = #{crossent}" 126 | puts "logL2 of test text(#{count} words) = #{logL2}, cross entropy 2 = #{crossent2}" 127 | 128 | 129 | #puts "logL of train text = #{loglikelihood(traintext, model_MLE)}, entropy = #{entropy(traintext, model_MLE)}" 130 | #puts "logL of test text(ignore unseen words) = #{loglikelihood(testtext, model_MLE)}, entropy = #{entropy(testtext, model_MLE)}" 131 | 132 | 133 | -------------------------------------------------------------------------------- /ngram/wordcount.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | 3 | 4 | import re, sys 5 | 6 | class NaiveCounting: 7 | def __init__(self): 8 | self.map = dict() 9 | def add(self, word): 10 | if word in self.map: 11 | self.map[word] += 1 12 | else: 13 | self.map[word] = 1 14 | 15 | class SpaceSaving: 16 | def __init__(self, k): 17 | self.k = k 18 | self.map = dict() 19 | def add(self, word): 20 | if word in self.map: 21 | self.map[word] += 1 22 | elif len(self.map) < self.k: 23 | self.map[word] = 1 24 | else: 25 | j = min(self.map, key=lambda x:self.map[x]) 26 | cj = self.map.pop(j) 27 | self.map[word] = cj + 1 28 | 29 | 30 | text = "" 31 | for filename in sys.argv: 32 | with open(filename, "rb") as f: 33 | text += f.read() 34 | 35 | c1 = NaiveCounting() 36 | c2 = SpaceSaving(1000) 37 | c3 = SpaceSaving(100) 38 | 39 | n = 0 40 | for m in re.finditer(r'[A-Za-z]+', text): 41 | word = m.group(0).lower() 42 | c1.add(word) 43 | c2.add(word) 44 | c3.add(word) 45 | n += 1 46 | 47 | print "total words = %d" % n 48 | 49 | words = c1.map.items() 50 | words.sort(key=lambda x:(-x[1], x[0])) 51 | m2 = c2.map 52 | m3 = c3.map 53 | for i, x in enumerate(words): 54 | print "%d\t%s\t%d\t%d\t%d" % (i+1, x[0], x[1], m2.get(x[0],0), m3.get(x[0],0)) 55 | 56 | 57 | -------------------------------------------------------------------------------- /pca/bayes.r: -------------------------------------------------------------------------------- 1 | # Bayesian PPCA for R 2 | 3 | M <- 2; 4 | I <- 50; 5 | splits <- 1; 6 | 7 | directory <- "."; 8 | 9 | argv <- commandArgs(T); 10 | if (length(argv)>0) directory <- commandArgs(T)[1]; 11 | if (length(argv)>1) M <- as.integer(commandArgs(T)[2]); 12 | if (length(argv)>2) I <- as.integer(commandArgs(T)[3]); 13 | if (length(argv)>3) splits <- as.integer(commandArgs(T)[4]); 14 | 15 | oilflow <- as.matrix(read.table(sprintf("%s/DataTrn.txt", directory))); 16 | oilflow.labels <- read.table(sprintf("%s/DataTrnLbls.txt", directory)); 17 | 18 | # density function of multivariate Gaussian 19 | dmnorm <- function(x, mu, sig) { 20 | D <- length(mu); 21 | 1/((2 * pi)^D * sqrt(det(sig))) * exp(- t(x-mu) %*% solve(sig) %*% (x-mu) / 2)[1]; 22 | } 23 | 24 | ppca_bayes <- function() { 25 | D <- ncol(oilflow); 26 | N <- nrow(oilflow); 27 | col <- colSums(t(oilflow.labels) * c(4,3,2)); 28 | pch <- colSums(t(oilflow.labels) * c(3,1,4)); 29 | 30 | # initialize parameters 31 | W <- matrix(rnorm(M*D), D); 32 | sigma2 <- rgamma(1,1); 33 | alpha <- c(1, rep(10000, M-1)); 34 | 35 | # mu = mean x_bar 36 | mu <- colMeans(oilflow); 37 | xn_minus_x_bar <- t(oilflow) - mu; # DxN-matrix 38 | S <- var(oilflow); 39 | 40 | # iteration 41 | for(i in 0:I) { 42 | # M = W^T W + sigma^2 I (PRML 12.41) 43 | M_inv <- solve(t(W) %*% W + sigma2 * diag(M)); 44 | 45 | ### E-step: 46 | 47 | # E[z_n] = M^-1 W^T (x_n - x^bar) (PRML 12.54) 48 | Ez <- t(M_inv %*% t(W) %*% xn_minus_x_bar); 49 | 50 | # E[z_n z_n^T] = sigma^2 M^-1 + E[z_n]E[z_n]^T (PRML 12.55) 51 | Ezz <- list(); 52 | sum_Ezz <- matrix(numeric(M*M), M); 53 | for(n in 1:N) { 54 | ezz <- sigma2 * M_inv + Ez[n,] %*% t(Ez[n,]); 55 | Ezz[[n]] <- ezz; 56 | sum_Ezz <- sum_Ezz + ezz; 57 | } 58 | 59 | ### M-step: 60 | 61 | # W_new = {sum (x_n - x^bar)E[z_n]^T}{sum E[z_n z_n^T] + sigma^2 A}^-1 (PRML 12.63) 62 | W <- xn_minus_x_bar %*% Ez %*% solve(sum_Ezz + diag(sigma2 * alpha)); 63 | 64 | # sigma_new^2 = 1/ND sum{ |x_n-x^bar|^2 - 2E[z_n]^T W^T (x_n-x^bar) + Tr(E[z_n z_n^T] W^T W) } (PRML 12.57) 65 | sigma2 <- sum(xn_minus_x_bar^2) - 2 * sum(diag(t(W) %*% xn_minus_x_bar %*% Ez)); 66 | for(n in 1:N) { 67 | sigma2 <- sigma2 + sum(diag(Ezz[[n]] %*% t(W) %*% W)); 68 | } 69 | sigma2 <- sigma2 / N / D; 70 | 71 | # alpha_i = D / w_i^T w_i (PRML 12.62) 72 | if (i>0) alpha <- D / diag(t(W) %*% W); 73 | 74 | cat(sprintf("M=%d, I=%d, alpha=(%s)\n", M, i, paste(sprintf(" %.2f",alpha),collapse=","))); 75 | if (sum(alpha>1e6)){ 76 | W <- W[,alpha<1e6]; 77 | alpha <- alpha[alpha<1e6]; 78 | M <- length(alpha); 79 | } 80 | } 81 | 82 | # draw chart 83 | draw_chart <- function(targets) { 84 | plot(Ez[,targets], col=col, pch=pch, xlim=c(-3,3), ylim=c(-3,3), 85 | #main=sprintf("M=%d, I=%d, alpha=(%s)\n", M, i, paste(sprintf(" %.2f",alpha), collapse=",")), 86 | xlab=sprintf("alpha=%.2f", alpha[targets[1]]), 87 | ylab=sprintf("alpha=%.2f", alpha[targets[2]]) 88 | ); 89 | }; 90 | png(width=640, height=640); 91 | par(mfrow=c(splits, splits), mar=c(4, 4, 2, 2)); 92 | #for(i in 1:(M-1)) for(j in (i+1):M) draw_chart(c(i, j)); 93 | #for(angle in 10:80) scatterplot3d(Ez[,1], Ez[,2], Ez[,3], color=col, pch=pch, xlim=c(-3,3), ylim=c(-3,3), zlim=c(-3,3), angle=angle*2); 94 | }; 95 | 96 | ppca_bayes() 97 | 98 | #library(scatterplot3d); 99 | #library(animation); 100 | #saveMovie(ppca_bayes(), interval=0.05, moviename="ppca_bayes", movietype="gif", outdir=getwd(),width=480, height=480); 101 | 102 | -------------------------------------------------------------------------------- /pca/ema.r: -------------------------------------------------------------------------------- 1 | # Probability Principal Component Analysis with EM Algorithm for R 2 | 3 | M <- 2; 4 | I <- 50; 5 | directory <- "."; 6 | 7 | argv <- commandArgs(T); 8 | if (length(argv)>0) directory <- commandArgs(T)[1]; 9 | if (length(argv)>1) M <- as.integer(commandArgs(T)[2]); 10 | 11 | oilflow <- as.matrix(read.table(sprintf("%s/DataTrn.txt", directory))); 12 | oilflow.labels <- read.table(sprintf("%s/DataTrnLbls.txt", directory)); 13 | likelihood.pre <- -999999; 14 | 15 | ppca_em <- function(oilflow, oilflow.labels, M, I) { 16 | D <- ncol(oilflow); 17 | N <- nrow(oilflow); 18 | col <- colSums(t(oilflow.labels) * c(4,3,2)); 19 | pch <- colSums(t(oilflow.labels) * c(3,1,4)); 20 | 21 | # initialize parameters 22 | W <- matrix(rnorm(M*D), D); 23 | sigma2 <- rgamma(1,1,1); 24 | 25 | # mu = mean x_bar 26 | mu <- colMeans(oilflow); 27 | xn_minus_x_bar <- t(oilflow) - mu; # DxN-matrix 28 | S <- var(oilflow); 29 | 30 | # iteration 31 | for(i in 0:I) { 32 | # M = W^T W + sigma^2 I (PRML 12.41) 33 | M_inv <- solve(t(W) %*% W + sigma2 * diag(M)); 34 | 35 | ### E-step: 36 | 37 | # E[z_n] = M^-1 W^T (x_n - x^bar) (PRML 12.54) 38 | Ez <- t(M_inv %*% t(W) %*% xn_minus_x_bar); 39 | 40 | # E[z_n z_n^T] = sigma^2 M^-1 + E[z_n]E[z_n]^T (PRML 12.55) 41 | Ezz <- list(); 42 | sum_Ezz <- matrix(numeric(M*M), M); 43 | for(n in 1:N) { 44 | ezz <- sigma2 * M_inv + Ez[n,] %*% t(Ez[n,]); 45 | Ezz[[n]] <- ezz; 46 | sum_Ezz <- sum_Ezz + ezz; 47 | } 48 | 49 | # likelihood 50 | C <- W %*% t(W) + diag(D) * sigma2; # (PRML 12.36) 51 | C_inv <- (diag(D) - W %*% M_inv %*% t(W)) / sigma2; # (PRML 12.40) 52 | likelihood <- - N / 2 * ( D * log(2 * pi) + log(det(C)) + sum(diag(C_inv %*% S)) ); # (PRML 12.44) 53 | plot(Ez, col=col, pch=pch, xlim=c(-3,3),ylim=c(-3,3),ylab="", 54 | xlab=sprintf("I=%d, log likelihood=%.3f", i, likelihood)) 55 | if (i>5 && (likelihood - likelihood.pre) < 0.001) break; 56 | likelihood.pre <- likelihood; 57 | 58 | ### M-step: 59 | 60 | # W_new = {sum (x_n - x^bar)E[z_n]^T}{sum E[z_n z_n^T]}^-1 (PRML 12.56) 61 | W <- xn_minus_x_bar %*% Ez %*% solve(sum_Ezz); 62 | 63 | # sigma_new^2 = 1/ND sum{ |x_n-x^bar|^2 - 2E[z_n]^T W^T (x_n-x^bar) + Tr(E[z_n z_n^T] W^T W) } (PRML 12.57) 64 | sigma2 <- sum(xn_minus_x_bar^2) - 2 * sum(diag(t(W) %*% xn_minus_x_bar %*% Ez)); 65 | for(n in 1:N) { 66 | sigma2 <- sigma2 + sum(diag(Ezz[[n]] %*% t(W) %*% W)); 67 | } 68 | sigma2 <- sigma2 / N / D; 69 | 70 | } 71 | print(likelihood); 72 | }; 73 | 74 | library(animation); 75 | saveMovie(ppca_em(oilflow, oilflow.labels, M, I), interval=1, moviename="ppca_em", 76 | movietype="gif", outdir=getwd(),width=480, height=480); 77 | 78 | -------------------------------------------------------------------------------- /pca/ppca.r: -------------------------------------------------------------------------------- 1 | # Probability Principal Component Analysis for R 2 | 3 | M <- 2; 4 | directory <- "."; 5 | 6 | argv <- commandArgs(T); 7 | if (length(argv)>0) directory <- commandArgs(T)[1]; 8 | if (length(argv)>1) M <- as.integer(commandArgs(T)[2]); 9 | 10 | oilflow <- as.matrix(read.table(sprintf("%s/DataTrn.txt", directory))); 11 | oilflow.labels <- read.table(sprintf("%s/DataTrnLbls.txt", directory)); 12 | D <- ncol(oilflow); 13 | 14 | # mu = mean x_bar 15 | mu <- colMeans(oilflow); 16 | 17 | # eigenvalues and eigenvectors of covariance S 18 | e <- eigen(var(oilflow)) 19 | 20 | # sigma^2 = sum(rest of eigenvalues) / (D - M) 21 | sigma2 <- mean(e$values[-(1:M)]); 22 | 23 | # W_ML = U_M(L_M - sigma^2 I)R, (now R = I) 24 | W_ML <- e$vectors[,1:M] %*% diag(e$values[1:M] - sigma2) %*% diag(c(1,-1)) 25 | 26 | # M = W^T W + sigma^2 I 27 | M_inv <- solve(t(W_ML) %*% W_ML + sigma2 * diag(M)); 28 | 29 | # projection into principal subspace 30 | z <- t(M_inv %*% t(W_ML) %*% (t(oilflow) - mu)) 31 | 32 | # draw chart 33 | col <- colSums(t(oilflow.labels) * c(4,3,2)); # ラベルごとに色を指定 34 | pch <- colSums(t(oilflow.labels) * c(3,1,4)); # ラベルごとにマーカーを指定 35 | plot(z, col=col, pch=pch, xlim=c(-2,4),ylim=c(-4,2)) 36 | 37 | -------------------------------------------------------------------------------- /perceptron/avg_percep_test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | traindata = [ # AND 4 | [[0, 0], -1], 5 | [[1, 0], -1], 6 | [[0, 1], -1], 7 | [[1, 1], +1], 8 | ] 9 | 10 | degree = traindata[0][0].size + 1 11 | w = Array.new(degree, 0) 12 | 13 | 20.times do |c| 14 | # shuffle 15 | traindata = traindata.sort_by{rand} 16 | 17 | # training 18 | n_errors = 0 19 | w_a = Array.new(degree, 0) 20 | n = 0 21 | traindata.each do |x, t| 22 | px = [1] + x # phai(x) 23 | s = 0 # sigma w^T phai(x_n) 24 | px.each_with_index do |x_i, i| 25 | s += w[i] * x_i 26 | end 27 | if s * t <= 0 # error 28 | #if (t>0)?(s<0):(s>=0) # 0 is also positive. 29 | puts [c+1, w, px, s, t].inspect 30 | n_errors += 1 31 | px.each_with_index do |x_i, i| 32 | w[i] += t * x_i 33 | w_a[i] += t * x_i * n 34 | end 35 | end 36 | n += 1 37 | end 38 | w_a.each_with_index do |w_i, i| 39 | w[i] -= w_i.to_f / n 40 | end 41 | 42 | if n_errors == 0 43 | puts "convergence: #{c}" 44 | break 45 | end 46 | end 47 | 48 | puts "w= #{w.inspect}" 49 | 50 | -------------------------------------------------------------------------------- /perceptron/percep_test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | traindata = [ # AND 4 | [[0, 0], -1], 5 | [[1, 0], -1], 6 | [[0, 1], -1], 7 | [[1, 1], +1], 8 | ] 9 | 10 | degree = traindata[0][0].size + 1 11 | w = Array.new(degree, 0) 12 | 13 | 20.times do |c| 14 | # shuffle 15 | traindata = traindata.sort_by{rand} 16 | 17 | # training 18 | n_errors = 0 19 | traindata.each do |x, t| 20 | px = [1] + x # phai(x) 21 | s = 0 # sigma w^T phai(x_n) 22 | px.each_with_index do |x_i, i| 23 | s += w[i] * x_i 24 | end 25 | if s * t <= 0 # error 26 | #if (t>0)?(s<0):(s>=0) # 0 is also positive. 27 | puts [c+1, w, px, s, t].inspect 28 | n_errors += 1 29 | px.each_with_index do |x_i, i| 30 | w[i] += t * x_i 31 | end 32 | end 33 | end 34 | 35 | if n_errors == 0 36 | puts "convergence: #{c}" 37 | break 38 | end 39 | end 40 | 41 | puts "w= #{w.inspect}" 42 | 43 | 44 | -------------------------------------------------------------------------------- /perceptron/test.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | if ARGV.length < 2 3 | puts "#$0 testfile modelfile" 4 | exit 1 5 | end 6 | 7 | w = open(ARGV[1]){|f| Marshal.load(f) } 8 | 9 | # load test data 10 | data = [] 11 | open(ARGV[0]) do |f| 12 | while line = f.gets 13 | features = line.split 14 | sign = features.shift.to_i 15 | map = Hash.new 16 | features.each do |feature| 17 | if feature =~ /^([0-9]+):([\+\-]?[0-9\.]+)$/ 18 | map[$1.to_i] = $2.to_f 19 | end 20 | end 21 | data << [map, sign] 22 | end 23 | end 24 | 25 | result = Array.new(4, 0) 26 | data.each do |x, t| 27 | x[w.size-1] = 1 # bias 28 | s = 0 29 | x.each do |i, x_i| 30 | s += w[i] * x_i if i < w.size 31 | end 32 | result[(t>0?2:0)+(s>0?1:0)] += 1 33 | end 34 | 35 | puts "Accuracy #{((result[3]+result[0]).to_f/data.size*100000).round/1000.0}% (#{result[3]+result[0]}/#{data.size})" 36 | puts "(Answer, Predict): (p,p):#{result[3]} (p,n):#{result[2]} (n,p):#{result[1]} (n,n):#{result[0]}" 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /perceptron/train.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby 2 | 3 | require 'optparse' 4 | opt = {:algo=>:P, :iteration=>10, :regularity=>1.0} 5 | parser = OptionParser.new 6 | parser.banner = "Usage: #$0 [options] trainfile modelfile" 7 | parser.on('-i [VAL]', Integer, 'number of iteration') {|v| opt[:iteration] = v } 8 | parser.on('-a [VAL]', [:P, :AP, :PA, :PA1, :PA2], 'algorism(P/AP/PA/PA1/PA2)') {|v| opt[:algo] = v } 9 | parser.on('-C [VAL]', Float, 'regularization parameter (for PA1/PA2)') {|v| opt[:regularity] = v } 10 | parser.parse!(ARGV) 11 | if ARGV.length < 2 12 | $stderr.puts parser 13 | exit(1) 14 | end 15 | 16 | 17 | # common 18 | 19 | def square_abs(x) 20 | square_abs_x = 0 21 | x.each do |i, x_i| 22 | square_abs_x += x_i * x_i 23 | end 24 | square_abs_x 25 | end 26 | 27 | class Train 28 | def initialize(degree) 29 | @degree = degree 30 | @w = Array.new(degree + 1, 0) 31 | end 32 | attr_accessor :w 33 | 34 | def loop(traindata, iteration=1, will_shuffle=true) 35 | pre_w = @w.dup 36 | iteration.times do |c| 37 | 38 | traindata = traindata.sort_by{rand} if will_shuffle 39 | traindata.each do |x, t| 40 | x[@degree] = 1 # bias 41 | s = 0 # sigma w^T phai(x_n) 42 | x.each do |i, x_i| 43 | s += @w[i] * x_i 44 | end 45 | yield @w, x, t, s 46 | end 47 | 48 | return c if pre_w == @w 49 | end 50 | nil 51 | end 52 | end 53 | 54 | 55 | # algorism 56 | 57 | def perceptron(traindata, degree, iteration) 58 | training = Train.new(degree) 59 | c = training.loop(traindata, iteration) do |w, x, t, s| 60 | if s * t <= 0 # error 61 | x.each do |i, x_i| 62 | w[i] += t * x_i 63 | end 64 | end 65 | end 66 | return [training, c] 67 | end 68 | 69 | def average_perceptron(traindata, degree, iteration) 70 | training = Train.new(degree) 71 | iteration.times do |c| 72 | w_a = Array.new(degree + 1, 0) # for average perceptron 73 | n = 0 74 | is_convergenced = training.loop(traindata) do |w, x, t, s| 75 | if s * t <= 0 # error 76 | x.each do |i, x_i| 77 | w[i] += t * x_i 78 | w_a[i] += t * x_i * n # for average perceptron 79 | end 80 | end 81 | n += 1 82 | end 83 | return [training, c] if is_convergenced 84 | w_a.each_with_index do |w_i, i| 85 | training.w[i] -= w_i.to_f / n # for averate perceptron 86 | end 87 | end 88 | return [training, nil] 89 | end 90 | 91 | def passive_aggressive(traindata, degree, iteration, aggressiveness=nil, regularity=0) 92 | training = Train.new(degree) 93 | c = training.loop(traindata, iteration) do |w, x, correct, predict| 94 | loss = 1 - correct * predict 95 | if loss > 0 96 | tau = loss.to_f / (square_abs(x) + regularity) 97 | tau = aggressiveness if aggressiveness && tau > aggressiveness 98 | x.each do |i, x_i| 99 | w[i] += tau * correct * x_i 100 | end 101 | end 102 | end 103 | return [training, c] 104 | end 105 | 106 | 107 | 108 | # load training data 109 | traindata = [] 110 | degree = 0 111 | open(ARGV[0]) do |f| 112 | while line = f.gets 113 | features = line.split 114 | sign = features.shift.to_i 115 | map = Hash.new 116 | features.each do |feature| 117 | if feature =~ /^([0-9]+):([\+\-]?[0-9\.]+)$/ 118 | term_id = $1.to_i 119 | map[term_id] = $2.to_f 120 | degree = term_id + 1 if degree <= term_id 121 | end 122 | end 123 | traindata << [map, sign] 124 | end 125 | end 126 | 127 | # training 128 | 129 | training, convergence = if opt[:algo] == :P 130 | perceptron(traindata, degree, opt[:iteration]) 131 | elsif opt[:algo] == :AP 132 | average_perceptron(traindata, degree, opt[:iteration]) 133 | elsif opt[:algo] == :PA 134 | passive_aggressive(traindata, degree, opt[:iteration]) 135 | elsif opt[:algo] == :PA1 136 | passive_aggressive(traindata, degree, opt[:iteration], opt[:regularity]) 137 | elsif opt[:algo] == :PA2 138 | passive_aggressive(traindata, degree, opt[:iteration], nil, 0.5 / opt[:regularity]) 139 | end 140 | 141 | puts "convergence: #{convergence}" if convergence 142 | open(ARGV[1], 'w'){|f| Marshal.dump(training.w, f) } 143 | 144 | -------------------------------------------------------------------------------- /privacy/randomized-response/rr-gibbs.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 - 3 | 4 | # Randomized Response (Gibbs Sampling) 5 | # This code is available under the MIT License. 6 | # (c)2021 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | nlist = [100, 1000, 10000] 9 | 10 | import numpy 11 | import matplotlib.pyplot as plt 12 | numpy.set_printoptions(precision=2, suppress=True) 13 | 14 | true_prob = numpy.array([0.1, 0.2, 0.3, 0.4]) 15 | #true_prob /= true_prob.sum() 16 | D = true_prob.size 17 | legend = [str(x) for x in true_prob] 18 | 19 | p = 1/5 20 | pii = (1 + (D - 1) * p) / D # P(Y=i|X=i) 21 | pij = (1 - p) / D # P(Y=j|X=i) 22 | P = pij * numpy.ones((D, D)) + (pii - pij) * numpy.eye(D) 23 | 24 | def gibbs_sampling(N, alpha): 25 | true_count = numpy.array(N * true_prob, dtype=int) 26 | true_count[-1] += N - true_count.sum() 27 | 28 | predicts = [] 29 | for _ in range(10000): 30 | c = sum(numpy.random.multinomial(n, P[i,:]) for i, n in enumerate(true_count)) 31 | 32 | pi = numpy.ones(D) + numpy.random.random(D) # initial 33 | pi /= pi.sum() 34 | sample = [] 35 | for epoch in range(400): 36 | Q = pi * P.T # _ij = pi_j * P_ji 37 | cond = Q.T / Q.sum(axis=1) # _ij = pi_i * P_ij / Σ_k pi_k * P_kj 38 | 39 | # sampling X 40 | X = numpy.sum([numpy.random.multinomial(n, cond[:,i]) for i, n in enumerate(c)], axis=0) 41 | 42 | # sampling pi 43 | pi = numpy.random.dirichlet(alpha + X) 44 | 45 | if epoch >= 200: sample.append(X) 46 | 47 | predicts.append(numpy.mean(sample, axis=0)/N) 48 | 49 | return numpy.array(predicts) 50 | 51 | for N in nlist: 52 | for alpha in [1.0, 0.1, 0.01]: 53 | predicts = gibbs_sampling(N, alpha) 54 | start = predicts.min() 55 | end = predicts.max() 56 | bins = 40 57 | step = (end - start)/bins 58 | 59 | plt.hist(predicts, bins=numpy.arange(start, end, step), density=True) 60 | plt.title("N = %d, alpha = %.2f" % (N, alpha)) 61 | plt.legend(legend) 62 | plt.tight_layout() 63 | plt.savefig("rr-gibbs-%d-%.2f.png" % (N, alpha)) 64 | plt.close() 65 | 66 | print("N=%d, alpha=%.2f, 1.true, 2.mean, 3.std, 4-5.95%%, 6.median" % (N, alpha)) 67 | print(numpy.vstack(( 68 | [true_prob, numpy.mean(predicts, axis=0), numpy.std(predicts, axis=0)], 69 | numpy.quantile(predicts, [0.025,0.975,0.5], axis=0)))) 70 | -------------------------------------------------------------------------------- /privacy/randomized-response/rr-mle.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 - 3 | 4 | # Randomized Response (Maximum Likelihood Estimation) 5 | # This code is available under the MIT License. 6 | # (c)2021 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import numpy 9 | import matplotlib.pyplot as plt 10 | numpy.set_printoptions(precision=2,suppress=True) 11 | 12 | true_prob = numpy.array([0.1, 0.2, 0.3, 0.4]) 13 | #true_prob /= true_prob.sum() 14 | D = true_prob.size 15 | legend = [str(x) for x in true_prob] 16 | 17 | p = 1/5 18 | pii = (1 + (D - 1) * p) / D # P(Y=i|X=i) 19 | pij = (1 - p) / D # P(Y=j|X=i) 20 | P = pij * numpy.ones((D, D)) + (pii - pij) * numpy.eye(D) 21 | 22 | for N in [100, 1000, 10000]: 23 | true_count = numpy.array(N * true_prob, dtype=int) 24 | true_count[-1] += N - true_count.sum() 25 | 26 | predicts = [] 27 | for _ in range(10000): 28 | c = sum(numpy.random.multinomial(n, P[i,:]) for i, n in enumerate(true_count)) # Randomized Response 29 | t = numpy.linalg.solve(P, c) # MLE 30 | predicts.append(t/N) 31 | predicts = numpy.array(predicts) 32 | 33 | start = predicts.min() 34 | end = predicts.max() 35 | bins = 40 36 | step = (end - start)/bins 37 | 38 | plt.hist(predicts, bins=numpy.arange(start, end, step), density=True) 39 | plt.title("N = %d" % N) 40 | plt.legend(legend) 41 | plt.tight_layout() 42 | plt.savefig("rr-mle-%d.png" % N) 43 | plt.close() 44 | 45 | print("N=%d, 1.true, 2.mean, 3.std, 4-5.95%%, 6.median" % N) 46 | print(numpy.vstack(( 47 | [true_prob, numpy.mean(predicts, axis=0), numpy.std(predicts, axis=0)], 48 | numpy.quantile(predicts, [0.025,0.975,0.5], axis=0)))) 49 | 50 | -------------------------------------------------------------------------------- /privacy/randomized-response/rr-vb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 - 3 | """Randomized Response (Collapsed Variational Bayesian) 4 | 5 | This code is available under the MIT License. 6 | (c)2021 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | Usage: 9 | experiments 10 | $ python rr-vb.py 100 11 | 12 | summary 13 | $ python rr-vb.py 14 | """ 15 | 16 | import sys, os, itertools, json 17 | from multiprocessing import Pool 18 | import numpy 19 | from scipy.stats import gaussian_kde 20 | import matplotlib.pyplot as plt 21 | numpy.set_printoptions(precision=3, suppress=True) 22 | 23 | def iterproducts(list1, list2, cycles): 24 | for _ in range(cycles): 25 | iters = itertools.product(list1, list2) 26 | for x in iters: yield x 27 | 28 | true_prob = numpy.array([0.1, 0.2, 0.3, 0.4]) 29 | legend = [str(x) for x in true_prob] 30 | D = true_prob.size 31 | #true_prob /= true_prob.sum() 32 | 33 | p = 1/5 34 | pii = (1 + (D - 1) * p) / D # P(Y=i|X=i) 35 | pij = (1 - p) / D # P(Y=j|X=i) 36 | P = pij * numpy.ones((D, D)) + (pii - pij) * numpy.eye(D) 37 | 38 | datapath = "rr-vb.txt" 39 | 40 | def variational_bayes(args): 41 | print("start:(%d, %.2f)" % args) 42 | N, alpha = args 43 | true_count = numpy.array(N * true_prob, dtype=int) 44 | true_count[-1] += N - true_count.sum() 45 | true_cum = numpy.cumsum(true_count) 46 | 47 | predicts = [] 48 | for _ in range(10): 49 | Y = numpy.concatenate([numpy.random.choice(D, n, p=pb) for pb, n in zip(P, true_count)]) 50 | numpy.random.shuffle(Y) 51 | 52 | X = numpy.random.random((N,D)) # P(X_n) 53 | X = (X.T/X.sum(axis=1)).T 54 | c = X.sum(axis=0) 55 | 56 | pre = c / c.sum() 57 | for epoch in range(200): 58 | for n in range(N): 59 | c -= X[n,:] 60 | x = P[:,Y[n]] * (alpha + c) 61 | z = X[n,:] = x / x.sum() 62 | c += z 63 | pi = c / c.sum() 64 | if ((pi - pre)**2).sum() < 1e-7: break 65 | pre = pi 66 | #print(epoch, pi) 67 | predicts.append((c/N).tolist()) 68 | print("end:(%d, %.2f)" % args) 69 | return {"N":N, "alpha":alpha, "predicts":predicts} 70 | 71 | if __name__ == '__main__': 72 | if len(sys.argv)>1: 73 | I = int(sys.argv[1]) 74 | tasks = iterproducts([10000, 1000, 100], [1.0, 0.1, 0.01], I) 75 | with Pool(os.cpu_count()-1) as pool: 76 | for outputs in pool.imap(variational_bayes, tasks): 77 | #print(outputs) 78 | with open(datapath, "a") as f: 79 | json.dump(outputs, f) 80 | f.write("\n") 81 | else: 82 | data = dict() 83 | with open(datapath) as f: 84 | for s in f: 85 | x = json.loads(s) 86 | N = x["N"] 87 | alpha = x["alpha"] 88 | predicts = x["predicts"] 89 | key = (N, alpha) 90 | if key in data: 91 | data[key].extend(predicts) 92 | else: 93 | data[key] = predicts 94 | 95 | cm = plt.get_cmap("tab10") 96 | for key, predicts in data.items(): 97 | N, alpha = key 98 | print("VB: N=%d, alpha=%.2f, 1.true, 2.mean, 3.std, 4-5.95%%, 6.median (trials=%d)" % (N, alpha, len(predicts))) 99 | predicts = numpy.array(predicts) 100 | print(numpy.vstack(([true_prob, numpy.mean(predicts, axis=0), numpy.std(predicts, axis=0)], numpy.quantile(predicts, [0.025,0.975,0.5], axis=0)))) 101 | 102 | start = numpy.min(predicts) 103 | end = numpy.max(predicts) 104 | xseq = numpy.arange(start, end, 0.001) 105 | pdfs = [gaussian_kde(predicts[:,i])(xseq) for i in range(D)] 106 | bins = 50 107 | step = (end - start)/bins 108 | 109 | plt.hist(predicts, bins=numpy.arange(start, end, step), density=True) 110 | plt.legend(legend) 111 | for i in range(D): 112 | plt.plot(xseq, pdfs[i], color=cm.colors[i], linewidth=0.5) 113 | plt.title("VB : N = %d, alpha = %.2f" % (N, alpha)) 114 | plt.tight_layout() 115 | plt.savefig("rr-vb-%d-%.2f.png" % (N, alpha)) 116 | plt.close() 117 | 118 | -------------------------------------------------------------------------------- /sampling/hmc.r: -------------------------------------------------------------------------------- 1 | # Hybrid Monte Carlo sampling 2 | 3 | N <- 10000; # number of sampling 4 | 5 | hmc_sampling <- function(N, E, partial_E, leapfrog_count=100, leapfrog_epsilon=0.01) { 6 | r <- rnorm(1,0,1); 7 | z <- 1; 8 | zlist <- c(); 9 | 10 | for(i in 1:N) { 11 | H <- E(z) + r^2/2; 12 | # leapfrog 13 | e <- sample(c(-1,1), 1) * runif(1, 0.9, 1.1) * leapfrog_epsilon; 14 | z2 <- z; 15 | r2 <- r - e * partial_E(z2) / 2; 16 | count = sample(leapfrog_count:(leapfrog_count*2), 1) 17 | for(j in 1:count) { 18 | z2 <- z2 + e * r2; 19 | r2 <- r2 - e * partial_E(z2); 20 | } 21 | r2 <- r2 - e * partial_E(z2) / 2; 22 | dH <- H - (E(z2) + r2^2/2); 23 | if (dH > 0 || runif(1) < exp(dH)) { 24 | z <- z2; 25 | zlist <- append(zlist, z); 26 | 27 | # resampling of r from p(r|z) = p(r) = N(0,1) 28 | r <- rnorm(1,0,1); 29 | } else { 30 | #cat(sprintf("%d: rejected\n", i)); 31 | } 32 | } 33 | cat(sprintf("reject: %d / %d\n", N-length(zlist), N)); 34 | zlist; 35 | } 36 | 37 | png(); 38 | 39 | # p(z) = N(0,1) = exp(-z^2/2)/sqrt(2pi) 40 | # E(z) = z^2/2, Zp = sqrt(2pi), dE/dz = z 41 | # Hamiltonian: H(z,r) = E(z) + K(r) = z^2/2 + r^2/2 42 | zlist <- hmc_sampling(N, function(z)z**2/2, function(z)z); 43 | hist(zlist, breaks=20, main=sprintf("N(1,0), mean=%.3f, var=%.3f", mean(zlist), var(zlist))); 44 | acf(zlist); 45 | 46 | # p(z) = Gamma(a,b) = 1/Zp * exp((a-1)ln z - bz) 47 | # E(z) = -(a-1)ln z + bz, dE/dz = b - (a-1)/z 48 | a <- 3; 49 | b <- 2; 50 | zlist <- hmc_sampling(N, function(z)b*z-(a-1)*log(z), function(z)b-(a-1)/z); 51 | hist(zlist, breaks=20, main=sprintf("Gamma(%d,%d), mean=%.3f, var=%.3f", a, b, mean(zlist), var(zlist))); 52 | acf(zlist); 53 | 54 | # E(z) = 1/12 z(z-1)(z-4)(z-6) = (z^4-11z^3+34z^2-24z)/12 55 | # dE/dz = (4z^3-33z^2+68z-24)/12 56 | N <- 1000 57 | zlist <- hmc_sampling(N, function(z)z*(z-1)*(z-4)*(z-6)/12, function(z)(4*z**3-33*z**2+68*z-24)/12, 58 | leapfrog_count=400, leapfrog_epsilon=0.005); 59 | hist(zlist, breaks=30, main=sprintf("E(z)=z(z-1)(z-4)(z-6)/12, mean=%.3f, var=%.3f", mean(zlist), var(zlist)), freq=F, xlim=c(-2,8)); 60 | par(new=T); 61 | plot(function(z)exp(-z*(z-1)*(z-4)*(z-6)/12), xlim=c(-2,8), col="red", ann=F, yaxt="n"); 62 | 63 | plot(zlist, type="l"); 64 | acf(zlist); 65 | -------------------------------------------------------------------------------- /semisupervised/ssnb.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # encode: utf-8 3 | 4 | # Semi-Supervised Naive Bayes Classifier with EM-Algorithm 5 | # [K. Nigam, A. McCallum, S. Thrun, and T. Mitchell 2000] Text Classifcation from Labeled and Unlabeled Documents using EM. Machine Learning 6 | 7 | # This code is available under the MIT License. 8 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc. 9 | 10 | 11 | import optparse 12 | import numpy, scipy 13 | import sklearn.datasets 14 | from sklearn.feature_extraction.text import CountVectorizer 15 | 16 | def performance(i, test, phi, theta): 17 | z = test.data * numpy.log(phi) + numpy.log(theta) # M * K 18 | z -= z.max(axis=1)[:, None] 19 | z = numpy.exp(z) 20 | z /= z.sum(axis=1)[:, None] 21 | predict = z.argmax(axis=1) 22 | correct = (test.target == predict).sum() 23 | T = test.data.shape[0] 24 | accuracy = float(correct) / T 25 | log_likelihood = numpy.log(numpy.choose(test.target, z.T) + 1e-14).sum() / T 26 | 27 | print "%d : %d / %d = %.3f, average of log likelihood = %.3f" % (i, correct, T, accuracy, log_likelihood) 28 | return accuracy 29 | 30 | def estimate(data, test, alpha, beta, n, K=None): 31 | M, V = data.data.shape 32 | if not K: 33 | K = data.target.max() + 1 34 | #if opt.training: 35 | # train = [int(x) for x in opt.training.split(",")] 36 | #else: 37 | train = [] 38 | for k in xrange(K): 39 | train.extend(numpy.random.choice((data.target==k).nonzero()[0], n)) 40 | 41 | theta = numpy.ones(K) / K 42 | phi0 = numpy.zeros((V, K)) + beta 43 | for n in train: 44 | phi0[:, data.target[n]] += data.data[n, :].toarray().flatten() 45 | phi = phi0 / phi0.sum(axis=0) 46 | accuracy0 = performance(0, test, phi, theta) 47 | 48 | for i in xrange(20): 49 | # E-step 50 | z = data.data * numpy.log(phi) + numpy.log(theta) # M * K 51 | z -= z.max(axis=1)[:, None] 52 | z = numpy.exp(z) 53 | z /= z.sum(axis=1)[:, None] 54 | 55 | # M-step 56 | theta = z.sum(axis=0) + alpha 57 | theta /= theta.sum() 58 | phi = phi0 + data.data.T * z 59 | phi = phi / phi.sum(axis=0) 60 | 61 | accuracy = performance(i+1, test, phi, theta) 62 | 63 | return len(train), accuracy0, accuracy 64 | 65 | def main(): 66 | parser = optparse.OptionParser() 67 | 68 | parser.add_option("-K", dest="class_size", type="int", help="number of class") 69 | parser.add_option("-a", dest="alpha", type="float", help="parameter alpha", default=0.05) 70 | parser.add_option("-b", dest="beta", type="float", help="parameter beta", default=0.001) 71 | #parser.add_option("-n", dest="n", type="int", help="training size for each label", default=1) 72 | #parser.add_option("-t", dest="training", help="specify indexes of training", default=None) 73 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 74 | (opt, args) = parser.parse_args() 75 | numpy.random.seed(opt.seed) 76 | 77 | data = sklearn.datasets.fetch_20newsgroups() 78 | test = sklearn.datasets.fetch_20newsgroups(subset='test') 79 | 80 | vec = CountVectorizer() 81 | data.data = vec.fit_transform(data.data).tocsr() 82 | test.data = vec.transform(test.data).tocsr() # use the same vocaburary of training data 83 | 84 | print "(data size, voca size) : (%d, %d)" % data.data.shape 85 | print "(test size, voca size) : (%d, %d)" % test.data.shape 86 | 87 | if opt.class_size: 88 | """ 89 | index = data.target < opt.class_size 90 | a = data.data.toarray()[index, :] 91 | data.data = scipy.sparse.csr_matrix(a) 92 | data.target = data.target[index] 93 | print "(shrinked data size, voca size) : (%d, %d)" % data.data.shape 94 | """ 95 | 96 | index = test.target < opt.class_size 97 | a = test.data.toarray()[index, :] 98 | test.data = scipy.sparse.csr_matrix(a) 99 | test.target = test.target[index] 100 | print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape 101 | 102 | 103 | result = [] 104 | for n in xrange(50): 105 | result.append(estimate(data, test, opt.alpha, opt.beta, n+1, 2)) 106 | for x in result: 107 | print x 108 | 109 | if __name__ == "__main__": 110 | main() 111 | 112 | -------------------------------------------------------------------------------- /sequence/testcrf.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # Project Gutenberg Content Extractor with CRF 4 | 5 | import numpy 6 | import time 7 | from optparse import OptionParser 8 | from crf import CRF, Features, FeatureVector 9 | 10 | 11 | def main(): 12 | def load_data(data): 13 | texts = [] 14 | labels = [] 15 | text = [] 16 | data = "\n" + data + "\n" 17 | for line in data.split("\n"): 18 | line = line.strip() 19 | if len(line) == 0: 20 | if len(text)>0: 21 | texts.append(text) 22 | labels.append(label) 23 | text = [] 24 | label = [] 25 | else: 26 | token, info, chunk = line.split() 27 | text.append((token, info)) 28 | label.append(chunk) 29 | return (texts, labels) 30 | 31 | texts, labels = load_data(""" 32 | This DT B-NP 33 | temblor-prone JJ I-NP 34 | city NN I-NP 35 | dispatched VBD B-VP 36 | inspectors NNS B-NP 37 | , , O 38 | 39 | firefighters NNS B-NP 40 | and CC O 41 | other JJ B-NP 42 | earthquake-trained JJ I-NP 43 | personnel NNS I-NP 44 | to TO B-VP 45 | aid VB I-VP 46 | San NNP B-NP 47 | Francisco NNP I-NP 48 | . . O 49 | """) 50 | 51 | print texts, labels 52 | 53 | test_texts, test_labels = load_data(""" 54 | Rockwell NNP B-NP 55 | said VBD B-VP 56 | the DT B-NP 57 | agreement NN I-NP 58 | calls VBZ B-VP 59 | for IN B-SBAR 60 | it PRP B-NP 61 | to TO B-VP 62 | supply VB I-VP 63 | 200 CD B-NP 64 | additional JJ I-NP 65 | so-called JJ I-NP 66 | shipsets NNS I-NP 67 | for IN B-PP 68 | the DT B-NP 69 | planes NNS I-NP 70 | . . O 71 | """) 72 | 73 | features = Features(labels) 74 | tokens = dict([(i[0],1) for x in texts for i in x]).keys() 75 | infos = dict([(i[1],1) for x in texts for i in x]).keys() 76 | 77 | for label in features.labels: 78 | for token in tokens: 79 | features.add_feature( lambda x, y, l=label, t=token: 1 if y==l and x[0]==t else 0 ) 80 | for info in infos: 81 | features.add_feature( lambda x, y, l=label, i=info: 1 if y==l and x[1]==i else 0 ) 82 | features.add_feature_edge( lambda y_, y: 0 ) 83 | 84 | fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)] 85 | fv = fvs[0] 86 | text_fv = FeatureVector(features, test_texts[0]) # text sequence without labels 87 | 88 | 89 | crf = CRF(features, 0) 90 | theta0 = crf.random_param() 91 | print "initial log likelihood:", crf.likelihood(fvs, theta0) 92 | 93 | 94 | print ">> Steepest Descent" 95 | theta = theta0.copy() 96 | eta = 0.5 97 | t = time.time() 98 | for i in range(20): 99 | theta += eta * crf.gradient_likelihood(fvs, theta) 100 | print i, "log likelihood:", crf.likelihood(fvs, theta) 101 | eta *= 0.95 102 | print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) 103 | 104 | print ">> SGD" 105 | theta = theta0.copy() 106 | eta = 0.5 107 | t = time.time() 108 | for i in range(20): 109 | for fv in fvs: 110 | theta += eta * crf.gradient_likelihood([fv], theta) 111 | print i, "log likelihood:", crf.likelihood(fvs, theta) 112 | eta *= 0.95 113 | print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) 114 | 115 | print ">> SGD + FOBOS L1" 116 | theta = theta0.copy() 117 | eta = 0.5 118 | lmd = 0.01 119 | t = time.time() 120 | for i in range(20): 121 | lmd_eta = lmd * eta 122 | for fv in fvs: 123 | theta += eta * crf.gradient_likelihood([fv], theta) 124 | theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta) 125 | print i, "log likelihood:", crf.likelihood(fvs, theta) 126 | eta *= 0.95 127 | print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) 128 | 129 | print ">> Steepest Descent + FOBOS L1" 130 | theta = theta0.copy() 131 | eta = 0.2 132 | lmd = 0.5 133 | t = time.time() 134 | for i in range(20): 135 | theta += eta * crf.gradient_likelihood(fvs, theta) 136 | lmd_eta = lmd * eta 137 | theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta) 138 | print i, "log likelihood:", crf.likelihood(fvs, theta) 139 | eta *= 0.9 140 | print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) 141 | #print theta 142 | 143 | print ">> BFGS" 144 | t = time.time() 145 | theta = crf.inference(fvs, theta0) 146 | print "log likelihood:", crf.likelihood(fvs, theta) 147 | print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size) 148 | 149 | 150 | if __name__ == "__main__": 151 | main() 152 | 153 | -------------------------------------------------------------------------------- /trie/da.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import collections 5 | import numpy 6 | 7 | # Double Array for static ordered data 8 | # This code is available under the MIT License. 9 | # (c)2011 Nakatani Shuyo / Cybozu Labs Inc. 10 | 11 | class DoubleArray(object): 12 | def __init__(self, verbose=False): 13 | self.verbose = verbose 14 | 15 | def validate_list(self, list): 16 | pre = "" 17 | for i, line in enumerate(list): 18 | if pre >= line: 19 | raise Exception, "list has not ascent order at %d" % (i+1) 20 | pre = line 21 | 22 | def initialize(self, list): 23 | self.validate_list(list) 24 | 25 | self.N = 1 26 | self.base = [-1] 27 | self.check = [-1] 28 | self.value = [-1] 29 | 30 | max_index = 0 31 | queue = collections.deque([(0, 0, len(list), 0)]) 32 | while len(queue) > 0: 33 | index, left, right, depth = queue.popleft() 34 | if depth >= len(list[left]): 35 | self.value[index] = left 36 | left += 1 37 | if left >= right: continue 38 | 39 | # get branches of current node 40 | stack = collections.deque([(right, -1)]) 41 | cur, c1 = (left, ord(list[left][depth])) 42 | result = [] 43 | while len(stack) >= 1: 44 | while c1 == stack[-1][1]: 45 | cur, c1 = stack.pop() 46 | mid = (cur + stack[-1][0]) / 2 47 | if cur == mid: 48 | result.append((cur + 1, c1)) 49 | cur, c1 = stack.pop() 50 | else: 51 | c2 = ord(list[mid][depth]) 52 | if c1 != c2: 53 | stack.append((mid, c2)) 54 | else: 55 | cur = mid 56 | 57 | # search empty index for current node 58 | v0 = result[0][1] 59 | j = - self.check[0] - v0 60 | while any(j + v < self.N and self.check[j + v] >= 0 for right, v in result): 61 | j = - self.check[j + v0] - v0 62 | tail_index = j + result[-1][1] 63 | if max_index < tail_index: 64 | max_index = tail_index 65 | self.extend_array(tail_index + 2) 66 | 67 | # insert current node into DA 68 | self.base[index] = j 69 | depth += 1 70 | for right, v in result: 71 | child = j + v 72 | self.check[self.base[child]] = self.check[child] 73 | self.base[-self.check[child]] = self.base[child] 74 | self.check[child] = index 75 | queue.append((child, left, right, depth)) 76 | left = right 77 | 78 | self.shrink_array(max_index) 79 | 80 | def extend_array(self, max_cand): 81 | if self.N < max_cand: 82 | new_N = 2 ** int(numpy.ceil(numpy.log2(max_cand))) 83 | self.log("extend DA : %d => (%d) => %d", (self.N, max_cand, new_N)) 84 | self.base.extend( n - 1 for n in xrange(self.N, new_N)) 85 | self.check.extend( - n - 1 for n in xrange(self.N, new_N)) 86 | self.value.extend( - 1 for n in xrange(self.N, new_N)) 87 | self.N = new_N 88 | 89 | def shrink_array(self, max_index): 90 | self.log("shrink DA : %d => %d", (self.N, max_index + 1)) 91 | self.N = max_index + 1 92 | self.check = numpy.array(self.check[:self.N]) 93 | self.base = numpy.array(self.base[:self.N]) 94 | self.value = numpy.array(self.value[:self.N]) 95 | 96 | not_used = self.check < 0 97 | self.check[not_used] = -1 98 | not_used[0] = False 99 | self.base[not_used] = self.N 100 | 101 | def log(self, format, param): 102 | if self.verbose: 103 | import time 104 | print "-- %s, %s" % (time.strftime("%Y/%m/%d %H:%M:%S"), format % param) 105 | 106 | def save(self, filename): 107 | numpy.savez(filename, base=self.base, check=self.check, value=self.value) 108 | 109 | def load(self, filename): 110 | loaded = numpy.load(filename) 111 | self.base = loaded['base'] 112 | self.check = loaded['check'] 113 | self.value = loaded['value'] 114 | self.N = self.base.size 115 | 116 | def add_element(self, s, v): 117 | pass 118 | 119 | def get_subtree(self, s): 120 | cur = 0 121 | for c in iter(s): 122 | v = ord(c) 123 | next = self.base[cur] + v 124 | if next >= self.N or self.check[next] != cur: 125 | return None 126 | cur = next 127 | return cur 128 | 129 | def get_child(self, c, subtree): 130 | v = ord(c) 131 | next = self.base[subtree] + v 132 | if next >= self.N or self.check[next] != subtree: 133 | return None 134 | return next 135 | 136 | def get(self, s): 137 | cur = self.get_subtree(s) 138 | if cur >= 0: 139 | value = self.value[cur] 140 | if value >= 0: return value 141 | return None 142 | 143 | def get_value(self, subtree): 144 | return self.value[subtree] 145 | 146 | def extract_features(self, st): 147 | events = dict() 148 | pointers = [] 149 | for c in iter(st): 150 | pointers.append(0) 151 | new_pointers = [] 152 | for pointer in pointers: 153 | p = self.get_child(c, pointer) 154 | if p is not None: 155 | new_pointers.append(p) 156 | id = self.value[p] 157 | if id >= 0: 158 | events[id] = events.get(id, 0) + 1 159 | pointers = new_pointers 160 | return events 161 | 162 | 163 | 164 | -------------------------------------------------------------------------------- /trie/test_da.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | import unittest 5 | import da 6 | 7 | class TestDoubleArray(unittest.TestCase): 8 | def test1(self): 9 | trie = da.DoubleArray(verbose=False) 10 | trie.initialize(["cat"]) 11 | self.assertEqual(trie.N, 4) 12 | self.assert_(trie.get("ca") is None) 13 | self.assert_(trie.get("xxx") is None) 14 | self.assertEqual(trie.get("cat"), 0) 15 | 16 | def test2(self): 17 | trie = da.DoubleArray() 18 | trie.initialize(["cat", "dog"]) 19 | self.assertEqual(trie.N, 7) 20 | self.assert_(trie.get("ca") is None) 21 | self.assert_(trie.get("xxx") is None) 22 | self.assertEqual(trie.get("cat"), 0) 23 | self.assertEqual(trie.get("dog"), 1) 24 | 25 | def test3(self): 26 | trie = da.DoubleArray(verbose=False) 27 | trie.initialize(["ca", "cat", "deer", "dog", "fox", "rat"]) 28 | print trie.base 29 | print trie.check 30 | print trie.value 31 | self.assertEqual(trie.N, 17) 32 | self.assert_(trie.get("c") is None) 33 | self.assertEqual(trie.get("ca"), 0) 34 | self.assertEqual(trie.get("cat"), 1) 35 | self.assertEqual(trie.get("deer"), 2) 36 | self.assertEqual(trie.get("dog"), 3) 37 | self.assert_(trie.get("xxx") is None) 38 | 39 | def test4(self): 40 | trie = da.DoubleArray() 41 | self.assertRaises(Exception, trie.initialize, ["cat", "ant"]) 42 | 43 | unittest.main() 44 | 45 | -------------------------------------------------------------------------------- /trie/trie.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Naive Trie 5 | # This code is available under the MIT License. 6 | # (c)2011 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | class Trie(object): 9 | def initialize(self): 10 | self.root = dict() 11 | def add_element(self, s, v): 12 | x = self.root 13 | for c in s: 14 | if c not in x: x[c] = dict() 15 | x = x[c] 16 | x[""] = v 17 | def get_subtree(self, s): 18 | x = self.root 19 | for c in iter(st): 20 | if c not in x: return None 21 | x = x[c] 22 | return x 23 | def get_child(self, c, subtree): 24 | if c not in x: return None 25 | return subtree[c] 26 | def get(self, s): 27 | return self.get_value(self.get_subtree(s)) 28 | def get_value(self, subtree): 29 | return subtree[""] 30 | 31 | -------------------------------------------------------------------------------- /unsupervised/bs.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby -KN 2 | # ./bs.rb [corpus files] 3 | 4 | begin 5 | #raise 6 | require '../lib/infinitive.rb' 7 | INF = Infinitive.new 8 | rescue 9 | module INF 10 | def self.infinitive(word) 11 | word.downcase 12 | end 13 | end 14 | end 15 | 16 | docs = Array.new 17 | words = Hash.new{|h,k| h[k]=Hash.new } 18 | worddocs = Hash.new{|h,k| h[k]=Hash.new } 19 | while filename = ARGV.shift 20 | puts "loading: #{filename}" 21 | vec = Hash.new(0) 22 | doc_id = docs.length 23 | open(filename) do |f| 24 | while line = f.gets 25 | line.scan(/[A-Za-z]+/) do |word| 26 | infword = INF.infinitive(word) 27 | vec[infword] = 1 28 | words[infword][word] = 1 29 | worddocs[infword][doc_id] = 1 30 | end 31 | if vec.size > 100 32 | docs << vec 33 | doc_id = docs.length 34 | vec = Hash.new(0) 35 | end 36 | end 37 | end 38 | docs << vec if vec.size > 0 39 | end 40 | 41 | class BayesianSet 42 | C = 2.0 43 | def initialize(docs, words, worddocs) 44 | @docs = docs 45 | @words = words 46 | @worddocs = worddocs 47 | @alpha = docs.map{|vec| C * vec.size / words.length } 48 | @beta = @alpha.map{|a| C - a } 49 | puts "# of words = #{words.size}, # of docs = #{docs.length}" 50 | end 51 | 52 | def search(query) 53 | query = query.map{|x| INF.infinitive(x)}.uniq 54 | n = query.length 55 | alpha_tild = Array.new # ln(alpha~/alpha) 56 | beta_tild = Array.new # ln(beta~/beta) 57 | @alpha.each_with_index do |a, i| 58 | s = query.select{|w| @docs[i].key?(w) }.length 59 | alpha_tild << Math.log(1 + s / a) 60 | beta_tild << Math.log(1 + (n - s) / @beta[i]) 61 | end 62 | 63 | @worddocs.map do |w, docs| 64 | score = 0 65 | =begin 66 | # method of original paper 67 | @docs.each_with_index do |vec, j| 68 | score += Math.log(@alpha[j]+@beta[j])-Math.log(@alpha[j]+@beta[j]+n) 69 | score += if vec.key?(w) then alpha_tild[j] else beta_tild[j] end 70 | end 71 | =end 72 | # simple & fast 73 | docs.each do |j, dummy| 74 | score += alpha_tild[j] - beta_tild[j] 75 | end 76 | 77 | [w, score] 78 | end.sort_by{|x| -x[1]}[0..9].each do |w, score| 79 | puts "#{w}: #{score} (#{@words[w].keys.join(',')})" 80 | end 81 | end 82 | end 83 | 84 | bs = BayesianSet.new(docs, words, worddocs) 85 | #if ARGV.length > 1 86 | # bs.search(ARGV[1..-1]) 87 | #else 88 | while input = $stdin.gets 89 | bs.search(input.split) 90 | puts 91 | end 92 | #end 93 | 94 | -------------------------------------------------------------------------------- /unsupervised/ema.r: -------------------------------------------------------------------------------- 1 | # EM algorithm and Online EMA 2 | 3 | 4 | argv <- commandArgs(T); 5 | if (length(argv[argv=="faithful"])) { 6 | # Old Faithful dataset を取得して正規化 7 | data("faithful"); 8 | xx <- scale(faithful, apply(faithful, 2, mean), apply(faithful, 2, sd)); 9 | K <- 2; 10 | } else { 11 | # 3次元&3峰のテストデータを生成 12 | library(MASS); 13 | xx <- rbind( 14 | mvrnorm(100, c(1,3,0), matrix(c(0.7324,-0.9193,0.5092,-0.9193,2.865,-0.2976,0.5092,-0.2976,3.294),3)), 15 | mvrnorm(150, c(4,-1,-2), matrix(c(2.8879,-0.2560,0.5875,-0.2560,3.0338,1.2960,0.5875,1.2960,1.7438),3)), 16 | mvrnorm(200, c(0,2,1), matrix(c(3.1178,1.7447,0.6726,1.7447,2.3693,0.0521,0.6726,0.0521,0.7917),3)) 17 | ); 18 | xx <- xx[sample(nrow(xx)),] 19 | K <- 3; 20 | } 21 | N <- nrow(xx); 22 | 23 | 24 | # パラメータの初期化(平均、共分散、混合率) 25 | init_param <- function(K, D) { 26 | sig <- list(); 27 | for(k in 1:K) sig[[k]] <- diag(K); 28 | list(mu = matrix(rnorm(K * D), D), mix = numeric(K)+1/K, sig = sig); 29 | } 30 | 31 | # 多次元正規分布密度関数 32 | dmnorm <- function(x, mu, sig) { 33 | D <- length(mu); 34 | 1/((2 * pi)^D * sqrt(det(sig))) * exp(- t(x-mu) %*% solve(sig) %*% (x-mu) / 2)[1]; 35 | } 36 | 37 | # EM アルゴリズムの E ステップ 38 | Estep <- function(xx, param) { 39 | K <- nrow(param$mu); 40 | t(apply(xx, 1, function(x){ 41 | numer <- param$mix * sapply(1:K, function(k) { 42 | dmnorm(x, param$mu[k,], param$sig[[k]]) 43 | }); 44 | numer / sum(numer); 45 | })) 46 | } 47 | 48 | # EM アルゴリズムの M ステップ 49 | Mstep <- function(xx, gamma_nk) { 50 | K <- ncol(gamma_nk); 51 | D <- ncol(xx); 52 | N <- nrow(xx); 53 | 54 | N_k <- colSums(gamma_nk); 55 | new_mix <- N_k / N; 56 | new_mu <- (t(gamma_nk) %*% xx) / N_k; 57 | 58 | new_sig <- list(); 59 | for(k in 1:K) { 60 | sig <- matrix(numeric(D^2), D); 61 | for(n in 1:N) { 62 | x <- xx[n,] - new_mu[k,]; 63 | sig <- sig + gamma_nk[n, k] * (x %*% t(x)); 64 | } 65 | new_sig[[k]] <- sig / N_k[k] 66 | } 67 | 68 | list(mu=new_mu, sig=new_sig, mix=new_mix); 69 | } 70 | 71 | # 対数尤度関数 72 | Likelihood <- function(xx, param) { 73 | K <- nrow(param$mu); 74 | sum(apply(xx, 1, function(x){ 75 | log(sum(param$mix * sapply(1:K, function(k) dmnorm(x, param$mu[k,], param$sig[[k]])))); 76 | })) 77 | } 78 | 79 | OnlineEM <- function(xx, m, param) { 80 | N <- nrow(xx); 81 | K <- nrow(param$mu); 82 | 83 | new_gamma <- param$mix * sapply(1:K, function(k) { 84 | dmnorm(xx[m, ], param$mu[k,], param$sig[[k]]); 85 | }); 86 | new_gamma <- new_gamma / sum(new_gamma); 87 | delta <- new_gamma - param$gamma[m,]; 88 | param$gamma[m,] <- new_gamma; 89 | 90 | param$mix <- param$mix + delta / N; 91 | N_k <- param$mix * N; 92 | for(k in 1:K) { 93 | x <- xx[m,] - param$mu[k,]; 94 | d <- delta[k] / N_k[k]; 95 | param$mu[k,] <- param$mu[k,] + d * x; 96 | param$sig[[k]] <- (1 - d) * (param$sig[[k]] + d * x %*% t(x)); 97 | } 98 | param; 99 | } 100 | 101 | 102 | for (n in 1:10) { 103 | # 初期値 104 | param0 <- init_param(K, ncol(xx)); 105 | 106 | # normal EM 107 | timing <- system.time({ 108 | param <- param0; 109 | 110 | # 収束するまで繰り返し 111 | likeli <- -999999; 112 | for (j in 1:999) { 113 | gamma_nk <- Estep(xx, param); 114 | param <- Mstep(xx, gamma_nk); 115 | 116 | cat(sprintf(" %d: %.3f\n", j, (l <- Likelihood(xx, param)))); 117 | if (l - likeli < 0.001) break; 118 | likeli <- l; 119 | } 120 | }); 121 | cat(sprintf("Normal %d:convergence=%d, likelihood=%.4f, %1.2fsec\n", n, j, likeli, timing[3])); 122 | #print(param$mu); 123 | 124 | # incremental EM 125 | timing <- system.time({ 126 | param <- param0; 127 | 128 | # 最初の一周は通常の EM 129 | gamma_nk <- Estep(xx, param); 130 | param <- Mstep(xx, gamma_nk); 131 | param$gamma <- gamma_nk; 132 | 133 | # online EM 134 | likeli <- -999999; 135 | for (j in 2:100) { 136 | randomlist <- sample(1:N); 137 | for(m in randomlist) param <- OnlineEM(xx, m, param); 138 | 139 | cat(sprintf(" %d: %.3f\n", j, (l <- Likelihood(xx, param)))); 140 | if (l - likeli < 0.001) break; 141 | likeli <- l; 142 | } 143 | }); 144 | cat(sprintf("Online %d:convergence=%d, likelihood=%.4f, %1.2fsec\n", n, j, likeli, timing[3])); 145 | #print(param$mu); 146 | } 147 | 148 | # plot(xx, col=rgb(gamma_nk[,1],0,gamma_nk[,2]), xlab=paste(sprintf("%1.3f",t(param$mu)),collapse=","), ylab=""); 149 | # points(param$mu, pch = 8); 150 | 151 | -------------------------------------------------------------------------------- /unsupervised/plsi.rb: -------------------------------------------------------------------------------- 1 | #!/usr/bin/ruby -KN 2 | # ./plsi.rb [corpus files] 3 | 4 | begin 5 | require '../lib/infinitive.rb' 6 | INF = Infinitive.new 7 | rescue 8 | module INF 9 | def self.infinitive(word);word.downcase;end 10 | end 11 | end 12 | 13 | def dump(obj) 14 | if obj.is_a?(Numeric) 15 | (obj*1000).round/1000.0 16 | elsif obj.is_a?(String) 17 | obj 18 | elsif obj.is_a?(Array) 19 | "[#{obj.map{|x| dump(x)}.join(',')}]\n" 20 | elsif obj.is_a?(Hash) 21 | "{#{obj.map{|k,v| "#{k}=>#{dump(v)}"}.join(',')}}\n" 22 | end 23 | end 24 | 25 | docs = Array.new 26 | words = Hash.new{|h,k| h[k]=Hash.new(0) } 27 | worddocs = Hash.new{|h,k| h[k]=Hash.new(0) } 28 | while filename = ARGV.shift 29 | puts "loading: #{filename}" 30 | texts = open(filename) {|f| f.read }.split(/\n\n+/) 31 | 32 | texts.each_with_index do |text, doc_id| 33 | vec = Hash.new(0) 34 | docs << vec 35 | text.scan(/[A-Za-z]+/) do |word| 36 | infword = INF.infinitive(word) 37 | vec[infword] += 1 38 | words[infword][word] += 1 39 | worddocs[infword][doc_id] += 1 40 | end 41 | end 42 | end 43 | puts "# of words = #{words.size}, # of docs = #{docs.length}" 44 | 45 | class PLSI 46 | K = 20 47 | def initialize(docs, words, worddocs) 48 | @docs = docs 49 | @words = words 50 | @worddocs = worddocs 51 | 52 | @z_k = Array.new(K){1.0/K} 53 | @d_i_z_k = Array.new(K){ Array.new(docs.length){1.0/docs.length} } 54 | @w_j_z_k = Array.new(K){ 55 | h = Hash.new 56 | s = 0 57 | worddocs.each{|j,x| s+=(h[j]=rand) } 58 | worddocs.each{|j,x| h[j]/=s } 59 | h 60 | } 61 | end 62 | 63 | def stepEM 64 | new_z_k_numer = Array.new(K){0} 65 | new_z_k_denom = 0 66 | new_d_i_numer = Array.new(K){ Array.new(@docs.length){0} } 67 | new_w_j_numer = Array.new(K){ Hash.new(0) } 68 | 69 | @worddocs.each do |j, n_w_j| 70 | #(0..@docs.length-1).each do |i| 71 | #n_w_j_d_i = n_w_j[i] 72 | n_w_j.each do |i, n_w_j_d_i| 73 | 74 | # E-step 75 | posterior_denom = 0 76 | posterior_numers = Array.new(K) 77 | (0..K-1).each do |k| 78 | # p(z=k)p(x|z)p(y|z) 79 | posterior_denom += (posterior_numers[k] = @z_k[k] * @d_i_z_k[k][i] * @w_j_z_k[k][j]) 80 | end 81 | 82 | # M-step 83 | posterior_numers.each_with_index do |posterior_numer, k| 84 | x = n_w_j_d_i * posterior_numer / posterior_denom 85 | new_z_k_numer[k] += x 86 | new_d_i_numer[k][i] += x 87 | new_w_j_numer[k][j] += x 88 | end 89 | new_z_k_denom += n_w_j_d_i 90 | end 91 | end 92 | 93 | @z_k = new_z_k_numer.map{|x| x / new_z_k_denom } 94 | 95 | new_d_i_numer.each_with_index do |d_i, k| 96 | d_i.each_with_index do |numer, i| 97 | @d_i_z_k[k][i] = numer / new_z_k_numer[k] 98 | end 99 | end 100 | 101 | new_w_j_numer.each_with_index do |w_j, k| 102 | w_j.each do |j, numer| 103 | @w_j_z_k[k][j] = numer / new_z_k_numer[k] 104 | end 105 | end 106 | 107 | #puts "----" 108 | puts dump(@z_k) 109 | #puts dump(@d_i_z_k) 110 | #puts dump(@w_j_z_k) 111 | end 112 | 113 | def max_z_k_w_j 114 | cluster = Array.new(K){ Array.new } 115 | @worddocs.each do |j, n_w_j| 116 | argmax_k = nil 117 | max_z_k = 0 118 | sum = 0 119 | (0..K-1).each do |k| 120 | p_z_k_w_j = @z_k[k] * @w_j_z_k[k][j] 121 | sum += p_z_k_w_j 122 | if max_z_k < p_z_k_w_j 123 | max_z_k = p_z_k_w_j 124 | argmax_k = k 125 | end 126 | end 127 | cluster[argmax_k] << [j, max_z_k / sum] 128 | end 129 | cluster 130 | end 131 | end 132 | 133 | plsi = PLSI.new(docs, words, worddocs) 134 | 200.times{ plsi.stepEM } 135 | 136 | cluster = plsi.max_z_k_w_j 137 | cluster.each_with_index do |words, k| 138 | puts " cluster: #{k}" 139 | sep = 1.0 140 | output = [] 141 | words.sort_by{|x| -x[1] }.each do |x| 142 | while sep >= x[1] 143 | output << (sep*10).round 144 | sep -= 0.1 145 | end 146 | output << x[0] 147 | end 148 | puts output.join(',') 149 | end 150 | 151 | -------------------------------------------------------------------------------- /unsupervised/vb.r: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/unsupervised/vb.r --------------------------------------------------------------------------------