├── activelearn
    ├── mmms.py
    ├── mmpm.py
    ├── oracle.py
    ├── qbc4.py
    ├── qbc_dist.py
    ├── uncert_dist.py
    ├── uncertain.py
    ├── uncertain2.py
    ├── uncertain3.py
    └── uncertain4.py
├── clustering
    ├── dpm.py
    └── irm.py
├── data
    ├── 4million.corpus
    ├── gen_cluto.rb
    ├── gen_corpus.rb
    ├── gen_libsvm.rb
    ├── henry-poe
    └── ohenry.corpus
├── difficulty
    └── wordbook.txt
├── dnn
    ├── README.md
    ├── cdcgan-svhn.ini
    ├── cdcgan-svhn.py
    ├── cgan-mnist.py
    ├── dcgan-svhn.py
    ├── e2emn.py
    └── gan-mnist.py
├── extractcontent
    ├── test.py
    ├── train.py
    └── webextract.py
├── hac
    ├── fselect.rb
    ├── hac.rb
    └── naive_hac.rb
├── irt
    └── irt.rb
├── langdetect
    ├── common.rb
    ├── crawler.rb
    ├── detect.rb
    ├── filetest.rb
    ├── model.json
    ├── test.rb
    └── train.rb
├── lda
    ├── hdp_online.py
    ├── hdplda.py
    ├── hdplda2.py
    ├── itm.py
    ├── lda.py
    ├── lda.r
    ├── lda_cvb0.py
    ├── lda_test.py
    ├── lda_test2.py
    ├── ldacvb0_cpp
    │   ├── README.md
    │   ├── ldacvb0.sln
    │   ├── ldacvb0
    │   │   ├── ldacvb0.cpp
    │   │   ├── ldacvb0.hpp
    │   │   └── ldacvb0.vcxproj
    │   └── ldacvb0_test
    │   │   ├── ldacvb0_test.vcxproj
    │   │   └── test.cpp
    ├── llda.py
    ├── llda_nltk.py
    ├── test_hdplda2.py
    ├── twentygroups.py
    └── vocabulary.py
├── lib
    ├── extract_gutenberg.rb
    ├── infinitive.rb
    ├── inflist.txt
    └── wordbook.txt
├── lr
    └── lr.r
├── misc
    ├── linear_regression.r
    ├── linear_regression.xlsx
    └── zipf.rb
├── neural
    ├── adult.rb
    ├── classification.rb
    ├── classification.txt
    ├── curve_fitting.rb
    ├── iris.rb
    ├── mnist.rb
    ├── mnist2.rb
    ├── neural.rb
    └── xor.rb
├── ngram
    ├── knlm.py
    ├── knsmooth.py
    ├── ngram.rb
    ├── rnnlm.py
    └── wordcount.py
├── pca
    ├── bayes.r
    ├── ema.r
    └── ppca.r
├── perceptron
    ├── avg_percep_test.rb
    ├── percep_test.rb
    ├── test.rb
    └── train.rb
├── privacy
    └── randomized-response
    │   ├── rr-gibbs.py
    │   ├── rr-mle.py
    │   └── rr-vb.py
├── sampling
    └── hmc.r
├── semisupervised
    └── ssnb.py
├── sequence
    ├── crf.py
    ├── hmm.py
    ├── pg.py
    └── testcrf.py
├── trie
    ├── da.py
    ├── test_da.py
    └── trie.py
└── unsupervised
    ├── bs.rb
    ├── ema.r
    ├── plsi.rb
    ├── vb.r
    └── vb_result.csv


/activelearn/mmms.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Active Learning for 20 newsgroups : MCMI[min] with margin sampling
  5 | #    MCMI[min] refers to (Guo+ IJCAI-07)
  6 | #    Yuhong Guo and Russ Greiner, Optimistic Active Learning using Mutual Information, IJCAI-07
  7 | 
  8 | # This code is available under the MIT License.
  9 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
 10 | 
 11 | import optparse
 12 | import numpy
 13 | import scipy.sparse
 14 | import sklearn.datasets
 15 | from sklearn.linear_model import LogisticRegression
 16 | from sklearn.naive_bayes import MultinomialNB
 17 | 
 18 | def activelearn(data, test, train, pool, classifier_factory, max_train, seed):
 19 |     numpy.random.seed(seed)
 20 | 
 21 |     # copy initial indexes of training and pool
 22 |     train = list(train)
 23 |     pool = list(pool)
 24 | 
 25 |     accuracies = []
 26 |     Z = len(test.target)
 27 |     K = data.target.max() + 1
 28 |     while len(train) < max_train:
 29 |         if len(accuracies) > 0:
 30 |             predict = classifier.predict_proba(data.data[pool,:])
 31 |             predict.sort(axis=1)
 32 |             margin = predict[:,-1] - predict[:,-2]
 33 |             candidate = margin.argsort()[:30]
 34 | 
 35 |             i_star = y_i_star = None
 36 |             f_i_star = 1e300
 37 |             print "i\ty_i\t(actual)\tf_i\tmargin"
 38 |             for i in candidate:
 39 |                 x = pool[i]
 40 |                 L_x_i = data.data[train + [x], :]
 41 |                 L_y = data.target[train]
 42 |                 entropies = numpy.zeros(K)
 43 |                 for y in xrange(K):
 44 |                     l = list(L_y)
 45 |                     l.append(y)
 46 |                     phi_i = classifier_factory().fit(L_x_i, l)
 47 | 
 48 |                     p = phi_i.predict_proba(data.data[pool])
 49 |                     entropies[y] = -(numpy.nan_to_num(numpy.log(p)) * p).sum()
 50 |                 y_i = entropies.argmin()
 51 |                 f_i = entropies[y_i]
 52 |                 print "%d\t%d\t%d\t%f\t%f" % (x, y_i, data.target[x], f_i, margin[i])
 53 |                 if f_i < f_i_star:
 54 |                     i_star = i
 55 |                     y_i_star = y_i
 56 |                     f_i_star = f_i
 57 | 
 58 |             x = pool[i_star]
 59 |             print "select : %d (MM=%f, predict=%d, actual=%d)" % (x, f_i_star, y_i_star, data.target[x])
 60 |             train.append(x)
 61 |             del pool[i_star]
 62 | 
 63 |         classifier = classifier_factory().fit(data.data[train,:], data.target[train])
 64 |         accuracy = classifier.score(test.data, test.target)
 65 |         print "%d : %f" % (len(train), accuracy)
 66 |         accuracies.append((len(train), accuracy))
 67 | 
 68 |     return accuracies
 69 | 
 70 | def main():
 71 |     parser = optparse.OptionParser()
 72 |     parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None)
 73 |     parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None)
 74 |     parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None)
 75 | 
 76 |     parser.add_option("-K", dest="class_size", type="int", help="number of class", default=4)
 77 |     parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=100)
 78 |     parser.add_option("-t", dest="training", help="specify indexes of training", default=None)
 79 |     parser.add_option("-N", dest="trying", type="int", help="number of trying", default=100)
 80 | 
 81 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
 82 |     (opt, args) = parser.parse_args()
 83 | 
 84 |     data = sklearn.datasets.fetch_20newsgroups_vectorized()
 85 |     print "(train size, voca size) : (%d, %d)" % data.data.shape
 86 | 
 87 |     if opt.class_size:
 88 |         index = data.target < opt.class_size
 89 |         a = data.data.toarray()[index, :]
 90 |         data.data = scipy.sparse.csr_matrix(a)
 91 |         data.target = data.target[index]
 92 |         print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape
 93 | 
 94 | 
 95 |     N_CLASS = data.target.max() + 1
 96 |     if opt.training:
 97 |         train = [int(x) for x in opt.training.split(",")]
 98 |     else:
 99 |         train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)]
100 |     print "indexes of training set : ", ",".join("%d" % x for x in train)
101 | 
102 |     pool = range(data.data.shape[0])
103 |     for x in train: pool.remove(x)
104 | 
105 |     classifier_factory = None
106 |     if opt.logistic_l1:
107 |         print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1
108 |         classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1)
109 |     elif opt.logistic_l2:
110 |         print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2
111 |         classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2)
112 |     elif opt.naive_bayes:
113 |         print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes
114 |         classifier_factory = lambda: MultinomialNB(alpha=opt.naive_bayes)
115 | 
116 |     if classifier_factory:
117 |         test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test')
118 |         print "(test size, voca size) : (%d, %d)" % test.data.shape
119 |         if opt.class_size:
120 |             index = test.target < opt.class_size
121 |             a = test.data.toarray()[index, :]
122 |             test.data = scipy.sparse.csr_matrix(a)
123 |             test.target = test.target[index]
124 |             print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape
125 | 
126 |         print "score for all data: %f" % classifier_factory().fit(data.data, data.target).score(test.data, test.target)
127 | 
128 |         for n in xrange(opt.trying):
129 |             print "trying.. %d" % n
130 |             train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)]
131 |             pool = range(data.data.shape[0])
132 |             for x in train: pool.remove(x)
133 |             results = activelearn(data, test, train, pool, classifier_factory, opt.max_train, opt.seed)
134 | 
135 |             with open("output_mmms_%d_%d.txt" % (opt.class_size, opt.max_train), "ab") as f:
136 |                 f.write("\t".join("%f" % x[1] for x in results))
137 |                 f.write("\n")
138 | 
139 | if __name__ == "__main__":
140 |     main()
141 | 


--------------------------------------------------------------------------------
/activelearn/mmpm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Active Learning for 20 newsgroups with MM+M (Guo+ IJCAI-07)
  5 | #    Yuhong Guo and Russ Greiner, Optimistic Active Learning using Mutual Information, IJCAI-07
  6 | 
  7 | # This code is available under the MIT License.
  8 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  9 | 
 10 | import optparse
 11 | import numpy
 12 | import scipy.sparse
 13 | import sklearn.datasets
 14 | from sklearn.linear_model import LogisticRegression
 15 | 
 16 | def activelearn(data, test, train, pool, classifier_factory, max_train, seed):
 17 |     numpy.random.seed(seed)
 18 | 
 19 |     # copy initial indexes of training and pool
 20 |     train = list(train)
 21 |     pool = list(pool)
 22 | 
 23 |     accuracies = []
 24 |     Z = len(test.target)
 25 |     K = data.target.max() + 1
 26 |     while len(train) < max_train:
 27 |         if len(accuracies) > 0:
 28 |             predict = classifier.predict_proba(data.data[pool,:])
 29 |             entbase = -numpy.nan_to_num(predict * numpy.log(predict)).sum(axis=1)
 30 |             predict.sort(axis=1)
 31 |             margin = predict[:,-1] - predict[:,-2]
 32 |             uncertain = predict[:,-1]
 33 | 
 34 |             i_star = y_i_star = None
 35 |             f_i_star = 1e300
 36 |             print "i\ty_i\tf_i\tuncertain\tmargin\tent"
 37 |             for i, x in enumerate(pool):
 38 |                 L_x_i = data.data[train + [x], :]
 39 |                 L_y = data.target[train]
 40 |                 entropies = numpy.zeros(K)
 41 |                 for y in xrange(K):
 42 |                     l = list(L_y)
 43 |                     l.append(y)
 44 |                     phi_i = classifier_factory().fit(L_x_i, l)
 45 | 
 46 |                     p = phi_i.predict_proba(data.data[pool])
 47 |                     entropies[y] = -(numpy.nan_to_num(numpy.log(p)) * p).sum()
 48 |                 y_i = entropies.argmin()
 49 |                 f_i = entropies[y_i]
 50 |                 print "%d\t%d\t%f\t%f\t%f\t%f" % (x, y_i, f_i, uncertain[i], margin[i], entbase[i])
 51 |                 if f_i < f_i_star:
 52 |                     i_star = i
 53 |                     y_i_star = y_i
 54 |                     f_i_star = f_i
 55 | 
 56 |             x = pool[i_star]
 57 |             print "select : %d (MM=%f, predict=%d, actual=%d)" % (x, f_i_star, y_i_star, data.target[x])
 58 |             train.append(x)
 59 |             del pool[i_star]
 60 | 
 61 |             if data.target[x] != y_i_star:
 62 |                 phi = classifier_factory().fit(data.data[train, :], data.target[train])
 63 |                 p = phi_i.predict_proba(data.data[pool])
 64 |                 i_star = (numpy.nan_to_num(numpy.log(p)) * p).sum(axis=1).argmin()
 65 | 
 66 |                 x = pool[i_star]
 67 |                 print "select : %d (actual=%d)" % (x, data.target[x])
 68 |                 train.append(x)
 69 |                 del pool[i_star]
 70 | 
 71 |         classifier = classifier_factory().fit(data.data[train,:], data.target[train])
 72 |         accuracy = classifier.score(test.data, test.target)
 73 |         print "%d : %f" % (len(train), accuracy)
 74 |         accuracies.append((len(train), accuracy))
 75 | 
 76 |     return accuracies
 77 | 
 78 | def main():
 79 |     parser = optparse.OptionParser()
 80 |     parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None)
 81 |     parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None)
 82 |     parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None)
 83 | 
 84 |     parser.add_option("-K", dest="class_size", type="int", help="number of class", default=None)
 85 |     parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=30)
 86 |     parser.add_option("-t", dest="training", help="specify indexes of training", default=None)
 87 | 
 88 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
 89 |     (opt, args) = parser.parse_args()
 90 | 
 91 |     data = sklearn.datasets.fetch_20newsgroups_vectorized()
 92 |     print "(train size, voca size) : (%d, %d)" % data.data.shape
 93 | 
 94 |     if opt.class_size:
 95 |         index = data.target < opt.class_size
 96 |         a = data.data.toarray()[index, :]
 97 |         data.data = scipy.sparse.csr_matrix(a)
 98 |         data.target = data.target[index]
 99 |         print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape
100 | 
101 | 
102 |     N_CLASS = data.target.max() + 1
103 |     if opt.training:
104 |         train = [int(x) for x in opt.training.split(",")]
105 |     else:
106 |         train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)]
107 |     print "indexes of training set : ", ",".join("%d" % x for x in train)
108 | 
109 |     pool = range(data.data.shape[0])
110 |     for x in train: pool.remove(x)
111 | 
112 |     classifier_factory = None
113 |     if opt.logistic_l1:
114 |         print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1
115 |         classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1)
116 |     elif opt.logistic_l2:
117 |         print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2
118 |         classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2)
119 |     else:
120 |         pass
121 | 
122 |     if classifier_factory:
123 |         test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test')
124 |         print "(test size, voca size) : (%d, %d)" % test.data.shape
125 |         if opt.class_size:
126 |             index = test.target < opt.class_size
127 |             a = test.data.toarray()[index, :]
128 |             test.data = scipy.sparse.csr_matrix(a)
129 |             test.target = test.target[index]
130 |             print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape
131 | 
132 |         print "score for all data: %f" % classifier_factory().fit(data.data, data.target).score(test.data, test.target)
133 | 
134 |         results = activelearn(data, test, train, pool, classifier_factory, opt.max_train, opt.seed)
135 | 
136 |         for x in results:
137 |             print "%d\t%f" % x
138 | 
139 | if __name__ == "__main__":
140 |     main()
141 | 


--------------------------------------------------------------------------------
/activelearn/oracle.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Active Learning for 20 newsgroups with Oracle and testset
  5 | 
  6 | # This code is available under the MIT License.
  7 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  8 | 
  9 | import optparse
 10 | import numpy
 11 | import scipy.sparse
 12 | import sklearn.datasets
 13 | from sklearn.linear_model import LogisticRegression
 14 | from sklearn.naive_bayes import MultinomialNB
 15 | 
 16 | def activelearn(data, test, train, pool, classifier_factory, max_train, n_candidate, seed):
 17 |     numpy.random.seed(seed)
 18 | 
 19 |     # copy initial indexes of training and pool
 20 |     train = list(train)
 21 |     pool = list(pool)
 22 | 
 23 |     accuracies = []
 24 |     Z = len(test.target)
 25 |     K = data.target.max() + 1
 26 |     while len(train) < max_train:
 27 |         if len(accuracies) > 0:
 28 |             i_star = None
 29 |             max_score = 0.0
 30 |             candidate = pool
 31 |             if 0 < n_candidate < len(pool):
 32 |                 numpy.random.shuffle(pool)
 33 |                 candidate = pool[:n_candidate]
 34 |             for i, x in enumerate(candidate):
 35 |                 t = train + [x]
 36 |                 s = classifier_factory().fit(data.data[t, :], data.target[t]).score(test.data, test.target)
 37 |                 if max_score < s:
 38 |                     print "%d\t%f" % (x, s)
 39 |                     max_score = s
 40 |                     i_star = i
 41 |             train.append(pool[i_star])
 42 |             del pool[i_star]
 43 | 
 44 |         classifier = classifier_factory().fit(data.data[train,:], data.target[train])
 45 |         accuracy = classifier.score(test.data, test.target)
 46 |         print "%d : %f" % (len(train), accuracy)
 47 |         accuracies.append((len(train), accuracy))
 48 | 
 49 |     return accuracies
 50 | 
 51 | def main():
 52 |     parser = optparse.OptionParser()
 53 |     parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None)
 54 |     parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None)
 55 |     parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None)
 56 | 
 57 |     parser.add_option("-K", dest="class_size", type="int", help="number of class", default=None)
 58 |     parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=30)
 59 |     parser.add_option("-t", dest="training", help="specify indexes of training", default=None)
 60 |     parser.add_option("-T", dest="candidate", type="int", help="candidate size", default=-1)
 61 | 
 62 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
 63 |     (opt, args) = parser.parse_args()
 64 | 
 65 |     data = sklearn.datasets.fetch_20newsgroups_vectorized()
 66 |     print "(train size, voca size) : (%d, %d)" % data.data.shape
 67 | 
 68 |     if opt.class_size:
 69 |         index = data.target < opt.class_size
 70 |         a = data.data.toarray()[index, :]
 71 |         data.data = scipy.sparse.csr_matrix(a)
 72 |         data.target = data.target[index]
 73 |         print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape
 74 | 
 75 | 
 76 |     N_CLASS = data.target.max() + 1
 77 |     if opt.training:
 78 |         train = [int(x) for x in opt.training.split(",")]
 79 |     else:
 80 |         train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)]
 81 |     print "indexes of training set : ", ",".join("%d" % x for x in train)
 82 | 
 83 |     pool = range(data.data.shape[0])
 84 |     for x in train: pool.remove(x)
 85 | 
 86 |     classifier_factory = None
 87 |     if opt.logistic_l1:
 88 |         print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1
 89 |         classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1)
 90 |     elif opt.logistic_l2:
 91 |         print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2
 92 |         classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2)
 93 |     elif opt.naive_bayes:
 94 |         print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes
 95 |         classifier_factory = lambda: MultinomialNB(alpha=opt.naive_bayes)
 96 | 
 97 |     if classifier_factory:
 98 |         test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test')
 99 |         print "(test size, voca size) : (%d, %d)" % test.data.shape
100 |         if opt.class_size:
101 |             index = test.target < opt.class_size
102 |             a = test.data.toarray()[index, :]
103 |             test.data = scipy.sparse.csr_matrix(a)
104 |             test.target = test.target[index]
105 |             print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape
106 | 
107 |         print "score for all data: %f" % classifier_factory().fit(data.data, data.target).score(test.data, test.target)
108 | 
109 |         results = activelearn(data, test, train, pool, classifier_factory, opt.max_train, opt.candidate, opt.seed)
110 | 
111 |         for x in results:
112 |             print "%d\t%f" % x
113 | 
114 | if __name__ == "__main__":
115 |     main()
116 | 


--------------------------------------------------------------------------------
/activelearn/qbc4.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Active Learning (Query-By-Committee) for 20 newsgroups
  5 | # This code is available under the MIT License.
  6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  7 | 
  8 | import optparse
  9 | import numpy
 10 | import sklearn.datasets
 11 | from sklearn.linear_model import LogisticRegression
 12 | from sklearn.naive_bayes import MultinomialNB
 13 | 
 14 | def activelearn(results, data, test, strategy, train, pool, classifier_factories, max_train, densities):
 15 |     print strategy
 16 | 
 17 |     # copy initial indexes of training and pool
 18 |     train = list(train)
 19 |     pool = list(pool)
 20 | 
 21 |     accuracies = []
 22 |     Z = len(test.target)
 23 |     while len(train) < max_train:
 24 |         if len(accuracies) > 0:
 25 |             if strategy == "random":
 26 |                 x = numpy.random.randint(len(pool))
 27 |             else:
 28 |                 if strategy == "vote entropy":
 29 |                     p = numpy.array([c.predict(data.data[pool,:]) for c in classifiers])
 30 |                     # This is equivalent to Vote Entropy when # of classifiers = 3
 31 |                     x = ((p[:,0:2]==p[:,1:3]).sum(axis=1) + (p[:,0]==p[:,2]))
 32 |                 elif strategy == "average KL":
 33 |                     p = numpy.array([c.predict_proba(data.data[pool,:]) for c in classifiers]) # 3 * N * K
 34 |                     pc = p.mean(axis=0) # N * K
 35 |                     x = numpy.nan_to_num(p * numpy.log(pc / p)).sum(axis=2).sum(axis=0)
 36 |                 elif strategy == "qbc+margin sampling":
 37 |                     p = numpy.array([c.predict_proba(data.data[pool,:]) for c in classifiers]) # 3 * N * K
 38 |                     pc = p.mean(axis=0) # N * K
 39 |                     pc.sort(axis=1)
 40 |                     x = pc[:,-1] - pc[:,-2]
 41 |                 if densities != None: x *= densities[pool]
 42 |                 x = x.argmin()
 43 |             train.append(pool[x])
 44 |             del pool[x]
 45 | 
 46 |         classifiers = [f().fit(data.data[train,:], data.target[train]) for f in classifier_factories]
 47 | 
 48 |         predict = sum(c.predict_proba(test.data) for c in classifiers)
 49 |         correct = (predict.argmax(axis=1) == test.target).sum()
 50 |         accuracy = float(correct) / Z
 51 |         print "%s %d : %d / %d = %f" % (strategy, len(train), correct, Z, accuracy)
 52 |         accuracies.append(accuracy)
 53 | 
 54 |     results.append((strategy, accuracies))
 55 | 
 56 | def main():
 57 |     parser = optparse.OptionParser()
 58 |     parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None)
 59 |     parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None)
 60 |     parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None)
 61 | 
 62 |     parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=300)
 63 |     parser.add_option("-t", dest="training", help="specify indexes of training", default=None)
 64 | 
 65 |     parser.add_option("-b", dest="beta", type="float", help="density importance", default=0)
 66 | 
 67 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
 68 |     (opt, args) = parser.parse_args()
 69 |     numpy.random.seed(opt.seed)
 70 | 
 71 |     data = sklearn.datasets.fetch_20newsgroups_vectorized()
 72 |     print "(train size, voca size) : (%d, %d)" % data.data.shape
 73 | 
 74 |     N_CLASS = data.target.max() + 1
 75 |     if opt.training:
 76 |         train = [int(x) for x in opt.training.split(",")]
 77 |     else:
 78 |         train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)]
 79 |     print "indexes of training set : ", ",".join("%d" % x for x in train)
 80 | 
 81 |     pool = range(data.data.shape[0])
 82 |     for x in train: pool.remove(x)
 83 | 
 84 |     classifier_factories = []
 85 |     if opt.logistic_l1:
 86 |         print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1
 87 |         classifier_factories.append(lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1))
 88 |     if opt.logistic_l2:
 89 |         print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2
 90 |         classifier_factories.append(lambda: LogisticRegression(C=opt.logistic_l2))
 91 |     if opt.naive_bayes:
 92 |         print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes
 93 |         classifier_factories.append(lambda: MultinomialNB(alpha=opt.naive_bayes))
 94 | 
 95 |     if len(classifier_factories) >= 2:
 96 |         test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test')
 97 |         print "(test size, voca size) : (%d, %d)" % test.data.shape
 98 | 
 99 |         densities = None
100 |         if opt.beta > 0:
101 |             densities = (data.data * data.data.T).mean(axis=0).A[0] ** opt.beta
102 | 
103 |         methods = ["random", "vote entropy", "average KL", "qbc+margin sampling", ]
104 |         results = []
105 |         for x in methods:
106 |             activelearn(results, data, test, x, train, pool, classifier_factories, opt.max_train, densities)
107 | 
108 |         print "\t%s" % "\t".join(x[0] for x in results)
109 |         d = len(train)
110 |         for i in xrange(len(results[0][1])):
111 |             print "%d\t%s" % (i+d, "\t".join("%f" % x[1][i] for x in results))
112 | 
113 | if __name__ == "__main__":
114 |     main()
115 | 


--------------------------------------------------------------------------------
/activelearn/qbc_dist.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Active Learning (Query-By-Committee) for 20 newsgroups
  5 | # This code is available under the MIT License.
  6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  7 | 
  8 | import optparse
  9 | import numpy
 10 | import sklearn.datasets
 11 | from sklearn.linear_model import LogisticRegression
 12 | from sklearn.naive_bayes import MultinomialNB
 13 | 
 14 | def activelearn(data, test, strategy, train, pool, classifier_factories, max_train, densities):
 15 |     # copy initial indexes of training and pool
 16 |     train = list(train)
 17 |     pool = list(pool)
 18 | 
 19 |     accuracies = []
 20 |     Z = len(test.target)
 21 |     while len(train) < max_train:
 22 |         if len(accuracies) > 0:
 23 |             if strategy == "random":
 24 |                 x = numpy.random.randint(len(pool))
 25 |             else:
 26 |                 if strategy == "vote entropy":
 27 |                     p = numpy.array([c.predict(data.data[pool,:]) for c in classifiers])
 28 |                     # This is equivalent to Vote Entropy when # of classifiers = 3
 29 |                     x = ((p[:,0:2]==p[:,1:3]).sum(axis=1) + (p[:,0]==p[:,2]))
 30 |                 elif strategy == "average KL":
 31 |                     p = numpy.array([c.predict_proba(data.data[pool,:]) for c in classifiers]) # 3 * N * K
 32 |                     pc = p.mean(axis=0) # N * K
 33 |                     x = numpy.nan_to_num(p * numpy.log(pc / p)).sum(axis=2).sum(axis=0)
 34 |                 elif strategy == "qbc+margin sampling":
 35 |                     p = numpy.array([c.predict_proba(data.data[pool,:]) for c in classifiers]) # 3 * N * K
 36 |                     pc = p.mean(axis=0) # N * K
 37 |                     pc.sort(axis=1)
 38 |                     x = pc[:,-1] - pc[:,-2]
 39 |                 if densities != None: x *= densities[pool]
 40 |                 x = x.argmin()
 41 |             train.append(pool[x])
 42 |             del pool[x]
 43 | 
 44 |         classifiers = [f().fit(data.data[train,:], data.target[train]) for f in classifier_factories]
 45 | 
 46 |         predict = sum(c.predict_proba(test.data) for c in classifiers)
 47 |         correct = (predict.argmax(axis=1) == test.target).sum()
 48 |         accuracy = float(correct) / Z
 49 |         print "%d : %d / %d = %f" % (len(train), correct, Z, accuracy)
 50 |         accuracies.append(accuracy)
 51 |     return accuracies
 52 | 
 53 | def main():
 54 |     parser = optparse.OptionParser()
 55 |     parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None)
 56 |     parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None)
 57 |     parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None)
 58 | 
 59 |     parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=300)
 60 |     parser.add_option("-N", dest="trying", type="int", help="number of trying", default=100)
 61 | 
 62 |     parser.add_option("-b", dest="beta", type="float", help="density importance", default=0)
 63 | 
 64 |     (opt, args) = parser.parse_args()
 65 | 
 66 |     data = sklearn.datasets.fetch_20newsgroups_vectorized()
 67 |     print "(train size, voca size) : (%d, %d)" % data.data.shape
 68 | 
 69 |     N_CLASS = data.target.max() + 1
 70 | 
 71 |     classifier_factories = []
 72 |     if opt.logistic_l1:
 73 |         print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1
 74 |         classifier_factories.append(lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1))
 75 |     if opt.logistic_l2:
 76 |         print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2
 77 |         classifier_factories.append(lambda: LogisticRegression(C=opt.logistic_l2))
 78 |     if opt.naive_bayes:
 79 |         print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes
 80 |         classifier_factories.append(lambda: MultinomialNB(alpha=opt.naive_bayes))
 81 | 
 82 |     if len(classifier_factories) >= 2:
 83 |         test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test')
 84 |         print "(test size, voca size) : (%d, %d)" % test.data.shape
 85 | 
 86 |         densities = None
 87 |         if opt.beta > 0:
 88 |             densities = (data.data * data.data.T).mean(axis=0).A[0] ** opt.beta
 89 | 
 90 |         methods = ["random", "vote entropy", "average KL", "qbc+margin sampling", ]
 91 |         results = []
 92 |         for n in xrange(opt.trying):
 93 |             for method in methods:
 94 |                 print "%s : %d" % (method, n)
 95 |                 train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)]
 96 |                 pool = range(data.data.shape[0])
 97 |                 for x in train: pool.remove(x)
 98 | 
 99 |                 results = activelearn(data, test, method, train, pool, classifier_factories, opt.max_train, densities)
100 | 
101 |                 d = len(train)
102 |                 with open("output_qbc_%d.txt" % opt.max_train, "ab") as f:
103 |                     f.write("%s\t%s\n" % (method, "\t".join("%f" % x for x in results)))
104 | 
105 | 
106 | if __name__ == "__main__":
107 |     main()
108 | 


--------------------------------------------------------------------------------
/activelearn/uncert_dist.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Active Learning (Uncertainly Sampling and Information Density) for 20 newsgroups
  5 | # This code is available under the MIT License.
  6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  7 | 
  8 | import optparse
  9 | import numpy
 10 | import scipy.sparse
 11 | import sklearn.datasets
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.naive_bayes import MultinomialNB
 14 | 
 15 | def activelearn(results, data, test, strategy, train, pool, classifier_factory, max_train, densities):
 16 |     # copy initial indexes of training and pool
 17 |     train = list(train)
 18 |     pool = list(pool)
 19 | 
 20 |     accuracies = []
 21 |     while len(train) < max_train:
 22 |         if len(accuracies) > 0:
 23 |             if strategy == "random":
 24 |                 x = numpy.random.randint(len(pool))
 25 |             else:
 26 |                 predict = cl.predict_proba(data.data[pool,:])
 27 |                 if strategy == "least confident":
 28 |                     x = predict.max(axis=1)-1
 29 |                 elif strategy == "margin sampling":
 30 |                     predict.sort(axis=1)
 31 |                     x = (predict[:,-1] - predict[:,-2])
 32 |                 elif strategy == "entropy-based":
 33 |                     x = numpy.nan_to_num(predict * numpy.log(predict)).sum(axis=1)
 34 |                 if densities != None: x *= densities[pool]
 35 |                 x = x.argmin()
 36 |             train.append(pool[x])
 37 |             del pool[x]
 38 | 
 39 |         cl = classifier_factory()
 40 |         cl.fit(data.data[train,:], data.target[train])
 41 |         accuracy = cl.score(test.data, test.target)
 42 |         print "%d : %f" % (len(train), accuracy)
 43 |         accuracies.append(accuracy)
 44 | 
 45 |     results.append((strategy, accuracies))
 46 | 
 47 | 
 48 | def main():
 49 |     parser = optparse.OptionParser()
 50 |     parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None)
 51 |     parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None)
 52 |     parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None)
 53 | 
 54 |     parser.add_option("-K", dest="class_size", type="int", help="number of class", default=None)
 55 |     parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=100)
 56 |     parser.add_option("-N", dest="trying", type="int", help="number of trying", default=100)
 57 | 
 58 |     parser.add_option("-b", dest="beta", type="float", help="density importance", default=0)
 59 |     (opt, args) = parser.parse_args()
 60 | 
 61 |     data = sklearn.datasets.fetch_20newsgroups_vectorized()
 62 |     print "(train size, voca size) : (%d, %d)" % data.data.shape
 63 | 
 64 |     if opt.class_size:
 65 |         index = data.target < opt.class_size
 66 |         a = data.data.toarray()[index, :]
 67 |         data.data = scipy.sparse.csr_matrix(a)
 68 |         data.target = data.target[index]
 69 |         print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape
 70 | 
 71 |     classifier_factory = clz = None
 72 |     if opt.logistic_l1:
 73 |         print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1
 74 |         classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1)
 75 |         clz = "lrl1"
 76 |     elif opt.logistic_l2:
 77 |         print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2
 78 |         classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2)
 79 |         clz = "lrl2"
 80 |     elif opt.naive_bayes:
 81 |         print "Naive Bayes Classifier : alpha = %f" % opt.naive_bayes
 82 |         classifier_factory = lambda: MultinomialNB(alpha=opt.naive_bayes)
 83 |         clz = "nb"
 84 | 
 85 |     if classifier_factory:
 86 |         test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test')
 87 |         print "(test size, voca size) : (%d, %d)" % test.data.shape
 88 |         if opt.class_size:
 89 |             index = test.target < opt.class_size
 90 |             a = test.data.toarray()[index, :]
 91 |             test.data = scipy.sparse.csr_matrix(a)
 92 |             test.target = test.target[index]
 93 |             print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape
 94 | 
 95 |         densities = None
 96 |         if opt.beta > 0:
 97 |             densities = (data.data * data.data.T).mean(axis=0).A[0] ** opt.beta
 98 | 
 99 |         N_CLASS = data.target.max() + 1
100 |         for method in ["random", "least confident", "margin sampling", "entropy-based"]:
101 |             results = []
102 |             for n in xrange(opt.trying):
103 |                 print "%s : %d" % (method, n)
104 |                 train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)]
105 |                 pool = range(data.data.shape[0])
106 |                 for x in train: pool.remove(x)
107 | 
108 |                 activelearn(results, data, test, method, train, pool, classifier_factory, opt.max_train, densities)
109 | 
110 |             d = len(train)
111 |             with open("output_%s_%s.txt" % (method, clz), "wb") as f:
112 |                 f.write(method)
113 |                 f.write("\n")
114 |                 for i in xrange(len(results[0][1])):
115 |                     f.write("%d\t%s\n" % (i+d, "\t".join("%f" % x[1][i] for x in results)))
116 | 
117 | 
118 | if __name__ == "__main__":
119 |     main()
120 | 


--------------------------------------------------------------------------------
/activelearn/uncertain.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encode: utf-8
 3 | 
 4 | # Active Learning (Uncertainly Sampling)
 5 | # This code is available under the MIT License.
 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
 7 | 
 8 | import sys, numpy
 9 | from sklearn.linear_model import LogisticRegression
10 | from sklearn import cross_validation
11 | 
12 | import optparse
13 | parser = optparse.OptionParser()
14 | #parser.add_option("-c", dest="corpus", help="corpus module name under nltk.corpus (e.g. brown, reuters)", default='brown')
15 | #parser.add_option("-r", dest="testrate", type="float", help="rate of test dataset in corpus", default=0.1)
16 | parser.add_option("--seed", dest="seed", type="int", help="random seed")
17 | (opt, args) = parser.parse_args()
18 | numpy.random.seed(opt.seed)
19 | 
20 | output = False
21 | 
22 | def activelearn(data, label, strategy):
23 |     #print strategy
24 | 
25 |     N, D = data.shape
26 |     train = list(range(D))
27 |     pool = range(D,N)
28 |     predict = None
29 | 
30 |     for i in xrange(30-D):
31 |         if predict != None:
32 |             if strategy == "random":
33 |                 x = numpy.random.randint(len(pool))
34 |             elif strategy == "least confident":
35 |                 x = predict.max(axis=1).argmin()
36 |             elif strategy == "margin sampling":
37 |                 predict.sort(axis=1)
38 |                 x = (numpy.exp(predict[:,-1])-numpy.exp(predict[:,-2])).argmin()
39 |             elif strategy == "entropy-based":
40 |                 x = numpy.nan_to_num(numpy.exp(predict)*predict).sum(axis=1).argmin()
41 |             train.append(pool[x])
42 |             del pool[x]
43 | 
44 |         cl = LogisticRegression()
45 |         #cl = LogisticRegression(C=0.1, penalty="l1")
46 |         cl.fit(data[train,:], label[train])
47 |         predict = cl.predict_log_proba(data[pool,:])
48 |         log_likelihood = 0
49 |         correct = 0
50 |         for n, logprob in zip(pool,predict):
51 |             c = label[n]
52 |             log_likelihood += logprob[c]
53 |             if c == logprob.argmax(): correct += 1
54 | 
55 |         Z = len(pool)
56 |         precision = float(correct) / Z
57 |         perplexity = numpy.exp(-log_likelihood / Z)
58 |         if output:
59 |             print "%d : %d / %d = %f, %f" % (len(train), correct, Z, precision, perplexity)
60 | 
61 |     #print data[train,:], label[train]
62 | 
63 |     if D==2:
64 |         import matplotlib.pyplot as plt
65 |         plt.plot(data[pool,0], data[pool,1], 'x', color="red")
66 |         plt.plot(data[train,0], data[train,1], 'o', color="red")
67 |         plt.title(strategy)
68 |         plt.show()
69 | 
70 |     return precision, perplexity
71 | 
72 | 
73 | D=10
74 | N=1000
75 | presicions = []
76 | perplexities = []
77 | for i in xrange(100):
78 |     data = numpy.random.randn(N,D)
79 |     label = numpy.zeros(N, dtype=int)
80 |     for n in xrange(N):
81 |         c = n % D
82 |         data[n, c] += 2
83 |         label[n] = c
84 | 
85 |     result = []
86 |     result.append(activelearn(data, label, "random"))
87 |     result.append(activelearn(data, label, "least confident"))
88 |     result.append(activelearn(data, label, "margin sampling"))
89 |     result.append(activelearn(data, label, "entropy-based"))
90 | 
91 |     x = numpy.array(result)
92 |     presicions.append(x[:,0])
93 |     perplexities.append(x[:,1])
94 | 
95 | print numpy.mean(presicions, axis=0)
96 | 
97 | 


--------------------------------------------------------------------------------
/activelearn/uncertain2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Active Learning (Uncertainly Sampling)
  5 | # This code is available under the MIT License.
  6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  7 | 
  8 | import re, collections, numpy
  9 | from nltk.corpus import movie_reviews
 10 | from nltk.stem import WordNetLemmatizer
 11 | 
 12 | voca = dict()
 13 | vocalist = []
 14 | doclist = []
 15 | labels = []
 16 | realphabet = re.compile('^[a-z]+$')
 17 | wnl = WordNetLemmatizer()
 18 | for id in movie_reviews.fileids():
 19 |     doc = collections.defaultdict(int)
 20 |     for w in movie_reviews.words(id):
 21 |         if realphabet.match(w):
 22 |             w = wnl.lemmatize(w)
 23 |             if w not in voca:
 24 |                 voca[w] = len(vocalist)
 25 |                 vocalist.append(w)
 26 |             doc[voca[w]] += 1
 27 |     if len(doc) > 0: doclist.append(doc)
 28 |     cat = movie_reviews.categories(id)[0]
 29 |     labels.append(1 if cat == "pos" else 0)
 30 | print len(voca)
 31 | 
 32 | labels = numpy.array(labels)
 33 | data = numpy.zeros((len(doclist), len(voca)))
 34 | for j, doc in enumerate(doclist):
 35 |     for i, c in doc.iteritems():
 36 |         data[j, i] = c
 37 | 
 38 | 
 39 | from sklearn.feature_extraction.text import TfidfTransformer
 40 | transformer = TfidfTransformer(norm=None)
 41 | data = transformer.fit_transform(data)
 42 | 
 43 | 
 44 | from sklearn import cross_validation
 45 | 
 46 | from sklearn.linear_model import LogisticRegression
 47 | cl = LogisticRegression()
 48 | 
 49 | from sklearn.naive_bayes import MultinomialNB
 50 | #cl = MultinomialNB()
 51 | 
 52 | from sklearn.naive_bayes import BernoulliNB
 53 | #cl = BernoulliNB()
 54 | 
 55 | from sklearn.svm import SVC
 56 | #cl = SVC()
 57 | 
 58 | from sklearn.ensemble import RandomForestClassifier
 59 | #cl = RandomForestClassifier()
 60 | 
 61 | 
 62 | print cross_validation.cross_val_score(cl, data, labels, cv=10)
 63 | 
 64 | 
 65 | 
 66 | """
 67 | import sys, numpy
 68 | from sklearn.linear_model import LogisticRegression
 69 | from sklearn import cross_validation
 70 | 
 71 | import optparse
 72 | parser = optparse.OptionParser()
 73 | #parser.add_option("-c", dest="corpus", help="corpus module name under nltk.corpus (e.g. brown, reuters)", default='brown')
 74 | #parser.add_option("-r", dest="testrate", type="float", help="rate of test dataset in corpus", default=0.1)
 75 | parser.add_option("--seed", dest="seed", type="int", help="random seed")
 76 | (opt, args) = parser.parse_args()
 77 | numpy.random.seed(opt.seed)
 78 | 
 79 | output = False
 80 | 
 81 | def activelearn(data, label, strategy):
 82 |     #print strategy
 83 | 
 84 |     N, D = data.shape
 85 |     train = list(range(D))
 86 |     pool = range(D,N)
 87 |     predict = None
 88 | 
 89 |     for i in xrange(30-D):
 90 |         if predict != None:
 91 |             if strategy == "random":
 92 |                 x = numpy.random.randint(len(pool))
 93 |             elif strategy == "least confident":
 94 |                 x = predict.max(axis=1).argmin()
 95 |             elif strategy == "margin sampling":
 96 |                 predict.sort(axis=1)
 97 |                 x = (numpy.exp(predict[:,-1])-numpy.exp(predict[:,-2])).argmin()
 98 |             elif strategy == "entropy-based":
 99 |                 x = numpy.nan_to_num(numpy.exp(predict)*predict).sum(axis=1).argmin()
100 |             train.append(pool[x])
101 |             del pool[x]
102 | 
103 |         cl = LogisticRegression()
104 |         #cl = LogisticRegression(C=0.1, penalty="l1")
105 |         cl.fit(data[train,:], label[train])
106 |         predict = cl.predict_log_proba(data[pool,:])
107 |         log_likelihood = 0
108 |         correct = 0
109 |         for n, logprob in zip(pool,predict):
110 |             c = label[n]
111 |             log_likelihood += logprob[c]
112 |             if c == logprob.argmax(): correct += 1
113 | 
114 |         Z = len(pool)
115 |         precision = float(correct) / Z
116 |         perplexity = numpy.exp(-log_likelihood / Z)
117 |         if output:
118 |             print "%d : %d / %d = %f, %f" % (len(train), correct, Z, precision, perplexity)
119 | 
120 |     #print data[train,:], label[train]
121 | 
122 |     if D==2:
123 |         import matplotlib.pyplot as plt
124 |         plt.plot(data[pool,0], data[pool,1], 'x', color="red")
125 |         plt.plot(data[train,0], data[train,1], 'o', color="red")
126 |         plt.title(strategy)
127 |         plt.show()
128 | 
129 |     return precision, perplexity
130 | 
131 | 
132 | D=10
133 | N=1000
134 | presicions = []
135 | perplexities = []
136 | for i in xrange(100):
137 |     data = numpy.random.randn(N,D)
138 |     label = numpy.zeros(N, dtype=int)
139 |     for n in xrange(N):
140 |         c = n % D
141 |         data[n, c] += 2
142 |         label[n] = c
143 | 
144 |     result = []
145 |     result.append(activelearn(data, label, "random"))
146 |     result.append(activelearn(data, label, "least confident"))
147 |     result.append(activelearn(data, label, "margin sampling"))
148 |     result.append(activelearn(data, label, "entropy-based"))
149 | 
150 |     x = numpy.array(result)
151 |     presicions.append(x[:,0])
152 |     perplexities.append(x[:,1])
153 | 
154 | print numpy.mean(presicions, axis=0)
155 | 
156 | """
157 | 


--------------------------------------------------------------------------------
/activelearn/uncertain3.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # encode: utf-8
 3 | 
 4 | # Active Learning (Uncertainly Sampling)
 5 | # This code is available under the MIT License.
 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
 7 | 
 8 | import numpy
 9 | import dataset
10 | from sklearn.linear_model import LogisticRegression
11 | 
12 | categories = ['crude', 'money-fx', 'trade', 'interest', 'ship', 'wheat', 'corn']
13 | doclist, labels, voca, vocalist = dataset.load(categories)
14 | print "document size : %d" % len(doclist)
15 | print "vocaburary size : %d" % len(voca)
16 | 
17 | data = numpy.zeros((len(doclist), len(voca)))
18 | for j, doc in enumerate(doclist):
19 |     for i, c in doc.iteritems():
20 |         data[j, i] = c
21 | 
22 | def activelearn(data, label, strategy, train):
23 |     print strategy
24 | 
25 |     N, D = data.shape
26 |     train = list(train) # copy initial indexes of training
27 |     pool = range(N)
28 |     for x in train: pool.remove(x)
29 | 
30 |     predict = None
31 |     precisions = []
32 |     while len(train) < 300:
33 |         if predict != None:
34 |             if strategy == "random":
35 |                 x = numpy.random.randint(len(pool))
36 |             elif strategy == "least confident":
37 |                 x = predict.max(axis=1).argmin()
38 |             elif strategy == "margin sampling":
39 |                 predict.sort(axis=1)
40 |                 x = (numpy.exp(predict[:,-1])-numpy.exp(predict[:,-2])).argmin()
41 |             elif strategy == "entropy-based":
42 |                 x = numpy.nan_to_num(numpy.exp(predict)*predict).sum(axis=1).argmin()
43 |             train.append(pool[x])
44 |             del pool[x]
45 | 
46 |         cl = LogisticRegression()
47 |         cl.fit(data[train,:], label[train])
48 |         predict = cl.predict_log_proba(data[pool,:])
49 |         log_likelihood = 0
50 |         correct = 0
51 |         for n, logprob in zip(pool,predict):
52 |             c = label[n]
53 |             log_likelihood += logprob[c]
54 |             if c == logprob.argmax(): correct += 1
55 | 
56 |         Z = len(pool)
57 |         precision = float(correct) / Z
58 |         perplexity = numpy.exp(-log_likelihood / Z)
59 |         print "%d : %d / %d = %f, %f" % (len(train), correct, Z, precision, perplexity)
60 | 
61 |         precisions.append(precision)
62 | 
63 |     return precisions
64 | 
65 | N_CLASS = labels.max() + 1
66 | train = [numpy.random.choice((labels==k).nonzero()[0]) for k in xrange(N_CLASS)]
67 | 
68 | methods = ["random", "least confident", "margin sampling", "entropy-based"]
69 | results = []
70 | for x in methods:
71 |     results.append(activelearn(data, labels, x, train))
72 | print "\t%s" % "\t".join(methods)
73 | d = len(categories)
74 | for i in xrange(len(results[0])):
75 |     print "%d\t%s" % (i+d, "\t".join("%f" % x[i] for x in results))
76 | 


--------------------------------------------------------------------------------
/activelearn/uncertain4.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Active Learning (Uncertainly Sampling and Information Density) for 20 newsgroups
  5 | # This code is available under the MIT License.
  6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  7 | 
  8 | import optparse
  9 | import numpy
 10 | import scipy.sparse
 11 | import sklearn.datasets
 12 | from sklearn.linear_model import LogisticRegression
 13 | from sklearn.naive_bayes import MultinomialNB
 14 | 
 15 | def activelearn(results, data, test, strategy, train, pool, classifier_factory, max_train, densities):
 16 |     print strategy
 17 | 
 18 |     # copy initial indexes of training and pool
 19 |     train = list(train)
 20 |     pool = list(pool)
 21 | 
 22 |     accuracies = []
 23 |     while len(train) < max_train:
 24 |         if len(accuracies) > 0:
 25 |             if strategy == "random":
 26 |                 x = numpy.random.randint(len(pool))
 27 |             else:
 28 |                 predict = cl.predict_proba(data.data[pool,:])
 29 |                 if strategy == "least confident":
 30 |                     x = predict.max(axis=1)-1
 31 |                 elif strategy == "margin sampling":
 32 |                     predict.sort(axis=1)
 33 |                     x = (predict[:,-1] - predict[:,-2])
 34 |                 elif strategy == "entropy-based":
 35 |                     x = numpy.nan_to_num(predict * numpy.log(predict)).sum(axis=1)
 36 |                 if densities != None: x *= densities[pool]
 37 |                 x = x.argmin()
 38 |             train.append(pool[x])
 39 |             del pool[x]
 40 | 
 41 |         cl = classifier_factory()
 42 |         cl.fit(data.data[train,:], data.target[train])
 43 |         accuracy = cl.score(test.data, test.target)
 44 |         print "%s %d : %f" % (strategy, len(train), accuracy)
 45 |         accuracies.append(accuracy)
 46 | 
 47 |     results.append((strategy, accuracies))
 48 | 
 49 | 
 50 | def main():
 51 |     parser = optparse.OptionParser()
 52 |     parser.add_option("-r", dest="method_random", action="store_true", help="use random sampling", default=False)
 53 |     parser.add_option("-l", dest="method_least", action="store_true", help="use least confident", default=False)
 54 |     parser.add_option("-m", dest="method_margin", action="store_true", help="use margin sampling", default=False)
 55 |     parser.add_option("-e", dest="method_entropy", action="store_true", help="use entropy-based method", default=False)
 56 |     parser.add_option("-a", dest="method_all", action="store_true", help="use all methods", default=False)
 57 | 
 58 |     parser.add_option("--nb", dest="naive_bayes", type="float", help="use naive bayes classifier", default=None)
 59 |     parser.add_option("--lr1", dest="logistic_l1", type="float", help="use logistic regression with l1-regularity", default=None)
 60 |     parser.add_option("--lr2", dest="logistic_l2", type="float", help="use logistic regression with l2-regularity", default=None)
 61 | 
 62 |     parser.add_option("-K", dest="class_size", type="int", help="number of class", default=None)
 63 |     parser.add_option("-n", dest="max_train", type="int", help="max size of training", default=300)
 64 |     parser.add_option("-t", dest="training", help="specify indexes of training", default=None)
 65 | 
 66 |     parser.add_option("-b", dest="beta", type="float", help="density importance", default=0)
 67 | 
 68 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
 69 |     (opt, args) = parser.parse_args()
 70 |     numpy.random.seed(opt.seed)
 71 | 
 72 |     data = sklearn.datasets.fetch_20newsgroups_vectorized()
 73 |     print "(train size, voca size) : (%d, %d)" % data.data.shape
 74 | 
 75 |     N_CLASS = data.target.max() + 1
 76 |     if opt.training:
 77 |         train = [int(x) for x in opt.training.split(",")]
 78 |     else:
 79 |         train = [numpy.random.choice((data.target==k).nonzero()[0]) for k in xrange(N_CLASS)]
 80 |     print "indexes of training set : ", ",".join("%d" % x for x in train)
 81 |     if opt.class_size:
 82 |         index = data.target < opt.class_size
 83 |         a = data.data.toarray()[index, :]
 84 |         data.data = scipy.sparse.csr_matrix(a)
 85 |         data.target = data.target[index]
 86 |         print "(shrinked train size, voca size) : (%d, %d)" % data.data.shape
 87 | 
 88 |     pool = range(data.data.shape[0])
 89 |     for x in train: pool.remove(x)
 90 | 
 91 |     methods = []
 92 |     if opt.method_all:
 93 |         methods = ["random", "least confident", "margin sampling", "entropy-based"]
 94 |     else:
 95 |         if opt.method_random: methods.append("random")
 96 |         if opt.method_least: methods.append("least confident")
 97 |         if opt.method_margin: methods.append("margin sampling")
 98 |         if opt.method_entropy: methods.append("entropy-based")
 99 | 
100 |     if len(methods) > 0:
101 |         test = sklearn.datasets.fetch_20newsgroups_vectorized(subset='test')
102 |         print "(test size, voca size) : (%d, %d)" % test.data.shape
103 |         if opt.class_size:
104 |             index = test.target < opt.class_size
105 |             a = test.data.toarray()[index, :]
106 |             test.data = scipy.sparse.csr_matrix(a)
107 |             test.target = test.target[index]
108 |             print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape
109 | 
110 |         densities = None
111 |         if opt.beta > 0:
112 |             densities = (data.data * data.data.T).mean(axis=0).A[0] ** opt.beta
113 | 
114 |         if opt.logistic_l1:
115 |             print "Logistic Regression with L1-regularity : C = %f" % opt.logistic_l1
116 |             classifier_factory = lambda: LogisticRegression(penalty='l1', C=opt.logistic_l1)
117 |         elif opt.logistic_l2:
118 |             print "Logistic Regression with L2-regularity : C = %f" % opt.logistic_l2
119 |             classifier_factory = lambda: LogisticRegression(C=opt.logistic_l2)
120 |         else:
121 |             a = opt.naive_bayes or 0.01
122 |             print "Naive Bayes Classifier : alpha = %f" % a
123 |             classifier_factory = lambda: MultinomialNB(alpha=a)
124 | 
125 |         results = []
126 |         for x in methods:
127 |             activelearn(results, data, test, x, train, pool, classifier_factory, opt.max_train, densities)
128 | 
129 |         print "\t%s" % "\t".join(x[0] for x in results)
130 |         d = len(train)
131 |         for i in xrange(len(results[0][1])):
132 |             print "%d\t%s" % (i+d, "\t".join("%f" % x[1][i] for x in results))
133 | 
134 | 
135 | if __name__ == "__main__":
136 |     main()
137 | 


--------------------------------------------------------------------------------
/clustering/irm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Infinite Relational Model
  5 | # via 石井健一郎・上田修功 "続・わかりやすいパターン認識" Chapter 14
  6 | 
  7 | # This code is available under the MIT License.
  8 | # (c)2016 Nakatani Shuyo / Cybozu Labs Inc.
  9 | 
 10 | import numpy
 11 | from scipy.special import betaln, gammaln
 12 | 
 13 | def log_ps(a, n, N):
 14 |     c = len(n)
 15 |     return c * numpy.log(a) + gammaln(n).sum() - gammaln(a + N) + gammaln(a) - gammaln(c+1)
 16 | 
 17 | class IRM(object):
 18 |     def __init__(self, data, alpha, a, b):
 19 |         self.R = data
 20 |         self.K, self.L = data.shape
 21 |         self.alpha = alpha
 22 |         self.a = a
 23 |         self.b = b
 24 |         self.s1 = numpy.zeros(self.K, dtype=int) - 1
 25 |         self.s2 = numpy.zeros(self.L, dtype=int) - 1
 26 |         self.n1 = []
 27 |         self.n2 = []
 28 | 
 29 |     def update(self):
 30 |         for k in range(self.K):
 31 |             p = self.update_cluster(k, self.s1, self.s2, self.n1, self.n2, self.R)
 32 |         for l in range(self.L):
 33 |             p = self.update_cluster(l, self.s2, self.s1, self.n2, self.n1, self.R.T)
 34 | 
 35 |     def update_cluster(self, k, s1, s2, n1, n2, R):
 36 |         now_i = s1[k]
 37 |         s1[k] = -1
 38 |         if now_i >= 0:
 39 |             n1[now_i] -= 1
 40 |             if n1[now_i] == 0:
 41 |                 n1.pop(now_i)
 42 |                 s1[s1>now_i] -= 1
 43 | 
 44 |         c1 = len(n1)
 45 |         c2 = len(n2)
 46 |         m1, m0, m1k, m0k = self.count_nij(R, s1, s2, c1, c2)
 47 | 
 48 |         logps = numpy.zeros(c1+1)
 49 |         for i in range(c1):
 50 |             p = numpy.log(n1[i])
 51 |             p += self.logZ(self.a+m1[i]+m1k, self.b+m0[i]+m0k).sum()
 52 |             p -= self.logZ(self.a+m1[i], self.b+m0[i]).sum()
 53 |             logps[i] = p
 54 |         p = numpy.log(self.alpha)
 55 |         p += self.logZ(self.a+m1k, self.b+m0k).sum()
 56 |         p -= c2 * self.logZ(self.a, self.b)
 57 |         logps[c1] = p
 58 | 
 59 |         logps -= logps.max()
 60 |         ps = numpy.exp(logps)
 61 |         ps /= ps.sum()
 62 |         new_i = numpy.random.choice(c1+1, 1, p=ps)
 63 |         if new_i<c1:
 64 |             n1[new_i] += 1
 65 |         else:
 66 |             n1.append(1)
 67 |         s1[k] = new_i
 68 | 
 69 |     def log_posterior(self):
 70 |         log_v = log_ps(self.alpha, self.n1, self.K)
 71 |         log_v += log_ps(self.alpha, self.n2, self.L)
 72 |         m1, m0, m1k, m0k = self.count_nij(self.R, self.s1, self.s2, len(self.n1), len(self.n2))
 73 |         beta_ab = self.logZ(self.a, self.b)
 74 |         for m1i, m0i in zip(m1, m0):
 75 |             for n1, n0 in zip(m1i, m0i):
 76 |                 log_v += self.logZ(n1 + self.a, n0 + self.b) - beta_ab
 77 |         return log_v
 78 | 
 79 |     def clone(self):
 80 |         import copy
 81 |         return copy.deepcopy(self)
 82 | 
 83 |     def logZ(self, a, b):
 84 |         "Log Normalization Constant of Beta Distribution"
 85 |         return betaln(a, b)
 86 | 
 87 |     def count_nij(self, R, s1, s2, c1, c2):
 88 |         m1 = numpy.zeros((c1,c2), dtype=int)    # n_(-k,+)[i,j]
 89 |         m0 = numpy.zeros((c1,c2), dtype=int)    # n~
 90 |         m1k = numpy.zeros(c2, dtype=int)        # n_(k,+)[j] where s_k=ω_i
 91 |         m0k = numpy.zeros(c2, dtype=int)        # n~
 92 |         for i, rk in zip(s1, R):
 93 |             if i>=0:
 94 |                 m1i = m1[i]
 95 |                 m0i = m0[i]
 96 |             else:
 97 |                 m1i = m1k
 98 |                 m0i = m0k
 99 |             for j, r in zip(s2, rk):
100 |                 if j<0: continue
101 |                 m1i[j] += r
102 |                 m0i[j] += 1-r
103 |         return m1, m0, m1k, m0k
104 | 
105 | class PoissonIRM(IRM):
106 |     def logZ(self, a, b):
107 |         "Log Normalization Constant of Gamma Distribution"
108 |         return gammaln(a) - a * numpy.log(b)
109 | 
110 |     def count_nij(self, R, s1, s2, c1, c2):
111 |         m1 = numpy.zeros((c1,c2), dtype=int)    # C_(-k,+)[i,j]
112 |         m0 = numpy.zeros((c1,c2), dtype=int)    # m
113 |         m1k = numpy.zeros(c2, dtype=int)        # C_(k,+)[j] where s_k=ω_i
114 |         m0k = numpy.zeros(c2, dtype=int)        # m
115 |         for i, rk in zip(s1, R):
116 |             if i>=0:
117 |                 m1i = m1[i]
118 |                 m0i = m0[i]
119 |             else:
120 |                 m1i = m1k
121 |                 m0i = m0k
122 |             for j, r in zip(s2, rk):
123 |                 if j<0: continue
124 |                 m1i[j] += r
125 |                 m0i[j] += 1
126 |         return m1, m0, m1k, m0k
127 | 
128 | if __name__ == "__main__":
129 |     from numpy.random import binomial
130 |     from numpy import concatenate as concat
131 | 
132 |     numpy.random.seed(123)
133 |     d = 5
134 |     phi = [[0.1, 0.7, 0.2], [0.1, 0.3, 0.9], [0.8, 0.1, 0.2]]
135 |     orgR = concat([concat([binomial(1, p, size=(d,d)) for p in pp], axis=1) for pp in phi])
136 |     i = numpy.arange(orgR.shape[0])
137 |     numpy.random.shuffle(i)
138 |     R = orgR[i,:]
139 |     i = numpy.arange(orgR.shape[1])
140 |     numpy.random.shuffle(i)
141 |     R = R[:,i]
142 | 
143 |     model = IRM(R, alpha=1.0, a=1.0, b=1.0)
144 |     maxv = -1e9
145 |     for i in range(200):
146 |         model.update()
147 |         v = model.log_posterior()
148 |         if v > maxv:
149 |             maxv = v
150 |             maxm = model.clone()
151 |         print(i, v)
152 |     RR = R[numpy.argsort(maxm.s1), :]
153 |     RR = RR[:, numpy.argsort(maxm.s2)]
154 |     print("--------")
155 |     print(orgR)
156 |     print(R)
157 |     print(maxm.s1)
158 |     print(maxm.s2)
159 |     print(RR)
160 |     print(maxv)
161 | 


--------------------------------------------------------------------------------
/data/4million.corpus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/data/4million.corpus


--------------------------------------------------------------------------------
/data/gen_cluto.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | # generate *.mat/*.clabel for CLUTO
 4 | 
 5 | input = ARGV[0] || 'corpus'
 6 | output = input.sub(/\.[^\.]+/, '')
 7 | 
 8 | data = open(input){|f| Marshal.load(f) }
 9 | docs = data[:docs]
10 | terms = data[:terms]
11 | 
12 | total = 0
13 | terms.each do |term, map|
14 |   total += map.size
15 | end
16 | termlist = terms.keys
17 | 
18 | open(output+".mat", "w") do |f|
19 |   f.puts "#{docs.size} #{terms.size} #{total}"
20 |   (0..(docs.size-1)).each do |doc_id|
21 |     row = []
22 |     termlist.each_with_index do |term, term_id|
23 |       v = terms[term][doc_id]
24 |       row << term_id+1 << v if v && v>0
25 |     end
26 |     f.puts row.join(" ")
27 |   end
28 | end
29 | 
30 | open(output+".clabel", "w") do |f|
31 |   termlist.each do |term|
32 |     f.puts term
33 |   end
34 | end
35 | 
36 | open(output+".rlabel", "w") do |f|
37 |   docs.each do |doc|
38 |     f.puts doc[:title]
39 |   end
40 | end
41 | 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/data/gen_corpus.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | # ruby gen_corpus.rb ohenry/1444.zip ohenry/1646.zip ohenry/1725.zip ohenry/2777.zip ohenry/2776.zip
 3 | 
 4 | require 'pstore'
 5 | require "../lib/extract_gutenberg.rb"
 6 | require '../lib/infinitive.rb'
 7 | INF = Infinitive.new
 8 | 
 9 | DEST_DIR = "output"
10 | DEBUG = false
11 | 
12 | docs = Array.new
13 | terms = Hash.new
14 | 
15 | doc_id = 0
16 | ARGV.each do |path|
17 |   file = path.dup
18 | 	file = $1 if path =~ /\/([^\/]+)$/
19 | 	text = nil
20 | 	if path =~ /\.zip$/i
21 | 	  file.sub!(/\.zip$/i, ".txt")
22 | 	  text = `unzip -cq #{path} "*.txt"`
23 |     open("#{DEST_DIR}/#{file}.org", "w"){|f| f.write text}
24 | 	else
25 | 	  text = open(path){|f| f.read}
26 | 	end
27 |   text = Gutenberg.extract(text)
28 |   open("#{DEST_DIR}/#{file}", "w"){|f| f.write text}
29 | 
30 |   list = text.split(/^[IVX]+\s*\.?$/)[1..-1]
31 |   list = text.split(/^\n{4}$/) if list.size<=1
32 |   list.each do |x|
33 |     next unless x =~ /^(.+)$/
34 |     title = $1
35 | 
36 |     words = x.scan(/[A-Za-z]+(?:'t)?/)
37 |     next if words.size < 1000
38 | 
39 |     n = 0
40 |     words.each do |word|
41 |       word = INF.infinitive(word)
42 |       terms[word] ||= Hash.new(0)
43 |       terms[word][doc_id] += 1
44 |       n += 1
45 |     end
46 | 
47 |     docs[doc_id] = {:title=>title, :n_words=>n}
48 |     doc_id += 1
49 |   end
50 | end
51 | 
52 | db = PStore.new('corpus')
53 | db.transaction do
54 |   db[:docs] = docs
55 |   db[:terms] = terms
56 | end
57 | 


--------------------------------------------------------------------------------
/data/gen_libsvm.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | # generate libsvm-format data
 3 | # gen_libsvm.rb [positive] [negative]
 4 | if ARGV.size<2
 5 |   puts "gen_libsvm.rb [positive] [negative]"
 6 |   exit 1
 7 | end
 8 | 
 9 | require "../lib/extract_gutenberg.rb"
10 | require '../lib/infinitive.rb'
11 | INF = Infinitive.new
12 | 
13 | REG_TITLE = /^([A-Z][A-Z\-', ]+)$/
14 | class Analyzer
15 |   def initialize
16 |     @terms = Hash.new
17 |     @docs = Array.new
18 |   end
19 |   attr_reader :docs, :terms
20 |   def extract_words(path, sign)
21 |     file = path.dup
22 |   	file = $1 if path =~ /\/([^\/]+)$/
23 |   	text = if path =~ /\.zip$/i
24 |   	  file.sub!(/\.zip$/i, ".txt")
25 |   	  `unzip -cq #{path} "*.txt"`
26 |   	else
27 |   	  open(path){|f| f.read}
28 |   	end
29 |     text = Gutenberg.extract(text)
30 | 
31 |     list = text.split(REG_TITLE)
32 | 
33 |     title = nil
34 |     list.each do |x|
35 |       if x =~ REG_TITLE
36 |         title = x
37 |         next
38 |       end
39 |       words = x.scan(/[A-Za-z]+(?:'t)?/)
40 |       next if words.size < 1000
41 | 
42 |       while words.size >= 100
43 |         subwords = words.slice!(0, 100)
44 |         n = 0
45 |         doc_id = @docs.size
46 |         subwords.each do |word|
47 |           word = INF.infinitive(word)
48 |           @terms[word] ||= Hash.new(0)
49 |           @terms[word][doc_id] += 1
50 |           n += 1
51 |         end
52 |         @docs << {:title=>title, :n_words=>n, :sign=>sign}
53 |       end
54 |     end
55 |   end
56 | end
57 | 
58 | ana = Analyzer.new
59 | ana.extract_words ARGV[0], "+1"
60 | ana.extract_words ARGV[1], "-1"
61 | 
62 | words = ana.terms.keys
63 | ana.docs.each_with_index do |doc, doc_id|
64 |   buf = [doc[:sign]]
65 |   words.each_with_index do |word, word_id|
66 |     if ana.terms[word]
67 |       freq = ana.terms[word][doc_id]
68 |       buf << "#{word_id}:#{freq}" if freq>0
69 |     end
70 |   end
71 |   puts buf.join(' ')
72 | end
73 | 
74 | 


--------------------------------------------------------------------------------
/data/ohenry.corpus:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/data/ohenry.corpus


--------------------------------------------------------------------------------
/dnn/README.md:
--------------------------------------------------------------------------------
1 | # Deep Learning Experiment Code
2 | ## Video
3 | 
4 |  * SVHN Generator with DCGAN ( [dcgan-svhn.py](/dnn/dcgan-svhn.py) )
5 |    * https://youtu.be/yXyJq35w5gk
6 | 
7 |  * SVHN Generator based on DCGAN+Conditional GAN ( [cdcgan-svhn.py](/dnn/cdcgan-svhn.py) )
8 |    * https://youtu.be/IXaeo9wxSoQ
9 | 


--------------------------------------------------------------------------------
/dnn/cdcgan-svhn.ini:
--------------------------------------------------------------------------------
 1 | 
 2 | [DEFAULT]
 3 | model filename = cdcgan-svhn.model
 4 | 
 5 | # download train_32x32.mat in advance from http://ufldl.stanford.edu/housenumbers/
 6 | SVHN path = data/train_32x32.mat
 7 | number of labels = 10
 8 | 
 9 | noise dim = 100
10 | discriminator hidden units = 64 128 256
11 | generator hidden units = 512 256 128
12 | 
13 | alpha = 2e-4
14 | mini batch size = 128
15 | epoch = 10
16 | 
17 | working directory = svhn_gen
18 | 
19 | [small]
20 | noise dim = 10
21 | discriminator hidden units = 16 32 64
22 | generator hidden units = 128 64 32
23 | epoch = 10
24 | working directory = svhn_gen2
25 | 
26 | [large]
27 | noise dim = 100
28 | discriminator hidden units = 128 256 512
29 | generator hidden units = 1024 512 256
30 | epoch = 100
31 | working directory = svhn_gen3
32 | 
33 | alpha = 5e-5
34 | 
35 | 


--------------------------------------------------------------------------------
/dnn/cgan-mnist.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # MNIST generator based on Conditional Generative Adversarial Networks with Tensorflow
  5 | # (M. Mirza and S. Osindero. Conditional generative adversarial nets. CoRR, abs/1411.1784, 2014.)
  6 | 
  7 | # This code is available under the MIT License.
  8 | # (c)2016 Nakatani Shuyo / Cybozu Labs Inc.
  9 | 
 10 | import numpy, math
 11 | import tensorflow as tf
 12 | from tensorflow.examples.tutorials.mnist import input_data
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | # model parameter
 16 | noise_dim = 10 # input noise size of Generator
 17 | Dhidden = 256  # hidden units of Discriminator's network
 18 | Ghidden = 512  # hidden units of Generator's network
 19 | K = 8          # maxout units of Discriminator
 20 | 
 21 | mini_batch_size = 50
 22 | epoch = 50
 23 | nsamples = 7 # drawing samples
 24 | 
 25 | mnist = input_data.read_data_sets("data/", one_hot=True)
 26 | N, num_features = mnist.train.images.shape
 27 | _, num_labels = mnist.train.labels.shape
 28 | period = N // mini_batch_size
 29 | 
 30 | X = tf.placeholder(tf.float32, shape=(None, num_features))
 31 | Y = tf.placeholder(tf.float32, shape=(None, num_labels))
 32 | Z = tf.placeholder(tf.float32, shape=(None, noise_dim))
 33 | keep_prob = tf.placeholder(tf.float32)
 34 | 
 35 | GW1z = tf.Variable(tf.random_normal([noise_dim, Ghidden], stddev=0.1), name="GW1z")
 36 | GW1y = tf.Variable(tf.random_normal([num_labels, Ghidden], stddev=0.1), name="GW1y")
 37 | Gb1 = tf.Variable(tf.zeros(Ghidden), name="Gb1")
 38 | GW2 = tf.Variable(tf.random_normal([Ghidden, num_features], stddev=0.1), name="GW2")
 39 | Gb2 = tf.Variable(tf.zeros(num_features), name="Gb2")
 40 | 
 41 | DW1x = tf.Variable(tf.random_normal([num_features, K * Dhidden], stddev=0.01), name="DW1x")
 42 | DW1y = tf.Variable(tf.random_normal([num_labels, K * Dhidden], stddev=0.01), name="DW1y")
 43 | Db1 = tf.Variable(tf.zeros(K * Dhidden), name="Db1")
 44 | DW2 = tf.Variable(tf.random_normal([Dhidden, 1], stddev=0.01), name="DW2")
 45 | Db2 = tf.Variable(tf.zeros(1), name="Db2")
 46 | 
 47 | def discriminator(x, y):
 48 |     u = tf.reshape(tf.matmul(x, DW1x) + tf.matmul(y, DW1y) + Db1, [-1, K, Dhidden])
 49 |     Dh1 = tf.nn.dropout(tf.reduce_max(u, reduction_indices=[1]), keep_prob)
 50 |     return tf.nn.sigmoid(tf.matmul(Dh1, DW2) + Db2)
 51 | 
 52 | Gh1 = tf.nn.relu(tf.matmul(Z, GW1z) + tf.matmul(Y, GW1y) + Gb1)
 53 | G = tf.nn.sigmoid(tf.matmul(Gh1, GW2) + Gb2)
 54 | DG = discriminator(G, Y)
 55 | 
 56 | Dloss = -tf.reduce_mean(tf.log(discriminator(X, Y)) + tf.log(1 - DG))
 57 | Gloss = tf.reduce_mean(tf.log(1 - DG) - tf.log(DG + 1e-9)) # the second term for stable learning
 58 | 
 59 | vars = tf.trainable_variables()
 60 | Dvars = [v for v in vars if v.name.startswith("D")]
 61 | Gvars = [v for v in vars if v.name.startswith("G")]
 62 | 
 63 | Doptimizer = tf.train.AdamOptimizer().minimize(Dloss, var_list=Dvars)
 64 | Goptimizer = tf.train.AdamOptimizer().minimize(Gloss, var_list=Gvars)
 65 | 
 66 | sess = tf.Session()
 67 | sess.run(tf.initialize_all_variables())
 68 | 
 69 | for e in range(epoch):
 70 |     dloss = gloss = 0.0
 71 |     for i in range(period):
 72 |         x, y = mnist.train.next_batch(mini_batch_size)
 73 |         z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim))
 74 |         loss, _ = sess.run([Dloss, Doptimizer], feed_dict={X:x, Y:y, Z:z, keep_prob:0.5})
 75 |         dloss += loss
 76 |         z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim))
 77 |         loss, _ = sess.run([Gloss, Goptimizer], feed_dict={Y:y, Z:z, keep_prob:1.0})
 78 |         gloss += loss
 79 | 
 80 |     print("%d: dloss=%.5f, gloss=%.5f" % (e+1, dloss / period, gloss / period))
 81 |     if math.isnan(dloss) or math.isnan(gloss):
 82 |         sess.run(tf.initialize_all_variables()) # initialize & retry if NaN
 83 | 
 84 | def save_figures(path, z):
 85 |     fig = plt.figure()
 86 |     fig.subplots_adjust(left=0,bottom=0,right=1,top=1)
 87 |     for i in range(num_labels):
 88 |         y = numpy.zeros((z.shape[0], num_labels))
 89 |         y[:,i] = 1
 90 |         Gz = sess.run(G, feed_dict={Y:y, Z: z})
 91 |         for j in range(nsamples):
 92 |             ax = fig.add_subplot(nsamples, num_labels, j * num_labels + i + 1)
 93 |             ax.axis("off")
 94 |             ax.imshow(Gz[j,:].reshape((28,28)), cmap=plt.get_cmap("gray"))
 95 |     fig.savefig(path)
 96 |     plt.close(fig)
 97 | 
 98 | z = numpy.random.uniform(-1, 1, size=(nsamples, noise_dim))
 99 | #z[:,0] = numpy.arange(0, nsamples) / (nsamples - 1) * 2 - 1
100 | save_figures("cgan-mnist.png", z)
101 | 
102 | 


--------------------------------------------------------------------------------
/dnn/dcgan-svhn.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # SVHN generator based on DCGAN with Tensorflow
  5 | # Radford, A., Metz, L., and Chintala, S. Unsupervised representation learning with deep convolutional generative adversarial networks. 2016.
  6 | 
  7 | # This code is available under the MIT License.
  8 | # (c)2016 Nakatani Shuyo / Cybozu Labs Inc.
  9 | 
 10 | import numpy, math, time
 11 | import scipy.io
 12 | import tensorflow as tf
 13 | import matplotlib.pyplot as plt
 14 | 
 15 | # model parameter
 16 | noise_dim = 100 # input noise size of Generator
 17 | Dhidden = [64, 128, 256]    # hidden units of Discriminator's network
 18 | Ghidden = [512, 256, 128]  # hidden units of Generator's network
 19 | 
 20 | mini_batch_size = 128
 21 | samples=(10,12)  # samples drawing size
 22 | nsamples = samples[0] * samples[1]
 23 | assert nsamples <= mini_batch_size
 24 | epoch = 100
 25 | 
 26 | # download train_32x32.mat in advance from http://ufldl.stanford.edu/housenumbers/
 27 | svhn = scipy.io.loadmat("data/train_32x32.mat")
 28 | train_data = svhn["X"]
 29 | #train_data = train_data[:, :, :, :256] # small dataset
 30 | fig_width, fig_height, n_channels, N = train_data.shape
 31 | train_data = train_data.reshape(fig_width * fig_height * n_channels, N)
 32 | train_data -= train_data.min(axis=0)
 33 | train_data = (numpy.array(train_data, dtype=numpy.float32) / train_data.max(axis=0)).T.reshape(N, fig_width, fig_height, n_channels)
 34 | period = N // mini_batch_size
 35 | 
 36 | X = tf.placeholder(tf.float32, shape=(None, fig_width, fig_height, n_channels))
 37 | Z = tf.placeholder(tf.float32, shape=(None, noise_dim))
 38 | keep_prob = tf.placeholder(tf.float32)
 39 | 
 40 | with tf.variable_scope("G"):
 41 |     GW0 = tf.Variable(tf.random_normal([noise_dim, Ghidden[0]*4*4], stddev=0.01))
 42 |     Gb0 = tf.Variable(tf.zeros(Ghidden[0]))
 43 |     GW1 = tf.Variable(tf.random_normal([5, 5, Ghidden[1], Ghidden[0]], stddev=0.01))
 44 |     Gb1 = tf.Variable(tf.zeros(Ghidden[1]))
 45 |     GW2 = tf.Variable(tf.random_normal([5, 5, Ghidden[2], Ghidden[1]], stddev=0.01))
 46 |     Gb2 = tf.Variable(tf.zeros(Ghidden[2]))
 47 |     GW3 = tf.Variable(tf.random_normal([5, 5, n_channels, Ghidden[2]], stddev=0.01))
 48 |     Gb3 = tf.Variable(tf.zeros(n_channels))
 49 | 
 50 | # batch normalization & relu
 51 | def bn(u):
 52 |     mean, variance = tf.nn.moments(u, axes=[0, 1, 2])
 53 |     return tf.nn.relu(tf.nn.batch_normalization(u, mean, variance, None, None, 1e-5))
 54 | 
 55 | Gh0 = bn(tf.nn.bias_add(tf.reshape(tf.matmul(Z, GW0), [-1, fig_width//8, fig_height//8, Ghidden[0]]), Gb0))
 56 | Gh1 = bn(tf.nn.bias_add(tf.nn.conv2d_transpose(Gh0, GW1, [mini_batch_size, fig_width//4, fig_height//4, Ghidden[1]], [1, 2, 2, 1]), Gb1))
 57 | Gh2 = bn(tf.nn.bias_add(tf.nn.conv2d_transpose(Gh1, GW2, [mini_batch_size, fig_width//2, fig_height//2, Ghidden[2]], [1, 2, 2, 1]), Gb2))
 58 | G = tf.nn.tanh(tf.nn.bias_add(tf.nn.conv2d_transpose(Gh2, GW3, [mini_batch_size, fig_width, fig_height, n_channels], [1, 2, 2, 1]), Gb3))
 59 | 
 60 | with tf.variable_scope("D"):
 61 |     DW0 = tf.Variable(tf.random_normal([5, 5, n_channels, Dhidden[0]], stddev=0.01))
 62 |     Db0 = tf.Variable(tf.zeros(Dhidden[0]))
 63 |     DW1 = tf.Variable(tf.random_normal([5, 5, Dhidden[0], Dhidden[1]], stddev=0.01))
 64 |     Db1 = tf.Variable(tf.zeros(Dhidden[1]))
 65 |     DW2 = tf.Variable(tf.random_normal([5, 5, Dhidden[1], Dhidden[2]], stddev=0.01))
 66 |     Db2 = tf.Variable(tf.zeros(Dhidden[2]))
 67 |     DW3 = tf.Variable(tf.random_normal([(fig_width//8)*(fig_height//8)*Dhidden[2], 1], stddev=0.01))
 68 |     Db3 = tf.Variable(tf.zeros(1))
 69 | 
 70 | # batch normalization & leaky relu
 71 | def bnl(u, a=0.2):
 72 |     mean, variance = tf.nn.moments(u, axes=[0, 1, 2])
 73 |     b = tf.nn.batch_normalization(u, mean, variance, None, None, 1e-5)
 74 |     return tf.maximum(a * b, b)
 75 | 
 76 | def discriminator(xx):
 77 |     Dh0 = bnl(tf.nn.bias_add(tf.nn.conv2d(xx, DW0, [1, 2, 2, 1], padding='SAME'), Db0))
 78 |     Dh1 = bnl(tf.nn.bias_add(tf.nn.conv2d(Dh0, DW1, [1, 2, 2, 1], padding='SAME'), Db1))
 79 |     Dh2 = bnl(tf.nn.bias_add(tf.nn.conv2d(Dh1, DW2, [1, 2, 2, 1], padding='SAME'), Db2))
 80 |     return tf.nn.sigmoid(tf.matmul(tf.reshape(Dh2, [-1, (fig_width//8)*(fig_height//8)*Dhidden[2]]), DW3) + Db3)
 81 | 
 82 | DG = discriminator(G)
 83 | Dloss = -tf.reduce_mean(tf.log(discriminator(X)) + tf.log(1 - DG))
 84 | Gloss = tf.reduce_mean(tf.log(1 - DG) - tf.log(DG + 1e-9)) # the second term for stable learning
 85 | 
 86 | vars = tf.trainable_variables()
 87 | Dvars = [v for v in vars if v.name.startswith("D")]
 88 | Gvars = [v for v in vars if v.name.startswith("G")]
 89 | 
 90 | Doptimizer = tf.train.AdamOptimizer(learning_rate=2e-4).minimize(Dloss, var_list=Dvars)
 91 | Goptimizer = tf.train.AdamOptimizer(learning_rate=2e-4).minimize(Gloss, var_list=Gvars)
 92 | 
 93 | sess = tf.Session()
 94 | sess.run(tf.initialize_all_variables())
 95 | 
 96 | def save_figure(path, z):
 97 |     Gz = sess.run(G, feed_dict={Z: z})
 98 |     #plt.ion()
 99 |     fig = plt.gcf()
100 |     fig.subplots_adjust(left=0,bottom=0,right=1,top=1)
101 |     for i in range(nsamples):
102 |         ax = fig.add_subplot(samples[0], samples[1], i + 1)
103 |         ax.axis("off")
104 |         ax.imshow(Gz[i,:,:,:])
105 |     plt.savefig(path)
106 |     plt.draw()
107 |     plt.pause(0.01)
108 | 
109 | t0 = time.time()
110 | drawz = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim)) # nsamples < mini_batch_size
111 | for e in range(epoch):
112 |     index = numpy.random.permutation(N)
113 |     dloss = gloss = 0.0
114 |     for i in range(period):
115 |         x = train_data[index[i*mini_batch_size:(i+1)*mini_batch_size], :]
116 |         z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim))
117 |         loss, _ = sess.run([Dloss, Doptimizer], feed_dict={X:x, Z:z, keep_prob:0.5})
118 |         dloss += loss
119 |         z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim))
120 |         loss, _ = sess.run([Gloss, Goptimizer], feed_dict={Z:z, keep_prob:1.0})
121 |         gloss += loss
122 | 
123 |         if math.isnan(dloss) or math.isnan(gloss):
124 |             sess.run(tf.initialize_all_variables()) # initialize & retry if NaN
125 |             print("...initialize parameters for nan...")
126 |             dloss = gloss = 0.0
127 | 
128 |     print("%d: dloss=%.5f, gloss=%.5f, time=%.1f" % (e+1, dloss / period, gloss / period, time.time()-t0))
129 |     save_figure("png/dcgan-svhn-%03d.png" % (e+1), drawz)
130 | 
131 | saver = tf.train.Saver()
132 | saver.save(sess, "dcgan-svhn.model")
133 | 


--------------------------------------------------------------------------------
/dnn/gan-mnist.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # MNIST generator based on Generative Adversarial Networks with Tensorflow
 5 | # (I. Goodfellow, J. Pouget-Abadie, M. Mirza, B. Xu, D. Warde-Farley, S. Ozair, A. Courville, and Y. Bengio. Generative adversarial nets. In NIPS, pages 2672–2680. 2014.)
 6 | 
 7 | # This code is available under the MIT License.
 8 | # (c)2016 Nakatani Shuyo / Cybozu Labs Inc.
 9 | 
10 | import numpy, math
11 | import tensorflow as tf
12 | from tensorflow.examples.tutorials.mnist import input_data
13 | import matplotlib.pyplot as plt
14 | 
15 | # model parameter
16 | noise_dim = 32 # input noise size of Generator
17 | Dhidden = 256  # hidden units of Discriminator's network
18 | Ghidden = 512  # hidden units of Generator's network
19 | K = 8          # maxout units of Discriminator
20 | 
21 | mini_batch_size = 50
22 | epoch = 50
23 | samples=(5,6)  # samples drawing size
24 | 
25 | mnist = input_data.read_data_sets("data/", one_hot=True)
26 | N, num_features = mnist.train.images.shape
27 | period = N // mini_batch_size
28 | 
29 | X = tf.placeholder(tf.float32, shape=(None, num_features))
30 | Z = tf.placeholder(tf.float32, shape=(None, noise_dim))
31 | keep_prob = tf.placeholder(tf.float32)
32 | 
33 | with tf.variable_scope("G"):
34 |     GW1 = tf.Variable(tf.random_normal([noise_dim, Ghidden], stddev=0.1))
35 |     Gb1 = tf.Variable(tf.zeros(Ghidden))
36 |     GW2 = tf.Variable(tf.random_normal([Ghidden, num_features], stddev=0.1))
37 |     Gb2 = tf.Variable(tf.zeros(num_features))
38 | 
39 | with tf.variable_scope("D"):
40 |     DW1 = tf.Variable(tf.random_normal([num_features, K * Dhidden], stddev=0.01))
41 |     Db1 = tf.Variable(tf.zeros(K * Dhidden))
42 |     DW2 = tf.Variable(tf.random_normal([Dhidden, 1], stddev=0.01))
43 |     Db2 = tf.Variable(tf.zeros(1))
44 | 
45 | def discriminator(xx):
46 |     u = tf.reshape(tf.matmul(xx, DW1) + Db1, [-1, K, Dhidden])
47 |     Dh1 = tf.nn.dropout(tf.reduce_max(u, reduction_indices=[1]), keep_prob)
48 |     return tf.nn.sigmoid(tf.matmul(Dh1, DW2) + Db2)
49 | 
50 | Gh1 = tf.nn.relu(tf.matmul(Z, GW1) + Gb1)
51 | G = tf.nn.sigmoid(tf.matmul(Gh1, GW2) + Gb2)
52 | DG = discriminator(G)
53 | Dloss = -tf.reduce_mean(tf.log(discriminator(X)) + tf.log(1 - DG))
54 | Gloss = tf.reduce_mean(tf.log(1 - DG) - tf.log(DG + 1e-9)) # the second term for stable learning
55 | 
56 | vars = tf.trainable_variables()
57 | Dvars = [v for v in vars if v.name.startswith("D")]
58 | Gvars = [v for v in vars if v.name.startswith("G")]
59 | 
60 | Doptimizer = tf.train.AdamOptimizer().minimize(Dloss, var_list=Dvars)
61 | Goptimizer = tf.train.AdamOptimizer().minimize(Gloss, var_list=Gvars)
62 | 
63 | sess = tf.Session()
64 | sess.run(tf.initialize_all_variables())
65 | 
66 | dloss = gloss = 0.0
67 | for i in range(epoch * period):
68 |     x, _ = mnist.train.next_batch(mini_batch_size)
69 |     z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim))
70 |     loss, _ = sess.run([Dloss, Doptimizer], feed_dict={X:x, Z:z, keep_prob:0.5})
71 |     dloss += loss
72 |     z = numpy.random.uniform(-1, 1, size=(mini_batch_size, noise_dim))
73 |     loss, _ = sess.run([Gloss, Goptimizer], feed_dict={Z:z, keep_prob:1.0})
74 |     gloss += loss
75 | 
76 |     if (i+1) % period == 0:
77 |         print("%d: dloss=%.5f, gloss=%.5f" % ((i+1)//period, dloss / period, gloss / period))
78 |         if math.isnan(dloss) or math.isnan(gloss):
79 |             sess.run(tf.initialize_all_variables()) # initialize & retry if NaN
80 |         dloss = gloss = 0.0
81 | 
82 | nsamples = samples[0] * samples[1]
83 | def save_figures(path, z):
84 |     Gz = sess.run(G, feed_dict={Z: z})
85 |     fig = plt.figure()
86 |     fig.subplots_adjust(left=0,bottom=0,right=1,top=1)
87 |     for i in range(nsamples):
88 |         ax = fig.add_subplot(samples[0], samples[1], i + 1)
89 |         ax.axis("off")
90 |         ax.imshow(Gz[i,:].reshape((28,28)), cmap=plt.get_cmap("gray"))
91 |     fig.savefig(path)
92 |     plt.close(fig)
93 | 
94 | z = numpy.random.uniform(-1, 1, size=(nsamples, noise_dim))
95 | #z[:,0] = numpy.arange(0, nsamples) / (nsamples - 1) * 2 - 1
96 | save_figures("gan-mnist.png", z)
97 | 
98 | 


--------------------------------------------------------------------------------
/extractcontent/test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Extract Web Content - Test
 5 | # (c)2010 Nakatani Shuyo, Cybozu Labs Inc.
 6 | 
 7 | import sys, os, re
 8 | from optparse import OptionParser
 9 | sys.path.append("../hmm")
10 | from hmm import HMM
11 | 
12 | def load_data(directory):
13 |     import glob
14 |     htmllist = glob.glob(os.path.join(directory, "*.html"))
15 |     features = []
16 |     for filename in htmllist:
17 |         taglist = []
18 |         f = open(filename, 'r')
19 |         for line in f:
20 |             tags = re.findall(r'<(\w+)',line)
21 |             if len(tags)>0: taglist.extend([x.lower() for x in tags])
22 |         f.close()
23 |         features.append(taglist)
24 |     return features
25 | 
26 | def main():
27 |     parser = OptionParser()
28 |     parser.add_option("-t", dest="test", help="test data directory")
29 |     parser.add_option("-m", dest="model", help="model data filename to save")
30 |     (options, args) = parser.parse_args()
31 |     if not options.model: parser.error("need model data filename(-m)")
32 | 
33 |     hmm = HMM()
34 |     hmm.load(options.model)
35 | 
36 |     if options.test:
37 |         tests = load_data(options.test)
38 |         for x in tests:
39 |             print zip(x, hmm.Viterbi(hmm.words2id(x)))
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 
44 | 


--------------------------------------------------------------------------------
/extractcontent/train.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Extract Web Content with HMM
 5 | # (c)2010 Nakatani Shuyo, Cybozu Labs Inc.
 6 | 
 7 | import sys, os, re
 8 | from optparse import OptionParser
 9 | sys.path.append("../hmm")
10 | from hmm import HMM
11 | #import numpy
12 | #from numpy.random import dirichlet, randn
13 | 
14 | def load_data(directory):
15 |     import glob
16 |     htmllist = glob.glob(os.path.join(directory, "*.html"))
17 |     features = []
18 |     for filename in htmllist:
19 |         taglist = []
20 |         f = open(filename, 'r')
21 |         for line in f:
22 |             tags = re.findall(r'<(\w+)',line)
23 |             if len(tags)>0: taglist.extend([x.lower() for x in tags])
24 |         f.close()
25 |         features.append(taglist)
26 |     return features
27 | 
28 | def main():
29 |     parser = OptionParser()
30 |     parser.add_option("-d", dest="training", help="training data directory")
31 |     parser.add_option("-k", dest="K", type="int", help="number of latent states", default=6)
32 |     parser.add_option("-a", dest="a", type="float", help="Dirichlet parameter", default=1.0)
33 |     parser.add_option("-i", dest="I", type="int", help="iteration count", default=10)
34 |     parser.add_option("-m", dest="model", help="model data filename to save")
35 |     (options, args) = parser.parse_args()
36 |     if not options.training: parser.error("need training data directory(-d)")
37 | 
38 |     features = load_data(options.training)
39 | 
40 |     hmm = HMM()
41 |     hmm.set_corpus(features)
42 |     hmm.init_inference(options.K, options.a)
43 |     pre_L = -1e10
44 |     for i in range(options.I):
45 |         log_likelihood = hmm.inference()
46 |         print i, ":", log_likelihood
47 |         if pre_L > log_likelihood: break
48 |         pre_L = log_likelihood
49 |     if options.model:
50 |         hmm.save(options.model)
51 |     else:
52 |         hmm.dump()
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 
57 | 


--------------------------------------------------------------------------------
/hac/fselect.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | # feature selection
 3 | # (cf. "Feature Selection and Document Clustering" http://www.csee.umbc.edu/cadip/2002Symposium/kogan.pdf )
 4 | 
 5 | require '../lib/infinitive.rb'
 6 | INF = Infinitive.new
 7 | 
 8 | require 'optparse'
 9 | opt = {:n_words=>1000, :type=>:q0, :stopwords=>true}
10 | parser = OptionParser.new
11 | parser.on('-n [VAL]', Integer) {|v| opt[:n_words] = v }
12 | parser.on('-t [VAL]', [:q0, :q1]) {|v| opt[:type] = v }
13 | parser.on('-s', 'exclude stop words') {|v| opt[:stopwords] = false }
14 | parser.parse!(ARGV)
15 | 
16 | 
17 | filename = ARGV[0] || 'corpus'
18 | data = open(filename){|f| Marshal.load(f) }
19 | docs = data[:docs]
20 | terms = data[:terms]
21 | 
22 | 
23 | stopwords = Hash.new(true)
24 | <<STOPWORDS.scan(/[a-z]+/).each{|x| stopwords[INF.infinitive(x)] = false }
25 | a about above across after afterwards again against all almost alone
26 | along already also although always am among amongst amoungst amount an
27 | and another any anyhow anyone anything anyway anywhere are around as
28 | at back be became because become becomes becoming been before
29 | beforehand behind being below beside besides between beyond bill both
30 | bottom but by call can cannot cant co computer con could couldnt cry
31 | de describe detail do done down due during each eg eight either eleven
32 | else elsewhere empty enough etc even ever every everyone everything
33 | everywhere except few fifteen fify fill find fire first five for
34 | former formerly forty found four from front full further get give go
35 | had has hasnt have he hence her here hereafter hereby herein hereupon
36 | hers herself him himself his how however hundred i ie if in inc indeed
37 | interest into is it its itself keep last latter latterly least less
38 | ltd made many may me meanwhile might mill mine more moreover most
39 | mostly move much must my myself name namely neither never nevertheless
40 | next nine no nobody none noone nor not nothing now nowhere of off
41 | often on once one only onto or other others otherwise our ours
42 | ourselves out over own part per perhaps please put rather re same see
43 | seem seemed seeming seems serious several she should show side since
44 | sincere six sixty so some somehow someone something sometime sometimes
45 | somewhere still such system take ten than that the their them
46 | themselves then thence there thereafter thereby therefore therein
47 | thereupon these they thick thin third this those though three through
48 | throughout thru thus to together too top toward towards twelve twenty
49 | two un under until up upon us very via was we well were what whatever
50 | when whence whenever where whereafter whereas whereby wherein
51 | whereupon wherever whether which while whither who whoever whole whom
52 | whose why will with within without would yet you your yours yourself
53 | yourselves
54 | STOPWORDS
55 | 
56 | 
57 | 
58 | 
59 | n_docs = docs.size
60 | 
61 | ev = []
62 | terms.each do |term, rev_index|
63 |   s1 = s2 = 0
64 |   rev_index.each do |doc_id, freq|
65 |     s1 += freq
66 |     s2 += freq * freq
67 |   end
68 |   n = if opt[:type]==:q0 then n_docs else rev_index.size end
69 |   v = s2 - (s1 * s1).to_f / n
70 | 
71 |   ev << [term, v] if opt[:stopwords] || stopwords[term]
72 | end
73 | 
74 | ev = ev.sort{|a,b| b[1]<=>a[1]}[0..opt[:n_words]-1]
75 | 
76 | new_terms = {}
77 | ev.each do |term, v|
78 |   new_terms[term] = terms[term]
79 | end
80 | 
81 | open("#{filename}.#{opt[:type]}", "w") do |f|
82 |   Marshal.dump({:docs=>docs, :terms=>new_terms}, f)
83 | end
84 | 
85 | puts "#{terms.size} => #{new_terms.size}"
86 | #puts ev.map{|x| x[0]}.join(' ')
87 | 


--------------------------------------------------------------------------------
/hac/hac.rb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/hac/hac.rb


--------------------------------------------------------------------------------
/hac/naive_hac.rb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/hac/naive_hac.rb


--------------------------------------------------------------------------------
/irt/irt.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | require 'zlib'
 3 | 
 4 | THRESHOLD = 1.2
 5 | 
 6 | learnfile = ARGV[0] || 'irt.data'
 7 | users = Hash.new
 8 | words = Hash.new
 9 | begin
10 |   open(learnfile) do |f|
11 |     users, words = Marshal.load(f)
12 |   end
13 | rescue
14 |   puts "create new learning data randomizely."
15 | end
16 | 
17 | data = []
18 | Zlib::GzipReader.open('word_scores.txt.gz') do |f|
19 |   while line = f.gets
20 |     if line =~ /^([0-9]+)\s+([0-9]+)\s*([0-9\.]+)$/
21 |       user_id = $1.to_i
22 |       word_id = $2.to_i
23 |       point = $3.to_f
24 |       #next if point > 10000
25 | 
26 |       t = if point < THRESHOLD then 1 else 0 end
27 |       data << [user_id, word_id, t]
28 | 
29 |       users[user_id] = rand unless users.key?(user_id)
30 |       words[word_id] = rand unless words.key?(word_id)
31 |     end
32 |   end
33 | end
34 | 
35 | 10000.times do |k|
36 |   eta = 0.01 #1.0/(k+10)
37 |   e = 0
38 |   error = 0
39 |   data.sort_by{rand}.each do |user_id, word_id, t|
40 |     z = users[user_id] - words[word_id]
41 |     y = 1.0/(1.0+Math.exp(-z))
42 |     e -= if t==1 then Math.log(y) else Math.log(1-y) end
43 |     error += 1 if (t==1 && y<0.5) || (t==0 && y>0.5)
44 | 
45 |     grad_e_eta = eta*(y - t)
46 |     users[user_id] -= grad_e_eta
47 |     words[word_id] += grad_e_eta
48 |   end
49 |   puts "#{k}: #{error}, #{e}"
50 |   open(learnfile+".1", 'w'){|f| Marshal.dump([users,words], f) } if (k % 50) == 0
51 | end
52 | 
53 | open(learnfile, 'w'){|f| Marshal.dump([users,words], f) }
54 | 
55 | 


--------------------------------------------------------------------------------
/langdetect/common.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby -Ku
 2 | 
 3 | require 'mysql'
 4 | require 'optparse'
 5 | 
 6 | module LD
 7 |   ENTITIES = {
 8 |     "&#39;"=>"'",
 9 |     "&amp;"=>"&",
10 |     "&gt;"=>">",
11 |     "&lt;"=>"<",
12 |     "&quot;"=>'"',
13 |     "&raquo;"=>""
14 |   }
15 | 
16 |   def self.optparser
17 |     @opt = {
18 |       :host=>'localhost', :user=>'root', :passwd=>'', :dbname=>'googlenews', :port=>3306,
19 |       :model=>'model.json'
20 |     }
21 | 
22 |     parser = OptionParser.new
23 |     parser.on('--host=VAL', String, 'database host') {|v| @opt[:host] = v }
24 |     parser.on('--user=VAL', String, 'database user') {|v| @opt[:user] = v }
25 |     parser.on('--password=VAL', String, 'database password') {|v| @opt[:passwd] = v }
26 |     parser.on('--dbname=VAL', String, 'database name') {|v| @opt[:dbname] = v }
27 |     parser.on('--port=VAL', Integer, 'database port') {|v| @opt[:port] = v }
28 |     parser.on('-f VAL', String, 'model filename') {|v| @opt[:model] = v }
29 |     parser
30 |   end
31 |   def self.model_filename
32 |     @opt[:model]
33 |   end
34 | 
35 |   def self.db_connect
36 |     db = Mysql::init
37 |     db.options Mysql::SET_CHARSET_NAME, 'utf8'
38 |     db.real_connect @opt[:host], @opt[:user], @opt[:passwd], @opt[:dbname], @opt[:port]
39 |     db
40 |   end
41 | 
42 |   def self.decode_entity(st)
43 |     st.gsub(/&[^ &]+?;/){|m| ENTITIES[m] || m}
44 |   end
45 | end
46 | 
47 | 


--------------------------------------------------------------------------------
/langdetect/crawler.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | #require 'rubygems'
 5 | require 'open-uri'
 6 | require 'rss/2.0'
 7 | 
 8 | require 'common.rb'
 9 | require 'detect.rb'
10 | 
11 | LD::optparser.parse!(ARGV)
12 | db = LD::db_connect
13 | 
14 | # Database
15 | # create database googlenews character set utf8;
16 | # create table news (id int auto_increment, title varchar(1024), lang varchar(8), body text, primary key (id));
17 | # create index news_title on news (title);
18 | # create index news_lang on news (lang);
19 | ps_select = db.prepare("select id from news where title=?")
20 | ps_insert = db.prepare("insert into news (title,lang,body) values (?,?,?)")
21 | 
22 | # Google News RSS
23 | def rssurl(lang)
24 |   if lang=="ja"
25 |     'http://news.google.com/news?hl=ja&ned=us&ie=UTF-8&oe=UTF-8&output=rss'
26 |   else
27 |     "http://news.google.com/news?pz=1&cf=all&hl=#{lang}&output=rss"
28 |   end
29 | end
30 | 
31 | langlist = LanguageDetector::LANGLIST
32 | 
33 | langlist.each do |lang|
34 |   url = rssurl(lang)
35 |   #puts url
36 |   rss = open(url) {|f| RSS::Parser.parse(f.read, false) }
37 | 
38 |   rss.items.each do |item|
39 |     rs = ps_select.execute(item.title)
40 |     if !rs.fetch
41 |       body = item.description.gsub(/<nobr>.*?<\/nobr>/, '').gsub(/<[^>]*>/, ' ').gsub(/&nbsp;/, ' ').gsub(/[ \t]+/, ' ')
42 |       ps_insert.execute item.title, lang, body
43 |     end
44 |   end
45 |   sleep 1
46 | end
47 | 
48 | 


--------------------------------------------------------------------------------
/langdetect/filetest.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | require 'optparse'
 5 | require 'nkf'
 6 | require 'detect.rb'
 7 | 
 8 | parser = OptionParser.new
 9 | model = 'model'
10 | alpha = 1.0
11 | debug_flag = false
12 | parser.on('-f VAL', String, 'model filename') {|v| model = v }
13 | parser.on('-a VAL', Float, 'alpha (additive smoothing)') {|v| alpha = v }
14 | parser.on('-d', 'debug mode') { debug_flag = true }
15 | parser.parse!(ARGV)
16 | 
17 | detector = LanguageDetector::Detector.new(model)
18 | detector.debug_on if debug_flag
19 | 
20 | ARGV.each do |filename|
21 |   text = open(filename){|f| NKF.nkf('-w', f.read) }
22 |   text.gsub!(/https?:\/\/[0-9a-zA-Z\.\/\?=\&\-]+/, '')
23 |   problist = detector.detect(text, alpha)
24 |   puts "#{filename},#{problist.inspect},#{text[0..100].gsub(/\s+/, ' ').strip}"
25 | end
26 | 


--------------------------------------------------------------------------------
/langdetect/test.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | require 'common.rb'
 5 | require 'detect.rb'
 6 | 
 7 | parser= LD::optparser
 8 | target_id = nil
 9 | alpha = 1.0
10 | parser.on('--id=VAL', Integer, 'target text id') {|v| target_id = v }
11 | parser.on('-a VAL', Float, 'alpha (additive smoothing)') {|v| alpha = v }
12 | parser.parse!(ARGV)
13 | 
14 | detector = LanguageDetector::Detector.new(LD::model_filename, alpha)
15 | 
16 | # Database
17 | db = LD::db_connect
18 | ps_select = if target_id
19 |   detector.debug_on
20 |   db.prepare("select id,title,lang,body from news where id = ?").execute target_id
21 | else
22 |   db.prepare("select id,title,lang,body from news order by lang").execute
23 | end
24 | 
25 | count = Hash.new(0)
26 | correct = Hash.new(0)
27 | detected = Hash.new{|h,k| h[k]=Hash.new(0)}
28 | ngramer = detector.ngramer
29 | while rs = ps_select.fetch
30 |   id, title, lang, body = rs
31 |   title.sub!(/ - [^\-]+$/, '')
32 |   text = LD::decode_entity(title + "\n" + body)
33 | 
34 |   ngramer.clear
35 |   detector.init
36 |   text.scan(/./) do |x|
37 |     ngramer.append x
38 |     ngramer.each do |z|
39 |       detector.append z
40 |     end
41 |     break if detector.maxprob > 0.99999
42 |   end
43 | 
44 |   problist = detector.problist
45 |   puts "#{id},#{lang},#{title},#{problist.inspect}"
46 |   count[lang] += 1
47 |   correct[lang] += 1 if problist[0][0] == lang
48 |   detected[lang][problist[0][0]] += 1
49 | end
50 | 
51 | sum = correct_sum = 0
52 | count.keys.sort.each do |lang|
53 |   rate = (10000.0 * correct[lang] / count[lang]).to_i / 100.0
54 |   list = detected[lang].to_a.sort_by{|x| -x[1]}.map{|x| x.join(':')}.join(',')
55 |   puts "#{lang} #{correct[lang]} / #{count[lang]} (#{rate}) [#{list}]"
56 |   sum += count[lang]
57 |   correct_sum += correct[lang]
58 | end
59 | puts "total: #{correct_sum} / #{sum} (#{(10000.0 * correct_sum / sum).to_i / 100.0})"
60 | 
61 | 


--------------------------------------------------------------------------------
/langdetect/train.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env ruby
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | require 'mysql'
 5 | require 'json'
 6 | require 'common.rb'
 7 | require 'detect.rb'
 8 | 
 9 | parser= LD::optparser
10 | opt = {:N=>3, :training_size=>150, :csv=>false, :json=>false}
11 | parser.on('-n VAL', Integer, 'N-gram') {|v| opt[:N] = v }
12 | parser.on('--size=VAL', Integer, 'max size of training data') {|v| opt[:training_size] = v }
13 | parser.on('--csv') {|v| opt[:csv] = true }
14 | parser.parse!(ARGV)
15 | 
16 | # Database
17 | db = LD::db_connect
18 | #ps_select = db.prepare("select title,lang,body from news order by id desc")
19 | ps_select = db.prepare("select title,lang,body from news order by rand()")
20 | 
21 | ps_select.execute
22 | n_k = Hash.new(0)
23 | p_ik = Hash.new{|h,k| h[k]=Hash.new(0)}
24 | ngramer = LanguageDetector::Ngramer.new(opt[:N])
25 | while rs = ps_select.fetch
26 |   title, lang, body = rs
27 |   title.sub!(/ - [^\-]+$/, '')
28 |   next if n_k[lang] >= opt[:training_size]
29 |   n_k[lang] += 1
30 |   text = LD::decode_entity(title + "\n" + body)
31 | 
32 |   grams = Hash.new
33 |   ngramer.clear
34 |   text.scan(/./) do |x|
35 |     ngramer.append x
36 |     ngramer.each do |z|
37 |       grams[z] = 1
38 |     end
39 |   end
40 |   grams.each do |gram, dummy|
41 |     p_ik[gram][lang] += 1
42 |   end
43 | end
44 | 
45 | if opt[:csv]
46 |   puts ","+LD::LANGLIST.join(',')
47 |   p_ik.to_a.sort.each do |unigram,langs|
48 |     langs.default = ''
49 |     puts "'#{unigram.unpack('H*')[0]},#{LD::LANGLIST.map{|lang| langs[lang]}.join(',')}"
50 |   end
51 | end
52 | 
53 | keys = p_ik.keys
54 | keys.each do |chunk|
55 |   langs = p_ik[chunk].keys
56 |   langs.each do |lang|
57 |     p_ik[chunk].delete lang if p_ik[chunk][lang] <= 2
58 |   end
59 |   p_ik.delete chunk if p_ik[chunk].size == 0
60 | end
61 | 
62 | p_ik.default = 0
63 | open(LD::model_filename, 'w') do |f|
64 |   JSON.dump([n_k, p_ik, opt[:N]], f)
65 | end
66 | 
67 | 


--------------------------------------------------------------------------------
/lda/lda.r:
--------------------------------------------------------------------------------
  1 | 
  2 | # Latent Dirichlet Allocation + collapsed Gibbs sampling
  3 | # This code is available under the MIT License.
  4 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc.
  5 | 
  6 | K <- 50;
  7 | I <- 200;
  8 | 
  9 | filename <- "../data/gift_of_magi.txt";
 10 | argv <- commandArgs(T);
 11 | if (length(argv)>0) filename <- commandArgs(T)[1];
 12 | text <- tolower(readLines(filename));
 13 | corpus <- strsplit(text, split="[[:blank:][:punct:]]", perl=T);
 14 | 
 15 | words <- c();
 16 | words_id <- list();
 17 | docs <- list();
 18 | M <- 0;
 19 | for(line in corpus) {
 20 | 	doc <- c();
 21 | 	for (term in line) {
 22 | 		if (term == "") next;
 23 | 		if (is.null(words_id[[term]])) {
 24 | 			words <- append(words, term);
 25 | 			words_id[[term]] <- length(words);
 26 | 		}
 27 | 		doc <- append(doc, words_id[[term]]);
 28 | 	}
 29 | 	if (length(doc)==0) next;
 30 | 	M <- M + 1;
 31 | 	docs[[M]] <- doc;
 32 | }
 33 | V <- length(words);
 34 | 
 35 | z_m_n <- list();   # M * N_m
 36 | n_m_z <- matrix(numeric(M*K),M);
 37 | n_z_t <- matrix(numeric(K*V),K);
 38 | n_z <- numeric(K);
 39 | n_terms <- 0;
 40 | 
 41 | for(m in 1:M) {
 42 | 	doc <- docs[[m]];
 43 | 	N_m <- length(doc);
 44 | 
 45 | 	z_n <- sample(1:K, N_m, replace=T);
 46 | 	z_m_n[[m]] <- z_n;
 47 | 	for(n in 1:N_m) {
 48 | 		z <- z_n[n];
 49 | 		t <- doc[n];
 50 | 		n_m_z[m,z] <- n_m_z[m,z] + 1;
 51 | 		n_z_t[z,t] <- n_z_t[z,t] + 1;
 52 | 		n_z[z] <- n_z[z] + 1;
 53 | 	}
 54 | 	n_terms <- n_terms + N_m;
 55 | }
 56 | 
 57 | alpha <- 0.001;
 58 | beta <- 0.001;
 59 | 
 60 | for(ita in 1:I) {
 61 | 	#print("-------------------------------------------------------------------");
 62 | 	#print(ita);
 63 | 	
 64 | 	changes <- 0;
 65 | 	for(m in 1:M) {
 66 | 		doc <- docs[[m]];
 67 | 		N_m <- length(doc);
 68 | 		for(n in 1:N_m) {
 69 | 			t <- doc[n];
 70 | 			z <- z_m_n[[m]][n]; # z_i
 71 | 
 72 | 			# z_{-i} の状況を作る
 73 | 			n_m_z[m,z] <- n_m_z[m,z] - 1;
 74 | 			n_z_t[z,t] <- n_z_t[z,t] - 1;
 75 | 			n_z[z] <- n_z[z] - 1;
 76 | 
 77 | 			# p(z|z_{-i}) からサンプリング
 78 | 			denom_a <- sum(n_m_z[m,]) + K * alpha;
 79 | 			denom_b <- rowSums(n_z_t) + V * beta;
 80 | 			p_z <- (n_z_t[,t] + beta) / denom_b * (n_m_z[m,] + alpha) / denom_a;
 81 | 			z_i <- sample(1:K, 1, prob=p_z);
 82 | 
 83 | 			z_m_n[[m]][n] <- z_i;
 84 | 			#print(p_z);
 85 | 			#cat(sprintf("%d,%d: %d => %d\n", m, n, z, z_i));
 86 | 			if (z != z_i) changes <- changes + 1;
 87 | 
 88 | 			n_m_z[m,z_i] <- n_m_z[m,z_i] + 1;
 89 | 			n_z_t[z_i,t] <- n_z_t[z_i,t] + 1;
 90 | 			n_z[z_i] <- n_z[z_i] + 1;
 91 | 		}
 92 | 	}
 93 | 	cat(sprintf("%d: %d/%d\n", ita, changes, n_terms));
 94 | }
 95 | 
 96 | phi <- matrix(numeric(K*V), K);
 97 | theta <- matrix(numeric(M*K), M);
 98 | for(m in 1:M) {
 99 | 	theta_m <- n_m_z[m,] + alpha;
100 | 	theta[m,] <- theta_m / sum(theta_m);
101 | }
102 | for(z in 1:K) {
103 | 	phi_z <- n_z_t[z,] + beta;
104 | 	phi[z,] <- phi_z / sum(phi_z);
105 | }
106 | colnames(phi) <- words;
107 | 
108 | options(digits=5, scipen=1, width=100);
109 | sink(format(Sys.time(), "lda%m%d%H%M.txt"));
110 | 
111 | for(m in 1:M) {
112 | 	doc <- docs[[m]];
113 | 	N_m <- length(doc);
114 | 	cat(sprintf("\n[corpus %d]-------------------------------------\n", m));
115 | 	print(theta[m,]);
116 | 	for(n in 1:N_m) {
117 | 		cat(sprintf("%s : %d\n", words[[doc[n]]], z_m_n[[m]][n]));
118 | 	}
119 | }
120 | 
121 | print(phi);
122 | sink();
123 | 
124 | 


--------------------------------------------------------------------------------
/lda/lda_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | # This code is available under the MIT License.
 4 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc.
 5 | 
 6 | import numpy
 7 | 
 8 | class FileOutput:
 9 |     def __init__(self, file):
10 |         import datetime
11 |         self.file = file + datetime.datetime.now().strftime('_%m%d_%H%M%S.txt')
12 |     def out(self, st):
13 |         with open(self.file, 'a') as f:
14 |             print >>f,  st
15 | 
16 | def lda_learning(f, LDA, smartinit, options, docs, voca, plimit=1):
17 |     import time
18 |     t0 = time.time()
19 | 
20 |     if options.seed != None: numpy.random.seed(options.seed)
21 |     lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), smartinit)
22 | 
23 |     pre_perp = lda.perplexity()
24 |     f.out("alg=%s smart_init=%s initial perplexity=%f" % (LDA.__name__, smartinit, pre_perp))
25 | 
26 |     pc = 0
27 |     for i in range(options.iteration):
28 |         if i % 10==0: output_word_topic_dist(f, lda, voca)
29 |         lda.inference()
30 |         perp = lda.perplexity()
31 |         f.out("-%d p=%f" % (i + 1, perp))
32 |         if pre_perp is not None:
33 |             if pre_perp < perp:
34 |                 pc += 1
35 |                 if pc >= plimit:
36 |                     output_word_topic_dist(f, lda, voca)
37 |                     pre_perp = None
38 |             else:
39 |                 pc = 0
40 |                 pre_perp = perp
41 |     output_word_topic_dist(f, lda, voca)
42 | 
43 |     t1 = time.time()
44 |     f.out("time = %f\n" % (t1 - t0))
45 | 
46 | def output_word_topic_dist(f, lda, voca):
47 |     phi = lda.worddist()
48 |     for k in range(lda.K):
49 |         f.out("\n-- topic: %d" % k)
50 |         for w in numpy.argsort(-phi[k])[:20]:
51 |             f.out("%s: %f" % (voca[w], phi[k,w]))
52 | 
53 | def main():
54 |     import optparse
55 |     import vocabulary
56 |     import lda
57 |     import lda_cvb0
58 |     parser = optparse.OptionParser()
59 |     parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)", default="1:100")
60 |     parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
61 |     parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
62 |     parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
63 |     parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
64 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
65 |     parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
66 |     parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=1)
67 |     (options, args) = parser.parse_args()
68 | 
69 |     corpus = vocabulary.load_corpus(options.corpus)
70 |     voca = vocabulary.Vocabulary(options.stopwords)
71 |     docs = [voca.doc_to_ids(doc) for doc in corpus]
72 |     if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
73 | 
74 |     f = FileOutput("lda_test")
75 |     f.out("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(docs), len(voca.vocas), options.K, options.alpha, options.beta))
76 | 
77 |     lda_learning(f, lda_cvb0.LDA_CVB0, False, options, docs, voca)
78 |     lda_learning(f, lda_cvb0.LDA_CVB0, True, options, docs, voca)
79 |     lda_learning(f, lda.LDA, False, options, docs, voca, 2)
80 |     lda_learning(f, lda.LDA, True, options, docs, voca, 2)
81 | 
82 | if __name__ == "__main__":
83 |     main()
84 | 
85 | 


--------------------------------------------------------------------------------
/lda/lda_test2.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/python
  2 | 
  3 | # This code is available under the MIT License.
  4 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc.
  5 | 
  6 | import numpy
  7 | 
  8 | class FileOutput:
  9 |     def __init__(self, file):
 10 |         import datetime
 11 |         self.file = file + datetime.datetime.now().strftime('_%m%d_%H%M%S.txt')
 12 |     def out(self, st):
 13 |         with open(self.file, 'a') as f:
 14 |             print >>f,  st
 15 | 
 16 | def lda_learning(f, LDA, smartinit, options, docs, test_docs, voca, plimit=1):
 17 |     import time
 18 |     t0 = time.time()
 19 | 
 20 |     if options.seed != None: numpy.random.seed(options.seed)
 21 |     lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), smartinit)
 22 | 
 23 |     pre_perp = lda.perplexity(test_docs)
 24 |     f.out("alg=%s smart_init=%s initial perplexity=%f" % (LDA.__name__, smartinit, pre_perp))
 25 | 
 26 |     pc = 0
 27 |     for i in range(options.iteration):
 28 |         if i % 10==0: output_word_topic_dist(f, lda, voca)
 29 |         lda.inference()
 30 |         perp = lda.perplexity(test_docs)
 31 |         f.out("-%d p=%f" % (i + 1, perp))
 32 |         if pre_perp is not None:
 33 |             if pre_perp < perp:
 34 |                 pc += 1
 35 |                 if pc >= plimit:
 36 |                     output_word_topic_dist(f, lda, voca)
 37 |                     pre_perp = None
 38 |             else:
 39 |                 pc = 0
 40 |                 pre_perp = perp
 41 |     output_word_topic_dist(f, lda, voca)
 42 | 
 43 |     t1 = time.time()
 44 |     f.out("time = %f\n" % (t1 - t0))
 45 | 
 46 | def output_word_topic_dist(f, lda, voca):
 47 |     phi = lda.worddist()
 48 |     for k in range(lda.K):
 49 |         f.out("\n-- topic: %d" % k)
 50 |         for w in numpy.argsort(-phi[k])[:20]:
 51 |             f.out("%s: %f" % (voca[w], phi[k,w]))
 52 | 
 53 | def conv_word_freq(docs):
 54 |     result = []
 55 |     for doc in docs:
 56 |         term_freq = dict()
 57 |         for w in doc:
 58 |             if w in term_freq:
 59 |                 term_freq[w] += 1
 60 |             else:
 61 |                 term_freq[w] = 1
 62 |         result.append(term_freq.items())
 63 |     return result
 64 | 
 65 | def main():
 66 |     import optparse
 67 |     import vocabulary
 68 |     import lda
 69 |     import lda_cvb0
 70 |     parser = optparse.OptionParser()
 71 |     parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)", default="0:100")
 72 |     parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5)
 73 |     parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5)
 74 |     parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
 75 |     parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
 76 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
 77 |     parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False)
 78 |     parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=10)
 79 |     (options, args) = parser.parse_args()
 80 | 
 81 |     corpus = vocabulary.load_corpus(options.corpus)
 82 |     voca = vocabulary.Vocabulary(options.stopwords)
 83 |     docs = [voca.doc_to_ids(doc) for doc in corpus]
 84 |     if options.df > 0: docs = voca.cut_low_freq(docs, options.df)
 85 |     train_docs = [[x for i, x in enumerate(doc) if i % 10 != 0] for doc in docs]
 86 |     test_docs = [[x for i, x in enumerate(doc) if i % 10 == 0] for doc in docs]
 87 |     test_docs_wf = conv_word_freq(test_docs)
 88 | 
 89 |     f = FileOutput("lda_test2")
 90 |     f.out("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(docs), len(voca.vocas), options.K, options.alpha, options.beta))
 91 | 
 92 |     lda_learning(f, lda_cvb0.LDA_CVB0, False, options, train_docs, test_docs_wf, voca)
 93 |     lda_learning(f, lda_cvb0.LDA_CVB0, True, options, train_docs, test_docs_wf, voca)
 94 |     lda_learning(f, lda.LDA, False, options, train_docs, test_docs, voca, 2)
 95 |     lda_learning(f, lda.LDA, True, options, train_docs, test_docs, voca, 2)
 96 | 
 97 | if __name__ == "__main__":
 98 |     main()
 99 | 
100 | 


--------------------------------------------------------------------------------
/lda/ldacvb0_cpp/README.md:
--------------------------------------------------------------------------------
 1 | LDA CVB0 in C++
 2 | ======================
 3 | 
 4 | 
 5 | How to Build
 6 | ------
 7 | 
 8 | 	git clone git://github.com/shuyo/iir.git
 9 | 	cd iir/lda/ldacvb0_cpp
10 | 	git clone git://github.com/herumi/cybozulib.git
11 | 	MSBuild.exe ldacvb0.sln /p:Configuration=Release /p:Platform="Win32"
12 | 
13 | 
14 | Usage
15 | ------
16 | 
17 | On cygwin,
18 | 
19 | 	curl http://nltk.googlecode.com/svn/trunk/nltk_data/packages/corpora/brown.zip -O
20 | 	unzip brown.zip
21 | 	Release/ldacvb0.exe brown/????
22 | 
23 | 
24 | Options
25 | ------
26 | 
27 | + -k : topic size (20)
28 | + -i : number of learning iteration (100)
29 | + -a : parameter alpha (0.1)
30 | + -b : parameter beta (0.01)
31 | + -n : how many top to print in topic-word distribution (20)
32 | + -p : use corpus with POS annotation
33 | 
34 | 
35 | License
36 | ----------
37 | 
38 | Copyright &copy; 2013 Nakatani Shuyo / Cybozu Labs, Inc
39 | 
40 | Distributed under the [MIT License][mit].
41 | 
42 | [MIT]: http://www.opensource.org/licenses/mit-license.php
43 | 


--------------------------------------------------------------------------------
/lda/ldacvb0_cpp/ldacvb0.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 11.00
 3 | # Visual Studio 2010
 4 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ldacvb0", "ldacvb0\ldacvb0.vcxproj", "{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}"
 5 | EndProject
 6 | Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "ldacvb0_test", "ldacvb0_test\ldacvb0_test.vcxproj", "{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}"
 7 | 	ProjectSection(ProjectDependencies) = postProject
 8 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25} = {7E7F27E1-8399-470B-A9F5-877EC4E8BA25}
 9 | 	EndProjectSection
10 | EndProject
11 | Global
12 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
13 | 		Debug|Win32 = Debug|Win32
14 | 		Debug|x64 = Debug|x64
15 | 		Release|Win32 = Release|Win32
16 | 		Release|x64 = Release|x64
17 | 	EndGlobalSection
18 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
19 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|Win32.ActiveCfg = Debug|Win32
20 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|Win32.Build.0 = Debug|Win32
21 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|x64.ActiveCfg = Debug|x64
22 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Debug|x64.Build.0 = Debug|x64
23 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|Win32.ActiveCfg = Release|Win32
24 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|Win32.Build.0 = Release|Win32
25 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|x64.ActiveCfg = Release|x64
26 | 		{7E7F27E1-8399-470B-A9F5-877EC4E8BA25}.Release|x64.Build.0 = Release|x64
27 | 		{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|Win32.ActiveCfg = Debug|Win32
28 | 		{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|Win32.Build.0 = Debug|Win32
29 | 		{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|x64.ActiveCfg = Debug|x64
30 | 		{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Debug|x64.Build.0 = Debug|x64
31 | 		{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|Win32.ActiveCfg = Release|Win32
32 | 		{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|Win32.Build.0 = Release|Win32
33 | 		{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|x64.ActiveCfg = Release|x64
34 | 		{CEDDBB13-7042-4FF1-8956-4F7ED0B0E1A7}.Release|x64.Build.0 = Release|x64
35 | 	EndGlobalSection
36 | 	GlobalSection(SolutionProperties) = preSolution
37 | 		HideSolutionNode = FALSE
38 | 	EndGlobalSection
39 | EndGlobal
40 | 


--------------------------------------------------------------------------------
/lda/ldacvb0_cpp/ldacvb0/ldacvb0.cpp:
--------------------------------------------------------------------------------
  1 | /**
  2 | @file
  3 | @brief LDA CVB0
  4 | Latent Dirichlet Allocation - Collapsed Variational Bayesian Estimation
  5 | 
  6 | Copyright (C) 2013 Nakatani Shuyo / Cybozu Labs, Inc., all rights reserved.
  7 | This code is licensed under the MIT license.
  8 | */
  9 | 
 10 | #include <iostream>
 11 | #include <fstream>
 12 | #include <cstdlib>
 13 | #include "cybozu/string.hpp"
 14 | #include <cybozu/mmap.hpp>
 15 | #include <cybozu/nlp/top_score.hpp>
 16 | #include "ldacvb0.hpp"
 17 | 
 18 | void printnwk(const cybozu::ldacvb0::LDA_CVB0& model, const std::string& word) {
 19 | 	auto voca = model.docs_.vocabularies;
 20 | 	size_t w = voca.id(word);
 21 | 	auto i = model.n_wk.begin()+ w * model.K_;
 22 | 
 23 | 	std::cout << "[" << word << "]" << std::endl;
 24 | 	std::cout << "( ";
 25 | 	for (size_t k=0;k<model.K_;++k) {
 26 | 		std::cout << *(i+k) << " ";
 27 | 	}
 28 | 	std::cout << ")" << std::endl;
 29 | }
 30 | 
 31 | template <class STRING, class CHAR>
 32 | void printHighFreqWords(const cybozu::ldacvb0::Documents<STRING, CHAR> &docs) {
 33 | 	for(auto df=docs.docfreq.begin(), dfend = docs.docfreq.end(); df!=dfend; ++df) {
 34 | 		if (df->second > (int)M/2) {
 35 | 			std::cout << docs.vocabularies.vocalist[df->first] << " " << df->second << std::endl;
 36 | 		}
 37 | 	}
 38 | }
 39 | 
 40 | 
 41 | int main(int argc, char* argv[]) {
 42 | 
 43 | 	int K = 20, I = 100, N_WORDS = 20;
 44 | 	size_t ldf = 1, udf = 0; // lower and upper limit of document frequency
 45 | 	double alpha = 0.1;
 46 | 	double beta = 0.01;
 47 | 	bool isCorpusWithPos = false;
 48 | 
 49 | 	std::vector<std::string> files;
 50 | 
 51 | 	for(int i=1;i<argc;++i) {
 52 | 		std::string st(argv[i]);
 53 | 
 54 | 		if (st == "-k") {
 55 | 			if (++i>=argc) goto ERROR_OPT_K;
 56 | 			K = atoi(argv[i]);
 57 | 		} else if (st == "-i") {
 58 | 			if (++i>=argc) goto ERROR_OPT_I;
 59 | 			I = atoi(argv[i]);
 60 | 		} else if (st == "-n") {
 61 | 			if (++i>=argc) goto ERROR_OPT_N;
 62 | 			N_WORDS = atoi(argv[i]);
 63 | 		} else if (st == "--ldf") {
 64 | 			if (++i>=argc) goto ERROR_OPT_DF;
 65 | 			ldf = atoi(argv[i]);
 66 | 		} else if (st == "--udf") {
 67 | 			if (++i>=argc) goto ERROR_OPT_DF;
 68 | 			udf = atoi(argv[i]);
 69 | 		} else if (st == "-a") {
 70 | 			if (++i>=argc) goto ERROR_OPT_A;
 71 | 			alpha = atof(argv[i]);
 72 | 		} else if (st == "-b") {
 73 | 			if (++i>=argc) goto ERROR_OPT_B;
 74 | 			beta = atof(argv[i]);
 75 | 		} else if (st == "-p") {
 76 | 			isCorpusWithPos = true;
 77 | 		} else {
 78 | 			files.push_back(st);
 79 | 		}
 80 | 	}
 81 | 
 82 | 	{
 83 | 		cybozu::ldacvb0::Documents<std::string, char> orgdocs(isCorpusWithPos?cybozu::ldacvb0::REXWORD_WITH_POS:cybozu::ldacvb0::REXWORD), docs;
 84 | 
 85 | 		for(auto i=files.begin(), iend=files.end();i!=iend;++i) {
 86 | 			try {
 87 | 				cybozu::Mmap map(*i);
 88 | 				const char *p = map.get();
 89 | 				const char *end = p + map.size();
 90 | 				orgdocs.add(p, end);
 91 | 			} catch (std::exception& e) {
 92 | 				printf("%s\n", e.what());
 93 | 			}
 94 | 		}
 95 | 
 96 | 		size_t M = orgdocs.size();
 97 | 		size_t orgV = orgdocs.vocabularies.size();
 98 | 		if (orgV <= 0) goto ERROR_NO_VOCA;
 99 | 
100 | 		if (udf == 0) udf = M / 2;
101 | 		truncDocFreq(docs, orgdocs, ldf, udf);
102 | 
103 | 		size_t V = docs.vocabularies.size();
104 | 		if (V <= 0) goto ERROR_NO_VOCA;
105 | 
106 | 		std::cout << "M = " << M;
107 | 		std::cout << ", N = " << docs.N;
108 | 		std::cout << ", V = " << V << " / " << orgV << std::endl;
109 | 		std::cout << "K = " << K << ", alpha = " << alpha << ", beta = " << beta << std::endl;
110 | 
111 | 		cybozu::ldacvb0::LDA_CVB0 model(K, V, alpha, beta, docs);
112 | 
113 | 		for(int i=0;i<I;++i) {
114 | 			std::cout << i << " " << model.perplexity() << std::endl;
115 | 			model.learn();
116 | 		}
117 | 		std::cout << "perplexity : " << model.perplexity() << std::endl;
118 | 
119 | 		cybozu::ldacvb0::Vec worddist;
120 | 		model.worddist(worddist);
121 | 		auto voca = docs.vocabularies;
122 | 		for (int k=0;k<K;++k) {
123 | 			std::cout << std::endl << "[topic " << k << "]" << std::endl;
124 | 
125 | 			cybozu::nlp::TopScore<size_t> ts(N_WORDS);
126 | 			size_t id = 0;
127 | 			for(auto i = worddist.begin() + k; id < V; i+=K, ++id) {
128 | 				ts.add(*i, id);
129 | 			}
130 | 
131 | 			auto table = ts.getTable();
132 | 			auto tend = table.end();
133 | 			for (auto t = table.begin(); t!=tend; ++t) {
134 | 				const std::string& w = voca.vocalist[t->idx];
135 | 				std::cout << w << "\t" << docs.docfreq[t->idx] << "\t" << voca.count(w) << "\t" << t->score << std::endl;
136 | 			}
137 | 		}
138 | 
139 | 		/*
140 | 		auto i = worddist.begin();
141 | 		//auto i = model.n_wk->begin();
142 | 		for(size_t id = 0; id < V;++id) {
143 | 			const std::string& w = voca.vocalist[id];
144 | 			std::cout << id << "\t" << w << "\t" << docs.docfreq[id] << "\t" << voca.count(w);
145 | 			for (int k=0;k<K;++k) {
146 | 				std::cout << "\t" << *i++;
147 | 			}
148 | 			std::cout << std::endl;
149 | 		}
150 | 		*/
151 | 
152 | 		/*
153 | 		printnwk(model, "the");
154 | 		printnwk(model, "of");
155 | 		printnwk(model, "and");
156 | 		*/
157 | 	}
158 | 
159 | 
160 | 
161 | 
162 | 	return 0;
163 | 
164 | 
165 | 
166 | 	/* error */
167 | 
168 | 	char *p;
169 | ERROR_OPT_K:
170 | 	p = "[ERROR] -k option needs positive integer";
171 | 	goto ERROR_EXIT;
172 | ERROR_OPT_I:
173 | 	p = "[ERROR] -i option needs positive integer";
174 | 	goto ERROR_EXIT;
175 | ERROR_OPT_N:
176 | 	p = "[ERROR] -n option needs positive integer";
177 | 	goto ERROR_EXIT;
178 | ERROR_OPT_DF:
179 | 	p = "[ERROR] --ldf/udf option needs integer";
180 | 	goto ERROR_EXIT;
181 | ERROR_OPT_A:
182 | 	p = "[ERROR] -a option needs positive real number";
183 | 	goto ERROR_EXIT;
184 | ERROR_OPT_B:
185 | 	p = "[ERROR] -b option needs positive real number";
186 | 	goto ERROR_EXIT;
187 | ERROR_NO_VOCA:
188 | 	p = "[ERROR] no vocabularies";
189 | 	goto ERROR_EXIT;
190 | 
191 | ERROR_EXIT:
192 | 	std::cerr << p << std::endl;
193 | 	return 1;
194 | 
195 | }
196 | 


--------------------------------------------------------------------------------
/lda/llda.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # Labeled Latent Dirichlet Allocation
  5 | # This code is available under the MIT License.
  6 | # (c)2010 Nakatani Shuyo / Cybozu Labs Inc.
  7 | # refer to Ramage+, Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora(EMNLP2009)
  8 | 
  9 | from optparse import OptionParser
 10 | import sys, re, numpy
 11 | 
 12 | def load_corpus(filename):
 13 |     corpus = []
 14 |     labels = []
 15 |     labelmap = dict()
 16 |     f = open(filename, 'r')
 17 |     for line in f:
 18 |         mt = re.match(r'\[(.+?)\](.+)', line)
 19 |         if mt:
 20 |             label = mt.group(1).split(',')
 21 |             for x in label: labelmap[x] = 1
 22 |             line = mt.group(2)
 23 |         else:
 24 |             label = None
 25 |         doc = re.findall(r'\w+(?:\'\w+)?',line.lower())
 26 |         if len(doc)>0:
 27 |             corpus.append(doc)
 28 |             labels.append(label)
 29 |     f.close()
 30 |     return labelmap.keys(), corpus, labels
 31 | 
 32 | class LLDA:
 33 |     def __init__(self, K, alpha, beta):
 34 |         #self.K = K
 35 |         self.alpha = alpha
 36 |         self.beta = beta
 37 | 
 38 |     def term_to_id(self, term):
 39 |         if term not in self.vocas_id:
 40 |             voca_id = len(self.vocas)
 41 |             self.vocas_id[term] = voca_id
 42 |             self.vocas.append(term)
 43 |         else:
 44 |             voca_id = self.vocas_id[term]
 45 |         return voca_id
 46 | 
 47 |     def complement_label(self, label):
 48 |         if not label: return numpy.ones(len(self.labelmap))
 49 |         vec = numpy.zeros(len(self.labelmap))
 50 |         vec[0] = 1.0
 51 |         for x in label: vec[self.labelmap[x]] = 1.0
 52 |         return vec
 53 | 
 54 |     def set_corpus(self, labelset, corpus, labels):
 55 |         labelset.insert(0, "common")
 56 |         self.labelmap = dict(zip(labelset, range(len(labelset))))
 57 |         self.K = len(self.labelmap)
 58 | 
 59 |         self.vocas = []
 60 |         self.vocas_id = dict()
 61 |         self.labels = numpy.array([self.complement_label(label) for label in labels])
 62 |         self.docs = [[self.term_to_id(term) for term in doc] for doc in corpus]
 63 | 
 64 |         M = len(corpus)
 65 |         V = len(self.vocas)
 66 | 
 67 |         self.z_m_n = []
 68 |         self.n_m_z = numpy.zeros((M, self.K), dtype=int)
 69 |         self.n_z_t = numpy.zeros((self.K, V), dtype=int)
 70 |         self.n_z = numpy.zeros(self.K, dtype=int)
 71 | 
 72 |         for m, doc, label in zip(range(M), self.docs, self.labels):
 73 |             N_m = len(doc)
 74 |             #z_n = [label[x] for x in numpy.random.randint(len(label), size=N_m)]
 75 |             z_n = [numpy.random.multinomial(1, label / label.sum()).argmax() for x in range(N_m)]
 76 |             self.z_m_n.append(z_n)
 77 |             for t, z in zip(doc, z_n):
 78 |                 self.n_m_z[m, z] += 1
 79 |                 self.n_z_t[z, t] += 1
 80 |                 self.n_z[z] += 1
 81 | 
 82 |     def inference(self):
 83 |         V = len(self.vocas)
 84 |         for m, doc, label in zip(range(len(self.docs)), self.docs, self.labels):
 85 |             for n in range(len(doc)):
 86 |                 t = doc[n]
 87 |                 z = self.z_m_n[m][n]
 88 |                 self.n_m_z[m, z] -= 1
 89 |                 self.n_z_t[z, t] -= 1
 90 |                 self.n_z[z] -= 1
 91 | 
 92 |                 denom_a = self.n_m_z[m].sum() + self.K * self.alpha
 93 |                 denom_b = self.n_z_t.sum(axis=1) + V * self.beta
 94 |                 p_z = label * (self.n_z_t[:, t] + self.beta) / denom_b * (self.n_m_z[m] + self.alpha) / denom_a
 95 |                 new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax()
 96 | 
 97 |                 self.z_m_n[m][n] = new_z
 98 |                 self.n_m_z[m, new_z] += 1
 99 |                 self.n_z_t[new_z, t] += 1
100 |                 self.n_z[new_z] += 1
101 | 
102 |     def phi(self):
103 |         V = len(self.vocas)
104 |         return (self.n_z_t + self.beta) / (self.n_z[:, numpy.newaxis] + V * self.beta)
105 | 
106 |     def theta(self):
107 |         """document-topic distribution"""
108 |         n_alpha = self.n_m_z + self.labels * self.alpha
109 |         return n_alpha / n_alpha.sum(axis=1)[:, numpy.newaxis]
110 | 
111 |     def perplexity(self, docs=None):
112 |         if docs == None: docs = self.docs
113 |         phi = self.phi()
114 |         thetas = self.theta()
115 | 
116 |         log_per = N = 0
117 |         for doc, theta in zip(docs, thetas):
118 |             for w in doc:
119 |                 log_per -= numpy.log(numpy.inner(phi[:,w], theta))
120 |             N += len(doc)
121 |         return numpy.exp(log_per / N)
122 | 
123 | def main():
124 |     parser = OptionParser()
125 |     parser.add_option("-f", dest="filename", help="corpus filename")
126 |     parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001)
127 |     parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001)
128 |     parser.add_option("-k", dest="K", type="int", help="number of topics", default=20)
129 |     parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
130 |     (options, args) = parser.parse_args()
131 |     if not options.filename: parser.error("need corpus filename(-f)")
132 | 
133 |     labelset, corpus, labels = load_corpus(options.filename)
134 | 
135 |     llda = LLDA(options.K, options.alpha, options.beta)
136 |     llda.set_corpus(labelset, corpus, labels)
137 | 
138 |     for i in range(options.iteration):
139 |         sys.stderr.write("-- %d " % (i + 1))
140 |         llda.inference()
141 |     #print llda.z_m_n
142 | 
143 |     phi = llda.phi()
144 |     for v, voca in enumerate(llda.vocas):
145 |         #print ','.join([voca]+[str(x) for x in llda.n_z_t[:,v]])
146 |         print ','.join([voca]+[str(x) for x in phi[:,v]])
147 | 
148 | if __name__ == "__main__":
149 |     main()
150 | 


--------------------------------------------------------------------------------
/lda/llda_nltk.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Labeled LDA using nltk.corpus.reuters as dataset
 5 | # This code is available under the MIT License.
 6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
 7 | 
 8 | import sys, string, random, numpy
 9 | from nltk.corpus import reuters
10 | from llda import LLDA
11 | from optparse import OptionParser
12 | 
13 | parser = OptionParser()
14 | parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.001)
15 | parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001)
16 | parser.add_option("-k", dest="K", type="int", help="number of topics", default=50)
17 | parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100)
18 | parser.add_option("-s", dest="seed", type="int", help="random seed", default=None)
19 | parser.add_option("-n", dest="samplesize", type="int", help="dataset sample size", default=100)
20 | (options, args) = parser.parse_args()
21 | random.seed(options.seed)
22 | numpy.random.seed(options.seed)
23 | 
24 | idlist = random.sample(reuters.fileids(), options.samplesize)
25 | 
26 | labels = []
27 | corpus = []
28 | for id in idlist:
29 |     labels.append(reuters.categories(id))
30 |     corpus.append([x.lower() for x in reuters.words(id) if x[0] in string.ascii_letters])
31 |     reuters.words(id).close()
32 | labelset = list(set(reduce(list.__add__, labels)))
33 | 
34 | 
35 | llda = LLDA(options.K, options.alpha, options.beta)
36 | llda.set_corpus(labelset, corpus, labels)
37 | 
38 | print "M=%d, V=%d, L=%d, K=%d" % (len(corpus), len(llda.vocas), len(labelset), options.K)
39 | 
40 | for i in range(options.iteration):
41 |     sys.stderr.write("-- %d : %.4f\n" % (i, llda.perplexity()))
42 |     llda.inference()
43 | print "perplexity : %.4f" % llda.perplexity()
44 | 
45 | phi = llda.phi()
46 | for k, label in enumerate(labelset):
47 |     print "\n-- label %d : %s" % (k, label)
48 |     for w in numpy.argsort(-phi[k])[:20]:
49 |         print "%s: %.4f" % (llda.vocas[w], phi[k,w])
50 | 
51 | 


--------------------------------------------------------------------------------
/lda/twentygroups.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # 20 Groups Loader
  5 | #   - load data at http://kdd.ics.uci.edu/databases/20newsgroups/20newsgroups.html
  6 | # This code is available under the MIT License.
  7 | # (c)2012 Nakatani Shuyo / Cybozu Labs Inc.
  8 | 
  9 | import os, codecs, re
 10 | 
 11 | STOPWORDS = """
 12 | a b c d e f g h i j k l m n o p q r s t u v w x y z
 13 | the of in and have to it was or were this that with is some on for so
 14 | how you if would com be your my one not never then take for an can no
 15 | but aaa when as out just from does they back up she those who another
 16 | her do by must what there at very are am much way all any other me he
 17 | something someone doesn his also its has into us him than about their
 18 | may too will had been we them why did being over without these could
 19 | out which only should even well more where after while anyone our now
 20 | such under two ten else always going either each however non let done
 21 | ever between anything before every same since because quite sure here
 22 | nothing new don off still down yes around few many own
 23 | go get know think like make say see look use said
 24 | """
 25 | 
 26 | def readTerms(target):
 27 |     with codecs.open(target, 'rb', 'latin1') as f:
 28 |         text = re.sub(r'^(.+\n)*\n', '', f.read())
 29 |     return [w.group(0).lower() for w in re.finditer(r'[A-Za-z]+', text)]
 30 | 
 31 | class Loader:
 32 |     def __init__(self, dirpath, freq_threshold=1, docs_threshold_each_label=100, includes_stopwords=False):
 33 |         if includes_stopwords:
 34 |             stopwords = set(re.split(r'\s', STOPWORDS))
 35 |         else:
 36 |             stopwords = []
 37 | 
 38 |         self.resourcenames = []
 39 |         self.labels = []
 40 |         self.label2id = dict()
 41 |         self.doclabelids = []
 42 |         vocacount = dict()
 43 |         tempdocs = []
 44 | 
 45 |         dirlist = os.listdir(dirpath)
 46 |         for label in dirlist:
 47 |             path = os.path.join(dirpath, label)
 48 |             if os.path.isdir(path):
 49 |                 label_id = len(self.labels)
 50 |                 self.label2id[label] = label_id
 51 |                 self.labels.append(label)
 52 | 
 53 |                 filelist = os.listdir(path)
 54 |                 for i, s in enumerate(filelist):
 55 |                     if i >= docs_threshold_each_label: break
 56 | 
 57 |                     self.resourcenames.append(os.path.join(label, s))
 58 |                     self.doclabelids.append(label_id)
 59 | 
 60 |                     wordlist = readTerms(os.path.join(path, s))
 61 |                     tempdocs.append(wordlist)
 62 | 
 63 |                     for w in wordlist:
 64 |                         if w in vocacount:
 65 |                             vocacount[w] += 1
 66 |                         else:
 67 |                             vocacount[w] = 1
 68 | 
 69 |         self.vocabulary = []
 70 |         self.vocabulary2id = dict()
 71 |         for w in vocacount:
 72 |             if w not in stopwords and vocacount[w] >= freq_threshold:
 73 |                 self.vocabulary2id[w] = len(self.vocabulary)
 74 |                 self.vocabulary.append(w)
 75 | 
 76 |         self.docs = []
 77 |         for doc in tempdocs:
 78 |             self.docs.append([self.vocabulary2id[w] for w in doc if w in self.vocabulary2id])
 79 | 
 80 | def main():
 81 |     import optparse
 82 |     parser = optparse.OptionParser()
 83 |     parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.1)
 84 |     parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.001)
 85 |     parser.add_option("-k", dest="K", type="int", help="number of topics", default=10)
 86 |     parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=20)
 87 |     parser.add_option("--seed", dest="seed", type="int", help="random seed", default=None)
 88 |     parser.add_option("--word_freq_threshold", dest="word_freq_threshold", type="int", default=3)
 89 |     parser.add_option("--docs_threshold_each_label", dest="docs_threshold_each_label", type="int", default=100)
 90 |     parser.add_option("-d", dest="dir", help="directory of 20-newsgroups dataset", default="./20groups/mini_newsgroups/")
 91 |     (options, args) = parser.parse_args()
 92 |     import numpy
 93 |     numpy.random.seed(options.seed)
 94 | 
 95 |     corpus = Loader(options.dir, options.word_freq_threshold, options.docs_threshold_each_label, True)
 96 |     V = len(corpus.vocabulary)
 97 | 
 98 |     import lda_cvb0 as lda
 99 |     model = lda.LDA_CVB0(options.K, options.alpha, options.beta, corpus.docs, V, True)
100 |     print("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus.docs), V, options.K, options.alpha, options.beta))
101 | 
102 |     pre_perp = model.perplexity()
103 |     print("initial perplexity=%f" % pre_perp)
104 |     for i in range(options.iteration):
105 |         model.inference()
106 |         perp = model.perplexity()
107 |         print("-%d p=%f" % (i + 1, perp))
108 |     lda.output_word_topic_dist(model, corpus.vocabulary)
109 | 
110 | if __name__ == "__main__":
111 |     main()
112 | 
113 | 


--------------------------------------------------------------------------------
/lib/extract_gutenberg.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | module Gutenberg
 4 | def self.extract(text)
 5 |   text = text.gsub(/[ \r]+$/, "") + "\n\n"
 6 |   $stderr.puts "Warning: HTML-formed comment in #{path}"if text.gsub!(/<-- .+? -->/m, "")
 7 |   $stderr.puts "Warning: HTML tag in #{path}"if text.gsub!(/<HTML>.+?<\/HTML>/mi, "")
 8 | 
 9 |   negative_phrase = /http|internet|project gutenberg|mail|ocr/i
10 |   separator = /^(?:.+?END\*{1,2}|\*{3} START OF THE PROJECT GUTENBERG E(?:BOOK|TEXT).*? \*{3}|\*{9}END OF .+?|\*{3} END OF THE PROJECT GUTENBERG E(?:BOOK|TEXT).+?|\*{3}START\*.+\*START\*{3}|\**\s*This file should be named .+|\*{5}These [eE](?:Books|texts) (?:Are|Were) Prepared By .+\*{5})$/
11 | 
12 | 	while text =~ separator
13 | 	  pre, post = $`, $'
14 | 	  text = if pre.length > post.length*3 then
15 | 	    pre
16 | 	  elsif post.length > pre.length*3 then
17 | 	    post
18 | 	  elsif pre.scan(negative_phrase).length < post.scan(negative_phrase).length
19 | 	    pre
20 | 	  else
21 | 	    post
22 | 	  end
23 | 	end
24 | 
25 |   text.gsub!(/^(?:Executive Director's Notes:|\[?Transcriber's Note|PREPARER'S NOTE|\[Redactor's note|\{This e-text has been prepared|As you may be aware, Project Gutenberg has been involved with|[\[\*]Portions of this header are|A note from the digitizer|ETEXT EDITOR'S BOOKMARKS|\[NOTE:|\[Project Gutenberg is|INFORMATION ABOUT THIS E-TEXT EDITION\n+|If you find any errors|This electronic edition was|Notes about this etext:|A request to all readers:|Comments on the preparation of the E-Text:|The base text for this edition has been provided by).+?\n(?:[\-\*]+)?\n\n/mi, "")
26 |   text.gsub!(/^[\[\n](?:[^\[\]\n]+\n)*[^\n]*(?:Project\sGutenberg|\setext\s|\s[A-Za-z0-9]+@[a-z\-]+\.(?:com|net))[^\n]*(?:\n[^\[\]\n]+)*[\]\n]$/i, "")
27 |   text.gsub!(/\{The end of etext of .+?\}/, "")
28 |   text = text.strip + "\n\n"
29 | 
30 | 	text.gsub!(/^(?:(?:End )?(?:of ?)?(?:by |This |The )?Project Gutenberg(?:'s )?(?:Etext)?|This (?:Gutenberg )?Etext).+?\n\n/mi, "")
31 | 	text.gsub!(/^(?:\(?E?-?(?:text )?(?:prepared|Processed|scanned|Typed|Produced|Edited|Entered|Transcribed|Converted|created) by|Transcribed from|Scanning and first proofing by|Scanned and proofed by|This e-text|This EBook of|Scanned with|This Etext created by|This eBook was (?:produced|updated) by|Image files scanned in by|\[[^\n]*mostly scanned by|This text was prepared by).+?\n\n/mi, "")
32 | 
33 |   if text=~/gutenberg|\setext\s|scanner|David Reed/i
34 | 	  $stderr.puts "Warning:  remain '#{$&.strip}'"
35 |   elsif text=~/[^\s\*]@[^\s]+\./
36 | 	  $stderr.puts "Warning:  maybe remain mail adress"
37 |   elsif text.length < 1024
38 | 	  $stderr.puts "Warning:  too small body"
39 | 	end
40 | 
41 |   text
42 | end
43 | end
44 | 
45 | puts Gutenberg.extract(ARGF.read) if $0 == __FILE__
46 | 


--------------------------------------------------------------------------------
/lib/infinitive.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | require 'rubygems'
 4 | require 'stemmer'
 5 | require 'linguistics'
 6 | Linguistics::use( :en )
 7 | 
 8 | class Infinitive
 9 |   def initialize(inflist_file = nil, wordbook_file = nil)
10 |     dir = caller(0)[0].sub(/\/[^\/]*:\d+:.+$/,"")
11 |     inflist_file = "#{dir}/inflist.txt" unless inflist_file
12 |     wordbook_file = "#{dir}/wordbook.txt" unless wordbook_file
13 | 
14 |     @inflist = Hash.new
15 |     open(inflist_file) do |f|
16 |       while line = f.gets
17 |         @inflist[$1]=$2 if line =~ /^(.+)\t(.+)\n/
18 |       end
19 |     end
20 |     @infcache = Hash.new
21 | 
22 |     @wordbook = Hash.new
23 |     open(wordbook_file) do |f|
24 |       while line = f.gets
25 |         @wordbook[line.chomp.stem.downcase]=0 if line !~ /^\s+$|^\s*#/
26 |       end
27 |     end
28 |   end
29 | 
30 |   def inf(src)
31 |     return @infcache[src] if @infcache.key?(src)
32 |     st = @inflist[src] || src.en.infinitive
33 |     @infcache[src] = (if st == "" then src else st end).stem
34 |   end
35 | 
36 |   def infinitive(word)
37 |     st = word2 = word.downcase
38 |     st = word2.stem
39 |     if @wordbook.key?(st)
40 |       st
41 |     else
42 |       inf(word2) || st
43 |     end
44 |   end
45 | end
46 | 
47 | 


--------------------------------------------------------------------------------
/lib/inflist.txt:
--------------------------------------------------------------------------------
1 | women	woman
2 | feet	foot
3 | gentlemen	gentleman
4 | policemen	policeman
5 | 


--------------------------------------------------------------------------------
/lr/lr.r:
--------------------------------------------------------------------------------
 1 | # Multi-class Logistic Regression + Stochastic Gradient Descent for R
 2 | # (c)2011 Nakatani Shuyo / Cybozu Labs, Inc.
 3 | # This code is available under the MIT Licence.
 4 | 
 5 | # コマンドライン処理
 6 | commandline <- commandArgs(TRUE)
 7 | chart <- "--chart" %in% commandline
 8 | i <- match("-i", commandline)
 9 | if (is.na(i)) {
10 | 	I <- 1
11 | } else {
12 | 	I <- as.numeric(commandline[i + 1])
13 | }
14 | 
15 | # iris dataset
16 | xlist <- scale(iris[1:4])
17 | tlist <- cbind(
18 | 	ifelse(iris[5]=="setosa",1,0),
19 | 	ifelse(iris[5]=="versicolor",1,0),
20 | 	ifelse(iris[5]=="virginica",1,0)
21 | )
22 | N <- nrow(xlist) # データ件数
23 | 
24 | # 事後確率
25 | y <- function(phi, w) {
26 | 	y <- c(phi %*% w)
27 | 	y <- exp(y - max(y))  # exp の中身から、その最大値を引く(オーバーフロー対策)
28 | 	return(y / sum(y))
29 | }
30 | 
31 | # 誤差関数＆勾配
32 | En <- function(phi, t, w) -log(sum(y(phi, w) * t))
33 | dEn <- function(phi, t, w) outer(phi, y(phi, w) - t)
34 | 
35 | inference <- function(title, xlist, tlist, phi) {
36 | 	PHI <- t(apply(xlist, 1, phi))  # NxM - design matrix
37 | 	M <- ncol(PHI)  # 特徴数(特徴空間の次元)
38 | 	K <- ncol(tlist) # クラス数
39 | 
40 | 	for (i in 1:I) {
41 | 		# 重み初期化
42 | 		w <- matrix(rnorm(M * K), M)
43 | 
44 | 		eta <- 0.1  # 学習率
45 | 		while (eta > 0.0001) {
46 | 			for(n in sample(N)) {
47 | 				w <- w - eta * dEn(PHI[n,], tlist[n,], w)  # 確率的勾配降下法
48 | 			}
49 | 			eta <- eta * 0.95
50 | 		}
51 | 
52 | 		ylist <- t(apply(PHI, 1, function(phi) y(phi, w)))
53 | 		error <- sum(sapply(1:nrow(PHI), function(n) En(PHI[n,], tlist[n,], w)))
54 | 		cat(sprintf("%s: error=%.3f", title, error), "\n")
55 | 
56 | 		# 可視化
57 | 		if (chart) {
58 | 			pairs(xlist, col=rgb(ylist), main=title)
59 | 			plot(xlist[,c(1,2)],
60 | 				col=rgb(ylist),
61 | 				pch=(tlist %*% c(17,16,22)),
62 | 				main=title,
63 | 				sub=sprintf("Negative Log Likelihood = %.3f", error)
64 | 			)
65 | 		}
66 | 	}
67 | 
68 | 	return(w)
69 | }
70 | 
71 | if (chart) png(width=640, height=640)
72 | 
73 | # 線形特徴関数
74 | phi <- function(x) c(1, x[1], x[2], x[3], x[4])
75 | w <- inference("Linear Features", xlist, tlist, phi)
76 | 
77 | # 二次特徴関数
78 | phi <- function(x) c(1, x[1], x[2], x[3], x[4],
79 | 	x[1]*x[1], x[1]*x[2], x[1]*x[3], x[1]*x[4], x[2]*x[2],
80 | 	x[2]*x[3], x[2]*x[4], x[3]*x[3], x[3]*x[4], x[4]*x[4])
81 | w <- inference("Quadratic Features", xlist, tlist, phi)
82 | 
83 | # RBF 特徴関数
84 | for (s in 1:10) {
85 | 	phi <- function(x) {
86 | 		c <- seq(-2.5,2.5,by=1)
87 | 		d <- outer(c,x,"-")^2
88 | 		return(exp(-c(0, outer(c(outer(c(outer(d[,1],d[,2],"+")),d[,3],"+")),d[,4],"+"))/s))
89 | 	}
90 | 	w <- inference(sprintf("RBF Features (s=%d)", s), xlist, tlist, phi)
91 | }
92 | 
93 | if (chart) dev.off()
94 | 
95 | 
96 | 


--------------------------------------------------------------------------------
/misc/linear_regression.r:
--------------------------------------------------------------------------------
  1 | 
  2 | polynomial_basis_func <- function(M) {
  3 | 	lapply(1:M, function(u){ u1=u; function(x) x^(u1-1) })
  4 | }
  5 | gaussian_basis_func <- function(M, has_bias=T, s=0.1) {
  6 | 	phi <- c()
  7 | 	if (has_bias) phi <- function(x) 0*x+1 # bias
  8 | 	u_i <- seq(0,1,length=ifelse(has_bias, M-1, M))
  9 | 	append(phi, lapply(u_i, function(u){ u1=u; function(x) exp(-(x-u1)^2/(2*s*s)) }))
 10 | }
 11 | sigmoid_basis_func <- function(M, has_bias=T, s=0.1) {
 12 | 	phi <- c()
 13 | 	if (has_bias) phi <- function(x) 0*x+1 # bias rep(1, length(x))
 14 | 	u_i <- seq(0,1,length=ifelse(has_bias, M-1, M))
 15 | 	append(phi, lapply(u_i, function(u){ u1=u; function(x) 1/(1+exp(-(x-u1)/s)) } ))
 16 | }
 17 | 
 18 | xlist <- seq(0, 1, length=250)
 19 | tlist <- sin(2*pi*xlist)+rnorm(length(xlist), sd=0.2)
 20 | D <- data.frame(x=xlist, t=tlist)
 21 | 
 22 | # PRML's synthetic data set
 23 | curve_fitting <- data.frame(
 24 | 	x=c(0.000000,0.111111,0.222222,0.333333,0.444444,0.555556,0.666667,0.777778,0.888889,1.000000),
 25 | 	t=c(0.349486,0.830839,1.007332,0.971507,0.133066,0.166823,-0.848307,-0.445686,-0.563567,0.261502)
 26 | )
 27 | 
 28 | calc_evidence <- function(phi, D, alpha=2, beta=25, graph=NULL) {
 29 | 	M <- length(phi)
 30 | 	N <- length(D$x)
 31 | 	PHI <- sapply(phi, function(f)f(D$x))
 32 | 
 33 | 	if (!is.null(graph)) {
 34 | 		plot(graph, lty=2, col="blue", xlim=c(0,1), ylim=c(-1.1,1.1), ylab="")
 35 | 		par(new=T)
 36 | 		plot(D, xlim=c(0,1), ylim=c(-1.1,1.1), xlab="", ylab="")
 37 | 	}
 38 | 
 39 | 	if (beta=="ml") {
 40 | 		w_ML <- solve(t(PHI) %*% PHI) %*% t(PHI) %*% D$t
 41 | 		loss_ML <- D$t - PHI %*% w_ML
 42 | 		beta <- N / sum(loss^2)
 43 | 		if (!is.null(graph)) {
 44 | 			par(new=T)
 45 | 			plot( function(x) sapply(phi, function(f)f(x)) %*% w_ML , col="red", xlim=c(0,1), ylim=c(-1.1,1.1), ylab="")
 46 | 		}
 47 | 	}
 48 | 
 49 | 	A <- alpha * diag(M) + beta * t(PHI) %*% PHI  # equal to S_N(PRML 3.54)
 50 | 	m_N <- beta * solve(A) %*% t(PHI) %*% D$t
 51 | 	loss_m_N <- D$t - PHI %*% m_N
 52 | 	E_m_N <- beta / 2 * sum(loss_m_N^2) + alpha / 2 * sum(m_N^2)
 53 | 
 54 | 	if (!is.null(graph)) {
 55 | 		par(new=T)
 56 | 		plot( function(x) sapply(phi, function(f)f(x)) %*% m_N, xlim=c(0,1), ylim=c(-1.1,1.1), ylab="")
 57 | 	}
 58 | 
 59 | 	# model evidence
 60 | 	c(M/2*log(alpha) + N/2*log(beta) - E_m_N - 1/2*log(det(A)) - N/2*log(2*pi), beta)
 61 | }
 62 | 
 63 | 
 64 | a<-sapply(1:9, function(n) calc_evidence(polynomial_basis_func(n), curve_fitting, alpha=5e-3))
 65 | 
 66 | orig_func <- function(x)sin(2*pi*x)
 67 | calc_evidence(gaussian_basis_func(9, F, s=0.37), D, beta="ml", alpha=2, graph=orig_func)
 68 | calc_evidence(polynomial_basis_func(4), curve_fitting, alpha=5e-3, beta="ml", graph=orig_func)
 69 | 
 70 | calc_evidence(gaussian_basis_func(6, F), D0, alpha=2, beta="ml", graph=orig_func)
 71 | 
 72 | # ----
 73 | 
 74 | > a<-sapply(1:9, function(n) calc_evidence(polynomial_basis_func(n), curve_fitting, alpha=5e-3, beta="ml"))
 75 | > data.frame(M=0:8, evidence=a[1,], beta_ML=a[2,])
 76 |   M  evidence   beta_ML
 77 | 1 0 -13.60463  2.649926
 78 | 2 1 -14.48098  4.680463
 79 | 3 2 -16.60761  4.752649
 80 | 4 3 -14.38654 28.600038
 81 | 5 4 -14.20562 28.651286
 82 | 6 5 -15.12706 29.206330
 83 | 7 6 -15.86874 30.294868
 84 | 8 7 -16.43925 30.954700
 85 | 9 8 -17.37590 35.353486
 86 | > a<-sapply(1:9, function(n) calc_evidence(polynomial_basis_func(n), curve_fitting, alpha=5e-3, beta=11.1))
 87 | > data.frame(M=0:8, evidence=a[1,], beta_ML=a[2,])
 88 |   M  evidence beta_ML
 89 | 1 0 -23.10268    11.1
 90 | 2 1 -17.88419    11.1
 91 | 3 2 -20.30879    11.1
 92 | 4 3 -13.93411    11.1
 93 | 5 4 -13.71294    11.1
 94 | 6 5 -14.35868    11.1
 95 | 7 6 -14.98120    11.1
 96 | 8 7 -15.48112    11.1
 97 | 9 8 -15.90587    11.1
 98 | 
 99 | # ----
100 | 
101 | a<-sapply(1:9, function(n) calc_evidence(gaussian_basis_func(n), D0, alpha=2, beta="ml"))
102 | data.frame(n=1:9, evidence=a[1,], beta_ML=a[2,])
103 | 
104 |   n   evidence   beta_ML
105 | 1 1  -24.15372  1.991818
106 | 2 2  -26.00534  2.080833
107 | 3 3  -28.32203  2.199902
108 | 4 4  -28.98627  2.204309
109 | 5 5 -141.40382 11.622494
110 | 6 6 -262.51315 20.230023
111 | 7 7 -531.38789 39.600405
112 | 8 8 -558.69096 41.400126
113 | 9 9 -566.97144 41.904756
114 | 
115 | a<-sapply(1:9, function(n) calc_evidence(gaussian_basis_func(n, F), D0, alpha=2, beta="ml"))
116 | data.frame(n=1:9, evidence=a[1,], beta_ML=a[2,])
117 |   n   evidence   beta_ML
118 | 1 1  -23.54826  2.069294
119 | 2 2  -27.36097  2.194946
120 | 3 3  -28.18806  2.194992
121 | 4 4 -139.53185 11.503610
122 | 5 5 -260.09231 20.131365
123 | 6 6 -523.95049 39.104824
124 | 7 7 -555.47110 41.216737
125 | 8 8 -564.31398 41.761091
126 | 9 9 -587.98582 43.413693
127 | 
128 | a<-sapply(1:9, function(n) calc_evidence(gaussian_basis_func(n, F), curve_fitting, alpha=2, beta="ml"))
129 | data.frame(n=1:9, evidence=a[1,], beta_ML=a[2,])
130 |   n    evidence   beta_ML
131 | 1 1   -9.947638  2.844855
132 | 2 2  -10.661711  2.849207
133 | 3 3  -10.915992  2.895102
134 | 4 4  -32.215789  9.113815
135 | 5 5  -84.841899 21.990299
136 | 6 6  -84.348342 21.365737
137 | 7 7 -110.852386 27.425745
138 | 8 8 -119.979257 29.427823
139 | 9 9 -131.627930 32.218532
140 | 
141 | a<-sapply(1:9, function(n) calc_evidence(gaussian_basis_func(n), curve_fitting, alpha=2, beta="ml"))
142 | data.frame(n=1:9, evidence=a[1,], beta_ML=a[2,])
143 |   n    evidence   beta_ML
144 | 1 1   -8.439416  2.649926
145 | 2 2   -9.732442  2.903392
146 | 3 3  -10.598375  2.953347
147 | 4 4  -11.048473  2.954229
148 | 5 5  -32.992820  9.370027
149 | 6 6  -93.909226 24.235243
150 | 7 7  -85.088391 21.481192
151 | 8 8 -132.661228 32.884408
152 | 9 9 -145.904973 35.879891
153 | 
154 | 


--------------------------------------------------------------------------------
/misc/linear_regression.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/misc/linear_regression.xlsx


--------------------------------------------------------------------------------
/misc/zipf.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | list = " etaonisrhdlucmfwgypbvkxjqz".split(//)
 4 | prob = if ARGV[0] == "unif"
 5 |   [0.2] + Array.new(26){ 0.8 / 26.0 }
 6 | elsif ARGV[0] == "linear"
 7 |   (1..27).map{|i| (28.0 - i) / (14 * 27) }
 8 | elsif ARGV[0] == "zipf"
 9 |   (1..27).map{|i| 0.256973175704523 / i }
10 | else
11 |   [0.1918182,0.1041442,0.0729357,0.0651738,0.0596302,0.0564513,0.0558094,0.0515760,0.0497563,
12 |    0.0492888,0.0349835,0.0331490,0.0225134,0.0217339,0.0202124,0.0197881,0.0171272,0.0158610,
13 |    0.0145984,0.0137645,0.0124248,0.0082903,0.0050529,0.0013692,0.0009033,0.0008606,0.0007836]
14 | end
15 | name = ARGV[0] || "orig"
16 | size = (ARGV[1] || 5000000).to_i
17 | 
18 | prob_sum = 0
19 | cum_prob = []
20 | prob.each do |x|
21 |   cum_prob << prob_sum
22 |   prob_sum += x
23 | end
24 | cum_prob << 1.0
25 | 
26 | module R;def self.rand;Kernel::rand;end;end
27 | random = Random.new rescue R
28 | 
29 | map = Hash.new(0)
30 | word = ""
31 | while true
32 |   r = random.rand
33 |   l = 0
34 |   h = prob.size
35 |   while h>l+1
36 |     m = (h+l)/2
37 |     if r < cum_prob[m]
38 |       h = m
39 |     else
40 |       l = m
41 |     end
42 |   end
43 |   x = list[l]
44 | 
45 |   if x == " "
46 |     if word.length > 0
47 |       map[word] += 1
48 |       break if map.size == size
49 |     end
50 |     word = ""
51 |   else
52 |     word += x
53 |   end
54 | end
55 | 
56 | open("#{name}#{map.size/1000}k.txt", "w") do |f|
57 |   f.puts "rank,word,freq,rank*freq,freq_freq"
58 |   freq = rank = 1
59 |   map.to_a.sort_by{|x| -x[1]}.each_with_index do |x, r|
60 |     if freq != x[1]
61 |       f.puts "#{rank},#{x[0]},#{x[1]},#{rank*x[1]},#{r-rank+2}" if rank>0
62 |       freq = x[1]
63 |       rank = r + 2
64 |     end
65 |   end
66 | end
67 | 
68 | 


--------------------------------------------------------------------------------
/neural/adult.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | require "neural.rb"
 4 | #OUTPUT_CODE = true
 5 | LOGFILE = "adult.log"
 6 | 
 7 | # training data
 8 | CLZ = {"<=50K"=>0, ">50K"=>1}
 9 | dataset = []
10 | categories = (1..14).map{[]}
11 | open("adult.data") do |f|
12 |   while line = f.gets
13 |     buf = line.chomp.split(',').map{|x| x.strip}
14 |     clz = buf.pop
15 |     break unless clz
16 |     buf.each_with_index do |x, i|
17 |       if x !~ /^[0-9]+$/
18 |         categories[i] << x if !categories[i].include?(x)
19 |       end
20 |     end
21 |     dataset << [buf, CLZ[clz]]
22 |   end
23 | end
24 | 
25 | dataset.each_with_index do |data, idx|
26 |   vector = []
27 |   data[0].each_with_index do |x, i|
28 |     if categories[i].length > 0
29 |       one_of_k = [0] * categories[i].length
30 |       one_of_k[categories[i].index(x)] = 1
31 |       vector.concat(one_of_k)
32 |     else
33 |       vector << x.to_f
34 |     end
35 |   end
36 |   dataset[idx] = [vector, data[1]]
37 | end
38 | 
39 | #dataset = dataset[0, 100]
40 | 
41 | 
42 | # units
43 | in_units = (1..dataset[0][0].length).map{|i| Unit.new("x#{i}")}
44 | hiddenunits1 = (1..20).map{|i| TanhUnit.new("z#{i}")}
45 | #hiddenunits2 = (1..30).map{|i| TanhUnit.new("w#{i}")}
46 | out_unit = [SigUnit.new("y1")]
47 | 
48 | # network
49 | network = Network.new(:error_func=>ErrorFunction::CrossEntropy)
50 | network.in  = in_units
51 | network.link in_units, hiddenunits1
52 | network.link hiddenunits1, out_unit
53 | network.out = out_unit
54 | 
55 | open(LOGFILE, "a") {|f| f.puts "==== start (#{Time.now})" }
56 | 
57 | max_correct = 0
58 | 10.times do |trial|
59 |   network.weights.init_parameters
60 | 
61 |   100.times do |tau|
62 |     eta = if tau<10 then 0.1 elsif tau<50 then 0.05 elsif tau<100 then 0.01 else 0.005 end
63 |     t = Time.now.to_i
64 |     dataset.sort{rand}.each do |data|
65 |       grad = network.gradient_E(data[0], [data[1]])
66 |       network.weights.descent eta, grad
67 |       #e += network.error_function(data[0], [data[1]])
68 |     end
69 |     puts "#{tau}: #{Time.now.to_i - t}s"
70 |   end
71 | 
72 |   correct = 0
73 |   dataset.each do |data|
74 |     y = network.apply(*data[0])
75 |     correct += 1 if (data[1]==0 && y[0]<0.5) || (data[1]==1 && y[0]>0.5)
76 |   end
77 | 
78 |   #log
79 |   log = "#{trial+1}: correct = #{correct}, mistake = #{dataset.length - correct}, rate = #{(10000.0*correct/dataset.length).to_i/100.0}"
80 |   puts log
81 |   open(LOGFILE, "a") do |f|
82 |     f.puts log
83 |     if max_correct < correct
84 |       max_correct = correct
85 |       f.puts network.weights.dump
86 |     end
87 |   end
88 | end
89 | open(LOGFILE, "a") {|f| f.puts "==== end (#{Time.now})" }
90 | 
91 | 


--------------------------------------------------------------------------------
/neural/classification.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | require "neural.rb"
 3 | 
 4 | data = []
 5 | open("classification.txt") do |f|
 6 |   while line = f.gets
 7 |     x1, x2, t = line.split
 8 |     data << [[x1.to_f, x2.to_f], [t.to_f]]
 9 |   end
10 | end
11 | 
12 | # units
13 | in_units = [Unit.new("x1"), Unit.new("x2")]
14 | hiddenunits = (1..6).map{|i| TanhUnit.new("z1#{i}")}
15 | hiddenunits2 = (1..6).map{|i| TanhUnit.new("z2#{i}")}
16 | out_unit = [SigUnit.new("y1")]
17 | 
18 | # network
19 | network = Network.new(:error_func=>ErrorFunction::CrossEntropy, :code_generate=>true)
20 | network.in  = in_units
21 | network.link in_units, hiddenunits
22 | network.link hiddenunits, hiddenunits2
23 | network.link hiddenunits2, out_unit
24 | network.out = out_unit
25 | 
26 | eta = 0.1
27 | sum_e = 999999
28 | 1000.times do |tau|
29 | =begin
30 |   s = 0
31 |   data.each do |d|
32 |     s += network.error_function(d[0], d[1])
33 |   end
34 |   puts "sum of errors: #{tau} => #{s}"
35 |   break if s > sum_e
36 |   sum_e = s
37 | =end
38 |   data.sort{rand}.each do |d|
39 |     grad = network.gradient_E(d[0], d[1])
40 |     network.weights.descent eta, grad
41 |   end
42 | end
43 | network.weights.dump
44 | 


--------------------------------------------------------------------------------
/neural/classification.txt:
--------------------------------------------------------------------------------
  1 | 1.208985 0.421448 0.000000
  2 | 0.504542 -0.285730 1.000000
  3 | 0.630568 1.054712 0.000000
  4 | 1.056364 0.601873 0.000000
  5 | 1.095326 -1.447579 1.000000
  6 | -0.210165 0.000284 1.000000
  7 | -0.367151 -1.255189 1.000000
  8 | 0.868013 -1.063465 0.000000
  9 | 1.704441 -0.644833 0.000000
 10 | 0.565619 -1.637858 1.000000
 11 | 0.598389 -1.477808 0.000000
 12 | 0.580927 -0.783898 1.000000
 13 | 1.183283 -1.797936 0.000000
 14 | 0.331843 -1.869486 0.000000
 15 | -0.051195 0.989475 1.000000
 16 | 2.427090 0.173557 0.000000
 17 | 1.603778 -0.030691 1.000000
 18 | 1.286206 -1.079916 0.000000
 19 | -1.243951 1.005355 1.000000
 20 | 1.181748 1.523744 0.000000
 21 | 0.896222 1.899568 0.000000
 22 | -0.366207 -0.664987 1.000000
 23 | -0.078800 1.007368 1.000000
 24 | -1.351435 1.766786 1.000000
 25 | -0.220423 -0.442405 1.000000
 26 | 0.836253 -1.927526 0.000000
 27 | 0.039899 -1.435842 0.000000
 28 | 0.256755 0.946722 0.000000
 29 | 0.974836 -0.944967 0.000000
 30 | 0.705256 -2.618644 0.000000
 31 | 0.738188 -1.666242 0.000000
 32 | 1.245931 -2.200826 0.000000
 33 | 0.297604 0.159463 1.000000
 34 | -2.210680 1.195815 1.000000
 35 | -0.872624 -0.131252 1.000000
 36 | 1.112762 -0.653777 0.000000
 37 | 1.123989 -1.347470 0.000000
 38 | 0.750833 0.811870 0.000000
 39 | -0.183497 1.416116 1.000000
 40 | 0.287582 -1.342512 0.000000
 41 | 1.092719 1.380559 0.000000
 42 | 0.719502 1.594624 0.000000
 43 | -1.016254 0.651607 1.000000
 44 | 0.379677 2.802498 0.000000
 45 | 0.150675 0.474679 1.000000
 46 | -0.116477 0.437483 1.000000
 47 | 1.122528 0.698541 0.000000
 48 | 0.953551 1.088368 0.000000
 49 | -0.000228 0.347187 1.000000
 50 | 0.505024 0.455407 1.000000
 51 | 0.113753 0.559572 1.000000
 52 | -0.677993 0.322716 1.000000
 53 | 1.114811 -0.735813 0.000000
 54 | 0.344114 -1.770137 0.000000
 55 | 0.684242 -0.636027 1.000000
 56 | -0.684629 -0.300568 1.000000
 57 | -0.362677 -0.669101 1.000000
 58 | 0.604984 -1.558581 0.000000
 59 | 0.514202 -0.225827 0.000000
 60 | 0.227014 -1.579346 1.000000
 61 | 1.044068 -1.491114 0.000000
 62 | 0.314855 -2.535762 1.000000
 63 | 1.187904 -1.367278 0.000000
 64 | 0.517132 1.375811 0.000000
 65 | 1.244285 -0.764164 0.000000
 66 | -0.831841 1.728708 1.000000
 67 | 1.719616 -2.491282 1.000000
 68 | 0.594216 1.137571 1.000000
 69 | 0.939919 -0.474988 0.000000
 70 | -0.918736 -0.748474 1.000000
 71 | 0.913760 -1.194336 0.000000
 72 | 0.893221 -1.569459 0.000000
 73 | 0.653152 0.510498 0.000000
 74 | 0.766890 -1.577565 0.000000
 75 | 0.868315 -1.966740 1.000000
 76 | 0.874218 0.514959 1.000000
 77 | -0.559543 1.749552 1.000000
 78 | 1.526669 -1.797734 1.000000
 79 | 1.843439 -0.363161 0.000000
 80 | 1.163746 2.062245 0.000000
 81 | 0.565749 -2.432301 1.000000
 82 | 1.016715 2.878822 0.000000
 83 | 1.433979 -1.944960 1.000000
 84 | -0.510225 0.295742 1.000000
 85 | -0.385261 0.278145 1.000000
 86 | 1.042889 -0.564351 0.000000
 87 | -0.607265 1.885851 1.000000
 88 | -0.355286 -1.813131 1.000000
 89 | -0.790644 -0.790761 1.000000
 90 | 1.372382 0.879619 0.000000
 91 | 1.133019 -0.300956 0.000000
 92 | 1.395009 -1.006842 0.000000
 93 | 0.887843 0.222319 1.000000
 94 | 1.484690 0.095074 0.000000
 95 | 1.268061 1.832532 0.000000
 96 | 0.124568 0.910824 1.000000
 97 | 1.061504 -0.768175 1.000000
 98 | 0.298551 2.573175 0.000000
 99 | 0.241114 -0.613155 0.000000
100 | -0.423781 -1.524901 1.000000
101 | 0.528691 -0.939526 0.000000
102 | 1.601252 1.791658 0.000000
103 | 0.793609 0.812783 1.000000
104 | 0.327097 0.326998 0.000000
105 | 1.131868 -0.985696 1.000000
106 | 1.273154 1.656441 0.000000
107 | -0.816691 0.961580 1.000000
108 | 0.669064 1.162614 0.000000
109 | -0.453759 -1.146883 1.000000
110 | 2.055105 0.025811 0.000000
111 | 0.463119 -0.813294 1.000000
112 | 0.802392 -0.140807 1.000000
113 | -0.730255 -0.145175 1.000000
114 | 0.569256 0.567628 1.000000
115 | 0.486947 1.130519 0.000000
116 | 1.793588 -1.426926 0.000000
117 | 1.178831 -0.581314 1.000000
118 | 0.480055 1.257981 0.000000
119 | 0.683732 0.190071 1.000000
120 | -0.119082 -0.004020 1.000000
121 | -1.251554 -0.176027 1.000000
122 | 1.094741 -1.099305 0.000000
123 | -0.238250 -1.277484 1.000000
124 | -0.661556 1.327722 1.000000
125 | 1.442837 1.241720 0.000000
126 | 1.202320 0.489702 0.000000
127 | 0.932890 0.296430 0.000000
128 | 0.665568 -1.314006 0.000000
129 | -0.058993 1.322294 1.000000
130 | 0.209525 -1.006357 0.000000
131 | 1.023340 0.219375 0.000000
132 | 1.324444 0.446567 1.000000
133 | 1.453910 -1.151325 0.000000
134 | 0.616303 0.974796 0.000000
135 | 1.492010 -0.885984 0.000000
136 | 1.738658 0.686807 1.000000
137 | 0.900582 -0.280724 0.000000
138 | 0.961914 -0.053991 1.000000
139 | 1.819706 -0.953273 1.000000
140 | 1.581289 -0.340552 0.000000
141 | 0.520837 -0.680639 1.000000
142 | 1.433771 -0.914798 0.000000
143 | 0.611594 -1.691685 0.000000
144 | 1.591513 -0.978986 1.000000
145 | 1.282094 0.113769 0.000000
146 | 0.985715 0.275551 0.000000
147 | -1.805143 2.628696 1.000000
148 | 1.473100 -0.241372 0.000000
149 | -0.242212 -1.040151 1.000000
150 | 1.175525 -1.662026 0.000000
151 | 0.696040 0.154387 0.000000
152 | 1.457713 1.608681 0.000000
153 | 0.883215 1.330538 0.000000
154 | -0.681209 0.622394 1.000000
155 | -0.355082 0.432941 1.000000
156 | 0.633011 -1.194431 0.000000
157 | 0.782723 1.060008 1.000000
158 | 0.670180 -0.766999 1.000000
159 | -0.047154 0.698693 1.000000
160 | 0.287385 -1.097756 0.000000
161 | 0.069561 1.632585 1.000000
162 | 1.013230 1.111551 0.000000
163 | 0.639065 -0.697237 0.000000
164 | 1.174621 2.240022 1.000000
165 | 1.322020 0.040277 1.000000
166 | 0.019127 0.105667 1.000000
167 | 0.584584 1.101914 0.000000
168 | 1.157265 -0.665947 0.000000
169 | 1.565230 -0.840790 0.000000
170 | 1.759315 0.963703 1.000000
171 | 1.687068 -1.086466 0.000000
172 | 0.578314 -0.340961 1.000000
173 | 0.118925 -1.487694 1.000000
174 | 0.471201 0.330872 1.000000
175 | -0.268209 -0.353477 0.000000
176 | 1.625390 -1.718798 0.000000
177 | 1.117791 2.752549 0.000000
178 | -0.194552 -0.752687 1.000000
179 | 0.769548 -2.066152 0.000000
180 | 0.186062 0.022072 1.000000
181 | 1.771337 -0.393550 0.000000
182 | -1.300597 0.962803 1.000000
183 | 0.708730 -1.013371 0.000000
184 | -0.624235 -0.892995 1.000000
185 | 0.377055 -1.296098 0.000000
186 | 0.804404 -0.856253 1.000000
187 | 1.359887 -0.974291 0.000000
188 | -0.115505 0.228439 1.000000
189 | 0.913645 -0.344936 1.000000
190 | 0.318875 -0.886290 1.000000
191 | 0.822157 0.102548 0.000000
192 | -0.281208 1.302572 1.000000
193 | 0.044639 -1.107980 1.000000
194 | -0.029205 -2.033973 0.000000
195 | 0.879914 -2.000582 1.000000
196 | 0.601936 -0.503923 0.000000
197 | -0.490114 -0.841122 1.000000
198 | 1.847075 2.362322 0.000000
199 | -0.279703 0.753196 1.000000
200 | 1.953357 -0.746632 0.000000
201 | 


--------------------------------------------------------------------------------
/neural/curve_fitting.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | require "neural.rb"
 4 | 
 5 | # training data ( y = sin(2 PI x) + N(0, 0.3) )
 6 | D = [
 7 |   [0.000000,  0.349486], [0.111111,  0.830839],
 8 |   [0.222222,  1.007332], [0.333333,  0.971507],
 9 |   [0.444444,  0.133066], [0.555556,  0.166823],
10 |   [0.666667, -0.848307], [0.777778, -0.445686],
11 |   [0.888889, -0.563567], [1.000000,  0.261502],
12 | ]
13 | 
14 | # units
15 | in_unit = [Unit.new("x1")]
16 | hiddenunits = [TanhUnit.new("z1"), TanhUnit.new("z2"), TanhUnit.new("z3"), TanhUnit.new("z4")]
17 | out_unit = [IdentityUnit.new("y1")]
18 | 
19 | # network
20 | network = Network.new
21 | network.in  = in_unit
22 | network.link in_unit, hiddenunits
23 | network.link hiddenunits, out_unit
24 | network.out = out_unit
25 | 
26 | eta = 0.1
27 | sum_e = 999999
28 | 1000.times do |tau|
29 |   error = 0
30 |   D.sort{rand}.each do |data|
31 |     error += network.error_function([data[0]], [data[1]])
32 |     grad = network.gradient_E([data[0]], [data[1]])
33 |     network.weights.descent eta, grad
34 |   end
35 |   puts "error func(#{tau}): #{error}"
36 |   break if sum_e < error
37 |   sum_e = error
38 | end
39 | network.weights.dump
40 | 
41 | 
42 | 
43 | =begin
44 | x = 0.0
45 | while x < 1.0
46 |   y = network.apply(x)
47 |   p [x, y]
48 |   x += 0.05
49 | end
50 | =end
51 | 


--------------------------------------------------------------------------------
/neural/iris.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/ruby
  2 | 
  3 | require "neural.rb"
  4 | #OUTPUT_CODE = true
  5 | LOGFILE = "iris.log"
  6 | 
  7 | # training data
  8 | CLZ = {"Iris-setosa"=>[1,0,0], "Iris-versicolor"=>[0,1,0], "Iris-virginica"=>[0,0,1]}
  9 | dataset = []
 10 | open("iris.data") do |f|
 11 |   while line = f.gets
 12 |     buf = line.chomp.split(',')
 13 |     clz = buf.pop
 14 |     break unless clz
 15 |     x = buf.map{|x| x.to_f}
 16 |     dataset << [x, CLZ[clz]]
 17 |   end
 18 | end
 19 | 
 20 | def generate_network(network_type)
 21 |   # units
 22 |   in_units = (1..4).map{|i| Unit.new("x#{i}")}
 23 |   hiddenunits1 = (1..6).map{|i| TanhUnit.new("z#{i}")}
 24 |   hiddenunits2 = (1..6).map{|i| TanhUnit.new("w#{i}")}
 25 |   out_unit = (1..3).map{|i| SoftMaxUnit.new("y#{i}")}
 26 | 
 27 |   # network
 28 |   network = Network.new(:error_func=>ErrorFunction::SoftMax)
 29 |   network.in  = in_units
 30 | 
 31 |   name = nil
 32 |   case network_type
 33 |   when 0
 34 |     name = "full link(6)"
 35 |     network.link in_units, hiddenunits1
 36 |     network.link hiddenunits1, out_unit
 37 |   when 1
 38 |     name = "full link(12)"
 39 |     network.link in_units, hiddenunits1+hiddenunits2
 40 |     network.link hiddenunits1+hiddenunits2, out_unit
 41 |   when 2
 42 |     name = "full link(6+6)"
 43 |     network.link in_units, hiddenunits1
 44 |     network.link hiddenunits1, hiddenunits2
 45 |     network.link hiddenunits2, out_unit
 46 |   when 3
 47 |     name = "each 2-input-units"
 48 |     network.link [in_units[0], in_units[1]], [hiddenunits1[0]]
 49 |     network.link [in_units[0], in_units[2]], [hiddenunits1[1]]
 50 |     network.link [in_units[0], in_units[3]], [hiddenunits1[2]]
 51 |     network.link [in_units[1], in_units[2]], [hiddenunits1[3]]
 52 |     network.link [in_units[1], in_units[3]], [hiddenunits1[4]]
 53 |     network.link [in_units[2], in_units[3]], [hiddenunits1[5]]
 54 |     network.link hiddenunits1[0, 6], out_unit
 55 |   end
 56 | 
 57 |   network.out = out_unit
 58 |   [name, network]
 59 | end
 60 | 
 61 | N_TRIALS = 100
 62 | 
 63 | open(LOGFILE, "a") {|f| f.puts "==== start (#{Time.now})" }
 64 | 
 65 | 4.times do |network_type|
 66 |   name, network = generate_network(network_type)
 67 |   max_correct = 0
 68 |   sum_correct = 0
 69 | 
 70 |   open(LOGFILE, "a") {|f| f.puts "-- #{name}" }
 71 |   t0 = Time.now.to_i
 72 |   N_TRIALS.times do |trial|
 73 |     network.weights.init_parameters 0, 3
 74 |     200.times do |tau|
 75 |       eta = if tau<10 then 0.1 elsif tau<50 then 0.05 elsif tau<100 then 0.01 else 0.005 end
 76 |       dataset.sort{rand}.each do |data|
 77 |         grad = network.gradient_E(data[0], data[1])
 78 |         network.weights.descent eta, grad
 79 |       end
 80 |     end
 81 | 
 82 |     correct = 0
 83 |     dataset.each do |data|
 84 |       y = network.apply(*data[0])
 85 |       predict = (0..2).max_by{|i| y[i]}
 86 |       #puts "y = #{y.map{|x| (x*10000).to_i/10000.0}.inspect}, answer = #{data[1].inspect}"
 87 |       correct += 1 if data[1][predict]==1
 88 |     end
 89 |     sum_correct += correct
 90 | 
 91 |     #log
 92 |     log = "#{trial+1}: correct = #{correct}, mistake = #{dataset.length - correct}, rate = #{(10000.0*correct/dataset.length).to_i/100.0}"
 93 |     puts log
 94 |     open(LOGFILE, "a") do |f|
 95 |       f.puts log
 96 |       if max_correct < correct
 97 |         max_correct = correct
 98 |         f.puts network.weights.dump
 99 |       end
100 |     end
101 |   end
102 |   open(LOGFILE, "a") do |f|
103 |     f.puts "max of rate = #{(10000*max_correct/dataset.length).to_i/100.0}, average of rate = #{(10000*sum_correct/(dataset.length*N_TRIALS)).to_i/100.0} (#{Time.now.to_i - t0}sec)"
104 |   end
105 | end
106 | open(LOGFILE, "a") {|f| f.puts "==== end (#{Time.now})" }
107 | 
108 | 
109 | 


--------------------------------------------------------------------------------
/neural/mnist.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/ruby
  2 | 
  3 | require 'zlib'
  4 | require "neural.rb"
  5 | 
  6 | n_rows = n_cols = nil
  7 | images = []
  8 | labels = []
  9 | Zlib::GzipReader.open('train-images-idx3-ubyte.gz') do |f|
 10 |   magic, n_images = f.read(8).unpack('N2')
 11 |   raise 'This is not MNIST image file' if magic != 2051
 12 |   n_rows, n_cols = f.read(8).unpack('N2')
 13 |   n_images.times do
 14 |     images << f.read(n_rows * n_cols)
 15 |   end
 16 | end
 17 | 
 18 | Zlib::GzipReader.open('train-labels-idx1-ubyte.gz') do |f|
 19 |   magic, n_labels = f.read(8).unpack('N2')
 20 |   raise 'This is not MNIST label file' if magic != 2049
 21 |   labels = f.read(n_labels).unpack('C*')
 22 | end
 23 | 
 24 | # output pgm
 25 | def output_pgm(filename, images, n_rows, n_cols, n_width, n_height)
 26 |   open(filename, "wb") do |f|
 27 |     f.printf("P5 %d %d %d ", n_rows*n_width, n_cols*n_height, 0xff)
 28 |     offset = 0
 29 |     buf = ""
 30 |     n_height.times do
 31 |       n_cols.times do |y|
 32 |         n_width.times do |idx|
 33 |           st = images[offset + idx][y * n_rows, n_rows].unpack('C*').map{|p| 0xff - p }.pack("C*")
 34 |           #p images[offset + idx][y * n_rows, n_rows].unpack('C*') if st.length != 28
 35 |           buf << st
 36 |         end
 37 |       end
 38 |       offset += n_width
 39 |     end
 40 |     f.puts buf
 41 |   end
 42 | end
 43 | #output_pgm "mnist.pgm", images, n_rows, n_cols, 300, 200
 44 | 
 45 | # units
 46 | in_units = (1..(28*28)).map{|i| Unit.new("x#{i}")}
 47 | hiddenunits = (1..100).map{|i| TanhUnit.new("z#{i}")}
 48 | out_unit = (1..10).map{|i| SoftMaxUnit.new("y#{i}")}
 49 | 
 50 | # network
 51 | network = Network.new(:error_func=>ErrorFunction::SoftMax, :code_generate=>true)
 52 | network.in  = in_units
 53 | network.link in_units, hiddenunits
 54 | network.link hiddenunits, out_unit
 55 | network.out = out_unit
 56 | 
 57 | # training
 58 | t1 = Time.now.to_f
 59 | N_IMAGES = 1000
 60 | 10.times do |n|
 61 |   eta = if n<2 then 0.1 elsif n<5 then 0.05 else 0.01 end
 62 |   (0..(N_IMAGES-1)).sort_by{rand}.each do |idx|
 63 |     image = images[idx].unpack('C*')
 64 |     target = [0]*10
 65 |     target[labels[idx]] = 1
 66 | 
 67 |     puts "(#{n+1}, #{idx}): correct: #{labels[idx]}"
 68 |     #puts "#{network.apply(*image).map{|x| (x*10000).floor/10000.0}.inspect}, e=#{(network.error_function(image, target)*1000)/1000.0}"
 69 | 
 70 |     grad = Gradient::BackPropagate.call(network, image, target)
 71 |     network.weights.descent eta, grad
 72 | 
 73 |     #puts "#{network.apply(*image).map{|x| (x*10000).floor/10000.0}.inspect}, e=#{(network.error_function(image, target)*1000)/1000.0}"
 74 |   end
 75 | end
 76 | t2 = Time.now.to_f
 77 | 
 78 | # test
 79 | puts "------------------------------"
 80 | correct = mistake = 0
 81 | (0..(N_IMAGES*2-1)).each do |idx|
 82 |   image = images[idx].unpack('C*')
 83 |   target = [0]*10
 84 |   target[labels[idx]] = 1
 85 | 
 86 |   y = network.apply(*image)
 87 |   predict = (0..9).max_by{|i| y[i]}
 88 |   puts "#{idx}: predict = #{predict}, expect = #{labels[idx]}"
 89 |   puts "#{y.map{|x| (x*10000).floor/10000.0}.inspect}, e=#{(network.error_function(image, target)*1000)/1000.0}"
 90 |   if labels[idx] == predict
 91 |     correct += 1
 92 |   else
 93 |     mistake += 1
 94 |   end
 95 | end
 96 | t3 = Time.now.to_f
 97 | 
 98 | #
 99 | puts "correct = #{correct}, mistake = #{mistake}, rate = #{(correct.to_f/(correct+mistake)*10000).floor/100.0}%"
100 | 
101 | puts "learning: #{t2-t1}"
102 | puts "testing: #{t3-t2}"
103 | 
104 | 
105 | 


--------------------------------------------------------------------------------
/neural/xor.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | require "neural.rb"
 4 | OUTPUT_CODE = false
 5 | 
 6 | # training data
 7 | D = [
 8 |   [[0, 0], [0]],
 9 |   [[1, 1], [0]],
10 |   [[0, 1], [1]],
11 |   [[1, 0], [1]],
12 | ]
13 | 
14 | # units
15 | in_units = [Unit.new("x1"), Unit.new("x2")]
16 | hiddenunits = [TanhUnit.new("z1"), TanhUnit.new("z2"), TanhUnit.new("z3"), TanhUnit.new("z4")]
17 | out_unit = [SigUnit.new("y1")]
18 | 
19 | # network
20 | network = Network.new(:error_func=>ErrorFunction::CrossEntropy)
21 | network.in  = in_units
22 | network.link in_units, hiddenunits
23 | network.link hiddenunits, out_unit
24 | network.out = out_unit
25 | 
26 | t1 = Time.now.to_f
27 | eta = 0.1
28 | sum_e = 999999
29 | 10000.times do |tau|
30 |   s = 0
31 |   D.each do |data|
32 |     s += network.error_function(data[0], data[1])
33 |   end
34 |   #puts "sum of errors: #{tau} => #{s}"
35 |   break if s > sum_e
36 |   sum_e = s
37 | 
38 |   D.sort{rand}.each do |data|
39 |     grad = network.gradient_E(data[0], data[1])
40 |     network.weights.descent eta, grad
41 |   end
42 | end
43 | #network.weights.dump
44 | 
45 | t2 = Time.now.to_f
46 | puts "#{RUBY_VERSION}(#{RUBY_RELEASE_DATE})[#{RUBY_PLATFORM}] #{((t2-t1)*1000).to_i/1000.0} sec"
47 | 
48 | #puts "(0, 0) => #{network.apply(0, 0)}, (1, 1) => #{network.apply(1, 1)}"
49 | #puts "(0, 1) => #{network.apply(0, 1)}, (1, 0) => #{network.apply(1, 0)}"
50 | 
51 | 


--------------------------------------------------------------------------------
/ngram/knlm.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # n-Gram Language Model with Knerser-Ney Smoother
  5 | # This code is available under the MIT License.
  6 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  7 | 
  8 | import sys, codecs, re, numpy
  9 | 
 10 | class NGram(dict):
 11 |     def __init__(self, N, depth=1):
 12 |         self.freq = 0
 13 |         self.N = N
 14 |         self.depth = depth
 15 |     def inc(self, v):
 16 |         if self.depth <= self.N:
 17 |             if v not in self:
 18 |                 self[v] = NGram(self.N, self.depth + 1)
 19 |             self[v].freq += 1
 20 |             return self[v]
 21 |     def dump(self):
 22 |         if self.depth <= self.N:
 23 |             return "%d:{%s}" % (self.freq, ",".join("'%s':%s" % (k,d.dump()) for k,d in self.iteritems()))
 24 |         return "%d" % self.freq
 25 | 
 26 |     def probKN(self, D, given=""):
 27 |         assert D >= 0.0 and D <= 1.0
 28 |         if given == "":
 29 |             voca = self.keys()
 30 |             n = float(self.freq)
 31 |             return voca, [self[v].freq / n for v in voca]
 32 |         else:
 33 |             if len(given) >= self.N:
 34 |                 given = given[-(self.N-1):]
 35 |             voca, low_prob = self.probKN(D, given[1:])
 36 |             cur_ngram = self
 37 |             for v in given:
 38 |                 if v not in cur_ngram: return voca, low_prob
 39 |                 cur_ngram = cur_ngram[v]
 40 |             g = 0.0 # for normalization
 41 |             freq = []
 42 |             for v in voca:
 43 |                 c = cur_ngram[v].freq if v in cur_ngram else 0
 44 |                 if c > D:
 45 |                     g += D
 46 |                     c -= D
 47 |                 freq.append(c)
 48 |             n = float(cur_ngram.freq)
 49 |             return voca, [(c + g * lp) / n for c, lp in zip(freq, low_prob)]
 50 | 
 51 | class Generator(object):
 52 |     def __init__(self, ngram):
 53 |         self.ngram = ngram
 54 |         self.start()
 55 |     def start(self):
 56 |         self.pointers = []
 57 |     def inc(self, v):
 58 |         pointers = self.pointers + [self.ngram]
 59 |         self.pointers = [d.inc(v) for d in pointers if d != None]
 60 |         self.ngram.freq += 1
 61 | 
 62 | def main():
 63 |     import optparse
 64 | 
 65 |     parser = optparse.OptionParser()
 66 |     parser.add_option("-n", dest="ngram", type="int", help="n-gram", default=7)
 67 |     parser.add_option("-d", dest="discount", type="float", help="discount parameter of Knerser-Ney", default=0.5)
 68 |     parser.add_option("-i", dest="numgen", type="int", help="number of texts to generate", default=100)
 69 |     parser.add_option("-e", dest="encode", help="character code of input file(s)", default='utf-8')
 70 |     parser.add_option("-o", dest="output", help="output filename", default="generated.txt")
 71 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
 72 |     (opt, args) = parser.parse_args()
 73 | 
 74 |     numpy.random.seed(opt.seed)
 75 | 
 76 |     START = u"\u0001"
 77 |     END = u"\u0002"
 78 | 
 79 |     ngram = NGram(opt.ngram)
 80 |     gen = Generator(ngram)
 81 |     for filename in args:
 82 |         with codecs.open(filename, "rb", opt.encode) as f:
 83 |             for s in f:
 84 |                 s = s.strip()
 85 |                 if len(s) == 0: continue
 86 |                 s = START + s + END
 87 |                 gen.start()
 88 |                 for c in s:
 89 |                     gen.inc(c)
 90 | 
 91 |     D = opt.discount
 92 |     with codecs.open(opt.output, "wb", "utf-8") as f:
 93 |         for n in xrange(opt.numgen):
 94 |             st = START
 95 |             for i in xrange(1000):
 96 |                 voca, prob = ngram.probKN(D, st)
 97 |                 i = numpy.random.multinomial(1, prob).argmax()
 98 |                 v = voca[i]
 99 |                 if v == END: break
100 |                 st += v
101 |             f.write(st[1:])
102 |             f.write("\n")
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 
107 | 


--------------------------------------------------------------------------------
/ngram/ngram.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env ruby
  2 | 
  3 | testtext = open(ARGV[0]) {|f| f.read }
  4 | traintext = open(ARGV[1]) {|f| f.read }
  5 | 
  6 | count = 0
  7 | freq = Hash.new(0)
  8 | freq2 = Hash.new(0)
  9 | pre = nil
 10 | traintext.scan(/\w+/) do |word|
 11 |   word.downcase!
 12 |   freq[word] += 1
 13 |   freq2[pre+" "+word] += 1 if pre
 14 |   pre = word
 15 |   count += 1
 16 | end
 17 | 
 18 | class MLE
 19 |   def initialize(freq, freq2, count)
 20 |     @prob = Hash.new(1)
 21 |     @prob2 = Hash.new(1)
 22 |     count = count.to_f
 23 |     freq.each do |word, f|
 24 |       @prob[word] = f / count
 25 |     end
 26 |     freq2.each do |word, f|
 27 |       @prob2[word] = f / (count-1)
 28 |     end
 29 |   end
 30 |   def set_target(text);end
 31 |   def key?(word);@prob.key?(word);end
 32 |   def [](word);@prob[word];end
 33 |   def bigram(word, word2);@prob2[word+" "+word2]/@prob[word];end
 34 | end
 35 | class Laplace
 36 |   def initialize(freq, freq2, count, lambda=1.0)
 37 |     @freq = freq.clone
 38 |     @freq2 = freq2.clone
 39 |     @count = count
 40 |     @lambda = lambda
 41 |   end
 42 |   def set_target(text)
 43 |     @voca = @freq.clone
 44 |     text.scan(/\w+/) do |word|
 45 |       @voca[word] = 0 unless @voca.key?(word)
 46 |     end
 47 |     @V = @voca.size
 48 |   end
 49 |   def key?(word);@voca.key?(word);end
 50 |   def [](word);(@voca[word]+@lambda)/(@count+@V*@lambda);end
 51 |   def bigram(word, word2)
 52 |     (@freq2[word+" "+word2]+@lambda)/(@count-1+@V*@V*@lambda)/self[word]
 53 |   end
 54 | end
 55 | 
 56 | model_MLE = MLE.new(freq, freq2, count)
 57 | model_Laplace = Laplace.new(freq, freq2, count)
 58 | model_Laplace.set_target(testtext)
 59 | model_ELE = Laplace.new(freq, freq2, count, 0.5)
 60 | model_ELE.set_target(testtext)
 61 | 
 62 | def loglikelihood(text, model)
 63 |   logL = 0.0  # unigram
 64 |   logL2 = 0.0 # bigram
 65 |   count = 0
 66 |   pre = nil
 67 |   text.scan(/\w+/) do |word|
 68 |     word.downcase!
 69 |     if model.key?(word)
 70 |       logL += Math.log(model[word])
 71 |       count += 1
 72 |       if pre
 73 |         logL2 += Math.log(model.bigram(pre, word))
 74 |       else
 75 |         logL2 += Math.log(model[word])
 76 |       end
 77 |       pre = word
 78 |     end
 79 |   end
 80 |   [logL, logL2, count]
 81 | end
 82 | 
 83 | def entropy(text, model)
 84 |   ent = 0.0
 85 |   text.scan(/\w+/) do |word|
 86 |     w = word.downcase
 87 |     ent -= model[w] * Math.log(model[w])
 88 |   end
 89 |   ent
 90 | end
 91 | 
 92 | 
 93 | puts "MLE:"
 94 | logL, logL2, count = loglikelihood(traintext, model_MLE)
 95 | crossent = -logL / count / Math.log(2)
 96 | crossent2 = -logL2 / count / Math.log(2)
 97 | puts "train text(#{count} words)"
 98 | puts "logL of unigrams = #{logL}, cross entropy = #{crossent}"
 99 | puts "logL of bigrams = #{logL2}, cross entropy = #{crossent2}"
100 | #logL, count = loglikelihood(testtext, model_MLE)
101 | #crossent = -logL / count / Math.log(2)
102 | #puts "logL of test text(#{count} words, ignore unseen words) = #{logL}, cross entropy = #{crossent}"
103 | 
104 | puts "Laplace:"
105 | logL, logL2, count = loglikelihood(traintext, model_Laplace)
106 | crossent = -logL / count / Math.log(2)
107 | crossent2 = -logL2 / count / Math.log(2)
108 | puts "logL of train text(#{count} words) = #{logL}, cross entropy = #{crossent}"
109 | puts "logL2 of train text(#{count} words) = #{logL2}, cross entropy 2 = #{crossent2}"
110 | logL, logL2, count = loglikelihood(testtext, model_Laplace)
111 | crossent = -logL / count / Math.log(2)
112 | crossent2 = -logL2 / count / Math.log(2)
113 | puts "logL of test text(#{count} words) = #{logL}, cross entropy = #{crossent}"
114 | puts "logL2 of test text(#{count} words) = #{logL2}, cross entropy 2 = #{crossent2}"
115 | 
116 | puts "ELE:"
117 | logL, logL2, count = loglikelihood(traintext, model_ELE)
118 | crossent = -logL / count / Math.log(2)
119 | crossent2 = -logL2 / count / Math.log(2)
120 | puts "logL of train text(#{count} words) = #{logL}, cross entropy = #{crossent}"
121 | puts "logL2 of train text(#{count} words) = #{logL2}, cross entropy 2 = #{crossent2}"
122 | logL, logL2, count = loglikelihood(testtext, model_ELE)
123 | crossent = -logL / count / Math.log(2)
124 | crossent2 = -logL2 / count / Math.log(2)
125 | puts "logL of test text(#{count} words) = #{logL}, cross entropy = #{crossent}"
126 | puts "logL2 of test text(#{count} words) = #{logL2}, cross entropy 2 = #{crossent2}"
127 | 
128 | 
129 | #puts "logL of train text = #{loglikelihood(traintext, model_MLE)}, entropy = #{entropy(traintext, model_MLE)}"
130 | #puts "logL of test text(ignore unseen words) = #{loglikelihood(testtext, model_MLE)}, entropy = #{entropy(testtext, model_MLE)}"
131 | 
132 | 
133 | 


--------------------------------------------------------------------------------
/ngram/wordcount.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | 
 3 | 
 4 | import re, sys
 5 | 
 6 | class NaiveCounting:
 7 |     def __init__(self):
 8 |         self.map = dict()
 9 |     def add(self, word):
10 |         if word in self.map:
11 |             self.map[word] += 1
12 |         else:
13 |             self.map[word] = 1
14 | 
15 | class SpaceSaving:
16 |     def __init__(self, k):
17 |         self.k = k
18 |         self.map = dict()
19 |     def add(self, word):
20 |         if word in self.map:
21 |             self.map[word] += 1
22 |         elif len(self.map) < self.k:
23 |             self.map[word] = 1
24 |         else:
25 |             j = min(self.map, key=lambda x:self.map[x])
26 |             cj = self.map.pop(j)
27 |             self.map[word] = cj + 1
28 | 
29 | 
30 | text = ""
31 | for filename in sys.argv:
32 |     with open(filename, "rb") as f:
33 |         text += f.read()
34 | 
35 | c1 = NaiveCounting()
36 | c2 = SpaceSaving(1000)
37 | c3 = SpaceSaving(100)
38 | 
39 | n = 0
40 | for m in re.finditer(r'[A-Za-z]+', text):
41 |     word = m.group(0).lower()
42 |     c1.add(word)
43 |     c2.add(word)
44 |     c3.add(word)
45 |     n += 1
46 | 
47 | print "total words = %d" % n
48 | 
49 | words = c1.map.items()
50 | words.sort(key=lambda x:(-x[1], x[0]))
51 | m2 = c2.map
52 | m3 = c3.map
53 | for i, x in enumerate(words):
54 |     print "%d\t%s\t%d\t%d\t%d" % (i+1, x[0], x[1], m2.get(x[0],0), m3.get(x[0],0))
55 | 
56 | 
57 | 


--------------------------------------------------------------------------------
/pca/bayes.r:
--------------------------------------------------------------------------------
  1 | # Bayesian PPCA for R
  2 | 
  3 | M <- 2;
  4 | I <- 50;
  5 | splits <- 1;
  6 | 
  7 | directory <- ".";
  8 | 
  9 | argv <- commandArgs(T);
 10 | if (length(argv)>0) directory <- commandArgs(T)[1];
 11 | if (length(argv)>1) M <- as.integer(commandArgs(T)[2]);
 12 | if (length(argv)>2) I <- as.integer(commandArgs(T)[3]);
 13 | if (length(argv)>3) splits <- as.integer(commandArgs(T)[4]);
 14 | 
 15 | oilflow <- as.matrix(read.table(sprintf("%s/DataTrn.txt", directory)));
 16 | oilflow.labels <- read.table(sprintf("%s/DataTrnLbls.txt", directory));
 17 | 
 18 | # density function of multivariate Gaussian
 19 | dmnorm <- function(x, mu, sig) {
 20 | 	D <- length(mu);
 21 | 	1/((2 * pi)^D * sqrt(det(sig))) * exp(- t(x-mu) %*% solve(sig) %*% (x-mu) / 2)[1];
 22 | }
 23 | 
 24 | ppca_bayes <- function() {
 25 | 	D <- ncol(oilflow);
 26 | 	N <- nrow(oilflow);
 27 | 	col <- colSums(t(oilflow.labels) * c(4,3,2));
 28 | 	pch <- colSums(t(oilflow.labels) * c(3,1,4));
 29 | 
 30 | 	# initialize parameters
 31 | 	W <- matrix(rnorm(M*D), D);
 32 | 	sigma2 <- rgamma(1,1);
 33 | 	alpha <- c(1, rep(10000, M-1));
 34 | 
 35 | 	# mu = mean x_bar
 36 | 	mu <- colMeans(oilflow);
 37 | 	xn_minus_x_bar <- t(oilflow) - mu;  # DxN-matrix
 38 | 	S <- var(oilflow);
 39 | 
 40 | 	# iteration
 41 | 	for(i in 0:I) {
 42 | 		# M = W^T W + sigma^2 I (PRML 12.41)
 43 | 		M_inv <- solve(t(W) %*% W + sigma2 * diag(M));
 44 | 
 45 | 		### E-step:
 46 | 
 47 | 		# E[z_n] = M^-1 W^T (x_n - x^bar) (PRML 12.54)
 48 | 		Ez <- t(M_inv %*% t(W) %*% xn_minus_x_bar);
 49 | 
 50 | 		# E[z_n z_n^T] = sigma^2 M^-1 + E[z_n]E[z_n]^T (PRML 12.55)
 51 | 		Ezz <- list();
 52 | 		sum_Ezz <- matrix(numeric(M*M), M);
 53 | 		for(n in 1:N) {
 54 | 			ezz <- sigma2 * M_inv + Ez[n,] %*% t(Ez[n,]);
 55 | 			Ezz[[n]] <- ezz;
 56 | 			sum_Ezz <- sum_Ezz + ezz;
 57 | 		}
 58 | 
 59 | 		### M-step:
 60 | 
 61 | 		# W_new = {sum (x_n - x^bar)E[z_n]^T}{sum E[z_n z_n^T] + sigma^2 A}^-1 (PRML 12.63)
 62 | 		W <- xn_minus_x_bar %*% Ez %*% solve(sum_Ezz + diag(sigma2 * alpha));
 63 | 
 64 | 		# sigma_new^2 = 1/ND sum{ |x_n-x^bar|^2 - 2E[z_n]^T W^T (x_n-x^bar) + Tr(E[z_n z_n^T] W^T W) } (PRML 12.57)
 65 | 		sigma2 <- sum(xn_minus_x_bar^2) - 2 * sum(diag(t(W) %*% xn_minus_x_bar %*% Ez));
 66 | 		for(n in 1:N) {
 67 | 			sigma2 <- sigma2 + sum(diag(Ezz[[n]] %*% t(W) %*% W));
 68 | 		}
 69 | 		sigma2 <- sigma2 / N / D;
 70 | 
 71 | 		# alpha_i = D / w_i^T w_i (PRML 12.62)
 72 | 		if (i>0) alpha <- D / diag(t(W) %*% W);
 73 | 
 74 | 		cat(sprintf("M=%d, I=%d, alpha=(%s)\n", M, i, paste(sprintf(" %.2f",alpha),collapse=",")));
 75 | 		if (sum(alpha>1e6)){
 76 | 			W <- W[,alpha<1e6];
 77 | 			alpha <- alpha[alpha<1e6];
 78 | 			M <- length(alpha);
 79 | 		}
 80 | 	}
 81 | 
 82 | 	# draw chart
 83 | 	draw_chart <- function(targets) {
 84 | 		plot(Ez[,targets], col=col, pch=pch, xlim=c(-3,3), ylim=c(-3,3),
 85 | 				#main=sprintf("M=%d, I=%d, alpha=(%s)\n", M, i, paste(sprintf(" %.2f",alpha), collapse=",")),
 86 | 				xlab=sprintf("alpha=%.2f", alpha[targets[1]]),
 87 | 				ylab=sprintf("alpha=%.2f", alpha[targets[2]])
 88 | 		);
 89 | 	};
 90 | 	png(width=640, height=640);
 91 | 	par(mfrow=c(splits, splits), mar=c(4, 4, 2, 2));
 92 | 	#for(i in 1:(M-1)) for(j in (i+1):M) draw_chart(c(i, j));
 93 | 	#for(angle in 10:80) scatterplot3d(Ez[,1], Ez[,2], Ez[,3], color=col, pch=pch, xlim=c(-3,3), ylim=c(-3,3), zlim=c(-3,3), angle=angle*2);
 94 | };
 95 | 
 96 | ppca_bayes()
 97 | 
 98 | #library(scatterplot3d);
 99 | #library(animation);
100 | #saveMovie(ppca_bayes(), interval=0.05, moviename="ppca_bayes", movietype="gif", outdir=getwd(),width=480, height=480);
101 | 
102 | 


--------------------------------------------------------------------------------
/pca/ema.r:
--------------------------------------------------------------------------------
 1 | # Probability Principal Component Analysis with EM Algorithm for R
 2 | 
 3 | M <- 2;
 4 | I <- 50;
 5 | directory <- ".";
 6 | 
 7 | argv <- commandArgs(T);
 8 | if (length(argv)>0) directory <- commandArgs(T)[1];
 9 | if (length(argv)>1) M <- as.integer(commandArgs(T)[2]);
10 | 
11 | oilflow <- as.matrix(read.table(sprintf("%s/DataTrn.txt", directory)));
12 | oilflow.labels <- read.table(sprintf("%s/DataTrnLbls.txt", directory));
13 | likelihood.pre <- -999999;
14 | 
15 | ppca_em <- function(oilflow, oilflow.labels, M, I) {
16 | 	D <- ncol(oilflow);
17 | 	N <- nrow(oilflow);
18 | 	col <- colSums(t(oilflow.labels) * c(4,3,2));
19 | 	pch <- colSums(t(oilflow.labels) * c(3,1,4));
20 | 
21 | 	# initialize parameters
22 | 	W <- matrix(rnorm(M*D), D);
23 | 	sigma2 <- rgamma(1,1,1);
24 | 
25 | 	# mu = mean x_bar
26 | 	mu <- colMeans(oilflow);
27 | 	xn_minus_x_bar <- t(oilflow) - mu;  # DxN-matrix
28 | 	S <- var(oilflow);
29 | 
30 | 	# iteration
31 | 	for(i in 0:I) {
32 | 		# M = W^T W + sigma^2 I (PRML 12.41)
33 | 		M_inv <- solve(t(W) %*% W + sigma2 * diag(M));
34 | 
35 | 		### E-step:
36 | 
37 | 		# E[z_n] = M^-1 W^T (x_n - x^bar) (PRML 12.54)
38 | 		Ez <- t(M_inv %*% t(W) %*% xn_minus_x_bar);
39 | 
40 | 		# E[z_n z_n^T] = sigma^2 M^-1 + E[z_n]E[z_n]^T (PRML 12.55)
41 | 		Ezz <- list();
42 | 		sum_Ezz <- matrix(numeric(M*M), M);
43 | 		for(n in 1:N) {
44 | 			ezz <- sigma2 * M_inv + Ez[n,] %*% t(Ez[n,]);
45 | 			Ezz[[n]] <- ezz;
46 | 			sum_Ezz <- sum_Ezz + ezz;
47 | 		}
48 | 
49 | 		# likelihood
50 | 		C <- W %*% t(W) + diag(D) * sigma2;                 # (PRML 12.36)
51 | 		C_inv <- (diag(D) - W %*% M_inv %*% t(W)) / sigma2; # (PRML 12.40)
52 | 		likelihood <- - N / 2 * ( D * log(2 * pi) + log(det(C)) + sum(diag(C_inv %*% S)) ); # (PRML 12.44)
53 | 		plot(Ez, col=col, pch=pch, xlim=c(-3,3),ylim=c(-3,3),ylab="",
54 | 			xlab=sprintf("I=%d, log likelihood=%.3f", i, likelihood))
55 | 		if (i>5 && (likelihood - likelihood.pre) < 0.001) break;
56 | 		likelihood.pre <- likelihood;
57 | 
58 | 		### M-step:
59 | 
60 | 		# W_new = {sum (x_n - x^bar)E[z_n]^T}{sum E[z_n z_n^T]}^-1 (PRML 12.56)
61 | 		W <- xn_minus_x_bar %*% Ez %*% solve(sum_Ezz);
62 | 
63 | 		# sigma_new^2 = 1/ND sum{ |x_n-x^bar|^2 - 2E[z_n]^T W^T (x_n-x^bar) + Tr(E[z_n z_n^T] W^T W) } (PRML 12.57)
64 | 		sigma2 <- sum(xn_minus_x_bar^2) - 2 * sum(diag(t(W) %*% xn_minus_x_bar %*% Ez));
65 | 		for(n in 1:N) {
66 | 			sigma2 <- sigma2 + sum(diag(Ezz[[n]] %*% t(W) %*% W));
67 | 		}
68 | 		sigma2 <- sigma2 / N / D;
69 | 
70 | 	}
71 | 	print(likelihood);
72 | };
73 | 
74 | library(animation);
75 | saveMovie(ppca_em(oilflow, oilflow.labels, M, I), interval=1, moviename="ppca_em",
76 | 	movietype="gif", outdir=getwd(),width=480, height=480);
77 | 
78 | 


--------------------------------------------------------------------------------
/pca/ppca.r:
--------------------------------------------------------------------------------
 1 | # Probability Principal Component Analysis for R
 2 | 
 3 | M <- 2;
 4 | directory <- ".";
 5 | 
 6 | argv <- commandArgs(T);
 7 | if (length(argv)>0) directory <- commandArgs(T)[1];
 8 | if (length(argv)>1) M <- as.integer(commandArgs(T)[2]);
 9 | 
10 | oilflow <- as.matrix(read.table(sprintf("%s/DataTrn.txt", directory)));
11 | oilflow.labels <- read.table(sprintf("%s/DataTrnLbls.txt", directory));
12 | D <- ncol(oilflow);
13 | 
14 | # mu = mean x_bar
15 | mu <- colMeans(oilflow);
16 | 
17 | # eigenvalues and eigenvectors of covariance S
18 | e <- eigen(var(oilflow))
19 | 
20 | # sigma^2 = sum(rest of eigenvalues) / (D - M)
21 | sigma2 <- mean(e$values[-(1:M)]);
22 | 
23 | # W_ML = U_M(L_M - sigma^2 I)R, (now R = I)
24 | W_ML <- e$vectors[,1:M] %*% diag(e$values[1:M] - sigma2) %*% diag(c(1,-1))
25 | 
26 | # M = W^T W + sigma^2 I
27 | M_inv <- solve(t(W_ML) %*% W_ML + sigma2 * diag(M));
28 | 
29 | # projection into principal subspace
30 | z <- t(M_inv %*% t(W_ML) %*% (t(oilflow) - mu))
31 | 
32 | # draw chart
33 | col <- colSums(t(oilflow.labels) * c(4,3,2));  # ラベルごとに色を指定
34 | pch <- colSums(t(oilflow.labels) * c(3,1,4));  # ラベルごとにマーカーを指定
35 | plot(z, col=col, pch=pch, xlim=c(-2,4),ylim=c(-4,2))
36 | 
37 | 


--------------------------------------------------------------------------------
/perceptron/avg_percep_test.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | traindata = [  # AND
 4 |   [[0, 0], -1],
 5 |   [[1, 0], -1],
 6 |   [[0, 1], -1],
 7 |   [[1, 1], +1],
 8 | ]
 9 | 
10 | degree = traindata[0][0].size + 1
11 | w = Array.new(degree, 0)
12 | 
13 | 20.times do |c|
14 |   # shuffle
15 |   traindata = traindata.sort_by{rand}
16 | 
17 |   # training
18 |   n_errors = 0
19 |   w_a = Array.new(degree, 0)
20 |   n = 0
21 |   traindata.each do |x, t|
22 |     px = [1] + x # phai(x)
23 |     s = 0        # sigma w^T phai(x_n)
24 |     px.each_with_index do |x_i, i|
25 |       s += w[i] * x_i
26 |     end
27 |     if s * t <= 0  # error
28 |     #if (t>0)?(s<0):(s>=0) # 0 is also positive.
29 |       puts [c+1, w, px, s, t].inspect
30 |       n_errors += 1
31 |       px.each_with_index do |x_i, i|
32 |         w[i] += t * x_i
33 |         w_a[i] += t * x_i * n
34 |       end
35 |     end
36 |     n += 1
37 |   end
38 |   w_a.each_with_index do |w_i, i|
39 |     w[i] -= w_i.to_f / n
40 |   end
41 | 
42 |   if n_errors == 0
43 |     puts "convergence: #{c}"
44 |     break
45 |   end
46 | end
47 | 
48 | puts "w= #{w.inspect}"
49 | 
50 | 


--------------------------------------------------------------------------------
/perceptron/percep_test.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | 
 3 | traindata = [  # AND
 4 |   [[0, 0], -1],
 5 |   [[1, 0], -1],
 6 |   [[0, 1], -1],
 7 |   [[1, 1], +1],
 8 | ]
 9 | 
10 | degree = traindata[0][0].size + 1
11 | w = Array.new(degree, 0)
12 | 
13 | 20.times do |c|
14 |   # shuffle
15 |   traindata = traindata.sort_by{rand}
16 | 
17 |   # training
18 |   n_errors = 0
19 |   traindata.each do |x, t|
20 |     px = [1] + x # phai(x)
21 |     s = 0        # sigma w^T phai(x_n)
22 |     px.each_with_index do |x_i, i|
23 |       s += w[i] * x_i
24 |     end
25 |     if s * t <= 0  # error
26 |     #if (t>0)?(s<0):(s>=0) # 0 is also positive.
27 |       puts [c+1, w, px, s, t].inspect
28 |       n_errors += 1
29 |       px.each_with_index do |x_i, i|
30 |         w[i] += t * x_i
31 |       end
32 |     end
33 |   end
34 | 
35 |   if n_errors == 0
36 |     puts "convergence: #{c}"
37 |     break
38 |   end
39 | end
40 | 
41 | puts "w= #{w.inspect}"
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/perceptron/test.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby
 2 | if ARGV.length < 2
 3 |   puts "#$0 testfile modelfile"
 4 |   exit 1
 5 | end
 6 | 
 7 | w = open(ARGV[1]){|f| Marshal.load(f) }
 8 | 
 9 | # load test data
10 | data = []
11 | open(ARGV[0]) do |f|
12 |   while line = f.gets
13 |     features = line.split
14 |     sign = features.shift.to_i
15 |     map = Hash.new
16 |     features.each do |feature|
17 |       if feature =~ /^([0-9]+):([\+\-]?[0-9\.]+)$/
18 |         map[$1.to_i] = $2.to_f
19 |       end
20 |     end
21 |     data << [map, sign]
22 |   end
23 | end
24 | 
25 | result = Array.new(4, 0)
26 | data.each do |x, t|
27 |   x[w.size-1] = 1 # bias
28 |   s = 0
29 |   x.each do |i, x_i|
30 |     s += w[i] * x_i if i < w.size
31 |   end
32 |   result[(t>0?2:0)+(s>0?1:0)] += 1
33 | end
34 | 
35 | puts "Accuracy #{((result[3]+result[0]).to_f/data.size*100000).round/1000.0}% (#{result[3]+result[0]}/#{data.size})"
36 | puts "(Answer, Predict): (p,p):#{result[3]} (p,n):#{result[2]} (n,p):#{result[1]} (n,n):#{result[0]}"
37 | 
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/perceptron/train.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/ruby
  2 | 
  3 | require 'optparse'
  4 | opt = {:algo=>:P, :iteration=>10, :regularity=>1.0}
  5 | parser = OptionParser.new
  6 | parser.banner = "Usage: #$0 [options] trainfile modelfile"
  7 | parser.on('-i [VAL]', Integer, 'number of iteration') {|v| opt[:iteration] = v }
  8 | parser.on('-a [VAL]', [:P, :AP, :PA, :PA1, :PA2], 'algorism(P/AP/PA/PA1/PA2)') {|v| opt[:algo] = v }
  9 | parser.on('-C [VAL]', Float, 'regularization parameter (for PA1/PA2)') {|v| opt[:regularity] = v }
 10 | parser.parse!(ARGV)
 11 | if ARGV.length < 2
 12 |   $stderr.puts parser
 13 |   exit(1)
 14 | end
 15 | 
 16 | 
 17 | # common
 18 | 
 19 | def square_abs(x)
 20 |   square_abs_x = 0
 21 |   x.each do |i, x_i|
 22 |     square_abs_x += x_i * x_i
 23 |   end
 24 |   square_abs_x
 25 | end
 26 | 
 27 | class Train
 28 |   def initialize(degree)
 29 |     @degree = degree
 30 |     @w = Array.new(degree + 1, 0)
 31 |   end
 32 |   attr_accessor :w
 33 | 
 34 |   def loop(traindata, iteration=1, will_shuffle=true)
 35 |     pre_w = @w.dup
 36 |     iteration.times do |c|
 37 | 
 38 |       traindata = traindata.sort_by{rand} if will_shuffle
 39 |       traindata.each do |x, t|
 40 |         x[@degree] = 1 # bias
 41 |         s = 0        # sigma w^T phai(x_n)
 42 |         x.each do |i, x_i|
 43 |           s += @w[i] * x_i
 44 |         end
 45 |         yield @w, x, t, s
 46 |       end
 47 | 
 48 |       return c if pre_w == @w
 49 |     end
 50 |     nil
 51 |   end
 52 | end
 53 | 
 54 | 
 55 | # algorism
 56 | 
 57 | def perceptron(traindata, degree, iteration)
 58 |   training = Train.new(degree)
 59 |   c = training.loop(traindata, iteration) do |w, x, t, s|
 60 |     if s * t <= 0  # error
 61 |       x.each do |i, x_i|
 62 |         w[i] += t * x_i
 63 |       end
 64 |     end
 65 |   end
 66 |   return [training, c]
 67 | end
 68 | 
 69 | def average_perceptron(traindata, degree, iteration)
 70 |   training = Train.new(degree)
 71 |   iteration.times do |c|
 72 |     w_a = Array.new(degree + 1, 0) # for average perceptron
 73 |     n = 0
 74 |     is_convergenced = training.loop(traindata) do |w, x, t, s|
 75 |       if s * t <= 0  # error
 76 |         x.each do |i, x_i|
 77 |           w[i] += t * x_i
 78 |           w_a[i] += t * x_i * n # for average perceptron
 79 |         end
 80 |       end
 81 |       n += 1
 82 |     end
 83 |     return [training, c] if is_convergenced
 84 |     w_a.each_with_index do |w_i, i|
 85 |       training.w[i] -= w_i.to_f / n     # for averate perceptron
 86 |     end
 87 |   end
 88 |   return [training, nil]
 89 | end
 90 | 
 91 | def passive_aggressive(traindata, degree, iteration, aggressiveness=nil, regularity=0)
 92 |   training = Train.new(degree)
 93 |   c = training.loop(traindata, iteration) do |w, x, correct, predict|
 94 |     loss = 1 - correct * predict
 95 |     if loss > 0
 96 |       tau = loss.to_f / (square_abs(x) + regularity)
 97 |       tau = aggressiveness if aggressiveness && tau > aggressiveness
 98 |       x.each do |i, x_i|
 99 |         w[i] += tau * correct * x_i
100 |       end
101 |     end
102 |   end
103 |   return [training, c]
104 | end
105 | 
106 | 
107 | 
108 | # load training data
109 | traindata = []
110 | degree = 0
111 | open(ARGV[0]) do |f|
112 |   while line = f.gets
113 |     features = line.split
114 |     sign = features.shift.to_i
115 |     map = Hash.new
116 |     features.each do |feature|
117 |       if feature =~ /^([0-9]+):([\+\-]?[0-9\.]+)$/
118 |         term_id = $1.to_i
119 |         map[term_id] = $2.to_f
120 |         degree = term_id + 1 if degree <= term_id
121 |       end
122 |     end
123 |     traindata << [map, sign]
124 |   end
125 | end
126 | 
127 | # training
128 | 
129 | training, convergence = if opt[:algo] == :P
130 |   perceptron(traindata, degree, opt[:iteration])
131 | elsif opt[:algo] == :AP
132 |   average_perceptron(traindata, degree, opt[:iteration])
133 | elsif opt[:algo] == :PA
134 |   passive_aggressive(traindata, degree, opt[:iteration])
135 | elsif opt[:algo] == :PA1
136 |   passive_aggressive(traindata, degree, opt[:iteration], opt[:regularity])
137 | elsif opt[:algo] == :PA2
138 |   passive_aggressive(traindata, degree, opt[:iteration], nil, 0.5 / opt[:regularity])
139 | end
140 | 
141 | puts "convergence: #{convergence}" if convergence
142 | open(ARGV[1], 'w'){|f| Marshal.dump(training.w, f) }
143 | 
144 | 


--------------------------------------------------------------------------------
/privacy/randomized-response/rr-gibbs.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -
 3 | 
 4 | # Randomized Response (Gibbs Sampling)
 5 | # This code is available under the MIT License.
 6 | # (c)2021 Nakatani Shuyo / Cybozu Labs Inc.
 7 | 
 8 | nlist = [100, 1000, 10000]
 9 | 
10 | import numpy
11 | import matplotlib.pyplot as plt
12 | numpy.set_printoptions(precision=2, suppress=True)
13 | 
14 | true_prob = numpy.array([0.1, 0.2, 0.3, 0.4])
15 | #true_prob /= true_prob.sum()
16 | D = true_prob.size
17 | legend = [str(x) for x in true_prob]
18 | 
19 | p = 1/5
20 | pii = (1 + (D - 1) * p) / D # P(Y=i|X=i)
21 | pij = (1 - p) / D # P(Y=j|X=i)
22 | P = pij * numpy.ones((D, D)) + (pii - pij) * numpy.eye(D)
23 | 
24 | def gibbs_sampling(N, alpha):
25 |     true_count = numpy.array(N * true_prob, dtype=int)
26 |     true_count[-1] += N - true_count.sum()
27 | 
28 |     predicts = []
29 |     for _ in range(10000):
30 |         c = sum(numpy.random.multinomial(n, P[i,:]) for i, n in enumerate(true_count))
31 | 
32 |         pi = numpy.ones(D) + numpy.random.random(D) # initial
33 |         pi /= pi.sum()
34 |         sample = []
35 |         for epoch in range(400):
36 |             Q = pi * P.T # _ij = pi_j * P_ji
37 |             cond = Q.T / Q.sum(axis=1) # _ij = pi_i * P_ij / Σ_k pi_k * P_kj
38 | 
39 |             # sampling X
40 |             X = numpy.sum([numpy.random.multinomial(n, cond[:,i]) for i, n in enumerate(c)], axis=0)
41 | 
42 |             # sampling pi
43 |             pi = numpy.random.dirichlet(alpha + X)
44 | 
45 |             if epoch >= 200: sample.append(X)
46 | 
47 |         predicts.append(numpy.mean(sample, axis=0)/N)
48 | 
49 |     return numpy.array(predicts)
50 | 
51 | for N in nlist:
52 |     for alpha in [1.0, 0.1, 0.01]:
53 |         predicts = gibbs_sampling(N, alpha)
54 |         start = predicts.min()
55 |         end = predicts.max()
56 |         bins = 40
57 |         step = (end - start)/bins
58 | 
59 |         plt.hist(predicts, bins=numpy.arange(start, end, step), density=True)
60 |         plt.title("N = %d, alpha = %.2f" % (N, alpha))
61 |         plt.legend(legend)
62 |         plt.tight_layout()
63 |         plt.savefig("rr-gibbs-%d-%.2f.png" % (N, alpha))
64 |         plt.close()
65 | 
66 |         print("N=%d, alpha=%.2f, 1.true, 2.mean, 3.std, 4-5.95%%, 6.median" % (N, alpha))
67 |         print(numpy.vstack((
68 |             [true_prob, numpy.mean(predicts, axis=0), numpy.std(predicts, axis=0)],
69 |             numpy.quantile(predicts, [0.025,0.975,0.5], axis=0))))
70 | 


--------------------------------------------------------------------------------
/privacy/randomized-response/rr-mle.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -
 3 | 
 4 | # Randomized Response (Maximum Likelihood Estimation)
 5 | # This code is available under the MIT License.
 6 | # (c)2021 Nakatani Shuyo / Cybozu Labs Inc.
 7 | 
 8 | import numpy
 9 | import matplotlib.pyplot as plt
10 | numpy.set_printoptions(precision=2,suppress=True)
11 | 
12 | true_prob = numpy.array([0.1, 0.2, 0.3, 0.4])
13 | #true_prob /= true_prob.sum()
14 | D = true_prob.size
15 | legend = [str(x) for x in true_prob]
16 | 
17 | p = 1/5
18 | pii = (1 + (D - 1) * p) / D # P(Y=i|X=i)
19 | pij = (1 - p) / D # P(Y=j|X=i)
20 | P = pij * numpy.ones((D, D)) + (pii - pij) * numpy.eye(D)
21 | 
22 | for N in [100, 1000, 10000]:
23 |     true_count = numpy.array(N * true_prob, dtype=int)
24 |     true_count[-1] += N - true_count.sum()
25 | 
26 |     predicts = []
27 |     for _ in range(10000):
28 |         c = sum(numpy.random.multinomial(n, P[i,:]) for i, n in enumerate(true_count)) # Randomized Response
29 |         t = numpy.linalg.solve(P, c) # MLE
30 |         predicts.append(t/N)
31 |     predicts = numpy.array(predicts)
32 | 
33 |     start = predicts.min()
34 |     end = predicts.max()
35 |     bins = 40
36 |     step = (end - start)/bins
37 | 
38 |     plt.hist(predicts, bins=numpy.arange(start, end, step), density=True)
39 |     plt.title("N = %d" % N)
40 |     plt.legend(legend)
41 |     plt.tight_layout()
42 |     plt.savefig("rr-mle-%d.png" % N)
43 |     plt.close()
44 | 
45 |     print("N=%d, 1.true, 2.mean, 3.std, 4-5.95%%, 6.median" % N)
46 |     print(numpy.vstack((
47 |         [true_prob, numpy.mean(predicts, axis=0), numpy.std(predicts, axis=0)],
48 |         numpy.quantile(predicts, [0.025,0.975,0.5], axis=0))))
49 | 
50 | 


--------------------------------------------------------------------------------
/privacy/randomized-response/rr-vb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -
  3 | """Randomized Response (Collapsed Variational Bayesian)
  4 | 
  5 | This code is available under the MIT License.
  6 | (c)2021 Nakatani Shuyo / Cybozu Labs Inc.
  7 | 
  8 | Usage:
  9 |     experiments
 10 |     $ python rr-vb.py 100
 11 | 
 12 |     summary
 13 |     $ python rr-vb.py
 14 | """
 15 | 
 16 | import sys, os, itertools, json
 17 | from multiprocessing import Pool
 18 | import numpy
 19 | from scipy.stats import gaussian_kde
 20 | import matplotlib.pyplot as plt
 21 | numpy.set_printoptions(precision=3, suppress=True)
 22 | 
 23 | def iterproducts(list1, list2, cycles):
 24 |     for _ in range(cycles):
 25 |         iters = itertools.product(list1, list2)
 26 |         for x in iters: yield x
 27 | 
 28 | true_prob = numpy.array([0.1, 0.2, 0.3, 0.4])
 29 | legend = [str(x) for x in true_prob]
 30 | D = true_prob.size
 31 | #true_prob /= true_prob.sum()
 32 | 
 33 | p = 1/5
 34 | pii = (1 + (D - 1) * p) / D # P(Y=i|X=i)
 35 | pij = (1 - p) / D # P(Y=j|X=i)
 36 | P = pij * numpy.ones((D, D)) + (pii - pij) * numpy.eye(D)
 37 | 
 38 | datapath = "rr-vb.txt"
 39 | 
 40 | def variational_bayes(args):
 41 |     print("start:(%d, %.2f)" % args)
 42 |     N, alpha = args
 43 |     true_count = numpy.array(N * true_prob, dtype=int)
 44 |     true_count[-1] += N - true_count.sum()
 45 |     true_cum = numpy.cumsum(true_count)
 46 | 
 47 |     predicts = []
 48 |     for _ in range(10):
 49 |         Y = numpy.concatenate([numpy.random.choice(D, n, p=pb) for pb, n in zip(P, true_count)])
 50 |         numpy.random.shuffle(Y)
 51 | 
 52 |         X = numpy.random.random((N,D)) # P(X_n)
 53 |         X = (X.T/X.sum(axis=1)).T
 54 |         c = X.sum(axis=0)
 55 | 
 56 |         pre = c / c.sum()
 57 |         for epoch in range(200):
 58 |             for n in range(N):
 59 |                 c -= X[n,:]
 60 |                 x = P[:,Y[n]] * (alpha + c)
 61 |                 z = X[n,:] = x / x.sum()
 62 |                 c += z
 63 |             pi = c / c.sum()
 64 |             if ((pi - pre)**2).sum() < 1e-7: break
 65 |             pre = pi
 66 |         #print(epoch, pi)
 67 |         predicts.append((c/N).tolist())
 68 |     print("end:(%d, %.2f)" % args)
 69 |     return {"N":N, "alpha":alpha, "predicts":predicts}
 70 | 
 71 | if __name__ == '__main__':
 72 |     if len(sys.argv)>1:
 73 |         I = int(sys.argv[1])
 74 |         tasks = iterproducts([10000, 1000, 100], [1.0, 0.1, 0.01], I)
 75 |         with Pool(os.cpu_count()-1) as pool:
 76 |             for outputs in pool.imap(variational_bayes, tasks):
 77 |                 #print(outputs)
 78 |                 with open(datapath, "a") as f:
 79 |                     json.dump(outputs, f)
 80 |                     f.write("\n")
 81 |     else:
 82 |         data = dict()
 83 |         with open(datapath) as f:
 84 |             for s in f:
 85 |                 x = json.loads(s)
 86 |                 N = x["N"]
 87 |                 alpha = x["alpha"]
 88 |                 predicts = x["predicts"]
 89 |                 key = (N, alpha)
 90 |                 if key in data:
 91 |                     data[key].extend(predicts)
 92 |                 else:
 93 |                     data[key] = predicts
 94 | 
 95 |         cm = plt.get_cmap("tab10")
 96 |         for key, predicts in data.items():
 97 |             N, alpha = key
 98 |             print("VB: N=%d, alpha=%.2f, 1.true, 2.mean, 3.std, 4-5.95%%, 6.median (trials=%d)" % (N, alpha, len(predicts)))
 99 |             predicts = numpy.array(predicts)
100 |             print(numpy.vstack(([true_prob, numpy.mean(predicts, axis=0), numpy.std(predicts, axis=0)], numpy.quantile(predicts, [0.025,0.975,0.5], axis=0))))
101 | 
102 |             start = numpy.min(predicts)
103 |             end = numpy.max(predicts)
104 |             xseq = numpy.arange(start, end, 0.001)
105 |             pdfs = [gaussian_kde(predicts[:,i])(xseq) for i in range(D)]
106 |             bins = 50
107 |             step = (end - start)/bins
108 | 
109 |             plt.hist(predicts, bins=numpy.arange(start, end, step), density=True)
110 |             plt.legend(legend)
111 |             for i in range(D):
112 |                 plt.plot(xseq, pdfs[i], color=cm.colors[i], linewidth=0.5)
113 |             plt.title("VB : N = %d, alpha = %.2f" % (N, alpha))
114 |             plt.tight_layout()
115 |             plt.savefig("rr-vb-%d-%.2f.png" % (N, alpha))
116 |             plt.close()
117 | 
118 | 


--------------------------------------------------------------------------------
/sampling/hmc.r:
--------------------------------------------------------------------------------
 1 | # Hybrid Monte Carlo sampling
 2 | 
 3 | N <- 10000; # number of sampling
 4 | 
 5 | hmc_sampling <- function(N, E, partial_E, leapfrog_count=100, leapfrog_epsilon=0.01) {
 6 | 	r <- rnorm(1,0,1);
 7 | 	z <- 1;
 8 | 	zlist <- c();
 9 | 
10 | 	for(i in 1:N) {
11 | 		H <- E(z) + r^2/2;
12 | 		# leapfrog
13 | 		e <- sample(c(-1,1), 1) * runif(1, 0.9, 1.1) * leapfrog_epsilon;
14 | 		z2 <- z;
15 | 		r2 <- r - e * partial_E(z2) / 2;
16 | 		count = sample(leapfrog_count:(leapfrog_count*2), 1)
17 | 		for(j in 1:count) {
18 | 			z2 <- z2 + e * r2;
19 | 			r2 <- r2 - e * partial_E(z2);
20 | 		}
21 | 		r2 <- r2 - e * partial_E(z2) / 2;
22 | 		dH <- H - (E(z2) + r2^2/2);
23 | 		if (dH > 0 || runif(1) < exp(dH)) {
24 | 			z <- z2;
25 | 			zlist <- append(zlist, z);
26 | 
27 | 			# resampling of r from p(r|z) = p(r) = N(0,1)
28 | 			r <- rnorm(1,0,1);
29 | 		} else {
30 | 			#cat(sprintf("%d: rejected\n", i));
31 | 		}
32 | 	}
33 | 	cat(sprintf("reject: %d / %d\n", N-length(zlist), N));
34 | 	zlist;
35 | }
36 | 
37 | png();
38 | 
39 | # p(z) = N(0,1) = exp(-z^2/2)/sqrt(2pi)
40 | # E(z) = z^2/2, Zp = sqrt(2pi), dE/dz = z
41 | # Hamiltonian: H(z,r) = E(z) + K(r) = z^2/2 + r^2/2
42 | zlist <- hmc_sampling(N, function(z)z**2/2, function(z)z);
43 | hist(zlist, breaks=20, main=sprintf("N(1,0), mean=%.3f, var=%.3f", mean(zlist), var(zlist)));
44 | acf(zlist);
45 | 
46 | # p(z) = Gamma(a,b) = 1/Zp * exp((a-1)ln z - bz)
47 | # E(z) = -(a-1)ln z + bz, dE/dz = b - (a-1)/z
48 | a <- 3;
49 | b <- 2;
50 | zlist <- hmc_sampling(N, function(z)b*z-(a-1)*log(z), function(z)b-(a-1)/z);
51 | hist(zlist, breaks=20, main=sprintf("Gamma(%d,%d), mean=%.3f, var=%.3f", a, b, mean(zlist), var(zlist)));
52 | acf(zlist);
53 | 
54 | # E(z) = 1/12 z(z-1)(z-4)(z-6) = (z^4-11z^3+34z^2-24z)/12
55 | # dE/dz = (4z^3-33z^2+68z-24)/12
56 | N <- 1000
57 | zlist <- hmc_sampling(N, function(z)z*(z-1)*(z-4)*(z-6)/12, function(z)(4*z**3-33*z**2+68*z-24)/12, 
58 | 	leapfrog_count=400, leapfrog_epsilon=0.005);
59 | hist(zlist, breaks=30, main=sprintf("E(z)=z(z-1)(z-4)(z-6)/12, mean=%.3f, var=%.3f", mean(zlist), var(zlist)), freq=F, xlim=c(-2,8));
60 | par(new=T);
61 | plot(function(z)exp(-z*(z-1)*(z-4)*(z-6)/12), xlim=c(-2,8), col="red", ann=F, yaxt="n");
62 | 
63 | plot(zlist, type="l");
64 | acf(zlist);
65 | 


--------------------------------------------------------------------------------
/semisupervised/ssnb.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # encode: utf-8
  3 | 
  4 | # Semi-Supervised Naive Bayes Classifier with EM-Algorithm
  5 | #    [K. Nigam, A. McCallum, S. Thrun, and T. Mitchell 2000] Text Classifcation from Labeled and Unlabeled Documents using EM. Machine Learning
  6 | 
  7 | # This code is available under the MIT License.
  8 | # (c)2013 Nakatani Shuyo / Cybozu Labs Inc.
  9 | 
 10 | 
 11 | import optparse
 12 | import numpy, scipy
 13 | import sklearn.datasets
 14 | from sklearn.feature_extraction.text import CountVectorizer
 15 | 
 16 | def performance(i, test, phi, theta):
 17 |     z = test.data * numpy.log(phi) + numpy.log(theta) # M * K
 18 |     z -= z.max(axis=1)[:, None]
 19 |     z = numpy.exp(z)
 20 |     z /= z.sum(axis=1)[:, None]
 21 |     predict = z.argmax(axis=1)
 22 |     correct = (test.target == predict).sum()
 23 |     T = test.data.shape[0]
 24 |     accuracy = float(correct) / T
 25 |     log_likelihood = numpy.log(numpy.choose(test.target, z.T) + 1e-14).sum() / T
 26 | 
 27 |     print "%d : %d / %d = %.3f, average of log likelihood = %.3f" % (i, correct, T, accuracy, log_likelihood)
 28 |     return accuracy
 29 | 
 30 | def estimate(data, test, alpha, beta, n, K=None):
 31 |     M, V = data.data.shape
 32 |     if not K:
 33 |         K = data.target.max() + 1
 34 |     #if opt.training:
 35 |     #    train = [int(x) for x in opt.training.split(",")]
 36 |     #else:
 37 |     train = []
 38 |     for k in xrange(K):
 39 |         train.extend(numpy.random.choice((data.target==k).nonzero()[0], n))
 40 | 
 41 |     theta = numpy.ones(K) / K
 42 |     phi0 = numpy.zeros((V, K)) + beta
 43 |     for n in train:
 44 |         phi0[:, data.target[n]] += data.data[n, :].toarray().flatten()
 45 |     phi = phi0 / phi0.sum(axis=0)
 46 |     accuracy0 = performance(0, test, phi, theta)
 47 | 
 48 |     for i in xrange(20):
 49 |         # E-step
 50 |         z = data.data * numpy.log(phi) + numpy.log(theta) # M * K
 51 |         z -= z.max(axis=1)[:, None]
 52 |         z = numpy.exp(z)
 53 |         z /= z.sum(axis=1)[:, None]
 54 | 
 55 |         # M-step
 56 |         theta = z.sum(axis=0) + alpha
 57 |         theta /= theta.sum()
 58 |         phi = phi0 + data.data.T * z
 59 |         phi = phi / phi.sum(axis=0)
 60 | 
 61 |         accuracy = performance(i+1, test, phi, theta)
 62 | 
 63 |     return len(train), accuracy0, accuracy
 64 | 
 65 | def main():
 66 |     parser = optparse.OptionParser()
 67 | 
 68 |     parser.add_option("-K", dest="class_size", type="int", help="number of class")
 69 |     parser.add_option("-a", dest="alpha", type="float", help="parameter alpha", default=0.05)
 70 |     parser.add_option("-b", dest="beta", type="float", help="parameter beta", default=0.001)
 71 |     #parser.add_option("-n", dest="n", type="int", help="training size for each label", default=1)
 72 |     #parser.add_option("-t", dest="training", help="specify indexes of training", default=None)
 73 |     parser.add_option("--seed", dest="seed", type="int", help="random seed")
 74 |     (opt, args) = parser.parse_args()
 75 |     numpy.random.seed(opt.seed)
 76 | 
 77 |     data = sklearn.datasets.fetch_20newsgroups()
 78 |     test = sklearn.datasets.fetch_20newsgroups(subset='test')
 79 | 
 80 |     vec = CountVectorizer()
 81 |     data.data = vec.fit_transform(data.data).tocsr()
 82 |     test.data = vec.transform(test.data).tocsr() # use the same vocaburary of training data
 83 | 
 84 |     print "(data size, voca size) : (%d, %d)" % data.data.shape
 85 |     print "(test size, voca size) : (%d, %d)" % test.data.shape
 86 | 
 87 |     if opt.class_size:
 88 |         """
 89 |         index = data.target < opt.class_size
 90 |         a = data.data.toarray()[index, :]
 91 |         data.data = scipy.sparse.csr_matrix(a)
 92 |         data.target = data.target[index]
 93 |         print "(shrinked data size, voca size) : (%d, %d)" % data.data.shape
 94 |         """
 95 | 
 96 |         index = test.target < opt.class_size
 97 |         a = test.data.toarray()[index, :]
 98 |         test.data = scipy.sparse.csr_matrix(a)
 99 |         test.target = test.target[index]
100 |         print "(shrinked test size, voca size) : (%d, %d)" % test.data.shape
101 | 
102 | 
103 |     result = []
104 |     for n in xrange(50):
105 |         result.append(estimate(data, test, opt.alpha, opt.beta, n+1, 2))
106 |     for x in result:
107 |         print x
108 | 
109 | if __name__ == "__main__":
110 |     main()
111 | 
112 | 


--------------------------------------------------------------------------------
/sequence/testcrf.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # Project Gutenberg Content Extractor with CRF
  4 | 
  5 | import numpy
  6 | import time
  7 | from optparse import OptionParser
  8 | from crf import CRF, Features, FeatureVector
  9 | 
 10 | 
 11 | def main():
 12 |     def load_data(data):
 13 |         texts = []
 14 |         labels = []
 15 |         text = []
 16 |         data = "\n" + data + "\n"
 17 |         for line in data.split("\n"):
 18 |             line = line.strip()
 19 |             if len(line) == 0:
 20 |                 if len(text)>0:
 21 |                     texts.append(text)
 22 |                     labels.append(label)
 23 |                 text = []
 24 |                 label = []
 25 |             else:
 26 |                 token, info, chunk = line.split()
 27 |                 text.append((token, info))
 28 |                 label.append(chunk)
 29 |         return (texts, labels)
 30 | 
 31 |     texts, labels = load_data("""
 32 |     This DT B-NP
 33 |     temblor-prone JJ I-NP
 34 |     city NN I-NP
 35 |     dispatched VBD B-VP
 36 |     inspectors NNS B-NP
 37 |     , , O
 38 | 
 39 |     firefighters NNS B-NP
 40 |     and CC O
 41 |     other JJ B-NP
 42 |     earthquake-trained JJ I-NP
 43 |     personnel NNS I-NP
 44 |     to TO B-VP
 45 |     aid VB I-VP
 46 |     San NNP B-NP
 47 |     Francisco NNP I-NP
 48 |     . . O
 49 |     """)
 50 | 
 51 |     print texts, labels
 52 | 
 53 |     test_texts, test_labels = load_data("""
 54 |     Rockwell NNP B-NP
 55 |     said VBD B-VP
 56 |     the DT B-NP
 57 |     agreement NN I-NP
 58 |     calls VBZ B-VP
 59 |     for IN B-SBAR
 60 |     it PRP B-NP
 61 |     to TO B-VP
 62 |     supply VB I-VP
 63 |     200 CD B-NP
 64 |     additional JJ I-NP
 65 |     so-called JJ I-NP
 66 |     shipsets NNS I-NP
 67 |     for IN B-PP
 68 |     the DT B-NP
 69 |     planes NNS I-NP
 70 |     . . O
 71 |     """)
 72 | 
 73 |     features = Features(labels)
 74 |     tokens = dict([(i[0],1) for x in texts for i in x]).keys()
 75 |     infos = dict([(i[1],1) for x in texts for i in x]).keys()
 76 | 
 77 |     for label in features.labels:
 78 |         for token in tokens:
 79 |             features.add_feature( lambda x, y, l=label, t=token: 1 if y==l and x[0]==t else 0 )
 80 |         for info in infos:
 81 |             features.add_feature( lambda x, y, l=label, i=info: 1 if y==l and x[1]==i else 0 )
 82 |     features.add_feature_edge( lambda y_, y: 0 )
 83 | 
 84 |     fvs = [FeatureVector(features, x, y) for x, y in zip(texts, labels)]
 85 |     fv = fvs[0]
 86 |     text_fv = FeatureVector(features, test_texts[0]) # text sequence without labels
 87 | 
 88 | 
 89 |     crf = CRF(features, 0)
 90 |     theta0 = crf.random_param()
 91 |     print "initial log likelihood:", crf.likelihood(fvs, theta0)
 92 | 
 93 | 
 94 |     print ">> Steepest Descent"
 95 |     theta = theta0.copy()
 96 |     eta = 0.5
 97 |     t = time.time()
 98 |     for i in range(20):
 99 |         theta += eta * crf.gradient_likelihood(fvs, theta)
100 |         print i, "log likelihood:", crf.likelihood(fvs, theta)
101 |         eta *= 0.95
102 |     print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)
103 | 
104 |     print ">> SGD"
105 |     theta = theta0.copy()
106 |     eta = 0.5
107 |     t = time.time()
108 |     for i in range(20):
109 |         for fv in fvs:
110 |             theta += eta * crf.gradient_likelihood([fv], theta)
111 |         print i, "log likelihood:", crf.likelihood(fvs, theta)
112 |         eta *= 0.95
113 |     print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)
114 | 
115 |     print ">> SGD + FOBOS L1"
116 |     theta = theta0.copy()
117 |     eta = 0.5
118 |     lmd = 0.01
119 |     t = time.time()
120 |     for i in range(20):
121 |         lmd_eta = lmd * eta
122 |         for fv in fvs:
123 |             theta += eta * crf.gradient_likelihood([fv], theta)
124 |             theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta)
125 |         print i, "log likelihood:", crf.likelihood(fvs, theta)
126 |         eta *= 0.95
127 |     print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)
128 | 
129 |     print ">> Steepest Descent + FOBOS L1"
130 |     theta = theta0.copy()
131 |     eta = 0.2
132 |     lmd = 0.5
133 |     t = time.time()
134 |     for i in range(20):
135 |         theta += eta * crf.gradient_likelihood(fvs, theta)
136 |         lmd_eta = lmd * eta
137 |         theta = (theta > lmd_eta) * (theta - lmd_eta) + (theta < -lmd_eta) * (theta + lmd_eta)
138 |         print i, "log likelihood:", crf.likelihood(fvs, theta)
139 |         eta *= 0.9
140 |     print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)
141 |     #print theta
142 | 
143 |     print ">> BFGS"
144 |     t = time.time()
145 |     theta = crf.inference(fvs, theta0)
146 |     print "log likelihood:", crf.likelihood(fvs, theta)
147 |     print "time = %.3f, relevant features = %d / %d" % (time.time() - t, (numpy.abs(theta) > 0.00001).sum(), theta.size)
148 | 
149 | 
150 | if __name__ == "__main__":
151 |     main()
152 | 
153 | 


--------------------------------------------------------------------------------
/trie/da.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | import collections
  5 | import numpy
  6 | 
  7 | # Double Array for static ordered data
  8 | # This code is available under the MIT License.
  9 | # (c)2011 Nakatani Shuyo / Cybozu Labs Inc.
 10 | 
 11 | class DoubleArray(object):
 12 |     def __init__(self, verbose=False):
 13 |         self.verbose = verbose
 14 | 
 15 |     def validate_list(self, list):
 16 |         pre = ""
 17 |         for i, line in enumerate(list):
 18 |             if pre >= line:
 19 |                 raise Exception, "list has not ascent order at %d" % (i+1)
 20 |             pre = line
 21 | 
 22 |     def initialize(self, list):
 23 |         self.validate_list(list)
 24 | 
 25 |         self.N = 1
 26 |         self.base  = [-1]
 27 |         self.check = [-1]
 28 |         self.value = [-1]
 29 | 
 30 |         max_index = 0
 31 |         queue = collections.deque([(0, 0, len(list), 0)])
 32 |         while len(queue) > 0:
 33 |             index, left, right, depth = queue.popleft()
 34 |             if depth >= len(list[left]):
 35 |                 self.value[index] = left
 36 |                 left += 1
 37 |                 if left >= right: continue
 38 | 
 39 |             # get branches of current node
 40 |             stack = collections.deque([(right, -1)])
 41 |             cur, c1 = (left, ord(list[left][depth]))
 42 |             result = []
 43 |             while len(stack) >= 1:
 44 |                 while c1 == stack[-1][1]:
 45 |                     cur, c1 = stack.pop()
 46 |                 mid = (cur + stack[-1][0]) / 2
 47 |                 if cur == mid:
 48 |                     result.append((cur + 1, c1))
 49 |                     cur, c1 = stack.pop()
 50 |                 else:
 51 |                     c2 = ord(list[mid][depth])
 52 |                     if c1 != c2:
 53 |                         stack.append((mid, c2))
 54 |                     else:
 55 |                         cur = mid
 56 | 
 57 |             # search empty index for current node
 58 |             v0 = result[0][1]
 59 |             j = - self.check[0] - v0
 60 |             while any(j + v < self.N and self.check[j + v] >= 0 for right, v in result):
 61 |                 j = - self.check[j + v0] - v0
 62 |             tail_index = j + result[-1][1]
 63 |             if max_index < tail_index:
 64 |                 max_index = tail_index
 65 |                 self.extend_array(tail_index + 2)
 66 | 
 67 |             # insert current node into DA
 68 |             self.base[index] = j
 69 |             depth += 1
 70 |             for right, v in result:
 71 |                 child = j + v
 72 |                 self.check[self.base[child]] = self.check[child]
 73 |                 self.base[-self.check[child]] = self.base[child]
 74 |                 self.check[child] = index
 75 |                 queue.append((child, left, right, depth))
 76 |                 left = right
 77 | 
 78 |         self.shrink_array(max_index)
 79 | 
 80 |     def extend_array(self, max_cand):
 81 |         if self.N < max_cand:
 82 |             new_N = 2 ** int(numpy.ceil(numpy.log2(max_cand)))
 83 |             self.log("extend DA : %d => (%d) => %d", (self.N, max_cand, new_N))
 84 |             self.base.extend(    n - 1 for n in xrange(self.N, new_N))
 85 |             self.check.extend( - n - 1 for n in xrange(self.N, new_N))
 86 |             self.value.extend(     - 1 for n in xrange(self.N, new_N))
 87 |             self.N = new_N
 88 | 
 89 |     def shrink_array(self, max_index):
 90 |         self.log("shrink DA : %d => %d", (self.N, max_index + 1))
 91 |         self.N = max_index + 1
 92 |         self.check = numpy.array(self.check[:self.N])
 93 |         self.base = numpy.array(self.base[:self.N])
 94 |         self.value = numpy.array(self.value[:self.N])
 95 | 
 96 |         not_used = self.check < 0
 97 |         self.check[not_used] = -1
 98 |         not_used[0] = False
 99 |         self.base[not_used] = self.N
100 | 
101 |     def log(self, format, param):
102 |         if self.verbose:
103 |             import time
104 |             print "-- %s, %s" % (time.strftime("%Y/%m/%d %H:%M:%S"), format % param)
105 | 
106 |     def save(self, filename):
107 |         numpy.savez(filename, base=self.base, check=self.check, value=self.value)
108 | 
109 |     def load(self, filename):
110 |         loaded = numpy.load(filename)
111 |         self.base = loaded['base']
112 |         self.check = loaded['check']
113 |         self.value = loaded['value']
114 |         self.N = self.base.size
115 | 
116 |     def add_element(self, s, v):
117 |         pass
118 | 
119 |     def get_subtree(self, s):
120 |         cur = 0
121 |         for c in iter(s):
122 |             v = ord(c)
123 |             next = self.base[cur] + v
124 |             if next >= self.N or self.check[next] != cur:
125 |                 return None
126 |             cur = next
127 |         return cur
128 | 
129 |     def get_child(self, c, subtree):
130 |         v = ord(c)
131 |         next = self.base[subtree] + v
132 |         if next >= self.N or self.check[next] != subtree:
133 |             return None
134 |         return next
135 | 
136 |     def get(self, s):
137 |         cur = self.get_subtree(s)
138 |         if cur >= 0:
139 |             value = self.value[cur]
140 |             if value >= 0: return value
141 |         return None
142 | 
143 |     def get_value(self, subtree):
144 |         return self.value[subtree]
145 | 
146 |     def extract_features(self, st):
147 |         events = dict()
148 |         pointers = []
149 |         for c in iter(st):
150 |             pointers.append(0)
151 |             new_pointers = []
152 |             for pointer in pointers:
153 |                 p = self.get_child(c, pointer)
154 |                 if p is not None:
155 |                     new_pointers.append(p)
156 |                     id = self.value[p]
157 |                     if id >= 0:
158 |                         events[id] = events.get(id, 0) + 1
159 |             pointers = new_pointers
160 |         return events
161 | 
162 | 
163 | 
164 | 


--------------------------------------------------------------------------------
/trie/test_da.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | import unittest
 5 | import da
 6 | 
 7 | class TestDoubleArray(unittest.TestCase):
 8 |     def test1(self):
 9 |         trie = da.DoubleArray(verbose=False)
10 |         trie.initialize(["cat"])
11 |         self.assertEqual(trie.N, 4)
12 |         self.assert_(trie.get("ca") is None)
13 |         self.assert_(trie.get("xxx") is None)
14 |         self.assertEqual(trie.get("cat"), 0)
15 | 
16 |     def test2(self):
17 |         trie = da.DoubleArray()
18 |         trie.initialize(["cat", "dog"])
19 |         self.assertEqual(trie.N, 7)
20 |         self.assert_(trie.get("ca") is None)
21 |         self.assert_(trie.get("xxx") is None)
22 |         self.assertEqual(trie.get("cat"), 0)
23 |         self.assertEqual(trie.get("dog"), 1)
24 | 
25 |     def test3(self):
26 |         trie = da.DoubleArray(verbose=False)
27 |         trie.initialize(["ca", "cat", "deer", "dog", "fox", "rat"])
28 |         print trie.base
29 |         print trie.check
30 |         print trie.value
31 |         self.assertEqual(trie.N, 17)
32 |         self.assert_(trie.get("c") is None)
33 |         self.assertEqual(trie.get("ca"), 0)
34 |         self.assertEqual(trie.get("cat"), 1)
35 |         self.assertEqual(trie.get("deer"), 2)
36 |         self.assertEqual(trie.get("dog"), 3)
37 |         self.assert_(trie.get("xxx") is None)
38 | 
39 |     def test4(self):
40 |         trie = da.DoubleArray()
41 |         self.assertRaises(Exception, trie.initialize, ["cat", "ant"])
42 | 
43 | unittest.main()
44 | 
45 | 


--------------------------------------------------------------------------------
/trie/trie.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | # Naive Trie
 5 | # This code is available under the MIT License.
 6 | # (c)2011 Nakatani Shuyo / Cybozu Labs Inc.
 7 | 
 8 | class Trie(object):
 9 |     def initialize(self):
10 |         self.root = dict()
11 |     def add_element(self, s, v):
12 |         x = self.root
13 |         for c in s:
14 |             if c not in x: x[c] = dict()
15 |             x = x[c]
16 |         x[""] = v
17 |     def get_subtree(self, s):
18 |         x = self.root
19 |         for c in iter(st):
20 |             if c not in x: return None
21 |             x = x[c]
22 |         return x
23 |     def get_child(self, c, subtree):
24 |         if c not in x: return None
25 |         return subtree[c]
26 |     def get(self, s):
27 |         return self.get_value(self.get_subtree(s))
28 |     def get_value(self, subtree):
29 |         return subtree[""]
30 | 
31 | 


--------------------------------------------------------------------------------
/unsupervised/bs.rb:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/ruby -KN
 2 | # ./bs.rb [corpus files]
 3 | 
 4 | begin
 5 |   #raise
 6 |   require '../lib/infinitive.rb'
 7 |   INF = Infinitive.new
 8 | rescue
 9 |   module INF
10 |     def self.infinitive(word)
11 |       word.downcase
12 |     end
13 |   end
14 | end
15 | 
16 | docs = Array.new
17 | words = Hash.new{|h,k| h[k]=Hash.new }
18 | worddocs = Hash.new{|h,k| h[k]=Hash.new }
19 | while filename = ARGV.shift
20 |   puts "loading: #{filename}"
21 |   vec = Hash.new(0)
22 |   doc_id = docs.length
23 |   open(filename) do |f|
24 |     while line = f.gets
25 |       line.scan(/[A-Za-z]+/) do |word|
26 |         infword = INF.infinitive(word)
27 |         vec[infword] = 1
28 |         words[infword][word] = 1
29 |         worddocs[infword][doc_id] = 1
30 |       end
31 |       if vec.size > 100
32 |         docs << vec
33 |         doc_id = docs.length
34 |         vec = Hash.new(0)
35 |       end
36 |     end
37 |   end
38 |   docs << vec if vec.size > 0
39 | end
40 | 
41 | class BayesianSet
42 |   C = 2.0
43 |   def initialize(docs, words, worddocs)
44 |     @docs = docs
45 |     @words = words
46 |     @worddocs = worddocs
47 |     @alpha = docs.map{|vec| C * vec.size / words.length }
48 |     @beta = @alpha.map{|a| C - a }
49 |     puts "# of words = #{words.size}, # of docs = #{docs.length}"
50 |   end
51 | 
52 |   def search(query)
53 |     query = query.map{|x| INF.infinitive(x)}.uniq
54 |     n = query.length
55 |     alpha_tild = Array.new # ln(alpha~/alpha)
56 |     beta_tild = Array.new  # ln(beta~/beta)
57 |     @alpha.each_with_index do |a, i|
58 |       s = query.select{|w| @docs[i].key?(w) }.length
59 |       alpha_tild << Math.log(1 + s / a)
60 |       beta_tild << Math.log(1 + (n - s) / @beta[i])
61 |     end
62 | 
63 |     @worddocs.map do |w, docs|
64 |       score = 0
65 | =begin
66 |       # method of original paper
67 |       @docs.each_with_index do |vec, j|
68 |         score += Math.log(@alpha[j]+@beta[j])-Math.log(@alpha[j]+@beta[j]+n)
69 |         score += if vec.key?(w) then alpha_tild[j] else beta_tild[j] end
70 |       end
71 | =end
72 |       # simple & fast
73 |       docs.each do |j, dummy|
74 |         score += alpha_tild[j] - beta_tild[j]
75 |       end
76 | 
77 |       [w, score]
78 |     end.sort_by{|x| -x[1]}[0..9].each do |w, score|
79 |       puts "#{w}: #{score} (#{@words[w].keys.join(',')})"
80 |     end
81 |   end
82 | end
83 | 
84 | bs = BayesianSet.new(docs, words, worddocs)
85 | #if ARGV.length > 1
86 | #  bs.search(ARGV[1..-1])
87 | #else
88 |   while input = $stdin.gets
89 |     bs.search(input.split)
90 |     puts
91 |   end
92 | #end
93 | 
94 | 


--------------------------------------------------------------------------------
/unsupervised/ema.r:
--------------------------------------------------------------------------------
  1 | # EM algorithm and Online EMA
  2 | 
  3 | 
  4 | argv <- commandArgs(T);
  5 | if (length(argv[argv=="faithful"])) {
  6 | 	# Old Faithful dataset を取得して正規化
  7 | 	data("faithful");
  8 | 	xx <- scale(faithful, apply(faithful, 2, mean), apply(faithful, 2, sd));
  9 | 	K <- 2;
 10 | } else {
 11 | 	# ３次元＆３峰のテストデータを生成
 12 | 	library(MASS);
 13 | 	xx <- rbind(
 14 | 		mvrnorm(100, c(1,3,0), matrix(c(0.7324,-0.9193,0.5092,-0.9193,2.865,-0.2976,0.5092,-0.2976,3.294),3)),
 15 | 		mvrnorm(150, c(4,-1,-2), matrix(c(2.8879,-0.2560,0.5875,-0.2560,3.0338,1.2960,0.5875,1.2960,1.7438),3)),
 16 | 		mvrnorm(200, c(0,2,1), matrix(c(3.1178,1.7447,0.6726,1.7447,2.3693,0.0521,0.6726,0.0521,0.7917),3))
 17 | 	);
 18 | 	xx <- xx[sample(nrow(xx)),]
 19 | 	K <- 3;
 20 | }
 21 | N <- nrow(xx);
 22 | 
 23 | 
 24 | # パラメータの初期化(平均、共分散、混合率)
 25 | init_param <- function(K, D) {
 26 | 	sig <- list();
 27 | 	for(k in 1:K) sig[[k]] <- diag(K);
 28 | 	list(mu = matrix(rnorm(K * D), D), mix = numeric(K)+1/K, sig = sig);
 29 | }
 30 | 
 31 | # 多次元正規分布密度関数
 32 | dmnorm <- function(x, mu, sig) {
 33 | 	D <- length(mu);
 34 | 	1/((2 * pi)^D * sqrt(det(sig))) * exp(- t(x-mu) %*% solve(sig) %*% (x-mu) / 2)[1];
 35 | }
 36 | 
 37 | # EM アルゴリズムの E ステップ
 38 | Estep <- function(xx, param) {
 39 | 	K <- nrow(param$mu);
 40 | 	t(apply(xx, 1, function(x){
 41 | 		numer <- param$mix * sapply(1:K, function(k) {
 42 | 			dmnorm(x, param$mu[k,], param$sig[[k]])
 43 | 		});
 44 | 		numer / sum(numer);
 45 | 	}))
 46 | }
 47 | 
 48 | # EM アルゴリズムの M ステップ
 49 | Mstep <- function(xx, gamma_nk) {
 50 | 	K <- ncol(gamma_nk);
 51 | 	D <- ncol(xx);
 52 | 	N <- nrow(xx);
 53 | 
 54 | 	N_k <- colSums(gamma_nk);
 55 | 	new_mix <- N_k / N;
 56 | 	new_mu <- (t(gamma_nk) %*% xx) / N_k;
 57 | 
 58 | 	new_sig <- list();
 59 | 	for(k in 1:K) {
 60 | 		sig <- matrix(numeric(D^2), D);
 61 | 		for(n in 1:N) {
 62 | 			x <- xx[n,] - new_mu[k,];
 63 | 			sig <- sig + gamma_nk[n, k] * (x %*% t(x));
 64 | 		}
 65 | 		new_sig[[k]] <- sig / N_k[k]
 66 | 	}
 67 | 
 68 | 	list(mu=new_mu, sig=new_sig, mix=new_mix);
 69 | }
 70 | 
 71 | # 対数尤度関数
 72 | Likelihood <- function(xx, param) {
 73 | 	K <- nrow(param$mu);
 74 | 	sum(apply(xx, 1, function(x){
 75 | 		log(sum(param$mix * sapply(1:K, function(k) dmnorm(x, param$mu[k,], param$sig[[k]]))));
 76 | 	}))
 77 | }
 78 | 
 79 | OnlineEM <- function(xx, m, param) {
 80 | 	N <- nrow(xx);
 81 | 	K <- nrow(param$mu);
 82 | 
 83 | 	new_gamma <- param$mix * sapply(1:K, function(k) {
 84 | 		dmnorm(xx[m, ], param$mu[k,], param$sig[[k]]);
 85 | 	});
 86 | 	new_gamma <- new_gamma / sum(new_gamma);
 87 | 	delta <- new_gamma - param$gamma[m,];
 88 | 	param$gamma[m,] <- new_gamma;
 89 | 
 90 | 	param$mix <- param$mix + delta / N;
 91 | 	N_k <- param$mix * N;
 92 | 	for(k in 1:K) {
 93 | 		x <- xx[m,] - param$mu[k,];
 94 | 		d <- delta[k] / N_k[k];
 95 | 		param$mu[k,] <- param$mu[k,] + d * x;
 96 | 		param$sig[[k]] <- (1 - d) * (param$sig[[k]] + d * x %*% t(x));
 97 | 	}
 98 | 	param;
 99 | }
100 | 
101 | 
102 | for (n in 1:10) {
103 | 	# 初期値
104 | 	param0 <- init_param(K, ncol(xx));
105 | 
106 | 	# normal EM
107 | 	timing <- system.time({
108 | 		param <- param0;
109 | 
110 | 		# 収束するまで繰り返し
111 | 		likeli <- -999999;
112 | 		for (j in 1:999) {
113 | 			gamma_nk <- Estep(xx, param);
114 | 			param <- Mstep(xx, gamma_nk);
115 | 
116 | 			cat(sprintf(" %d: %.3f\n", j, (l <- Likelihood(xx, param))));
117 | 			if (l - likeli < 0.001) break;
118 | 			likeli <- l;
119 | 		}
120 | 	});
121 | 	cat(sprintf("Normal %d:convergence=%d, likelihood=%.4f, %1.2fsec\n", n, j, likeli, timing[3]));
122 | 	#print(param$mu);
123 | 
124 | 	# incremental EM
125 | 	timing <- system.time({
126 | 		param <- param0;
127 | 
128 | 		# 最初の一周は通常の EM
129 | 		gamma_nk <- Estep(xx, param);
130 | 		param <- Mstep(xx, gamma_nk);
131 | 		param$gamma <- gamma_nk;
132 | 
133 | 		# online EM
134 | 		likeli <- -999999;
135 | 		for (j in 2:100) {
136 | 			randomlist <- sample(1:N);
137 | 			for(m in randomlist) param <- OnlineEM(xx, m, param);
138 | 
139 | 			cat(sprintf(" %d: %.3f\n", j, (l <- Likelihood(xx, param))));
140 | 			if (l - likeli < 0.001) break;
141 | 			likeli <- l;
142 | 		}
143 | 	});
144 | 	cat(sprintf("Online %d:convergence=%d, likelihood=%.4f, %1.2fsec\n", n, j, likeli, timing[3]));
145 | 	#print(param$mu);
146 | }
147 | 
148 | # plot(xx, col=rgb(gamma_nk[,1],0,gamma_nk[,2]), xlab=paste(sprintf("%1.3f",t(param$mu)),collapse=","), ylab="");
149 | # points(param$mu, pch = 8);
150 | 
151 | 


--------------------------------------------------------------------------------
/unsupervised/plsi.rb:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/ruby -KN
  2 | # ./plsi.rb [corpus files]
  3 | 
  4 | begin
  5 |   require '../lib/infinitive.rb'
  6 |   INF = Infinitive.new
  7 | rescue
  8 |   module INF
  9 |     def self.infinitive(word);word.downcase;end
 10 |   end
 11 | end
 12 | 
 13 | def dump(obj)
 14 |   if obj.is_a?(Numeric)
 15 |     (obj*1000).round/1000.0
 16 |   elsif obj.is_a?(String)
 17 |     obj
 18 |   elsif obj.is_a?(Array)
 19 |     "[#{obj.map{|x| dump(x)}.join(',')}]\n"
 20 |   elsif obj.is_a?(Hash)
 21 |     "{#{obj.map{|k,v| "#{k}=>#{dump(v)}"}.join(',')}}\n"
 22 |   end
 23 | end
 24 | 
 25 | docs = Array.new
 26 | words = Hash.new{|h,k| h[k]=Hash.new(0) }
 27 | worddocs = Hash.new{|h,k| h[k]=Hash.new(0) }
 28 | while filename = ARGV.shift
 29 |   puts "loading: #{filename}"
 30 |   texts = open(filename) {|f| f.read }.split(/\n\n+/)
 31 | 
 32 |   texts.each_with_index do |text, doc_id|
 33 |     vec = Hash.new(0)
 34 |     docs << vec
 35 |     text.scan(/[A-Za-z]+/) do |word|
 36 |       infword = INF.infinitive(word)
 37 |       vec[infword] += 1
 38 |       words[infword][word] += 1
 39 |       worddocs[infword][doc_id] += 1
 40 |     end
 41 |   end
 42 | end
 43 | puts "# of words = #{words.size}, # of docs = #{docs.length}"
 44 | 
 45 | class PLSI
 46 |   K = 20
 47 |   def initialize(docs, words, worddocs)
 48 |     @docs = docs
 49 |     @words = words
 50 |     @worddocs = worddocs
 51 |     
 52 |     @z_k = Array.new(K){1.0/K}
 53 |     @d_i_z_k = Array.new(K){ Array.new(docs.length){1.0/docs.length} }
 54 |     @w_j_z_k = Array.new(K){
 55 |       h = Hash.new
 56 |       s = 0
 57 |       worddocs.each{|j,x| s+=(h[j]=rand) }
 58 |       worddocs.each{|j,x| h[j]/=s }
 59 |       h
 60 |     }
 61 |   end
 62 | 
 63 |   def stepEM
 64 |     new_z_k_numer = Array.new(K){0}
 65 |     new_z_k_denom = 0
 66 |     new_d_i_numer = Array.new(K){ Array.new(@docs.length){0} }
 67 |     new_w_j_numer = Array.new(K){ Hash.new(0) }
 68 | 
 69 |     @worddocs.each do |j, n_w_j|
 70 |       #(0..@docs.length-1).each do |i|
 71 |       #n_w_j_d_i = n_w_j[i]
 72 |       n_w_j.each do |i, n_w_j_d_i|
 73 | 
 74 |         # E-step
 75 |         posterior_denom = 0
 76 |         posterior_numers = Array.new(K)
 77 |         (0..K-1).each do |k|
 78 |           # p(z=k)p(x|z)p(y|z)
 79 |           posterior_denom += (posterior_numers[k] = @z_k[k] * @d_i_z_k[k][i] * @w_j_z_k[k][j])
 80 |         end
 81 | 
 82 |         # M-step
 83 |         posterior_numers.each_with_index do |posterior_numer, k|
 84 |           x = n_w_j_d_i * posterior_numer / posterior_denom
 85 |           new_z_k_numer[k] += x
 86 |           new_d_i_numer[k][i] += x
 87 |           new_w_j_numer[k][j] += x
 88 |         end
 89 |         new_z_k_denom += n_w_j_d_i
 90 |       end
 91 |     end
 92 | 
 93 |     @z_k = new_z_k_numer.map{|x| x / new_z_k_denom }
 94 | 
 95 |     new_d_i_numer.each_with_index do |d_i, k|
 96 |       d_i.each_with_index do |numer, i|
 97 |         @d_i_z_k[k][i] = numer / new_z_k_numer[k]
 98 |       end
 99 |     end
100 | 
101 |     new_w_j_numer.each_with_index do |w_j, k|
102 |       w_j.each do |j, numer|
103 |         @w_j_z_k[k][j] = numer / new_z_k_numer[k]
104 |       end
105 |     end
106 | 
107 |     #puts "----"
108 |     puts dump(@z_k)
109 |     #puts dump(@d_i_z_k)
110 |     #puts dump(@w_j_z_k)
111 |   end
112 | 
113 |   def max_z_k_w_j
114 |     cluster = Array.new(K){ Array.new }
115 |     @worddocs.each do |j, n_w_j|
116 |       argmax_k = nil
117 |       max_z_k = 0
118 |       sum = 0
119 |       (0..K-1).each do |k|
120 |         p_z_k_w_j = @z_k[k] * @w_j_z_k[k][j]
121 |         sum += p_z_k_w_j
122 |         if max_z_k < p_z_k_w_j
123 |           max_z_k = p_z_k_w_j
124 |           argmax_k = k
125 |         end
126 |       end
127 |       cluster[argmax_k] << [j, max_z_k / sum]
128 |     end
129 |     cluster
130 |   end
131 | end
132 | 
133 | plsi = PLSI.new(docs, words, worddocs)
134 | 200.times{ plsi.stepEM }
135 | 
136 | cluster = plsi.max_z_k_w_j
137 | cluster.each_with_index do |words, k|
138 |   puts "  cluster: #{k}"
139 |   sep = 1.0
140 |   output = []
141 |   words.sort_by{|x| -x[1] }.each do |x|
142 |     while sep >= x[1]
143 |       output << (sep*10).round
144 |       sep -= 0.1
145 |     end
146 |     output << x[0]
147 |   end
148 |   puts output.join(',')
149 | end
150 | 
151 | 


--------------------------------------------------------------------------------
/unsupervised/vb.r:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/shuyo/iir/a9b133f27e8ab5b8ef6f528c1f212717399d852f/unsupervised/vb.r


--------------------------------------------------------------------------------