├── LICENSE ├── README.md ├── lda.py ├── requirements.txt ├── stm.py ├── stm_main.py ├── utils.py └── vocabulary.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Retrieva, Inc. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Structured topic model 2 | 3 | 論文: [A Model of Text for Experimentation in the Social Sciences](https://scholar.princeton.edu/sites/default/files/bstewart/files/a_model_of_text_for_experimentation_in_the_social_sciences.pdf) 4 | 5 | ``` 6 | python stm_main.py -f -k -i 7 | ``` 8 | 9 | ``` 10 | optional arguments: 11 | -h, --help show this help message and exit 12 | -f FILENAME Set corpus filepath. Fileformat is csv 13 | -d DOCUMENT Set document field name 14 | -c CORPUS Using range of Brown corpus' files(start:end) 15 | --alpha ALPHA Parameter alpha for LDA(default=1.0) 16 | --beta BETA Parameter beta for LDA(default=0.1) 17 | -k TOPICS Number of topics(default=20) 18 | -i ITERATION Iteration count(default=100) 19 | -x X Set prevalences column name 20 | -y Y Set covariates column name 21 | --parser PARSER Select parser eng_nltk or mecab(default=mecab) 22 | --sigma SIGMA Initial value of sigma diagonals(default=0.1) 23 | --stopwords Exclude stop words by using corpus from nltk 24 | --seed SEED Random seed 25 | --df DF Threshold of document freaquency to cut words 26 | --interact Consider interaction between covariates adn topics 27 | --sinit Smart initialize of parameters for LDA 28 | ``` 29 | 30 | 注意 31 | - `-y` をつけた場合、計算速度が落ちます。また、perplexityがあまり下がらない傾向があります。 -------------------------------------------------------------------------------- /lda.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # Latent Dirichlet Allocation + collapsed Gibbs sampling 5 | # This code is available under the MIT License. 6 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc. 7 | 8 | import numpy 9 | 10 | class LDA: 11 | def __init__(self, K, alpha, beta, docs, V, smartinit=True): 12 | self.K = K 13 | self.alpha = alpha # parameter of topics prior 14 | self.beta = beta # parameter of words prior 15 | self.docs = docs 16 | self.V = V 17 | 18 | self.z_m_n = [] # topics of words of documents 19 | self.n_m_z = numpy.full((len(self.docs), K), alpha) # word count of each document and topic 20 | self.n_z_t = numpy.full((K, V), beta) # word count of each topic and vocabulary 21 | self.n_z = numpy.full(K, V * beta) # word count of each topic 22 | 23 | self.N = 0 24 | for m, doc in enumerate(docs): 25 | self.N += len(doc) 26 | z_n = [] 27 | for t in doc: 28 | if smartinit: 29 | p_z = self.n_z_t[:, t] * self.n_m_z[m] / self.n_z 30 | z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax() 31 | else: 32 | z = numpy.random.randint(0, K) 33 | z_n.append(z) 34 | self.n_m_z[m, z] += 1 35 | self.n_z_t[z, t] += 1 36 | self.n_z[z] += 1 37 | self.z_m_n.append(numpy.array(z_n)) 38 | 39 | def inference(self): 40 | """learning once iteration""" 41 | for m, doc in enumerate(self.docs): 42 | z_n = self.z_m_n[m] 43 | n_m_z = self.n_m_z[m] 44 | for n, t in enumerate(doc): 45 | # discount for n-th word t with topic z 46 | z = z_n[n] 47 | n_m_z[z] -= 1 48 | self.n_z_t[z, t] -= 1 49 | self.n_z[z] -= 1 50 | 51 | # sampling topic new_z for t 52 | p_z = self.n_z_t[:, t] * n_m_z / self.n_z 53 | new_z = numpy.random.multinomial(1, p_z / p_z.sum()).argmax() 54 | 55 | # set z the new topic and increment counters 56 | z_n[n] = new_z 57 | n_m_z[new_z] += 1 58 | self.n_z_t[new_z, t] += 1 59 | self.n_z[new_z] += 1 60 | 61 | def worddist(self): 62 | """get topic-word distribution""" 63 | return self.n_z_t / self.n_z[:, numpy.newaxis] 64 | 65 | def perplexity(self, docs=None): 66 | if docs is None: docs = self.docs 67 | phi = self.worddist() 68 | log_per = 0 69 | N = 0 70 | Kalpha = self.K * self.alpha 71 | for m, doc in enumerate(docs): 72 | theta = self.n_m_z[m] / (len(self.docs[m]) + Kalpha) 73 | for w in doc: 74 | log_per -= numpy.log(numpy.inner(phi[:,w], theta)) 75 | N += len(doc) 76 | return numpy.exp(log_per / N) 77 | 78 | def learning(self, iteration, voca): 79 | pre_perp = self.perplexity() 80 | print ("initial perplexity=%f" % pre_perp) 81 | for i in range(iteration): 82 | self.inference() 83 | perp = self.perplexity() 84 | print ("-%d p=%f" % (i + 1, perp)) 85 | if pre_perp is not None: 86 | if pre_perp < perp: 87 | self.output_word_topic_dist(voca) 88 | pre_perp = None 89 | else: 90 | pre_perp = perp 91 | self.output_word_topic_dist(voca) 92 | 93 | def output_word_topic_dist(self, voca): 94 | zcount = numpy.zeros(self.K, dtype=int) 95 | wordcount = [dict() for k in range(self.K)] 96 | for xlist, zlist in zip(self.docs, self.z_m_n): 97 | for x, z in zip(xlist, zlist): 98 | zcount[z] += 1 99 | if x in wordcount[z]: 100 | wordcount[z][x] += 1 101 | else: 102 | wordcount[z][x] = 1 103 | 104 | phi = self.worddist() 105 | for k in range(self.K): 106 | print ("\n-- topic: %d (%d words)" % (k, zcount[k])) 107 | for w in numpy.argsort(-phi[k])[:20]: 108 | print ("%s: %f (%d)" % (voca[w], phi[k,w], wordcount[k].get(w,0))) 109 | 110 | def main(): 111 | import optparse 112 | import vocabulary 113 | parser = optparse.OptionParser() 114 | parser.add_option("-f", dest="filename", help="corpus filename") 115 | parser.add_option("-c", dest="corpus", help="using range of Brown corpus' files(start:end)") 116 | parser.add_option("--alpha", dest="alpha", type="float", help="parameter alpha", default=0.5) 117 | parser.add_option("--beta", dest="beta", type="float", help="parameter beta", default=0.5) 118 | parser.add_option("-k", dest="K", type="int", help="number of topics", default=20) 119 | parser.add_option("-i", dest="iteration", type="int", help="iteration count", default=100) 120 | parser.add_option("-s", dest="smartinit", action="store_true", help="smart initialize of parameters", default=False) 121 | parser.add_option("--stopwords", dest="stopwords", help="exclude stop words", action="store_true", default=False) 122 | parser.add_option("--seed", dest="seed", type="int", help="random seed") 123 | parser.add_option("--df", dest="df", type="int", help="threshold of document freaquency to cut words", default=0) 124 | (options, args) = parser.parse_args() 125 | if not (options.filename or options.corpus): parser.error("need corpus filename(-f) or corpus range(-c)") 126 | 127 | if options.filename: 128 | corpus = vocabulary.load_file(options.filename) 129 | else: 130 | corpus = vocabulary.load_corpus(options.corpus) 131 | if not corpus: parser.error("corpus range(-c) forms 'start:end'") 132 | if options.seed != None: 133 | numpy.random.seed(options.seed) 134 | 135 | voca = vocabulary.Vocabulary(options.stopwords) 136 | print(corpus) 137 | docs = [voca.doc_to_ids(doc) for doc in corpus] 138 | if options.df > 0: docs = voca.cut_low_freq(docs, options.df) 139 | 140 | lda = LDA(options.K, options.alpha, options.beta, docs, voca.size(), options.smartinit) 141 | print ("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.K, options.alpha, options.beta)) 142 | 143 | lda.learning(options.iteration, voca) 144 | 145 | if __name__ == "__main__": 146 | main() 147 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | bandit==1.6.2 2 | entrypoints==0.3 3 | flake8==3.7.9 4 | gitdb2==2.0.6 5 | GitPython==3.0.5 6 | mccabe==0.6.1 7 | mecab==0.996 8 | nltk==3.4.5 9 | numpy==1.18.1 10 | pandas==1.0.0 11 | pbr==5.4.4 12 | pycodestyle==2.5.0 13 | pyflakes==2.1.1 14 | python-dateutil==2.8.1 15 | pytz==2019.3 16 | PyYAML==5.3 17 | scipy==1.4.1 18 | six==1.14.0 19 | smmap2==2.0.5 20 | stevedore==1.31.0 21 | -------------------------------------------------------------------------------- /stm.py: -------------------------------------------------------------------------------- 1 | # MIT License 2 | # 3 | # Copyright (c) 2018-2019 Hiroki Iida / Retrieva, Inc. 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a copy 6 | # of this software and associated documentation files (the "Software"), to deal 7 | # in the Software without restriction, including without limitation the rights 8 | # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | # copies of the Software, and to permit persons to whom the Software is 10 | # furnished to do so, subject to the following conditions: 11 | # 12 | # The above copyright notice and this permission notice shall be included in all 13 | # copies or substantial portions of the Software. 14 | # 15 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | # SOFTWARE. 22 | # This code is available under the MIT License. 23 | 24 | 25 | import numpy as np 26 | import scipy as sp 27 | from scipy.special import logsumexp 28 | import lda 29 | import utils 30 | from abc import ABCMeta, abstractmethod 31 | 32 | 33 | class STM_base(metaclass=ABCMeta): 34 | def __init__(self, K, X, Y, docs, V, sigma, interact=True): 35 | self.X = X # DxPx matrix (Px is the num of tags) 36 | self.K = K 37 | self.D = len(docs) 38 | if X is not None: 39 | P = X.shape[1] 40 | self.Gamma = np.zeros((P, K-1)) # parameter of topics prior 41 | 42 | if Y is None: 43 | Y = np.zeros(self.D, dtype=int) 44 | 45 | self.Y = Y # Dx1 matrix 46 | 47 | self.mu = np.zeros((self.D, K)) 48 | self.Sigma = np.diag(np.ones(K-1)) * sigma # if zero, no update. so using diag. 49 | self.c_dv = np.zeros((self.D, V), dtype=int) 50 | self.wd = np.zeros(self.D, dtype=int) 51 | self.mv = np.zeros(V, dtype=int) 52 | for m, doc in enumerate(docs): 53 | for t in doc: 54 | self.c_dv[m, t] += 1 55 | self.wd[m] += 1 56 | self.mv[t] += 1 57 | 58 | self.mv = np.log(self.mv) - np.log(np.sum(self.mv)) 59 | self.docs = docs 60 | self.docs_vocab = [] 61 | for doc in docs: 62 | self.docs_vocab.append(sorted(list(set(doc)))) 63 | 64 | self.V = V 65 | self.eta = np.zeros((self.D, K)) 66 | self.theta = np.exp(self.eta) / np.sum(np.exp(self.eta), axis=1)[:, np.newaxis] 67 | 68 | self.A = np.unique(np.array(Y)) 69 | self.len_A = len(self.A) 70 | self.phi = np.zeros((self.len_A, self.K, self.V)) 71 | 72 | def lda_initialize(self, alpha, beta, itr, voca, smartinit=True): 73 | lda_init = lda.LDA(self.K, alpha, beta, self.docs, self.V, smartinit) 74 | lda_init.learning(itr, voca) 75 | 76 | Kalpha = self.K * alpha 77 | self.theta = lda_init.n_m_z / (np.vectorize(len)(np.array(self.docs)) + Kalpha)[:, np.newaxis] 78 | 79 | self.phi += lda_init.worddist() 80 | 81 | del lda_init 82 | 83 | def output_word_topic_dist(self, voca): 84 | def output(phi, voca): 85 | for k in range(self.K): 86 | print("\n-- topic: {}".format(k)) 87 | for w in np.argsort(-phi[k])[:20]: 88 | print("{}: {}".format(voca[w], phi[k, w])) 89 | 90 | phi = np.average(self.phi, axis=0) 91 | output(phi, voca) 92 | 93 | def perplexity(self, docs=None, Y=None): 94 | if docs is None: 95 | docs = self.docs 96 | if Y is None: 97 | Y = self.Y 98 | log_per = 0 99 | N = 0 100 | 101 | for m, (doc, a) in enumerate(zip(docs, Y)): 102 | for w in doc: 103 | log_per -= np.log(np.dot(self.phi[a, :, w], self.theta[m])) 104 | N += len(doc) 105 | return np.exp(log_per / N) 106 | 107 | def learning(self, iteration, voca): 108 | pre_perp = self.perplexity() 109 | print("initial perplexity=%f" % pre_perp) 110 | for i in range(iteration): 111 | self.inference(i) 112 | perp = self.perplexity() 113 | print("-%d p=%f" % (i + 1, perp)) 114 | if pre_perp: 115 | if pre_perp < perp: 116 | self.output_word_topic_dist(voca) 117 | pre_perp = None 118 | else: 119 | pre_perp = perp 120 | self.output_word_topic_dist(voca) 121 | 122 | def inference(self, iter_num): 123 | """learning once iteration""" 124 | # E-step 125 | # update q_eta and q_z 126 | phi_updater, q_v, variance_topics = self.update_Estep() 127 | # M-step 128 | self.update_mu_and_Gamma() 129 | 130 | # update Sigma 131 | if iter_num > 10: 132 | self.update_Sigma(q_v, variance_topics) 133 | 134 | # update phi 135 | self.update_phi(phi_updater) 136 | 137 | def update_Estep(self): 138 | E_count = np.zeros((len(self.A), self.K, self.V)) 139 | q_v = np.zeros((self.K - 1, self.K - 1)) 140 | variance_topics = np.zeros((self.K - 1, self.K - 1)) 141 | inv_Sigma = np.linalg.inv(self.Sigma) 142 | 143 | for m, (_, i, a) in enumerate(zip(self.docs, self.docs_vocab, self.Y)): 144 | # because fuzzy index induces copy 145 | phi_a = self.phi[a, :, i].T 146 | c_dv_d = self.c_dv[m, i] 147 | self.eta[m], self.theta[m], q_z_d \ 148 | = utils.update_eta(m, self.K, self.eta[m], 149 | phi_a, self.Sigma, 150 | self.mu, c_dv_d, self.wd) 151 | 152 | # prepare update Sigma(calc q_v) and phi(calc phi_tmp) 153 | E_count[a, :, i] += (c_dv_d * q_z_d).T 154 | hessian = utils.update_Hessian(self.K, q_z_d, c_dv_d, self.wd[m], self.theta[m], inv_Sigma) 155 | q_v += np.linalg.inv(hessian) 156 | diff_var_and_mean = self.calc_diff_var_and_mean(m) 157 | variance_topics += np.outer(diff_var_and_mean, diff_var_and_mean) 158 | return (E_count, q_v, variance_topics) 159 | 160 | @abstractmethod 161 | def update_mu_and_Gamma(self): 162 | pass 163 | 164 | def update_Sigma(self, q_v, variance_topics): 165 | self.Sigma = (q_v + variance_topics) / len(self.docs) 166 | 167 | @abstractmethod 168 | def update_phi(self, E_count): 169 | pass 170 | 171 | 172 | class STM_jeff_base(STM_base): 173 | def __init__(self, K, X, Y, docs, V, sigma, interact=True): 174 | super().__init__(K, X, Y, docs, V, sigma, interact) 175 | 176 | self.aspectmod = self.len_A > 1.0 177 | self.interact = interact 178 | self.coef_row = self.K + self.len_A * self.aspectmod + self.len_A * self.K * self.interact 179 | 180 | self.kappa_params = np.zeros((self.coef_row, V)) 181 | self.kappa_sum = np.full((self.len_A, self.K, self.V), self.mv) 182 | 183 | def jeffereysKappa(self, E_count): 184 | def kappa_obj(kappa_param, kappa_other, c_k, bigC_k, gaussprec): 185 | p1 = -1 * np.sum(c_k * kappa_param) 186 | demon_kappas = kappa_other * np.exp(kappa_param) 187 | lseout = np.log(np.sum(demon_kappas, axis=1)) 188 | p2 = np.sum(bigC_k * lseout) 189 | p3 = 0.5 * np.sum(kappa_param**2 * gaussprec) 190 | return p1 + p2 + p3 191 | 192 | def kappa_grad(kappa_param, kappa_other, c_k, bigC_k, gaussprec): 193 | denom_kappas = kappa_other * np.exp(kappa_param) 194 | betaout = denom_kappas / np.sum(denom_kappas, axis=1)[:, np.newaxis] 195 | p2 = np.sum(bigC_k[:, np.newaxis] * betaout, axis=0) # sum up the non focus axis 196 | p3 = kappa_param * gaussprec 197 | return -c_k + p2 + p3 198 | 199 | if(not(self.aspectmod)): 200 | KbyV = E_count[0] 201 | KbyA = np.sum(KbyV, axis=1) 202 | else: 203 | KbyV = np.sum(E_count, axis=0) 204 | KbyA = np.sum(E_count, axis=2).T 205 | 206 | max_it = 3 207 | tol = .001 208 | kappamax_it = 1000 209 | taumax_it = 1000 210 | tautol = 1e-5 211 | 212 | # define update indicater upmost 213 | i_update_kv = self.K 214 | if (self.aspectmod and self.interact): 215 | i_update_ka = self.K + self.len_A 216 | i_update_kav = self.coef_row 217 | else: 218 | i_update_ka = self.coef_row 219 | i_update_kav = 0 220 | 221 | opt_tau = np.vectorize(lambda x: 1/x**2 if x**2 > 1e-5 else 1e5) 222 | 223 | for it in range(max_it): 224 | compare = np.abs(self.kappa_params) < .001 225 | for i in range(self.coef_row): # i:0~K-1=>update kv, K~K+A-1=>update ka, K+A~K+A+K*A-1=>update kav 226 | kappa_init = self.kappa_params[i] 227 | if i < i_update_kv: 228 | k = i 229 | c_k = KbyV[k, :] 230 | bigC_k = KbyA[k, :] 231 | self.kappa_sum[:, k, :] -= kappa_init 232 | kappa_other = np.exp(self.kappa_sum[:, k, :]) 233 | elif i < i_update_ka: 234 | a = i - self.K 235 | c_k = np.sum(E_count[a], axis=0) 236 | bigC_k = KbyA[:, a] 237 | self.kappa_sum[a, :, :] -= kappa_init 238 | kappa_other = np.exp(self.kappa_sum[a, :, :]) 239 | elif i < i_update_kav: 240 | a, k = divmod(i-self.K-self.len_A, self.K) 241 | c_k = E_count[a, k, :] 242 | bigC_k = KbyA[k, a][np.newaxis] 243 | self.kappa_sum[a, k, :] -= kappa_init 244 | kappa_other = np.exp(self.kappa_sum[a, k, :])[np.newaxis, :] 245 | 246 | converged = False 247 | for j in range(taumax_it): 248 | if(not(np.any(kappa_init))): 249 | gaussprec = 1 250 | else: 251 | gaussprec = opt_tau(kappa_init) 252 | 253 | result = sp.optimize.minimize(fun=kappa_obj, x0=kappa_init, 254 | args=(kappa_other, c_k, bigC_k, gaussprec), 255 | jac=kappa_grad, method="L-BFGS-B", options={'maxiter': kappamax_it}) 256 | kappa_init = result.x 257 | converged = np.mean(np.abs(self.kappa_params[i] - kappa_init)) 258 | self.kappa_params[i] = kappa_init 259 | if converged <= tautol: 260 | break 261 | 262 | if i < i_update_kv: 263 | self.kappa_sum[:, k, :] += self.kappa_params[i] 264 | elif i < i_update_ka: 265 | self.kappa_sum[a, :, :] += self.kappa_params[i] 266 | elif i < i_update_kav: 267 | self.kappa_sum[a, k, :] += self.kappa_params[i] 268 | 269 | current = np.abs(self.kappa_params) < .001 270 | sparseagree = np.average(compare == current) 271 | self.phi = np.exp(self.kappa_sum - logsumexp(self.kappa_sum, axis=2)[:, :, np.newaxis]) 272 | if sparseagree > tol: 273 | break 274 | 275 | def update_phi(self, E_count): 276 | self.jeffereysKappa(E_count) 277 | 278 | @abstractmethod 279 | def calc_diff_var_and_mean(self, m): 280 | pass 281 | 282 | 283 | class STM_jeff_reg(STM_jeff_base): 284 | def __init__(self, K, X, Y, docs, V, sigma, interact=True): 285 | super().__init__(K, X, Y, docs, V, sigma, interact) 286 | 287 | def calc_diff_var_and_mean(self, m): 288 | return (self.eta[m, 0:self.K-1] - np.dot(self.X, self.Gamma)[m]) 289 | 290 | def update_mu_and_Gamma(self): 291 | tmp_Gamma = utils.RVM_regression(self.eta, self.X, self.K) 292 | self.Gamma = tmp_Gamma[:self.D, :self.K-1] 293 | self.mu = np.dot(self.X, self.Gamma) 294 | 295 | 296 | class STM_jeff_noX(STM_jeff_base): 297 | def __init__(self, K, X, Y, docs, V, sigma, interact=True): 298 | super().__init__(K, X, Y, docs, V, sigma, interact) 299 | 300 | def calc_diff_var_and_mean(self, m): 301 | return (self.eta[m, 0:self.K-1] - self.mu[m, 0:self.K-1]) 302 | 303 | def update_mu_and_Gamma(self): 304 | self.mu = np.tile(np.sum(self.eta, axis=0) / self.D, (self.D, 1)) 305 | 306 | 307 | class STM_noY_base(STM_base): 308 | def __init__(self, K, X, Y, docs, V, sigma, interact=True): 309 | super().__init__(K, X, Y, docs, V, sigma, interact) 310 | 311 | def calc_diff_var_and_mean(self, m): 312 | pass 313 | 314 | def update_phi(self, q_z): 315 | # ref: Variational EM Algorithms for Correlated Topic Models / Mohhammad Emtiaz Khan et al 316 | for k in range(self.K): 317 | self.phi[0, k, :] = q_z[0, k, :] 318 | 319 | self.phi[0, :, :] = q_z[0] / np.sum(q_z[0, :, :], axis=1)[:, np.newaxis] 320 | 321 | 322 | class STM_noY_reg(STM_noY_base): 323 | def __init__(self, K, X, Y, docs, V, sigma, interact=True): 324 | super().__init__(K, X, Y, docs, V, sigma, interact) 325 | 326 | def calc_diff_var_and_mean(self, m): 327 | return (self.eta[m, 0:self.K-1] - np.dot(self.X, self.Gamma)[m]) 328 | 329 | def update_mu_and_Gamma(self): 330 | tmp_Gamma = utils.RVM_regression(self.eta, self.X, self.K) 331 | self.Gamma = tmp_Gamma[:self.D, :self.K-1] 332 | self.mu = np.dot(self.X, self.Gamma) 333 | 334 | 335 | class STM_noY_noX(STM_noY_base): 336 | def __init__(self, K, X, Y, docs, V, sigma, interact=True): 337 | super().__init__(K, X, Y, docs, V, sigma, interact) 338 | 339 | def calc_diff_var_and_mean(self, m): 340 | return (self.eta[m, 0:self.K-1] - self.mu[m, 0:self.K-1]) 341 | 342 | def update_mu_and_Gamma(self): 343 | self.mu = np.tile(np.sum(self.eta, axis=0) / self.D, (self.D, 1)) 344 | 345 | 346 | def STM_factory_method(K, X, Y, docs, V, sigma, interact=True): 347 | if Y is None: 348 | if X is None: 349 | return STM_noY_noX(K, X, Y, docs, V, sigma, interact) 350 | else: 351 | return STM_noY_reg(K, X, Y, docs, V, sigma, interact) 352 | else: 353 | if X is None: 354 | return STM_jeff_noX(K, X, Y, docs, V, sigma, interact) 355 | else: 356 | return STM_jeff_reg(K, X, Y, docs, V, sigma, interact) 357 | -------------------------------------------------------------------------------- /stm_main.py: -------------------------------------------------------------------------------- 1 | # This code is available under the MIT License. 2 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc. 3 | # (c)2018-2019 Hiroki Iida / Retrieva Inc. 4 | 5 | import numpy as np 6 | import pandas as pd 7 | import stm 8 | 9 | 10 | def main(): 11 | import argparse 12 | import vocabulary 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("-f", dest="filename", help="Set corpus filepath. Fileformat is csv") 15 | parser.add_argument("-d", dest="document", help="Set document field name") 16 | parser.add_argument("-c", dest="corpus", help="Using range of Brown corpus' files(start:end)") 17 | parser.add_argument("--alpha", dest="alpha", type=float, help="Parameter alpha for LDA(default=1.0)", default=1.0) 18 | parser.add_argument("--beta", dest="beta", type=float, help="Parameter beta for LDA(default=0.1)", default=0.1) 19 | parser.add_argument("-k", dest="topics", type=int, help="Number of topics(default=20)", default=20) 20 | parser.add_argument("-i", dest="iteration", type=int, help="Iteration count(default=100)", default=100) 21 | parser.add_argument("-x", dest="X", type=str, help="Set prevalences column name", default=None) 22 | parser.add_argument("-y", dest="Y", type=str, help="Set covariates column name", default=None) 23 | parser.add_argument("--parser", dest="parser", help="Select parser eng_nltk or mecab(default=mecab)", default="mecab") 24 | parser.add_argument("--sigma", dest="sigma", help="Initial value of sigma diagonals(default=0.1)", default=0.1) 25 | parser.add_argument("--stopwords", dest="stopwords", help="Exclude stop words by using corpus from nltk", 26 | action="store_true", default=False) 27 | parser.add_argument("--seed", dest="seed", type=int, help="Random seed") 28 | parser.add_argument("--df", dest="df", type=int, help="Threshold of document freaquency to cut words", default=0) 29 | parser.add_argument("--interact", dest="interact", action="store_true", 30 | help="Consider interaction between covariates adn topics", default=False) 31 | parser.add_argument("--sinit", dest="smartinit", action="store_true", 32 | help="Smart initialize of parameters for LDA", default=False) 33 | options = parser.parse_args() 34 | if not (options.filename or options.corpus): 35 | parser.error("need corpus filename(-f) or corpus range(-c)") 36 | 37 | if options.filename: 38 | load_doc = pd.read_csv(options.filename) 39 | if options.parser.lower() == "eng_nltk": 40 | corpus = vocabulary.load_dataframe(load_doc[options.document]) 41 | elif options.parser.lower() == "mecab": 42 | corpus = vocabulary.load_dataframe_jp(load_doc[options.document]) 43 | else: 44 | corpus = vocabulary.load_corpus(options.corpus) 45 | if not corpus: 46 | parser.error("corpus range(-c) forms 'start:end'") 47 | 48 | if options.seed is not None: 49 | np.random.seed(options.seed) 50 | 51 | print("proc voca") 52 | voca = vocabulary.Vocabulary(options.stopwords) 53 | docs = [voca.doc_to_ids(doc) for doc in corpus] 54 | 55 | # process prevarence, if it is pointed 56 | print("proc X") 57 | if options.X is not None: 58 | X = pd.get_dummies(load_doc[options.X.split(',')], drop_first=True).values 59 | X = np.concatenate((np.ones(X.shape[0])[:, np.newaxis], X), axis=1) 60 | else: 61 | X = options.X 62 | 63 | print("proc Y") 64 | if options.Y is not None: 65 | Y = pd.get_dummies(load_doc[[options.Y]], drop_first=True).values.flatten() 66 | else: 67 | Y = options.Y 68 | 69 | if options.df > 0: 70 | docs = voca.cut_low_freq(docs, options.df) 71 | 72 | print("set STM obj") 73 | stm_obj = stm.STM_factory_method(options.topics, X, Y, docs, voca.size(), options.sigma, options.interact) 74 | print("corpus=%d, words=%d, K=%d, a=%f, b=%f" % (len(corpus), len(voca.vocas), options.topics, options.alpha, options.beta)) 75 | 76 | # import cProfile 77 | # cProfile.runctx('lda_learning(lda, options.iteration, voca)', globals(), locals(), 'lda.profile') 78 | print("lda_initialize") 79 | stm_obj.lda_initialize(options.alpha, options.beta, 10, voca, options.smartinit) 80 | print("stm_learning") 81 | stm_obj.learning(options.iteration, voca) 82 | 83 | 84 | if __name__ == "__main__": 85 | main() 86 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | # This code is available under the MIT License. 2 | # (c)2018-2019 Hiroki Iida / Retrieva Inc. 3 | 4 | 5 | import numpy as np 6 | import scipy as sp 7 | 8 | 9 | def update_Hessian(K, q_z, c_dv, wd, theta, inv_Sigma): 10 | hessian = np.diag(-1.0 * np.dot(q_z[0:K-1, :], c_dv)) 11 | hessian += np.dot(np.sqrt(c_dv) * q_z[0:K-1, :], (np.sqrt(c_dv) * q_z[0:K-1, :]).T) 12 | hessian += wd * np.diag(theta[0:K-1]) 13 | hessian -= wd * np.outer(theta[0:K-1], theta[0:K-1]) + inv_Sigma 14 | return hessian 15 | 16 | 17 | def eta_optim_obj(ndoc, K, x, phi, Sigma, mu, c_dv, wd): 18 | """ 19 | ndoc: int 20 | x:K numpy array 21 | phi: KxV numpy array 22 | """ 23 | diff = x[:K-1] - mu[ndoc, :K-1] 24 | x -= x.max() 25 | obj_fn = 0.5 * np.dot(diff.T, np.dot(np.linalg.inv(Sigma), diff)) 26 | obj_fn -= np.dot(c_dv, np.log(np.dot(np.exp(x), phi))) 27 | obj_fn += wd[ndoc] * np.log(np.sum(np.exp(x))) 28 | return obj_fn 29 | 30 | 31 | def eta_optim_grad(ndoc, K, x, phi, Sigma, mu, c_dv, wd): 32 | """ 33 | ndoc: int 34 | x:K numpy array 35 | phi: KxV numpy arrray 36 | """ 37 | diff = x[:K-1] - mu[ndoc, :K-1] 38 | x -= x.max() 39 | q_z = np.exp(x)[:, np.newaxis] * phi 40 | q_z /= np.sum(q_z, axis=0) 41 | theta = np.exp(x) / np.sum(np.exp(x)) 42 | grad_fn = -1.0 * np.dot(q_z, c_dv) + wd[ndoc] * theta 43 | grad_fn += np.append(np.dot(Sigma, diff), 0.0) 44 | return grad_fn 45 | 46 | 47 | def update_eta(m, K, eta, phi, Sigma, mu, c_dv, wd): 48 | eta_sol_options = {"maxiter": 500, "gtol": 1e-6} 49 | obj = lambda x: eta_optim_obj(m, K, x, phi, Sigma, mu, c_dv, wd) 50 | grad = lambda x: eta_optim_grad(m, K, x, phi, Sigma, mu, c_dv, wd) 51 | result = sp.optimize.minimize(fun=obj, x0=eta, method='BFGS', jac=grad, options=eta_sol_options) 52 | eta = result.x 53 | eta_max = eta.max() 54 | eta -= eta_max 55 | theta = np.exp(eta)/np.sum(np.exp(eta)) 56 | q_z = np.exp(eta)[:, np.newaxis] * phi 57 | q_z /= np.sum(q_z, axis=0) 58 | eta += eta.max() 59 | return (eta, theta, q_z) 60 | 61 | 62 | def RVM_regression(Y, X, K, it_num=100): 63 | """ 64 | Parameters 65 | --------- 66 | Y: NxK matrix of target value 67 | 68 | X: NxD matrix of data 69 | 70 | K: topic number 71 | 72 | it_num: repeat count 73 | 74 | sup: N is data number(so it is equivalent to document number) 75 | D is data-dimension 76 | 77 | Returns: 78 | -------- 79 | W: updated weight of linear regression 80 | 81 | ref: VB INFERENCE FOR LINEAR/LOGISTIC REGRESSION JAN DRUGOWITSCH et al 82 | """ 83 | # inv-gamma prior from thesis 84 | N = X.shape[0] 85 | D = X.shape[1] 86 | 87 | a0 = np.full(K, 0.01) 88 | b0 = np.full(K, 0.0001) 89 | c0 = np.full(K, 0.01) 90 | d0 = np.full((K, D), 0.001) 91 | 92 | a_N = a0 + 0.5 * N 93 | b_N = b0 94 | c_N = c0 + 0.5 95 | d_N = d0 96 | updater_inv_V_N = np.dot(X.T, X) 97 | 98 | W = np.zeros((D, K)) 99 | 100 | updater_W = np.dot(X.T, Y) 101 | 102 | updater_b_N = np.sum(Y*Y, axis=0) 103 | 104 | for _ in range(it_num): 105 | inv_V_N = np.zeros((K, D, D)) 106 | for k in range(K): 107 | inv_V_N[k, :, :] += np.diag(np.ones(D) * c_N[k] / d_N[k, :]) + updater_inv_V_N 108 | 109 | for k in range(K): 110 | W[:, k] = np.dot(np.linalg.inv(inv_V_N[k]), updater_W[:, k]) 111 | 112 | for k in range(K): 113 | b_N[k] = b0[k] + 0.5 * (updater_b_N[k] - np.dot(W[:, k].T, np.dot(inv_V_N[k], W[:, k]))) 114 | 115 | for k in range(K): 116 | d_N[k] = d0[k] + 0.5 * W[:, k] * W[:, k] * a_N[k] / b_N[k] 117 | 118 | return W 119 | -------------------------------------------------------------------------------- /vocabulary.py: -------------------------------------------------------------------------------- 1 | # This code is available under the MIT License. 2 | # (c)2010-2011 Nakatani Shuyo / Cybozu Labs Inc. 3 | # (c)2018-2019 Hiroki Iida / Retrieva Inc. 4 | 5 | import nltk 6 | import re 7 | import MeCab 8 | 9 | 10 | stopwords_list = nltk.corpus.stopwords.words('english') 11 | recover_list = {"wa":"was", "ha":"has"} 12 | wl = nltk.WordNetLemmatizer() 13 | 14 | 15 | def load_corpus(ranges): 16 | """ 17 | load data from corpus 18 | """ 19 | tmp = re.match(r'(\d+):(\d+)$', ranges) 20 | if tmp: 21 | start = int(tmp.group(1)) 22 | end = int(tmp.group(2)) 23 | from nltk.corpus import brown as corpus 24 | return [corpus.words(fileid) for fileid in corpus.fileids()[start:end]] 25 | 26 | 27 | def load_dataframe(documents): 28 | corpus = [] 29 | for doc in documents: 30 | sentences = re.findall(r'\w+(?:\'\w+)?', doc) 31 | if len(sentences) > 0: 32 | corpus.append(sentences) 33 | 34 | return corpus 35 | 36 | 37 | def load_dataframe_jp(documents): 38 | corpus = [] 39 | tagger = MeCab.Tagger('-O wakati') 40 | tagger.parse("") 41 | for doc in documents: 42 | tokens = tagger.parse(doc.strip()).split() 43 | corpus.append(tokens) 44 | return corpus 45 | 46 | 47 | def load_file(filename): 48 | """ 49 | for one file 50 | one line corresponds to one doc 51 | """ 52 | corpus = [] 53 | f = open(filename, 'r') 54 | for line in f: 55 | doc = re.findall(r'\w+(?:\'\w+)?', line) 56 | if len(doc) > 0: 57 | corpus.append(doc) 58 | f.close() 59 | return corpus 60 | 61 | 62 | def is_stopword(w): 63 | return w in stopwords_list 64 | 65 | 66 | def lemmatize(w0): 67 | w = wl.lemmatize(w0.lower()) 68 | if w in recover_list: return recover_list[w] 69 | return w 70 | 71 | 72 | class Vocabulary: 73 | def __init__(self, excluds_stopwords=False): 74 | self.vocas = [] # id to word 75 | self.vocas_id = dict() # word to id 76 | self.docfreq = [] # id to document frequency 77 | self.excluds_stopwords = excluds_stopwords 78 | 79 | def term_to_id(self, term0): 80 | term = lemmatize(term0) 81 | if self.excluds_stopwords and is_stopword(term): 82 | return None 83 | if term not in self.vocas_id: 84 | voca_id = len(self.vocas) 85 | self.vocas_id[term] = voca_id 86 | self.vocas.append(term) 87 | self.docfreq.append(0) 88 | else: 89 | voca_id = self.vocas_id[term] 90 | return voca_id 91 | 92 | def doc_to_ids(self, doc): 93 | ids_list = [] 94 | words = dict() 95 | for term in doc: 96 | id = self.term_to_id(term) 97 | if id is not None: 98 | ids_list.append(id) 99 | if id not in words: 100 | words[id] = 1 101 | self.docfreq[id] += 1 102 | if "close" in dir(doc): 103 | doc.close() 104 | return ids_list 105 | 106 | def cut_low_freq(self, corpus, threshold=1): 107 | new_vocas = [] 108 | new_docfreq = [] 109 | self.vocas_id = dict() 110 | conv_map = dict() 111 | for id, term in enumerate(self.vocas): 112 | freq = self.docfreq[id] 113 | if freq > threshold: 114 | new_id = len(new_vocas) 115 | self.vocas_id[term] = new_id 116 | new_vocas.append(term) 117 | new_docfreq.append(freq) 118 | conv_map[id] = new_id 119 | self.vocas = new_vocas 120 | self.docfreq = new_docfreq 121 | 122 | def conv(doc): 123 | new_doc = [] 124 | for id in doc: 125 | if id in conv_map: new_doc.append(conv_map[id]) 126 | return new_doc 127 | return [conv(doc) for doc in corpus] 128 | 129 | def __getitem__(self, v): 130 | return self.vocas[v] 131 | 132 | def size(self): 133 | return len(self.vocas) 134 | 135 | def is_stopword_id(self, id): 136 | return self.vocas[id] in stopwords_list 137 | --------------------------------------------------------------------------------