├── model ├── __init__.py └── labeled_lda.py ├── requirements.txt ├── .gitignore ├── assets ├── gibbs-sampling-equation.png ├── graphical-of-labeled-lda.png └── generative-process-for-labeled-lda.png ├── LICENSE ├── example └── example.py └── README.md /model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy==1.22.0 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.DS_Store 2 | *.pyc 3 | .idea/ 4 | data/ 5 | example/test.py 6 | 7 | -------------------------------------------------------------------------------- /assets/gibbs-sampling-equation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/gibbs-sampling-equation.png -------------------------------------------------------------------------------- /assets/graphical-of-labeled-lda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/graphical-of-labeled-lda.png -------------------------------------------------------------------------------- /assets/generative-process-for-labeled-lda.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/generative-process-for-labeled-lda.png -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Jiahong Zhou 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /example/example.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.append('../') 3 | import model.labeled_lda as llda 4 | 5 | # initialize data 6 | labeled_documents = [("example example example example example"*10, ["example"]), 7 | ("test llda model test llda model test llda model"*10, ["test", "llda_model"]), 8 | ("example test example test example test example test"*10, ["example", "test"]), 9 | ("good perfect good good perfect good good perfect good "*10, ["positive"]), 10 | ("bad bad down down bad bad down"*10, ["negative"])] 11 | 12 | # new a Labeled LDA model 13 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector="50_div_K", eta_vector=0.001) 14 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.02, eta_vector=0.002) 15 | llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.01) 16 | print(llda_model) 17 | 18 | # training 19 | # llda_model.training(iteration=10, log=True) 20 | while True: 21 | print("iteration %s sampling..." % (llda_model.iteration + 1)) 22 | llda_model.training(1) 23 | print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity())) 24 | print("delta beta: %s" % llda_model.delta_beta) 25 | if llda_model.is_convergent(method="beta", delta=0.01): 26 | break 27 | 28 | # update 29 | print("before updating: ", llda_model) 30 | update_labeled_documents = [("new example test example test example test example test", ["example", "test"])] 31 | llda_model.update(labeled_documents=update_labeled_documents) 32 | print("after updating: ", llda_model) 33 | 34 | # train again 35 | # llda_model.training(iteration=10, log=True) 36 | while True: 37 | print("iteration %s sampling..." % (llda_model.iteration + 1)) 38 | llda_model.training(1) 39 | print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity())) 40 | print("delta beta: %s" % llda_model.delta_beta) 41 | if llda_model.is_convergent(method="beta", delta=0.01): 42 | break 43 | 44 | # inference 45 | # note: the result topics may be different for difference training, because gibbs sampling is a random algorithm 46 | document = "example llda model example example good perfect good perfect good perfect" * 100 47 | 48 | topics = llda_model.inference(document=document, iteration=100, times=10) 49 | print(topics) 50 | 51 | # perplexity 52 | # calculate perplexity on test data 53 | perplexity = llda_model.perplexity(documents=["example example example example example", 54 | "test llda model test llda model test llda model", 55 | "example test example test example test example test", 56 | "good perfect good good perfect good good perfect good", 57 | "bad bad down down bad bad down"], 58 | iteration=30, 59 | times=10) 60 | print("perplexity on test data: %s" % perplexity) 61 | # calculate perplexity on training data 62 | print("perplexity on training data: %s" % llda_model.perplexity()) 63 | 64 | # save to disk 65 | save_model_dir = "../data/model" 66 | # llda_model.save_model_to_dir(save_model_dir, save_derivative_properties=True) 67 | llda_model.save_model_to_dir(save_model_dir) 68 | 69 | # load from disk 70 | llda_model_new = llda.LldaModel() 71 | llda_model_new.load_model_from_dir(save_model_dir, load_derivative_properties=False) 72 | print("llda_model_new", llda_model_new) 73 | print("llda_model", llda_model) 74 | print("Top-5 terms of topic 'negative': ", llda_model.top_terms_of_topic("negative", 5, False)) 75 | print("Doc-Topic Matrix: \n", llda_model.theta) 76 | print("Topic-Term Matrix: \n", llda_model.beta) 77 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Implement of L-LDA Model(Labeled Latent Dirichlet Allocation Model) with python 2 | 3 | 4 | References: 5 | * *Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage...* 6 | * *Parameter estimation for text analysis, Gregor Heinrich.* 7 | * *Latent Dirichlet Allocation, David M. Blei, Andrew Y. Ng...* 8 | 9 | ### An efficient implementation based on Gibbs sampling 10 | 11 | **The following descriptions come from *Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage...*** 12 | 13 | ##### Introduction: 14 | Labeled LDA is a topic model that constrains Latent Dirichlet Allocation by defining a one-to-one correspondence between LDA’s latent topics and user tags. 15 | Labeled LDA can directly learn topics(tags) correspondences. 16 | 17 | ##### Gibbs sampling: 18 | * Graphical model of Labeled LDA: 19 | 20 | 21 | 22 | 23 | * Generative process for Labeled LDA: 24 | 25 | 26 | 27 | * Gibbs sampling equation: 28 | 29 | 30 | 31 | ### Usage 32 | * new llda model 33 | * training 34 | * ?is_convergence 35 | * update 36 | * inference 37 | * save model to disk 38 | * load model from disk 39 | * get top-k terms of target topic 40 | 41 | 42 | ### Example 43 | ``` 44 | # @source code: example/exapmle.py 45 | 46 | import sys 47 | sys.path.append('../') 48 | import model.labeled_lda as llda 49 | 50 | # initialize data 51 | labeled_documents = [("example example example example example"*10, ["example"]), 52 | ("test llda model test llda model test llda model"*10, ["test", "llda_model"]), 53 | ("example test example test example test example test"*10, ["example", "test"]), 54 | ("good perfect good good perfect good good perfect good "*10, ["positive"]), 55 | ("bad bad down down bad bad down"*10, ["negative"])] 56 | 57 | # new a Labeled LDA model 58 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector="50_div_K", eta_vector=0.001) 59 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.02, eta_vector=0.002) 60 | llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.01) 61 | print(llda_model) 62 | 63 | # training 64 | # llda_model.training(iteration=10, log=True) 65 | while True: 66 | print("iteration %s sampling..." % (llda_model.iteration + 1)) 67 | llda_model.training(1) 68 | print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity())) 69 | print("delta beta: %s" % llda_model.delta_beta) 70 | if llda_model.is_convergent(method="beta", delta=0.01): 71 | break 72 | 73 | # update 74 | print("before updating: ", llda_model) 75 | update_labeled_documents = [("new example test example test example test example test", ["example", "test"])] 76 | llda_model.update(labeled_documents=update_labeled_documents) 77 | print("after updating: ", llda_model) 78 | 79 | # train again 80 | # llda_model.training(iteration=10, log=True) 81 | while True: 82 | print("iteration %s sampling..." % (llda_model.iteration + 1)) 83 | llda_model.training(1) 84 | print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity())) 85 | print("delta beta: %s" % llda_model.delta_beta) 86 | if llda_model.is_convergent(method="beta", delta=0.01): 87 | break 88 | 89 | # inference 90 | # note: the result topics may be different for difference training, because gibbs sampling is a random algorithm 91 | document = "example llda model example example good perfect good perfect good perfect" * 100 92 | 93 | topics = llda_model.inference(document=document, iteration=100, times=10) 94 | print(topics) 95 | 96 | # perplexity 97 | # calculate perplexity on test data 98 | perplexity = llda_model.perplexity(documents=["example example example example example", 99 | "test llda model test llda model test llda model", 100 | "example test example test example test example test", 101 | "good perfect good good perfect good good perfect good", 102 | "bad bad down down bad bad down"], 103 | iteration=30, 104 | times=10) 105 | print("perplexity on test data: %s" % perplexity) 106 | # calculate perplexity on training data 107 | print("perplexity on training data: %s" % llda_model.perplexity()) 108 | 109 | # save to disk 110 | save_model_dir = "../data/model" 111 | # llda_model.save_model_to_dir(save_model_dir, save_derivative_properties=True) 112 | llda_model.save_model_to_dir(save_model_dir) 113 | 114 | # load from disk 115 | llda_model_new = llda.LldaModel() 116 | llda_model_new.load_model_from_dir(save_model_dir, load_derivative_properties=False) 117 | print("llda_model_new", llda_model_new) 118 | print("llda_model", llda_model) 119 | print("Top-5 terms of topic 'negative': ", llda_model.top_terms_of_topic("negative", 5, False)) 120 | print("Doc-Topic Matrix: \n", llda_model.theta) 121 | print("Topic-Term Matrix: \n", llda_model.beta) 122 | ``` 123 | 124 | 125 | -------------------------------------------------------------------------------- /model/labeled_lda.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # @Author: Jiahong Zhou 4 | # @Date: 2018-10-20 5 | # @Email: JoeZJiahong@gmail.com 6 | # implement of L-LDA Model(Labeled Latent Dirichlet Allocation Model) 7 | # References: 8 | # i. Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage... 9 | # ii. Parameter estimation for text analysis, Gregor Heinrich. 10 | # iii. Latent Dirichlet Allocation, David M. Blei, Andrew Y. Ng... 11 | import numpy 12 | import numpy as np 13 | import os 14 | import json 15 | from concurrent import futures 16 | try: 17 | import copy_reg 18 | except Exception: 19 | import copyreg as copy_reg 20 | 21 | import types 22 | 23 | 24 | class NpEncoder(json.JSONEncoder): 25 | def default(self, obj): 26 | if isinstance(obj, np.integer): 27 | return int(obj) 28 | elif isinstance(obj, np.floating): 29 | return float(obj) 30 | elif isinstance(obj, np.ndarray): 31 | return obj.tolist() 32 | else: 33 | return super(NpEncoder, self).default(obj) 34 | 35 | 36 | class LldaModel: 37 | """ 38 | L-LDA(Labeled Latent Dirichlet Allocation Model) 39 | 40 | @field K: the number of topics 41 | @field alpha_vector: the prior distribution of theta_m 42 | str("50_div_K"): means [K/50, K/50, ...], 43 | this value come from Parameter estimation for text analysis, Gregor Heinrich. 44 | int or float: means [alpha_vector, alpha_vector, ...] 45 | None: means [0.001, 0.001, ...] 46 | @field eta_vector: the prior distribution of beta_k 47 | int or float: means [eta_vector, eta_vector, ...] 48 | None: means [0.001, 0.001, ...] 49 | @field terms: a list of the all terms 50 | @field vocabulary: a dict of , vocabulary[terms[id]] == id 51 | @field topics: a list of the all topics 52 | @field topic_vocabulary: a dict of , topic_vocabulary[topics[id]] == id 53 | @field W: the corpus, a list of terms list, 54 | W[m] is the document vector, W[m][n] is the id of the term 55 | @field Z: the topic corpus, just same as W, 56 | except Z[m][n] is the id of the topic of the term 57 | @field M: the number of documents 58 | @field T: the number of terms 59 | @field WN: the number of all words in W 60 | @field LN: the number of all original labels 61 | @field iteration: the times of iteration 62 | @field all_perplexities: a list of all perplexities (one training iteration one perplexity) 63 | @field last_beta: the parameter `beta` of last training iteration 64 | @field Lambda: a matrix, shape is M * K, 65 | Lambda[m][k] is 1 means topic k is a label of document m 66 | 67 | # derivative fields 68 | @field Doc2TopicCount: a matrix, shape is M * K, 69 | Doc2TopicCount[m][k] is the times of topic k sampled in document m 70 | @field Topic2TermCount: a matrix, shape is K * T, 71 | Topic2TermCount[k][t] is the times of term t generated from topic k 72 | @field Doc2TopicCountSum: a vector, shape is M, self.Doc2TopicCount.sum(axis=1) 73 | Doc2TopicCountSum[m] is the count of all topic, 74 | i.e., Doc2TopicCountSum[m] is the number of words in document m 75 | @field alpha_vector_Lambda: a matrix, self.alpha_vector * self.Lambda 76 | @field alpha_vector_Lambda_sum: a vector, self.alpha_vector_Lambda.sum(axis=1) 77 | @field eta_vector_sum: float value, sum(self.eta_vector) 78 | @field Topic2TermCountSum: a vector, self.Topic2TermCount.sum(axis=1) 79 | 80 | """ 81 | def __init__(self, alpha_vector="50_div_K", eta_vector=None, labeled_documents=None): 82 | """ 83 | 84 | :param alpha_vector: the prior distribution of theta_m 85 | :param eta_vector: the prior distribution of beta_k 86 | :param labeled_documents: a iterable of tuple(doc, iterable of label), contains all doc and their labels 87 | """ 88 | self.alpha_vector = alpha_vector 89 | self.eta_vector = eta_vector 90 | self.terms = [] 91 | self.vocabulary = {} 92 | self.topics = [] 93 | self.topic_vocabulary = {} 94 | self.W = [] 95 | self.Z = [] 96 | self.K = 0 97 | self.M = 0 98 | self.T = 0 99 | self.WN = 0 100 | self.LN = 0 101 | self.iteration = 0 102 | self.all_perplexities = [] 103 | self.last_beta = None 104 | self.Lambda = None 105 | 106 | # derivative fields: 107 | # the following fields could reduce operations in training and inference 108 | # it is not necessary to save them to file, we can recover them by other fields 109 | 110 | self.Doc2TopicCount = None 111 | self.Topic2TermCount = None 112 | # self.Doc2TopicCountSum = None 113 | self.alpha_vector_Lambda = None 114 | # self.alpha_vector_Lambda_sum = None 115 | self.eta_vector_sum = 0.0 116 | self.Topic2TermCountSum = None 117 | 118 | if labeled_documents is not None: 119 | self._load_labeled_documents(labeled_documents) 120 | 121 | pass 122 | 123 | def _initialize_derivative_fields(self): 124 | """ 125 | initialize derivative fields 126 | :return: None 127 | """ 128 | # TODO: Doc2TopicCount could be reduced to a smaller matrix, 129 | # TODO: because some vector in Doc2TopicCount will always been 0 130 | self.Doc2TopicCount = np.zeros((self.M, self.K), dtype=int) 131 | self.Topic2TermCount = np.zeros((self.K, self.T), dtype=int) 132 | for m in range(self.M): 133 | # print self.Z[m] 134 | for t, z in zip(self.W[m], self.Z[m]): 135 | k = z 136 | # print "[m=%s, k=%s]" % (m, k) 137 | # print "[k=%s, t=%s]" % (k, t) 138 | self.Doc2TopicCount[m, k] += 1 139 | self.Topic2TermCount[k, t] += 1 140 | 141 | # self.Doc2TopicCountSum = self.Doc2TopicCount.sum(axis=1) 142 | self.alpha_vector_Lambda = self.alpha_vector * self.Lambda 143 | # self.alpha_vector_Lambda_sum = self.alpha_vector_Lambda.sum(axis=1) 144 | self.eta_vector_sum = sum(self.eta_vector) 145 | self.Topic2TermCountSum = self.Topic2TermCount.sum(axis=1) 146 | 147 | def _load_labeled_documents(self, labeled_documents): 148 | """ 149 | input labeled corpus, which contains all documents and their corresponding labels 150 | :param labeled_documents: a iterable of tuple(doc, iterable of label), contains all doc and their labels 151 | :return: 152 | """ 153 | # self.documents = [] 154 | all_labels = [] 155 | all_words = [] 156 | doc_corpus = [] 157 | labels_corpus = [] 158 | for document, labels in labeled_documents: 159 | document = LldaModel._document_preprocess(document) 160 | doc_words = document.split() 161 | doc_corpus.append(doc_words) 162 | if labels is None: 163 | labels = [] 164 | labels.append("common_topic") 165 | labels_corpus.append(labels) 166 | all_words.extend(doc_words) 167 | all_labels.extend(labels) 168 | self.terms = list(set(all_words)) 169 | self.vocabulary = {term: index for index, term in enumerate(self.terms)} 170 | self.topics = list(set(all_labels)) 171 | self.topic_vocabulary = {topic: index for index, topic in enumerate(self.topics)} 172 | self.K = len(self.topics) 173 | self.T = len(self.terms) 174 | self.W = [[self.vocabulary[term] for term in doc_words] for doc_words in doc_corpus] 175 | self.M = len(self.W) 176 | self.WN = len(all_words) 177 | # we appended topic "common_topic" to each doc at the beginning 178 | # so we need minus the number of "common_topic" 179 | # LN is the number of original labels 180 | self.LN = len(all_labels) - self.M 181 | 182 | self.Lambda = np.zeros((self.M, self.K), dtype=float) 183 | for m in range(self.M): 184 | if len(labels_corpus[m]) == 1: 185 | labels_corpus[m] = self.topics 186 | for label in labels_corpus[m]: 187 | k = self.topic_vocabulary[label] 188 | self.Lambda[m, k] = 1.0 189 | 190 | if self.alpha_vector is None: 191 | self.alpha_vector = [0.001 for _ in range(self.K)] 192 | elif type(self.alpha_vector) is str and self.alpha_vector == "50_div_K": 193 | self.alpha_vector = [50.0/self.K for _ in range(self.K)] 194 | elif type(self.alpha_vector) is float or type(self.alpha_vector) is int: 195 | self.alpha_vector = [self.alpha_vector for _ in range(self.K)] 196 | else: 197 | message = "error alpha_vector: %s" % self.alpha_vector 198 | raise Exception(message) 199 | 200 | if self.eta_vector is None: 201 | self.eta_vector = [0.001 for _ in range(self.T)] 202 | elif type(self.eta_vector) is float or type(self.eta_vector) is int: 203 | self.eta_vector = [self.eta_vector for _ in range(self.T)] 204 | else: 205 | message = "error eta_vector: %s" % self.eta_vector 206 | raise Exception(message) 207 | 208 | self.Z = [] 209 | for m in range(self.M): 210 | # print "self.Lambda[m]: ", self.Lambda[m] 211 | numerator_vector = self.Lambda[m] * self.alpha_vector 212 | p_vector = 1.0 * numerator_vector / sum(numerator_vector) 213 | # print p_vector 214 | # print "p_vector: ", p_vector 215 | # z_vector is a vector of a document, 216 | # just like [2, 3, 6, 0], which means this doc have 4 word and them generated 217 | # from the 2nd, 3rd, 6th, 0th topic, respectively 218 | z_vector = [LldaModel._multinomial_sample(p_vector) for _ in range(len(self.W[m]))] 219 | self.Z.append(z_vector) 220 | 221 | self._initialize_derivative_fields() 222 | pass 223 | 224 | @staticmethod 225 | def _multinomial_sample(p_vector, random_state=None): 226 | """ 227 | sample a number from multinomial distribution 228 | :param p_vector: the probabilities 229 | :return: a int value 230 | """ 231 | if random_state is not None: 232 | return random_state.multinomial(1, p_vector).argmax() 233 | return np.random.multinomial(1, p_vector).argmax() 234 | 235 | def _gibbs_sample_training(self): 236 | """ 237 | sample a topic(k) for each word(t) of all documents, Generate a new matrix Z 238 | :return: None 239 | """ 240 | # TODO: the operations of addition and multiplication could be reduced, because some 241 | self.last_beta = self.beta 242 | count = 0 243 | for m in range(self.M): 244 | 245 | # doc_m_eta_vector = self.eta_vector 246 | # doc_m_alpha_vector = self.alpha_vector * self.Lambda[m] 247 | doc_m_alpha_vector = self.alpha_vector_Lambda[m] 248 | # assert (doc_m_alpha_vector == self.alpha_vector_Lambda[m]).all() 249 | 250 | # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector) 251 | # sum_doc_m_alpha_vector = self.alpha_vector_Lambda_sum[m] 252 | # assert sum_doc_m_alpha_vector == self.alpha_vector_Lambda_sum[m] 253 | 254 | for t, z, n in zip(self.W[m], self.Z[m], range(len(self.W[m]))): 255 | k = z 256 | self.Doc2TopicCount[m, k] -= 1 257 | self.Topic2TermCount[k, t] -= 1 258 | self.Topic2TermCountSum[k] -= 1 259 | 260 | numerator_theta_vector = self.Doc2TopicCount[m] + doc_m_alpha_vector 261 | # denominator_theta = sum(self.Doc2TopicCount[m]) + sum_doc_m_alpha_vector 262 | # denominator_theta = self.Doc2TopicCountSum[m]-1 + sum_doc_m_alpha_vector 263 | # assert sum(self.Doc2TopicCount[m]) == self.Doc2TopicCountSum[m]-1 264 | 265 | numerator_beta_vector = self.Topic2TermCount[:, t] + self.eta_vector[t] 266 | # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector) 267 | # denominator_beta = self.Topic2TermCount.sum(axis=1) + self.eta_vector_sum 268 | denominator_beta = self.Topic2TermCountSum + self.eta_vector_sum 269 | # assert (self.Topic2TermCount.sum(axis=1) == self.Topic2TermCountSum).all() 270 | # assert sum(self.eta_vector) == self.eta_vector_sum 271 | 272 | beta_vector = 1.0 * numerator_beta_vector / denominator_beta 273 | # theta_vector = 1.0 * numerator_theta_vector / denominator_theta 274 | # denominator_theta is independent with t and k, so denominator could be any value except 0 275 | # will set denominator_theta as 1.0 276 | theta_vector = numerator_theta_vector 277 | 278 | p_vector = beta_vector * theta_vector 279 | # print p_vector 280 | """ 281 | for some special document m (only have one word) p_vector may be zero here, sum(p_vector) will be zero too 282 | 1.0 * p_vector / sum(p_vector) will be [...nan...] 283 | so we should avoid inputting the special document 284 | """ 285 | p_vector = 1.0 * p_vector / sum(p_vector) 286 | # print p_vector 287 | sample_z = LldaModel._multinomial_sample(p_vector) 288 | self.Z[m][n] = sample_z 289 | 290 | k = sample_z 291 | self.Doc2TopicCount[m, k] += 1 292 | self.Topic2TermCount[k, t] += 1 293 | self.Topic2TermCountSum[k] += 1 294 | count += 1 295 | assert count == self.WN 296 | print("gibbs sample count: ", self.WN) 297 | self.iteration += 1 298 | self.all_perplexities.append(self.perplexity()) 299 | pass 300 | 301 | def _gibbs_sample_inference(self, term_vector, iteration=300, times=10): 302 | """ 303 | inference with gibbs sampling 304 | :param term_vector: the term vector of document 305 | :param iteration: the times of iteration until Markov chain converges 306 | :param times: the number of samples of the target distribution 307 | (one whole iteration(sample for all words) generates a sample, the ) 308 | #times = #samples, 309 | after Markov chain converges, the next #times samples as the samples of the target distribution, 310 | we drop the samples before the Markov chain converges, 311 | the result is the average value of #times samples 312 | :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k 313 | theta_new, a theta_vector, the doc-topic distribution 314 | """ 315 | doc_topic_count = np.zeros(self.K, dtype=int) 316 | accumulated_doc_topic_count = np.zeros(self.K, dtype=int) 317 | p_vector = np.ones(self.K, dtype=int) 318 | p_vector = p_vector * 1.0 / sum(p_vector) 319 | z_vector = [LldaModel._multinomial_sample(p_vector) for _ in term_vector] 320 | for n, t in enumerate(term_vector): 321 | k = z_vector[n] 322 | doc_topic_count[k] += 1 323 | self.Topic2TermCount[k, t] += 1 324 | self.Topic2TermCountSum[k] += 1 325 | 326 | # sum_doc_topic_count = sum(doc_topic_count) 327 | doc_m_alpha_vector = self.alpha_vector 328 | # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector) 329 | for i in range(iteration+times): 330 | for n, t in enumerate(term_vector): 331 | k = z_vector[n] 332 | doc_topic_count[k] -= 1 333 | self.Topic2TermCount[k, t] -= 1 334 | self.Topic2TermCountSum[k] -= 1 335 | 336 | numerator_theta_vector = doc_topic_count + doc_m_alpha_vector 337 | # denominator_theta = sum_doc_topic_count - 1 + sum_doc_m_alpha_vector 338 | 339 | numerator_beta_vector = self.Topic2TermCount[:, t] + self.eta_vector[t] 340 | # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector) 341 | denominator_beta = self.Topic2TermCountSum + self.eta_vector_sum 342 | 343 | beta_vector = 1.0 * numerator_beta_vector / denominator_beta 344 | # theta_vector = 1.0 numerator_theta_vector / denominator_theta 345 | # denominator_theta is independent with t and k, so denominator could be any value except 0 346 | # will set denominator_theta as 1.0 347 | theta_vector = numerator_theta_vector 348 | 349 | p_vector = beta_vector * theta_vector 350 | # print p_vector 351 | p_vector = 1.0 * p_vector / sum(p_vector) 352 | # print p_vector 353 | sample_z = LldaModel._multinomial_sample(p_vector) 354 | z_vector[n] = sample_z 355 | 356 | k = sample_z 357 | doc_topic_count[k] += 1 358 | self.Topic2TermCount[k, t] += 1 359 | self.Topic2TermCountSum[k] += 1 360 | if i >= iteration: 361 | accumulated_doc_topic_count += doc_topic_count 362 | # reset self.Topic2TermCount 363 | for n, t in enumerate(term_vector): 364 | k = z_vector[n] 365 | self.Topic2TermCount[k, t] -= 1 366 | self.Topic2TermCountSum[k] -= 1 367 | 368 | numerator_theta_vector = accumulated_doc_topic_count/times + doc_m_alpha_vector 369 | # denominator_theta = sum(doc_topic_count) + sum(doc_m_alpha_vector) 370 | denominator_theta = sum(numerator_theta_vector) 371 | theta_new = 1.0 * numerator_theta_vector / denominator_theta 372 | return theta_new 373 | 374 | # def _gibbs_sample_inference_multi_processors(self, term_vector, iteration=30): 375 | # """ 376 | # inference with gibbs sampling 377 | # :param term_vector: the term vector of document 378 | # :param iteration: the times of iteration 379 | # :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k 380 | # theta_new, a theta_vector, the doc-topic distribution 381 | # """ 382 | # # print("gibbs sample inference iteration: %s" % iteration) 383 | # # TODO: complete multi-processors code here 384 | # # we copy all the shared variables may be modified on runtime 385 | # random_state = np.random.RandomState() 386 | # topic2term_count = self.Topic2TermCount.copy() 387 | # topic2term_count_sum = self.Topic2TermCountSum.copy() 388 | # 389 | # doc_topic_count = np.zeros(self.K, dtype=int) 390 | # p_vector = np.ones(self.K, dtype=int) 391 | # p_vector = p_vector * 1.0 / sum(p_vector) 392 | # z_vector = [LldaModel._multinomial_sample(p_vector, random_state=random_state) for _ in term_vector] 393 | # for n, t in enumerate(term_vector): 394 | # k = z_vector[n] 395 | # doc_topic_count[k] += 1 396 | # topic2term_count[k, t] += 1 397 | # topic2term_count_sum[k] += 1 398 | # 399 | # # sum_doc_topic_count = sum(doc_topic_count) 400 | # doc_m_alpha_vector = self.alpha_vector 401 | # # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector) 402 | # for i in range(iteration): 403 | # for n, t in enumerate(term_vector): 404 | # k = z_vector[n] 405 | # doc_topic_count[k] -= 1 406 | # topic2term_count[k, t] -= 1 407 | # topic2term_count_sum[k] -= 1 408 | # 409 | # numerator_theta_vector = doc_topic_count + doc_m_alpha_vector 410 | # # denominator_theta = sum_doc_topic_count - 1 + sum_doc_m_alpha_vector 411 | # 412 | # numerator_beta_vector = topic2term_count[:, t] + self.eta_vector[t] 413 | # # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector) 414 | # denominator_beta = topic2term_count_sum + self.eta_vector_sum 415 | # 416 | # beta_vector = 1.0 * numerator_beta_vector / denominator_beta 417 | # # theta_vector = 1.0 numerator_theta_vector / denominator_theta 418 | # # denominator_theta is independent with t and k, so denominator could be any value except 0 419 | # # will set denominator_theta as 1.0 420 | # theta_vector = numerator_theta_vector 421 | # 422 | # p_vector = beta_vector * theta_vector 423 | # # print p_vector 424 | # p_vector = 1.0 * p_vector / sum(p_vector) 425 | # # print p_vector 426 | # sample_z = LldaModel._multinomial_sample(p_vector, random_state) 427 | # z_vector[n] = sample_z 428 | # 429 | # k = sample_z 430 | # doc_topic_count[k] += 1 431 | # topic2term_count[k, t] += 1 432 | # topic2term_count_sum[k] += 1 433 | # # reset self.Topic2TermCount 434 | # # for n, t in enumerate(term_vector): 435 | # # k = z_vector[n] 436 | # # self.Topic2TermCount[k, t] -= 1 437 | # # self.Topic2TermCountSum[k] -= 1 438 | # 439 | # numerator_theta_vector = doc_topic_count + doc_m_alpha_vector 440 | # # denominator_theta = sum(doc_topic_count) + sum(doc_m_alpha_vector) 441 | # denominator_theta = sum(numerator_theta_vector) 442 | # theta_new = 1.0 * numerator_theta_vector / denominator_theta 443 | # return theta_new 444 | 445 | def training(self, iteration=10, log=False): 446 | """ 447 | training this model with gibbs sampling 448 | :param log: print perplexity after every gibbs sampling if True 449 | :param iteration: the times of iteration 450 | :return: None 451 | """ 452 | for i in range(iteration): 453 | if log: 454 | print("after iteration: %s, perplexity: %s" % (self.iteration, self.perplexity())) 455 | self._gibbs_sample_training() 456 | pass 457 | 458 | def inference(self, document, iteration=30, times=10): 459 | # TODO: inference of a document 460 | """ 461 | inference for one document 462 | :param document: some sentence like "this is a method for inference" 463 | :param times: the number of samples of the target distribution 464 | (one whole iteration(sample for all words) generates a sample, the ) 465 | #times = #samples, 466 | after Markov chain converges, the next #times samples as the samples of the target distribution, 467 | we drop the samples before the Markov chain converges, 468 | the result is the average value of #times samples 469 | :param iteration: the times of iteration until Markov chain converges 470 | :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k 471 | theta_new, a theta_vector, the doc-topic distribution 472 | """ 473 | document = LldaModel._document_preprocess(document) 474 | doc_words = document.split() 475 | term_vector = [self.vocabulary[word] for word in doc_words if word in self.vocabulary] 476 | theta_new = self._gibbs_sample_inference(term_vector, iteration=iteration, times=times) 477 | doc_topic_new = [(self.topics[k], probability) for k, probability in enumerate(theta_new)] 478 | sorted_doc_topic_new = sorted(doc_topic_new, 479 | key=lambda topic_probability: topic_probability[1], 480 | reverse=True) 481 | return sorted_doc_topic_new 482 | pass 483 | 484 | # def inference_multi_processors(self, document, iteration=30, times=8, max_workers=8): 485 | # # TODO: inference of a document with multi processors 486 | # """ 487 | # inference for one document 488 | # :param times: the times of gibbs sampling, the result is the average value of all times(gibbs sampling) 489 | # :param iteration: the times of iteration 490 | # :param document: some sentence like "this is a method for inference" 491 | # :param max_workers: the max number of processors(workers) 492 | # :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k 493 | # theta_new, a theta_vector, the doc-topic distribution 494 | # """ 495 | # 496 | # def _pickle_method(m): 497 | # if m.im_self is None: 498 | # return getattr, (m.im_class, m.im_func.func_name) 499 | # else: 500 | # return getattr, (m.im_self, m.im_func.func_name) 501 | # copy_reg.pickle(types.MethodType, _pickle_method) 502 | # 503 | # words = document.split() 504 | # term_vector = [self.vocabulary[word] for word in words if word in self.vocabulary] 505 | # term_vectors = [term_vector for _ in range(times)] 506 | # iterations = [iteration for _ in range(times)] 507 | # 508 | # with futures.ProcessPoolExecutor(max_workers) as executor: 509 | # # print "executor.map" 510 | # res = executor.map(self._gibbs_sample_inference_multi_processors, term_vectors, iterations) 511 | # theta_new_accumulation = np.zeros(self.K, float) 512 | # for theta_new in res: 513 | # theta_new_accumulation += theta_new 514 | # theta_new = 1.0 * theta_new_accumulation / times 515 | # # print "avg: \n", theta_new 516 | # doc_topic_new = [(self.topics[k], probability) for k, probability in enumerate(theta_new)] 517 | # sorted_doc_topic_new = sorted(doc_topic_new, 518 | # key=lambda topic_probability: topic_probability[1], 519 | # reverse=True) 520 | # return sorted_doc_topic_new 521 | # pass 522 | 523 | def beta_k(self, k): 524 | """ 525 | topic-term distribution 526 | beta_k[t] is the probability of term t(word) to be generated from topic k 527 | :return: a vector, shape is T 528 | """ 529 | numerator_vector = self.Topic2TermCount[k] + self.eta_vector 530 | # denominator = sum(self.Topic2TermCount[k]) + sum(self.eta_vector) 531 | denominator = sum(numerator_vector) 532 | return 1.0 * numerator_vector / denominator 533 | 534 | def theta_m(self, m): 535 | """ 536 | doc-topic distribution 537 | theta_m[k] is the probability of doc m to be generated from topic k 538 | :return: a vector, shape is K 539 | """ 540 | numerator_vector = self.Doc2TopicCount[m] + self.alpha_vector * self.Lambda[m] 541 | # denominator = sum(self.Doc2TopicCount[m]) + sum(self.alpha_vector * self.Lambda[m]) 542 | denominator = sum(numerator_vector) 543 | return 1.0 * numerator_vector / denominator 544 | 545 | @property 546 | def beta(self): 547 | """ 548 | This name "beta" comes from 549 | "Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage..." 550 | topic-term distribution 551 | beta[k, t] is the probability of term t(word) to be generated from topic k 552 | :return: a matrix, shape is K * T 553 | """ 554 | numerator_matrix = self.Topic2TermCount + self.eta_vector 555 | # column vector 556 | # denominator_vector = self.Topic2TermCount.sum(axis=1).reshape(self.K, 1) + sum(self.eta_vector) 557 | denominator_vector = numerator_matrix.sum(axis=1).reshape(self.K, 1) 558 | return 1.0 * numerator_matrix / denominator_vector 559 | 560 | pass 561 | 562 | @property 563 | def theta(self): 564 | """ 565 | doc-topic distribution 566 | theta[m, k] is the probability of doc m to be generated from topic k 567 | :return: a matrix, shape is M * K 568 | """ 569 | numerator_matrix = self.Doc2TopicCount + self.alpha_vector * self.Lambda 570 | denominator_vector = numerator_matrix.sum(axis=1).reshape(self.M, 1) 571 | # column vector 572 | return 1.0 * numerator_matrix / denominator_vector 573 | pass 574 | 575 | def log_perplexity(self, documents=None, iteration=30, times=10): 576 | """ 577 | log perplexity of LDA topic model, use the training data if documents is None 578 | Reference: Parameter estimation for text analysis, Gregor Heinrich. 579 | :param: documents: test set 580 | :return: a float value 581 | """ 582 | beta, theta, W, WN, log_likelihood = self.beta, None, None, None, 0 583 | # theta is the doc-topic distribution matrix 584 | # W is the list of term_vector, each term_vector represents a document 585 | # WN is the number of all word in W 586 | # difference test set means difference theta, W, WN 587 | 588 | if not documents: 589 | theta = self.theta 590 | W = self.W 591 | WN = self.WN 592 | else: 593 | # generate the term_vector of document 594 | documents = [LldaModel._document_preprocess(document) for document in documents] 595 | test_corpus = [document.split() for document in documents] 596 | W = [[self.vocabulary[term] for term in doc_words if term in self.vocabulary] for doc_words in test_corpus] 597 | WN = sum([len(term_vector) for term_vector in W]) 598 | theta = [] 599 | for term_vector in W: 600 | # sample on term_vector until Markov chain converges 601 | theta_new = self._gibbs_sample_inference(term_vector, iteration=iteration, times=times) 602 | theta.append(theta_new) 603 | 604 | # caculate the log_perplexity of current documents 605 | for m, theta_m in enumerate(theta): 606 | for t in W[m]: 607 | likelihood_t = np.inner(theta_m, beta[:, t]) 608 | log_likelihood += -np.log(likelihood_t) 609 | return 1.0 * log_likelihood / WN 610 | 611 | def perplexity(self, documents=None, iteration=30, times=10): 612 | """ 613 | perplexity of LDA topic model, we use the training data if documents is None 614 | Reference: Parameter estimation for text analysis, Gregor Heinrich. 615 | :param: documents: test set 616 | :return: a float value, perplexity = exp{log_perplexity} 617 | """ 618 | return np.exp(self.log_perplexity(documents=documents, iteration=iteration, times=times)) 619 | 620 | def __repr__(self): 621 | return "\nLabeled-LDA Model:\n" \ 622 | "\tK = %s\n" \ 623 | "\tM = %s\n" \ 624 | "\tT = %s\n" \ 625 | "\tWN = %s\n" \ 626 | "\tLN = %s\n" \ 627 | "\talpha = %s\n" \ 628 | "\teta = %s\n" \ 629 | "\tperplexity = %s\n" \ 630 | "\t" % (self.K, self.M, self.T, self.WN, self.LN, self.alpha_vector[0], self.eta_vector[0], 631 | self.perplexity()) 632 | pass 633 | 634 | class SaveModel: 635 | def __init__(self, save_model_dict=None): 636 | self.alpha_vector = [] 637 | self.eta_vector = [] 638 | self.terms = [] 639 | self.vocabulary = {} 640 | self.topics = [] 641 | self.topic_vocabulary = {} 642 | self.W = [] 643 | self.Z = [] 644 | self.K = 0 645 | self.M = 0 646 | self.T = 0 647 | self.WN = 0 648 | self.LN = 0 649 | self.iteration = 0 650 | 651 | # the following fields cannot be dumped into json file 652 | # we need write them with np.save() and read them with np.load() 653 | # self.Doc2TopicCount = None 654 | # self.Topic2TermCount = None 655 | self.Lambda = None 656 | 657 | if save_model_dict is not None: 658 | self.__dict__ = save_model_dict 659 | pass 660 | 661 | @staticmethod 662 | def _document_preprocess(document): 663 | """ 664 | process document before inputting it into the model(both training, update and inference) 665 | :param document: the target document 666 | :return: the word we change 667 | """ 668 | document = document.lower() 669 | return document 670 | 671 | @staticmethod 672 | def _read_object_from_file(file_name): 673 | """ 674 | read an object from json file 675 | :param file_name: json file name 676 | :return: None if file doesn't exist or can not convert to an object by json, else return the object 677 | """ 678 | if os.path.exists(file_name) is False: 679 | print ("Error read path: [%s]" % file_name) 680 | return None 681 | with open(file_name, 'r') as f: 682 | try: 683 | obj = json.load(f) 684 | except Exception: 685 | print ("Error json: [%s]" % f.read()[0:10]) 686 | return None 687 | return obj 688 | 689 | @staticmethod 690 | def _write_object_to_file(file_name, target_object): 691 | """ 692 | write the object to file with json(if the file exists, this function will overwrite it) 693 | :param file_name: the name of new file 694 | :param target_object: the target object for writing 695 | :return: True if success else False 696 | """ 697 | dirname = os.path.dirname(file_name) 698 | LldaModel._find_and_create_dirs(dirname) 699 | try: 700 | with open(file_name, "w") as f: 701 | json.dump(target_object, f, skipkeys=False, ensure_ascii=False, check_circular=True, allow_nan=True, 702 | cls=NpEncoder, indent=True, separators=None, default=None, sort_keys=False) 703 | except Exception as e: 704 | message = "Write [%s...] to file [%s] error: json.dump error" % (str(target_object)[0:10], file_name) 705 | print ("%s: %s" % (e, message)) 706 | return False 707 | else: 708 | # print ("Write %s" % file_name) 709 | return True 710 | 711 | @staticmethod 712 | def _find_and_create_dirs(dir_name): 713 | """ 714 | find dir, create it if it doesn't exist 715 | :param dir_name: the name of dir 716 | :return: the name of dir 717 | """ 718 | if os.path.exists(dir_name) is False: 719 | os.makedirs(dir_name) 720 | return dir_name 721 | 722 | def save_model_to_dir(self, dir_name, save_derivative_properties=False): 723 | """ 724 | save model to directory dir_name 725 | :param save_derivative_properties: save derivative properties if True 726 | some properties are not necessary save to disk, they could be derived from some basic properties, 727 | we call they derivative properties. 728 | to save derivative properties to disk: 729 | it will reduce the time of loading model from disk (read properties directly but do not compute them) 730 | but, meanwhile, it will take up more disk space 731 | :param dir_name: the target directory name 732 | :return: None 733 | """ 734 | save_model = LldaModel.SaveModel() 735 | save_model.alpha_vector = self.alpha_vector 736 | save_model.eta_vector = self.eta_vector 737 | save_model.terms = self.terms 738 | save_model.vocabulary = self.vocabulary 739 | save_model.topics = self.topics 740 | save_model.topic_vocabulary = self.topic_vocabulary 741 | save_model.W = self.W 742 | save_model.Z = self.Z 743 | save_model.K = self.K 744 | save_model.M = self.M 745 | save_model.T = self.T 746 | save_model.WN = self.WN 747 | save_model.LN = self.LN 748 | save_model.iteration = self.iteration 749 | 750 | save_model_path = os.path.join(dir_name, "llda_model.json") 751 | LldaModel._write_object_to_file(save_model_path, save_model.__dict__) 752 | 753 | np.save(os.path.join(dir_name, "Lambda.npy"), self.Lambda) 754 | # save derivative properties 755 | if save_derivative_properties: 756 | np.save(os.path.join(dir_name, "Doc2TopicCount.npy"), self.Doc2TopicCount) 757 | np.save(os.path.join(dir_name, "Topic2TermCount.npy"), self.Topic2TermCount) 758 | np.save(os.path.join(dir_name, "alpha_vector_Lambda.npy"), self.alpha_vector_Lambda) 759 | np.save(os.path.join(dir_name, "eta_vector_sum.npy"), self.eta_vector_sum) 760 | np.save(os.path.join(dir_name, "Topic2TermCountSum.npy"), self.Topic2TermCountSum) 761 | pass 762 | 763 | def load_model_from_dir(self, dir_name, load_derivative_properties=True): 764 | """ 765 | load model from directory dir_name 766 | :param load_derivative_properties: load derivative properties from disk if True 767 | :param dir_name: the target directory name 768 | :return: None 769 | """ 770 | save_model_path = os.path.join(dir_name, "llda_model.json") 771 | save_model_dict = LldaModel._read_object_from_file(save_model_path) 772 | save_model = LldaModel.SaveModel(save_model_dict=save_model_dict) 773 | self.alpha_vector = save_model.alpha_vector 774 | self.eta_vector = save_model.eta_vector 775 | self.terms = save_model.terms 776 | self.vocabulary = save_model.vocabulary 777 | self.topics = save_model.topics 778 | self.topic_vocabulary = save_model.topic_vocabulary 779 | self.W = save_model.W 780 | self.Z = save_model.Z 781 | self.K = save_model.K 782 | self.M = save_model.M 783 | self.T = save_model.T 784 | self.WN = save_model.WN 785 | self.LN = save_model.LN 786 | self.iteration = save_model.iteration 787 | 788 | self.Lambda = np.load(os.path.join(dir_name, "Lambda.npy")) 789 | 790 | # load load_derivative properties 791 | if load_derivative_properties: 792 | try: 793 | self.Doc2TopicCount = np.load(os.path.join(dir_name, "Doc2TopicCount.npy")) 794 | self.Topic2TermCount = np.load(os.path.join(dir_name, "Topic2TermCount.npy")) 795 | self.alpha_vector_Lambda = np.load(os.path.join(dir_name, "alpha_vector_Lambda.npy")) 796 | self.eta_vector_sum = np.load(os.path.join(dir_name, "eta_vector_sum.npy")) 797 | self.Topic2TermCountSum = np.load(os.path.join(dir_name, "Topic2TermCountSum.npy")) 798 | except IOError or ValueError as e: 799 | print("%s: load derivative properties fail, initialize them with basic properties" % e) 800 | self._initialize_derivative_fields() 801 | else: 802 | self._initialize_derivative_fields() 803 | pass 804 | 805 | def update(self, labeled_documents=None): 806 | """ 807 | update model with labeled documents, incremental update 808 | :return: None 809 | """ 810 | self.all_perplexities = [] 811 | if labeled_documents is None: 812 | pass 813 | 814 | new_labels = [] 815 | new_words = [] 816 | new_doc_corpus = [] 817 | new_labels_corpus = [] 818 | for document, labels in labeled_documents: 819 | document = LldaModel._document_preprocess(document) 820 | doc_words = document.split() 821 | new_doc_corpus.append(doc_words) 822 | if labels is None: 823 | labels = [] 824 | labels.append("common_topic") 825 | new_labels_corpus.append(labels) 826 | new_words.extend(doc_words) 827 | new_labels.extend(labels) 828 | # self.terms = list(set(new_words)) 829 | new_terms = set(new_words) - set(self.terms) 830 | self.terms.extend(new_terms) 831 | self.vocabulary = {term: index for index, term in enumerate(self.terms)} 832 | 833 | # self.topics = list(set(new_labels)) 834 | new_topics = set(new_labels) - set(self.topics) 835 | self.topics.extend(new_topics) 836 | self.topic_vocabulary = {topic: index for index, topic in enumerate(self.topics)} 837 | 838 | old_K = self.K 839 | old_T = self.T 840 | self.K = len(self.topics) 841 | self.T = len(self.terms) 842 | 843 | # self.W = [[self.vocabulary[term] for term in doc_words] for doc_words in new_doc_corpus] 844 | new_w_vectors = [[self.vocabulary[term] for term in doc_words] for doc_words in new_doc_corpus] 845 | for new_w_vector in new_w_vectors: 846 | self.W.append(new_w_vector) 847 | 848 | old_M = self.M 849 | old_WN = self.WN 850 | self.M = len(self.W) 851 | self.WN += len(new_words) 852 | # we appended topic "common_topic" to each doc at the beginning 853 | # so we need minus the number of "common_topic" 854 | # LN is the number of original labels 855 | old_LN = self.LN 856 | 857 | self.LN += len(new_labels) + len(new_labels_corpus) 858 | 859 | old_Lambda = self.Lambda 860 | self.Lambda = np.zeros((self.M, self.K), dtype=float) 861 | for m in range(self.M): 862 | if m < old_M: 863 | # if the old document has no topic, we also init it to all topics here 864 | if sum(old_Lambda[m]) == old_K: 865 | # set all value of self.Lambda[m] to 1.0 866 | self.Lambda[m] += 1.0 867 | continue 868 | # print m, old_M 869 | if len(new_labels_corpus[m-old_M]) == 1: 870 | new_labels_corpus[m-old_M] = self.topics 871 | for label in new_labels_corpus[m-old_M]: 872 | k = self.topic_vocabulary[label] 873 | self.Lambda[m, k] = 1.0 874 | 875 | # TODO: the following 2 fields should be modified again if alpha_vector is not constant vector 876 | self.alpha_vector = [self.alpha_vector[0] for _ in range(self.K)] 877 | self.eta_vector = [self.eta_vector[0] for _ in range(self.T)] 878 | 879 | # self.Z = [] 880 | for m in range(old_M, self.M): 881 | # print "self.Lambda[m]: ", self.Lambda[m] 882 | numerator_vector = self.Lambda[m] * self.alpha_vector 883 | p_vector = numerator_vector / sum(numerator_vector) 884 | # print p_vector 885 | # print "p_vector: ", p_vector 886 | # z_vector is a vector of a document, 887 | # just like [2, 3, 6, 0], which means this doc have 4 word and them generated 888 | # from the 2nd, 3rd, 6th, 0th topic, respectively 889 | z_vector = [LldaModel._multinomial_sample(p_vector) for _ in range(len(self.W[m]))] 890 | self.Z.append(z_vector) 891 | 892 | self._initialize_derivative_fields() 893 | pass 894 | 895 | @staticmethod 896 | def _extend_matrix(origin=None, shape=None, padding_value=0): 897 | """ 898 | for quickly extend the matrices when update 899 | extend origin matrix with shape, padding with padding_value 900 | :type shape: the shape of new matrix 901 | :param origin: np.ndarray, the original matrix 902 | :return: np.ndarray, a matrix with new shape 903 | """ 904 | new_matrix = np.zeros(shape, dtype=origin.dtype) 905 | 906 | for row in range(new_matrix.shape[0]): 907 | for col in range(new_matrix.shape[1]): 908 | if row < origin.shape[0] and col < origin.shape[0]: 909 | new_matrix[row, col] = origin[row, col] 910 | else: 911 | new_matrix[row, col] = padding_value 912 | 913 | return new_matrix 914 | pass 915 | 916 | def is_convergent(self, method="PPL", delta=0.001): 917 | """ 918 | is this model convergent? 919 | use the perplexities to determine whether the Markov chain converges 920 | :param method: the method of determining whether the Markov chain converges 921 | "PPL": use the perplexities of training data 922 | "beta": use the parameter 'beta' 923 | :param delta: if the changes are less than or equal to `delta`, means that the Markov chain converges 924 | :return: True if model is convergent 925 | """ 926 | if method == "PPL": 927 | if len(self.all_perplexities) < 10: 928 | return False 929 | perplexities = self.all_perplexities[-10:] 930 | if max(perplexities) - min(perplexities) <= delta: 931 | return True 932 | return False 933 | elif method == "beta": 934 | if self.delta_beta <= delta: 935 | return True 936 | return False 937 | else: 938 | raise Exception("parameter 'method=\"%s\"' is illegal" % method) 939 | 940 | @property 941 | def delta_beta(self): 942 | """ 943 | calculate the changes of the parameter `beta` 944 | :return: the sum of changes of the parameter `beta` 945 | """ 946 | return np.sum(np.abs(self.beta - self.last_beta)) 947 | 948 | def top_terms_of_topic(self, topic, k, with_probabilities=True): 949 | """ 950 | get top-k terms of topic 951 | :param with_probabilities: True means return the probabilities of a term generated by topic, 952 | else return only terms 953 | :param topic: str, the name of topic 954 | :param k: int, the number of terms 955 | :return: the top-k terms of topic 956 | """ 957 | if topic not in self.topic_vocabulary: 958 | raise Exception("Cannot find topic \"%s\"" % topic) 959 | beta = self.beta_k(self.topic_vocabulary[topic]) 960 | terms = sorted(list(zip(self.terms, beta)), key=lambda x: x[1], reverse=True) 961 | if with_probabilities: 962 | return terms[:k] 963 | return [term for term, p in terms[:k]] 964 | 965 | 966 | if __name__ == "__main__": 967 | pass 968 | 969 | --------------------------------------------------------------------------------