├── model
├── __init__.py
└── labeled_lda.py
├── requirements.txt
├── .gitignore
├── assets
├── gibbs-sampling-equation.png
├── graphical-of-labeled-lda.png
└── generative-process-for-labeled-lda.png
├── LICENSE
├── example
└── example.py
└── README.md
/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.0
2 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | *.pyc
3 | .idea/
4 | data/
5 | example/test.py
6 |
7 |
--------------------------------------------------------------------------------
/assets/gibbs-sampling-equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/gibbs-sampling-equation.png
--------------------------------------------------------------------------------
/assets/graphical-of-labeled-lda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/graphical-of-labeled-lda.png
--------------------------------------------------------------------------------
/assets/generative-process-for-labeled-lda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/generative-process-for-labeled-lda.png
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Jiahong Zhou
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/example/example.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.append('../')
3 | import model.labeled_lda as llda
4 |
5 | # initialize data
6 | labeled_documents = [("example example example example example"*10, ["example"]),
7 | ("test llda model test llda model test llda model"*10, ["test", "llda_model"]),
8 | ("example test example test example test example test"*10, ["example", "test"]),
9 | ("good perfect good good perfect good good perfect good "*10, ["positive"]),
10 | ("bad bad down down bad bad down"*10, ["negative"])]
11 |
12 | # new a Labeled LDA model
13 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector="50_div_K", eta_vector=0.001)
14 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.02, eta_vector=0.002)
15 | llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.01)
16 | print(llda_model)
17 |
18 | # training
19 | # llda_model.training(iteration=10, log=True)
20 | while True:
21 | print("iteration %s sampling..." % (llda_model.iteration + 1))
22 | llda_model.training(1)
23 | print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
24 | print("delta beta: %s" % llda_model.delta_beta)
25 | if llda_model.is_convergent(method="beta", delta=0.01):
26 | break
27 |
28 | # update
29 | print("before updating: ", llda_model)
30 | update_labeled_documents = [("new example test example test example test example test", ["example", "test"])]
31 | llda_model.update(labeled_documents=update_labeled_documents)
32 | print("after updating: ", llda_model)
33 |
34 | # train again
35 | # llda_model.training(iteration=10, log=True)
36 | while True:
37 | print("iteration %s sampling..." % (llda_model.iteration + 1))
38 | llda_model.training(1)
39 | print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
40 | print("delta beta: %s" % llda_model.delta_beta)
41 | if llda_model.is_convergent(method="beta", delta=0.01):
42 | break
43 |
44 | # inference
45 | # note: the result topics may be different for difference training, because gibbs sampling is a random algorithm
46 | document = "example llda model example example good perfect good perfect good perfect" * 100
47 |
48 | topics = llda_model.inference(document=document, iteration=100, times=10)
49 | print(topics)
50 |
51 | # perplexity
52 | # calculate perplexity on test data
53 | perplexity = llda_model.perplexity(documents=["example example example example example",
54 | "test llda model test llda model test llda model",
55 | "example test example test example test example test",
56 | "good perfect good good perfect good good perfect good",
57 | "bad bad down down bad bad down"],
58 | iteration=30,
59 | times=10)
60 | print("perplexity on test data: %s" % perplexity)
61 | # calculate perplexity on training data
62 | print("perplexity on training data: %s" % llda_model.perplexity())
63 |
64 | # save to disk
65 | save_model_dir = "../data/model"
66 | # llda_model.save_model_to_dir(save_model_dir, save_derivative_properties=True)
67 | llda_model.save_model_to_dir(save_model_dir)
68 |
69 | # load from disk
70 | llda_model_new = llda.LldaModel()
71 | llda_model_new.load_model_from_dir(save_model_dir, load_derivative_properties=False)
72 | print("llda_model_new", llda_model_new)
73 | print("llda_model", llda_model)
74 | print("Top-5 terms of topic 'negative': ", llda_model.top_terms_of_topic("negative", 5, False))
75 | print("Doc-Topic Matrix: \n", llda_model.theta)
76 | print("Topic-Term Matrix: \n", llda_model.beta)
77 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Implement of L-LDA Model(Labeled Latent Dirichlet Allocation Model) with python
2 |
3 |
4 | References:
5 | * *Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage...*
6 | * *Parameter estimation for text analysis, Gregor Heinrich.*
7 | * *Latent Dirichlet Allocation, David M. Blei, Andrew Y. Ng...*
8 |
9 | ### An efficient implementation based on Gibbs sampling
10 |
11 | **The following descriptions come from *Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage...***
12 |
13 | ##### Introduction:
14 | Labeled LDA is a topic model that constrains Latent Dirichlet Allocation by defining a one-to-one correspondence between LDA’s latent topics and user tags.
15 | Labeled LDA can directly learn topics(tags) correspondences.
16 |
17 | ##### Gibbs sampling:
18 | * Graphical model of Labeled LDA:
19 |
20 |
21 |
22 |
23 | * Generative process for Labeled LDA:
24 |
25 |
26 |
27 | * Gibbs sampling equation:
28 |
29 |
30 |
31 | ### Usage
32 | * new llda model
33 | * training
34 | * ?is_convergence
35 | * update
36 | * inference
37 | * save model to disk
38 | * load model from disk
39 | * get top-k terms of target topic
40 |
41 |
42 | ### Example
43 | ```
44 | # @source code: example/exapmle.py
45 |
46 | import sys
47 | sys.path.append('../')
48 | import model.labeled_lda as llda
49 |
50 | # initialize data
51 | labeled_documents = [("example example example example example"*10, ["example"]),
52 | ("test llda model test llda model test llda model"*10, ["test", "llda_model"]),
53 | ("example test example test example test example test"*10, ["example", "test"]),
54 | ("good perfect good good perfect good good perfect good "*10, ["positive"]),
55 | ("bad bad down down bad bad down"*10, ["negative"])]
56 |
57 | # new a Labeled LDA model
58 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector="50_div_K", eta_vector=0.001)
59 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.02, eta_vector=0.002)
60 | llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.01)
61 | print(llda_model)
62 |
63 | # training
64 | # llda_model.training(iteration=10, log=True)
65 | while True:
66 | print("iteration %s sampling..." % (llda_model.iteration + 1))
67 | llda_model.training(1)
68 | print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
69 | print("delta beta: %s" % llda_model.delta_beta)
70 | if llda_model.is_convergent(method="beta", delta=0.01):
71 | break
72 |
73 | # update
74 | print("before updating: ", llda_model)
75 | update_labeled_documents = [("new example test example test example test example test", ["example", "test"])]
76 | llda_model.update(labeled_documents=update_labeled_documents)
77 | print("after updating: ", llda_model)
78 |
79 | # train again
80 | # llda_model.training(iteration=10, log=True)
81 | while True:
82 | print("iteration %s sampling..." % (llda_model.iteration + 1))
83 | llda_model.training(1)
84 | print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
85 | print("delta beta: %s" % llda_model.delta_beta)
86 | if llda_model.is_convergent(method="beta", delta=0.01):
87 | break
88 |
89 | # inference
90 | # note: the result topics may be different for difference training, because gibbs sampling is a random algorithm
91 | document = "example llda model example example good perfect good perfect good perfect" * 100
92 |
93 | topics = llda_model.inference(document=document, iteration=100, times=10)
94 | print(topics)
95 |
96 | # perplexity
97 | # calculate perplexity on test data
98 | perplexity = llda_model.perplexity(documents=["example example example example example",
99 | "test llda model test llda model test llda model",
100 | "example test example test example test example test",
101 | "good perfect good good perfect good good perfect good",
102 | "bad bad down down bad bad down"],
103 | iteration=30,
104 | times=10)
105 | print("perplexity on test data: %s" % perplexity)
106 | # calculate perplexity on training data
107 | print("perplexity on training data: %s" % llda_model.perplexity())
108 |
109 | # save to disk
110 | save_model_dir = "../data/model"
111 | # llda_model.save_model_to_dir(save_model_dir, save_derivative_properties=True)
112 | llda_model.save_model_to_dir(save_model_dir)
113 |
114 | # load from disk
115 | llda_model_new = llda.LldaModel()
116 | llda_model_new.load_model_from_dir(save_model_dir, load_derivative_properties=False)
117 | print("llda_model_new", llda_model_new)
118 | print("llda_model", llda_model)
119 | print("Top-5 terms of topic 'negative': ", llda_model.top_terms_of_topic("negative", 5, False))
120 | print("Doc-Topic Matrix: \n", llda_model.theta)
121 | print("Topic-Term Matrix: \n", llda_model.beta)
122 | ```
123 |
124 |
125 |
--------------------------------------------------------------------------------
/model/labeled_lda.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | # @Author: Jiahong Zhou
4 | # @Date: 2018-10-20
5 | # @Email: JoeZJiahong@gmail.com
6 | # implement of L-LDA Model(Labeled Latent Dirichlet Allocation Model)
7 | # References:
8 | # i. Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage...
9 | # ii. Parameter estimation for text analysis, Gregor Heinrich.
10 | # iii. Latent Dirichlet Allocation, David M. Blei, Andrew Y. Ng...
11 | import numpy
12 | import numpy as np
13 | import os
14 | import json
15 | from concurrent import futures
16 | try:
17 | import copy_reg
18 | except Exception:
19 | import copyreg as copy_reg
20 |
21 | import types
22 |
23 |
24 | class NpEncoder(json.JSONEncoder):
25 | def default(self, obj):
26 | if isinstance(obj, np.integer):
27 | return int(obj)
28 | elif isinstance(obj, np.floating):
29 | return float(obj)
30 | elif isinstance(obj, np.ndarray):
31 | return obj.tolist()
32 | else:
33 | return super(NpEncoder, self).default(obj)
34 |
35 |
36 | class LldaModel:
37 | """
38 | L-LDA(Labeled Latent Dirichlet Allocation Model)
39 |
40 | @field K: the number of topics
41 | @field alpha_vector: the prior distribution of theta_m
42 | str("50_div_K"): means [K/50, K/50, ...],
43 | this value come from Parameter estimation for text analysis, Gregor Heinrich.
44 | int or float: means [alpha_vector, alpha_vector, ...]
45 | None: means [0.001, 0.001, ...]
46 | @field eta_vector: the prior distribution of beta_k
47 | int or float: means [eta_vector, eta_vector, ...]
48 | None: means [0.001, 0.001, ...]
49 | @field terms: a list of the all terms
50 | @field vocabulary: a dict of , vocabulary[terms[id]] == id
51 | @field topics: a list of the all topics
52 | @field topic_vocabulary: a dict of , topic_vocabulary[topics[id]] == id
53 | @field W: the corpus, a list of terms list,
54 | W[m] is the document vector, W[m][n] is the id of the term
55 | @field Z: the topic corpus, just same as W,
56 | except Z[m][n] is the id of the topic of the term
57 | @field M: the number of documents
58 | @field T: the number of terms
59 | @field WN: the number of all words in W
60 | @field LN: the number of all original labels
61 | @field iteration: the times of iteration
62 | @field all_perplexities: a list of all perplexities (one training iteration one perplexity)
63 | @field last_beta: the parameter `beta` of last training iteration
64 | @field Lambda: a matrix, shape is M * K,
65 | Lambda[m][k] is 1 means topic k is a label of document m
66 |
67 | # derivative fields
68 | @field Doc2TopicCount: a matrix, shape is M * K,
69 | Doc2TopicCount[m][k] is the times of topic k sampled in document m
70 | @field Topic2TermCount: a matrix, shape is K * T,
71 | Topic2TermCount[k][t] is the times of term t generated from topic k
72 | @field Doc2TopicCountSum: a vector, shape is M, self.Doc2TopicCount.sum(axis=1)
73 | Doc2TopicCountSum[m] is the count of all topic,
74 | i.e., Doc2TopicCountSum[m] is the number of words in document m
75 | @field alpha_vector_Lambda: a matrix, self.alpha_vector * self.Lambda
76 | @field alpha_vector_Lambda_sum: a vector, self.alpha_vector_Lambda.sum(axis=1)
77 | @field eta_vector_sum: float value, sum(self.eta_vector)
78 | @field Topic2TermCountSum: a vector, self.Topic2TermCount.sum(axis=1)
79 |
80 | """
81 | def __init__(self, alpha_vector="50_div_K", eta_vector=None, labeled_documents=None):
82 | """
83 |
84 | :param alpha_vector: the prior distribution of theta_m
85 | :param eta_vector: the prior distribution of beta_k
86 | :param labeled_documents: a iterable of tuple(doc, iterable of label), contains all doc and their labels
87 | """
88 | self.alpha_vector = alpha_vector
89 | self.eta_vector = eta_vector
90 | self.terms = []
91 | self.vocabulary = {}
92 | self.topics = []
93 | self.topic_vocabulary = {}
94 | self.W = []
95 | self.Z = []
96 | self.K = 0
97 | self.M = 0
98 | self.T = 0
99 | self.WN = 0
100 | self.LN = 0
101 | self.iteration = 0
102 | self.all_perplexities = []
103 | self.last_beta = None
104 | self.Lambda = None
105 |
106 | # derivative fields:
107 | # the following fields could reduce operations in training and inference
108 | # it is not necessary to save them to file, we can recover them by other fields
109 |
110 | self.Doc2TopicCount = None
111 | self.Topic2TermCount = None
112 | # self.Doc2TopicCountSum = None
113 | self.alpha_vector_Lambda = None
114 | # self.alpha_vector_Lambda_sum = None
115 | self.eta_vector_sum = 0.0
116 | self.Topic2TermCountSum = None
117 |
118 | if labeled_documents is not None:
119 | self._load_labeled_documents(labeled_documents)
120 |
121 | pass
122 |
123 | def _initialize_derivative_fields(self):
124 | """
125 | initialize derivative fields
126 | :return: None
127 | """
128 | # TODO: Doc2TopicCount could be reduced to a smaller matrix,
129 | # TODO: because some vector in Doc2TopicCount will always been 0
130 | self.Doc2TopicCount = np.zeros((self.M, self.K), dtype=int)
131 | self.Topic2TermCount = np.zeros((self.K, self.T), dtype=int)
132 | for m in range(self.M):
133 | # print self.Z[m]
134 | for t, z in zip(self.W[m], self.Z[m]):
135 | k = z
136 | # print "[m=%s, k=%s]" % (m, k)
137 | # print "[k=%s, t=%s]" % (k, t)
138 | self.Doc2TopicCount[m, k] += 1
139 | self.Topic2TermCount[k, t] += 1
140 |
141 | # self.Doc2TopicCountSum = self.Doc2TopicCount.sum(axis=1)
142 | self.alpha_vector_Lambda = self.alpha_vector * self.Lambda
143 | # self.alpha_vector_Lambda_sum = self.alpha_vector_Lambda.sum(axis=1)
144 | self.eta_vector_sum = sum(self.eta_vector)
145 | self.Topic2TermCountSum = self.Topic2TermCount.sum(axis=1)
146 |
147 | def _load_labeled_documents(self, labeled_documents):
148 | """
149 | input labeled corpus, which contains all documents and their corresponding labels
150 | :param labeled_documents: a iterable of tuple(doc, iterable of label), contains all doc and their labels
151 | :return:
152 | """
153 | # self.documents = []
154 | all_labels = []
155 | all_words = []
156 | doc_corpus = []
157 | labels_corpus = []
158 | for document, labels in labeled_documents:
159 | document = LldaModel._document_preprocess(document)
160 | doc_words = document.split()
161 | doc_corpus.append(doc_words)
162 | if labels is None:
163 | labels = []
164 | labels.append("common_topic")
165 | labels_corpus.append(labels)
166 | all_words.extend(doc_words)
167 | all_labels.extend(labels)
168 | self.terms = list(set(all_words))
169 | self.vocabulary = {term: index for index, term in enumerate(self.terms)}
170 | self.topics = list(set(all_labels))
171 | self.topic_vocabulary = {topic: index for index, topic in enumerate(self.topics)}
172 | self.K = len(self.topics)
173 | self.T = len(self.terms)
174 | self.W = [[self.vocabulary[term] for term in doc_words] for doc_words in doc_corpus]
175 | self.M = len(self.W)
176 | self.WN = len(all_words)
177 | # we appended topic "common_topic" to each doc at the beginning
178 | # so we need minus the number of "common_topic"
179 | # LN is the number of original labels
180 | self.LN = len(all_labels) - self.M
181 |
182 | self.Lambda = np.zeros((self.M, self.K), dtype=float)
183 | for m in range(self.M):
184 | if len(labels_corpus[m]) == 1:
185 | labels_corpus[m] = self.topics
186 | for label in labels_corpus[m]:
187 | k = self.topic_vocabulary[label]
188 | self.Lambda[m, k] = 1.0
189 |
190 | if self.alpha_vector is None:
191 | self.alpha_vector = [0.001 for _ in range(self.K)]
192 | elif type(self.alpha_vector) is str and self.alpha_vector == "50_div_K":
193 | self.alpha_vector = [50.0/self.K for _ in range(self.K)]
194 | elif type(self.alpha_vector) is float or type(self.alpha_vector) is int:
195 | self.alpha_vector = [self.alpha_vector for _ in range(self.K)]
196 | else:
197 | message = "error alpha_vector: %s" % self.alpha_vector
198 | raise Exception(message)
199 |
200 | if self.eta_vector is None:
201 | self.eta_vector = [0.001 for _ in range(self.T)]
202 | elif type(self.eta_vector) is float or type(self.eta_vector) is int:
203 | self.eta_vector = [self.eta_vector for _ in range(self.T)]
204 | else:
205 | message = "error eta_vector: %s" % self.eta_vector
206 | raise Exception(message)
207 |
208 | self.Z = []
209 | for m in range(self.M):
210 | # print "self.Lambda[m]: ", self.Lambda[m]
211 | numerator_vector = self.Lambda[m] * self.alpha_vector
212 | p_vector = 1.0 * numerator_vector / sum(numerator_vector)
213 | # print p_vector
214 | # print "p_vector: ", p_vector
215 | # z_vector is a vector of a document,
216 | # just like [2, 3, 6, 0], which means this doc have 4 word and them generated
217 | # from the 2nd, 3rd, 6th, 0th topic, respectively
218 | z_vector = [LldaModel._multinomial_sample(p_vector) for _ in range(len(self.W[m]))]
219 | self.Z.append(z_vector)
220 |
221 | self._initialize_derivative_fields()
222 | pass
223 |
224 | @staticmethod
225 | def _multinomial_sample(p_vector, random_state=None):
226 | """
227 | sample a number from multinomial distribution
228 | :param p_vector: the probabilities
229 | :return: a int value
230 | """
231 | if random_state is not None:
232 | return random_state.multinomial(1, p_vector).argmax()
233 | return np.random.multinomial(1, p_vector).argmax()
234 |
235 | def _gibbs_sample_training(self):
236 | """
237 | sample a topic(k) for each word(t) of all documents, Generate a new matrix Z
238 | :return: None
239 | """
240 | # TODO: the operations of addition and multiplication could be reduced, because some
241 | self.last_beta = self.beta
242 | count = 0
243 | for m in range(self.M):
244 |
245 | # doc_m_eta_vector = self.eta_vector
246 | # doc_m_alpha_vector = self.alpha_vector * self.Lambda[m]
247 | doc_m_alpha_vector = self.alpha_vector_Lambda[m]
248 | # assert (doc_m_alpha_vector == self.alpha_vector_Lambda[m]).all()
249 |
250 | # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector)
251 | # sum_doc_m_alpha_vector = self.alpha_vector_Lambda_sum[m]
252 | # assert sum_doc_m_alpha_vector == self.alpha_vector_Lambda_sum[m]
253 |
254 | for t, z, n in zip(self.W[m], self.Z[m], range(len(self.W[m]))):
255 | k = z
256 | self.Doc2TopicCount[m, k] -= 1
257 | self.Topic2TermCount[k, t] -= 1
258 | self.Topic2TermCountSum[k] -= 1
259 |
260 | numerator_theta_vector = self.Doc2TopicCount[m] + doc_m_alpha_vector
261 | # denominator_theta = sum(self.Doc2TopicCount[m]) + sum_doc_m_alpha_vector
262 | # denominator_theta = self.Doc2TopicCountSum[m]-1 + sum_doc_m_alpha_vector
263 | # assert sum(self.Doc2TopicCount[m]) == self.Doc2TopicCountSum[m]-1
264 |
265 | numerator_beta_vector = self.Topic2TermCount[:, t] + self.eta_vector[t]
266 | # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector)
267 | # denominator_beta = self.Topic2TermCount.sum(axis=1) + self.eta_vector_sum
268 | denominator_beta = self.Topic2TermCountSum + self.eta_vector_sum
269 | # assert (self.Topic2TermCount.sum(axis=1) == self.Topic2TermCountSum).all()
270 | # assert sum(self.eta_vector) == self.eta_vector_sum
271 |
272 | beta_vector = 1.0 * numerator_beta_vector / denominator_beta
273 | # theta_vector = 1.0 * numerator_theta_vector / denominator_theta
274 | # denominator_theta is independent with t and k, so denominator could be any value except 0
275 | # will set denominator_theta as 1.0
276 | theta_vector = numerator_theta_vector
277 |
278 | p_vector = beta_vector * theta_vector
279 | # print p_vector
280 | """
281 | for some special document m (only have one word) p_vector may be zero here, sum(p_vector) will be zero too
282 | 1.0 * p_vector / sum(p_vector) will be [...nan...]
283 | so we should avoid inputting the special document
284 | """
285 | p_vector = 1.0 * p_vector / sum(p_vector)
286 | # print p_vector
287 | sample_z = LldaModel._multinomial_sample(p_vector)
288 | self.Z[m][n] = sample_z
289 |
290 | k = sample_z
291 | self.Doc2TopicCount[m, k] += 1
292 | self.Topic2TermCount[k, t] += 1
293 | self.Topic2TermCountSum[k] += 1
294 | count += 1
295 | assert count == self.WN
296 | print("gibbs sample count: ", self.WN)
297 | self.iteration += 1
298 | self.all_perplexities.append(self.perplexity())
299 | pass
300 |
301 | def _gibbs_sample_inference(self, term_vector, iteration=300, times=10):
302 | """
303 | inference with gibbs sampling
304 | :param term_vector: the term vector of document
305 | :param iteration: the times of iteration until Markov chain converges
306 | :param times: the number of samples of the target distribution
307 | (one whole iteration(sample for all words) generates a sample, the )
308 | #times = #samples,
309 | after Markov chain converges, the next #times samples as the samples of the target distribution,
310 | we drop the samples before the Markov chain converges,
311 | the result is the average value of #times samples
312 | :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k
313 | theta_new, a theta_vector, the doc-topic distribution
314 | """
315 | doc_topic_count = np.zeros(self.K, dtype=int)
316 | accumulated_doc_topic_count = np.zeros(self.K, dtype=int)
317 | p_vector = np.ones(self.K, dtype=int)
318 | p_vector = p_vector * 1.0 / sum(p_vector)
319 | z_vector = [LldaModel._multinomial_sample(p_vector) for _ in term_vector]
320 | for n, t in enumerate(term_vector):
321 | k = z_vector[n]
322 | doc_topic_count[k] += 1
323 | self.Topic2TermCount[k, t] += 1
324 | self.Topic2TermCountSum[k] += 1
325 |
326 | # sum_doc_topic_count = sum(doc_topic_count)
327 | doc_m_alpha_vector = self.alpha_vector
328 | # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector)
329 | for i in range(iteration+times):
330 | for n, t in enumerate(term_vector):
331 | k = z_vector[n]
332 | doc_topic_count[k] -= 1
333 | self.Topic2TermCount[k, t] -= 1
334 | self.Topic2TermCountSum[k] -= 1
335 |
336 | numerator_theta_vector = doc_topic_count + doc_m_alpha_vector
337 | # denominator_theta = sum_doc_topic_count - 1 + sum_doc_m_alpha_vector
338 |
339 | numerator_beta_vector = self.Topic2TermCount[:, t] + self.eta_vector[t]
340 | # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector)
341 | denominator_beta = self.Topic2TermCountSum + self.eta_vector_sum
342 |
343 | beta_vector = 1.0 * numerator_beta_vector / denominator_beta
344 | # theta_vector = 1.0 numerator_theta_vector / denominator_theta
345 | # denominator_theta is independent with t and k, so denominator could be any value except 0
346 | # will set denominator_theta as 1.0
347 | theta_vector = numerator_theta_vector
348 |
349 | p_vector = beta_vector * theta_vector
350 | # print p_vector
351 | p_vector = 1.0 * p_vector / sum(p_vector)
352 | # print p_vector
353 | sample_z = LldaModel._multinomial_sample(p_vector)
354 | z_vector[n] = sample_z
355 |
356 | k = sample_z
357 | doc_topic_count[k] += 1
358 | self.Topic2TermCount[k, t] += 1
359 | self.Topic2TermCountSum[k] += 1
360 | if i >= iteration:
361 | accumulated_doc_topic_count += doc_topic_count
362 | # reset self.Topic2TermCount
363 | for n, t in enumerate(term_vector):
364 | k = z_vector[n]
365 | self.Topic2TermCount[k, t] -= 1
366 | self.Topic2TermCountSum[k] -= 1
367 |
368 | numerator_theta_vector = accumulated_doc_topic_count/times + doc_m_alpha_vector
369 | # denominator_theta = sum(doc_topic_count) + sum(doc_m_alpha_vector)
370 | denominator_theta = sum(numerator_theta_vector)
371 | theta_new = 1.0 * numerator_theta_vector / denominator_theta
372 | return theta_new
373 |
374 | # def _gibbs_sample_inference_multi_processors(self, term_vector, iteration=30):
375 | # """
376 | # inference with gibbs sampling
377 | # :param term_vector: the term vector of document
378 | # :param iteration: the times of iteration
379 | # :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k
380 | # theta_new, a theta_vector, the doc-topic distribution
381 | # """
382 | # # print("gibbs sample inference iteration: %s" % iteration)
383 | # # TODO: complete multi-processors code here
384 | # # we copy all the shared variables may be modified on runtime
385 | # random_state = np.random.RandomState()
386 | # topic2term_count = self.Topic2TermCount.copy()
387 | # topic2term_count_sum = self.Topic2TermCountSum.copy()
388 | #
389 | # doc_topic_count = np.zeros(self.K, dtype=int)
390 | # p_vector = np.ones(self.K, dtype=int)
391 | # p_vector = p_vector * 1.0 / sum(p_vector)
392 | # z_vector = [LldaModel._multinomial_sample(p_vector, random_state=random_state) for _ in term_vector]
393 | # for n, t in enumerate(term_vector):
394 | # k = z_vector[n]
395 | # doc_topic_count[k] += 1
396 | # topic2term_count[k, t] += 1
397 | # topic2term_count_sum[k] += 1
398 | #
399 | # # sum_doc_topic_count = sum(doc_topic_count)
400 | # doc_m_alpha_vector = self.alpha_vector
401 | # # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector)
402 | # for i in range(iteration):
403 | # for n, t in enumerate(term_vector):
404 | # k = z_vector[n]
405 | # doc_topic_count[k] -= 1
406 | # topic2term_count[k, t] -= 1
407 | # topic2term_count_sum[k] -= 1
408 | #
409 | # numerator_theta_vector = doc_topic_count + doc_m_alpha_vector
410 | # # denominator_theta = sum_doc_topic_count - 1 + sum_doc_m_alpha_vector
411 | #
412 | # numerator_beta_vector = topic2term_count[:, t] + self.eta_vector[t]
413 | # # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector)
414 | # denominator_beta = topic2term_count_sum + self.eta_vector_sum
415 | #
416 | # beta_vector = 1.0 * numerator_beta_vector / denominator_beta
417 | # # theta_vector = 1.0 numerator_theta_vector / denominator_theta
418 | # # denominator_theta is independent with t and k, so denominator could be any value except 0
419 | # # will set denominator_theta as 1.0
420 | # theta_vector = numerator_theta_vector
421 | #
422 | # p_vector = beta_vector * theta_vector
423 | # # print p_vector
424 | # p_vector = 1.0 * p_vector / sum(p_vector)
425 | # # print p_vector
426 | # sample_z = LldaModel._multinomial_sample(p_vector, random_state)
427 | # z_vector[n] = sample_z
428 | #
429 | # k = sample_z
430 | # doc_topic_count[k] += 1
431 | # topic2term_count[k, t] += 1
432 | # topic2term_count_sum[k] += 1
433 | # # reset self.Topic2TermCount
434 | # # for n, t in enumerate(term_vector):
435 | # # k = z_vector[n]
436 | # # self.Topic2TermCount[k, t] -= 1
437 | # # self.Topic2TermCountSum[k] -= 1
438 | #
439 | # numerator_theta_vector = doc_topic_count + doc_m_alpha_vector
440 | # # denominator_theta = sum(doc_topic_count) + sum(doc_m_alpha_vector)
441 | # denominator_theta = sum(numerator_theta_vector)
442 | # theta_new = 1.0 * numerator_theta_vector / denominator_theta
443 | # return theta_new
444 |
445 | def training(self, iteration=10, log=False):
446 | """
447 | training this model with gibbs sampling
448 | :param log: print perplexity after every gibbs sampling if True
449 | :param iteration: the times of iteration
450 | :return: None
451 | """
452 | for i in range(iteration):
453 | if log:
454 | print("after iteration: %s, perplexity: %s" % (self.iteration, self.perplexity()))
455 | self._gibbs_sample_training()
456 | pass
457 |
458 | def inference(self, document, iteration=30, times=10):
459 | # TODO: inference of a document
460 | """
461 | inference for one document
462 | :param document: some sentence like "this is a method for inference"
463 | :param times: the number of samples of the target distribution
464 | (one whole iteration(sample for all words) generates a sample, the )
465 | #times = #samples,
466 | after Markov chain converges, the next #times samples as the samples of the target distribution,
467 | we drop the samples before the Markov chain converges,
468 | the result is the average value of #times samples
469 | :param iteration: the times of iteration until Markov chain converges
470 | :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k
471 | theta_new, a theta_vector, the doc-topic distribution
472 | """
473 | document = LldaModel._document_preprocess(document)
474 | doc_words = document.split()
475 | term_vector = [self.vocabulary[word] for word in doc_words if word in self.vocabulary]
476 | theta_new = self._gibbs_sample_inference(term_vector, iteration=iteration, times=times)
477 | doc_topic_new = [(self.topics[k], probability) for k, probability in enumerate(theta_new)]
478 | sorted_doc_topic_new = sorted(doc_topic_new,
479 | key=lambda topic_probability: topic_probability[1],
480 | reverse=True)
481 | return sorted_doc_topic_new
482 | pass
483 |
484 | # def inference_multi_processors(self, document, iteration=30, times=8, max_workers=8):
485 | # # TODO: inference of a document with multi processors
486 | # """
487 | # inference for one document
488 | # :param times: the times of gibbs sampling, the result is the average value of all times(gibbs sampling)
489 | # :param iteration: the times of iteration
490 | # :param document: some sentence like "this is a method for inference"
491 | # :param max_workers: the max number of processors(workers)
492 | # :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k
493 | # theta_new, a theta_vector, the doc-topic distribution
494 | # """
495 | #
496 | # def _pickle_method(m):
497 | # if m.im_self is None:
498 | # return getattr, (m.im_class, m.im_func.func_name)
499 | # else:
500 | # return getattr, (m.im_self, m.im_func.func_name)
501 | # copy_reg.pickle(types.MethodType, _pickle_method)
502 | #
503 | # words = document.split()
504 | # term_vector = [self.vocabulary[word] for word in words if word in self.vocabulary]
505 | # term_vectors = [term_vector for _ in range(times)]
506 | # iterations = [iteration for _ in range(times)]
507 | #
508 | # with futures.ProcessPoolExecutor(max_workers) as executor:
509 | # # print "executor.map"
510 | # res = executor.map(self._gibbs_sample_inference_multi_processors, term_vectors, iterations)
511 | # theta_new_accumulation = np.zeros(self.K, float)
512 | # for theta_new in res:
513 | # theta_new_accumulation += theta_new
514 | # theta_new = 1.0 * theta_new_accumulation / times
515 | # # print "avg: \n", theta_new
516 | # doc_topic_new = [(self.topics[k], probability) for k, probability in enumerate(theta_new)]
517 | # sorted_doc_topic_new = sorted(doc_topic_new,
518 | # key=lambda topic_probability: topic_probability[1],
519 | # reverse=True)
520 | # return sorted_doc_topic_new
521 | # pass
522 |
523 | def beta_k(self, k):
524 | """
525 | topic-term distribution
526 | beta_k[t] is the probability of term t(word) to be generated from topic k
527 | :return: a vector, shape is T
528 | """
529 | numerator_vector = self.Topic2TermCount[k] + self.eta_vector
530 | # denominator = sum(self.Topic2TermCount[k]) + sum(self.eta_vector)
531 | denominator = sum(numerator_vector)
532 | return 1.0 * numerator_vector / denominator
533 |
534 | def theta_m(self, m):
535 | """
536 | doc-topic distribution
537 | theta_m[k] is the probability of doc m to be generated from topic k
538 | :return: a vector, shape is K
539 | """
540 | numerator_vector = self.Doc2TopicCount[m] + self.alpha_vector * self.Lambda[m]
541 | # denominator = sum(self.Doc2TopicCount[m]) + sum(self.alpha_vector * self.Lambda[m])
542 | denominator = sum(numerator_vector)
543 | return 1.0 * numerator_vector / denominator
544 |
545 | @property
546 | def beta(self):
547 | """
548 | This name "beta" comes from
549 | "Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage..."
550 | topic-term distribution
551 | beta[k, t] is the probability of term t(word) to be generated from topic k
552 | :return: a matrix, shape is K * T
553 | """
554 | numerator_matrix = self.Topic2TermCount + self.eta_vector
555 | # column vector
556 | # denominator_vector = self.Topic2TermCount.sum(axis=1).reshape(self.K, 1) + sum(self.eta_vector)
557 | denominator_vector = numerator_matrix.sum(axis=1).reshape(self.K, 1)
558 | return 1.0 * numerator_matrix / denominator_vector
559 |
560 | pass
561 |
562 | @property
563 | def theta(self):
564 | """
565 | doc-topic distribution
566 | theta[m, k] is the probability of doc m to be generated from topic k
567 | :return: a matrix, shape is M * K
568 | """
569 | numerator_matrix = self.Doc2TopicCount + self.alpha_vector * self.Lambda
570 | denominator_vector = numerator_matrix.sum(axis=1).reshape(self.M, 1)
571 | # column vector
572 | return 1.0 * numerator_matrix / denominator_vector
573 | pass
574 |
575 | def log_perplexity(self, documents=None, iteration=30, times=10):
576 | """
577 | log perplexity of LDA topic model, use the training data if documents is None
578 | Reference: Parameter estimation for text analysis, Gregor Heinrich.
579 | :param: documents: test set
580 | :return: a float value
581 | """
582 | beta, theta, W, WN, log_likelihood = self.beta, None, None, None, 0
583 | # theta is the doc-topic distribution matrix
584 | # W is the list of term_vector, each term_vector represents a document
585 | # WN is the number of all word in W
586 | # difference test set means difference theta, W, WN
587 |
588 | if not documents:
589 | theta = self.theta
590 | W = self.W
591 | WN = self.WN
592 | else:
593 | # generate the term_vector of document
594 | documents = [LldaModel._document_preprocess(document) for document in documents]
595 | test_corpus = [document.split() for document in documents]
596 | W = [[self.vocabulary[term] for term in doc_words if term in self.vocabulary] for doc_words in test_corpus]
597 | WN = sum([len(term_vector) for term_vector in W])
598 | theta = []
599 | for term_vector in W:
600 | # sample on term_vector until Markov chain converges
601 | theta_new = self._gibbs_sample_inference(term_vector, iteration=iteration, times=times)
602 | theta.append(theta_new)
603 |
604 | # caculate the log_perplexity of current documents
605 | for m, theta_m in enumerate(theta):
606 | for t in W[m]:
607 | likelihood_t = np.inner(theta_m, beta[:, t])
608 | log_likelihood += -np.log(likelihood_t)
609 | return 1.0 * log_likelihood / WN
610 |
611 | def perplexity(self, documents=None, iteration=30, times=10):
612 | """
613 | perplexity of LDA topic model, we use the training data if documents is None
614 | Reference: Parameter estimation for text analysis, Gregor Heinrich.
615 | :param: documents: test set
616 | :return: a float value, perplexity = exp{log_perplexity}
617 | """
618 | return np.exp(self.log_perplexity(documents=documents, iteration=iteration, times=times))
619 |
620 | def __repr__(self):
621 | return "\nLabeled-LDA Model:\n" \
622 | "\tK = %s\n" \
623 | "\tM = %s\n" \
624 | "\tT = %s\n" \
625 | "\tWN = %s\n" \
626 | "\tLN = %s\n" \
627 | "\talpha = %s\n" \
628 | "\teta = %s\n" \
629 | "\tperplexity = %s\n" \
630 | "\t" % (self.K, self.M, self.T, self.WN, self.LN, self.alpha_vector[0], self.eta_vector[0],
631 | self.perplexity())
632 | pass
633 |
634 | class SaveModel:
635 | def __init__(self, save_model_dict=None):
636 | self.alpha_vector = []
637 | self.eta_vector = []
638 | self.terms = []
639 | self.vocabulary = {}
640 | self.topics = []
641 | self.topic_vocabulary = {}
642 | self.W = []
643 | self.Z = []
644 | self.K = 0
645 | self.M = 0
646 | self.T = 0
647 | self.WN = 0
648 | self.LN = 0
649 | self.iteration = 0
650 |
651 | # the following fields cannot be dumped into json file
652 | # we need write them with np.save() and read them with np.load()
653 | # self.Doc2TopicCount = None
654 | # self.Topic2TermCount = None
655 | self.Lambda = None
656 |
657 | if save_model_dict is not None:
658 | self.__dict__ = save_model_dict
659 | pass
660 |
661 | @staticmethod
662 | def _document_preprocess(document):
663 | """
664 | process document before inputting it into the model(both training, update and inference)
665 | :param document: the target document
666 | :return: the word we change
667 | """
668 | document = document.lower()
669 | return document
670 |
671 | @staticmethod
672 | def _read_object_from_file(file_name):
673 | """
674 | read an object from json file
675 | :param file_name: json file name
676 | :return: None if file doesn't exist or can not convert to an object by json, else return the object
677 | """
678 | if os.path.exists(file_name) is False:
679 | print ("Error read path: [%s]" % file_name)
680 | return None
681 | with open(file_name, 'r') as f:
682 | try:
683 | obj = json.load(f)
684 | except Exception:
685 | print ("Error json: [%s]" % f.read()[0:10])
686 | return None
687 | return obj
688 |
689 | @staticmethod
690 | def _write_object_to_file(file_name, target_object):
691 | """
692 | write the object to file with json(if the file exists, this function will overwrite it)
693 | :param file_name: the name of new file
694 | :param target_object: the target object for writing
695 | :return: True if success else False
696 | """
697 | dirname = os.path.dirname(file_name)
698 | LldaModel._find_and_create_dirs(dirname)
699 | try:
700 | with open(file_name, "w") as f:
701 | json.dump(target_object, f, skipkeys=False, ensure_ascii=False, check_circular=True, allow_nan=True,
702 | cls=NpEncoder, indent=True, separators=None, default=None, sort_keys=False)
703 | except Exception as e:
704 | message = "Write [%s...] to file [%s] error: json.dump error" % (str(target_object)[0:10], file_name)
705 | print ("%s: %s" % (e, message))
706 | return False
707 | else:
708 | # print ("Write %s" % file_name)
709 | return True
710 |
711 | @staticmethod
712 | def _find_and_create_dirs(dir_name):
713 | """
714 | find dir, create it if it doesn't exist
715 | :param dir_name: the name of dir
716 | :return: the name of dir
717 | """
718 | if os.path.exists(dir_name) is False:
719 | os.makedirs(dir_name)
720 | return dir_name
721 |
722 | def save_model_to_dir(self, dir_name, save_derivative_properties=False):
723 | """
724 | save model to directory dir_name
725 | :param save_derivative_properties: save derivative properties if True
726 | some properties are not necessary save to disk, they could be derived from some basic properties,
727 | we call they derivative properties.
728 | to save derivative properties to disk:
729 | it will reduce the time of loading model from disk (read properties directly but do not compute them)
730 | but, meanwhile, it will take up more disk space
731 | :param dir_name: the target directory name
732 | :return: None
733 | """
734 | save_model = LldaModel.SaveModel()
735 | save_model.alpha_vector = self.alpha_vector
736 | save_model.eta_vector = self.eta_vector
737 | save_model.terms = self.terms
738 | save_model.vocabulary = self.vocabulary
739 | save_model.topics = self.topics
740 | save_model.topic_vocabulary = self.topic_vocabulary
741 | save_model.W = self.W
742 | save_model.Z = self.Z
743 | save_model.K = self.K
744 | save_model.M = self.M
745 | save_model.T = self.T
746 | save_model.WN = self.WN
747 | save_model.LN = self.LN
748 | save_model.iteration = self.iteration
749 |
750 | save_model_path = os.path.join(dir_name, "llda_model.json")
751 | LldaModel._write_object_to_file(save_model_path, save_model.__dict__)
752 |
753 | np.save(os.path.join(dir_name, "Lambda.npy"), self.Lambda)
754 | # save derivative properties
755 | if save_derivative_properties:
756 | np.save(os.path.join(dir_name, "Doc2TopicCount.npy"), self.Doc2TopicCount)
757 | np.save(os.path.join(dir_name, "Topic2TermCount.npy"), self.Topic2TermCount)
758 | np.save(os.path.join(dir_name, "alpha_vector_Lambda.npy"), self.alpha_vector_Lambda)
759 | np.save(os.path.join(dir_name, "eta_vector_sum.npy"), self.eta_vector_sum)
760 | np.save(os.path.join(dir_name, "Topic2TermCountSum.npy"), self.Topic2TermCountSum)
761 | pass
762 |
763 | def load_model_from_dir(self, dir_name, load_derivative_properties=True):
764 | """
765 | load model from directory dir_name
766 | :param load_derivative_properties: load derivative properties from disk if True
767 | :param dir_name: the target directory name
768 | :return: None
769 | """
770 | save_model_path = os.path.join(dir_name, "llda_model.json")
771 | save_model_dict = LldaModel._read_object_from_file(save_model_path)
772 | save_model = LldaModel.SaveModel(save_model_dict=save_model_dict)
773 | self.alpha_vector = save_model.alpha_vector
774 | self.eta_vector = save_model.eta_vector
775 | self.terms = save_model.terms
776 | self.vocabulary = save_model.vocabulary
777 | self.topics = save_model.topics
778 | self.topic_vocabulary = save_model.topic_vocabulary
779 | self.W = save_model.W
780 | self.Z = save_model.Z
781 | self.K = save_model.K
782 | self.M = save_model.M
783 | self.T = save_model.T
784 | self.WN = save_model.WN
785 | self.LN = save_model.LN
786 | self.iteration = save_model.iteration
787 |
788 | self.Lambda = np.load(os.path.join(dir_name, "Lambda.npy"))
789 |
790 | # load load_derivative properties
791 | if load_derivative_properties:
792 | try:
793 | self.Doc2TopicCount = np.load(os.path.join(dir_name, "Doc2TopicCount.npy"))
794 | self.Topic2TermCount = np.load(os.path.join(dir_name, "Topic2TermCount.npy"))
795 | self.alpha_vector_Lambda = np.load(os.path.join(dir_name, "alpha_vector_Lambda.npy"))
796 | self.eta_vector_sum = np.load(os.path.join(dir_name, "eta_vector_sum.npy"))
797 | self.Topic2TermCountSum = np.load(os.path.join(dir_name, "Topic2TermCountSum.npy"))
798 | except IOError or ValueError as e:
799 | print("%s: load derivative properties fail, initialize them with basic properties" % e)
800 | self._initialize_derivative_fields()
801 | else:
802 | self._initialize_derivative_fields()
803 | pass
804 |
805 | def update(self, labeled_documents=None):
806 | """
807 | update model with labeled documents, incremental update
808 | :return: None
809 | """
810 | self.all_perplexities = []
811 | if labeled_documents is None:
812 | pass
813 |
814 | new_labels = []
815 | new_words = []
816 | new_doc_corpus = []
817 | new_labels_corpus = []
818 | for document, labels in labeled_documents:
819 | document = LldaModel._document_preprocess(document)
820 | doc_words = document.split()
821 | new_doc_corpus.append(doc_words)
822 | if labels is None:
823 | labels = []
824 | labels.append("common_topic")
825 | new_labels_corpus.append(labels)
826 | new_words.extend(doc_words)
827 | new_labels.extend(labels)
828 | # self.terms = list(set(new_words))
829 | new_terms = set(new_words) - set(self.terms)
830 | self.terms.extend(new_terms)
831 | self.vocabulary = {term: index for index, term in enumerate(self.terms)}
832 |
833 | # self.topics = list(set(new_labels))
834 | new_topics = set(new_labels) - set(self.topics)
835 | self.topics.extend(new_topics)
836 | self.topic_vocabulary = {topic: index for index, topic in enumerate(self.topics)}
837 |
838 | old_K = self.K
839 | old_T = self.T
840 | self.K = len(self.topics)
841 | self.T = len(self.terms)
842 |
843 | # self.W = [[self.vocabulary[term] for term in doc_words] for doc_words in new_doc_corpus]
844 | new_w_vectors = [[self.vocabulary[term] for term in doc_words] for doc_words in new_doc_corpus]
845 | for new_w_vector in new_w_vectors:
846 | self.W.append(new_w_vector)
847 |
848 | old_M = self.M
849 | old_WN = self.WN
850 | self.M = len(self.W)
851 | self.WN += len(new_words)
852 | # we appended topic "common_topic" to each doc at the beginning
853 | # so we need minus the number of "common_topic"
854 | # LN is the number of original labels
855 | old_LN = self.LN
856 |
857 | self.LN += len(new_labels) + len(new_labels_corpus)
858 |
859 | old_Lambda = self.Lambda
860 | self.Lambda = np.zeros((self.M, self.K), dtype=float)
861 | for m in range(self.M):
862 | if m < old_M:
863 | # if the old document has no topic, we also init it to all topics here
864 | if sum(old_Lambda[m]) == old_K:
865 | # set all value of self.Lambda[m] to 1.0
866 | self.Lambda[m] += 1.0
867 | continue
868 | # print m, old_M
869 | if len(new_labels_corpus[m-old_M]) == 1:
870 | new_labels_corpus[m-old_M] = self.topics
871 | for label in new_labels_corpus[m-old_M]:
872 | k = self.topic_vocabulary[label]
873 | self.Lambda[m, k] = 1.0
874 |
875 | # TODO: the following 2 fields should be modified again if alpha_vector is not constant vector
876 | self.alpha_vector = [self.alpha_vector[0] for _ in range(self.K)]
877 | self.eta_vector = [self.eta_vector[0] for _ in range(self.T)]
878 |
879 | # self.Z = []
880 | for m in range(old_M, self.M):
881 | # print "self.Lambda[m]: ", self.Lambda[m]
882 | numerator_vector = self.Lambda[m] * self.alpha_vector
883 | p_vector = numerator_vector / sum(numerator_vector)
884 | # print p_vector
885 | # print "p_vector: ", p_vector
886 | # z_vector is a vector of a document,
887 | # just like [2, 3, 6, 0], which means this doc have 4 word and them generated
888 | # from the 2nd, 3rd, 6th, 0th topic, respectively
889 | z_vector = [LldaModel._multinomial_sample(p_vector) for _ in range(len(self.W[m]))]
890 | self.Z.append(z_vector)
891 |
892 | self._initialize_derivative_fields()
893 | pass
894 |
895 | @staticmethod
896 | def _extend_matrix(origin=None, shape=None, padding_value=0):
897 | """
898 | for quickly extend the matrices when update
899 | extend origin matrix with shape, padding with padding_value
900 | :type shape: the shape of new matrix
901 | :param origin: np.ndarray, the original matrix
902 | :return: np.ndarray, a matrix with new shape
903 | """
904 | new_matrix = np.zeros(shape, dtype=origin.dtype)
905 |
906 | for row in range(new_matrix.shape[0]):
907 | for col in range(new_matrix.shape[1]):
908 | if row < origin.shape[0] and col < origin.shape[0]:
909 | new_matrix[row, col] = origin[row, col]
910 | else:
911 | new_matrix[row, col] = padding_value
912 |
913 | return new_matrix
914 | pass
915 |
916 | def is_convergent(self, method="PPL", delta=0.001):
917 | """
918 | is this model convergent?
919 | use the perplexities to determine whether the Markov chain converges
920 | :param method: the method of determining whether the Markov chain converges
921 | "PPL": use the perplexities of training data
922 | "beta": use the parameter 'beta'
923 | :param delta: if the changes are less than or equal to `delta`, means that the Markov chain converges
924 | :return: True if model is convergent
925 | """
926 | if method == "PPL":
927 | if len(self.all_perplexities) < 10:
928 | return False
929 | perplexities = self.all_perplexities[-10:]
930 | if max(perplexities) - min(perplexities) <= delta:
931 | return True
932 | return False
933 | elif method == "beta":
934 | if self.delta_beta <= delta:
935 | return True
936 | return False
937 | else:
938 | raise Exception("parameter 'method=\"%s\"' is illegal" % method)
939 |
940 | @property
941 | def delta_beta(self):
942 | """
943 | calculate the changes of the parameter `beta`
944 | :return: the sum of changes of the parameter `beta`
945 | """
946 | return np.sum(np.abs(self.beta - self.last_beta))
947 |
948 | def top_terms_of_topic(self, topic, k, with_probabilities=True):
949 | """
950 | get top-k terms of topic
951 | :param with_probabilities: True means return the probabilities of a term generated by topic,
952 | else return only terms
953 | :param topic: str, the name of topic
954 | :param k: int, the number of terms
955 | :return: the top-k terms of topic
956 | """
957 | if topic not in self.topic_vocabulary:
958 | raise Exception("Cannot find topic \"%s\"" % topic)
959 | beta = self.beta_k(self.topic_vocabulary[topic])
960 | terms = sorted(list(zip(self.terms, beta)), key=lambda x: x[1], reverse=True)
961 | if with_probabilities:
962 | return terms[:k]
963 | return [term for term, p in terms[:k]]
964 |
965 |
966 | if __name__ == "__main__":
967 | pass
968 |
969 |
--------------------------------------------------------------------------------