├── model
    ├── __init__.py
    └── labeled_lda.py
├── requirements.txt
├── .gitignore
├── assets
    ├── gibbs-sampling-equation.png
    ├── graphical-of-labeled-lda.png
    └── generative-process-for-labeled-lda.png
├── LICENSE
├── example
    └── example.py
└── README.md


/model/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy==1.22.0
2 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.DS_Store
2 | *.pyc
3 | .idea/
4 | data/
5 | example/test.py
6 | 
7 | 


--------------------------------------------------------------------------------
/assets/gibbs-sampling-equation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/gibbs-sampling-equation.png


--------------------------------------------------------------------------------
/assets/graphical-of-labeled-lda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/graphical-of-labeled-lda.png


--------------------------------------------------------------------------------
/assets/generative-process-for-labeled-lda.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/JoeZJH/Labeled-LDA-Python/HEAD/assets/generative-process-for-labeled-lda.png


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Jiahong Zhou
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/example/example.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | sys.path.append('../')
 3 | import model.labeled_lda as llda
 4 | 
 5 | # initialize data
 6 | labeled_documents = [("example example example example example"*10, ["example"]),
 7 |                      ("test llda model test llda model test llda model"*10, ["test", "llda_model"]),
 8 |                      ("example test example test example test example test"*10, ["example", "test"]),
 9 |                      ("good perfect good good perfect good good perfect good "*10, ["positive"]),
10 |                      ("bad bad down down bad bad down"*10, ["negative"])]
11 | 
12 | # new a Labeled LDA model
13 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector="50_div_K", eta_vector=0.001)
14 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.02, eta_vector=0.002)
15 | llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.01)
16 | print(llda_model)
17 | 
18 | # training
19 | # llda_model.training(iteration=10, log=True)
20 | while True:
21 |     print("iteration %s sampling..." % (llda_model.iteration + 1))
22 |     llda_model.training(1)
23 |     print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
24 |     print("delta beta: %s" % llda_model.delta_beta)
25 |     if llda_model.is_convergent(method="beta", delta=0.01):
26 |         break
27 | 
28 | # update
29 | print("before updating: ", llda_model)
30 | update_labeled_documents = [("new example test example test example test example test", ["example", "test"])]
31 | llda_model.update(labeled_documents=update_labeled_documents)
32 | print("after updating: ", llda_model)
33 | 
34 | # train again
35 | # llda_model.training(iteration=10, log=True)
36 | while True:
37 |     print("iteration %s sampling..." % (llda_model.iteration + 1))
38 |     llda_model.training(1)
39 |     print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
40 |     print("delta beta: %s" % llda_model.delta_beta)
41 |     if llda_model.is_convergent(method="beta", delta=0.01):
42 |         break
43 | 
44 | # inference
45 | # note: the result topics may be different for difference training, because gibbs sampling is a random algorithm
46 | document = "example llda model example example good perfect good perfect good perfect" * 100
47 | 
48 | topics = llda_model.inference(document=document, iteration=100, times=10)
49 | print(topics)
50 | 
51 | # perplexity
52 | # calculate perplexity on test data
53 | perplexity = llda_model.perplexity(documents=["example example example example example",
54 |                                               "test llda model test llda model test llda model",
55 |                                               "example test example test example test example test",
56 |                                               "good perfect good good perfect good good perfect good",
57 |                                               "bad bad down down bad bad down"],
58 |                                    iteration=30,
59 |                                    times=10)
60 | print("perplexity on test data: %s" % perplexity)
61 | # calculate perplexity on training data
62 | print("perplexity on training data: %s" % llda_model.perplexity())
63 | 
64 | # save to disk
65 | save_model_dir = "../data/model"
66 | # llda_model.save_model_to_dir(save_model_dir, save_derivative_properties=True)
67 | llda_model.save_model_to_dir(save_model_dir)
68 | 
69 | # load from disk
70 | llda_model_new = llda.LldaModel()
71 | llda_model_new.load_model_from_dir(save_model_dir, load_derivative_properties=False)
72 | print("llda_model_new", llda_model_new)
73 | print("llda_model", llda_model)
74 | print("Top-5 terms of topic 'negative': ", llda_model.top_terms_of_topic("negative", 5, False))
75 | print("Doc-Topic Matrix: \n", llda_model.theta)
76 | print("Topic-Term Matrix: \n", llda_model.beta)
77 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ## Implement of L-LDA Model(Labeled Latent Dirichlet Allocation Model) with python
  2 | 
  3 | 
  4 | References:
  5 |    * *Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage...*
  6 |    * *Parameter estimation for text analysis, Gregor Heinrich.*
  7 |    * *Latent Dirichlet Allocation, David M. Blei, Andrew Y. Ng...*
  8 |    
  9 | ### An efficient implementation based on Gibbs sampling
 10 | 
 11 | **The following descriptions come from *Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage...***
 12 | 
 13 | ##### Introduction:
 14 | Labeled LDA is a topic model that constrains Latent Dirichlet Allocation by defining a one-to-one correspondence between LDA’s latent topics and user tags.
 15 | Labeled LDA can directly learn topics(tags) correspondences.
 16 | 
 17 | ##### Gibbs sampling:
 18 | * Graphical model of Labeled LDA:
 19 | <!-- ![https://github.com/JoeZJH/Labeled-LDA/blob/master/assets/graphical-of-labeled-lda.png](https://github.com/JoeZJH/Labeled-LDA/blob/master/assets/graphical-of-labeled-lda.png) -->
 20 | 
 21 | <img src="https://github.com/JoeZJH/Labeled-LDA-Python/blob/master/assets/graphical-of-labeled-lda.png" width="400" height="265"/>
 22 | 
 23 | * Generative process for Labeled LDA:
 24 | <!-- ![https://github.com/JoeZJH/Labeled-LDA/blob/master/assets/generative-process-for-labeled-lda.png](https://github.com/JoeZJH/Labeled-LDA/blob/master/assets/generative-process-for-labeled-lda.png) -->
 25 | <img src="https://github.com/JoeZJH/Labeled-LDA-Python/blob/master/assets/generative-process-for-labeled-lda.png" width="400" height="400"/>
 26 | 
 27 | * Gibbs sampling equation:
 28 | <!-- ![https://github.com/JoeZJH/Labeled-LDA/blob/master/assets/gibbs-sampling-equation.png](https://github.com/JoeZJH/Labeled-LDA/blob/master/assets/gibbs-sampling-equation.png) -->
 29 | <img src="https://github.com/JoeZJH/Labeled-LDA-Python/blob/master/assets/gibbs-sampling-equation.png" width="400" height="85"/>
 30 | 
 31 | ### Usage
 32 | * new llda model
 33 | * training
 34 | * ?is_convergence
 35 | * update
 36 | * inference
 37 | * save model to disk
 38 | * load model from disk
 39 | * get top-k terms of target topic
 40 | 
 41 | 
 42 | ### Example 
 43 | ```
 44 | # @source code: example/exapmle.py
 45 | 
 46 | import sys
 47 | sys.path.append('../')
 48 | import model.labeled_lda as llda
 49 | 
 50 | # initialize data
 51 | labeled_documents = [("example example example example example"*10, ["example"]),
 52 |                      ("test llda model test llda model test llda model"*10, ["test", "llda_model"]),
 53 |                      ("example test example test example test example test"*10, ["example", "test"]),
 54 |                      ("good perfect good good perfect good good perfect good "*10, ["positive"]),
 55 |                      ("bad bad down down bad bad down"*10, ["negative"])]
 56 | 
 57 | # new a Labeled LDA model
 58 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector="50_div_K", eta_vector=0.001)
 59 | # llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.02, eta_vector=0.002)
 60 | llda_model = llda.LldaModel(labeled_documents=labeled_documents, alpha_vector=0.01)
 61 | print(llda_model)
 62 | 
 63 | # training
 64 | # llda_model.training(iteration=10, log=True)
 65 | while True:
 66 |     print("iteration %s sampling..." % (llda_model.iteration + 1))
 67 |     llda_model.training(1)
 68 |     print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
 69 |     print("delta beta: %s" % llda_model.delta_beta)
 70 |     if llda_model.is_convergent(method="beta", delta=0.01):
 71 |         break
 72 | 
 73 | # update
 74 | print("before updating: ", llda_model)
 75 | update_labeled_documents = [("new example test example test example test example test", ["example", "test"])]
 76 | llda_model.update(labeled_documents=update_labeled_documents)
 77 | print("after updating: ", llda_model)
 78 | 
 79 | # train again
 80 | # llda_model.training(iteration=10, log=True)
 81 | while True:
 82 |     print("iteration %s sampling..." % (llda_model.iteration + 1))
 83 |     llda_model.training(1)
 84 |     print("after iteration: %s, perplexity: %s" % (llda_model.iteration, llda_model.perplexity()))
 85 |     print("delta beta: %s" % llda_model.delta_beta)
 86 |     if llda_model.is_convergent(method="beta", delta=0.01):
 87 |         break
 88 | 
 89 | # inference
 90 | # note: the result topics may be different for difference training, because gibbs sampling is a random algorithm
 91 | document = "example llda model example example good perfect good perfect good perfect" * 100
 92 | 
 93 | topics = llda_model.inference(document=document, iteration=100, times=10)
 94 | print(topics)
 95 | 
 96 | # perplexity
 97 | # calculate perplexity on test data
 98 | perplexity = llda_model.perplexity(documents=["example example example example example",
 99 |                                               "test llda model test llda model test llda model",
100 |                                               "example test example test example test example test",
101 |                                               "good perfect good good perfect good good perfect good",
102 |                                               "bad bad down down bad bad down"],
103 |                                    iteration=30,
104 |                                    times=10)
105 | print("perplexity on test data: %s" % perplexity)
106 | # calculate perplexity on training data
107 | print("perplexity on training data: %s" % llda_model.perplexity())
108 | 
109 | # save to disk
110 | save_model_dir = "../data/model"
111 | # llda_model.save_model_to_dir(save_model_dir, save_derivative_properties=True)
112 | llda_model.save_model_to_dir(save_model_dir)
113 | 
114 | # load from disk
115 | llda_model_new = llda.LldaModel()
116 | llda_model_new.load_model_from_dir(save_model_dir, load_derivative_properties=False)
117 | print("llda_model_new", llda_model_new)
118 | print("llda_model", llda_model)
119 | print("Top-5 terms of topic 'negative': ", llda_model.top_terms_of_topic("negative", 5, False))
120 | print("Doc-Topic Matrix: \n", llda_model.theta)
121 | print("Topic-Term Matrix: \n", llda_model.beta)
122 | ```
123 | 
124 | 
125 | 


--------------------------------------------------------------------------------
/model/labeled_lda.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # @Author: Jiahong Zhou
  4 | # @Date: 2018-10-20
  5 | # @Email: JoeZJiahong@gmail.com
  6 | # implement of L-LDA Model(Labeled Latent Dirichlet Allocation Model)
  7 | # References:
  8 | #   i.      Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage...
  9 | #   ii.     Parameter estimation for text analysis, Gregor Heinrich.
 10 | #   iii.    Latent Dirichlet Allocation, David M. Blei, Andrew Y. Ng...
 11 | import numpy
 12 | import numpy as np
 13 | import os
 14 | import json
 15 | from concurrent import futures
 16 | try:
 17 |     import copy_reg
 18 | except Exception:
 19 |     import copyreg as copy_reg
 20 | 
 21 | import types
 22 | 
 23 | 
 24 | class NpEncoder(json.JSONEncoder):
 25 |     def default(self, obj):
 26 |         if isinstance(obj, np.integer):
 27 |             return int(obj)
 28 |         elif isinstance(obj, np.floating):
 29 |             return float(obj)
 30 |         elif isinstance(obj, np.ndarray):
 31 |             return obj.tolist()
 32 |         else:
 33 |             return super(NpEncoder, self).default(obj)
 34 | 
 35 | 
 36 | class LldaModel:
 37 |     """
 38 |     L-LDA(Labeled Latent Dirichlet Allocation Model)
 39 | 
 40 |     @field K: the number of topics
 41 |     @field alpha_vector: the prior distribution of theta_m
 42 |                          str("50_div_K"): means [K/50, K/50, ...],
 43 |                                 this value come from Parameter estimation for text analysis, Gregor Heinrich.
 44 |                          int or float: means [alpha_vector, alpha_vector, ...]
 45 |                          None: means [0.001, 0.001, ...]
 46 |     @field eta_vector: the prior distribution of beta_k
 47 |                        int or float: means [eta_vector, eta_vector, ...]
 48 |                        None: means [0.001, 0.001, ...]
 49 |     @field terms: a list of the all terms
 50 |     @field vocabulary: a dict of <term, term_id>, vocabulary[terms[id]] == id
 51 |     @field topics: a list of the all topics
 52 |     @field topic_vocabulary: a dict of <topic, topic_id>, topic_vocabulary[topics[id]] == id
 53 |     @field W: the corpus, a list of terms list,
 54 |               W[m] is the document vector, W[m][n] is the id of the term
 55 |     @field Z: the topic corpus, just same as W,
 56 |               except Z[m][n] is the id of the topic of the term
 57 |     @field M: the number of documents
 58 |     @field T: the number of terms
 59 |     @field WN: the number of all words in W
 60 |     @field LN: the number of all original labels
 61 |     @field iteration: the times of iteration
 62 |     @field all_perplexities: a list of all perplexities (one training iteration one perplexity)
 63 |     @field last_beta: the parameter `beta` of last training iteration
 64 |     @field Lambda: a matrix, shape is M * K,
 65 |                    Lambda[m][k] is 1 means topic k is a label of document m
 66 | 
 67 |     # derivative fields
 68 |     @field Doc2TopicCount: a matrix, shape is M * K,
 69 |                            Doc2TopicCount[m][k] is the times of topic k sampled in document m
 70 |     @field Topic2TermCount: a matrix, shape is K * T,
 71 |                             Topic2TermCount[k][t] is the times of term t generated from topic k
 72 |     @field Doc2TopicCountSum: a vector, shape is M, self.Doc2TopicCount.sum(axis=1)
 73 |                               Doc2TopicCountSum[m] is the count of all topic,
 74 |                               i.e., Doc2TopicCountSum[m] is the number of words in document m
 75 |     @field alpha_vector_Lambda: a matrix, self.alpha_vector * self.Lambda
 76 |     @field alpha_vector_Lambda_sum: a vector, self.alpha_vector_Lambda.sum(axis=1)
 77 |     @field eta_vector_sum: float value, sum(self.eta_vector)
 78 |     @field Topic2TermCountSum: a vector, self.Topic2TermCount.sum(axis=1)
 79 | 
 80 |     """
 81 |     def __init__(self, alpha_vector="50_div_K", eta_vector=None, labeled_documents=None):
 82 |         """
 83 | 
 84 |         :param alpha_vector: the prior distribution of theta_m
 85 |         :param eta_vector: the prior distribution of beta_k
 86 |         :param labeled_documents: a iterable of tuple(doc, iterable of label), contains all doc and their labels
 87 |         """
 88 |         self.alpha_vector = alpha_vector
 89 |         self.eta_vector = eta_vector
 90 |         self.terms = []
 91 |         self.vocabulary = {}
 92 |         self.topics = []
 93 |         self.topic_vocabulary = {}
 94 |         self.W = []
 95 |         self.Z = []
 96 |         self.K = 0
 97 |         self.M = 0
 98 |         self.T = 0
 99 |         self.WN = 0
100 |         self.LN = 0
101 |         self.iteration = 0
102 |         self.all_perplexities = []
103 |         self.last_beta = None
104 |         self.Lambda = None
105 | 
106 |         # derivative fields:
107 |         # the following fields could reduce operations in training and inference
108 |         # it is not necessary to save them to file, we can recover them by other fields
109 | 
110 |         self.Doc2TopicCount = None
111 |         self.Topic2TermCount = None
112 |         # self.Doc2TopicCountSum = None
113 |         self.alpha_vector_Lambda = None
114 |         # self.alpha_vector_Lambda_sum = None
115 |         self.eta_vector_sum = 0.0
116 |         self.Topic2TermCountSum = None
117 | 
118 |         if labeled_documents is not None:
119 |             self._load_labeled_documents(labeled_documents)
120 | 
121 |         pass
122 | 
123 |     def _initialize_derivative_fields(self):
124 |         """
125 |         initialize derivative fields
126 |         :return: None
127 |         """
128 |         # TODO: Doc2TopicCount could be reduced to a smaller matrix,
129 |         # TODO: because some vector in Doc2TopicCount will always been 0
130 |         self.Doc2TopicCount = np.zeros((self.M, self.K), dtype=int)
131 |         self.Topic2TermCount = np.zeros((self.K, self.T), dtype=int)
132 |         for m in range(self.M):
133 |             # print self.Z[m]
134 |             for t, z in zip(self.W[m], self.Z[m]):
135 |                 k = z
136 |                 # print "[m=%s, k=%s]" % (m, k)
137 |                 # print "[k=%s, t=%s]" % (k, t)
138 |                 self.Doc2TopicCount[m, k] += 1
139 |                 self.Topic2TermCount[k, t] += 1
140 | 
141 |         # self.Doc2TopicCountSum = self.Doc2TopicCount.sum(axis=1)
142 |         self.alpha_vector_Lambda = self.alpha_vector * self.Lambda
143 |         # self.alpha_vector_Lambda_sum = self.alpha_vector_Lambda.sum(axis=1)
144 |         self.eta_vector_sum = sum(self.eta_vector)
145 |         self.Topic2TermCountSum = self.Topic2TermCount.sum(axis=1)
146 | 
147 |     def _load_labeled_documents(self, labeled_documents):
148 |         """
149 |         input labeled corpus, which contains all documents and their corresponding labels
150 |         :param labeled_documents: a iterable of tuple(doc, iterable of label), contains all doc and their labels
151 |         :return:
152 |         """
153 |         # self.documents = []
154 |         all_labels = []
155 |         all_words = []
156 |         doc_corpus = []
157 |         labels_corpus = []
158 |         for document, labels in labeled_documents:
159 |             document = LldaModel._document_preprocess(document)
160 |             doc_words = document.split()
161 |             doc_corpus.append(doc_words)
162 |             if labels is None:
163 |                 labels = []
164 |             labels.append("common_topic")
165 |             labels_corpus.append(labels)
166 |             all_words.extend(doc_words)
167 |             all_labels.extend(labels)
168 |         self.terms = list(set(all_words))
169 |         self.vocabulary = {term: index for index, term in enumerate(self.terms)}
170 |         self.topics = list(set(all_labels))
171 |         self.topic_vocabulary = {topic: index for index, topic in enumerate(self.topics)}
172 |         self.K = len(self.topics)
173 |         self.T = len(self.terms)
174 |         self.W = [[self.vocabulary[term] for term in doc_words] for doc_words in doc_corpus]
175 |         self.M = len(self.W)
176 |         self.WN = len(all_words)
177 |         # we appended topic "common_topic" to each doc at the beginning
178 |         # so we need minus the number of "common_topic"
179 |         # LN is the number of original labels
180 |         self.LN = len(all_labels) - self.M
181 | 
182 |         self.Lambda = np.zeros((self.M, self.K), dtype=float)
183 |         for m in range(self.M):
184 |             if len(labels_corpus[m]) == 1:
185 |                 labels_corpus[m] = self.topics
186 |             for label in labels_corpus[m]:
187 |                 k = self.topic_vocabulary[label]
188 |                 self.Lambda[m, k] = 1.0
189 | 
190 |         if self.alpha_vector is None:
191 |             self.alpha_vector = [0.001 for _ in range(self.K)]
192 |         elif type(self.alpha_vector) is str and self.alpha_vector == "50_div_K":
193 |             self.alpha_vector = [50.0/self.K for _ in range(self.K)]
194 |         elif type(self.alpha_vector) is float or type(self.alpha_vector) is int:
195 |             self.alpha_vector = [self.alpha_vector for _ in range(self.K)]
196 |         else:
197 |             message = "error alpha_vector: %s" % self.alpha_vector
198 |             raise Exception(message)
199 | 
200 |         if self.eta_vector is None:
201 |             self.eta_vector = [0.001 for _ in range(self.T)]
202 |         elif type(self.eta_vector) is float or type(self.eta_vector) is int:
203 |             self.eta_vector = [self.eta_vector for _ in range(self.T)]
204 |         else:
205 |             message = "error eta_vector: %s" % self.eta_vector
206 |             raise Exception(message)
207 | 
208 |         self.Z = []
209 |         for m in range(self.M):
210 |             # print "self.Lambda[m]: ", self.Lambda[m]
211 |             numerator_vector = self.Lambda[m] * self.alpha_vector
212 |             p_vector = 1.0 * numerator_vector / sum(numerator_vector)
213 |             # print p_vector
214 |             # print "p_vector: ", p_vector
215 |             # z_vector is a vector of a document,
216 |             # just like [2, 3, 6, 0], which means this doc have 4 word and them generated
217 |             # from the 2nd, 3rd, 6th, 0th topic, respectively
218 |             z_vector = [LldaModel._multinomial_sample(p_vector) for _ in range(len(self.W[m]))]
219 |             self.Z.append(z_vector)
220 | 
221 |         self._initialize_derivative_fields()
222 |         pass
223 | 
224 |     @staticmethod
225 |     def _multinomial_sample(p_vector, random_state=None):
226 |         """
227 |         sample a number from multinomial distribution
228 |         :param p_vector: the probabilities
229 |         :return: a int value
230 |         """
231 |         if random_state is not None:
232 |             return random_state.multinomial(1, p_vector).argmax()
233 |         return np.random.multinomial(1, p_vector).argmax()
234 | 
235 |     def _gibbs_sample_training(self):
236 |         """
237 |         sample a topic(k) for each word(t) of all documents, Generate a new matrix Z
238 |         :return: None
239 |         """
240 |         # TODO: the operations of addition and multiplication could be reduced, because some
241 |         self.last_beta = self.beta
242 |         count = 0
243 |         for m in range(self.M):
244 | 
245 |             # doc_m_eta_vector = self.eta_vector
246 |             # doc_m_alpha_vector = self.alpha_vector * self.Lambda[m]
247 |             doc_m_alpha_vector = self.alpha_vector_Lambda[m]
248 |             # assert (doc_m_alpha_vector == self.alpha_vector_Lambda[m]).all()
249 | 
250 |             # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector)
251 |             # sum_doc_m_alpha_vector = self.alpha_vector_Lambda_sum[m]
252 |             # assert sum_doc_m_alpha_vector == self.alpha_vector_Lambda_sum[m]
253 | 
254 |             for t, z, n in zip(self.W[m], self.Z[m], range(len(self.W[m]))):
255 |                 k = z
256 |                 self.Doc2TopicCount[m, k] -= 1
257 |                 self.Topic2TermCount[k, t] -= 1
258 |                 self.Topic2TermCountSum[k] -= 1
259 | 
260 |                 numerator_theta_vector = self.Doc2TopicCount[m] + doc_m_alpha_vector
261 |                 # denominator_theta = sum(self.Doc2TopicCount[m]) + sum_doc_m_alpha_vector
262 |                 # denominator_theta = self.Doc2TopicCountSum[m]-1 + sum_doc_m_alpha_vector
263 |                 # assert sum(self.Doc2TopicCount[m]) == self.Doc2TopicCountSum[m]-1
264 | 
265 |                 numerator_beta_vector = self.Topic2TermCount[:, t] + self.eta_vector[t]
266 |                 # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector)
267 |                 # denominator_beta = self.Topic2TermCount.sum(axis=1) + self.eta_vector_sum
268 |                 denominator_beta = self.Topic2TermCountSum + self.eta_vector_sum
269 |                 # assert (self.Topic2TermCount.sum(axis=1) == self.Topic2TermCountSum).all()
270 |                 # assert sum(self.eta_vector) == self.eta_vector_sum
271 | 
272 |                 beta_vector = 1.0 * numerator_beta_vector / denominator_beta
273 |                 # theta_vector = 1.0 * numerator_theta_vector / denominator_theta
274 |                 # denominator_theta is independent with t and k, so denominator could be any value except 0
275 |                 # will set denominator_theta as 1.0
276 |                 theta_vector = numerator_theta_vector
277 | 
278 |                 p_vector = beta_vector * theta_vector
279 |                 # print p_vector
280 |                 """
281 |                 for some special document m (only have one word) p_vector may be zero here, sum(p_vector) will be zero too
282 |                 1.0 * p_vector / sum(p_vector) will be [...nan...]
283 |                 so we should avoid inputting the special document 
284 |                 """
285 |                 p_vector = 1.0 * p_vector / sum(p_vector)
286 |                 # print p_vector
287 |                 sample_z = LldaModel._multinomial_sample(p_vector)
288 |                 self.Z[m][n] = sample_z
289 | 
290 |                 k = sample_z
291 |                 self.Doc2TopicCount[m, k] += 1
292 |                 self.Topic2TermCount[k, t] += 1
293 |                 self.Topic2TermCountSum[k] += 1
294 |                 count += 1
295 |         assert count == self.WN
296 |         print("gibbs sample count: ", self.WN)
297 |         self.iteration += 1
298 |         self.all_perplexities.append(self.perplexity())
299 |         pass
300 | 
301 |     def _gibbs_sample_inference(self, term_vector, iteration=300, times=10):
302 |         """
303 |         inference with gibbs sampling
304 |         :param term_vector: the term vector of document
305 |         :param iteration: the times of iteration until Markov chain converges
306 |         :param times: the number of samples of the target distribution
307 |                 (one whole iteration(sample for all words) generates a sample, the )
308 |                 #times = #samples,
309 |                 after Markov chain converges, the next #times samples as the samples of the target distribution,
310 |                 we drop the samples before the Markov chain converges,
311 |                 the result is the average value of #times samples
312 |         :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k
313 |                  theta_new, a theta_vector, the doc-topic distribution
314 |         """
315 |         doc_topic_count = np.zeros(self.K, dtype=int)
316 |         accumulated_doc_topic_count = np.zeros(self.K, dtype=int)
317 |         p_vector = np.ones(self.K, dtype=int)
318 |         p_vector = p_vector * 1.0 / sum(p_vector)
319 |         z_vector = [LldaModel._multinomial_sample(p_vector) for _ in term_vector]
320 |         for n, t in enumerate(term_vector):
321 |             k = z_vector[n]
322 |             doc_topic_count[k] += 1
323 |             self.Topic2TermCount[k, t] += 1
324 |             self.Topic2TermCountSum[k] += 1
325 | 
326 |         # sum_doc_topic_count = sum(doc_topic_count)
327 |         doc_m_alpha_vector = self.alpha_vector
328 |         # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector)
329 |         for i in range(iteration+times):
330 |             for n, t in enumerate(term_vector):
331 |                 k = z_vector[n]
332 |                 doc_topic_count[k] -= 1
333 |                 self.Topic2TermCount[k, t] -= 1
334 |                 self.Topic2TermCountSum[k] -= 1
335 | 
336 |                 numerator_theta_vector = doc_topic_count + doc_m_alpha_vector
337 |                 # denominator_theta = sum_doc_topic_count - 1 + sum_doc_m_alpha_vector
338 | 
339 |                 numerator_beta_vector = self.Topic2TermCount[:, t] + self.eta_vector[t]
340 |                 # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector)
341 |                 denominator_beta = self.Topic2TermCountSum + self.eta_vector_sum
342 | 
343 |                 beta_vector = 1.0 * numerator_beta_vector / denominator_beta
344 |                 # theta_vector = 1.0 numerator_theta_vector / denominator_theta
345 |                 # denominator_theta is independent with t and k, so denominator could be any value except 0
346 |                 # will set denominator_theta as 1.0
347 |                 theta_vector = numerator_theta_vector
348 | 
349 |                 p_vector = beta_vector * theta_vector
350 |                 # print p_vector
351 |                 p_vector = 1.0 * p_vector / sum(p_vector)
352 |                 # print p_vector
353 |                 sample_z = LldaModel._multinomial_sample(p_vector)
354 |                 z_vector[n] = sample_z
355 | 
356 |                 k = sample_z
357 |                 doc_topic_count[k] += 1
358 |                 self.Topic2TermCount[k, t] += 1
359 |                 self.Topic2TermCountSum[k] += 1
360 |             if i >= iteration:
361 |                 accumulated_doc_topic_count += doc_topic_count
362 |         # reset self.Topic2TermCount
363 |         for n, t in enumerate(term_vector):
364 |             k = z_vector[n]
365 |             self.Topic2TermCount[k, t] -= 1
366 |             self.Topic2TermCountSum[k] -= 1
367 | 
368 |         numerator_theta_vector = accumulated_doc_topic_count/times + doc_m_alpha_vector
369 |         # denominator_theta = sum(doc_topic_count) + sum(doc_m_alpha_vector)
370 |         denominator_theta = sum(numerator_theta_vector)
371 |         theta_new = 1.0 * numerator_theta_vector / denominator_theta
372 |         return theta_new
373 | 
374 |     # def _gibbs_sample_inference_multi_processors(self, term_vector, iteration=30):
375 |     #     """
376 |     #     inference with gibbs sampling
377 |     #     :param term_vector: the term vector of document
378 |     #     :param iteration: the times of iteration
379 |     #     :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k
380 |     #              theta_new, a theta_vector, the doc-topic distribution
381 |     #     """
382 |     #     # print("gibbs sample inference iteration: %s" % iteration)
383 |     #     # TODO: complete multi-processors code here
384 |     #     # we copy all the shared variables may be modified on runtime
385 |     #     random_state = np.random.RandomState()
386 |     #     topic2term_count = self.Topic2TermCount.copy()
387 |     #     topic2term_count_sum = self.Topic2TermCountSum.copy()
388 |     #
389 |     #     doc_topic_count = np.zeros(self.K, dtype=int)
390 |     #     p_vector = np.ones(self.K, dtype=int)
391 |     #     p_vector = p_vector * 1.0 / sum(p_vector)
392 |     #     z_vector = [LldaModel._multinomial_sample(p_vector, random_state=random_state) for _ in term_vector]
393 |     #     for n, t in enumerate(term_vector):
394 |     #         k = z_vector[n]
395 |     #         doc_topic_count[k] += 1
396 |     #         topic2term_count[k, t] += 1
397 |     #         topic2term_count_sum[k] += 1
398 |     #
399 |     #     # sum_doc_topic_count = sum(doc_topic_count)
400 |     #     doc_m_alpha_vector = self.alpha_vector
401 |     #     # sum_doc_m_alpha_vector = sum(doc_m_alpha_vector)
402 |     #     for i in range(iteration):
403 |     #         for n, t in enumerate(term_vector):
404 |     #             k = z_vector[n]
405 |     #             doc_topic_count[k] -= 1
406 |     #             topic2term_count[k, t] -= 1
407 |     #             topic2term_count_sum[k] -= 1
408 |     #
409 |     #             numerator_theta_vector = doc_topic_count + doc_m_alpha_vector
410 |     #             # denominator_theta = sum_doc_topic_count - 1 + sum_doc_m_alpha_vector
411 |     #
412 |     #             numerator_beta_vector = topic2term_count[:, t] + self.eta_vector[t]
413 |     #             # denominator_beta = self.Topic2TermCount.sum(axis=1) + sum(self.eta_vector)
414 |     #             denominator_beta = topic2term_count_sum + self.eta_vector_sum
415 |     #
416 |     #             beta_vector = 1.0 * numerator_beta_vector / denominator_beta
417 |     #             # theta_vector = 1.0 numerator_theta_vector / denominator_theta
418 |     #             # denominator_theta is independent with t and k, so denominator could be any value except 0
419 |     #             # will set denominator_theta as 1.0
420 |     #             theta_vector = numerator_theta_vector
421 |     #
422 |     #             p_vector = beta_vector * theta_vector
423 |     #             # print p_vector
424 |     #             p_vector = 1.0 * p_vector / sum(p_vector)
425 |     #             # print p_vector
426 |     #             sample_z = LldaModel._multinomial_sample(p_vector, random_state)
427 |     #             z_vector[n] = sample_z
428 |     #
429 |     #             k = sample_z
430 |     #             doc_topic_count[k] += 1
431 |     #             topic2term_count[k, t] += 1
432 |     #             topic2term_count_sum[k] += 1
433 |     #     # reset self.Topic2TermCount
434 |     #     # for n, t in enumerate(term_vector):
435 |     #     #     k = z_vector[n]
436 |     #     #     self.Topic2TermCount[k, t] -= 1
437 |     #     #     self.Topic2TermCountSum[k] -= 1
438 |     #
439 |     #     numerator_theta_vector = doc_topic_count + doc_m_alpha_vector
440 |     #     # denominator_theta = sum(doc_topic_count) + sum(doc_m_alpha_vector)
441 |     #     denominator_theta = sum(numerator_theta_vector)
442 |     #     theta_new = 1.0 * numerator_theta_vector / denominator_theta
443 |     #     return theta_new
444 | 
445 |     def training(self, iteration=10, log=False):
446 |         """
447 |         training this model with gibbs sampling
448 |         :param log: print perplexity after every gibbs sampling if True
449 |         :param iteration: the times of iteration
450 |         :return: None
451 |         """
452 |         for i in range(iteration):
453 |             if log:
454 |                 print("after iteration: %s, perplexity: %s" % (self.iteration, self.perplexity()))
455 |             self._gibbs_sample_training()
456 |         pass
457 | 
458 |     def inference(self, document, iteration=30, times=10):
459 |         # TODO: inference of a document
460 |         """
461 |         inference for one document
462 |         :param document: some sentence like "this is a method for inference"
463 |         :param times: the number of samples of the target distribution
464 |                 (one whole iteration(sample for all words) generates a sample, the )
465 |                 #times = #samples,
466 |                 after Markov chain converges, the next #times samples as the samples of the target distribution,
467 |                 we drop the samples before the Markov chain converges,
468 |                 the result is the average value of #times samples
469 |         :param iteration: the times of iteration until Markov chain converges
470 |         :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k
471 |                  theta_new, a theta_vector, the doc-topic distribution
472 |         """
473 |         document = LldaModel._document_preprocess(document)
474 |         doc_words = document.split()
475 |         term_vector = [self.vocabulary[word] for word in doc_words if word in self.vocabulary]
476 |         theta_new = self._gibbs_sample_inference(term_vector, iteration=iteration, times=times)
477 |         doc_topic_new = [(self.topics[k], probability) for k, probability in enumerate(theta_new)]
478 |         sorted_doc_topic_new = sorted(doc_topic_new,
479 |                                       key=lambda topic_probability: topic_probability[1],
480 |                                       reverse=True)
481 |         return sorted_doc_topic_new
482 |         pass
483 | 
484 |     # def inference_multi_processors(self, document, iteration=30, times=8, max_workers=8):
485 |     #     # TODO: inference of a document with multi processors
486 |     #     """
487 |     #     inference for one document
488 |     #     :param times: the times of gibbs sampling, the result is the average value of all times(gibbs sampling)
489 |     #     :param iteration: the times of iteration
490 |     #     :param document: some sentence like "this is a method for inference"
491 |     #     :param max_workers: the max number of processors(workers)
492 |     #     :return: theta_new, a vector, theta_new[k] is the probability of doc(term_vector) to be generated from topic k
493 |     #              theta_new, a theta_vector, the doc-topic distribution
494 |     #     """
495 |     #
496 |     #     def _pickle_method(m):
497 |     #         if m.im_self is None:
498 |     #             return getattr, (m.im_class, m.im_func.func_name)
499 |     #         else:
500 |     #             return getattr, (m.im_self, m.im_func.func_name)
501 |     #     copy_reg.pickle(types.MethodType, _pickle_method)
502 |     #
503 |     #     words = document.split()
504 |     #     term_vector = [self.vocabulary[word] for word in words if word in self.vocabulary]
505 |     #     term_vectors = [term_vector for _ in range(times)]
506 |     #     iterations = [iteration for _ in range(times)]
507 |     #
508 |     #     with futures.ProcessPoolExecutor(max_workers) as executor:
509 |     #         # print "executor.map"
510 |     #         res = executor.map(self._gibbs_sample_inference_multi_processors, term_vectors, iterations)
511 |     #     theta_new_accumulation = np.zeros(self.K, float)
512 |     #     for theta_new in res:
513 |     #         theta_new_accumulation += theta_new
514 |     #     theta_new = 1.0 * theta_new_accumulation / times
515 |     #     # print "avg: \n", theta_new
516 |     #     doc_topic_new = [(self.topics[k], probability) for k, probability in enumerate(theta_new)]
517 |     #     sorted_doc_topic_new = sorted(doc_topic_new,
518 |     #                                   key=lambda topic_probability: topic_probability[1],
519 |     #                                   reverse=True)
520 |     #     return sorted_doc_topic_new
521 |     #     pass
522 | 
523 |     def beta_k(self, k):
524 |         """
525 |         topic-term distribution
526 |         beta_k[t] is the probability of term t(word) to be generated from topic k
527 |         :return: a vector, shape is T
528 |         """
529 |         numerator_vector = self.Topic2TermCount[k] + self.eta_vector
530 |         # denominator = sum(self.Topic2TermCount[k]) + sum(self.eta_vector)
531 |         denominator = sum(numerator_vector)
532 |         return 1.0 * numerator_vector / denominator
533 | 
534 |     def theta_m(self, m):
535 |         """
536 |         doc-topic distribution
537 |         theta_m[k] is the probability of doc m to be generated from topic k
538 |         :return: a vector, shape is K
539 |         """
540 |         numerator_vector = self.Doc2TopicCount[m] + self.alpha_vector * self.Lambda[m]
541 |         # denominator = sum(self.Doc2TopicCount[m]) + sum(self.alpha_vector * self.Lambda[m])
542 |         denominator = sum(numerator_vector)
543 |         return 1.0 * numerator_vector / denominator
544 | 
545 |     @property
546 |     def beta(self):
547 |         """
548 |         This name "beta" comes from
549 |             "Labeled LDA: A supervised topic model for credit attribution in multi-labeled corpora, Daniel Ramage..."
550 |         topic-term distribution
551 |         beta[k, t] is the probability of term t(word) to be generated from topic k
552 |         :return: a matrix, shape is K * T
553 |         """
554 |         numerator_matrix = self.Topic2TermCount + self.eta_vector
555 |         # column vector
556 |         # denominator_vector = self.Topic2TermCount.sum(axis=1).reshape(self.K, 1) + sum(self.eta_vector)
557 |         denominator_vector = numerator_matrix.sum(axis=1).reshape(self.K, 1)
558 |         return 1.0 * numerator_matrix / denominator_vector
559 | 
560 |         pass
561 | 
562 |     @property
563 |     def theta(self):
564 |         """
565 |         doc-topic distribution
566 |         theta[m, k] is the probability of doc m to be generated from topic k
567 |         :return: a matrix, shape is M * K
568 |         """
569 |         numerator_matrix = self.Doc2TopicCount + self.alpha_vector * self.Lambda
570 |         denominator_vector = numerator_matrix.sum(axis=1).reshape(self.M, 1)
571 |         # column vector
572 |         return 1.0 * numerator_matrix / denominator_vector
573 |         pass
574 | 
575 |     def log_perplexity(self, documents=None, iteration=30, times=10):
576 |         """
577 |         log perplexity of LDA topic model, use the training data if documents is None
578 |         Reference:  Parameter estimation for text analysis, Gregor Heinrich.
579 |         :param: documents: test set
580 |         :return: a float value
581 |         """
582 |         beta, theta, W, WN, log_likelihood = self.beta, None, None, None, 0
583 |         # theta is the doc-topic distribution matrix
584 |         # W is the list of term_vector, each term_vector represents a document
585 |         # WN is the number of all word in W
586 |         # difference test set means difference theta, W, WN
587 | 
588 |         if not documents:
589 |             theta = self.theta
590 |             W = self.W
591 |             WN = self.WN
592 |         else:
593 |             # generate the term_vector of document
594 |             documents = [LldaModel._document_preprocess(document) for document in documents]
595 |             test_corpus = [document.split() for document in documents]
596 |             W = [[self.vocabulary[term] for term in doc_words if term in self.vocabulary] for doc_words in test_corpus]
597 |             WN = sum([len(term_vector) for term_vector in W])
598 |             theta = []
599 |             for term_vector in W:
600 |                 # sample on term_vector until Markov chain converges
601 |                 theta_new = self._gibbs_sample_inference(term_vector, iteration=iteration, times=times)
602 |                 theta.append(theta_new)
603 | 
604 |         # caculate the log_perplexity of current documents
605 |         for m, theta_m in enumerate(theta):
606 |             for t in W[m]:
607 |                 likelihood_t = np.inner(theta_m, beta[:, t])
608 |                 log_likelihood += -np.log(likelihood_t)
609 |         return 1.0 * log_likelihood / WN
610 | 
611 |     def perplexity(self, documents=None, iteration=30, times=10):
612 |         """
613 |         perplexity of LDA topic model, we use the training data if documents is None
614 |         Reference:  Parameter estimation for text analysis, Gregor Heinrich.
615 |         :param: documents: test set
616 |         :return: a float value, perplexity = exp{log_perplexity}
617 |         """
618 |         return np.exp(self.log_perplexity(documents=documents, iteration=iteration, times=times))
619 | 
620 |     def __repr__(self):
621 |         return "\nLabeled-LDA Model:\n" \
622 |                "\tK = %s\n" \
623 |                "\tM = %s\n" \
624 |                "\tT = %s\n" \
625 |                "\tWN = %s\n" \
626 |                "\tLN = %s\n" \
627 |                "\talpha = %s\n" \
628 |                "\teta = %s\n" \
629 |                "\tperplexity = %s\n" \
630 |                "\t" % (self.K, self.M, self.T, self.WN, self.LN, self.alpha_vector[0], self.eta_vector[0],
631 |                        self.perplexity())
632 |         pass
633 | 
634 |     class SaveModel:
635 |         def __init__(self, save_model_dict=None):
636 |             self.alpha_vector = []
637 |             self.eta_vector = []
638 |             self.terms = []
639 |             self.vocabulary = {}
640 |             self.topics = []
641 |             self.topic_vocabulary = {}
642 |             self.W = []
643 |             self.Z = []
644 |             self.K = 0
645 |             self.M = 0
646 |             self.T = 0
647 |             self.WN = 0
648 |             self.LN = 0
649 |             self.iteration = 0
650 | 
651 |             # the following fields cannot be dumped into json file
652 |             # we need write them with np.save() and read them with np.load()
653 |             # self.Doc2TopicCount = None
654 |             # self.Topic2TermCount = None
655 |             self.Lambda = None
656 | 
657 |             if save_model_dict is not None:
658 |                 self.__dict__ = save_model_dict
659 |         pass
660 | 
661 |     @staticmethod
662 |     def _document_preprocess(document):
663 |         """
664 |         process document before inputting it into the model(both training, update and inference)
665 |         :param document: the target document
666 |         :return: the word we change
667 |         """
668 |         document = document.lower()
669 |         return document
670 | 
671 |     @staticmethod
672 |     def _read_object_from_file(file_name):
673 |         """
674 |         read an object from json file
675 |         :param file_name: json file name
676 |         :return: None if file doesn't exist or can not convert to an object by json, else return the object
677 |         """
678 |         if os.path.exists(file_name) is False:
679 |             print ("Error read path: [%s]" % file_name)
680 |             return None
681 |         with open(file_name, 'r') as f:
682 |             try:
683 |                 obj = json.load(f)
684 |             except Exception:
685 |                 print ("Error json: [%s]" % f.read()[0:10])
686 |                 return None
687 |         return obj
688 | 
689 |     @staticmethod
690 |     def _write_object_to_file(file_name, target_object):
691 |         """
692 |         write the object to file with json(if the file exists, this function will overwrite it)
693 |         :param file_name: the name of new file
694 |         :param target_object: the target object for writing
695 |         :return: True if success else False
696 |         """
697 |         dirname = os.path.dirname(file_name)
698 |         LldaModel._find_and_create_dirs(dirname)
699 |         try:
700 |             with open(file_name, "w") as f:
701 |                 json.dump(target_object, f, skipkeys=False, ensure_ascii=False, check_circular=True, allow_nan=True,
702 |                           cls=NpEncoder, indent=True, separators=None, default=None, sort_keys=False)
703 |         except Exception as e:
704 |             message = "Write [%s...] to file [%s] error: json.dump error" % (str(target_object)[0:10], file_name)
705 |             print ("%s: %s" % (e, message))
706 |             return False
707 |         else:
708 |             # print ("Write %s" % file_name)
709 |             return True
710 | 
711 |     @staticmethod
712 |     def _find_and_create_dirs(dir_name):
713 |         """
714 |         find dir, create it if it doesn't exist
715 |         :param dir_name: the name of dir
716 |         :return: the name of dir
717 |         """
718 |         if os.path.exists(dir_name) is False:
719 |             os.makedirs(dir_name)
720 |         return dir_name
721 | 
722 |     def save_model_to_dir(self, dir_name, save_derivative_properties=False):
723 |         """
724 |         save model to directory dir_name
725 |         :param save_derivative_properties: save derivative properties if True
726 |             some properties are not necessary save to disk, they could be derived from some basic properties,
727 |             we call they derivative properties.
728 |             to save derivative properties to disk:
729 |                 it will reduce the time of loading model from disk (read properties directly but do not compute them)
730 |                 but, meanwhile, it will take up more disk space
731 |         :param dir_name: the target directory name
732 |         :return: None
733 |         """
734 |         save_model = LldaModel.SaveModel()
735 |         save_model.alpha_vector = self.alpha_vector
736 |         save_model.eta_vector = self.eta_vector
737 |         save_model.terms = self.terms
738 |         save_model.vocabulary = self.vocabulary
739 |         save_model.topics = self.topics
740 |         save_model.topic_vocabulary = self.topic_vocabulary
741 |         save_model.W = self.W
742 |         save_model.Z = self.Z
743 |         save_model.K = self.K
744 |         save_model.M = self.M
745 |         save_model.T = self.T
746 |         save_model.WN = self.WN
747 |         save_model.LN = self.LN
748 |         save_model.iteration = self.iteration
749 | 
750 |         save_model_path = os.path.join(dir_name, "llda_model.json")
751 |         LldaModel._write_object_to_file(save_model_path, save_model.__dict__)
752 | 
753 |         np.save(os.path.join(dir_name, "Lambda.npy"), self.Lambda)
754 |         # save derivative properties
755 |         if save_derivative_properties:
756 |             np.save(os.path.join(dir_name, "Doc2TopicCount.npy"), self.Doc2TopicCount)
757 |             np.save(os.path.join(dir_name, "Topic2TermCount.npy"), self.Topic2TermCount)
758 |             np.save(os.path.join(dir_name, "alpha_vector_Lambda.npy"), self.alpha_vector_Lambda)
759 |             np.save(os.path.join(dir_name, "eta_vector_sum.npy"), self.eta_vector_sum)
760 |             np.save(os.path.join(dir_name, "Topic2TermCountSum.npy"), self.Topic2TermCountSum)
761 |         pass
762 | 
763 |     def load_model_from_dir(self, dir_name, load_derivative_properties=True):
764 |         """
765 |         load model from directory dir_name
766 |         :param load_derivative_properties: load derivative properties from disk if True
767 |         :param dir_name: the target directory name
768 |         :return: None
769 |         """
770 |         save_model_path = os.path.join(dir_name, "llda_model.json")
771 |         save_model_dict = LldaModel._read_object_from_file(save_model_path)
772 |         save_model = LldaModel.SaveModel(save_model_dict=save_model_dict)
773 |         self.alpha_vector = save_model.alpha_vector
774 |         self.eta_vector = save_model.eta_vector
775 |         self.terms = save_model.terms
776 |         self.vocabulary = save_model.vocabulary
777 |         self.topics = save_model.topics
778 |         self.topic_vocabulary = save_model.topic_vocabulary
779 |         self.W = save_model.W
780 |         self.Z = save_model.Z
781 |         self.K = save_model.K
782 |         self.M = save_model.M
783 |         self.T = save_model.T
784 |         self.WN = save_model.WN
785 |         self.LN = save_model.LN
786 |         self.iteration = save_model.iteration
787 | 
788 |         self.Lambda = np.load(os.path.join(dir_name, "Lambda.npy"))
789 | 
790 |         # load load_derivative properties
791 |         if load_derivative_properties:
792 |             try:
793 |                 self.Doc2TopicCount = np.load(os.path.join(dir_name, "Doc2TopicCount.npy"))
794 |                 self.Topic2TermCount = np.load(os.path.join(dir_name, "Topic2TermCount.npy"))
795 |                 self.alpha_vector_Lambda = np.load(os.path.join(dir_name, "alpha_vector_Lambda.npy"))
796 |                 self.eta_vector_sum = np.load(os.path.join(dir_name, "eta_vector_sum.npy"))
797 |                 self.Topic2TermCountSum = np.load(os.path.join(dir_name, "Topic2TermCountSum.npy"))
798 |             except IOError or ValueError as e:
799 |                 print("%s: load derivative properties fail, initialize them with basic properties" % e)
800 |                 self._initialize_derivative_fields()
801 |         else:
802 |             self._initialize_derivative_fields()
803 |         pass
804 | 
805 |     def update(self, labeled_documents=None):
806 |         """
807 |         update model with labeled documents, incremental update
808 |         :return: None
809 |         """
810 |         self.all_perplexities = []
811 |         if labeled_documents is None:
812 |             pass
813 | 
814 |         new_labels = []
815 |         new_words = []
816 |         new_doc_corpus = []
817 |         new_labels_corpus = []
818 |         for document, labels in labeled_documents:
819 |             document = LldaModel._document_preprocess(document)
820 |             doc_words = document.split()
821 |             new_doc_corpus.append(doc_words)
822 |             if labels is None:
823 |                 labels = []
824 |             labels.append("common_topic")
825 |             new_labels_corpus.append(labels)
826 |             new_words.extend(doc_words)
827 |             new_labels.extend(labels)
828 |         # self.terms = list(set(new_words))
829 |         new_terms = set(new_words) - set(self.terms)
830 |         self.terms.extend(new_terms)
831 |         self.vocabulary = {term: index for index, term in enumerate(self.terms)}
832 | 
833 |         # self.topics = list(set(new_labels))
834 |         new_topics = set(new_labels) - set(self.topics)
835 |         self.topics.extend(new_topics)
836 |         self.topic_vocabulary = {topic: index for index, topic in enumerate(self.topics)}
837 | 
838 |         old_K = self.K
839 |         old_T = self.T
840 |         self.K = len(self.topics)
841 |         self.T = len(self.terms)
842 | 
843 |         # self.W = [[self.vocabulary[term] for term in doc_words] for doc_words in new_doc_corpus]
844 |         new_w_vectors = [[self.vocabulary[term] for term in doc_words] for doc_words in new_doc_corpus]
845 |         for new_w_vector in new_w_vectors:
846 |             self.W.append(new_w_vector)
847 | 
848 |         old_M = self.M
849 |         old_WN = self.WN
850 |         self.M = len(self.W)
851 |         self.WN += len(new_words)
852 |         # we appended topic "common_topic" to each doc at the beginning
853 |         # so we need minus the number of "common_topic"
854 |         # LN is the number of original labels
855 |         old_LN = self.LN
856 | 
857 |         self.LN += len(new_labels) + len(new_labels_corpus)
858 | 
859 |         old_Lambda = self.Lambda
860 |         self.Lambda = np.zeros((self.M, self.K), dtype=float)
861 |         for m in range(self.M):
862 |             if m < old_M:
863 |                 # if the old document has no topic, we also init it to all topics here
864 |                 if sum(old_Lambda[m]) == old_K:
865 |                     # set all value of self.Lambda[m] to 1.0
866 |                     self.Lambda[m] += 1.0
867 |                 continue
868 |             # print m, old_M
869 |             if len(new_labels_corpus[m-old_M]) == 1:
870 |                 new_labels_corpus[m-old_M] = self.topics
871 |             for label in new_labels_corpus[m-old_M]:
872 |                 k = self.topic_vocabulary[label]
873 |                 self.Lambda[m, k] = 1.0
874 | 
875 |         # TODO: the following 2 fields should be modified again if alpha_vector is not constant vector
876 |         self.alpha_vector = [self.alpha_vector[0] for _ in range(self.K)]
877 |         self.eta_vector = [self.eta_vector[0] for _ in range(self.T)]
878 | 
879 |         # self.Z = []
880 |         for m in range(old_M, self.M):
881 |             # print "self.Lambda[m]: ", self.Lambda[m]
882 |             numerator_vector = self.Lambda[m] * self.alpha_vector
883 |             p_vector = numerator_vector / sum(numerator_vector)
884 |             # print p_vector
885 |             # print "p_vector: ", p_vector
886 |             # z_vector is a vector of a document,
887 |             # just like [2, 3, 6, 0], which means this doc have 4 word and them generated
888 |             # from the 2nd, 3rd, 6th, 0th topic, respectively
889 |             z_vector = [LldaModel._multinomial_sample(p_vector) for _ in range(len(self.W[m]))]
890 |             self.Z.append(z_vector)
891 | 
892 |         self._initialize_derivative_fields()
893 |         pass
894 | 
895 |     @staticmethod
896 |     def _extend_matrix(origin=None, shape=None, padding_value=0):
897 |         """
898 |         for quickly extend the matrices when update
899 |         extend origin matrix with shape, padding with padding_value
900 |         :type shape: the shape of new matrix
901 |         :param origin: np.ndarray, the original matrix
902 |         :return: np.ndarray, a matrix with new shape
903 |         """
904 |         new_matrix = np.zeros(shape, dtype=origin.dtype)
905 | 
906 |         for row in range(new_matrix.shape[0]):
907 |             for col in range(new_matrix.shape[1]):
908 |                 if row < origin.shape[0] and col < origin.shape[0]:
909 |                     new_matrix[row, col] = origin[row, col]
910 |                 else:
911 |                     new_matrix[row, col] = padding_value
912 | 
913 |         return new_matrix
914 |         pass
915 | 
916 |     def is_convergent(self, method="PPL", delta=0.001):
917 |         """
918 |         is this model convergent?
919 |         use the perplexities to determine whether the Markov chain converges
920 |         :param method: the method of determining whether the Markov chain converges
921 |                 "PPL": use the perplexities of training data
922 |                 "beta": use the parameter 'beta'
923 |         :param delta: if the changes are less than or equal to `delta`, means that the Markov chain converges
924 |         :return: True if model is convergent
925 |         """
926 |         if method == "PPL":
927 |             if len(self.all_perplexities) < 10:
928 |                 return False
929 |             perplexities = self.all_perplexities[-10:]
930 |             if max(perplexities) - min(perplexities) <= delta:
931 |                 return True
932 |             return False
933 |         elif method == "beta":
934 |             if self.delta_beta <= delta:
935 |                 return True
936 |             return False
937 |         else:
938 |             raise Exception("parameter 'method=\"%s\"' is illegal" % method)
939 | 
940 |     @property
941 |     def delta_beta(self):
942 |         """
943 |         calculate the changes of the parameter `beta`
944 |         :return: the sum of changes of the parameter `beta`
945 |         """
946 |         return np.sum(np.abs(self.beta - self.last_beta))
947 | 
948 |     def top_terms_of_topic(self, topic, k, with_probabilities=True):
949 |         """
950 |         get top-k terms of topic
951 |         :param with_probabilities: True means return the probabilities of a term generated by topic,
952 |                                    else return only terms
953 |         :param topic: str, the name of topic
954 |         :param k: int, the number of terms
955 |         :return: the top-k terms of topic
956 |         """
957 |         if topic not in self.topic_vocabulary:
958 |             raise Exception("Cannot find topic \"%s\"" % topic)
959 |         beta = self.beta_k(self.topic_vocabulary[topic])
960 |         terms = sorted(list(zip(self.terms, beta)), key=lambda x: x[1], reverse=True)
961 |         if with_probabilities:
962 |             return terms[:k]
963 |         return [term for term, p in terms[:k]]
964 | 
965 | 
966 | if __name__ == "__main__":
967 |     pass
968 | 
969 | 


--------------------------------------------------------------------------------