├── .idea
├── inspectionProfiles
│ └── Project_Default.xml
└── vcs.xml
├── CascadeLDA.py
├── HSLDA.py
├── LabeledLDA.py
├── LocalLDA.py
├── README.md
├── abstracts_data.csv
├── evaluate_CascadeLDA.py
├── evaluate_LabeledLDA.py
├── requirements.txt
└── thesis_kenhbs.pdf
/.idea/inspectionProfiles/Project_Default.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
13 |
14 |
15 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/CascadeLDA.py:
--------------------------------------------------------------------------------
1 | import gensim.parsing.preprocessing as gensimm
2 | from gensim.corpora import dictionary
3 | import numpy as np
4 | import re
5 | multinom_draw = np.random.multinomial
6 |
7 |
8 | def load_corpus(filename, d=3):
9 | import csv, sys
10 |
11 | # Increase max line length for csv.reader:
12 | max_int = sys.maxsize
13 | decrement = True
14 | while decrement:
15 | decrement = False
16 | try:
17 | csv.field_size_limit(max_int)
18 | except OverflowError:
19 | max_int = int(max_int/10)
20 | decrement = True
21 |
22 | docs = []
23 | labs = []
24 | labelmap = dict()
25 | pat = re.compile("[A-Z]\d{2}")
26 | f = open(filename, 'r')
27 | reader = csv.reader(f)
28 | for row in reader:
29 | doc = row[1]
30 | lab = row[2]
31 | if len(lab) > 3:
32 | lab = lab.split(" ")
33 | lab = list(filter(lambda i: pat.search(i), lab))
34 | lab = [partition_label(x, d) for x in lab]
35 | lab = [item for sublist in lab for item in sublist]
36 | lab = list(set(lab))
37 | for x in lab:
38 | labelmap[x] = 1
39 | else:
40 | lab = partition_label(lab, d)
41 | for x in lab:
42 | labelmap[x] = 1
43 | # lab = [lab]
44 | docs.append(doc)
45 | labs.append(lab)
46 | f.close()
47 | print("Stemming documents ....")
48 | docs = gensimm.preprocess_documents(docs)
49 | return docs, labs, list(labelmap.keys())
50 |
51 |
52 | def partition_label(lab, d):
53 | return [lab[:i+1] for i in range(d)]
54 |
55 |
56 | class CascadeLDA(object):
57 | def __init__(self, docs, labs, labelset, dicti, alpha=0.001, beta=0.001):
58 | labelset.insert(0, 'root')
59 | self.labelmap = dict(zip(labelset, range(len(labelset))))
60 | self.dicti = dicti
61 | self.K = len(self.labelmap)
62 | self.lablist = labelset
63 |
64 | self.alpha = alpha
65 | self.beta = beta
66 |
67 | self.vocab = list(dicti.values())
68 | self.w_to_v = dicti.token2id
69 | self.v_to_w = dicti.id2token
70 |
71 | self.labs = np.array([self.set_label(lab) for lab in labs])
72 | self.doc_tups = [dicti.doc2bow(x) for x in docs]
73 |
74 | self.docs = []
75 | self.freqs = []
76 | for doc in self.doc_tups:
77 | ids, freqs = zip(*doc)
78 | self.docs.append(ids)
79 | self.freqs.append(freqs)
80 |
81 | self.D = len(docs)
82 | self.V = len(self.vocab)
83 |
84 | self.ph = np.zeros((self.K, self.V), dtype=float)
85 | self.perplx = []
86 |
87 | self.l1 = [[l1 for l1 in lab if len(l1) == 1] for lab in labs]
88 | self.l2 = [[l2 for l2 in lab if len(l2) == 2] for lab in labs]
89 | self.l3 = [[l3 for l3 in lab if len(l3) == 3] for lab in labs]
90 |
91 | self.lablist_l1 = [x for x in self.lablist if len(x) == 1]
92 | self.lablist_l2 = [x for x in self.lablist if len(x) == 2]
93 | self.lablist_l3 = [x for x in self.lablist if len(x) == 3]
94 |
95 | self.rawlabs = labs
96 |
97 | def set_label(self, label):
98 | vec = np.zeros(len(self.labelmap))
99 | vec[0] = 1.0
100 | for x in label:
101 | vec[self.labelmap[x]] = 1.0
102 | return vec
103 |
104 | def term_to_id(self, term):
105 | if term not in self.w_to_v:
106 | voca_id = len(self.vocab)
107 | self.w_to_v[term] = voca_id
108 | self.vocab.append(term)
109 | else:
110 | voca_id = self.w_to_v[term]
111 | return voca_id
112 |
113 | def sub_corpus(self, parent):
114 | level = len(parent)
115 | if level == 1:
116 | lab_level = self.l2
117 | elif level == 2:
118 | lab_level = self.l3
119 | present = np.where([[parent in lab] for lab in self.rawlabs])[0]
120 | doc_tups = [self.doc_tups[p] for p in present]
121 | labs = [lab_level[p] for p in present]
122 |
123 | # Only keep the target labels, remove all other labels: they will be
124 | # gathered as the 'generic' topic
125 | labs = [[x for x in lab if x[:level] == parent] for lab in labs]
126 | labset = sorted(list(set([x for sub in labs for x in sub])))
127 | return doc_tups, labs, labset
128 |
129 | def get_sub_ph(self, subdocs, sublabs, sublabset, it=150, thinning=12):
130 | sublda = SubLDA(subdocs, sublabs, sublabset, self.dicti,
131 | alpha=self.alpha, beta=self.beta)
132 | sublda.run_training(it=it, thinning=thinning)
133 | return sublda.get_ph()
134 |
135 | def go_down_tree(self, it, s):
136 | # Starting at 'root' as parent node:
137 | doc_tups = self.doc_tups
138 | labs = self.l1
139 | labset = self.lablist_l1
140 |
141 | sub_ph = self.get_sub_ph(doc_tups, labs, labset, it=it, thinning=s)
142 |
143 | label_ids = [self.labelmap[x] for x in labset]
144 | self.ph[label_ids, :] = sub_ph
145 |
146 | # Only for this root-level we retain the topic-word distr ph for 'root'
147 | labset.remove('root')
148 |
149 | for l in labset:
150 | print(" --- ")
151 | print("Working on parent node", l)
152 | # Take subset of the entire corpus. With label "l*"
153 | doc_tups, labs, sublabset = self.sub_corpus(parent=l)
154 |
155 | # Run local LDA on subset - get those label-word distr.
156 | # This function also adds 'root' to sublabset
157 | sub_ph = self.get_sub_ph(doc_tups, labs, sublabset, it, s)
158 |
159 | # Get the local label ids and insert into global label-word:
160 | # Disregard "root" of every local label-word distr.
161 | sublabset.remove("root")
162 | label_ids = [self.labelmap[x] for x in sublabset]
163 |
164 | sub_ph = sub_ph[1:, :]
165 | self.ph[label_ids, :] = sub_ph
166 |
167 | one_down = [x for x in self.lablist_l2 if x[0] == l]
168 | for l2 in one_down:
169 | print(" --- ")
170 | print("Working on parent node", l2)
171 | # Take subset of the entire corpus. With label "l*"
172 | doc_tups, labs, sublabset = self.sub_corpus(parent=l2)
173 |
174 | # Run local LDA on subset - get those label-word distr.
175 | # This function also adds 'root' to sublabset
176 | sub_ph = self.get_sub_ph(doc_tups, labs, sublabset, it, s)
177 |
178 | # Get the local label ids and insert into global label-word:
179 | # Disregard "root" of every local label-word distr.
180 | sublabset.remove('root')
181 | label_ids = [self.labelmap[x] for x in sublabset]
182 |
183 | sub_ph = sub_ph[1:, :]
184 | self.ph[label_ids, :] = sub_ph
185 |
186 | def prep4test(self, doc, ph):
187 | doc_tups = self.dicti.doc2bow(doc)
188 | doc, freqs = zip(*doc_tups)
189 | ld = len(doc)
190 |
191 | n_dk = np.zeros(ph.shape[0], dtype=int)
192 | z_dn = []
193 |
194 | probs = ph[:, doc]
195 | probs += self.beta
196 | probs /= probs.sum(axis=0)
197 | # Initiate with the 'garbage'/'root' label uniformly:
198 | probs[0, :] = 1 / ld
199 | for n, freq in enumerate(freqs):
200 | prob = probs[:, n]
201 | while prob.sum() > 1:
202 | prob /= 1.0000005
203 | new_z = multinom_draw(1, prob).argmax()
204 |
205 | z_dn.append(new_z)
206 | n_dk[new_z] += freq
207 | start_state = (doc, freqs, z_dn, n_dk)
208 | return start_state
209 |
210 | def cascade_test(self, doc, it, thinning, labels):
211 | ids = [self.labelmap[x] for x in labels]
212 | ph = self.ph[ids, :]
213 | doc, freqs, z_dn, n_dk = self.prep4test(doc, ph)
214 |
215 | avg_state = np.zeros(len(ids), dtype=float)
216 | for i in range(it):
217 | for n, (v, f, z) in enumerate(zip(doc, freqs, z_dn)):
218 | n_dk[z] -= f
219 |
220 | num_a = n_dk + self.alpha
221 | b = ph[:, v]
222 | prob = num_a * b
223 | # In CascadeLDA it can occur that prob.sum() = 0. This
224 | # is forced to throw an error, else would have been warning:
225 | try:
226 | with np.errstate(invalid="raise"):
227 | prob /= prob.sum()
228 | except FloatingPointError:
229 | prob = num_a * (b + self.beta)
230 | prob /= prob.sum()
231 | while prob.sum() > 1:
232 | prob /= 1.000005
233 | new_z = multinom_draw(1, prob).argmax()
234 |
235 | z_dn[n] = new_z
236 | n_dk[new_z] += f
237 | s = (i+1) / thinning
238 | s2 = int(s)
239 | if s == s2:
240 | this_state = n_dk / n_dk.sum()
241 | if s2 == 1:
242 | avg_state = this_state
243 | else:
244 | old = (s2 - 1) / s2 * avg_state
245 | new = (1 / s2) * this_state
246 | avg_state = old + new
247 | return avg_state
248 |
249 | def test_down_tree(self, doc, it, thinning, threshold):
250 | labels = self.lablist_l1
251 | th_hat = self.cascade_test(doc, it, thinning, labels)
252 |
253 | top_loads = np.sort(th_hat)[::-1]
254 | n = sum(np.cumsum(top_loads) < threshold) + 1
255 |
256 | top_n_load = top_loads[:n]
257 | top_n_labs = np.argsort(th_hat)[::-1][:n]
258 | top_n_labs = [labels[i] for i in top_n_labs]
259 |
260 | level_1 = list(zip(top_n_labs, top_n_load))
261 | level_2 = []
262 | level_3 = []
263 |
264 | if 'root' in top_n_labs:
265 | top_n_labs.remove('root')
266 | next_levels = top_n_labs
267 | for next_level in next_levels:
268 | pat = re.compile('^' + next_level + "[0-9]{1}$")
269 | labels = list(filter(pat.match, self.lablist))
270 | labels.insert(0, next_level)
271 | th_hat = self.cascade_test(doc, it, thinning, labels)
272 |
273 | top_loads = np.sort(th_hat)[::-1]
274 | n = sum(np.cumsum(top_loads) < threshold) + 1
275 |
276 | top_n_load = top_loads[:n]
277 | top_n_labs = np.argsort(th_hat)[::-1][:n]
278 | top_n_labs = [labels[i] for i in top_n_labs]
279 |
280 | tups = list(zip(top_n_labs, top_n_load))
281 | level_2.append(tups)
282 |
283 | if next_level in top_n_labs:
284 | top_n_labs.remove(next_level)
285 | last_levels = top_n_labs
286 | for newlab in last_levels:
287 | pat = re.compile('^' + newlab + "[0-9]{1}$")
288 | labels = list(filter(pat.match, self.lablist))
289 | labels.insert(0, newlab)
290 | th_hat = self.cascade_test(doc, it, thinning, labels)
291 |
292 | top_loads = np.sort(th_hat)[::-1]
293 | n = sum(np.cumsum(top_loads) < threshold) + 1
294 |
295 | top_n_load = top_loads[:n]
296 | top_n_labs = np.argsort(th_hat)[::-1][:n]
297 | top_n_labs = [labels[i] for i in top_n_labs]
298 | tups = list(zip(top_n_labs, top_n_load))
299 |
300 | level_3.append(tups)
301 | return level_1, level_2, level_3
302 |
303 | def run_test(self, docs, it, thinning, depth="all"):
304 | inds = None
305 | if depth in [1, 2, 3]:
306 | inds = np.where([len(x) in [depth, 4] for x in self.lablist])[0]
307 | elif depth == "all":
308 | inds = range(self.K)
309 |
310 | ph = self.ph[inds, :]
311 | th_hat = np.zeros((len(docs), len(inds)), dtype=float)
312 |
313 | for d, doc in enumerate(docs):
314 | new_d, new_f, z_dn, n_zk = self.prep4test(doc, ph)
315 | for i in range(it):
316 | for n, (v, f) in enumerate(zip(new_d, new_f)):
317 | # v = int(v)
318 | z = z_dn[n]
319 | n_zk[z] -= f
320 |
321 | num_a = n_zk + self.alpha
322 | b = ph[:, v]
323 | prob = num_a * b
324 | prob /= prob.sum()
325 | while prob.sum() > 1:
326 | prob /= 1.000005
327 | new_z = multinom_draw(1, prob).argmax()
328 |
329 | z_dn[n] = new_z
330 | n_zk[new_z] += f
331 |
332 | # Save the current state in MC chain and calc. average state:
333 | s = (i+1) / thinning
334 | if s == int(s):
335 | print("----")
336 | print("Testing iteration #", i+1)
337 | cur_th = n_zk / n_zk.sum()
338 | if s > 1:
339 | m = (s-1)/s
340 | th = m * th + (1-m) * cur_th
341 | else:
342 | th = cur_th
343 | th_hat[d, :] = th
344 | return th_hat
345 |
346 |
347 | class SubLDA(object):
348 | def __init__(self, docs, labs, labelset, dicti, alpha=0.001, beta=0.001):
349 | labelset.insert(0, 'root')
350 | self.labelmap = dict(zip(labelset, range(len(labelset))))
351 | self.K = len(self.labelmap)
352 | self.dicti = dicti
353 | self.lablist = labelset
354 |
355 | self.alpha = alpha
356 | self.beta = beta
357 |
358 | self.labs = np.array([self.set_label(lab) for lab in labs])
359 | self.doc_tups = docs
360 |
361 | self.V = len(dicti)
362 | self.D = len(docs)
363 |
364 | self.z_dn = []
365 | self.n_zk = np.zeros(self.K, dtype=int)
366 | self.n_d_k = np.zeros((self.D, self.K), dtype=int)
367 | self.n_k_v = np.zeros((self.K, self.V), dtype=int)
368 |
369 | self.ph = np.zeros((self.K, self.V), dtype=float)
370 |
371 | self.docs = []
372 | self.freqs = []
373 | for d, (doc, lab) in enumerate(zip(self.doc_tups, self.labs)):
374 | ids, freqs = zip(*doc)
375 | self.docs.append(list(ids))
376 | self.freqs.append(list(freqs))
377 |
378 | ld = len(doc)
379 | prob = lab / lab.sum()
380 | zets = np.random.choice(self.K, size=ld, p=prob)
381 | self.z_dn.append(zets)
382 | for v, z, f in zip(doc, zets, freqs):
383 | self.n_zk[z] += f
384 | self.n_d_k[d, z] += f
385 | self.n_k_v[z, v] += f
386 |
387 | def set_label(self, label):
388 | vec = np.zeros(len(self.labelmap))
389 | vec[0] = 1.0
390 | for x in label:
391 | vec[self.labelmap[x]] = 1.0
392 | return vec
393 |
394 | def get_ph(self):
395 | return self.n_k_v / self.n_k_v.sum(axis=1, keepdims=True)
396 |
397 | def training_iteration(self):
398 | docs = self.docs
399 | freqs = self.freqs
400 | zdn = self.z_dn
401 | labs = self.labs
402 | for d, (doc, freq, zet, lab) in enumerate(zip(docs, freqs, zdn, labs)):
403 | doc_n_d_k = self.n_d_k[d]
404 | for n, (v, f, z) in enumerate(zip(doc, freq, zet)):
405 | self.n_k_v[z, v] -= f
406 | doc_n_d_k[z] -= f
407 | self.n_zk[z] -= f
408 |
409 | a = doc_n_d_k + self.alpha
410 | num_b = self.n_k_v[:, v] + self.beta
411 | den_b = self.n_zk + self.V * self.beta
412 |
413 | prob = lab * a * (num_b/den_b)
414 | prob /= np.sum(prob)
415 | z_new = multinom_draw(1, prob).argmax()
416 |
417 | self.z_dn[d][n] = z_new
418 |
419 | self.n_k_v[z_new, v] += f
420 | doc_n_d_k[z_new] += f
421 | self.n_zk[z_new] += f
422 |
423 | def run_training(self, it=120, thinning=15):
424 | for i in range(it):
425 | self.training_iteration()
426 | s = (i+1) / thinning
427 | if s == int(s):
428 | print("Training iteration #", i+1)
429 | cur_ph = self.get_ph()
430 | if s > 1:
431 | m = (s-1)/s
432 | self.ph = m * self.ph + (1-m) * cur_ph
433 | else:
434 | self.ph = cur_ph
435 |
436 |
437 | def split_data(f="thesis_data.csv", d=3):
438 | a, b, c = load_corpus(f, d)
439 |
440 | zipped = list(zip(a, b))
441 | np.random.shuffle(zipped)
442 | a, b, = zip(*zipped)
443 |
444 | split = int(len(a) * 0.9)
445 | train_data = (a[:split], b[:split], c)
446 | test_data = (a[split:], b[split:], c)
447 | return train_data, test_data
448 |
449 |
450 | def prune_dict(docs, lower=0.1, upper=0.9):
451 | dicti = dictionary.Dictionary(docs)
452 | lower *= len(docs)
453 | dicti.filter_extremes(no_above=upper, no_below=int(lower))
454 | return dicti
455 |
456 |
457 | def train_it(train_data, it=150, s=12, l=0.02, u=0.98, al=0.001, be=0.001):
458 | a, b, c = train_data
459 | dicti = prune_dict(a, lower=l, upper=u)
460 | cascade = CascadeLDA(a, b, c, dicti, alpha=al, beta=be)
461 | cascade.go_down_tree(it=it, s=s)
462 | return cascade
463 |
--------------------------------------------------------------------------------
/HSLDA.py:
--------------------------------------------------------------------------------
1 | import gensim.parsing.preprocessing as gensimm
2 | import numpy as np
3 | from scipy.stats import truncnorm
4 | import scipy
5 | import scipy.special
6 | multinom_draw = np.random.multinomial
7 | rvs = truncnorm.rvs
8 |
9 |
10 | def partition_label(lab, d):
11 | return [lab[:i+1] for i in range(d)]
12 |
13 |
14 | def phi(x):
15 | return 1/2 * (1 + scipy.special.erf(x / np.sqrt(2)))
16 |
17 |
18 | def vect_multinom(prob_matrix):
19 | s = prob_matrix.cumsum(axis=0)
20 | r = np.random.rand(prob_matrix.shape[1])
21 | k = (s < r).sum(axis=0)
22 | return k
23 |
24 |
25 | def get_stirling_numbers(n):
26 | mat = np.identity(int(n))
27 | mat[1, 0] = 0
28 | mat[2, 1] = 1
29 | for m in range(3, n):
30 | for k in range(1, m):
31 | l = mat[m-1, k-1]
32 | r = (m-1) * mat[m-1, k]
33 | mat[m, k] = l + r
34 | h = mat.max(axis=1)
35 | res = mat / h[:, None]
36 | return res
37 |
38 |
39 | def load_corpus(filename, d=3):
40 | import csv, sys, re
41 |
42 | # Increase max line length for csv.reader:
43 | max_int = sys.maxsize
44 | decrement = True
45 | while decrement:
46 | decrement = False
47 | try:
48 | csv.field_size_limit(max_int)
49 | except OverflowError:
50 | max_int = int(max_int/10)
51 | decrement = True
52 |
53 | docs = []
54 | labs = []
55 | labelmap = dict()
56 | pat = re.compile("[A-Z]\d{2}")
57 | f = open(filename, 'r')
58 | reader = csv.reader(f)
59 | for row in reader:
60 | doc = row[1]
61 | lab = row[2]
62 | if len(lab) > 3:
63 | lab = lab.split(" ")
64 | lab = list(filter(lambda i: pat.search(i), lab))
65 | lab = [partition_label(x, d) for x in lab]
66 | lab = [item for sublist in lab for item in sublist]
67 | lab = list(set(lab))
68 | for x in lab:
69 | labelmap[x] = 1
70 | else:
71 | lab = partition_label(lab, d)
72 | for x in lab:
73 | labelmap[x] = 1
74 | docs.append(doc)
75 | labs.append(lab)
76 | f.close()
77 | print("Stemming documents .... ")
78 | docs = gensimm.preprocess_documents(docs)
79 | return docs, labs, list(labelmap.keys())
80 |
81 |
82 | class HSLDA(object):
83 | def __init__(self, docs, labs, labelset, k=15,
84 | alpha_prime=1, alpha=1, gamma=1, mu=0, sigma=1, xi=0):
85 |
86 | self.labelmap = dict(zip(labelset, range(len(labelset))))
87 | self.labelmap[''] = 0
88 |
89 | self.lablist = labelset
90 |
91 | self.aprime = alpha_prime
92 | self.alpha = alpha
93 | self.gamma = gamma
94 | self.mu = mu
95 | self.sigma = sigma
96 | self.xi = xi
97 | self.K = k
98 |
99 | self.vocab = []
100 | self.w_to_v = dict()
101 | self.labs = np.array([self.set_label(lab) for lab in labs])
102 | self.docs = [[self.term_to_id(term) for term in doc] for doc in docs]
103 | self.v_to_w = {v:w for w, v in self.w_to_v.items()}
104 |
105 | self.D = len(docs)
106 | self.L = len(self.labelmap)
107 | self.V = len(self.vocab)
108 |
109 | k_ones = np.repeat(1, self.K)
110 | v_ones = np.repeat(1, self.V)
111 | mu_par = self.mu * k_ones
112 | self.eta = np.random.normal(mu_par, 1, size=(self.L, self.K))
113 | self.beta = np.random.dirichlet(self.aprime * k_ones)
114 | self.ph = np.random.dirichlet(self.gamma * v_ones, size=self.K)
115 | self.th = np.random.dirichlet(self.beta * self.alpha, size=self.D)
116 |
117 | self.z_dn = []
118 | self.n_d_k = np.zeros((self.D, self.K), dtype=int)
119 | self.n_k_v = np.zeros((self.K, self.V), dtype=int)
120 | self.n_zk = np.zeros(self.K, dtype=int)
121 |
122 | for d, doc in enumerate(self.docs):
123 | nd = len(doc)
124 | prob = self.th[d, :]
125 | zets = np.random.choice(self.K, size=nd, p=prob)
126 | self.z_dn.append(zets)
127 | for v, z in zip(doc, zets):
128 | self.n_d_k[d, z] += 1
129 | self.n_k_v[z, v] += 1
130 | self.n_zk[z] += 1
131 |
132 | self.zbar = self.get_zbar()
133 | self.mean_a = np.dot(self.zbar, self.eta.T)
134 |
135 | border_left = np.where(self.labs == 1, -self.mean_a, -np.inf)
136 | border_right = np.where(self.labs == 1, np.inf, -self.mean_a)
137 | self.a = rvs(border_left, border_right, self.mean_a)
138 |
139 | parents = [x[:-1] for x in labelset]
140 | parents = [self.labelmap[x] for x in parents]
141 | own = [self.labelmap[x] for x in labelset]
142 | self.child_to_parent = dict(zip(own, parents))
143 |
144 | self.stirling = get_stirling_numbers(150)
145 | self.mdot = np.zeros(self.K)
146 | self.m_aux = np.zeros((self.D, self.K))
147 |
148 | def get_zbar(self):
149 | return self.n_d_k / self.n_d_k.sum(axis=1, keepdims=True)
150 |
151 | def get_ph(self):
152 | return self.n_k_v / self.n_k_v.sum(axis=1, keepdims=True)
153 |
154 | def set_label(self, label):
155 | l = len(self.labelmap)
156 | vec = np.zeros(l, dtype=int)
157 | vec[0] = 1
158 | for x in label:
159 | vec[self.labelmap[x]] = 1
160 | return vec
161 |
162 | def term_to_id(self, term):
163 | if term not in self.w_to_v:
164 | voca_id = len(self.vocab)
165 | self.w_to_v[term] = voca_id
166 | self.vocab.append(term)
167 | else:
168 | voca_id = self.w_to_v[term]
169 | return voca_id
170 |
171 | def sample_z(self, opt=1):
172 | """
173 | Draws new values for all word-topic assignments in the corpus, based on
174 | Eq. (1) in Perotte '11 HSLDA paper. Two variations have been added
175 | for mathematical and theoretical precision and comparison
176 | (see :param opt below).
177 | This function contains two loops: the outer loop collects doc-level
178 | data from the HSLDA-object to avoid lengthy and superfluous computation
179 | The inner loop uses those subsets to first deduct the current token's
180 | topic assignment in all relevant subsets, then calculate probabilities
181 | for k = 1, 2, ... K and then draw a random values, based on those probs
182 | opt=1 stands for Eq. (1) as presented in the paper.
183 |
184 | val_a: L' x 1 np.array(floats):
185 | The values of the running variable a. Only the
186 | relevant values for document d are used here
187 | mean_a: L' x 1 np.array(floats):
188 | The mean of the running variable a. That is,
189 | np.dot(zbar.T, eta).
190 | dif_mean: L' x K np.array(floats):
191 | This is the reduction in mean_a, due to new topic
192 | assignment z_{d,n}. This implicitly affects zbar, then
193 | np.dot(zbar, eta), which is mean_a. Every column
194 | represents the hypothetical change in mean_a caused
195 | by a reassignment of topic k.
196 |
197 | labs: L x 1 np.array(binary):
198 | An L-dimensional vector with zeros and ones,
199 | indicating whether label l is part of document d's
200 | labelset, or not
201 | relevant_labs: L' x 1 np.array(int):
202 | Vector containing the label ID of the labels in
203 | document d's labelset
204 |
205 |
206 | :param opt: 1 calculates p(a_{l,d} = x) for l positive labels only
207 | 2 calculates p(a_{l,d} > 0) for l positive labels only
208 | 3 calculates p(a_{l',d} > 0) for all l' positive label and
209 | p(a_{l'', d} < 0) for all l'' negative label
210 | :return: K-dimensional probability vector
211 | """
212 | for d, doc in enumerate(self.docs):
213 |
214 | # Identify the labelset of document doc:
215 | labs = self.labs[d]
216 | if opt in [1, 2]:
217 | relevant_labs = np.where(labs == 1)[0]
218 | elif opt == 3:
219 | relevant_labs = range(self.L)
220 |
221 | # Select relevant data subsets in outer loop
222 | z_dn = self.z_dn[d]
223 | n_d_k = self.n_d_k[d, :]
224 | eta = self.eta[relevant_labs, :]
225 | val_a = self.a[d, relevant_labs, np.newaxis]
226 | mean_a = self.mean_a[d, relevant_labs, np.newaxis]
227 |
228 | # Calculate the implicit update of a's mean.
229 | n_d = len(doc)
230 | dif_mean = eta / n_d
231 | means_a = mean_a + dif_mean
232 | for n, v in enumerate(doc):
233 | # Find and deduct the word-topic assignment:
234 | old_z = z_dn[n]
235 | means_a[:, old_z] -= dif_mean[:, old_z]
236 | n_d_k[old_z] -= 1
237 | self.n_k_v[old_z, v] -= 1
238 | self.n_zk[old_z] -= 1
239 |
240 | # Calculate probability of first part of Eq. (1)
241 | l = n_d_k + self.alpha * self.beta
242 | r_num = self.n_k_v[:, v] + self.gamma
243 | r_den = self.n_zk + self.V * self.gamma
244 | p1 = l * r_num / r_den
245 |
246 | # Calculate probability of second part of Eq. (1)
247 | if opt == 1:
248 | p2 = np.exp((means_a - val_a) ** 2 * (-1 / 2))
249 | elif opt in [2, 3]:
250 | labcheck = labs[relevant_labs]
251 | labcheck = labcheck[:, np.newaxis]
252 | means_a -= self.xi
253 | signed_mean = np.where(labcheck == 1, means_a, -means_a)
254 | p2 = phi(signed_mean)
255 | p2 *= 2
256 | p2 = p2.prod(axis=0)
257 |
258 | # Combine two parts and draw new word-topic assignment z_{d,n}
259 | prob = p1 * p2
260 | prob /= prob.sum()
261 | new_z = multinom_draw(1, prob).argmax()
262 |
263 | # Add back z_new to all relevant containers:
264 | z_dn[n] = new_z
265 | means_a[:, new_z] += dif_mean[:, new_z]
266 | n_d_k[new_z] += 1
267 | self.n_k_v[new_z, v] += 1
268 | self.n_zk[new_z] += 1
269 | self.n_d_k[d, :] = n_d_k
270 | self.z_dn[d] = z_dn
271 | self.zbar[d, :] = n_d_k / n_d
272 | self.mean_a = np.dot(self.zbar, self.eta.T)
273 |
274 | def sample_eta(self):
275 | sig_prior = np.identity(self.K) / self.sigma
276 | sig_data = np.dot(self.zbar.T, self.zbar)
277 | sigma_hat = scipy.linalg.inv(sig_prior + sig_data)
278 |
279 | mu_prior = self.mu / self.sigma
280 | mu_data = np.dot(self.zbar.T, self.a)
281 | raw_mean = mu_prior + mu_data
282 | mu_hat = np.dot(sigma_hat, raw_mean)
283 |
284 | for l in range(self.L):
285 | mu = mu_hat[:, l]
286 | eta_l = np.random.multivariate_normal(mu, sigma_hat)
287 | self.eta[l, :] = eta_l
288 |
289 | def sample_a(self):
290 | border_left = np.where(self.labs > 0, -self.mean_a, -np.inf)
291 | border_right = np.where(self.labs > 0, np.inf, -self.mean_a)
292 | self.a = rvs(border_left, border_right, self.mean_a)
293 |
294 | def sample_beta(self):
295 | param = self.mdot + self.aprime
296 | self.beta = np.random.dirichlet(param)
297 |
298 | def sample_m(self):
299 | ab = self.alpha * self.beta
300 | for d in range(self.D):
301 | n_d_k = self.n_d_k[d]
302 | for k, n_k in enumerate(n_d_k):
303 | if n_k-1 > self.stirling.shape[0]:
304 | self.stirling = get_stirling_numbers(n_k+1)
305 | ms = self.stirling[n_k, :(n_k+1)]
306 | m_probs = [s * ab[k]**m for m, s in enumerate(ms)]
307 | m_probs /= sum(m_probs)
308 | draw = np.random.choice(m_probs)
309 | self.m_aux[d, k] = draw
310 | self.mdot = self.m_aux.mean(axis=0)
311 |
312 | def run_training(self, it=25, thinning=5, opt=1):
313 | for i in range(it):
314 | self.sample_z(opt=opt)
315 | self.sample_eta()
316 | self.sample_a()
317 | self.sample_m()
318 | self.sample_beta()
319 | s = ((i+1) / thinning)
320 | if s == int(s):
321 | print("Training iteration #", i)
322 | p = i / it * 100
323 | print("Progress is %.2f %%" % p)
324 | print("-----")
325 | cur_ph = self.get_ph()
326 | cur_th = self.get_zbar()
327 | if s > 1:
328 | m = (s-1)/s
329 | self.ph = m * self.ph + (1-m) * cur_ph
330 | self.th = m * self.th + (1-m) * cur_th
331 | else:
332 | self.ph = cur_ph
333 | self.th = cur_th
334 |
335 | def z_for_newdoc(self, newdoc):
336 | newdoc = [self.term_to_id(t) for t in newdoc if t in self.w_to_v]
337 | prob_matrix = self.ph[:, newdoc]
338 | prob_matrix /= prob_matrix.sum(axis=0, keepdims=True)
339 | z_dn = vect_multinom(prob_matrix)
340 | n_d_k = np.zeros(self.K)
341 | for z in z_dn:
342 | n_d_k[z] += 1
343 |
344 | return z_dn, n_d_k, newdoc
345 |
346 | def run_test(self, newdoc, it=250, s=25):
347 | z_dn, n_d_k, newdoc = self.z_for_newdoc(newdoc)
348 | ph_hat = self.n_k_v + self.gamma
349 | ph_hat = ph_hat / ph_hat.sum(axis=1, keepdims=True)
350 | n_d = len(newdoc)
351 | for i in range(it):
352 | for n, v in enumerate(newdoc):
353 | # Find and deduct the word-topic assignment:
354 | old_z = z_dn[n]
355 | n_d_k[old_z] -= 1
356 |
357 | # Calculate probability of first part of Eq. (1)
358 | l = n_d_k + self.alpha * self.beta
359 | r = ph_hat[:, v]
360 | p1 = l * r
361 | p1 /= p1.sum()
362 | new_z = multinom_draw(1, p1).argmax()
363 |
364 | z_dn[n] = new_z
365 | n_d_k[new_z] += 1
366 |
367 | c = ((i+1) / s)
368 | if c == int(c):
369 | cur_th = n_d_k / n_d
370 | if c > 1:
371 | m = (c-1)/c
372 | zbar = m * zbar + (1-m) * cur_th
373 | else:
374 | zbar = cur_th
375 | means_a = np.dot(self.eta, zbar)
376 | means_a -= self.xi
377 | probs = phi(means_a)
378 | return probs
379 |
380 | def display_topics(self, n=10):
381 | top_v = np.argsort(-self.ph)[:, :n]
382 | return [[self.v_to_w[v] for v in top] for top in top_v]
383 |
384 | def label_predictions(self, probs):
385 | return sorted(zip(probs, self.lablist))[::-1]
386 |
387 | def run_tests(self, newdocs, it=250, s=25):
388 | if len(newdocs) == 1:
389 | return self.run_test(newdocs, it=it, s=s)
390 | else:
391 | lab_probs = np.empty((len(newdocs), self.L))
392 | for d, doc in enumerate(newdocs):
393 | lab_probs[d, :] = self.run_test(doc, it=it, s=s)
394 | return lab_probs
395 |
396 |
397 | def split_data(f="thesis_data.csv", d=3):
398 | a, b, c = load_corpus(filename=f, d=d)
399 | split = int(len(a) * 0.9)
400 |
401 | train_data = (a[:split], b[:split], c)
402 | test_data = (a[split:], b[split:], c)
403 | return train_data, test_data
404 |
405 |
406 | def train_it(traindata, it=150, s=25, opt=1):
407 | a, b, c = traindata[0], traindata[1], traindata[2]
408 | hs = HSLDA(a, b, c)
409 | hs.run_training(it=it, thinning=s, opt=opt)
410 | return hs
411 |
412 |
413 | def test_it(model, testdata, it=500, s=25):
414 | testdocs = testdata[0]
415 | testdocs = [[x for x in doc if x in model.vocab] for doc in testdocs]
416 | lab_probs = model.run_tests(testdocs, it=it, s=s)
417 | return lab_probs
418 |
--------------------------------------------------------------------------------
/LabeledLDA.py:
--------------------------------------------------------------------------------
1 | import gensim.parsing.preprocessing as gensimm
2 | from gensim.corpora import dictionary
3 | import numpy as np
4 | from numpy.random import multinomial as multinom_draw
5 |
6 |
7 | def load_corpus(filename, d):
8 | import csv, sys, re
9 |
10 | # Increase max line length for csv.reader:
11 | max_int = sys.maxsize
12 | decrement = True
13 | while decrement:
14 | decrement = False
15 | try:
16 | csv.field_size_limit(max_int)
17 | except OverflowError:
18 | max_int = int(max_int/10)
19 | decrement = True
20 |
21 | docs = []
22 | labs = []
23 | labelmap = dict()
24 | pat = re.compile("[A-Z]\d{2}")
25 | f = open(filename, 'r')
26 | reader = csv.reader(f)
27 | for row in reader:
28 | doc = row[1]
29 | lab = row[2]
30 | if len(lab) > 3:
31 | lab = lab.split(" ")
32 | lab = list(filter(lambda i: pat.search(i), lab))
33 | lab = [x[:d] for x in lab]
34 | for x in lab:
35 | labelmap[x] = 1
36 | else:
37 | lab = lab[:d]
38 | labelmap[lab] = 1
39 | lab = [lab]
40 | lab = list(set(lab))
41 | docs.append(doc)
42 | labs.append(lab)
43 | f.close()
44 | print("Stemming documents ....")
45 | docs = gensimm.preprocess_documents(docs)
46 | return docs, labs, list(labelmap.keys())
47 |
48 |
49 | class LabeledLDA(object):
50 | def __init__(self, docs, labs, labelset, dicti, alpha, beta):
51 | labelset.insert(0, 'root')
52 | self.labelmap = dict(zip(labelset, range(len(labelset))))
53 | self.K = len(self.labelmap)
54 | self.dicti = dicti
55 |
56 | self.alpha = alpha
57 | self.beta = beta
58 |
59 | self.vocab = list(dicti.values())
60 | self.w_to_v = dicti.token2id
61 | self.v_to_w = dicti.id2token
62 |
63 | self.labs = np.array([self.set_label(lab) for lab in labs])
64 | self.doc_tups = [dicti.doc2bow(x) for x in docs]
65 |
66 | self.D = len(docs)
67 | self.V = len(self.vocab)
68 |
69 | self.ph_hat = np.zeros((self.K, self.V), dtype=float)
70 | self.th_hat = np.zeros((self.D, self.K), dtype=float)
71 | self.cur_perplx = []
72 |
73 | self.z_dn = []
74 | self.n_zk = np.zeros(self.K, dtype=int)
75 | self.n_d_k = np.zeros((self.D, self.K), dtype=int)
76 | self.n_k_v = np.zeros((self.K, self.V), dtype=int)
77 |
78 | self.docs = []
79 | self.freqs = []
80 | for d, (doc, lab) in enumerate(zip(self.doc_tups, self.labs)):
81 | ids, freqs = zip(*doc)
82 | self.docs.append(list(ids))
83 | self.freqs.append(list(freqs))
84 |
85 | ld = len(doc)
86 | prob = lab/lab.sum()
87 | zets = np.random.choice(self.K, size=ld, p=prob)
88 | self.z_dn.append(zets)
89 | for v, z, freq in zip(ids, zets, freqs):
90 | self.n_zk[z] += freq
91 | self.n_d_k[d, z] += freq
92 | self.n_k_v[z, v] += freq
93 |
94 | def set_label(self, label):
95 | vec = np.zeros(len(self.labelmap))
96 | vec[0] = 1.0
97 | for x in label:
98 | vec[self.labelmap[x]] = 1.0
99 | return vec
100 |
101 | def training_iteration(self):
102 | docs = self.docs
103 | freqs = self.freqs
104 | zdn = self.z_dn
105 | labs = self.labs
106 | for d, (doc, freq, zet, lab) in enumerate(zip(docs, freqs, zdn, labs)):
107 | doc_n_d_k = self.n_d_k[d]
108 | for n, (v, f, z) in enumerate(zip(doc, freq, zet)):
109 | self.n_k_v[z, v] -= f
110 | doc_n_d_k[z] -= f
111 | self.n_zk[z] -= f
112 |
113 | a = doc_n_d_k + self.alpha
114 | num_b = self.n_k_v[:, v] + self.beta
115 | den_b = self.n_zk + self.V * self.beta
116 |
117 | prob = lab * a * (num_b/den_b)
118 | prob /= np.sum(prob)
119 | z_new = multinom_draw(1, prob).argmax()
120 |
121 | self.z_dn[d][n] = z_new
122 |
123 | self.n_k_v[z_new, v] += f
124 | doc_n_d_k[z_new] += f
125 | self.n_zk[z_new] += f
126 |
127 | def run_training(self, iters, thinning):
128 | for n in range(iters):
129 | self.training_iteration()
130 | print('Running iteration # %d ' % (n+1))
131 | if (n+1) % thinning == 0:
132 | cur_ph = self.get_phi()
133 | cur_th = self.get_theta()
134 |
135 | cur_perp = self.perplexity()
136 | self.cur_perplx.append(cur_perp)
137 |
138 | s = (n+1) / thinning
139 | if s == 1:
140 | self.ph_hat = cur_ph
141 | self.th_hat = cur_th
142 | elif s > 1:
143 | factor = (s-1)/s
144 | self.ph_hat = factor*self.ph_hat + (1/s * cur_ph)
145 | self.th_hat = factor*self.th_hat + (1/s * cur_th)
146 | if np.any(self.ph_hat < 0):
147 | raise ValueError('A negative value occurred in self.ph_hat'
148 | 'while saving iteration %d ' % n)
149 | if np.any([np.isnan(x) for x in self.ph_hat]):
150 | raise ValueError('A nan has creeped into ph_hat')
151 | wordload = self.ph_hat.sum(axis=0)
152 | if np.any([x == 0 for x in wordload]):
153 | raise ValueError('A word in dictionary has no z-value')
154 |
155 | def prep4test(self, doc):
156 | doc_tups = self.dicti.doc2bow(doc)
157 | doc, freqs = zip(*doc_tups)
158 |
159 | z_dn = []
160 | n_dk = np.zeros(self.K, dtype=int)
161 |
162 | probs = self.ph_hat[:, doc]
163 | with np.errstate(divide="raise", invalid="raise"):
164 | try:
165 | probs /= probs.sum(axis=0)
166 | except FloatingPointError:
167 | probs = 1/self.K * np.ones_like(probs)
168 | for n, f in enumerate(freqs):
169 | prob = probs[:, n]
170 | while prob.sum() > 1:
171 | prob /= 1.0000000005
172 | new_z = multinom_draw(1, prob).argmax()
173 |
174 | z_dn.append(new_z)
175 | n_dk[new_z] += f
176 | start_state = (doc, freqs, z_dn, n_dk)
177 | return start_state
178 |
179 | def run_test(self, newdocs, it, thinning):
180 | nr = len(newdocs)
181 | th_hat = np.zeros((nr, self.K), dtype=float)
182 | for d, newdoc in enumerate(newdocs):
183 | doc, freqs, z_dn, n_dk = self.prep4test(newdoc)
184 | for i in range(it):
185 | for n, (v, f, z) in enumerate(zip(doc, freqs, z_dn)):
186 | n_dk[z] -= f
187 |
188 | num_a = n_dk + self.alpha
189 | b = self.ph_hat[:, v]
190 | prob = num_a * b
191 | prob /= prob.sum()
192 | while prob.sum() > 1:
193 | prob /= 1.0000005
194 | new_z = multinom_draw(1, prob).argmax()
195 |
196 | z_dn[n] = new_z
197 | n_dk[new_z] += f
198 |
199 | # Save the current state in MC chain and calc. average state:
200 | # Only the document-topic distribution estimate theta is saved
201 | s = (i + 1) / thinning
202 | s2 = int(s)
203 | if s == s2:
204 | this_state = n_dk / n_dk.sum()
205 | if s2 == 1:
206 | avg_state = this_state
207 | else:
208 | old = (s2 - 1) / s2 * avg_state
209 | new = (1 / s2) * this_state
210 | avg_state = old + new
211 | th_hat[d, :] = avg_state
212 | return th_hat
213 |
214 | def get_pred(self, single_th, n=5):
215 | labs = np.array(list(self.labelmap.keys()))
216 | top_tops = np.argsort(-single_th)[:n]
217 | top_load = np.flip(np.sort(single_th), axis=0)[:n]
218 |
219 | top_tops = labs[top_tops]
220 | return list(zip(top_tops, top_load))
221 |
222 | def get_preds(self, all_th, n=5):
223 | preds = []
224 | nr = all_th.shape[0]
225 | for d in range(nr):
226 | one_th = all_th[d, :]
227 | pred = self.get_pred(one_th, n)
228 | preds.append(pred)
229 | return preds
230 |
231 | def get_phi(self):
232 | num = self.n_k_v + self.beta
233 | den = self.n_zk[:, np.newaxis] + self.V * self.beta
234 | return num / den
235 |
236 | def get_theta(self):
237 | num = self.n_d_k + self.labs * self.alpha
238 | den = num.sum(axis=1)[:, np.newaxis]
239 | return num / den
240 |
241 | def topwords_per_topic(self, topwords=10):
242 | n = topwords
243 | ph = self.get_phi()
244 | topiclist = []
245 | label_list = list(self.labelmap.keys())
246 | for k in range(self.K):
247 | v_inds = np.argsort(-ph[k, :])[:n]
248 | top_n = [self.v_to_w[x] for x in v_inds]
249 |
250 | topic_name = label_list[k]
251 | top_n.insert(0, topic_name)
252 |
253 | topiclist += [top_n]
254 | return topiclist
255 |
256 | def perplexity(self):
257 | phis = self.get_phi()
258 | thetas = self.get_theta()
259 |
260 | log_per = l = 0
261 | for doc, th in zip(self.docs, thetas):
262 | for w in doc:
263 | log_per -= np.log(np.inner(phis[:, w], th))
264 | l += len(doc)
265 | return np.exp(log_per / l)
266 |
267 |
268 | def split_data(f, d=2):
269 | a, b, c = load_corpus(f, d)
270 |
271 | zipped = list(zip(a, b))
272 | np.random.shuffle(zipped)
273 | a, b = zip(*zipped)
274 |
275 | split = int(len(a) * 0.9)
276 | train_data = (a[:split], b[:split], c)
277 | test_data = (a[split:], b[split:], c)
278 | return train_data, test_data
279 |
280 |
281 | def prune_dict(docs, lower=0.1, upper=0.9):
282 | dicti = dictionary.Dictionary(docs)
283 | lower *= len(docs)
284 | dicti.filter_extremes(no_above=upper, no_below=lower)
285 | return dicti
286 |
287 |
288 | def train_it(traindata, it=30, s=3, al=0.001, be=0.001, l=0.05, u=0.95):
289 | a, b, c = traindata
290 | dicti = prune_dict(a, lower=l, upper=u)
291 | llda = LabeledLDA(a, b, c, dicti, al, be)
292 | llda.run_training(it, s)
293 | return llda
294 |
295 |
296 | def test_it(model, testdata, it=500, thinning=25, n=5):
297 | testdocs = testdata[0]
298 | testdocs = [[x for x in doc if x in model.vocab] for doc in testdocs]
299 | th_hat = model.run_test(testdocs, it, thinning)
300 | preds = model.get_preds(th_hat, n)
301 | th_hat = [[round(x, 4) for x in single_th] for single_th in th_hat]
302 | return th_hat, preds
303 |
--------------------------------------------------------------------------------
/LocalLDA.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import gensim
4 | import re
5 |
6 | from numpy.random import multinomial as multinom_draw
7 | from gensim.parsing.preprocessing import STOPWORDS as stopwords
8 | from nltk.stem import WordNetLemmatizer
9 |
10 |
11 | class LocalLDA:
12 | def __init__(self, docs, alpha, beta, K,
13 | localLDA=True, lemma=True, stem=False):
14 | self.a = alpha
15 | self.b = beta
16 |
17 | if localLDA:
18 | sentences = []
19 | for doc in docs:
20 | s = splitdocs(doc)
21 | sentences.extend(s)
22 | docs = sentences
23 |
24 | # Preprocess the documents, create word2id mapping & map words to IDs
25 | prepped_corp = prep_docs(docs, stem=stem, lemma=lemma)
26 | self.word2id = gensim.corpora.dictionary.Dictionary(prepped_corp)
27 | self.doc_tups = [self.word2id.doc2bow(doc) for doc in prepped_corp]
28 | self.doc_tups = [doc for doc in self.doc_tups if len(doc) > 1]
29 |
30 | # Gather some general LDA parameters
31 | self.V = len(self.word2id)
32 | self.K = K
33 | self.D = len(self.doc_tups)
34 |
35 | self.w_to_v = self.word2id.token2id
36 | self.v_to_w = self.word2id
37 |
38 | self.z_dn = []
39 | self.n_zk = np.zeros(self.K, dtype=int)
40 | self.n_d_k = np.zeros((self.D, self.K), dtype=int)
41 | self.n_k_v = np.zeros((self.K, self.V), dtype=int)
42 |
43 | self.docs = []
44 | self.freqs = []
45 | for d, doctup in enumerate(self.doc_tups):
46 | ids, freqs = zip(*doctup)
47 | self.docs.append(list(ids))
48 | self.freqs.append(list(freqs))
49 |
50 | zets = np.random.choice(self.K, self.K)
51 | self.z_dn.append(zets)
52 | for v, z, freq in zip(ids, zets, freqs):
53 | self.n_zk[z] += freq
54 | self.n_d_k[d, z] += freq
55 | self.n_k_v[z, v] += freq
56 |
57 | self.th_hat = None # will be filled during training
58 | self.ph_hat = None # will be filled during training
59 |
60 | def training_iteration(self):
61 | docs = self.docs
62 | freqs = self.freqs
63 |
64 | zdn = self.z_dn
65 | for d, (doc, freq, zet) in enumerate(zip(docs, freqs, zdn)):
66 | doc_n_d_k = self.n_d_k[d]
67 | for n, (v, f, z) in enumerate(zip(doc, freq, zet)):
68 | self.n_k_v[z, v] -= f
69 | doc_n_d_k[z] -= f
70 | self.n_zk[z] -= f
71 |
72 | a = doc_n_d_k + self.a
73 | num_b = self.n_k_v[:, v] + self.b
74 | den_b = self.n_zk + self.V * self.b
75 |
76 | prob = a * (num_b / den_b)
77 | prob /= np.sum(prob)
78 | z_new = multinom_draw(1, prob).argmax()
79 |
80 | self.z_dn[d][n] = z_new
81 |
82 | self.n_k_v[z_new, v] += f
83 | doc_n_d_k[z_new] += f
84 | self.n_zk[z_new] += f
85 |
86 | def run_training(self, iters, thinning):
87 | for n in range(iters):
88 | self.training_iteration()
89 | print('Running iteration # %d ' % (n + 1))
90 | if (n + 1) % thinning == 0:
91 | cur_ph = self.get_phi()
92 | cur_th = self.get_theta()
93 |
94 | s = (n + 1) / thinning
95 | if s == 1:
96 | self.ph_hat = cur_ph
97 | self.th_hat = cur_th
98 | elif s > 1:
99 | factor = (s - 1) / s
100 | self.ph_hat = factor * self.ph_hat + (1 / s * cur_ph)
101 | self.th_hat = factor * self.th_hat + (1 / s * cur_th)
102 | if np.any(self.ph_hat < 0):
103 | raise ValueError('A negative value occurred in self.ph_hat'
104 | 'while saving iteration %d ' % n)
105 | if np.any([np.isnan(x) for x in self.ph_hat]):
106 | raise ValueError('A nan has creeped into ph_hat')
107 | wordload = self.ph_hat.sum(axis=0)
108 | if np.any([x == 0 for x in wordload]):
109 | raise ValueError('A word in dictionary has no z-value')
110 |
111 | def get_phi(self):
112 | num = self.n_k_v + self.b
113 | den = self.n_zk[:, np.newaxis] + self.V * self.b
114 | return num / den
115 |
116 | def get_theta(self):
117 | num = self.n_d_k + self.a
118 | den = num.sum(axis=1)[:, np.newaxis]
119 | return num / den
120 |
121 | def print_topwords(self, n=10):
122 | ph = self.get_phi()
123 | topiclist = []
124 | for k in range(self.K):
125 | v_ind = np.argsort(-ph[k, :])[:n]
126 | top_n = [self.v_to_w[x] for x in v_ind]
127 | top_n.insert(0, str(k))
128 | topiclist += [top_n]
129 | print(topiclist)
130 | pass
131 |
132 |
133 | def prep_docs(docs, stem=False, lemma=True):
134 | return [prep_doc(doc, stem=stem, lemma=lemma) for doc in docs]
135 |
136 |
137 | def prep_doc(doc, stem=False, lemma=True):
138 | doc = doc.lower()
139 | doc = re.sub('[^\w\s]', '', doc)
140 | doc = doc.split()
141 | # remove stopwords and short words
142 | doc = [word for word in doc if word not in stopwords and len(word) > 2]
143 |
144 | if stem:
145 | p = gensim.parsing.PorterStemmer()
146 | return [p.stem(word) for word in doc]
147 | elif lemma:
148 | lm = WordNetLemmatizer()
149 | return [lm.lemmatize(word, pos='v') for word in doc]
150 | else:
151 | return doc
152 |
153 |
154 | def splitdocs(doc):
155 | sentences = re.split('!|\.|\?|,|-|', doc)
156 | return sentences
157 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Usage
2 | This code can be used for multi-label topic modelling with prior knowledge. It uses Latent Dirichlet Allocation (LDA) as a baseline and implements the following LDA-based models:
3 |
4 | 1) Labeled LDA (Ramage et al, 2009)
5 | 2) Hierarchical Supervised LDA (Perotte et al, 2011)
6 | 3) CascadeLDA
7 | 4) LocalLDA
8 |
9 | The workflow of each model is roughly divided in four parts: Loading and preparing data, train a model, test a model and finally evaluate the predictive quality of the model.
10 |
11 | I also added a training option for LocalLDA, which is a sentence-based version of LDA. Very useful for short texts such as online reviews, but not very useful in longer, more coherent texts.
12 |
13 | ## Input
14 |
15 | Each model takes a `.csv` document as input. Each line must consist of three columns:
16 |
17 | Column 1) Document ID
18 | Column 2) One string containing the entire document
19 | Column 3) Labels contained in a single string, separated by a space
20 |
21 | See `abstracts_data.csv` for an example. Any other structure will not be accepted as input.
22 |
23 | ## How to run \& output
24 |
25 | To run Labeled LDA, see the below example. Simply replace `evaluate_LabeledLDA.py` with `evaluate_CascadeLDA.py` to run CascadeLDA, instead.
26 |
27 |
28 | ```
29 | $ python3 evaluate_LabeledLDA.py -- help
30 |
31 | Usage: evaluate_LabeledLDA.py [options]
32 |
33 | Options:
34 | -h, --help show this help message and exit
35 | -f FILE dataset location
36 | -d LVL depth of lab level
37 | -i IT # of iterations
38 | -s THINNING save frequency
39 | -l LOWER lower threshold for dictionary pruning
40 | -u UPPER upper threshold for dictionary pruning
41 | -a ALPHA alpha prior
42 | -b BETA beta prior
43 | -p Save the model as pickle?
44 |
45 | ```
46 |
47 | So for example:
48 |
49 | ```
50 | $ python3 evaluate_LabeledLDA.py -f "abstracts_data.csv" -d 3 -i 4 -s 4 -l 0 -u 1 -a 0.1 -b 0.01 -p
51 |
52 | Stemming documents ....
53 | Starting training...
54 | Running iteration # 1
55 | Running iteration # 2
56 | Running iteration # 3
57 | Running iteration # 4
58 | Testing test data, this may take a while...
59 | Saved the model and predictions as pickles!
60 | Model: Labeled LDA
61 | Corpus: Abstracts
62 | Label depth 3
63 | # of Gibbs samples: 4
64 | -----------------------------------
65 | AUC ROC: 0.696858414365
66 | one error: 0.47198275862068967
67 | two error: 0.5862068965517241
68 | F1 score (macro average) 0.378575246979
69 |
70 | ```
71 |
72 | ## Datasets
73 |
74 | Two datasets were used in the thesis. For copyright reasons, only the abstracts dataset is made available here. It consists of 4.500 labeled academic abstracts from the economics literature. The papers are labeled according to the JEL classification.
75 |
76 |
77 | # Multilabel hierarchical topic modelling with prior knowledge
78 |
79 | ## CascadeLDA - Thesis abstract
80 |
81 | A new multi-label document classification technique called CascadeLDA is introduced
82 | in this thesis. Rather than focusing on discriminative modelling techniques, CascadeLDA
83 | extends a baseline generative model by incorporating two types of prior information.
84 | Firstly, knowledge from a labeled training dataset is used to direct the generative model.
85 | Secondly, the implicit tree structure of the labels is exploited to emphasise discriminative
86 | features between closely related labels. By segregating the classification problem in an
87 | ensemble of smaller problems, out-of-sample results are achieved at about 25 times the
88 | speed of the baseline model. In this thesis, CascadeLDA is performed on datasets with
89 | academic abstracts and full academic papers. The model is employed to assist authors in
90 | tagging their newly published articles.
91 |
92 | A formal and detailed coverage of baseline LDA, L-LDA, HSLDA and CascadeLDA can be found in `thesis_kenhbs.pdf`. The paper also gives an indepth explanation and derivation of Gibbs sampling and variational inference in the LDA setting.
93 |
94 |
95 | ## Summary of Challenges
96 |
97 | In order to solve the classification problem of academic papers, the main extensions to LDA can be summarised in the following categories:
98 |
99 | 1) Instead of latent topics, we need the topics to correspond exactly to the JEL code descriptions (i.e. explicit topic modelling).
100 | 2) Incorporating prior knowledge on document-topic assignment (i.e. we have a training dataset)
101 | 3) Many labels are very closely related and barely distinguishable. Even though topic-word distributions are accurate, they are nearly identical and do not allow for discrimination.
102 |
103 | ## License
104 |
105 | The code and thesis are licensed under Attribution-NonCommercial-ShareAlike 3.0 Germany (CC BY-NC-SA 3.0 DE)
106 |
--------------------------------------------------------------------------------
/evaluate_CascadeLDA.py:
--------------------------------------------------------------------------------
1 | from CascadeLDA import *
2 | from sklearn.metrics import auc
3 | from optparse import OptionParser
4 | import pickle
5 |
6 |
7 | def one_roc(prob, real_binary):
8 | resorted = np.argsort(prob)[::-1]
9 |
10 | reals = real_binary[resorted]
11 | probs = prob[resorted]
12 | thresholds = np.sort(list(set(probs)))[::-1]
13 |
14 | tp = []
15 | tn = []
16 | fp = []
17 | fn = []
18 | for c in thresholds:
19 | preds = [1 if x >= c else 0 for x in probs]
20 | zipped = list(zip(preds, reals))
21 |
22 | tp_pre = sum([x == y for (x, y) in zipped if x == 1])
23 | tn_pre = sum([x == y for (x, y) in zipped if x == 0])
24 | fp_pre = sum([x != y for (x, y) in zipped if x == 1])
25 | fn_pre = sum([x != y for (x, y) in zipped if x == 0])
26 |
27 | tp.append(tp_pre)
28 | tn.append(tn_pre)
29 | fp.append(fp_pre)
30 | fn.append(fn_pre)
31 | return tp, tn, fp, fn
32 |
33 |
34 | def fpr_tpr(tp, fp, tn, fn):
35 | fpr = [x / (x + y) for (x, y) in zip(fp, tn)]
36 | tpr = [x / (x + y) for (x, y) in zip(tp, fn)]
37 | return fpr, tpr
38 |
39 |
40 | def precision_recall(tp, fp, tn, fn):
41 | precis = [x / (x + y) for (x, y) in zip(tp, fp)]
42 | recall = [x / (x + y) for (x, y) in zip(tp, fn)]
43 | return precis, recall
44 |
45 |
46 | def rates(y_prob, y_real_binary):
47 | tps = []
48 | tns = []
49 | fps = []
50 | fns = []
51 | fprs = []
52 | tprs = []
53 | for d_prob, d_real in zip(y_prob, y_real_binary):
54 | tp, tn, fp, fn = one_roc(d_prob, d_real)
55 | fpr, tpr = fpr_tpr(tp, fp, tn, fn)
56 |
57 | tps.append(tp)
58 | tns.append(tn)
59 | fps.append(fp)
60 | fns.append(fn)
61 | fprs.append(fpr)
62 | tprs.append(tpr)
63 | return tps, tns, fps, fns, fprs, tprs
64 |
65 |
66 | def macro_auc_roc(fprs, tprs):
67 | areas_under_curve = [auc(fpr, tpr) for (fpr, tpr) in zip(fprs, tprs)]
68 | return np.mean(areas_under_curve)
69 |
70 |
71 | def n_error(th_hat, y_real_binary, n):
72 | ndocs = th_hat.shape[0]
73 | counter = 0
74 | for i in range(ndocs):
75 | ordered = np.argsort(th_hat[i, :])[::-1]
76 | toplabs = ordered[:n]
77 | sub_y = y_real_binary[i, :]
78 | hit = sum(sub_y[toplabs]) > 0
79 | if hit:
80 | counter += 1
81 | return counter / ndocs
82 |
83 |
84 | def get_f1(tps, fps, tns, fns):
85 | f1 = []
86 | for tp, fp, tn, fn in zip(tps, fps, tns, fns):
87 | prec, rec = precision_recall(tp, fp, tn, fn)
88 | with np.errstate(invalid='ignore'):
89 | raw_f1 = [(2 * p * r)/(p + r) for p, r in zip(prec, rec)]
90 | opt_f1 = np.nanmax(raw_f1)
91 | f1.append(opt_f1)
92 | return np.mean(f1)
93 |
94 |
95 | def setup_theta(l1p, l2p, l3p, model):
96 | # Start adding the lowest labs and just add the 'rest', too. It will be
97 | # overwritten later on with the correct value from the upper level
98 | n = len(l1p)
99 | k = len(model.labelmap)
100 | th_hat = np.zeros((n, k), dtype=float)
101 |
102 | for d in range(n):
103 | sub_th = th_hat[d, :]
104 | levels = dict()
105 | for tuplist in l3p[d]:
106 | levels.update(tuplist)
107 | for tuplist in l2p[d]:
108 | levels.update(tuplist)
109 | levels.update(l1p[d])
110 |
111 | # Multiple probs of local scope with the prob of upper level:
112 | predecessors = [s for (s, t) in l1p[d]]
113 | lookup = " ".join(list(levels.keys()))
114 | for p in predecessors:
115 | pat = re.compile("(" + p + "[0-9])(?:[^0-9]|$)")
116 | currents = re.findall(pat, lookup)
117 | for c in currents:
118 | levels[c] *= levels[p]
119 | pat = re.compile(c + "[0-9]")
120 | finals = re.findall(pat, lookup)
121 | for f in finals:
122 | levels[f] *= levels[c]
123 |
124 | labs, probs = zip(*levels.items())
125 | inds = [model.labelmap[x] for x in labs]
126 | sub_th[inds] = probs
127 | return th_hat
128 |
129 |
130 | def binary_yreal(label_strings, label_dict):
131 | ndoc = len(label_strings)
132 | ntop = len(label_dict)
133 | y_true = np.zeros((ndoc, ntop), dtype=int)
134 | for d, lab in enumerate(label_strings):
135 | for l in lab:
136 | try:
137 | ind = label_dict[l]
138 | y_true[d, ind] = 1
139 | except KeyError:
140 | pass
141 | return y_true
142 |
143 |
144 | def main():
145 | parser = OptionParser()
146 | parser.add_option("-f", dest="file", help="dataset location")
147 | parser.add_option("-d", dest="lvl", type="int",
148 | help="depth of label level", default=3)
149 | parser.add_option("-i", dest="it", type="int",
150 | help="# of iterations - train and test")
151 | parser.add_option("-s", dest="thinning", type="int",
152 | help="inter saving frequency", default=0)
153 | parser.add_option("-a", dest="alpha", type="float", help="alpha prior",
154 | default=0.1)
155 | parser.add_option("-b", dest="beta", type="float", help="beta prior",
156 | default=0.01)
157 | parser.add_option("-l", dest="lower", type="float",
158 | help="lower threshold for dictionary pruning", default=0)
159 | parser.add_option("-u", dest="upper", type="float",
160 | help="upper threshold for dictionary pruning", default=1)
161 | parser.add_option("-p", action="store_true", dest="pickle",
162 | help="save pickle of model?", default=False)
163 |
164 | (opt, arg) = parser.parse_args()
165 |
166 | if opt.thinning == 0:
167 | opt.thinning = opt.it
168 | train, test = split_data(f=opt.file)
169 | model = train_it(train, it=opt.it, s=opt.thinning,
170 | l=opt.lower, u=opt.upper, al=opt.alpha, be=opt.beta)
171 |
172 | print("Testing test data, this may take a while")
173 | l1, l2, l3 = zip(*[model.test_down_tree(x, it=opt.it, thinning=opt.thinning, threshold=0.95) for x in test[0]])
174 | if opt.pickle:
175 | pickle.dump(model, open("Cascade_model.pkl", "wb"))
176 | pickle.dump(test, open("Cascade_testset.pkl", "wb"))
177 | pickle.dump(l1, open("Cascade_d1_pred.pkl", "wb"))
178 | pickle.dump(l2, open("Cascade_d2_pred.pkl", "wb"))
179 | pickle.dump(l3, open("Cascade_d3_pred.pkl", "wb"))
180 | print("Saved the model and predictions as pickles!")
181 |
182 | # Evaluate quality for all label depths:
183 | d = int(opt.lvl)
184 | label_depths = list(range(1, d+1))
185 | for depth in label_depths:
186 | c = "Full texts"
187 | if opt.file == "thesis_data3.csv":
188 | c = "Abstracts"
189 |
190 | print("Model: CascadeLDA")
191 | print("Corpus: ", c)
192 | print("Label depth ", depth)
193 | print("# of Gibbs samples: ", int(opt.it))
194 | print("-----------------------------------")
195 |
196 | lab_level = [len(x) == depth for x in model.labelmap.keys()]
197 | inds = np.where(lab_level)[0]
198 |
199 | y_bin = binary_yreal(test[1], model.labelmap)
200 | th_hat = setup_theta(l1, l2, l3, model)
201 |
202 | # Selecting the relevant labels
203 | y_bin = y_bin[:, inds]
204 | th_hat = th_hat[:, inds]
205 |
206 | # Remove no-prediction and no-label documents
207 | doc_id1 = np.where(th_hat.sum(axis=1) != 0)[0]
208 | doc_id2 = np.where(y_bin.sum(axis=1) != 0)[0]
209 | valid = np.intersect1d(doc_id1, doc_id2)
210 |
211 | y_bin = y_bin[valid, :]
212 | th_hat = th_hat[valid, :]
213 |
214 | tps, tns, fps, fns, fprs, tprs = rates(th_hat, y_bin)
215 |
216 | one_err = n_error(th_hat, y_bin, 1)
217 | two_err = n_error(th_hat, y_bin, 2)
218 | auc_roc = macro_auc_roc(fprs, tprs)
219 | f1_macro = get_f1(tps, fps, tns, fns)
220 |
221 | print("AUC ROC: ", auc_roc)
222 | print("one error: ", one_err)
223 | print("two error: ", two_err)
224 | print("F1 score (macro average) ", f1_macro)
225 |
226 |
227 | if __name__ == "__main__":
228 | main()
229 |
--------------------------------------------------------------------------------
/evaluate_LabeledLDA.py:
--------------------------------------------------------------------------------
1 | from LabeledLDA import *
2 | from sklearn.metrics import auc
3 | from optparse import OptionParser
4 | import pickle
5 | import numpy as np
6 |
7 |
8 | def one_roc(prob, real_binary):
9 | resorted = np.argsort(prob)[::-1]
10 |
11 | reals = real_binary[resorted]
12 | probs = prob[resorted]
13 | thresholds = np.sort(list(set(probs)))[::-1]
14 |
15 | tp = []
16 | tn = []
17 | fp = []
18 | fn = []
19 | for c in thresholds:
20 | preds = [1 if x >= c else 0 for x in probs]
21 | zipped = list(zip(preds, reals))
22 |
23 | tp_pre = sum([x == y for (x, y) in zipped if x == 1])
24 | tn_pre = sum([x == y for (x, y) in zipped if x == 0])
25 | fp_pre = sum([x != y for (x, y) in zipped if x == 1])
26 | fn_pre = sum([x != y for (x, y) in zipped if x == 0])
27 |
28 | tp.append(tp_pre)
29 | tn.append(tn_pre)
30 | fp.append(fp_pre)
31 | fn.append(fn_pre)
32 | return tp, tn, fp, fn
33 |
34 |
35 | def fpr_tpr(tp, fp, tn, fn):
36 | fpr = [x / (x + y) for (x, y) in zip(fp, tn)]
37 | tpr = [x / (x + y) for (x, y) in zip(tp, fn)]
38 | return fpr, tpr
39 |
40 |
41 | def precision_recall(tp, fp, tn, fn):
42 | precis = [x / (x + y) for (x, y) in zip(tp, fp)]
43 | recall = [x / (x + y) for (x, y) in zip(tp, fn)]
44 | return precis, recall
45 |
46 |
47 | def rates(y_prob, y_real_binary):
48 | tps = []
49 | tns = []
50 | fps = []
51 | fns = []
52 | fprs = []
53 | tprs = []
54 | for d_prob, d_real in zip(y_prob, y_real_binary):
55 | tp, tn, fp, fn = one_roc(d_prob, d_real)
56 | fpr, tpr = fpr_tpr(tp, fp, tn, fn)
57 |
58 | tps.append(tp)
59 | tns.append(tn)
60 | fps.append(fp)
61 | fns.append(fn)
62 | fprs.append(fpr)
63 | tprs.append(tpr)
64 | return tps, tns, fps, fns, fprs, tprs
65 |
66 |
67 | def macro_auc_roc(fprs, tprs):
68 | areas_under_curve = [auc(fpr, tpr) for (fpr, tpr) in zip(fprs, tprs)]
69 | return np.mean(areas_under_curve)
70 |
71 |
72 | def n_error(th_hat, y_real_binary, n):
73 | ndocs = th_hat.shape[0]
74 | counter = 0
75 | for i in range(ndocs):
76 | ordered = np.argsort(th_hat[i, :])[::-1]
77 | toplabs = ordered[:n]
78 | sub_y = y_real_binary[i, :]
79 | hit = sum(sub_y[toplabs]) > 0
80 | if hit:
81 | counter += 1
82 | return counter / ndocs
83 |
84 |
85 | def get_f1(tps, fps, tns, fns):
86 | f1 = []
87 | for tp, fp, tn, fn in zip(tps, fps, tns, fns):
88 | prec, rec = precision_recall(tp, fp, tn, fn)
89 | with np.errstate(invalid='ignore'):
90 | raw_f1 = [(2 * p * r)/(p + r) for p, r in zip(prec, rec)]
91 | opt_f1 = np.nanmax(raw_f1)
92 | f1.append(opt_f1)
93 | return np.mean(f1)
94 |
95 |
96 | def binary_yreal(label_strings, label_dict):
97 | ndoc = len(label_strings)
98 | ntop = len(label_dict)
99 | y_true = np.zeros((ndoc, ntop), dtype=int)
100 | for d, lab in enumerate(label_strings):
101 | for l in lab:
102 | try:
103 | ind = label_dict[l]
104 | y_true[d, ind] = 1
105 | except KeyError:
106 | pass
107 | return y_true
108 |
109 |
110 | def main():
111 | parser = OptionParser()
112 | parser.add_option("-f", dest="file", help="dataset location")
113 | parser.add_option("-d", dest="lvl", type="int", default=3,
114 | help="depth of lab level")
115 | parser.add_option("-i", dest="it", type="int", help="# of iterations")
116 | parser.add_option("-s", dest="thinning", type="int", default=0,
117 | help="save frequency")
118 | parser.add_option("-l", dest="lower", type="float", default=0,
119 | help="lower threshold for dictionary pruning")
120 | parser.add_option("-u", dest="upper", type="float", default=1,
121 | help="upper threshold for dictionary pruning")
122 | parser.add_option("-a", dest="alpha", type="float", default=0.1,
123 | help="alpha prior")
124 | parser.add_option("-b", dest="beta", type="float", default=0.01,
125 | help="beta prior")
126 | parser.add_option("-p", action="store_true", dest="pickle", default=False,
127 | help="Save the model as pickle?")
128 |
129 | (opt, arg) = parser.parse_args()
130 | if opt.thinning == 0:
131 | opt.thinning = opt.it
132 |
133 | train, test = split_data(f=opt.file, d=opt.lvl)
134 |
135 | print("Starting training...")
136 | model = train_it(train, it=opt.it, s=opt.thinning,
137 | al=opt.alpha, be=opt.beta, l=opt.lower, u=opt.upper)
138 |
139 | print("Testing test data, this may take a while...")
140 | th, _ = test_it(model, test, it=opt.it, thinning=opt.thinning)
141 | th = np.array(th)
142 | if opt.pickle:
143 | pickle.dump(model, open("LabeledLDA_model.pkl", "wb"))
144 | pickle.dump(test, open("LabeledLDA_testset.pkl", "wb"))
145 | pickle.dump(th, open("LabeledLDA_theta.pkl", "wb"))
146 |
147 | c = "Full Texts"
148 | if opt.file == "thesis_data3.csv":
149 | c = "Abstracts"
150 |
151 | print("Model: Labeled LDA")
152 | print("Corpus: ", c)
153 | print("Label depth ", opt.lvl)
154 | print("# of Gibbs samples: ", int(opt.it))
155 | print("-----------------------------------")
156 |
157 | y_bin = binary_yreal(test[1], model.labelmap)
158 |
159 | # Remove root label from predictions (also not included in label sets)
160 | y_bin = y_bin[:, 1:]
161 | th = th[:, 1:]
162 |
163 | # Remove docs that were assigned to 'root' completely:
164 | nonzero_load = [x != 0 for x in th.sum(axis=1)]
165 | nonzero_load = np.where(nonzero_load)[0]
166 | y_bin = y_bin[nonzero_load, :]
167 | th = th[nonzero_load, :]
168 |
169 | tps, tns, fps, fns, fprs, tprs = rates(th, y_bin)
170 |
171 | one_err = n_error(th, y_bin, 1)
172 | two_err = n_error(th, y_bin, 2)
173 | auc_roc = macro_auc_roc(fprs, tprs)
174 | f1_macro = get_f1(tps, fps, tns, fns)
175 |
176 | print("AUC ROC: ", auc_roc)
177 | print("one error: ", one_err)
178 | print("two error: ", two_err)
179 | print("F1 score (macro average) ", f1_macro)
180 |
181 |
182 | if __name__ == "__main__":
183 | main()
184 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | boto==2.48.0
2 | bz2file==0.98
3 | certifi==2017.7.27.1
4 | chardet==3.0.4
5 | gensim==2.3.0
6 | idna==2.6
7 | nltk==3.6.6
8 | numpy==1.22.0
9 | regex==2017.7.28
10 | requests==2.20.0
11 | scipy==0.19.1
12 | six==1.10.0
13 | smart-open==1.5.3
14 | stop-words==2015.2.23.1
15 | urllib3==1.26.5
16 | sklearn==0.0
--------------------------------------------------------------------------------
/thesis_kenhbs.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/KenHBS/LDA_thesis/9128551af929f6f2692e8edd79de72dec5c5f7ce/thesis_kenhbs.pdf
--------------------------------------------------------------------------------