├── .idea ├── inspectionProfiles │ └── Project_Default.xml └── vcs.xml ├── CascadeLDA.py ├── HSLDA.py ├── LabeledLDA.py ├── LocalLDA.py ├── README.md ├── abstracts_data.csv ├── evaluate_CascadeLDA.py ├── evaluate_LabeledLDA.py ├── requirements.txt └── thesis_kenhbs.pdf /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 15 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /CascadeLDA.py: -------------------------------------------------------------------------------- 1 | import gensim.parsing.preprocessing as gensimm 2 | from gensim.corpora import dictionary 3 | import numpy as np 4 | import re 5 | multinom_draw = np.random.multinomial 6 | 7 | 8 | def load_corpus(filename, d=3): 9 | import csv, sys 10 | 11 | # Increase max line length for csv.reader: 12 | max_int = sys.maxsize 13 | decrement = True 14 | while decrement: 15 | decrement = False 16 | try: 17 | csv.field_size_limit(max_int) 18 | except OverflowError: 19 | max_int = int(max_int/10) 20 | decrement = True 21 | 22 | docs = [] 23 | labs = [] 24 | labelmap = dict() 25 | pat = re.compile("[A-Z]\d{2}") 26 | f = open(filename, 'r') 27 | reader = csv.reader(f) 28 | for row in reader: 29 | doc = row[1] 30 | lab = row[2] 31 | if len(lab) > 3: 32 | lab = lab.split(" ") 33 | lab = list(filter(lambda i: pat.search(i), lab)) 34 | lab = [partition_label(x, d) for x in lab] 35 | lab = [item for sublist in lab for item in sublist] 36 | lab = list(set(lab)) 37 | for x in lab: 38 | labelmap[x] = 1 39 | else: 40 | lab = partition_label(lab, d) 41 | for x in lab: 42 | labelmap[x] = 1 43 | # lab = [lab] 44 | docs.append(doc) 45 | labs.append(lab) 46 | f.close() 47 | print("Stemming documents ....") 48 | docs = gensimm.preprocess_documents(docs) 49 | return docs, labs, list(labelmap.keys()) 50 | 51 | 52 | def partition_label(lab, d): 53 | return [lab[:i+1] for i in range(d)] 54 | 55 | 56 | class CascadeLDA(object): 57 | def __init__(self, docs, labs, labelset, dicti, alpha=0.001, beta=0.001): 58 | labelset.insert(0, 'root') 59 | self.labelmap = dict(zip(labelset, range(len(labelset)))) 60 | self.dicti = dicti 61 | self.K = len(self.labelmap) 62 | self.lablist = labelset 63 | 64 | self.alpha = alpha 65 | self.beta = beta 66 | 67 | self.vocab = list(dicti.values()) 68 | self.w_to_v = dicti.token2id 69 | self.v_to_w = dicti.id2token 70 | 71 | self.labs = np.array([self.set_label(lab) for lab in labs]) 72 | self.doc_tups = [dicti.doc2bow(x) for x in docs] 73 | 74 | self.docs = [] 75 | self.freqs = [] 76 | for doc in self.doc_tups: 77 | ids, freqs = zip(*doc) 78 | self.docs.append(ids) 79 | self.freqs.append(freqs) 80 | 81 | self.D = len(docs) 82 | self.V = len(self.vocab) 83 | 84 | self.ph = np.zeros((self.K, self.V), dtype=float) 85 | self.perplx = [] 86 | 87 | self.l1 = [[l1 for l1 in lab if len(l1) == 1] for lab in labs] 88 | self.l2 = [[l2 for l2 in lab if len(l2) == 2] for lab in labs] 89 | self.l3 = [[l3 for l3 in lab if len(l3) == 3] for lab in labs] 90 | 91 | self.lablist_l1 = [x for x in self.lablist if len(x) == 1] 92 | self.lablist_l2 = [x for x in self.lablist if len(x) == 2] 93 | self.lablist_l3 = [x for x in self.lablist if len(x) == 3] 94 | 95 | self.rawlabs = labs 96 | 97 | def set_label(self, label): 98 | vec = np.zeros(len(self.labelmap)) 99 | vec[0] = 1.0 100 | for x in label: 101 | vec[self.labelmap[x]] = 1.0 102 | return vec 103 | 104 | def term_to_id(self, term): 105 | if term not in self.w_to_v: 106 | voca_id = len(self.vocab) 107 | self.w_to_v[term] = voca_id 108 | self.vocab.append(term) 109 | else: 110 | voca_id = self.w_to_v[term] 111 | return voca_id 112 | 113 | def sub_corpus(self, parent): 114 | level = len(parent) 115 | if level == 1: 116 | lab_level = self.l2 117 | elif level == 2: 118 | lab_level = self.l3 119 | present = np.where([[parent in lab] for lab in self.rawlabs])[0] 120 | doc_tups = [self.doc_tups[p] for p in present] 121 | labs = [lab_level[p] for p in present] 122 | 123 | # Only keep the target labels, remove all other labels: they will be 124 | # gathered as the 'generic' topic 125 | labs = [[x for x in lab if x[:level] == parent] for lab in labs] 126 | labset = sorted(list(set([x for sub in labs for x in sub]))) 127 | return doc_tups, labs, labset 128 | 129 | def get_sub_ph(self, subdocs, sublabs, sublabset, it=150, thinning=12): 130 | sublda = SubLDA(subdocs, sublabs, sublabset, self.dicti, 131 | alpha=self.alpha, beta=self.beta) 132 | sublda.run_training(it=it, thinning=thinning) 133 | return sublda.get_ph() 134 | 135 | def go_down_tree(self, it, s): 136 | # Starting at 'root' as parent node: 137 | doc_tups = self.doc_tups 138 | labs = self.l1 139 | labset = self.lablist_l1 140 | 141 | sub_ph = self.get_sub_ph(doc_tups, labs, labset, it=it, thinning=s) 142 | 143 | label_ids = [self.labelmap[x] for x in labset] 144 | self.ph[label_ids, :] = sub_ph 145 | 146 | # Only for this root-level we retain the topic-word distr ph for 'root' 147 | labset.remove('root') 148 | 149 | for l in labset: 150 | print(" --- ") 151 | print("Working on parent node", l) 152 | # Take subset of the entire corpus. With label "l*" 153 | doc_tups, labs, sublabset = self.sub_corpus(parent=l) 154 | 155 | # Run local LDA on subset - get those label-word distr. 156 | # This function also adds 'root' to sublabset 157 | sub_ph = self.get_sub_ph(doc_tups, labs, sublabset, it, s) 158 | 159 | # Get the local label ids and insert into global label-word: 160 | # Disregard "root" of every local label-word distr. 161 | sublabset.remove("root") 162 | label_ids = [self.labelmap[x] for x in sublabset] 163 | 164 | sub_ph = sub_ph[1:, :] 165 | self.ph[label_ids, :] = sub_ph 166 | 167 | one_down = [x for x in self.lablist_l2 if x[0] == l] 168 | for l2 in one_down: 169 | print(" --- ") 170 | print("Working on parent node", l2) 171 | # Take subset of the entire corpus. With label "l*" 172 | doc_tups, labs, sublabset = self.sub_corpus(parent=l2) 173 | 174 | # Run local LDA on subset - get those label-word distr. 175 | # This function also adds 'root' to sublabset 176 | sub_ph = self.get_sub_ph(doc_tups, labs, sublabset, it, s) 177 | 178 | # Get the local label ids and insert into global label-word: 179 | # Disregard "root" of every local label-word distr. 180 | sublabset.remove('root') 181 | label_ids = [self.labelmap[x] for x in sublabset] 182 | 183 | sub_ph = sub_ph[1:, :] 184 | self.ph[label_ids, :] = sub_ph 185 | 186 | def prep4test(self, doc, ph): 187 | doc_tups = self.dicti.doc2bow(doc) 188 | doc, freqs = zip(*doc_tups) 189 | ld = len(doc) 190 | 191 | n_dk = np.zeros(ph.shape[0], dtype=int) 192 | z_dn = [] 193 | 194 | probs = ph[:, doc] 195 | probs += self.beta 196 | probs /= probs.sum(axis=0) 197 | # Initiate with the 'garbage'/'root' label uniformly: 198 | probs[0, :] = 1 / ld 199 | for n, freq in enumerate(freqs): 200 | prob = probs[:, n] 201 | while prob.sum() > 1: 202 | prob /= 1.0000005 203 | new_z = multinom_draw(1, prob).argmax() 204 | 205 | z_dn.append(new_z) 206 | n_dk[new_z] += freq 207 | start_state = (doc, freqs, z_dn, n_dk) 208 | return start_state 209 | 210 | def cascade_test(self, doc, it, thinning, labels): 211 | ids = [self.labelmap[x] for x in labels] 212 | ph = self.ph[ids, :] 213 | doc, freqs, z_dn, n_dk = self.prep4test(doc, ph) 214 | 215 | avg_state = np.zeros(len(ids), dtype=float) 216 | for i in range(it): 217 | for n, (v, f, z) in enumerate(zip(doc, freqs, z_dn)): 218 | n_dk[z] -= f 219 | 220 | num_a = n_dk + self.alpha 221 | b = ph[:, v] 222 | prob = num_a * b 223 | # In CascadeLDA it can occur that prob.sum() = 0. This 224 | # is forced to throw an error, else would have been warning: 225 | try: 226 | with np.errstate(invalid="raise"): 227 | prob /= prob.sum() 228 | except FloatingPointError: 229 | prob = num_a * (b + self.beta) 230 | prob /= prob.sum() 231 | while prob.sum() > 1: 232 | prob /= 1.000005 233 | new_z = multinom_draw(1, prob).argmax() 234 | 235 | z_dn[n] = new_z 236 | n_dk[new_z] += f 237 | s = (i+1) / thinning 238 | s2 = int(s) 239 | if s == s2: 240 | this_state = n_dk / n_dk.sum() 241 | if s2 == 1: 242 | avg_state = this_state 243 | else: 244 | old = (s2 - 1) / s2 * avg_state 245 | new = (1 / s2) * this_state 246 | avg_state = old + new 247 | return avg_state 248 | 249 | def test_down_tree(self, doc, it, thinning, threshold): 250 | labels = self.lablist_l1 251 | th_hat = self.cascade_test(doc, it, thinning, labels) 252 | 253 | top_loads = np.sort(th_hat)[::-1] 254 | n = sum(np.cumsum(top_loads) < threshold) + 1 255 | 256 | top_n_load = top_loads[:n] 257 | top_n_labs = np.argsort(th_hat)[::-1][:n] 258 | top_n_labs = [labels[i] for i in top_n_labs] 259 | 260 | level_1 = list(zip(top_n_labs, top_n_load)) 261 | level_2 = [] 262 | level_3 = [] 263 | 264 | if 'root' in top_n_labs: 265 | top_n_labs.remove('root') 266 | next_levels = top_n_labs 267 | for next_level in next_levels: 268 | pat = re.compile('^' + next_level + "[0-9]{1}$") 269 | labels = list(filter(pat.match, self.lablist)) 270 | labels.insert(0, next_level) 271 | th_hat = self.cascade_test(doc, it, thinning, labels) 272 | 273 | top_loads = np.sort(th_hat)[::-1] 274 | n = sum(np.cumsum(top_loads) < threshold) + 1 275 | 276 | top_n_load = top_loads[:n] 277 | top_n_labs = np.argsort(th_hat)[::-1][:n] 278 | top_n_labs = [labels[i] for i in top_n_labs] 279 | 280 | tups = list(zip(top_n_labs, top_n_load)) 281 | level_2.append(tups) 282 | 283 | if next_level in top_n_labs: 284 | top_n_labs.remove(next_level) 285 | last_levels = top_n_labs 286 | for newlab in last_levels: 287 | pat = re.compile('^' + newlab + "[0-9]{1}$") 288 | labels = list(filter(pat.match, self.lablist)) 289 | labels.insert(0, newlab) 290 | th_hat = self.cascade_test(doc, it, thinning, labels) 291 | 292 | top_loads = np.sort(th_hat)[::-1] 293 | n = sum(np.cumsum(top_loads) < threshold) + 1 294 | 295 | top_n_load = top_loads[:n] 296 | top_n_labs = np.argsort(th_hat)[::-1][:n] 297 | top_n_labs = [labels[i] for i in top_n_labs] 298 | tups = list(zip(top_n_labs, top_n_load)) 299 | 300 | level_3.append(tups) 301 | return level_1, level_2, level_3 302 | 303 | def run_test(self, docs, it, thinning, depth="all"): 304 | inds = None 305 | if depth in [1, 2, 3]: 306 | inds = np.where([len(x) in [depth, 4] for x in self.lablist])[0] 307 | elif depth == "all": 308 | inds = range(self.K) 309 | 310 | ph = self.ph[inds, :] 311 | th_hat = np.zeros((len(docs), len(inds)), dtype=float) 312 | 313 | for d, doc in enumerate(docs): 314 | new_d, new_f, z_dn, n_zk = self.prep4test(doc, ph) 315 | for i in range(it): 316 | for n, (v, f) in enumerate(zip(new_d, new_f)): 317 | # v = int(v) 318 | z = z_dn[n] 319 | n_zk[z] -= f 320 | 321 | num_a = n_zk + self.alpha 322 | b = ph[:, v] 323 | prob = num_a * b 324 | prob /= prob.sum() 325 | while prob.sum() > 1: 326 | prob /= 1.000005 327 | new_z = multinom_draw(1, prob).argmax() 328 | 329 | z_dn[n] = new_z 330 | n_zk[new_z] += f 331 | 332 | # Save the current state in MC chain and calc. average state: 333 | s = (i+1) / thinning 334 | if s == int(s): 335 | print("----") 336 | print("Testing iteration #", i+1) 337 | cur_th = n_zk / n_zk.sum() 338 | if s > 1: 339 | m = (s-1)/s 340 | th = m * th + (1-m) * cur_th 341 | else: 342 | th = cur_th 343 | th_hat[d, :] = th 344 | return th_hat 345 | 346 | 347 | class SubLDA(object): 348 | def __init__(self, docs, labs, labelset, dicti, alpha=0.001, beta=0.001): 349 | labelset.insert(0, 'root') 350 | self.labelmap = dict(zip(labelset, range(len(labelset)))) 351 | self.K = len(self.labelmap) 352 | self.dicti = dicti 353 | self.lablist = labelset 354 | 355 | self.alpha = alpha 356 | self.beta = beta 357 | 358 | self.labs = np.array([self.set_label(lab) for lab in labs]) 359 | self.doc_tups = docs 360 | 361 | self.V = len(dicti) 362 | self.D = len(docs) 363 | 364 | self.z_dn = [] 365 | self.n_zk = np.zeros(self.K, dtype=int) 366 | self.n_d_k = np.zeros((self.D, self.K), dtype=int) 367 | self.n_k_v = np.zeros((self.K, self.V), dtype=int) 368 | 369 | self.ph = np.zeros((self.K, self.V), dtype=float) 370 | 371 | self.docs = [] 372 | self.freqs = [] 373 | for d, (doc, lab) in enumerate(zip(self.doc_tups, self.labs)): 374 | ids, freqs = zip(*doc) 375 | self.docs.append(list(ids)) 376 | self.freqs.append(list(freqs)) 377 | 378 | ld = len(doc) 379 | prob = lab / lab.sum() 380 | zets = np.random.choice(self.K, size=ld, p=prob) 381 | self.z_dn.append(zets) 382 | for v, z, f in zip(doc, zets, freqs): 383 | self.n_zk[z] += f 384 | self.n_d_k[d, z] += f 385 | self.n_k_v[z, v] += f 386 | 387 | def set_label(self, label): 388 | vec = np.zeros(len(self.labelmap)) 389 | vec[0] = 1.0 390 | for x in label: 391 | vec[self.labelmap[x]] = 1.0 392 | return vec 393 | 394 | def get_ph(self): 395 | return self.n_k_v / self.n_k_v.sum(axis=1, keepdims=True) 396 | 397 | def training_iteration(self): 398 | docs = self.docs 399 | freqs = self.freqs 400 | zdn = self.z_dn 401 | labs = self.labs 402 | for d, (doc, freq, zet, lab) in enumerate(zip(docs, freqs, zdn, labs)): 403 | doc_n_d_k = self.n_d_k[d] 404 | for n, (v, f, z) in enumerate(zip(doc, freq, zet)): 405 | self.n_k_v[z, v] -= f 406 | doc_n_d_k[z] -= f 407 | self.n_zk[z] -= f 408 | 409 | a = doc_n_d_k + self.alpha 410 | num_b = self.n_k_v[:, v] + self.beta 411 | den_b = self.n_zk + self.V * self.beta 412 | 413 | prob = lab * a * (num_b/den_b) 414 | prob /= np.sum(prob) 415 | z_new = multinom_draw(1, prob).argmax() 416 | 417 | self.z_dn[d][n] = z_new 418 | 419 | self.n_k_v[z_new, v] += f 420 | doc_n_d_k[z_new] += f 421 | self.n_zk[z_new] += f 422 | 423 | def run_training(self, it=120, thinning=15): 424 | for i in range(it): 425 | self.training_iteration() 426 | s = (i+1) / thinning 427 | if s == int(s): 428 | print("Training iteration #", i+1) 429 | cur_ph = self.get_ph() 430 | if s > 1: 431 | m = (s-1)/s 432 | self.ph = m * self.ph + (1-m) * cur_ph 433 | else: 434 | self.ph = cur_ph 435 | 436 | 437 | def split_data(f="thesis_data.csv", d=3): 438 | a, b, c = load_corpus(f, d) 439 | 440 | zipped = list(zip(a, b)) 441 | np.random.shuffle(zipped) 442 | a, b, = zip(*zipped) 443 | 444 | split = int(len(a) * 0.9) 445 | train_data = (a[:split], b[:split], c) 446 | test_data = (a[split:], b[split:], c) 447 | return train_data, test_data 448 | 449 | 450 | def prune_dict(docs, lower=0.1, upper=0.9): 451 | dicti = dictionary.Dictionary(docs) 452 | lower *= len(docs) 453 | dicti.filter_extremes(no_above=upper, no_below=int(lower)) 454 | return dicti 455 | 456 | 457 | def train_it(train_data, it=150, s=12, l=0.02, u=0.98, al=0.001, be=0.001): 458 | a, b, c = train_data 459 | dicti = prune_dict(a, lower=l, upper=u) 460 | cascade = CascadeLDA(a, b, c, dicti, alpha=al, beta=be) 461 | cascade.go_down_tree(it=it, s=s) 462 | return cascade 463 | -------------------------------------------------------------------------------- /HSLDA.py: -------------------------------------------------------------------------------- 1 | import gensim.parsing.preprocessing as gensimm 2 | import numpy as np 3 | from scipy.stats import truncnorm 4 | import scipy 5 | import scipy.special 6 | multinom_draw = np.random.multinomial 7 | rvs = truncnorm.rvs 8 | 9 | 10 | def partition_label(lab, d): 11 | return [lab[:i+1] for i in range(d)] 12 | 13 | 14 | def phi(x): 15 | return 1/2 * (1 + scipy.special.erf(x / np.sqrt(2))) 16 | 17 | 18 | def vect_multinom(prob_matrix): 19 | s = prob_matrix.cumsum(axis=0) 20 | r = np.random.rand(prob_matrix.shape[1]) 21 | k = (s < r).sum(axis=0) 22 | return k 23 | 24 | 25 | def get_stirling_numbers(n): 26 | mat = np.identity(int(n)) 27 | mat[1, 0] = 0 28 | mat[2, 1] = 1 29 | for m in range(3, n): 30 | for k in range(1, m): 31 | l = mat[m-1, k-1] 32 | r = (m-1) * mat[m-1, k] 33 | mat[m, k] = l + r 34 | h = mat.max(axis=1) 35 | res = mat / h[:, None] 36 | return res 37 | 38 | 39 | def load_corpus(filename, d=3): 40 | import csv, sys, re 41 | 42 | # Increase max line length for csv.reader: 43 | max_int = sys.maxsize 44 | decrement = True 45 | while decrement: 46 | decrement = False 47 | try: 48 | csv.field_size_limit(max_int) 49 | except OverflowError: 50 | max_int = int(max_int/10) 51 | decrement = True 52 | 53 | docs = [] 54 | labs = [] 55 | labelmap = dict() 56 | pat = re.compile("[A-Z]\d{2}") 57 | f = open(filename, 'r') 58 | reader = csv.reader(f) 59 | for row in reader: 60 | doc = row[1] 61 | lab = row[2] 62 | if len(lab) > 3: 63 | lab = lab.split(" ") 64 | lab = list(filter(lambda i: pat.search(i), lab)) 65 | lab = [partition_label(x, d) for x in lab] 66 | lab = [item for sublist in lab for item in sublist] 67 | lab = list(set(lab)) 68 | for x in lab: 69 | labelmap[x] = 1 70 | else: 71 | lab = partition_label(lab, d) 72 | for x in lab: 73 | labelmap[x] = 1 74 | docs.append(doc) 75 | labs.append(lab) 76 | f.close() 77 | print("Stemming documents .... ") 78 | docs = gensimm.preprocess_documents(docs) 79 | return docs, labs, list(labelmap.keys()) 80 | 81 | 82 | class HSLDA(object): 83 | def __init__(self, docs, labs, labelset, k=15, 84 | alpha_prime=1, alpha=1, gamma=1, mu=0, sigma=1, xi=0): 85 | 86 | self.labelmap = dict(zip(labelset, range(len(labelset)))) 87 | self.labelmap[''] = 0 88 | 89 | self.lablist = labelset 90 | 91 | self.aprime = alpha_prime 92 | self.alpha = alpha 93 | self.gamma = gamma 94 | self.mu = mu 95 | self.sigma = sigma 96 | self.xi = xi 97 | self.K = k 98 | 99 | self.vocab = [] 100 | self.w_to_v = dict() 101 | self.labs = np.array([self.set_label(lab) for lab in labs]) 102 | self.docs = [[self.term_to_id(term) for term in doc] for doc in docs] 103 | self.v_to_w = {v:w for w, v in self.w_to_v.items()} 104 | 105 | self.D = len(docs) 106 | self.L = len(self.labelmap) 107 | self.V = len(self.vocab) 108 | 109 | k_ones = np.repeat(1, self.K) 110 | v_ones = np.repeat(1, self.V) 111 | mu_par = self.mu * k_ones 112 | self.eta = np.random.normal(mu_par, 1, size=(self.L, self.K)) 113 | self.beta = np.random.dirichlet(self.aprime * k_ones) 114 | self.ph = np.random.dirichlet(self.gamma * v_ones, size=self.K) 115 | self.th = np.random.dirichlet(self.beta * self.alpha, size=self.D) 116 | 117 | self.z_dn = [] 118 | self.n_d_k = np.zeros((self.D, self.K), dtype=int) 119 | self.n_k_v = np.zeros((self.K, self.V), dtype=int) 120 | self.n_zk = np.zeros(self.K, dtype=int) 121 | 122 | for d, doc in enumerate(self.docs): 123 | nd = len(doc) 124 | prob = self.th[d, :] 125 | zets = np.random.choice(self.K, size=nd, p=prob) 126 | self.z_dn.append(zets) 127 | for v, z in zip(doc, zets): 128 | self.n_d_k[d, z] += 1 129 | self.n_k_v[z, v] += 1 130 | self.n_zk[z] += 1 131 | 132 | self.zbar = self.get_zbar() 133 | self.mean_a = np.dot(self.zbar, self.eta.T) 134 | 135 | border_left = np.where(self.labs == 1, -self.mean_a, -np.inf) 136 | border_right = np.where(self.labs == 1, np.inf, -self.mean_a) 137 | self.a = rvs(border_left, border_right, self.mean_a) 138 | 139 | parents = [x[:-1] for x in labelset] 140 | parents = [self.labelmap[x] for x in parents] 141 | own = [self.labelmap[x] for x in labelset] 142 | self.child_to_parent = dict(zip(own, parents)) 143 | 144 | self.stirling = get_stirling_numbers(150) 145 | self.mdot = np.zeros(self.K) 146 | self.m_aux = np.zeros((self.D, self.K)) 147 | 148 | def get_zbar(self): 149 | return self.n_d_k / self.n_d_k.sum(axis=1, keepdims=True) 150 | 151 | def get_ph(self): 152 | return self.n_k_v / self.n_k_v.sum(axis=1, keepdims=True) 153 | 154 | def set_label(self, label): 155 | l = len(self.labelmap) 156 | vec = np.zeros(l, dtype=int) 157 | vec[0] = 1 158 | for x in label: 159 | vec[self.labelmap[x]] = 1 160 | return vec 161 | 162 | def term_to_id(self, term): 163 | if term not in self.w_to_v: 164 | voca_id = len(self.vocab) 165 | self.w_to_v[term] = voca_id 166 | self.vocab.append(term) 167 | else: 168 | voca_id = self.w_to_v[term] 169 | return voca_id 170 | 171 | def sample_z(self, opt=1): 172 | """ 173 | Draws new values for all word-topic assignments in the corpus, based on 174 | Eq. (1) in Perotte '11 HSLDA paper. Two variations have been added 175 | for mathematical and theoretical precision and comparison 176 | (see :param opt below). 177 | This function contains two loops: the outer loop collects doc-level 178 | data from the HSLDA-object to avoid lengthy and superfluous computation 179 | The inner loop uses those subsets to first deduct the current token's 180 | topic assignment in all relevant subsets, then calculate probabilities 181 | for k = 1, 2, ... K and then draw a random values, based on those probs 182 | opt=1 stands for Eq. (1) as presented in the paper. 183 | 184 | val_a: L' x 1 np.array(floats): 185 | The values of the running variable a. Only the 186 | relevant values for document d are used here 187 | mean_a: L' x 1 np.array(floats): 188 | The mean of the running variable a. That is, 189 | np.dot(zbar.T, eta). 190 | dif_mean: L' x K np.array(floats): 191 | This is the reduction in mean_a, due to new topic 192 | assignment z_{d,n}. This implicitly affects zbar, then 193 | np.dot(zbar, eta), which is mean_a. Every column 194 | represents the hypothetical change in mean_a caused 195 | by a reassignment of topic k. 196 | 197 | labs: L x 1 np.array(binary): 198 | An L-dimensional vector with zeros and ones, 199 | indicating whether label l is part of document d's 200 | labelset, or not 201 | relevant_labs: L' x 1 np.array(int): 202 | Vector containing the label ID of the labels in 203 | document d's labelset 204 | 205 | 206 | :param opt: 1 calculates p(a_{l,d} = x) for l positive labels only 207 | 2 calculates p(a_{l,d} > 0) for l positive labels only 208 | 3 calculates p(a_{l',d} > 0) for all l' positive label and 209 | p(a_{l'', d} < 0) for all l'' negative label 210 | :return: K-dimensional probability vector 211 | """ 212 | for d, doc in enumerate(self.docs): 213 | 214 | # Identify the labelset of document doc: 215 | labs = self.labs[d] 216 | if opt in [1, 2]: 217 | relevant_labs = np.where(labs == 1)[0] 218 | elif opt == 3: 219 | relevant_labs = range(self.L) 220 | 221 | # Select relevant data subsets in outer loop 222 | z_dn = self.z_dn[d] 223 | n_d_k = self.n_d_k[d, :] 224 | eta = self.eta[relevant_labs, :] 225 | val_a = self.a[d, relevant_labs, np.newaxis] 226 | mean_a = self.mean_a[d, relevant_labs, np.newaxis] 227 | 228 | # Calculate the implicit update of a's mean. 229 | n_d = len(doc) 230 | dif_mean = eta / n_d 231 | means_a = mean_a + dif_mean 232 | for n, v in enumerate(doc): 233 | # Find and deduct the word-topic assignment: 234 | old_z = z_dn[n] 235 | means_a[:, old_z] -= dif_mean[:, old_z] 236 | n_d_k[old_z] -= 1 237 | self.n_k_v[old_z, v] -= 1 238 | self.n_zk[old_z] -= 1 239 | 240 | # Calculate probability of first part of Eq. (1) 241 | l = n_d_k + self.alpha * self.beta 242 | r_num = self.n_k_v[:, v] + self.gamma 243 | r_den = self.n_zk + self.V * self.gamma 244 | p1 = l * r_num / r_den 245 | 246 | # Calculate probability of second part of Eq. (1) 247 | if opt == 1: 248 | p2 = np.exp((means_a - val_a) ** 2 * (-1 / 2)) 249 | elif opt in [2, 3]: 250 | labcheck = labs[relevant_labs] 251 | labcheck = labcheck[:, np.newaxis] 252 | means_a -= self.xi 253 | signed_mean = np.where(labcheck == 1, means_a, -means_a) 254 | p2 = phi(signed_mean) 255 | p2 *= 2 256 | p2 = p2.prod(axis=0) 257 | 258 | # Combine two parts and draw new word-topic assignment z_{d,n} 259 | prob = p1 * p2 260 | prob /= prob.sum() 261 | new_z = multinom_draw(1, prob).argmax() 262 | 263 | # Add back z_new to all relevant containers: 264 | z_dn[n] = new_z 265 | means_a[:, new_z] += dif_mean[:, new_z] 266 | n_d_k[new_z] += 1 267 | self.n_k_v[new_z, v] += 1 268 | self.n_zk[new_z] += 1 269 | self.n_d_k[d, :] = n_d_k 270 | self.z_dn[d] = z_dn 271 | self.zbar[d, :] = n_d_k / n_d 272 | self.mean_a = np.dot(self.zbar, self.eta.T) 273 | 274 | def sample_eta(self): 275 | sig_prior = np.identity(self.K) / self.sigma 276 | sig_data = np.dot(self.zbar.T, self.zbar) 277 | sigma_hat = scipy.linalg.inv(sig_prior + sig_data) 278 | 279 | mu_prior = self.mu / self.sigma 280 | mu_data = np.dot(self.zbar.T, self.a) 281 | raw_mean = mu_prior + mu_data 282 | mu_hat = np.dot(sigma_hat, raw_mean) 283 | 284 | for l in range(self.L): 285 | mu = mu_hat[:, l] 286 | eta_l = np.random.multivariate_normal(mu, sigma_hat) 287 | self.eta[l, :] = eta_l 288 | 289 | def sample_a(self): 290 | border_left = np.where(self.labs > 0, -self.mean_a, -np.inf) 291 | border_right = np.where(self.labs > 0, np.inf, -self.mean_a) 292 | self.a = rvs(border_left, border_right, self.mean_a) 293 | 294 | def sample_beta(self): 295 | param = self.mdot + self.aprime 296 | self.beta = np.random.dirichlet(param) 297 | 298 | def sample_m(self): 299 | ab = self.alpha * self.beta 300 | for d in range(self.D): 301 | n_d_k = self.n_d_k[d] 302 | for k, n_k in enumerate(n_d_k): 303 | if n_k-1 > self.stirling.shape[0]: 304 | self.stirling = get_stirling_numbers(n_k+1) 305 | ms = self.stirling[n_k, :(n_k+1)] 306 | m_probs = [s * ab[k]**m for m, s in enumerate(ms)] 307 | m_probs /= sum(m_probs) 308 | draw = np.random.choice(m_probs) 309 | self.m_aux[d, k] = draw 310 | self.mdot = self.m_aux.mean(axis=0) 311 | 312 | def run_training(self, it=25, thinning=5, opt=1): 313 | for i in range(it): 314 | self.sample_z(opt=opt) 315 | self.sample_eta() 316 | self.sample_a() 317 | self.sample_m() 318 | self.sample_beta() 319 | s = ((i+1) / thinning) 320 | if s == int(s): 321 | print("Training iteration #", i) 322 | p = i / it * 100 323 | print("Progress is %.2f %%" % p) 324 | print("-----") 325 | cur_ph = self.get_ph() 326 | cur_th = self.get_zbar() 327 | if s > 1: 328 | m = (s-1)/s 329 | self.ph = m * self.ph + (1-m) * cur_ph 330 | self.th = m * self.th + (1-m) * cur_th 331 | else: 332 | self.ph = cur_ph 333 | self.th = cur_th 334 | 335 | def z_for_newdoc(self, newdoc): 336 | newdoc = [self.term_to_id(t) for t in newdoc if t in self.w_to_v] 337 | prob_matrix = self.ph[:, newdoc] 338 | prob_matrix /= prob_matrix.sum(axis=0, keepdims=True) 339 | z_dn = vect_multinom(prob_matrix) 340 | n_d_k = np.zeros(self.K) 341 | for z in z_dn: 342 | n_d_k[z] += 1 343 | 344 | return z_dn, n_d_k, newdoc 345 | 346 | def run_test(self, newdoc, it=250, s=25): 347 | z_dn, n_d_k, newdoc = self.z_for_newdoc(newdoc) 348 | ph_hat = self.n_k_v + self.gamma 349 | ph_hat = ph_hat / ph_hat.sum(axis=1, keepdims=True) 350 | n_d = len(newdoc) 351 | for i in range(it): 352 | for n, v in enumerate(newdoc): 353 | # Find and deduct the word-topic assignment: 354 | old_z = z_dn[n] 355 | n_d_k[old_z] -= 1 356 | 357 | # Calculate probability of first part of Eq. (1) 358 | l = n_d_k + self.alpha * self.beta 359 | r = ph_hat[:, v] 360 | p1 = l * r 361 | p1 /= p1.sum() 362 | new_z = multinom_draw(1, p1).argmax() 363 | 364 | z_dn[n] = new_z 365 | n_d_k[new_z] += 1 366 | 367 | c = ((i+1) / s) 368 | if c == int(c): 369 | cur_th = n_d_k / n_d 370 | if c > 1: 371 | m = (c-1)/c 372 | zbar = m * zbar + (1-m) * cur_th 373 | else: 374 | zbar = cur_th 375 | means_a = np.dot(self.eta, zbar) 376 | means_a -= self.xi 377 | probs = phi(means_a) 378 | return probs 379 | 380 | def display_topics(self, n=10): 381 | top_v = np.argsort(-self.ph)[:, :n] 382 | return [[self.v_to_w[v] for v in top] for top in top_v] 383 | 384 | def label_predictions(self, probs): 385 | return sorted(zip(probs, self.lablist))[::-1] 386 | 387 | def run_tests(self, newdocs, it=250, s=25): 388 | if len(newdocs) == 1: 389 | return self.run_test(newdocs, it=it, s=s) 390 | else: 391 | lab_probs = np.empty((len(newdocs), self.L)) 392 | for d, doc in enumerate(newdocs): 393 | lab_probs[d, :] = self.run_test(doc, it=it, s=s) 394 | return lab_probs 395 | 396 | 397 | def split_data(f="thesis_data.csv", d=3): 398 | a, b, c = load_corpus(filename=f, d=d) 399 | split = int(len(a) * 0.9) 400 | 401 | train_data = (a[:split], b[:split], c) 402 | test_data = (a[split:], b[split:], c) 403 | return train_data, test_data 404 | 405 | 406 | def train_it(traindata, it=150, s=25, opt=1): 407 | a, b, c = traindata[0], traindata[1], traindata[2] 408 | hs = HSLDA(a, b, c) 409 | hs.run_training(it=it, thinning=s, opt=opt) 410 | return hs 411 | 412 | 413 | def test_it(model, testdata, it=500, s=25): 414 | testdocs = testdata[0] 415 | testdocs = [[x for x in doc if x in model.vocab] for doc in testdocs] 416 | lab_probs = model.run_tests(testdocs, it=it, s=s) 417 | return lab_probs 418 | -------------------------------------------------------------------------------- /LabeledLDA.py: -------------------------------------------------------------------------------- 1 | import gensim.parsing.preprocessing as gensimm 2 | from gensim.corpora import dictionary 3 | import numpy as np 4 | from numpy.random import multinomial as multinom_draw 5 | 6 | 7 | def load_corpus(filename, d): 8 | import csv, sys, re 9 | 10 | # Increase max line length for csv.reader: 11 | max_int = sys.maxsize 12 | decrement = True 13 | while decrement: 14 | decrement = False 15 | try: 16 | csv.field_size_limit(max_int) 17 | except OverflowError: 18 | max_int = int(max_int/10) 19 | decrement = True 20 | 21 | docs = [] 22 | labs = [] 23 | labelmap = dict() 24 | pat = re.compile("[A-Z]\d{2}") 25 | f = open(filename, 'r') 26 | reader = csv.reader(f) 27 | for row in reader: 28 | doc = row[1] 29 | lab = row[2] 30 | if len(lab) > 3: 31 | lab = lab.split(" ") 32 | lab = list(filter(lambda i: pat.search(i), lab)) 33 | lab = [x[:d] for x in lab] 34 | for x in lab: 35 | labelmap[x] = 1 36 | else: 37 | lab = lab[:d] 38 | labelmap[lab] = 1 39 | lab = [lab] 40 | lab = list(set(lab)) 41 | docs.append(doc) 42 | labs.append(lab) 43 | f.close() 44 | print("Stemming documents ....") 45 | docs = gensimm.preprocess_documents(docs) 46 | return docs, labs, list(labelmap.keys()) 47 | 48 | 49 | class LabeledLDA(object): 50 | def __init__(self, docs, labs, labelset, dicti, alpha, beta): 51 | labelset.insert(0, 'root') 52 | self.labelmap = dict(zip(labelset, range(len(labelset)))) 53 | self.K = len(self.labelmap) 54 | self.dicti = dicti 55 | 56 | self.alpha = alpha 57 | self.beta = beta 58 | 59 | self.vocab = list(dicti.values()) 60 | self.w_to_v = dicti.token2id 61 | self.v_to_w = dicti.id2token 62 | 63 | self.labs = np.array([self.set_label(lab) for lab in labs]) 64 | self.doc_tups = [dicti.doc2bow(x) for x in docs] 65 | 66 | self.D = len(docs) 67 | self.V = len(self.vocab) 68 | 69 | self.ph_hat = np.zeros((self.K, self.V), dtype=float) 70 | self.th_hat = np.zeros((self.D, self.K), dtype=float) 71 | self.cur_perplx = [] 72 | 73 | self.z_dn = [] 74 | self.n_zk = np.zeros(self.K, dtype=int) 75 | self.n_d_k = np.zeros((self.D, self.K), dtype=int) 76 | self.n_k_v = np.zeros((self.K, self.V), dtype=int) 77 | 78 | self.docs = [] 79 | self.freqs = [] 80 | for d, (doc, lab) in enumerate(zip(self.doc_tups, self.labs)): 81 | ids, freqs = zip(*doc) 82 | self.docs.append(list(ids)) 83 | self.freqs.append(list(freqs)) 84 | 85 | ld = len(doc) 86 | prob = lab/lab.sum() 87 | zets = np.random.choice(self.K, size=ld, p=prob) 88 | self.z_dn.append(zets) 89 | for v, z, freq in zip(ids, zets, freqs): 90 | self.n_zk[z] += freq 91 | self.n_d_k[d, z] += freq 92 | self.n_k_v[z, v] += freq 93 | 94 | def set_label(self, label): 95 | vec = np.zeros(len(self.labelmap)) 96 | vec[0] = 1.0 97 | for x in label: 98 | vec[self.labelmap[x]] = 1.0 99 | return vec 100 | 101 | def training_iteration(self): 102 | docs = self.docs 103 | freqs = self.freqs 104 | zdn = self.z_dn 105 | labs = self.labs 106 | for d, (doc, freq, zet, lab) in enumerate(zip(docs, freqs, zdn, labs)): 107 | doc_n_d_k = self.n_d_k[d] 108 | for n, (v, f, z) in enumerate(zip(doc, freq, zet)): 109 | self.n_k_v[z, v] -= f 110 | doc_n_d_k[z] -= f 111 | self.n_zk[z] -= f 112 | 113 | a = doc_n_d_k + self.alpha 114 | num_b = self.n_k_v[:, v] + self.beta 115 | den_b = self.n_zk + self.V * self.beta 116 | 117 | prob = lab * a * (num_b/den_b) 118 | prob /= np.sum(prob) 119 | z_new = multinom_draw(1, prob).argmax() 120 | 121 | self.z_dn[d][n] = z_new 122 | 123 | self.n_k_v[z_new, v] += f 124 | doc_n_d_k[z_new] += f 125 | self.n_zk[z_new] += f 126 | 127 | def run_training(self, iters, thinning): 128 | for n in range(iters): 129 | self.training_iteration() 130 | print('Running iteration # %d ' % (n+1)) 131 | if (n+1) % thinning == 0: 132 | cur_ph = self.get_phi() 133 | cur_th = self.get_theta() 134 | 135 | cur_perp = self.perplexity() 136 | self.cur_perplx.append(cur_perp) 137 | 138 | s = (n+1) / thinning 139 | if s == 1: 140 | self.ph_hat = cur_ph 141 | self.th_hat = cur_th 142 | elif s > 1: 143 | factor = (s-1)/s 144 | self.ph_hat = factor*self.ph_hat + (1/s * cur_ph) 145 | self.th_hat = factor*self.th_hat + (1/s * cur_th) 146 | if np.any(self.ph_hat < 0): 147 | raise ValueError('A negative value occurred in self.ph_hat' 148 | 'while saving iteration %d ' % n) 149 | if np.any([np.isnan(x) for x in self.ph_hat]): 150 | raise ValueError('A nan has creeped into ph_hat') 151 | wordload = self.ph_hat.sum(axis=0) 152 | if np.any([x == 0 for x in wordload]): 153 | raise ValueError('A word in dictionary has no z-value') 154 | 155 | def prep4test(self, doc): 156 | doc_tups = self.dicti.doc2bow(doc) 157 | doc, freqs = zip(*doc_tups) 158 | 159 | z_dn = [] 160 | n_dk = np.zeros(self.K, dtype=int) 161 | 162 | probs = self.ph_hat[:, doc] 163 | with np.errstate(divide="raise", invalid="raise"): 164 | try: 165 | probs /= probs.sum(axis=0) 166 | except FloatingPointError: 167 | probs = 1/self.K * np.ones_like(probs) 168 | for n, f in enumerate(freqs): 169 | prob = probs[:, n] 170 | while prob.sum() > 1: 171 | prob /= 1.0000000005 172 | new_z = multinom_draw(1, prob).argmax() 173 | 174 | z_dn.append(new_z) 175 | n_dk[new_z] += f 176 | start_state = (doc, freqs, z_dn, n_dk) 177 | return start_state 178 | 179 | def run_test(self, newdocs, it, thinning): 180 | nr = len(newdocs) 181 | th_hat = np.zeros((nr, self.K), dtype=float) 182 | for d, newdoc in enumerate(newdocs): 183 | doc, freqs, z_dn, n_dk = self.prep4test(newdoc) 184 | for i in range(it): 185 | for n, (v, f, z) in enumerate(zip(doc, freqs, z_dn)): 186 | n_dk[z] -= f 187 | 188 | num_a = n_dk + self.alpha 189 | b = self.ph_hat[:, v] 190 | prob = num_a * b 191 | prob /= prob.sum() 192 | while prob.sum() > 1: 193 | prob /= 1.0000005 194 | new_z = multinom_draw(1, prob).argmax() 195 | 196 | z_dn[n] = new_z 197 | n_dk[new_z] += f 198 | 199 | # Save the current state in MC chain and calc. average state: 200 | # Only the document-topic distribution estimate theta is saved 201 | s = (i + 1) / thinning 202 | s2 = int(s) 203 | if s == s2: 204 | this_state = n_dk / n_dk.sum() 205 | if s2 == 1: 206 | avg_state = this_state 207 | else: 208 | old = (s2 - 1) / s2 * avg_state 209 | new = (1 / s2) * this_state 210 | avg_state = old + new 211 | th_hat[d, :] = avg_state 212 | return th_hat 213 | 214 | def get_pred(self, single_th, n=5): 215 | labs = np.array(list(self.labelmap.keys())) 216 | top_tops = np.argsort(-single_th)[:n] 217 | top_load = np.flip(np.sort(single_th), axis=0)[:n] 218 | 219 | top_tops = labs[top_tops] 220 | return list(zip(top_tops, top_load)) 221 | 222 | def get_preds(self, all_th, n=5): 223 | preds = [] 224 | nr = all_th.shape[0] 225 | for d in range(nr): 226 | one_th = all_th[d, :] 227 | pred = self.get_pred(one_th, n) 228 | preds.append(pred) 229 | return preds 230 | 231 | def get_phi(self): 232 | num = self.n_k_v + self.beta 233 | den = self.n_zk[:, np.newaxis] + self.V * self.beta 234 | return num / den 235 | 236 | def get_theta(self): 237 | num = self.n_d_k + self.labs * self.alpha 238 | den = num.sum(axis=1)[:, np.newaxis] 239 | return num / den 240 | 241 | def topwords_per_topic(self, topwords=10): 242 | n = topwords 243 | ph = self.get_phi() 244 | topiclist = [] 245 | label_list = list(self.labelmap.keys()) 246 | for k in range(self.K): 247 | v_inds = np.argsort(-ph[k, :])[:n] 248 | top_n = [self.v_to_w[x] for x in v_inds] 249 | 250 | topic_name = label_list[k] 251 | top_n.insert(0, topic_name) 252 | 253 | topiclist += [top_n] 254 | return topiclist 255 | 256 | def perplexity(self): 257 | phis = self.get_phi() 258 | thetas = self.get_theta() 259 | 260 | log_per = l = 0 261 | for doc, th in zip(self.docs, thetas): 262 | for w in doc: 263 | log_per -= np.log(np.inner(phis[:, w], th)) 264 | l += len(doc) 265 | return np.exp(log_per / l) 266 | 267 | 268 | def split_data(f, d=2): 269 | a, b, c = load_corpus(f, d) 270 | 271 | zipped = list(zip(a, b)) 272 | np.random.shuffle(zipped) 273 | a, b = zip(*zipped) 274 | 275 | split = int(len(a) * 0.9) 276 | train_data = (a[:split], b[:split], c) 277 | test_data = (a[split:], b[split:], c) 278 | return train_data, test_data 279 | 280 | 281 | def prune_dict(docs, lower=0.1, upper=0.9): 282 | dicti = dictionary.Dictionary(docs) 283 | lower *= len(docs) 284 | dicti.filter_extremes(no_above=upper, no_below=lower) 285 | return dicti 286 | 287 | 288 | def train_it(traindata, it=30, s=3, al=0.001, be=0.001, l=0.05, u=0.95): 289 | a, b, c = traindata 290 | dicti = prune_dict(a, lower=l, upper=u) 291 | llda = LabeledLDA(a, b, c, dicti, al, be) 292 | llda.run_training(it, s) 293 | return llda 294 | 295 | 296 | def test_it(model, testdata, it=500, thinning=25, n=5): 297 | testdocs = testdata[0] 298 | testdocs = [[x for x in doc if x in model.vocab] for doc in testdocs] 299 | th_hat = model.run_test(testdocs, it, thinning) 300 | preds = model.get_preds(th_hat, n) 301 | th_hat = [[round(x, 4) for x in single_th] for single_th in th_hat] 302 | return th_hat, preds 303 | -------------------------------------------------------------------------------- /LocalLDA.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import gensim 4 | import re 5 | 6 | from numpy.random import multinomial as multinom_draw 7 | from gensim.parsing.preprocessing import STOPWORDS as stopwords 8 | from nltk.stem import WordNetLemmatizer 9 | 10 | 11 | class LocalLDA: 12 | def __init__(self, docs, alpha, beta, K, 13 | localLDA=True, lemma=True, stem=False): 14 | self.a = alpha 15 | self.b = beta 16 | 17 | if localLDA: 18 | sentences = [] 19 | for doc in docs: 20 | s = splitdocs(doc) 21 | sentences.extend(s) 22 | docs = sentences 23 | 24 | # Preprocess the documents, create word2id mapping & map words to IDs 25 | prepped_corp = prep_docs(docs, stem=stem, lemma=lemma) 26 | self.word2id = gensim.corpora.dictionary.Dictionary(prepped_corp) 27 | self.doc_tups = [self.word2id.doc2bow(doc) for doc in prepped_corp] 28 | self.doc_tups = [doc for doc in self.doc_tups if len(doc) > 1] 29 | 30 | # Gather some general LDA parameters 31 | self.V = len(self.word2id) 32 | self.K = K 33 | self.D = len(self.doc_tups) 34 | 35 | self.w_to_v = self.word2id.token2id 36 | self.v_to_w = self.word2id 37 | 38 | self.z_dn = [] 39 | self.n_zk = np.zeros(self.K, dtype=int) 40 | self.n_d_k = np.zeros((self.D, self.K), dtype=int) 41 | self.n_k_v = np.zeros((self.K, self.V), dtype=int) 42 | 43 | self.docs = [] 44 | self.freqs = [] 45 | for d, doctup in enumerate(self.doc_tups): 46 | ids, freqs = zip(*doctup) 47 | self.docs.append(list(ids)) 48 | self.freqs.append(list(freqs)) 49 | 50 | zets = np.random.choice(self.K, self.K) 51 | self.z_dn.append(zets) 52 | for v, z, freq in zip(ids, zets, freqs): 53 | self.n_zk[z] += freq 54 | self.n_d_k[d, z] += freq 55 | self.n_k_v[z, v] += freq 56 | 57 | self.th_hat = None # will be filled during training 58 | self.ph_hat = None # will be filled during training 59 | 60 | def training_iteration(self): 61 | docs = self.docs 62 | freqs = self.freqs 63 | 64 | zdn = self.z_dn 65 | for d, (doc, freq, zet) in enumerate(zip(docs, freqs, zdn)): 66 | doc_n_d_k = self.n_d_k[d] 67 | for n, (v, f, z) in enumerate(zip(doc, freq, zet)): 68 | self.n_k_v[z, v] -= f 69 | doc_n_d_k[z] -= f 70 | self.n_zk[z] -= f 71 | 72 | a = doc_n_d_k + self.a 73 | num_b = self.n_k_v[:, v] + self.b 74 | den_b = self.n_zk + self.V * self.b 75 | 76 | prob = a * (num_b / den_b) 77 | prob /= np.sum(prob) 78 | z_new = multinom_draw(1, prob).argmax() 79 | 80 | self.z_dn[d][n] = z_new 81 | 82 | self.n_k_v[z_new, v] += f 83 | doc_n_d_k[z_new] += f 84 | self.n_zk[z_new] += f 85 | 86 | def run_training(self, iters, thinning): 87 | for n in range(iters): 88 | self.training_iteration() 89 | print('Running iteration # %d ' % (n + 1)) 90 | if (n + 1) % thinning == 0: 91 | cur_ph = self.get_phi() 92 | cur_th = self.get_theta() 93 | 94 | s = (n + 1) / thinning 95 | if s == 1: 96 | self.ph_hat = cur_ph 97 | self.th_hat = cur_th 98 | elif s > 1: 99 | factor = (s - 1) / s 100 | self.ph_hat = factor * self.ph_hat + (1 / s * cur_ph) 101 | self.th_hat = factor * self.th_hat + (1 / s * cur_th) 102 | if np.any(self.ph_hat < 0): 103 | raise ValueError('A negative value occurred in self.ph_hat' 104 | 'while saving iteration %d ' % n) 105 | if np.any([np.isnan(x) for x in self.ph_hat]): 106 | raise ValueError('A nan has creeped into ph_hat') 107 | wordload = self.ph_hat.sum(axis=0) 108 | if np.any([x == 0 for x in wordload]): 109 | raise ValueError('A word in dictionary has no z-value') 110 | 111 | def get_phi(self): 112 | num = self.n_k_v + self.b 113 | den = self.n_zk[:, np.newaxis] + self.V * self.b 114 | return num / den 115 | 116 | def get_theta(self): 117 | num = self.n_d_k + self.a 118 | den = num.sum(axis=1)[:, np.newaxis] 119 | return num / den 120 | 121 | def print_topwords(self, n=10): 122 | ph = self.get_phi() 123 | topiclist = [] 124 | for k in range(self.K): 125 | v_ind = np.argsort(-ph[k, :])[:n] 126 | top_n = [self.v_to_w[x] for x in v_ind] 127 | top_n.insert(0, str(k)) 128 | topiclist += [top_n] 129 | print(topiclist) 130 | pass 131 | 132 | 133 | def prep_docs(docs, stem=False, lemma=True): 134 | return [prep_doc(doc, stem=stem, lemma=lemma) for doc in docs] 135 | 136 | 137 | def prep_doc(doc, stem=False, lemma=True): 138 | doc = doc.lower() 139 | doc = re.sub('[^\w\s]', '', doc) 140 | doc = doc.split() 141 | # remove stopwords and short words 142 | doc = [word for word in doc if word not in stopwords and len(word) > 2] 143 | 144 | if stem: 145 | p = gensim.parsing.PorterStemmer() 146 | return [p.stem(word) for word in doc] 147 | elif lemma: 148 | lm = WordNetLemmatizer() 149 | return [lm.lemmatize(word, pos='v') for word in doc] 150 | else: 151 | return doc 152 | 153 | 154 | def splitdocs(doc): 155 | sentences = re.split('!|\.|\?|,|-|', doc) 156 | return sentences 157 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Usage 2 | This code can be used for multi-label topic modelling with prior knowledge. It uses Latent Dirichlet Allocation (LDA) as a baseline and implements the following LDA-based models: 3 | 4 | 1) Labeled LDA (Ramage et al, 2009) 5 | 2) Hierarchical Supervised LDA (Perotte et al, 2011) 6 | 3) CascadeLDA 7 | 4) LocalLDA 8 | 9 | The workflow of each model is roughly divided in four parts: Loading and preparing data, train a model, test a model and finally evaluate the predictive quality of the model. 10 | 11 | I also added a training option for LocalLDA, which is a sentence-based version of LDA. Very useful for short texts such as online reviews, but not very useful in longer, more coherent texts. 12 | 13 | ## Input 14 | 15 | Each model takes a `.csv` document as input. Each line must consist of three columns: 16 | 17 | Column 1) Document ID 18 | Column 2) One string containing the entire document 19 | Column 3) Labels contained in a single string, separated by a space 20 | 21 | See `abstracts_data.csv` for an example. Any other structure will not be accepted as input. 22 | 23 | ## How to run \& output 24 | 25 | To run Labeled LDA, see the below example. Simply replace `evaluate_LabeledLDA.py` with `evaluate_CascadeLDA.py` to run CascadeLDA, instead. 26 | 27 | 28 | ``` 29 | $ python3 evaluate_LabeledLDA.py -- help 30 | 31 | Usage: evaluate_LabeledLDA.py [options] 32 | 33 | Options: 34 | -h, --help show this help message and exit 35 | -f FILE dataset location 36 | -d LVL depth of lab level 37 | -i IT # of iterations 38 | -s THINNING save frequency 39 | -l LOWER lower threshold for dictionary pruning 40 | -u UPPER upper threshold for dictionary pruning 41 | -a ALPHA alpha prior 42 | -b BETA beta prior 43 | -p Save the model as pickle? 44 | 45 | ``` 46 | 47 | So for example: 48 | 49 | ``` 50 | $ python3 evaluate_LabeledLDA.py -f "abstracts_data.csv" -d 3 -i 4 -s 4 -l 0 -u 1 -a 0.1 -b 0.01 -p 51 | 52 | Stemming documents .... 53 | Starting training... 54 | Running iteration # 1 55 | Running iteration # 2 56 | Running iteration # 3 57 | Running iteration # 4 58 | Testing test data, this may take a while... 59 | Saved the model and predictions as pickles! 60 | Model: Labeled LDA 61 | Corpus: Abstracts 62 | Label depth 3 63 | # of Gibbs samples: 4 64 | ----------------------------------- 65 | AUC ROC: 0.696858414365 66 | one error: 0.47198275862068967 67 | two error: 0.5862068965517241 68 | F1 score (macro average) 0.378575246979 69 | 70 | ``` 71 | 72 | ## Datasets 73 | 74 | Two datasets were used in the thesis. For copyright reasons, only the abstracts dataset is made available here. It consists of 4.500 labeled academic abstracts from the economics literature. The papers are labeled according to the JEL classification. 75 | 76 | 77 | # Multilabel hierarchical topic modelling with prior knowledge 78 | 79 | ## CascadeLDA - Thesis abstract 80 | 81 | A new multi-label document classification technique called CascadeLDA is introduced 82 | in this thesis. Rather than focusing on discriminative modelling techniques, CascadeLDA 83 | extends a baseline generative model by incorporating two types of prior information. 84 | Firstly, knowledge from a labeled training dataset is used to direct the generative model. 85 | Secondly, the implicit tree structure of the labels is exploited to emphasise discriminative 86 | features between closely related labels. By segregating the classification problem in an 87 | ensemble of smaller problems, out-of-sample results are achieved at about 25 times the 88 | speed of the baseline model. In this thesis, CascadeLDA is performed on datasets with 89 | academic abstracts and full academic papers. The model is employed to assist authors in 90 | tagging their newly published articles. 91 | 92 | A formal and detailed coverage of baseline LDA, L-LDA, HSLDA and CascadeLDA can be found in `thesis_kenhbs.pdf`. The paper also gives an indepth explanation and derivation of Gibbs sampling and variational inference in the LDA setting. 93 | 94 | 95 | ## Summary of Challenges 96 | 97 | In order to solve the classification problem of academic papers, the main extensions to LDA can be summarised in the following categories: 98 | 99 | 1) Instead of latent topics, we need the topics to correspond exactly to the JEL code descriptions (i.e. explicit topic modelling). 100 | 2) Incorporating prior knowledge on document-topic assignment (i.e. we have a training dataset) 101 | 3) Many labels are very closely related and barely distinguishable. Even though topic-word distributions are accurate, they are nearly identical and do not allow for discrimination. 102 | 103 | ## License 104 | 105 | The code and thesis are licensed under Attribution-NonCommercial-ShareAlike 3.0 Germany (CC BY-NC-SA 3.0 DE) 106 | -------------------------------------------------------------------------------- /evaluate_CascadeLDA.py: -------------------------------------------------------------------------------- 1 | from CascadeLDA import * 2 | from sklearn.metrics import auc 3 | from optparse import OptionParser 4 | import pickle 5 | 6 | 7 | def one_roc(prob, real_binary): 8 | resorted = np.argsort(prob)[::-1] 9 | 10 | reals = real_binary[resorted] 11 | probs = prob[resorted] 12 | thresholds = np.sort(list(set(probs)))[::-1] 13 | 14 | tp = [] 15 | tn = [] 16 | fp = [] 17 | fn = [] 18 | for c in thresholds: 19 | preds = [1 if x >= c else 0 for x in probs] 20 | zipped = list(zip(preds, reals)) 21 | 22 | tp_pre = sum([x == y for (x, y) in zipped if x == 1]) 23 | tn_pre = sum([x == y for (x, y) in zipped if x == 0]) 24 | fp_pre = sum([x != y for (x, y) in zipped if x == 1]) 25 | fn_pre = sum([x != y for (x, y) in zipped if x == 0]) 26 | 27 | tp.append(tp_pre) 28 | tn.append(tn_pre) 29 | fp.append(fp_pre) 30 | fn.append(fn_pre) 31 | return tp, tn, fp, fn 32 | 33 | 34 | def fpr_tpr(tp, fp, tn, fn): 35 | fpr = [x / (x + y) for (x, y) in zip(fp, tn)] 36 | tpr = [x / (x + y) for (x, y) in zip(tp, fn)] 37 | return fpr, tpr 38 | 39 | 40 | def precision_recall(tp, fp, tn, fn): 41 | precis = [x / (x + y) for (x, y) in zip(tp, fp)] 42 | recall = [x / (x + y) for (x, y) in zip(tp, fn)] 43 | return precis, recall 44 | 45 | 46 | def rates(y_prob, y_real_binary): 47 | tps = [] 48 | tns = [] 49 | fps = [] 50 | fns = [] 51 | fprs = [] 52 | tprs = [] 53 | for d_prob, d_real in zip(y_prob, y_real_binary): 54 | tp, tn, fp, fn = one_roc(d_prob, d_real) 55 | fpr, tpr = fpr_tpr(tp, fp, tn, fn) 56 | 57 | tps.append(tp) 58 | tns.append(tn) 59 | fps.append(fp) 60 | fns.append(fn) 61 | fprs.append(fpr) 62 | tprs.append(tpr) 63 | return tps, tns, fps, fns, fprs, tprs 64 | 65 | 66 | def macro_auc_roc(fprs, tprs): 67 | areas_under_curve = [auc(fpr, tpr) for (fpr, tpr) in zip(fprs, tprs)] 68 | return np.mean(areas_under_curve) 69 | 70 | 71 | def n_error(th_hat, y_real_binary, n): 72 | ndocs = th_hat.shape[0] 73 | counter = 0 74 | for i in range(ndocs): 75 | ordered = np.argsort(th_hat[i, :])[::-1] 76 | toplabs = ordered[:n] 77 | sub_y = y_real_binary[i, :] 78 | hit = sum(sub_y[toplabs]) > 0 79 | if hit: 80 | counter += 1 81 | return counter / ndocs 82 | 83 | 84 | def get_f1(tps, fps, tns, fns): 85 | f1 = [] 86 | for tp, fp, tn, fn in zip(tps, fps, tns, fns): 87 | prec, rec = precision_recall(tp, fp, tn, fn) 88 | with np.errstate(invalid='ignore'): 89 | raw_f1 = [(2 * p * r)/(p + r) for p, r in zip(prec, rec)] 90 | opt_f1 = np.nanmax(raw_f1) 91 | f1.append(opt_f1) 92 | return np.mean(f1) 93 | 94 | 95 | def setup_theta(l1p, l2p, l3p, model): 96 | # Start adding the lowest labs and just add the 'rest', too. It will be 97 | # overwritten later on with the correct value from the upper level 98 | n = len(l1p) 99 | k = len(model.labelmap) 100 | th_hat = np.zeros((n, k), dtype=float) 101 | 102 | for d in range(n): 103 | sub_th = th_hat[d, :] 104 | levels = dict() 105 | for tuplist in l3p[d]: 106 | levels.update(tuplist) 107 | for tuplist in l2p[d]: 108 | levels.update(tuplist) 109 | levels.update(l1p[d]) 110 | 111 | # Multiple probs of local scope with the prob of upper level: 112 | predecessors = [s for (s, t) in l1p[d]] 113 | lookup = " ".join(list(levels.keys())) 114 | for p in predecessors: 115 | pat = re.compile("(" + p + "[0-9])(?:[^0-9]|$)") 116 | currents = re.findall(pat, lookup) 117 | for c in currents: 118 | levels[c] *= levels[p] 119 | pat = re.compile(c + "[0-9]") 120 | finals = re.findall(pat, lookup) 121 | for f in finals: 122 | levels[f] *= levels[c] 123 | 124 | labs, probs = zip(*levels.items()) 125 | inds = [model.labelmap[x] for x in labs] 126 | sub_th[inds] = probs 127 | return th_hat 128 | 129 | 130 | def binary_yreal(label_strings, label_dict): 131 | ndoc = len(label_strings) 132 | ntop = len(label_dict) 133 | y_true = np.zeros((ndoc, ntop), dtype=int) 134 | for d, lab in enumerate(label_strings): 135 | for l in lab: 136 | try: 137 | ind = label_dict[l] 138 | y_true[d, ind] = 1 139 | except KeyError: 140 | pass 141 | return y_true 142 | 143 | 144 | def main(): 145 | parser = OptionParser() 146 | parser.add_option("-f", dest="file", help="dataset location") 147 | parser.add_option("-d", dest="lvl", type="int", 148 | help="depth of label level", default=3) 149 | parser.add_option("-i", dest="it", type="int", 150 | help="# of iterations - train and test") 151 | parser.add_option("-s", dest="thinning", type="int", 152 | help="inter saving frequency", default=0) 153 | parser.add_option("-a", dest="alpha", type="float", help="alpha prior", 154 | default=0.1) 155 | parser.add_option("-b", dest="beta", type="float", help="beta prior", 156 | default=0.01) 157 | parser.add_option("-l", dest="lower", type="float", 158 | help="lower threshold for dictionary pruning", default=0) 159 | parser.add_option("-u", dest="upper", type="float", 160 | help="upper threshold for dictionary pruning", default=1) 161 | parser.add_option("-p", action="store_true", dest="pickle", 162 | help="save pickle of model?", default=False) 163 | 164 | (opt, arg) = parser.parse_args() 165 | 166 | if opt.thinning == 0: 167 | opt.thinning = opt.it 168 | train, test = split_data(f=opt.file) 169 | model = train_it(train, it=opt.it, s=opt.thinning, 170 | l=opt.lower, u=opt.upper, al=opt.alpha, be=opt.beta) 171 | 172 | print("Testing test data, this may take a while") 173 | l1, l2, l3 = zip(*[model.test_down_tree(x, it=opt.it, thinning=opt.thinning, threshold=0.95) for x in test[0]]) 174 | if opt.pickle: 175 | pickle.dump(model, open("Cascade_model.pkl", "wb")) 176 | pickle.dump(test, open("Cascade_testset.pkl", "wb")) 177 | pickle.dump(l1, open("Cascade_d1_pred.pkl", "wb")) 178 | pickle.dump(l2, open("Cascade_d2_pred.pkl", "wb")) 179 | pickle.dump(l3, open("Cascade_d3_pred.pkl", "wb")) 180 | print("Saved the model and predictions as pickles!") 181 | 182 | # Evaluate quality for all label depths: 183 | d = int(opt.lvl) 184 | label_depths = list(range(1, d+1)) 185 | for depth in label_depths: 186 | c = "Full texts" 187 | if opt.file == "thesis_data3.csv": 188 | c = "Abstracts" 189 | 190 | print("Model: CascadeLDA") 191 | print("Corpus: ", c) 192 | print("Label depth ", depth) 193 | print("# of Gibbs samples: ", int(opt.it)) 194 | print("-----------------------------------") 195 | 196 | lab_level = [len(x) == depth for x in model.labelmap.keys()] 197 | inds = np.where(lab_level)[0] 198 | 199 | y_bin = binary_yreal(test[1], model.labelmap) 200 | th_hat = setup_theta(l1, l2, l3, model) 201 | 202 | # Selecting the relevant labels 203 | y_bin = y_bin[:, inds] 204 | th_hat = th_hat[:, inds] 205 | 206 | # Remove no-prediction and no-label documents 207 | doc_id1 = np.where(th_hat.sum(axis=1) != 0)[0] 208 | doc_id2 = np.where(y_bin.sum(axis=1) != 0)[0] 209 | valid = np.intersect1d(doc_id1, doc_id2) 210 | 211 | y_bin = y_bin[valid, :] 212 | th_hat = th_hat[valid, :] 213 | 214 | tps, tns, fps, fns, fprs, tprs = rates(th_hat, y_bin) 215 | 216 | one_err = n_error(th_hat, y_bin, 1) 217 | two_err = n_error(th_hat, y_bin, 2) 218 | auc_roc = macro_auc_roc(fprs, tprs) 219 | f1_macro = get_f1(tps, fps, tns, fns) 220 | 221 | print("AUC ROC: ", auc_roc) 222 | print("one error: ", one_err) 223 | print("two error: ", two_err) 224 | print("F1 score (macro average) ", f1_macro) 225 | 226 | 227 | if __name__ == "__main__": 228 | main() 229 | -------------------------------------------------------------------------------- /evaluate_LabeledLDA.py: -------------------------------------------------------------------------------- 1 | from LabeledLDA import * 2 | from sklearn.metrics import auc 3 | from optparse import OptionParser 4 | import pickle 5 | import numpy as np 6 | 7 | 8 | def one_roc(prob, real_binary): 9 | resorted = np.argsort(prob)[::-1] 10 | 11 | reals = real_binary[resorted] 12 | probs = prob[resorted] 13 | thresholds = np.sort(list(set(probs)))[::-1] 14 | 15 | tp = [] 16 | tn = [] 17 | fp = [] 18 | fn = [] 19 | for c in thresholds: 20 | preds = [1 if x >= c else 0 for x in probs] 21 | zipped = list(zip(preds, reals)) 22 | 23 | tp_pre = sum([x == y for (x, y) in zipped if x == 1]) 24 | tn_pre = sum([x == y for (x, y) in zipped if x == 0]) 25 | fp_pre = sum([x != y for (x, y) in zipped if x == 1]) 26 | fn_pre = sum([x != y for (x, y) in zipped if x == 0]) 27 | 28 | tp.append(tp_pre) 29 | tn.append(tn_pre) 30 | fp.append(fp_pre) 31 | fn.append(fn_pre) 32 | return tp, tn, fp, fn 33 | 34 | 35 | def fpr_tpr(tp, fp, tn, fn): 36 | fpr = [x / (x + y) for (x, y) in zip(fp, tn)] 37 | tpr = [x / (x + y) for (x, y) in zip(tp, fn)] 38 | return fpr, tpr 39 | 40 | 41 | def precision_recall(tp, fp, tn, fn): 42 | precis = [x / (x + y) for (x, y) in zip(tp, fp)] 43 | recall = [x / (x + y) for (x, y) in zip(tp, fn)] 44 | return precis, recall 45 | 46 | 47 | def rates(y_prob, y_real_binary): 48 | tps = [] 49 | tns = [] 50 | fps = [] 51 | fns = [] 52 | fprs = [] 53 | tprs = [] 54 | for d_prob, d_real in zip(y_prob, y_real_binary): 55 | tp, tn, fp, fn = one_roc(d_prob, d_real) 56 | fpr, tpr = fpr_tpr(tp, fp, tn, fn) 57 | 58 | tps.append(tp) 59 | tns.append(tn) 60 | fps.append(fp) 61 | fns.append(fn) 62 | fprs.append(fpr) 63 | tprs.append(tpr) 64 | return tps, tns, fps, fns, fprs, tprs 65 | 66 | 67 | def macro_auc_roc(fprs, tprs): 68 | areas_under_curve = [auc(fpr, tpr) for (fpr, tpr) in zip(fprs, tprs)] 69 | return np.mean(areas_under_curve) 70 | 71 | 72 | def n_error(th_hat, y_real_binary, n): 73 | ndocs = th_hat.shape[0] 74 | counter = 0 75 | for i in range(ndocs): 76 | ordered = np.argsort(th_hat[i, :])[::-1] 77 | toplabs = ordered[:n] 78 | sub_y = y_real_binary[i, :] 79 | hit = sum(sub_y[toplabs]) > 0 80 | if hit: 81 | counter += 1 82 | return counter / ndocs 83 | 84 | 85 | def get_f1(tps, fps, tns, fns): 86 | f1 = [] 87 | for tp, fp, tn, fn in zip(tps, fps, tns, fns): 88 | prec, rec = precision_recall(tp, fp, tn, fn) 89 | with np.errstate(invalid='ignore'): 90 | raw_f1 = [(2 * p * r)/(p + r) for p, r in zip(prec, rec)] 91 | opt_f1 = np.nanmax(raw_f1) 92 | f1.append(opt_f1) 93 | return np.mean(f1) 94 | 95 | 96 | def binary_yreal(label_strings, label_dict): 97 | ndoc = len(label_strings) 98 | ntop = len(label_dict) 99 | y_true = np.zeros((ndoc, ntop), dtype=int) 100 | for d, lab in enumerate(label_strings): 101 | for l in lab: 102 | try: 103 | ind = label_dict[l] 104 | y_true[d, ind] = 1 105 | except KeyError: 106 | pass 107 | return y_true 108 | 109 | 110 | def main(): 111 | parser = OptionParser() 112 | parser.add_option("-f", dest="file", help="dataset location") 113 | parser.add_option("-d", dest="lvl", type="int", default=3, 114 | help="depth of lab level") 115 | parser.add_option("-i", dest="it", type="int", help="# of iterations") 116 | parser.add_option("-s", dest="thinning", type="int", default=0, 117 | help="save frequency") 118 | parser.add_option("-l", dest="lower", type="float", default=0, 119 | help="lower threshold for dictionary pruning") 120 | parser.add_option("-u", dest="upper", type="float", default=1, 121 | help="upper threshold for dictionary pruning") 122 | parser.add_option("-a", dest="alpha", type="float", default=0.1, 123 | help="alpha prior") 124 | parser.add_option("-b", dest="beta", type="float", default=0.01, 125 | help="beta prior") 126 | parser.add_option("-p", action="store_true", dest="pickle", default=False, 127 | help="Save the model as pickle?") 128 | 129 | (opt, arg) = parser.parse_args() 130 | if opt.thinning == 0: 131 | opt.thinning = opt.it 132 | 133 | train, test = split_data(f=opt.file, d=opt.lvl) 134 | 135 | print("Starting training...") 136 | model = train_it(train, it=opt.it, s=opt.thinning, 137 | al=opt.alpha, be=opt.beta, l=opt.lower, u=opt.upper) 138 | 139 | print("Testing test data, this may take a while...") 140 | th, _ = test_it(model, test, it=opt.it, thinning=opt.thinning) 141 | th = np.array(th) 142 | if opt.pickle: 143 | pickle.dump(model, open("LabeledLDA_model.pkl", "wb")) 144 | pickle.dump(test, open("LabeledLDA_testset.pkl", "wb")) 145 | pickle.dump(th, open("LabeledLDA_theta.pkl", "wb")) 146 | 147 | c = "Full Texts" 148 | if opt.file == "thesis_data3.csv": 149 | c = "Abstracts" 150 | 151 | print("Model: Labeled LDA") 152 | print("Corpus: ", c) 153 | print("Label depth ", opt.lvl) 154 | print("# of Gibbs samples: ", int(opt.it)) 155 | print("-----------------------------------") 156 | 157 | y_bin = binary_yreal(test[1], model.labelmap) 158 | 159 | # Remove root label from predictions (also not included in label sets) 160 | y_bin = y_bin[:, 1:] 161 | th = th[:, 1:] 162 | 163 | # Remove docs that were assigned to 'root' completely: 164 | nonzero_load = [x != 0 for x in th.sum(axis=1)] 165 | nonzero_load = np.where(nonzero_load)[0] 166 | y_bin = y_bin[nonzero_load, :] 167 | th = th[nonzero_load, :] 168 | 169 | tps, tns, fps, fns, fprs, tprs = rates(th, y_bin) 170 | 171 | one_err = n_error(th, y_bin, 1) 172 | two_err = n_error(th, y_bin, 2) 173 | auc_roc = macro_auc_roc(fprs, tprs) 174 | f1_macro = get_f1(tps, fps, tns, fns) 175 | 176 | print("AUC ROC: ", auc_roc) 177 | print("one error: ", one_err) 178 | print("two error: ", two_err) 179 | print("F1 score (macro average) ", f1_macro) 180 | 181 | 182 | if __name__ == "__main__": 183 | main() 184 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | boto==2.48.0 2 | bz2file==0.98 3 | certifi==2017.7.27.1 4 | chardet==3.0.4 5 | gensim==2.3.0 6 | idna==2.6 7 | nltk==3.6.6 8 | numpy==1.22.0 9 | regex==2017.7.28 10 | requests==2.20.0 11 | scipy==0.19.1 12 | six==1.10.0 13 | smart-open==1.5.3 14 | stop-words==2015.2.23.1 15 | urllib3==1.26.5 16 | sklearn==0.0 -------------------------------------------------------------------------------- /thesis_kenhbs.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/KenHBS/LDA_thesis/9128551af929f6f2692e8edd79de72dec5c5f7ce/thesis_kenhbs.pdf --------------------------------------------------------------------------------