├── stoplists └── en.txt ├── setup.py ├── README.md ├── LICENSE ├── lda.py ├── topicmodel.pyx └── lda_reference.py /stoplists/en.txt: -------------------------------------------------------------------------------- 1 | the 2 | and 3 | of 4 | in 5 | on 6 | for 7 | by 8 | to 9 | a 10 | an 11 | is 12 | are 13 | this 14 | that 15 | than 16 | from 17 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from distutils.core import setup 2 | from Cython.Build import cythonize 3 | import numpy 4 | 5 | setup( 6 | name = 'LDA sampler', 7 | ext_modules = cythonize("*.pyx"), 8 | include_dirs=[numpy.get_include()] 9 | ) 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # PyMallet 2 | 3 | This package provides tools for extracting latent semantic representations of text, particularly probabilistic topic models. 4 | 5 | The implementation of LDA uses Gibbs sampling, which is simple but reliable. People often find the resulting models more useful than the stochastic variational algorithm used in Gensim. 6 | 7 | To compile: 8 | 9 | python setup.py build_ext --inplace 10 | 11 | As an example, the `sample_data` directory contains 10000 posts from the stats Stack Exchange forum. 12 | 13 | To run on this sample collection with 50 topics: 14 | 15 | python lda.py sample_data/stats_10k.txt 50 16 | 17 | The script `lda_reference.py` contains a reference implementation in pure Python (no Cython) to compare speed. The Cython version is currently about 100x faster. -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 mimno 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /lda.py: -------------------------------------------------------------------------------- 1 | import re, sys, random, math 2 | import numpy as np 3 | from collections import Counter 4 | import topicmodel 5 | from timeit import default_timer as timer 6 | 7 | import pstats, cProfile 8 | import pyximport 9 | pyximport.install() 10 | 11 | word_pattern = re.compile("\w[\w\-\']*\w|\w") 12 | 13 | if len(sys.argv) != 3: 14 | print("Usage: topicmodel.py [docs file] [num topics]") 15 | sys.exit() 16 | 17 | num_topics = int(sys.argv[2]) 18 | doc_smoothing = 0.5 19 | word_smoothing = 0.01 20 | 21 | stoplist = set() 22 | with open("stoplists/en.txt", encoding="utf-8") as stop_reader: 23 | for line in stop_reader: 24 | line = line.rstrip() 25 | stoplist.add(line) 26 | 27 | word_counts = Counter() 28 | 29 | documents = [] 30 | word_topics = {} 31 | topic_totals = np.zeros(num_topics, dtype=int) 32 | 33 | for line in open(sys.argv[1], encoding="utf-8"): 34 | #line = line.lower() 35 | 36 | tokens = word_pattern.findall(line) 37 | 38 | ## remove stopwords, short words, and upper-cased words 39 | tokens = [w for w in tokens if not w in stoplist and len(w) >= 3 and not w[0].isupper()] 40 | word_counts.update(tokens) 41 | 42 | doc_topic_counts = np.zeros(num_topics, dtype=int) 43 | 44 | documents.append({ "original": line, "token_strings": tokens, "topic_counts": doc_topic_counts }) 45 | 46 | ## Now that we're done reading from disk, we can count the total 47 | ## number of words. 48 | 49 | vocabulary = list(word_counts.keys()) 50 | vocabulary_size = len(vocabulary) 51 | word_ids = { w: i for (i, w) in enumerate(vocabulary) } 52 | smoothing_times_vocab_size = word_smoothing * vocabulary_size 53 | 54 | word_topics = np.zeros((len(vocabulary), num_topics), dtype=int) 55 | 56 | for document in documents: 57 | tokens = document["token_strings"] 58 | doc_topic_counts = document["topic_counts"] 59 | 60 | doc_tokens = np.ndarray(len(tokens), dtype=int) 61 | doc_topics = np.ndarray(len(tokens), dtype=int) 62 | topic_changes = np.zeros(len(tokens), dtype=int) 63 | 64 | for i, w in enumerate(tokens): 65 | word_id = word_ids[w] 66 | topic = random.randrange(num_topics) 67 | 68 | doc_tokens[i] = word_id 69 | doc_topics[i] = topic 70 | 71 | ## Update counts: 72 | word_topics[word_id][topic] += 1 73 | topic_totals[topic] += 1 74 | doc_topic_counts[topic] += 1 75 | 76 | document["doc_tokens"] = doc_tokens 77 | document["doc_topics"] = doc_topics 78 | document["topic_changes"] = topic_changes 79 | 80 | sampling_dist = np.zeros(num_topics, dtype=float) 81 | topic_normalizers = np.zeros(num_topics, dtype=float) 82 | for topic in range(num_topics): 83 | topic_normalizers[topic] = 1.0 / (topic_totals[topic] + smoothing_times_vocab_size) 84 | 85 | def profile(): 86 | 87 | model = topicmodel.TopicModel(50, len(vocabulary), doc_smoothing, word_smoothing) 88 | document = documents[0] 89 | 90 | for document in documents: 91 | c_doc = topicmodel.Document(document["doc_tokens"], document["doc_topics"], document["topic_changes"], document["topic_counts"]) 92 | model.add_document(c_doc) 93 | 94 | #model.sample(10) 95 | 96 | #cProfile.runctx("topicmodel.sample_doc(doc_tokens, doc_topics, topic_changes, doc_topic_counts, word_topics, topic_totals, sampling_dist, topic_normalizers, doc_smoothing, word_smoothing, smoothing_times_vocab_size, num_topics)", globals(), locals(), "topics.prof") 97 | cProfile.runctx("model.sample(10)", globals(), locals(), "topics.prof") 98 | 99 | stats = pstats.Stats("topics.prof") 100 | stats.strip_dirs().sort_stats("time").print_stats() 101 | 102 | 103 | 104 | def sample(num_iterations): 105 | start = timer() 106 | 107 | for iteration in range(num_iterations): 108 | 109 | for document in documents: 110 | 111 | doc_topic_counts = document["topic_counts"] 112 | doc_tokens = document["doc_tokens"] 113 | doc_topics = document["doc_topics"] 114 | topic_changes = document["topic_changes"] 115 | 116 | # Pass the document to the fast C code 117 | topicmodel.sample_doc(doc_tokens, doc_topics, topic_changes, doc_topic_counts, word_topics, topic_totals, sampling_dist, topic_normalizers, doc_smoothing, word_smoothing, smoothing_times_vocab_size, num_topics) 118 | 119 | if iteration % 10 == 0: 120 | end = timer() 121 | print(end - start) 122 | start = timer() 123 | 124 | def entropy(p): 125 | ## make sure the vector is a valid probability distribution 126 | p = p / np.sum(p) 127 | 128 | result = 0.0 129 | for x in p: 130 | if x > 0.0: 131 | result += -x * math.log2(x) 132 | 133 | return result 134 | 135 | def print_topic(topic): 136 | sorted_words = sorted(zip(word_topics[:,topic], vocabulary), reverse=True) 137 | 138 | for i in range(20): 139 | w = sorted_words[i] 140 | print("{}\t{}".format(w[0], w[1])) 141 | 142 | def print_all_topics(): 143 | for topic in range(num_topics): 144 | sorted_words = sorted(zip(word_topics[:,topic], vocabulary), reverse=True) 145 | print(" ".join([w for x, w in sorted_words[:20]])) 146 | 147 | def write_state(writer): 148 | writer.write("Doc\tWordID\tWord\tTopic\tCounts\tChanges\n") 149 | 150 | for doc, document in enumerate(documents): 151 | doc_tokens = document["doc_tokens"] 152 | doc_topics = document["doc_topics"] 153 | topic_changes = document["topic_changes"] 154 | 155 | doc_length = len(doc_tokens) 156 | 157 | for i in range(doc_length): 158 | word_id = doc_tokens[i] 159 | word = vocabulary[word_id] 160 | topic = doc_topics[i] 161 | changes = topic_changes[i] 162 | 163 | writer.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(doc, word_id, word, topic, word_counts[word], changes)) 164 | 165 | 166 | #profile() 167 | 168 | model = topicmodel.TopicModel(num_topics, vocabulary, doc_smoothing, word_smoothing) 169 | document = documents[0] 170 | 171 | for document in documents: 172 | c_doc = topicmodel.Document(document["doc_tokens"], document["doc_topics"], document["topic_changes"], document["topic_counts"]) 173 | model.add_document(c_doc) 174 | 175 | for i in range(20): 176 | start = timer() 177 | model.sample(50) 178 | print(timer() - start) 179 | model.print_all_topics() 180 | 181 | 182 | #sample(1000) 183 | #topicmodel.sample(10, documents, word_topics, topic_totals, doc_smoothing, word_smoothing, smoothing_times_vocab_size, num_topics) 184 | #print_all_topics() 185 | #with open("state.txt", "w") as writer: 186 | # write_state(writer) 187 | -------------------------------------------------------------------------------- /topicmodel.pyx: -------------------------------------------------------------------------------- 1 | # cython: linetrace=True 2 | 3 | from cython.view cimport array as cvarray 4 | import numpy as np 5 | import random 6 | from timeit import default_timer as timer 7 | 8 | class Document: 9 | 10 | def __init__(self, long[:] doc_tokens, long[:] doc_topics, long[:] topic_changes, long[:] doc_topic_counts): 11 | self.doc_tokens = doc_tokens 12 | self.doc_topics = doc_topics 13 | self.topic_changes = topic_changes 14 | self.doc_topic_counts = doc_topic_counts 15 | 16 | cdef class TopicModel: 17 | 18 | cdef long[:] topic_totals 19 | cdef long[:,:] word_topics 20 | cdef int num_topics 21 | cdef int vocab_size 22 | 23 | cdef double[:] topic_probs 24 | cdef double[:] topic_normalizers 25 | cdef float doc_smoothing 26 | cdef float word_smoothing 27 | cdef float smoothing_times_vocab_size 28 | 29 | documents = [] 30 | vocabulary = [] 31 | 32 | def __init__(self, num_topics, vocabulary, doc_smoothing, word_smoothing): 33 | self.num_topics = num_topics 34 | self.vocabulary.extend(vocabulary) 35 | self.vocab_size = len(vocabulary) 36 | 37 | self.doc_smoothing = doc_smoothing 38 | self.word_smoothing = word_smoothing 39 | self.smoothing_times_vocab_size = word_smoothing * self.vocab_size 40 | 41 | self.topic_totals = np.zeros(num_topics, dtype=int) 42 | self.word_topics = np.zeros((self.vocab_size, num_topics), dtype=int) 43 | 44 | def add_document(self, doc): 45 | cdef int word_id, topic 46 | 47 | self.documents.append(doc) 48 | 49 | for i in range(len(doc.doc_tokens)): 50 | word_id = doc.doc_tokens[i] 51 | topic = doc.doc_topics[i] 52 | 53 | self.word_topics[word_id,topic] += 1 54 | self.topic_totals[topic] += 1 55 | doc.doc_topic_counts[topic] += 1 56 | 57 | def sample(self, iterations): 58 | cdef int old_topic, new_topic, word_id, topic, i, doc_length 59 | cdef double sampling_sum = 0 60 | cdef double sample 61 | cdef long[:] word_topic_counts 62 | 63 | cdef long[:] doc_tokens 64 | cdef long[:] doc_topics 65 | cdef long[:] doc_topic_counts 66 | cdef long[:] topic_changes 67 | 68 | cdef double[:] uniform_variates 69 | cdef double[:] topic_probs = np.zeros(self.num_topics, dtype=float) 70 | cdef double[:] topic_normalizers = np.zeros(self.num_topics, dtype=float) 71 | 72 | for topic in range(self.num_topics): 73 | topic_normalizers[topic] = 1.0 / (self.topic_totals[topic] + self.smoothing_times_vocab_size) 74 | 75 | 76 | for iteration in range(iterations): 77 | for document in self.documents: 78 | doc_tokens = document.doc_tokens 79 | doc_topics = document.doc_topics 80 | doc_topic_counts = document.doc_topic_counts 81 | topic_changes = document.topic_changes 82 | 83 | doc_length = len(document.doc_tokens) 84 | uniform_variates = np.random.random_sample(doc_length) 85 | 86 | for i in range(doc_length): 87 | word_id = doc_tokens[i] 88 | old_topic = doc_topics[i] 89 | word_topic_counts = self.word_topics[word_id,:] 90 | 91 | ## erase the effect of this token 92 | word_topic_counts[old_topic] -= 1 93 | self.topic_totals[old_topic] -= 1 94 | doc_topic_counts[old_topic] -= 1 95 | 96 | topic_normalizers[old_topic] = 1.0 / (self.topic_totals[old_topic] + self.smoothing_times_vocab_size) 97 | 98 | ### 99 | ### SAMPLING DISTRIBUTION 100 | ### 101 | 102 | sampling_sum = 0.0 103 | for topic in range(self.num_topics): 104 | topic_probs[topic] = (doc_topic_counts[topic] + self.doc_smoothing) * (word_topic_counts[topic] + self.word_smoothing) * topic_normalizers[topic] 105 | sampling_sum += topic_probs[topic] 106 | 107 | #sample = random.uniform(0, sampling_sum) 108 | #sample = np.random.random_sample() * sampling_sum 109 | sample = uniform_variates[i] * sampling_sum 110 | 111 | new_topic = 0 112 | while sample > topic_probs[new_topic]: 113 | sample -= topic_probs[new_topic] 114 | new_topic += 1 115 | 116 | ## add back in the effect of this token 117 | word_topic_counts[new_topic] += 1 118 | self.topic_totals[new_topic] += 1 119 | doc_topic_counts[new_topic] += 1 120 | topic_normalizers[new_topic] = 1.0 / (self.topic_totals[new_topic] + self.smoothing_times_vocab_size) 121 | 122 | doc_topics[i] = new_topic 123 | 124 | if new_topic != old_topic: 125 | #pass 126 | topic_changes[i] += 1 127 | 128 | def print_topic(self, int topic): 129 | sorted_words = sorted(zip(self.word_topics[:,topic], self.vocabulary), reverse=True) 130 | 131 | for i in range(20): 132 | w = sorted_words[i] 133 | print("{}\t{}".format(w[0], w[1])) 134 | 135 | def print_all_topics(self): 136 | for topic in range(self.num_topics): 137 | sorted_words = sorted(zip(self.word_topics[:,topic], self.vocabulary), reverse=True) 138 | print(" ".join([w for x, w in sorted_words[:20]])) 139 | 140 | 141 | 142 | def sample_doc(long[:] doc_tokens, long[:] doc_topics, long[:] topic_changes, long[:] doc_topic_counts, long[:,:] word_topics, long[:] topic_totals, double[:] topic_probs, double[:] topic_normalizers, float doc_smoothing, float word_smoothing, float smoothing_times_vocab_size, int num_topics): 143 | 144 | cdef int old_topic, new_topic, word_id, topic, i 145 | cdef double sampling_sum = 0 146 | cdef double sample 147 | cdef long[:] word_topic_counts 148 | 149 | cdef int doc_length = len(doc_tokens) 150 | cdef double[:] uniform_variates = np.random.random_sample(doc_length) 151 | 152 | for i in range(doc_length): 153 | word_id = doc_tokens[i] 154 | old_topic = doc_topics[i] 155 | word_topic_counts = word_topics[word_id,:] 156 | 157 | ## erase the effect of this token 158 | word_topic_counts[old_topic] -= 1 159 | topic_totals[old_topic] -= 1 160 | doc_topic_counts[old_topic] -= 1 161 | 162 | topic_normalizers[old_topic] = 1.0 / (topic_totals[old_topic] + smoothing_times_vocab_size) 163 | 164 | ### 165 | ### SAMPLING DISTRIBUTION 166 | ### 167 | 168 | sampling_sum = 0.0 169 | for topic in range(num_topics): 170 | topic_probs[topic] = (doc_topic_counts[topic] + doc_smoothing) * (word_topic_counts[topic] + word_smoothing) * topic_normalizers[topic] 171 | sampling_sum += topic_probs[topic] 172 | 173 | #sample = random.uniform(0, sampling_sum) 174 | #sample = np.random.random_sample() * sampling_sum 175 | sample = uniform_variates[i] * sampling_sum 176 | 177 | new_topic = 0 178 | while sample > topic_probs[new_topic]: 179 | sample -= topic_probs[new_topic] 180 | new_topic += 1 181 | 182 | ## add back in the effect of this token 183 | word_topic_counts[new_topic] += 1 184 | topic_totals[new_topic] += 1 185 | doc_topic_counts[new_topic] += 1 186 | topic_normalizers[new_topic] = 1.0 / (topic_totals[new_topic] + smoothing_times_vocab_size) 187 | 188 | doc_topics[i] = new_topic 189 | 190 | if new_topic != old_topic: 191 | #pass 192 | topic_changes[i] += 1 193 | -------------------------------------------------------------------------------- /lda_reference.py: -------------------------------------------------------------------------------- 1 | """ 2 | 3 | Topic models look for groups of words that occur frequenty together. We can often recognize these clusters as specific themes that appear in the collection -- thus the term "topic" model. 4 | 5 | Our example corpus today is a collection of Viking sagas. Start python like this: 6 | 7 | python -i topicmodel.py sagas_en.txt 20 8 | 9 | We will work at the python prompt ">>>". 10 | 11 | Today we'll be working with the simplest and most reliable topic model algorithm, Gibbs sampling. 12 | Gibb sampling is a way to take a very complicted optimization problem and break it into little problems that are individually easy. 13 | 14 | First, we need to have a way of describing probability distributions. 15 | A discrete distribution is a vector of numbers that are >= 0.0 and sum to 1.0. 16 | One function is called *entropy*. Entropy takes a distribution and returns a number. 17 | 18 | 1. Run `entropy(np.array([0.7, 0.1, 0.2]))`. What is the value? 19 | 20 | [Response here] 21 | 22 | 2. Run `entropy(np.array([7, 1, 2]))`. Does the value change? Why or why not? 23 | 24 | [Response here] 25 | 26 | 3. Try different (non-negative) values of the three numbers. What is the largest value you can get, and what is the smallest? 27 | 28 | [Response here] 29 | 30 | 4. Now try different (non-negative) values of *four* numbers. Can you get a larger or smaller entropy than with three? 31 | 32 | [Response here] 33 | 34 | 5. Describe in your own words what entropy is measuring. 35 | 36 | [Response here] 37 | 38 | The Gibbs sampling algorithm proceeds in multiple iterations. In each iteration, 39 | we look at all the word tokens in all the documents, one after another. 40 | For each word, we erase its current topic assignment and sample a new topic 41 | assignment given all the other word tokens' topic assignments. 42 | 43 | Now look at the lines below the "SAMPLING DISTRIBUTION" comment. These define two vectors: 44 | * The probability of each topic in the current document 45 | * The probability of the current word in each topic 46 | 47 | We'll look at a particular dramatic moment in Njal's saga. Define these variables: 48 | 49 | document = documents[1160] 50 | doc_topic_counts = document["topic_counts"] 51 | word = "sword" 52 | word_topic_counts = word_topics[word] 53 | 54 | Use this command to suppress scientific notation: 55 | 56 | np.set_printoptions(suppress=True) 57 | 58 | 6. Calculate the entropy of `doc_topic_counts` 59 | 60 | 7. Calculate the entropy of `(doc_topic_counts + doc_smoothing)`. Should this be larger or smaller than the previous value? 61 | 62 | 8. Calculate the entropy of `(word_topic_counts + word_smoothing) / (topic_totals + smoothing_times_vocab_size)` 63 | 64 | 9. Calculate the entropy of `(doc_topic_counts + doc_smoothing) * (word_topic_counts + word_smoothing) / (topic_totals + smoothing_times_vocab_size)` 65 | 66 | These values are random initializations. Let's run the algorithm 67 | over the documents a few times and see what happens. Run: 68 | 69 | sample(25) 70 | 71 | Use `print_all_topics()` to get a view of the current state of the topics. 72 | 73 | 10. This function prints the number of tokens in each topic for the sample doc. Describe how (if at all) they change. 74 | 75 | 11. Recalculate the four entropies we calculated above for the sampling distribution. How are they different? 76 | 77 | 12. What is the value of `word_smoothing`? Previously we added 1.0 in this situation. Why are we using a different value now? Use the concept of entropy in your answer. 78 | 79 | [Response here] 80 | 81 | 13. What are Norse sagas about, from the perspective of the model? 82 | 83 | [Response here] 84 | 85 | 14. I'm removing a list of frequent words, words that are too short, and 86 | words whose first letter is capitalized. Why does removing capitalized words 87 | help? What happens if you remove that check? Is this a good idea? 88 | 89 | [Response here] 90 | 91 | """ 92 | 93 | import re, sys, random, math 94 | import numpy as np 95 | from collections import Counter 96 | from timeit import default_timer as timer 97 | 98 | word_pattern = re.compile("\w[\w\-\']*\w|\w") 99 | 100 | if len(sys.argv) != 3: 101 | print("Usage: topicmodel.py [docs file] [num topics]") 102 | sys.exit() 103 | 104 | num_topics = int(sys.argv[2]) 105 | doc_smoothing = 0.5 106 | word_smoothing = 0.01 107 | 108 | stoplist = set() 109 | with open("stoplist.txt", encoding="utf-8") as stop_reader: 110 | for line in stop_reader: 111 | line = line.rstrip() 112 | stoplist.add(line) 113 | 114 | word_counts = Counter() 115 | 116 | documents = [] 117 | word_topics = {} 118 | topic_totals = np.zeros(num_topics) 119 | 120 | for line in open(sys.argv[1], encoding="utf-8"): 121 | #line = line.lower() 122 | 123 | tokens = word_pattern.findall(line) 124 | 125 | ## remove stopwords, short words, and upper-cased words 126 | tokens = [w for w in tokens if not w in stoplist and len(w) >= 3 and not w[0].isupper()] 127 | word_counts.update(tokens) 128 | 129 | doc_topic_counts = np.zeros(num_topics) 130 | token_topics = [] 131 | 132 | for w in tokens: 133 | 134 | ## Generate a topic randomly 135 | topic = random.randrange(num_topics) 136 | token_topics.append({ "word": w, "topic": topic }) 137 | 138 | ## If we haven't seen this word before, initialize it 139 | if not w in word_topics: 140 | word_topics[w] = np.zeros(num_topics) 141 | 142 | ## Update counts: 143 | word_topics[w][topic] += 1 144 | topic_totals[topic] += 1 145 | doc_topic_counts[topic] += 1 146 | 147 | documents.append({ "original": line, "token_topics": token_topics, "topic_counts": doc_topic_counts }) 148 | 149 | ## Now that we're done reading from disk, we can count the total 150 | ## number of words. 151 | vocabulary = list(word_counts.keys()) 152 | vocabulary_size = len(vocabulary) 153 | 154 | smoothing_times_vocab_size = word_smoothing * vocabulary_size 155 | 156 | def sample(num_iterations): 157 | for iteration in range(num_iterations): 158 | 159 | start = timer() 160 | 161 | for document in documents: 162 | 163 | doc_topic_counts = document["topic_counts"] 164 | token_topics = document["token_topics"] 165 | doc_length = len(token_topics) 166 | for token_topic in token_topics: 167 | 168 | w = token_topic["word"] 169 | old_topic = token_topic["topic"] 170 | word_topic_counts = word_topics[w] 171 | 172 | ## erase the effect of this token 173 | word_topic_counts[old_topic] -= 1 174 | topic_totals[old_topic] -= 1 175 | doc_topic_counts[old_topic] -= 1 176 | 177 | ### 178 | ### SAMPLING DISTRIBUTION 179 | ### 180 | 181 | ## Does this topic occur often in the document? 182 | topic_probs = (doc_topic_counts + doc_smoothing) / (doc_length + num_topics * doc_smoothing) 183 | ## Does this word occur often in the topic? 184 | topic_probs *= (word_topic_counts + word_smoothing) / (topic_totals + smoothing_times_vocab_size) 185 | 186 | ## sample from an array that doesn't sum to 1.0 187 | sample = random.uniform(0, np.sum(topic_probs)) 188 | 189 | new_topic = 0 190 | while sample > topic_probs[new_topic]: 191 | sample -= topic_probs[new_topic] 192 | new_topic += 1 193 | 194 | ## add back in the effect of this token 195 | word_topic_counts[new_topic] += 1 196 | topic_totals[new_topic] += 1 197 | doc_topic_counts[new_topic] += 1 198 | 199 | token_topic["topic"] = new_topic 200 | end = timer() 201 | print(end - start) 202 | 203 | 204 | def entropy(p): 205 | ## make sure the vector is a valid probability distribution 206 | p = p / np.sum(p) 207 | 208 | result = 0.0 209 | for x in p: 210 | if x > 0.0: 211 | result += -x * math.log2(x) 212 | 213 | return result 214 | 215 | def print_topic(topic): 216 | sorted_words = sorted(vocabulary, key=lambda w: word_topics[w][topic], reverse=True) 217 | 218 | for i in range(20): 219 | w = sorted_words[i] 220 | print("{}\t{}".format(word_topics[w][topic], w)) 221 | 222 | def print_all_topics(): 223 | for topic in range(num_topics): 224 | sorted_words = sorted(vocabulary, key=lambda w: word_topics[w][topic], reverse=True) 225 | print(" ".join(sorted_words[:20])) 226 | 227 | sample(100) --------------------------------------------------------------------------------