├── stoplists
    └── en.txt
├── setup.py
├── README.md
├── LICENSE
├── lda.py
├── topicmodel.pyx
└── lda_reference.py


/stoplists/en.txt:
--------------------------------------------------------------------------------
 1 | the
 2 | and
 3 | of
 4 | in
 5 | on
 6 | for
 7 | by
 8 | to
 9 | a
10 | an
11 | is
12 | are
13 | this
14 | that
15 | than
16 | from
17 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from distutils.core import setup
 2 | from Cython.Build import cythonize
 3 | import numpy
 4 | 
 5 | setup(
 6 |     name = 'LDA sampler',
 7 |     ext_modules = cythonize("*.pyx"),
 8 |     include_dirs=[numpy.get_include()]
 9 | )
10 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # PyMallet
 2 | 
 3 | This package provides tools for extracting latent semantic representations of text, particularly probabilistic topic models.
 4 | 
 5 | The implementation of LDA uses Gibbs sampling, which is simple but reliable. People often find the resulting models more useful than the stochastic variational algorithm used in Gensim.
 6 | 
 7 | To compile:
 8 | 
 9 |     python setup.py build_ext --inplace
10 | 
11 | As an example, the `sample_data` directory contains 10000 posts from the stats Stack Exchange forum.
12 | 
13 | To run on this sample collection with 50 topics:
14 | 
15 |     python lda.py sample_data/stats_10k.txt 50
16 | 
17 | The script `lda_reference.py` contains a reference implementation in pure Python (no Cython) to compare speed. The Cython version is currently about 100x faster.


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 mimno
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/lda.py:
--------------------------------------------------------------------------------
  1 | import re, sys, random, math
  2 | import numpy as np
  3 | from collections import Counter
  4 | import topicmodel
  5 | from timeit import default_timer as timer
  6 | 
  7 | import pstats, cProfile
  8 | import pyximport
  9 | pyximport.install()
 10 | 
 11 | word_pattern = re.compile("\w[\w\-\']*\w|\w")
 12 | 
 13 | if len(sys.argv) != 3:
 14 |     print("Usage: topicmodel.py [docs file] [num topics]")
 15 |     sys.exit()
 16 | 
 17 | num_topics = int(sys.argv[2])
 18 | doc_smoothing = 0.5
 19 | word_smoothing = 0.01
 20 | 
 21 | stoplist = set()
 22 | with open("stoplists/en.txt", encoding="utf-8") as stop_reader:
 23 |     for line in stop_reader:
 24 |         line = line.rstrip()
 25 |         stoplist.add(line)
 26 | 
 27 | word_counts = Counter()
 28 | 
 29 | documents = []
 30 | word_topics = {}
 31 | topic_totals = np.zeros(num_topics, dtype=int)
 32 | 
 33 | for line in open(sys.argv[1], encoding="utf-8"):
 34 |     #line = line.lower()
 35 |     
 36 |     tokens = word_pattern.findall(line)
 37 |     
 38 |     ## remove stopwords, short words, and upper-cased words
 39 |     tokens = [w for w in tokens if not w in stoplist and len(w) >= 3 and not w[0].isupper()]
 40 |     word_counts.update(tokens)
 41 |     
 42 |     doc_topic_counts = np.zeros(num_topics, dtype=int)
 43 |     
 44 |     documents.append({ "original": line, "token_strings": tokens, "topic_counts": doc_topic_counts })
 45 | 
 46 | ## Now that we're done reading from disk, we can count the total
 47 | ##  number of words.
 48 | 
 49 | vocabulary = list(word_counts.keys())
 50 | vocabulary_size = len(vocabulary)
 51 | word_ids = { w: i for (i, w) in enumerate(vocabulary) }
 52 | smoothing_times_vocab_size = word_smoothing * vocabulary_size
 53 | 
 54 | word_topics = np.zeros((len(vocabulary), num_topics), dtype=int)
 55 | 
 56 | for document in documents:
 57 |     tokens = document["token_strings"]
 58 |     doc_topic_counts = document["topic_counts"]
 59 |     
 60 |     doc_tokens = np.ndarray(len(tokens), dtype=int)
 61 |     doc_topics = np.ndarray(len(tokens), dtype=int)
 62 |     topic_changes = np.zeros(len(tokens), dtype=int)
 63 |     
 64 |     for i, w in enumerate(tokens):
 65 |         word_id = word_ids[w]
 66 |         topic = random.randrange(num_topics)
 67 |         
 68 |         doc_tokens[i] = word_id
 69 |         doc_topics[i] = topic
 70 |         
 71 |         ## Update counts: 
 72 |         word_topics[word_id][topic] += 1
 73 |         topic_totals[topic] += 1
 74 |         doc_topic_counts[topic] += 1
 75 |     
 76 |     document["doc_tokens"] = doc_tokens
 77 |     document["doc_topics"] = doc_topics
 78 |     document["topic_changes"] = topic_changes
 79 | 
 80 | sampling_dist = np.zeros(num_topics, dtype=float)
 81 | topic_normalizers = np.zeros(num_topics, dtype=float)
 82 | for topic in range(num_topics):
 83 |     topic_normalizers[topic] = 1.0 / (topic_totals[topic] + smoothing_times_vocab_size)
 84 | 
 85 | def profile():
 86 | 
 87 |     model = topicmodel.TopicModel(50, len(vocabulary), doc_smoothing, word_smoothing)
 88 |     document = documents[0]
 89 | 
 90 |     for document in documents:
 91 |         c_doc = topicmodel.Document(document["doc_tokens"], document["doc_topics"], document["topic_changes"], document["topic_counts"])
 92 |         model.add_document(c_doc)
 93 | 
 94 |     #model.sample(10)
 95 |     
 96 |     #cProfile.runctx("topicmodel.sample_doc(doc_tokens, doc_topics, topic_changes, doc_topic_counts, word_topics, topic_totals, sampling_dist, topic_normalizers, doc_smoothing, word_smoothing, smoothing_times_vocab_size, num_topics)", globals(), locals(), "topics.prof")
 97 |     cProfile.runctx("model.sample(10)", globals(), locals(), "topics.prof")
 98 |     
 99 |     stats = pstats.Stats("topics.prof")
100 |     stats.strip_dirs().sort_stats("time").print_stats()
101 |     
102 |     
103 | 
104 | def sample(num_iterations):
105 |     start = timer()
106 |     
107 |     for iteration in range(num_iterations):
108 |         
109 |         for document in documents:
110 |             
111 |             doc_topic_counts = document["topic_counts"]
112 |             doc_tokens = document["doc_tokens"]
113 |             doc_topics = document["doc_topics"]
114 |             topic_changes = document["topic_changes"]
115 |             
116 |             # Pass the document to the fast C code
117 |             topicmodel.sample_doc(doc_tokens, doc_topics, topic_changes, doc_topic_counts, word_topics, topic_totals, sampling_dist, topic_normalizers, doc_smoothing, word_smoothing, smoothing_times_vocab_size, num_topics)
118 |             
119 |         if iteration % 10 == 0:
120 |             end = timer()
121 |             print(end - start)
122 |             start = timer()
123 | 
124 | def entropy(p):
125 |     ## make sure the vector is a valid probability distribution
126 |     p = p / np.sum(p)
127 |     
128 |     result = 0.0
129 |     for x in p:
130 |         if x > 0.0:
131 |             result += -x * math.log2(x)
132 |             
133 |     return result
134 | 
135 | def print_topic(topic):
136 |     sorted_words = sorted(zip(word_topics[:,topic], vocabulary), reverse=True)
137 |     
138 |     for i in range(20):
139 |         w = sorted_words[i]
140 |         print("{}\t{}".format(w[0], w[1]))
141 | 
142 | def print_all_topics():
143 |     for topic in range(num_topics):
144 |         sorted_words = sorted(zip(word_topics[:,topic], vocabulary), reverse=True)
145 |         print(" ".join([w for x, w in sorted_words[:20]]))
146 | 
147 | def write_state(writer):
148 |     writer.write("Doc\tWordID\tWord\tTopic\tCounts\tChanges\n")
149 |     
150 |     for doc, document in enumerate(documents):
151 |         doc_tokens = document["doc_tokens"]
152 |         doc_topics = document["doc_topics"]
153 |         topic_changes = document["topic_changes"]
154 |         
155 |         doc_length = len(doc_tokens)
156 |         
157 |         for i in range(doc_length):
158 |             word_id = doc_tokens[i]
159 |             word = vocabulary[word_id]
160 |             topic = doc_topics[i]
161 |             changes = topic_changes[i]
162 |             
163 |             writer.write("{}\t{}\t{}\t{}\t{}\t{}\n".format(doc, word_id, word, topic, word_counts[word], changes))
164 |         
165 | 
166 | #profile()
167 | 
168 | model = topicmodel.TopicModel(num_topics, vocabulary, doc_smoothing, word_smoothing)
169 | document = documents[0]
170 | 
171 | for document in documents:
172 |     c_doc = topicmodel.Document(document["doc_tokens"], document["doc_topics"], document["topic_changes"], document["topic_counts"])
173 |     model.add_document(c_doc)
174 | 
175 | for i in range(20):
176 |     start = timer()
177 |     model.sample(50)
178 |     print(timer() - start)
179 |     model.print_all_topics()
180 |     
181 | 
182 | #sample(1000)
183 | #topicmodel.sample(10, documents, word_topics, topic_totals, doc_smoothing, word_smoothing, smoothing_times_vocab_size, num_topics)
184 | #print_all_topics()
185 | #with open("state.txt", "w") as writer:
186 | #    write_state(writer)
187 | 


--------------------------------------------------------------------------------
/topicmodel.pyx:
--------------------------------------------------------------------------------
  1 | # cython: linetrace=True
  2 | 
  3 | from cython.view cimport array as cvarray
  4 | import numpy as np
  5 | import random
  6 | from timeit import default_timer as timer
  7 | 
  8 | class Document:
  9 |     
 10 |     def __init__(self, long[:] doc_tokens, long[:] doc_topics, long[:] topic_changes, long[:] doc_topic_counts):
 11 |         self.doc_tokens = doc_tokens
 12 |         self.doc_topics = doc_topics
 13 |         self.topic_changes = topic_changes
 14 |         self.doc_topic_counts = doc_topic_counts
 15 | 
 16 | cdef class TopicModel:
 17 |     
 18 |     cdef long[:] topic_totals
 19 |     cdef long[:,:] word_topics
 20 |     cdef int num_topics
 21 |     cdef int vocab_size
 22 |     
 23 |     cdef double[:] topic_probs
 24 |     cdef double[:] topic_normalizers
 25 |     cdef float doc_smoothing
 26 |     cdef float word_smoothing
 27 |     cdef float smoothing_times_vocab_size
 28 |     
 29 |     documents = []
 30 |     vocabulary = []
 31 |     
 32 |     def __init__(self, num_topics, vocabulary, doc_smoothing, word_smoothing):
 33 |         self.num_topics = num_topics
 34 |         self.vocabulary.extend(vocabulary)
 35 |         self.vocab_size = len(vocabulary)
 36 |         
 37 |         self.doc_smoothing = doc_smoothing
 38 |         self.word_smoothing = word_smoothing
 39 |         self.smoothing_times_vocab_size = word_smoothing * self.vocab_size
 40 |         
 41 |         self.topic_totals = np.zeros(num_topics, dtype=int)
 42 |         self.word_topics = np.zeros((self.vocab_size, num_topics), dtype=int)
 43 |     
 44 |     def add_document(self, doc):
 45 |         cdef int word_id, topic
 46 |         
 47 |         self.documents.append(doc)
 48 |         
 49 |         for i in range(len(doc.doc_tokens)):
 50 |             word_id = doc.doc_tokens[i]
 51 |             topic = doc.doc_topics[i]
 52 |             
 53 |             self.word_topics[word_id,topic] += 1
 54 |             self.topic_totals[topic] += 1
 55 |             doc.doc_topic_counts[topic] += 1
 56 |             
 57 |     def sample(self, iterations):
 58 |         cdef int old_topic, new_topic, word_id, topic, i, doc_length
 59 |         cdef double sampling_sum = 0
 60 |         cdef double sample
 61 |         cdef long[:] word_topic_counts
 62 |         
 63 |         cdef long[:] doc_tokens
 64 |         cdef long[:] doc_topics
 65 |         cdef long[:] doc_topic_counts
 66 |         cdef long[:] topic_changes
 67 |         
 68 |         cdef double[:] uniform_variates
 69 |         cdef double[:] topic_probs = np.zeros(self.num_topics, dtype=float)
 70 |         cdef double[:] topic_normalizers = np.zeros(self.num_topics, dtype=float)
 71 |         
 72 |         for topic in range(self.num_topics):
 73 |             topic_normalizers[topic] = 1.0 / (self.topic_totals[topic] + self.smoothing_times_vocab_size)
 74 |         
 75 |         
 76 |         for iteration in range(iterations):
 77 |             for document in self.documents:
 78 |                 doc_tokens = document.doc_tokens
 79 |                 doc_topics = document.doc_topics
 80 |                 doc_topic_counts = document.doc_topic_counts
 81 |                 topic_changes = document.topic_changes
 82 |                 
 83 |                 doc_length = len(document.doc_tokens)
 84 |                 uniform_variates = np.random.random_sample(doc_length)
 85 |                 
 86 |                 for i in range(doc_length):
 87 |                     word_id = doc_tokens[i]
 88 |                     old_topic = doc_topics[i]
 89 |                     word_topic_counts = self.word_topics[word_id,:]
 90 |         
 91 |                     ## erase the effect of this token
 92 |                     word_topic_counts[old_topic] -= 1
 93 |                     self.topic_totals[old_topic] -= 1
 94 |                     doc_topic_counts[old_topic] -= 1
 95 |         
 96 |                     topic_normalizers[old_topic] = 1.0 / (self.topic_totals[old_topic] + self.smoothing_times_vocab_size)
 97 |         
 98 |                     ###
 99 |                     ### SAMPLING DISTRIBUTION
100 |                     ###
101 |         
102 |                     sampling_sum = 0.0
103 |                     for topic in range(self.num_topics):
104 |                         topic_probs[topic] = (doc_topic_counts[topic] + self.doc_smoothing) * (word_topic_counts[topic] + self.word_smoothing) * topic_normalizers[topic]
105 |                         sampling_sum += topic_probs[topic]
106 |         
107 |                     #sample = random.uniform(0, sampling_sum)
108 |                     #sample = np.random.random_sample() * sampling_sum
109 |                     sample = uniform_variates[i] * sampling_sum
110 |         
111 |                     new_topic = 0
112 |                     while sample > topic_probs[new_topic]:
113 |                         sample -= topic_probs[new_topic]
114 |                         new_topic += 1
115 |             
116 |                     ## add back in the effect of this token
117 |                     word_topic_counts[new_topic] += 1
118 |                     self.topic_totals[new_topic] += 1
119 |                     doc_topic_counts[new_topic] += 1
120 |                     topic_normalizers[new_topic] = 1.0 / (self.topic_totals[new_topic] + self.smoothing_times_vocab_size)
121 | 
122 |                     doc_topics[i] = new_topic
123 |         
124 |                     if new_topic != old_topic:
125 |                         #pass
126 |                         topic_changes[i] += 1
127 | 
128 |     def print_topic(self, int topic):
129 |         sorted_words = sorted(zip(self.word_topics[:,topic], self.vocabulary), reverse=True)
130 | 
131 |         for i in range(20):
132 |             w = sorted_words[i]
133 |             print("{}\t{}".format(w[0], w[1]))
134 | 
135 |     def print_all_topics(self):
136 |         for topic in range(self.num_topics):
137 |             sorted_words = sorted(zip(self.word_topics[:,topic], self.vocabulary), reverse=True)
138 |             print(" ".join([w for x, w in sorted_words[:20]]))
139 | 
140 | 
141 | 
142 | def sample_doc(long[:] doc_tokens, long[:] doc_topics, long[:] topic_changes, long[:] doc_topic_counts, long[:,:] word_topics, long[:] topic_totals, double[:] topic_probs, double[:] topic_normalizers, float doc_smoothing, float word_smoothing, float smoothing_times_vocab_size, int num_topics):
143 |     
144 |     cdef int old_topic, new_topic, word_id, topic, i
145 |     cdef double sampling_sum = 0
146 |     cdef double sample
147 |     cdef long[:] word_topic_counts
148 |     
149 |     cdef int doc_length = len(doc_tokens)
150 |     cdef double[:] uniform_variates = np.random.random_sample(doc_length)
151 |     
152 |     for i in range(doc_length):
153 |         word_id = doc_tokens[i]
154 |         old_topic = doc_topics[i]
155 |         word_topic_counts = word_topics[word_id,:]
156 |         
157 |         ## erase the effect of this token
158 |         word_topic_counts[old_topic] -= 1
159 |         topic_totals[old_topic] -= 1
160 |         doc_topic_counts[old_topic] -= 1
161 |         
162 |         topic_normalizers[old_topic] = 1.0 / (topic_totals[old_topic] + smoothing_times_vocab_size)
163 |         
164 |         ###
165 |         ### SAMPLING DISTRIBUTION
166 |         ###
167 |         
168 |         sampling_sum = 0.0
169 |         for topic in range(num_topics):
170 |             topic_probs[topic] = (doc_topic_counts[topic] + doc_smoothing) * (word_topic_counts[topic] + word_smoothing) * topic_normalizers[topic]
171 |             sampling_sum += topic_probs[topic]
172 |         
173 |         #sample = random.uniform(0, sampling_sum)
174 |         #sample = np.random.random_sample() * sampling_sum
175 |         sample = uniform_variates[i] * sampling_sum
176 |         
177 |         new_topic = 0
178 |         while sample > topic_probs[new_topic]:
179 |             sample -= topic_probs[new_topic]
180 |             new_topic += 1
181 |             
182 |         ## add back in the effect of this token
183 |         word_topic_counts[new_topic] += 1
184 |         topic_totals[new_topic] += 1
185 |         doc_topic_counts[new_topic] += 1
186 |         topic_normalizers[new_topic] = 1.0 / (topic_totals[new_topic] + smoothing_times_vocab_size)
187 | 
188 |         doc_topics[i] = new_topic
189 |         
190 |         if new_topic != old_topic:
191 |             #pass
192 |             topic_changes[i] += 1
193 | 


--------------------------------------------------------------------------------
/lda_reference.py:
--------------------------------------------------------------------------------
  1 | """
  2 | 
  3 | Topic models look for groups of words that occur frequenty together. We can often recognize these clusters as specific themes that appear in the collection -- thus the term "topic" model.
  4 | 
  5 | Our example corpus today is a collection of Viking sagas. Start python like this:
  6 | 
  7 |     python -i topicmodel.py sagas_en.txt 20
  8 | 
  9 | We will work at the python prompt ">>>".
 10 | 
 11 | Today we'll be working with the simplest and most reliable topic model algorithm, Gibbs sampling.
 12 | Gibb sampling is a way to take a very complicted optimization problem and break it into little problems that are individually easy.
 13 | 
 14 | First, we need to have a way of describing probability distributions.
 15 | A discrete distribution is a vector of numbers that are >= 0.0 and sum to 1.0.
 16 | One function is called *entropy*. Entropy takes a distribution and returns a number.
 17 | 
 18 | 1. Run `entropy(np.array([0.7, 0.1, 0.2]))`. What is the value?
 19 | 
 20 | [Response here]
 21 | 
 22 | 2. Run `entropy(np.array([7, 1, 2]))`. Does the value change? Why or why not?
 23 | 
 24 | [Response here]
 25 | 
 26 | 3. Try different (non-negative) values of the three numbers. What is the largest value you can get, and what is the smallest?
 27 | 
 28 | [Response here]
 29 | 
 30 | 4. Now try different (non-negative) values of *four* numbers. Can you get a larger or smaller entropy than with three?
 31 | 
 32 | [Response here]
 33 | 
 34 | 5. Describe in your own words what entropy is measuring.
 35 | 
 36 | [Response here]
 37 | 
 38 | The Gibbs sampling algorithm proceeds in multiple iterations. In each iteration, 
 39 | we look at all the word tokens in all the documents, one after another.
 40 | For each word, we erase its current topic assignment and sample a new topic 
 41 | assignment given all the other word tokens' topic assignments.
 42 | 
 43 | Now look at the lines below the "SAMPLING DISTRIBUTION" comment. These define two vectors:
 44 | * The probability of each topic in the current document
 45 | * The probability of the current word in each topic
 46 | 
 47 | We'll look at a particular dramatic moment in Njal's saga. Define these variables:
 48 | 
 49 |     document = documents[1160]
 50 |     doc_topic_counts = document["topic_counts"]
 51 |     word = "sword"
 52 |     word_topic_counts = word_topics[word]
 53 | 
 54 | Use this command to suppress scientific notation:
 55 | 
 56 |     np.set_printoptions(suppress=True)
 57 | 
 58 | 6. Calculate the entropy of `doc_topic_counts`
 59 | 
 60 | 7. Calculate the entropy of `(doc_topic_counts + doc_smoothing)`. Should this be larger or smaller than the previous value?
 61 | 
 62 | 8. Calculate the entropy of `(word_topic_counts + word_smoothing) / (topic_totals + smoothing_times_vocab_size)`
 63 | 
 64 | 9. Calculate the entropy of `(doc_topic_counts + doc_smoothing) * (word_topic_counts + word_smoothing) / (topic_totals + smoothing_times_vocab_size)`
 65 | 
 66 | These values are random initializations. Let's run the algorithm
 67 | over the documents a few times and see what happens. Run:
 68 | 
 69 |     sample(25)
 70 | 
 71 | Use `print_all_topics()` to get a view of the current state of the topics.
 72 | 
 73 | 10. This function prints the number of tokens in each topic for the sample doc. Describe how (if at all) they change.
 74 | 
 75 | 11. Recalculate the four entropies we calculated above for the sampling distribution. How are they different?
 76 | 
 77 | 12. What is the value of `word_smoothing`? Previously we added 1.0 in this situation. Why are we using a different value now? Use the concept of entropy in your answer.
 78 | 
 79 | [Response here]
 80 | 
 81 | 13. What are Norse sagas about, from the perspective of the model?
 82 | 
 83 | [Response here]
 84 | 
 85 | 14. I'm removing a list of frequent words, words that are too short, and
 86 | words whose first letter is capitalized. Why does removing capitalized words
 87 | help? What happens if you remove that check? Is this a good idea?
 88 | 
 89 | [Response here]
 90 | 
 91 | """
 92 | 
 93 | import re, sys, random, math
 94 | import numpy as np
 95 | from collections import Counter
 96 | from timeit import default_timer as timer
 97 | 
 98 | word_pattern = re.compile("\w[\w\-\']*\w|\w")
 99 | 
100 | if len(sys.argv) != 3:
101 |     print("Usage: topicmodel.py [docs file] [num topics]")
102 |     sys.exit()
103 | 
104 | num_topics = int(sys.argv[2])
105 | doc_smoothing = 0.5
106 | word_smoothing = 0.01
107 | 
108 | stoplist = set()
109 | with open("stoplist.txt", encoding="utf-8") as stop_reader:
110 |     for line in stop_reader:
111 |         line = line.rstrip()
112 |         stoplist.add(line)
113 | 
114 | word_counts = Counter()
115 | 
116 | documents = []
117 | word_topics = {}
118 | topic_totals = np.zeros(num_topics)
119 | 
120 | for line in open(sys.argv[1], encoding="utf-8"):
121 |     #line = line.lower()
122 |     
123 |     tokens = word_pattern.findall(line)
124 |     
125 |     ## remove stopwords, short words, and upper-cased words
126 |     tokens = [w for w in tokens if not w in stoplist and len(w) >= 3 and not w[0].isupper()]
127 |     word_counts.update(tokens)
128 |     
129 |     doc_topic_counts = np.zeros(num_topics)
130 |     token_topics = []
131 |     
132 |     for w in tokens:
133 |         
134 |         ## Generate a topic randomly
135 |         topic = random.randrange(num_topics)
136 |         token_topics.append({ "word": w, "topic": topic })
137 |         
138 |         ## If we haven't seen this word before, initialize it
139 |         if not w in word_topics:
140 |             word_topics[w] = np.zeros(num_topics)
141 |         
142 |         ## Update counts: 
143 |         word_topics[w][topic] += 1
144 |         topic_totals[topic] += 1
145 |         doc_topic_counts[topic] += 1
146 |     
147 |     documents.append({ "original": line, "token_topics": token_topics, "topic_counts": doc_topic_counts })
148 | 
149 | ## Now that we're done reading from disk, we can count the total
150 | ##  number of words.
151 | vocabulary = list(word_counts.keys())
152 | vocabulary_size = len(vocabulary)
153 | 
154 | smoothing_times_vocab_size = word_smoothing * vocabulary_size
155 | 
156 | def sample(num_iterations):
157 |     for iteration in range(num_iterations):
158 |         
159 |         start = timer()
160 |         
161 |         for document in documents:
162 |             
163 |             doc_topic_counts = document["topic_counts"]
164 |             token_topics = document["token_topics"]
165 |             doc_length = len(token_topics)
166 |             for token_topic in token_topics:
167 |                 
168 |                 w = token_topic["word"]
169 |                 old_topic = token_topic["topic"]
170 |                 word_topic_counts = word_topics[w]
171 |                 
172 |                 ## erase the effect of this token
173 |                 word_topic_counts[old_topic] -= 1
174 |                 topic_totals[old_topic] -= 1
175 |                 doc_topic_counts[old_topic] -= 1
176 |                 
177 |                 ###
178 |                 ### SAMPLING DISTRIBUTION
179 |                 ###
180 |                 
181 |                 ## Does this topic occur often in the document?
182 |                 topic_probs = (doc_topic_counts + doc_smoothing) / (doc_length + num_topics * doc_smoothing)
183 |                 ## Does this word occur often in the topic?
184 |                 topic_probs *= (word_topic_counts + word_smoothing) / (topic_totals + smoothing_times_vocab_size)
185 |                 
186 |                 ## sample from an array that doesn't sum to 1.0
187 |                 sample = random.uniform(0, np.sum(topic_probs))
188 |                 
189 |                 new_topic = 0
190 |                 while sample > topic_probs[new_topic]:
191 |                     sample -= topic_probs[new_topic]
192 |                     new_topic += 1
193 |                 
194 |                 ## add back in the effect of this token
195 |                 word_topic_counts[new_topic] += 1
196 |                 topic_totals[new_topic] += 1
197 |                 doc_topic_counts[new_topic] += 1
198 |                 
199 |                 token_topic["topic"] = new_topic
200 |         end = timer()
201 |         print(end - start)
202 |                 
203 | 
204 | def entropy(p):
205 |     ## make sure the vector is a valid probability distribution
206 |     p = p / np.sum(p)
207 |     
208 |     result = 0.0
209 |     for x in p:
210 |         if x > 0.0:
211 |             result += -x * math.log2(x)
212 |             
213 |     return result
214 | 
215 | def print_topic(topic):
216 |     sorted_words = sorted(vocabulary, key=lambda w: word_topics[w][topic], reverse=True)
217 |     
218 |     for i in range(20):
219 |         w = sorted_words[i]
220 |         print("{}\t{}".format(word_topics[w][topic], w))
221 | 
222 | def print_all_topics():
223 |     for topic in range(num_topics):
224 |         sorted_words = sorted(vocabulary, key=lambda w: word_topics[w][topic], reverse=True)
225 |         print(" ".join(sorted_words[:20]))
226 | 
227 | sample(100)


--------------------------------------------------------------------------------