├── .coveragerc
├── .gitattributes
├── .gitignore
├── LICENSE-MIT
├── MANIFEST.in
├── README.md
├── lda2vec
    ├── __init__.py
    ├── corpus.py
    ├── dirichlet_likelihood.py
    ├── embed_mixture.py
    ├── fake_data.py
    ├── lda2vec.py
    ├── negative_sampling.py
    ├── preprocess.py
    ├── topics.py
    ├── tracking.py
    └── utils.py
├── notebooks
    ├── dataset.ipynb
    ├── lda2vec_model.ipynb
    └── viz.ipynb
├── pyproject.toml
├── requirements.txt
├── setup.py
├── tests
    └── __init__.py
└── tox.ini


/.coveragerc:
--------------------------------------------------------------------------------
 1 | [run]
 2 | source =
 3 |     lda2vec
 4 |     tests
 5 | branch = True
 6 | omit =
 7 |     lda2vec/cli.py
 8 | 
 9 | [report]
10 | exclude_lines =
11 |     no cov
12 |     no qa
13 |     noqa
14 |     pragma: no cover
15 |     if __name__ == .__main__.:
16 | 


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | 
118 | 
119 | #others
120 | nohup.out
121 | *.pkl
122 | *.bak
123 | *.dat
124 | *.npy
125 | *.dir 
126 | *.npz 
127 | *.hdf5


--------------------------------------------------------------------------------
/LICENSE-MIT:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2019 ONLPS
 2 | 
 3 | MIT License
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | include LICENSE-MIT
2 | include README.md
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # lda2vec
 2 | 
 3 | -----
 4 | 
 5 | **Table of Contents**
 6 | 
 7 | * [Installation](#installation)
 8 | * [License](#license)
 9 | 
10 | [![Downloads](https://pepy.tech/badge/pylda2vec)](https://pepy.tech/project/pylda2vec)
11 | 
12 | ## Installation
13 | 
14 | lda2vec is distributed on [PyPI](https://pypi.org) as a universal
15 | wheel and is available on Linux/macOS and Windows and supports
16 | Python 3.6+.
17 | 
18 | ```bash
19 | $ pip install pylda2vec
20 | ```
21 | 
22 | ## License
23 | 
24 | lda2vec is distributed under the terms of the
25 | [MIT License](https://choosealicense.com/licenses/mit).
26 | 


--------------------------------------------------------------------------------
/lda2vec/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = '1.0.0'
 2 | 
 3 | from .dirichlet_likelihood import dirichlet_likelihood
 4 | from .embed_mixture import EmbedMixture
 5 | from .tracking import Tracking
 6 | from .preprocess import tokenize
 7 | from .corpus import Corpus
 8 | from .topics import *
 9 | from .negative_sampling import NegativeSamplingFunction
10 | from .lda2vec import LDA2Vec
11 | 


--------------------------------------------------------------------------------
/lda2vec/corpus.py:
--------------------------------------------------------------------------------
  1 | from collections import defaultdict
  2 | import numpy as np
  3 | import difflib
  4 | import pandas as pd
  5 | from jellyfish import damerau_levenshtein_distance
  6 | 
  7 | try:
  8 |     from pyxdameraulevenshtein import damerau_levenshtein_distance_withNPArray
  9 | except ImportError:
 10 |     pass
 11 | 
 12 | 
 13 | class Corpus():
 14 |     _keys_frequency = None
 15 | 
 16 |     def __init__(self, out_of_vocabulary=-1, skip=-2):
 17 |         """ The Corpus helps with tasks involving integer representations of
 18 |         words. This object is used to filter, subsample, and convert loose
 19 |         word indices to compact word indices.
 20 | 
 21 |         'Loose' word arrays are word indices given by a tokenizer. The word
 22 |         index is not necessarily representative of word's frequency rank, and
 23 |         so loose arrays tend to have 'gaps' of unused indices, which can make
 24 |         models less memory efficient. As a result, this class helps convert
 25 |         a loose array to a 'compact' one where the most common words have low
 26 |         indices, and the most infrequent have high indices.
 27 | 
 28 |         Corpus maintains a count of how many of each word it has seen so
 29 |         that it can later selectively filter frequent or rare words. However,
 30 |         since word popularity rank could change with incoming data the word
 31 |         index count must be updated fully and `self.finalize()` must be called
 32 |         before any filtering and subsampling operations can happen.
 33 | 
 34 |         Arguments
 35 |         ---------
 36 |         out_of_vocabulary : int, default=-1
 37 |             Token index to replace whenever we encounter a rare or unseen word.
 38 |             Instead of skipping the token, we mark as an out of vocabulary
 39 |             word.
 40 |         skip : int, default=-2
 41 |             Token index to replace whenever we want to skip the current frame.
 42 |             Particularly useful when subsampling words or when padding a
 43 |             sentence.
 44 | 
 45 |         Examples
 46 |         --------
 47 |         >>> corpus = Corpus()
 48 |         >>> words_raw = np.random.randint(100, size=25)
 49 |         >>> corpus.update_word_count(words_raw)
 50 |         >>> corpus.finalize()
 51 |         >>> words_compact = corpus.to_compact(words_raw)
 52 |         >>> words_pruned = corpus.filter_count(words_compact, min_count=2)
 53 |         >>> # words_sub = corpus.subsample_frequent(words_pruned, thresh=1e-5)
 54 |         >>> words_loose = corpus.to_loose(words_pruned)
 55 |         >>> not_oov = words_loose > -1
 56 |         >>> np.all(words_loose[not_oov] == words_raw[not_oov])
 57 |         True
 58 |         """
 59 |         self.counts_loose = defaultdict(int)
 60 |         self._finalized = False
 61 |         self.specials = dict(out_of_vocabulary=out_of_vocabulary,
 62 |                              skip=skip)
 63 | 
 64 |     @property
 65 |     def n_specials(self):
 66 |         return len(self.specials)
 67 | 
 68 |     def update_word_count(self, loose_array):
 69 |         """ Update the corpus word counts given a loose array of word indices.
 70 |         Can be called multiple times, but once `finalize` is called the word
 71 |         counts cannot be updated.
 72 | 
 73 |         Arguments
 74 |         ---------
 75 |         loose_array : int array
 76 |             Array of word indices.
 77 | 
 78 |         Examples
 79 |         --------
 80 |         >>> corpus = Corpus()
 81 |         >>> corpus.update_word_count(np.arange(10))
 82 |         >>> corpus.update_word_count(np.arange(8))
 83 |         >>> corpus.counts_loose[0]
 84 |         2
 85 |         >>> corpus.counts_loose[9]
 86 |         1
 87 |         """
 88 |         self._check_unfinalized()
 89 |         uniques, counts = np.unique(np.ravel(loose_array), return_counts=True)
 90 |         msg = "Loose arrays cannot have elements below the values of special "
 91 |         msg += "tokens as these indices are reserved"
 92 |         assert uniques.min() >= min(self.specials.values()), msg
 93 |         for k, v in zip(uniques, counts):
 94 |             self.counts_loose[k] += v
 95 | 
 96 |     def _loose_keys_ordered(self):
 97 |         """ Get the loose keys in order of decreasing frequency"""
 98 |         loose_counts = sorted(self.counts_loose.items(), key=lambda x: x[1],
 99 |                               reverse=True)
100 |         keys = np.array(loose_counts)[:, 0]
101 |         counts = np.array(loose_counts)[:, 1]
102 |         order = np.argsort(counts)[::-1].astype('int32')
103 |         keys, counts = keys[order], counts[order]
104 |         # Add in the specials as a prefix to the other keys
105 | 
106 |         specials = np.sort(list(self.specials.values()))
107 |         keys = np.concatenate((specials, keys))
108 |         empty = np.zeros(len(specials), dtype='int32')
109 |         counts = np.concatenate((empty, counts))
110 |         n_keys = keys.shape[0]
111 |         assert counts.min() >= 0
112 |         return keys, counts, n_keys
113 | 
114 |     def finalize(self):
115 |         """ Call `finalize` once done updating word counts. This means the
116 |         object will no longer accept new word count data, but the loose
117 |         to compact index mapping can be computed. This frees the object to
118 |         filter, subsample, and compactify incoming word arrays.
119 | 
120 |         Examples
121 |         --------
122 |         >>> corpus = Corpus()
123 |         >>> # We'll update the word counts, making sure that word index 2
124 |         >>> # is the most common word index.
125 |         >>> corpus.update_word_count(np.arange(1) + 2)
126 |         >>> corpus.update_word_count(np.arange(3) + 2)
127 |         >>> corpus.update_word_count(np.arange(10) + 2)
128 |         >>> corpus.update_word_count(np.arange(8) + 2)
129 |         >>> corpus.counts_loose[2]
130 |         4
131 |         >>> # The corpus has not been finalized yet, and so the compact mapping
132 |         >>> # has not yet been computed.
133 |         >>> corpus.keys_counts[0]
134 |         Traceback (most recent call last):
135 |             ...
136 |         AttributeError: Corpus instance has no attribute 'keys_counts'
137 |         >>> corpus.finalize()
138 |         >>> corpus.n_specials
139 |         2
140 |         >>> # The special tokens are mapped to the first compact indices
141 |         >>> corpus.compact_to_loose[0]
142 |         -2
143 |         >>> corpus.compact_to_loose[0] == corpus.specials['skip']
144 |         True
145 |         >>> corpus.compact_to_loose[1] == corpus.specials['out_of_vocabulary']
146 |         True
147 |         >>> corpus.compact_to_loose[2]  # Most popular token is mapped next
148 |         2
149 |         >>> corpus.loose_to_compact[3]  # 2nd most popular token is mapped next
150 |         4
151 |         >>> first_non_special = corpus.n_specials
152 |         >>> corpus.keys_counts[first_non_special] # First normal token
153 |         4
154 |         """
155 |         # Return the loose keys and counts in descending count order
156 |         # so that the counts arrays is already in compact order
157 |         self.keys_loose, self.keys_counts, n_keys = self._loose_keys_ordered()
158 |         self.keys_compact = np.arange(n_keys).astype('int32')
159 |         self.loose_to_compact = {l: c for l, c in
160 |                                  zip(self.keys_loose, self.keys_compact)}
161 |         self.compact_to_loose = {c: l for l, c in
162 |                                  self.loose_to_compact.items()}
163 |         self.specials_to_compact = {s: self.loose_to_compact[i]
164 |                                     for s, i in self.specials.items()}
165 |         self.compact_to_special = {c: s for c, s in
166 |                                    self.specials_to_compact.items()}
167 |         self._finalized = True
168 | 
169 |     @property
170 |     def keys_frequency(self):
171 |         if self._keys_frequency is None:
172 |             f = self.keys_counts * 1.0 / np.sum(self.keys_counts)
173 |             self._keys_frequency = f
174 |         return self._keys_frequency
175 | 
176 |     def _check_finalized(self):
177 |         msg = "self.finalized() must be called before any other array ops"
178 |         assert self._finalized, msg
179 | 
180 |     def _check_unfinalized(self):
181 |         msg = "Cannot update word counts after self.finalized()"
182 |         msg += "has been called"
183 |         assert not self._finalized, msg
184 | 
185 |     def filter_count(self, words_compact, min_count=15, max_count=0,
186 |                      max_replacement=None, min_replacement=None):
187 |         """ Replace word indices below min_count with the pad index.
188 | 
189 |         Arguments
190 |         ---------
191 |         words_compact: int array
192 |             Source array whose values will be replaced. This is assumed to
193 |             already be converted into a compact array with `to_compact`.
194 |         min_count : int
195 |             Replace words less frequently occuring than this count. This
196 |             defines the threshold for what words are very rare
197 |         max_count : int
198 |             Replace words occuring more frequently than this count. This
199 |             defines the threshold for very frequent words
200 |         min_replacement : int, default is out_of_vocabulary
201 |             Replace words less than min_count with this.
202 |         max_replacement : int, default is out_of_vocabulary
203 |             Replace words greater than max_count with this.
204 | 
205 |         Examples
206 |         --------
207 |         >>> corpus = Corpus()
208 |         >>> # Make 1000 word indices with index < 100 and
209 |         >>> # update the word counts.
210 |         >>> word_indices = np.random.randint(100, size=1000)
211 |         >>> corpus.update_word_count(word_indices)
212 |         >>> corpus.finalize()  # any word indices above 99 will be filtered
213 |         >>> # Now create a new text, but with some indices above 100
214 |         >>> word_indices = np.random.randint(200, size=1000)
215 |         >>> word_indices.max() < 100
216 |         False
217 |         >>> # Remove words that have never appeared in the original corpus.
218 |         >>> filtered = corpus.filter_count(word_indices, min_count=1)
219 |         >>> filtered.max() < 100
220 |         True
221 |         >>> # We can also remove highly frequent words.
222 |         >>> filtered = corpus.filter_count(word_indices, max_count=2)
223 |         >>> len(np.unique(word_indices)) > len(np.unique(filtered))
224 |         True
225 |         """
226 |         self._check_finalized()
227 |         ret = words_compact.copy()
228 |         if min_replacement is None:
229 |             min_replacement = self.specials_to_compact['out_of_vocabulary']
230 |         if max_replacement is None:
231 |             max_replacement = self.specials_to_compact['out_of_vocabulary']
232 |         not_specials = np.ones(self.keys_counts.shape[0], dtype='bool')
233 |         not_specials[:self.n_specials] = False
234 |         if min_count:
235 |             # Find first index with count less than min_count
236 |             min_idx = np.argmax(not_specials & (self.keys_counts < min_count))
237 |             # Replace all indices greater than min_idx
238 |             ret[ret > min_idx] = min_replacement
239 |         if max_count:
240 |             # Find first index with count less than max_count
241 |             max_idx = np.argmax(not_specials & (self.keys_counts < max_count))
242 |             # Replace all indices less than max_idx
243 |             ret[ret < max_idx] = max_replacement
244 |         return ret
245 | 
246 |     def subsample_frequent(self, words_compact, threshold=1e-5):
247 |         """ Subsample the most frequent words. This aggressively
248 |         replaces words with frequencies higher than `threshold`. Words
249 |         are replaced with the out_of_vocabulary token.
250 | 
251 |         Words will be replaced with probability as a function of their
252 |         frequency in the training corpus:
253 | 
254 |         .. math::
255 |             p(w) = 1.0 - \sqrt{threshold\over f(w)}
256 | 
257 |         Arguments
258 |         ---------
259 |         words_compact: int array
260 |             The input array to subsample.
261 |         threshold: float in [0, 1]
262 |             Words with frequencies higher than this will be increasingly
263 |             subsampled.
264 | 
265 |         Examples
266 |         --------
267 |         >>> corpus = Corpus()
268 |         >>> word_indices = (np.random.power(5.0, size=1000) * 100).astype('i')
269 |         >>> corpus.update_word_count(word_indices)
270 |         >>> corpus.finalize()
271 |         >>> compact = corpus.to_compact(word_indices)
272 |         >>> sampled = corpus.subsample_frequent(compact, threshold=1e-2)
273 |         >>> skip = corpus.specials_to_compact['skip']
274 |         >>> np.sum(compact == skip)  # No skips in the compact tokens
275 |         0
276 |         >>> np.sum(sampled == skip) > 0  # Many skips in the sampled tokens
277 |         True
278 | 
279 |         .. [1] Distributed Representations of Words and Phrases and
280 |                their Compositionality. Mikolov, Tomas and Sutskever, Ilya
281 |                and Chen, Kai and Corrado, Greg S and Dean, Jeff
282 |                Advances in Neural Information Processing Systems 26
283 |         """
284 |         self._check_finalized()
285 |         freq = self.keys_frequency + 1e-10
286 |         pw = 1.0 - (np.sqrt(threshold / freq) + threshold / freq)
287 |         prob = fast_replace(words_compact, self.keys_compact, pw)
288 |         draw = np.random.uniform(size=prob.shape)
289 |         ret = words_compact.copy()
290 |         # If probability greater than draw, skip the word
291 |         ret[prob > draw] = self.specials_to_compact['skip']
292 |         return ret
293 | 
294 |     def to_compact(self, word_loose):
295 |         """ Convert a loose word index matrix to a compact array using
296 |         a fixed loose to dense mapping. Out of vocabulary word indices
297 |         will be replaced by the out of vocabulary index. The most common
298 |         index will be mapped to 0, the next most common to 1, and so on.
299 | 
300 |         Arguments
301 |         ---------
302 |         word_loose : int array
303 |             Input loose word array to be converted into a compact array.
304 | 
305 | 
306 |         Examples
307 |         --------
308 |         >>> corpus = Corpus()
309 |         >>> word_indices = np.random.randint(100, size=1000)
310 |         >>> n_words = len(np.unique(word_indices))
311 |         >>> corpus.update_word_count(word_indices)
312 |         >>> corpus.finalize()
313 |         >>> word_compact = corpus.to_compact(word_indices)
314 |         >>> # The most common word in the training set will be mapped to be
315 |         >>> # right after all the special tokens, so 2 in this case.
316 |         >>> np.argmax(np.bincount(word_compact)) == 2
317 |         True
318 |         >>> most_common = np.argmax(np.bincount(word_indices))
319 |         >>> corpus.loose_to_compact[most_common] == 2
320 |         True
321 |         >>> # Out of vocabulary indices will be mapped to 1
322 |         >>> word_indices = np.random.randint(150, size=1000)
323 |         >>> word_compact_oov = corpus.to_compact(word_indices)
324 |         >>> oov = corpus.specials_to_compact['out_of_vocabulary']
325 |         >>> oov
326 |         1
327 |         >>> oov in word_compact
328 |         False
329 |         >>> oov in word_compact_oov
330 |         True
331 |         """
332 |         self._check_finalized()
333 |         keys = self.keys_loose
334 |         reps = self.keys_compact
335 |         uniques = np.unique(word_loose)
336 |         # Find the out of vocab indices
337 |         oov = np.setdiff1d(uniques, keys, assume_unique=True)
338 |         oov_token = self.specials_to_compact['out_of_vocabulary']
339 |         keys = np.concatenate((keys, oov))
340 |         reps = np.concatenate((reps, np.zeros_like(oov) + oov_token))
341 |         compact = fast_replace(word_loose, keys, reps)
342 |         msg = "Error: all compact indices should be non-negative"
343 |         assert compact.min() >= 0, msg
344 |         return compact
345 | 
346 |     def to_loose(self, word_compact):
347 |         """ Convert a compacted array back into a loose array.
348 | 
349 |         Arguments
350 |         ---------
351 |         word_compact : int array
352 |             Input compacted word array to be converted into a loose array.
353 | 
354 | 
355 |         Examples
356 |         --------
357 |         >>> corpus = Corpus()
358 |         >>> word_indices = np.random.randint(100, size=1000)
359 |         >>> corpus.update_word_count(word_indices)
360 |         >>> corpus.finalize()
361 |         >>> word_compact = corpus.to_compact(word_indices)
362 |         >>> word_loose = corpus.to_loose(word_compact)
363 |         >>> np.all(word_loose == word_indices)
364 |         True
365 |         """
366 |         self._check_finalized()
367 |         uniques = np.unique(word_compact)
368 |         # Find the out of vocab indices
369 |         oov = np.setdiff1d(uniques, self.keys_compact, assume_unique=True)
370 |         msg = "Found keys in `word_compact` not present in the"
371 |         msg += "training corpus. Is this actually a compacted array?"
372 |         assert np.all(oov < 0), msg
373 |         loose = fast_replace(word_compact, self.keys_compact, self.keys_loose)
374 |         return loose
375 | 
376 |     def compact_to_flat(self, word_compact, *components):
377 |         """ Ravel a 2D compact array of documents (rows) and word
378 |         positions (columns) into a 1D array of words. Leave out special
379 |         tokens and ravel the component arrays in the same fashion.
380 | 
381 |         Arguments
382 |         ---------
383 |         word_compact : int array
384 |             Array of word indices in documents. Has shape (n_docs, max_length)
385 |         components : list of arrays
386 |             A list of arrays detailing per-document properties. Each array
387 |             must n_docs long.
388 | 
389 |         Returns
390 |         -------
391 |         flat : int array
392 |             An array of all words unravelled into a 1D shape
393 |         components : list of arrays
394 |             Each array here is also unravelled into the same shape
395 | 
396 |         Examples
397 |         --------
398 |         >>> corpus = Corpus()
399 |         >>> word_indices = np.random.randint(100, size=1000)
400 |         >>> corpus.update_word_count(word_indices)
401 |         >>> corpus.finalize()
402 |         >>> doc_texts = np.arange(8).reshape((2, 4))
403 |         >>> doc_texts[:, -1] = -2  # Mark as skips
404 |         >>> doc_ids = np.arange(2)
405 |         >>> compact = corpus.to_compact(doc_texts)
406 |         >>> oov = corpus.specials_to_compact['out_of_vocabulary']
407 |         >>> compact[1, 3] = oov  # Mark the last word as OOV
408 |         >>> flat = corpus.compact_to_flat(compact)
409 |         >>> flat.shape[0] == 6  # 2 skips were dropped from 8 words
410 |         True
411 |         >>> flat[-1] == corpus.loose_to_compact[doc_texts[1, 2]]
412 |         True
413 |         >>> flat, (flat_id,) = corpus.compact_to_flat(compact, doc_ids)
414 |         >>> flat_id
415 |         array([0, 0, 0, 1, 1, 1])
416 |         """
417 |         self._check_finalized()
418 |         n_docs = word_compact.shape[0]
419 |         max_length = word_compact.shape[1]
420 |         idx = word_compact > self.n_specials
421 |         components_raveled = []
422 |         msg = "Length of each component must much `word_compact` size"
423 |         for component in components:
424 |             raveled = np.tile(component[:, None], max_length)[idx]
425 |             components_raveled.append(raveled)
426 |             assert len(component) == n_docs, msg
427 |         if len(components_raveled) == 0:
428 |             return word_compact[idx]
429 |         else:
430 |             return word_compact[idx], components_raveled
431 | 
432 |     def word_list(self, vocab, max_compact_index=None, oov_token='<OoV>'):
433 |         """ Translate compact keys back into string representations for a word.
434 | 
435 |         Arguments
436 |         ---------
437 |         vocab : dict
438 |             The vocab object has loose indices as keys and word strings as
439 |             values.
440 | 
441 |         max_compact_index : int
442 |             Only return words up to this index. If None, defaults to the number
443 |             of compact indices available
444 | 
445 |         oov_token : str
446 |             Returns this string if a compact index does not have a word in the
447 |             vocab dictionary provided.
448 | 
449 |         Returns
450 |         -------
451 |         word_list : list
452 |             A list of strings representations corresponding to word indices
453 |             zero to `max_compact_index`
454 | 
455 |         Examples
456 |         --------
457 | 
458 |         >>> vocab = {0: 'But', 1: 'the', 2: 'night', 3: 'was', 4: 'warm'}
459 |         >>> word_indices = np.zeros(50).astype('int32')
460 |         >>> word_indices[:25] = 0  # 'But' shows 25 times
461 |         >>> word_indices[25:35] = 1  # 'the' is in 10 times
462 |         >>> word_indices[40:46] = 2  # 'night' is in 6 times
463 |         >>> word_indices[46:49] = 3  # 'was' is in 3 times
464 |         >>> word_indices[49:] = 4  # 'warm' in in 2 times
465 |         >>> corpus = Corpus()
466 |         >>> corpus.update_word_count(word_indices)
467 |         >>> corpus.finalize()
468 |         >>> # Build a vocabulary of word indices
469 |         >>> corpus.word_list(vocab)
470 |         ['skip', 'out_of_vocabulary', 'But', 'the', 'night', 'was', 'warm']
471 |         """
472 |         # Translate the compact keys into string words
473 |         oov = self.specials['out_of_vocabulary']
474 |         words = []
475 |         if max_compact_index is None:
476 |             max_compact_index = self.keys_compact.shape[0]
477 |         index_to_special = {i: s for s, i in self.specials.items()}
478 |         for compact_index in range(max_compact_index):
479 |             loose_index = self.compact_to_loose.get(compact_index, oov)
480 |             special = index_to_special.get(loose_index, oov_token)
481 |             string = vocab.get(loose_index, special)
482 |             words.append(string)
483 |         return words
484 | 
485 |     def compact_word_vectors(self, vocab, filename=None, array=None,
486 |                              top=20000):
487 |         """ Retrieve pretrained word spectors for our vocabulary.
488 |         The returned word array has row indices corresponding to the
489 |         compact index of a word, and columns correponding to the word
490 |         vector.
491 | 
492 |         Arguments
493 |         ---------
494 |         vocab : dict
495 |             Dictionary where keys are the loose index, and values are
496 |             the word string.
497 | 
498 |         use_spacy : bool
499 |             Use SpaCy to load in word vectors. Otherwise Gensim.
500 | 
501 |         filename : str
502 |             Filename for SpaCy-compatible word vectors or if use_spacy=False
503 |             then uses word2vec vectors via gensim.
504 | 
505 |         Returns
506 |         -------
507 |         data : numpy float array
508 |             Array such that data[compact_index, :] = word_vector
509 | 
510 |         Examples
511 |         --------
512 |         >>> import numpy.linalg as nl
513 |         >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
514 |         >>> word_indices = np.zeros(50).astype('int32')
515 |         >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
516 |         >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
517 |         >>> word_indices[40:46] = 7  # 'cold' is in 6 times
518 |         >>> word_indices[46:] = 3  # 'hot' is in 3 times
519 |         >>> corpus = Corpus()
520 |         >>> corpus.update_word_count(word_indices)
521 |         >>> corpus.finalize()
522 |         >>> v, s, f = corpus.compact_word_vectors(vocab)
523 |         >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y)
524 |         >>> vocab[corpus.compact_to_loose[2]]
525 |         'shuttle'
526 |         >>> vocab[corpus.compact_to_loose[3]]
527 |         'astronomy'
528 |         >>> vocab[corpus.compact_to_loose[4]]
529 |         'cold'
530 |         >>> sim_shuttle_astro = sim(v[2, :], v[3, :])
531 |         >>> sim_shuttle_cold = sim(v[2, :], v[4, :])
532 |         >>> sim_shuttle_astro > sim_shuttle_cold
533 |         True
534 |         """
535 |         n_words = len(self.compact_to_loose)
536 |         from gensim.models.word2vec import Word2Vec
537 |         from gensim.models import KeyedVectors
538 |         model = KeyedVectors.load_word2vec_format(filename, binary=True)
539 |         #model = Word2Vec.load_word2vec_format(filename, binary=True)
540 |         n_dim = model.syn0.shape[1]
541 |         data = np.random.normal(size=(n_words, n_dim)).astype('float32')
542 |         data -= data.mean()
543 |         data += model.syn0.mean()
544 |         data /= data.std()
545 |         data *= model.syn0.std()
546 |         if array is not None:
547 |             data = array
548 |             n_words = data.shape[0]
549 |         keys_raw = model.vocab.keys()
550 |         keys = [s.encode('ascii', 'ignore') for s in keys_raw]
551 |         lens = [len(s) for s in model.vocab.keys()]
552 |         choices = np.array(keys, dtype='S')
553 |         lengths = np.array(lens, dtype='int32')
554 |         s, f = 0, 0
555 | 
556 |         def rep0(w): return w
557 | 
558 |         def rep1(w): return w.replace(' ', '_')
559 | 
560 |         def rep2(w): return w.title().replace(' ', '_')
561 |         reps = [rep0, rep1, rep2]
562 |         for compact in np.arange(top):
563 |             loose = self.compact_to_loose.get(compact, None)
564 |             if loose is None:
565 |                 continue
566 |             word = vocab.get(loose, None)
567 |             if word is None:
568 |                 continue
569 |             word = word.strip()
570 |             vector = None
571 |             for rep in reps:
572 |                 clean = rep(word)
573 |                 if clean in model.vocab:
574 |                     vector = model[clean]
575 |                     break
576 |             if vector is None:
577 |                 try:
578 |                     word = str(word)
579 |                     idx = lengths >= len(word) - 3
580 |                     idx &= lengths <= len(word) + 3
581 |                     sel = choices[idx]
582 |                     sel = str(sel.tolist()[0])
583 |                     d = damerau_levenshtein_distance(word, sel)
584 |                     choice = np.array(keys_raw)[idx][np.argmin(d)]
585 |                     # choice = difflib.get_close_matches(word, choices)[0]
586 |                     vector = model[choice]
587 |                     print(compact, word, ' --> ', choice)
588 |                 except IndexError:
589 |                     pass
590 |             if vector is None:
591 |                 f += 1
592 |                 continue
593 |             s += 1
594 |             data[compact, :] = vector[:]
595 |         return data, s, f
596 | 
597 |     def compact_to_bow(self, word_compact, max_compact_index=None):
598 |         """ Given a 2D array of compact indices, return the bag of words
599 |         representation where the column is the word index, row is the document
600 |         index, and the value is the number of times that word appears in that
601 |         document.
602 | 
603 |         >>> import numpy.linalg as nl
604 |         >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'}
605 |         >>> word_indices = np.zeros(50).astype('int32')
606 |         >>> word_indices[:25] = 19  # 'Shuttle' shows 25 times
607 |         >>> word_indices[25:35] = 5  # 'astronomy' is in 10 times
608 |         >>> word_indices[40:46] = 7  # 'cold' is in 6 times
609 |         >>> word_indices[46:] = 3  # 'hot' is in 3 times
610 |         >>> corpus = Corpus()
611 |         >>> corpus.update_word_count(word_indices)
612 |         >>> corpus.finalize()
613 |         >>> v = corpus.compact_to_bow(word_indices)
614 |         >>> len(v)
615 |         20
616 |         >>> v[:6]
617 |         array([ 5,  0,  0,  4,  0, 10])
618 |         >>> v[19]
619 |         25
620 |         >>> v.sum()
621 |         50
622 |         >>> words = [[0, 0, 0, 3, 4], [1, 1, 1, 4, 5]]
623 |         >>> words = np.array(words)
624 |         >>> bow = corpus.compact_to_bow(words)
625 |         >>> bow.shape
626 |         (2, 6)
627 |         """
628 |         if max_compact_index is None:
629 |             max_compact_index = word_compact.max()
630 | 
631 |         def bincount(x):
632 |             return np.bincount(x, minlength=max_compact_index + 1)
633 |         axis = len(word_compact.shape) - 1
634 |         bow = np.apply_along_axis(bincount, axis, word_compact)
635 |         return bow
636 | 
637 |     def compact_to_coocurrence(self, word_compact, indices, window_size=10):
638 |         """ From an array of compact tokens and aligned array of document indices
639 |         compute (word, word, document) co-occurrences within a moving window.
640 | 
641 |         Arguments
642 |         ---------
643 |         word_compact: int array
644 |         Sequence of tokens.
645 | 
646 |         indices: dict of int arrays
647 |         Each array in this dictionary should represent the document index it
648 |         came from.
649 | 
650 |         window_size: int
651 |         Indicates the moving window size around which all co-occurrences will
652 |         be computed.
653 | 
654 |         Returns
655 |         -------
656 |         counts : DataFrame
657 |         Returns a DataFrame with two columns for word index A and B,
658 |         one extra column for each document index, and a final column for counts
659 |         in that key.
660 | 
661 |         >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0])
662 |         >>> doc_idx = np.array([0, 0, 0, 0, 1, 1, 1, 1])
663 |         >>> corpus = Corpus()
664 |         >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx})
665 |         >>> counts.counts.sum()
666 |         24
667 |         >>> counts.query('doc == 0').counts.values
668 |         array([3, 3, 6])
669 |         >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0])
670 |         >>> doc_idx = np.array([0, 0, 0, 1, 1, 2, 2, 2])
671 |         >>> corpus = Corpus()
672 |         >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx})
673 |         >>> counts.counts.sum()
674 |         14
675 |         >>> counts.query('doc == 0').word_index_x.values
676 |         array([0, 1, 1])
677 |         >>> counts.query('doc == 0').word_index_y.values
678 |         array([1, 0, 1])
679 |         >>> counts.query('doc == 0').counts.values
680 |         array([2, 2, 2])
681 |         >>> counts.query('doc == 1').counts.values
682 |         array([1, 1])
683 |         """
684 |         tokens = pd.DataFrame(dict(word_index=word_compact)).reset_index()
685 |         for name, index in indices.items():
686 |             tokens[name] = index
687 |         a, b = tokens.copy(), tokens.copy()
688 | 
689 |         def mask(x): return np.prod([x[k + '_x'] == x[k + '_y']
690 |                                      for k in indices.keys()], axis=0)
691 |         group_keys = ['word_index_x', 'word_index_y', ]
692 |         group_keys += [k + '_x' for k in indices.keys()]
693 |         total = []
694 |         a['frame'] = a['index'].copy()
695 |         for frame in range(-window_size, window_size + 1):
696 |             if frame == 0:
697 |                 continue
698 |             b['frame'] = b['index'] + frame
699 |             matches = (a.merge(b, on='frame')
700 |                         .assign(same_doc=mask)
701 |                         .pipe(lambda df: df[df['same_doc'] == 1])
702 |                         .groupby(group_keys)['frame']
703 |                         .count()
704 |                         .reset_index())
705 |             total.append(matches)
706 |         counts = (pd.concat(total)
707 |                     .groupby(group_keys)['frame']
708 |                     .sum()
709 |                     .reset_index()
710 |                     .rename(columns={k + '_x': k for k in indices.keys()})
711 |                     .rename(columns=dict(frame='counts')))
712 |         return counts
713 | 
714 | 
715 | def fast_replace(data, keys, values, skip_checks=False):
716 |     """ Do a search-and-replace in array `data`.
717 | 
718 |     Arguments
719 |     ---------
720 |     data : int array
721 |         Array of integers
722 |     keys : int array
723 |         Array of keys inside of `data` to be replaced
724 |     values : int array
725 |         Array of values that replace the `keys` array
726 |     skip_checks : bool, default=False
727 |         Optionally skip sanity checking the input.
728 | 
729 |     Examples
730 |     --------
731 |     >>> fast_replace(np.arange(5), np.arange(5), np.arange(5)[::-1])
732 |     array([4, 3, 2, 1, 0])
733 |     """
734 |     assert np.allclose(keys.shape, values.shape)
735 |     if not skip_checks:
736 |         msg = "data has elements not in keys"
737 |         assert data.max() <= keys.max(), msg
738 |     sdx = np.argsort(keys)
739 |     keys, values = keys[sdx], values[sdx]
740 |     idx = np.digitize(data, keys, right=True)
741 |     new_data = values[idx]
742 |     return new_data
743 | 


--------------------------------------------------------------------------------
/lda2vec/dirichlet_likelihood.py:
--------------------------------------------------------------------------------
 1 | import chainer.functions as F
 2 | from chainer import Variable
 3 | 
 4 | 
 5 | def dirichlet_likelihood(weights, alpha=None):
 6 |     """ Calculate the log likelihood of the observed topic proportions.
 7 |     A negative likelihood is more likely than a negative likelihood.
 8 | 
 9 |     Args:
10 |         weights (chainer.Variable): Unnormalized weight vector. The vector
11 |             will be passed through a softmax function that will map the input
12 |             onto a probability simplex.
13 |         alpha (float): The Dirichlet concentration parameter. Alpha
14 |             greater than 1.0 results in very dense topic weights such
15 |             that each document belongs to many topics. Alpha < 1.0 results
16 |             in sparser topic weights. The default is to set alpha to
17 |             1.0 / n_topics, effectively enforcing the prior belief that a
18 |             document belong to very topics at once.
19 | 
20 |     Returns:
21 |         ~chainer.Variable: Output loss variable.
22 |     """
23 |     if type(weights) is Variable:
24 |         n_topics = weights.data.shape[1]
25 |     else:
26 |         n_topics = weights.W.data.shape[1]
27 |     if alpha is None:
28 |         alpha = 1.0 / n_topics
29 |     if type(weights) is Variable:
30 |         log_proportions = F.log_softmax(weights)
31 |     else:
32 |         log_proportions = F.log_softmax(weights.W)
33 |     loss = (alpha - 1.0) * log_proportions
34 |     return -F.sum(loss)
35 | 


--------------------------------------------------------------------------------
/lda2vec/embed_mixture.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | 
  3 | import chainer
  4 | import chainer.links as L
  5 | import chainer.functions as F
  6 | from chainer import Variable
  7 | 
  8 | 
  9 | def _orthogonal_matrix(shape):
 10 |     # Stolen from blocks:
 11 |     # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
 12 |     M1 = np.random.randn(shape[0], shape[0])
 13 |     M2 = np.random.randn(shape[1], shape[1])
 14 | 
 15 |     # QR decomposition of matrix with entries in N(0, 1) is random
 16 |     Q1, R1 = np.linalg.qr(M1)
 17 |     Q2, R2 = np.linalg.qr(M2)
 18 |     # Correct that NumPy doesn't force diagonal of R to be non-negative
 19 |     Q1 = Q1 * np.sign(np.diag(R1))
 20 |     Q2 = Q2 * np.sign(np.diag(R2))
 21 | 
 22 |     n_min = min(shape[0], shape[1])
 23 |     return np.dot(Q1[:, :n_min], Q2[:n_min, :])
 24 | 
 25 | 
 26 | class EmbedMixture(chainer.Chain):
 27 |     r""" A single document is encoded as a multinomial mixture of latent topics.
 28 |     The mixture is defined on simplex, so that mixture weights always sum
 29 |     to 100%. The latent topic vectors resemble word vectors whose elements are
 30 |     defined over all real numbers.
 31 | 
 32 |     For example, a single document mix may be :math:`[0.9, 0.1]`, indicating
 33 |     that it is 90% in the first topic, 10% in the second. An example topic
 34 |     vector looks like :math:`[1.5e1, -1.3e0, +3.4e0, -0.2e0]`, which is
 35 |     largely uninterpretable until you measure the words most similar to this
 36 |     topic vector.
 37 | 
 38 |     A single document vector :math:`\vec{e}` is composed as weights :math:`c_j`
 39 |     over topic vectors :math:`\vec{T_j}`:
 40 | 
 41 |     .. math::
 42 | 
 43 |         \vec{e}=\Sigma_{j=0}^{j=n\_topics}c_j\vec{T_j}
 44 | 
 45 |     This is usually paired with regularization on the weights :math:`c_j`.
 46 |     If using a Dirichlet prior with low alpha, these weights will be sparse.
 47 | 
 48 |     Args:
 49 |         n_documents (int): Total number of documents
 50 |         n_topics (int): Number of topics per document
 51 |         n_dim (int): Number of dimensions per topic vector (should match word
 52 |             vector size)
 53 | 
 54 |     Attributes:
 55 |         weights : chainer.links.EmbedID
 56 |             Unnormalized topic weights (:math:`c_j`). To normalize these
 57 |             weights, use `F.softmax(weights)`.
 58 |         factors : chainer.links.Parameter
 59 |             Topic vector matrix (:math:`T_j`)
 60 | 
 61 |     .. seealso:: :func:`lda2vec.dirichlet_likelihood`
 62 |     """
 63 | 
 64 |     def __init__(self, n_documents, n_topics, n_dim, dropout_ratio=0.2,
 65 |                  temperature=1.0):
 66 |         self.n_documents = n_documents
 67 |         self.n_topics = n_topics
 68 |         self.n_dim = n_dim
 69 |         self.dropout_ratio = dropout_ratio
 70 |         factors = _orthogonal_matrix((n_topics, n_dim)).astype('float32')
 71 |         factors /= np.sqrt(n_topics + n_dim)
 72 |         super(EmbedMixture, self).__init__(
 73 |             weights=L.EmbedID(n_documents, n_topics),
 74 |             factors=L.Parameter(factors))
 75 |         self.temperature = temperature
 76 |         self.weights.W.data[...] /= np.sqrt(n_documents + n_topics)
 77 | 
 78 |     def __call__(self, doc_ids, update_only_docs=False):
 79 |         """ Given an array of document integer indices, returns a vector
 80 |         for each document. The vector is composed of topic weights projected
 81 |         onto topic vectors.
 82 | 
 83 |         Args:
 84 |             doc_ids : chainer.Variable
 85 |                 One-dimensional batch vectors of IDs
 86 | 
 87 |         Returns:
 88 |             doc_vector : chainer.Variable
 89 |                 Batch of two-dimensional embeddings for every document.
 90 |         """
 91 |         # (batchsize, ) --> (batchsize, multinomial)
 92 |         proportions = self.proportions(doc_ids, softmax=True)
 93 |         # (batchsize, n_factors) * (n_factors, n_dim) --> (batchsize, n_dim)
 94 |         factors = F.dropout(self.factors(), ratio=self.dropout_ratio)
 95 |         if update_only_docs:
 96 |             factors.unchain_backward()
 97 |         w_sum = F.matmul(proportions, factors)
 98 |         return w_sum
 99 | 
100 |     def proportions(self, doc_ids, softmax=False):
101 |         """ Given an array of document indices, return a vector
102 |         for each document of just the unnormalized topic weights.
103 | 
104 |         Returns:
105 |             doc_weights : chainer.Variable
106 |                 Two dimensional topic weights of each document.
107 |         """
108 |         w = self.weights(doc_ids)
109 |         if softmax:
110 |             size = w.data.shape
111 |             mask = self.xp.random.random_integers(0, 1, size=size)
112 |             y = (F.softmax(w * self.temperature) *
113 |                  Variable(mask.astype('float32')))
114 |             norm, y = F.broadcast(F.expand_dims(F.sum(y, axis=1), 1), y)
115 |             return y / (norm + 1e-7)
116 |         else:
117 |             return w
118 | 


--------------------------------------------------------------------------------
/lda2vec/fake_data.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from numpy.random import random_sample
 3 | 
 4 | 
 5 | def orthogonal_matrix(shape):
 6 |     # Stolen from blocks:
 7 |     # github.com/mila-udem/blocks/blob/master/blocks/initialization.py
 8 |     M1 = np.random.randn(shape[0], shape[0])
 9 |     M2 = np.random.randn(shape[1], shape[1])
10 | 
11 |     # QR decomposition of matrix with entries in N(0, 1) is random
12 |     Q1, R1 = np.linalg.qr(M1)
13 |     Q2, R2 = np.linalg.qr(M2)
14 |     # Correct that NumPy doesn't force diagonal of R to be non-negative
15 |     Q1 = Q1 * np.sign(np.diag(R1))
16 |     Q2 = Q2 * np.sign(np.diag(R2))
17 | 
18 |     n_min = min(shape[0], shape[1])
19 |     return np.dot(Q1[:, :n_min], Q2[:n_min, :])
20 | 
21 | 
22 | def softmax(w):
23 |     # https://gist.github.com/stober/1946926
24 |     w = np.array(w)
25 |     maxes = np.amax(w, axis=1)
26 |     maxes = maxes.reshape(maxes.shape[0], 1)
27 |     e = np.exp(w - maxes)
28 |     dist = e / np.sum(e, axis=1)[:, None]
29 |     return dist
30 | 
31 | 
32 | def sample(values, probabilities, size):
33 |     assert np.allclose(np.sum(probabilities, axis=-1), 1.0)
34 |     bins = np.add.accumulate(probabilities)
35 |     return values[np.digitize(random_sample(size), bins)]
36 | 
37 | 
38 | def fake_data(n_docs, n_words, n_sent_length, n_topics):
39 |     """ Generate latent topic vectors for words and documents
40 |     and then for each document, draw a sentence. Draw each word
41 |     document with probability proportional to the dot product and
42 |     normalized with a softmax.
43 | 
44 |     Arguments
45 |     ---------
46 |     n_docs : int
47 |         Number of documents
48 |     n_words : int
49 |         Number of words in the vocabulary
50 |     n_sent_length : int
51 |         Number of words to draw for each document
52 |     n_topics : int
53 |         Number of topics that a single document can belong to.
54 | 
55 |     Returns
56 |     -------
57 |     sentences : int array
58 |         Array of word indices of shape (n_docs, n_sent_length).
59 | 
60 |     """
61 |     # These are log ratios for the doc & word topics
62 |     doc_topics = orthogonal_matrix([n_docs, n_topics])
63 |     wrd_topics = orthogonal_matrix([n_topics, n_words])
64 |     # Multiply log ratios and softmax to get prob of word in doc
65 |     doc_to_wrds = softmax(np.dot(doc_topics, wrd_topics))
66 |     # Now sample from doc_to_wrd to get realizations
67 |     indices = np.arange(n_words).astype('int32')
68 |     sentences = []
69 |     for doc_to_wrd in doc_to_wrds:
70 |         words = sample(indices, doc_to_wrd, n_sent_length)
71 |         sentences.append(words)
72 |     sentences = np.array(sentences)
73 |     return sentences.astype('int32')
74 | 


--------------------------------------------------------------------------------
/lda2vec/lda2vec.py:
--------------------------------------------------------------------------------
 1 | from .embed_mixture import EmbedMixture
 2 | from .dirichlet_likelihood import dirichlet_likelihood
 3 | from .utils import move
 4 | 
 5 | from chainer import Chain
 6 | import chainer.links as L
 7 | import chainer.functions as F
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | class LDA2Vec(Chain):
13 |     def __init__(self, n_documents=100, n_document_topics=10,
14 |                  n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True,
15 |                  counts=None, n_samples=15, word_dropout_ratio=0.0,
16 |                  power=0.75, temperature=1.0):
17 |         em = EmbedMixture(n_documents, n_document_topics, n_units,
18 |                           dropout_ratio=dropout_ratio, temperature=temperature)
19 |         kwargs = {}
20 |         kwargs['mixture'] = em
21 |         #self.mixture = em
22 |         kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples,
23 |                                                power=power)
24 |         super(LDA2Vec, self).__init__(**kwargs)
25 |         rand = np.random.random(self.sampler.W.data.shape)
26 |         self.sampler.W.data[:, :] = rand[:, :]
27 |         self.n_units = n_units
28 |         self.train = train
29 |         self.dropout_ratio = dropout_ratio
30 |         self.word_dropout_ratio = word_dropout_ratio
31 |         self.n_samples = n_samples
32 | 
33 |     def prior(self):
34 |         dl1 = dirichlet_likelihood(self.mixture.weights)
35 |         return dl1
36 | 
37 |     def fit_partial(self, rdoc_ids, rword_indices, window=5,
38 |                     update_only_docs=False):
39 |         doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices)
40 |         pivot_idx = next(move(self.xp, rword_indices[window: -window]))
41 |         pivot = F.embed_id(pivot_idx, self.sampler.W)
42 |         if update_only_docs:
43 |             pivot.unchain_backward()
44 |         doc_at_pivot = rdoc_ids[window: -window]
45 |         doc = self.mixture(next(move(self.xp, doc_at_pivot)),
46 |                            update_only_docs=update_only_docs)
47 |         loss = 0.0
48 |         start, end = window, rword_indices.shape[0] - window
49 |         context = (F.dropout(doc, self.dropout_ratio) +
50 |                    F.dropout(pivot, self.dropout_ratio))
51 |         for frame in range(-window, window + 1):
52 |             # Skip predicting the current pivot
53 |             if frame == 0:
54 |                 continue
55 |             # Predict word given context and pivot word
56 |             # The target starts before the pivot
57 |             targetidx = rword_indices[start + frame: end + frame]
58 |             doc_at_target = rdoc_ids[start + frame: end + frame]
59 |             doc_is_same = doc_at_target == doc_at_pivot
60 |             rand = np.random.uniform(0, 1, doc_is_same.shape[0])
61 |             mask = (rand > self.word_dropout_ratio).astype('bool')
62 |             weight = np.logical_and(doc_is_same, mask).astype('int32')
63 |             # If weight is 1.0 then targetidx
64 |             # If weight is 0.0 then -1
65 |             targetidx = targetidx * weight + -1 * (1 - weight)
66 |             target, = move(self.xp, targetidx)
67 |             loss = self.sampler(context, target)
68 |             loss.backward()
69 |             if update_only_docs:
70 |                 # Wipe out any gradient accumulation on word vectors
71 |                 self.sampler.W.grad *= 0.0
72 |         return loss.data
73 | 


--------------------------------------------------------------------------------
/lda2vec/negative_sampling.py:
--------------------------------------------------------------------------------
  1 | import numpy
  2 | import six
  3 | 
  4 | from chainer import cuda
  5 | from chainer import function
  6 | from chainer.utils import type_check
  7 | 
  8 | 
  9 | class NegativeSamplingFunction(function.Function):
 10 | 
 11 |     ignore_label = -1
 12 | 
 13 |     def __init__(self, sampler, sample_size):
 14 |         self.sampler = sampler
 15 |         self.sample_size = sample_size
 16 | 
 17 |     def _make_samples(self, t):
 18 |         if hasattr(self, 'samples'):
 19 |             return self.samples  # for testing
 20 | 
 21 |         size = int(t.shape[0])
 22 |         # first one is the positive, and others are sampled negatives
 23 |         samples = self.sampler((size, self.sample_size + 1))
 24 |         samples[:, 0] = t
 25 |         self.samples = samples
 26 | 
 27 |     def check_type_forward(self, in_types):
 28 |         type_check.expect(in_types.size() == 3)
 29 |         x_type, t_type, w_type = in_types
 30 | 
 31 |         type_check.expect(
 32 |             x_type.dtype == numpy.float32,
 33 |             x_type.ndim == 2,
 34 |             t_type.dtype == numpy.int32,
 35 |             t_type.ndim == 1,
 36 |             x_type.shape[0] == t_type.shape[0],
 37 |             w_type.dtype == numpy.float32,
 38 |             w_type.ndim == 2,
 39 |         )
 40 | 
 41 |     def forward_cpu(self, inputs):
 42 |         x, t, W = inputs
 43 |         self.ignore_mask = (t != self.ignore_label)
 44 |         self._make_samples(t)
 45 | 
 46 |         loss = numpy.float32(0.0)
 47 |         for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask],
 48 |                                     self.samples[self.ignore_mask])):
 49 |             w = W[k]
 50 |             f = w.dot(ix)
 51 |             f[0] *= -1  # positive sample
 52 |             loss += numpy.sum(numpy.logaddexp(f, 0))
 53 |         return numpy.array(loss, numpy.float32),
 54 | 
 55 |     def forward_gpu(self, inputs):
 56 |         x, t, W = inputs
 57 |         self.ignore_mask = (t != self.ignore_label)
 58 |         n_in = x.shape[1]
 59 |         self._make_samples(t)
 60 | 
 61 |         self.wx = cuda.elementwise(
 62 |             'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx',
 63 |             '''
 64 |             T f = 0;
 65 |             if (mask == 1){
 66 |                 for (int j = 0; j < c; ++j) {
 67 |                   int x_ind[] = {(i / m), j};
 68 |                   int w_ind[] = {k, j};
 69 |                   f += x[x_ind] * W[w_ind];
 70 |                 }
 71 |             }
 72 |             wx = f;
 73 |             ''',
 74 |             'negative_sampling_wx'
 75 |             )(W, x, self.ignore_mask[:, None], self.samples, n_in,
 76 |               self.sample_size + 1)
 77 | 
 78 |         y = cuda.elementwise(
 79 |             'T wx, int32 c, int32 m', 'T y',
 80 |             '''
 81 |             T f = wx;
 82 |             if (i % m == 0) {
 83 |               f = -f;
 84 |             }
 85 |             T loss;
 86 |             if (f < 0) {
 87 |               loss = __logf(1 + __expf(f));
 88 |             } else {
 89 |               loss = f + __logf(1 + __expf(-f));
 90 |             }
 91 |             y = loss;
 92 |             ''',
 93 |             'negative_sampling_forward'
 94 |         )(self.wx, n_in, self.sample_size + 1)
 95 |         # TODO(okuta): merge elementwise
 96 |         loss = cuda.cupy.sum(y * self.ignore_mask[:, None].astype('float32'))
 97 |         return loss,
 98 | 
 99 |     def backward_cpu(self, inputs, grads):
100 |         x, t, W = inputs
101 |         gloss, = grads
102 | 
103 |         gx = numpy.zeros_like(x)
104 |         gW = numpy.zeros_like(W)
105 |         for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask],
106 |                                     self.samples[self.ignore_mask])):
107 |             w = W[k]
108 |             f = w.dot(ix)
109 | 
110 |             # g == -y * gloss / (1 + exp(yf))
111 |             f[0] *= -1
112 |             g = gloss / (1 + numpy.exp(-f))
113 |             g[0] *= -1
114 | 
115 |             gx[i] = g.dot(w)
116 |             for ik, ig in six.moves.zip(k, g):
117 |                 gW[ik] += ig * ix
118 |         return gx, None, gW
119 | 
120 |     def backward_gpu(self, inputs, grads):
121 |         cupy = cuda.cupy
122 |         x, t, W = inputs
123 |         gloss, = grads
124 | 
125 |         n_in = x.shape[1]
126 |         g = cuda.elementwise(
127 |             'T wx, raw T gloss, int32 m', 'T g',
128 |             '''
129 |             T y;
130 |             if (i % m == 0) {
131 |               y = 1;
132 |             } else {
133 |               y = -1;
134 |             }
135 | 
136 |             g = -y * gloss[0] / (1.0f + __expf(wx * y));
137 |             ''',
138 |             'negative_sampling_calculate_g'
139 |         )(self.wx, gloss, self.sample_size + 1)
140 |         gx = cupy.zeros_like(x)
141 |         cuda.elementwise(
142 |             'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx',
143 |             '''
144 |             int d = i / c;
145 |             T w = 0;
146 |             if (mask == 1){
147 |                 for (int j = 0; j < m; ++j) {
148 |                   w += g[d * m + j] * W[k[d * m + j] * c + i % c];
149 |                 }
150 |             }
151 |             gx = w;
152 |             ''',
153 |             'negative_sampling_calculate_gx'
154 |             )(g, W, self.ignore_mask[:, None], self.samples, n_in,
155 |               self.sample_size + 1, gx)
156 |         gW = cupy.zeros_like(W)
157 |         cuda.elementwise(
158 |             'T g, raw T x, S k, bool mask, int32 c, int32 m',
159 |             'raw T gW',
160 |             '''
161 |             T gi = g;
162 |             if (mask == 1) {
163 |                 for (int j = 0; j < c; ++j) {
164 |                   atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]);
165 |                 }
166 |             }
167 |             ''',
168 |             'negative_sampling_calculate_gw'
169 |             )(g, x, self.samples, self.ignore_mask[:, None], n_in,
170 |               self.sample_size + 1, gW)
171 |         return gx, None, gW
172 | 
173 | 
174 | def negative_sampling(x, t, W, sampler, sample_size):
175 |     """Negative sampling loss function.
176 | 
177 |     In natural language processing, especially language modeling, the number of
178 |     words in a vocabulary can be very large.
179 |     Therefore, you need to spend a lot of time calculating the gradient of the
180 |     embedding matrix.
181 | 
182 |     By using the negative sampling trick you only need to calculate the
183 |     gradient for a few sampled negative examples.
184 | 
185 |     The objective function is below:
186 | 
187 |     .. math::
188 | 
189 |        f(x, p) = \\log \\sigma(x^\\top w_p) + \\
190 |        k E_{i \\sim P(i)}[\\log \\sigma(- x^\\top w_i)],
191 | 
192 |     where :math:`\sigma(\cdot)` is a sigmoid function, :math:`w_i` is the
193 |     weight vector for the word :math:`i`, and :math:`p` is a positive example.
194 |     It is approximeted with :math:`k` examples :math:`N` sampled from
195 |     probability :math:`P(i)`, like this:
196 | 
197 |     .. math::
198 | 
199 |        f(x, p) \\approx \\log \\sigma(x^\\top w_p) + \\
200 |        \\sum_{n \\in N} \\log \\sigma(-x^\\top w_n).
201 | 
202 |     Each sample of :math:`N` is drawn from the word distribution :math:`P(w)`.
203 |     This is calculated as :math:`P(w) = \\frac{1}{Z} c(w)^\\alpha`, where
204 |     :math:`c(w)` is the unigram count of the word :math:`w`, :math:`\\alpha` is
205 |     a hyper-parameter, and :math:`Z` is the normalization constant.
206 | 
207 |     Args:
208 |         x (~chainer.Variable): Batch of input vectors.
209 |         t (~chainer.Variable): Vector of groundtruth labels.
210 |         W (~chainer.Variable): Weight matrix.
211 |         sampler (function): Sampling function. It takes a shape and returns an
212 |             integer array of the shape. Each element of this array is a sample
213 |             from the word distribution. A :class:`~chainer.utils.WalkerAlias`
214 |             object built with the power distribution of word frequency is
215 |             recommended.
216 |         sample_size (int): Number of samples.
217 | 
218 |     See: `Distributed Representations of Words and Phrases and their\
219 |          Compositionality <http://arxiv.org/abs/1310.4546>`_
220 | 
221 |     .. seealso:: :class:`~chainer.links.NegativeSampling`.
222 | 
223 |     """
224 |     return NegativeSamplingFunction(sampler, sample_size)(x, t, W)
225 | 
226 | 
227 | # Monkey-patch the chainer code to replace the negative sampling
228 | # with the one used here
229 | import chainer.links as L
230 | import chainer.functions as F
231 | negative_sampling.patched = True
232 | L.NegativeSampling.negative_sampling = negative_sampling
233 | F.negative_sampling = negative_sampling
234 | 


--------------------------------------------------------------------------------
/lda2vec/preprocess.py:
--------------------------------------------------------------------------------
  1 | import spacy
  2 | import numpy as np
  3 | import en_core_web_md as en
  4 | from tqdm import tqdm_notebook as progress
  5 | from spacy.attrs import LOWER, LIKE_EMAIL, LIKE_URL
  6 | import warnings
  7 | 
  8 | warnings.simplefilter("ignore")
  9 | 
 10 | 
 11 | def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None,
 12 |              **kwargs):
 13 |     """ Uses spaCy to quickly tokenize text and return an array
 14 |     of indices.
 15 | 
 16 |     This method stores a global NLP directory in memory, and takes
 17 |     up to a minute to run for the time. Later calls will have the
 18 |     tokenizer in memory.
 19 | 
 20 |     Parameters
 21 |     ----------
 22 |     text : list of unicode strings
 23 |         These are the input documents. There can be multiple sentences per
 24 |         item in the list.
 25 |     max_length : int
 26 |         This is the maximum number of words per document. If the document is
 27 |         shorter then this number it will be padded to this length.
 28 |     skip : int, optional
 29 |         Short documents will be padded with this variable up until max_length.
 30 |     attr : int, from spacy.attrs
 31 |         What to transform the token to. Choice must be in spacy.attrs, and =
 32 |         common choices are (LOWER, LEMMA)
 33 |     merge : int, optional
 34 |         Merge noun phrases into a single token. Useful for turning 'New York'
 35 |         into a single token.
 36 |     nlp : None
 37 |         A spaCy NLP object. Useful for not reinstantiating the object multiple
 38 |         times.
 39 |     kwargs : dict, optional
 40 |         Any further argument will be sent to the spaCy tokenizer. For extra
 41 |         speed consider setting tag=False, parse=False, entity=False, or
 42 |         n_threads=8.
 43 | 
 44 |     Returns
 45 |     -------
 46 |     arr : 2D array of ints
 47 |         Has shape (len(texts), max_length). Each value represents
 48 |         the word index.
 49 |     vocab : dict
 50 |         Keys are the word index, and values are the string. The pad index gets
 51 |         mapped to None
 52 | 
 53 |     >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"]
 54 |     >>> arr, vocab = tokenize(sents, 10, merge=True)
 55 |     >>> arr.shape[0]
 56 |     2
 57 |     >>> arr.shape[1]
 58 |     10
 59 |     >>> w2i = {w: i for i, w in vocab.iteritems()}
 60 |     >>> arr[0, 0] == w2i[u'do']  # First word and its index should match
 61 |     True
 62 |     >>> arr[0, 1] == w2i[u'you']
 63 |     True
 64 |     >>> arr[0, -1]  # last word in 0th document is a pad word
 65 |     -2
 66 |     >>> arr[0, 4] == w2i[u'class action lawsuit']  # noun phrase is tokenized
 67 |     True
 68 |     >>> arr[1, 1]  # The URL token is thrown out
 69 |     -2
 70 |     """
 71 |     if nlp is None:
 72 |         nlp = en.load()
 73 |     data = np.zeros((len(texts), max_length), dtype='int32')
 74 |     data[:] = skip
 75 |     bad_deps = ('amod', 'compound')
 76 |     token_list = []
 77 |     vocab = {}
 78 |     index = 0
 79 |     for row, doc in progress(enumerate(nlp.pipe(texts, **kwargs))):
 80 |         if merge:
 81 |             for phrase in doc.noun_chunks:
 82 |                 while len(phrase) > 1 and phrase[0].dep_ not in bad_deps:
 83 |                     phrase = phrase[1:]
 84 |                 if len(phrase) > 1:
 85 |                     phrase.merge(phrase.root.tag_, phrase.text,
 86 |                                  phrase.root.ent_type_)
 87 |                 for ent in doc.ents:
 88 |                     if len(ent) > 1:
 89 |                         ent.merge(ent.root.tag_, ent.text, ent.label_)
 90 | 
 91 |         dat = doc.to_array([LOWER, LIKE_EMAIL, LIKE_URL]).astype("int32")
 92 |         for i, token in enumerate(doc):
 93 |             text = token.text.lower()
 94 |             if text not in list(vocab.values()):
 95 |                 dat[i][0] = index
 96 |                 vocab[index] = text
 97 |                 index += 1
 98 |             else:
 99 |                 for k, v in vocab.items():
100 |                     if v == text:
101 |                         value = k
102 |                         break
103 |                 dat[i][0] = value
104 |         if len(dat) > 0:
105 |             msg = "Negative indices reserved for special tokens"
106 |             assert dat.min() >= 0, msg
107 |             idx = (dat[:, 1] > 0) | (dat[:, 2] > 0)
108 |             dat[idx] = skip
109 |             length = min(len(dat), max_length)
110 |             data[row, :length] = dat[:length, 0].ravel()
111 | 
112 |     vocab[skip] = '<SKIP>'
113 |     return data, vocab
114 | 
115 | 
116 | if __name__ == "__main__":
117 |     import doctest
118 |     doctest.testmod()
119 | 


--------------------------------------------------------------------------------
/lda2vec/topics.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import requests
  3 | import multiprocessing
  4 | 
  5 | 
  6 | def _softmax(x):
  7 |     e_x = np.exp(x - np.max(x))
  8 |     out = e_x / e_x.sum()
  9 |     return out
 10 | 
 11 | 
 12 | def _softmax_2d(x):
 13 |     y = x - x.max(axis=1, keepdims=True)
 14 |     np.exp(y, out=y)
 15 |     y /= y.sum(axis=1, keepdims=True)
 16 |     return y
 17 | 
 18 | 
 19 | def prob_words(context, vocab, temperature=1.0):
 20 |     """ This calculates a softmax over the vocabulary as a function
 21 |     of the dot product of context and word.
 22 |     """
 23 |     dot = np.dot(vocab, context)
 24 |     prob = _softmax(dot / temperature)
 25 |     return prob
 26 | 
 27 | 
 28 | def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0,
 29 |                    doc_lengths=None, term_frequency=None, normalize=False):
 30 |     """ Collects a dictionary of word, document and topic distributions.
 31 | 
 32 |     Arguments
 33 |     ---------
 34 |     weights : float array
 35 |         This must be an array of unnormalized log-odds of document-to-topic
 36 |         weights. Shape should be [n_documents, n_topics]
 37 |     factors : float array
 38 |         Should be an array of topic vectors. These topic vectors live in the
 39 |         same space as word vectors and will be used to find the most similar
 40 |         words to each topic. Shape should be [n_topics, n_dim].
 41 |     word_vectors : float array
 42 |         This must be a matrix of word vectors. Should be of shape
 43 |         [n_words, n_dim]
 44 |     vocab : list of str
 45 |         These must be the strings for words corresponding to
 46 |         indices [0, n_words]
 47 |     temperature : float
 48 |         Used to calculate the log probability of a word. Higher
 49 |         temperatures make more rare words more likely.
 50 |     doc_lengths : int array
 51 |         An array indicating the number of words in the nth document.
 52 |         Must be of shape [n_documents]. Required by pyLDAvis.
 53 |     term_frequency : int array
 54 |         An array indicating the overall number of times each token appears
 55 |         in the corpus. Must be of shape [n_words]. Required by pyLDAvis.
 56 | 
 57 |     Returns
 58 |     -------
 59 |     data : dict
 60 |         This dictionary is readily consumed by pyLDAVis for topic
 61 |         visualization.
 62 |     """
 63 |     # Map each factor vector to a word
 64 |     topic_to_word = []
 65 |     msg = "Vocabulary size did not match size of word vectors"
 66 |     assert len(vocab) == word_vectors.shape[0], msg
 67 |     if normalize:
 68 |         word_vectors /= np.linalg.norm(word_vectors, axis=1)[:, None]
 69 |     # factors = factors / np.linalg.norm(factors, axis=1)[:, None]
 70 |     for factor_vector in factors:
 71 |         factor_to_word = prob_words(factor_vector, word_vectors,
 72 |                                     temperature=temperature)
 73 |         topic_to_word.append(np.ravel(factor_to_word))
 74 |     topic_to_word = np.array(topic_to_word)
 75 |     msg = "Not all rows in topic_to_word sum to 1"
 76 |     assert np.allclose(np.sum(topic_to_word, axis=1), 1), msg
 77 |     # Collect document-to-topic distributions, e.g. theta
 78 |     doc_to_topic = _softmax_2d(weights)
 79 |     msg = "Not all rows in doc_to_topic sum to 1"
 80 |     assert np.allclose(np.sum(doc_to_topic, axis=1), 1), msg
 81 |     data = {'topic_term_dists': topic_to_word,
 82 |             'doc_topic_dists': doc_to_topic,
 83 |             'doc_lengths': doc_lengths,
 84 |             'vocab': vocab,
 85 |             'term_frequency': term_frequency}
 86 |     return data
 87 | 
 88 | 
 89 | def print_top_words_per_topic(data, top_n=10, do_print=True):
 90 |     """ Given a pyLDAvis data array, print out the top words in every topic.
 91 | 
 92 |     Arguments
 93 |     ---------
 94 |     data : dict
 95 |         A dict object that summarizes topic data and has been made using
 96 |         `prepare_topics`.
 97 |     """
 98 |     msgs = []
 99 |     lists = []
100 |     for j, topic_to_word in enumerate(data['topic_term_dists']):
101 |         top = np.argsort(topic_to_word)[::-1][:top_n]
102 |         prefix = "Top words in topic %i " % j
103 |         top_words = [data['vocab'][i].strip().replace(' ', '_') for i in top]
104 |         msg = ' '.join(top_words)
105 |         if do_print:
106 |             print(prefix + msg)
107 |         lists.append(top_words)
108 |     return lists
109 | 
110 | 
111 | def get_request(url):
112 |     for _ in range(5):
113 |         try:
114 |             return float(requests.get(url).text)
115 |         except:
116 |             pass
117 |     return None
118 | 
119 | 
120 | def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci',
121 |                                      'umass']):
122 |     """ Requests the topic coherence from AKSW Palmetto
123 | 
124 |     Arguments
125 |     ---------
126 |     lists : list of lists
127 |         A list of lists with one list of top words for each topic.
128 | 
129 |     >>> topic_words = [['cake', 'apple', 'banana', 'cherry', 'chocolate']]
130 |     >>> topic_coherence(topic_words, services=['cv'])
131 |     {(0, 'cv'): 0.5678879445677241}
132 |     """
133 |     url = u'http://palmetto.aksw.org/palmetto-webapp/service/{}?words={}'
134 |     reqs = [url.format(s, '%20'.join(top[:10]))
135 |             for s in services for top in lists]
136 |     pool = multiprocessing.Pool()
137 |     coherences = pool.map(get_request, reqs)
138 |     pool.close()
139 |     pool.terminate()
140 |     pool.join()
141 |     del pool
142 |     args = [(j, s, top) for s in services for j, top in enumerate(lists)]
143 |     ans = {}
144 |     for ((j, s, t), tc) in zip(args, coherences):
145 |         ans[(j, s)] = tc
146 |     return ans
147 | 


--------------------------------------------------------------------------------
/lda2vec/tracking.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from sklearn.linear_model import LinearRegression
 3 | 
 4 | 
 5 | class Tracking:
 6 |     cache = {}
 7 |     calls = 0
 8 |     slope = 0.0
 9 | 
10 |     def __init__(self, n=5000):
11 |         """ The tracking class keeps a most recently used cache of values
12 | 
13 |         Parameters
14 |         ----------
15 |         n: int
16 |         Number of items to keep.
17 |         """
18 |         self.n = n
19 | 
20 |     def add(self, key, item):
21 |         """ Add an item with a particular to the cache.
22 | 
23 |         >>> tracker = Tracking()
24 |         >>> tracker.add('log_perplexity', 55.6)
25 |         >>> tracker.cache['log_perplexity']
26 |         [55.6]
27 |         >>> tracker.add('log_perplexity', 55.2)
28 |         >>> tracker.add('loss', -12.1)
29 |         >>> tracker.cache['log_perplexity']
30 |         [55.6, 55.2]
31 |         >>> tracker.cache['loss']
32 |         [-12.1]
33 |         """
34 |         if key not in self.cache:
35 |             self.cache[key] = []
36 |         self.cache[key].append(item)
37 |         if len(self.cache[key]) > self.n:
38 |             self.cache[key] = self.cache[key][:self.n]
39 | 
40 |     def stats(self, key):
41 |         """ Get the statistics for items with a particular key
42 | 
43 |         >>> tracker = Tracking()
44 |         >>> tracker.add('log_perplexity', 55.6)
45 |         >>> tracker.add('log_perplexity', 55.2)
46 |         >>> tracker.stats('log_perplexity')
47 |         (55.400000000000006, 0.19999999999999929, 0.0)
48 |         """
49 |         data = self.cache[key]
50 |         mean = np.mean(data)
51 |         std = np.std(data)
52 |         slope = self.slope
53 |         if self.calls % 100 == 0:
54 |             lr = LinearRegression()
55 |             x = np.arange(len(data)).astype('float32')
56 |             lr.fit(x[:, None], np.array(data))
57 |             self.slope = lr.coef_[0]
58 |         self.calls += 1
59 |         return mean, std, slope
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     import doctest
64 |     doctest.testmod()
65 | 


--------------------------------------------------------------------------------
/lda2vec/utils.py:
--------------------------------------------------------------------------------
 1 | from chainer import Variable
 2 | import random
 3 | import numpy as np
 4 | 
 5 | 
 6 | def move(xp, *args):
 7 |     for arg in args:
 8 |         if 'float' in str(arg.dtype):
 9 |             yield Variable(xp.asarray(arg, dtype='float32'))
10 |         else:
11 |             assert 'int' in str(arg.dtype)
12 |             yield Variable(xp.asarray(arg, dtype='int32'))
13 | 
14 | 
15 | def most_similar(embeddings, word_index):
16 |     input_vector = embeddings.W[word_index]
17 |     similarities = embeddings.dot(input_vector)
18 |     return similarities
19 | 
20 | 
21 | def chunks(n, *args):
22 |     """Yield successive n-sized chunks from l."""
23 |     # From stackoverflow question 312443
24 |     keypoints = []
25 |     for i in range(0, len(args[0]), n):
26 |         keypoints.append((i, i + n))
27 |     random.shuffle(keypoints)
28 |     for a, b in keypoints:
29 |         yield [arg[a: b] for arg in args]
30 | 
31 | 
32 | class MovingAverage():
33 |     def __init__(self, lastn=100):
34 |         self.points = np.array([])
35 |         self.lastn = lastn
36 | 
37 |     def add(self, x):
38 |         self.points = np.append(self.points, x)
39 | 
40 |     def mean(self):
41 |         return np.mean(self.points[-self.lastn:])
42 | 
43 |     def std(self):
44 |         return np.std(self.points[-self.lastn:])
45 | 
46 |     def get_stats(self):
47 |         return (np.mean(self.points[-self.lastn:]),
48 |                 np.std(self.points[-self.lastn:]))
49 | 


--------------------------------------------------------------------------------
/notebooks/dataset.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import logging\n",
 10 |     "import pickle\n",
 11 |     "from sklearn.datasets import fetch_20newsgroups\n",
 12 |     "import numpy as np\n",
 13 |     "from lda2vec import preprocess, Corpus\n",
 14 |     "logging.basicConfig()"
 15 |    ]
 16 |   },
 17 |   {
 18 |    "cell_type": "code",
 19 |    "execution_count": 2,
 20 |    "metadata": {},
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# Fetch data\n",
 24 |     "remove = ('headers', 'footers', 'quotes')\n",
 25 |     "texts = fetch_20newsgroups(subset='train', remove=remove).data\n",
 26 |     "# Remove tokens with these substrings\n",
 27 |     "bad = set([\"ax>\", '`@(\"', '---', '===', '^^^'])\n",
 28 |     "\n",
 29 |     "\n",
 30 |     "def clean(line):\n",
 31 |     "    return ' '.join(w for w in line.split() if not any(t in w for t in bad))\n",
 32 |     "    "
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 3,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "# Preprocess data\n",
 42 |     "max_length = 10000   # Limit of 10k words per document\n",
 43 |     "# Convert to unicode (spaCy only works with unicode)\n",
 44 |     "texts = [str(clean(d)) for d in texts if len(str(clean(d))) > 0]\n"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 4,
 50 |    "metadata": {},
 51 |    "outputs": [],
 52 |    "source": [
 53 |     "#tokens, vocab = preprocess.tokenize(texts, max_length, merge=False,\n",
 54 |     "#                                    n_threads=4)\n",
 55 |     "\n",
 56 |     "tokens = np.load(\"tokens.npy\")\n",
 57 |     "vocab = np.load(\"vocab.npy\")"
 58 |    ]
 59 |   },
 60 |   {
 61 |    "cell_type": "code",
 62 |    "execution_count": 5,
 63 |    "metadata": {},
 64 |    "outputs": [],
 65 |    "source": [
 66 |     "vocab = vocab.tolist()"
 67 |    ]
 68 |   },
 69 |   {
 70 |    "cell_type": "code",
 71 |    "execution_count": 6,
 72 |    "metadata": {},
 73 |    "outputs": [],
 74 |    "source": [
 75 |     "corpus = Corpus()\n",
 76 |     "# Make a ranked list of rare vs frequent words\n",
 77 |     "corpus.update_word_count(tokens)\n",
 78 |     "corpus.finalize()\n",
 79 |     "# The tokenization uses spaCy indices, and so may have gaps\n",
 80 |     "# between indices for words that aren't present in our dataset.\n",
 81 |     "# This builds a new compact index\n",
 82 |     "compact = corpus.to_compact(tokens)\n",
 83 |     "# Remove extremely rare words\n",
 84 |     "pruned = corpus.filter_count(compact, min_count=30)\n",
 85 |     "# Convert the compactified arrays into bag of words arrays\n",
 86 |     "bow = corpus.compact_to_bow(pruned)\n",
 87 |     "# Words tend to have power law frequency, so selectively\n",
 88 |     "# downsample the most prevalent words\n",
 89 |     "clean = corpus.subsample_frequent(pruned)\n",
 90 |     "# Now flatten a 2D array of document per row and word position\n",
 91 |     "# per column to a 1D array of words. This will also remove skips\n",
 92 |     "# and OoV words\n",
 93 |     "doc_ids = np.arange(pruned.shape[0])\n",
 94 |     "flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)\n"
 95 |    ]
 96 |   },
 97 |   {
 98 |    "cell_type": "code",
 99 |    "execution_count": 7,
100 |    "metadata": {},
101 |    "outputs": [],
102 |    "source": [
103 |     "assert flattened.min() >= 0\n",
104 |     "# Fill in the pretrained word vectors\n",
105 |     "n_dim = 300\n",
106 |     "fn_wordvc = '../../../../Downloads/vectors/GoogleNews-vectors-negative300.bin'\n",
107 |     "vectors, s, f = corpus.compact_word_vectors(vocab, filename=fn_wordvc)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 9,
113 |    "metadata": {},
114 |    "outputs": [],
115 |    "source": [
116 |     "# Save all of the preprocessed files\n",
117 |     "pickle.dump(vocab, open('vocab.pkl', 'w'))\n",
118 |     "pickle.dump(corpus, open('corpus.pkl', 'w'))\n",
119 |     "np.save(\"flattened\", flattened)\n",
120 |     "np.save(\"doc_ids\", doc_ids)\n",
121 |     "np.save(\"pruned\", pruned)\n",
122 |     "np.save(\"bow\", bow)\n",
123 |     "np.save(\"vectors\", vectors)"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": null,
129 |    "metadata": {},
130 |    "outputs": [],
131 |    "source": []
132 |   },
133 |   {
134 |    "cell_type": "code",
135 |    "execution_count": null,
136 |    "metadata": {},
137 |    "outputs": [],
138 |    "source": []
139 |   },
140 |   {
141 |    "cell_type": "code",
142 |    "execution_count": null,
143 |    "metadata": {},
144 |    "outputs": [],
145 |    "source": []
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": null,
150 |    "metadata": {},
151 |    "outputs": [],
152 |    "source": []
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": null,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": []
160 |   }
161 |  ],
162 |  "metadata": {
163 |   "kernelspec": {
164 |    "display_name": "Python 3",
165 |    "language": "python",
166 |    "name": "python3"
167 |   },
168 |   "language_info": {
169 |    "codemirror_mode": {
170 |     "name": "ipython",
171 |     "version": 3
172 |    },
173 |    "file_extension": ".py",
174 |    "mimetype": "text/x-python",
175 |    "name": "python",
176 |    "nbconvert_exporter": "python",
177 |    "pygments_lexer": "ipython3",
178 |    "version": "3.7.2"
179 |   }
180 |  },
181 |  "nbformat": 4,
182 |  "nbformat_minor": 2
183 | }
184 | 


--------------------------------------------------------------------------------
/notebooks/lda2vec_model.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "import os\n",
  10 |     "import os.path\n",
  11 |     "import pickle\n",
  12 |     "import time\n",
  13 |     "import shelve\n",
  14 |     "\n",
  15 |     "import chainer\n",
  16 |     "from chainer import cuda\n",
  17 |     "from chainer import serializers\n",
  18 |     "import chainer.optimizers as O\n",
  19 |     "import numpy as np\n",
  20 |     "\n",
  21 |     "from lda2vec import utils\n",
  22 |     "from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence\n",
  23 |     "from lda2vec import LDA2Vec"
  24 |    ]
  25 |   },
  26 |   {
  27 |    "cell_type": "code",
  28 |    "execution_count": 2,
  29 |    "metadata": {},
  30 |    "outputs": [
  31 |     {
  32 |      "name": "stdout",
  33 |      "output_type": "stream",
  34 |      "text": [
  35 |       "Using GPU:0\n"
  36 |      ]
  37 |     }
  38 |    ],
  39 |    "source": [
  40 |     "gpu_id = int(os.getenv('CUDA_GPU', 0))\n",
  41 |     "cuda.get_device(gpu_id).use()\n",
  42 |     "print(\"Using GPU:\" + str(gpu_id))"
  43 |    ]
  44 |   },
  45 |   {
  46 |    "cell_type": "code",
  47 |    "execution_count": 3,
  48 |    "metadata": {},
  49 |    "outputs": [],
  50 |    "source": [
  51 |     "#data_dir = os.getenv('data_dir', '../data/')\n",
  52 |     "fn_vocab = 'vocab.pkl'\n",
  53 |     "fn_corpus = 'corpus.pkl'\n",
  54 |     "fn_flatnd = 'flattened.npy'\n",
  55 |     "fn_docids = 'doc_ids.npy'\n",
  56 |     "fn_vectors = 'vectors.npy'\n",
  57 |     "vocab = pickle.load(open(fn_vocab, 'rb'))\n",
  58 |     "corpus = pickle.load(open(fn_corpus, 'rb'))\n",
  59 |     "flattened = np.load(fn_flatnd)\n",
  60 |     "doc_ids = np.load(fn_docids)\n",
  61 |     "vectors = np.load(fn_vectors)"
  62 |    ]
  63 |   },
  64 |   {
  65 |    "cell_type": "code",
  66 |    "execution_count": 4,
  67 |    "metadata": {},
  68 |    "outputs": [],
  69 |    "source": [
  70 |     "# Model Parameters\n",
  71 |     "# Number of documents\n",
  72 |     "n_docs = doc_ids.max() + 1\n",
  73 |     "# Number of unique words in the vocabulary\n",
  74 |     "n_vocab = flattened.max() + 1\n",
  75 |     "# 'Strength' of the dircihlet prior; 200.0 seems to work well\n",
  76 |     "clambda = 200.0\n",
  77 |     "# Number of topics to fit\n",
  78 |     "n_topics = int(os.getenv('n_topics', 20))\n",
  79 |     "batchsize = 4096\n",
  80 |     "# Power for neg sampling\n",
  81 |     "power = float(os.getenv('power', 0.75))\n",
  82 |     "# Intialize with pretrained word vectors\n",
  83 |     "pretrained = bool(int(os.getenv('pretrained', True)))\n",
  84 |     "# Sampling temperature\n",
  85 |     "temperature = float(os.getenv('temperature', 1.0))\n",
  86 |     "# Number of dimensions in a single word vector\n",
  87 |     "n_units = int(os.getenv('n_units', 300))\n",
  88 |     "# Get the string representation for every compact key\n",
  89 |     "words = corpus.word_list(vocab)[:n_vocab]\n",
  90 |     "# How many tokens are in each document\n",
  91 |     "doc_idx, lengths = np.unique(doc_ids, return_counts=True)\n",
  92 |     "doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')\n",
  93 |     "doc_lengths[doc_idx] = lengths\n",
  94 |     "# Count all token frequencies\n",
  95 |     "tok_idx, freq = np.unique(flattened, return_counts=True)\n",
  96 |     "term_frequency = np.zeros(n_vocab, dtype='int32')\n",
  97 |     "term_frequency[tok_idx] = freq"
  98 |    ]
  99 |   },
 100 |   {
 101 |    "cell_type": "code",
 102 |    "execution_count": 26,
 103 |    "metadata": {},
 104 |    "outputs": [
 105 |     {
 106 |      "name": "stdout",
 107 |      "output_type": "stream",
 108 |      "text": [
 109 |       "_ 11009\n",
 110 |       "_25 11009\n",
 111 |       "__doc__ Automatically created module for IPython interactive environment\n",
 112 |       "__loader__ None\n",
 113 |       "__name__ __main__\n",
 114 |       "__package__ None\n",
 115 |       "__spec__ None\n",
 116 |       "_dh ['/home/xenomorph/projects/onlps/lda2vec/notebooks']\n",
 117 |       "_i len(model.mixture.weights.W)\n",
 118 |       "_i13 serializers.load_npz('my.model', model)\n",
 119 |       "_i14 serializers.load_npz('lda2vec.hdf5', model)\n",
 120 |       "_i15 serializers.load_npz('lda2vec', model)\n",
 121 |       "_i16 serializers.load_hdf5(\"lda2vec.hdf5\")\n",
 122 |       "_i17 serializers.load_hdf5(\"lda2vec.hdf5\"), model)\n",
 123 |       "_i18 serializers.load_hdf5(\"lda2vec.hdf5\", model)\n",
 124 |       "_i19 model\n",
 125 |       "_i20 model\n",
 126 |       "_i21 import pickle\n",
 127 |       "_i22 with open(\"lda2vec.pkl\", \"w\"):\n",
 128 |       "    pickle.dump(model)\n",
 129 |       "_i23 with open(\"lda2vec.pkl\", \"w\") as f:\n",
 130 |       "    pickle.dump(model, f)\n",
 131 |       "_i24 with open(\"lda2vec.pkl\", \"wb\") as f:\n",
 132 |       "    pickle.dump(model, f)\n",
 133 |       "_i25 len(model.mixture.weights.W)\n",
 134 |       "_ii with open(\"lda2vec.pkl\", \"wb\") as f:\n",
 135 |       "    pickle.dump(model, f)\n",
 136 |       "_iii with open(\"lda2vec.pkl\", \"w\") as f:\n",
 137 |       "    pickle.dump(model, f)\n",
 138 |       "batchsize 4096\n",
 139 |       "clambda 200.0\n",
 140 |       "d [6535 6535 6535 ... 6535 6535 6535]\n",
 141 |       "doc_ids [    0     0     0 ... 11008 11008 11008]\n",
 142 |       "doc_idx [    0     1     2 ... 11006 11007 11008]\n",
 143 |       "doc_lengths [100  92 333 ... 115  63  50]\n",
 144 |       "dt 0.6835141181945801\n",
 145 |       "epoch 0\n",
 146 |       "flattened [  10   38 1311 ...   49   49   49]\n",
 147 |       "fn_corpus corpus.pkl\n",
 148 |       "fn_docids doc_ids.npy\n",
 149 |       "fn_flatnd flattened.npy\n",
 150 |       "fn_vectors vectors.npy\n",
 151 |       "fn_vocab vocab.pkl\n",
 152 |       "fraction 0.0017746121285380678\n",
 153 |       "freq [105430 103758 100329 ...     30     30     29]\n",
 154 |       "gpu_id 0\n",
 155 |       "j 564\n",
 156 |       "key key\n",
 157 |       "l 15207.722\n",
 158 |       "lengths [100  92 333 ... 115  63  50]\n",
 159 |       "logs {'loss': 15207.7216796875, 'epoch': 0, 'j': 563, 'prior': -637025.4375, 'rate': 5992.560930298103}\n",
 160 |       "loss variable(-1130.473)\n",
 161 |       "msg J:{j:05d} E:{epoch:05d} L:{loss:1.3e} P:{prior:1.3e} R:{rate:1.3e}\n",
 162 |       "n_docs 11009\n",
 163 |       "n_topics 20\n",
 164 |       "n_units 300\n",
 165 |       "n_vocab 5838\n",
 166 |       "power 0.75\n",
 167 |       "pretrained True\n",
 168 |       "prior variable(-637025.44)\n",
 169 |       "rate 5992.560930298103\n",
 170 |       "remove ('headers', 'footers', 'quotes')\n",
 171 |       "t0 1549864137.5832853\n",
 172 |       "t1 1549864138.2667994\n",
 173 |       "temperature 1.0\n",
 174 |       "term_frequency [ 0  0  0 ... 30 30 29]\n",
 175 |       "tok_idx [   3    4    5 ... 5835 5836 5837]\n"
 176 |      ]
 177 |     }
 178 |    ],
 179 |    "source": [
 180 |     "for key in sorted(locals().keys()):\n",
 181 |     "    val = locals()[key]\n",
 182 |     "    if len(str(val)) < 100 and '<' not in str(val):\n",
 183 |     "        print(key, val)"
 184 |    ]
 185 |   },
 186 |   {
 187 |    "cell_type": "markdown",
 188 |    "metadata": {},
 189 |    "source": [
 190 |     "# training the model"
 191 |    ]
 192 |   },
 193 |   {
 194 |    "cell_type": "code",
 195 |    "execution_count": 7,
 196 |    "metadata": {},
 197 |    "outputs": [],
 198 |    "source": [
 199 |     "model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,\n",
 200 |     "                n_units=n_units, n_vocab=n_vocab, counts=term_frequency,\n",
 201 |     "                n_samples=15, power=power, temperature=temperature)"
 202 |    ]
 203 |   },
 204 |   {
 205 |    "cell_type": "code",
 206 |    "execution_count": 8,
 207 |    "metadata": {},
 208 |    "outputs": [],
 209 |    "source": [
 210 |     "if os.path.exists('lda2vec.hdf5'):\n",
 211 |     "    print(\"Reloading from saved\")\n",
 212 |     "    serializers.load_hdf5(\"lda2vec.hdf5\", model)\n",
 213 |     "    \n",
 214 |     "if pretrained:\n",
 215 |     "    model.sampler.W.data[:, :] = vectors[:n_vocab, :]"
 216 |    ]
 217 |   },
 218 |   {
 219 |    "cell_type": "code",
 220 |    "execution_count": 9,
 221 |    "metadata": {},
 222 |    "outputs": [],
 223 |    "source": [
 224 |     "model.to_gpu()\n",
 225 |     "optimizer = O.Adam()\n",
 226 |     "optimizer.setup(model)\n",
 227 |     "clip = chainer.optimizer.GradientClipping(5.0)\n",
 228 |     "optimizer.add_hook(clip)"
 229 |    ]
 230 |   },
 231 |   {
 232 |    "cell_type": "code",
 233 |    "execution_count": 10,
 234 |    "metadata": {},
 235 |    "outputs": [],
 236 |    "source": [
 237 |     "j = 0\n",
 238 |     "epoch = 0\n",
 239 |     "fraction = batchsize * 1.0 / flattened.shape[0]\n",
 240 |     "progress = shelve.open('progress.shelve')"
 241 |    ]
 242 |   },
 243 |   {
 244 |    "cell_type": "code",
 245 |    "execution_count": 11,
 246 |    "metadata": {},
 247 |    "outputs": [
 248 |     {
 249 |      "name": "stdout",
 250 |      "output_type": "stream",
 251 |      "text": [
 252 |       "Top words in topic 0 galileo easier codes differences professor worse better van calculations complicated\n",
 253 |       "Top words in topic 1 gene dept subscribe nut puck altitude atlanta protein seed login\n",
 254 |       "Top words in topic 2 md exec languages consumer d.c. sf sensitive centris dir commands\n",
 255 |       "Top words in topic 3 wiretap politically 2d armenians politics di al conflicts differences political\n",
 256 |       "Top words in topic 4 puck shareware tyre maxtor header lens der responsibility visit ball\n",
 257 |       "Top words in topic 5 ss isaiah rf apologize mental v. skills arithmetic wolverine situation\n",
 258 |       "Top words in topic 6 criticism koresh intelligence demands replies skepticism theists spiritual teacher atheist\n",
 259 |       "Top words in topic 7 l. transmitted transmit widget expos pilot directory rider jim boost\n",
 260 |       "Top words in topic 8 hung england florida punishment lying california practice 96 baptism arizona\n",
 261 |       "Top words in topic 9 denning glory smokeless netters patches jim ban michael cubs alt\n",
 262 |       "Top words in topic 10 login documentary jury alomar murdered murder picture murders convicted cryptography\n",
 263 |       "Top words in topic 11 blank cartridge ahl ministry identification corpses goalie worthless authentication cooperation\n",
 264 |       "Top words in topic 12 hotel mileage seat apartment gas cabin sf jet pipe average\n",
 265 |       "Top words in topic 13 toll governor launches d.c. 76 zealand confusing route km launched\n",
 266 |       "Top words in topic 14 physics astronaut theology que ss v obey marriage des ahl\n",
 267 |       "Top words in topic 15 allergic dictionary myself homicides rape murders 'm filename symptoms differ\n",
 268 |       "Top words in topic 16 bullets rifle hitter accidents revolver anger crashes helmet dealer massacre\n",
 269 |       "Top words in topic 17 wounded protein injured voltage vitamin o'clock centaur fish blood damaged\n",
 270 |       "Top words in topic 18 mom pirates dream goalie priority stephanopoulos protecting 130 animation accomplish\n",
 271 |       "Top words in topic 19 login submit entry edu ftp edit interview password verify submitting\n",
 272 |       "0\n",
 273 |       "after partial fitting: 60025.6\n",
 274 |       "J:00000 E:00000 L:6.003e+04 P:-6.266e+05 R:2.576e+03\n",
 275 |       "after partial fitting: 59965.18\n",
 276 |       "J:00001 E:00000 L:5.997e+04 P:-6.266e+05 R:4.847e+03\n",
 277 |       "after partial fitting: 58814.625\n",
 278 |       "J:00002 E:00000 L:5.881e+04 P:-6.266e+05 R:4.773e+03\n",
 279 |       "after partial fitting: 58230.41\n",
 280 |       "J:00003 E:00000 L:5.823e+04 P:-6.266e+05 R:4.714e+03\n",
 281 |       "after partial fitting: 57572.15\n",
 282 |       "J:00004 E:00000 L:5.757e+04 P:-6.266e+05 R:6.034e+03\n",
 283 |       "after partial fitting: 53957.297\n",
 284 |       "J:00005 E:00000 L:5.396e+04 P:-6.266e+05 R:6.011e+03\n",
 285 |       "after partial fitting: 55610.78\n",
 286 |       "J:00006 E:00000 L:5.561e+04 P:-6.266e+05 R:5.859e+03\n",
 287 |       "after partial fitting: 55718.6\n",
 288 |       "J:00007 E:00000 L:5.572e+04 P:-6.266e+05 R:5.897e+03\n",
 289 |       "after partial fitting: 54909.97\n",
 290 |       "J:00008 E:00000 L:5.491e+04 P:-6.266e+05 R:5.979e+03\n",
 291 |       "after partial fitting: 53568.777\n",
 292 |       "J:00009 E:00000 L:5.357e+04 P:-6.266e+05 R:5.862e+03\n",
 293 |       "after partial fitting: 52668.152\n",
 294 |       "J:00010 E:00000 L:5.267e+04 P:-6.266e+05 R:5.952e+03\n",
 295 |       "after partial fitting: 54325.47\n",
 296 |       "J:00011 E:00000 L:5.433e+04 P:-6.266e+05 R:5.897e+03\n",
 297 |       "after partial fitting: 53106.15\n",
 298 |       "J:00012 E:00000 L:5.311e+04 P:-6.266e+05 R:5.384e+03\n",
 299 |       "after partial fitting: 52418.695\n",
 300 |       "J:00013 E:00000 L:5.242e+04 P:-6.266e+05 R:5.985e+03\n",
 301 |       "after partial fitting: 51428.49\n",
 302 |       "J:00014 E:00000 L:5.143e+04 P:-6.266e+05 R:6.027e+03\n",
 303 |       "after partial fitting: 52054.29\n",
 304 |       "J:00015 E:00000 L:5.205e+04 P:-6.266e+05 R:5.959e+03\n",
 305 |       "after partial fitting: 50060.258\n",
 306 |       "J:00016 E:00000 L:5.006e+04 P:-6.266e+05 R:5.859e+03\n",
 307 |       "after partial fitting: 51417.117\n",
 308 |       "J:00017 E:00000 L:5.142e+04 P:-6.266e+05 R:5.870e+03\n",
 309 |       "after partial fitting: 51258.33\n",
 310 |       "J:00018 E:00000 L:5.126e+04 P:-6.266e+05 R:5.941e+03\n",
 311 |       "after partial fitting: 49484.758\n",
 312 |       "J:00019 E:00000 L:4.948e+04 P:-6.266e+05 R:5.990e+03\n",
 313 |       "after partial fitting: 51158.375\n",
 314 |       "J:00020 E:00000 L:5.116e+04 P:-6.266e+05 R:5.959e+03\n",
 315 |       "after partial fitting: 49449.215\n",
 316 |       "J:00021 E:00000 L:4.945e+04 P:-6.266e+05 R:4.884e+03\n",
 317 |       "after partial fitting: 48607.37\n",
 318 |       "J:00022 E:00000 L:4.861e+04 P:-6.266e+05 R:6.011e+03\n",
 319 |       "after partial fitting: 49262.7\n",
 320 |       "J:00023 E:00000 L:4.926e+04 P:-6.266e+05 R:5.805e+03\n",
 321 |       "after partial fitting: 48953.863\n",
 322 |       "J:00024 E:00000 L:4.895e+04 P:-6.266e+05 R:5.977e+03\n",
 323 |       "after partial fitting: 48829.59\n",
 324 |       "J:00025 E:00000 L:4.883e+04 P:-6.266e+05 R:5.944e+03\n",
 325 |       "after partial fitting: 48458.402\n",
 326 |       "J:00026 E:00000 L:4.846e+04 P:-6.266e+05 R:6.012e+03\n",
 327 |       "after partial fitting: 47735.047\n",
 328 |       "J:00027 E:00000 L:4.774e+04 P:-6.266e+05 R:6.013e+03\n",
 329 |       "after partial fitting: 46937.727\n",
 330 |       "J:00028 E:00000 L:4.694e+04 P:-6.266e+05 R:5.054e+03\n",
 331 |       "after partial fitting: 47446.117\n",
 332 |       "J:00029 E:00000 L:4.745e+04 P:-6.266e+05 R:5.018e+03\n",
 333 |       "after partial fitting: 46643.074\n",
 334 |       "J:00030 E:00000 L:4.664e+04 P:-6.266e+05 R:4.998e+03\n",
 335 |       "after partial fitting: 47368.3\n",
 336 |       "J:00031 E:00000 L:4.737e+04 P:-6.266e+05 R:5.440e+03\n",
 337 |       "after partial fitting: 46788.117\n",
 338 |       "J:00032 E:00000 L:4.679e+04 P:-6.266e+05 R:5.994e+03\n",
 339 |       "after partial fitting: 46492.664\n",
 340 |       "J:00033 E:00000 L:4.649e+04 P:-6.266e+05 R:6.045e+03\n",
 341 |       "after partial fitting: 46800.375\n",
 342 |       "J:00034 E:00000 L:4.680e+04 P:-6.266e+05 R:4.916e+03\n",
 343 |       "after partial fitting: 46502.49\n",
 344 |       "J:00035 E:00000 L:4.650e+04 P:-6.266e+05 R:5.881e+03\n",
 345 |       "after partial fitting: 46280.367\n",
 346 |       "J:00036 E:00000 L:4.628e+04 P:-6.266e+05 R:5.420e+03\n",
 347 |       "after partial fitting: 45915.508\n",
 348 |       "J:00037 E:00000 L:4.592e+04 P:-6.266e+05 R:5.019e+03\n",
 349 |       "after partial fitting: 44986.273\n",
 350 |       "J:00038 E:00000 L:4.499e+04 P:-6.266e+05 R:5.143e+03\n",
 351 |       "after partial fitting: 45867.562\n",
 352 |       "J:00039 E:00000 L:4.587e+04 P:-6.266e+05 R:5.123e+03\n",
 353 |       "after partial fitting: 45208.17\n",
 354 |       "J:00040 E:00000 L:4.521e+04 P:-6.266e+05 R:4.864e+03\n",
 355 |       "after partial fitting: 45020.992\n",
 356 |       "J:00041 E:00000 L:4.502e+04 P:-6.266e+05 R:5.882e+03\n",
 357 |       "after partial fitting: 44829.258\n",
 358 |       "J:00042 E:00000 L:4.483e+04 P:-6.266e+05 R:6.010e+03\n",
 359 |       "after partial fitting: 45038.305\n",
 360 |       "J:00043 E:00000 L:4.504e+04 P:-6.266e+05 R:5.870e+03\n",
 361 |       "after partial fitting: 44635.492\n",
 362 |       "J:00044 E:00000 L:4.464e+04 P:-6.266e+05 R:5.858e+03\n",
 363 |       "after partial fitting: 44424.434\n",
 364 |       "J:00045 E:00000 L:4.442e+04 P:-6.266e+05 R:6.008e+03\n",
 365 |       "after partial fitting: 43542.312\n",
 366 |       "J:00046 E:00000 L:4.354e+04 P:-6.266e+05 R:5.910e+03\n",
 367 |       "after partial fitting: 43630.508\n",
 368 |       "J:00047 E:00000 L:4.363e+04 P:-6.266e+05 R:6.064e+03\n",
 369 |       "after partial fitting: 43692.285\n",
 370 |       "J:00048 E:00000 L:4.369e+04 P:-6.266e+05 R:5.880e+03\n",
 371 |       "after partial fitting: 44308.992\n",
 372 |       "J:00049 E:00000 L:4.431e+04 P:-6.266e+05 R:5.517e+03\n",
 373 |       "after partial fitting: 43372.08\n",
 374 |       "J:00050 E:00000 L:4.337e+04 P:-6.266e+05 R:5.322e+03\n",
 375 |       "after partial fitting: 43270.508\n",
 376 |       "J:00051 E:00000 L:4.327e+04 P:-6.266e+05 R:5.414e+03\n",
 377 |       "after partial fitting: 42826.88\n",
 378 |       "J:00052 E:00000 L:4.283e+04 P:-6.266e+05 R:5.849e+03\n",
 379 |       "after partial fitting: 42931.203\n",
 380 |       "J:00053 E:00000 L:4.293e+04 P:-6.266e+05 R:5.754e+03\n",
 381 |       "after partial fitting: 43169.156\n",
 382 |       "J:00054 E:00000 L:4.317e+04 P:-6.266e+05 R:5.793e+03\n",
 383 |       "after partial fitting: 41746.6\n",
 384 |       "J:00055 E:00000 L:4.175e+04 P:-6.266e+05 R:6.047e+03\n",
 385 |       "after partial fitting: 42317.406\n",
 386 |       "J:00056 E:00000 L:4.232e+04 P:-6.266e+05 R:5.987e+03\n",
 387 |       "after partial fitting: 41575.227\n",
 388 |       "J:00057 E:00000 L:4.158e+04 P:-6.266e+05 R:6.008e+03\n",
 389 |       "after partial fitting: 42544.33\n",
 390 |       "J:00058 E:00000 L:4.254e+04 P:-6.266e+05 R:5.794e+03\n",
 391 |       "after partial fitting: 41813.383\n",
 392 |       "J:00059 E:00000 L:4.181e+04 P:-6.266e+05 R:5.784e+03\n",
 393 |       "after partial fitting: 41465.195\n",
 394 |       "J:00060 E:00000 L:4.147e+04 P:-6.266e+05 R:5.856e+03\n",
 395 |       "after partial fitting: 41694.242\n",
 396 |       "J:00061 E:00000 L:4.169e+04 P:-6.266e+05 R:5.990e+03\n",
 397 |       "after partial fitting: 41176.25\n",
 398 |       "J:00062 E:00000 L:4.118e+04 P:-6.266e+05 R:5.985e+03\n",
 399 |       "after partial fitting: 40851.453\n",
 400 |       "J:00063 E:00000 L:4.085e+04 P:-6.266e+05 R:6.030e+03\n",
 401 |       "after partial fitting: 42134.867\n",
 402 |       "J:00064 E:00000 L:4.213e+04 P:-6.266e+05 R:5.333e+03\n",
 403 |       "after partial fitting: 40474.58\n",
 404 |       "J:00065 E:00000 L:4.047e+04 P:-6.266e+05 R:4.870e+03\n",
 405 |       "after partial fitting: 42136.484\n",
 406 |       "J:00066 E:00000 L:4.214e+04 P:-6.266e+05 R:4.597e+03\n",
 407 |       "after partial fitting: 40501.766\n",
 408 |       "J:00067 E:00000 L:4.050e+04 P:-6.266e+05 R:5.039e+03\n",
 409 |       "after partial fitting: 41102.46\n",
 410 |       "J:00068 E:00000 L:4.110e+04 P:-6.266e+05 R:5.076e+03\n",
 411 |       "after partial fitting: 41022.062\n",
 412 |       "J:00069 E:00000 L:4.102e+04 P:-6.266e+05 R:5.904e+03\n",
 413 |       "after partial fitting: 40279.215\n",
 414 |       "J:00070 E:00000 L:4.028e+04 P:-6.266e+05 R:5.971e+03\n",
 415 |       "after partial fitting: 39444.703\n",
 416 |       "J:00071 E:00000 L:3.944e+04 P:-6.266e+05 R:4.952e+03\n",
 417 |       "after partial fitting: 39023.19\n",
 418 |       "J:00072 E:00000 L:3.902e+04 P:-6.266e+05 R:5.509e+03\n",
 419 |       "after partial fitting: 38559.867\n",
 420 |       "J:00073 E:00000 L:3.856e+04 P:-6.266e+05 R:5.275e+03\n",
 421 |       "after partial fitting: 38806.773\n",
 422 |       "J:00074 E:00000 L:3.881e+04 P:-6.266e+05 R:5.970e+03\n",
 423 |       "after partial fitting: 39776.703\n",
 424 |       "J:00075 E:00000 L:3.978e+04 P:-6.266e+05 R:5.879e+03\n",
 425 |       "after partial fitting: 38228.57\n",
 426 |       "J:00076 E:00000 L:3.823e+04 P:-6.266e+05 R:5.946e+03\n",
 427 |       "after partial fitting: 38373.004\n",
 428 |       "J:00077 E:00000 L:3.837e+04 P:-6.266e+05 R:5.928e+03\n",
 429 |       "after partial fitting: 38043.43\n",
 430 |       "J:00078 E:00000 L:3.804e+04 P:-6.266e+05 R:6.069e+03\n",
 431 |       "after partial fitting: 37861.18\n",
 432 |       "J:00079 E:00000 L:3.786e+04 P:-6.266e+05 R:6.023e+03\n",
 433 |       "after partial fitting: 37864.953\n",
 434 |       "J:00080 E:00000 L:3.786e+04 P:-6.266e+05 R:5.961e+03\n",
 435 |       "after partial fitting: 38696.086\n",
 436 |       "J:00081 E:00000 L:3.870e+04 P:-6.266e+05 R:5.974e+03\n",
 437 |       "after partial fitting: 37526.062\n",
 438 |       "J:00082 E:00000 L:3.753e+04 P:-6.266e+05 R:5.968e+03\n",
 439 |       "after partial fitting: 38310.363\n",
 440 |       "J:00083 E:00000 L:3.831e+04 P:-6.266e+05 R:5.950e+03\n",
 441 |       "after partial fitting: 37119.11\n",
 442 |       "J:00084 E:00000 L:3.712e+04 P:-6.266e+05 R:5.968e+03\n",
 443 |       "after partial fitting: 36629.86\n",
 444 |       "J:00085 E:00000 L:3.663e+04 P:-6.266e+05 R:5.961e+03\n",
 445 |       "after partial fitting: 37560.9\n",
 446 |       "J:00086 E:00000 L:3.756e+04 P:-6.266e+05 R:5.920e+03\n",
 447 |       "after partial fitting: 36843.906\n",
 448 |       "J:00087 E:00000 L:3.684e+04 P:-6.266e+05 R:5.975e+03\n",
 449 |       "after partial fitting: 36011.906\n",
 450 |       "J:00088 E:00000 L:3.601e+04 P:-6.266e+05 R:5.945e+03\n",
 451 |       "after partial fitting: 35597.51\n",
 452 |       "J:00089 E:00000 L:3.560e+04 P:-6.266e+05 R:5.917e+03\n",
 453 |       "after partial fitting: 36237.87\n",
 454 |       "J:00090 E:00000 L:3.624e+04 P:-6.266e+05 R:5.849e+03\n",
 455 |       "after partial fitting: 35933.492\n",
 456 |       "J:00091 E:00000 L:3.593e+04 P:-6.266e+05 R:5.356e+03\n",
 457 |       "after partial fitting: 35335.695\n",
 458 |       "J:00092 E:00000 L:3.534e+04 P:-6.266e+05 R:5.550e+03\n",
 459 |       "after partial fitting: 35161.688\n",
 460 |       "J:00093 E:00000 L:3.516e+04 P:-6.266e+05 R:5.813e+03\n",
 461 |       "after partial fitting: 34334.477\n",
 462 |       "J:00094 E:00000 L:3.433e+04 P:-6.266e+05 R:5.944e+03\n",
 463 |       "after partial fitting: 35093.33\n",
 464 |       "J:00095 E:00000 L:3.509e+04 P:-6.266e+05 R:5.857e+03\n",
 465 |       "after partial fitting: 35384.68\n",
 466 |       "J:00096 E:00000 L:3.538e+04 P:-6.266e+05 R:5.856e+03\n",
 467 |       "after partial fitting: 34563.676\n",
 468 |       "J:00097 E:00000 L:3.456e+04 P:-6.266e+05 R:5.873e+03\n",
 469 |       "after partial fitting: 34980.82\n",
 470 |       "J:00098 E:00000 L:3.498e+04 P:-6.266e+05 R:6.003e+03\n",
 471 |       "after partial fitting: 34145.688\n",
 472 |       "J:00099 E:00000 L:3.415e+04 P:-6.266e+05 R:6.062e+03\n",
 473 |       "after partial fitting: 34361.914\n",
 474 |       "J:00100 E:00000 L:3.436e+04 P:-6.266e+05 R:6.086e+03\n",
 475 |       "after partial fitting: 33269.227\n",
 476 |       "J:00101 E:00000 L:3.327e+04 P:-6.266e+05 R:6.014e+03\n",
 477 |       "after partial fitting: 34806.66\n",
 478 |       "J:00102 E:00000 L:3.481e+04 P:-6.266e+05 R:6.053e+03\n",
 479 |       "after partial fitting: 34571.63\n",
 480 |       "J:00103 E:00000 L:3.457e+04 P:-6.266e+05 R:6.010e+03\n",
 481 |       "after partial fitting: 34954.33\n",
 482 |       "J:00104 E:00000 L:3.495e+04 P:-6.266e+05 R:6.052e+03\n",
 483 |       "after partial fitting: 32349.73\n",
 484 |       "J:00105 E:00000 L:3.235e+04 P:-6.266e+05 R:6.013e+03\n",
 485 |       "after partial fitting: 32342.969\n",
 486 |       "J:00106 E:00000 L:3.234e+04 P:-6.266e+05 R:6.071e+03\n",
 487 |       "after partial fitting: 32015.27\n",
 488 |       "J:00107 E:00000 L:3.202e+04 P:-6.266e+05 R:6.093e+03\n",
 489 |       "after partial fitting: 31933.24\n",
 490 |       "J:00108 E:00000 L:3.193e+04 P:-6.266e+05 R:6.076e+03\n",
 491 |       "after partial fitting: 31818.71\n",
 492 |       "J:00109 E:00000 L:3.182e+04 P:-6.266e+05 R:6.074e+03\n",
 493 |       "after partial fitting: 31405.674\n",
 494 |       "J:00110 E:00000 L:3.141e+04 P:-6.266e+05 R:6.105e+03\n",
 495 |       "after partial fitting: 31727.777\n",
 496 |       "J:00111 E:00000 L:3.173e+04 P:-6.266e+05 R:6.075e+03\n",
 497 |       "after partial fitting: 31360.021\n",
 498 |       "J:00112 E:00000 L:3.136e+04 P:-6.266e+05 R:6.063e+03\n",
 499 |       "after partial fitting: 31144.55\n",
 500 |       "J:00113 E:00000 L:3.114e+04 P:-6.266e+05 R:6.088e+03\n",
 501 |       "after partial fitting: 30779.344\n",
 502 |       "J:00114 E:00000 L:3.078e+04 P:-6.266e+05 R:6.112e+03\n",
 503 |       "after partial fitting: 30584.752\n",
 504 |       "J:00115 E:00000 L:3.058e+04 P:-6.266e+05 R:6.049e+03\n",
 505 |       "after partial fitting: 31395.637\n",
 506 |       "J:00116 E:00000 L:3.140e+04 P:-6.266e+05 R:6.067e+03\n",
 507 |       "after partial fitting: 30131.213\n",
 508 |       "J:00117 E:00000 L:3.013e+04 P:-6.266e+05 R:6.068e+03\n",
 509 |       "after partial fitting: 31391.371\n",
 510 |       "J:00118 E:00000 L:3.139e+04 P:-6.266e+05 R:6.051e+03\n",
 511 |       "after partial fitting: 29529.635\n",
 512 |       "J:00119 E:00000 L:2.953e+04 P:-6.266e+05 R:6.075e+03\n",
 513 |       "after partial fitting: 29273.645\n",
 514 |       "J:00120 E:00000 L:2.927e+04 P:-6.266e+05 R:6.021e+03\n",
 515 |       "after partial fitting: 29233.674\n",
 516 |       "J:00121 E:00000 L:2.923e+04 P:-6.266e+05 R:6.027e+03\n",
 517 |       "after partial fitting: 30145.396\n",
 518 |       "J:00122 E:00000 L:3.015e+04 P:-6.266e+05 R:6.048e+03\n",
 519 |       "after partial fitting: 29761.84\n",
 520 |       "J:00123 E:00000 L:2.976e+04 P:-6.266e+05 R:6.061e+03\n",
 521 |       "after partial fitting: 29238.902\n",
 522 |       "J:00124 E:00000 L:2.924e+04 P:-6.266e+05 R:6.069e+03\n",
 523 |       "after partial fitting: 29642.45\n",
 524 |       "J:00125 E:00000 L:2.964e+04 P:-6.266e+05 R:6.043e+03\n",
 525 |       "after partial fitting: 28716.219\n",
 526 |       "J:00126 E:00000 L:2.872e+04 P:-6.266e+05 R:6.053e+03\n",
 527 |       "after partial fitting: 27662.445\n",
 528 |       "J:00127 E:00000 L:2.766e+04 P:-6.266e+05 R:6.131e+03\n",
 529 |       "after partial fitting: 28901.814\n",
 530 |       "J:00128 E:00000 L:2.890e+04 P:-6.266e+05 R:6.043e+03\n",
 531 |       "after partial fitting: 25056.883\n",
 532 |       "J:00129 E:00000 L:2.506e+04 P:-6.266e+05 R:6.034e+03\n",
 533 |       "after partial fitting: 27731.514\n",
 534 |       "J:00130 E:00000 L:2.773e+04 P:-6.266e+05 R:6.025e+03\n",
 535 |       "after partial fitting: 27779.36\n",
 536 |       "J:00131 E:00000 L:2.778e+04 P:-6.267e+05 R:6.094e+03\n",
 537 |       "after partial fitting: 27414.254\n",
 538 |       "J:00132 E:00000 L:2.741e+04 P:-6.267e+05 R:6.056e+03\n",
 539 |       "after partial fitting: 27296.277\n",
 540 |       "J:00133 E:00000 L:2.730e+04 P:-6.267e+05 R:6.101e+03\n",
 541 |       "after partial fitting: 27481.258\n",
 542 |       "J:00134 E:00000 L:2.748e+04 P:-6.267e+05 R:6.020e+03\n",
 543 |       "after partial fitting: 28271.377\n",
 544 |       "J:00135 E:00000 L:2.827e+04 P:-6.267e+05 R:6.038e+03\n",
 545 |       "after partial fitting: 26254.014\n",
 546 |       "J:00136 E:00000 L:2.625e+04 P:-6.267e+05 R:6.078e+03\n",
 547 |       "after partial fitting: 26578.258\n",
 548 |       "J:00137 E:00000 L:2.658e+04 P:-6.267e+05 R:6.081e+03\n",
 549 |       "after partial fitting: 26785.209\n",
 550 |       "J:00138 E:00000 L:2.679e+04 P:-6.267e+05 R:6.080e+03\n",
 551 |       "after partial fitting: 27028.56\n",
 552 |       "J:00139 E:00000 L:2.703e+04 P:-6.267e+05 R:6.043e+03\n",
 553 |       "after partial fitting: 26331.686\n",
 554 |       "J:00140 E:00000 L:2.633e+04 P:-6.267e+05 R:6.063e+03\n",
 555 |       "after partial fitting: 25806.61\n",
 556 |       "J:00141 E:00000 L:2.581e+04 P:-6.267e+05 R:5.967e+03\n",
 557 |       "after partial fitting: 25997.297\n",
 558 |       "J:00142 E:00000 L:2.600e+04 P:-6.267e+05 R:6.098e+03\n",
 559 |       "after partial fitting: 25662.008\n",
 560 |       "J:00143 E:00000 L:2.566e+04 P:-6.267e+05 R:6.116e+03\n",
 561 |       "after partial fitting: 25373.52\n",
 562 |       "J:00144 E:00000 L:2.537e+04 P:-6.267e+05 R:6.127e+03\n",
 563 |       "after partial fitting: 24699.531\n",
 564 |       "J:00145 E:00000 L:2.470e+04 P:-6.267e+05 R:5.986e+03\n",
 565 |       "after partial fitting: 26028.555\n",
 566 |       "J:00146 E:00000 L:2.603e+04 P:-6.267e+05 R:6.073e+03\n",
 567 |       "after partial fitting: 24439.402\n",
 568 |       "J:00147 E:00000 L:2.444e+04 P:-6.267e+05 R:6.073e+03\n",
 569 |       "after partial fitting: 24852.848\n",
 570 |       "J:00148 E:00000 L:2.485e+04 P:-6.267e+05 R:6.059e+03\n",
 571 |       "after partial fitting: 24875.38\n",
 572 |       "J:00149 E:00000 L:2.488e+04 P:-6.267e+05 R:6.064e+03\n",
 573 |       "after partial fitting: 23890.79\n",
 574 |       "J:00150 E:00000 L:2.389e+04 P:-6.267e+05 R:6.033e+03\n",
 575 |       "after partial fitting: 24489.16\n",
 576 |       "J:00151 E:00000 L:2.449e+04 P:-6.267e+05 R:6.116e+03\n",
 577 |       "after partial fitting: 23221.39\n",
 578 |       "J:00152 E:00000 L:2.322e+04 P:-6.267e+05 R:6.053e+03\n",
 579 |       "after partial fitting: 23643.238\n",
 580 |       "J:00153 E:00000 L:2.364e+04 P:-6.267e+05 R:6.057e+03\n",
 581 |       "after partial fitting: 23551.635\n",
 582 |       "J:00154 E:00000 L:2.355e+04 P:-6.267e+05 R:6.055e+03\n",
 583 |       "after partial fitting: 22977.184\n",
 584 |       "J:00155 E:00000 L:2.298e+04 P:-6.267e+05 R:6.073e+03\n",
 585 |       "after partial fitting: 23447.684\n",
 586 |       "J:00156 E:00000 L:2.345e+04 P:-6.267e+05 R:6.085e+03\n",
 587 |       "after partial fitting: 22477.803\n",
 588 |       "J:00157 E:00000 L:2.248e+04 P:-6.267e+05 R:6.150e+03\n",
 589 |       "after partial fitting: 22690.54\n",
 590 |       "J:00158 E:00000 L:2.269e+04 P:-6.267e+05 R:6.063e+03\n",
 591 |       "after partial fitting: 22940.223\n",
 592 |       "J:00159 E:00000 L:2.294e+04 P:-6.267e+05 R:6.046e+03\n",
 593 |       "after partial fitting: 22163.59\n",
 594 |       "J:00160 E:00000 L:2.216e+04 P:-6.267e+05 R:5.698e+03\n",
 595 |       "after partial fitting: 22588.23\n",
 596 |       "J:00161 E:00000 L:2.259e+04 P:-6.267e+05 R:5.782e+03\n",
 597 |       "after partial fitting: 22168.441\n",
 598 |       "J:00162 E:00000 L:2.217e+04 P:-6.267e+05 R:6.010e+03\n",
 599 |       "after partial fitting: 21736.072\n",
 600 |       "J:00163 E:00000 L:2.174e+04 P:-6.267e+05 R:5.721e+03\n",
 601 |       "after partial fitting: 23206.814\n",
 602 |       "J:00164 E:00000 L:2.321e+04 P:-6.267e+05 R:4.622e+03\n",
 603 |       "after partial fitting: 22202.191\n",
 604 |       "J:00165 E:00000 L:2.220e+04 P:-6.267e+05 R:4.764e+03\n",
 605 |       "after partial fitting: 21514.31\n",
 606 |       "J:00166 E:00000 L:2.151e+04 P:-6.267e+05 R:5.897e+03\n",
 607 |       "after partial fitting: 22966.6\n",
 608 |       "J:00167 E:00000 L:2.297e+04 P:-6.267e+05 R:5.875e+03\n",
 609 |       "after partial fitting: 21476.262\n",
 610 |       "J:00168 E:00000 L:2.148e+04 P:-6.267e+05 R:5.847e+03\n",
 611 |       "after partial fitting: 20888.7\n",
 612 |       "J:00169 E:00000 L:2.089e+04 P:-6.267e+05 R:5.988e+03\n",
 613 |       "after partial fitting: 21083.533\n",
 614 |       "J:00170 E:00000 L:2.108e+04 P:-6.267e+05 R:5.261e+03\n",
 615 |       "after partial fitting: 20684.258\n",
 616 |       "J:00171 E:00000 L:2.068e+04 P:-6.267e+05 R:5.008e+03\n",
 617 |       "after partial fitting: 20984.19\n",
 618 |       "J:00172 E:00000 L:2.098e+04 P:-6.267e+05 R:5.987e+03\n",
 619 |       "after partial fitting: 20521.496\n",
 620 |       "J:00173 E:00000 L:2.052e+04 P:-6.267e+05 R:5.928e+03\n",
 621 |       "after partial fitting: 20661.85\n",
 622 |       "J:00174 E:00000 L:2.066e+04 P:-6.267e+05 R:5.984e+03\n",
 623 |       "after partial fitting: 20266.916\n",
 624 |       "J:00175 E:00000 L:2.027e+04 P:-6.267e+05 R:6.010e+03\n",
 625 |       "after partial fitting: 20429.816\n",
 626 |       "J:00176 E:00000 L:2.043e+04 P:-6.267e+05 R:5.596e+03\n",
 627 |       "after partial fitting: 20474.457\n",
 628 |       "J:00177 E:00000 L:2.047e+04 P:-6.267e+05 R:6.005e+03\n",
 629 |       "after partial fitting: 19181.3\n",
 630 |       "J:00178 E:00000 L:1.918e+04 P:-6.267e+05 R:6.017e+03\n",
 631 |       "after partial fitting: 19949.555\n",
 632 |       "J:00179 E:00000 L:1.995e+04 P:-6.267e+05 R:5.903e+03\n",
 633 |       "after partial fitting: 19915.707\n",
 634 |       "J:00180 E:00000 L:1.992e+04 P:-6.267e+05 R:5.939e+03\n",
 635 |       "after partial fitting: 19854.262\n",
 636 |       "J:00181 E:00000 L:1.985e+04 P:-6.267e+05 R:6.004e+03\n",
 637 |       "after partial fitting: 19265.129\n",
 638 |       "J:00182 E:00000 L:1.927e+04 P:-6.267e+05 R:6.076e+03\n",
 639 |       "after partial fitting: 19137.309\n",
 640 |       "J:00183 E:00000 L:1.914e+04 P:-6.267e+05 R:6.050e+03\n",
 641 |       "after partial fitting: 17785.469\n",
 642 |       "J:00184 E:00000 L:1.779e+04 P:-6.267e+05 R:5.906e+03\n",
 643 |       "after partial fitting: 19165.738\n",
 644 |       "J:00185 E:00000 L:1.917e+04 P:-6.267e+05 R:5.875e+03\n",
 645 |       "after partial fitting: 19463.63\n",
 646 |       "J:00186 E:00000 L:1.946e+04 P:-6.267e+05 R:5.850e+03\n",
 647 |       "after partial fitting: 18957.111\n",
 648 |       "J:00187 E:00000 L:1.896e+04 P:-6.267e+05 R:5.782e+03\n",
 649 |       "after partial fitting: 18527.273\n",
 650 |       "J:00188 E:00000 L:1.853e+04 P:-6.267e+05 R:5.902e+03\n",
 651 |       "after partial fitting: 18686.93\n",
 652 |       "J:00189 E:00000 L:1.869e+04 P:-6.267e+05 R:5.658e+03\n",
 653 |       "after partial fitting: 18578.914\n",
 654 |       "J:00190 E:00000 L:1.858e+04 P:-6.267e+05 R:5.706e+03\n",
 655 |       "after partial fitting: 18030.516\n",
 656 |       "J:00191 E:00000 L:1.803e+04 P:-6.267e+05 R:5.242e+03\n",
 657 |       "after partial fitting: 19121.955\n",
 658 |       "J:00192 E:00000 L:1.912e+04 P:-6.267e+05 R:5.421e+03\n",
 659 |       "after partial fitting: 17995.668\n",
 660 |       "J:00193 E:00000 L:1.800e+04 P:-6.267e+05 R:5.343e+03\n",
 661 |       "after partial fitting: 18015.19\n",
 662 |       "J:00194 E:00000 L:1.802e+04 P:-6.267e+05 R:5.277e+03\n",
 663 |       "after partial fitting: 18463.668\n",
 664 |       "J:00195 E:00000 L:1.846e+04 P:-6.267e+05 R:5.252e+03\n",
 665 |       "after partial fitting: 17576.13\n",
 666 |       "J:00196 E:00000 L:1.758e+04 P:-6.267e+05 R:5.327e+03\n",
 667 |       "after partial fitting: 17220.068\n",
 668 |       "J:00197 E:00000 L:1.722e+04 P:-6.267e+05 R:5.080e+03\n",
 669 |       "after partial fitting: 18059.652\n",
 670 |       "J:00198 E:00000 L:1.806e+04 P:-6.267e+05 R:5.574e+03\n",
 671 |       "after partial fitting: 17178.809\n",
 672 |       "J:00199 E:00000 L:1.718e+04 P:-6.267e+05 R:5.354e+03\n",
 673 |       "after partial fitting: 17971.396\n",
 674 |       "J:00200 E:00000 L:1.797e+04 P:-6.267e+05 R:5.501e+03\n",
 675 |       "after partial fitting: 17752.889\n",
 676 |       "J:00201 E:00000 L:1.775e+04 P:-6.267e+05 R:5.717e+03\n",
 677 |       "after partial fitting: 17587.137\n",
 678 |       "J:00202 E:00000 L:1.759e+04 P:-6.267e+05 R:5.184e+03\n",
 679 |       "after partial fitting: 17260.096\n",
 680 |       "J:00203 E:00000 L:1.726e+04 P:-6.267e+05 R:4.770e+03\n",
 681 |       "after partial fitting: 17327.832\n",
 682 |       "J:00204 E:00000 L:1.733e+04 P:-6.267e+05 R:5.954e+03\n",
 683 |       "after partial fitting: 18475.19\n",
 684 |       "J:00205 E:00000 L:1.848e+04 P:-6.267e+05 R:5.972e+03\n",
 685 |       "after partial fitting: 18049.121\n",
 686 |       "J:00206 E:00000 L:1.805e+04 P:-6.267e+05 R:5.678e+03\n",
 687 |       "after partial fitting: 17023.158\n",
 688 |       "J:00207 E:00000 L:1.702e+04 P:-6.267e+05 R:5.507e+03\n",
 689 |       "after partial fitting: 17343.344\n",
 690 |       "J:00208 E:00000 L:1.734e+04 P:-6.267e+05 R:5.861e+03\n",
 691 |       "after partial fitting: 16262.592\n",
 692 |       "J:00209 E:00000 L:1.626e+04 P:-6.267e+05 R:6.001e+03\n",
 693 |       "after partial fitting: 16617.215\n",
 694 |       "J:00210 E:00000 L:1.662e+04 P:-6.267e+05 R:5.201e+03\n",
 695 |       "after partial fitting: 17078.59\n",
 696 |       "J:00211 E:00000 L:1.708e+04 P:-6.267e+05 R:5.277e+03\n",
 697 |       "after partial fitting: 16457.357\n",
 698 |       "J:00212 E:00000 L:1.646e+04 P:-6.267e+05 R:5.232e+03\n",
 699 |       "after partial fitting: 15459.547\n",
 700 |       "J:00213 E:00000 L:1.546e+04 P:-6.267e+05 R:5.191e+03\n",
 701 |       "after partial fitting: 16364.579\n",
 702 |       "J:00214 E:00000 L:1.636e+04 P:-6.267e+05 R:4.901e+03\n",
 703 |       "after partial fitting: 16609.305\n",
 704 |       "J:00215 E:00000 L:1.661e+04 P:-6.267e+05 R:5.278e+03\n",
 705 |       "after partial fitting: 16419.035\n",
 706 |       "J:00216 E:00000 L:1.642e+04 P:-6.267e+05 R:5.017e+03\n",
 707 |       "after partial fitting: 15992.141\n",
 708 |       "J:00217 E:00000 L:1.599e+04 P:-6.267e+05 R:5.029e+03\n",
 709 |       "after partial fitting: 16366.637\n",
 710 |       "J:00218 E:00000 L:1.637e+04 P:-6.267e+05 R:4.978e+03\n",
 711 |       "after partial fitting: 16428.293\n",
 712 |       "J:00219 E:00000 L:1.643e+04 P:-6.267e+05 R:5.033e+03\n",
 713 |       "after partial fitting: 16153.443\n",
 714 |       "J:00220 E:00000 L:1.615e+04 P:-6.267e+05 R:5.130e+03\n",
 715 |       "after partial fitting: 15108.082\n",
 716 |       "J:00221 E:00000 L:1.511e+04 P:-6.267e+05 R:5.411e+03\n",
 717 |       "after partial fitting: 15839.101\n",
 718 |       "J:00222 E:00000 L:1.584e+04 P:-6.267e+05 R:5.250e+03\n",
 719 |       "after partial fitting: 16148.507\n",
 720 |       "J:00223 E:00000 L:1.615e+04 P:-6.268e+05 R:5.110e+03\n",
 721 |       "after partial fitting: 15898.63\n",
 722 |       "J:00224 E:00000 L:1.590e+04 P:-6.268e+05 R:5.200e+03\n",
 723 |       "after partial fitting: 16240.622\n",
 724 |       "J:00225 E:00000 L:1.624e+04 P:-6.268e+05 R:5.123e+03\n",
 725 |       "after partial fitting: 15974.372\n",
 726 |       "J:00226 E:00000 L:1.597e+04 P:-6.268e+05 R:5.015e+03\n",
 727 |       "after partial fitting: 15815.311\n",
 728 |       "J:00227 E:00000 L:1.582e+04 P:-6.268e+05 R:5.025e+03\n",
 729 |       "after partial fitting: 15577.67\n",
 730 |       "J:00228 E:00000 L:1.558e+04 P:-6.268e+05 R:5.140e+03\n",
 731 |       "after partial fitting: 15190.018\n",
 732 |       "J:00229 E:00000 L:1.519e+04 P:-6.268e+05 R:5.215e+03\n",
 733 |       "after partial fitting: 15393.206\n",
 734 |       "J:00230 E:00000 L:1.539e+04 P:-6.268e+05 R:5.399e+03\n",
 735 |       "after partial fitting: 15632.131\n",
 736 |       "J:00231 E:00000 L:1.563e+04 P:-6.268e+05 R:5.548e+03\n",
 737 |       "after partial fitting: 15489.15\n",
 738 |       "J:00232 E:00000 L:1.549e+04 P:-6.268e+05 R:5.711e+03\n",
 739 |       "after partial fitting: 15210.231\n",
 740 |       "J:00233 E:00000 L:1.521e+04 P:-6.268e+05 R:6.133e+03\n",
 741 |       "after partial fitting: 15511.427\n",
 742 |       "J:00234 E:00000 L:1.551e+04 P:-6.268e+05 R:5.687e+03\n",
 743 |       "after partial fitting: 15548.255\n",
 744 |       "J:00235 E:00000 L:1.555e+04 P:-6.268e+05 R:6.024e+03\n",
 745 |       "after partial fitting: 14978.25\n",
 746 |       "J:00236 E:00000 L:1.498e+04 P:-6.268e+05 R:5.734e+03\n",
 747 |       "after partial fitting: 15332.8955\n",
 748 |       "J:00237 E:00000 L:1.533e+04 P:-6.268e+05 R:6.029e+03\n",
 749 |       "after partial fitting: 15407.283\n",
 750 |       "J:00238 E:00000 L:1.541e+04 P:-6.268e+05 R:5.623e+03\n",
 751 |       "after partial fitting: 15660.328\n",
 752 |       "J:00239 E:00000 L:1.566e+04 P:-6.268e+05 R:5.024e+03\n",
 753 |       "after partial fitting: 15221.151\n",
 754 |       "J:00240 E:00000 L:1.522e+04 P:-6.268e+05 R:5.622e+03\n",
 755 |       "after partial fitting: 15117.535\n",
 756 |       "J:00241 E:00000 L:1.512e+04 P:-6.268e+05 R:4.781e+03\n",
 757 |       "after partial fitting: 15358.136\n",
 758 |       "J:00242 E:00000 L:1.536e+04 P:-6.268e+05 R:5.170e+03\n",
 759 |       "after partial fitting: 15403.156\n",
 760 |       "J:00243 E:00000 L:1.540e+04 P:-6.269e+05 R:5.762e+03\n",
 761 |       "after partial fitting: 15151.9\n",
 762 |       "J:00244 E:00000 L:1.515e+04 P:-6.269e+05 R:5.827e+03\n",
 763 |       "after partial fitting: 15325.089\n",
 764 |       "J:00245 E:00000 L:1.533e+04 P:-6.269e+05 R:5.751e+03\n",
 765 |       "after partial fitting: 14827.129\n",
 766 |       "J:00246 E:00000 L:1.483e+04 P:-6.269e+05 R:5.825e+03\n",
 767 |       "after partial fitting: 14815.379\n",
 768 |       "J:00247 E:00000 L:1.482e+04 P:-6.269e+05 R:5.806e+03\n",
 769 |       "after partial fitting: 15042.434\n",
 770 |       "J:00248 E:00000 L:1.504e+04 P:-6.269e+05 R:5.833e+03\n",
 771 |       "after partial fitting: 15511.191\n",
 772 |       "J:00249 E:00000 L:1.551e+04 P:-6.269e+05 R:5.784e+03\n",
 773 |       "after partial fitting: 14819.643\n",
 774 |       "J:00250 E:00000 L:1.482e+04 P:-6.269e+05 R:5.798e+03\n",
 775 |       "after partial fitting: 14744.163\n",
 776 |       "J:00251 E:00000 L:1.474e+04 P:-6.269e+05 R:5.821e+03\n",
 777 |       "after partial fitting: 15016.177\n",
 778 |       "J:00252 E:00000 L:1.502e+04 P:-6.269e+05 R:5.816e+03\n",
 779 |       "after partial fitting: 14533.602\n",
 780 |       "J:00253 E:00000 L:1.453e+04 P:-6.270e+05 R:5.884e+03\n",
 781 |       "after partial fitting: 14643.593\n",
 782 |       "J:00254 E:00000 L:1.464e+04 P:-6.270e+05 R:5.749e+03\n",
 783 |       "after partial fitting: 15308.663\n",
 784 |       "J:00255 E:00000 L:1.531e+04 P:-6.270e+05 R:5.765e+03\n",
 785 |       "after partial fitting: 15204.26\n",
 786 |       "J:00256 E:00000 L:1.520e+04 P:-6.270e+05 R:5.781e+03\n",
 787 |       "after partial fitting: 14733.767\n",
 788 |       "J:00257 E:00000 L:1.473e+04 P:-6.270e+05 R:5.793e+03\n",
 789 |       "after partial fitting: 15499.161\n",
 790 |       "J:00258 E:00000 L:1.550e+04 P:-6.270e+05 R:5.198e+03\n",
 791 |       "after partial fitting: 14766.961\n",
 792 |       "J:00259 E:00000 L:1.477e+04 P:-6.270e+05 R:5.726e+03\n",
 793 |       "after partial fitting: 15104.436\n",
 794 |       "J:00260 E:00000 L:1.510e+04 P:-6.271e+05 R:5.907e+03\n",
 795 |       "after partial fitting: 14658.041\n",
 796 |       "J:00261 E:00000 L:1.466e+04 P:-6.271e+05 R:5.946e+03\n",
 797 |       "after partial fitting: 14428.885\n",
 798 |       "J:00262 E:00000 L:1.443e+04 P:-6.271e+05 R:5.165e+03\n",
 799 |       "after partial fitting: 14984.813\n",
 800 |       "J:00263 E:00000 L:1.498e+04 P:-6.271e+05 R:5.528e+03\n",
 801 |       "after partial fitting: 14678.289\n",
 802 |       "J:00264 E:00000 L:1.468e+04 P:-6.271e+05 R:5.668e+03\n",
 803 |       "after partial fitting: 15225.107\n",
 804 |       "J:00265 E:00000 L:1.523e+04 P:-6.271e+05 R:5.636e+03\n",
 805 |       "after partial fitting: 14805.553\n",
 806 |       "J:00266 E:00000 L:1.481e+04 P:-6.272e+05 R:5.392e+03\n",
 807 |       "after partial fitting: 14566.131\n",
 808 |       "J:00267 E:00000 L:1.457e+04 P:-6.272e+05 R:5.155e+03\n",
 809 |       "after partial fitting: 14621.453\n",
 810 |       "J:00268 E:00000 L:1.462e+04 P:-6.272e+05 R:5.358e+03\n",
 811 |       "after partial fitting: 15119.041\n",
 812 |       "J:00269 E:00000 L:1.512e+04 P:-6.272e+05 R:5.326e+03\n",
 813 |       "after partial fitting: 14595.709\n",
 814 |       "J:00270 E:00000 L:1.460e+04 P:-6.273e+05 R:5.749e+03\n",
 815 |       "after partial fitting: 14660.914\n",
 816 |       "J:00271 E:00000 L:1.466e+04 P:-6.273e+05 R:5.148e+03\n",
 817 |       "after partial fitting: 15116.81\n",
 818 |       "J:00272 E:00000 L:1.512e+04 P:-6.273e+05 R:5.643e+03\n",
 819 |       "after partial fitting: 14525.841\n",
 820 |       "J:00273 E:00000 L:1.453e+04 P:-6.273e+05 R:5.481e+03\n",
 821 |       "after partial fitting: 14896.59\n",
 822 |       "J:00274 E:00000 L:1.490e+04 P:-6.274e+05 R:5.442e+03\n",
 823 |       "after partial fitting: 14540.08\n",
 824 |       "J:00275 E:00000 L:1.454e+04 P:-6.274e+05 R:6.094e+03\n",
 825 |       "after partial fitting: 14998.677\n",
 826 |       "J:00276 E:00000 L:1.500e+04 P:-6.274e+05 R:5.728e+03\n",
 827 |       "after partial fitting: 14743.502\n",
 828 |       "J:00277 E:00000 L:1.474e+04 P:-6.274e+05 R:5.765e+03\n",
 829 |       "after partial fitting: 14476.426\n",
 830 |       "J:00278 E:00000 L:1.448e+04 P:-6.275e+05 R:5.630e+03\n",
 831 |       "after partial fitting: 14808.815\n",
 832 |       "J:00279 E:00000 L:1.481e+04 P:-6.275e+05 R:5.127e+03\n",
 833 |       "after partial fitting: 15177.396\n",
 834 |       "J:00280 E:00000 L:1.518e+04 P:-6.275e+05 R:5.551e+03\n",
 835 |       "after partial fitting: 14562.357\n",
 836 |       "J:00281 E:00000 L:1.456e+04 P:-6.275e+05 R:5.510e+03\n",
 837 |       "after partial fitting: 14613.694\n",
 838 |       "J:00282 E:00000 L:1.461e+04 P:-6.276e+05 R:5.650e+03\n",
 839 |       "after partial fitting: 17188.234\n",
 840 |       "J:00283 E:00000 L:1.719e+04 P:-6.276e+05 R:5.751e+03\n",
 841 |       "after partial fitting: 15848.637\n",
 842 |       "J:00284 E:00000 L:1.585e+04 P:-6.276e+05 R:5.326e+03\n",
 843 |       "after partial fitting: 14628.442\n",
 844 |       "J:00285 E:00000 L:1.463e+04 P:-6.276e+05 R:5.874e+03\n",
 845 |       "after partial fitting: 14513.595\n",
 846 |       "J:00286 E:00000 L:1.451e+04 P:-6.277e+05 R:5.529e+03\n",
 847 |       "after partial fitting: 14446.684\n",
 848 |       "J:00287 E:00000 L:1.445e+04 P:-6.277e+05 R:6.113e+03\n",
 849 |       "after partial fitting: 14198.85\n",
 850 |       "J:00288 E:00000 L:1.420e+04 P:-6.277e+05 R:6.108e+03\n",
 851 |       "after partial fitting: 14470.988\n",
 852 |       "J:00289 E:00000 L:1.447e+04 P:-6.278e+05 R:5.689e+03\n",
 853 |       "after partial fitting: 14844.4\n",
 854 |       "J:00290 E:00000 L:1.484e+04 P:-6.278e+05 R:5.490e+03\n",
 855 |       "after partial fitting: 14505.133\n",
 856 |       "J:00291 E:00000 L:1.451e+04 P:-6.278e+05 R:5.616e+03\n",
 857 |       "after partial fitting: 15538.74\n",
 858 |       "J:00292 E:00000 L:1.554e+04 P:-6.278e+05 R:5.156e+03\n",
 859 |       "after partial fitting: 14582.016\n",
 860 |       "J:00293 E:00000 L:1.458e+04 P:-6.279e+05 R:5.860e+03\n",
 861 |       "after partial fitting: 14257.053\n",
 862 |       "J:00294 E:00000 L:1.426e+04 P:-6.279e+05 R:5.861e+03\n",
 863 |       "after partial fitting: 14271.592\n",
 864 |       "J:00295 E:00000 L:1.427e+04 P:-6.279e+05 R:5.740e+03\n",
 865 |       "after partial fitting: 14750.109\n",
 866 |       "J:00296 E:00000 L:1.475e+04 P:-6.280e+05 R:5.819e+03\n",
 867 |       "after partial fitting: 14552.396\n",
 868 |       "J:00297 E:00000 L:1.455e+04 P:-6.280e+05 R:5.811e+03\n",
 869 |       "after partial fitting: 14646.393\n",
 870 |       "J:00298 E:00000 L:1.465e+04 P:-6.280e+05 R:5.818e+03\n",
 871 |       "after partial fitting: 14718.272\n",
 872 |       "J:00299 E:00000 L:1.472e+04 P:-6.281e+05 R:5.916e+03\n",
 873 |       "after partial fitting: 14788.186\n",
 874 |       "J:00300 E:00000 L:1.479e+04 P:-6.281e+05 R:5.835e+03\n",
 875 |       "after partial fitting: 14407.2295\n",
 876 |       "J:00301 E:00000 L:1.441e+04 P:-6.281e+05 R:5.641e+03\n",
 877 |       "after partial fitting: 14949.893\n",
 878 |       "J:00302 E:00000 L:1.495e+04 P:-6.282e+05 R:5.548e+03\n",
 879 |       "after partial fitting: 14651.773\n",
 880 |       "J:00303 E:00000 L:1.465e+04 P:-6.282e+05 R:5.378e+03\n",
 881 |       "after partial fitting: 15536.415\n",
 882 |       "J:00304 E:00000 L:1.554e+04 P:-6.282e+05 R:5.713e+03\n",
 883 |       "after partial fitting: 14580.162\n",
 884 |       "J:00305 E:00000 L:1.458e+04 P:-6.283e+05 R:5.849e+03\n",
 885 |       "after partial fitting: 14813.968\n",
 886 |       "J:00306 E:00000 L:1.481e+04 P:-6.283e+05 R:5.701e+03\n",
 887 |       "after partial fitting: 14664.225\n",
 888 |       "J:00307 E:00000 L:1.466e+04 P:-6.283e+05 R:6.024e+03\n",
 889 |       "after partial fitting: 14385.124\n",
 890 |       "J:00308 E:00000 L:1.439e+04 P:-6.284e+05 R:5.570e+03\n",
 891 |       "after partial fitting: 16108.602\n",
 892 |       "J:00309 E:00000 L:1.611e+04 P:-6.284e+05 R:5.612e+03\n",
 893 |       "after partial fitting: 14166.184\n",
 894 |       "J:00310 E:00000 L:1.417e+04 P:-6.285e+05 R:5.694e+03\n",
 895 |       "after partial fitting: 14035.904\n",
 896 |       "J:00311 E:00000 L:1.404e+04 P:-6.285e+05 R:5.772e+03\n",
 897 |       "after partial fitting: 14465.242\n",
 898 |       "J:00312 E:00000 L:1.447e+04 P:-6.285e+05 R:5.794e+03\n",
 899 |       "after partial fitting: 14710.816\n",
 900 |       "J:00313 E:00000 L:1.471e+04 P:-6.286e+05 R:5.773e+03\n",
 901 |       "after partial fitting: 14611.902\n",
 902 |       "J:00314 E:00000 L:1.461e+04 P:-6.286e+05 R:5.723e+03\n",
 903 |       "after partial fitting: 14772.826\n",
 904 |       "J:00315 E:00000 L:1.477e+04 P:-6.286e+05 R:5.807e+03\n",
 905 |       "after partial fitting: 14725.696\n",
 906 |       "J:00316 E:00000 L:1.473e+04 P:-6.287e+05 R:5.908e+03\n",
 907 |       "after partial fitting: 14312.758\n",
 908 |       "J:00317 E:00000 L:1.431e+04 P:-6.287e+05 R:5.602e+03\n",
 909 |       "after partial fitting: 16062.393\n",
 910 |       "J:00318 E:00000 L:1.606e+04 P:-6.287e+05 R:5.779e+03\n",
 911 |       "after partial fitting: 14631.928\n",
 912 |       "J:00319 E:00000 L:1.463e+04 P:-6.288e+05 R:5.794e+03\n",
 913 |       "after partial fitting: 14921.791\n",
 914 |       "J:00320 E:00000 L:1.492e+04 P:-6.288e+05 R:5.858e+03\n",
 915 |       "after partial fitting: 14663.575\n",
 916 |       "J:00321 E:00000 L:1.466e+04 P:-6.289e+05 R:5.889e+03\n",
 917 |       "after partial fitting: 14448.781\n",
 918 |       "J:00322 E:00000 L:1.445e+04 P:-6.289e+05 R:5.630e+03\n",
 919 |       "after partial fitting: 15242.897\n",
 920 |       "J:00323 E:00000 L:1.524e+04 P:-6.290e+05 R:5.426e+03\n",
 921 |       "after partial fitting: 14516.9375\n",
 922 |       "J:00324 E:00000 L:1.452e+04 P:-6.290e+05 R:5.954e+03\n",
 923 |       "after partial fitting: 14548.303\n",
 924 |       "J:00325 E:00000 L:1.455e+04 P:-6.290e+05 R:5.958e+03\n",
 925 |       "after partial fitting: 15684.223\n",
 926 |       "J:00326 E:00000 L:1.568e+04 P:-6.291e+05 R:5.931e+03\n",
 927 |       "after partial fitting: 15556.242\n",
 928 |       "J:00327 E:00000 L:1.556e+04 P:-6.291e+05 R:5.931e+03\n",
 929 |       "after partial fitting: 12954.486\n",
 930 |       "J:00328 E:00000 L:1.295e+04 P:-6.292e+05 R:5.475e+03\n",
 931 |       "after partial fitting: 14847.99\n",
 932 |       "J:00329 E:00000 L:1.485e+04 P:-6.292e+05 R:5.816e+03\n",
 933 |       "after partial fitting: 14558.787\n",
 934 |       "J:00330 E:00000 L:1.456e+04 P:-6.292e+05 R:5.987e+03\n",
 935 |       "after partial fitting: 14741.229\n",
 936 |       "J:00331 E:00000 L:1.474e+04 P:-6.293e+05 R:6.007e+03\n",
 937 |       "after partial fitting: 14486.702\n",
 938 |       "J:00332 E:00000 L:1.449e+04 P:-6.293e+05 R:6.031e+03\n",
 939 |       "after partial fitting: 14335.401\n",
 940 |       "J:00333 E:00000 L:1.434e+04 P:-6.294e+05 R:5.435e+03\n",
 941 |       "after partial fitting: 14669.0\n",
 942 |       "J:00334 E:00000 L:1.467e+04 P:-6.294e+05 R:5.968e+03\n",
 943 |       "after partial fitting: 14545.955\n",
 944 |       "J:00335 E:00000 L:1.455e+04 P:-6.295e+05 R:6.064e+03\n",
 945 |       "after partial fitting: 13692.357\n",
 946 |       "J:00336 E:00000 L:1.369e+04 P:-6.295e+05 R:6.072e+03\n",
 947 |       "after partial fitting: 14744.129\n",
 948 |       "J:00337 E:00000 L:1.474e+04 P:-6.295e+05 R:6.014e+03\n",
 949 |       "after partial fitting: 14489.162\n",
 950 |       "J:00338 E:00000 L:1.449e+04 P:-6.296e+05 R:5.963e+03\n",
 951 |       "after partial fitting: 15319.191\n",
 952 |       "J:00339 E:00000 L:1.532e+04 P:-6.296e+05 R:5.996e+03\n",
 953 |       "after partial fitting: 15349.924\n",
 954 |       "J:00340 E:00000 L:1.535e+04 P:-6.297e+05 R:6.040e+03\n",
 955 |       "after partial fitting: 14864.773\n",
 956 |       "J:00341 E:00000 L:1.486e+04 P:-6.297e+05 R:6.082e+03\n",
 957 |       "after partial fitting: 15899.033\n",
 958 |       "J:00342 E:00000 L:1.590e+04 P:-6.297e+05 R:5.982e+03\n",
 959 |       "after partial fitting: 14391.564\n",
 960 |       "J:00343 E:00000 L:1.439e+04 P:-6.298e+05 R:6.083e+03\n",
 961 |       "after partial fitting: 15263.919\n",
 962 |       "J:00344 E:00000 L:1.526e+04 P:-6.298e+05 R:6.004e+03\n",
 963 |       "after partial fitting: 14539.586\n",
 964 |       "J:00345 E:00000 L:1.454e+04 P:-6.299e+05 R:6.075e+03\n",
 965 |       "after partial fitting: 14860.878\n",
 966 |       "J:00346 E:00000 L:1.486e+04 P:-6.299e+05 R:6.017e+03\n",
 967 |       "after partial fitting: 16857.904\n",
 968 |       "J:00347 E:00000 L:1.686e+04 P:-6.299e+05 R:6.067e+03\n",
 969 |       "after partial fitting: 14713.729\n",
 970 |       "J:00348 E:00000 L:1.471e+04 P:-6.300e+05 R:6.077e+03\n",
 971 |       "after partial fitting: 14849.369\n",
 972 |       "J:00349 E:00000 L:1.485e+04 P:-6.300e+05 R:5.964e+03\n",
 973 |       "after partial fitting: 14556.814\n",
 974 |       "J:00350 E:00000 L:1.456e+04 P:-6.301e+05 R:6.071e+03\n",
 975 |       "after partial fitting: 15130.792\n",
 976 |       "J:00351 E:00000 L:1.513e+04 P:-6.301e+05 R:6.030e+03\n",
 977 |       "after partial fitting: 14156.36\n",
 978 |       "J:00352 E:00000 L:1.416e+04 P:-6.301e+05 R:6.109e+03\n",
 979 |       "after partial fitting: 14585.213\n",
 980 |       "J:00353 E:00000 L:1.459e+04 P:-6.302e+05 R:6.036e+03\n",
 981 |       "after partial fitting: 14655.086\n",
 982 |       "J:00354 E:00000 L:1.466e+04 P:-6.302e+05 R:6.042e+03\n",
 983 |       "after partial fitting: 14604.042\n",
 984 |       "J:00355 E:00000 L:1.460e+04 P:-6.303e+05 R:6.104e+03\n",
 985 |       "after partial fitting: 14931.332\n",
 986 |       "J:00356 E:00000 L:1.493e+04 P:-6.303e+05 R:6.036e+03\n",
 987 |       "after partial fitting: 14726.496\n",
 988 |       "J:00357 E:00000 L:1.473e+04 P:-6.304e+05 R:6.038e+03\n",
 989 |       "after partial fitting: 14970.17\n",
 990 |       "J:00358 E:00000 L:1.497e+04 P:-6.304e+05 R:6.066e+03\n",
 991 |       "after partial fitting: 15602.671\n",
 992 |       "J:00359 E:00000 L:1.560e+04 P:-6.305e+05 R:5.973e+03\n",
 993 |       "after partial fitting: 14683.339\n",
 994 |       "J:00360 E:00000 L:1.468e+04 P:-6.305e+05 R:6.082e+03\n",
 995 |       "after partial fitting: 14489.512\n",
 996 |       "J:00361 E:00000 L:1.449e+04 P:-6.305e+05 R:6.080e+03\n",
 997 |       "after partial fitting: 14815.252\n",
 998 |       "J:00362 E:00000 L:1.482e+04 P:-6.306e+05 R:6.078e+03\n",
 999 |       "after partial fitting: 14727.34\n",
1000 |       "J:00363 E:00000 L:1.473e+04 P:-6.306e+05 R:6.050e+03\n",
1001 |       "after partial fitting: 14432.391\n",
1002 |       "J:00364 E:00000 L:1.443e+04 P:-6.307e+05 R:6.078e+03\n",
1003 |       "after partial fitting: 14706.677\n",
1004 |       "J:00365 E:00000 L:1.471e+04 P:-6.307e+05 R:6.040e+03\n",
1005 |       "after partial fitting: 14421.685\n",
1006 |       "J:00366 E:00000 L:1.442e+04 P:-6.308e+05 R:6.070e+03\n",
1007 |       "after partial fitting: 15381.402\n",
1008 |       "J:00367 E:00000 L:1.538e+04 P:-6.308e+05 R:5.917e+03\n",
1009 |       "after partial fitting: 14509.718\n",
1010 |       "J:00368 E:00000 L:1.451e+04 P:-6.309e+05 R:6.083e+03\n",
1011 |       "after partial fitting: 14324.279\n",
1012 |       "J:00369 E:00000 L:1.432e+04 P:-6.309e+05 R:5.524e+03\n",
1013 |       "after partial fitting: 15178.109\n",
1014 |       "J:00370 E:00000 L:1.518e+04 P:-6.310e+05 R:5.098e+03\n",
1015 |       "after partial fitting: 14355.745\n",
1016 |       "J:00371 E:00000 L:1.436e+04 P:-6.310e+05 R:5.723e+03\n",
1017 |       "after partial fitting: 14463.208\n",
1018 |       "J:00372 E:00000 L:1.446e+04 P:-6.311e+05 R:5.402e+03\n",
1019 |       "after partial fitting: 15461.666\n",
1020 |       "J:00373 E:00000 L:1.546e+04 P:-6.311e+05 R:5.723e+03\n",
1021 |       "after partial fitting: 15355.652\n",
1022 |       "J:00374 E:00000 L:1.536e+04 P:-6.312e+05 R:6.055e+03\n",
1023 |       "after partial fitting: 14635.645\n",
1024 |       "J:00375 E:00000 L:1.464e+04 P:-6.312e+05 R:6.108e+03\n",
1025 |       "after partial fitting: 14594.617\n",
1026 |       "J:00376 E:00000 L:1.459e+04 P:-6.313e+05 R:6.129e+03\n",
1027 |       "after partial fitting: 14975.333\n",
1028 |       "J:00377 E:00000 L:1.498e+04 P:-6.313e+05 R:6.018e+03\n",
1029 |       "after partial fitting: 15256.993\n",
1030 |       "J:00378 E:00000 L:1.526e+04 P:-6.314e+05 R:5.995e+03\n",
1031 |       "after partial fitting: 14293.412\n",
1032 |       "J:00379 E:00000 L:1.429e+04 P:-6.314e+05 R:5.984e+03\n",
1033 |       "after partial fitting: 14688.223\n",
1034 |       "J:00380 E:00000 L:1.469e+04 P:-6.315e+05 R:5.568e+03\n",
1035 |       "after partial fitting: 14864.831\n",
1036 |       "J:00381 E:00000 L:1.486e+04 P:-6.315e+05 R:5.273e+03\n",
1037 |       "after partial fitting: 14482.523\n",
1038 |       "J:00382 E:00000 L:1.448e+04 P:-6.316e+05 R:5.607e+03\n",
1039 |       "after partial fitting: 14615.026\n",
1040 |       "J:00383 E:00000 L:1.462e+04 P:-6.316e+05 R:5.597e+03\n",
1041 |       "after partial fitting: 14640.195\n",
1042 |       "J:00384 E:00000 L:1.464e+04 P:-6.317e+05 R:5.786e+03\n",
1043 |       "after partial fitting: 14482.943\n",
1044 |       "J:00385 E:00000 L:1.448e+04 P:-6.317e+05 R:5.912e+03\n",
1045 |       "after partial fitting: 14468.982\n",
1046 |       "J:00386 E:00000 L:1.447e+04 P:-6.318e+05 R:4.466e+03\n",
1047 |       "after partial fitting: 14245.359\n",
1048 |       "J:00387 E:00000 L:1.425e+04 P:-6.318e+05 R:4.640e+03\n",
1049 |       "after partial fitting: 15537.297\n",
1050 |       "J:00388 E:00000 L:1.554e+04 P:-6.319e+05 R:5.830e+03\n",
1051 |       "after partial fitting: 13345.988\n",
1052 |       "J:00389 E:00000 L:1.335e+04 P:-6.319e+05 R:5.994e+03\n",
1053 |       "after partial fitting: 14001.105\n",
1054 |       "J:00390 E:00000 L:1.400e+04 P:-6.320e+05 R:5.940e+03\n",
1055 |       "after partial fitting: 14789.098\n",
1056 |       "J:00391 E:00000 L:1.479e+04 P:-6.320e+05 R:5.954e+03\n",
1057 |       "after partial fitting: 7241.8867\n",
1058 |       "J:00392 E:00000 L:7.242e+03 P:-6.321e+05 R:1.046e+04\n",
1059 |       "after partial fitting: 14878.034\n",
1060 |       "J:00393 E:00000 L:1.488e+04 P:-6.321e+05 R:5.930e+03\n",
1061 |       "after partial fitting: 14486.625\n",
1062 |       "J:00394 E:00000 L:1.449e+04 P:-6.322e+05 R:5.781e+03\n",
1063 |       "after partial fitting: 14453.366\n",
1064 |       "J:00395 E:00000 L:1.445e+04 P:-6.322e+05 R:5.867e+03\n",
1065 |       "after partial fitting: 14207.471\n",
1066 |       "J:00396 E:00000 L:1.421e+04 P:-6.323e+05 R:5.970e+03\n",
1067 |       "after partial fitting: 14913.216\n",
1068 |       "J:00397 E:00000 L:1.491e+04 P:-6.323e+05 R:5.863e+03\n",
1069 |       "after partial fitting: 14606.648\n",
1070 |       "J:00398 E:00000 L:1.461e+04 P:-6.324e+05 R:5.862e+03\n",
1071 |       "after partial fitting: 14520.863\n",
1072 |       "J:00399 E:00000 L:1.452e+04 P:-6.324e+05 R:5.967e+03\n",
1073 |       "after partial fitting: 15069.138\n",
1074 |       "J:00400 E:00000 L:1.507e+04 P:-6.325e+05 R:5.985e+03\n",
1075 |       "after partial fitting: 14020.991\n",
1076 |       "J:00401 E:00000 L:1.402e+04 P:-6.325e+05 R:6.088e+03\n",
1077 |       "after partial fitting: 14512.34\n",
1078 |       "J:00402 E:00000 L:1.451e+04 P:-6.326e+05 R:5.994e+03\n",
1079 |       "after partial fitting: 14999.486\n",
1080 |       "J:00403 E:00000 L:1.500e+04 P:-6.326e+05 R:5.937e+03\n",
1081 |       "after partial fitting: 14546.295\n",
1082 |       "J:00404 E:00000 L:1.455e+04 P:-6.327e+05 R:5.861e+03\n",
1083 |       "after partial fitting: 14527.749\n",
1084 |       "J:00405 E:00000 L:1.453e+04 P:-6.327e+05 R:5.838e+03\n",
1085 |       "after partial fitting: 14727.026\n",
1086 |       "J:00406 E:00000 L:1.473e+04 P:-6.327e+05 R:5.785e+03\n",
1087 |       "after partial fitting: 14214.577\n",
1088 |       "J:00407 E:00000 L:1.421e+04 P:-6.328e+05 R:5.764e+03\n",
1089 |       "after partial fitting: 14713.629\n",
1090 |       "J:00408 E:00000 L:1.471e+04 P:-6.328e+05 R:5.814e+03\n",
1091 |       "after partial fitting: 15407.915\n",
1092 |       "J:00409 E:00000 L:1.541e+04 P:-6.329e+05 R:5.806e+03\n",
1093 |       "after partial fitting: 14397.699\n",
1094 |       "J:00410 E:00000 L:1.440e+04 P:-6.329e+05 R:5.798e+03\n",
1095 |       "after partial fitting: 14702.228\n",
1096 |       "J:00411 E:00000 L:1.470e+04 P:-6.330e+05 R:5.733e+03\n",
1097 |       "after partial fitting: 13619.862\n",
1098 |       "J:00412 E:00000 L:1.362e+04 P:-6.330e+05 R:5.873e+03\n",
1099 |       "after partial fitting: 14371.762\n",
1100 |       "J:00413 E:00000 L:1.437e+04 P:-6.330e+05 R:5.946e+03\n",
1101 |       "after partial fitting: 14834.496\n",
1102 |       "J:00414 E:00000 L:1.483e+04 P:-6.331e+05 R:5.771e+03\n",
1103 |       "after partial fitting: 15064.877\n",
1104 |       "J:00415 E:00000 L:1.506e+04 P:-6.331e+05 R:5.900e+03\n",
1105 |       "after partial fitting: 15450.629\n",
1106 |       "J:00416 E:00000 L:1.545e+04 P:-6.332e+05 R:5.931e+03\n",
1107 |       "after partial fitting: 14237.912\n",
1108 |       "J:00417 E:00000 L:1.424e+04 P:-6.332e+05 R:5.902e+03\n",
1109 |       "after partial fitting: 15287.123\n",
1110 |       "J:00418 E:00000 L:1.529e+04 P:-6.332e+05 R:5.943e+03\n",
1111 |       "after partial fitting: 15018.633\n",
1112 |       "J:00419 E:00000 L:1.502e+04 P:-6.333e+05 R:5.851e+03\n",
1113 |       "after partial fitting: 14213.513\n",
1114 |       "J:00420 E:00000 L:1.421e+04 P:-6.333e+05 R:5.959e+03\n",
1115 |       "after partial fitting: 14366.404\n",
1116 |       "J:00421 E:00000 L:1.437e+04 P:-6.334e+05 R:5.845e+03\n",
1117 |       "after partial fitting: 14147.428\n",
1118 |       "J:00422 E:00000 L:1.415e+04 P:-6.334e+05 R:5.648e+03\n",
1119 |       "after partial fitting: 14529.006\n",
1120 |       "J:00423 E:00000 L:1.453e+04 P:-6.334e+05 R:5.945e+03\n",
1121 |       "after partial fitting: 14726.794\n",
1122 |       "J:00424 E:00000 L:1.473e+04 P:-6.335e+05 R:5.790e+03\n",
1123 |       "after partial fitting: 14704.387\n",
1124 |       "J:00425 E:00000 L:1.470e+04 P:-6.335e+05 R:5.833e+03\n",
1125 |       "after partial fitting: 14794.7\n",
1126 |       "J:00426 E:00000 L:1.479e+04 P:-6.336e+05 R:5.974e+03\n",
1127 |       "after partial fitting: 14855.431\n",
1128 |       "J:00427 E:00000 L:1.486e+04 P:-6.336e+05 R:5.981e+03\n",
1129 |       "after partial fitting: 14757.901\n",
1130 |       "J:00428 E:00000 L:1.476e+04 P:-6.336e+05 R:5.773e+03\n",
1131 |       "after partial fitting: 15629.122\n",
1132 |       "J:00429 E:00000 L:1.563e+04 P:-6.337e+05 R:5.074e+03\n",
1133 |       "after partial fitting: 14570.609\n",
1134 |       "J:00430 E:00000 L:1.457e+04 P:-6.337e+05 R:6.071e+03\n",
1135 |       "after partial fitting: 14573.698\n",
1136 |       "J:00431 E:00000 L:1.457e+04 P:-6.338e+05 R:5.308e+03\n",
1137 |       "after partial fitting: 14640.74\n",
1138 |       "J:00432 E:00000 L:1.464e+04 P:-6.338e+05 R:4.669e+03\n",
1139 |       "after partial fitting: 13987.582\n",
1140 |       "J:00433 E:00000 L:1.399e+04 P:-6.339e+05 R:5.732e+03\n",
1141 |       "after partial fitting: 14344.887\n",
1142 |       "J:00434 E:00000 L:1.434e+04 P:-6.339e+05 R:5.636e+03\n",
1143 |       "after partial fitting: 14234.611\n",
1144 |       "J:00435 E:00000 L:1.423e+04 P:-6.339e+05 R:6.034e+03\n",
1145 |       "after partial fitting: 14910.684\n",
1146 |       "J:00436 E:00000 L:1.491e+04 P:-6.340e+05 R:5.985e+03\n",
1147 |       "after partial fitting: 14540.426\n",
1148 |       "J:00437 E:00000 L:1.454e+04 P:-6.340e+05 R:6.120e+03\n",
1149 |       "after partial fitting: 15498.494\n",
1150 |       "J:00438 E:00000 L:1.550e+04 P:-6.341e+05 R:5.903e+03\n",
1151 |       "after partial fitting: 14287.078\n",
1152 |       "J:00439 E:00000 L:1.429e+04 P:-6.341e+05 R:6.118e+03\n",
1153 |       "after partial fitting: 15466.504\n",
1154 |       "J:00440 E:00000 L:1.547e+04 P:-6.341e+05 R:6.042e+03\n",
1155 |       "after partial fitting: 14994.009\n",
1156 |       "J:00441 E:00000 L:1.499e+04 P:-6.342e+05 R:5.957e+03\n",
1157 |       "after partial fitting: 14992.865\n",
1158 |       "J:00442 E:00000 L:1.499e+04 P:-6.342e+05 R:6.076e+03\n",
1159 |       "after partial fitting: 13770.967\n",
1160 |       "J:00443 E:00000 L:1.377e+04 P:-6.343e+05 R:6.147e+03\n",
1161 |       "after partial fitting: 14730.11\n",
1162 |       "J:00444 E:00000 L:1.473e+04 P:-6.343e+05 R:5.968e+03\n",
1163 |       "after partial fitting: 14343.22\n",
1164 |       "J:00445 E:00000 L:1.434e+04 P:-6.343e+05 R:6.099e+03\n",
1165 |       "after partial fitting: 14472.494\n",
1166 |       "J:00446 E:00000 L:1.447e+04 P:-6.344e+05 R:5.713e+03\n",
1167 |       "after partial fitting: 15321.039\n",
1168 |       "J:00447 E:00000 L:1.532e+04 P:-6.344e+05 R:5.249e+03\n",
1169 |       "after partial fitting: 14637.301\n",
1170 |       "J:00448 E:00000 L:1.464e+04 P:-6.344e+05 R:5.342e+03\n",
1171 |       "after partial fitting: 14501.582\n",
1172 |       "J:00449 E:00000 L:1.450e+04 P:-6.345e+05 R:6.040e+03\n",
1173 |       "after partial fitting: 14394.538\n",
1174 |       "J:00450 E:00000 L:1.439e+04 P:-6.345e+05 R:6.068e+03\n",
1175 |       "after partial fitting: 14712.55\n",
1176 |       "J:00451 E:00000 L:1.471e+04 P:-6.345e+05 R:6.008e+03\n",
1177 |       "after partial fitting: 14628.376\n",
1178 |       "J:00452 E:00000 L:1.463e+04 P:-6.346e+05 R:5.957e+03\n",
1179 |       "after partial fitting: 14567.139\n",
1180 |       "J:00453 E:00000 L:1.457e+04 P:-6.346e+05 R:6.052e+03\n",
1181 |       "after partial fitting: 14481.777\n",
1182 |       "J:00454 E:00000 L:1.448e+04 P:-6.347e+05 R:6.018e+03\n",
1183 |       "after partial fitting: 14603.255\n",
1184 |       "J:00455 E:00000 L:1.460e+04 P:-6.347e+05 R:5.983e+03\n",
1185 |       "after partial fitting: 15486.94\n",
1186 |       "J:00456 E:00000 L:1.549e+04 P:-6.347e+05 R:5.950e+03\n",
1187 |       "after partial fitting: 14729.221\n",
1188 |       "J:00457 E:00000 L:1.473e+04 P:-6.348e+05 R:6.001e+03\n",
1189 |       "after partial fitting: 14569.208\n",
1190 |       "J:00458 E:00000 L:1.457e+04 P:-6.348e+05 R:6.031e+03\n",
1191 |       "after partial fitting: 14535.811\n",
1192 |       "J:00459 E:00000 L:1.454e+04 P:-6.348e+05 R:5.998e+03\n",
1193 |       "after partial fitting: 14438.721\n",
1194 |       "J:00460 E:00000 L:1.444e+04 P:-6.349e+05 R:6.053e+03\n",
1195 |       "after partial fitting: 14511.928\n",
1196 |       "J:00461 E:00000 L:1.451e+04 P:-6.349e+05 R:6.066e+03\n",
1197 |       "after partial fitting: 14477.723\n",
1198 |       "J:00462 E:00000 L:1.448e+04 P:-6.350e+05 R:6.078e+03\n",
1199 |       "after partial fitting: 14950.235\n",
1200 |       "J:00463 E:00000 L:1.495e+04 P:-6.350e+05 R:5.917e+03\n",
1201 |       "after partial fitting: 14582.895\n",
1202 |       "J:00464 E:00000 L:1.458e+04 P:-6.350e+05 R:6.031e+03\n",
1203 |       "after partial fitting: 14729.203\n",
1204 |       "J:00465 E:00000 L:1.473e+04 P:-6.351e+05 R:6.025e+03\n",
1205 |       "after partial fitting: 14402.289\n",
1206 |       "J:00466 E:00000 L:1.440e+04 P:-6.351e+05 R:6.004e+03\n",
1207 |       "after partial fitting: 14550.326\n",
1208 |       "J:00467 E:00000 L:1.455e+04 P:-6.351e+05 R:5.950e+03\n",
1209 |       "after partial fitting: 15661.102\n",
1210 |       "J:00468 E:00000 L:1.566e+04 P:-6.352e+05 R:5.940e+03\n",
1211 |       "after partial fitting: 14441.411\n",
1212 |       "J:00469 E:00000 L:1.444e+04 P:-6.352e+05 R:6.002e+03\n",
1213 |       "after partial fitting: 14841.039\n",
1214 |       "J:00470 E:00000 L:1.484e+04 P:-6.352e+05 R:6.044e+03\n",
1215 |       "after partial fitting: 14554.411\n",
1216 |       "J:00471 E:00000 L:1.455e+04 P:-6.353e+05 R:5.950e+03\n",
1217 |       "after partial fitting: 14392.907\n",
1218 |       "J:00472 E:00000 L:1.439e+04 P:-6.353e+05 R:6.064e+03\n",
1219 |       "after partial fitting: 14705.483\n",
1220 |       "J:00473 E:00000 L:1.471e+04 P:-6.354e+05 R:5.972e+03\n",
1221 |       "after partial fitting: 14510.408\n",
1222 |       "J:00474 E:00000 L:1.451e+04 P:-6.354e+05 R:6.076e+03\n",
1223 |       "after partial fitting: 14634.635\n",
1224 |       "J:00475 E:00000 L:1.463e+04 P:-6.354e+05 R:6.041e+03\n",
1225 |       "after partial fitting: 13660.078\n",
1226 |       "J:00476 E:00000 L:1.366e+04 P:-6.355e+05 R:6.010e+03\n",
1227 |       "after partial fitting: 14219.305\n",
1228 |       "J:00477 E:00000 L:1.422e+04 P:-6.355e+05 R:6.035e+03\n",
1229 |       "after partial fitting: 14432.213\n",
1230 |       "J:00478 E:00000 L:1.443e+04 P:-6.355e+05 R:6.058e+03\n",
1231 |       "after partial fitting: 15724.705\n",
1232 |       "J:00479 E:00000 L:1.572e+04 P:-6.356e+05 R:5.981e+03\n",
1233 |       "after partial fitting: 14698.277\n",
1234 |       "J:00480 E:00000 L:1.470e+04 P:-6.356e+05 R:6.041e+03\n",
1235 |       "after partial fitting: 14409.623\n",
1236 |       "J:00481 E:00000 L:1.441e+04 P:-6.356e+05 R:5.995e+03\n",
1237 |       "after partial fitting: 15361.801\n",
1238 |       "J:00482 E:00000 L:1.536e+04 P:-6.356e+05 R:5.895e+03\n",
1239 |       "after partial fitting: 13763.598\n",
1240 |       "J:00483 E:00000 L:1.376e+04 P:-6.357e+05 R:5.966e+03\n",
1241 |       "after partial fitting: 14787.51\n",
1242 |       "J:00484 E:00000 L:1.479e+04 P:-6.357e+05 R:5.978e+03\n",
1243 |       "after partial fitting: 14301.448\n",
1244 |       "J:00485 E:00000 L:1.430e+04 P:-6.357e+05 R:6.064e+03\n",
1245 |       "after partial fitting: 14570.569\n",
1246 |       "J:00486 E:00000 L:1.457e+04 P:-6.358e+05 R:6.005e+03\n",
1247 |       "after partial fitting: 14483.9\n",
1248 |       "J:00487 E:00000 L:1.448e+04 P:-6.358e+05 R:6.056e+03\n",
1249 |       "after partial fitting: 14777.172\n",
1250 |       "J:00488 E:00000 L:1.478e+04 P:-6.358e+05 R:6.005e+03\n",
1251 |       "after partial fitting: 14138.645\n",
1252 |       "J:00489 E:00000 L:1.414e+04 P:-6.358e+05 R:6.071e+03\n",
1253 |       "after partial fitting: 14507.3\n",
1254 |       "J:00490 E:00000 L:1.451e+04 P:-6.359e+05 R:5.999e+03\n",
1255 |       "after partial fitting: 14240.59\n",
1256 |       "J:00491 E:00000 L:1.424e+04 P:-6.359e+05 R:6.100e+03\n",
1257 |       "after partial fitting: 14599.896\n",
1258 |       "J:00492 E:00000 L:1.460e+04 P:-6.359e+05 R:6.029e+03\n",
1259 |       "after partial fitting: 13480.795\n",
1260 |       "J:00493 E:00000 L:1.348e+04 P:-6.360e+05 R:6.003e+03\n",
1261 |       "after partial fitting: 14399.111\n",
1262 |       "J:00494 E:00000 L:1.440e+04 P:-6.360e+05 R:5.972e+03\n",
1263 |       "after partial fitting: 13600.765\n",
1264 |       "J:00495 E:00000 L:1.360e+04 P:-6.360e+05 R:6.022e+03\n",
1265 |       "after partial fitting: 14430.855\n",
1266 |       "J:00496 E:00000 L:1.443e+04 P:-6.360e+05 R:5.999e+03\n",
1267 |       "after partial fitting: 14232.558\n",
1268 |       "J:00497 E:00000 L:1.423e+04 P:-6.361e+05 R:5.943e+03\n",
1269 |       "after partial fitting: 14240.448\n",
1270 |       "J:00498 E:00000 L:1.424e+04 P:-6.361e+05 R:6.010e+03\n",
1271 |       "after partial fitting: 14434.785\n",
1272 |       "J:00499 E:00000 L:1.443e+04 P:-6.361e+05 R:6.007e+03\n",
1273 |       "after partial fitting: 14706.266\n",
1274 |       "J:00500 E:00000 L:1.471e+04 P:-6.361e+05 R:5.978e+03\n",
1275 |       "after partial fitting: 14607.213\n",
1276 |       "J:00501 E:00000 L:1.461e+04 P:-6.362e+05 R:6.052e+03\n",
1277 |       "after partial fitting: 15106.46\n",
1278 |       "J:00502 E:00000 L:1.511e+04 P:-6.362e+05 R:5.975e+03\n",
1279 |       "after partial fitting: 14668.539\n",
1280 |       "J:00503 E:00000 L:1.467e+04 P:-6.362e+05 R:6.015e+03\n",
1281 |       "after partial fitting: 14146.911\n",
1282 |       "J:00504 E:00000 L:1.415e+04 P:-6.362e+05 R:5.965e+03\n",
1283 |       "after partial fitting: 14280.128\n",
1284 |       "J:00505 E:00000 L:1.428e+04 P:-6.362e+05 R:6.016e+03\n",
1285 |       "after partial fitting: 14417.5\n",
1286 |       "J:00506 E:00000 L:1.442e+04 P:-6.363e+05 R:6.028e+03\n",
1287 |       "after partial fitting: 14364.486\n",
1288 |       "J:00507 E:00000 L:1.436e+04 P:-6.363e+05 R:6.084e+03\n",
1289 |       "after partial fitting: 14277.384\n",
1290 |       "J:00508 E:00000 L:1.428e+04 P:-6.363e+05 R:5.962e+03\n",
1291 |       "after partial fitting: 15637.449\n",
1292 |       "J:00509 E:00000 L:1.564e+04 P:-6.363e+05 R:5.986e+03\n",
1293 |       "after partial fitting: 15505.58\n",
1294 |       "J:00510 E:00000 L:1.551e+04 P:-6.364e+05 R:5.971e+03\n",
1295 |       "after partial fitting: 14423.724\n",
1296 |       "J:00511 E:00000 L:1.442e+04 P:-6.364e+05 R:5.973e+03\n",
1297 |       "after partial fitting: 14431.197\n",
1298 |       "J:00512 E:00000 L:1.443e+04 P:-6.364e+05 R:6.024e+03\n",
1299 |       "after partial fitting: 14257.615\n",
1300 |       "J:00513 E:00000 L:1.426e+04 P:-6.364e+05 R:6.053e+03\n",
1301 |       "after partial fitting: 14101.682\n",
1302 |       "J:00514 E:00000 L:1.410e+04 P:-6.364e+05 R:6.025e+03\n",
1303 |       "after partial fitting: 14489.803\n",
1304 |       "J:00515 E:00000 L:1.449e+04 P:-6.365e+05 R:5.931e+03\n",
1305 |       "after partial fitting: 14667.535\n",
1306 |       "J:00516 E:00000 L:1.467e+04 P:-6.365e+05 R:6.070e+03\n",
1307 |       "after partial fitting: 14588.743\n",
1308 |       "J:00517 E:00000 L:1.459e+04 P:-6.365e+05 R:6.068e+03\n",
1309 |       "after partial fitting: 14503.665\n",
1310 |       "J:00518 E:00000 L:1.450e+04 P:-6.365e+05 R:5.975e+03\n",
1311 |       "after partial fitting: 14442.247\n",
1312 |       "J:00519 E:00000 L:1.444e+04 P:-6.365e+05 R:6.005e+03\n",
1313 |       "after partial fitting: 14448.367\n",
1314 |       "J:00520 E:00000 L:1.445e+04 P:-6.366e+05 R:5.996e+03\n",
1315 |       "after partial fitting: 12936.318\n",
1316 |       "J:00521 E:00000 L:1.294e+04 P:-6.366e+05 R:6.048e+03\n",
1317 |       "after partial fitting: 14588.896\n",
1318 |       "J:00522 E:00000 L:1.459e+04 P:-6.366e+05 R:6.042e+03\n",
1319 |       "after partial fitting: 14579.913\n",
1320 |       "J:00523 E:00000 L:1.458e+04 P:-6.366e+05 R:6.019e+03\n",
1321 |       "after partial fitting: 14512.184\n",
1322 |       "J:00524 E:00000 L:1.451e+04 P:-6.366e+05 R:6.022e+03\n",
1323 |       "after partial fitting: 14489.904\n",
1324 |       "J:00525 E:00000 L:1.449e+04 P:-6.366e+05 R:6.007e+03\n",
1325 |       "after partial fitting: 14809.156\n",
1326 |       "J:00526 E:00000 L:1.481e+04 P:-6.367e+05 R:5.937e+03\n",
1327 |       "after partial fitting: 14390.97\n",
1328 |       "J:00527 E:00000 L:1.439e+04 P:-6.367e+05 R:6.088e+03\n",
1329 |       "after partial fitting: 13970.924\n",
1330 |       "J:00528 E:00000 L:1.397e+04 P:-6.367e+05 R:6.064e+03\n",
1331 |       "after partial fitting: 14603.634\n",
1332 |       "J:00529 E:00000 L:1.460e+04 P:-6.367e+05 R:6.027e+03\n",
1333 |       "after partial fitting: 14482.102\n",
1334 |       "J:00530 E:00000 L:1.448e+04 P:-6.367e+05 R:5.963e+03\n",
1335 |       "after partial fitting: 14398.1875\n",
1336 |       "J:00531 E:00000 L:1.440e+04 P:-6.367e+05 R:6.008e+03\n",
1337 |       "after partial fitting: 15500.604\n",
1338 |       "J:00532 E:00000 L:1.550e+04 P:-6.368e+05 R:5.911e+03\n",
1339 |       "after partial fitting: 14584.271\n",
1340 |       "J:00533 E:00000 L:1.458e+04 P:-6.368e+05 R:6.082e+03\n",
1341 |       "after partial fitting: 14435.316\n",
1342 |       "J:00534 E:00000 L:1.444e+04 P:-6.368e+05 R:5.979e+03\n",
1343 |       "after partial fitting: 14266.354\n",
1344 |       "J:00535 E:00000 L:1.427e+04 P:-6.368e+05 R:6.051e+03\n",
1345 |       "after partial fitting: 14484.787\n",
1346 |       "J:00536 E:00000 L:1.448e+04 P:-6.368e+05 R:5.984e+03\n",
1347 |       "after partial fitting: 13549.424\n",
1348 |       "J:00537 E:00000 L:1.355e+04 P:-6.368e+05 R:6.020e+03\n",
1349 |       "after partial fitting: 14277.084\n",
1350 |       "J:00538 E:00000 L:1.428e+04 P:-6.368e+05 R:6.122e+03\n",
1351 |       "after partial fitting: 14314.386\n",
1352 |       "J:00539 E:00000 L:1.431e+04 P:-6.368e+05 R:6.029e+03\n",
1353 |       "after partial fitting: 14598.623\n",
1354 |       "J:00540 E:00000 L:1.460e+04 P:-6.369e+05 R:6.029e+03\n",
1355 |       "after partial fitting: 14271.439\n",
1356 |       "J:00541 E:00000 L:1.427e+04 P:-6.369e+05 R:5.959e+03\n",
1357 |       "after partial fitting: 14124.813\n",
1358 |       "J:00542 E:00000 L:1.412e+04 P:-6.369e+05 R:6.079e+03\n",
1359 |       "after partial fitting: 14820.553\n",
1360 |       "J:00543 E:00000 L:1.482e+04 P:-6.369e+05 R:5.966e+03\n",
1361 |       "after partial fitting: 14490.596\n",
1362 |       "J:00544 E:00000 L:1.449e+04 P:-6.369e+05 R:6.023e+03\n",
1363 |       "after partial fitting: 14951.023\n",
1364 |       "J:00545 E:00000 L:1.495e+04 P:-6.369e+05 R:5.966e+03\n",
1365 |       "after partial fitting: 15445.996\n",
1366 |       "J:00546 E:00000 L:1.545e+04 P:-6.369e+05 R:5.961e+03\n",
1367 |       "after partial fitting: 14599.424\n",
1368 |       "J:00547 E:00000 L:1.460e+04 P:-6.369e+05 R:6.045e+03\n",
1369 |       "after partial fitting: 14435.922\n",
1370 |       "J:00548 E:00000 L:1.444e+04 P:-6.369e+05 R:6.013e+03\n",
1371 |       "after partial fitting: 14287.492\n",
1372 |       "J:00549 E:00000 L:1.429e+04 P:-6.370e+05 R:6.039e+03\n",
1373 |       "after partial fitting: 14432.537\n",
1374 |       "J:00550 E:00000 L:1.443e+04 P:-6.370e+05 R:6.061e+03\n",
1375 |       "after partial fitting: 14526.366\n",
1376 |       "J:00551 E:00000 L:1.453e+04 P:-6.370e+05 R:6.057e+03\n",
1377 |       "after partial fitting: 13976.875\n",
1378 |       "J:00552 E:00000 L:1.398e+04 P:-6.370e+05 R:6.075e+03\n",
1379 |       "after partial fitting: 14478.694\n",
1380 |       "J:00553 E:00000 L:1.448e+04 P:-6.370e+05 R:6.062e+03\n",
1381 |       "after partial fitting: 14647.529\n",
1382 |       "J:00554 E:00000 L:1.465e+04 P:-6.370e+05 R:5.915e+03\n",
1383 |       "after partial fitting: 14420.941\n",
1384 |       "J:00555 E:00000 L:1.442e+04 P:-6.370e+05 R:5.950e+03\n",
1385 |       "after partial fitting: 14300.262\n",
1386 |       "J:00556 E:00000 L:1.430e+04 P:-6.370e+05 R:6.088e+03\n",
1387 |       "after partial fitting: 14331.677\n",
1388 |       "J:00557 E:00000 L:1.433e+04 P:-6.370e+05 R:5.984e+03\n",
1389 |       "after partial fitting: 14221.029\n",
1390 |       "J:00558 E:00000 L:1.422e+04 P:-6.370e+05 R:5.981e+03\n",
1391 |       "after partial fitting: 14602.904\n",
1392 |       "J:00559 E:00000 L:1.460e+04 P:-6.370e+05 R:5.967e+03\n",
1393 |       "after partial fitting: 14225.79\n",
1394 |       "J:00560 E:00000 L:1.423e+04 P:-6.370e+05 R:5.998e+03\n",
1395 |       "after partial fitting: 14353.901\n",
1396 |       "J:00561 E:00000 L:1.435e+04 P:-6.370e+05 R:6.046e+03\n",
1397 |       "after partial fitting: 14728.567\n",
1398 |       "J:00562 E:00000 L:1.473e+04 P:-6.370e+05 R:5.953e+03\n",
1399 |       "after partial fitting: 15207.722\n",
1400 |       "J:00563 E:00000 L:1.521e+04 P:-6.370e+05 R:5.993e+03\n"
1401 |      ]
1402 |     }
1403 |    ],
1404 |    "source": [
1405 |     "for epoch in range(1):\n",
1406 |     "    data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),\n",
1407 |     "                          cuda.to_cpu(model.mixture.factors.W.data).copy(),\n",
1408 |     "                          cuda.to_cpu(model.sampler.W.data).copy(),\n",
1409 |     "                          words)\n",
1410 |     "    top_words = print_top_words_per_topic(data)\n",
1411 |     "    if j % 100 == 0 and j > 100:\n",
1412 |     "        coherence = topic_coherence(top_words)\n",
1413 |     "        for j in range(n_topics):\n",
1414 |     "            print(j, coherence[(j, 'cv')])\n",
1415 |     "        kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)\n",
1416 |     "        progress[str(epoch)] = pickle.dumps(kw)\n",
1417 |     "    data['doc_lengths'] = doc_lengths\n",
1418 |     "    data['term_frequency'] = term_frequency\n",
1419 |     "    np.savez('topics.pyldavis', **data)\n",
1420 |     "    print(epoch)\n",
1421 |     "    for d, f in utils.chunks(batchsize, doc_ids, flattened):\n",
1422 |     "        t0 = time.time()\n",
1423 |     "        model.cleargrads()\n",
1424 |     "        #optimizer.use_cleargrads(use=False)\n",
1425 |     "        l = model.fit_partial(d.copy(), f.copy())\n",
1426 |     "        print(\"after partial fitting:\", l)\n",
1427 |     "        prior = model.prior()\n",
1428 |     "        loss = prior * fraction\n",
1429 |     "        loss.backward()\n",
1430 |     "        optimizer.update()\n",
1431 |     "        msg = (\"J:{j:05d} E:{epoch:05d} L:{loss:1.3e} \"\n",
1432 |     "               \"P:{prior:1.3e} R:{rate:1.3e}\")\n",
1433 |     "        prior.to_cpu()\n",
1434 |     "        loss.to_cpu()\n",
1435 |     "        t1 = time.time()\n",
1436 |     "        dt = t1 - t0\n",
1437 |     "        rate = batchsize / dt\n",
1438 |     "        logs = dict(loss=float(l), epoch=epoch, j=j,\n",
1439 |     "                    prior=float(prior.data), rate=rate)\n",
1440 |     "        print(msg.format(**logs))\n",
1441 |     "        j += 1\n",
1442 |     "    serializers.save_hdf5(\"lda2vec.hdf5\", model)"
1443 |    ]
1444 |   }
1445 |  ],
1446 |  "metadata": {
1447 |   "kernelspec": {
1448 |    "display_name": "Python 3",
1449 |    "language": "python",
1450 |    "name": "python3"
1451 |   },
1452 |   "language_info": {
1453 |    "codemirror_mode": {
1454 |     "name": "ipython",
1455 |     "version": 3
1456 |    },
1457 |    "file_extension": ".py",
1458 |    "mimetype": "text/x-python",
1459 |    "name": "python",
1460 |    "nbconvert_exporter": "python",
1461 |    "pygments_lexer": "ipython3",
1462 |    "version": "3.7.2"
1463 |   }
1464 |  },
1465 |  "nbformat": 4,
1466 |  "nbformat_minor": 2
1467 | }
1468 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = 'pylda2vec'
 3 | version = '0.0.1'
 4 | description = 'Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec.'
 5 | author = 'ONLPS'
 6 | author_email = 'royalkingpin@gmail.com'
 7 | license = 'MIT'
 8 | url = 'https://github.com/ONLPS/lda2vec'
 9 | 
10 | [requires]
11 | python_version = ['3.6']
12 | 
13 | [build-system]
14 | requires = ['setuptools', 'wheel']
15 | 
16 | [tool.hatch.commands]
17 | prerelease = 'hatch build'
18 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | chainer
2 | cupy
3 | numpy
4 | jellyfish
5 | pandas
6 | en_core_web_md
7 | spacy
8 | scipy
9 | scikit-learn


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from io import open
 2 | 
 3 | from setuptools import find_packages, setup
 4 | 
 5 | with open('lda2vec/__init__.py', 'r') as f:
 6 |     for line in f:
 7 |         if line.startswith('__version__'):
 8 |             version = line.strip().split('=')[1].strip(' \'"')
 9 |             break
10 |     else:
11 |         version = '0.0.1'
12 | 
13 | with open('README.md', 'r', encoding='utf-8') as f:
14 |     readme = f.read()
15 | 
16 | REQUIRES = []
17 | 
18 | setup(
19 |     name='pylda2vec',
20 |     version=version,
21 |     description='Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec',
22 |     long_description=readme,
23 |     author='ONLPS',
24 |     author_email='royalkingpin@gmail.com',
25 |     maintainer='ONLPS',
26 |     maintainer_email='royalkingpin@gmail.com',
27 |     url='https://github.com/ONLPS/lda2vec',
28 |     license='MIT',
29 | 
30 |     keywords=[
31 |         'lda', 'topic-models', 'text', 'text processing', 'nlp'
32 |     ],
33 | 
34 |     classifiers=[
35 |         'Development Status :: 4 - Beta',
36 |         'Intended Audience :: Developers',
37 |         'License :: OSI Approved :: MIT License',
38 |         'Natural Language :: English',
39 |         'Operating System :: OS Independent',
40 |         'Programming Language :: Python :: 3.6',
41 |         'Programming Language :: Python :: Implementation :: CPython',
42 |     ],
43 | 
44 |     install_requires=REQUIRES,
45 |     tests_require=['coverage', 'pytest'],
46 | 
47 |     packages=find_packages(),
48 | )
49 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.0.1'
2 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | envlist =
 3 |     py36,
 4 | 
 5 | [testenv]
 6 | passenv = *
 7 | deps =
 8 |     coverage
 9 |     pytest
10 | commands =
11 |     python setup.py --quiet clean develop
12 |     coverage run --parallel-mode -m pytest
13 |     coverage combine --append
14 |     coverage report -m
15 | 


--------------------------------------------------------------------------------