├── .coveragerc ├── .gitattributes ├── .gitignore ├── LICENSE-MIT ├── MANIFEST.in ├── README.md ├── lda2vec ├── __init__.py ├── corpus.py ├── dirichlet_likelihood.py ├── embed_mixture.py ├── fake_data.py ├── lda2vec.py ├── negative_sampling.py ├── preprocess.py ├── topics.py ├── tracking.py └── utils.py ├── notebooks ├── dataset.ipynb ├── lda2vec_model.ipynb └── viz.ipynb ├── pyproject.toml ├── requirements.txt ├── setup.py ├── tests └── __init__.py └── tox.ini /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | source = 3 | lda2vec 4 | tests 5 | branch = True 6 | omit = 7 | lda2vec/cli.py 8 | 9 | [report] 10 | exclude_lines = 11 | no cov 12 | no qa 13 | noqa 14 | pragma: no cover 15 | if __name__ == .__main__.: 16 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | 118 | 119 | #others 120 | nohup.out 121 | *.pkl 122 | *.bak 123 | *.dat 124 | *.npy 125 | *.dir 126 | *.npz 127 | *.hdf5 -------------------------------------------------------------------------------- /LICENSE-MIT: -------------------------------------------------------------------------------- 1 | Copyright (c) 2019 ONLPS 2 | 3 | MIT License 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include LICENSE-MIT 2 | include README.md 3 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # lda2vec 2 | 3 | ----- 4 | 5 | **Table of Contents** 6 | 7 | * [Installation](#installation) 8 | * [License](#license) 9 | 10 | [![Downloads](https://pepy.tech/badge/pylda2vec)](https://pepy.tech/project/pylda2vec) 11 | 12 | ## Installation 13 | 14 | lda2vec is distributed on [PyPI](https://pypi.org) as a universal 15 | wheel and is available on Linux/macOS and Windows and supports 16 | Python 3.6+. 17 | 18 | ```bash 19 | $ pip install pylda2vec 20 | ``` 21 | 22 | ## License 23 | 24 | lda2vec is distributed under the terms of the 25 | [MIT License](https://choosealicense.com/licenses/mit). 26 | -------------------------------------------------------------------------------- /lda2vec/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '1.0.0' 2 | 3 | from .dirichlet_likelihood import dirichlet_likelihood 4 | from .embed_mixture import EmbedMixture 5 | from .tracking import Tracking 6 | from .preprocess import tokenize 7 | from .corpus import Corpus 8 | from .topics import * 9 | from .negative_sampling import NegativeSamplingFunction 10 | from .lda2vec import LDA2Vec 11 | -------------------------------------------------------------------------------- /lda2vec/corpus.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | import numpy as np 3 | import difflib 4 | import pandas as pd 5 | from jellyfish import damerau_levenshtein_distance 6 | 7 | try: 8 | from pyxdameraulevenshtein import damerau_levenshtein_distance_withNPArray 9 | except ImportError: 10 | pass 11 | 12 | 13 | class Corpus(): 14 | _keys_frequency = None 15 | 16 | def __init__(self, out_of_vocabulary=-1, skip=-2): 17 | """ The Corpus helps with tasks involving integer representations of 18 | words. This object is used to filter, subsample, and convert loose 19 | word indices to compact word indices. 20 | 21 | 'Loose' word arrays are word indices given by a tokenizer. The word 22 | index is not necessarily representative of word's frequency rank, and 23 | so loose arrays tend to have 'gaps' of unused indices, which can make 24 | models less memory efficient. As a result, this class helps convert 25 | a loose array to a 'compact' one where the most common words have low 26 | indices, and the most infrequent have high indices. 27 | 28 | Corpus maintains a count of how many of each word it has seen so 29 | that it can later selectively filter frequent or rare words. However, 30 | since word popularity rank could change with incoming data the word 31 | index count must be updated fully and `self.finalize()` must be called 32 | before any filtering and subsampling operations can happen. 33 | 34 | Arguments 35 | --------- 36 | out_of_vocabulary : int, default=-1 37 | Token index to replace whenever we encounter a rare or unseen word. 38 | Instead of skipping the token, we mark as an out of vocabulary 39 | word. 40 | skip : int, default=-2 41 | Token index to replace whenever we want to skip the current frame. 42 | Particularly useful when subsampling words or when padding a 43 | sentence. 44 | 45 | Examples 46 | -------- 47 | >>> corpus = Corpus() 48 | >>> words_raw = np.random.randint(100, size=25) 49 | >>> corpus.update_word_count(words_raw) 50 | >>> corpus.finalize() 51 | >>> words_compact = corpus.to_compact(words_raw) 52 | >>> words_pruned = corpus.filter_count(words_compact, min_count=2) 53 | >>> # words_sub = corpus.subsample_frequent(words_pruned, thresh=1e-5) 54 | >>> words_loose = corpus.to_loose(words_pruned) 55 | >>> not_oov = words_loose > -1 56 | >>> np.all(words_loose[not_oov] == words_raw[not_oov]) 57 | True 58 | """ 59 | self.counts_loose = defaultdict(int) 60 | self._finalized = False 61 | self.specials = dict(out_of_vocabulary=out_of_vocabulary, 62 | skip=skip) 63 | 64 | @property 65 | def n_specials(self): 66 | return len(self.specials) 67 | 68 | def update_word_count(self, loose_array): 69 | """ Update the corpus word counts given a loose array of word indices. 70 | Can be called multiple times, but once `finalize` is called the word 71 | counts cannot be updated. 72 | 73 | Arguments 74 | --------- 75 | loose_array : int array 76 | Array of word indices. 77 | 78 | Examples 79 | -------- 80 | >>> corpus = Corpus() 81 | >>> corpus.update_word_count(np.arange(10)) 82 | >>> corpus.update_word_count(np.arange(8)) 83 | >>> corpus.counts_loose[0] 84 | 2 85 | >>> corpus.counts_loose[9] 86 | 1 87 | """ 88 | self._check_unfinalized() 89 | uniques, counts = np.unique(np.ravel(loose_array), return_counts=True) 90 | msg = "Loose arrays cannot have elements below the values of special " 91 | msg += "tokens as these indices are reserved" 92 | assert uniques.min() >= min(self.specials.values()), msg 93 | for k, v in zip(uniques, counts): 94 | self.counts_loose[k] += v 95 | 96 | def _loose_keys_ordered(self): 97 | """ Get the loose keys in order of decreasing frequency""" 98 | loose_counts = sorted(self.counts_loose.items(), key=lambda x: x[1], 99 | reverse=True) 100 | keys = np.array(loose_counts)[:, 0] 101 | counts = np.array(loose_counts)[:, 1] 102 | order = np.argsort(counts)[::-1].astype('int32') 103 | keys, counts = keys[order], counts[order] 104 | # Add in the specials as a prefix to the other keys 105 | 106 | specials = np.sort(list(self.specials.values())) 107 | keys = np.concatenate((specials, keys)) 108 | empty = np.zeros(len(specials), dtype='int32') 109 | counts = np.concatenate((empty, counts)) 110 | n_keys = keys.shape[0] 111 | assert counts.min() >= 0 112 | return keys, counts, n_keys 113 | 114 | def finalize(self): 115 | """ Call `finalize` once done updating word counts. This means the 116 | object will no longer accept new word count data, but the loose 117 | to compact index mapping can be computed. This frees the object to 118 | filter, subsample, and compactify incoming word arrays. 119 | 120 | Examples 121 | -------- 122 | >>> corpus = Corpus() 123 | >>> # We'll update the word counts, making sure that word index 2 124 | >>> # is the most common word index. 125 | >>> corpus.update_word_count(np.arange(1) + 2) 126 | >>> corpus.update_word_count(np.arange(3) + 2) 127 | >>> corpus.update_word_count(np.arange(10) + 2) 128 | >>> corpus.update_word_count(np.arange(8) + 2) 129 | >>> corpus.counts_loose[2] 130 | 4 131 | >>> # The corpus has not been finalized yet, and so the compact mapping 132 | >>> # has not yet been computed. 133 | >>> corpus.keys_counts[0] 134 | Traceback (most recent call last): 135 | ... 136 | AttributeError: Corpus instance has no attribute 'keys_counts' 137 | >>> corpus.finalize() 138 | >>> corpus.n_specials 139 | 2 140 | >>> # The special tokens are mapped to the first compact indices 141 | >>> corpus.compact_to_loose[0] 142 | -2 143 | >>> corpus.compact_to_loose[0] == corpus.specials['skip'] 144 | True 145 | >>> corpus.compact_to_loose[1] == corpus.specials['out_of_vocabulary'] 146 | True 147 | >>> corpus.compact_to_loose[2] # Most popular token is mapped next 148 | 2 149 | >>> corpus.loose_to_compact[3] # 2nd most popular token is mapped next 150 | 4 151 | >>> first_non_special = corpus.n_specials 152 | >>> corpus.keys_counts[first_non_special] # First normal token 153 | 4 154 | """ 155 | # Return the loose keys and counts in descending count order 156 | # so that the counts arrays is already in compact order 157 | self.keys_loose, self.keys_counts, n_keys = self._loose_keys_ordered() 158 | self.keys_compact = np.arange(n_keys).astype('int32') 159 | self.loose_to_compact = {l: c for l, c in 160 | zip(self.keys_loose, self.keys_compact)} 161 | self.compact_to_loose = {c: l for l, c in 162 | self.loose_to_compact.items()} 163 | self.specials_to_compact = {s: self.loose_to_compact[i] 164 | for s, i in self.specials.items()} 165 | self.compact_to_special = {c: s for c, s in 166 | self.specials_to_compact.items()} 167 | self._finalized = True 168 | 169 | @property 170 | def keys_frequency(self): 171 | if self._keys_frequency is None: 172 | f = self.keys_counts * 1.0 / np.sum(self.keys_counts) 173 | self._keys_frequency = f 174 | return self._keys_frequency 175 | 176 | def _check_finalized(self): 177 | msg = "self.finalized() must be called before any other array ops" 178 | assert self._finalized, msg 179 | 180 | def _check_unfinalized(self): 181 | msg = "Cannot update word counts after self.finalized()" 182 | msg += "has been called" 183 | assert not self._finalized, msg 184 | 185 | def filter_count(self, words_compact, min_count=15, max_count=0, 186 | max_replacement=None, min_replacement=None): 187 | """ Replace word indices below min_count with the pad index. 188 | 189 | Arguments 190 | --------- 191 | words_compact: int array 192 | Source array whose values will be replaced. This is assumed to 193 | already be converted into a compact array with `to_compact`. 194 | min_count : int 195 | Replace words less frequently occuring than this count. This 196 | defines the threshold for what words are very rare 197 | max_count : int 198 | Replace words occuring more frequently than this count. This 199 | defines the threshold for very frequent words 200 | min_replacement : int, default is out_of_vocabulary 201 | Replace words less than min_count with this. 202 | max_replacement : int, default is out_of_vocabulary 203 | Replace words greater than max_count with this. 204 | 205 | Examples 206 | -------- 207 | >>> corpus = Corpus() 208 | >>> # Make 1000 word indices with index < 100 and 209 | >>> # update the word counts. 210 | >>> word_indices = np.random.randint(100, size=1000) 211 | >>> corpus.update_word_count(word_indices) 212 | >>> corpus.finalize() # any word indices above 99 will be filtered 213 | >>> # Now create a new text, but with some indices above 100 214 | >>> word_indices = np.random.randint(200, size=1000) 215 | >>> word_indices.max() < 100 216 | False 217 | >>> # Remove words that have never appeared in the original corpus. 218 | >>> filtered = corpus.filter_count(word_indices, min_count=1) 219 | >>> filtered.max() < 100 220 | True 221 | >>> # We can also remove highly frequent words. 222 | >>> filtered = corpus.filter_count(word_indices, max_count=2) 223 | >>> len(np.unique(word_indices)) > len(np.unique(filtered)) 224 | True 225 | """ 226 | self._check_finalized() 227 | ret = words_compact.copy() 228 | if min_replacement is None: 229 | min_replacement = self.specials_to_compact['out_of_vocabulary'] 230 | if max_replacement is None: 231 | max_replacement = self.specials_to_compact['out_of_vocabulary'] 232 | not_specials = np.ones(self.keys_counts.shape[0], dtype='bool') 233 | not_specials[:self.n_specials] = False 234 | if min_count: 235 | # Find first index with count less than min_count 236 | min_idx = np.argmax(not_specials & (self.keys_counts < min_count)) 237 | # Replace all indices greater than min_idx 238 | ret[ret > min_idx] = min_replacement 239 | if max_count: 240 | # Find first index with count less than max_count 241 | max_idx = np.argmax(not_specials & (self.keys_counts < max_count)) 242 | # Replace all indices less than max_idx 243 | ret[ret < max_idx] = max_replacement 244 | return ret 245 | 246 | def subsample_frequent(self, words_compact, threshold=1e-5): 247 | """ Subsample the most frequent words. This aggressively 248 | replaces words with frequencies higher than `threshold`. Words 249 | are replaced with the out_of_vocabulary token. 250 | 251 | Words will be replaced with probability as a function of their 252 | frequency in the training corpus: 253 | 254 | .. math:: 255 | p(w) = 1.0 - \sqrt{threshold\over f(w)} 256 | 257 | Arguments 258 | --------- 259 | words_compact: int array 260 | The input array to subsample. 261 | threshold: float in [0, 1] 262 | Words with frequencies higher than this will be increasingly 263 | subsampled. 264 | 265 | Examples 266 | -------- 267 | >>> corpus = Corpus() 268 | >>> word_indices = (np.random.power(5.0, size=1000) * 100).astype('i') 269 | >>> corpus.update_word_count(word_indices) 270 | >>> corpus.finalize() 271 | >>> compact = corpus.to_compact(word_indices) 272 | >>> sampled = corpus.subsample_frequent(compact, threshold=1e-2) 273 | >>> skip = corpus.specials_to_compact['skip'] 274 | >>> np.sum(compact == skip) # No skips in the compact tokens 275 | 0 276 | >>> np.sum(sampled == skip) > 0 # Many skips in the sampled tokens 277 | True 278 | 279 | .. [1] Distributed Representations of Words and Phrases and 280 | their Compositionality. Mikolov, Tomas and Sutskever, Ilya 281 | and Chen, Kai and Corrado, Greg S and Dean, Jeff 282 | Advances in Neural Information Processing Systems 26 283 | """ 284 | self._check_finalized() 285 | freq = self.keys_frequency + 1e-10 286 | pw = 1.0 - (np.sqrt(threshold / freq) + threshold / freq) 287 | prob = fast_replace(words_compact, self.keys_compact, pw) 288 | draw = np.random.uniform(size=prob.shape) 289 | ret = words_compact.copy() 290 | # If probability greater than draw, skip the word 291 | ret[prob > draw] = self.specials_to_compact['skip'] 292 | return ret 293 | 294 | def to_compact(self, word_loose): 295 | """ Convert a loose word index matrix to a compact array using 296 | a fixed loose to dense mapping. Out of vocabulary word indices 297 | will be replaced by the out of vocabulary index. The most common 298 | index will be mapped to 0, the next most common to 1, and so on. 299 | 300 | Arguments 301 | --------- 302 | word_loose : int array 303 | Input loose word array to be converted into a compact array. 304 | 305 | 306 | Examples 307 | -------- 308 | >>> corpus = Corpus() 309 | >>> word_indices = np.random.randint(100, size=1000) 310 | >>> n_words = len(np.unique(word_indices)) 311 | >>> corpus.update_word_count(word_indices) 312 | >>> corpus.finalize() 313 | >>> word_compact = corpus.to_compact(word_indices) 314 | >>> # The most common word in the training set will be mapped to be 315 | >>> # right after all the special tokens, so 2 in this case. 316 | >>> np.argmax(np.bincount(word_compact)) == 2 317 | True 318 | >>> most_common = np.argmax(np.bincount(word_indices)) 319 | >>> corpus.loose_to_compact[most_common] == 2 320 | True 321 | >>> # Out of vocabulary indices will be mapped to 1 322 | >>> word_indices = np.random.randint(150, size=1000) 323 | >>> word_compact_oov = corpus.to_compact(word_indices) 324 | >>> oov = corpus.specials_to_compact['out_of_vocabulary'] 325 | >>> oov 326 | 1 327 | >>> oov in word_compact 328 | False 329 | >>> oov in word_compact_oov 330 | True 331 | """ 332 | self._check_finalized() 333 | keys = self.keys_loose 334 | reps = self.keys_compact 335 | uniques = np.unique(word_loose) 336 | # Find the out of vocab indices 337 | oov = np.setdiff1d(uniques, keys, assume_unique=True) 338 | oov_token = self.specials_to_compact['out_of_vocabulary'] 339 | keys = np.concatenate((keys, oov)) 340 | reps = np.concatenate((reps, np.zeros_like(oov) + oov_token)) 341 | compact = fast_replace(word_loose, keys, reps) 342 | msg = "Error: all compact indices should be non-negative" 343 | assert compact.min() >= 0, msg 344 | return compact 345 | 346 | def to_loose(self, word_compact): 347 | """ Convert a compacted array back into a loose array. 348 | 349 | Arguments 350 | --------- 351 | word_compact : int array 352 | Input compacted word array to be converted into a loose array. 353 | 354 | 355 | Examples 356 | -------- 357 | >>> corpus = Corpus() 358 | >>> word_indices = np.random.randint(100, size=1000) 359 | >>> corpus.update_word_count(word_indices) 360 | >>> corpus.finalize() 361 | >>> word_compact = corpus.to_compact(word_indices) 362 | >>> word_loose = corpus.to_loose(word_compact) 363 | >>> np.all(word_loose == word_indices) 364 | True 365 | """ 366 | self._check_finalized() 367 | uniques = np.unique(word_compact) 368 | # Find the out of vocab indices 369 | oov = np.setdiff1d(uniques, self.keys_compact, assume_unique=True) 370 | msg = "Found keys in `word_compact` not present in the" 371 | msg += "training corpus. Is this actually a compacted array?" 372 | assert np.all(oov < 0), msg 373 | loose = fast_replace(word_compact, self.keys_compact, self.keys_loose) 374 | return loose 375 | 376 | def compact_to_flat(self, word_compact, *components): 377 | """ Ravel a 2D compact array of documents (rows) and word 378 | positions (columns) into a 1D array of words. Leave out special 379 | tokens and ravel the component arrays in the same fashion. 380 | 381 | Arguments 382 | --------- 383 | word_compact : int array 384 | Array of word indices in documents. Has shape (n_docs, max_length) 385 | components : list of arrays 386 | A list of arrays detailing per-document properties. Each array 387 | must n_docs long. 388 | 389 | Returns 390 | ------- 391 | flat : int array 392 | An array of all words unravelled into a 1D shape 393 | components : list of arrays 394 | Each array here is also unravelled into the same shape 395 | 396 | Examples 397 | -------- 398 | >>> corpus = Corpus() 399 | >>> word_indices = np.random.randint(100, size=1000) 400 | >>> corpus.update_word_count(word_indices) 401 | >>> corpus.finalize() 402 | >>> doc_texts = np.arange(8).reshape((2, 4)) 403 | >>> doc_texts[:, -1] = -2 # Mark as skips 404 | >>> doc_ids = np.arange(2) 405 | >>> compact = corpus.to_compact(doc_texts) 406 | >>> oov = corpus.specials_to_compact['out_of_vocabulary'] 407 | >>> compact[1, 3] = oov # Mark the last word as OOV 408 | >>> flat = corpus.compact_to_flat(compact) 409 | >>> flat.shape[0] == 6 # 2 skips were dropped from 8 words 410 | True 411 | >>> flat[-1] == corpus.loose_to_compact[doc_texts[1, 2]] 412 | True 413 | >>> flat, (flat_id,) = corpus.compact_to_flat(compact, doc_ids) 414 | >>> flat_id 415 | array([0, 0, 0, 1, 1, 1]) 416 | """ 417 | self._check_finalized() 418 | n_docs = word_compact.shape[0] 419 | max_length = word_compact.shape[1] 420 | idx = word_compact > self.n_specials 421 | components_raveled = [] 422 | msg = "Length of each component must much `word_compact` size" 423 | for component in components: 424 | raveled = np.tile(component[:, None], max_length)[idx] 425 | components_raveled.append(raveled) 426 | assert len(component) == n_docs, msg 427 | if len(components_raveled) == 0: 428 | return word_compact[idx] 429 | else: 430 | return word_compact[idx], components_raveled 431 | 432 | def word_list(self, vocab, max_compact_index=None, oov_token=''): 433 | """ Translate compact keys back into string representations for a word. 434 | 435 | Arguments 436 | --------- 437 | vocab : dict 438 | The vocab object has loose indices as keys and word strings as 439 | values. 440 | 441 | max_compact_index : int 442 | Only return words up to this index. If None, defaults to the number 443 | of compact indices available 444 | 445 | oov_token : str 446 | Returns this string if a compact index does not have a word in the 447 | vocab dictionary provided. 448 | 449 | Returns 450 | ------- 451 | word_list : list 452 | A list of strings representations corresponding to word indices 453 | zero to `max_compact_index` 454 | 455 | Examples 456 | -------- 457 | 458 | >>> vocab = {0: 'But', 1: 'the', 2: 'night', 3: 'was', 4: 'warm'} 459 | >>> word_indices = np.zeros(50).astype('int32') 460 | >>> word_indices[:25] = 0 # 'But' shows 25 times 461 | >>> word_indices[25:35] = 1 # 'the' is in 10 times 462 | >>> word_indices[40:46] = 2 # 'night' is in 6 times 463 | >>> word_indices[46:49] = 3 # 'was' is in 3 times 464 | >>> word_indices[49:] = 4 # 'warm' in in 2 times 465 | >>> corpus = Corpus() 466 | >>> corpus.update_word_count(word_indices) 467 | >>> corpus.finalize() 468 | >>> # Build a vocabulary of word indices 469 | >>> corpus.word_list(vocab) 470 | ['skip', 'out_of_vocabulary', 'But', 'the', 'night', 'was', 'warm'] 471 | """ 472 | # Translate the compact keys into string words 473 | oov = self.specials['out_of_vocabulary'] 474 | words = [] 475 | if max_compact_index is None: 476 | max_compact_index = self.keys_compact.shape[0] 477 | index_to_special = {i: s for s, i in self.specials.items()} 478 | for compact_index in range(max_compact_index): 479 | loose_index = self.compact_to_loose.get(compact_index, oov) 480 | special = index_to_special.get(loose_index, oov_token) 481 | string = vocab.get(loose_index, special) 482 | words.append(string) 483 | return words 484 | 485 | def compact_word_vectors(self, vocab, filename=None, array=None, 486 | top=20000): 487 | """ Retrieve pretrained word spectors for our vocabulary. 488 | The returned word array has row indices corresponding to the 489 | compact index of a word, and columns correponding to the word 490 | vector. 491 | 492 | Arguments 493 | --------- 494 | vocab : dict 495 | Dictionary where keys are the loose index, and values are 496 | the word string. 497 | 498 | use_spacy : bool 499 | Use SpaCy to load in word vectors. Otherwise Gensim. 500 | 501 | filename : str 502 | Filename for SpaCy-compatible word vectors or if use_spacy=False 503 | then uses word2vec vectors via gensim. 504 | 505 | Returns 506 | ------- 507 | data : numpy float array 508 | Array such that data[compact_index, :] = word_vector 509 | 510 | Examples 511 | -------- 512 | >>> import numpy.linalg as nl 513 | >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'} 514 | >>> word_indices = np.zeros(50).astype('int32') 515 | >>> word_indices[:25] = 19 # 'Shuttle' shows 25 times 516 | >>> word_indices[25:35] = 5 # 'astronomy' is in 10 times 517 | >>> word_indices[40:46] = 7 # 'cold' is in 6 times 518 | >>> word_indices[46:] = 3 # 'hot' is in 3 times 519 | >>> corpus = Corpus() 520 | >>> corpus.update_word_count(word_indices) 521 | >>> corpus.finalize() 522 | >>> v, s, f = corpus.compact_word_vectors(vocab) 523 | >>> sim = lambda x, y: np.dot(x, y) / nl.norm(x) / nl.norm(y) 524 | >>> vocab[corpus.compact_to_loose[2]] 525 | 'shuttle' 526 | >>> vocab[corpus.compact_to_loose[3]] 527 | 'astronomy' 528 | >>> vocab[corpus.compact_to_loose[4]] 529 | 'cold' 530 | >>> sim_shuttle_astro = sim(v[2, :], v[3, :]) 531 | >>> sim_shuttle_cold = sim(v[2, :], v[4, :]) 532 | >>> sim_shuttle_astro > sim_shuttle_cold 533 | True 534 | """ 535 | n_words = len(self.compact_to_loose) 536 | from gensim.models.word2vec import Word2Vec 537 | from gensim.models import KeyedVectors 538 | model = KeyedVectors.load_word2vec_format(filename, binary=True) 539 | #model = Word2Vec.load_word2vec_format(filename, binary=True) 540 | n_dim = model.syn0.shape[1] 541 | data = np.random.normal(size=(n_words, n_dim)).astype('float32') 542 | data -= data.mean() 543 | data += model.syn0.mean() 544 | data /= data.std() 545 | data *= model.syn0.std() 546 | if array is not None: 547 | data = array 548 | n_words = data.shape[0] 549 | keys_raw = model.vocab.keys() 550 | keys = [s.encode('ascii', 'ignore') for s in keys_raw] 551 | lens = [len(s) for s in model.vocab.keys()] 552 | choices = np.array(keys, dtype='S') 553 | lengths = np.array(lens, dtype='int32') 554 | s, f = 0, 0 555 | 556 | def rep0(w): return w 557 | 558 | def rep1(w): return w.replace(' ', '_') 559 | 560 | def rep2(w): return w.title().replace(' ', '_') 561 | reps = [rep0, rep1, rep2] 562 | for compact in np.arange(top): 563 | loose = self.compact_to_loose.get(compact, None) 564 | if loose is None: 565 | continue 566 | word = vocab.get(loose, None) 567 | if word is None: 568 | continue 569 | word = word.strip() 570 | vector = None 571 | for rep in reps: 572 | clean = rep(word) 573 | if clean in model.vocab: 574 | vector = model[clean] 575 | break 576 | if vector is None: 577 | try: 578 | word = str(word) 579 | idx = lengths >= len(word) - 3 580 | idx &= lengths <= len(word) + 3 581 | sel = choices[idx] 582 | sel = str(sel.tolist()[0]) 583 | d = damerau_levenshtein_distance(word, sel) 584 | choice = np.array(keys_raw)[idx][np.argmin(d)] 585 | # choice = difflib.get_close_matches(word, choices)[0] 586 | vector = model[choice] 587 | print(compact, word, ' --> ', choice) 588 | except IndexError: 589 | pass 590 | if vector is None: 591 | f += 1 592 | continue 593 | s += 1 594 | data[compact, :] = vector[:] 595 | return data, s, f 596 | 597 | def compact_to_bow(self, word_compact, max_compact_index=None): 598 | """ Given a 2D array of compact indices, return the bag of words 599 | representation where the column is the word index, row is the document 600 | index, and the value is the number of times that word appears in that 601 | document. 602 | 603 | >>> import numpy.linalg as nl 604 | >>> vocab = {19: 'shuttle', 5: 'astronomy', 7: 'cold', 3: 'hot'} 605 | >>> word_indices = np.zeros(50).astype('int32') 606 | >>> word_indices[:25] = 19 # 'Shuttle' shows 25 times 607 | >>> word_indices[25:35] = 5 # 'astronomy' is in 10 times 608 | >>> word_indices[40:46] = 7 # 'cold' is in 6 times 609 | >>> word_indices[46:] = 3 # 'hot' is in 3 times 610 | >>> corpus = Corpus() 611 | >>> corpus.update_word_count(word_indices) 612 | >>> corpus.finalize() 613 | >>> v = corpus.compact_to_bow(word_indices) 614 | >>> len(v) 615 | 20 616 | >>> v[:6] 617 | array([ 5, 0, 0, 4, 0, 10]) 618 | >>> v[19] 619 | 25 620 | >>> v.sum() 621 | 50 622 | >>> words = [[0, 0, 0, 3, 4], [1, 1, 1, 4, 5]] 623 | >>> words = np.array(words) 624 | >>> bow = corpus.compact_to_bow(words) 625 | >>> bow.shape 626 | (2, 6) 627 | """ 628 | if max_compact_index is None: 629 | max_compact_index = word_compact.max() 630 | 631 | def bincount(x): 632 | return np.bincount(x, minlength=max_compact_index + 1) 633 | axis = len(word_compact.shape) - 1 634 | bow = np.apply_along_axis(bincount, axis, word_compact) 635 | return bow 636 | 637 | def compact_to_coocurrence(self, word_compact, indices, window_size=10): 638 | """ From an array of compact tokens and aligned array of document indices 639 | compute (word, word, document) co-occurrences within a moving window. 640 | 641 | Arguments 642 | --------- 643 | word_compact: int array 644 | Sequence of tokens. 645 | 646 | indices: dict of int arrays 647 | Each array in this dictionary should represent the document index it 648 | came from. 649 | 650 | window_size: int 651 | Indicates the moving window size around which all co-occurrences will 652 | be computed. 653 | 654 | Returns 655 | ------- 656 | counts : DataFrame 657 | Returns a DataFrame with two columns for word index A and B, 658 | one extra column for each document index, and a final column for counts 659 | in that key. 660 | 661 | >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0]) 662 | >>> doc_idx = np.array([0, 0, 0, 0, 1, 1, 1, 1]) 663 | >>> corpus = Corpus() 664 | >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx}) 665 | >>> counts.counts.sum() 666 | 24 667 | >>> counts.query('doc == 0').counts.values 668 | array([3, 3, 6]) 669 | >>> compact = np.array([0, 1, 1, 1, 2, 2, 3, 0]) 670 | >>> doc_idx = np.array([0, 0, 0, 1, 1, 2, 2, 2]) 671 | >>> corpus = Corpus() 672 | >>> counts = corpus.compact_to_coocurrence(compact, {'doc': doc_idx}) 673 | >>> counts.counts.sum() 674 | 14 675 | >>> counts.query('doc == 0').word_index_x.values 676 | array([0, 1, 1]) 677 | >>> counts.query('doc == 0').word_index_y.values 678 | array([1, 0, 1]) 679 | >>> counts.query('doc == 0').counts.values 680 | array([2, 2, 2]) 681 | >>> counts.query('doc == 1').counts.values 682 | array([1, 1]) 683 | """ 684 | tokens = pd.DataFrame(dict(word_index=word_compact)).reset_index() 685 | for name, index in indices.items(): 686 | tokens[name] = index 687 | a, b = tokens.copy(), tokens.copy() 688 | 689 | def mask(x): return np.prod([x[k + '_x'] == x[k + '_y'] 690 | for k in indices.keys()], axis=0) 691 | group_keys = ['word_index_x', 'word_index_y', ] 692 | group_keys += [k + '_x' for k in indices.keys()] 693 | total = [] 694 | a['frame'] = a['index'].copy() 695 | for frame in range(-window_size, window_size + 1): 696 | if frame == 0: 697 | continue 698 | b['frame'] = b['index'] + frame 699 | matches = (a.merge(b, on='frame') 700 | .assign(same_doc=mask) 701 | .pipe(lambda df: df[df['same_doc'] == 1]) 702 | .groupby(group_keys)['frame'] 703 | .count() 704 | .reset_index()) 705 | total.append(matches) 706 | counts = (pd.concat(total) 707 | .groupby(group_keys)['frame'] 708 | .sum() 709 | .reset_index() 710 | .rename(columns={k + '_x': k for k in indices.keys()}) 711 | .rename(columns=dict(frame='counts'))) 712 | return counts 713 | 714 | 715 | def fast_replace(data, keys, values, skip_checks=False): 716 | """ Do a search-and-replace in array `data`. 717 | 718 | Arguments 719 | --------- 720 | data : int array 721 | Array of integers 722 | keys : int array 723 | Array of keys inside of `data` to be replaced 724 | values : int array 725 | Array of values that replace the `keys` array 726 | skip_checks : bool, default=False 727 | Optionally skip sanity checking the input. 728 | 729 | Examples 730 | -------- 731 | >>> fast_replace(np.arange(5), np.arange(5), np.arange(5)[::-1]) 732 | array([4, 3, 2, 1, 0]) 733 | """ 734 | assert np.allclose(keys.shape, values.shape) 735 | if not skip_checks: 736 | msg = "data has elements not in keys" 737 | assert data.max() <= keys.max(), msg 738 | sdx = np.argsort(keys) 739 | keys, values = keys[sdx], values[sdx] 740 | idx = np.digitize(data, keys, right=True) 741 | new_data = values[idx] 742 | return new_data 743 | -------------------------------------------------------------------------------- /lda2vec/dirichlet_likelihood.py: -------------------------------------------------------------------------------- 1 | import chainer.functions as F 2 | from chainer import Variable 3 | 4 | 5 | def dirichlet_likelihood(weights, alpha=None): 6 | """ Calculate the log likelihood of the observed topic proportions. 7 | A negative likelihood is more likely than a negative likelihood. 8 | 9 | Args: 10 | weights (chainer.Variable): Unnormalized weight vector. The vector 11 | will be passed through a softmax function that will map the input 12 | onto a probability simplex. 13 | alpha (float): The Dirichlet concentration parameter. Alpha 14 | greater than 1.0 results in very dense topic weights such 15 | that each document belongs to many topics. Alpha < 1.0 results 16 | in sparser topic weights. The default is to set alpha to 17 | 1.0 / n_topics, effectively enforcing the prior belief that a 18 | document belong to very topics at once. 19 | 20 | Returns: 21 | ~chainer.Variable: Output loss variable. 22 | """ 23 | if type(weights) is Variable: 24 | n_topics = weights.data.shape[1] 25 | else: 26 | n_topics = weights.W.data.shape[1] 27 | if alpha is None: 28 | alpha = 1.0 / n_topics 29 | if type(weights) is Variable: 30 | log_proportions = F.log_softmax(weights) 31 | else: 32 | log_proportions = F.log_softmax(weights.W) 33 | loss = (alpha - 1.0) * log_proportions 34 | return -F.sum(loss) 35 | -------------------------------------------------------------------------------- /lda2vec/embed_mixture.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import chainer 4 | import chainer.links as L 5 | import chainer.functions as F 6 | from chainer import Variable 7 | 8 | 9 | def _orthogonal_matrix(shape): 10 | # Stolen from blocks: 11 | # github.com/mila-udem/blocks/blob/master/blocks/initialization.py 12 | M1 = np.random.randn(shape[0], shape[0]) 13 | M2 = np.random.randn(shape[1], shape[1]) 14 | 15 | # QR decomposition of matrix with entries in N(0, 1) is random 16 | Q1, R1 = np.linalg.qr(M1) 17 | Q2, R2 = np.linalg.qr(M2) 18 | # Correct that NumPy doesn't force diagonal of R to be non-negative 19 | Q1 = Q1 * np.sign(np.diag(R1)) 20 | Q2 = Q2 * np.sign(np.diag(R2)) 21 | 22 | n_min = min(shape[0], shape[1]) 23 | return np.dot(Q1[:, :n_min], Q2[:n_min, :]) 24 | 25 | 26 | class EmbedMixture(chainer.Chain): 27 | r""" A single document is encoded as a multinomial mixture of latent topics. 28 | The mixture is defined on simplex, so that mixture weights always sum 29 | to 100%. The latent topic vectors resemble word vectors whose elements are 30 | defined over all real numbers. 31 | 32 | For example, a single document mix may be :math:`[0.9, 0.1]`, indicating 33 | that it is 90% in the first topic, 10% in the second. An example topic 34 | vector looks like :math:`[1.5e1, -1.3e0, +3.4e0, -0.2e0]`, which is 35 | largely uninterpretable until you measure the words most similar to this 36 | topic vector. 37 | 38 | A single document vector :math:`\vec{e}` is composed as weights :math:`c_j` 39 | over topic vectors :math:`\vec{T_j}`: 40 | 41 | .. math:: 42 | 43 | \vec{e}=\Sigma_{j=0}^{j=n\_topics}c_j\vec{T_j} 44 | 45 | This is usually paired with regularization on the weights :math:`c_j`. 46 | If using a Dirichlet prior with low alpha, these weights will be sparse. 47 | 48 | Args: 49 | n_documents (int): Total number of documents 50 | n_topics (int): Number of topics per document 51 | n_dim (int): Number of dimensions per topic vector (should match word 52 | vector size) 53 | 54 | Attributes: 55 | weights : chainer.links.EmbedID 56 | Unnormalized topic weights (:math:`c_j`). To normalize these 57 | weights, use `F.softmax(weights)`. 58 | factors : chainer.links.Parameter 59 | Topic vector matrix (:math:`T_j`) 60 | 61 | .. seealso:: :func:`lda2vec.dirichlet_likelihood` 62 | """ 63 | 64 | def __init__(self, n_documents, n_topics, n_dim, dropout_ratio=0.2, 65 | temperature=1.0): 66 | self.n_documents = n_documents 67 | self.n_topics = n_topics 68 | self.n_dim = n_dim 69 | self.dropout_ratio = dropout_ratio 70 | factors = _orthogonal_matrix((n_topics, n_dim)).astype('float32') 71 | factors /= np.sqrt(n_topics + n_dim) 72 | super(EmbedMixture, self).__init__( 73 | weights=L.EmbedID(n_documents, n_topics), 74 | factors=L.Parameter(factors)) 75 | self.temperature = temperature 76 | self.weights.W.data[...] /= np.sqrt(n_documents + n_topics) 77 | 78 | def __call__(self, doc_ids, update_only_docs=False): 79 | """ Given an array of document integer indices, returns a vector 80 | for each document. The vector is composed of topic weights projected 81 | onto topic vectors. 82 | 83 | Args: 84 | doc_ids : chainer.Variable 85 | One-dimensional batch vectors of IDs 86 | 87 | Returns: 88 | doc_vector : chainer.Variable 89 | Batch of two-dimensional embeddings for every document. 90 | """ 91 | # (batchsize, ) --> (batchsize, multinomial) 92 | proportions = self.proportions(doc_ids, softmax=True) 93 | # (batchsize, n_factors) * (n_factors, n_dim) --> (batchsize, n_dim) 94 | factors = F.dropout(self.factors(), ratio=self.dropout_ratio) 95 | if update_only_docs: 96 | factors.unchain_backward() 97 | w_sum = F.matmul(proportions, factors) 98 | return w_sum 99 | 100 | def proportions(self, doc_ids, softmax=False): 101 | """ Given an array of document indices, return a vector 102 | for each document of just the unnormalized topic weights. 103 | 104 | Returns: 105 | doc_weights : chainer.Variable 106 | Two dimensional topic weights of each document. 107 | """ 108 | w = self.weights(doc_ids) 109 | if softmax: 110 | size = w.data.shape 111 | mask = self.xp.random.random_integers(0, 1, size=size) 112 | y = (F.softmax(w * self.temperature) * 113 | Variable(mask.astype('float32'))) 114 | norm, y = F.broadcast(F.expand_dims(F.sum(y, axis=1), 1), y) 115 | return y / (norm + 1e-7) 116 | else: 117 | return w 118 | -------------------------------------------------------------------------------- /lda2vec/fake_data.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from numpy.random import random_sample 3 | 4 | 5 | def orthogonal_matrix(shape): 6 | # Stolen from blocks: 7 | # github.com/mila-udem/blocks/blob/master/blocks/initialization.py 8 | M1 = np.random.randn(shape[0], shape[0]) 9 | M2 = np.random.randn(shape[1], shape[1]) 10 | 11 | # QR decomposition of matrix with entries in N(0, 1) is random 12 | Q1, R1 = np.linalg.qr(M1) 13 | Q2, R2 = np.linalg.qr(M2) 14 | # Correct that NumPy doesn't force diagonal of R to be non-negative 15 | Q1 = Q1 * np.sign(np.diag(R1)) 16 | Q2 = Q2 * np.sign(np.diag(R2)) 17 | 18 | n_min = min(shape[0], shape[1]) 19 | return np.dot(Q1[:, :n_min], Q2[:n_min, :]) 20 | 21 | 22 | def softmax(w): 23 | # https://gist.github.com/stober/1946926 24 | w = np.array(w) 25 | maxes = np.amax(w, axis=1) 26 | maxes = maxes.reshape(maxes.shape[0], 1) 27 | e = np.exp(w - maxes) 28 | dist = e / np.sum(e, axis=1)[:, None] 29 | return dist 30 | 31 | 32 | def sample(values, probabilities, size): 33 | assert np.allclose(np.sum(probabilities, axis=-1), 1.0) 34 | bins = np.add.accumulate(probabilities) 35 | return values[np.digitize(random_sample(size), bins)] 36 | 37 | 38 | def fake_data(n_docs, n_words, n_sent_length, n_topics): 39 | """ Generate latent topic vectors for words and documents 40 | and then for each document, draw a sentence. Draw each word 41 | document with probability proportional to the dot product and 42 | normalized with a softmax. 43 | 44 | Arguments 45 | --------- 46 | n_docs : int 47 | Number of documents 48 | n_words : int 49 | Number of words in the vocabulary 50 | n_sent_length : int 51 | Number of words to draw for each document 52 | n_topics : int 53 | Number of topics that a single document can belong to. 54 | 55 | Returns 56 | ------- 57 | sentences : int array 58 | Array of word indices of shape (n_docs, n_sent_length). 59 | 60 | """ 61 | # These are log ratios for the doc & word topics 62 | doc_topics = orthogonal_matrix([n_docs, n_topics]) 63 | wrd_topics = orthogonal_matrix([n_topics, n_words]) 64 | # Multiply log ratios and softmax to get prob of word in doc 65 | doc_to_wrds = softmax(np.dot(doc_topics, wrd_topics)) 66 | # Now sample from doc_to_wrd to get realizations 67 | indices = np.arange(n_words).astype('int32') 68 | sentences = [] 69 | for doc_to_wrd in doc_to_wrds: 70 | words = sample(indices, doc_to_wrd, n_sent_length) 71 | sentences.append(words) 72 | sentences = np.array(sentences) 73 | return sentences.astype('int32') 74 | -------------------------------------------------------------------------------- /lda2vec/lda2vec.py: -------------------------------------------------------------------------------- 1 | from .embed_mixture import EmbedMixture 2 | from .dirichlet_likelihood import dirichlet_likelihood 3 | from .utils import move 4 | 5 | from chainer import Chain 6 | import chainer.links as L 7 | import chainer.functions as F 8 | 9 | import numpy as np 10 | 11 | 12 | class LDA2Vec(Chain): 13 | def __init__(self, n_documents=100, n_document_topics=10, 14 | n_units=256, n_vocab=1000, dropout_ratio=0.5, train=True, 15 | counts=None, n_samples=15, word_dropout_ratio=0.0, 16 | power=0.75, temperature=1.0): 17 | em = EmbedMixture(n_documents, n_document_topics, n_units, 18 | dropout_ratio=dropout_ratio, temperature=temperature) 19 | kwargs = {} 20 | kwargs['mixture'] = em 21 | #self.mixture = em 22 | kwargs['sampler'] = L.NegativeSampling(n_units, counts, n_samples, 23 | power=power) 24 | super(LDA2Vec, self).__init__(**kwargs) 25 | rand = np.random.random(self.sampler.W.data.shape) 26 | self.sampler.W.data[:, :] = rand[:, :] 27 | self.n_units = n_units 28 | self.train = train 29 | self.dropout_ratio = dropout_ratio 30 | self.word_dropout_ratio = word_dropout_ratio 31 | self.n_samples = n_samples 32 | 33 | def prior(self): 34 | dl1 = dirichlet_likelihood(self.mixture.weights) 35 | return dl1 36 | 37 | def fit_partial(self, rdoc_ids, rword_indices, window=5, 38 | update_only_docs=False): 39 | doc_ids, word_indices = move(self.xp, rdoc_ids, rword_indices) 40 | pivot_idx = next(move(self.xp, rword_indices[window: -window])) 41 | pivot = F.embed_id(pivot_idx, self.sampler.W) 42 | if update_only_docs: 43 | pivot.unchain_backward() 44 | doc_at_pivot = rdoc_ids[window: -window] 45 | doc = self.mixture(next(move(self.xp, doc_at_pivot)), 46 | update_only_docs=update_only_docs) 47 | loss = 0.0 48 | start, end = window, rword_indices.shape[0] - window 49 | context = (F.dropout(doc, self.dropout_ratio) + 50 | F.dropout(pivot, self.dropout_ratio)) 51 | for frame in range(-window, window + 1): 52 | # Skip predicting the current pivot 53 | if frame == 0: 54 | continue 55 | # Predict word given context and pivot word 56 | # The target starts before the pivot 57 | targetidx = rword_indices[start + frame: end + frame] 58 | doc_at_target = rdoc_ids[start + frame: end + frame] 59 | doc_is_same = doc_at_target == doc_at_pivot 60 | rand = np.random.uniform(0, 1, doc_is_same.shape[0]) 61 | mask = (rand > self.word_dropout_ratio).astype('bool') 62 | weight = np.logical_and(doc_is_same, mask).astype('int32') 63 | # If weight is 1.0 then targetidx 64 | # If weight is 0.0 then -1 65 | targetidx = targetidx * weight + -1 * (1 - weight) 66 | target, = move(self.xp, targetidx) 67 | loss = self.sampler(context, target) 68 | loss.backward() 69 | if update_only_docs: 70 | # Wipe out any gradient accumulation on word vectors 71 | self.sampler.W.grad *= 0.0 72 | return loss.data 73 | -------------------------------------------------------------------------------- /lda2vec/negative_sampling.py: -------------------------------------------------------------------------------- 1 | import numpy 2 | import six 3 | 4 | from chainer import cuda 5 | from chainer import function 6 | from chainer.utils import type_check 7 | 8 | 9 | class NegativeSamplingFunction(function.Function): 10 | 11 | ignore_label = -1 12 | 13 | def __init__(self, sampler, sample_size): 14 | self.sampler = sampler 15 | self.sample_size = sample_size 16 | 17 | def _make_samples(self, t): 18 | if hasattr(self, 'samples'): 19 | return self.samples # for testing 20 | 21 | size = int(t.shape[0]) 22 | # first one is the positive, and others are sampled negatives 23 | samples = self.sampler((size, self.sample_size + 1)) 24 | samples[:, 0] = t 25 | self.samples = samples 26 | 27 | def check_type_forward(self, in_types): 28 | type_check.expect(in_types.size() == 3) 29 | x_type, t_type, w_type = in_types 30 | 31 | type_check.expect( 32 | x_type.dtype == numpy.float32, 33 | x_type.ndim == 2, 34 | t_type.dtype == numpy.int32, 35 | t_type.ndim == 1, 36 | x_type.shape[0] == t_type.shape[0], 37 | w_type.dtype == numpy.float32, 38 | w_type.ndim == 2, 39 | ) 40 | 41 | def forward_cpu(self, inputs): 42 | x, t, W = inputs 43 | self.ignore_mask = (t != self.ignore_label) 44 | self._make_samples(t) 45 | 46 | loss = numpy.float32(0.0) 47 | for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask], 48 | self.samples[self.ignore_mask])): 49 | w = W[k] 50 | f = w.dot(ix) 51 | f[0] *= -1 # positive sample 52 | loss += numpy.sum(numpy.logaddexp(f, 0)) 53 | return numpy.array(loss, numpy.float32), 54 | 55 | def forward_gpu(self, inputs): 56 | x, t, W = inputs 57 | self.ignore_mask = (t != self.ignore_label) 58 | n_in = x.shape[1] 59 | self._make_samples(t) 60 | 61 | self.wx = cuda.elementwise( 62 | 'raw T W, raw T x, bool mask, S k, int32 c, int32 m', 'T wx', 63 | ''' 64 | T f = 0; 65 | if (mask == 1){ 66 | for (int j = 0; j < c; ++j) { 67 | int x_ind[] = {(i / m), j}; 68 | int w_ind[] = {k, j}; 69 | f += x[x_ind] * W[w_ind]; 70 | } 71 | } 72 | wx = f; 73 | ''', 74 | 'negative_sampling_wx' 75 | )(W, x, self.ignore_mask[:, None], self.samples, n_in, 76 | self.sample_size + 1) 77 | 78 | y = cuda.elementwise( 79 | 'T wx, int32 c, int32 m', 'T y', 80 | ''' 81 | T f = wx; 82 | if (i % m == 0) { 83 | f = -f; 84 | } 85 | T loss; 86 | if (f < 0) { 87 | loss = __logf(1 + __expf(f)); 88 | } else { 89 | loss = f + __logf(1 + __expf(-f)); 90 | } 91 | y = loss; 92 | ''', 93 | 'negative_sampling_forward' 94 | )(self.wx, n_in, self.sample_size + 1) 95 | # TODO(okuta): merge elementwise 96 | loss = cuda.cupy.sum(y * self.ignore_mask[:, None].astype('float32')) 97 | return loss, 98 | 99 | def backward_cpu(self, inputs, grads): 100 | x, t, W = inputs 101 | gloss, = grads 102 | 103 | gx = numpy.zeros_like(x) 104 | gW = numpy.zeros_like(W) 105 | for i, (ix, k) in enumerate(six.moves.zip(x[self.ignore_mask], 106 | self.samples[self.ignore_mask])): 107 | w = W[k] 108 | f = w.dot(ix) 109 | 110 | # g == -y * gloss / (1 + exp(yf)) 111 | f[0] *= -1 112 | g = gloss / (1 + numpy.exp(-f)) 113 | g[0] *= -1 114 | 115 | gx[i] = g.dot(w) 116 | for ik, ig in six.moves.zip(k, g): 117 | gW[ik] += ig * ix 118 | return gx, None, gW 119 | 120 | def backward_gpu(self, inputs, grads): 121 | cupy = cuda.cupy 122 | x, t, W = inputs 123 | gloss, = grads 124 | 125 | n_in = x.shape[1] 126 | g = cuda.elementwise( 127 | 'T wx, raw T gloss, int32 m', 'T g', 128 | ''' 129 | T y; 130 | if (i % m == 0) { 131 | y = 1; 132 | } else { 133 | y = -1; 134 | } 135 | 136 | g = -y * gloss[0] / (1.0f + __expf(wx * y)); 137 | ''', 138 | 'negative_sampling_calculate_g' 139 | )(self.wx, gloss, self.sample_size + 1) 140 | gx = cupy.zeros_like(x) 141 | cuda.elementwise( 142 | 'raw T g, raw T W, bool mask, raw S k, int32 c, int32 m', 'T gx', 143 | ''' 144 | int d = i / c; 145 | T w = 0; 146 | if (mask == 1){ 147 | for (int j = 0; j < m; ++j) { 148 | w += g[d * m + j] * W[k[d * m + j] * c + i % c]; 149 | } 150 | } 151 | gx = w; 152 | ''', 153 | 'negative_sampling_calculate_gx' 154 | )(g, W, self.ignore_mask[:, None], self.samples, n_in, 155 | self.sample_size + 1, gx) 156 | gW = cupy.zeros_like(W) 157 | cuda.elementwise( 158 | 'T g, raw T x, S k, bool mask, int32 c, int32 m', 159 | 'raw T gW', 160 | ''' 161 | T gi = g; 162 | if (mask == 1) { 163 | for (int j = 0; j < c; ++j) { 164 | atomicAdd(&gW[k * c + j], gi * x[(i / m) * c + j]); 165 | } 166 | } 167 | ''', 168 | 'negative_sampling_calculate_gw' 169 | )(g, x, self.samples, self.ignore_mask[:, None], n_in, 170 | self.sample_size + 1, gW) 171 | return gx, None, gW 172 | 173 | 174 | def negative_sampling(x, t, W, sampler, sample_size): 175 | """Negative sampling loss function. 176 | 177 | In natural language processing, especially language modeling, the number of 178 | words in a vocabulary can be very large. 179 | Therefore, you need to spend a lot of time calculating the gradient of the 180 | embedding matrix. 181 | 182 | By using the negative sampling trick you only need to calculate the 183 | gradient for a few sampled negative examples. 184 | 185 | The objective function is below: 186 | 187 | .. math:: 188 | 189 | f(x, p) = \\log \\sigma(x^\\top w_p) + \\ 190 | k E_{i \\sim P(i)}[\\log \\sigma(- x^\\top w_i)], 191 | 192 | where :math:`\sigma(\cdot)` is a sigmoid function, :math:`w_i` is the 193 | weight vector for the word :math:`i`, and :math:`p` is a positive example. 194 | It is approximeted with :math:`k` examples :math:`N` sampled from 195 | probability :math:`P(i)`, like this: 196 | 197 | .. math:: 198 | 199 | f(x, p) \\approx \\log \\sigma(x^\\top w_p) + \\ 200 | \\sum_{n \\in N} \\log \\sigma(-x^\\top w_n). 201 | 202 | Each sample of :math:`N` is drawn from the word distribution :math:`P(w)`. 203 | This is calculated as :math:`P(w) = \\frac{1}{Z} c(w)^\\alpha`, where 204 | :math:`c(w)` is the unigram count of the word :math:`w`, :math:`\\alpha` is 205 | a hyper-parameter, and :math:`Z` is the normalization constant. 206 | 207 | Args: 208 | x (~chainer.Variable): Batch of input vectors. 209 | t (~chainer.Variable): Vector of groundtruth labels. 210 | W (~chainer.Variable): Weight matrix. 211 | sampler (function): Sampling function. It takes a shape and returns an 212 | integer array of the shape. Each element of this array is a sample 213 | from the word distribution. A :class:`~chainer.utils.WalkerAlias` 214 | object built with the power distribution of word frequency is 215 | recommended. 216 | sample_size (int): Number of samples. 217 | 218 | See: `Distributed Representations of Words and Phrases and their\ 219 | Compositionality `_ 220 | 221 | .. seealso:: :class:`~chainer.links.NegativeSampling`. 222 | 223 | """ 224 | return NegativeSamplingFunction(sampler, sample_size)(x, t, W) 225 | 226 | 227 | # Monkey-patch the chainer code to replace the negative sampling 228 | # with the one used here 229 | import chainer.links as L 230 | import chainer.functions as F 231 | negative_sampling.patched = True 232 | L.NegativeSampling.negative_sampling = negative_sampling 233 | F.negative_sampling = negative_sampling 234 | -------------------------------------------------------------------------------- /lda2vec/preprocess.py: -------------------------------------------------------------------------------- 1 | import spacy 2 | import numpy as np 3 | import en_core_web_md as en 4 | from tqdm import tqdm_notebook as progress 5 | from spacy.attrs import LOWER, LIKE_EMAIL, LIKE_URL 6 | import warnings 7 | 8 | warnings.simplefilter("ignore") 9 | 10 | 11 | def tokenize(texts, max_length, skip=-2, attr=LOWER, merge=False, nlp=None, 12 | **kwargs): 13 | """ Uses spaCy to quickly tokenize text and return an array 14 | of indices. 15 | 16 | This method stores a global NLP directory in memory, and takes 17 | up to a minute to run for the time. Later calls will have the 18 | tokenizer in memory. 19 | 20 | Parameters 21 | ---------- 22 | text : list of unicode strings 23 | These are the input documents. There can be multiple sentences per 24 | item in the list. 25 | max_length : int 26 | This is the maximum number of words per document. If the document is 27 | shorter then this number it will be padded to this length. 28 | skip : int, optional 29 | Short documents will be padded with this variable up until max_length. 30 | attr : int, from spacy.attrs 31 | What to transform the token to. Choice must be in spacy.attrs, and = 32 | common choices are (LOWER, LEMMA) 33 | merge : int, optional 34 | Merge noun phrases into a single token. Useful for turning 'New York' 35 | into a single token. 36 | nlp : None 37 | A spaCy NLP object. Useful for not reinstantiating the object multiple 38 | times. 39 | kwargs : dict, optional 40 | Any further argument will be sent to the spaCy tokenizer. For extra 41 | speed consider setting tag=False, parse=False, entity=False, or 42 | n_threads=8. 43 | 44 | Returns 45 | ------- 46 | arr : 2D array of ints 47 | Has shape (len(texts), max_length). Each value represents 48 | the word index. 49 | vocab : dict 50 | Keys are the word index, and values are the string. The pad index gets 51 | mapped to None 52 | 53 | >>> sents = [u"Do you recall a class action lawsuit", u"hello zombo.com"] 54 | >>> arr, vocab = tokenize(sents, 10, merge=True) 55 | >>> arr.shape[0] 56 | 2 57 | >>> arr.shape[1] 58 | 10 59 | >>> w2i = {w: i for i, w in vocab.iteritems()} 60 | >>> arr[0, 0] == w2i[u'do'] # First word and its index should match 61 | True 62 | >>> arr[0, 1] == w2i[u'you'] 63 | True 64 | >>> arr[0, -1] # last word in 0th document is a pad word 65 | -2 66 | >>> arr[0, 4] == w2i[u'class action lawsuit'] # noun phrase is tokenized 67 | True 68 | >>> arr[1, 1] # The URL token is thrown out 69 | -2 70 | """ 71 | if nlp is None: 72 | nlp = en.load() 73 | data = np.zeros((len(texts), max_length), dtype='int32') 74 | data[:] = skip 75 | bad_deps = ('amod', 'compound') 76 | token_list = [] 77 | vocab = {} 78 | index = 0 79 | for row, doc in progress(enumerate(nlp.pipe(texts, **kwargs))): 80 | if merge: 81 | for phrase in doc.noun_chunks: 82 | while len(phrase) > 1 and phrase[0].dep_ not in bad_deps: 83 | phrase = phrase[1:] 84 | if len(phrase) > 1: 85 | phrase.merge(phrase.root.tag_, phrase.text, 86 | phrase.root.ent_type_) 87 | for ent in doc.ents: 88 | if len(ent) > 1: 89 | ent.merge(ent.root.tag_, ent.text, ent.label_) 90 | 91 | dat = doc.to_array([LOWER, LIKE_EMAIL, LIKE_URL]).astype("int32") 92 | for i, token in enumerate(doc): 93 | text = token.text.lower() 94 | if text not in list(vocab.values()): 95 | dat[i][0] = index 96 | vocab[index] = text 97 | index += 1 98 | else: 99 | for k, v in vocab.items(): 100 | if v == text: 101 | value = k 102 | break 103 | dat[i][0] = value 104 | if len(dat) > 0: 105 | msg = "Negative indices reserved for special tokens" 106 | assert dat.min() >= 0, msg 107 | idx = (dat[:, 1] > 0) | (dat[:, 2] > 0) 108 | dat[idx] = skip 109 | length = min(len(dat), max_length) 110 | data[row, :length] = dat[:length, 0].ravel() 111 | 112 | vocab[skip] = '' 113 | return data, vocab 114 | 115 | 116 | if __name__ == "__main__": 117 | import doctest 118 | doctest.testmod() 119 | -------------------------------------------------------------------------------- /lda2vec/topics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import requests 3 | import multiprocessing 4 | 5 | 6 | def _softmax(x): 7 | e_x = np.exp(x - np.max(x)) 8 | out = e_x / e_x.sum() 9 | return out 10 | 11 | 12 | def _softmax_2d(x): 13 | y = x - x.max(axis=1, keepdims=True) 14 | np.exp(y, out=y) 15 | y /= y.sum(axis=1, keepdims=True) 16 | return y 17 | 18 | 19 | def prob_words(context, vocab, temperature=1.0): 20 | """ This calculates a softmax over the vocabulary as a function 21 | of the dot product of context and word. 22 | """ 23 | dot = np.dot(vocab, context) 24 | prob = _softmax(dot / temperature) 25 | return prob 26 | 27 | 28 | def prepare_topics(weights, factors, word_vectors, vocab, temperature=1.0, 29 | doc_lengths=None, term_frequency=None, normalize=False): 30 | """ Collects a dictionary of word, document and topic distributions. 31 | 32 | Arguments 33 | --------- 34 | weights : float array 35 | This must be an array of unnormalized log-odds of document-to-topic 36 | weights. Shape should be [n_documents, n_topics] 37 | factors : float array 38 | Should be an array of topic vectors. These topic vectors live in the 39 | same space as word vectors and will be used to find the most similar 40 | words to each topic. Shape should be [n_topics, n_dim]. 41 | word_vectors : float array 42 | This must be a matrix of word vectors. Should be of shape 43 | [n_words, n_dim] 44 | vocab : list of str 45 | These must be the strings for words corresponding to 46 | indices [0, n_words] 47 | temperature : float 48 | Used to calculate the log probability of a word. Higher 49 | temperatures make more rare words more likely. 50 | doc_lengths : int array 51 | An array indicating the number of words in the nth document. 52 | Must be of shape [n_documents]. Required by pyLDAvis. 53 | term_frequency : int array 54 | An array indicating the overall number of times each token appears 55 | in the corpus. Must be of shape [n_words]. Required by pyLDAvis. 56 | 57 | Returns 58 | ------- 59 | data : dict 60 | This dictionary is readily consumed by pyLDAVis for topic 61 | visualization. 62 | """ 63 | # Map each factor vector to a word 64 | topic_to_word = [] 65 | msg = "Vocabulary size did not match size of word vectors" 66 | assert len(vocab) == word_vectors.shape[0], msg 67 | if normalize: 68 | word_vectors /= np.linalg.norm(word_vectors, axis=1)[:, None] 69 | # factors = factors / np.linalg.norm(factors, axis=1)[:, None] 70 | for factor_vector in factors: 71 | factor_to_word = prob_words(factor_vector, word_vectors, 72 | temperature=temperature) 73 | topic_to_word.append(np.ravel(factor_to_word)) 74 | topic_to_word = np.array(topic_to_word) 75 | msg = "Not all rows in topic_to_word sum to 1" 76 | assert np.allclose(np.sum(topic_to_word, axis=1), 1), msg 77 | # Collect document-to-topic distributions, e.g. theta 78 | doc_to_topic = _softmax_2d(weights) 79 | msg = "Not all rows in doc_to_topic sum to 1" 80 | assert np.allclose(np.sum(doc_to_topic, axis=1), 1), msg 81 | data = {'topic_term_dists': topic_to_word, 82 | 'doc_topic_dists': doc_to_topic, 83 | 'doc_lengths': doc_lengths, 84 | 'vocab': vocab, 85 | 'term_frequency': term_frequency} 86 | return data 87 | 88 | 89 | def print_top_words_per_topic(data, top_n=10, do_print=True): 90 | """ Given a pyLDAvis data array, print out the top words in every topic. 91 | 92 | Arguments 93 | --------- 94 | data : dict 95 | A dict object that summarizes topic data and has been made using 96 | `prepare_topics`. 97 | """ 98 | msgs = [] 99 | lists = [] 100 | for j, topic_to_word in enumerate(data['topic_term_dists']): 101 | top = np.argsort(topic_to_word)[::-1][:top_n] 102 | prefix = "Top words in topic %i " % j 103 | top_words = [data['vocab'][i].strip().replace(' ', '_') for i in top] 104 | msg = ' '.join(top_words) 105 | if do_print: 106 | print(prefix + msg) 107 | lists.append(top_words) 108 | return lists 109 | 110 | 111 | def get_request(url): 112 | for _ in range(5): 113 | try: 114 | return float(requests.get(url).text) 115 | except: 116 | pass 117 | return None 118 | 119 | 120 | def topic_coherence(lists, services=['ca', 'cp', 'cv', 'npmi', 'uci', 121 | 'umass']): 122 | """ Requests the topic coherence from AKSW Palmetto 123 | 124 | Arguments 125 | --------- 126 | lists : list of lists 127 | A list of lists with one list of top words for each topic. 128 | 129 | >>> topic_words = [['cake', 'apple', 'banana', 'cherry', 'chocolate']] 130 | >>> topic_coherence(topic_words, services=['cv']) 131 | {(0, 'cv'): 0.5678879445677241} 132 | """ 133 | url = u'http://palmetto.aksw.org/palmetto-webapp/service/{}?words={}' 134 | reqs = [url.format(s, '%20'.join(top[:10])) 135 | for s in services for top in lists] 136 | pool = multiprocessing.Pool() 137 | coherences = pool.map(get_request, reqs) 138 | pool.close() 139 | pool.terminate() 140 | pool.join() 141 | del pool 142 | args = [(j, s, top) for s in services for j, top in enumerate(lists)] 143 | ans = {} 144 | for ((j, s, t), tc) in zip(args, coherences): 145 | ans[(j, s)] = tc 146 | return ans 147 | -------------------------------------------------------------------------------- /lda2vec/tracking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from sklearn.linear_model import LinearRegression 3 | 4 | 5 | class Tracking: 6 | cache = {} 7 | calls = 0 8 | slope = 0.0 9 | 10 | def __init__(self, n=5000): 11 | """ The tracking class keeps a most recently used cache of values 12 | 13 | Parameters 14 | ---------- 15 | n: int 16 | Number of items to keep. 17 | """ 18 | self.n = n 19 | 20 | def add(self, key, item): 21 | """ Add an item with a particular to the cache. 22 | 23 | >>> tracker = Tracking() 24 | >>> tracker.add('log_perplexity', 55.6) 25 | >>> tracker.cache['log_perplexity'] 26 | [55.6] 27 | >>> tracker.add('log_perplexity', 55.2) 28 | >>> tracker.add('loss', -12.1) 29 | >>> tracker.cache['log_perplexity'] 30 | [55.6, 55.2] 31 | >>> tracker.cache['loss'] 32 | [-12.1] 33 | """ 34 | if key not in self.cache: 35 | self.cache[key] = [] 36 | self.cache[key].append(item) 37 | if len(self.cache[key]) > self.n: 38 | self.cache[key] = self.cache[key][:self.n] 39 | 40 | def stats(self, key): 41 | """ Get the statistics for items with a particular key 42 | 43 | >>> tracker = Tracking() 44 | >>> tracker.add('log_perplexity', 55.6) 45 | >>> tracker.add('log_perplexity', 55.2) 46 | >>> tracker.stats('log_perplexity') 47 | (55.400000000000006, 0.19999999999999929, 0.0) 48 | """ 49 | data = self.cache[key] 50 | mean = np.mean(data) 51 | std = np.std(data) 52 | slope = self.slope 53 | if self.calls % 100 == 0: 54 | lr = LinearRegression() 55 | x = np.arange(len(data)).astype('float32') 56 | lr.fit(x[:, None], np.array(data)) 57 | self.slope = lr.coef_[0] 58 | self.calls += 1 59 | return mean, std, slope 60 | 61 | 62 | if __name__ == "__main__": 63 | import doctest 64 | doctest.testmod() 65 | -------------------------------------------------------------------------------- /lda2vec/utils.py: -------------------------------------------------------------------------------- 1 | from chainer import Variable 2 | import random 3 | import numpy as np 4 | 5 | 6 | def move(xp, *args): 7 | for arg in args: 8 | if 'float' in str(arg.dtype): 9 | yield Variable(xp.asarray(arg, dtype='float32')) 10 | else: 11 | assert 'int' in str(arg.dtype) 12 | yield Variable(xp.asarray(arg, dtype='int32')) 13 | 14 | 15 | def most_similar(embeddings, word_index): 16 | input_vector = embeddings.W[word_index] 17 | similarities = embeddings.dot(input_vector) 18 | return similarities 19 | 20 | 21 | def chunks(n, *args): 22 | """Yield successive n-sized chunks from l.""" 23 | # From stackoverflow question 312443 24 | keypoints = [] 25 | for i in range(0, len(args[0]), n): 26 | keypoints.append((i, i + n)) 27 | random.shuffle(keypoints) 28 | for a, b in keypoints: 29 | yield [arg[a: b] for arg in args] 30 | 31 | 32 | class MovingAverage(): 33 | def __init__(self, lastn=100): 34 | self.points = np.array([]) 35 | self.lastn = lastn 36 | 37 | def add(self, x): 38 | self.points = np.append(self.points, x) 39 | 40 | def mean(self): 41 | return np.mean(self.points[-self.lastn:]) 42 | 43 | def std(self): 44 | return np.std(self.points[-self.lastn:]) 45 | 46 | def get_stats(self): 47 | return (np.mean(self.points[-self.lastn:]), 48 | np.std(self.points[-self.lastn:])) 49 | -------------------------------------------------------------------------------- /notebooks/dataset.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import logging\n", 10 | "import pickle\n", 11 | "from sklearn.datasets import fetch_20newsgroups\n", 12 | "import numpy as np\n", 13 | "from lda2vec import preprocess, Corpus\n", 14 | "logging.basicConfig()" 15 | ] 16 | }, 17 | { 18 | "cell_type": "code", 19 | "execution_count": 2, 20 | "metadata": {}, 21 | "outputs": [], 22 | "source": [ 23 | "# Fetch data\n", 24 | "remove = ('headers', 'footers', 'quotes')\n", 25 | "texts = fetch_20newsgroups(subset='train', remove=remove).data\n", 26 | "# Remove tokens with these substrings\n", 27 | "bad = set([\"ax>\", '`@(\"', '---', '===', '^^^'])\n", 28 | "\n", 29 | "\n", 30 | "def clean(line):\n", 31 | " return ' '.join(w for w in line.split() if not any(t in w for t in bad))\n", 32 | " " 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 3, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "# Preprocess data\n", 42 | "max_length = 10000 # Limit of 10k words per document\n", 43 | "# Convert to unicode (spaCy only works with unicode)\n", 44 | "texts = [str(clean(d)) for d in texts if len(str(clean(d))) > 0]\n" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 4, 50 | "metadata": {}, 51 | "outputs": [], 52 | "source": [ 53 | "#tokens, vocab = preprocess.tokenize(texts, max_length, merge=False,\n", 54 | "# n_threads=4)\n", 55 | "\n", 56 | "tokens = np.load(\"tokens.npy\")\n", 57 | "vocab = np.load(\"vocab.npy\")" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 5, 63 | "metadata": {}, 64 | "outputs": [], 65 | "source": [ 66 | "vocab = vocab.tolist()" 67 | ] 68 | }, 69 | { 70 | "cell_type": "code", 71 | "execution_count": 6, 72 | "metadata": {}, 73 | "outputs": [], 74 | "source": [ 75 | "corpus = Corpus()\n", 76 | "# Make a ranked list of rare vs frequent words\n", 77 | "corpus.update_word_count(tokens)\n", 78 | "corpus.finalize()\n", 79 | "# The tokenization uses spaCy indices, and so may have gaps\n", 80 | "# between indices for words that aren't present in our dataset.\n", 81 | "# This builds a new compact index\n", 82 | "compact = corpus.to_compact(tokens)\n", 83 | "# Remove extremely rare words\n", 84 | "pruned = corpus.filter_count(compact, min_count=30)\n", 85 | "# Convert the compactified arrays into bag of words arrays\n", 86 | "bow = corpus.compact_to_bow(pruned)\n", 87 | "# Words tend to have power law frequency, so selectively\n", 88 | "# downsample the most prevalent words\n", 89 | "clean = corpus.subsample_frequent(pruned)\n", 90 | "# Now flatten a 2D array of document per row and word position\n", 91 | "# per column to a 1D array of words. This will also remove skips\n", 92 | "# and OoV words\n", 93 | "doc_ids = np.arange(pruned.shape[0])\n", 94 | "flattened, (doc_ids,) = corpus.compact_to_flat(pruned, doc_ids)\n" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 7, 100 | "metadata": {}, 101 | "outputs": [], 102 | "source": [ 103 | "assert flattened.min() >= 0\n", 104 | "# Fill in the pretrained word vectors\n", 105 | "n_dim = 300\n", 106 | "fn_wordvc = '../../../../Downloads/vectors/GoogleNews-vectors-negative300.bin'\n", 107 | "vectors, s, f = corpus.compact_word_vectors(vocab, filename=fn_wordvc)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 9, 113 | "metadata": {}, 114 | "outputs": [], 115 | "source": [ 116 | "# Save all of the preprocessed files\n", 117 | "pickle.dump(vocab, open('vocab.pkl', 'w'))\n", 118 | "pickle.dump(corpus, open('corpus.pkl', 'w'))\n", 119 | "np.save(\"flattened\", flattened)\n", 120 | "np.save(\"doc_ids\", doc_ids)\n", 121 | "np.save(\"pruned\", pruned)\n", 122 | "np.save(\"bow\", bow)\n", 123 | "np.save(\"vectors\", vectors)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": null, 129 | "metadata": {}, 130 | "outputs": [], 131 | "source": [] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": null, 143 | "metadata": {}, 144 | "outputs": [], 145 | "source": [] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": null, 150 | "metadata": {}, 151 | "outputs": [], 152 | "source": [] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": null, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [] 160 | } 161 | ], 162 | "metadata": { 163 | "kernelspec": { 164 | "display_name": "Python 3", 165 | "language": "python", 166 | "name": "python3" 167 | }, 168 | "language_info": { 169 | "codemirror_mode": { 170 | "name": "ipython", 171 | "version": 3 172 | }, 173 | "file_extension": ".py", 174 | "mimetype": "text/x-python", 175 | "name": "python", 176 | "nbconvert_exporter": "python", 177 | "pygments_lexer": "ipython3", 178 | "version": "3.7.2" 179 | } 180 | }, 181 | "nbformat": 4, 182 | "nbformat_minor": 2 183 | } 184 | -------------------------------------------------------------------------------- /notebooks/lda2vec_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "import os.path\n", 11 | "import pickle\n", 12 | "import time\n", 13 | "import shelve\n", 14 | "\n", 15 | "import chainer\n", 16 | "from chainer import cuda\n", 17 | "from chainer import serializers\n", 18 | "import chainer.optimizers as O\n", 19 | "import numpy as np\n", 20 | "\n", 21 | "from lda2vec import utils\n", 22 | "from lda2vec import prepare_topics, print_top_words_per_topic, topic_coherence\n", 23 | "from lda2vec import LDA2Vec" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 2, 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stdout", 33 | "output_type": "stream", 34 | "text": [ 35 | "Using GPU:0\n" 36 | ] 37 | } 38 | ], 39 | "source": [ 40 | "gpu_id = int(os.getenv('CUDA_GPU', 0))\n", 41 | "cuda.get_device(gpu_id).use()\n", 42 | "print(\"Using GPU:\" + str(gpu_id))" 43 | ] 44 | }, 45 | { 46 | "cell_type": "code", 47 | "execution_count": 3, 48 | "metadata": {}, 49 | "outputs": [], 50 | "source": [ 51 | "#data_dir = os.getenv('data_dir', '../data/')\n", 52 | "fn_vocab = 'vocab.pkl'\n", 53 | "fn_corpus = 'corpus.pkl'\n", 54 | "fn_flatnd = 'flattened.npy'\n", 55 | "fn_docids = 'doc_ids.npy'\n", 56 | "fn_vectors = 'vectors.npy'\n", 57 | "vocab = pickle.load(open(fn_vocab, 'rb'))\n", 58 | "corpus = pickle.load(open(fn_corpus, 'rb'))\n", 59 | "flattened = np.load(fn_flatnd)\n", 60 | "doc_ids = np.load(fn_docids)\n", 61 | "vectors = np.load(fn_vectors)" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "# Model Parameters\n", 71 | "# Number of documents\n", 72 | "n_docs = doc_ids.max() + 1\n", 73 | "# Number of unique words in the vocabulary\n", 74 | "n_vocab = flattened.max() + 1\n", 75 | "# 'Strength' of the dircihlet prior; 200.0 seems to work well\n", 76 | "clambda = 200.0\n", 77 | "# Number of topics to fit\n", 78 | "n_topics = int(os.getenv('n_topics', 20))\n", 79 | "batchsize = 4096\n", 80 | "# Power for neg sampling\n", 81 | "power = float(os.getenv('power', 0.75))\n", 82 | "# Intialize with pretrained word vectors\n", 83 | "pretrained = bool(int(os.getenv('pretrained', True)))\n", 84 | "# Sampling temperature\n", 85 | "temperature = float(os.getenv('temperature', 1.0))\n", 86 | "# Number of dimensions in a single word vector\n", 87 | "n_units = int(os.getenv('n_units', 300))\n", 88 | "# Get the string representation for every compact key\n", 89 | "words = corpus.word_list(vocab)[:n_vocab]\n", 90 | "# How many tokens are in each document\n", 91 | "doc_idx, lengths = np.unique(doc_ids, return_counts=True)\n", 92 | "doc_lengths = np.zeros(doc_ids.max() + 1, dtype='int32')\n", 93 | "doc_lengths[doc_idx] = lengths\n", 94 | "# Count all token frequencies\n", 95 | "tok_idx, freq = np.unique(flattened, return_counts=True)\n", 96 | "term_frequency = np.zeros(n_vocab, dtype='int32')\n", 97 | "term_frequency[tok_idx] = freq" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 26, 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "_ 11009\n", 110 | "_25 11009\n", 111 | "__doc__ Automatically created module for IPython interactive environment\n", 112 | "__loader__ None\n", 113 | "__name__ __main__\n", 114 | "__package__ None\n", 115 | "__spec__ None\n", 116 | "_dh ['/home/xenomorph/projects/onlps/lda2vec/notebooks']\n", 117 | "_i len(model.mixture.weights.W)\n", 118 | "_i13 serializers.load_npz('my.model', model)\n", 119 | "_i14 serializers.load_npz('lda2vec.hdf5', model)\n", 120 | "_i15 serializers.load_npz('lda2vec', model)\n", 121 | "_i16 serializers.load_hdf5(\"lda2vec.hdf5\")\n", 122 | "_i17 serializers.load_hdf5(\"lda2vec.hdf5\"), model)\n", 123 | "_i18 serializers.load_hdf5(\"lda2vec.hdf5\", model)\n", 124 | "_i19 model\n", 125 | "_i20 model\n", 126 | "_i21 import pickle\n", 127 | "_i22 with open(\"lda2vec.pkl\", \"w\"):\n", 128 | " pickle.dump(model)\n", 129 | "_i23 with open(\"lda2vec.pkl\", \"w\") as f:\n", 130 | " pickle.dump(model, f)\n", 131 | "_i24 with open(\"lda2vec.pkl\", \"wb\") as f:\n", 132 | " pickle.dump(model, f)\n", 133 | "_i25 len(model.mixture.weights.W)\n", 134 | "_ii with open(\"lda2vec.pkl\", \"wb\") as f:\n", 135 | " pickle.dump(model, f)\n", 136 | "_iii with open(\"lda2vec.pkl\", \"w\") as f:\n", 137 | " pickle.dump(model, f)\n", 138 | "batchsize 4096\n", 139 | "clambda 200.0\n", 140 | "d [6535 6535 6535 ... 6535 6535 6535]\n", 141 | "doc_ids [ 0 0 0 ... 11008 11008 11008]\n", 142 | "doc_idx [ 0 1 2 ... 11006 11007 11008]\n", 143 | "doc_lengths [100 92 333 ... 115 63 50]\n", 144 | "dt 0.6835141181945801\n", 145 | "epoch 0\n", 146 | "flattened [ 10 38 1311 ... 49 49 49]\n", 147 | "fn_corpus corpus.pkl\n", 148 | "fn_docids doc_ids.npy\n", 149 | "fn_flatnd flattened.npy\n", 150 | "fn_vectors vectors.npy\n", 151 | "fn_vocab vocab.pkl\n", 152 | "fraction 0.0017746121285380678\n", 153 | "freq [105430 103758 100329 ... 30 30 29]\n", 154 | "gpu_id 0\n", 155 | "j 564\n", 156 | "key key\n", 157 | "l 15207.722\n", 158 | "lengths [100 92 333 ... 115 63 50]\n", 159 | "logs {'loss': 15207.7216796875, 'epoch': 0, 'j': 563, 'prior': -637025.4375, 'rate': 5992.560930298103}\n", 160 | "loss variable(-1130.473)\n", 161 | "msg J:{j:05d} E:{epoch:05d} L:{loss:1.3e} P:{prior:1.3e} R:{rate:1.3e}\n", 162 | "n_docs 11009\n", 163 | "n_topics 20\n", 164 | "n_units 300\n", 165 | "n_vocab 5838\n", 166 | "power 0.75\n", 167 | "pretrained True\n", 168 | "prior variable(-637025.44)\n", 169 | "rate 5992.560930298103\n", 170 | "remove ('headers', 'footers', 'quotes')\n", 171 | "t0 1549864137.5832853\n", 172 | "t1 1549864138.2667994\n", 173 | "temperature 1.0\n", 174 | "term_frequency [ 0 0 0 ... 30 30 29]\n", 175 | "tok_idx [ 3 4 5 ... 5835 5836 5837]\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "for key in sorted(locals().keys()):\n", 181 | " val = locals()[key]\n", 182 | " if len(str(val)) < 100 and '<' not in str(val):\n", 183 | " print(key, val)" 184 | ] 185 | }, 186 | { 187 | "cell_type": "markdown", 188 | "metadata": {}, 189 | "source": [ 190 | "# training the model" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 7, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "model = LDA2Vec(n_documents=n_docs, n_document_topics=n_topics,\n", 200 | " n_units=n_units, n_vocab=n_vocab, counts=term_frequency,\n", 201 | " n_samples=15, power=power, temperature=temperature)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 8, 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "if os.path.exists('lda2vec.hdf5'):\n", 211 | " print(\"Reloading from saved\")\n", 212 | " serializers.load_hdf5(\"lda2vec.hdf5\", model)\n", 213 | " \n", 214 | "if pretrained:\n", 215 | " model.sampler.W.data[:, :] = vectors[:n_vocab, :]" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 9, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "model.to_gpu()\n", 225 | "optimizer = O.Adam()\n", 226 | "optimizer.setup(model)\n", 227 | "clip = chainer.optimizer.GradientClipping(5.0)\n", 228 | "optimizer.add_hook(clip)" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 10, 234 | "metadata": {}, 235 | "outputs": [], 236 | "source": [ 237 | "j = 0\n", 238 | "epoch = 0\n", 239 | "fraction = batchsize * 1.0 / flattened.shape[0]\n", 240 | "progress = shelve.open('progress.shelve')" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 11, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "Top words in topic 0 galileo easier codes differences professor worse better van calculations complicated\n", 253 | "Top words in topic 1 gene dept subscribe nut puck altitude atlanta protein seed login\n", 254 | "Top words in topic 2 md exec languages consumer d.c. sf sensitive centris dir commands\n", 255 | "Top words in topic 3 wiretap politically 2d armenians politics di al conflicts differences political\n", 256 | "Top words in topic 4 puck shareware tyre maxtor header lens der responsibility visit ball\n", 257 | "Top words in topic 5 ss isaiah rf apologize mental v. skills arithmetic wolverine situation\n", 258 | "Top words in topic 6 criticism koresh intelligence demands replies skepticism theists spiritual teacher atheist\n", 259 | "Top words in topic 7 l. transmitted transmit widget expos pilot directory rider jim boost\n", 260 | "Top words in topic 8 hung england florida punishment lying california practice 96 baptism arizona\n", 261 | "Top words in topic 9 denning glory smokeless netters patches jim ban michael cubs alt\n", 262 | "Top words in topic 10 login documentary jury alomar murdered murder picture murders convicted cryptography\n", 263 | "Top words in topic 11 blank cartridge ahl ministry identification corpses goalie worthless authentication cooperation\n", 264 | "Top words in topic 12 hotel mileage seat apartment gas cabin sf jet pipe average\n", 265 | "Top words in topic 13 toll governor launches d.c. 76 zealand confusing route km launched\n", 266 | "Top words in topic 14 physics astronaut theology que ss v obey marriage des ahl\n", 267 | "Top words in topic 15 allergic dictionary myself homicides rape murders 'm filename symptoms differ\n", 268 | "Top words in topic 16 bullets rifle hitter accidents revolver anger crashes helmet dealer massacre\n", 269 | "Top words in topic 17 wounded protein injured voltage vitamin o'clock centaur fish blood damaged\n", 270 | "Top words in topic 18 mom pirates dream goalie priority stephanopoulos protecting 130 animation accomplish\n", 271 | "Top words in topic 19 login submit entry edu ftp edit interview password verify submitting\n", 272 | "0\n", 273 | "after partial fitting: 60025.6\n", 274 | "J:00000 E:00000 L:6.003e+04 P:-6.266e+05 R:2.576e+03\n", 275 | "after partial fitting: 59965.18\n", 276 | "J:00001 E:00000 L:5.997e+04 P:-6.266e+05 R:4.847e+03\n", 277 | "after partial fitting: 58814.625\n", 278 | "J:00002 E:00000 L:5.881e+04 P:-6.266e+05 R:4.773e+03\n", 279 | "after partial fitting: 58230.41\n", 280 | "J:00003 E:00000 L:5.823e+04 P:-6.266e+05 R:4.714e+03\n", 281 | "after partial fitting: 57572.15\n", 282 | "J:00004 E:00000 L:5.757e+04 P:-6.266e+05 R:6.034e+03\n", 283 | "after partial fitting: 53957.297\n", 284 | "J:00005 E:00000 L:5.396e+04 P:-6.266e+05 R:6.011e+03\n", 285 | "after partial fitting: 55610.78\n", 286 | "J:00006 E:00000 L:5.561e+04 P:-6.266e+05 R:5.859e+03\n", 287 | "after partial fitting: 55718.6\n", 288 | "J:00007 E:00000 L:5.572e+04 P:-6.266e+05 R:5.897e+03\n", 289 | "after partial fitting: 54909.97\n", 290 | "J:00008 E:00000 L:5.491e+04 P:-6.266e+05 R:5.979e+03\n", 291 | "after partial fitting: 53568.777\n", 292 | "J:00009 E:00000 L:5.357e+04 P:-6.266e+05 R:5.862e+03\n", 293 | "after partial fitting: 52668.152\n", 294 | "J:00010 E:00000 L:5.267e+04 P:-6.266e+05 R:5.952e+03\n", 295 | "after partial fitting: 54325.47\n", 296 | "J:00011 E:00000 L:5.433e+04 P:-6.266e+05 R:5.897e+03\n", 297 | "after partial fitting: 53106.15\n", 298 | "J:00012 E:00000 L:5.311e+04 P:-6.266e+05 R:5.384e+03\n", 299 | "after partial fitting: 52418.695\n", 300 | "J:00013 E:00000 L:5.242e+04 P:-6.266e+05 R:5.985e+03\n", 301 | "after partial fitting: 51428.49\n", 302 | "J:00014 E:00000 L:5.143e+04 P:-6.266e+05 R:6.027e+03\n", 303 | "after partial fitting: 52054.29\n", 304 | "J:00015 E:00000 L:5.205e+04 P:-6.266e+05 R:5.959e+03\n", 305 | "after partial fitting: 50060.258\n", 306 | "J:00016 E:00000 L:5.006e+04 P:-6.266e+05 R:5.859e+03\n", 307 | "after partial fitting: 51417.117\n", 308 | "J:00017 E:00000 L:5.142e+04 P:-6.266e+05 R:5.870e+03\n", 309 | "after partial fitting: 51258.33\n", 310 | "J:00018 E:00000 L:5.126e+04 P:-6.266e+05 R:5.941e+03\n", 311 | "after partial fitting: 49484.758\n", 312 | "J:00019 E:00000 L:4.948e+04 P:-6.266e+05 R:5.990e+03\n", 313 | "after partial fitting: 51158.375\n", 314 | "J:00020 E:00000 L:5.116e+04 P:-6.266e+05 R:5.959e+03\n", 315 | "after partial fitting: 49449.215\n", 316 | "J:00021 E:00000 L:4.945e+04 P:-6.266e+05 R:4.884e+03\n", 317 | "after partial fitting: 48607.37\n", 318 | "J:00022 E:00000 L:4.861e+04 P:-6.266e+05 R:6.011e+03\n", 319 | "after partial fitting: 49262.7\n", 320 | "J:00023 E:00000 L:4.926e+04 P:-6.266e+05 R:5.805e+03\n", 321 | "after partial fitting: 48953.863\n", 322 | "J:00024 E:00000 L:4.895e+04 P:-6.266e+05 R:5.977e+03\n", 323 | "after partial fitting: 48829.59\n", 324 | "J:00025 E:00000 L:4.883e+04 P:-6.266e+05 R:5.944e+03\n", 325 | "after partial fitting: 48458.402\n", 326 | "J:00026 E:00000 L:4.846e+04 P:-6.266e+05 R:6.012e+03\n", 327 | "after partial fitting: 47735.047\n", 328 | "J:00027 E:00000 L:4.774e+04 P:-6.266e+05 R:6.013e+03\n", 329 | "after partial fitting: 46937.727\n", 330 | "J:00028 E:00000 L:4.694e+04 P:-6.266e+05 R:5.054e+03\n", 331 | "after partial fitting: 47446.117\n", 332 | "J:00029 E:00000 L:4.745e+04 P:-6.266e+05 R:5.018e+03\n", 333 | "after partial fitting: 46643.074\n", 334 | "J:00030 E:00000 L:4.664e+04 P:-6.266e+05 R:4.998e+03\n", 335 | "after partial fitting: 47368.3\n", 336 | "J:00031 E:00000 L:4.737e+04 P:-6.266e+05 R:5.440e+03\n", 337 | "after partial fitting: 46788.117\n", 338 | "J:00032 E:00000 L:4.679e+04 P:-6.266e+05 R:5.994e+03\n", 339 | "after partial fitting: 46492.664\n", 340 | "J:00033 E:00000 L:4.649e+04 P:-6.266e+05 R:6.045e+03\n", 341 | "after partial fitting: 46800.375\n", 342 | "J:00034 E:00000 L:4.680e+04 P:-6.266e+05 R:4.916e+03\n", 343 | "after partial fitting: 46502.49\n", 344 | "J:00035 E:00000 L:4.650e+04 P:-6.266e+05 R:5.881e+03\n", 345 | "after partial fitting: 46280.367\n", 346 | "J:00036 E:00000 L:4.628e+04 P:-6.266e+05 R:5.420e+03\n", 347 | "after partial fitting: 45915.508\n", 348 | "J:00037 E:00000 L:4.592e+04 P:-6.266e+05 R:5.019e+03\n", 349 | "after partial fitting: 44986.273\n", 350 | "J:00038 E:00000 L:4.499e+04 P:-6.266e+05 R:5.143e+03\n", 351 | "after partial fitting: 45867.562\n", 352 | "J:00039 E:00000 L:4.587e+04 P:-6.266e+05 R:5.123e+03\n", 353 | "after partial fitting: 45208.17\n", 354 | "J:00040 E:00000 L:4.521e+04 P:-6.266e+05 R:4.864e+03\n", 355 | "after partial fitting: 45020.992\n", 356 | "J:00041 E:00000 L:4.502e+04 P:-6.266e+05 R:5.882e+03\n", 357 | "after partial fitting: 44829.258\n", 358 | "J:00042 E:00000 L:4.483e+04 P:-6.266e+05 R:6.010e+03\n", 359 | "after partial fitting: 45038.305\n", 360 | "J:00043 E:00000 L:4.504e+04 P:-6.266e+05 R:5.870e+03\n", 361 | "after partial fitting: 44635.492\n", 362 | "J:00044 E:00000 L:4.464e+04 P:-6.266e+05 R:5.858e+03\n", 363 | "after partial fitting: 44424.434\n", 364 | "J:00045 E:00000 L:4.442e+04 P:-6.266e+05 R:6.008e+03\n", 365 | "after partial fitting: 43542.312\n", 366 | "J:00046 E:00000 L:4.354e+04 P:-6.266e+05 R:5.910e+03\n", 367 | "after partial fitting: 43630.508\n", 368 | "J:00047 E:00000 L:4.363e+04 P:-6.266e+05 R:6.064e+03\n", 369 | "after partial fitting: 43692.285\n", 370 | "J:00048 E:00000 L:4.369e+04 P:-6.266e+05 R:5.880e+03\n", 371 | "after partial fitting: 44308.992\n", 372 | "J:00049 E:00000 L:4.431e+04 P:-6.266e+05 R:5.517e+03\n", 373 | "after partial fitting: 43372.08\n", 374 | "J:00050 E:00000 L:4.337e+04 P:-6.266e+05 R:5.322e+03\n", 375 | "after partial fitting: 43270.508\n", 376 | "J:00051 E:00000 L:4.327e+04 P:-6.266e+05 R:5.414e+03\n", 377 | "after partial fitting: 42826.88\n", 378 | "J:00052 E:00000 L:4.283e+04 P:-6.266e+05 R:5.849e+03\n", 379 | "after partial fitting: 42931.203\n", 380 | "J:00053 E:00000 L:4.293e+04 P:-6.266e+05 R:5.754e+03\n", 381 | "after partial fitting: 43169.156\n", 382 | "J:00054 E:00000 L:4.317e+04 P:-6.266e+05 R:5.793e+03\n", 383 | "after partial fitting: 41746.6\n", 384 | "J:00055 E:00000 L:4.175e+04 P:-6.266e+05 R:6.047e+03\n", 385 | "after partial fitting: 42317.406\n", 386 | "J:00056 E:00000 L:4.232e+04 P:-6.266e+05 R:5.987e+03\n", 387 | "after partial fitting: 41575.227\n", 388 | "J:00057 E:00000 L:4.158e+04 P:-6.266e+05 R:6.008e+03\n", 389 | "after partial fitting: 42544.33\n", 390 | "J:00058 E:00000 L:4.254e+04 P:-6.266e+05 R:5.794e+03\n", 391 | "after partial fitting: 41813.383\n", 392 | "J:00059 E:00000 L:4.181e+04 P:-6.266e+05 R:5.784e+03\n", 393 | "after partial fitting: 41465.195\n", 394 | "J:00060 E:00000 L:4.147e+04 P:-6.266e+05 R:5.856e+03\n", 395 | "after partial fitting: 41694.242\n", 396 | "J:00061 E:00000 L:4.169e+04 P:-6.266e+05 R:5.990e+03\n", 397 | "after partial fitting: 41176.25\n", 398 | "J:00062 E:00000 L:4.118e+04 P:-6.266e+05 R:5.985e+03\n", 399 | "after partial fitting: 40851.453\n", 400 | "J:00063 E:00000 L:4.085e+04 P:-6.266e+05 R:6.030e+03\n", 401 | "after partial fitting: 42134.867\n", 402 | "J:00064 E:00000 L:4.213e+04 P:-6.266e+05 R:5.333e+03\n", 403 | "after partial fitting: 40474.58\n", 404 | "J:00065 E:00000 L:4.047e+04 P:-6.266e+05 R:4.870e+03\n", 405 | "after partial fitting: 42136.484\n", 406 | "J:00066 E:00000 L:4.214e+04 P:-6.266e+05 R:4.597e+03\n", 407 | "after partial fitting: 40501.766\n", 408 | "J:00067 E:00000 L:4.050e+04 P:-6.266e+05 R:5.039e+03\n", 409 | "after partial fitting: 41102.46\n", 410 | "J:00068 E:00000 L:4.110e+04 P:-6.266e+05 R:5.076e+03\n", 411 | "after partial fitting: 41022.062\n", 412 | "J:00069 E:00000 L:4.102e+04 P:-6.266e+05 R:5.904e+03\n", 413 | "after partial fitting: 40279.215\n", 414 | "J:00070 E:00000 L:4.028e+04 P:-6.266e+05 R:5.971e+03\n", 415 | "after partial fitting: 39444.703\n", 416 | "J:00071 E:00000 L:3.944e+04 P:-6.266e+05 R:4.952e+03\n", 417 | "after partial fitting: 39023.19\n", 418 | "J:00072 E:00000 L:3.902e+04 P:-6.266e+05 R:5.509e+03\n", 419 | "after partial fitting: 38559.867\n", 420 | "J:00073 E:00000 L:3.856e+04 P:-6.266e+05 R:5.275e+03\n", 421 | "after partial fitting: 38806.773\n", 422 | "J:00074 E:00000 L:3.881e+04 P:-6.266e+05 R:5.970e+03\n", 423 | "after partial fitting: 39776.703\n", 424 | "J:00075 E:00000 L:3.978e+04 P:-6.266e+05 R:5.879e+03\n", 425 | "after partial fitting: 38228.57\n", 426 | "J:00076 E:00000 L:3.823e+04 P:-6.266e+05 R:5.946e+03\n", 427 | "after partial fitting: 38373.004\n", 428 | "J:00077 E:00000 L:3.837e+04 P:-6.266e+05 R:5.928e+03\n", 429 | "after partial fitting: 38043.43\n", 430 | "J:00078 E:00000 L:3.804e+04 P:-6.266e+05 R:6.069e+03\n", 431 | "after partial fitting: 37861.18\n", 432 | "J:00079 E:00000 L:3.786e+04 P:-6.266e+05 R:6.023e+03\n", 433 | "after partial fitting: 37864.953\n", 434 | "J:00080 E:00000 L:3.786e+04 P:-6.266e+05 R:5.961e+03\n", 435 | "after partial fitting: 38696.086\n", 436 | "J:00081 E:00000 L:3.870e+04 P:-6.266e+05 R:5.974e+03\n", 437 | "after partial fitting: 37526.062\n", 438 | "J:00082 E:00000 L:3.753e+04 P:-6.266e+05 R:5.968e+03\n", 439 | "after partial fitting: 38310.363\n", 440 | "J:00083 E:00000 L:3.831e+04 P:-6.266e+05 R:5.950e+03\n", 441 | "after partial fitting: 37119.11\n", 442 | "J:00084 E:00000 L:3.712e+04 P:-6.266e+05 R:5.968e+03\n", 443 | "after partial fitting: 36629.86\n", 444 | "J:00085 E:00000 L:3.663e+04 P:-6.266e+05 R:5.961e+03\n", 445 | "after partial fitting: 37560.9\n", 446 | "J:00086 E:00000 L:3.756e+04 P:-6.266e+05 R:5.920e+03\n", 447 | "after partial fitting: 36843.906\n", 448 | "J:00087 E:00000 L:3.684e+04 P:-6.266e+05 R:5.975e+03\n", 449 | "after partial fitting: 36011.906\n", 450 | "J:00088 E:00000 L:3.601e+04 P:-6.266e+05 R:5.945e+03\n", 451 | "after partial fitting: 35597.51\n", 452 | "J:00089 E:00000 L:3.560e+04 P:-6.266e+05 R:5.917e+03\n", 453 | "after partial fitting: 36237.87\n", 454 | "J:00090 E:00000 L:3.624e+04 P:-6.266e+05 R:5.849e+03\n", 455 | "after partial fitting: 35933.492\n", 456 | "J:00091 E:00000 L:3.593e+04 P:-6.266e+05 R:5.356e+03\n", 457 | "after partial fitting: 35335.695\n", 458 | "J:00092 E:00000 L:3.534e+04 P:-6.266e+05 R:5.550e+03\n", 459 | "after partial fitting: 35161.688\n", 460 | "J:00093 E:00000 L:3.516e+04 P:-6.266e+05 R:5.813e+03\n", 461 | "after partial fitting: 34334.477\n", 462 | "J:00094 E:00000 L:3.433e+04 P:-6.266e+05 R:5.944e+03\n", 463 | "after partial fitting: 35093.33\n", 464 | "J:00095 E:00000 L:3.509e+04 P:-6.266e+05 R:5.857e+03\n", 465 | "after partial fitting: 35384.68\n", 466 | "J:00096 E:00000 L:3.538e+04 P:-6.266e+05 R:5.856e+03\n", 467 | "after partial fitting: 34563.676\n", 468 | "J:00097 E:00000 L:3.456e+04 P:-6.266e+05 R:5.873e+03\n", 469 | "after partial fitting: 34980.82\n", 470 | "J:00098 E:00000 L:3.498e+04 P:-6.266e+05 R:6.003e+03\n", 471 | "after partial fitting: 34145.688\n", 472 | "J:00099 E:00000 L:3.415e+04 P:-6.266e+05 R:6.062e+03\n", 473 | "after partial fitting: 34361.914\n", 474 | "J:00100 E:00000 L:3.436e+04 P:-6.266e+05 R:6.086e+03\n", 475 | "after partial fitting: 33269.227\n", 476 | "J:00101 E:00000 L:3.327e+04 P:-6.266e+05 R:6.014e+03\n", 477 | "after partial fitting: 34806.66\n", 478 | "J:00102 E:00000 L:3.481e+04 P:-6.266e+05 R:6.053e+03\n", 479 | "after partial fitting: 34571.63\n", 480 | "J:00103 E:00000 L:3.457e+04 P:-6.266e+05 R:6.010e+03\n", 481 | "after partial fitting: 34954.33\n", 482 | "J:00104 E:00000 L:3.495e+04 P:-6.266e+05 R:6.052e+03\n", 483 | "after partial fitting: 32349.73\n", 484 | "J:00105 E:00000 L:3.235e+04 P:-6.266e+05 R:6.013e+03\n", 485 | "after partial fitting: 32342.969\n", 486 | "J:00106 E:00000 L:3.234e+04 P:-6.266e+05 R:6.071e+03\n", 487 | "after partial fitting: 32015.27\n", 488 | "J:00107 E:00000 L:3.202e+04 P:-6.266e+05 R:6.093e+03\n", 489 | "after partial fitting: 31933.24\n", 490 | "J:00108 E:00000 L:3.193e+04 P:-6.266e+05 R:6.076e+03\n", 491 | "after partial fitting: 31818.71\n", 492 | "J:00109 E:00000 L:3.182e+04 P:-6.266e+05 R:6.074e+03\n", 493 | "after partial fitting: 31405.674\n", 494 | "J:00110 E:00000 L:3.141e+04 P:-6.266e+05 R:6.105e+03\n", 495 | "after partial fitting: 31727.777\n", 496 | "J:00111 E:00000 L:3.173e+04 P:-6.266e+05 R:6.075e+03\n", 497 | "after partial fitting: 31360.021\n", 498 | "J:00112 E:00000 L:3.136e+04 P:-6.266e+05 R:6.063e+03\n", 499 | "after partial fitting: 31144.55\n", 500 | "J:00113 E:00000 L:3.114e+04 P:-6.266e+05 R:6.088e+03\n", 501 | "after partial fitting: 30779.344\n", 502 | "J:00114 E:00000 L:3.078e+04 P:-6.266e+05 R:6.112e+03\n", 503 | "after partial fitting: 30584.752\n", 504 | "J:00115 E:00000 L:3.058e+04 P:-6.266e+05 R:6.049e+03\n", 505 | "after partial fitting: 31395.637\n", 506 | "J:00116 E:00000 L:3.140e+04 P:-6.266e+05 R:6.067e+03\n", 507 | "after partial fitting: 30131.213\n", 508 | "J:00117 E:00000 L:3.013e+04 P:-6.266e+05 R:6.068e+03\n", 509 | "after partial fitting: 31391.371\n", 510 | "J:00118 E:00000 L:3.139e+04 P:-6.266e+05 R:6.051e+03\n", 511 | "after partial fitting: 29529.635\n", 512 | "J:00119 E:00000 L:2.953e+04 P:-6.266e+05 R:6.075e+03\n", 513 | "after partial fitting: 29273.645\n", 514 | "J:00120 E:00000 L:2.927e+04 P:-6.266e+05 R:6.021e+03\n", 515 | "after partial fitting: 29233.674\n", 516 | "J:00121 E:00000 L:2.923e+04 P:-6.266e+05 R:6.027e+03\n", 517 | "after partial fitting: 30145.396\n", 518 | "J:00122 E:00000 L:3.015e+04 P:-6.266e+05 R:6.048e+03\n", 519 | "after partial fitting: 29761.84\n", 520 | "J:00123 E:00000 L:2.976e+04 P:-6.266e+05 R:6.061e+03\n", 521 | "after partial fitting: 29238.902\n", 522 | "J:00124 E:00000 L:2.924e+04 P:-6.266e+05 R:6.069e+03\n", 523 | "after partial fitting: 29642.45\n", 524 | "J:00125 E:00000 L:2.964e+04 P:-6.266e+05 R:6.043e+03\n", 525 | "after partial fitting: 28716.219\n", 526 | "J:00126 E:00000 L:2.872e+04 P:-6.266e+05 R:6.053e+03\n", 527 | "after partial fitting: 27662.445\n", 528 | "J:00127 E:00000 L:2.766e+04 P:-6.266e+05 R:6.131e+03\n", 529 | "after partial fitting: 28901.814\n", 530 | "J:00128 E:00000 L:2.890e+04 P:-6.266e+05 R:6.043e+03\n", 531 | "after partial fitting: 25056.883\n", 532 | "J:00129 E:00000 L:2.506e+04 P:-6.266e+05 R:6.034e+03\n", 533 | "after partial fitting: 27731.514\n", 534 | "J:00130 E:00000 L:2.773e+04 P:-6.266e+05 R:6.025e+03\n", 535 | "after partial fitting: 27779.36\n", 536 | "J:00131 E:00000 L:2.778e+04 P:-6.267e+05 R:6.094e+03\n", 537 | "after partial fitting: 27414.254\n", 538 | "J:00132 E:00000 L:2.741e+04 P:-6.267e+05 R:6.056e+03\n", 539 | "after partial fitting: 27296.277\n", 540 | "J:00133 E:00000 L:2.730e+04 P:-6.267e+05 R:6.101e+03\n", 541 | "after partial fitting: 27481.258\n", 542 | "J:00134 E:00000 L:2.748e+04 P:-6.267e+05 R:6.020e+03\n", 543 | "after partial fitting: 28271.377\n", 544 | "J:00135 E:00000 L:2.827e+04 P:-6.267e+05 R:6.038e+03\n", 545 | "after partial fitting: 26254.014\n", 546 | "J:00136 E:00000 L:2.625e+04 P:-6.267e+05 R:6.078e+03\n", 547 | "after partial fitting: 26578.258\n", 548 | "J:00137 E:00000 L:2.658e+04 P:-6.267e+05 R:6.081e+03\n", 549 | "after partial fitting: 26785.209\n", 550 | "J:00138 E:00000 L:2.679e+04 P:-6.267e+05 R:6.080e+03\n", 551 | "after partial fitting: 27028.56\n", 552 | "J:00139 E:00000 L:2.703e+04 P:-6.267e+05 R:6.043e+03\n", 553 | "after partial fitting: 26331.686\n", 554 | "J:00140 E:00000 L:2.633e+04 P:-6.267e+05 R:6.063e+03\n", 555 | "after partial fitting: 25806.61\n", 556 | "J:00141 E:00000 L:2.581e+04 P:-6.267e+05 R:5.967e+03\n", 557 | "after partial fitting: 25997.297\n", 558 | "J:00142 E:00000 L:2.600e+04 P:-6.267e+05 R:6.098e+03\n", 559 | "after partial fitting: 25662.008\n", 560 | "J:00143 E:00000 L:2.566e+04 P:-6.267e+05 R:6.116e+03\n", 561 | "after partial fitting: 25373.52\n", 562 | "J:00144 E:00000 L:2.537e+04 P:-6.267e+05 R:6.127e+03\n", 563 | "after partial fitting: 24699.531\n", 564 | "J:00145 E:00000 L:2.470e+04 P:-6.267e+05 R:5.986e+03\n", 565 | "after partial fitting: 26028.555\n", 566 | "J:00146 E:00000 L:2.603e+04 P:-6.267e+05 R:6.073e+03\n", 567 | "after partial fitting: 24439.402\n", 568 | "J:00147 E:00000 L:2.444e+04 P:-6.267e+05 R:6.073e+03\n", 569 | "after partial fitting: 24852.848\n", 570 | "J:00148 E:00000 L:2.485e+04 P:-6.267e+05 R:6.059e+03\n", 571 | "after partial fitting: 24875.38\n", 572 | "J:00149 E:00000 L:2.488e+04 P:-6.267e+05 R:6.064e+03\n", 573 | "after partial fitting: 23890.79\n", 574 | "J:00150 E:00000 L:2.389e+04 P:-6.267e+05 R:6.033e+03\n", 575 | "after partial fitting: 24489.16\n", 576 | "J:00151 E:00000 L:2.449e+04 P:-6.267e+05 R:6.116e+03\n", 577 | "after partial fitting: 23221.39\n", 578 | "J:00152 E:00000 L:2.322e+04 P:-6.267e+05 R:6.053e+03\n", 579 | "after partial fitting: 23643.238\n", 580 | "J:00153 E:00000 L:2.364e+04 P:-6.267e+05 R:6.057e+03\n", 581 | "after partial fitting: 23551.635\n", 582 | "J:00154 E:00000 L:2.355e+04 P:-6.267e+05 R:6.055e+03\n", 583 | "after partial fitting: 22977.184\n", 584 | "J:00155 E:00000 L:2.298e+04 P:-6.267e+05 R:6.073e+03\n", 585 | "after partial fitting: 23447.684\n", 586 | "J:00156 E:00000 L:2.345e+04 P:-6.267e+05 R:6.085e+03\n", 587 | "after partial fitting: 22477.803\n", 588 | "J:00157 E:00000 L:2.248e+04 P:-6.267e+05 R:6.150e+03\n", 589 | "after partial fitting: 22690.54\n", 590 | "J:00158 E:00000 L:2.269e+04 P:-6.267e+05 R:6.063e+03\n", 591 | "after partial fitting: 22940.223\n", 592 | "J:00159 E:00000 L:2.294e+04 P:-6.267e+05 R:6.046e+03\n", 593 | "after partial fitting: 22163.59\n", 594 | "J:00160 E:00000 L:2.216e+04 P:-6.267e+05 R:5.698e+03\n", 595 | "after partial fitting: 22588.23\n", 596 | "J:00161 E:00000 L:2.259e+04 P:-6.267e+05 R:5.782e+03\n", 597 | "after partial fitting: 22168.441\n", 598 | "J:00162 E:00000 L:2.217e+04 P:-6.267e+05 R:6.010e+03\n", 599 | "after partial fitting: 21736.072\n", 600 | "J:00163 E:00000 L:2.174e+04 P:-6.267e+05 R:5.721e+03\n", 601 | "after partial fitting: 23206.814\n", 602 | "J:00164 E:00000 L:2.321e+04 P:-6.267e+05 R:4.622e+03\n", 603 | "after partial fitting: 22202.191\n", 604 | "J:00165 E:00000 L:2.220e+04 P:-6.267e+05 R:4.764e+03\n", 605 | "after partial fitting: 21514.31\n", 606 | "J:00166 E:00000 L:2.151e+04 P:-6.267e+05 R:5.897e+03\n", 607 | "after partial fitting: 22966.6\n", 608 | "J:00167 E:00000 L:2.297e+04 P:-6.267e+05 R:5.875e+03\n", 609 | "after partial fitting: 21476.262\n", 610 | "J:00168 E:00000 L:2.148e+04 P:-6.267e+05 R:5.847e+03\n", 611 | "after partial fitting: 20888.7\n", 612 | "J:00169 E:00000 L:2.089e+04 P:-6.267e+05 R:5.988e+03\n", 613 | "after partial fitting: 21083.533\n", 614 | "J:00170 E:00000 L:2.108e+04 P:-6.267e+05 R:5.261e+03\n", 615 | "after partial fitting: 20684.258\n", 616 | "J:00171 E:00000 L:2.068e+04 P:-6.267e+05 R:5.008e+03\n", 617 | "after partial fitting: 20984.19\n", 618 | "J:00172 E:00000 L:2.098e+04 P:-6.267e+05 R:5.987e+03\n", 619 | "after partial fitting: 20521.496\n", 620 | "J:00173 E:00000 L:2.052e+04 P:-6.267e+05 R:5.928e+03\n", 621 | "after partial fitting: 20661.85\n", 622 | "J:00174 E:00000 L:2.066e+04 P:-6.267e+05 R:5.984e+03\n", 623 | "after partial fitting: 20266.916\n", 624 | "J:00175 E:00000 L:2.027e+04 P:-6.267e+05 R:6.010e+03\n", 625 | "after partial fitting: 20429.816\n", 626 | "J:00176 E:00000 L:2.043e+04 P:-6.267e+05 R:5.596e+03\n", 627 | "after partial fitting: 20474.457\n", 628 | "J:00177 E:00000 L:2.047e+04 P:-6.267e+05 R:6.005e+03\n", 629 | "after partial fitting: 19181.3\n", 630 | "J:00178 E:00000 L:1.918e+04 P:-6.267e+05 R:6.017e+03\n", 631 | "after partial fitting: 19949.555\n", 632 | "J:00179 E:00000 L:1.995e+04 P:-6.267e+05 R:5.903e+03\n", 633 | "after partial fitting: 19915.707\n", 634 | "J:00180 E:00000 L:1.992e+04 P:-6.267e+05 R:5.939e+03\n", 635 | "after partial fitting: 19854.262\n", 636 | "J:00181 E:00000 L:1.985e+04 P:-6.267e+05 R:6.004e+03\n", 637 | "after partial fitting: 19265.129\n", 638 | "J:00182 E:00000 L:1.927e+04 P:-6.267e+05 R:6.076e+03\n", 639 | "after partial fitting: 19137.309\n", 640 | "J:00183 E:00000 L:1.914e+04 P:-6.267e+05 R:6.050e+03\n", 641 | "after partial fitting: 17785.469\n", 642 | "J:00184 E:00000 L:1.779e+04 P:-6.267e+05 R:5.906e+03\n", 643 | "after partial fitting: 19165.738\n", 644 | "J:00185 E:00000 L:1.917e+04 P:-6.267e+05 R:5.875e+03\n", 645 | "after partial fitting: 19463.63\n", 646 | "J:00186 E:00000 L:1.946e+04 P:-6.267e+05 R:5.850e+03\n", 647 | "after partial fitting: 18957.111\n", 648 | "J:00187 E:00000 L:1.896e+04 P:-6.267e+05 R:5.782e+03\n", 649 | "after partial fitting: 18527.273\n", 650 | "J:00188 E:00000 L:1.853e+04 P:-6.267e+05 R:5.902e+03\n", 651 | "after partial fitting: 18686.93\n", 652 | "J:00189 E:00000 L:1.869e+04 P:-6.267e+05 R:5.658e+03\n", 653 | "after partial fitting: 18578.914\n", 654 | "J:00190 E:00000 L:1.858e+04 P:-6.267e+05 R:5.706e+03\n", 655 | "after partial fitting: 18030.516\n", 656 | "J:00191 E:00000 L:1.803e+04 P:-6.267e+05 R:5.242e+03\n", 657 | "after partial fitting: 19121.955\n", 658 | "J:00192 E:00000 L:1.912e+04 P:-6.267e+05 R:5.421e+03\n", 659 | "after partial fitting: 17995.668\n", 660 | "J:00193 E:00000 L:1.800e+04 P:-6.267e+05 R:5.343e+03\n", 661 | "after partial fitting: 18015.19\n", 662 | "J:00194 E:00000 L:1.802e+04 P:-6.267e+05 R:5.277e+03\n", 663 | "after partial fitting: 18463.668\n", 664 | "J:00195 E:00000 L:1.846e+04 P:-6.267e+05 R:5.252e+03\n", 665 | "after partial fitting: 17576.13\n", 666 | "J:00196 E:00000 L:1.758e+04 P:-6.267e+05 R:5.327e+03\n", 667 | "after partial fitting: 17220.068\n", 668 | "J:00197 E:00000 L:1.722e+04 P:-6.267e+05 R:5.080e+03\n", 669 | "after partial fitting: 18059.652\n", 670 | "J:00198 E:00000 L:1.806e+04 P:-6.267e+05 R:5.574e+03\n", 671 | "after partial fitting: 17178.809\n", 672 | "J:00199 E:00000 L:1.718e+04 P:-6.267e+05 R:5.354e+03\n", 673 | "after partial fitting: 17971.396\n", 674 | "J:00200 E:00000 L:1.797e+04 P:-6.267e+05 R:5.501e+03\n", 675 | "after partial fitting: 17752.889\n", 676 | "J:00201 E:00000 L:1.775e+04 P:-6.267e+05 R:5.717e+03\n", 677 | "after partial fitting: 17587.137\n", 678 | "J:00202 E:00000 L:1.759e+04 P:-6.267e+05 R:5.184e+03\n", 679 | "after partial fitting: 17260.096\n", 680 | "J:00203 E:00000 L:1.726e+04 P:-6.267e+05 R:4.770e+03\n", 681 | "after partial fitting: 17327.832\n", 682 | "J:00204 E:00000 L:1.733e+04 P:-6.267e+05 R:5.954e+03\n", 683 | "after partial fitting: 18475.19\n", 684 | "J:00205 E:00000 L:1.848e+04 P:-6.267e+05 R:5.972e+03\n", 685 | "after partial fitting: 18049.121\n", 686 | "J:00206 E:00000 L:1.805e+04 P:-6.267e+05 R:5.678e+03\n", 687 | "after partial fitting: 17023.158\n", 688 | "J:00207 E:00000 L:1.702e+04 P:-6.267e+05 R:5.507e+03\n", 689 | "after partial fitting: 17343.344\n", 690 | "J:00208 E:00000 L:1.734e+04 P:-6.267e+05 R:5.861e+03\n", 691 | "after partial fitting: 16262.592\n", 692 | "J:00209 E:00000 L:1.626e+04 P:-6.267e+05 R:6.001e+03\n", 693 | "after partial fitting: 16617.215\n", 694 | "J:00210 E:00000 L:1.662e+04 P:-6.267e+05 R:5.201e+03\n", 695 | "after partial fitting: 17078.59\n", 696 | "J:00211 E:00000 L:1.708e+04 P:-6.267e+05 R:5.277e+03\n", 697 | "after partial fitting: 16457.357\n", 698 | "J:00212 E:00000 L:1.646e+04 P:-6.267e+05 R:5.232e+03\n", 699 | "after partial fitting: 15459.547\n", 700 | "J:00213 E:00000 L:1.546e+04 P:-6.267e+05 R:5.191e+03\n", 701 | "after partial fitting: 16364.579\n", 702 | "J:00214 E:00000 L:1.636e+04 P:-6.267e+05 R:4.901e+03\n", 703 | "after partial fitting: 16609.305\n", 704 | "J:00215 E:00000 L:1.661e+04 P:-6.267e+05 R:5.278e+03\n", 705 | "after partial fitting: 16419.035\n", 706 | "J:00216 E:00000 L:1.642e+04 P:-6.267e+05 R:5.017e+03\n", 707 | "after partial fitting: 15992.141\n", 708 | "J:00217 E:00000 L:1.599e+04 P:-6.267e+05 R:5.029e+03\n", 709 | "after partial fitting: 16366.637\n", 710 | "J:00218 E:00000 L:1.637e+04 P:-6.267e+05 R:4.978e+03\n", 711 | "after partial fitting: 16428.293\n", 712 | "J:00219 E:00000 L:1.643e+04 P:-6.267e+05 R:5.033e+03\n", 713 | "after partial fitting: 16153.443\n", 714 | "J:00220 E:00000 L:1.615e+04 P:-6.267e+05 R:5.130e+03\n", 715 | "after partial fitting: 15108.082\n", 716 | "J:00221 E:00000 L:1.511e+04 P:-6.267e+05 R:5.411e+03\n", 717 | "after partial fitting: 15839.101\n", 718 | "J:00222 E:00000 L:1.584e+04 P:-6.267e+05 R:5.250e+03\n", 719 | "after partial fitting: 16148.507\n", 720 | "J:00223 E:00000 L:1.615e+04 P:-6.268e+05 R:5.110e+03\n", 721 | "after partial fitting: 15898.63\n", 722 | "J:00224 E:00000 L:1.590e+04 P:-6.268e+05 R:5.200e+03\n", 723 | "after partial fitting: 16240.622\n", 724 | "J:00225 E:00000 L:1.624e+04 P:-6.268e+05 R:5.123e+03\n", 725 | "after partial fitting: 15974.372\n", 726 | "J:00226 E:00000 L:1.597e+04 P:-6.268e+05 R:5.015e+03\n", 727 | "after partial fitting: 15815.311\n", 728 | "J:00227 E:00000 L:1.582e+04 P:-6.268e+05 R:5.025e+03\n", 729 | "after partial fitting: 15577.67\n", 730 | "J:00228 E:00000 L:1.558e+04 P:-6.268e+05 R:5.140e+03\n", 731 | "after partial fitting: 15190.018\n", 732 | "J:00229 E:00000 L:1.519e+04 P:-6.268e+05 R:5.215e+03\n", 733 | "after partial fitting: 15393.206\n", 734 | "J:00230 E:00000 L:1.539e+04 P:-6.268e+05 R:5.399e+03\n", 735 | "after partial fitting: 15632.131\n", 736 | "J:00231 E:00000 L:1.563e+04 P:-6.268e+05 R:5.548e+03\n", 737 | "after partial fitting: 15489.15\n", 738 | "J:00232 E:00000 L:1.549e+04 P:-6.268e+05 R:5.711e+03\n", 739 | "after partial fitting: 15210.231\n", 740 | "J:00233 E:00000 L:1.521e+04 P:-6.268e+05 R:6.133e+03\n", 741 | "after partial fitting: 15511.427\n", 742 | "J:00234 E:00000 L:1.551e+04 P:-6.268e+05 R:5.687e+03\n", 743 | "after partial fitting: 15548.255\n", 744 | "J:00235 E:00000 L:1.555e+04 P:-6.268e+05 R:6.024e+03\n", 745 | "after partial fitting: 14978.25\n", 746 | "J:00236 E:00000 L:1.498e+04 P:-6.268e+05 R:5.734e+03\n", 747 | "after partial fitting: 15332.8955\n", 748 | "J:00237 E:00000 L:1.533e+04 P:-6.268e+05 R:6.029e+03\n", 749 | "after partial fitting: 15407.283\n", 750 | "J:00238 E:00000 L:1.541e+04 P:-6.268e+05 R:5.623e+03\n", 751 | "after partial fitting: 15660.328\n", 752 | "J:00239 E:00000 L:1.566e+04 P:-6.268e+05 R:5.024e+03\n", 753 | "after partial fitting: 15221.151\n", 754 | "J:00240 E:00000 L:1.522e+04 P:-6.268e+05 R:5.622e+03\n", 755 | "after partial fitting: 15117.535\n", 756 | "J:00241 E:00000 L:1.512e+04 P:-6.268e+05 R:4.781e+03\n", 757 | "after partial fitting: 15358.136\n", 758 | "J:00242 E:00000 L:1.536e+04 P:-6.268e+05 R:5.170e+03\n", 759 | "after partial fitting: 15403.156\n", 760 | "J:00243 E:00000 L:1.540e+04 P:-6.269e+05 R:5.762e+03\n", 761 | "after partial fitting: 15151.9\n", 762 | "J:00244 E:00000 L:1.515e+04 P:-6.269e+05 R:5.827e+03\n", 763 | "after partial fitting: 15325.089\n", 764 | "J:00245 E:00000 L:1.533e+04 P:-6.269e+05 R:5.751e+03\n", 765 | "after partial fitting: 14827.129\n", 766 | "J:00246 E:00000 L:1.483e+04 P:-6.269e+05 R:5.825e+03\n", 767 | "after partial fitting: 14815.379\n", 768 | "J:00247 E:00000 L:1.482e+04 P:-6.269e+05 R:5.806e+03\n", 769 | "after partial fitting: 15042.434\n", 770 | "J:00248 E:00000 L:1.504e+04 P:-6.269e+05 R:5.833e+03\n", 771 | "after partial fitting: 15511.191\n", 772 | "J:00249 E:00000 L:1.551e+04 P:-6.269e+05 R:5.784e+03\n", 773 | "after partial fitting: 14819.643\n", 774 | "J:00250 E:00000 L:1.482e+04 P:-6.269e+05 R:5.798e+03\n", 775 | "after partial fitting: 14744.163\n", 776 | "J:00251 E:00000 L:1.474e+04 P:-6.269e+05 R:5.821e+03\n", 777 | "after partial fitting: 15016.177\n", 778 | "J:00252 E:00000 L:1.502e+04 P:-6.269e+05 R:5.816e+03\n", 779 | "after partial fitting: 14533.602\n", 780 | "J:00253 E:00000 L:1.453e+04 P:-6.270e+05 R:5.884e+03\n", 781 | "after partial fitting: 14643.593\n", 782 | "J:00254 E:00000 L:1.464e+04 P:-6.270e+05 R:5.749e+03\n", 783 | "after partial fitting: 15308.663\n", 784 | "J:00255 E:00000 L:1.531e+04 P:-6.270e+05 R:5.765e+03\n", 785 | "after partial fitting: 15204.26\n", 786 | "J:00256 E:00000 L:1.520e+04 P:-6.270e+05 R:5.781e+03\n", 787 | "after partial fitting: 14733.767\n", 788 | "J:00257 E:00000 L:1.473e+04 P:-6.270e+05 R:5.793e+03\n", 789 | "after partial fitting: 15499.161\n", 790 | "J:00258 E:00000 L:1.550e+04 P:-6.270e+05 R:5.198e+03\n", 791 | "after partial fitting: 14766.961\n", 792 | "J:00259 E:00000 L:1.477e+04 P:-6.270e+05 R:5.726e+03\n", 793 | "after partial fitting: 15104.436\n", 794 | "J:00260 E:00000 L:1.510e+04 P:-6.271e+05 R:5.907e+03\n", 795 | "after partial fitting: 14658.041\n", 796 | "J:00261 E:00000 L:1.466e+04 P:-6.271e+05 R:5.946e+03\n", 797 | "after partial fitting: 14428.885\n", 798 | "J:00262 E:00000 L:1.443e+04 P:-6.271e+05 R:5.165e+03\n", 799 | "after partial fitting: 14984.813\n", 800 | "J:00263 E:00000 L:1.498e+04 P:-6.271e+05 R:5.528e+03\n", 801 | "after partial fitting: 14678.289\n", 802 | "J:00264 E:00000 L:1.468e+04 P:-6.271e+05 R:5.668e+03\n", 803 | "after partial fitting: 15225.107\n", 804 | "J:00265 E:00000 L:1.523e+04 P:-6.271e+05 R:5.636e+03\n", 805 | "after partial fitting: 14805.553\n", 806 | "J:00266 E:00000 L:1.481e+04 P:-6.272e+05 R:5.392e+03\n", 807 | "after partial fitting: 14566.131\n", 808 | "J:00267 E:00000 L:1.457e+04 P:-6.272e+05 R:5.155e+03\n", 809 | "after partial fitting: 14621.453\n", 810 | "J:00268 E:00000 L:1.462e+04 P:-6.272e+05 R:5.358e+03\n", 811 | "after partial fitting: 15119.041\n", 812 | "J:00269 E:00000 L:1.512e+04 P:-6.272e+05 R:5.326e+03\n", 813 | "after partial fitting: 14595.709\n", 814 | "J:00270 E:00000 L:1.460e+04 P:-6.273e+05 R:5.749e+03\n", 815 | "after partial fitting: 14660.914\n", 816 | "J:00271 E:00000 L:1.466e+04 P:-6.273e+05 R:5.148e+03\n", 817 | "after partial fitting: 15116.81\n", 818 | "J:00272 E:00000 L:1.512e+04 P:-6.273e+05 R:5.643e+03\n", 819 | "after partial fitting: 14525.841\n", 820 | "J:00273 E:00000 L:1.453e+04 P:-6.273e+05 R:5.481e+03\n", 821 | "after partial fitting: 14896.59\n", 822 | "J:00274 E:00000 L:1.490e+04 P:-6.274e+05 R:5.442e+03\n", 823 | "after partial fitting: 14540.08\n", 824 | "J:00275 E:00000 L:1.454e+04 P:-6.274e+05 R:6.094e+03\n", 825 | "after partial fitting: 14998.677\n", 826 | "J:00276 E:00000 L:1.500e+04 P:-6.274e+05 R:5.728e+03\n", 827 | "after partial fitting: 14743.502\n", 828 | "J:00277 E:00000 L:1.474e+04 P:-6.274e+05 R:5.765e+03\n", 829 | "after partial fitting: 14476.426\n", 830 | "J:00278 E:00000 L:1.448e+04 P:-6.275e+05 R:5.630e+03\n", 831 | "after partial fitting: 14808.815\n", 832 | "J:00279 E:00000 L:1.481e+04 P:-6.275e+05 R:5.127e+03\n", 833 | "after partial fitting: 15177.396\n", 834 | "J:00280 E:00000 L:1.518e+04 P:-6.275e+05 R:5.551e+03\n", 835 | "after partial fitting: 14562.357\n", 836 | "J:00281 E:00000 L:1.456e+04 P:-6.275e+05 R:5.510e+03\n", 837 | "after partial fitting: 14613.694\n", 838 | "J:00282 E:00000 L:1.461e+04 P:-6.276e+05 R:5.650e+03\n", 839 | "after partial fitting: 17188.234\n", 840 | "J:00283 E:00000 L:1.719e+04 P:-6.276e+05 R:5.751e+03\n", 841 | "after partial fitting: 15848.637\n", 842 | "J:00284 E:00000 L:1.585e+04 P:-6.276e+05 R:5.326e+03\n", 843 | "after partial fitting: 14628.442\n", 844 | "J:00285 E:00000 L:1.463e+04 P:-6.276e+05 R:5.874e+03\n", 845 | "after partial fitting: 14513.595\n", 846 | "J:00286 E:00000 L:1.451e+04 P:-6.277e+05 R:5.529e+03\n", 847 | "after partial fitting: 14446.684\n", 848 | "J:00287 E:00000 L:1.445e+04 P:-6.277e+05 R:6.113e+03\n", 849 | "after partial fitting: 14198.85\n", 850 | "J:00288 E:00000 L:1.420e+04 P:-6.277e+05 R:6.108e+03\n", 851 | "after partial fitting: 14470.988\n", 852 | "J:00289 E:00000 L:1.447e+04 P:-6.278e+05 R:5.689e+03\n", 853 | "after partial fitting: 14844.4\n", 854 | "J:00290 E:00000 L:1.484e+04 P:-6.278e+05 R:5.490e+03\n", 855 | "after partial fitting: 14505.133\n", 856 | "J:00291 E:00000 L:1.451e+04 P:-6.278e+05 R:5.616e+03\n", 857 | "after partial fitting: 15538.74\n", 858 | "J:00292 E:00000 L:1.554e+04 P:-6.278e+05 R:5.156e+03\n", 859 | "after partial fitting: 14582.016\n", 860 | "J:00293 E:00000 L:1.458e+04 P:-6.279e+05 R:5.860e+03\n", 861 | "after partial fitting: 14257.053\n", 862 | "J:00294 E:00000 L:1.426e+04 P:-6.279e+05 R:5.861e+03\n", 863 | "after partial fitting: 14271.592\n", 864 | "J:00295 E:00000 L:1.427e+04 P:-6.279e+05 R:5.740e+03\n", 865 | "after partial fitting: 14750.109\n", 866 | "J:00296 E:00000 L:1.475e+04 P:-6.280e+05 R:5.819e+03\n", 867 | "after partial fitting: 14552.396\n", 868 | "J:00297 E:00000 L:1.455e+04 P:-6.280e+05 R:5.811e+03\n", 869 | "after partial fitting: 14646.393\n", 870 | "J:00298 E:00000 L:1.465e+04 P:-6.280e+05 R:5.818e+03\n", 871 | "after partial fitting: 14718.272\n", 872 | "J:00299 E:00000 L:1.472e+04 P:-6.281e+05 R:5.916e+03\n", 873 | "after partial fitting: 14788.186\n", 874 | "J:00300 E:00000 L:1.479e+04 P:-6.281e+05 R:5.835e+03\n", 875 | "after partial fitting: 14407.2295\n", 876 | "J:00301 E:00000 L:1.441e+04 P:-6.281e+05 R:5.641e+03\n", 877 | "after partial fitting: 14949.893\n", 878 | "J:00302 E:00000 L:1.495e+04 P:-6.282e+05 R:5.548e+03\n", 879 | "after partial fitting: 14651.773\n", 880 | "J:00303 E:00000 L:1.465e+04 P:-6.282e+05 R:5.378e+03\n", 881 | "after partial fitting: 15536.415\n", 882 | "J:00304 E:00000 L:1.554e+04 P:-6.282e+05 R:5.713e+03\n", 883 | "after partial fitting: 14580.162\n", 884 | "J:00305 E:00000 L:1.458e+04 P:-6.283e+05 R:5.849e+03\n", 885 | "after partial fitting: 14813.968\n", 886 | "J:00306 E:00000 L:1.481e+04 P:-6.283e+05 R:5.701e+03\n", 887 | "after partial fitting: 14664.225\n", 888 | "J:00307 E:00000 L:1.466e+04 P:-6.283e+05 R:6.024e+03\n", 889 | "after partial fitting: 14385.124\n", 890 | "J:00308 E:00000 L:1.439e+04 P:-6.284e+05 R:5.570e+03\n", 891 | "after partial fitting: 16108.602\n", 892 | "J:00309 E:00000 L:1.611e+04 P:-6.284e+05 R:5.612e+03\n", 893 | "after partial fitting: 14166.184\n", 894 | "J:00310 E:00000 L:1.417e+04 P:-6.285e+05 R:5.694e+03\n", 895 | "after partial fitting: 14035.904\n", 896 | "J:00311 E:00000 L:1.404e+04 P:-6.285e+05 R:5.772e+03\n", 897 | "after partial fitting: 14465.242\n", 898 | "J:00312 E:00000 L:1.447e+04 P:-6.285e+05 R:5.794e+03\n", 899 | "after partial fitting: 14710.816\n", 900 | "J:00313 E:00000 L:1.471e+04 P:-6.286e+05 R:5.773e+03\n", 901 | "after partial fitting: 14611.902\n", 902 | "J:00314 E:00000 L:1.461e+04 P:-6.286e+05 R:5.723e+03\n", 903 | "after partial fitting: 14772.826\n", 904 | "J:00315 E:00000 L:1.477e+04 P:-6.286e+05 R:5.807e+03\n", 905 | "after partial fitting: 14725.696\n", 906 | "J:00316 E:00000 L:1.473e+04 P:-6.287e+05 R:5.908e+03\n", 907 | "after partial fitting: 14312.758\n", 908 | "J:00317 E:00000 L:1.431e+04 P:-6.287e+05 R:5.602e+03\n", 909 | "after partial fitting: 16062.393\n", 910 | "J:00318 E:00000 L:1.606e+04 P:-6.287e+05 R:5.779e+03\n", 911 | "after partial fitting: 14631.928\n", 912 | "J:00319 E:00000 L:1.463e+04 P:-6.288e+05 R:5.794e+03\n", 913 | "after partial fitting: 14921.791\n", 914 | "J:00320 E:00000 L:1.492e+04 P:-6.288e+05 R:5.858e+03\n", 915 | "after partial fitting: 14663.575\n", 916 | "J:00321 E:00000 L:1.466e+04 P:-6.289e+05 R:5.889e+03\n", 917 | "after partial fitting: 14448.781\n", 918 | "J:00322 E:00000 L:1.445e+04 P:-6.289e+05 R:5.630e+03\n", 919 | "after partial fitting: 15242.897\n", 920 | "J:00323 E:00000 L:1.524e+04 P:-6.290e+05 R:5.426e+03\n", 921 | "after partial fitting: 14516.9375\n", 922 | "J:00324 E:00000 L:1.452e+04 P:-6.290e+05 R:5.954e+03\n", 923 | "after partial fitting: 14548.303\n", 924 | "J:00325 E:00000 L:1.455e+04 P:-6.290e+05 R:5.958e+03\n", 925 | "after partial fitting: 15684.223\n", 926 | "J:00326 E:00000 L:1.568e+04 P:-6.291e+05 R:5.931e+03\n", 927 | "after partial fitting: 15556.242\n", 928 | "J:00327 E:00000 L:1.556e+04 P:-6.291e+05 R:5.931e+03\n", 929 | "after partial fitting: 12954.486\n", 930 | "J:00328 E:00000 L:1.295e+04 P:-6.292e+05 R:5.475e+03\n", 931 | "after partial fitting: 14847.99\n", 932 | "J:00329 E:00000 L:1.485e+04 P:-6.292e+05 R:5.816e+03\n", 933 | "after partial fitting: 14558.787\n", 934 | "J:00330 E:00000 L:1.456e+04 P:-6.292e+05 R:5.987e+03\n", 935 | "after partial fitting: 14741.229\n", 936 | "J:00331 E:00000 L:1.474e+04 P:-6.293e+05 R:6.007e+03\n", 937 | "after partial fitting: 14486.702\n", 938 | "J:00332 E:00000 L:1.449e+04 P:-6.293e+05 R:6.031e+03\n", 939 | "after partial fitting: 14335.401\n", 940 | "J:00333 E:00000 L:1.434e+04 P:-6.294e+05 R:5.435e+03\n", 941 | "after partial fitting: 14669.0\n", 942 | "J:00334 E:00000 L:1.467e+04 P:-6.294e+05 R:5.968e+03\n", 943 | "after partial fitting: 14545.955\n", 944 | "J:00335 E:00000 L:1.455e+04 P:-6.295e+05 R:6.064e+03\n", 945 | "after partial fitting: 13692.357\n", 946 | "J:00336 E:00000 L:1.369e+04 P:-6.295e+05 R:6.072e+03\n", 947 | "after partial fitting: 14744.129\n", 948 | "J:00337 E:00000 L:1.474e+04 P:-6.295e+05 R:6.014e+03\n", 949 | "after partial fitting: 14489.162\n", 950 | "J:00338 E:00000 L:1.449e+04 P:-6.296e+05 R:5.963e+03\n", 951 | "after partial fitting: 15319.191\n", 952 | "J:00339 E:00000 L:1.532e+04 P:-6.296e+05 R:5.996e+03\n", 953 | "after partial fitting: 15349.924\n", 954 | "J:00340 E:00000 L:1.535e+04 P:-6.297e+05 R:6.040e+03\n", 955 | "after partial fitting: 14864.773\n", 956 | "J:00341 E:00000 L:1.486e+04 P:-6.297e+05 R:6.082e+03\n", 957 | "after partial fitting: 15899.033\n", 958 | "J:00342 E:00000 L:1.590e+04 P:-6.297e+05 R:5.982e+03\n", 959 | "after partial fitting: 14391.564\n", 960 | "J:00343 E:00000 L:1.439e+04 P:-6.298e+05 R:6.083e+03\n", 961 | "after partial fitting: 15263.919\n", 962 | "J:00344 E:00000 L:1.526e+04 P:-6.298e+05 R:6.004e+03\n", 963 | "after partial fitting: 14539.586\n", 964 | "J:00345 E:00000 L:1.454e+04 P:-6.299e+05 R:6.075e+03\n", 965 | "after partial fitting: 14860.878\n", 966 | "J:00346 E:00000 L:1.486e+04 P:-6.299e+05 R:6.017e+03\n", 967 | "after partial fitting: 16857.904\n", 968 | "J:00347 E:00000 L:1.686e+04 P:-6.299e+05 R:6.067e+03\n", 969 | "after partial fitting: 14713.729\n", 970 | "J:00348 E:00000 L:1.471e+04 P:-6.300e+05 R:6.077e+03\n", 971 | "after partial fitting: 14849.369\n", 972 | "J:00349 E:00000 L:1.485e+04 P:-6.300e+05 R:5.964e+03\n", 973 | "after partial fitting: 14556.814\n", 974 | "J:00350 E:00000 L:1.456e+04 P:-6.301e+05 R:6.071e+03\n", 975 | "after partial fitting: 15130.792\n", 976 | "J:00351 E:00000 L:1.513e+04 P:-6.301e+05 R:6.030e+03\n", 977 | "after partial fitting: 14156.36\n", 978 | "J:00352 E:00000 L:1.416e+04 P:-6.301e+05 R:6.109e+03\n", 979 | "after partial fitting: 14585.213\n", 980 | "J:00353 E:00000 L:1.459e+04 P:-6.302e+05 R:6.036e+03\n", 981 | "after partial fitting: 14655.086\n", 982 | "J:00354 E:00000 L:1.466e+04 P:-6.302e+05 R:6.042e+03\n", 983 | "after partial fitting: 14604.042\n", 984 | "J:00355 E:00000 L:1.460e+04 P:-6.303e+05 R:6.104e+03\n", 985 | "after partial fitting: 14931.332\n", 986 | "J:00356 E:00000 L:1.493e+04 P:-6.303e+05 R:6.036e+03\n", 987 | "after partial fitting: 14726.496\n", 988 | "J:00357 E:00000 L:1.473e+04 P:-6.304e+05 R:6.038e+03\n", 989 | "after partial fitting: 14970.17\n", 990 | "J:00358 E:00000 L:1.497e+04 P:-6.304e+05 R:6.066e+03\n", 991 | "after partial fitting: 15602.671\n", 992 | "J:00359 E:00000 L:1.560e+04 P:-6.305e+05 R:5.973e+03\n", 993 | "after partial fitting: 14683.339\n", 994 | "J:00360 E:00000 L:1.468e+04 P:-6.305e+05 R:6.082e+03\n", 995 | "after partial fitting: 14489.512\n", 996 | "J:00361 E:00000 L:1.449e+04 P:-6.305e+05 R:6.080e+03\n", 997 | "after partial fitting: 14815.252\n", 998 | "J:00362 E:00000 L:1.482e+04 P:-6.306e+05 R:6.078e+03\n", 999 | "after partial fitting: 14727.34\n", 1000 | "J:00363 E:00000 L:1.473e+04 P:-6.306e+05 R:6.050e+03\n", 1001 | "after partial fitting: 14432.391\n", 1002 | "J:00364 E:00000 L:1.443e+04 P:-6.307e+05 R:6.078e+03\n", 1003 | "after partial fitting: 14706.677\n", 1004 | "J:00365 E:00000 L:1.471e+04 P:-6.307e+05 R:6.040e+03\n", 1005 | "after partial fitting: 14421.685\n", 1006 | "J:00366 E:00000 L:1.442e+04 P:-6.308e+05 R:6.070e+03\n", 1007 | "after partial fitting: 15381.402\n", 1008 | "J:00367 E:00000 L:1.538e+04 P:-6.308e+05 R:5.917e+03\n", 1009 | "after partial fitting: 14509.718\n", 1010 | "J:00368 E:00000 L:1.451e+04 P:-6.309e+05 R:6.083e+03\n", 1011 | "after partial fitting: 14324.279\n", 1012 | "J:00369 E:00000 L:1.432e+04 P:-6.309e+05 R:5.524e+03\n", 1013 | "after partial fitting: 15178.109\n", 1014 | "J:00370 E:00000 L:1.518e+04 P:-6.310e+05 R:5.098e+03\n", 1015 | "after partial fitting: 14355.745\n", 1016 | "J:00371 E:00000 L:1.436e+04 P:-6.310e+05 R:5.723e+03\n", 1017 | "after partial fitting: 14463.208\n", 1018 | "J:00372 E:00000 L:1.446e+04 P:-6.311e+05 R:5.402e+03\n", 1019 | "after partial fitting: 15461.666\n", 1020 | "J:00373 E:00000 L:1.546e+04 P:-6.311e+05 R:5.723e+03\n", 1021 | "after partial fitting: 15355.652\n", 1022 | "J:00374 E:00000 L:1.536e+04 P:-6.312e+05 R:6.055e+03\n", 1023 | "after partial fitting: 14635.645\n", 1024 | "J:00375 E:00000 L:1.464e+04 P:-6.312e+05 R:6.108e+03\n", 1025 | "after partial fitting: 14594.617\n", 1026 | "J:00376 E:00000 L:1.459e+04 P:-6.313e+05 R:6.129e+03\n", 1027 | "after partial fitting: 14975.333\n", 1028 | "J:00377 E:00000 L:1.498e+04 P:-6.313e+05 R:6.018e+03\n", 1029 | "after partial fitting: 15256.993\n", 1030 | "J:00378 E:00000 L:1.526e+04 P:-6.314e+05 R:5.995e+03\n", 1031 | "after partial fitting: 14293.412\n", 1032 | "J:00379 E:00000 L:1.429e+04 P:-6.314e+05 R:5.984e+03\n", 1033 | "after partial fitting: 14688.223\n", 1034 | "J:00380 E:00000 L:1.469e+04 P:-6.315e+05 R:5.568e+03\n", 1035 | "after partial fitting: 14864.831\n", 1036 | "J:00381 E:00000 L:1.486e+04 P:-6.315e+05 R:5.273e+03\n", 1037 | "after partial fitting: 14482.523\n", 1038 | "J:00382 E:00000 L:1.448e+04 P:-6.316e+05 R:5.607e+03\n", 1039 | "after partial fitting: 14615.026\n", 1040 | "J:00383 E:00000 L:1.462e+04 P:-6.316e+05 R:5.597e+03\n", 1041 | "after partial fitting: 14640.195\n", 1042 | "J:00384 E:00000 L:1.464e+04 P:-6.317e+05 R:5.786e+03\n", 1043 | "after partial fitting: 14482.943\n", 1044 | "J:00385 E:00000 L:1.448e+04 P:-6.317e+05 R:5.912e+03\n", 1045 | "after partial fitting: 14468.982\n", 1046 | "J:00386 E:00000 L:1.447e+04 P:-6.318e+05 R:4.466e+03\n", 1047 | "after partial fitting: 14245.359\n", 1048 | "J:00387 E:00000 L:1.425e+04 P:-6.318e+05 R:4.640e+03\n", 1049 | "after partial fitting: 15537.297\n", 1050 | "J:00388 E:00000 L:1.554e+04 P:-6.319e+05 R:5.830e+03\n", 1051 | "after partial fitting: 13345.988\n", 1052 | "J:00389 E:00000 L:1.335e+04 P:-6.319e+05 R:5.994e+03\n", 1053 | "after partial fitting: 14001.105\n", 1054 | "J:00390 E:00000 L:1.400e+04 P:-6.320e+05 R:5.940e+03\n", 1055 | "after partial fitting: 14789.098\n", 1056 | "J:00391 E:00000 L:1.479e+04 P:-6.320e+05 R:5.954e+03\n", 1057 | "after partial fitting: 7241.8867\n", 1058 | "J:00392 E:00000 L:7.242e+03 P:-6.321e+05 R:1.046e+04\n", 1059 | "after partial fitting: 14878.034\n", 1060 | "J:00393 E:00000 L:1.488e+04 P:-6.321e+05 R:5.930e+03\n", 1061 | "after partial fitting: 14486.625\n", 1062 | "J:00394 E:00000 L:1.449e+04 P:-6.322e+05 R:5.781e+03\n", 1063 | "after partial fitting: 14453.366\n", 1064 | "J:00395 E:00000 L:1.445e+04 P:-6.322e+05 R:5.867e+03\n", 1065 | "after partial fitting: 14207.471\n", 1066 | "J:00396 E:00000 L:1.421e+04 P:-6.323e+05 R:5.970e+03\n", 1067 | "after partial fitting: 14913.216\n", 1068 | "J:00397 E:00000 L:1.491e+04 P:-6.323e+05 R:5.863e+03\n", 1069 | "after partial fitting: 14606.648\n", 1070 | "J:00398 E:00000 L:1.461e+04 P:-6.324e+05 R:5.862e+03\n", 1071 | "after partial fitting: 14520.863\n", 1072 | "J:00399 E:00000 L:1.452e+04 P:-6.324e+05 R:5.967e+03\n", 1073 | "after partial fitting: 15069.138\n", 1074 | "J:00400 E:00000 L:1.507e+04 P:-6.325e+05 R:5.985e+03\n", 1075 | "after partial fitting: 14020.991\n", 1076 | "J:00401 E:00000 L:1.402e+04 P:-6.325e+05 R:6.088e+03\n", 1077 | "after partial fitting: 14512.34\n", 1078 | "J:00402 E:00000 L:1.451e+04 P:-6.326e+05 R:5.994e+03\n", 1079 | "after partial fitting: 14999.486\n", 1080 | "J:00403 E:00000 L:1.500e+04 P:-6.326e+05 R:5.937e+03\n", 1081 | "after partial fitting: 14546.295\n", 1082 | "J:00404 E:00000 L:1.455e+04 P:-6.327e+05 R:5.861e+03\n", 1083 | "after partial fitting: 14527.749\n", 1084 | "J:00405 E:00000 L:1.453e+04 P:-6.327e+05 R:5.838e+03\n", 1085 | "after partial fitting: 14727.026\n", 1086 | "J:00406 E:00000 L:1.473e+04 P:-6.327e+05 R:5.785e+03\n", 1087 | "after partial fitting: 14214.577\n", 1088 | "J:00407 E:00000 L:1.421e+04 P:-6.328e+05 R:5.764e+03\n", 1089 | "after partial fitting: 14713.629\n", 1090 | "J:00408 E:00000 L:1.471e+04 P:-6.328e+05 R:5.814e+03\n", 1091 | "after partial fitting: 15407.915\n", 1092 | "J:00409 E:00000 L:1.541e+04 P:-6.329e+05 R:5.806e+03\n", 1093 | "after partial fitting: 14397.699\n", 1094 | "J:00410 E:00000 L:1.440e+04 P:-6.329e+05 R:5.798e+03\n", 1095 | "after partial fitting: 14702.228\n", 1096 | "J:00411 E:00000 L:1.470e+04 P:-6.330e+05 R:5.733e+03\n", 1097 | "after partial fitting: 13619.862\n", 1098 | "J:00412 E:00000 L:1.362e+04 P:-6.330e+05 R:5.873e+03\n", 1099 | "after partial fitting: 14371.762\n", 1100 | "J:00413 E:00000 L:1.437e+04 P:-6.330e+05 R:5.946e+03\n", 1101 | "after partial fitting: 14834.496\n", 1102 | "J:00414 E:00000 L:1.483e+04 P:-6.331e+05 R:5.771e+03\n", 1103 | "after partial fitting: 15064.877\n", 1104 | "J:00415 E:00000 L:1.506e+04 P:-6.331e+05 R:5.900e+03\n", 1105 | "after partial fitting: 15450.629\n", 1106 | "J:00416 E:00000 L:1.545e+04 P:-6.332e+05 R:5.931e+03\n", 1107 | "after partial fitting: 14237.912\n", 1108 | "J:00417 E:00000 L:1.424e+04 P:-6.332e+05 R:5.902e+03\n", 1109 | "after partial fitting: 15287.123\n", 1110 | "J:00418 E:00000 L:1.529e+04 P:-6.332e+05 R:5.943e+03\n", 1111 | "after partial fitting: 15018.633\n", 1112 | "J:00419 E:00000 L:1.502e+04 P:-6.333e+05 R:5.851e+03\n", 1113 | "after partial fitting: 14213.513\n", 1114 | "J:00420 E:00000 L:1.421e+04 P:-6.333e+05 R:5.959e+03\n", 1115 | "after partial fitting: 14366.404\n", 1116 | "J:00421 E:00000 L:1.437e+04 P:-6.334e+05 R:5.845e+03\n", 1117 | "after partial fitting: 14147.428\n", 1118 | "J:00422 E:00000 L:1.415e+04 P:-6.334e+05 R:5.648e+03\n", 1119 | "after partial fitting: 14529.006\n", 1120 | "J:00423 E:00000 L:1.453e+04 P:-6.334e+05 R:5.945e+03\n", 1121 | "after partial fitting: 14726.794\n", 1122 | "J:00424 E:00000 L:1.473e+04 P:-6.335e+05 R:5.790e+03\n", 1123 | "after partial fitting: 14704.387\n", 1124 | "J:00425 E:00000 L:1.470e+04 P:-6.335e+05 R:5.833e+03\n", 1125 | "after partial fitting: 14794.7\n", 1126 | "J:00426 E:00000 L:1.479e+04 P:-6.336e+05 R:5.974e+03\n", 1127 | "after partial fitting: 14855.431\n", 1128 | "J:00427 E:00000 L:1.486e+04 P:-6.336e+05 R:5.981e+03\n", 1129 | "after partial fitting: 14757.901\n", 1130 | "J:00428 E:00000 L:1.476e+04 P:-6.336e+05 R:5.773e+03\n", 1131 | "after partial fitting: 15629.122\n", 1132 | "J:00429 E:00000 L:1.563e+04 P:-6.337e+05 R:5.074e+03\n", 1133 | "after partial fitting: 14570.609\n", 1134 | "J:00430 E:00000 L:1.457e+04 P:-6.337e+05 R:6.071e+03\n", 1135 | "after partial fitting: 14573.698\n", 1136 | "J:00431 E:00000 L:1.457e+04 P:-6.338e+05 R:5.308e+03\n", 1137 | "after partial fitting: 14640.74\n", 1138 | "J:00432 E:00000 L:1.464e+04 P:-6.338e+05 R:4.669e+03\n", 1139 | "after partial fitting: 13987.582\n", 1140 | "J:00433 E:00000 L:1.399e+04 P:-6.339e+05 R:5.732e+03\n", 1141 | "after partial fitting: 14344.887\n", 1142 | "J:00434 E:00000 L:1.434e+04 P:-6.339e+05 R:5.636e+03\n", 1143 | "after partial fitting: 14234.611\n", 1144 | "J:00435 E:00000 L:1.423e+04 P:-6.339e+05 R:6.034e+03\n", 1145 | "after partial fitting: 14910.684\n", 1146 | "J:00436 E:00000 L:1.491e+04 P:-6.340e+05 R:5.985e+03\n", 1147 | "after partial fitting: 14540.426\n", 1148 | "J:00437 E:00000 L:1.454e+04 P:-6.340e+05 R:6.120e+03\n", 1149 | "after partial fitting: 15498.494\n", 1150 | "J:00438 E:00000 L:1.550e+04 P:-6.341e+05 R:5.903e+03\n", 1151 | "after partial fitting: 14287.078\n", 1152 | "J:00439 E:00000 L:1.429e+04 P:-6.341e+05 R:6.118e+03\n", 1153 | "after partial fitting: 15466.504\n", 1154 | "J:00440 E:00000 L:1.547e+04 P:-6.341e+05 R:6.042e+03\n", 1155 | "after partial fitting: 14994.009\n", 1156 | "J:00441 E:00000 L:1.499e+04 P:-6.342e+05 R:5.957e+03\n", 1157 | "after partial fitting: 14992.865\n", 1158 | "J:00442 E:00000 L:1.499e+04 P:-6.342e+05 R:6.076e+03\n", 1159 | "after partial fitting: 13770.967\n", 1160 | "J:00443 E:00000 L:1.377e+04 P:-6.343e+05 R:6.147e+03\n", 1161 | "after partial fitting: 14730.11\n", 1162 | "J:00444 E:00000 L:1.473e+04 P:-6.343e+05 R:5.968e+03\n", 1163 | "after partial fitting: 14343.22\n", 1164 | "J:00445 E:00000 L:1.434e+04 P:-6.343e+05 R:6.099e+03\n", 1165 | "after partial fitting: 14472.494\n", 1166 | "J:00446 E:00000 L:1.447e+04 P:-6.344e+05 R:5.713e+03\n", 1167 | "after partial fitting: 15321.039\n", 1168 | "J:00447 E:00000 L:1.532e+04 P:-6.344e+05 R:5.249e+03\n", 1169 | "after partial fitting: 14637.301\n", 1170 | "J:00448 E:00000 L:1.464e+04 P:-6.344e+05 R:5.342e+03\n", 1171 | "after partial fitting: 14501.582\n", 1172 | "J:00449 E:00000 L:1.450e+04 P:-6.345e+05 R:6.040e+03\n", 1173 | "after partial fitting: 14394.538\n", 1174 | "J:00450 E:00000 L:1.439e+04 P:-6.345e+05 R:6.068e+03\n", 1175 | "after partial fitting: 14712.55\n", 1176 | "J:00451 E:00000 L:1.471e+04 P:-6.345e+05 R:6.008e+03\n", 1177 | "after partial fitting: 14628.376\n", 1178 | "J:00452 E:00000 L:1.463e+04 P:-6.346e+05 R:5.957e+03\n", 1179 | "after partial fitting: 14567.139\n", 1180 | "J:00453 E:00000 L:1.457e+04 P:-6.346e+05 R:6.052e+03\n", 1181 | "after partial fitting: 14481.777\n", 1182 | "J:00454 E:00000 L:1.448e+04 P:-6.347e+05 R:6.018e+03\n", 1183 | "after partial fitting: 14603.255\n", 1184 | "J:00455 E:00000 L:1.460e+04 P:-6.347e+05 R:5.983e+03\n", 1185 | "after partial fitting: 15486.94\n", 1186 | "J:00456 E:00000 L:1.549e+04 P:-6.347e+05 R:5.950e+03\n", 1187 | "after partial fitting: 14729.221\n", 1188 | "J:00457 E:00000 L:1.473e+04 P:-6.348e+05 R:6.001e+03\n", 1189 | "after partial fitting: 14569.208\n", 1190 | "J:00458 E:00000 L:1.457e+04 P:-6.348e+05 R:6.031e+03\n", 1191 | "after partial fitting: 14535.811\n", 1192 | "J:00459 E:00000 L:1.454e+04 P:-6.348e+05 R:5.998e+03\n", 1193 | "after partial fitting: 14438.721\n", 1194 | "J:00460 E:00000 L:1.444e+04 P:-6.349e+05 R:6.053e+03\n", 1195 | "after partial fitting: 14511.928\n", 1196 | "J:00461 E:00000 L:1.451e+04 P:-6.349e+05 R:6.066e+03\n", 1197 | "after partial fitting: 14477.723\n", 1198 | "J:00462 E:00000 L:1.448e+04 P:-6.350e+05 R:6.078e+03\n", 1199 | "after partial fitting: 14950.235\n", 1200 | "J:00463 E:00000 L:1.495e+04 P:-6.350e+05 R:5.917e+03\n", 1201 | "after partial fitting: 14582.895\n", 1202 | "J:00464 E:00000 L:1.458e+04 P:-6.350e+05 R:6.031e+03\n", 1203 | "after partial fitting: 14729.203\n", 1204 | "J:00465 E:00000 L:1.473e+04 P:-6.351e+05 R:6.025e+03\n", 1205 | "after partial fitting: 14402.289\n", 1206 | "J:00466 E:00000 L:1.440e+04 P:-6.351e+05 R:6.004e+03\n", 1207 | "after partial fitting: 14550.326\n", 1208 | "J:00467 E:00000 L:1.455e+04 P:-6.351e+05 R:5.950e+03\n", 1209 | "after partial fitting: 15661.102\n", 1210 | "J:00468 E:00000 L:1.566e+04 P:-6.352e+05 R:5.940e+03\n", 1211 | "after partial fitting: 14441.411\n", 1212 | "J:00469 E:00000 L:1.444e+04 P:-6.352e+05 R:6.002e+03\n", 1213 | "after partial fitting: 14841.039\n", 1214 | "J:00470 E:00000 L:1.484e+04 P:-6.352e+05 R:6.044e+03\n", 1215 | "after partial fitting: 14554.411\n", 1216 | "J:00471 E:00000 L:1.455e+04 P:-6.353e+05 R:5.950e+03\n", 1217 | "after partial fitting: 14392.907\n", 1218 | "J:00472 E:00000 L:1.439e+04 P:-6.353e+05 R:6.064e+03\n", 1219 | "after partial fitting: 14705.483\n", 1220 | "J:00473 E:00000 L:1.471e+04 P:-6.354e+05 R:5.972e+03\n", 1221 | "after partial fitting: 14510.408\n", 1222 | "J:00474 E:00000 L:1.451e+04 P:-6.354e+05 R:6.076e+03\n", 1223 | "after partial fitting: 14634.635\n", 1224 | "J:00475 E:00000 L:1.463e+04 P:-6.354e+05 R:6.041e+03\n", 1225 | "after partial fitting: 13660.078\n", 1226 | "J:00476 E:00000 L:1.366e+04 P:-6.355e+05 R:6.010e+03\n", 1227 | "after partial fitting: 14219.305\n", 1228 | "J:00477 E:00000 L:1.422e+04 P:-6.355e+05 R:6.035e+03\n", 1229 | "after partial fitting: 14432.213\n", 1230 | "J:00478 E:00000 L:1.443e+04 P:-6.355e+05 R:6.058e+03\n", 1231 | "after partial fitting: 15724.705\n", 1232 | "J:00479 E:00000 L:1.572e+04 P:-6.356e+05 R:5.981e+03\n", 1233 | "after partial fitting: 14698.277\n", 1234 | "J:00480 E:00000 L:1.470e+04 P:-6.356e+05 R:6.041e+03\n", 1235 | "after partial fitting: 14409.623\n", 1236 | "J:00481 E:00000 L:1.441e+04 P:-6.356e+05 R:5.995e+03\n", 1237 | "after partial fitting: 15361.801\n", 1238 | "J:00482 E:00000 L:1.536e+04 P:-6.356e+05 R:5.895e+03\n", 1239 | "after partial fitting: 13763.598\n", 1240 | "J:00483 E:00000 L:1.376e+04 P:-6.357e+05 R:5.966e+03\n", 1241 | "after partial fitting: 14787.51\n", 1242 | "J:00484 E:00000 L:1.479e+04 P:-6.357e+05 R:5.978e+03\n", 1243 | "after partial fitting: 14301.448\n", 1244 | "J:00485 E:00000 L:1.430e+04 P:-6.357e+05 R:6.064e+03\n", 1245 | "after partial fitting: 14570.569\n", 1246 | "J:00486 E:00000 L:1.457e+04 P:-6.358e+05 R:6.005e+03\n", 1247 | "after partial fitting: 14483.9\n", 1248 | "J:00487 E:00000 L:1.448e+04 P:-6.358e+05 R:6.056e+03\n", 1249 | "after partial fitting: 14777.172\n", 1250 | "J:00488 E:00000 L:1.478e+04 P:-6.358e+05 R:6.005e+03\n", 1251 | "after partial fitting: 14138.645\n", 1252 | "J:00489 E:00000 L:1.414e+04 P:-6.358e+05 R:6.071e+03\n", 1253 | "after partial fitting: 14507.3\n", 1254 | "J:00490 E:00000 L:1.451e+04 P:-6.359e+05 R:5.999e+03\n", 1255 | "after partial fitting: 14240.59\n", 1256 | "J:00491 E:00000 L:1.424e+04 P:-6.359e+05 R:6.100e+03\n", 1257 | "after partial fitting: 14599.896\n", 1258 | "J:00492 E:00000 L:1.460e+04 P:-6.359e+05 R:6.029e+03\n", 1259 | "after partial fitting: 13480.795\n", 1260 | "J:00493 E:00000 L:1.348e+04 P:-6.360e+05 R:6.003e+03\n", 1261 | "after partial fitting: 14399.111\n", 1262 | "J:00494 E:00000 L:1.440e+04 P:-6.360e+05 R:5.972e+03\n", 1263 | "after partial fitting: 13600.765\n", 1264 | "J:00495 E:00000 L:1.360e+04 P:-6.360e+05 R:6.022e+03\n", 1265 | "after partial fitting: 14430.855\n", 1266 | "J:00496 E:00000 L:1.443e+04 P:-6.360e+05 R:5.999e+03\n", 1267 | "after partial fitting: 14232.558\n", 1268 | "J:00497 E:00000 L:1.423e+04 P:-6.361e+05 R:5.943e+03\n", 1269 | "after partial fitting: 14240.448\n", 1270 | "J:00498 E:00000 L:1.424e+04 P:-6.361e+05 R:6.010e+03\n", 1271 | "after partial fitting: 14434.785\n", 1272 | "J:00499 E:00000 L:1.443e+04 P:-6.361e+05 R:6.007e+03\n", 1273 | "after partial fitting: 14706.266\n", 1274 | "J:00500 E:00000 L:1.471e+04 P:-6.361e+05 R:5.978e+03\n", 1275 | "after partial fitting: 14607.213\n", 1276 | "J:00501 E:00000 L:1.461e+04 P:-6.362e+05 R:6.052e+03\n", 1277 | "after partial fitting: 15106.46\n", 1278 | "J:00502 E:00000 L:1.511e+04 P:-6.362e+05 R:5.975e+03\n", 1279 | "after partial fitting: 14668.539\n", 1280 | "J:00503 E:00000 L:1.467e+04 P:-6.362e+05 R:6.015e+03\n", 1281 | "after partial fitting: 14146.911\n", 1282 | "J:00504 E:00000 L:1.415e+04 P:-6.362e+05 R:5.965e+03\n", 1283 | "after partial fitting: 14280.128\n", 1284 | "J:00505 E:00000 L:1.428e+04 P:-6.362e+05 R:6.016e+03\n", 1285 | "after partial fitting: 14417.5\n", 1286 | "J:00506 E:00000 L:1.442e+04 P:-6.363e+05 R:6.028e+03\n", 1287 | "after partial fitting: 14364.486\n", 1288 | "J:00507 E:00000 L:1.436e+04 P:-6.363e+05 R:6.084e+03\n", 1289 | "after partial fitting: 14277.384\n", 1290 | "J:00508 E:00000 L:1.428e+04 P:-6.363e+05 R:5.962e+03\n", 1291 | "after partial fitting: 15637.449\n", 1292 | "J:00509 E:00000 L:1.564e+04 P:-6.363e+05 R:5.986e+03\n", 1293 | "after partial fitting: 15505.58\n", 1294 | "J:00510 E:00000 L:1.551e+04 P:-6.364e+05 R:5.971e+03\n", 1295 | "after partial fitting: 14423.724\n", 1296 | "J:00511 E:00000 L:1.442e+04 P:-6.364e+05 R:5.973e+03\n", 1297 | "after partial fitting: 14431.197\n", 1298 | "J:00512 E:00000 L:1.443e+04 P:-6.364e+05 R:6.024e+03\n", 1299 | "after partial fitting: 14257.615\n", 1300 | "J:00513 E:00000 L:1.426e+04 P:-6.364e+05 R:6.053e+03\n", 1301 | "after partial fitting: 14101.682\n", 1302 | "J:00514 E:00000 L:1.410e+04 P:-6.364e+05 R:6.025e+03\n", 1303 | "after partial fitting: 14489.803\n", 1304 | "J:00515 E:00000 L:1.449e+04 P:-6.365e+05 R:5.931e+03\n", 1305 | "after partial fitting: 14667.535\n", 1306 | "J:00516 E:00000 L:1.467e+04 P:-6.365e+05 R:6.070e+03\n", 1307 | "after partial fitting: 14588.743\n", 1308 | "J:00517 E:00000 L:1.459e+04 P:-6.365e+05 R:6.068e+03\n", 1309 | "after partial fitting: 14503.665\n", 1310 | "J:00518 E:00000 L:1.450e+04 P:-6.365e+05 R:5.975e+03\n", 1311 | "after partial fitting: 14442.247\n", 1312 | "J:00519 E:00000 L:1.444e+04 P:-6.365e+05 R:6.005e+03\n", 1313 | "after partial fitting: 14448.367\n", 1314 | "J:00520 E:00000 L:1.445e+04 P:-6.366e+05 R:5.996e+03\n", 1315 | "after partial fitting: 12936.318\n", 1316 | "J:00521 E:00000 L:1.294e+04 P:-6.366e+05 R:6.048e+03\n", 1317 | "after partial fitting: 14588.896\n", 1318 | "J:00522 E:00000 L:1.459e+04 P:-6.366e+05 R:6.042e+03\n", 1319 | "after partial fitting: 14579.913\n", 1320 | "J:00523 E:00000 L:1.458e+04 P:-6.366e+05 R:6.019e+03\n", 1321 | "after partial fitting: 14512.184\n", 1322 | "J:00524 E:00000 L:1.451e+04 P:-6.366e+05 R:6.022e+03\n", 1323 | "after partial fitting: 14489.904\n", 1324 | "J:00525 E:00000 L:1.449e+04 P:-6.366e+05 R:6.007e+03\n", 1325 | "after partial fitting: 14809.156\n", 1326 | "J:00526 E:00000 L:1.481e+04 P:-6.367e+05 R:5.937e+03\n", 1327 | "after partial fitting: 14390.97\n", 1328 | "J:00527 E:00000 L:1.439e+04 P:-6.367e+05 R:6.088e+03\n", 1329 | "after partial fitting: 13970.924\n", 1330 | "J:00528 E:00000 L:1.397e+04 P:-6.367e+05 R:6.064e+03\n", 1331 | "after partial fitting: 14603.634\n", 1332 | "J:00529 E:00000 L:1.460e+04 P:-6.367e+05 R:6.027e+03\n", 1333 | "after partial fitting: 14482.102\n", 1334 | "J:00530 E:00000 L:1.448e+04 P:-6.367e+05 R:5.963e+03\n", 1335 | "after partial fitting: 14398.1875\n", 1336 | "J:00531 E:00000 L:1.440e+04 P:-6.367e+05 R:6.008e+03\n", 1337 | "after partial fitting: 15500.604\n", 1338 | "J:00532 E:00000 L:1.550e+04 P:-6.368e+05 R:5.911e+03\n", 1339 | "after partial fitting: 14584.271\n", 1340 | "J:00533 E:00000 L:1.458e+04 P:-6.368e+05 R:6.082e+03\n", 1341 | "after partial fitting: 14435.316\n", 1342 | "J:00534 E:00000 L:1.444e+04 P:-6.368e+05 R:5.979e+03\n", 1343 | "after partial fitting: 14266.354\n", 1344 | "J:00535 E:00000 L:1.427e+04 P:-6.368e+05 R:6.051e+03\n", 1345 | "after partial fitting: 14484.787\n", 1346 | "J:00536 E:00000 L:1.448e+04 P:-6.368e+05 R:5.984e+03\n", 1347 | "after partial fitting: 13549.424\n", 1348 | "J:00537 E:00000 L:1.355e+04 P:-6.368e+05 R:6.020e+03\n", 1349 | "after partial fitting: 14277.084\n", 1350 | "J:00538 E:00000 L:1.428e+04 P:-6.368e+05 R:6.122e+03\n", 1351 | "after partial fitting: 14314.386\n", 1352 | "J:00539 E:00000 L:1.431e+04 P:-6.368e+05 R:6.029e+03\n", 1353 | "after partial fitting: 14598.623\n", 1354 | "J:00540 E:00000 L:1.460e+04 P:-6.369e+05 R:6.029e+03\n", 1355 | "after partial fitting: 14271.439\n", 1356 | "J:00541 E:00000 L:1.427e+04 P:-6.369e+05 R:5.959e+03\n", 1357 | "after partial fitting: 14124.813\n", 1358 | "J:00542 E:00000 L:1.412e+04 P:-6.369e+05 R:6.079e+03\n", 1359 | "after partial fitting: 14820.553\n", 1360 | "J:00543 E:00000 L:1.482e+04 P:-6.369e+05 R:5.966e+03\n", 1361 | "after partial fitting: 14490.596\n", 1362 | "J:00544 E:00000 L:1.449e+04 P:-6.369e+05 R:6.023e+03\n", 1363 | "after partial fitting: 14951.023\n", 1364 | "J:00545 E:00000 L:1.495e+04 P:-6.369e+05 R:5.966e+03\n", 1365 | "after partial fitting: 15445.996\n", 1366 | "J:00546 E:00000 L:1.545e+04 P:-6.369e+05 R:5.961e+03\n", 1367 | "after partial fitting: 14599.424\n", 1368 | "J:00547 E:00000 L:1.460e+04 P:-6.369e+05 R:6.045e+03\n", 1369 | "after partial fitting: 14435.922\n", 1370 | "J:00548 E:00000 L:1.444e+04 P:-6.369e+05 R:6.013e+03\n", 1371 | "after partial fitting: 14287.492\n", 1372 | "J:00549 E:00000 L:1.429e+04 P:-6.370e+05 R:6.039e+03\n", 1373 | "after partial fitting: 14432.537\n", 1374 | "J:00550 E:00000 L:1.443e+04 P:-6.370e+05 R:6.061e+03\n", 1375 | "after partial fitting: 14526.366\n", 1376 | "J:00551 E:00000 L:1.453e+04 P:-6.370e+05 R:6.057e+03\n", 1377 | "after partial fitting: 13976.875\n", 1378 | "J:00552 E:00000 L:1.398e+04 P:-6.370e+05 R:6.075e+03\n", 1379 | "after partial fitting: 14478.694\n", 1380 | "J:00553 E:00000 L:1.448e+04 P:-6.370e+05 R:6.062e+03\n", 1381 | "after partial fitting: 14647.529\n", 1382 | "J:00554 E:00000 L:1.465e+04 P:-6.370e+05 R:5.915e+03\n", 1383 | "after partial fitting: 14420.941\n", 1384 | "J:00555 E:00000 L:1.442e+04 P:-6.370e+05 R:5.950e+03\n", 1385 | "after partial fitting: 14300.262\n", 1386 | "J:00556 E:00000 L:1.430e+04 P:-6.370e+05 R:6.088e+03\n", 1387 | "after partial fitting: 14331.677\n", 1388 | "J:00557 E:00000 L:1.433e+04 P:-6.370e+05 R:5.984e+03\n", 1389 | "after partial fitting: 14221.029\n", 1390 | "J:00558 E:00000 L:1.422e+04 P:-6.370e+05 R:5.981e+03\n", 1391 | "after partial fitting: 14602.904\n", 1392 | "J:00559 E:00000 L:1.460e+04 P:-6.370e+05 R:5.967e+03\n", 1393 | "after partial fitting: 14225.79\n", 1394 | "J:00560 E:00000 L:1.423e+04 P:-6.370e+05 R:5.998e+03\n", 1395 | "after partial fitting: 14353.901\n", 1396 | "J:00561 E:00000 L:1.435e+04 P:-6.370e+05 R:6.046e+03\n", 1397 | "after partial fitting: 14728.567\n", 1398 | "J:00562 E:00000 L:1.473e+04 P:-6.370e+05 R:5.953e+03\n", 1399 | "after partial fitting: 15207.722\n", 1400 | "J:00563 E:00000 L:1.521e+04 P:-6.370e+05 R:5.993e+03\n" 1401 | ] 1402 | } 1403 | ], 1404 | "source": [ 1405 | "for epoch in range(1):\n", 1406 | " data = prepare_topics(cuda.to_cpu(model.mixture.weights.W.data).copy(),\n", 1407 | " cuda.to_cpu(model.mixture.factors.W.data).copy(),\n", 1408 | " cuda.to_cpu(model.sampler.W.data).copy(),\n", 1409 | " words)\n", 1410 | " top_words = print_top_words_per_topic(data)\n", 1411 | " if j % 100 == 0 and j > 100:\n", 1412 | " coherence = topic_coherence(top_words)\n", 1413 | " for j in range(n_topics):\n", 1414 | " print(j, coherence[(j, 'cv')])\n", 1415 | " kw = dict(top_words=top_words, coherence=coherence, epoch=epoch)\n", 1416 | " progress[str(epoch)] = pickle.dumps(kw)\n", 1417 | " data['doc_lengths'] = doc_lengths\n", 1418 | " data['term_frequency'] = term_frequency\n", 1419 | " np.savez('topics.pyldavis', **data)\n", 1420 | " print(epoch)\n", 1421 | " for d, f in utils.chunks(batchsize, doc_ids, flattened):\n", 1422 | " t0 = time.time()\n", 1423 | " model.cleargrads()\n", 1424 | " #optimizer.use_cleargrads(use=False)\n", 1425 | " l = model.fit_partial(d.copy(), f.copy())\n", 1426 | " print(\"after partial fitting:\", l)\n", 1427 | " prior = model.prior()\n", 1428 | " loss = prior * fraction\n", 1429 | " loss.backward()\n", 1430 | " optimizer.update()\n", 1431 | " msg = (\"J:{j:05d} E:{epoch:05d} L:{loss:1.3e} \"\n", 1432 | " \"P:{prior:1.3e} R:{rate:1.3e}\")\n", 1433 | " prior.to_cpu()\n", 1434 | " loss.to_cpu()\n", 1435 | " t1 = time.time()\n", 1436 | " dt = t1 - t0\n", 1437 | " rate = batchsize / dt\n", 1438 | " logs = dict(loss=float(l), epoch=epoch, j=j,\n", 1439 | " prior=float(prior.data), rate=rate)\n", 1440 | " print(msg.format(**logs))\n", 1441 | " j += 1\n", 1442 | " serializers.save_hdf5(\"lda2vec.hdf5\", model)" 1443 | ] 1444 | } 1445 | ], 1446 | "metadata": { 1447 | "kernelspec": { 1448 | "display_name": "Python 3", 1449 | "language": "python", 1450 | "name": "python3" 1451 | }, 1452 | "language_info": { 1453 | "codemirror_mode": { 1454 | "name": "ipython", 1455 | "version": 3 1456 | }, 1457 | "file_extension": ".py", 1458 | "mimetype": "text/x-python", 1459 | "name": "python", 1460 | "nbconvert_exporter": "python", 1461 | "pygments_lexer": "ipython3", 1462 | "version": "3.7.2" 1463 | } 1464 | }, 1465 | "nbformat": 4, 1466 | "nbformat_minor": 2 1467 | } 1468 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = 'pylda2vec' 3 | version = '0.0.1' 4 | description = 'Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec.' 5 | author = 'ONLPS' 6 | author_email = 'royalkingpin@gmail.com' 7 | license = 'MIT' 8 | url = 'https://github.com/ONLPS/lda2vec' 9 | 10 | [requires] 11 | python_version = ['3.6'] 12 | 13 | [build-system] 14 | requires = ['setuptools', 'wheel'] 15 | 16 | [tool.hatch.commands] 17 | prerelease = 'hatch build' 18 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | chainer 2 | cupy 3 | numpy 4 | jellyfish 5 | pandas 6 | en_core_web_md 7 | spacy 8 | scipy 9 | scikit-learn -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from io import open 2 | 3 | from setuptools import find_packages, setup 4 | 5 | with open('lda2vec/__init__.py', 'r') as f: 6 | for line in f: 7 | if line.startswith('__version__'): 8 | version = line.strip().split('=')[1].strip(' \'"') 9 | break 10 | else: 11 | version = '0.0.1' 12 | 13 | with open('README.md', 'r', encoding='utf-8') as f: 14 | readme = f.read() 15 | 16 | REQUIRES = [] 17 | 18 | setup( 19 | name='pylda2vec', 20 | version=version, 21 | description='Mixing Dirichlet Topic Models and Word Embeddings to Make lda2vec', 22 | long_description=readme, 23 | author='ONLPS', 24 | author_email='royalkingpin@gmail.com', 25 | maintainer='ONLPS', 26 | maintainer_email='royalkingpin@gmail.com', 27 | url='https://github.com/ONLPS/lda2vec', 28 | license='MIT', 29 | 30 | keywords=[ 31 | 'lda', 'topic-models', 'text', 'text processing', 'nlp' 32 | ], 33 | 34 | classifiers=[ 35 | 'Development Status :: 4 - Beta', 36 | 'Intended Audience :: Developers', 37 | 'License :: OSI Approved :: MIT License', 38 | 'Natural Language :: English', 39 | 'Operating System :: OS Independent', 40 | 'Programming Language :: Python :: 3.6', 41 | 'Programming Language :: Python :: Implementation :: CPython', 42 | ], 43 | 44 | install_requires=REQUIRES, 45 | tests_require=['coverage', 'pytest'], 46 | 47 | packages=find_packages(), 48 | ) 49 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.0.1' 2 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | envlist = 3 | py36, 4 | 5 | [testenv] 6 | passenv = * 7 | deps = 8 | coverage 9 | pytest 10 | commands = 11 | python setup.py --quiet clean develop 12 | coverage run --parallel-mode -m pytest 13 | coverage combine --append 14 | coverage report -m 15 | --------------------------------------------------------------------------------