├── requirements.txt ├── enstop ├── __init__.py ├── utils.py ├── distributed_plsa.py ├── cuda_plsa.py ├── block_parallel_plsa.py ├── enstop_.py ├── plsa.py └── streamed_plsa.py ├── LICENSE ├── .gitignore ├── setup.py ├── CODE_OF_CONDUCT.md ├── README.rst └── notebooks └── EnsTop with 20-Newsgroups.ipynb /requirements.txt: -------------------------------------------------------------------------------- 1 | scikit-learn>=0.23 2 | scipy>=1.0 3 | numba>=0.48 4 | dask[delayed]>=1.2 5 | hdbscan>=0.8.10 6 | umap-learn>=0.3.8 7 | -------------------------------------------------------------------------------- /enstop/__init__.py: -------------------------------------------------------------------------------- 1 | from enstop.plsa import PLSA 2 | from enstop.streamed_plsa import StreamedPLSA 3 | from enstop.block_parallel_plsa import BlockParallelPLSA 4 | from enstop.distributed_plsa import DistributedPLSA 5 | from enstop.cuda_plsa import GPUPLSA 6 | from enstop.enstop_ import EnsembleTopics 7 | from enstop.utils import log_lift, mean_log_lift, coherence, mean_coherence 8 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD 2-Clause License 2 | 3 | Copyright (c) 2019, Leland McInnes 4 | All rights reserved. 5 | 6 | Redistribution and use in source and binary forms, with or without 7 | modification, are permitted provided that the following conditions are met: 8 | 9 | 1. Redistributions of source code must retain the above copyright notice, this 10 | list of conditions and the following disclaimer. 11 | 12 | 2. Redistributions in binary form must reproduce the above copyright notice, 13 | this list of conditions and the following disclaimer in the documentation 14 | and/or other materials provided with the distribution. 15 | 16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE 20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE 25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | 4 | def readme(): 5 | with open("README.rst") as readme_file: 6 | return readme_file.read() 7 | 8 | 9 | configuration = { 10 | "name": "enstop", 11 | "version": "0.2.6", 12 | "description": "Ensemble topic modelling with pLSA", 13 | "long_description": readme(), 14 | "classifiers": [ 15 | "Development Status :: 3 - Alpha", 16 | "Intended Audience :: Science/Research", 17 | "Intended Audience :: Developers", 18 | "License :: OSI Approved", 19 | "Programming Language :: C", 20 | "Programming Language :: Python", 21 | "Topic :: Software Development", 22 | "Topic :: Scientific/Engineering", 23 | "Operating System :: Microsoft :: Windows", 24 | "Operating System :: POSIX", 25 | "Operating System :: Unix", 26 | "Operating System :: MacOS", 27 | "Programming Language :: Python :: 3.6", 28 | "Programming Language :: Python :: 3.7", 29 | "Programming Language :: Python :: 3.8", 30 | ], 31 | "keywords": "topic model, LDA, pLSA, NMF", 32 | "url": "http://github.com/lmcinnes/enstop", 33 | "author": "Leland McInnes", 34 | "author_email": "leland.mcinnes@gmail.com", 35 | "maintainer": "Leland McInnes", 36 | "maintainer_email": "leland.mcinnes@gmail.com", 37 | "license": "BSD", 38 | "packages": ["enstop"], 39 | "install_requires": [ 40 | "scikit-learn >= 0.23", 41 | "scipy >= 1.0", 42 | "numba >= 0.48", 43 | "dask[delayed] >= 1.2", 44 | "hdbscan >= 0.8", 45 | "umap-learn >= 0.3.8", 46 | ], 47 | "ext_modules": [], 48 | "cmdclass": {}, 49 | "test_suite": "nose.collector", 50 | "tests_require": ["nose"], 51 | "data_files": (), 52 | "zip_safe": True, 53 | } 54 | 55 | setup(**configuration) 56 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Contributor Covenant Code of Conduct 2 | 3 | ## Our Pledge 4 | 5 | In the interest of fostering an open and welcoming environment, we as 6 | contributors and maintainers pledge to making participation in our project and 7 | our community a harassment-free experience for everyone, regardless of age, body 8 | size, disability, ethnicity, sex characteristics, gender identity and expression, 9 | level of experience, education, socio-economic status, nationality, personal 10 | appearance, race, religion, or sexual identity and orientation. 11 | 12 | ## Our Standards 13 | 14 | Examples of behavior that contributes to creating a positive environment 15 | include: 16 | 17 | * Using welcoming and inclusive language 18 | * Being respectful of differing viewpoints and experiences 19 | * Gracefully accepting constructive criticism 20 | * Focusing on what is best for the community 21 | * Showing empathy towards other community members 22 | 23 | Examples of unacceptable behavior by participants include: 24 | 25 | * The use of sexualized language or imagery and unwelcome sexual attention or 26 | advances 27 | * Trolling, insulting/derogatory comments, and personal or political attacks 28 | * Public or private harassment 29 | * Publishing others' private information, such as a physical or electronic 30 | address, without explicit permission 31 | * Other conduct which could reasonably be considered inappropriate in a 32 | professional setting 33 | 34 | ## Our Responsibilities 35 | 36 | Project maintainers are responsible for clarifying the standards of acceptable 37 | behavior and are expected to take appropriate and fair corrective action in 38 | response to any instances of unacceptable behavior. 39 | 40 | Project maintainers have the right and responsibility to remove, edit, or 41 | reject comments, commits, code, wiki edits, issues, and other contributions 42 | that are not aligned to this Code of Conduct, or to ban temporarily or 43 | permanently any contributor for other behaviors that they deem inappropriate, 44 | threatening, offensive, or harmful. 45 | 46 | ## Scope 47 | 48 | This Code of Conduct applies both within project spaces and in public spaces 49 | when an individual is representing the project or its community. Examples of 50 | representing a project or community include using an official project e-mail 51 | address, posting via an official social media account, or acting as an appointed 52 | representative at an online or offline event. Representation of a project may be 53 | further defined and clarified by project maintainers. 54 | 55 | ## Enforcement 56 | 57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be 58 | reported by contacting the project team at leland.mcinnes@gmail.com. All 59 | complaints will be reviewed and investigated and will result in a response that 60 | is deemed necessary and appropriate to the circumstances. The project team is 61 | obligated to maintain confidentiality with regard to the reporter of an incident. 62 | Further details of specific enforcement policies may be posted separately. 63 | 64 | Project maintainers who do not follow or enforce the Code of Conduct in good 65 | faith may face temporary or permanent repercussions as determined by other 66 | members of the project's leadership. 67 | 68 | ## Attribution 69 | 70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, 71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html 72 | 73 | [homepage]: https://www.contributor-covenant.org 74 | 75 | For answers to common questions about this code of conduct, see 76 | https://www.contributor-covenant.org/faq 77 | -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | ====== 2 | EnsTop 3 | ====== 4 | 5 | EnsTop provides an ensemble based approach to topic modelling using pLSA. It makes 6 | use of a high performance numba based pLSA implementation to run multiple 7 | bootstrapped topic models in parallel, and then clusters the resulting outputs to 8 | determine a set of stable topics. It can then refit the document vectors against 9 | these topics embed documents into the stable topic space. 10 | 11 | --------------- 12 | Why use EnsTop? 13 | --------------- 14 | 15 | There are a number of advantages to using an ensemble approach to topic modelling. 16 | The most obvious is that it produces better more stable topics. A close second, 17 | however, is that, by making use of HDBSCAN for clustering topics, it can learn a 18 | "natural" number of topics. That is, while the user needs to specify an estimated 19 | number of topics, the *actual* number of topics produced will be determined by how 20 | many stable topics are produced over many bootstrapped runs. In practice this can 21 | either be more, or less, than the estimated number of topics. 22 | 23 | Despite all of these extra features the ensemble topic approach is still very 24 | efficient, especially in multi-core environments (due the the embarrassingly parallel 25 | nature of the ensemble). A run with a reasonable size ensemble can be completed in 26 | around the same time it might take to fit an LDA model, and usually produces superior 27 | quality results. 28 | 29 | In addition to this EnsTop comes with a pLSA implementation that can be used 30 | standalone (and not as part of an ensemble). So if all you are loosing for is a good 31 | fast pLSA implementation (that can run considerably faster than many LDA 32 | implementations) then EnsTop is the library for you. 33 | 34 | ----------------- 35 | How to use EnsTop 36 | ----------------- 37 | 38 | EnsTop follows the sklearn API (and inherits from sklearn base classes), so if you 39 | use sklearn for LDA or NMF then you already know how to use Enstop. General usage is 40 | very straightforward. The following example uses EnsTop to model topics from the 41 | classic 20-Newsgroups dataset, using sklearn's CountVectorizer to generate the 42 | required count matrix. 43 | 44 | .. code:: python 45 | 46 | from sklearn.datasets import fetch_20newsgroups 47 | from sklearn.feature_extraction.text import CountVectorizer 48 | from enstop import EnsembleTopics 49 | 50 | news = fetch_20newsgroups(subset='all') 51 | data = CountVectorizer().fit_transform(news.data) 52 | 53 | model = EnsembleTopics(n_components=20).fit(data) 54 | topics = model.components_ 55 | doc_vectors = model.embedding_ 56 | 57 | 58 | --------------- 59 | How to use pLSA 60 | --------------- 61 | 62 | EnsTop also provides a simple to use but fast and effective pLSA implementation out 63 | of the box. As with the ensemble topic modeller it follows the sklearn API, and usage 64 | is very similar. 65 | 66 | .. code:: python 67 | 68 | from sklearn.datasets import fetch_20newsgroups 69 | from sklearn.feature_extraction.text import CountVectorizer 70 | from enstop import PLSA 71 | 72 | news = fetch_20newsgroups(subset='all') 73 | data = CountVectorizer().fit_transform(news.data) 74 | 75 | model = PLSA(n_components=20).fit(data) 76 | topics = model.components_ 77 | doc_vectors = model.embedding_ 78 | 79 | 80 | ------------ 81 | Installation 82 | ------------ 83 | 84 | The easiest way to install EnsTop is via pip 85 | 86 | .. code:: bash 87 | 88 | pip install enstop 89 | 90 | To manually install this package: 91 | 92 | .. code:: bash 93 | 94 | wget https://github.com/lmcinnes/enstop/archive/master.zip 95 | unzip master.zip 96 | rm master.zip 97 | cd enstop-master 98 | python setup.py install 99 | 100 | ---------------- 101 | Help and Support 102 | ---------------- 103 | 104 | Some basic example notebooks are available `here <./notebooks>`_. 105 | 106 | Documentation is coming. This project is still very young. If you need help, or have 107 | problems please `open an issue `_ 108 | and I will try to provide any help and guidance that I can. Please also check 109 | the docstrings on the code, which provide some descriptions of the parameters. 110 | 111 | 112 | ------- 113 | License 114 | ------- 115 | 116 | The EnsTop package is 2-clause BSD licensed. 117 | 118 | ------------ 119 | Contributing 120 | ------------ 121 | 122 | Contributions are more than welcome! There are lots of opportunities 123 | for potential projects, so please get in touch if you would like to 124 | help out. Everything from code to notebooks to 125 | examples and documentation are all *equally valuable* so please don't feel 126 | you can't contribute. To contribute please `fork the project `_ make your changes and 127 | submit a pull request. We will do our best to work through any issues with 128 | you and get your code merged into the main branch. 129 | -------------------------------------------------------------------------------- /enstop/utils.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | from scipy.sparse import issparse, csc_matrix 4 | from sklearn.utils.validation import check_array 5 | from sklearn.preprocessing import normalize as sklearn_normalize 6 | import numbers 7 | 8 | @numba.njit(fastmath=True, nogil=True) 9 | def normalize(ndarray, axis=0): 10 | """Normalize an array with respect to the l1-norm along an axis. Note that this procedure 11 | modifies the array **in place**. 12 | 13 | Parameters 14 | ---------- 15 | ndarray: array of shape (n,m) 16 | The array to be normalized. Must be a 2D array. 17 | 18 | axis: int (optional, default=0) 19 | The axis to normalize with respect to. 0 means normalize columns, 1 means normalize rows. 20 | """ 21 | # Compute marginal sum along axis 22 | marginal = np.zeros(ndarray.shape[1 - axis]) 23 | for i in range(marginal.shape[0]): 24 | for j in range(ndarray.shape[axis]): 25 | if axis == 0: 26 | marginal[i] += ndarray[j, i] 27 | elif axis == 1: 28 | marginal[i] += ndarray[i, j] 29 | else: 30 | raise ValueError("axis must be 0 or 1") 31 | 32 | # Divide out by the marginal 33 | for i in range(marginal.shape[0]): 34 | for j in range(ndarray.shape[axis]): 35 | if marginal[i] > 0.0: 36 | if axis == 0: 37 | ndarray[j, i] /= marginal[i] 38 | elif axis == 1: 39 | ndarray[i, j] /= marginal[i] 40 | else: 41 | raise ValueError("axis must be 0 or 1") 42 | 43 | 44 | @numba.njit() 45 | def _log_lift(topics, z, empirical_probs, n=-1): 46 | """Internal method to compute the log lift given precomputed empirical probabilities. This 47 | routine is designed to be numba compilable for performance. 48 | 49 | Parameters 50 | ---------- 51 | topics: array of shape (n_topics, n_words) 52 | The topic vectors to evaluate. 53 | 54 | z: int 55 | Which topic vector to evaluate. Must be 56 | in range(0, n_topics). 57 | 58 | empirical_probs: array of shape (n_words,) 59 | The empirical probability of word occurrence. 60 | 61 | n: int (optional, default=-1) 62 | The number of words to average over. If less than 0 it will evaluate over the entire 63 | vocabulary, otherwise it will select the top ``n`` words of the chosen topic. 64 | 65 | Returns 66 | ------- 67 | log_lift: float 68 | The log lift of the ``z``th topic vector. 69 | """ 70 | total_lift = 0.0 71 | if n <= 0: 72 | for w in range(topics.shape[1]): 73 | if empirical_probs[w] > 0: 74 | total_lift += topics[z, w] * 1.0 / empirical_probs[w] 75 | return np.log(total_lift * 1.0 / topics.shape[1]) 76 | else: 77 | top_words = np.argsort(topics[z])[-n:] 78 | for i in range(n): 79 | w = top_words[i] 80 | if empirical_probs[w] > 0: 81 | total_lift += topics[z, w] * 1.0 / empirical_probs[w] 82 | return np.log(total_lift * 1.0 / n) 83 | 84 | 85 | def log_lift(topics, z, data, n_words=-1): 86 | """Compute the log lift of a single topic given empirical data from which empirical 87 | probabilities of word occurrence can be computed. 88 | 89 | Parameters 90 | ---------- 91 | topics: array of shape (n_topics, n_words) 92 | The topic vectors to evaluate. 93 | 94 | z: int 95 | Which topic vector to evaluate. Must be 96 | in range(0, n_topics). 97 | 98 | data: array or sparse matrix of shape (n_docs, n_words,) 99 | The empirical data of word occurrence in a corpus. 100 | 101 | n: int (optional, default=-1) 102 | The number of words to average over. If less than 0 it will evaluate over the entire 103 | vocabulary, otherwise it will select the top ``n`` words of the chosen topic. 104 | 105 | Returns 106 | ------- 107 | log_lift: float 108 | The log lift of the ``z``th topic vector. 109 | """ 110 | normalized_topics = topics.copy() 111 | normalize(normalized_topics, axis=1) 112 | empirical_probs = np.array(data.sum(axis=0)).squeeze().astype(np.float64) 113 | empirical_probs /= empirical_probs.sum() 114 | return _log_lift(normalized_topics, z, empirical_probs, n=n_words) 115 | 116 | 117 | def mean_log_lift(topics, data, n_words=-1): 118 | """Compute the average log lift over all topics given empirical data from which empirical 119 | probabilities of word occurrence can be computed. 120 | 121 | Parameters 122 | ---------- 123 | topics: array of shape (n_topics, n_words) 124 | The topic vectors to evaluate. 125 | 126 | data: array or sparse matrix of shape (n_docs, n_words,) 127 | The empirical data of word occurrence in a corpus. 128 | 129 | n: int (optional, default=-1) 130 | The number of words to average over. If less than 0 it will evaluate over the entire 131 | vocabulary, otherwise it will select the top ``n`` words of the chosen topic. 132 | 133 | Returns 134 | ------- 135 | log_lift: float 136 | The average log lift over all topic vectors. 137 | """ 138 | normalized_topics = topics.copy() 139 | normalize(normalized_topics, axis=1) 140 | empirical_probs = np.array(data.sum(axis=0)).squeeze().astype(np.float64) 141 | empirical_probs /= empirical_probs.sum() 142 | return np.mean( 143 | [ 144 | _log_lift(topics, z, empirical_probs, n=n_words) 145 | for z in range(topics.shape[0]) 146 | ] 147 | ) 148 | 149 | 150 | @numba.njit() 151 | def arr_intersect(ar1, ar2): 152 | """Numba compilable equivalent of numpy's intersect1d""" 153 | aux = np.concatenate((ar1, ar2)) 154 | aux.sort() 155 | return aux[:-1][aux[1:] == aux[:-1]] 156 | 157 | 158 | @numba.njit() 159 | def _coherence(topics, z, n, indices, indptr, n_docs_per_word): 160 | """Internal routine for computing the coherence of a given topic given raw data and the 161 | number of documents per vocabulary word. This routine makes use of scipy sparse matrix 162 | formats, but to be numba compilable it must make use of internal arrays thereof. 163 | 164 | Parameters 165 | ---------- 166 | topics: array of shape (n_topics, n_words) 167 | The topic vectors for scoring 168 | 169 | z: int 170 | Which topic vector to score. 171 | 172 | n: int 173 | The number of topic words to score against. The top ``n`` words from the ``z``th topic 174 | will be used. 175 | 176 | indices: array of shape (nnz,) 177 | The indices array of a CSC format sparse matrix representation of the corpus data. 178 | 179 | indptr: array of shape(n_words - 1,) 180 | The indptr array of a CSC format sparse matrix representation of the corpus data. 181 | 182 | n_docs_per_word: array of shape (n_words,) 183 | The total number of documents for each vocabulary word (the column sum of the corpus data). 184 | 185 | 186 | Returns 187 | ------- 188 | topic_coherence: float 189 | The coherence score of the ``z``th topic. 190 | """ 191 | top_words = np.argsort(topics[z])[-n:] 192 | coherence = 0.0 193 | for i in range(n - 1): 194 | w = top_words[i] 195 | if n_docs_per_word[w] == 0: 196 | continue 197 | for j in range(i + 1, n): 198 | v = top_words[j] 199 | n_co_occur = arr_intersect( 200 | indices[indptr[w] : indptr[w + 1]], indices[indptr[v] : indptr[v + 1]] 201 | ).shape[0] 202 | coherence += np.log((n_co_occur + 1.0) / n_docs_per_word[w]) 203 | return coherence 204 | 205 | 206 | def coherence(topics, z, data, n_words=20): 207 | """Compute the coherence of a single topic given empirical data. 208 | 209 | Parameters 210 | ---------- 211 | topics: array of shape (n_topics, n_words) 212 | The topic vectors for scoring 213 | 214 | z: int 215 | Which topic vector to score. 216 | 217 | data: array or sparse matrix of shape (n_doc, n_words) 218 | The empirical data of word occurrence in a corpus. 219 | 220 | n_words: int (optional, default=20) 221 | The number of topic words to score against. The top ``n_words`` words from the ``z``th topic 222 | will be used. 223 | 224 | Returns 225 | ------- 226 | topic_coherence: float 227 | The coherence score of the ``z``th topic. 228 | """ 229 | if not issparse(data): 230 | csc_data = csc_matrix(data) 231 | else: 232 | csc_data = data.tocsc() 233 | 234 | n_docs_per_word = np.array((data > 0).sum(axis=0)).squeeze() 235 | return _coherence( 236 | topics, z, n_words, csc_data.indices, csc_data.indptr, n_docs_per_word 237 | ) 238 | 239 | 240 | def mean_coherence(topics, data, n_words=20): 241 | """Compute the average coherence of all topics given empirical data. 242 | 243 | Parameters 244 | ---------- 245 | topics: array of shape (n_topics, n_words) 246 | The topic vectors for scoring 247 | 248 | data: array or sparse matrix of shape (n_doc, n_words) 249 | The empirical data of word occurrence in a corpus. 250 | 251 | n_words: int (optional, default=20) 252 | The number of topic words to score against. The top ``n_words`` words of each topic 253 | will be used. 254 | 255 | Returns 256 | ------- 257 | topic_coherence: float 258 | The average coherence score of all the topics. 259 | """ 260 | if not issparse(data): 261 | csc_data = csc_matrix(data) 262 | else: 263 | csc_data = data.tocsc() 264 | 265 | n_docs_per_word = np.array((data > 0).sum(axis=0)).squeeze() 266 | return np.mean( 267 | [ 268 | _coherence( 269 | topics, z, n_words, csc_data.indices, csc_data.indptr, n_docs_per_word 270 | ) 271 | for z in range(topics.shape[0]) 272 | ] 273 | ) 274 | 275 | 276 | def standardize_input(input_matrix): 277 | if input_matrix.dtype in (np.float32, np.float64, np.float, np.double): 278 | return sklearn_normalize(input_matrix, norm="l1") 279 | else: 280 | return input_matrix 281 | 282 | #### 283 | # Taken from sklearn as a fallback option; by default we import their latest version 284 | #### 285 | def _check_sample_weight(sample_weight, X, dtype=None): 286 | """Validate sample weights. 287 | 288 | Note that passing sample_weight=None will output an array of ones. 289 | Therefore, in some cases, you may want to protect the call with: 290 | if sample_weight is not None: 291 | sample_weight = _check_sample_weight(...) 292 | 293 | Parameters 294 | ---------- 295 | sample_weight : {ndarray, Number or None}, shape (n_samples,) 296 | Input sample weights. 297 | 298 | X : nd-array, list or sparse matrix 299 | Input data. 300 | 301 | dtype: dtype 302 | dtype of the validated `sample_weight`. 303 | If None, and the input `sample_weight` is an array, the dtype of the 304 | input is preserved; otherwise an array with the default numpy dtype 305 | is be allocated. If `dtype` is not one of `float32`, `float64`, 306 | `None`, the output will be of dtype `float64`. 307 | 308 | Returns 309 | ------- 310 | sample_weight : ndarray, shape (n_samples,) 311 | Validated sample weight. It is guaranteed to be "C" contiguous. 312 | """ 313 | n_samples = X.shape[0] 314 | 315 | if dtype is not None and dtype not in [np.float32, np.float64]: 316 | dtype = np.float64 317 | 318 | if sample_weight is None: 319 | sample_weight = np.ones(n_samples, dtype=dtype) 320 | elif isinstance(sample_weight, numbers.Number): 321 | sample_weight = np.full(n_samples, sample_weight, dtype=dtype) 322 | else: 323 | if dtype is None: 324 | dtype = [np.float64, np.float32] 325 | sample_weight = check_array( 326 | sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype, 327 | order="C" 328 | ) 329 | if sample_weight.ndim != 1: 330 | raise ValueError("Sample weights must be 1D array or scalar") 331 | 332 | if sample_weight.shape != (n_samples,): 333 | raise ValueError("sample_weight.shape == {}, expected {}!" 334 | .format(sample_weight.shape, (n_samples,))) 335 | return sample_weight 336 | -------------------------------------------------------------------------------- /enstop/distributed_plsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | from sklearn.utils import check_array, check_random_state 6 | from scipy.sparse import issparse, csr_matrix, coo_matrix 7 | 8 | from enstop.utils import normalize, coherence, mean_coherence, log_lift, mean_log_lift 9 | from enstop.plsa import plsa_init 10 | from enstop.block_parallel_plsa import ( 11 | plsa_e_step_on_a_block, 12 | plsa_partial_m_step_on_a_block, 13 | ) 14 | 15 | from dask import delayed, compute, optimize, persist 16 | import dask.array as da 17 | 18 | 19 | @delayed 20 | @numba.njit(nogil=True, fastmath=True) 21 | def plsa_em_step_block_kernel( 22 | row_block, col_block, val_block, p_w_given_z, p_z_given_d, e_step_thresh=1e-32, 23 | ): 24 | result_p_w_given_z = np.zeros_like(p_w_given_z) 25 | result_p_z_given_d = np.zeros_like(p_z_given_d) 26 | result_norm_pwz = np.zeros(p_w_given_z.shape[0], dtype=np.float32) 27 | result_norm_pdz = np.zeros(p_z_given_d.shape[0], dtype=np.float32) 28 | p_z_given_wd_block = np.zeros( 29 | (row_block.shape[0], p_w_given_z.shape[0]), dtype=np.float32 30 | ) 31 | 32 | plsa_e_step_on_a_block( 33 | row_block, 34 | col_block, 35 | p_w_given_z, 36 | p_z_given_d, 37 | p_z_given_wd_block, 38 | e_step_thresh, 39 | ) 40 | plsa_partial_m_step_on_a_block( 41 | row_block, 42 | col_block, 43 | val_block, 44 | result_p_w_given_z, 45 | result_p_z_given_d, 46 | p_z_given_wd_block, 47 | result_norm_pwz, 48 | result_norm_pdz, 49 | ) 50 | 51 | return result_p_w_given_z, result_p_z_given_d, result_norm_pwz, result_norm_pdz 52 | 53 | 54 | def plsa_em_step_dask( 55 | block_rows_ndarray, 56 | block_cols_ndarray, 57 | block_vals_ndarray, 58 | p_w_given_z, 59 | p_z_given_d, 60 | block_row_size, 61 | block_col_size, 62 | e_step_thresh=1e-32, 63 | ): 64 | n_d_blocks = block_rows_ndarray.shape[0] 65 | n_w_blocks = block_rows_ndarray.shape[1] 66 | 67 | n = p_z_given_d.shape[0] 68 | m = p_w_given_z.shape[1] 69 | k = p_z_given_d.shape[1] 70 | 71 | result_p_w_given_z = [[] for i in range(n_w_blocks)] 72 | result_p_z_given_d = [[] for i in range(n_d_blocks)] 73 | result_norm_pwz = [] 74 | result_norm_pdz = [[] for i in range(n_d_blocks)] 75 | 76 | for i in range(n_d_blocks): 77 | 78 | row_start = block_row_size * i 79 | row_end = min(row_start + block_row_size, n) 80 | 81 | for j in range(n_w_blocks): 82 | col_start = block_col_size * j 83 | col_end = min(col_start + block_col_size, m) 84 | 85 | row_block = block_rows_ndarray[i, j] 86 | col_block = block_cols_ndarray[i, j] 87 | val_block = block_vals_ndarray[i, j] 88 | 89 | kernel_results = plsa_em_step_block_kernel( 90 | row_block, 91 | col_block, 92 | val_block, 93 | p_w_given_z[:, col_start:col_end], 94 | p_z_given_d[row_start:row_end, :], 95 | e_step_thresh=e_step_thresh, 96 | ) 97 | 98 | result_p_w_given_z[j].append( 99 | da.from_delayed( 100 | kernel_results[0], (k, block_col_size), dtype=np.float32 101 | ) 102 | ) 103 | result_p_z_given_d[i].append( 104 | da.from_delayed( 105 | kernel_results[1], (block_row_size, k), dtype=np.float32 106 | ) 107 | ) 108 | result_norm_pwz.append( 109 | da.from_delayed(kernel_results[2], (k,), dtype=np.float32) 110 | ) 111 | 112 | result_norm_pdz[i].append( 113 | da.from_delayed(kernel_results[3], (block_row_size,), dtype=np.float32) 114 | ) 115 | 116 | p_w_given_z_blocks = [ 117 | da.dstack(result_p_w_given_z[i]).sum(axis=-1) for i in range(n_w_blocks) 118 | ] 119 | p_z_given_d_blocks = [ 120 | da.dstack(result_p_z_given_d[i]).sum(axis=-1) for i in range(n_d_blocks) 121 | ] 122 | norm_pdz_blocks = [ 123 | da.dstack(result_norm_pdz[i]).sum(axis=-1) for i in range(n_d_blocks) 124 | ] 125 | 126 | p_w_given_z = ( 127 | da.hstack(p_w_given_z_blocks) / da.dstack(result_norm_pwz).sum(axis=-1).T 128 | ) 129 | p_z_given_d = da.vstack(p_z_given_d_blocks) / da.hstack(norm_pdz_blocks).T 130 | 131 | result = compute(p_w_given_z, p_z_given_d) 132 | 133 | return result 134 | 135 | 136 | @numba.njit( 137 | locals={ 138 | "i": numba.types.uint16, 139 | "j": numba.types.uint16, 140 | "k": numba.types.intp, 141 | "w": numba.types.uint32, 142 | "d": numba.types.uint32, 143 | "z": numba.types.uint16, 144 | "nz_idx": numba.types.uint32, 145 | "x": numba.types.float32, 146 | "result": numba.types.float32[:, :, ::1], 147 | "p_w_given_d": numba.types.float32, 148 | }, 149 | fastmath=True, 150 | nogil=True, 151 | parallel=True, 152 | ) 153 | def log_likelihood_by_blocks_kernel( 154 | block_rows, 155 | block_cols, 156 | block_vals, 157 | p_w_given_z, 158 | p_z_given_d, 159 | block_row_size, 160 | block_col_size, 161 | i, j, 162 | ): 163 | result = np.zeros((1, 1, 1), dtype=np.float32) 164 | k = p_w_given_z.shape[0] 165 | 166 | for nz_idx in range(block_rows.shape[2]): 167 | if block_rows[0, 0, nz_idx] < 0: 168 | break 169 | 170 | d = block_rows[0, 0, nz_idx] + i * block_row_size 171 | w = block_cols[0, 0, nz_idx] + j * block_col_size 172 | x = block_vals[0, 0, nz_idx] 173 | 174 | p_w_given_d = 0.0 175 | for z in range(k): 176 | p_w_given_d += p_w_given_z[z, w] * p_z_given_d[d, z] 177 | 178 | result[0, 0, 0] += x * np.log(p_w_given_d) 179 | 180 | return result 181 | 182 | def log_likelihood_by_blocks_kernel_wrapper( 183 | block_rows, 184 | block_cols, 185 | block_vals, 186 | p_w_given_z, 187 | p_z_given_d, 188 | block_row_size, 189 | block_col_size, 190 | block_info=None, 191 | ): 192 | i, j, _ = block_info[0]["chunk-location"] 193 | return log_likelihood_by_blocks_kernel( 194 | block_rows, 195 | block_cols, 196 | block_vals, 197 | p_w_given_z, 198 | p_z_given_d, 199 | block_row_size, 200 | block_col_size, 201 | i, j, 202 | ) 203 | 204 | def log_likelihood_by_blocks( 205 | block_rows_ndarray, 206 | block_cols_ndarray, 207 | block_vals_ndarray, 208 | p_w_given_z, 209 | p_z_given_d, 210 | block_row_size, 211 | block_col_size, 212 | ): 213 | 214 | log_likelihood_per_block = da.map_blocks( 215 | log_likelihood_by_blocks_kernel_wrapper, 216 | block_rows_ndarray, 217 | block_cols_ndarray, 218 | block_vals_ndarray, 219 | p_w_given_z, 220 | p_z_given_d, 221 | block_row_size, 222 | block_col_size, 223 | dtype=np.float32, 224 | ) 225 | result = log_likelihood_per_block.sum() 226 | return result.compute() 227 | 228 | 229 | def plsa_fit_inner_dask( 230 | block_rows_ndarray, 231 | block_cols_ndarray, 232 | block_vals_ndarray, 233 | p_w_given_z, 234 | p_z_given_d, 235 | block_row_size, 236 | block_col_size, 237 | n_iter=100, 238 | n_iter_per_test=10, 239 | tolerance=0.001, 240 | e_step_thresh=1e-32, 241 | ): 242 | previous_log_likelihood = log_likelihood_by_blocks( 243 | block_rows_ndarray, 244 | block_cols_ndarray, 245 | block_vals_ndarray, 246 | p_w_given_z, 247 | p_z_given_d, 248 | block_row_size, 249 | block_col_size, 250 | ) 251 | 252 | # block_rows_ndarray, block_cols_ndarray, block_vals_ndarray = persist( 253 | # block_rows_ndarray, block_cols_ndarray, block_vals_ndarray 254 | # ) 255 | 256 | for i in range(n_iter): 257 | p_w_given_z, p_z_given_d = plsa_em_step_dask( 258 | block_rows_ndarray, 259 | block_cols_ndarray, 260 | block_vals_ndarray, 261 | p_w_given_z, 262 | p_z_given_d, 263 | block_row_size, 264 | block_col_size, 265 | e_step_thresh=e_step_thresh, 266 | ) 267 | if i % n_iter_per_test == 0: 268 | current_log_likelihood = log_likelihood_by_blocks( 269 | block_rows_ndarray, 270 | block_cols_ndarray, 271 | block_vals_ndarray, 272 | p_w_given_z, 273 | p_z_given_d, 274 | block_row_size, 275 | block_col_size, 276 | ) 277 | change = np.abs(current_log_likelihood - previous_log_likelihood) 278 | if change / np.abs(current_log_likelihood) < tolerance: 279 | break 280 | else: 281 | previous_log_likelihood = current_log_likelihood 282 | 283 | return p_z_given_d, p_w_given_z 284 | 285 | 286 | def plsa_fit( 287 | X, 288 | k, 289 | n_row_blocks=8, 290 | n_col_blocks=8, 291 | init="random", 292 | n_iter=100, 293 | n_iter_per_test=10, 294 | tolerance=0.001, 295 | e_step_thresh=1e-32, 296 | random_state=None, 297 | ): 298 | rng = check_random_state(random_state) 299 | p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng) 300 | p_z_given_d = p_z_given_d.astype(np.float32, order="C") 301 | p_w_given_z = p_w_given_z.astype(np.float32, order="C") 302 | 303 | A = X.tocsr().astype(np.float32) 304 | 305 | n = A.shape[0] 306 | m = A.shape[1] 307 | 308 | block_row_size = np.uint32(np.ceil(A.shape[0] / n_row_blocks)) 309 | block_col_size = np.uint32(np.ceil(A.shape[1] / n_col_blocks)) 310 | 311 | A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)] 312 | max_nnz_per_block = 0 313 | for i in range(n_row_blocks): 314 | 315 | row_start = block_row_size * i 316 | row_end = min(row_start + block_row_size, n) 317 | 318 | for j in range(n_col_blocks): 319 | 320 | col_start = block_col_size * j 321 | col_end = min(col_start + block_col_size, m) 322 | 323 | A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo() 324 | if A_blocks[i][j].nnz > max_nnz_per_block: 325 | max_nnz_per_block = A_blocks[i][j].nnz 326 | 327 | del A 328 | 329 | block_rows_ndarray = np.full( 330 | (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32, 331 | ) 332 | block_cols_ndarray = np.full( 333 | (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32, 334 | ) 335 | block_vals_ndarray = np.zeros( 336 | (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32, 337 | ) 338 | for i in range(n_row_blocks): 339 | for j in range(n_col_blocks): 340 | nnz = A_blocks[i][j].nnz 341 | block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row 342 | block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col 343 | block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data 344 | 345 | del A_blocks 346 | 347 | block_rows_ndarray = da.from_array( 348 | block_rows_ndarray, chunks=(1, 1, max_nnz_per_block), 349 | ) 350 | block_cols_ndarray = da.from_array( 351 | block_cols_ndarray, chunks=(1, 1, max_nnz_per_block), 352 | ) 353 | block_vals_ndarray = da.from_array( 354 | block_vals_ndarray, chunks=(1, 1, max_nnz_per_block), 355 | ) 356 | 357 | p_z_given_d, p_w_given_z = plsa_fit_inner_dask( 358 | block_rows_ndarray, 359 | block_cols_ndarray, 360 | block_vals_ndarray, 361 | p_w_given_z, 362 | p_z_given_d, 363 | block_row_size, 364 | block_col_size, 365 | n_iter=n_iter, 366 | n_iter_per_test=n_iter_per_test, 367 | tolerance=tolerance, 368 | e_step_thresh=e_step_thresh, 369 | ) 370 | 371 | return p_z_given_d, p_w_given_z 372 | 373 | 374 | class DistributedPLSA(BaseEstimator, TransformerMixin): 375 | def __init__( 376 | self, 377 | n_components=10, 378 | init="random", 379 | n_row_blocks=8, 380 | n_col_blocks=8, 381 | n_iter=100, 382 | n_iter_per_test=10, 383 | tolerance=0.001, 384 | e_step_thresh=1e-32, 385 | transform_random_seed=42, 386 | random_state=None, 387 | ): 388 | 389 | self.n_components = n_components 390 | self.init = init 391 | self.n_row_blocks = n_row_blocks 392 | self.n_col_blocks = n_col_blocks 393 | self.n_iter = n_iter 394 | self.n_iter_per_test = n_iter_per_test 395 | self.tolerance = tolerance 396 | self.e_step_thresh = e_step_thresh 397 | self.transform_random_seed = transform_random_seed 398 | self.random_state = random_state 399 | 400 | def fit(self, X, y=None, sample_weight=None): 401 | """Learn the pLSA model for the data X and return the document vectors. 402 | 403 | This is more efficient than calling fit followed by transform. 404 | 405 | Parameters 406 | ---------- 407 | X: array or sparse matrix of shape (n_docs, n_words) 408 | The data matrix pLSA is attempting to fit to. 409 | 410 | y: Ignored 411 | 412 | sample_weight: array of shape (n_docs,) 413 | Input document weights. 414 | 415 | Returns 416 | ------- 417 | self 418 | """ 419 | self.fit_transform(X, sample_weight=sample_weight) 420 | return self 421 | 422 | def fit_transform(self, X, y=None, sample_weight=None): 423 | """Learn the pLSA model for the data X and return the document vectors. 424 | 425 | This is more efficient than calling fit followed by transform. 426 | 427 | Parameters 428 | ---------- 429 | X: array or sparse matrix of shape (n_docs, n_words) 430 | The data matrix pLSA is attempting to fit to. 431 | 432 | y: Ignored 433 | 434 | sample_weight: array of shape (n_docs,) 435 | Input document weights. 436 | 437 | Returns 438 | ------- 439 | embedding: array of shape (n_docs, n_topics) 440 | An embedding of the documents into a topic space. 441 | """ 442 | 443 | X = check_array(X, accept_sparse="csr") 444 | 445 | if not issparse(X): 446 | X = csr_matrix(X) 447 | 448 | if sample_weight is not None: 449 | NotImplementedError("Sample weights not supported in distributed") 450 | # sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32) 451 | 452 | if np.any(X.data < 0): 453 | raise ValueError( 454 | "PLSA is only valid for matrices with non-negative " "entries" 455 | ) 456 | 457 | row_sums = np.array(X.sum(axis=1).T)[0] 458 | good_rows = row_sums != 0 459 | 460 | if not np.all(good_rows): 461 | zero_rows_found = True 462 | data_for_fitting = X[good_rows] 463 | else: 464 | zero_rows_found = False 465 | data_for_fitting = X 466 | 467 | U, V = plsa_fit( 468 | data_for_fitting, 469 | self.n_components, 470 | n_row_blocks=self.n_row_blocks, 471 | n_col_blocks=self.n_col_blocks, 472 | init=self.init, 473 | n_iter=self.n_iter, 474 | n_iter_per_test=self.n_iter_per_test, 475 | tolerance=self.tolerance, 476 | e_step_thresh=self.e_step_thresh, 477 | random_state=self.random_state, 478 | ) 479 | 480 | if zero_rows_found: 481 | self.embedding_ = np.zeros((X.shape[0], self.n_components)) 482 | self.embedding_[good_rows] = U 483 | else: 484 | self.embedding_ = U 485 | 486 | self.components_ = V 487 | self.training_data_ = X 488 | 489 | return self.embedding_ 490 | -------------------------------------------------------------------------------- /enstop/cuda_plsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | import numba.cuda as cuda 4 | 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.utils import check_array, check_random_state 7 | from sklearn.utils.validation import _check_sample_weight 8 | from scipy.sparse import issparse, csr_matrix, coo_matrix 9 | 10 | from enstop.utils import ( 11 | normalize, 12 | coherence, 13 | mean_coherence, 14 | log_lift, 15 | mean_log_lift, 16 | standardize_input, 17 | ) 18 | from enstop.plsa import plsa_init 19 | from enstop.block_parallel_plsa import log_likelihood_by_blocks 20 | 21 | 22 | @cuda.jit() 23 | def plsa_e_step( 24 | block_rows, 25 | block_cols, 26 | p_w_given_z_block, 27 | p_z_given_d_block, 28 | p_z_given_wd_block, 29 | e_step_thresh, 30 | ): 31 | i = cuda.blockIdx.x 32 | j = cuda.blockIdx.y 33 | nz_offset = cuda.threadIdx.x 34 | threads_per_blocks = cuda.blockDim.x 35 | k = p_z_given_d_block[i].shape[1] 36 | nnz = block_rows.shape[2] 37 | 38 | n_passes = (nnz // threads_per_blocks) + 1 39 | 40 | for n in range(n_passes): 41 | nz_idx = n * threads_per_blocks + nz_offset 42 | if nz_idx < nnz: 43 | if block_rows[i, j, nz_idx] < 0: 44 | break 45 | 46 | d = block_rows[i, j, nz_idx] 47 | w = block_cols[i, j, nz_idx] 48 | 49 | norm = 0.0 50 | for z in range(k): 51 | v = p_w_given_z_block[j, z, w] * p_z_given_d_block[i, d, z] 52 | if v > e_step_thresh: 53 | p_z_given_wd_block[i, j, nz_idx, z] = v 54 | norm += v 55 | else: 56 | p_z_given_wd_block[i, j, nz_idx, z] = 0.0 57 | 58 | for z in range(k): 59 | if norm > 0.0: 60 | p_z_given_wd_block[i, j, nz_idx, z] /= norm 61 | 62 | 63 | @cuda.jit() 64 | def plsa_partial_m_step( 65 | block_rows, 66 | block_cols, 67 | block_vals, 68 | p_w_given_z_block, 69 | p_z_given_d_block, 70 | result_p_w_given_z_block, 71 | result_p_z_given_d_block, 72 | p_z_given_wd_block, 73 | pwz_norms, 74 | ): 75 | z = cuda.threadIdx.x 76 | i = cuda.blockIdx.x 77 | j = cuda.blockIdx.y 78 | k = p_z_given_d_block[i].shape[1] 79 | nnz = block_rows.shape[2] 80 | 81 | if z < k: 82 | 83 | result_p_w_given_z_block[i, j, z, :] = 0.0 84 | result_p_z_given_d_block[j, i, :, z] = 0.0 85 | pwz_norms[i, j, z] = 0.0 86 | 87 | for nz_idx in range(block_rows[i, j].shape[0]): 88 | if block_rows[i, j, nz_idx] < 0: 89 | break 90 | 91 | d = block_rows[i, j, nz_idx] 92 | w = block_cols[i, j, nz_idx] 93 | x = block_vals[i, j, nz_idx] 94 | 95 | s = x * p_z_given_wd_block[i, j, nz_idx, z] 96 | 97 | result_p_w_given_z_block[i, j, z, w] += s 98 | result_p_z_given_d_block[j, i, d, z] += s 99 | 100 | pwz_norms[i, j, z] += s 101 | 102 | 103 | @cuda.jit() 104 | def normalize_m_step_p_z_given_d(blocked_next_p_z_given_d, p_z_given_d): 105 | d_offset = cuda.threadIdx.x 106 | i = cuda.blockIdx.x 107 | threads_per_block = cuda.blockDim.x 108 | k = p_z_given_d[i].shape[1] 109 | n_passes = ((p_z_given_d.shape[0] * p_z_given_d.shape[1]) // threads_per_block) + 1 110 | 111 | for n in range(n_passes): 112 | d = threads_per_block * n + d_offset 113 | if ( 114 | i < blocked_next_p_z_given_d.shape[1] 115 | and d < blocked_next_p_z_given_d.shape[2] 116 | ): 117 | norm = 0.0 118 | for z in range(k): 119 | p_z_given_d[i, d, z] = 0.0 120 | for j in range(blocked_next_p_z_given_d.shape[0]): 121 | p_z_given_d[i, d, z] += blocked_next_p_z_given_d[j, i, d, z] 122 | norm += blocked_next_p_z_given_d[j, i, d, z] 123 | for z in range(k): 124 | p_z_given_d[i, d, z] /= norm 125 | 126 | 127 | @cuda.jit() 128 | def normalize_m_step_p_w_given_z(blocked_next_p_w_given_z, p_w_given_z, pwz_norms): 129 | w_offset = cuda.threadIdx.x 130 | i = cuda.blockIdx.x 131 | threads_per_block = cuda.blockDim.x 132 | k = p_w_given_z[i].shape[0] 133 | n_passes = ((p_w_given_z.shape[0] * p_w_given_z.shape[2]) // threads_per_block) + 1 134 | 135 | norms = cuda.local.array(1024, numba.float64) 136 | for z in range(k): 137 | norms[z] = 0.0 138 | for p in range(pwz_norms.shape[0]): 139 | for q in range(pwz_norms.shape[1]): 140 | for z in range(k): 141 | norms[z] += pwz_norms[p, q, z] 142 | 143 | for n in range(n_passes): 144 | w = n * threads_per_block + w_offset 145 | if ( 146 | i < blocked_next_p_w_given_z.shape[1] 147 | and w < blocked_next_p_w_given_z.shape[3] 148 | ): 149 | for z in range(k): 150 | p_w_given_z[i, z, w] = 0.0 151 | for j in range(blocked_next_p_w_given_z.shape[0]): 152 | p_w_given_z[i, z, w] += blocked_next_p_w_given_z[j, i, z, w] 153 | for z in range(k): 154 | p_w_given_z[i, z, w] /= norms[z] 155 | 156 | 157 | def plsa_fit( 158 | data, 159 | k, 160 | n_row_blocks=8, 161 | n_col_blocks=8, 162 | init="random", 163 | n_iter=100, 164 | n_iter_per_test=10, 165 | tolerance=0.001, 166 | e_step_thresh=1e-32, 167 | random_state=None, 168 | ): 169 | rng = check_random_state(random_state) 170 | p_z_given_d_init, p_w_given_z_init = plsa_init(data, k, init=init, rng=rng) 171 | 172 | A = data.tocsr().astype(np.float32) 173 | 174 | n = A.shape[0] 175 | m = A.shape[1] 176 | 177 | block_row_size = np.uint16(np.ceil(A.shape[0] / n_row_blocks)) 178 | block_col_size = np.uint16(np.ceil(A.shape[1] / n_col_blocks)) 179 | 180 | p_z_given_d = np.zeros((block_row_size * n_row_blocks, k), dtype=np.float32) 181 | p_z_given_d[: p_z_given_d_init.shape[0]] = p_z_given_d_init 182 | p_z_given_d = p_z_given_d.reshape(n_row_blocks, block_row_size, k) 183 | 184 | p_w_given_z = np.zeros((k, block_col_size * n_col_blocks), dtype=np.float32) 185 | p_w_given_z[:, : p_w_given_z_init.shape[1]] = p_w_given_z_init 186 | p_w_given_z = np.transpose( 187 | p_w_given_z.T.reshape(n_col_blocks, block_col_size, k), axes=[0, 2, 1] 188 | ).astype(np.float32, order="C") 189 | 190 | A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)] 191 | max_nnz_per_block = 0 192 | for i in range(n_row_blocks): 193 | 194 | row_start = block_row_size * i 195 | row_end = min(row_start + block_row_size, n) 196 | 197 | for j in range(n_col_blocks): 198 | 199 | col_start = block_col_size * j 200 | col_end = min(col_start + block_col_size, m) 201 | 202 | A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo() 203 | if A_blocks[i][j].nnz > max_nnz_per_block: 204 | max_nnz_per_block = A_blocks[i][j].nnz 205 | 206 | block_rows_ndarray = np.full( 207 | (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32 208 | ) 209 | block_cols_ndarray = np.full( 210 | (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32 211 | ) 212 | block_vals_ndarray = np.zeros( 213 | (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32 214 | ) 215 | for i in range(n_row_blocks): 216 | for j in range(n_col_blocks): 217 | nnz = A_blocks[i][j].nnz 218 | block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row 219 | block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col 220 | block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data 221 | 222 | n_d_blocks = block_rows_ndarray.shape[0] 223 | n_w_blocks = block_rows_ndarray.shape[1] 224 | block_size = block_rows_ndarray.shape[2] 225 | 226 | p_z_given_wd_block = np.zeros( 227 | (n_d_blocks, n_w_blocks, block_size, k), dtype=np.float32 228 | ) 229 | 230 | blocked_next_p_w_given_z = np.zeros( 231 | ( 232 | np.int64(n_d_blocks), 233 | np.int64(n_w_blocks), 234 | np.int64(k), 235 | np.int64(block_col_size), 236 | ), 237 | dtype=np.float32, 238 | ) 239 | blocked_next_p_z_given_d = np.zeros( 240 | ( 241 | np.int64(n_w_blocks), 242 | np.int64(n_d_blocks), 243 | np.int64(block_row_size), 244 | np.int64(k), 245 | ), 246 | dtype=np.float32, 247 | ) 248 | norms_pwz = np.zeros((n_d_blocks, n_w_blocks, k), dtype=np.float64) 249 | 250 | previous_log_likelihood = log_likelihood_by_blocks( 251 | block_rows_ndarray, 252 | block_cols_ndarray, 253 | block_vals_ndarray, 254 | p_w_given_z, 255 | p_z_given_d, 256 | ) 257 | 258 | d_block_rows_ndarray = cuda.to_device(block_rows_ndarray) 259 | d_block_cols_ndarray = cuda.to_device(block_cols_ndarray) 260 | d_block_vals_ndarray = cuda.to_device(block_vals_ndarray) 261 | d_blocked_next_p_w_given_z = cuda.to_device(blocked_next_p_w_given_z) 262 | d_blocked_next_p_z_given_d = cuda.to_device(blocked_next_p_z_given_d) 263 | d_p_z_given_wd_block = cuda.to_device(p_z_given_wd_block) 264 | d_p_w_given_z = cuda.to_device(p_w_given_z) 265 | d_p_z_given_d = cuda.to_device(p_z_given_d) 266 | d_norms_pwz = cuda.to_device(norms_pwz) 267 | 268 | n_d = p_z_given_d.shape[1] 269 | n_w = p_w_given_z.shape[2] 270 | 271 | for i in range(n_iter // n_iter_per_test): 272 | for j in range(n_iter_per_test): 273 | plsa_e_step[(n_d_blocks, n_w_blocks), 256]( 274 | d_block_rows_ndarray, 275 | d_block_cols_ndarray, 276 | d_p_w_given_z, 277 | d_p_z_given_d, 278 | d_p_z_given_wd_block, 279 | e_step_thresh, 280 | ) 281 | cuda.synchronize() 282 | plsa_partial_m_step[(n_d_blocks, n_w_blocks), k]( 283 | d_block_rows_ndarray, 284 | d_block_cols_ndarray, 285 | d_block_vals_ndarray, 286 | d_p_w_given_z, 287 | d_p_z_given_d, 288 | d_blocked_next_p_w_given_z, 289 | d_blocked_next_p_z_given_d, 290 | d_p_z_given_wd_block, 291 | d_norms_pwz, 292 | ) 293 | cuda.synchronize() 294 | normalize_m_step_p_z_given_d[n_d_blocks, 256]( 295 | d_blocked_next_p_z_given_d, d_p_z_given_d 296 | ) 297 | normalize_m_step_p_w_given_z[n_w_blocks, 256]( 298 | d_blocked_next_p_w_given_z, d_p_w_given_z, d_norms_pwz 299 | ) 300 | cuda.synchronize() 301 | 302 | p_z_given_d = d_p_z_given_d.copy_to_host() 303 | p_w_given_z = d_p_w_given_z.copy_to_host() 304 | current_log_likelihood = log_likelihood_by_blocks( 305 | block_rows_ndarray, 306 | block_cols_ndarray, 307 | block_vals_ndarray, 308 | p_w_given_z, 309 | p_z_given_d, 310 | ) 311 | change = np.abs(current_log_likelihood - previous_log_likelihood) 312 | if change / np.abs(current_log_likelihood) < tolerance: 313 | break 314 | else: 315 | previous_log_likelihood = current_log_likelihood 316 | 317 | for i in range(n_iter % n_iter_per_test): 318 | plsa_e_step[(n_d_blocks, n_w_blocks), 256]( 319 | d_block_rows_ndarray, 320 | d_block_cols_ndarray, 321 | d_p_w_given_z, 322 | d_p_z_given_d, 323 | d_p_z_given_wd_block, 324 | e_step_thresh, 325 | ) 326 | cuda.synchronize() 327 | plsa_partial_m_step[(n_d_blocks, n_w_blocks), k]( 328 | d_block_rows_ndarray, 329 | d_block_cols_ndarray, 330 | d_block_vals_ndarray, 331 | d_p_w_given_z, 332 | d_p_z_given_d, 333 | d_blocked_next_p_w_given_z, 334 | d_blocked_next_p_z_given_d, 335 | d_p_z_given_wd_block, 336 | d_norms_pwz, 337 | ) 338 | cuda.synchronize() 339 | normalize_m_step_p_z_given_d[n_d_blocks, 256]( 340 | d_blocked_next_p_z_given_d, d_p_z_given_d 341 | ) 342 | normalize_m_step_p_w_given_z[n_w_blocks, 256]( 343 | d_blocked_next_p_w_given_z, d_p_w_given_z, d_norms_pwz 344 | ) 345 | cuda.synchronize() 346 | 347 | p_z_given_d = d_p_z_given_d.copy_to_host() 348 | p_w_given_z = d_p_w_given_z.copy_to_host() 349 | 350 | p_z_given_d = np.vstack(p_z_given_d)[:n, :] 351 | p_w_given_z = np.hstack(p_w_given_z)[:, :m] 352 | 353 | return p_z_given_d, p_w_given_z 354 | 355 | 356 | class GPUPLSA(BaseEstimator, TransformerMixin): 357 | def __init__( 358 | self, 359 | n_components=10, 360 | init="random", 361 | n_row_blocks=8, 362 | n_col_blocks=8, 363 | n_iter=100, 364 | n_iter_per_test=10, 365 | tolerance=0.001, 366 | e_step_thresh=1e-32, 367 | transform_random_seed=42, 368 | random_state=None, 369 | ): 370 | 371 | self.n_components = n_components 372 | self.init = init 373 | self.n_row_blocks = n_row_blocks 374 | self.n_col_blocks = n_col_blocks 375 | self.n_iter = n_iter 376 | self.n_iter_per_test = n_iter_per_test 377 | self.tolerance = tolerance 378 | self.e_step_thresh = e_step_thresh 379 | self.transform_random_seed = transform_random_seed 380 | self.random_state = random_state 381 | 382 | def fit(self, X, y=None, sample_weight=None): 383 | """Learn the pLSA model for the data X and return the document vectors. 384 | 385 | This is more efficient than calling fit followed by transform. 386 | 387 | Parameters 388 | ---------- 389 | X: array or sparse matrix of shape (n_docs, n_words) 390 | The data matrix pLSA is attempting to fit to. 391 | 392 | y: Ignored 393 | 394 | sample_weight: array of shape (n_docs,) 395 | Input document weights. 396 | 397 | Returns 398 | ------- 399 | self 400 | """ 401 | self.fit_transform(X, sample_weight=sample_weight) 402 | return self 403 | 404 | def fit_transform(self, X, y=None, sample_weight=None): 405 | """Learn the pLSA model for the data X and return the document vectors. 406 | 407 | This is more efficient than calling fit followed by transform. 408 | 409 | Parameters 410 | ---------- 411 | X: array or sparse matrix of shape (n_docs, n_words) 412 | The data matrix pLSA is attempting to fit to. 413 | 414 | y: Ignored 415 | 416 | sample_weight: array of shape (n_docs,) 417 | Input document weights. 418 | 419 | Returns 420 | ------- 421 | embedding: array of shape (n_docs, n_topics) 422 | An embedding of the documents into a topic space. 423 | """ 424 | 425 | X = check_array(X, accept_sparse="csr") 426 | X = standardize_input(X) 427 | 428 | if not issparse(X): 429 | X = csr_matrix(X) 430 | 431 | sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32) 432 | 433 | if np.any(X.data < 0): 434 | raise ValueError( 435 | "PLSA is only valid for matrices with non-negative " "entries" 436 | ) 437 | 438 | row_sums = np.array(X.sum(axis=1).T)[0] 439 | good_rows = row_sums != 0 440 | 441 | if not np.all(good_rows): 442 | zero_rows_found = True 443 | data_for_fitting = X[good_rows] 444 | else: 445 | zero_rows_found = False 446 | data_for_fitting = X 447 | 448 | U, V = plsa_fit( 449 | data_for_fitting, 450 | self.n_components, 451 | n_row_blocks=self.n_row_blocks, 452 | n_col_blocks=self.n_col_blocks, 453 | init=self.init, 454 | n_iter=self.n_iter, 455 | n_iter_per_test=self.n_iter_per_test, 456 | tolerance=self.tolerance, 457 | e_step_thresh=self.e_step_thresh, 458 | random_state=self.random_state, 459 | ) 460 | 461 | if zero_rows_found: 462 | self.embedding_ = np.zeros((X.shape[0], self.n_components)) 463 | self.embedding_[good_rows] = U 464 | else: 465 | self.embedding_ = U 466 | 467 | self.components_ = V 468 | self.training_data_ = X 469 | 470 | return self.embedding_ 471 | -------------------------------------------------------------------------------- /notebooks/EnsTop with 20-Newsgroups.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Topic Modeling for 20-Newsgroups\n", 8 | "\n", 9 | "There are several approaches to topic modeling. The most popular options are Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF). In this notebook we will use the 20-newsgroups dataset to compare these methods with probabilistic Latent Semantic Analysis (pLSA) and ensemble topic modeling (EnsTop) from the enstop library. This is not meant to be a particularly complete or comprehensive comparison, but rather a means to show how the enstop library works, and provide a quick comparison to other popular approaches.\n", 10 | "\n", 11 | "First we'll need the requisite libraries. Fortunately sklearn has a function to get the 20-newsgroups dataset, a CountVectorizer which can convert the raw text data into bag-of-words based count matrix, and implementations of both LDA and NMF. We'll of course also need the PLSA and EnsembleTopics classes from the enstop library." 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 1, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import warnings; warnings.simplefilter('ignore') # Suppress deprecation warnings" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from sklearn.datasets import fetch_20newsgroups\n", 30 | "from sklearn.feature_extraction.text import CountVectorizer\n", 31 | "from sklearn.decomposition import NMF, LatentDirichletAllocation\n", 32 | "from enstop import EnsembleTopics, PLSA" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "The next step is getting the data. For this we can just use sklearn. First the ``fetch_20newsgroups`` function will download the 20-newsgroups data. By specifying ``subset='all'`` we collect the full dataset rather than either a train or test set. The next step in the process is to convert this text data into a form that can be consumed by LDA, NMF, PLSA and EnsembleTopics. The required format, in this case, is a matrix where the (i,j)th entry is the count of the number of times the jth word in the vocabulary occurs in the ith document (in this case each document is a newsgroup post). This can be done extremely efficiently using sklearn's ``CountVectorizer``. We'll pass two extra parameters to the ``CountVectorizer``: a setting of ``min_df=5`` which will restrict the vocabulary to words that occur at least 5 times in the entire corpus; and ``stop_words='english'`` which will eliminate common words (like \"the\", \"and\", etc.) accordingly to a dictionary of such words in English." 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": {}, 46 | "outputs": [], 47 | "source": [ 48 | "news = fetch_20newsgroups(subset='all')\n", 49 | "data = CountVectorizer(min_df=5, stop_words='english').fit_transform(news.data)" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": {}, 55 | "source": [ 56 | "## Time to fit a model\n", 57 | "\n", 58 | "Now that we have the data is order, let's fit the various topic models and time them to see how long they take to fit. First up is LDA. The only parameter that requires tuning in this case is the number of topics we want to have. As a reasonable guess we'll choose 20 (the number of different newsgroups in the dataset)." 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "CPU times: user 5min 16s, sys: 4.4 s, total: 5min 20s\n", 71 | "Wall time: 2min 54s\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "%%time\n", 77 | "lda_model = LatentDirichletAllocation(n_components=20).fit(data)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "markdown", 82 | "metadata": {}, 83 | "source": [ 84 | "A little over two minutes on my laptop -- not bad at all. It is worth noting the total CPU time, which came in at a little over four minutes, demonstrating that the implementation is making good use of parallelism, especially considering this is running on a two core processor.\n", 85 | "\n", 86 | "Next up is NMF. In this case we need a few extra parameters for the sklearn implementation. By default the sklearn NMF uses Frobenius loss -- essentially the total squared error between the data matrix and the reconstruction from the product two low rank matrices (with positive entries). While this is suitable for many uses it isn't the right loss for topic modeling. Instead we want to use the Kullback-Leibler loss, which essentially models the data as a set of independent Poisson's -- essentially it views the data as counts (which they are), and seeks the reconstruction from the product two low rank matrices to provide Poisson parameters that maximise the likelihood of seeing the data. Having changed the loss function we also need to change the solver from the classical coordinate descent to the multiplicative update based solver which can work with KL loss. All of this makes the NMF fitting process much slower, but it provides more accurate results for the purposes of topic modelling." 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 5, 92 | "metadata": {}, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "CPU times: user 3min 47s, sys: 1min 27s, total: 5min 15s\n", 99 | "Wall time: 3min 46s\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "%%time\n", 105 | "nmf_model = NMF(n_components=20, beta_loss='kullback-leibler', solver='mu').fit(data)" 106 | ] 107 | }, 108 | { 109 | "cell_type": "markdown", 110 | "metadata": {}, 111 | "source": [ 112 | "Around three and half minutes, so slower than LDA in this case. In general, especially for larger datasets than this, NMF will often tend to be as fast or sometimes even faster than LDA. In this case, however, it is a little slower. It's again worth noting the CPU time: over five minutes. Again, the implementation is making good use of parallelism on the two core processor.\n", 113 | "\n", 114 | "Next let's try EnsembleTopics. In this case we will specify ``n_components=20`` as with LDA and NMF, but this time that is more of a suggestion. EnsembleTopics will attempt to find a \"natural\" number of topics. Given that this is a small dataset we will also reduce the overall work to be done via the ``n_starts`` parameter, which specifies how many bootstrap runs of pLSA to try; for small data like this eight runs will likely suffice rather than the default 15. It is also beneficial to scale the parallelism a little -- since the processor only has two cores it is best not to overtax it with too many jobs at once." 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 6, 120 | "metadata": {}, 121 | "outputs": [ 122 | { 123 | "name": "stdout", 124 | "output_type": "stream", 125 | "text": [ 126 | "CPU times: user 7min 12s, sys: 4.28 s, total: 7min 17s\n", 127 | "Wall time: 3min 3s\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "%%time\n", 133 | "ens_model = EnsembleTopics(n_components=20, n_starts=8, n_jobs=2).fit(data)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Coming in at just over three minutes this among the slower of the options we've tried here. On the other hand, as with NMF, EnsembleTopics will scale up well, and would likely look better on larger datasets. It is, however, in the right ballpark, which is somewhat reassuring. When we look at how well the topic modeling performed on the data this extra time might seem more worthwhile.\n", 141 | "\n", 142 | "Lastly let's look at pLSA. Historically pLSA is a precursor to LDA which came out a couple of years later and added Bayesian priors and more robust statistical foundations. On the other hand the pLSA algorithm itself is surprisingly simple, and with a little care high performance implementations are not hard to write. Given an efficient Expectation-Maximization optimizer it can potentially even find better solutions than a somewhat more complex LDA optimization. Using pLSA from enstop is just as easy as LDA in sklearn -- tell it the number of topics you want and set it going." 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 7, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "CPU times: user 26.3 s, sys: 325 ms, total: 26.6 s\n", 155 | "Wall time: 14.7 s\n" 156 | ] 157 | } 158 | ], 159 | "source": [ 160 | "%%time\n", 161 | "plsa_model = PLSA(n_components=20).fit(data)" 162 | ] 163 | }, 164 | { 165 | "cell_type": "markdown", 166 | "metadata": {}, 167 | "source": [ 168 | "A mere fifteen seconds! Decidedly faster than LDA, and it will scale well in terms of dataset size (but may scale less well in the desired number of topics). At the very least, given its speed, pLSA is a contender in the topic modeling space. Also worth noting is that this performance was achieved despite being a completely serial implementation -- the CPU time is the same as the wall time in this case.\n", 169 | "\n", 170 | "Now, having looked at how long it takes the algorithms to run, the next question is: how good are they? A fast algorithm that does a poor job is not worth much." 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "## Quality measures of topic models\n", 178 | "\n", 179 | "There are numerous ways to measure quality in topic models, including perplexity based approaches, lift, and coherence. Each technique has its pros and cons, as with any unsupervised task evaluation. We will attempt to sidestep some of these issues by evaluating the topic modeling approaches at a downstream task -- how well does the topic space categorise the different documents. Since the documents have defined labels (which newsgroup they were posted to) we have ground-truth to compare to. Since we can express the documents in terms of the learned topic space we can \"classify\" a document as the strongest topic associated to that document. Given two classifications we can then score how well these match via [adjusted Rand score](https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index) or [adjusted mutual information](https://en.wikipedia.org/wiki/Adjusted_mutual_information). Fortunately sklearn has implementations for both metrics. We'll also load numpy so we can extract the index of the most likely topic for each document." 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 8, 185 | "metadata": {}, 186 | "outputs": [], 187 | "source": [ 188 | "from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score\n", 189 | "import numpy as np" 190 | ] 191 | }, 192 | { 193 | "cell_type": "markdown", 194 | "metadata": {}, 195 | "source": [ 196 | "Next we need to place the documents in topic space. For NMF and LDA we can use the transform function. We could do the same for EnsembleTopics and pLSA, but since they store the document embedding in topic space of the training set as the ``embedding_`` attribute we can save work and just use that. Next we need to determine which topic is the most likely for each document -- this is just a matter of computing the argmax for each row of the embedded document matrix." 197 | ] 198 | }, 199 | { 200 | "cell_type": "code", 201 | "execution_count": 9, 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "nmf_doc_vectors = nmf_model.transform(data)\n", 206 | "nmf_clusters = np.argmax(nmf_doc_vectors, axis=1)\n", 207 | "lda_doc_vectors = lda_model.transform(data)\n", 208 | "lda_clusters = np.argmax(lda_doc_vectors, axis=1)\n", 209 | "ens_doc_vectors = ens_model.embedding_\n", 210 | "ens_clusters = np.argmax(ens_doc_vectors, axis=1)\n", 211 | "plsa_doc_vectors = plsa_model.embedding_\n", 212 | "plsa_clusters = np.argmax(plsa_doc_vectors, axis=1)" 213 | ] 214 | }, 215 | { 216 | "cell_type": "markdown", 217 | "metadata": {}, 218 | "source": [ 219 | "Now that we have classified the documents according to the topic space we can consider how well that classification compares to the actual ground-truth classification. Both adjusted Rand score and adjusted mutual information provide scores between 0 and 1 such that 0 represents an essentially random assignment (in comparison to the ground truth) and 1 represents a perfect matching with the ground truth. Obviosuly higher scores are better.\n", 220 | "\n", 221 | "We'll start with NMF." 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 10, 227 | "metadata": {}, 228 | "outputs": [ 229 | { 230 | "name": "stdout", 231 | "output_type": "stream", 232 | "text": [ 233 | "NMF Adjusted Rand: 0.151495442288548\n", 234 | "NMF Adjusted Mutual Information: 0.322145856972107\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "print(\"NMF Adjusted Rand: \", adjusted_rand_score(news.target, nmf_clusters))\n", 240 | "print(\"NMF Adjusted Mutual Information: \", adjusted_mutual_info_score(news.target, nmf_clusters))" 241 | ] 242 | }, 243 | { 244 | "cell_type": "markdown", 245 | "metadata": {}, 246 | "source": [ 247 | "These scores are not great -- but the 20-newsgroups dataset is somewhat nontrivial (with several closely related newsgroups). Still, we can hope that some of the other techniques may have fared better.\n", 248 | "\n", 249 | "Next let's look at how the LDA model performed." 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 11, 255 | "metadata": {}, 256 | "outputs": [ 257 | { 258 | "name": "stdout", 259 | "output_type": "stream", 260 | "text": [ 261 | "LDA Adjusted Rand: 0.22301812908887647\n", 262 | "LDA Adjusted Mutual Information: 0.3660410130009368\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "print(\"LDA Adjusted Rand: \", adjusted_rand_score(news.target, lda_clusters))\n", 268 | "print(\"LDA Adjusted Mutual Information: \", adjusted_mutual_info_score(news.target, lda_clusters))" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "A definite improvement over NMF, but still below what we might ideally like. On the other hand LDA is considered the go-to state-of-the-art technique for topic modeling, so perhaps this is the best we can hope to do with this corpus and the (rather limited) amount of text-preprocessing we have done.\n", 276 | "\n", 277 | "Let's try pLSA next and see how it managed to do, given that it ran so very quickly." 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 12, 283 | "metadata": {}, 284 | "outputs": [ 285 | { 286 | "name": "stdout", 287 | "output_type": "stream", 288 | "text": [ 289 | "pLSA Adjusted Rand: 0.2764225648913671\n", 290 | "pLSA Adjusted Mutual Information: 0.43413462309828155\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "print(\"pLSA Adjusted Rand: \", adjusted_rand_score(news.target, plsa_clusters))\n", 296 | "print(\"pLSA Adjusted Mutual Information: \", adjusted_mutual_info_score(news.target, plsa_clusters))" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "It seems that pLSA outperformed LDA on this particular task, and by a reasonable amount. While the rand score is still fairly low the mutual information indicates that we are almost getting into a range that might be considered reasonable.\n", 304 | "\n", 305 | "Finally let's see what the extra work of ensembling can buy us." 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": 13, 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "EnsTop Adjusted Rand: 0.33676056267373145\n", 318 | "EnsTop Adjusted Mutual Information: 0.47842663849608985\n" 319 | ] 320 | } 321 | ], 322 | "source": [ 323 | "print(\"EnsTop Adjusted Rand: \", adjusted_rand_score(news.target, ens_clusters))\n", 324 | "print(\"EnsTop Adjusted Mutual Information: \", adjusted_mutual_info_score(news.target, ens_clusters))" 325 | ] 326 | }, 327 | { 328 | "cell_type": "markdown", 329 | "metadata": {}, 330 | "source": [ 331 | "It seems that running several pLSA runs and looking for stable topics actually manages to produce much better topics, at least for classifying the 20-newsgroups posts. The even better news is that, despite getting the benefits of the pLSA approach, since the ensemble is built from bootstrap samples of the corpus we actually expect this to generalise better than the pure pLSA approach.\n", 332 | "\n", 333 | "We claim that, at least for this small example, EnsembleTopics is clearly the best approach for topic modeling." 334 | ] 335 | } 336 | ], 337 | "metadata": { 338 | "kernelspec": { 339 | "display_name": "Python 3", 340 | "language": "python", 341 | "name": "python3" 342 | }, 343 | "language_info": { 344 | "codemirror_mode": { 345 | "name": "ipython", 346 | "version": 3 347 | }, 348 | "file_extension": ".py", 349 | "mimetype": "text/x-python", 350 | "name": "python", 351 | "nbconvert_exporter": "python", 352 | "pygments_lexer": "ipython3", 353 | "version": "3.7.5" 354 | } 355 | }, 356 | "nbformat": 4, 357 | "nbformat_minor": 2 358 | } 359 | -------------------------------------------------------------------------------- /enstop/block_parallel_plsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | from sklearn.utils import check_array, check_random_state 6 | from sklearn.utils.validation import _check_sample_weight 7 | from scipy.sparse import issparse, csr_matrix, coo_matrix 8 | 9 | from enstop.utils import ( 10 | normalize, 11 | coherence, 12 | mean_coherence, 13 | log_lift, 14 | mean_log_lift, 15 | standardize_input, 16 | ) 17 | from enstop.plsa import plsa_init 18 | 19 | 20 | @numba.njit( 21 | [ 22 | "f4[:,::1](i4[::1],i4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4)", 23 | "f4[:,::1](i4[::1],i4[::1],f4[:,:],f4[:,::1],f4[:,::1],f4)", 24 | ], 25 | locals={ 26 | "k": numba.types.intp, 27 | "w": numba.types.uint32, 28 | "d": numba.types.uint32, 29 | "z": numba.types.uint16, 30 | "v": numba.types.float32, 31 | "nz_idx": numba.types.uint32, 32 | "norm": numba.types.float32, 33 | }, 34 | fastmath=True, 35 | nogil=True, 36 | ) 37 | def plsa_e_step_on_a_block( 38 | block_rows, 39 | block_cols, 40 | p_w_given_z_block, 41 | p_z_given_d_block, 42 | p_z_given_wd_block, 43 | probability_threshold=1e-32, 44 | ): 45 | k = p_w_given_z_block.shape[0] 46 | 47 | for nz_idx in range(block_rows.shape[0]): 48 | if block_rows[nz_idx] < 0: 49 | break 50 | 51 | d = block_rows[nz_idx] 52 | w = block_cols[nz_idx] 53 | 54 | norm = 0.0 55 | for z in range(k): 56 | v = p_w_given_z_block[z, w] * p_z_given_d_block[d, z] 57 | if v > probability_threshold: 58 | p_z_given_wd_block[nz_idx, z] = v 59 | norm += v 60 | else: 61 | p_z_given_wd_block[nz_idx, z] = 0.0 62 | for z in range(k): 63 | if norm > 0: 64 | p_z_given_wd_block[nz_idx, z] /= norm 65 | 66 | return p_z_given_wd_block 67 | 68 | 69 | @numba.njit( 70 | [ 71 | "void(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1])", 72 | "void(i4[::1],i4[::1],f4[::1],f4[:,:],f4[:,:],f4[:,::1],f4[::1],f4[::1])", 73 | ], 74 | locals={ 75 | "k": numba.types.intp, 76 | "w": numba.types.uint32, 77 | "d": numba.types.uint32, 78 | "x": numba.types.float32, 79 | "z": numba.types.uint16, 80 | "nz_idx": numba.types.uint32, 81 | "s": numba.types.float32, 82 | }, 83 | fastmath=True, 84 | nogil=True, 85 | ) 86 | def plsa_partial_m_step_on_a_block( 87 | block_rows, 88 | block_cols, 89 | block_vals, 90 | p_w_given_z_block, 91 | p_z_given_d_block, 92 | p_z_given_wd_block, 93 | norm_pwz, 94 | norm_pdz_block, 95 | ): 96 | k = p_w_given_z_block.shape[0] 97 | 98 | for nz_idx in range(block_rows.shape[0]): 99 | if block_rows[nz_idx] < 0: 100 | break 101 | 102 | d = block_rows[nz_idx] 103 | w = block_cols[nz_idx] 104 | x = block_vals[nz_idx] 105 | 106 | for z in range(k): 107 | s = x * p_z_given_wd_block[nz_idx, z] 108 | 109 | p_w_given_z_block[z, w] += s 110 | p_z_given_d_block[d, z] += s 111 | 112 | norm_pwz[z] += s 113 | norm_pdz_block[d] += s 114 | 115 | 116 | @numba.njit( 117 | "void(i4[:,:,::1],i4[:,:,::1],f4[:,:,::1],f4[:,:,::1],f4[:,:,::1],f4[:,:,:,::1]," 118 | "f4[:,:,:,::1],f4[:,:,:,::1],f4[:,::1],f4[:,:,::1],f4)", 119 | locals={ 120 | "k": numba.types.intp, 121 | "z": numba.types.uint16, 122 | "d": numba.types.uint32, 123 | "i": numba.types.uint16, 124 | "j": numba.types.uint16, 125 | "n_w_blocks": numba.types.intp, 126 | "n_d_blocks": numba.types.intp, 127 | }, 128 | parallel=True, 129 | fastmath=True, 130 | nogil=True, 131 | ) 132 | def plsa_em_step_by_blocks( 133 | block_rows_ndarray, 134 | block_cols_ndarray, 135 | block_vals_ndarray, 136 | prev_p_w_given_z, 137 | prev_p_z_given_d, 138 | blocked_next_p_w_given_z, 139 | blocked_next_p_z_given_d, 140 | p_z_given_wd_block, 141 | blocked_norm_pwz, 142 | blocked_norm_pdz, 143 | e_step_thresh=1e-32, 144 | ): 145 | n_d_blocks = block_rows_ndarray.shape[0] 146 | n_w_blocks = block_rows_ndarray.shape[1] 147 | 148 | # n = prev_p_z_given_d.shape[0] 149 | # m = prev_p_w_given_z.shape[1] 150 | k = prev_p_z_given_d.shape[2] 151 | 152 | # zero out the norms for recomputation 153 | blocked_norm_pdz[:] = 0.0 154 | blocked_norm_pwz[:] = 0.0 155 | 156 | for i in numba.prange(n_d_blocks): 157 | 158 | for j in numba.prange(n_w_blocks): 159 | block_rows = block_rows_ndarray[i, j] 160 | block_cols = block_cols_ndarray[i, j] 161 | block_vals = block_vals_ndarray[i, j] 162 | 163 | plsa_e_step_on_a_block( 164 | block_rows, 165 | block_cols, 166 | prev_p_w_given_z[j], 167 | prev_p_z_given_d[i], 168 | p_z_given_wd_block[i, j], 169 | np.float32(e_step_thresh), 170 | ) 171 | plsa_partial_m_step_on_a_block( 172 | block_rows, 173 | block_cols, 174 | block_vals, 175 | blocked_next_p_w_given_z[i, j], 176 | blocked_next_p_z_given_d[j, i], 177 | p_z_given_wd_block[i, j], 178 | blocked_norm_pwz[i], 179 | blocked_norm_pdz[j, i], 180 | ) 181 | 182 | prev_p_z_given_d[:] = blocked_next_p_z_given_d.sum(axis=0) 183 | norm_pdz = blocked_norm_pdz.sum(axis=0) 184 | prev_p_w_given_z[:] = blocked_next_p_w_given_z.sum(axis=0) 185 | norm_pwz = blocked_norm_pwz.sum(axis=0) 186 | 187 | # Once complete we can normalize to complete the M step 188 | for z in numba.prange(k): 189 | if norm_pwz[z] > 0: 190 | for w_block in range(prev_p_w_given_z.shape[0]): 191 | for w_offset in range(prev_p_w_given_z.shape[2]): 192 | prev_p_w_given_z[w_block, z, w_offset] /= norm_pwz[z] 193 | for d_block in range(prev_p_z_given_d.shape[0]): 194 | for d_offset in range(prev_p_z_given_d.shape[1]): 195 | if norm_pdz[d_block, d_offset] > 0: 196 | prev_p_z_given_d[d_block, d_offset, z] /= norm_pdz[ 197 | d_block, d_offset 198 | ] 199 | 200 | # Zero out the old matrices these matrices for next time 201 | blocked_next_p_z_given_d[:] = 0.0 202 | blocked_next_p_w_given_z[:] = 0.0 203 | 204 | 205 | @numba.njit( 206 | locals={ 207 | "i": numba.types.uint16, 208 | "j": numba.types.uint16, 209 | "k": numba.types.intp, 210 | "w": numba.types.uint32, 211 | "d": numba.types.uint32, 212 | "z": numba.types.uint16, 213 | "nz_idx": numba.types.uint32, 214 | "x": numba.types.float32, 215 | "result": numba.types.float32, 216 | "p_w_given_d": numba.types.float32, 217 | }, 218 | fastmath=True, 219 | nogil=True, 220 | parallel=True, 221 | ) 222 | def log_likelihood_by_blocks( 223 | block_rows_ndarray, 224 | block_cols_ndarray, 225 | block_vals_ndarray, 226 | p_w_given_z, 227 | p_z_given_d, 228 | ): 229 | result = 0.0 230 | k = p_z_given_d.shape[2] 231 | 232 | for i in numba.prange(block_rows_ndarray.shape[0]): 233 | for j in range(block_rows_ndarray.shape[1]): 234 | for nz_idx in range(block_rows_ndarray.shape[2]): 235 | if block_rows_ndarray[i, j, nz_idx] < 0: 236 | break 237 | 238 | d = block_rows_ndarray[i, j, nz_idx] 239 | w = block_cols_ndarray[i, j, nz_idx] 240 | x = block_vals_ndarray[i, j, nz_idx] 241 | 242 | p_w_given_d = 0.0 243 | for z in range(k): 244 | p_w_given_d += p_w_given_z[j, z, w] * p_z_given_d[i, d, z] 245 | 246 | result += x * np.log(p_w_given_d) 247 | 248 | return result 249 | 250 | 251 | @numba.njit(fastmath=True, nogil=True) 252 | def plsa_fit_inner_blockwise( 253 | block_rows_ndarray, 254 | block_cols_ndarray, 255 | block_vals_ndarray, 256 | p_w_given_z, 257 | p_z_given_d, 258 | block_row_size, 259 | block_col_size, 260 | n_iter=100, 261 | n_iter_per_test=10, 262 | tolerance=0.001, 263 | e_step_thresh=1e-32, 264 | ): 265 | k = p_z_given_d.shape[2] 266 | 267 | n_d_blocks = block_rows_ndarray.shape[0] 268 | n_w_blocks = block_rows_ndarray.shape[1] 269 | block_size = block_rows_ndarray.shape[2] 270 | 271 | p_z_given_wd_block = np.zeros( 272 | (n_d_blocks, n_w_blocks, block_size, k), dtype=np.float32 273 | ) 274 | 275 | blocked_next_p_w_given_z = np.zeros( 276 | ( 277 | np.int64(n_d_blocks), 278 | np.int64(n_w_blocks), 279 | np.int64(k), 280 | np.int64(block_col_size), 281 | ), 282 | dtype=np.float32, 283 | ) 284 | blocked_norm_pwz = np.zeros((n_d_blocks, k), dtype=np.float32) 285 | blocked_next_p_z_given_d = np.zeros( 286 | ( 287 | np.int64(n_w_blocks), 288 | np.int64(n_d_blocks), 289 | np.int64(block_row_size), 290 | np.int64(k), 291 | ), 292 | dtype=np.float32, 293 | ) 294 | blocked_norm_pdz = np.zeros( 295 | (np.int64(n_w_blocks), np.int64(n_d_blocks), np.int64(block_row_size)), 296 | dtype=np.float32, 297 | ) 298 | 299 | previous_log_likelihood = log_likelihood_by_blocks( 300 | block_rows_ndarray, 301 | block_cols_ndarray, 302 | block_vals_ndarray, 303 | p_w_given_z, 304 | p_z_given_d, 305 | ) 306 | 307 | for i in range(n_iter): 308 | plsa_em_step_by_blocks( 309 | block_rows_ndarray, 310 | block_cols_ndarray, 311 | block_vals_ndarray, 312 | p_w_given_z, 313 | p_z_given_d, 314 | blocked_next_p_w_given_z, 315 | blocked_next_p_z_given_d, 316 | p_z_given_wd_block, 317 | blocked_norm_pwz, 318 | blocked_norm_pdz, 319 | e_step_thresh, 320 | ) 321 | 322 | if i % n_iter_per_test == 0: 323 | current_log_likelihood = log_likelihood_by_blocks( 324 | block_rows_ndarray, 325 | block_cols_ndarray, 326 | block_vals_ndarray, 327 | p_w_given_z, 328 | p_z_given_d, 329 | ) 330 | change = np.abs(current_log_likelihood - previous_log_likelihood) 331 | if change / np.abs(current_log_likelihood) < tolerance: 332 | break 333 | else: 334 | previous_log_likelihood = current_log_likelihood 335 | 336 | return p_z_given_d, p_w_given_z 337 | 338 | 339 | def plsa_fit( 340 | X, 341 | k, 342 | n_row_blocks=8, 343 | n_col_blocks=8, 344 | init="random", 345 | n_iter=100, 346 | n_iter_per_test=10, 347 | tolerance=0.001, 348 | e_step_thresh=1e-32, 349 | random_state=None, 350 | ): 351 | rng = check_random_state(random_state) 352 | p_z_given_d_init, p_w_given_z_init = plsa_init(X, k, init=init, rng=rng) 353 | 354 | A = X.tocsr().astype(np.float32) 355 | 356 | n = A.shape[0] 357 | m = A.shape[1] 358 | 359 | block_row_size = np.uint16(np.ceil(A.shape[0] / n_row_blocks)) 360 | block_col_size = np.uint16(np.ceil(A.shape[1] / n_col_blocks)) 361 | 362 | p_z_given_d = np.zeros((block_row_size * n_row_blocks, k), dtype=np.float32) 363 | p_z_given_d[: p_z_given_d_init.shape[0]] = p_z_given_d_init 364 | p_z_given_d = p_z_given_d.reshape(n_row_blocks, block_row_size, k) 365 | 366 | p_w_given_z = np.zeros((k, block_col_size * n_col_blocks), dtype=np.float32) 367 | p_w_given_z[:, : p_w_given_z_init.shape[1]] = p_w_given_z_init 368 | # p_w_given_z = np.transpose( 369 | # p_w_given_z.T.reshape(n_col_blocks, block_col_size, k), axes=[0, 2, 1] 370 | # ).astype(np.float32, order="C") 371 | p_w_given_z = np.stack(np.hsplit(p_w_given_z, n_col_blocks)) 372 | 373 | A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)] 374 | max_nnz_per_block = 0 375 | for i in range(n_row_blocks): 376 | 377 | row_start = block_row_size * i 378 | row_end = min(row_start + block_row_size, n) 379 | 380 | for j in range(n_col_blocks): 381 | 382 | col_start = block_col_size * j 383 | col_end = min(col_start + block_col_size, m) 384 | 385 | A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo() 386 | if A_blocks[i][j].nnz > max_nnz_per_block: 387 | max_nnz_per_block = A_blocks[i][j].nnz 388 | 389 | block_rows_ndarray = np.full( 390 | (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32 391 | ) 392 | block_cols_ndarray = np.full( 393 | (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32 394 | ) 395 | block_vals_ndarray = np.zeros( 396 | (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32 397 | ) 398 | for i in range(n_row_blocks): 399 | for j in range(n_col_blocks): 400 | nnz = A_blocks[i][j].nnz 401 | block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row 402 | block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col 403 | block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data 404 | 405 | p_z_given_d, p_w_given_z = plsa_fit_inner_blockwise( 406 | block_rows_ndarray, 407 | block_cols_ndarray, 408 | block_vals_ndarray, 409 | p_w_given_z, 410 | p_z_given_d, 411 | block_row_size, 412 | block_col_size, 413 | n_iter=n_iter, 414 | n_iter_per_test=n_iter_per_test, 415 | tolerance=tolerance, 416 | e_step_thresh=e_step_thresh, 417 | ) 418 | p_z_given_d = np.vstack(p_z_given_d)[:n, :] 419 | p_w_given_z = np.hstack(p_w_given_z)[:, :m] 420 | 421 | return p_z_given_d, p_w_given_z 422 | 423 | 424 | class BlockParallelPLSA(BaseEstimator, TransformerMixin): 425 | def __init__( 426 | self, 427 | n_components=10, 428 | init="random", 429 | n_row_blocks=8, 430 | n_col_blocks=8, 431 | n_iter=100, 432 | n_iter_per_test=10, 433 | tolerance=0.001, 434 | e_step_thresh=1e-32, 435 | transform_random_seed=42, 436 | random_state=None, 437 | ): 438 | 439 | self.n_components = n_components 440 | self.init = init 441 | self.n_row_blocks = n_row_blocks 442 | self.n_col_blocks = n_col_blocks 443 | self.n_iter = n_iter 444 | self.n_iter_per_test = n_iter_per_test 445 | self.tolerance = tolerance 446 | self.e_step_thresh = e_step_thresh 447 | self.transform_random_seed = transform_random_seed 448 | self.random_state = random_state 449 | 450 | def fit(self, X, y=None, sample_weight=None): 451 | """Learn the pLSA model for the data X and return the document vectors. 452 | 453 | This is more efficient than calling fit followed by transform. 454 | 455 | Parameters 456 | ---------- 457 | X: array or sparse matrix of shape (n_docs, n_words) 458 | The data matrix pLSA is attempting to fit to. 459 | 460 | y: Ignored 461 | 462 | sample_weight: array of shape (n_docs,) 463 | Input document weights. 464 | 465 | Returns 466 | ------- 467 | self 468 | """ 469 | self.fit_transform(X, sample_weight=sample_weight) 470 | return self 471 | 472 | def fit_transform(self, X, y=None, sample_weight=None): 473 | """Learn the pLSA model for the data X and return the document vectors. 474 | 475 | This is more efficient than calling fit followed by transform. 476 | 477 | Parameters 478 | ---------- 479 | X: array or sparse matrix of shape (n_docs, n_words) 480 | The data matrix pLSA is attempting to fit to. 481 | 482 | y: Ignored 483 | 484 | sample_weight: array of shape (n_docs,) 485 | Input document weights. 486 | 487 | Returns 488 | ------- 489 | embedding: array of shape (n_docs, n_topics) 490 | An embedding of the documents into a topic space. 491 | """ 492 | 493 | X = check_array(X, accept_sparse="csr") 494 | X = standardize_input(X) 495 | 496 | if not issparse(X): 497 | X = csr_matrix(X) 498 | 499 | sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32) 500 | 501 | if np.any(X.data < 0): 502 | raise ValueError( 503 | "PLSA is only valid for matrices with non-negative " "entries" 504 | ) 505 | 506 | row_sums = np.array(X.sum(axis=1).T)[0] 507 | good_rows = row_sums != 0 508 | 509 | if not np.all(good_rows): 510 | zero_rows_found = True 511 | data_for_fitting = X[good_rows] 512 | else: 513 | zero_rows_found = False 514 | data_for_fitting = X 515 | 516 | U, V = plsa_fit( 517 | data_for_fitting, 518 | self.n_components, 519 | n_row_blocks=self.n_row_blocks, 520 | n_col_blocks=self.n_col_blocks, 521 | init=self.init, 522 | n_iter=self.n_iter, 523 | n_iter_per_test=self.n_iter_per_test, 524 | tolerance=self.tolerance, 525 | e_step_thresh=self.e_step_thresh, 526 | random_state=self.random_state, 527 | ) 528 | 529 | if zero_rows_found: 530 | self.embedding_ = np.zeros((X.shape[0], self.n_components)) 531 | self.embedding_[good_rows] = U 532 | else: 533 | self.embedding_ = U 534 | 535 | self.components_ = V 536 | self.training_data_ = X 537 | 538 | return self.embedding_ 539 | -------------------------------------------------------------------------------- /enstop/enstop_.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | import numba.cuda 4 | from warnings import warn 5 | from sklearn.base import BaseEstimator, TransformerMixin 6 | from sklearn.utils import check_array, check_random_state 7 | try: 8 | from sklearn.utils.validation import _check_sample_weight 9 | except ImportError: 10 | from enstop.utils import _check_sample_weight 11 | from sklearn.decomposition import NMF, non_negative_factorization 12 | from scipy.sparse import issparse, csr_matrix, coo_matrix 13 | import dask 14 | 15 | try: 16 | import joblib 17 | _HAVE_JOBLIB = True 18 | except ImportError: 19 | warn("Joblib could not be loaded; joblib parallelism will not be available") 20 | _HAVE_JOBLIB = False 21 | from hdbscan._hdbscan_linkage import mst_linkage_core, label 22 | from hdbscan.hdbscan_ import _tree_to_labels 23 | import hdbscan 24 | import umap 25 | 26 | # TODO: Once umap 0.4 is released enable this... 27 | from umap.distances import hellinger 28 | 29 | 30 | # @numba.njit() 31 | # def hellinger(x, y): 32 | # result = 0.0 33 | # l1_norm_x = 0.0 34 | # l1_norm_y = 0.0 35 | # 36 | # for i in range(x.shape[0]): 37 | # result += np.sqrt(x[i] * y[i]) 38 | # l1_norm_x += x[i] 39 | # l1_norm_y += y[i] 40 | # 41 | # if l1_norm_x == 0 and l1_norm_y == 0: 42 | # return 0.0 43 | # elif l1_norm_x == 0 or l1_norm_y == 0: 44 | # return 1.0 45 | # else: 46 | # return np.sqrt(1 - result / np.sqrt(l1_norm_x * l1_norm_y)) 47 | 48 | 49 | from enstop.utils import normalize, coherence, mean_coherence, log_lift, mean_log_lift 50 | from enstop.plsa import plsa_fit, plsa_refit 51 | 52 | if numba.cuda.is_available(): 53 | from enstop.cuda_plsa import plsa_fit as gpu_plsa_fit 54 | 55 | 56 | def plsa_topics(X, k, **kwargs): 57 | """Perform a boostrap sample from a corpus of documents and fit the sample using 58 | pLSA to give a set of topic vectors such that the (z,w) entry of the returned 59 | array is the probability P(w|z) of word w occuring given the zth topic. 60 | 61 | Parameters 62 | ---------- 63 | X: sparse matrix of shape (n_docs, n_words) 64 | The bag of words representation of the corpus of documents. 65 | 66 | k: int 67 | The number of topics to generate. 68 | 69 | kwargs: 70 | Further keyword arguments that can be passed on th the ``plsa_fit`` function. 71 | Possibilities include: 72 | * ``init`` 73 | * ``n_iter`` 74 | * ``n_iter_per_test`` 75 | * ``tolerance`` 76 | * ``e_step_threshold`` 77 | * ``random_state`` 78 | 79 | Returns 80 | ------- 81 | topics: array of shape (k, n_words) 82 | The topics generated from the bootstrap sample. 83 | """ 84 | A = X.tocsr() 85 | if kwargs.get("bootstrap", True): 86 | rng = check_random_state(kwargs.get("random_state", None)) 87 | bootstrap_sample_indices = rng.randint(0, A.shape[0], size=A.shape[0]) 88 | B = A[bootstrap_sample_indices] 89 | else: 90 | B = A 91 | sample_weight = _check_sample_weight(None, B, dtype=np.float32) 92 | if numba.cuda.is_available(): 93 | doc_topic, topic_vocab = gpu_plsa_fit( 94 | B, 95 | k, 96 | init=kwargs.get("init", "random"), 97 | n_iter=kwargs.get("n_iter", 100), 98 | n_iter_per_test=kwargs.get("n_iter_per_test", 10), 99 | tolerance=kwargs.get("tolerance", 0.001), 100 | e_step_thresh=kwargs.get("e_step_thresh", 1e-16), 101 | random_state=kwargs.get("random_state", None), 102 | ) 103 | else: 104 | doc_topic, topic_vocab = plsa_fit( 105 | B, 106 | k, 107 | sample_weight, 108 | init=kwargs.get("init", "random"), 109 | n_iter=kwargs.get("n_iter", 100), 110 | n_iter_per_test=kwargs.get("n_iter_per_test", 10), 111 | tolerance=kwargs.get("tolerance", 0.001), 112 | e_step_thresh=kwargs.get("e_step_thresh", 1e-16), 113 | random_state=kwargs.get("random_state", None), 114 | ) 115 | return topic_vocab 116 | 117 | 118 | def nmf_topics(X, k, **kwargs): 119 | """Perform a boostrap sample from a corpus of documents and fit the sample using 120 | NMF to give a set of topic vectors, normalized such that the(z,w) entry of the 121 | returned array is the probability P(w|z) of word w occuring given the zth topic. 122 | 123 | Parameters 124 | ---------- 125 | X: sparse matrix of shape (n_docs, n_words) 126 | The bag of words representation of the corpus of documents. 127 | 128 | k: int 129 | The number of topics to generate. 130 | 131 | kwargs: 132 | Further keyword arguments that can be passed on th the ``NMF`` class. 133 | Possibilities include: 134 | * ``init`` 135 | * ``beta_loss`` 136 | * ``alpha`` 137 | * ``solver`` 138 | 139 | Returns 140 | ------- 141 | topics: array of shape (k, n_words) 142 | The topics generated from the bootstrap sample. 143 | """ 144 | A = X.tocsr() 145 | if kwargs.get("bootstrap", True): 146 | rng = check_random_state(kwargs.get("random_state", None)) 147 | bootstrap_sample_indices = rng.randint(0, A.shape[0], size=A.shape[0]) 148 | B = A[bootstrap_sample_indices] 149 | else: 150 | B = A 151 | nmf = NMF( 152 | n_components=k, 153 | init=kwargs.get("init", "nndsvd"), 154 | beta_loss=kwargs.get("beta_loss", 1), 155 | alpha=kwargs.get("alpha", 0.0), 156 | solver=kwargs.get("solver", "mu"), 157 | random_state=kwargs.get("random_state", None), 158 | ).fit(B) 159 | topics = nmf.components_.copy() 160 | normalize(topics, axis=1) 161 | return topics 162 | 163 | 164 | def ensemble_of_topics( 165 | X, k, model="plsa", n_jobs=4, n_runs=16, parallelism="dask", **kwargs 166 | ): 167 | """Generate a large number of topic vectors by running an ensemble of 168 | bootstrap samples of a given corpus. Exploit the embarrassingly parallel nature of the problem 169 | using wither joblib or dask. Support for both pLSA and NMF approaches to topic generation are 170 | available. The sklearn implementation of NMF is used for NMF modeling. 171 | 172 | Parameters 173 | ---------- 174 | X: sparse matrix of shape (n_docs, n_words) 175 | The bag-of-words matrix for the corpus to train on 176 | 177 | k: int 178 | The number of topics to generate per bootstrap sampled run. 179 | 180 | model: string (optional, default="plsa") 181 | The topic modeling method to use (either "plsa" or "nmf") 182 | 183 | n_jobs: int (optional, default=4) 184 | The number of jobs to run in parallel. 185 | 186 | n_runs: int (optional, default=16) 187 | The number of bootstrapped sampled runs to use for topic generation. 188 | 189 | parallelism: string (optional, default="dask") 190 | The parallelism model to use. Should be one of "dask" or "joblib". 191 | 192 | kwargs: 193 | Extra keyword based arguments to pass on to the pLSA or NMF models. 194 | 195 | Returns 196 | ------- 197 | topics: array of shape (n_runs * k, n_words) 198 | The full set of all topics generated by all the topic modeling runs. 199 | 200 | """ 201 | 202 | if model == "plsa": 203 | create_topics = plsa_topics 204 | elif model == "nmf": 205 | create_topics = nmf_topics 206 | else: 207 | raise ValueError('Model must be one of "plsa" or "nmf"') 208 | 209 | if parallelism == "dask": 210 | dask_topics = dask.delayed(create_topics) 211 | staged_topics = [dask_topics(X, k, **kwargs) for i in range(n_runs)] 212 | topics = dask.compute(*staged_topics, scheduler="threads", num_workers=n_jobs) 213 | elif parallelism == "joblib" and _HAVE_JOBLIB: 214 | joblib_topics = joblib.delayed(create_topics) 215 | topics = joblib.Parallel(n_jobs=n_jobs, prefer="threads")( 216 | joblib_topics(X, k, **kwargs) for i in range(n_runs) 217 | ) 218 | elif parallelism == "joblib" and not _HAVE_JOBLIB: 219 | raise ValueError("Joblib was not correctly imported and is unavailable") 220 | elif parallelism == "none": 221 | topics = [] 222 | for i in range(n_runs): 223 | topics.append(create_topics(X, k, **kwargs)) 224 | else: 225 | raise ValueError( 226 | "Unrecognized parallelism {}; should be one of {}".format( 227 | parallelism, ("dask", "joblib") 228 | ) 229 | ) 230 | 231 | return np.vstack(topics) 232 | 233 | 234 | @numba.njit(fastmath=True, nogil=True) 235 | def kl_divergence(a, b): 236 | """Compute the KL-divergence between two multinomial distributions.""" 237 | result = 0.0 238 | for i in range(a.shape[0]): 239 | if a[i] > 0.0 and b[i] > 0.0: 240 | result += a[i] * (np.log2(a[i]) - np.log2(b[i])) 241 | return result 242 | 243 | 244 | @numba.njit(fastmath=True, parallel=True) 245 | def all_pairs_kl_divergence(distributions): 246 | """Compute all pairwise KL-divergences between a set of multinomial distributions.""" 247 | n = distributions.shape[0] 248 | result = np.zeros((n, n)) 249 | for i in range(n): 250 | for j in range(n): 251 | result[i, j] = kl_divergence(distributions[i], distributions[j]) 252 | return result 253 | 254 | 255 | @numba.njit(fastmath=True, parallel=True) 256 | def all_pairs_hellinger_distance(distributions): 257 | """Compute all pairwise Hellinger distances between a set of multinomial distributions.""" 258 | n = distributions.shape[0] 259 | result = np.zeros((n, n)) 260 | for i in range(n): 261 | for j in range(n): 262 | result[i, j] = hellinger(distributions[i], distributions[j]) 263 | return result 264 | 265 | 266 | def generate_combined_topics_kl(all_topics, min_samples=5, min_cluster_size=5): 267 | """Given a large list of topics select out a small list of stable topics 268 | by clustering the topics with HDBSCAN using KL-divergence as a distance 269 | measure between topics. 270 | 271 | 272 | Parameters 273 | ---------- 274 | all_topics: array of shape (N, n_words) 275 | The set of topics to be clustered. 276 | 277 | min_samples: int (optional, default=5) 278 | The min_samples parameter to use for HDBSCAN clustering. 279 | 280 | min_cluster_size: int (optional, default=5) 281 | The min_cluster_size parameter to use for HDBSCAN clustering 282 | 283 | Returns 284 | ------- 285 | stable_topics: array of shape (M, n_words) 286 | A set of M topics, one for each cluster found by HDBSCAN. 287 | """ 288 | divergence_matrix = all_pairs_kl_divergence(all_topics) 289 | core_divergences = np.sort(divergence_matrix, axis=1)[:, min_samples] 290 | tiled_core_divergences = np.tile(core_divergences, (core_divergences.shape[0], 1)) 291 | mutual_reachability = np.dstack( 292 | [ 293 | divergence_matrix, 294 | divergence_matrix.T, 295 | tiled_core_divergences, 296 | tiled_core_divergences.T, 297 | ] 298 | ).max(axis=-1) 299 | mst_data = mst_linkage_core(mutual_reachability) 300 | mst_order = np.argsort(mst_data.T[2]) 301 | mst_data = mst_data[mst_order] 302 | single_linkage_tree = label(mst_data) 303 | labels, probs, stabs, ctree, stree = _tree_to_labels( 304 | all_topics, 305 | single_linkage_tree, 306 | min_cluster_size=min_cluster_size, 307 | cluster_selection_method="leaf", 308 | ) 309 | result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32) 310 | for i in range(labels.max() + 1): 311 | result[i] = np.mean(np.sqrt(all_topics[labels == i]), axis=0) ** 2 312 | result[i] /= result[i].sum() 313 | 314 | return result 315 | 316 | 317 | def generate_combined_topics_hellinger(all_topics, min_samples=5, min_cluster_size=5): 318 | """Given a large list of topics select out a small list of stable topics 319 | by clustering the topics with HDBSCAN using Hellinger as a distance 320 | measure between topics. 321 | 322 | 323 | Parameters 324 | ---------- 325 | all_topics: array of shape (N, n_words) 326 | The set of topics to be clustered. 327 | 328 | min_samples: int (optional, default=5) 329 | The min_samples parameter to use for HDBSCAN clustering. 330 | 331 | min_cluster_size: int (optional, default=5) 332 | The min_cluster_size parameter to use for HDBSCAN clustering 333 | 334 | Returns 335 | ------- 336 | stable_topics: array of shape (M, n_words) 337 | A set of M topics, one for each cluster found by HDBSCAN. 338 | """ 339 | distance_matrix = all_pairs_hellinger_distance(all_topics) 340 | labels = hdbscan.HDBSCAN( 341 | min_samples=min_samples, 342 | min_cluster_size=min_cluster_size, 343 | metric="precomputed", 344 | cluster_selection_method="leaf", 345 | ).fit_predict(distance_matrix) 346 | result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32) 347 | for i in range(labels.max() + 1): 348 | result[i] = np.mean(np.sqrt(all_topics[labels == i]), axis=0) ** 2 349 | result[i] /= result[i].sum() 350 | 351 | return result 352 | 353 | 354 | def generate_combined_topics_hellinger_umap( 355 | all_topics, min_samples=5, min_cluster_size=5, n_neighbors=15, reduced_dim=5 356 | ): 357 | """Given a large list of topics select out a small list of stable topics 358 | by mapping the topics to a low dimensional space with UMAP (using 359 | Hellinger distance) and then clustering the topics with HDBSCAN using 360 | Euclidean distance in the embedding space to measure distance between topics. 361 | 362 | 363 | Parameters 364 | ---------- 365 | all_topics: array of shape (N, n_words) 366 | The set of topics to be clustered. 367 | 368 | min_samples: int (optional, default=5) 369 | The min_samples parameter to use for HDBSCAN clustering. 370 | 371 | min_cluster_size: int (optional, default=5) 372 | The min_cluster_size parameter to use for HDBSCAN clustering 373 | 374 | n_neighbors: int (optional, default=15) 375 | The n_neighbors value to use with UMAP. 376 | 377 | reduced_dim: int (optional, default=5) 378 | The dimension of the embedding space to use. 379 | 380 | Returns 381 | ------- 382 | stable_topics: array of shape (M, n_words) 383 | A set of M topics, one for each cluster found by HDBSCAN. 384 | """ 385 | embedding = umap.UMAP( 386 | n_neighbors=n_neighbors, n_components=reduced_dim, metric=hellinger 387 | ).fit_transform(all_topics) 388 | clusterer = hdbscan.HDBSCAN( 389 | min_samples=min_samples, 390 | min_cluster_size=min_cluster_size, 391 | cluster_selection_method="leaf", 392 | allow_single_cluster=True, 393 | ).fit(embedding) 394 | labels = clusterer.labels_ 395 | membership_strengths = clusterer.probabilities_ 396 | result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32) 397 | for i in range(labels.max() + 1): 398 | mask = labels == i 399 | result[i] = ( 400 | np.average( 401 | np.sqrt(all_topics[mask]), axis=0, weights=membership_strengths[mask] 402 | ) 403 | ** 2 404 | ) 405 | result[i] /= result[i].sum() 406 | 407 | return result 408 | 409 | 410 | _topic_combiner = { 411 | "kl_divergence": generate_combined_topics_kl, 412 | "hellinger": generate_combined_topics_hellinger, 413 | "hellinger_umap": generate_combined_topics_hellinger_umap, 414 | } 415 | 416 | 417 | def ensemble_fit( 418 | X, 419 | estimated_n_topics=10, 420 | model="plsa", 421 | init="random", 422 | min_samples=3, 423 | min_cluster_size=4, 424 | n_starts=16, 425 | n_jobs=1, 426 | parallelism="dask", 427 | topic_combination="hellinger_umap", 428 | bootstrap=True, 429 | n_iter=100, 430 | n_iter_per_test=10, 431 | tolerance=0.001, 432 | e_step_thresh=1e-16, 433 | lift_factor=1, 434 | beta_loss=1, 435 | alpha=0.0, 436 | solver="mu", 437 | random_state=None, 438 | ): 439 | """Generate a set of stable topics by using an ensemble of topic models and then clustering 440 | the results and generating representative topics for each cluster. The generate a set of 441 | document vectors based on the selected stable topics. 442 | 443 | Parameters 444 | ---------- 445 | X: array or sparse matrix of shape (n_docs, n_words) 446 | The bag-of-words matrix for the corpus to train on. 447 | 448 | estimated_n_topics: int (optional, default=10) 449 | The estimated number of topics. Note that the final number of topics produced can differ 450 | from this value, and may be more or less than the provided value. Instead this value 451 | provides the algorithm with a suggestion of the approximate number of topics to use. 452 | 453 | model: string (optional, default="plsa") 454 | The topic modeling method to use (either "plsa" or "nmf") 455 | 456 | init: string or tuple (optional, default="random") 457 | The intialization method to use. This should be one of: 458 | * ``"random"`` 459 | * ``"nndsvd"`` 460 | * ``"nmf"`` 461 | or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). 462 | 463 | int (optional, default=3) 464 | The min_samples parameter to use for HDBSCAN clustering. 465 | 466 | min_cluster_size: int (optional, default=4) 467 | The min_cluster_size parameter to use for HDBSCAN clustering 468 | 469 | n_starts: int (optional, default=16) 470 | The number of bootstrap sampled topic models to run -- the size of the ensemble. 471 | 472 | n_jobs: int (optional, default=8) 473 | The number of parallel jobs to run at a time. 474 | 475 | parallelism: string (optional, default="dask") 476 | The parallelism model to use. Should be one of "dask" or "joblib" or "none". 477 | 478 | topic_combination: string (optional, default="hellinger_umap") 479 | The method of comnining ensemble topics into a set of stable topics. Should be one of: 480 | * ``"hellinger_umap"`` 481 | * ``"hellinger"`` 482 | * ``"kl_divergence"`` 483 | 484 | n_iter: int 485 | The maximum number iterations of EM to perform 486 | 487 | n_iter_per_test: int 488 | The number of iterations between tests for 489 | relative improvement in log-likelihood. 490 | 491 | tolerance: float 492 | The threshold of relative improvement in 493 | log-likelihood required to continue iterations. 494 | 495 | e_step_thresh: float (optional, default=1e-32) 496 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 497 | below threshold then write a zero for P(z|w,d). 498 | 499 | lift_factor: int (optional, default=1) 500 | Importance factor to apply to lift -- if high lift value are important to 501 | you then larger lift factors will be beneficial. 502 | 503 | beta_loss: float or string, (optional, default 'kullback-leibler') 504 | The beta loss to use if using NMF for topic modeling. 505 | 506 | alpha: float (optional, default=0.0) 507 | The alpha parameter defining regularization if using NMF for topic modeling. 508 | 509 | solver: string, (optional, default="mu") 510 | The choice of solver if using NMF for topic modeling. Should be either "cd" or "mu". 511 | 512 | random_state int, RandomState instance or None, (optional, default: None) 513 | If int, random_state is the seed used by the random number generator; 514 | If RandomState instance, random_state is the random number generator; 515 | If None, the random number generator is the RandomState instance used 516 | by `np.random`. Used in in initialization. 517 | 518 | Returns 519 | ------- 520 | doc_vectors, stable_topics: arrays of shape (n_docs, M) and (M, n_words) 521 | The vectors giving the probability of topics for each document, and the stable topics 522 | produced by the ensemble. 523 | """ 524 | 525 | X = check_array(X, accept_sparse="csr", dtype=np.float32) 526 | 527 | if issparse(X): 528 | X_coo = X.tocoo() 529 | else: 530 | X_coo = coo_matrix(X, dtype=np.float32) 531 | 532 | all_topics = ensemble_of_topics( 533 | X_coo, 534 | estimated_n_topics, 535 | model, 536 | n_jobs, 537 | n_starts, 538 | parallelism, 539 | init=init, 540 | n_iter=n_iter, 541 | n_iter_per_test=n_iter_per_test, 542 | tolerance=tolerance, 543 | e_step_thresh=e_step_thresh, 544 | bootstrap=bootstrap, 545 | lift_factor=1, 546 | beta_loss=beta_loss, 547 | alpha=alpha, 548 | solver=solver, 549 | random_state=random_state, 550 | ) 551 | 552 | if topic_combination in _topic_combiner: 553 | cluster_topics = _topic_combiner[topic_combination] 554 | else: 555 | raise ValueError( 556 | "topic_combination must be one of {}".format(tuple(_topic_combiner.keys())) 557 | ) 558 | 559 | stable_topics = cluster_topics(all_topics, min_samples, min_cluster_size) 560 | 561 | if lift_factor != 1: 562 | stable_topics **= lift_factor 563 | normalize(stable_topics, axis=1) 564 | 565 | if model == "plsa": 566 | sample_weight = _check_sample_weight(None, X, dtype=np.float32) 567 | doc_vectors = plsa_refit( 568 | X, stable_topics, sample_weight, e_step_thresh=e_step_thresh, 569 | random_state=random_state, 570 | ) 571 | elif model == "nmf": 572 | doc_vectors, _, _ = non_negative_factorization( 573 | X, 574 | H=stable_topics, 575 | n_components=stable_topics.shape[0], 576 | update_H=False, 577 | beta_loss=beta_loss, 578 | alpha=alpha, 579 | solver=solver, 580 | ) 581 | else: 582 | raise ValueError('Model must be one of "plsa" or "nmf"') 583 | 584 | return doc_vectors, stable_topics 585 | 586 | 587 | class EnsembleTopics(BaseEstimator, TransformerMixin): 588 | """Ensemble Topic Modelling (EnsTop) 589 | 590 | Given a bag-of-words matrix representation of a corpus of documents, where each row of the 591 | matrix represents a document, and the jth element of the ith row is the count of the number of 592 | times the jth vocabulary word occurs in the ith document, build an ensemble of different 593 | topic models from bootstrap samples of the corpus, and then select a set of representative 594 | stable topics by clustering the topic produced. 595 | 596 | By default this will use pLSA for topic modelling. In that case the result will be matrices 597 | of conditional probabilities P(z|d) and P(w|z) such that the product matrix of probabilities 598 | P(w|d) maximises the likelihood of seeing the observed corpus data. Here P(z|d) represents 599 | the probability of topic z given document d, P(w|z) represents the probability of word w 600 | given topic z, and P(w|d) represents the probability of word w given document d. 601 | 602 | Parameters 603 | ---------- 604 | n_components: int (optional, default=10) 605 | The estimated number of topics. Note that the final number of topics produced can differ 606 | from this value, and may be more or less than the provided value. Instead this value 607 | provides the algorithm with a suggestion of the approximate number of topics to use. 608 | 609 | model: string (optional, default="plsa") 610 | The topic modeling method to use (either "plsa" or "nmf") 611 | 612 | init: string or tuple (optional, default="random") 613 | The intialization method to use. This should be one of: 614 | * ``"random"`` 615 | * ``"nndsvd"`` 616 | * ``"nmf"`` 617 | or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). 618 | 619 | int (optional, default=3) 620 | The min_samples parameter to use for HDBSCAN clustering. 621 | 622 | min_cluster_size: int (optional, default=4) 623 | The min_cluster_size parameter to use for HDBSCAN clustering 624 | 625 | n_starts: int (optional, default=16) 626 | The number of bootstrap sampled topic models to run -- the size of the ensemble. 627 | 628 | n_jobs: int (optional, default=8) 629 | The number of parallel jobs to run at a time. 630 | 631 | parallelism: string (optional, default="dask") 632 | The parallelism model to use. Should be one of "dask" or "joblib". 633 | 634 | topic_combination: string (optional, default="hellinger_umap") 635 | The method of comnining ensemble topics into a set of stable topics. Should be one of: 636 | * ``"hellinger_umap"`` 637 | * ``"hellinger"`` 638 | * ``"kl_divergence"`` 639 | 640 | bootstrap: bool (optional, default=True) 641 | Whether to use bootstrap resampling of documents for greater randomization. In general 642 | this is a good idea that helps to prevent overfitting, however for small document 643 | collections, or for other reasons, this might not be desireable. 644 | 645 | n_iter: int 646 | The maximum number iterations of EM to perform 647 | 648 | n_iter_per_test: int 649 | The number of iterations between tests for 650 | relative improvement in log-likelihood. 651 | 652 | tolerance: float 653 | The threshold of relative improvement in 654 | log-likelihood required to continue iterations. 655 | 656 | e_step_thresh: float (optional, default=1e-32) 657 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 658 | below threshold then write a zero for P(z|w,d). 659 | 660 | lift_factor: int (optional, default=1) 661 | Importance factor to apply to lift -- if high lift value are important to 662 | you then larger lift factors will be beneficial. 663 | 664 | beta_loss: float or string, (optional, default 'kullback-leibler') 665 | The beta loss to use if using NMF for topic modeling. 666 | 667 | alpha: float (optional, default=0.0) 668 | The alpha parameter defining regularization if using NMF for topic modeling. 669 | 670 | solver: string, (optional, default="mu") 671 | The choice of solver if using NMF for topic modeling. Should be either "cd" or "mu". 672 | 673 | random_state int, RandomState instance or None, (optional, default: None) 674 | If int, random_state is the seed used by the random number generator; 675 | If RandomState instance, random_state is the random number generator; 676 | If None, the random number generator is the RandomState instance used 677 | by `np.random`. Used in in initialization. 678 | 679 | Attributes 680 | ---------- 681 | 682 | n_components_: int 683 | The actual number of stable topics generated by the ensemble. 684 | 685 | components_: array of shape (n_topics, n_words) 686 | The topic vectors produced by pLSA. Each row is a topic, which is a probability 687 | distribution, over the vocabulary, giving the probability of each word given the topic ( 688 | P(w|z)). 689 | 690 | embedding_: array of shape (n_docs, n_topics) 691 | The document vectors produced by pLSA. Each row corresponds to a document, giving a 692 | probability distribution, over the topic space, specifying the probability of each topic 693 | occuring in the document (P(z|d)). 694 | 695 | training_data_: sparse matrix of shape (n_docs, n_words) 696 | The original training data saved in sparse matrix format. 697 | 698 | References 699 | ---------- 700 | 701 | Hofmann, Thomas. "Probabilistic latent semantic analysis." Proceedings of the Fifteenth 702 | conference on Uncertainty in artificial intelligence. Morgan Kaufmann Publishers Inc., 1999. 703 | 704 | Hofmann, Thomas. "Unsupervised learning by probabilistic latent semantic analysis." 705 | Machine learning 42.1-2 (2001): 177-196. 706 | 707 | """ 708 | 709 | def __init__( 710 | self, 711 | n_components=10, 712 | model="plsa", 713 | init="random", 714 | n_starts=16, 715 | min_samples=3, 716 | min_cluster_size=5, 717 | n_jobs=8, 718 | parallelism="dask", 719 | topic_combination="hellinger_umap", 720 | bootstrap=True, 721 | n_iter=80, 722 | n_iter_per_test=10, 723 | tolerance=0.001, 724 | e_step_thresh=1e-32, 725 | lift_factor=1, 726 | beta_loss=1, 727 | alpha=0.0, 728 | solver="mu", 729 | transform_random_seed=42, 730 | random_state=None, 731 | ): 732 | self.n_components = n_components 733 | self.model = model 734 | self.init = init 735 | self.n_starts = n_starts 736 | self.min_samples = min_samples 737 | self.min_cluster_size = min_cluster_size 738 | self.n_jobs = n_jobs 739 | self.parallelism = parallelism 740 | self.topic_combination = topic_combination 741 | self.bootstrap = bootstrap 742 | self.n_iter = n_iter 743 | self.n_iter_per_test = n_iter_per_test 744 | self.tolerance = tolerance 745 | self.e_step_thresh = e_step_thresh 746 | self.lift_factor = lift_factor 747 | self.beta_loss = beta_loss 748 | self.alpha = alpha 749 | self.solver = solver 750 | self.transform_random_seed = transform_random_seed 751 | self.random_state = random_state 752 | 753 | def fit(self, X, y=None): 754 | """Learn the ensemble model for the data X and return the document vectors. 755 | 756 | This is more efficient than calling fit followed by transform. 757 | 758 | Parameters 759 | ---------- 760 | X: array or sparse matrix of shape (n_docs, n_words) 761 | The data matrix pLSA is attempting to fit to. 762 | 763 | y: Ignored 764 | 765 | Returns 766 | ------- 767 | self 768 | """ 769 | self.fit_transform(X) 770 | return self 771 | 772 | def fit_transform(self, X, y=None, **fit_params): 773 | """Learn the ensemble model for the data X and return the document vectors. 774 | 775 | This is more efficient than calling fit followed by transform. 776 | 777 | Parameters 778 | ---------- 779 | X: array or sparse matrix of shape (n_docs, n_words) 780 | The data matrix pLSA is attempting to fit to. 781 | 782 | y: Ignored 783 | 784 | Returns 785 | ------- 786 | embedding: array of shape (n_docs, n_topics) 787 | An embedding of the documents into a topic space. 788 | """ 789 | X = check_array(X, accept_sparse="csr") 790 | 791 | if not issparse(X): 792 | X = csr_matrix(X) 793 | 794 | U, V = ensemble_fit( 795 | X, 796 | self.n_components, 797 | self.model, 798 | self.init, 799 | self.min_samples, 800 | self.min_cluster_size, 801 | self.n_starts, 802 | self.n_jobs, 803 | self.parallelism, 804 | self.topic_combination, 805 | self.bootstrap, 806 | self.n_iter, 807 | self.n_iter_per_test, 808 | self.tolerance, 809 | self.e_step_thresh, 810 | self.lift_factor, 811 | self.beta_loss, 812 | self.alpha, 813 | self.solver, 814 | self.random_state, 815 | ) 816 | self.components_ = V 817 | self.embedding_ = U 818 | self.training_data_ = X 819 | self.n_components_ = self.components_.shape[0] 820 | 821 | return U 822 | 823 | def transform(self, X, y=None): 824 | """Transform the data X into the topic space of the fitted ensemble model. 825 | 826 | Parameters 827 | ---------- 828 | X: array or sparse matrix of shape (n_docs, n_words) 829 | Corpus to be embedded into topic space 830 | 831 | y: Ignored 832 | 833 | Returns 834 | ------- 835 | embedding: array of shape (n_docs, n_topics) 836 | An embedding of the documents X into the topic space. 837 | """ 838 | 839 | X = check_array(X, accept_sparse="csr") 840 | random_state = check_random_state(self.transform_random_seed) 841 | 842 | if not issparse(X): 843 | X = coo_matrix(X) 844 | else: 845 | X = X.tocoo() 846 | 847 | result = plsa_refit( 848 | X, 849 | self.components_, 850 | n_iter=50, 851 | n_iter_per_test=5, 852 | tolerance=0.001, 853 | random_state=random_state, 854 | ) 855 | 856 | return result 857 | 858 | def coherence(self, topic_num=None, n_words=20): 859 | """Compute the average coherence of fitted topics, or of a single individual topic. 860 | 861 | Parameters 862 | ---------- 863 | topic_num: int (optional, default=None) 864 | The topic number to compute coherence for. If ``topic_num`` is None then the average 865 | coherence over all topics will be computed. 866 | 867 | n_words int (optional, default=20) 868 | The number of topic words to score against. The top ``n_words`` words from the selected 869 | topic will be used. 870 | 871 | Returns 872 | ------- 873 | topic_coherence: float 874 | The requested coherence score. 875 | """ 876 | 877 | # Test for errors 878 | if not isinstance(topic_num, int) and topic_num is not None: 879 | raise ValueError("Topic number must be an integer or None.") 880 | 881 | if topic_num is None: 882 | return mean_coherence( 883 | self.components_, self.training_data_, n_words=n_words 884 | ) 885 | elif topic_num >= 0 and topic_num < self.n_components: 886 | return coherence( 887 | self.components_, topic_num, self.training_data_, n_words=n_words 888 | ) 889 | else: 890 | raise ValueError( 891 | "Topic number must be in range 0 to {}".format(self.n_components) 892 | ) 893 | 894 | def log_lift(self, topic_num=None, n_words=20): 895 | """Compute the average log lift of fitted topics, or of a single individual topic. 896 | 897 | Parameters 898 | ---------- 899 | topic_num: int (optional, default=None) 900 | The topic number to compute log lift for. If ``topic_num`` is None then the average 901 | log lift over all topics will be computed. 902 | 903 | n_words int (optional, default=20) 904 | The number of topic words to score against. The top ``n_words`` words from the selected 905 | topic will be used. 906 | 907 | 908 | Returns 909 | ------- 910 | log_lift: float 911 | The requested log lift score. 912 | """ 913 | 914 | # Test for errors 915 | if not isinstance(topic_num, int) and topic_num is not None: 916 | raise ValueError("Topic number must be an integer or None.") 917 | 918 | if topic_num is None: 919 | return mean_log_lift(self.components_, self.training_data_, n_words=n_words) 920 | elif topic_num >= 0 and topic_num < self.n_components: 921 | return log_lift( 922 | self.components_, topic_num, self.training_data_, n_words=n_words 923 | ) 924 | else: 925 | raise ValueError( 926 | "Topic number must be in range 0 to {}".format(self.n_components) 927 | ) 928 | -------------------------------------------------------------------------------- /enstop/plsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | from sklearn.utils import check_array, check_random_state 6 | from sklearn.utils.extmath import randomized_svd 7 | 8 | try: 9 | from sklearn.utils.validation import _check_sample_weight 10 | except ImportError: 11 | from enstop.utils import _check_sample_weight 12 | from sklearn.decomposition import non_negative_factorization 13 | from scipy.sparse import issparse, csr_matrix, coo_matrix 14 | 15 | from enstop.utils import ( 16 | normalize, 17 | coherence, 18 | mean_coherence, 19 | log_lift, 20 | mean_log_lift, 21 | standardize_input, 22 | ) 23 | 24 | 25 | @numba.njit( 26 | "f4[:,::1](i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4)", 27 | locals={ 28 | "k": numba.types.intp, 29 | "w": numba.types.uint32, 30 | "d": numba.types.uint32, 31 | "z": numba.types.uint16, 32 | "nz_idx": numba.types.uint32, 33 | "norm": numba.types.float32, 34 | }, 35 | fastmath=True, 36 | nogil=True, 37 | parallel=True, 38 | ) 39 | def plsa_e_step( 40 | X_rows, 41 | X_cols, 42 | X_vals, 43 | p_w_given_z, 44 | p_z_given_d, 45 | p_z_given_wd, 46 | probability_threshold=1e-32, 47 | ): 48 | """Perform the E-step of pLSA optimization. This amounts to computing the 49 | probability of each topic given each word document pair. The computation 50 | implements 51 | 52 | P(z|w,d) = \frac{P(z|w)P(d|z)}{\sum_{z=1}^k P(z|w)P(d|z)}. 53 | 54 | This routine is optimized to work with sparse matrices such that P(z|w,d) 55 | is only computed for w, d such that X_{w,d} is non-zero, where X is the 56 | data matrix. 57 | 58 | To make this numba compilable the raw arrays defining the COO format sparse 59 | matrix must be passed separately. 60 | 61 | 62 | Parameters 63 | ---------- 64 | X_rows: array of shape (nnz,) 65 | For each non-zero entry of X, the row of the entry. 66 | 67 | X_cols: array of shape (nnz,) 68 | For each non-zero entry of X, the column of the 69 | entry. 70 | 71 | X_vals: array of shape (nnz,) 72 | For each non-zero entry of X, the value of entry. 73 | 74 | p_w_given_z: array of shape (n_topics, n_words) 75 | The current estimates of values for P(w|z) 76 | 77 | p_z_given_d: array of shape (n_docs, n_topics) 78 | The current estimates of values for P(z|d) 79 | 80 | p_z_given_wd: array of shape (nnz, n_topics) 81 | The result array to write new estimates of P(z|w,d) to. 82 | 83 | probability_threshold: float (optional, default=1e-32) 84 | Option to promote sparsity. If the value of P(w|z)P(z|d) falls below 85 | threshold then write a zero for P(z|w,d). 86 | 87 | """ 88 | 89 | k = p_w_given_z.shape[0] 90 | 91 | for nz_idx in numba.prange(X_vals.shape[0]): 92 | d = X_rows[nz_idx] 93 | w = X_cols[nz_idx] 94 | 95 | norm = 0.0 96 | for z in range(k): 97 | v = p_w_given_z[z, w] * p_z_given_d[d, z] 98 | if v > probability_threshold: 99 | p_z_given_wd[nz_idx, z] = v 100 | norm += p_z_given_wd[nz_idx, z] 101 | else: 102 | p_z_given_wd[nz_idx, z] = 0.0 103 | for z in range(k): 104 | if norm > 0: 105 | p_z_given_wd[nz_idx, z] /= norm 106 | 107 | return p_z_given_wd 108 | 109 | 110 | @numba.njit( 111 | "UniTuple(f4[:,::1],2)(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1])", 112 | locals={ 113 | "k": numba.types.intp, 114 | "w": numba.types.uint32, 115 | "d": numba.types.uint32, 116 | "z": numba.types.uint16, 117 | "nz_idx": numba.types.uint32, 118 | "s": numba.types.float32, 119 | }, 120 | fastmath=True, 121 | nogil=True, 122 | parallel=True, 123 | ) 124 | def plsa_m_step( 125 | X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, p_z_given_wd, norm_pwz, norm_pdz 126 | ): 127 | """Perform the M-step of pLSA optimization. This amounts to using the estimates 128 | of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation implements 129 | 130 | P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)} 131 | P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)} 132 | 133 | This routine is optimized to work with sparse matrices such that P(z|w,d) is only 134 | computed for w, d such that X_{w,d} is non-zero, where X is the data matrix. 135 | 136 | To make this numba compilable the raw arrays defining the COO format sparse 137 | matrix must be passed separately. 138 | 139 | Parameters 140 | ---------- 141 | X_rows: array of shape (nnz,) 142 | For each non-zero entry of X, the row of the entry. 143 | 144 | X_cols: array of shape (nnz,) 145 | For each non-zero entry of X, the column of the 146 | entry. 147 | 148 | X_vals: array of shape (nnz,) 149 | For each non-zero entry of X, the value of entry. 150 | 151 | p_w_given_z: array of shape (n_topics, n_words) 152 | The result array to write new estimates of P(w|z) to. 153 | 154 | p_z_given_d: array of shape (n_docs, n_topics) 155 | The result array to write new estimates of P(z|d) to. 156 | 157 | p_z_given_wd: array of shape (nnz, n_topics) 158 | The current estimates for P(z|w,d) 159 | 160 | sample_weight: array of shape (n_docs,) 161 | Input document weights. 162 | 163 | norm_pwz: array of shape (n_topics,) 164 | Auxilliary array used for storing row norms; this is passed in to save 165 | reallocations. 166 | 167 | norm_pdz: array of shape (n_docs,) 168 | Auxilliary array used for storing row norms; this is passed in to save 169 | reallocations. 170 | """ 171 | 172 | k = p_z_given_wd.shape[1] 173 | n = p_z_given_d.shape[0] 174 | m = p_w_given_z.shape[1] 175 | 176 | p_w_given_z[:] = 0.0 177 | p_z_given_d[:] = 0.0 178 | 179 | norm_pwz[:] = 0.0 180 | norm_pdz[:] = 0.0 181 | 182 | for nz_idx in range(X_vals.shape[0]): 183 | d = X_rows[nz_idx] 184 | w = X_cols[nz_idx] 185 | x = X_vals[nz_idx] 186 | 187 | for z in range(k): 188 | s = x * p_z_given_wd[nz_idx, z] 189 | 190 | p_w_given_z[z, w] += s 191 | p_z_given_d[d, z] += s 192 | 193 | norm_pwz[z] += s 194 | norm_pdz[d] += s 195 | 196 | for z in numba.prange(k): 197 | if norm_pwz[z] > 0: 198 | for w in range(m): 199 | p_w_given_z[z, w] /= norm_pwz[z] 200 | for d in range(n): 201 | if norm_pdz[d] > 0: 202 | p_z_given_d[d, z] /= norm_pdz[d] 203 | 204 | return p_w_given_z, p_z_given_d 205 | 206 | 207 | @numba.njit( 208 | "UniTuple(f4[:,::1],2)(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1],f4[::1])", 209 | locals={ 210 | "k": numba.types.intp, 211 | "w": numba.types.uint32, 212 | "d": numba.types.uint32, 213 | "z": numba.types.uint16, 214 | "nz_idx": numba.types.uint32, 215 | "s": numba.types.float32, 216 | }, 217 | fastmath=True, 218 | nogil=True, 219 | parallel=True, 220 | ) 221 | def plsa_m_step_w_sample_weight( 222 | X_rows, 223 | X_cols, 224 | X_vals, 225 | p_w_given_z, 226 | p_z_given_d, 227 | p_z_given_wd, 228 | sample_weight, 229 | norm_pwz, 230 | norm_pdz, 231 | ): 232 | """Perform the M-step of pLSA optimization. This amounts to using the estimates 233 | of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation implements 234 | 235 | P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)} 236 | P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)} 237 | 238 | This routine is optimized to work with sparse matrices such that P(z|w,d) is only 239 | computed for w, d such that X_{w,d} is non-zero, where X is the data matrix. 240 | 241 | To make this numba compilable the raw arrays defining the COO format sparse 242 | matrix must be passed separately. 243 | 244 | Parameters 245 | ---------- 246 | X_rows: array of shape (nnz,) 247 | For each non-zero entry of X, the row of the entry. 248 | 249 | X_cols: array of shape (nnz,) 250 | For each non-zero entry of X, the column of the 251 | entry. 252 | 253 | X_vals: array of shape (nnz,) 254 | For each non-zero entry of X, the value of entry. 255 | 256 | p_w_given_z: array of shape (n_topics, n_words) 257 | The result array to write new estimates of P(w|z) to. 258 | 259 | p_z_given_d: array of shape (n_docs, n_topics) 260 | The result array to write new estimates of P(z|d) to. 261 | 262 | p_z_given_wd: array of shape (nnz, n_topics) 263 | The current estimates for P(z|w,d) 264 | 265 | sample_weight: array of shape (n_docs,) 266 | Input document weights. 267 | 268 | norm_pwz: array of shape (n_topics,) 269 | Auxilliary array used for storing row norms; this is passed in to save 270 | reallocations. 271 | 272 | norm_pdz: array of shape (n_docs,) 273 | Auxilliary array used for storing row norms; this is passed in to save 274 | reallocations. 275 | """ 276 | 277 | k = p_z_given_wd.shape[1] 278 | n = p_z_given_d.shape[0] 279 | m = p_w_given_z.shape[1] 280 | 281 | p_w_given_z[:] = 0.0 282 | p_z_given_d[:] = 0.0 283 | 284 | norm_pwz[:] = 0.0 285 | norm_pdz[:] = 0.0 286 | 287 | for nz_idx in range(X_vals.shape[0]): 288 | d = X_rows[nz_idx] 289 | w = X_cols[nz_idx] 290 | x = X_vals[nz_idx] 291 | 292 | for z in range(k): 293 | s = x * p_z_given_wd[nz_idx, z] 294 | t = s * sample_weight[d] 295 | 296 | p_w_given_z[z, w] += t 297 | p_z_given_d[d, z] += s 298 | 299 | norm_pwz[z] += t 300 | norm_pdz[d] += s 301 | 302 | for z in numba.prange(k): 303 | if norm_pwz[z] > 0: 304 | for w in range(m): 305 | p_w_given_z[z, w] /= norm_pwz[z] 306 | for d in range(n): 307 | if norm_pdz[d] > 0: 308 | p_z_given_d[d, z] /= norm_pdz[d] 309 | 310 | return p_w_given_z, p_z_given_d 311 | 312 | 313 | @numba.njit( 314 | "f4(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[::1])", 315 | locals={ 316 | "k": numba.types.intp, 317 | "w": numba.types.uint32, 318 | "d": numba.types.uint32, 319 | "z": numba.types.uint16, 320 | "nz_idx": numba.types.uint32, 321 | "x": numba.types.float32, 322 | "result": numba.types.float32, 323 | "p_w_given_d": numba.types.float32, 324 | }, 325 | fastmath=True, 326 | nogil=True, 327 | parallel=True, 328 | ) 329 | def log_likelihood(X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight): 330 | """Compute the log-likelihood of observing the data X given estimates for P(w|z) 331 | and P(z|d). The likelihood of X_{w,d} under the model is given by X_{w,d} P(w|d) 332 | = X_{w,d} P(w|z) P(z|d). This function returns 333 | 334 | \log\left(\prod_{w,d} X_{w,d} P(w|d)\right) 335 | 336 | This routine is optimized to work with sparse matrices and only compute values 337 | for w, d such that X_{w,d} is non-zero. 338 | 339 | To make this numba compilable the raw arrays defining the COO format sparse 340 | matrix must be passed separately. 341 | 342 | Parameters 343 | ---------- 344 | X_rows: array of shape (nnz,) 345 | For each non-zero entry of X, the row of the entry. 346 | 347 | X_cols: array of shape (nnz,) 348 | For each non-zero entry of X, the column of the 349 | entry. 350 | 351 | X_vals: array of shape (nnz,) 352 | For each non-zero entry of X, the value of entry. 353 | 354 | p_w_given_z: array of shape (n_topics, n_words) 355 | The current estimates of values for P(w|z) 356 | 357 | p_z_given_d: array of shape (n_docs, n_topics) 358 | The current estimates of values for P(z|d) 359 | 360 | sample_weight: array of shape (n_docs,) 361 | Input document weights. 362 | 363 | Returns 364 | ------- 365 | 366 | log_likelihood: float 367 | The log of the likelihood of observing X under the 368 | model given by the P(z|d) and P(z|w). 369 | 370 | """ 371 | 372 | result = 0.0 373 | k = p_w_given_z.shape[0] 374 | 375 | for nz_idx in numba.prange(X_vals.shape[0]): 376 | d = X_rows[nz_idx] 377 | w = X_cols[nz_idx] 378 | x = X_vals[nz_idx] 379 | 380 | p_w_given_d = 0.0 381 | for z in range(k): 382 | p_w_given_d += p_w_given_z[z, w] * p_z_given_d[d, z] 383 | 384 | result += x * np.log(p_w_given_d) * sample_weight[d] 385 | 386 | return result 387 | 388 | 389 | @numba.njit(fastmath=True, nogil=True) 390 | def norm(x): 391 | """Numba compilable routine for computing the l2-norm 392 | of a given vector x. 393 | 394 | Parameters 395 | ---------- 396 | x: array of shape (n,) 397 | The array to compute the l2-norm of. 398 | 399 | Returns 400 | ------- 401 | n: float 402 | The l2-norm of the input array x. 403 | """ 404 | result = 0.0 405 | 406 | for i in range(x.shape[0]): 407 | result += x[i] ** 2 408 | 409 | return np.sqrt(result) 410 | 411 | 412 | def plsa_init(X, k, init="random", rng=np.random): 413 | """Initialize matrices for pLSA. Specifically, given data X, a number of topics 414 | k, and an initialization method, compute matrices for P(z|d) and P(w|z) that can 415 | be used to begin an EM optimization of pLSA. 416 | 417 | Various initialization approaches are available. The most straightforward is 418 | "random", which randomly initializes values for P(z|d) and P(w|z) and normalizes 419 | to make them probabilities. A second approach, borrowing from sklearn's NMF 420 | implementation, is to use a non-negative SVD approach ("nndsvd"). A third option 421 | is the use the fast coordinate descent under Frobenius loss version of NMF and 422 | then normalize to make probabilities ("nmf"). Finally if the ``init`` parameter 423 | is a tuple of ndarrays then these will be used, allowing for custom user defined 424 | initializations. 425 | 426 | Parameters 427 | ---------- 428 | X: sparse matrix of shape (n_docs, n_words) 429 | The data matrix pLSA is attempting to fit to. 430 | 431 | k: int 432 | The number of topics for pLSA to fit with. 433 | 434 | init: string or tuple (optional, default="random") 435 | The intialization method to use. This should be one of: 436 | * ``"random"`` 437 | * ``"nndsvd"`` 438 | * ``"nmf"`` 439 | or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). 440 | 441 | rng: RandomState instance (optional, default=np.random) 442 | Seeded randomness generator. Used for random intialization. 443 | 444 | Returns 445 | ------- 446 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 447 | Initialized arrays suitable to passing to 448 | pLSA optimization methods. 449 | """ 450 | 451 | n = X.shape[0] 452 | m = X.shape[1] 453 | 454 | if init == "random": 455 | p_w_given_z = rng.rand(k, m) 456 | p_z_given_d = rng.rand(n, k) 457 | 458 | elif init == "nndsvd": 459 | # Taken from sklearn NMF implementation 460 | U, S, V = randomized_svd(X, k) 461 | p_z_given_d, p_w_given_z = np.zeros(U.shape), np.zeros(V.shape) 462 | 463 | # The leading singular triplet is non-negative 464 | # so it can be used as is for initialization. 465 | p_z_given_d[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0]) 466 | p_w_given_z[0, :] = np.sqrt(S[0]) * np.abs(V[0, :]) 467 | 468 | for j in range(1, k): 469 | x, y = U[:, j], V[j, :] 470 | 471 | # extract positive and negative parts of column vectors 472 | x_p, y_p = np.maximum(x, 0), np.maximum(y, 0) 473 | x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0)) 474 | 475 | # and their norms 476 | x_p_nrm, y_p_nrm = norm(x_p), norm(y_p) 477 | x_n_nrm, y_n_nrm = norm(x_n), norm(y_n) 478 | 479 | m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm 480 | 481 | # choose update 482 | if m_p > m_n: 483 | u = x_p / x_p_nrm 484 | v = y_p / y_p_nrm 485 | sigma = m_p 486 | else: 487 | u = x_n / x_n_nrm 488 | v = y_n / y_n_nrm 489 | sigma = m_n 490 | 491 | lbd = np.sqrt(S[j] * sigma) 492 | p_z_given_d[:, j] = lbd * u 493 | p_w_given_z[j, :] = lbd * v 494 | 495 | elif init == "nmf": 496 | p_z_given_d, p_w_given_z, _ = non_negative_factorization( 497 | X, 498 | n_components=k, 499 | init="nndsvd", 500 | solver="cd", 501 | beta_loss=2, 502 | tol=1e-2, 503 | max_iter=100, 504 | ) 505 | elif isinstance(init, tuple) or isinstance(init, list): 506 | p_z_given_d, p_w_given_z = init 507 | else: 508 | raise ValueError("Unrecognized init {}".format(init)) 509 | 510 | normalize(p_w_given_z, axis=1) 511 | normalize(p_z_given_d, axis=1) 512 | 513 | return p_z_given_d, p_w_given_z 514 | 515 | 516 | @numba.njit(fastmath=True, nogil=True) 517 | def plsa_fit_inner( 518 | X_rows, 519 | X_cols, 520 | X_vals, 521 | p_w_given_z, 522 | p_z_given_d, 523 | sample_weight, 524 | n_iter=100, 525 | n_iter_per_test=10, 526 | tolerance=0.001, 527 | e_step_thresh=1e-32, 528 | use_sample_weights=False, 529 | ): 530 | """Internal loop of EM steps required to optimize pLSA, along with relative 531 | convergence tests with respect to the log-likelihood of observing the data under 532 | the model. 533 | 534 | The EM looping will stop when either ``n_iter`` iterations have been reached, 535 | or if the relative improvement in log-likelihood over the last 536 | ``n_iter_per_test`` steps is under ``threshold``. 537 | 538 | This function is designed to wrap the internals of the EM process in a numba 539 | compilable loop, and is not the preferred entry point for fitting a plsa model. 540 | 541 | Parameters 542 | ---------- 543 | X_rows: array of shape (nnz,) 544 | For each non-zero entry of X, the row of the entry. 545 | 546 | X_cols: array of shape (nnz,) 547 | For each non-zero entry of X, the column of the 548 | entry. 549 | 550 | X_vals: array of shape (nnz,) 551 | For each non-zero entry of X, the value of entry. 552 | 553 | p_w_given_z: array of shape (n_topics, n_words) 554 | The current estimates of values for P(w|z) 555 | 556 | p_z_given_d: array of shape (n_docs, n_topics) 557 | The current estimates of values for P(z|d) 558 | 559 | sample_weight: array of shape (n_docs,) 560 | Input document weights. 561 | 562 | n_iter: int 563 | The maximum number iterations of EM to perform 564 | 565 | n_iter_per_test: int 566 | The number of iterations between tests for 567 | relative improvement in log-likelihood. 568 | 569 | tolerance: float 570 | The threshold of relative improvement in 571 | log-likelihood required to continue iterations. 572 | 573 | e_step_thresh: float (optional, default=1e-32) 574 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 575 | below threshold then write a zero for P(z|w,d). 576 | 577 | Returns 578 | ------- 579 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 580 | The resulting model values of P(z|d) and P(w|z) 581 | 582 | """ 583 | k = p_z_given_d.shape[1] 584 | n = p_z_given_d.shape[0] 585 | 586 | p_z_given_wd = np.zeros((X_vals.shape[0], k), dtype=np.float32) 587 | 588 | norm_pwz = np.zeros(k, dtype=np.float32) 589 | norm_pdz = np.zeros(n, dtype=np.float32) 590 | 591 | previous_log_likelihood = log_likelihood( 592 | X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight 593 | ) 594 | 595 | for i in range(n_iter): 596 | 597 | plsa_e_step( 598 | X_rows, 599 | X_cols, 600 | X_vals, 601 | p_w_given_z, 602 | p_z_given_d, 603 | p_z_given_wd, 604 | e_step_thresh, 605 | ) 606 | if use_sample_weights: 607 | plsa_m_step_w_sample_weight( 608 | X_rows, 609 | X_cols, 610 | X_vals, 611 | p_w_given_z, 612 | p_z_given_d, 613 | p_z_given_wd, 614 | sample_weight, 615 | norm_pwz, 616 | norm_pdz, 617 | ) 618 | else: 619 | plsa_m_step( 620 | X_rows, 621 | X_cols, 622 | X_vals, 623 | p_w_given_z, 624 | p_z_given_d, 625 | p_z_given_wd, 626 | norm_pwz, 627 | norm_pdz, 628 | ) 629 | 630 | if i % n_iter_per_test == 0: 631 | current_log_likelihood = log_likelihood( 632 | X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight 633 | ) 634 | change = np.abs(current_log_likelihood - previous_log_likelihood) 635 | if change == 0 or change / np.abs(current_log_likelihood) < tolerance: 636 | break 637 | else: 638 | previous_log_likelihood = current_log_likelihood 639 | 640 | return p_z_given_d, p_w_given_z 641 | 642 | 643 | def plsa_fit( 644 | X, 645 | k, 646 | sample_weight, 647 | init="random", 648 | n_iter=100, 649 | n_iter_per_test=10, 650 | tolerance=0.001, 651 | e_step_thresh=1e-32, 652 | random_state=None, 653 | ): 654 | """Fit a pLSA model to a data matrix ``X`` with ``k`` topics, an initialized 655 | according to ``init``. This will run an EM method to optimize estimates of P(z|d) 656 | and P(w|z). The will perform at most ``n_iter`` EM step iterations, 657 | while checking for relative improvement of the log-likelihood of the data under 658 | the model every ``n_iter_per_test`` iterations, and stops early if that is under 659 | ``tolerance``. 660 | 661 | Parameters 662 | ---------- 663 | X: sparse matrix of shape (n_docs, n_words) 664 | The data matrix pLSA is attempting to fit to. 665 | 666 | k: int 667 | The number of topics for pLSA to fit with. 668 | 669 | sample_weight: array of shape (n_docs,) 670 | Input document weights. 671 | 672 | init: string or tuple (optional, default="random") 673 | The intialization method to use. This should be one of: 674 | * ``"random"`` 675 | * ``"nndsvd"`` 676 | * ``"nmf"`` 677 | or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). 678 | 679 | n_iter: int 680 | The maximum number iterations of EM to perform 681 | 682 | n_iter_per_test: int 683 | The number of iterations between tests for 684 | relative improvement in log-likelihood. 685 | 686 | tolerance: float 687 | The threshold of relative improvement in 688 | log-likelihood required to continue iterations. 689 | 690 | e_step_thresh: float (optional, default=1e-32) 691 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 692 | below threshold then write a zero for P(z|w,d). 693 | 694 | random_state: int, RandomState instance or None, (optional, default: None) 695 | If int, random_state is the seed used by the random number generator; 696 | If RandomState instance, random_state is the random number generator; 697 | If None, the random number generator is the RandomState instance used 698 | by `np.random`. Used in in initialization. 699 | 700 | Returns 701 | ------- 702 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 703 | The resulting model values of P(z|d) and P(w|z) 704 | 705 | """ 706 | 707 | rng = check_random_state(random_state) 708 | p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng) 709 | p_z_given_d = p_z_given_d.astype(np.float32, order="C") 710 | p_w_given_z = p_w_given_z.astype(np.float32, order="C") 711 | 712 | use_sample_weights = np.any(sample_weight != 1.0) 713 | 714 | A = X.tocoo().astype(np.float32) 715 | 716 | p_z_given_d, p_w_given_z = plsa_fit_inner( 717 | A.row, 718 | A.col, 719 | A.data, 720 | p_w_given_z, 721 | p_z_given_d, 722 | sample_weight, 723 | n_iter, 724 | n_iter_per_test, 725 | tolerance, 726 | e_step_thresh, 727 | use_sample_weights, 728 | ) 729 | 730 | return p_z_given_d, p_w_given_z 731 | 732 | 733 | @numba.njit( 734 | "UniTuple(f4[:,::1],2)(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1])", 735 | locals={ 736 | "k": numba.types.intp, 737 | "w": numba.types.uint32, 738 | "d": numba.types.uint32, 739 | "z": numba.types.uint16, 740 | "nz_idx": numba.types.uint32, 741 | "s": numba.types.float32, 742 | }, 743 | fastmath=True, 744 | nogil=True, 745 | ) 746 | def plsa_refit_m_step( 747 | X_rows, 748 | X_cols, 749 | X_vals, 750 | p_w_given_z, 751 | p_z_given_d, 752 | p_z_given_wd, 753 | sample_weight, 754 | norm_pdz, 755 | ): 756 | """Optimized routine for the M step fitting values of P(z|d) given a fixed set of 757 | topics (i.e. P(w|z)). 758 | 759 | This routine is optimized to work with sparse matrices and only compute values 760 | for w, d such that X_{w,d} is non-zero. 761 | 762 | To make this numba compilable the raw arrays defining the COO format sparse 763 | matrix must be passed separately. 764 | 765 | Parameters 766 | ---------- 767 | X_rows: array of shape (nnz,) 768 | For each non-zero entry of X, the row of the entry. 769 | 770 | X_cols: array of shape (nnz,) 771 | For each non-zero entry of X, the column of the 772 | entry. 773 | 774 | X_vals: array of shape (nnz,) 775 | For each non-zero entry of X, the value of entry. 776 | 777 | p_w_given_z: array of shape (n_topics, n_words) 778 | The fixed topics P(w|z) to fit P(z|d) against. 779 | 780 | p_z_given_d: array of shape (n_docs, n_topics) 781 | The result array to write new estimates of P(z|d) to. 782 | 783 | p_z_given_wd: array of shape (nnz, n_topics) 784 | The current estimates for P(z|w,d) 785 | 786 | sample_weight: array of shape (n_docs,) 787 | Input document weights. 788 | 789 | norm_pdz: array of shape (n_docs,) 790 | Auxilliary array used for storing row norms; this is passed in to save 791 | reallocations. 792 | 793 | """ 794 | 795 | k = p_z_given_wd.shape[1] 796 | n = p_z_given_d.shape[0] 797 | 798 | p_z_given_d[:] = 0.0 799 | norm_pdz[:] = 0.0 800 | 801 | for nz_idx in range(X_vals.shape[0]): 802 | d = X_rows[nz_idx] 803 | w = X_cols[nz_idx] 804 | x = X_vals[nz_idx] 805 | 806 | for z in range(k): 807 | s = x * p_z_given_wd[nz_idx, z] 808 | p_z_given_d[d, z] += s 809 | norm_pdz[d] += s 810 | 811 | for z in range(k): 812 | for d in range(n): 813 | if norm_pdz[d] > 0: 814 | p_z_given_d[d, z] /= norm_pdz[d] 815 | 816 | return p_w_given_z, p_z_given_d 817 | 818 | 819 | @numba.njit(locals={"e_step_thresh": numba.types.float32,}, fastmath=True, nogil=True) 820 | def plsa_refit_inner( 821 | X_rows, 822 | X_cols, 823 | X_vals, 824 | topics, 825 | p_z_given_d, 826 | sample_weight, 827 | n_iter=50, 828 | n_iter_per_test=10, 829 | tolerance=0.005, 830 | e_step_thresh=1e-32, 831 | ): 832 | """Optimized routine for refitting values of P(z|d) given a fixed set of topics ( 833 | i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics 834 | (given, for example, by an ensemble result). 835 | 836 | This routine is optimized to work with sparse matrices and only compute values 837 | for w, d such that X_{w,d} is non-zero. 838 | 839 | To make this numba compilable the raw arrays defining the COO format sparse 840 | matrix must be passed separately. 841 | 842 | Parameters 843 | ---------- 844 | X_rows: array of shape (nnz,) 845 | For each non-zero entry of X, the row of the entry. 846 | 847 | X_cols: array of shape (nnz,) 848 | For each non-zero entry of X, the column of the 849 | entry. 850 | 851 | X_vals: array of shape (nnz,) 852 | For each non-zero entry of X, the value of entry. 853 | 854 | topics: array of shape (n_topics, n_words) 855 | The fixed topics against which to fit the values of P(z|d). 856 | 857 | p_z_given_d: array of shape (n_docs, n_topics) 858 | The current estimates of values for P(z|d) 859 | 860 | sample_weight: array of shape (n_docs,) 861 | Input document weights. 862 | 863 | n_iter: int 864 | The maximum number iterations of EM to perform 865 | 866 | n_iter_per_test: int 867 | The number of iterations between tests for relative improvement in 868 | log-likelihood. 869 | 870 | tolerance: float 871 | The threshold of relative improvement in log-likelihood required to continue 872 | iterations. 873 | 874 | e_step_thresh: float (optional, default=1e-32) 875 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 876 | below threshold then write a zero for P(z|w,d). 877 | 878 | Returns 879 | ------- 880 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 881 | The resulting model values of P(z|d) and P(w|z) 882 | 883 | """ 884 | k = topics.shape[0] 885 | p_z_given_wd = np.zeros((X_rows.shape[0], k), dtype=np.float32) 886 | 887 | norm_pdz = np.zeros(p_z_given_d.shape[0], dtype=np.float32) 888 | 889 | previous_log_likelihood = log_likelihood( 890 | X_rows, X_cols, X_vals, topics, p_z_given_d, sample_weight 891 | ) 892 | 893 | for i in range(n_iter): 894 | 895 | plsa_e_step( 896 | X_rows, X_cols, X_vals, topics, p_z_given_d, p_z_given_wd, e_step_thresh 897 | ) 898 | plsa_refit_m_step( 899 | X_rows, 900 | X_cols, 901 | X_vals, 902 | topics, 903 | p_z_given_d, 904 | p_z_given_wd, 905 | sample_weight, 906 | norm_pdz, 907 | ) 908 | 909 | if i % n_iter_per_test == 0: 910 | current_log_likelihood = log_likelihood( 911 | X_rows, X_cols, X_vals, topics, p_z_given_d, sample_weight 912 | ) 913 | if current_log_likelihood > 0: 914 | change = np.abs(current_log_likelihood - previous_log_likelihood) 915 | if change / np.abs(current_log_likelihood) < tolerance: 916 | break 917 | else: 918 | previous_log_likelihood = current_log_likelihood 919 | 920 | return p_z_given_d 921 | 922 | 923 | def plsa_refit( 924 | X, 925 | topics, 926 | sample_weight, 927 | n_iter=50, 928 | n_iter_per_test=10, 929 | tolerance=0.005, 930 | e_step_thresh=1e-32, 931 | random_state=None, 932 | ): 933 | """Routine for refitting values of P(z|d) given a fixed set of topics ( 934 | i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics 935 | (given, for example, by an ensemble result). 936 | 937 | Parameters 938 | ---------- 939 | X: sparse matrix of shape (n_docs, n_words) 940 | The data matrix pLSA is attempting to fit to. 941 | 942 | topics: array of shape (n_topics, n_words) 943 | The fixed topics against which to fit the values of P(z|d). 944 | 945 | sample_weight: array of shape (n_docs,) 946 | Input document weights. 947 | 948 | n_iter: int 949 | The maximum number iterations of EM to perform 950 | 951 | n_iter_per_test: int 952 | The number of iterations between tests for relative improvement in 953 | log-likelihood. 954 | 955 | tolerance: float 956 | The threshold of relative improvement in log-likelihood required to continue 957 | iterations. 958 | 959 | e_step_thresh: float (optional, default=1e-32) 960 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 961 | below threshold then write a zero for P(z|w,d). 962 | 963 | random_state: int, RandomState instance or None, (optional, default: None) 964 | If int, random_state is the seed used by the random number generator; 965 | If RandomState instance, random_state is the random number generator; 966 | If None, the random number generator is the RandomState instance used 967 | by `np.random`. Used in in initialization. 968 | 969 | Returns 970 | ------- 971 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 972 | The resulting model values of P(z|d) and P(w|z) 973 | 974 | """ 975 | A = X.tocoo().astype(np.float32) 976 | k = topics.shape[0] 977 | 978 | rng = check_random_state(random_state) 979 | p_z_given_d = rng.rand(A.shape[0], k) 980 | normalize(p_z_given_d, axis=1) 981 | p_z_given_d = p_z_given_d.astype(np.float32) 982 | topics = topics.astype(np.float32) 983 | 984 | p_z_given_d = plsa_refit_inner( 985 | A.row, 986 | A.col, 987 | A.data, 988 | topics, 989 | p_z_given_d, 990 | sample_weight, 991 | n_iter=n_iter, 992 | n_iter_per_test=n_iter_per_test, 993 | tolerance=tolerance, 994 | e_step_thresh=e_step_thresh, 995 | ) 996 | 997 | return p_z_given_d 998 | 999 | 1000 | class PLSA(BaseEstimator, TransformerMixin): 1001 | """Probabilistic Latent Semantic Analysis (pLSA) 1002 | 1003 | Given a bag-of-words matrix representation of a corpus of documents, where each row of the 1004 | matrix represents a document, and the jth element of the ith row is the count of the number of 1005 | times the jth vocabulary word occurs in the ith document, estimate matrices of conditional 1006 | probabilities P(z|d) and P(w|z) such that the product matrix of probabilities P(w|d) 1007 | maximises the likelihood of seeing the observed corpus data. Here P(z|d) represents the 1008 | probability of topic z given document d, P(w|z) represents the probability of word w given 1009 | topic z, and P(w|d) represents the probability of word w given document d. 1010 | 1011 | The algorithm proceeds using an Expectation-Maximization (EM) approach to attempt to maximise 1012 | the likelihood of the observed data under the estimated model. 1013 | 1014 | Parameters 1015 | ---------- 1016 | n_components: int (optional, default=10) 1017 | The number of topics to use in the matrix factorization. 1018 | 1019 | init: string or tuple (optional, default="random") 1020 | The intialization method to use. This should be one of: 1021 | * ``"random"`` 1022 | * ``"nndsvd"`` 1023 | * ``"nmf"`` 1024 | or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). 1025 | 1026 | n_iter: int 1027 | The maximum number iterations of EM to perform 1028 | 1029 | n_iter_per_test: int 1030 | The number of iterations between tests for relative improvement in 1031 | log-likelihood. 1032 | 1033 | tolerance: float 1034 | The threshold of relative improvement in log-likelihood required to continue 1035 | iterations. 1036 | 1037 | e_step_thresh: float (optional, default=1e-32) 1038 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 1039 | below threshold then write a zero for P(z|w,d). 1040 | 1041 | random_state: int, RandomState instance or None, (optional, default: None) 1042 | If int, random_state is the seed used by the random number generator; 1043 | If RandomState instance, random_state is the random number generator; 1044 | If None, the random number generator is the RandomState instance used 1045 | by `np.random`. Used in in initialization. 1046 | 1047 | Attributes 1048 | ---------- 1049 | 1050 | components_: array of shape (n_topics, n_words) 1051 | The topic vectors produced by pLSA. Each row is a topic, which is a probability 1052 | distribution, over the vocabulary, giving the probability of each word given the topic ( 1053 | P(w|z)). 1054 | 1055 | embedding_: array of shape (n_docs, n_topics) 1056 | The document vectors produced by pLSA. Each row corresponds to a document, giving a 1057 | probability distribution, over the topic space, specifying the probability of each topic 1058 | occuring in the document (P(z|d)). 1059 | 1060 | training_data_: sparse matrix of shape (n_docs, n_words) 1061 | The original training data saved in sparse matrix format. 1062 | 1063 | References 1064 | ---------- 1065 | 1066 | Hofmann, Thomas. "Probabilistic latent semantic analysis." Proceedings of the Fifteenth 1067 | conference on Uncertainty in artificial intelligence. Morgan Kaufmann Publishers Inc., 1999. 1068 | 1069 | Hofmann, Thomas. "Unsupervised learning by probabilistic latent semantic analysis." 1070 | Machine learning 42.1-2 (2001): 177-196. 1071 | 1072 | """ 1073 | 1074 | def __init__( 1075 | self, 1076 | n_components=10, 1077 | init="random", 1078 | n_iter=100, 1079 | n_iter_per_test=10, 1080 | tolerance=0.001, 1081 | e_step_thresh=1e-32, 1082 | transform_random_seed=42, 1083 | random_state=None, 1084 | ): 1085 | 1086 | self.n_components = n_components 1087 | self.init = init 1088 | self.n_iter = n_iter 1089 | self.n_iter_per_test = n_iter_per_test 1090 | self.tolerance = tolerance 1091 | self.e_step_thresh = e_step_thresh 1092 | self.transform_random_seed = transform_random_seed 1093 | self.random_state = random_state 1094 | 1095 | def fit(self, X, y=None, sample_weight=None): 1096 | """Learn the pLSA model for the data X and return the document vectors. 1097 | 1098 | This is more efficient than calling fit followed by transform. 1099 | 1100 | Parameters 1101 | ---------- 1102 | X: array or sparse matrix of shape (n_docs, n_words) 1103 | The data matrix pLSA is attempting to fit to. 1104 | 1105 | y: Ignored 1106 | 1107 | sample_weight: array of shape (n_docs,) 1108 | Input document weights. 1109 | 1110 | Returns 1111 | ------- 1112 | self 1113 | """ 1114 | self.fit_transform(X, sample_weight=sample_weight) 1115 | return self 1116 | 1117 | def fit_transform(self, X, y=None, sample_weight=None): 1118 | """Learn the pLSA model for the data X and return the document vectors. 1119 | 1120 | This is more efficient than calling fit followed by transform. 1121 | 1122 | Parameters 1123 | ---------- 1124 | X: array or sparse matrix of shape (n_docs, n_words) 1125 | The data matrix pLSA is attempting to fit to. 1126 | 1127 | y: Ignored 1128 | 1129 | sample_weight: array of shape (n_docs,) 1130 | Input document weights. 1131 | 1132 | Returns 1133 | ------- 1134 | embedding: array of shape (n_docs, n_topics) 1135 | An embedding of the documents into a topic space. 1136 | """ 1137 | 1138 | X = check_array(X, accept_sparse="csr") 1139 | X = standardize_input(X) 1140 | 1141 | if not issparse(X): 1142 | X = csr_matrix(X) 1143 | 1144 | sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32) 1145 | 1146 | if np.any(X.data < 0): 1147 | raise ValueError( 1148 | "PLSA is only valid for matrices with non-negative " "entries" 1149 | ) 1150 | 1151 | row_sums = np.array(X.sum(axis=1).T)[0] 1152 | good_rows = row_sums != 0 1153 | 1154 | if not np.all(good_rows): 1155 | zero_rows_found = True 1156 | data_for_fitting = X[good_rows] 1157 | else: 1158 | zero_rows_found = False 1159 | data_for_fitting = X 1160 | 1161 | U, V = plsa_fit( 1162 | data_for_fitting, 1163 | self.n_components, 1164 | sample_weight, 1165 | self.init, 1166 | self.n_iter, 1167 | self.n_iter_per_test, 1168 | self.tolerance, 1169 | self.e_step_thresh, 1170 | self.random_state, 1171 | ) 1172 | 1173 | if zero_rows_found: 1174 | self.embedding_ = np.zeros((X.shape[0], self.n_components)) 1175 | self.embedding_[good_rows] = U 1176 | else: 1177 | self.embedding_ = U 1178 | 1179 | self.components_ = V 1180 | self.training_data_ = X 1181 | 1182 | return self.embedding_ 1183 | 1184 | def transform(self, X, y=None): 1185 | """Transform the data X into the topic space of the fitted pLSA model. 1186 | 1187 | Parameters 1188 | ---------- 1189 | X: array or sparse matrix of shape (n_docs, n_words) 1190 | Corpus to be embedded into topic space 1191 | 1192 | y: Ignored 1193 | 1194 | Returns 1195 | ------- 1196 | embedding: array of shape (n_docs, n_topics) 1197 | An embedding of the documents X into the topic space. 1198 | """ 1199 | X = check_array(X, accept_sparse="csr") 1200 | random_state = check_random_state(self.transform_random_seed) 1201 | 1202 | # Set weights to 1 for all examples 1203 | sample_weight = _check_sample_weight(None, X, dtype=np.float32) 1204 | 1205 | if not issparse(X): 1206 | X = coo_matrix(X) 1207 | else: 1208 | X = X.tocoo() 1209 | 1210 | result = plsa_refit( 1211 | X, 1212 | self.components_, 1213 | sample_weight, 1214 | n_iter=50, 1215 | n_iter_per_test=5, 1216 | tolerance=0.001, 1217 | random_state=random_state, 1218 | ) 1219 | 1220 | return result 1221 | 1222 | def coherence(self, topic_num=None, n_words=20): 1223 | """Compute the average coherence of fitted topics, or of a single individual topic. 1224 | 1225 | Parameters 1226 | ---------- 1227 | topic_num: int (optional, default=None) 1228 | The topic number to compute coherence for. If ``topic_num`` is None then the average 1229 | coherence over all topics will be computed. 1230 | 1231 | n_words int (optional, default=20) 1232 | The number of topic words to score against. The top ``n_words`` words from the selected 1233 | topic will be used. 1234 | 1235 | Returns 1236 | ------- 1237 | topic_coherence: float 1238 | The requested coherence score. 1239 | """ 1240 | 1241 | # Test for errors 1242 | if not isinstance(topic_num, int) and topic_num is not None: 1243 | raise ValueError("Topic number must be an integer or None.") 1244 | 1245 | if topic_num is None: 1246 | return mean_coherence(self.components_, self.training_data_, n_words) 1247 | elif topic_num >= 0 and topic_num < self.n_components: 1248 | return coherence(self.components_, topic_num, self.training_data_, n_words) 1249 | else: 1250 | raise ValueError( 1251 | "Topic number must be in range 0 to {}".format(self.n_components) 1252 | ) 1253 | 1254 | def log_lift(self, topic_num=None, n_words=20): 1255 | """Compute the average log lift of fitted topics, or of a single individual topic. 1256 | 1257 | Parameters 1258 | ---------- 1259 | topic_num: int (optional, default=None) 1260 | The topic number to compute log lift for. If ``topic_num`` is None then the average 1261 | log lift over all topics will be computed. 1262 | 1263 | n_words int (optional, default=20) 1264 | The number of topic words to score against. The top ``n_words`` words from the selected 1265 | topic will be used. 1266 | 1267 | 1268 | Returns 1269 | ------- 1270 | log_lift: float 1271 | The requested log lift score. 1272 | """ 1273 | 1274 | # Test for errors 1275 | if not isinstance(topic_num, int) and topic_num is not None: 1276 | raise ValueError("Topic number must be an integer or None.") 1277 | 1278 | if topic_num is None: 1279 | return mean_log_lift(self.components_, self.training_data_, n_words) 1280 | elif topic_num >= 0 and topic_num < self.n_components: 1281 | return log_lift(self.components_, topic_num, self.training_data_, n_words) 1282 | else: 1283 | raise ValueError( 1284 | "Topic number must be in range 0 to {}".format(self.n_components) 1285 | ) 1286 | -------------------------------------------------------------------------------- /enstop/streamed_plsa.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import numba 3 | 4 | from sklearn.base import BaseEstimator, TransformerMixin 5 | from sklearn.utils import check_array, check_random_state 6 | 7 | try: 8 | from sklearn.utils.validation import _check_sample_weight 9 | except ImportError: 10 | from enstop.utils import _check_sample_weight 11 | from scipy.sparse import issparse, csr_matrix, coo_matrix 12 | 13 | from enstop.utils import ( 14 | normalize, 15 | coherence, 16 | mean_coherence, 17 | log_lift, 18 | mean_log_lift, 19 | standardize_input, 20 | ) 21 | from enstop.plsa import log_likelihood, plsa_init 22 | 23 | 24 | @numba.njit( 25 | "f4[:,::1](i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],i8,i8,f4)", 26 | locals={ 27 | "k": numba.types.intp, 28 | "w": numba.types.uint32, 29 | "d": numba.types.uint32, 30 | "z": numba.types.uint16, 31 | "nz_idx": numba.types.uint32, 32 | "norm": numba.types.float32, 33 | }, 34 | fastmath=True, 35 | nogil=True, 36 | parallel=True, 37 | ) 38 | def plsa_e_step_on_a_block( 39 | X_rows, 40 | X_cols, 41 | X_vals, 42 | p_w_given_z, 43 | p_z_given_d, 44 | p_z_given_wd_block, 45 | block_start, 46 | block_end, 47 | probability_threshold=1e-32, 48 | ): 49 | """Perform the E-step of pLSA optimization. This amounts to computing the 50 | probability of each topic given each word document pair. The computation 51 | implements 52 | 53 | P(z|w,d) = \frac{P(z|w)P(d|z)}{\sum_{z=1}^k P(z|w)P(d|z)}. 54 | 55 | This routine is optimized to work with sparse matrices such that P(z|w,d) 56 | is only computed for w, d such that X_{w,d} is non-zero, where X is the 57 | data matrix. 58 | 59 | To make this numba compilable the raw arrays defining the COO format sparse 60 | matrix must be passed separately. 61 | 62 | To keep memory use lower we only compute a block of P(z|w,d) -- specifically 63 | we compute it for all topics and a block of non-zeros of X. We can then use 64 | this block to complete a partial M step before computing the E step for 65 | the next block. 66 | 67 | 68 | Parameters 69 | ---------- 70 | X_rows: array of shape (nnz,) 71 | For each non-zero entry of X, the row of the entry. 72 | 73 | X_cols: array of shape (nnz,) 74 | For each non-zero entry of X, the column of the 75 | entry. 76 | 77 | X_vals: array of shape (nnz,) 78 | For each non-zero entry of X, the value of entry. 79 | 80 | p_w_given_z: array of shape (n_topics, n_words) 81 | The current estimates of values for P(w|z) 82 | 83 | p_z_given_d: array of shape (n_docs, n_topics) 84 | The current estimates of values for P(z|d) 85 | 86 | p_z_given_wd_block: array of shape (block_size, n_topics) 87 | The result array to write new estimates of P(z|w,d) to. 88 | 89 | block_start: int 90 | The index into nen-zeros of X where this block starts 91 | 92 | block_end: int 93 | The index into nen-zeros of X where this block ends 94 | 95 | probability_threshold: float (optional, default=1e-32) 96 | Option to promote sparsity. If the value of P(w|z)P(z|d) falls below 97 | threshold then write a zero for P(z|w,d). 98 | 99 | """ 100 | 101 | k = p_w_given_z.shape[0] 102 | 103 | for nz_idx in numba.prange(block_start, block_end): 104 | d = X_rows[nz_idx] 105 | w = X_cols[nz_idx] 106 | 107 | norm = 0.0 108 | for z in range(k): 109 | v = p_w_given_z[z, w] * p_z_given_d[d, z] 110 | if v > probability_threshold: 111 | p_z_given_wd_block[nz_idx - block_start, z] = v 112 | norm += v 113 | else: 114 | p_z_given_wd_block[nz_idx - block_start, z] = 0.0 115 | for z in range(k): 116 | if norm > 0: 117 | p_z_given_wd_block[nz_idx - block_start, z] /= norm 118 | 119 | return p_z_given_wd_block 120 | 121 | 122 | @numba.njit( 123 | "void(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1],i8,i8)", 124 | locals={ 125 | "k": numba.types.intp, 126 | "w": numba.types.uint32, 127 | "d": numba.types.uint32, 128 | "z": numba.types.uint16, 129 | "nz_idx": numba.types.uint32, 130 | "s": numba.types.float32, 131 | }, 132 | fastmath=True, 133 | nogil=True, 134 | ) 135 | def plsa_partial_m_step_on_a_block( 136 | X_rows, 137 | X_cols, 138 | X_vals, 139 | p_w_given_z, 140 | p_z_given_d, 141 | p_z_given_wd_block, 142 | norm_pwz, 143 | norm_pdz, 144 | block_start, 145 | block_end, 146 | ): 147 | """Perform a partial M-step of pLSA optimization. This amounts to using the 148 | estimates of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation 149 | implements 150 | 151 | P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)} 152 | P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)} 153 | 154 | This routine is optimized to work with sparse matrices such that P(z|w,d) is only 155 | computed for w, d such that X_{w,d} is non-zero, where X is the data matrix. 156 | 157 | To make this numba compilable the raw arrays defining the COO format sparse 158 | matrix must be passed separately. 159 | 160 | Note that in order to not store the entire P(z|w,d) matrix in memory at once 161 | we only process a block of it here. The normalization in the above formulas 162 | will actually be computed after all blocks have been completed. 163 | 164 | Parameters 165 | ---------- 166 | X_rows: array of shape (nnz,) 167 | For each non-zero entry of X, the row of the entry. 168 | 169 | X_cols: array of shape (nnz,) 170 | For each non-zero entry of X, the column of the 171 | entry. 172 | 173 | X_vals: array of shape (nnz,) 174 | For each non-zero entry of X, the value of entry. 175 | 176 | p_w_given_z: array of shape (n_topics, n_words) 177 | The result array to write new estimates of P(w|z) to. 178 | 179 | p_z_given_d: array of shape (n_docs, n_topics) 180 | The result array to write new estimates of P(z|d) to. 181 | 182 | p_z_given_wd_block: array of shape (block_size, n_topics) 183 | The current estimates for P(z|w,d) for a block 184 | 185 | norm_pwz: array of shape (n_topics,) 186 | Auxilliary array used for storing row norms; this is passed in to save 187 | reallocations. 188 | 189 | norm_pdz: array of shape (n_docs,) 190 | Auxilliary array used for storing row norms; this is passed in to save 191 | reallocations. 192 | 193 | sample_weight: array of shape (n_docs,) 194 | Input document weights. 195 | 196 | block_start: int 197 | The index into nen-zeros of X where this block starts 198 | 199 | block_end: int 200 | The index into nen-zeros of X where this block ends 201 | 202 | """ 203 | 204 | k = p_z_given_wd_block.shape[1] 205 | 206 | for nz_idx in range(block_start, block_end): 207 | d = X_rows[nz_idx] 208 | w = X_cols[nz_idx] 209 | x = X_vals[nz_idx] 210 | 211 | for z in range(k): 212 | s = x * p_z_given_wd_block[nz_idx - block_start, z] 213 | 214 | p_w_given_z[z, w] += s 215 | p_z_given_d[d, z] += s 216 | 217 | norm_pwz[z] += s 218 | norm_pdz[d] += s 219 | 220 | 221 | @numba.njit( 222 | "void(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1],f4[::1],i8,i8)", 223 | locals={ 224 | "k": numba.types.intp, 225 | "w": numba.types.uint32, 226 | "d": numba.types.uint32, 227 | "z": numba.types.uint16, 228 | "nz_idx": numba.types.uint32, 229 | "s": numba.types.float32, 230 | }, 231 | fastmath=True, 232 | nogil=True, 233 | ) 234 | def plsa_partial_m_step_on_a_block_w_sample_weight( 235 | X_rows, 236 | X_cols, 237 | X_vals, 238 | p_w_given_z, 239 | p_z_given_d, 240 | p_z_given_wd_block, 241 | norm_pwz, 242 | norm_pdz, 243 | sample_weight, 244 | block_start, 245 | block_end, 246 | ): 247 | """Perform a partial M-step of pLSA optimization. This amounts to using the 248 | estimates of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation 249 | implements 250 | 251 | P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)} 252 | P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)} 253 | 254 | This routine is optimized to work with sparse matrices such that P(z|w,d) is only 255 | computed for w, d such that X_{w,d} is non-zero, where X is the data matrix. 256 | 257 | To make this numba compilable the raw arrays defining the COO format sparse 258 | matrix must be passed separately. 259 | 260 | Note that in order to not store the entire P(z|w,d) matrix in memory at once 261 | we only process a block of it here. The normalization in the above formulas 262 | will actually be computed after all blocks have been completed. 263 | 264 | Parameters 265 | ---------- 266 | X_rows: array of shape (nnz,) 267 | For each non-zero entry of X, the row of the entry. 268 | 269 | X_cols: array of shape (nnz,) 270 | For each non-zero entry of X, the column of the 271 | entry. 272 | 273 | X_vals: array of shape (nnz,) 274 | For each non-zero entry of X, the value of entry. 275 | 276 | p_w_given_z: array of shape (n_topics, n_words) 277 | The result array to write new estimates of P(w|z) to. 278 | 279 | p_z_given_d: array of shape (n_docs, n_topics) 280 | The result array to write new estimates of P(z|d) to. 281 | 282 | p_z_given_wd_block: array of shape (block_size, n_topics) 283 | The current estimates for P(z|w,d) for a block 284 | 285 | norm_pwz: array of shape (n_topics,) 286 | Auxilliary array used for storing row norms; this is passed in to save 287 | reallocations. 288 | 289 | norm_pdz: array of shape (n_docs,) 290 | Auxilliary array used for storing row norms; this is passed in to save 291 | reallocations. 292 | 293 | sample_weight: array of shape (n_docs,) 294 | Input document weights. 295 | 296 | block_start: int 297 | The index into nen-zeros of X where this block starts 298 | 299 | block_end: int 300 | The index into nen-zeros of X where this block ends 301 | 302 | """ 303 | 304 | k = p_z_given_wd_block.shape[1] 305 | 306 | for nz_idx in range(block_start, block_end): 307 | d = X_rows[nz_idx] 308 | w = X_cols[nz_idx] 309 | x = X_vals[nz_idx] 310 | 311 | for z in range(k): 312 | s = x * p_z_given_wd_block[nz_idx - block_start, z] 313 | t = s * sample_weight[d] 314 | 315 | p_w_given_z[z, w] += t 316 | p_z_given_d[d, z] += s 317 | 318 | norm_pwz[z] += t 319 | norm_pdz[d] += s 320 | 321 | 322 | @numba.njit(parallel=True, fastmath=True, nogil=True) 323 | def plsa_em_step( 324 | X_rows, 325 | X_cols, 326 | X_vals, 327 | prev_p_w_given_z, 328 | prev_p_z_given_d, 329 | next_p_w_given_z, 330 | next_p_z_given_d, 331 | p_z_given_wd_block, 332 | norm_pwz, 333 | norm_pdz, 334 | e_step_thresh=1e-32, 335 | ): 336 | 337 | k = p_z_given_wd_block.shape[1] 338 | n = prev_p_z_given_d.shape[0] 339 | m = prev_p_w_given_z.shape[1] 340 | 341 | block_size = p_z_given_wd_block.shape[0] 342 | n_blocks = (X_vals.shape[0] // block_size) + 1 343 | 344 | # zero out the norms for recomputation 345 | norm_pdz[:] = 0.0 346 | norm_pwz[:] = 0.0 347 | 348 | # Loop over blocks doing E step on a block and a partial M step 349 | for block_index in range(n_blocks): 350 | block_start = block_index * block_size 351 | block_end = min(X_vals.shape[0], block_start + block_size) 352 | 353 | plsa_e_step_on_a_block( 354 | X_rows, 355 | X_cols, 356 | X_vals, 357 | prev_p_w_given_z, 358 | prev_p_z_given_d, 359 | p_z_given_wd_block, 360 | block_start, 361 | block_end, 362 | e_step_thresh, 363 | ) 364 | plsa_partial_m_step_on_a_block( 365 | X_rows, 366 | X_cols, 367 | X_vals, 368 | next_p_w_given_z, 369 | next_p_z_given_d, 370 | p_z_given_wd_block, 371 | norm_pwz, 372 | norm_pdz, 373 | block_start, 374 | block_end, 375 | ) 376 | 377 | # Once complete we can normalize to complete the M step 378 | for z in numba.prange(k): 379 | if norm_pwz[z] > 0: 380 | for w in range(m): 381 | next_p_w_given_z[z, w] /= norm_pwz[z] 382 | for d in range(n): 383 | if norm_pdz[d] > 0: 384 | next_p_z_given_d[d, z] /= norm_pdz[d] 385 | 386 | # Zero out the old matrices, we'll swap them on return and 387 | # these will become the new "next" 388 | prev_p_w_given_z[:] = 0.0 389 | prev_p_z_given_d[:] = 0.0 390 | 391 | return next_p_w_given_z, next_p_z_given_d, prev_p_w_given_z, prev_p_z_given_d 392 | 393 | 394 | @numba.njit(parallel=True, fastmath=True, nogil=True) 395 | def plsa_em_step_w_sample_weights( 396 | X_rows, 397 | X_cols, 398 | X_vals, 399 | prev_p_w_given_z, 400 | prev_p_z_given_d, 401 | next_p_w_given_z, 402 | next_p_z_given_d, 403 | p_z_given_wd_block, 404 | norm_pwz, 405 | norm_pdz, 406 | sample_weight, 407 | e_step_thresh=1e-32, 408 | ): 409 | 410 | k = p_z_given_wd_block.shape[1] 411 | n = prev_p_z_given_d.shape[0] 412 | m = prev_p_w_given_z.shape[1] 413 | 414 | block_size = p_z_given_wd_block.shape[0] 415 | n_blocks = (X_vals.shape[0] // block_size) + 1 416 | 417 | # zero out the norms for recomputation 418 | norm_pdz[:] = 0.0 419 | norm_pwz[:] = 0.0 420 | 421 | # Loop over blocks doing E step on a block and a partial M step 422 | for block_index in range(n_blocks): 423 | block_start = block_index * block_size 424 | block_end = min(X_vals.shape[0], block_start + block_size) 425 | 426 | plsa_e_step_on_a_block( 427 | X_rows, 428 | X_cols, 429 | X_vals, 430 | prev_p_w_given_z, 431 | prev_p_z_given_d, 432 | p_z_given_wd_block, 433 | block_start, 434 | block_end, 435 | e_step_thresh, 436 | ) 437 | plsa_partial_m_step_on_a_block_w_sample_weight( 438 | X_rows, 439 | X_cols, 440 | X_vals, 441 | next_p_w_given_z, 442 | next_p_z_given_d, 443 | p_z_given_wd_block, 444 | norm_pwz, 445 | norm_pdz, 446 | sample_weight, 447 | block_start, 448 | block_end, 449 | ) 450 | 451 | # Once complete we can normalize to complete the M step 452 | for z in numba.prange(k): 453 | if norm_pwz[z] > 0: 454 | for w in range(m): 455 | next_p_w_given_z[z, w] /= norm_pwz[z] 456 | for d in range(n): 457 | if norm_pdz[d] > 0: 458 | next_p_z_given_d[d, z] /= norm_pdz[d] 459 | 460 | # Zero out the old matrices, we'll swap them on return and 461 | # these will become the new "next" 462 | prev_p_w_given_z[:] = 0.0 463 | prev_p_z_given_d[:] = 0.0 464 | 465 | return next_p_w_given_z, next_p_z_given_d, prev_p_w_given_z, prev_p_z_given_d 466 | 467 | 468 | @numba.njit(fastmath=True, nogil=True) 469 | def plsa_fit_inner_blockwise( 470 | X_rows, 471 | X_cols, 472 | X_vals, 473 | p_w_given_z, 474 | p_z_given_d, 475 | sample_weight, 476 | block_size=65536, 477 | n_iter=100, 478 | n_iter_per_test=10, 479 | tolerance=0.001, 480 | e_step_thresh=1e-32, 481 | use_sample_weights=False, 482 | ): 483 | """Internal loop of EM steps required to optimize pLSA, along with relative 484 | convergence tests with respect to the log-likelihood of observing the data under 485 | the model. 486 | 487 | The EM looping will stop when either ``n_iter`` iterations have been reached, 488 | or if the relative improvement in log-likelihood over the last 489 | ``n_iter_per_test`` steps is under ``threshold``. 490 | 491 | This function is designed to wrap the internals of the EM process in a numba 492 | compilable loop, and is not the preferred entry point for fitting a plsa model. 493 | 494 | Parameters 495 | ---------- 496 | X_rows: array of shape (nnz,) 497 | For each non-zero entry of X, the row of the entry. 498 | 499 | X_cols: array of shape (nnz,) 500 | For each non-zero entry of X, the column of the 501 | entry. 502 | 503 | X_vals: array of shape (nnz,) 504 | For each non-zero entry of X, the value of entry. 505 | 506 | p_w_given_z: array of shape (n_topics, n_words) 507 | The current estimates of values for P(w|z) 508 | 509 | p_z_given_d: array of shape (n_docs, n_topics) 510 | The current estimates of values for P(z|d) 511 | 512 | sample_weight: array of shape (n_docs,) 513 | Input document weights. 514 | 515 | block_size: int (optional, default=65536) 516 | The number of nonzero entries of X to process in a block. The larger this 517 | value the faster the compute may go, but at higher memory cost. 518 | 519 | n_iter: int 520 | The maximum number iterations of EM to perform 521 | 522 | n_iter_per_test: int 523 | The number of iterations between tests for 524 | relative improvement in log-likelihood. 525 | 526 | tolerance: float 527 | The threshold of relative improvement in 528 | log-likelihood required to continue iterations. 529 | 530 | e_step_thresh: float (optional, default=1e-32) 531 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 532 | below threshold then write a zero for P(z|w,d). 533 | 534 | Returns 535 | ------- 536 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 537 | The resulting model values of P(z|d) and P(w|z) 538 | 539 | """ 540 | k = p_z_given_d.shape[1] 541 | n = p_z_given_d.shape[0] 542 | 543 | p_z_given_wd_block = np.zeros((block_size, k), dtype=np.float32) 544 | 545 | norm_pwz = np.zeros(k, dtype=np.float32) 546 | norm_pdz = np.zeros(n, dtype=np.float32) 547 | 548 | previous_log_likelihood = log_likelihood( 549 | X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight, 550 | ) 551 | 552 | next_p_w_given_z = np.zeros_like(p_w_given_z) 553 | next_p_z_given_d = np.zeros_like(p_z_given_d) 554 | 555 | for i in range(n_iter): 556 | 557 | if use_sample_weights: 558 | ( 559 | p_w_given_z, 560 | p_z_given_d, 561 | next_p_w_given_z, 562 | next_p_z_given_d, 563 | ) = plsa_em_step_w_sample_weights( 564 | X_rows, 565 | X_cols, 566 | X_vals, 567 | p_w_given_z, 568 | p_z_given_d, 569 | next_p_w_given_z, 570 | next_p_z_given_d, 571 | p_z_given_wd_block, 572 | norm_pwz, 573 | norm_pdz, 574 | sample_weight, 575 | e_step_thresh, 576 | ) 577 | else: 578 | p_w_given_z, p_z_given_d, next_p_w_given_z, next_p_z_given_d = plsa_em_step( 579 | X_rows, 580 | X_cols, 581 | X_vals, 582 | p_w_given_z, 583 | p_z_given_d, 584 | next_p_w_given_z, 585 | next_p_z_given_d, 586 | p_z_given_wd_block, 587 | norm_pwz, 588 | norm_pdz, 589 | e_step_thresh, 590 | ) 591 | 592 | if i % n_iter_per_test == 0: 593 | current_log_likelihood = log_likelihood( 594 | X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight, 595 | ) 596 | change = np.abs(current_log_likelihood - previous_log_likelihood) 597 | if change / np.abs(current_log_likelihood) < tolerance: 598 | break 599 | else: 600 | previous_log_likelihood = current_log_likelihood 601 | 602 | return p_z_given_d, p_w_given_z 603 | 604 | 605 | def plsa_fit( 606 | X, 607 | k, 608 | sample_weight, 609 | init="random", 610 | block_size=65536, 611 | n_iter=100, 612 | n_iter_per_test=10, 613 | tolerance=0.001, 614 | e_step_thresh=1e-32, 615 | random_state=None, 616 | ): 617 | """Fit a pLSA model to a data matrix ``X`` with ``k`` topics, an initialized 618 | according to ``init``. This will run an EM method to optimize estimates of P(z|d) 619 | and P(w|z). The will perform at most ``n_iter`` EM step iterations, 620 | while checking for relative improvement of the log-likelihood of the data under 621 | the model every ``n_iter_per_test`` iterations, and stops early if that is under 622 | ``tolerance``. 623 | 624 | Parameters 625 | ---------- 626 | X: sparse matrix of shape (n_docs, n_words) 627 | The data matrix pLSA is attempting to fit to. 628 | 629 | k: int 630 | The number of topics for pLSA to fit with. 631 | 632 | sample_weight: array of shape (n_docs,) 633 | Input document weights. 634 | 635 | init: string or tuple (optional, default="random") 636 | The intialization method to use. This should be one of: 637 | * ``"random"`` 638 | * ``"nndsvd"`` 639 | * ``"nmf"`` 640 | or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). 641 | 642 | block_size: int (optional, default=65536) 643 | The number of nonzero entries of X to process in a block. The larger this 644 | value the faster the compute may go, but at higher memory cost. 645 | 646 | n_iter: int 647 | The maximum number iterations of EM to perform 648 | 649 | n_iter_per_test: int 650 | The number of iterations between tests for 651 | relative improvement in log-likelihood. 652 | 653 | tolerance: float 654 | The threshold of relative improvement in 655 | log-likelihood required to continue iterations. 656 | 657 | e_step_thresh: float (optional, default=1e-32) 658 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 659 | below threshold then write a zero for P(z|w,d). 660 | 661 | random_state: int, RandomState instance or None, (optional, default: None) 662 | If int, random_state is the seed used by the random number generator; 663 | If RandomState instance, random_state is the random number generator; 664 | If None, the random number generator is the RandomState instance used 665 | by `np.random`. Used in in initialization. 666 | 667 | Returns 668 | ------- 669 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 670 | The resulting model values of P(z|d) and P(w|z) 671 | 672 | """ 673 | 674 | rng = check_random_state(random_state) 675 | p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng) 676 | p_z_given_d = p_z_given_d.astype(np.float32, order="C") 677 | p_w_given_z = p_w_given_z.astype(np.float32, order="C") 678 | 679 | use_sample_weights = np.any(sample_weight != 1.0) 680 | 681 | A = X.tocoo().astype(np.float32) 682 | 683 | p_z_given_d, p_w_given_z = plsa_fit_inner_blockwise( 684 | A.row, 685 | A.col, 686 | A.data, 687 | p_w_given_z, 688 | p_z_given_d, 689 | sample_weight, 690 | block_size=block_size, 691 | n_iter=n_iter, 692 | n_iter_per_test=n_iter_per_test, 693 | tolerance=tolerance, 694 | e_step_thresh=e_step_thresh, 695 | use_sample_weights=use_sample_weights, 696 | ) 697 | 698 | return p_z_given_d, p_w_given_z 699 | 700 | 701 | @numba.njit( 702 | "void(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[::1],i8,i8)", 703 | locals={ 704 | "k": numba.types.intp, 705 | "w": numba.types.uint32, 706 | "d": numba.types.uint32, 707 | "z": numba.types.uint16, 708 | "nz_idx": numba.types.uint32, 709 | "s": numba.types.float32, 710 | }, 711 | fastmath=True, 712 | nogil=True, 713 | ) 714 | def plsa_partial_refit_m_step_on_a_block( 715 | X_rows, 716 | X_cols, 717 | X_vals, 718 | p_z_given_d, 719 | p_z_given_wd_block, 720 | norm_pdz, 721 | block_start, 722 | block_end, 723 | ): 724 | """Perform a partial M-step of pLSA optimization. This amounts to using the 725 | estimates of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation 726 | implements 727 | 728 | P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)} 729 | P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)} 730 | 731 | This routine is optimized to work with sparse matrices such that P(z|w,d) is only 732 | computed for w, d such that X_{w,d} is non-zero, where X is the data matrix. 733 | 734 | To make this numba compilable the raw arrays defining the COO format sparse 735 | matrix must be passed separately. 736 | 737 | Note that in order to not store the entire P(z|w,d) matrix in memory at once 738 | we only process a block of it here. The normalization in the above formulas 739 | will actually be computed after all blocks have been completed. 740 | 741 | Parameters 742 | ---------- 743 | X_rows: array of shape (nnz,) 744 | For each non-zero entry of X, the row of the entry. 745 | 746 | X_cols: array of shape (nnz,) 747 | For each non-zero entry of X, the column of the 748 | entry. 749 | 750 | X_vals: array of shape (nnz,) 751 | For each non-zero entry of X, the value of entry. 752 | 753 | p_z_given_d: array of shape (n_docs, n_topics) 754 | The result array to write new estimates of P(z|d) to. 755 | 756 | p_z_given_wd_block: array of shape (block_size, n_topics) 757 | The current estimates for P(z|w,d) for a block 758 | 759 | sample_weight: array of shape (n_docs,) 760 | Input document weights. 761 | 762 | norm_pdz: array of shape (n_docs,) 763 | Auxilliary array used for storing row norms; this is passed in to save 764 | reallocations. 765 | 766 | block_start: int 767 | The index into nen-zeros of X where this block starts 768 | 769 | block_end: int 770 | The index into nen-zeros of X where this block ends 771 | 772 | """ 773 | 774 | k = p_z_given_wd_block.shape[1] 775 | 776 | for nz_idx in range(block_start, block_end): 777 | d = X_rows[nz_idx] 778 | w = X_cols[nz_idx] 779 | x = X_vals[nz_idx] 780 | 781 | for z in range(k): 782 | s = x * p_z_given_wd_block[nz_idx - block_start, z] 783 | p_z_given_d[d, z] += s 784 | norm_pdz[d] += s 785 | 786 | 787 | @numba.njit() 788 | def plsa_refit_em_step( 789 | X_rows, 790 | X_cols, 791 | X_vals, 792 | p_w_given_z, 793 | prev_p_z_given_d, 794 | next_p_z_given_d, 795 | p_z_given_wd_block, 796 | sample_weight, 797 | norm_pdz, 798 | e_step_thresh=1e-32, 799 | ): 800 | 801 | k = p_z_given_wd_block.shape[1] 802 | n = prev_p_z_given_d.shape[0] 803 | 804 | block_size = p_z_given_wd_block.shape[0] 805 | n_blocks = (X_vals.shape[0] // block_size) + 1 806 | 807 | # zero out the norms for recomputation 808 | norm_pdz[:] = 0.0 809 | 810 | # Loop over blocks doing E step on a block and a partial M step 811 | for block_index in range(n_blocks): 812 | block_start = block_index * block_size 813 | block_end = min(X_vals.shape[0], block_start + block_size) 814 | 815 | plsa_e_step_on_a_block( 816 | X_rows, 817 | X_cols, 818 | X_vals, 819 | p_w_given_z, 820 | prev_p_z_given_d, 821 | p_z_given_wd_block, 822 | block_start, 823 | block_end, 824 | e_step_thresh, 825 | ) 826 | plsa_partial_refit_m_step_on_a_block( 827 | X_rows, 828 | X_cols, 829 | X_vals, 830 | next_p_z_given_d, 831 | p_z_given_wd_block, 832 | norm_pdz, 833 | block_start, 834 | block_end, 835 | ) 836 | 837 | # Once complete we can normalize to complete the M step 838 | for z in numba.prange(k): 839 | for d in range(n): 840 | if norm_pdz[d] > 0: 841 | next_p_z_given_d[d, z] /= norm_pdz[d] 842 | 843 | # Zero out the old matrices, we'll swap them on return and 844 | # these will become the new "next" 845 | prev_p_z_given_d[:] = 0.0 846 | 847 | return next_p_z_given_d, prev_p_z_given_d 848 | 849 | 850 | @numba.njit(locals={"e_step_thresh": numba.types.float32,}, fastmath=True, nogil=True) 851 | def plsa_refit_inner_blockwise( 852 | X_rows, 853 | X_cols, 854 | X_vals, 855 | topics, 856 | p_z_given_d, 857 | sample_weight, 858 | block_size=65536, 859 | n_iter=50, 860 | n_iter_per_test=10, 861 | tolerance=0.005, 862 | e_step_thresh=1e-32, 863 | ): 864 | """Optimized routine for refitting values of P(z|d) given a fixed set of topics ( 865 | i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics 866 | (given, for example, by an ensemble result). 867 | 868 | This routine is optimized to work with sparse matrices and only compute values 869 | for w, d such that X_{w,d} is non-zero. 870 | 871 | To make this numba compilable the raw arrays defining the COO format sparse 872 | matrix must be passed separately. 873 | 874 | Parameters 875 | ---------- 876 | X_rows: array of shape (nnz,) 877 | For each non-zero entry of X, the row of the entry. 878 | 879 | X_cols: array of shape (nnz,) 880 | For each non-zero entry of X, the column of the 881 | entry. 882 | 883 | X_vals: array of shape (nnz,) 884 | For each non-zero entry of X, the value of entry. 885 | 886 | topics: array of shape (n_topics, n_words) 887 | The fixed topics against which to fit the values of P(z|d). 888 | 889 | p_z_given_d: array of shape (n_docs, n_topics) 890 | The current estimates of values for P(z|d) 891 | 892 | sample_weight: array of shape (n_docs,) 893 | Input document weights. 894 | 895 | block_size: int (optional, default=65536) 896 | The number of nonzero entries of X to process in a block. The larger this 897 | value the faster the compute may go, but at higher memory cost. 898 | 899 | n_iter: int 900 | The maximum number iterations of EM to perform 901 | 902 | n_iter_per_test: int 903 | The number of iterations between tests for relative improvement in 904 | log-likelihood. 905 | 906 | tolerance: float 907 | The threshold of relative improvement in log-likelihood required to continue 908 | iterations. 909 | 910 | e_step_thresh: float (optional, default=1e-32) 911 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 912 | below threshold then write a zero for P(z|w,d). 913 | 914 | Returns 915 | ------- 916 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 917 | The resulting model values of P(z|d) and P(w|z) 918 | 919 | """ 920 | k = topics.shape[0] 921 | p_z_given_wd_block = np.zeros((block_size, k), dtype=np.float32) 922 | 923 | norm_pdz = np.zeros(p_z_given_d.shape[0], dtype=np.float32) 924 | 925 | previous_log_likelihood = log_likelihood( 926 | X_rows, X_cols, X_vals, topics, p_z_given_d, sample_weight, 927 | ) 928 | 929 | next_p_z_given_d = np.zeros_like(p_z_given_d) 930 | 931 | for i in range(n_iter): 932 | 933 | p_z_given_d, next_p_z_given_d = plsa_refit_em_step( 934 | X_rows, 935 | X_cols, 936 | X_vals, 937 | topics, 938 | p_z_given_d, 939 | next_p_z_given_d, 940 | p_z_given_wd_block, 941 | sample_weight, 942 | norm_pdz, 943 | ) 944 | 945 | if i % n_iter_per_test == 0: 946 | current_log_likelihood = log_likelihood( 947 | X_rows, X_cols, X_vals, topics, p_z_given_d, sample_weight, 948 | ) 949 | if current_log_likelihood > 0: 950 | change = np.abs(current_log_likelihood - previous_log_likelihood) 951 | if change / np.abs(current_log_likelihood) < tolerance: 952 | break 953 | else: 954 | previous_log_likelihood = current_log_likelihood 955 | 956 | return p_z_given_d 957 | 958 | 959 | def plsa_refit( 960 | X, 961 | topics, 962 | sample_weight, 963 | block_size=65536, 964 | n_iter=50, 965 | n_iter_per_test=10, 966 | tolerance=0.005, 967 | e_step_thresh=1e-32, 968 | random_state=None, 969 | ): 970 | """Routine for refitting values of P(z|d) given a fixed set of topics ( 971 | i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics 972 | (given, for example, by an ensemble result). 973 | 974 | Parameters 975 | ---------- 976 | X: sparse matrix of shape (n_docs, n_words) 977 | The data matrix pLSA is attempting to fit to. 978 | 979 | topics: array of shape (n_topics, n_words) 980 | The fixed topics against which to fit the values of P(z|d). 981 | 982 | sample_weight: array of shape (n_docs,) 983 | Input document weights. 984 | 985 | block_size: int (optional, default=65536) 986 | The number of nonzero entries of X to process in a block. The larger this 987 | value the faster the compute may go, but at higher memory cost. 988 | 989 | n_iter: int 990 | The maximum number iterations of EM to perform 991 | 992 | n_iter_per_test: int 993 | The number of iterations between tests for relative improvement in 994 | log-likelihood. 995 | 996 | tolerance: float 997 | The threshold of relative improvement in log-likelihood required to continue 998 | iterations. 999 | 1000 | e_step_thresh: float (optional, default=1e-32) 1001 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 1002 | below threshold then write a zero for P(z|w,d). 1003 | 1004 | random_state: int, RandomState instance or None, (optional, default: None) 1005 | If int, random_state is the seed used by the random number generator; 1006 | If RandomState instance, random_state is the random number generator; 1007 | If None, the random number generator is the RandomState instance used 1008 | by `np.random`. Used in in initialization. 1009 | 1010 | Returns 1011 | ------- 1012 | p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words) 1013 | The resulting model values of P(z|d) and P(w|z) 1014 | 1015 | """ 1016 | A = X.tocoo().astype(np.float32) 1017 | k = topics.shape[0] 1018 | 1019 | rng = check_random_state(random_state) 1020 | p_z_given_d = rng.rand(A.shape[0], k) 1021 | normalize(p_z_given_d, axis=1) 1022 | p_z_given_d = p_z_given_d.astype(np.float32) 1023 | topics = topics.astype(np.float32) 1024 | 1025 | p_z_given_d = plsa_refit_inner_blockwise( 1026 | A.row, 1027 | A.col, 1028 | A.data, 1029 | topics, 1030 | p_z_given_d, 1031 | sample_weight, 1032 | block_size=block_size, 1033 | n_iter=n_iter, 1034 | n_iter_per_test=n_iter_per_test, 1035 | tolerance=tolerance, 1036 | e_step_thresh=e_step_thresh, 1037 | ) 1038 | 1039 | return p_z_given_d 1040 | 1041 | 1042 | class StreamedPLSA(BaseEstimator, TransformerMixin): 1043 | """Probabilistic Latent Semantic Analysis (pLSA) 1044 | 1045 | Given a bag-of-words matrix representation of a corpus of documents, where each row of the 1046 | matrix represents a document, and the jth element of the ith row is the count of the number of 1047 | times the jth vocabulary word occurs in the ith document, estimate matrices of conditional 1048 | probabilities P(z|d) and P(w|z) such that the product matrix of probabilities P(w|d) 1049 | maximises the likelihood of seeing the observed corpus data. Here P(z|d) represents the 1050 | probability of topic z given document d, P(w|z) represents the probability of word w given 1051 | topic z, and P(w|d) represents the probability of word w given document d. 1052 | 1053 | The algorithm proceeds using an Expectation-Maximization (EM) approach to attempt to maximise 1054 | the likelihood of the observed data under the estimated model. 1055 | 1056 | The StreamedPLSA uses a block based approached to compute partial E-step M-step 1057 | pairs to lower overall memory usage. This is particularly useful for very large 1058 | training data and/or large numbers of topics. 1059 | 1060 | Parameters 1061 | ---------- 1062 | n_components: int (optional, default=10) 1063 | The number of topics to use in the matrix factorization. 1064 | 1065 | init: string or tuple (optional, default="random") 1066 | The intialization method to use. This should be one of: 1067 | * ``"random"`` 1068 | * ``"nndsvd"`` 1069 | * ``"nmf"`` 1070 | or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words). 1071 | 1072 | block_size: int (optional, default=65536) 1073 | The number of nonzero entries of X to process in a block. The larger this 1074 | value the faster the compute may go, but at higher memory cost. 1075 | 1076 | n_iter: int 1077 | The maximum number iterations of EM to perform 1078 | 1079 | n_iter_per_test: int 1080 | The number of iterations between tests for relative improvement in 1081 | log-likelihood. 1082 | 1083 | tolerance: float 1084 | The threshold of relative improvement in log-likelihood required to continue 1085 | iterations. 1086 | 1087 | e_step_thresh: float (optional, default=1e-32) 1088 | Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls 1089 | below threshold then write a zero for P(z|w,d). 1090 | 1091 | random_state: int, RandomState instance or None, (optional, default: None) 1092 | If int, random_state is the seed used by the random number generator; 1093 | If RandomState instance, random_state is the random number generator; 1094 | If None, the random number generator is the RandomState instance used 1095 | by `np.random`. Used in in initialization. 1096 | 1097 | Attributes 1098 | ---------- 1099 | 1100 | components_: array of shape (n_topics, n_words) 1101 | The topic vectors produced by pLSA. Each row is a topic, which is a probability 1102 | distribution, over the vocabulary, giving the probability of each word given the topic ( 1103 | P(w|z)). 1104 | 1105 | embedding_: array of shape (n_docs, n_topics) 1106 | The document vectors produced by pLSA. Each row corresponds to a document, giving a 1107 | probability distribution, over the topic space, specifying the probability of each topic 1108 | occuring in the document (P(z|d)). 1109 | 1110 | training_data_: sparse matrix of shape (n_docs, n_words) 1111 | The original training data saved in sparse matrix format. 1112 | 1113 | References 1114 | ---------- 1115 | 1116 | Hofmann, Thomas. "Probabilistic latent semantic analysis." Proceedings of the Fifteenth 1117 | conference on Uncertainty in artificial intelligence. Morgan Kaufmann Publishers Inc., 1999. 1118 | 1119 | Hofmann, Thomas. "Unsupervised learning by probabilistic latent semantic analysis." 1120 | Machine learning 42.1-2 (2001): 177-196. 1121 | 1122 | """ 1123 | 1124 | def __init__( 1125 | self, 1126 | n_components=10, 1127 | init="random", 1128 | block_size=65536, 1129 | n_iter=100, 1130 | n_iter_per_test=10, 1131 | tolerance=0.001, 1132 | e_step_thresh=1e-32, 1133 | transform_random_seed=42, 1134 | random_state=None, 1135 | ): 1136 | 1137 | self.n_components = n_components 1138 | self.init = init 1139 | self.block_size = block_size 1140 | self.n_iter = n_iter 1141 | self.n_iter_per_test = n_iter_per_test 1142 | self.tolerance = tolerance 1143 | self.e_step_thresh = e_step_thresh 1144 | self.transform_random_seed = transform_random_seed 1145 | self.random_state = random_state 1146 | 1147 | def fit(self, X, y=None, sample_weight=None): 1148 | """Learn the pLSA model for the data X and return the document vectors. 1149 | 1150 | This is more efficient than calling fit followed by transform. 1151 | 1152 | Parameters 1153 | ---------- 1154 | X: array or sparse matrix of shape (n_docs, n_words) 1155 | The data matrix pLSA is attempting to fit to. 1156 | 1157 | y: Ignored 1158 | 1159 | sample_weight: array of shape (n_docs,) 1160 | Input document weights. 1161 | 1162 | Returns 1163 | ------- 1164 | self 1165 | """ 1166 | self.fit_transform(X, sample_weight=sample_weight) 1167 | return self 1168 | 1169 | def fit_transform(self, X, y=None, sample_weight=None): 1170 | """Learn the pLSA model for the data X and return the document vectors. 1171 | 1172 | This is more efficient than calling fit followed by transform. 1173 | 1174 | Parameters 1175 | ---------- 1176 | X: array or sparse matrix of shape (n_docs, n_words) 1177 | The data matrix pLSA is attempting to fit to. 1178 | 1179 | y: Ignored 1180 | 1181 | sample_weight: array of shape (n_docs,) 1182 | Input document weights. 1183 | 1184 | Returns 1185 | ------- 1186 | embedding: array of shape (n_docs, n_topics) 1187 | An embedding of the documents into a topic space. 1188 | """ 1189 | 1190 | X = check_array(X, accept_sparse="csr") 1191 | X = standardize_input(X) 1192 | 1193 | if not issparse(X): 1194 | X = csr_matrix(X) 1195 | 1196 | sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32) 1197 | 1198 | if np.any(X.data < 0): 1199 | raise ValueError( 1200 | "PLSA is only valid for matrices with non-negative " "entries" 1201 | ) 1202 | 1203 | row_sums = np.array(X.sum(axis=1).T)[0] 1204 | good_rows = row_sums != 0 1205 | 1206 | if not np.all(good_rows): 1207 | zero_rows_found = True 1208 | data_for_fitting = X[good_rows] 1209 | else: 1210 | zero_rows_found = False 1211 | data_for_fitting = X 1212 | 1213 | U, V = plsa_fit( 1214 | data_for_fitting, 1215 | self.n_components, 1216 | sample_weight, 1217 | init=self.init, 1218 | block_size=self.block_size, 1219 | n_iter=self.n_iter, 1220 | n_iter_per_test=self.n_iter_per_test, 1221 | tolerance=self.tolerance, 1222 | e_step_thresh=self.e_step_thresh, 1223 | random_state=self.random_state, 1224 | ) 1225 | 1226 | if zero_rows_found: 1227 | self.embedding_ = np.zeros((X.shape[0], self.n_components)) 1228 | self.embedding_[good_rows] = U 1229 | else: 1230 | self.embedding_ = U 1231 | 1232 | self.components_ = V 1233 | self.training_data_ = X 1234 | 1235 | return self.embedding_ 1236 | 1237 | def transform(self, X, y=None, sample_weight=None): 1238 | """Transform the data X into the topic space of the fitted pLSA model. 1239 | 1240 | Parameters 1241 | ---------- 1242 | X: array or sparse matrix of shape (n_docs, n_words) 1243 | Corpus to be embedded into topic space 1244 | 1245 | y: Ignored 1246 | 1247 | Returns 1248 | ------- 1249 | embedding: array of shape (n_docs, n_topics) 1250 | An embedding of the documents X into the topic space. 1251 | """ 1252 | X = check_array(X, accept_sparse="csr") 1253 | sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32) 1254 | random_state = check_random_state(self.transform_random_seed) 1255 | 1256 | if not issparse(X): 1257 | X = coo_matrix(X) 1258 | else: 1259 | X = X.tocoo() 1260 | 1261 | result = plsa_refit( 1262 | X, 1263 | self.components_, 1264 | sample_weight, 1265 | block_size=self.block_size, 1266 | n_iter=50, 1267 | n_iter_per_test=5, 1268 | tolerance=0.001, 1269 | random_state=random_state, 1270 | ) 1271 | 1272 | return result 1273 | 1274 | def coherence(self, topic_num=None, n_words=20): 1275 | """Compute the average coherence of fitted topics, or of a single individual topic. 1276 | 1277 | Parameters 1278 | ---------- 1279 | topic_num: int (optional, default=None) 1280 | The topic number to compute coherence for. If ``topic_num`` is None then the average 1281 | coherence over all topics will be computed. 1282 | 1283 | n_words int (optional, default=20) 1284 | The number of topic words to score against. The top ``n_words`` words from the selected 1285 | topic will be used. 1286 | 1287 | Returns 1288 | ------- 1289 | topic_coherence: float 1290 | The requested coherence score. 1291 | """ 1292 | 1293 | # Test for errors 1294 | if not isinstance(topic_num, int) and topic_num is not None: 1295 | raise ValueError("Topic number must be an integer or None.") 1296 | 1297 | if topic_num is None: 1298 | return mean_coherence(self.components_, self.training_data_, n_words) 1299 | elif topic_num >= 0 and topic_num < self.n_components: 1300 | return coherence(self.components_, topic_num, self.training_data_, n_words) 1301 | else: 1302 | raise ValueError( 1303 | "Topic number must be in range 0 to {}".format(self.n_components) 1304 | ) 1305 | 1306 | def log_lift(self, topic_num=None, n_words=20): 1307 | """Compute the average log lift of fitted topics, or of a single individual topic. 1308 | 1309 | Parameters 1310 | ---------- 1311 | topic_num: int (optional, default=None) 1312 | The topic number to compute log lift for. If ``topic_num`` is None then the average 1313 | log lift over all topics will be computed. 1314 | 1315 | n_words int (optional, default=20) 1316 | The number of topic words to score against. The top ``n_words`` words from the selected 1317 | topic will be used. 1318 | 1319 | 1320 | Returns 1321 | ------- 1322 | log_lift: float 1323 | The requested log lift score. 1324 | """ 1325 | 1326 | # Test for errors 1327 | if not isinstance(topic_num, int) and topic_num is not None: 1328 | raise ValueError("Topic number must be an integer or None.") 1329 | 1330 | if topic_num is None: 1331 | return mean_log_lift(self.components_, self.training_data_, n_words) 1332 | elif topic_num >= 0 and topic_num < self.n_components: 1333 | return log_lift(self.components_, topic_num, self.training_data_, n_words) 1334 | else: 1335 | raise ValueError( 1336 | "Topic number must be in range 0 to {}".format(self.n_components) 1337 | ) 1338 | --------------------------------------------------------------------------------