├── requirements.txt
├── enstop
    ├── __init__.py
    ├── utils.py
    ├── distributed_plsa.py
    ├── cuda_plsa.py
    ├── block_parallel_plsa.py
    ├── enstop_.py
    ├── plsa.py
    └── streamed_plsa.py
├── LICENSE
├── .gitignore
├── setup.py
├── CODE_OF_CONDUCT.md
├── README.rst
└── notebooks
    └── EnsTop with 20-Newsgroups.ipynb


/requirements.txt:
--------------------------------------------------------------------------------
1 | scikit-learn>=0.23
2 | scipy>=1.0
3 | numba>=0.48
4 | dask[delayed]>=1.2
5 | hdbscan>=0.8.10
6 | umap-learn>=0.3.8
7 | 


--------------------------------------------------------------------------------
/enstop/__init__.py:
--------------------------------------------------------------------------------
1 | from enstop.plsa import PLSA
2 | from enstop.streamed_plsa import StreamedPLSA
3 | from enstop.block_parallel_plsa import BlockParallelPLSA
4 | from enstop.distributed_plsa import DistributedPLSA
5 | from enstop.cuda_plsa import GPUPLSA
6 | from enstop.enstop_ import EnsembleTopics
7 | from enstop.utils import log_lift, mean_log_lift, coherence, mean_coherence
8 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD 2-Clause License
 2 | 
 3 | Copyright (c) 2019, Leland McInnes
 4 | All rights reserved.
 5 | 
 6 | Redistribution and use in source and binary forms, with or without
 7 | modification, are permitted provided that the following conditions are met:
 8 | 
 9 | 1. Redistributions of source code must retain the above copyright notice, this
10 |    list of conditions and the following disclaimer.
11 | 
12 | 2. Redistributions in binary form must reproduce the above copyright notice,
13 |    this list of conditions and the following disclaimer in the documentation
14 |    and/or other materials provided with the distribution.
15 | 
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17 | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18 | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20 | FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21 | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22 | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23 | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24 | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25 | OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | 
 4 | def readme():
 5 |     with open("README.rst") as readme_file:
 6 |         return readme_file.read()
 7 | 
 8 | 
 9 | configuration = {
10 |     "name": "enstop",
11 |     "version": "0.2.6",
12 |     "description": "Ensemble topic modelling with pLSA",
13 |     "long_description": readme(),
14 |     "classifiers": [
15 |         "Development Status :: 3 - Alpha",
16 |         "Intended Audience :: Science/Research",
17 |         "Intended Audience :: Developers",
18 |         "License :: OSI Approved",
19 |         "Programming Language :: C",
20 |         "Programming Language :: Python",
21 |         "Topic :: Software Development",
22 |         "Topic :: Scientific/Engineering",
23 |         "Operating System :: Microsoft :: Windows",
24 |         "Operating System :: POSIX",
25 |         "Operating System :: Unix",
26 |         "Operating System :: MacOS",
27 |         "Programming Language :: Python :: 3.6",
28 |         "Programming Language :: Python :: 3.7",
29 |         "Programming Language :: Python :: 3.8",
30 |     ],
31 |     "keywords": "topic model, LDA, pLSA, NMF",
32 |     "url": "http://github.com/lmcinnes/enstop",
33 |     "author": "Leland McInnes",
34 |     "author_email": "leland.mcinnes@gmail.com",
35 |     "maintainer": "Leland McInnes",
36 |     "maintainer_email": "leland.mcinnes@gmail.com",
37 |     "license": "BSD",
38 |     "packages": ["enstop"],
39 |     "install_requires": [
40 |         "scikit-learn >= 0.23",
41 |         "scipy >= 1.0",
42 |         "numba >= 0.48",
43 |         "dask[delayed] >= 1.2",
44 |         "hdbscan >= 0.8",
45 |         "umap-learn >= 0.3.8",
46 |     ],
47 |     "ext_modules": [],
48 |     "cmdclass": {},
49 |     "test_suite": "nose.collector",
50 |     "tests_require": ["nose"],
51 |     "data_files": (),
52 |     "zip_safe": True,
53 | }
54 | 
55 | setup(**configuration)
56 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Contributor Covenant Code of Conduct
 2 | 
 3 | ## Our Pledge
 4 | 
 5 | In the interest of fostering an open and welcoming environment, we as
 6 | contributors and maintainers pledge to making participation in our project and
 7 | our community a harassment-free experience for everyone, regardless of age, body
 8 | size, disability, ethnicity, sex characteristics, gender identity and expression,
 9 | level of experience, education, socio-economic status, nationality, personal
10 | appearance, race, religion, or sexual identity and orientation.
11 | 
12 | ## Our Standards
13 | 
14 | Examples of behavior that contributes to creating a positive environment
15 | include:
16 | 
17 | * Using welcoming and inclusive language
18 | * Being respectful of differing viewpoints and experiences
19 | * Gracefully accepting constructive criticism
20 | * Focusing on what is best for the community
21 | * Showing empathy towards other community members
22 | 
23 | Examples of unacceptable behavior by participants include:
24 | 
25 | * The use of sexualized language or imagery and unwelcome sexual attention or
26 |  advances
27 | * Trolling, insulting/derogatory comments, and personal or political attacks
28 | * Public or private harassment
29 | * Publishing others' private information, such as a physical or electronic
30 |  address, without explicit permission
31 | * Other conduct which could reasonably be considered inappropriate in a
32 |  professional setting
33 | 
34 | ## Our Responsibilities
35 | 
36 | Project maintainers are responsible for clarifying the standards of acceptable
37 | behavior and are expected to take appropriate and fair corrective action in
38 | response to any instances of unacceptable behavior.
39 | 
40 | Project maintainers have the right and responsibility to remove, edit, or
41 | reject comments, commits, code, wiki edits, issues, and other contributions
42 | that are not aligned to this Code of Conduct, or to ban temporarily or
43 | permanently any contributor for other behaviors that they deem inappropriate,
44 | threatening, offensive, or harmful.
45 | 
46 | ## Scope
47 | 
48 | This Code of Conduct applies both within project spaces and in public spaces
49 | when an individual is representing the project or its community. Examples of
50 | representing a project or community include using an official project e-mail
51 | address, posting via an official social media account, or acting as an appointed
52 | representative at an online or offline event. Representation of a project may be
53 | further defined and clarified by project maintainers.
54 | 
55 | ## Enforcement
56 | 
57 | Instances of abusive, harassing, or otherwise unacceptable behavior may be
58 | reported by contacting the project team at leland.mcinnes@gmail.com. All
59 | complaints will be reviewed and investigated and will result in a response that
60 | is deemed necessary and appropriate to the circumstances. The project team is
61 | obligated to maintain confidentiality with regard to the reporter of an incident.
62 | Further details of specific enforcement policies may be posted separately.
63 | 
64 | Project maintainers who do not follow or enforce the Code of Conduct in good
65 | faith may face temporary or permanent repercussions as determined by other
66 | members of the project's leadership.
67 | 
68 | ## Attribution
69 | 
70 | This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
71 | available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
72 | 
73 | [homepage]: https://www.contributor-covenant.org
74 | 
75 | For answers to common questions about this code of conduct, see
76 | https://www.contributor-covenant.org/faq
77 | 


--------------------------------------------------------------------------------
/README.rst:
--------------------------------------------------------------------------------
  1 | ======
  2 | EnsTop
  3 | ======
  4 | 
  5 | EnsTop provides an ensemble based approach to topic modelling using pLSA. It makes
  6 | use of a high performance numba based pLSA implementation to run multiple
  7 | bootstrapped topic models in parallel, and then clusters the resulting outputs to
  8 | determine a set of stable topics. It can then refit the document vectors against
  9 | these topics embed documents into the stable topic space.
 10 | 
 11 | ---------------
 12 | Why use EnsTop?
 13 | ---------------
 14 | 
 15 | There are a number of advantages to using an ensemble approach to topic modelling.
 16 | The most obvious is that it produces better more stable topics. A close second,
 17 | however, is that, by making use of HDBSCAN for clustering topics, it can learn a
 18 | "natural" number of topics. That is, while the user needs to specify an estimated
 19 | number of topics, the *actual* number of topics produced will be determined by how
 20 | many stable topics are produced over many bootstrapped runs. In practice this can
 21 | either be more, or less, than the estimated number of topics.
 22 | 
 23 | Despite all of these extra features the ensemble topic approach is still very
 24 | efficient, especially in multi-core environments (due the the embarrassingly parallel
 25 | nature of the ensemble). A run with a reasonable size ensemble can be completed in
 26 | around the same time it might take to fit an LDA model, and usually produces superior
 27 | quality results.
 28 | 
 29 | In addition to this EnsTop comes with a pLSA implementation that can be used
 30 | standalone (and not as part of an ensemble). So if all you are loosing for is a good
 31 | fast pLSA implementation (that can run considerably faster than many LDA
 32 | implementations) then EnsTop is the library for you.
 33 | 
 34 | -----------------
 35 | How to use EnsTop
 36 | -----------------
 37 | 
 38 | EnsTop follows the sklearn API (and inherits from sklearn base classes), so if you
 39 | use sklearn for LDA or NMF then you already know how to use Enstop. General usage is
 40 | very straightforward. The following example uses EnsTop to model topics from the
 41 | classic 20-Newsgroups dataset, using sklearn's CountVectorizer to generate the
 42 | required count matrix.
 43 | 
 44 | .. code:: python
 45 | 
 46 |     from sklearn.datasets import fetch_20newsgroups
 47 |     from sklearn.feature_extraction.text import CountVectorizer
 48 |     from enstop import EnsembleTopics
 49 | 
 50 |     news = fetch_20newsgroups(subset='all')
 51 |     data = CountVectorizer().fit_transform(news.data)
 52 | 
 53 |     model = EnsembleTopics(n_components=20).fit(data)
 54 |     topics = model.components_
 55 |     doc_vectors = model.embedding_
 56 | 
 57 | 
 58 | ---------------
 59 | How to use pLSA
 60 | ---------------
 61 | 
 62 | EnsTop also provides a simple to use but fast and effective pLSA implementation out
 63 | of the box. As with the ensemble topic modeller it follows the sklearn API, and usage
 64 | is very similar.
 65 | 
 66 | .. code:: python
 67 | 
 68 |     from sklearn.datasets import fetch_20newsgroups
 69 |     from sklearn.feature_extraction.text import CountVectorizer
 70 |     from enstop import PLSA
 71 | 
 72 |     news = fetch_20newsgroups(subset='all')
 73 |     data = CountVectorizer().fit_transform(news.data)
 74 | 
 75 |     model = PLSA(n_components=20).fit(data)
 76 |     topics = model.components_
 77 |     doc_vectors = model.embedding_
 78 | 
 79 | 
 80 | ------------
 81 | Installation
 82 | ------------
 83 | 
 84 | The easiest way to install EnsTop is via pip
 85 | 
 86 | .. code:: bash
 87 | 
 88 |     pip install enstop
 89 | 
 90 | To manually install this package:
 91 | 
 92 | .. code:: bash
 93 | 
 94 |     wget https://github.com/lmcinnes/enstop/archive/master.zip
 95 |     unzip master.zip
 96 |     rm master.zip
 97 |     cd enstop-master
 98 |     python setup.py install
 99 | 
100 | ----------------
101 | Help and Support
102 | ----------------
103 | 
104 | Some basic example notebooks are available `here <./notebooks>`_.
105 | 
106 | Documentation is coming. This project is still very young. If you need help, or have
107 | problems please `open an issue <https://github.com/lmcinnes/enstop/issues/new>`_
108 | and I will try to provide any help and guidance that I can. Please also check
109 | the docstrings on the code, which provide some descriptions of the parameters.
110 | 
111 | 
112 | -------
113 | License
114 | -------
115 | 
116 | The EnsTop package is 2-clause BSD licensed.
117 | 
118 | ------------
119 | Contributing
120 | ------------
121 | 
122 | Contributions are more than welcome! There are lots of opportunities
123 | for potential projects, so please get in touch if you would like to
124 | help out. Everything from code to notebooks to
125 | examples and documentation are all *equally valuable* so please don't feel
126 | you can't contribute. To contribute please `fork the project <https://github.com/lmcinnes/enstop/issues#fork-destination-box>`_ make your changes and
127 | submit a pull request. We will do our best to work through any issues with
128 | you and get your code merged into the main branch.
129 | 


--------------------------------------------------------------------------------
/enstop/utils.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numba
  3 | from scipy.sparse import issparse, csc_matrix
  4 | from sklearn.utils.validation import check_array
  5 | from sklearn.preprocessing import normalize as sklearn_normalize
  6 | import numbers
  7 | 
  8 | @numba.njit(fastmath=True, nogil=True)
  9 | def normalize(ndarray, axis=0):
 10 |     """Normalize an array with respect to the l1-norm along an axis. Note that this procedure
 11 |     modifies the array **in place**.
 12 | 
 13 |     Parameters
 14 |     ----------
 15 |     ndarray: array of shape (n,m)
 16 |         The array to be normalized. Must be a 2D array.
 17 | 
 18 |     axis: int (optional, default=0)
 19 |         The axis to normalize with respect to. 0 means normalize columns, 1 means normalize rows.
 20 |     """
 21 |     # Compute marginal sum along axis
 22 |     marginal = np.zeros(ndarray.shape[1 - axis])
 23 |     for i in range(marginal.shape[0]):
 24 |         for j in range(ndarray.shape[axis]):
 25 |             if axis == 0:
 26 |                 marginal[i] += ndarray[j, i]
 27 |             elif axis == 1:
 28 |                 marginal[i] += ndarray[i, j]
 29 |             else:
 30 |                 raise ValueError("axis must be 0 or 1")
 31 | 
 32 |     # Divide out by the marginal
 33 |     for i in range(marginal.shape[0]):
 34 |         for j in range(ndarray.shape[axis]):
 35 |             if marginal[i] > 0.0:
 36 |                 if axis == 0:
 37 |                     ndarray[j, i] /= marginal[i]
 38 |                 elif axis == 1:
 39 |                     ndarray[i, j] /= marginal[i]
 40 |                 else:
 41 |                     raise ValueError("axis must be 0 or 1")
 42 | 
 43 | 
 44 | @numba.njit()
 45 | def _log_lift(topics, z, empirical_probs, n=-1):
 46 |     """Internal method to compute the log lift given precomputed empirical probabilities. This
 47 |     routine is designed to be numba compilable for performance.
 48 | 
 49 |     Parameters
 50 |     ----------
 51 |     topics: array of shape (n_topics, n_words)
 52 |         The topic vectors to evaluate.
 53 | 
 54 |     z: int
 55 |         Which topic vector to evaluate. Must be
 56 |         in range(0, n_topics).
 57 | 
 58 |     empirical_probs: array of shape (n_words,)
 59 |         The empirical probability of word occurrence.
 60 | 
 61 |     n: int (optional, default=-1)
 62 |         The number of words to average over. If less than 0 it will evaluate over the entire
 63 |         vocabulary, otherwise it will select the top ``n`` words of the chosen topic.
 64 | 
 65 |     Returns
 66 |     -------
 67 |     log_lift: float
 68 |         The log lift of the ``z``th topic vector.
 69 |     """
 70 |     total_lift = 0.0
 71 |     if n <= 0:
 72 |         for w in range(topics.shape[1]):
 73 |             if empirical_probs[w] > 0:
 74 |                 total_lift += topics[z, w] * 1.0 / empirical_probs[w]
 75 |         return np.log(total_lift * 1.0 / topics.shape[1])
 76 |     else:
 77 |         top_words = np.argsort(topics[z])[-n:]
 78 |         for i in range(n):
 79 |             w = top_words[i]
 80 |             if empirical_probs[w] > 0:
 81 |                 total_lift += topics[z, w] * 1.0 / empirical_probs[w]
 82 |         return np.log(total_lift * 1.0 / n)
 83 | 
 84 | 
 85 | def log_lift(topics, z, data, n_words=-1):
 86 |     """Compute the log lift of a single topic given empirical data from which empirical
 87 |     probabilities of word occurrence can be computed.
 88 | 
 89 |      Parameters
 90 |      ----------
 91 |      topics: array of shape (n_topics, n_words)
 92 |          The topic vectors to evaluate.
 93 | 
 94 |      z: int
 95 |          Which topic vector to evaluate. Must be
 96 |          in range(0, n_topics).
 97 | 
 98 |      data: array or sparse matrix of shape (n_docs, n_words,)
 99 |          The empirical data of word occurrence in a corpus.
100 | 
101 |      n: int (optional, default=-1)
102 |          The number of words to average over. If less than 0 it will evaluate over the entire
103 |          vocabulary, otherwise it will select the top ``n`` words of the chosen topic.
104 | 
105 |      Returns
106 |      -------
107 |      log_lift: float
108 |          The log lift of the ``z``th topic vector.
109 |      """
110 |     normalized_topics = topics.copy()
111 |     normalize(normalized_topics, axis=1)
112 |     empirical_probs = np.array(data.sum(axis=0)).squeeze().astype(np.float64)
113 |     empirical_probs /= empirical_probs.sum()
114 |     return _log_lift(normalized_topics, z, empirical_probs, n=n_words)
115 | 
116 | 
117 | def mean_log_lift(topics, data, n_words=-1):
118 |     """Compute the average log lift over all topics given empirical data from which empirical
119 |     probabilities of word occurrence can be computed.
120 | 
121 |      Parameters
122 |      ----------
123 |      topics: array of shape (n_topics, n_words)
124 |          The topic vectors to evaluate.
125 | 
126 |      data: array or sparse matrix of shape (n_docs, n_words,)
127 |          The empirical data of word occurrence in a corpus.
128 | 
129 |      n: int (optional, default=-1)
130 |          The number of words to average over. If less than 0 it will evaluate over the entire
131 |          vocabulary, otherwise it will select the top ``n`` words of the chosen topic.
132 | 
133 |      Returns
134 |      -------
135 |      log_lift: float
136 |          The average log lift over all topic vectors.
137 |      """
138 |     normalized_topics = topics.copy()
139 |     normalize(normalized_topics, axis=1)
140 |     empirical_probs = np.array(data.sum(axis=0)).squeeze().astype(np.float64)
141 |     empirical_probs /= empirical_probs.sum()
142 |     return np.mean(
143 |         [
144 |             _log_lift(topics, z, empirical_probs, n=n_words)
145 |             for z in range(topics.shape[0])
146 |         ]
147 |     )
148 | 
149 | 
150 | @numba.njit()
151 | def arr_intersect(ar1, ar2):
152 |     """Numba compilable equivalent of numpy's intersect1d"""
153 |     aux = np.concatenate((ar1, ar2))
154 |     aux.sort()
155 |     return aux[:-1][aux[1:] == aux[:-1]]
156 | 
157 | 
158 | @numba.njit()
159 | def _coherence(topics, z, n, indices, indptr, n_docs_per_word):
160 |     """Internal routine for computing the coherence of a given topic given raw data and the
161 |     number of documents per vocabulary word. This routine makes use of scipy sparse matrix
162 |     formats, but to be numba compilable it must make use of internal arrays thereof.
163 | 
164 |     Parameters
165 |     ----------
166 |     topics: array of shape (n_topics, n_words)
167 |         The topic vectors for scoring
168 | 
169 |     z: int
170 |         Which topic vector to score.
171 | 
172 |     n: int
173 |         The number of topic words to score against. The top ``n`` words from the ``z``th topic
174 |         will be used.
175 | 
176 |     indices: array of shape (nnz,)
177 |         The indices array of a CSC format sparse matrix representation of the corpus data.
178 | 
179 |     indptr: array of shape(n_words - 1,)
180 |         The indptr array of a CSC format sparse matrix representation of the corpus data.
181 | 
182 |     n_docs_per_word: array of shape (n_words,)
183 |         The total number of documents for each vocabulary word (the column sum of the corpus data).
184 | 
185 | 
186 |     Returns
187 |     -------
188 |     topic_coherence: float
189 |         The coherence score of the ``z``th topic.
190 |     """
191 |     top_words = np.argsort(topics[z])[-n:]
192 |     coherence = 0.0
193 |     for i in range(n - 1):
194 |         w = top_words[i]
195 |         if n_docs_per_word[w] == 0:
196 |             continue
197 |         for j in range(i + 1, n):
198 |             v = top_words[j]
199 |             n_co_occur = arr_intersect(
200 |                 indices[indptr[w] : indptr[w + 1]], indices[indptr[v] : indptr[v + 1]]
201 |             ).shape[0]
202 |             coherence += np.log((n_co_occur + 1.0) / n_docs_per_word[w])
203 |     return coherence
204 | 
205 | 
206 | def coherence(topics, z, data, n_words=20):
207 |     """Compute the coherence of a single topic given empirical data.
208 | 
209 |     Parameters
210 |     ----------
211 |     topics: array of shape (n_topics, n_words)
212 |         The topic vectors for scoring
213 | 
214 |     z: int
215 |         Which topic vector to score.
216 | 
217 |     data: array or sparse matrix of shape (n_doc, n_words)
218 |         The empirical data of word occurrence in a corpus.
219 | 
220 |     n_words: int (optional, default=20)
221 |         The number of topic words to score against. The top ``n_words`` words from the ``z``th topic
222 |         will be used.
223 | 
224 |     Returns
225 |     -------
226 |     topic_coherence: float
227 |         The coherence score of the ``z``th topic.
228 |     """
229 |     if not issparse(data):
230 |         csc_data = csc_matrix(data)
231 |     else:
232 |         csc_data = data.tocsc()
233 | 
234 |     n_docs_per_word = np.array((data > 0).sum(axis=0)).squeeze()
235 |     return _coherence(
236 |         topics, z, n_words, csc_data.indices, csc_data.indptr, n_docs_per_word
237 |     )
238 | 
239 | 
240 | def mean_coherence(topics, data, n_words=20):
241 |     """Compute the average coherence of all topics given empirical data.
242 | 
243 |     Parameters
244 |     ----------
245 |     topics: array of shape (n_topics, n_words)
246 |         The topic vectors for scoring
247 | 
248 |     data: array or sparse matrix of shape (n_doc, n_words)
249 |         The empirical data of word occurrence in a corpus.
250 | 
251 |     n_words: int (optional, default=20)
252 |         The number of topic words to score against. The top ``n_words`` words of each topic
253 |         will be used.
254 | 
255 |     Returns
256 |     -------
257 |     topic_coherence: float
258 |         The average coherence score of all the topics.
259 |     """
260 |     if not issparse(data):
261 |         csc_data = csc_matrix(data)
262 |     else:
263 |         csc_data = data.tocsc()
264 | 
265 |     n_docs_per_word = np.array((data > 0).sum(axis=0)).squeeze()
266 |     return np.mean(
267 |         [
268 |             _coherence(
269 |                 topics, z, n_words, csc_data.indices, csc_data.indptr, n_docs_per_word
270 |             )
271 |             for z in range(topics.shape[0])
272 |         ]
273 |     )
274 | 
275 | 
276 | def standardize_input(input_matrix):
277 |     if input_matrix.dtype in (np.float32, np.float64, np.float, np.double):
278 |         return sklearn_normalize(input_matrix, norm="l1")
279 |     else:
280 |         return input_matrix
281 | 
282 | ####
283 | # Taken from sklearn as a fallback option; by default we import their latest version
284 | ####
285 | def _check_sample_weight(sample_weight, X, dtype=None):
286 |     """Validate sample weights.
287 | 
288 |     Note that passing sample_weight=None will output an array of ones.
289 |     Therefore, in some cases, you may want to protect the call with:
290 |     if sample_weight is not None:
291 |         sample_weight = _check_sample_weight(...)
292 | 
293 |     Parameters
294 |     ----------
295 |     sample_weight : {ndarray, Number or None}, shape (n_samples,)
296 |        Input sample weights.
297 | 
298 |     X : nd-array, list or sparse matrix
299 |         Input data.
300 | 
301 |     dtype: dtype
302 |        dtype of the validated `sample_weight`.
303 |        If None, and the input `sample_weight` is an array, the dtype of the
304 |        input is preserved; otherwise an array with the default numpy dtype
305 |        is be allocated.  If `dtype` is not one of `float32`, `float64`,
306 |        `None`, the output will be of dtype `float64`.
307 | 
308 |     Returns
309 |     -------
310 |     sample_weight : ndarray, shape (n_samples,)
311 |        Validated sample weight. It is guaranteed to be "C" contiguous.
312 |     """
313 |     n_samples = X.shape[0]
314 | 
315 |     if dtype is not None and dtype not in [np.float32, np.float64]:
316 |         dtype = np.float64
317 | 
318 |     if sample_weight is None:
319 |         sample_weight = np.ones(n_samples, dtype=dtype)
320 |     elif isinstance(sample_weight, numbers.Number):
321 |         sample_weight = np.full(n_samples, sample_weight, dtype=dtype)
322 |     else:
323 |         if dtype is None:
324 |             dtype = [np.float64, np.float32]
325 |         sample_weight = check_array(
326 |             sample_weight, accept_sparse=False, ensure_2d=False, dtype=dtype,
327 |             order="C"
328 |         )
329 |         if sample_weight.ndim != 1:
330 |             raise ValueError("Sample weights must be 1D array or scalar")
331 | 
332 |         if sample_weight.shape != (n_samples,):
333 |             raise ValueError("sample_weight.shape == {}, expected {}!"
334 |                              .format(sample_weight.shape, (n_samples,)))
335 |     return sample_weight
336 | 


--------------------------------------------------------------------------------
/enstop/distributed_plsa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numba
  3 | 
  4 | from sklearn.base import BaseEstimator, TransformerMixin
  5 | from sklearn.utils import check_array, check_random_state
  6 | from scipy.sparse import issparse, csr_matrix, coo_matrix
  7 | 
  8 | from enstop.utils import normalize, coherence, mean_coherence, log_lift, mean_log_lift
  9 | from enstop.plsa import plsa_init
 10 | from enstop.block_parallel_plsa import (
 11 |     plsa_e_step_on_a_block,
 12 |     plsa_partial_m_step_on_a_block,
 13 | )
 14 | 
 15 | from dask import delayed, compute, optimize, persist
 16 | import dask.array as da
 17 | 
 18 | 
 19 | @delayed
 20 | @numba.njit(nogil=True, fastmath=True)
 21 | def plsa_em_step_block_kernel(
 22 |     row_block, col_block, val_block, p_w_given_z, p_z_given_d, e_step_thresh=1e-32,
 23 | ):
 24 |     result_p_w_given_z = np.zeros_like(p_w_given_z)
 25 |     result_p_z_given_d = np.zeros_like(p_z_given_d)
 26 |     result_norm_pwz = np.zeros(p_w_given_z.shape[0], dtype=np.float32)
 27 |     result_norm_pdz = np.zeros(p_z_given_d.shape[0], dtype=np.float32)
 28 |     p_z_given_wd_block = np.zeros(
 29 |         (row_block.shape[0], p_w_given_z.shape[0]), dtype=np.float32
 30 |     )
 31 | 
 32 |     plsa_e_step_on_a_block(
 33 |         row_block,
 34 |         col_block,
 35 |         p_w_given_z,
 36 |         p_z_given_d,
 37 |         p_z_given_wd_block,
 38 |         e_step_thresh,
 39 |     )
 40 |     plsa_partial_m_step_on_a_block(
 41 |         row_block,
 42 |         col_block,
 43 |         val_block,
 44 |         result_p_w_given_z,
 45 |         result_p_z_given_d,
 46 |         p_z_given_wd_block,
 47 |         result_norm_pwz,
 48 |         result_norm_pdz,
 49 |     )
 50 | 
 51 |     return result_p_w_given_z, result_p_z_given_d, result_norm_pwz, result_norm_pdz
 52 | 
 53 | 
 54 | def plsa_em_step_dask(
 55 |     block_rows_ndarray,
 56 |     block_cols_ndarray,
 57 |     block_vals_ndarray,
 58 |     p_w_given_z,
 59 |     p_z_given_d,
 60 |     block_row_size,
 61 |     block_col_size,
 62 |     e_step_thresh=1e-32,
 63 | ):
 64 |     n_d_blocks = block_rows_ndarray.shape[0]
 65 |     n_w_blocks = block_rows_ndarray.shape[1]
 66 | 
 67 |     n = p_z_given_d.shape[0]
 68 |     m = p_w_given_z.shape[1]
 69 |     k = p_z_given_d.shape[1]
 70 | 
 71 |     result_p_w_given_z = [[] for i in range(n_w_blocks)]
 72 |     result_p_z_given_d = [[] for i in range(n_d_blocks)]
 73 |     result_norm_pwz = []
 74 |     result_norm_pdz = [[] for i in range(n_d_blocks)]
 75 | 
 76 |     for i in range(n_d_blocks):
 77 | 
 78 |         row_start = block_row_size * i
 79 |         row_end = min(row_start + block_row_size, n)
 80 | 
 81 |         for j in range(n_w_blocks):
 82 |             col_start = block_col_size * j
 83 |             col_end = min(col_start + block_col_size, m)
 84 | 
 85 |             row_block = block_rows_ndarray[i, j]
 86 |             col_block = block_cols_ndarray[i, j]
 87 |             val_block = block_vals_ndarray[i, j]
 88 | 
 89 |             kernel_results = plsa_em_step_block_kernel(
 90 |                 row_block,
 91 |                 col_block,
 92 |                 val_block,
 93 |                 p_w_given_z[:, col_start:col_end],
 94 |                 p_z_given_d[row_start:row_end, :],
 95 |                 e_step_thresh=e_step_thresh,
 96 |             )
 97 | 
 98 |             result_p_w_given_z[j].append(
 99 |                 da.from_delayed(
100 |                     kernel_results[0], (k, block_col_size), dtype=np.float32
101 |                 )
102 |             )
103 |             result_p_z_given_d[i].append(
104 |                 da.from_delayed(
105 |                     kernel_results[1], (block_row_size, k), dtype=np.float32
106 |                 )
107 |             )
108 |             result_norm_pwz.append(
109 |                 da.from_delayed(kernel_results[2], (k,), dtype=np.float32)
110 |             )
111 | 
112 |             result_norm_pdz[i].append(
113 |                 da.from_delayed(kernel_results[3], (block_row_size,), dtype=np.float32)
114 |             )
115 | 
116 |     p_w_given_z_blocks = [
117 |         da.dstack(result_p_w_given_z[i]).sum(axis=-1) for i in range(n_w_blocks)
118 |     ]
119 |     p_z_given_d_blocks = [
120 |         da.dstack(result_p_z_given_d[i]).sum(axis=-1) for i in range(n_d_blocks)
121 |     ]
122 |     norm_pdz_blocks = [
123 |         da.dstack(result_norm_pdz[i]).sum(axis=-1) for i in range(n_d_blocks)
124 |     ]
125 | 
126 |     p_w_given_z = (
127 |         da.hstack(p_w_given_z_blocks) / da.dstack(result_norm_pwz).sum(axis=-1).T
128 |     )
129 |     p_z_given_d = da.vstack(p_z_given_d_blocks) / da.hstack(norm_pdz_blocks).T
130 | 
131 |     result = compute(p_w_given_z, p_z_given_d)
132 | 
133 |     return result
134 | 
135 | 
136 | @numba.njit(
137 |     locals={
138 |         "i": numba.types.uint16,
139 |         "j": numba.types.uint16,
140 |         "k": numba.types.intp,
141 |         "w": numba.types.uint32,
142 |         "d": numba.types.uint32,
143 |         "z": numba.types.uint16,
144 |         "nz_idx": numba.types.uint32,
145 |         "x": numba.types.float32,
146 |         "result": numba.types.float32[:, :, ::1],
147 |         "p_w_given_d": numba.types.float32,
148 |     },
149 |     fastmath=True,
150 |     nogil=True,
151 |     parallel=True,
152 | )
153 | def log_likelihood_by_blocks_kernel(
154 |     block_rows,
155 |     block_cols,
156 |     block_vals,
157 |     p_w_given_z,
158 |     p_z_given_d,
159 |     block_row_size,
160 |     block_col_size,
161 |     i, j,
162 | ):
163 |     result = np.zeros((1, 1, 1), dtype=np.float32)
164 |     k = p_w_given_z.shape[0]
165 | 
166 |     for nz_idx in range(block_rows.shape[2]):
167 |         if block_rows[0, 0, nz_idx] < 0:
168 |             break
169 | 
170 |         d = block_rows[0, 0, nz_idx] + i * block_row_size
171 |         w = block_cols[0, 0, nz_idx] + j * block_col_size
172 |         x = block_vals[0, 0, nz_idx]
173 | 
174 |         p_w_given_d = 0.0
175 |         for z in range(k):
176 |             p_w_given_d += p_w_given_z[z, w] * p_z_given_d[d, z]
177 | 
178 |         result[0, 0, 0] += x * np.log(p_w_given_d)
179 | 
180 |     return result
181 | 
182 | def log_likelihood_by_blocks_kernel_wrapper(
183 |     block_rows,
184 |     block_cols,
185 |     block_vals,
186 |     p_w_given_z,
187 |     p_z_given_d,
188 |     block_row_size,
189 |     block_col_size,
190 |     block_info=None,
191 | ):
192 |     i, j, _ = block_info[0]["chunk-location"]
193 |     return log_likelihood_by_blocks_kernel(
194 |         block_rows,
195 |         block_cols,
196 |         block_vals,
197 |         p_w_given_z,
198 |         p_z_given_d,
199 |         block_row_size,
200 |         block_col_size,
201 |         i, j,
202 |     )
203 | 
204 | def log_likelihood_by_blocks(
205 |     block_rows_ndarray,
206 |     block_cols_ndarray,
207 |     block_vals_ndarray,
208 |     p_w_given_z,
209 |     p_z_given_d,
210 |     block_row_size,
211 |     block_col_size,
212 | ):
213 | 
214 |     log_likelihood_per_block = da.map_blocks(
215 |         log_likelihood_by_blocks_kernel_wrapper,
216 |         block_rows_ndarray,
217 |         block_cols_ndarray,
218 |         block_vals_ndarray,
219 |         p_w_given_z,
220 |         p_z_given_d,
221 |         block_row_size,
222 |         block_col_size,
223 |         dtype=np.float32,
224 |     )
225 |     result = log_likelihood_per_block.sum()
226 |     return result.compute()
227 | 
228 | 
229 | def plsa_fit_inner_dask(
230 |     block_rows_ndarray,
231 |     block_cols_ndarray,
232 |     block_vals_ndarray,
233 |     p_w_given_z,
234 |     p_z_given_d,
235 |     block_row_size,
236 |     block_col_size,
237 |     n_iter=100,
238 |     n_iter_per_test=10,
239 |     tolerance=0.001,
240 |     e_step_thresh=1e-32,
241 | ):
242 |     previous_log_likelihood = log_likelihood_by_blocks(
243 |         block_rows_ndarray,
244 |         block_cols_ndarray,
245 |         block_vals_ndarray,
246 |         p_w_given_z,
247 |         p_z_given_d,
248 |         block_row_size,
249 |         block_col_size,
250 |     )
251 | 
252 |     # block_rows_ndarray, block_cols_ndarray, block_vals_ndarray = persist(
253 |     #     block_rows_ndarray, block_cols_ndarray, block_vals_ndarray
254 |     # )
255 | 
256 |     for i in range(n_iter):
257 |         p_w_given_z, p_z_given_d = plsa_em_step_dask(
258 |             block_rows_ndarray,
259 |             block_cols_ndarray,
260 |             block_vals_ndarray,
261 |             p_w_given_z,
262 |             p_z_given_d,
263 |             block_row_size,
264 |             block_col_size,
265 |             e_step_thresh=e_step_thresh,
266 |         )
267 |         if i % n_iter_per_test == 0:
268 |             current_log_likelihood = log_likelihood_by_blocks(
269 |                 block_rows_ndarray,
270 |                 block_cols_ndarray,
271 |                 block_vals_ndarray,
272 |                 p_w_given_z,
273 |                 p_z_given_d,
274 |                 block_row_size,
275 |                 block_col_size,
276 |             )
277 |             change = np.abs(current_log_likelihood - previous_log_likelihood)
278 |             if change / np.abs(current_log_likelihood) < tolerance:
279 |                 break
280 |             else:
281 |                 previous_log_likelihood = current_log_likelihood
282 | 
283 |     return p_z_given_d, p_w_given_z
284 | 
285 | 
286 | def plsa_fit(
287 |     X,
288 |     k,
289 |     n_row_blocks=8,
290 |     n_col_blocks=8,
291 |     init="random",
292 |     n_iter=100,
293 |     n_iter_per_test=10,
294 |     tolerance=0.001,
295 |     e_step_thresh=1e-32,
296 |     random_state=None,
297 | ):
298 |     rng = check_random_state(random_state)
299 |     p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng)
300 |     p_z_given_d = p_z_given_d.astype(np.float32, order="C")
301 |     p_w_given_z = p_w_given_z.astype(np.float32, order="C")
302 | 
303 |     A = X.tocsr().astype(np.float32)
304 | 
305 |     n = A.shape[0]
306 |     m = A.shape[1]
307 | 
308 |     block_row_size = np.uint32(np.ceil(A.shape[0] / n_row_blocks))
309 |     block_col_size = np.uint32(np.ceil(A.shape[1] / n_col_blocks))
310 | 
311 |     A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)]
312 |     max_nnz_per_block = 0
313 |     for i in range(n_row_blocks):
314 | 
315 |         row_start = block_row_size * i
316 |         row_end = min(row_start + block_row_size, n)
317 | 
318 |         for j in range(n_col_blocks):
319 | 
320 |             col_start = block_col_size * j
321 |             col_end = min(col_start + block_col_size, m)
322 | 
323 |             A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo()
324 |             if A_blocks[i][j].nnz > max_nnz_per_block:
325 |                 max_nnz_per_block = A_blocks[i][j].nnz
326 | 
327 |     del A
328 | 
329 |     block_rows_ndarray = np.full(
330 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32,
331 |     )
332 |     block_cols_ndarray = np.full(
333 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32,
334 |     )
335 |     block_vals_ndarray = np.zeros(
336 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32,
337 |     )
338 |     for i in range(n_row_blocks):
339 |         for j in range(n_col_blocks):
340 |             nnz = A_blocks[i][j].nnz
341 |             block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row
342 |             block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col
343 |             block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data
344 | 
345 |     del A_blocks
346 | 
347 |     block_rows_ndarray = da.from_array(
348 |         block_rows_ndarray, chunks=(1, 1, max_nnz_per_block),
349 |     )
350 |     block_cols_ndarray = da.from_array(
351 |         block_cols_ndarray, chunks=(1, 1, max_nnz_per_block),
352 |     )
353 |     block_vals_ndarray = da.from_array(
354 |         block_vals_ndarray, chunks=(1, 1, max_nnz_per_block),
355 |     )
356 | 
357 |     p_z_given_d, p_w_given_z = plsa_fit_inner_dask(
358 |         block_rows_ndarray,
359 |         block_cols_ndarray,
360 |         block_vals_ndarray,
361 |         p_w_given_z,
362 |         p_z_given_d,
363 |         block_row_size,
364 |         block_col_size,
365 |         n_iter=n_iter,
366 |         n_iter_per_test=n_iter_per_test,
367 |         tolerance=tolerance,
368 |         e_step_thresh=e_step_thresh,
369 |     )
370 | 
371 |     return p_z_given_d, p_w_given_z
372 | 
373 | 
374 | class DistributedPLSA(BaseEstimator, TransformerMixin):
375 |     def __init__(
376 |         self,
377 |         n_components=10,
378 |         init="random",
379 |         n_row_blocks=8,
380 |         n_col_blocks=8,
381 |         n_iter=100,
382 |         n_iter_per_test=10,
383 |         tolerance=0.001,
384 |         e_step_thresh=1e-32,
385 |         transform_random_seed=42,
386 |         random_state=None,
387 |     ):
388 | 
389 |         self.n_components = n_components
390 |         self.init = init
391 |         self.n_row_blocks = n_row_blocks
392 |         self.n_col_blocks = n_col_blocks
393 |         self.n_iter = n_iter
394 |         self.n_iter_per_test = n_iter_per_test
395 |         self.tolerance = tolerance
396 |         self.e_step_thresh = e_step_thresh
397 |         self.transform_random_seed = transform_random_seed
398 |         self.random_state = random_state
399 | 
400 |     def fit(self, X, y=None, sample_weight=None):
401 |         """Learn the pLSA model for the data X and return the document vectors.
402 | 
403 |         This is more efficient than calling fit followed by transform.
404 | 
405 |         Parameters
406 |         ----------
407 |         X: array or sparse matrix of shape (n_docs, n_words)
408 |             The data matrix pLSA is attempting to fit to.
409 | 
410 |         y: Ignored
411 | 
412 |         sample_weight: array of shape (n_docs,)
413 |             Input document weights.
414 | 
415 |         Returns
416 |         -------
417 |         self
418 |         """
419 |         self.fit_transform(X, sample_weight=sample_weight)
420 |         return self
421 | 
422 |     def fit_transform(self, X, y=None, sample_weight=None):
423 |         """Learn the pLSA model for the data X and return the document vectors.
424 | 
425 |         This is more efficient than calling fit followed by transform.
426 | 
427 |         Parameters
428 |         ----------
429 |         X: array or sparse matrix of shape (n_docs, n_words)
430 |             The data matrix pLSA is attempting to fit to.
431 | 
432 |         y: Ignored
433 | 
434 |         sample_weight: array of shape (n_docs,)
435 |             Input document weights.
436 | 
437 |         Returns
438 |         -------
439 |         embedding: array of shape (n_docs, n_topics)
440 |             An embedding of the documents into a topic space.
441 |         """
442 | 
443 |         X = check_array(X, accept_sparse="csr")
444 | 
445 |         if not issparse(X):
446 |             X = csr_matrix(X)
447 | 
448 |         if sample_weight is not None:
449 |             NotImplementedError("Sample weights not supported in distributed")
450 |         # sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
451 | 
452 |         if np.any(X.data < 0):
453 |             raise ValueError(
454 |                 "PLSA is only valid for matrices with non-negative " "entries"
455 |             )
456 | 
457 |         row_sums = np.array(X.sum(axis=1).T)[0]
458 |         good_rows = row_sums != 0
459 | 
460 |         if not np.all(good_rows):
461 |             zero_rows_found = True
462 |             data_for_fitting = X[good_rows]
463 |         else:
464 |             zero_rows_found = False
465 |             data_for_fitting = X
466 | 
467 |         U, V = plsa_fit(
468 |             data_for_fitting,
469 |             self.n_components,
470 |             n_row_blocks=self.n_row_blocks,
471 |             n_col_blocks=self.n_col_blocks,
472 |             init=self.init,
473 |             n_iter=self.n_iter,
474 |             n_iter_per_test=self.n_iter_per_test,
475 |             tolerance=self.tolerance,
476 |             e_step_thresh=self.e_step_thresh,
477 |             random_state=self.random_state,
478 |         )
479 | 
480 |         if zero_rows_found:
481 |             self.embedding_ = np.zeros((X.shape[0], self.n_components))
482 |             self.embedding_[good_rows] = U
483 |         else:
484 |             self.embedding_ = U
485 | 
486 |         self.components_ = V
487 |         self.training_data_ = X
488 | 
489 |         return self.embedding_
490 | 


--------------------------------------------------------------------------------
/enstop/cuda_plsa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numba
  3 | import numba.cuda as cuda
  4 | 
  5 | from sklearn.base import BaseEstimator, TransformerMixin
  6 | from sklearn.utils import check_array, check_random_state
  7 | from sklearn.utils.validation import _check_sample_weight
  8 | from scipy.sparse import issparse, csr_matrix, coo_matrix
  9 | 
 10 | from enstop.utils import (
 11 |     normalize,
 12 |     coherence,
 13 |     mean_coherence,
 14 |     log_lift,
 15 |     mean_log_lift,
 16 |     standardize_input,
 17 | )
 18 | from enstop.plsa import plsa_init
 19 | from enstop.block_parallel_plsa import log_likelihood_by_blocks
 20 | 
 21 | 
 22 | @cuda.jit()
 23 | def plsa_e_step(
 24 |     block_rows,
 25 |     block_cols,
 26 |     p_w_given_z_block,
 27 |     p_z_given_d_block,
 28 |     p_z_given_wd_block,
 29 |     e_step_thresh,
 30 | ):
 31 |     i = cuda.blockIdx.x
 32 |     j = cuda.blockIdx.y
 33 |     nz_offset = cuda.threadIdx.x
 34 |     threads_per_blocks = cuda.blockDim.x
 35 |     k = p_z_given_d_block[i].shape[1]
 36 |     nnz = block_rows.shape[2]
 37 | 
 38 |     n_passes = (nnz // threads_per_blocks) + 1
 39 | 
 40 |     for n in range(n_passes):
 41 |         nz_idx = n * threads_per_blocks + nz_offset
 42 |         if nz_idx < nnz:
 43 |             if block_rows[i, j, nz_idx] < 0:
 44 |                 break
 45 | 
 46 |             d = block_rows[i, j, nz_idx]
 47 |             w = block_cols[i, j, nz_idx]
 48 | 
 49 |             norm = 0.0
 50 |             for z in range(k):
 51 |                 v = p_w_given_z_block[j, z, w] * p_z_given_d_block[i, d, z]
 52 |                 if v > e_step_thresh:
 53 |                     p_z_given_wd_block[i, j, nz_idx, z] = v
 54 |                     norm += v
 55 |                 else:
 56 |                     p_z_given_wd_block[i, j, nz_idx, z] = 0.0
 57 | 
 58 |             for z in range(k):
 59 |                 if norm > 0.0:
 60 |                     p_z_given_wd_block[i, j, nz_idx, z] /= norm
 61 | 
 62 | 
 63 | @cuda.jit()
 64 | def plsa_partial_m_step(
 65 |     block_rows,
 66 |     block_cols,
 67 |     block_vals,
 68 |     p_w_given_z_block,
 69 |     p_z_given_d_block,
 70 |     result_p_w_given_z_block,
 71 |     result_p_z_given_d_block,
 72 |     p_z_given_wd_block,
 73 |     pwz_norms,
 74 | ):
 75 |     z = cuda.threadIdx.x
 76 |     i = cuda.blockIdx.x
 77 |     j = cuda.blockIdx.y
 78 |     k = p_z_given_d_block[i].shape[1]
 79 |     nnz = block_rows.shape[2]
 80 | 
 81 |     if z < k:
 82 | 
 83 |         result_p_w_given_z_block[i, j, z, :] = 0.0
 84 |         result_p_z_given_d_block[j, i, :, z] = 0.0
 85 |         pwz_norms[i, j, z] = 0.0
 86 | 
 87 |         for nz_idx in range(block_rows[i, j].shape[0]):
 88 |             if block_rows[i, j, nz_idx] < 0:
 89 |                 break
 90 | 
 91 |             d = block_rows[i, j, nz_idx]
 92 |             w = block_cols[i, j, nz_idx]
 93 |             x = block_vals[i, j, nz_idx]
 94 | 
 95 |             s = x * p_z_given_wd_block[i, j, nz_idx, z]
 96 | 
 97 |             result_p_w_given_z_block[i, j, z, w] += s
 98 |             result_p_z_given_d_block[j, i, d, z] += s
 99 | 
100 |             pwz_norms[i, j, z] += s
101 | 
102 | 
103 | @cuda.jit()
104 | def normalize_m_step_p_z_given_d(blocked_next_p_z_given_d, p_z_given_d):
105 |     d_offset = cuda.threadIdx.x
106 |     i = cuda.blockIdx.x
107 |     threads_per_block = cuda.blockDim.x
108 |     k = p_z_given_d[i].shape[1]
109 |     n_passes = ((p_z_given_d.shape[0] * p_z_given_d.shape[1]) // threads_per_block) + 1
110 | 
111 |     for n in range(n_passes):
112 |         d = threads_per_block * n + d_offset
113 |         if (
114 |             i < blocked_next_p_z_given_d.shape[1]
115 |             and d < blocked_next_p_z_given_d.shape[2]
116 |         ):
117 |             norm = 0.0
118 |             for z in range(k):
119 |                 p_z_given_d[i, d, z] = 0.0
120 |                 for j in range(blocked_next_p_z_given_d.shape[0]):
121 |                     p_z_given_d[i, d, z] += blocked_next_p_z_given_d[j, i, d, z]
122 |                     norm += blocked_next_p_z_given_d[j, i, d, z]
123 |             for z in range(k):
124 |                 p_z_given_d[i, d, z] /= norm
125 | 
126 | 
127 | @cuda.jit()
128 | def normalize_m_step_p_w_given_z(blocked_next_p_w_given_z, p_w_given_z, pwz_norms):
129 |     w_offset = cuda.threadIdx.x
130 |     i = cuda.blockIdx.x
131 |     threads_per_block = cuda.blockDim.x
132 |     k = p_w_given_z[i].shape[0]
133 |     n_passes = ((p_w_given_z.shape[0] * p_w_given_z.shape[2]) // threads_per_block) + 1
134 | 
135 |     norms = cuda.local.array(1024, numba.float64)
136 |     for z in range(k):
137 |         norms[z] = 0.0
138 |     for p in range(pwz_norms.shape[0]):
139 |         for q in range(pwz_norms.shape[1]):
140 |             for z in range(k):
141 |                 norms[z] += pwz_norms[p, q, z]
142 | 
143 |     for n in range(n_passes):
144 |         w = n * threads_per_block + w_offset
145 |         if (
146 |             i < blocked_next_p_w_given_z.shape[1]
147 |             and w < blocked_next_p_w_given_z.shape[3]
148 |         ):
149 |             for z in range(k):
150 |                 p_w_given_z[i, z, w] = 0.0
151 |                 for j in range(blocked_next_p_w_given_z.shape[0]):
152 |                     p_w_given_z[i, z, w] += blocked_next_p_w_given_z[j, i, z, w]
153 |             for z in range(k):
154 |                 p_w_given_z[i, z, w] /= norms[z]
155 | 
156 | 
157 | def plsa_fit(
158 |     data,
159 |     k,
160 |     n_row_blocks=8,
161 |     n_col_blocks=8,
162 |     init="random",
163 |     n_iter=100,
164 |     n_iter_per_test=10,
165 |     tolerance=0.001,
166 |     e_step_thresh=1e-32,
167 |     random_state=None,
168 | ):
169 |     rng = check_random_state(random_state)
170 |     p_z_given_d_init, p_w_given_z_init = plsa_init(data, k, init=init, rng=rng)
171 | 
172 |     A = data.tocsr().astype(np.float32)
173 | 
174 |     n = A.shape[0]
175 |     m = A.shape[1]
176 | 
177 |     block_row_size = np.uint16(np.ceil(A.shape[0] / n_row_blocks))
178 |     block_col_size = np.uint16(np.ceil(A.shape[1] / n_col_blocks))
179 | 
180 |     p_z_given_d = np.zeros((block_row_size * n_row_blocks, k), dtype=np.float32)
181 |     p_z_given_d[: p_z_given_d_init.shape[0]] = p_z_given_d_init
182 |     p_z_given_d = p_z_given_d.reshape(n_row_blocks, block_row_size, k)
183 | 
184 |     p_w_given_z = np.zeros((k, block_col_size * n_col_blocks), dtype=np.float32)
185 |     p_w_given_z[:, : p_w_given_z_init.shape[1]] = p_w_given_z_init
186 |     p_w_given_z = np.transpose(
187 |         p_w_given_z.T.reshape(n_col_blocks, block_col_size, k), axes=[0, 2, 1]
188 |     ).astype(np.float32, order="C")
189 | 
190 |     A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)]
191 |     max_nnz_per_block = 0
192 |     for i in range(n_row_blocks):
193 | 
194 |         row_start = block_row_size * i
195 |         row_end = min(row_start + block_row_size, n)
196 | 
197 |         for j in range(n_col_blocks):
198 | 
199 |             col_start = block_col_size * j
200 |             col_end = min(col_start + block_col_size, m)
201 | 
202 |             A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo()
203 |             if A_blocks[i][j].nnz > max_nnz_per_block:
204 |                 max_nnz_per_block = A_blocks[i][j].nnz
205 | 
206 |     block_rows_ndarray = np.full(
207 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32
208 |     )
209 |     block_cols_ndarray = np.full(
210 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32
211 |     )
212 |     block_vals_ndarray = np.zeros(
213 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32
214 |     )
215 |     for i in range(n_row_blocks):
216 |         for j in range(n_col_blocks):
217 |             nnz = A_blocks[i][j].nnz
218 |             block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row
219 |             block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col
220 |             block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data
221 | 
222 |     n_d_blocks = block_rows_ndarray.shape[0]
223 |     n_w_blocks = block_rows_ndarray.shape[1]
224 |     block_size = block_rows_ndarray.shape[2]
225 | 
226 |     p_z_given_wd_block = np.zeros(
227 |         (n_d_blocks, n_w_blocks, block_size, k), dtype=np.float32
228 |     )
229 | 
230 |     blocked_next_p_w_given_z = np.zeros(
231 |         (
232 |             np.int64(n_d_blocks),
233 |             np.int64(n_w_blocks),
234 |             np.int64(k),
235 |             np.int64(block_col_size),
236 |         ),
237 |         dtype=np.float32,
238 |     )
239 |     blocked_next_p_z_given_d = np.zeros(
240 |         (
241 |             np.int64(n_w_blocks),
242 |             np.int64(n_d_blocks),
243 |             np.int64(block_row_size),
244 |             np.int64(k),
245 |         ),
246 |         dtype=np.float32,
247 |     )
248 |     norms_pwz = np.zeros((n_d_blocks, n_w_blocks, k), dtype=np.float64)
249 | 
250 |     previous_log_likelihood = log_likelihood_by_blocks(
251 |         block_rows_ndarray,
252 |         block_cols_ndarray,
253 |         block_vals_ndarray,
254 |         p_w_given_z,
255 |         p_z_given_d,
256 |     )
257 | 
258 |     d_block_rows_ndarray = cuda.to_device(block_rows_ndarray)
259 |     d_block_cols_ndarray = cuda.to_device(block_cols_ndarray)
260 |     d_block_vals_ndarray = cuda.to_device(block_vals_ndarray)
261 |     d_blocked_next_p_w_given_z = cuda.to_device(blocked_next_p_w_given_z)
262 |     d_blocked_next_p_z_given_d = cuda.to_device(blocked_next_p_z_given_d)
263 |     d_p_z_given_wd_block = cuda.to_device(p_z_given_wd_block)
264 |     d_p_w_given_z = cuda.to_device(p_w_given_z)
265 |     d_p_z_given_d = cuda.to_device(p_z_given_d)
266 |     d_norms_pwz = cuda.to_device(norms_pwz)
267 | 
268 |     n_d = p_z_given_d.shape[1]
269 |     n_w = p_w_given_z.shape[2]
270 | 
271 |     for i in range(n_iter // n_iter_per_test):
272 |         for j in range(n_iter_per_test):
273 |             plsa_e_step[(n_d_blocks, n_w_blocks), 256](
274 |                 d_block_rows_ndarray,
275 |                 d_block_cols_ndarray,
276 |                 d_p_w_given_z,
277 |                 d_p_z_given_d,
278 |                 d_p_z_given_wd_block,
279 |                 e_step_thresh,
280 |             )
281 |             cuda.synchronize()
282 |             plsa_partial_m_step[(n_d_blocks, n_w_blocks), k](
283 |                 d_block_rows_ndarray,
284 |                 d_block_cols_ndarray,
285 |                 d_block_vals_ndarray,
286 |                 d_p_w_given_z,
287 |                 d_p_z_given_d,
288 |                 d_blocked_next_p_w_given_z,
289 |                 d_blocked_next_p_z_given_d,
290 |                 d_p_z_given_wd_block,
291 |                 d_norms_pwz,
292 |             )
293 |             cuda.synchronize()
294 |             normalize_m_step_p_z_given_d[n_d_blocks, 256](
295 |                 d_blocked_next_p_z_given_d, d_p_z_given_d
296 |             )
297 |             normalize_m_step_p_w_given_z[n_w_blocks, 256](
298 |                 d_blocked_next_p_w_given_z, d_p_w_given_z, d_norms_pwz
299 |             )
300 |             cuda.synchronize()
301 | 
302 |         p_z_given_d = d_p_z_given_d.copy_to_host()
303 |         p_w_given_z = d_p_w_given_z.copy_to_host()
304 |         current_log_likelihood = log_likelihood_by_blocks(
305 |             block_rows_ndarray,
306 |             block_cols_ndarray,
307 |             block_vals_ndarray,
308 |             p_w_given_z,
309 |             p_z_given_d,
310 |         )
311 |         change = np.abs(current_log_likelihood - previous_log_likelihood)
312 |         if change / np.abs(current_log_likelihood) < tolerance:
313 |             break
314 |         else:
315 |             previous_log_likelihood = current_log_likelihood
316 | 
317 |     for i in range(n_iter % n_iter_per_test):
318 |         plsa_e_step[(n_d_blocks, n_w_blocks), 256](
319 |             d_block_rows_ndarray,
320 |             d_block_cols_ndarray,
321 |             d_p_w_given_z,
322 |             d_p_z_given_d,
323 |             d_p_z_given_wd_block,
324 |             e_step_thresh,
325 |         )
326 |         cuda.synchronize()
327 |         plsa_partial_m_step[(n_d_blocks, n_w_blocks), k](
328 |             d_block_rows_ndarray,
329 |             d_block_cols_ndarray,
330 |             d_block_vals_ndarray,
331 |             d_p_w_given_z,
332 |             d_p_z_given_d,
333 |             d_blocked_next_p_w_given_z,
334 |             d_blocked_next_p_z_given_d,
335 |             d_p_z_given_wd_block,
336 |             d_norms_pwz,
337 |         )
338 |         cuda.synchronize()
339 |         normalize_m_step_p_z_given_d[n_d_blocks, 256](
340 |             d_blocked_next_p_z_given_d, d_p_z_given_d
341 |         )
342 |         normalize_m_step_p_w_given_z[n_w_blocks, 256](
343 |             d_blocked_next_p_w_given_z, d_p_w_given_z, d_norms_pwz
344 |         )
345 |         cuda.synchronize()
346 | 
347 |     p_z_given_d = d_p_z_given_d.copy_to_host()
348 |     p_w_given_z = d_p_w_given_z.copy_to_host()
349 | 
350 |     p_z_given_d = np.vstack(p_z_given_d)[:n, :]
351 |     p_w_given_z = np.hstack(p_w_given_z)[:, :m]
352 | 
353 |     return p_z_given_d, p_w_given_z
354 | 
355 | 
356 | class GPUPLSA(BaseEstimator, TransformerMixin):
357 |     def __init__(
358 |         self,
359 |         n_components=10,
360 |         init="random",
361 |         n_row_blocks=8,
362 |         n_col_blocks=8,
363 |         n_iter=100,
364 |         n_iter_per_test=10,
365 |         tolerance=0.001,
366 |         e_step_thresh=1e-32,
367 |         transform_random_seed=42,
368 |         random_state=None,
369 |     ):
370 | 
371 |         self.n_components = n_components
372 |         self.init = init
373 |         self.n_row_blocks = n_row_blocks
374 |         self.n_col_blocks = n_col_blocks
375 |         self.n_iter = n_iter
376 |         self.n_iter_per_test = n_iter_per_test
377 |         self.tolerance = tolerance
378 |         self.e_step_thresh = e_step_thresh
379 |         self.transform_random_seed = transform_random_seed
380 |         self.random_state = random_state
381 | 
382 |     def fit(self, X, y=None, sample_weight=None):
383 |         """Learn the pLSA model for the data X and return the document vectors.
384 | 
385 |         This is more efficient than calling fit followed by transform.
386 | 
387 |         Parameters
388 |         ----------
389 |         X: array or sparse matrix of shape (n_docs, n_words)
390 |             The data matrix pLSA is attempting to fit to.
391 | 
392 |         y: Ignored
393 | 
394 |         sample_weight: array of shape (n_docs,)
395 |             Input document weights.
396 | 
397 |         Returns
398 |         -------
399 |         self
400 |         """
401 |         self.fit_transform(X, sample_weight=sample_weight)
402 |         return self
403 | 
404 |     def fit_transform(self, X, y=None, sample_weight=None):
405 |         """Learn the pLSA model for the data X and return the document vectors.
406 | 
407 |         This is more efficient than calling fit followed by transform.
408 | 
409 |         Parameters
410 |         ----------
411 |         X: array or sparse matrix of shape (n_docs, n_words)
412 |             The data matrix pLSA is attempting to fit to.
413 | 
414 |         y: Ignored
415 | 
416 |         sample_weight: array of shape (n_docs,)
417 |             Input document weights.
418 | 
419 |         Returns
420 |         -------
421 |         embedding: array of shape (n_docs, n_topics)
422 |             An embedding of the documents into a topic space.
423 |         """
424 | 
425 |         X = check_array(X, accept_sparse="csr")
426 |         X = standardize_input(X)
427 | 
428 |         if not issparse(X):
429 |             X = csr_matrix(X)
430 | 
431 |         sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
432 | 
433 |         if np.any(X.data < 0):
434 |             raise ValueError(
435 |                 "PLSA is only valid for matrices with non-negative " "entries"
436 |             )
437 | 
438 |         row_sums = np.array(X.sum(axis=1).T)[0]
439 |         good_rows = row_sums != 0
440 | 
441 |         if not np.all(good_rows):
442 |             zero_rows_found = True
443 |             data_for_fitting = X[good_rows]
444 |         else:
445 |             zero_rows_found = False
446 |             data_for_fitting = X
447 | 
448 |         U, V = plsa_fit(
449 |             data_for_fitting,
450 |             self.n_components,
451 |             n_row_blocks=self.n_row_blocks,
452 |             n_col_blocks=self.n_col_blocks,
453 |             init=self.init,
454 |             n_iter=self.n_iter,
455 |             n_iter_per_test=self.n_iter_per_test,
456 |             tolerance=self.tolerance,
457 |             e_step_thresh=self.e_step_thresh,
458 |             random_state=self.random_state,
459 |         )
460 | 
461 |         if zero_rows_found:
462 |             self.embedding_ = np.zeros((X.shape[0], self.n_components))
463 |             self.embedding_[good_rows] = U
464 |         else:
465 |             self.embedding_ = U
466 | 
467 |         self.components_ = V
468 |         self.training_data_ = X
469 | 
470 |         return self.embedding_
471 | 


--------------------------------------------------------------------------------
/notebooks/EnsTop with 20-Newsgroups.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Topic Modeling for 20-Newsgroups\n",
  8 |     "\n",
  9 |     "There are several approaches to topic modeling. The most popular options are Latent Dirichlet Allocation (LDA) and Non-negative Matrix Factorization (NMF). In this notebook we will use the 20-newsgroups dataset to compare these methods with probabilistic Latent Semantic Analysis (pLSA) and ensemble topic modeling (EnsTop) from the enstop library. This is not meant to be a particularly complete or comprehensive comparison, but rather a means to show how the enstop library works, and provide a quick comparison to other popular approaches.\n",
 10 |     "\n",
 11 |     "First we'll need the requisite libraries. Fortunately sklearn has a function to get the 20-newsgroups dataset, a CountVectorizer which can convert the raw text data into bag-of-words based count matrix, and implementations of both LDA and NMF. We'll of course also need the PLSA and EnsembleTopics classes from the enstop library."
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 1,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import warnings; warnings.simplefilter('ignore') # Suppress deprecation warnings"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from sklearn.datasets import fetch_20newsgroups\n",
 30 |     "from sklearn.feature_extraction.text import CountVectorizer\n",
 31 |     "from sklearn.decomposition import NMF, LatentDirichletAllocation\n",
 32 |     "from enstop import EnsembleTopics, PLSA"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "The next step is getting the data. For this we can just use sklearn. First the ``fetch_20newsgroups`` function will download the 20-newsgroups data. By specifying ``subset='all'`` we collect the full dataset rather than either a train or test set. The next step in the process is to convert this text data into a form that can be consumed by LDA, NMF, PLSA and EnsembleTopics. The required format, in this case, is a matrix where the (i,j)th entry is the count of the number of times the jth word in the vocabulary occurs in the ith document (in this case each document is a newsgroup post). This can be done extremely efficiently using sklearn's ``CountVectorizer``. We'll pass two extra parameters to the ``CountVectorizer``: a setting of ``min_df=5`` which will restrict the vocabulary to words that occur at least 5 times in the entire corpus; and ``stop_words='english'`` which will eliminate common words (like \"the\", \"and\", etc.) accordingly to a dictionary of such words in English."
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {},
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "news = fetch_20newsgroups(subset='all')\n",
 49 |     "data = CountVectorizer(min_df=5, stop_words='english').fit_transform(news.data)"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "markdown",
 54 |    "metadata": {},
 55 |    "source": [
 56 |     "## Time to fit a model\n",
 57 |     "\n",
 58 |     "Now that we have the data is order, let's fit the various topic models and time them to see how long they take to fit. First up is LDA. The only parameter that requires tuning in this case is the number of topics we want to have. As a reasonable guess we'll choose 20 (the number of different newsgroups in the dataset)."
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "CPU times: user 5min 16s, sys: 4.4 s, total: 5min 20s\n",
 71 |       "Wall time: 2min 54s\n"
 72 |      ]
 73 |     }
 74 |    ],
 75 |    "source": [
 76 |     "%%time\n",
 77 |     "lda_model = LatentDirichletAllocation(n_components=20).fit(data)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "markdown",
 82 |    "metadata": {},
 83 |    "source": [
 84 |     "A little over two minutes on my laptop -- not bad at all. It is worth noting the total CPU time, which came in at a little over four minutes, demonstrating that the implementation is making good use of parallelism, especially considering this is running on a two core processor.\n",
 85 |     "\n",
 86 |     "Next up is NMF. In this case we need a few extra parameters for the sklearn implementation. By default the sklearn NMF uses Frobenius loss -- essentially the total squared error between the data matrix and the reconstruction from the product two low rank matrices (with positive entries). While this is suitable for many uses it isn't the right loss for topic modeling. Instead we want to use the Kullback-Leibler loss, which essentially models the data as a set of independent Poisson's -- essentially it views the data as counts (which they are), and seeks the reconstruction from the product two low rank matrices to provide Poisson parameters that maximise the likelihood of seeing the data. Having changed the loss function we also need to change the solver from the classical coordinate descent to the multiplicative update based solver which can work with KL loss. All of this makes the NMF fitting process much slower, but it provides more accurate results for the purposes of topic modelling."
 87 |    ]
 88 |   },
 89 |   {
 90 |    "cell_type": "code",
 91 |    "execution_count": 5,
 92 |    "metadata": {},
 93 |    "outputs": [
 94 |     {
 95 |      "name": "stdout",
 96 |      "output_type": "stream",
 97 |      "text": [
 98 |       "CPU times: user 3min 47s, sys: 1min 27s, total: 5min 15s\n",
 99 |       "Wall time: 3min 46s\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "%%time\n",
105 |     "nmf_model = NMF(n_components=20, beta_loss='kullback-leibler', solver='mu').fit(data)"
106 |    ]
107 |   },
108 |   {
109 |    "cell_type": "markdown",
110 |    "metadata": {},
111 |    "source": [
112 |     "Around three and half minutes, so slower than LDA in this case. In general, especially for larger datasets than this, NMF will often tend to be as fast or sometimes even faster than LDA. In this case, however, it is a little slower. It's again worth noting the CPU time: over five minutes. Again, the implementation is making good use of parallelism on the two core processor.\n",
113 |     "\n",
114 |     "Next let's try EnsembleTopics. In this case we will specify ``n_components=20`` as with LDA and NMF, but this time that is more of a suggestion. EnsembleTopics will attempt to find a \"natural\" number of topics. Given that this is a small dataset we will also reduce the overall work to be done via the ``n_starts`` parameter, which specifies how many bootstrap runs of pLSA to try; for small data like this eight runs will likely suffice rather than the default 15. It is also beneficial to scale the parallelism a little -- since the processor only has two cores it is best not to overtax it with too many jobs at once."
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 6,
120 |    "metadata": {},
121 |    "outputs": [
122 |     {
123 |      "name": "stdout",
124 |      "output_type": "stream",
125 |      "text": [
126 |       "CPU times: user 7min 12s, sys: 4.28 s, total: 7min 17s\n",
127 |       "Wall time: 3min 3s\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "%%time\n",
133 |     "ens_model = EnsembleTopics(n_components=20, n_starts=8, n_jobs=2).fit(data)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Coming in at just over three minutes this among the slower of the options we've tried here. On the other hand, as with NMF, EnsembleTopics will scale up well, and would likely look better on larger datasets. It is, however, in the right ballpark, which is somewhat reassuring. When we look at how well the topic modeling performed on the data this extra time might seem more worthwhile.\n",
141 |     "\n",
142 |     "Lastly let's look at pLSA. Historically pLSA is a precursor to LDA which came out a couple of years later and added Bayesian priors and more robust statistical foundations. On the other hand the pLSA algorithm itself is surprisingly simple, and with a little care high performance implementations are not hard to write. Given an efficient Expectation-Maximization optimizer it can potentially even find better solutions than a somewhat more complex LDA optimization. Using pLSA from enstop is just as easy as LDA in sklearn -- tell it the number of topics you want and set it going."
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 7,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "name": "stdout",
152 |      "output_type": "stream",
153 |      "text": [
154 |       "CPU times: user 26.3 s, sys: 325 ms, total: 26.6 s\n",
155 |       "Wall time: 14.7 s\n"
156 |      ]
157 |     }
158 |    ],
159 |    "source": [
160 |     "%%time\n",
161 |     "plsa_model = PLSA(n_components=20).fit(data)"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "markdown",
166 |    "metadata": {},
167 |    "source": [
168 |     "A mere fifteen seconds! Decidedly faster than LDA, and it will scale well in terms of dataset size (but may scale less well in the desired number of topics). At the very least, given its speed, pLSA is a contender in the topic modeling space. Also worth noting is that this performance was achieved despite being a completely serial implementation -- the CPU time is the same as the wall time in this case.\n",
169 |     "\n",
170 |     "Now, having looked at how long it takes the algorithms to run, the next question is: how good are they? A fast algorithm that does a poor job is not worth much."
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "## Quality measures of topic models\n",
178 |     "\n",
179 |     "There are numerous ways to measure quality in topic models, including perplexity based approaches, lift, and coherence. Each technique has its pros and cons, as with any unsupervised task evaluation. We will attempt to sidestep some of these issues by evaluating the topic modeling approaches at a downstream task -- how well does the topic space categorise the different documents. Since the documents have defined labels (which newsgroup they were posted to) we have ground-truth to compare to. Since we can express the documents in terms of the learned topic space we can \"classify\" a document as the strongest topic associated to that document. Given two classifications we can then score how well these match via [adjusted Rand score](https://en.wikipedia.org/wiki/Rand_index#Adjusted_Rand_index) or [adjusted mutual information](https://en.wikipedia.org/wiki/Adjusted_mutual_information). Fortunately sklearn has implementations for both metrics. We'll also load numpy so we can extract the index of the most likely topic for each document."
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 8,
185 |    "metadata": {},
186 |    "outputs": [],
187 |    "source": [
188 |     "from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score\n",
189 |     "import numpy as np"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "markdown",
194 |    "metadata": {},
195 |    "source": [
196 |     "Next we need to place the documents in topic space. For NMF and LDA we can use the transform function. We could do the same for EnsembleTopics and pLSA, but since they store the document embedding in topic space of the training set as the ``embedding_`` attribute we can save work and just use that. Next we need to determine which topic is the most likely for each document -- this is just a matter of computing the argmax for each row of the embedded document matrix."
197 |    ]
198 |   },
199 |   {
200 |    "cell_type": "code",
201 |    "execution_count": 9,
202 |    "metadata": {},
203 |    "outputs": [],
204 |    "source": [
205 |     "nmf_doc_vectors = nmf_model.transform(data)\n",
206 |     "nmf_clusters = np.argmax(nmf_doc_vectors, axis=1)\n",
207 |     "lda_doc_vectors = lda_model.transform(data)\n",
208 |     "lda_clusters = np.argmax(lda_doc_vectors, axis=1)\n",
209 |     "ens_doc_vectors = ens_model.embedding_\n",
210 |     "ens_clusters = np.argmax(ens_doc_vectors, axis=1)\n",
211 |     "plsa_doc_vectors = plsa_model.embedding_\n",
212 |     "plsa_clusters = np.argmax(plsa_doc_vectors, axis=1)"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "markdown",
217 |    "metadata": {},
218 |    "source": [
219 |     "Now that we have classified the documents according to the topic space we can consider how well that classification compares to the actual ground-truth classification. Both adjusted Rand score and adjusted mutual information provide scores between 0 and 1 such that 0 represents an essentially random assignment (in comparison to the ground truth) and 1 represents a perfect matching with the ground truth. Obviosuly higher scores are better.\n",
220 |     "\n",
221 |     "We'll start with NMF."
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 10,
227 |    "metadata": {},
228 |    "outputs": [
229 |     {
230 |      "name": "stdout",
231 |      "output_type": "stream",
232 |      "text": [
233 |       "NMF Adjusted Rand:  0.151495442288548\n",
234 |       "NMF Adjusted Mutual Information:  0.322145856972107\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "print(\"NMF Adjusted Rand: \", adjusted_rand_score(news.target, nmf_clusters))\n",
240 |     "print(\"NMF Adjusted Mutual Information: \", adjusted_mutual_info_score(news.target, nmf_clusters))"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "markdown",
245 |    "metadata": {},
246 |    "source": [
247 |     "These scores are not great -- but the 20-newsgroups dataset is somewhat nontrivial (with several closely related newsgroups). Still, we can hope that some of the other techniques may have fared better.\n",
248 |     "\n",
249 |     "Next let's look at how the LDA model performed."
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "code",
254 |    "execution_count": 11,
255 |    "metadata": {},
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "LDA Adjusted Rand:  0.22301812908887647\n",
262 |       "LDA Adjusted Mutual Information:  0.3660410130009368\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "print(\"LDA Adjusted Rand: \", adjusted_rand_score(news.target, lda_clusters))\n",
268 |     "print(\"LDA Adjusted Mutual Information: \", adjusted_mutual_info_score(news.target, lda_clusters))"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "A definite improvement over NMF, but still below what we might ideally like. On the other hand LDA is considered the go-to state-of-the-art technique for topic modeling, so perhaps this is the best we can hope to do with this corpus and the (rather limited) amount of text-preprocessing we have done.\n",
276 |     "\n",
277 |     "Let's try pLSA next and see how it managed to do, given that it ran so very quickly."
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 12,
283 |    "metadata": {},
284 |    "outputs": [
285 |     {
286 |      "name": "stdout",
287 |      "output_type": "stream",
288 |      "text": [
289 |       "pLSA Adjusted Rand:  0.2764225648913671\n",
290 |       "pLSA Adjusted Mutual Information:  0.43413462309828155\n"
291 |      ]
292 |     }
293 |    ],
294 |    "source": [
295 |     "print(\"pLSA Adjusted Rand: \", adjusted_rand_score(news.target, plsa_clusters))\n",
296 |     "print(\"pLSA Adjusted Mutual Information: \", adjusted_mutual_info_score(news.target, plsa_clusters))"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "It seems that pLSA outperformed LDA on this particular task, and by a reasonable amount. While the rand score is still fairly low the mutual information indicates that we are almost getting into a range that might be considered reasonable.\n",
304 |     "\n",
305 |     "Finally let's see what the extra work of ensembling can buy us."
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": 13,
311 |    "metadata": {},
312 |    "outputs": [
313 |     {
314 |      "name": "stdout",
315 |      "output_type": "stream",
316 |      "text": [
317 |       "EnsTop Adjusted Rand:  0.33676056267373145\n",
318 |       "EnsTop Adjusted Mutual Information:  0.47842663849608985\n"
319 |      ]
320 |     }
321 |    ],
322 |    "source": [
323 |     "print(\"EnsTop Adjusted Rand: \", adjusted_rand_score(news.target, ens_clusters))\n",
324 |     "print(\"EnsTop Adjusted Mutual Information: \", adjusted_mutual_info_score(news.target, ens_clusters))"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {},
330 |    "source": [
331 |     "It seems that running several pLSA runs and looking for stable topics actually manages to produce much better topics, at least for classifying the 20-newsgroups posts. The even better news is that, despite getting the benefits of the pLSA approach, since the ensemble is built from bootstrap samples of the corpus we actually expect this to generalise better than the pure pLSA approach.\n",
332 |     "\n",
333 |     "We claim that, at least for this small example, EnsembleTopics is clearly the best approach for topic modeling."
334 |    ]
335 |   }
336 |  ],
337 |  "metadata": {
338 |   "kernelspec": {
339 |    "display_name": "Python 3",
340 |    "language": "python",
341 |    "name": "python3"
342 |   },
343 |   "language_info": {
344 |    "codemirror_mode": {
345 |     "name": "ipython",
346 |     "version": 3
347 |    },
348 |    "file_extension": ".py",
349 |    "mimetype": "text/x-python",
350 |    "name": "python",
351 |    "nbconvert_exporter": "python",
352 |    "pygments_lexer": "ipython3",
353 |    "version": "3.7.5"
354 |   }
355 |  },
356 |  "nbformat": 4,
357 |  "nbformat_minor": 2
358 | }
359 | 


--------------------------------------------------------------------------------
/enstop/block_parallel_plsa.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numba
  3 | 
  4 | from sklearn.base import BaseEstimator, TransformerMixin
  5 | from sklearn.utils import check_array, check_random_state
  6 | from sklearn.utils.validation import _check_sample_weight
  7 | from scipy.sparse import issparse, csr_matrix, coo_matrix
  8 | 
  9 | from enstop.utils import (
 10 |     normalize,
 11 |     coherence,
 12 |     mean_coherence,
 13 |     log_lift,
 14 |     mean_log_lift,
 15 |     standardize_input,
 16 | )
 17 | from enstop.plsa import plsa_init
 18 | 
 19 | 
 20 | @numba.njit(
 21 |     [
 22 |         "f4[:,::1](i4[::1],i4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4)",
 23 |         "f4[:,::1](i4[::1],i4[::1],f4[:,:],f4[:,::1],f4[:,::1],f4)",
 24 |     ],
 25 |     locals={
 26 |         "k": numba.types.intp,
 27 |         "w": numba.types.uint32,
 28 |         "d": numba.types.uint32,
 29 |         "z": numba.types.uint16,
 30 |         "v": numba.types.float32,
 31 |         "nz_idx": numba.types.uint32,
 32 |         "norm": numba.types.float32,
 33 |     },
 34 |     fastmath=True,
 35 |     nogil=True,
 36 | )
 37 | def plsa_e_step_on_a_block(
 38 |     block_rows,
 39 |     block_cols,
 40 |     p_w_given_z_block,
 41 |     p_z_given_d_block,
 42 |     p_z_given_wd_block,
 43 |     probability_threshold=1e-32,
 44 | ):
 45 |     k = p_w_given_z_block.shape[0]
 46 | 
 47 |     for nz_idx in range(block_rows.shape[0]):
 48 |         if block_rows[nz_idx] < 0:
 49 |             break
 50 | 
 51 |         d = block_rows[nz_idx]
 52 |         w = block_cols[nz_idx]
 53 | 
 54 |         norm = 0.0
 55 |         for z in range(k):
 56 |             v = p_w_given_z_block[z, w] * p_z_given_d_block[d, z]
 57 |             if v > probability_threshold:
 58 |                 p_z_given_wd_block[nz_idx, z] = v
 59 |                 norm += v
 60 |             else:
 61 |                 p_z_given_wd_block[nz_idx, z] = 0.0
 62 |         for z in range(k):
 63 |             if norm > 0:
 64 |                 p_z_given_wd_block[nz_idx, z] /= norm
 65 | 
 66 |     return p_z_given_wd_block
 67 | 
 68 | 
 69 | @numba.njit(
 70 |     [
 71 |         "void(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1])",
 72 |         "void(i4[::1],i4[::1],f4[::1],f4[:,:],f4[:,:],f4[:,::1],f4[::1],f4[::1])",
 73 |     ],
 74 |     locals={
 75 |         "k": numba.types.intp,
 76 |         "w": numba.types.uint32,
 77 |         "d": numba.types.uint32,
 78 |         "x": numba.types.float32,
 79 |         "z": numba.types.uint16,
 80 |         "nz_idx": numba.types.uint32,
 81 |         "s": numba.types.float32,
 82 |     },
 83 |     fastmath=True,
 84 |     nogil=True,
 85 | )
 86 | def plsa_partial_m_step_on_a_block(
 87 |     block_rows,
 88 |     block_cols,
 89 |     block_vals,
 90 |     p_w_given_z_block,
 91 |     p_z_given_d_block,
 92 |     p_z_given_wd_block,
 93 |     norm_pwz,
 94 |     norm_pdz_block,
 95 | ):
 96 |     k = p_w_given_z_block.shape[0]
 97 | 
 98 |     for nz_idx in range(block_rows.shape[0]):
 99 |         if block_rows[nz_idx] < 0:
100 |             break
101 | 
102 |         d = block_rows[nz_idx]
103 |         w = block_cols[nz_idx]
104 |         x = block_vals[nz_idx]
105 | 
106 |         for z in range(k):
107 |             s = x * p_z_given_wd_block[nz_idx, z]
108 | 
109 |             p_w_given_z_block[z, w] += s
110 |             p_z_given_d_block[d, z] += s
111 | 
112 |             norm_pwz[z] += s
113 |             norm_pdz_block[d] += s
114 | 
115 | 
116 | @numba.njit(
117 |     "void(i4[:,:,::1],i4[:,:,::1],f4[:,:,::1],f4[:,:,::1],f4[:,:,::1],f4[:,:,:,::1],"
118 |     "f4[:,:,:,::1],f4[:,:,:,::1],f4[:,::1],f4[:,:,::1],f4)",
119 |     locals={
120 |         "k": numba.types.intp,
121 |         "z": numba.types.uint16,
122 |         "d": numba.types.uint32,
123 |         "i": numba.types.uint16,
124 |         "j": numba.types.uint16,
125 |         "n_w_blocks": numba.types.intp,
126 |         "n_d_blocks": numba.types.intp,
127 |     },
128 |     parallel=True,
129 |     fastmath=True,
130 |     nogil=True,
131 | )
132 | def plsa_em_step_by_blocks(
133 |     block_rows_ndarray,
134 |     block_cols_ndarray,
135 |     block_vals_ndarray,
136 |     prev_p_w_given_z,
137 |     prev_p_z_given_d,
138 |     blocked_next_p_w_given_z,
139 |     blocked_next_p_z_given_d,
140 |     p_z_given_wd_block,
141 |     blocked_norm_pwz,
142 |     blocked_norm_pdz,
143 |     e_step_thresh=1e-32,
144 | ):
145 |     n_d_blocks = block_rows_ndarray.shape[0]
146 |     n_w_blocks = block_rows_ndarray.shape[1]
147 | 
148 |     # n = prev_p_z_given_d.shape[0]
149 |     # m = prev_p_w_given_z.shape[1]
150 |     k = prev_p_z_given_d.shape[2]
151 | 
152 |     # zero out the norms for recomputation
153 |     blocked_norm_pdz[:] = 0.0
154 |     blocked_norm_pwz[:] = 0.0
155 | 
156 |     for i in numba.prange(n_d_blocks):
157 | 
158 |         for j in numba.prange(n_w_blocks):
159 |             block_rows = block_rows_ndarray[i, j]
160 |             block_cols = block_cols_ndarray[i, j]
161 |             block_vals = block_vals_ndarray[i, j]
162 | 
163 |             plsa_e_step_on_a_block(
164 |                 block_rows,
165 |                 block_cols,
166 |                 prev_p_w_given_z[j],
167 |                 prev_p_z_given_d[i],
168 |                 p_z_given_wd_block[i, j],
169 |                 np.float32(e_step_thresh),
170 |             )
171 |             plsa_partial_m_step_on_a_block(
172 |                 block_rows,
173 |                 block_cols,
174 |                 block_vals,
175 |                 blocked_next_p_w_given_z[i, j],
176 |                 blocked_next_p_z_given_d[j, i],
177 |                 p_z_given_wd_block[i, j],
178 |                 blocked_norm_pwz[i],
179 |                 blocked_norm_pdz[j, i],
180 |             )
181 | 
182 |     prev_p_z_given_d[:] = blocked_next_p_z_given_d.sum(axis=0)
183 |     norm_pdz = blocked_norm_pdz.sum(axis=0)
184 |     prev_p_w_given_z[:] = blocked_next_p_w_given_z.sum(axis=0)
185 |     norm_pwz = blocked_norm_pwz.sum(axis=0)
186 | 
187 |     # Once complete we can normalize to complete the M step
188 |     for z in numba.prange(k):
189 |         if norm_pwz[z] > 0:
190 |             for w_block in range(prev_p_w_given_z.shape[0]):
191 |                 for w_offset in range(prev_p_w_given_z.shape[2]):
192 |                     prev_p_w_given_z[w_block, z, w_offset] /= norm_pwz[z]
193 |         for d_block in range(prev_p_z_given_d.shape[0]):
194 |             for d_offset in range(prev_p_z_given_d.shape[1]):
195 |                 if norm_pdz[d_block, d_offset] > 0:
196 |                     prev_p_z_given_d[d_block, d_offset, z] /= norm_pdz[
197 |                         d_block, d_offset
198 |                     ]
199 | 
200 |     # Zero out the old matrices these matrices for next time
201 |     blocked_next_p_z_given_d[:] = 0.0
202 |     blocked_next_p_w_given_z[:] = 0.0
203 | 
204 | 
205 | @numba.njit(
206 |     locals={
207 |         "i": numba.types.uint16,
208 |         "j": numba.types.uint16,
209 |         "k": numba.types.intp,
210 |         "w": numba.types.uint32,
211 |         "d": numba.types.uint32,
212 |         "z": numba.types.uint16,
213 |         "nz_idx": numba.types.uint32,
214 |         "x": numba.types.float32,
215 |         "result": numba.types.float32,
216 |         "p_w_given_d": numba.types.float32,
217 |     },
218 |     fastmath=True,
219 |     nogil=True,
220 |     parallel=True,
221 | )
222 | def log_likelihood_by_blocks(
223 |     block_rows_ndarray,
224 |     block_cols_ndarray,
225 |     block_vals_ndarray,
226 |     p_w_given_z,
227 |     p_z_given_d,
228 | ):
229 |     result = 0.0
230 |     k = p_z_given_d.shape[2]
231 | 
232 |     for i in numba.prange(block_rows_ndarray.shape[0]):
233 |         for j in range(block_rows_ndarray.shape[1]):
234 |             for nz_idx in range(block_rows_ndarray.shape[2]):
235 |                 if block_rows_ndarray[i, j, nz_idx] < 0:
236 |                     break
237 | 
238 |                 d = block_rows_ndarray[i, j, nz_idx]
239 |                 w = block_cols_ndarray[i, j, nz_idx]
240 |                 x = block_vals_ndarray[i, j, nz_idx]
241 | 
242 |                 p_w_given_d = 0.0
243 |                 for z in range(k):
244 |                     p_w_given_d += p_w_given_z[j, z, w] * p_z_given_d[i, d, z]
245 | 
246 |                 result += x * np.log(p_w_given_d)
247 | 
248 |     return result
249 | 
250 | 
251 | @numba.njit(fastmath=True, nogil=True)
252 | def plsa_fit_inner_blockwise(
253 |     block_rows_ndarray,
254 |     block_cols_ndarray,
255 |     block_vals_ndarray,
256 |     p_w_given_z,
257 |     p_z_given_d,
258 |     block_row_size,
259 |     block_col_size,
260 |     n_iter=100,
261 |     n_iter_per_test=10,
262 |     tolerance=0.001,
263 |     e_step_thresh=1e-32,
264 | ):
265 |     k = p_z_given_d.shape[2]
266 | 
267 |     n_d_blocks = block_rows_ndarray.shape[0]
268 |     n_w_blocks = block_rows_ndarray.shape[1]
269 |     block_size = block_rows_ndarray.shape[2]
270 | 
271 |     p_z_given_wd_block = np.zeros(
272 |         (n_d_blocks, n_w_blocks, block_size, k), dtype=np.float32
273 |     )
274 | 
275 |     blocked_next_p_w_given_z = np.zeros(
276 |         (
277 |             np.int64(n_d_blocks),
278 |             np.int64(n_w_blocks),
279 |             np.int64(k),
280 |             np.int64(block_col_size),
281 |         ),
282 |         dtype=np.float32,
283 |     )
284 |     blocked_norm_pwz = np.zeros((n_d_blocks, k), dtype=np.float32)
285 |     blocked_next_p_z_given_d = np.zeros(
286 |         (
287 |             np.int64(n_w_blocks),
288 |             np.int64(n_d_blocks),
289 |             np.int64(block_row_size),
290 |             np.int64(k),
291 |         ),
292 |         dtype=np.float32,
293 |     )
294 |     blocked_norm_pdz = np.zeros(
295 |         (np.int64(n_w_blocks), np.int64(n_d_blocks), np.int64(block_row_size)),
296 |         dtype=np.float32,
297 |     )
298 | 
299 |     previous_log_likelihood = log_likelihood_by_blocks(
300 |         block_rows_ndarray,
301 |         block_cols_ndarray,
302 |         block_vals_ndarray,
303 |         p_w_given_z,
304 |         p_z_given_d,
305 |     )
306 | 
307 |     for i in range(n_iter):
308 |         plsa_em_step_by_blocks(
309 |             block_rows_ndarray,
310 |             block_cols_ndarray,
311 |             block_vals_ndarray,
312 |             p_w_given_z,
313 |             p_z_given_d,
314 |             blocked_next_p_w_given_z,
315 |             blocked_next_p_z_given_d,
316 |             p_z_given_wd_block,
317 |             blocked_norm_pwz,
318 |             blocked_norm_pdz,
319 |             e_step_thresh,
320 |         )
321 | 
322 |         if i % n_iter_per_test == 0:
323 |             current_log_likelihood = log_likelihood_by_blocks(
324 |                 block_rows_ndarray,
325 |                 block_cols_ndarray,
326 |                 block_vals_ndarray,
327 |                 p_w_given_z,
328 |                 p_z_given_d,
329 |             )
330 |             change = np.abs(current_log_likelihood - previous_log_likelihood)
331 |             if change / np.abs(current_log_likelihood) < tolerance:
332 |                 break
333 |             else:
334 |                 previous_log_likelihood = current_log_likelihood
335 | 
336 |     return p_z_given_d, p_w_given_z
337 | 
338 | 
339 | def plsa_fit(
340 |     X,
341 |     k,
342 |     n_row_blocks=8,
343 |     n_col_blocks=8,
344 |     init="random",
345 |     n_iter=100,
346 |     n_iter_per_test=10,
347 |     tolerance=0.001,
348 |     e_step_thresh=1e-32,
349 |     random_state=None,
350 | ):
351 |     rng = check_random_state(random_state)
352 |     p_z_given_d_init, p_w_given_z_init = plsa_init(X, k, init=init, rng=rng)
353 | 
354 |     A = X.tocsr().astype(np.float32)
355 | 
356 |     n = A.shape[0]
357 |     m = A.shape[1]
358 | 
359 |     block_row_size = np.uint16(np.ceil(A.shape[0] / n_row_blocks))
360 |     block_col_size = np.uint16(np.ceil(A.shape[1] / n_col_blocks))
361 | 
362 |     p_z_given_d = np.zeros((block_row_size * n_row_blocks, k), dtype=np.float32)
363 |     p_z_given_d[: p_z_given_d_init.shape[0]] = p_z_given_d_init
364 |     p_z_given_d = p_z_given_d.reshape(n_row_blocks, block_row_size, k)
365 | 
366 |     p_w_given_z = np.zeros((k, block_col_size * n_col_blocks), dtype=np.float32)
367 |     p_w_given_z[:, : p_w_given_z_init.shape[1]] = p_w_given_z_init
368 |     # p_w_given_z = np.transpose(
369 |     #     p_w_given_z.T.reshape(n_col_blocks, block_col_size, k), axes=[0, 2, 1]
370 |     # ).astype(np.float32, order="C")
371 |     p_w_given_z = np.stack(np.hsplit(p_w_given_z, n_col_blocks))
372 | 
373 |     A_blocks = [[0] * n_col_blocks for i in range(n_row_blocks)]
374 |     max_nnz_per_block = 0
375 |     for i in range(n_row_blocks):
376 | 
377 |         row_start = block_row_size * i
378 |         row_end = min(row_start + block_row_size, n)
379 | 
380 |         for j in range(n_col_blocks):
381 | 
382 |             col_start = block_col_size * j
383 |             col_end = min(col_start + block_col_size, m)
384 | 
385 |             A_blocks[i][j] = A[row_start:row_end, col_start:col_end].tocoo()
386 |             if A_blocks[i][j].nnz > max_nnz_per_block:
387 |                 max_nnz_per_block = A_blocks[i][j].nnz
388 | 
389 |     block_rows_ndarray = np.full(
390 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32
391 |     )
392 |     block_cols_ndarray = np.full(
393 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), -1, dtype=np.int32
394 |     )
395 |     block_vals_ndarray = np.zeros(
396 |         (n_row_blocks, n_col_blocks, max_nnz_per_block), dtype=np.float32
397 |     )
398 |     for i in range(n_row_blocks):
399 |         for j in range(n_col_blocks):
400 |             nnz = A_blocks[i][j].nnz
401 |             block_rows_ndarray[i, j, :nnz] = A_blocks[i][j].row
402 |             block_cols_ndarray[i, j, :nnz] = A_blocks[i][j].col
403 |             block_vals_ndarray[i, j, :nnz] = A_blocks[i][j].data
404 | 
405 |     p_z_given_d, p_w_given_z = plsa_fit_inner_blockwise(
406 |         block_rows_ndarray,
407 |         block_cols_ndarray,
408 |         block_vals_ndarray,
409 |         p_w_given_z,
410 |         p_z_given_d,
411 |         block_row_size,
412 |         block_col_size,
413 |         n_iter=n_iter,
414 |         n_iter_per_test=n_iter_per_test,
415 |         tolerance=tolerance,
416 |         e_step_thresh=e_step_thresh,
417 |     )
418 |     p_z_given_d = np.vstack(p_z_given_d)[:n, :]
419 |     p_w_given_z = np.hstack(p_w_given_z)[:, :m]
420 | 
421 |     return p_z_given_d, p_w_given_z
422 | 
423 | 
424 | class BlockParallelPLSA(BaseEstimator, TransformerMixin):
425 |     def __init__(
426 |         self,
427 |         n_components=10,
428 |         init="random",
429 |         n_row_blocks=8,
430 |         n_col_blocks=8,
431 |         n_iter=100,
432 |         n_iter_per_test=10,
433 |         tolerance=0.001,
434 |         e_step_thresh=1e-32,
435 |         transform_random_seed=42,
436 |         random_state=None,
437 |     ):
438 | 
439 |         self.n_components = n_components
440 |         self.init = init
441 |         self.n_row_blocks = n_row_blocks
442 |         self.n_col_blocks = n_col_blocks
443 |         self.n_iter = n_iter
444 |         self.n_iter_per_test = n_iter_per_test
445 |         self.tolerance = tolerance
446 |         self.e_step_thresh = e_step_thresh
447 |         self.transform_random_seed = transform_random_seed
448 |         self.random_state = random_state
449 | 
450 |     def fit(self, X, y=None, sample_weight=None):
451 |         """Learn the pLSA model for the data X and return the document vectors.
452 | 
453 |         This is more efficient than calling fit followed by transform.
454 | 
455 |         Parameters
456 |         ----------
457 |         X: array or sparse matrix of shape (n_docs, n_words)
458 |             The data matrix pLSA is attempting to fit to.
459 | 
460 |         y: Ignored
461 | 
462 |         sample_weight: array of shape (n_docs,)
463 |             Input document weights.
464 | 
465 |         Returns
466 |         -------
467 |         self
468 |         """
469 |         self.fit_transform(X, sample_weight=sample_weight)
470 |         return self
471 | 
472 |     def fit_transform(self, X, y=None, sample_weight=None):
473 |         """Learn the pLSA model for the data X and return the document vectors.
474 | 
475 |         This is more efficient than calling fit followed by transform.
476 | 
477 |         Parameters
478 |         ----------
479 |         X: array or sparse matrix of shape (n_docs, n_words)
480 |             The data matrix pLSA is attempting to fit to.
481 | 
482 |         y: Ignored
483 | 
484 |         sample_weight: array of shape (n_docs,)
485 |             Input document weights.
486 | 
487 |         Returns
488 |         -------
489 |         embedding: array of shape (n_docs, n_topics)
490 |             An embedding of the documents into a topic space.
491 |         """
492 | 
493 |         X = check_array(X, accept_sparse="csr")
494 |         X = standardize_input(X)
495 | 
496 |         if not issparse(X):
497 |             X = csr_matrix(X)
498 | 
499 |         sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
500 | 
501 |         if np.any(X.data < 0):
502 |             raise ValueError(
503 |                 "PLSA is only valid for matrices with non-negative " "entries"
504 |             )
505 | 
506 |         row_sums = np.array(X.sum(axis=1).T)[0]
507 |         good_rows = row_sums != 0
508 | 
509 |         if not np.all(good_rows):
510 |             zero_rows_found = True
511 |             data_for_fitting = X[good_rows]
512 |         else:
513 |             zero_rows_found = False
514 |             data_for_fitting = X
515 | 
516 |         U, V = plsa_fit(
517 |             data_for_fitting,
518 |             self.n_components,
519 |             n_row_blocks=self.n_row_blocks,
520 |             n_col_blocks=self.n_col_blocks,
521 |             init=self.init,
522 |             n_iter=self.n_iter,
523 |             n_iter_per_test=self.n_iter_per_test,
524 |             tolerance=self.tolerance,
525 |             e_step_thresh=self.e_step_thresh,
526 |             random_state=self.random_state,
527 |         )
528 | 
529 |         if zero_rows_found:
530 |             self.embedding_ = np.zeros((X.shape[0], self.n_components))
531 |             self.embedding_[good_rows] = U
532 |         else:
533 |             self.embedding_ = U
534 | 
535 |         self.components_ = V
536 |         self.training_data_ = X
537 | 
538 |         return self.embedding_
539 | 


--------------------------------------------------------------------------------
/enstop/enstop_.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import numba
  3 | import numba.cuda
  4 | from warnings import warn
  5 | from sklearn.base import BaseEstimator, TransformerMixin
  6 | from sklearn.utils import check_array, check_random_state
  7 | try:
  8 |     from sklearn.utils.validation import _check_sample_weight
  9 | except ImportError:
 10 |     from enstop.utils import _check_sample_weight
 11 | from sklearn.decomposition import NMF, non_negative_factorization
 12 | from scipy.sparse import issparse, csr_matrix, coo_matrix
 13 | import dask
 14 | 
 15 | try:
 16 |     import joblib
 17 |     _HAVE_JOBLIB = True
 18 | except ImportError:
 19 |     warn("Joblib could not be loaded; joblib parallelism will not be available")
 20 |     _HAVE_JOBLIB = False
 21 | from hdbscan._hdbscan_linkage import mst_linkage_core, label
 22 | from hdbscan.hdbscan_ import _tree_to_labels
 23 | import hdbscan
 24 | import umap
 25 | 
 26 | # TODO: Once umap 0.4 is released enable this...
 27 | from umap.distances import hellinger
 28 | 
 29 | 
 30 | # @numba.njit()
 31 | # def hellinger(x, y):
 32 | #     result = 0.0
 33 | #     l1_norm_x = 0.0
 34 | #     l1_norm_y = 0.0
 35 | #
 36 | #     for i in range(x.shape[0]):
 37 | #         result += np.sqrt(x[i] * y[i])
 38 | #         l1_norm_x += x[i]
 39 | #         l1_norm_y += y[i]
 40 | #
 41 | #     if l1_norm_x == 0 and l1_norm_y == 0:
 42 | #         return 0.0
 43 | #     elif l1_norm_x == 0 or l1_norm_y == 0:
 44 | #         return 1.0
 45 | #     else:
 46 | #         return np.sqrt(1 - result / np.sqrt(l1_norm_x * l1_norm_y))
 47 | 
 48 | 
 49 | from enstop.utils import normalize, coherence, mean_coherence, log_lift, mean_log_lift
 50 | from enstop.plsa import plsa_fit, plsa_refit
 51 | 
 52 | if numba.cuda.is_available():
 53 |     from enstop.cuda_plsa import plsa_fit as gpu_plsa_fit
 54 | 
 55 | 
 56 | def plsa_topics(X, k, **kwargs):
 57 |     """Perform a boostrap sample from a corpus of documents and fit the sample using
 58 |     pLSA to give a set of topic vectors such that the (z,w) entry of the returned
 59 |     array is the probability P(w|z) of word w occuring given the zth topic.
 60 | 
 61 |     Parameters
 62 |     ----------
 63 |     X: sparse matrix of shape (n_docs, n_words)
 64 |         The bag of words representation of the corpus of documents.
 65 | 
 66 |     k: int
 67 |         The number of topics to generate.
 68 | 
 69 |     kwargs:
 70 |         Further keyword arguments that can be passed on th the ``plsa_fit`` function.
 71 |         Possibilities include:
 72 |             * ``init``
 73 |             * ``n_iter``
 74 |             * ``n_iter_per_test``
 75 |             * ``tolerance``
 76 |             * ``e_step_threshold``
 77 |             * ``random_state``
 78 | 
 79 |     Returns
 80 |     -------
 81 |     topics: array of shape (k, n_words)
 82 |         The topics generated from the bootstrap sample.
 83 |     """
 84 |     A = X.tocsr()
 85 |     if kwargs.get("bootstrap", True):
 86 |         rng = check_random_state(kwargs.get("random_state", None))
 87 |         bootstrap_sample_indices = rng.randint(0, A.shape[0], size=A.shape[0])
 88 |         B = A[bootstrap_sample_indices]
 89 |     else:
 90 |         B = A
 91 |     sample_weight = _check_sample_weight(None, B, dtype=np.float32)
 92 |     if numba.cuda.is_available():
 93 |         doc_topic, topic_vocab = gpu_plsa_fit(
 94 |             B,
 95 |             k,
 96 |             init=kwargs.get("init", "random"),
 97 |             n_iter=kwargs.get("n_iter", 100),
 98 |             n_iter_per_test=kwargs.get("n_iter_per_test", 10),
 99 |             tolerance=kwargs.get("tolerance", 0.001),
100 |             e_step_thresh=kwargs.get("e_step_thresh", 1e-16),
101 |             random_state=kwargs.get("random_state", None),
102 |         )
103 |     else:
104 |         doc_topic, topic_vocab = plsa_fit(
105 |             B,
106 |             k,
107 |             sample_weight,
108 |             init=kwargs.get("init", "random"),
109 |             n_iter=kwargs.get("n_iter", 100),
110 |             n_iter_per_test=kwargs.get("n_iter_per_test", 10),
111 |             tolerance=kwargs.get("tolerance", 0.001),
112 |             e_step_thresh=kwargs.get("e_step_thresh", 1e-16),
113 |             random_state=kwargs.get("random_state", None),
114 |         )
115 |     return topic_vocab
116 | 
117 | 
118 | def nmf_topics(X, k, **kwargs):
119 |     """Perform a boostrap sample from a corpus of documents and fit the sample using
120 |     NMF to give a set of topic vectors, normalized such that the(z,w) entry of the
121 |     returned array is the probability P(w|z) of word w occuring given the zth topic.
122 | 
123 |     Parameters
124 |     ----------
125 |     X: sparse matrix of shape (n_docs, n_words)
126 |         The bag of words representation of the corpus of documents.
127 | 
128 |     k: int
129 |         The number of topics to generate.
130 | 
131 |     kwargs:
132 |         Further keyword arguments that can be passed on th the ``NMF`` class.
133 |         Possibilities include:
134 |             * ``init``
135 |             * ``beta_loss``
136 |             * ``alpha``
137 |             * ``solver``
138 | 
139 |     Returns
140 |     -------
141 |     topics: array of shape (k, n_words)
142 |         The topics generated from the bootstrap sample.
143 |     """
144 |     A = X.tocsr()
145 |     if kwargs.get("bootstrap", True):
146 |         rng = check_random_state(kwargs.get("random_state", None))
147 |         bootstrap_sample_indices = rng.randint(0, A.shape[0], size=A.shape[0])
148 |         B = A[bootstrap_sample_indices]
149 |     else:
150 |         B = A
151 |     nmf = NMF(
152 |         n_components=k,
153 |         init=kwargs.get("init", "nndsvd"),
154 |         beta_loss=kwargs.get("beta_loss", 1),
155 |         alpha=kwargs.get("alpha", 0.0),
156 |         solver=kwargs.get("solver", "mu"),
157 |         random_state=kwargs.get("random_state", None),
158 |     ).fit(B)
159 |     topics = nmf.components_.copy()
160 |     normalize(topics, axis=1)
161 |     return topics
162 | 
163 | 
164 | def ensemble_of_topics(
165 |     X, k, model="plsa", n_jobs=4, n_runs=16, parallelism="dask", **kwargs
166 | ):
167 |     """Generate a large number of topic vectors by running an ensemble of
168 |     bootstrap samples of a given corpus. Exploit the embarrassingly parallel nature of the problem
169 |     using wither joblib or dask. Support for both pLSA and NMF approaches to topic generation are
170 |     available. The sklearn implementation of NMF is used for NMF modeling.
171 | 
172 |     Parameters
173 |     ----------
174 |     X: sparse matrix of shape (n_docs, n_words)
175 |         The bag-of-words matrix for the corpus to train on
176 | 
177 |     k: int
178 |         The number of topics to generate per bootstrap sampled run.
179 | 
180 |     model: string (optional, default="plsa")
181 |         The topic modeling method to use (either "plsa" or "nmf")
182 | 
183 |     n_jobs: int (optional, default=4)
184 |         The number of jobs to run in parallel.
185 | 
186 |     n_runs: int (optional, default=16)
187 |         The number of bootstrapped sampled runs to use for topic generation.
188 | 
189 |     parallelism: string (optional, default="dask")
190 |         The parallelism model to use. Should be one of "dask" or "joblib".
191 | 
192 |     kwargs:
193 |         Extra keyword based arguments to pass on to the pLSA or NMF models.
194 | 
195 |     Returns
196 |     -------
197 |     topics: array of shape (n_runs * k, n_words)
198 |         The full set of all topics generated by all the topic modeling runs.
199 | 
200 |     """
201 | 
202 |     if model == "plsa":
203 |         create_topics = plsa_topics
204 |     elif model == "nmf":
205 |         create_topics = nmf_topics
206 |     else:
207 |         raise ValueError('Model must be one of "plsa" or "nmf"')
208 | 
209 |     if parallelism == "dask":
210 |         dask_topics = dask.delayed(create_topics)
211 |         staged_topics = [dask_topics(X, k, **kwargs) for i in range(n_runs)]
212 |         topics = dask.compute(*staged_topics, scheduler="threads", num_workers=n_jobs)
213 |     elif parallelism == "joblib" and _HAVE_JOBLIB:
214 |         joblib_topics = joblib.delayed(create_topics)
215 |         topics = joblib.Parallel(n_jobs=n_jobs, prefer="threads")(
216 |             joblib_topics(X, k, **kwargs) for i in range(n_runs)
217 |         )
218 |     elif parallelism == "joblib" and not _HAVE_JOBLIB:
219 |         raise ValueError("Joblib was not correctly imported and is unavailable")
220 |     elif parallelism == "none":
221 |         topics = []
222 |         for i in range(n_runs):
223 |             topics.append(create_topics(X, k, **kwargs))
224 |     else:
225 |         raise ValueError(
226 |             "Unrecognized parallelism {}; should be one of {}".format(
227 |                 parallelism, ("dask", "joblib")
228 |             )
229 |         )
230 | 
231 |     return np.vstack(topics)
232 | 
233 | 
234 | @numba.njit(fastmath=True, nogil=True)
235 | def kl_divergence(a, b):
236 |     """Compute the KL-divergence between two multinomial distributions."""
237 |     result = 0.0
238 |     for i in range(a.shape[0]):
239 |         if a[i] > 0.0 and b[i] > 0.0:
240 |             result += a[i] * (np.log2(a[i]) - np.log2(b[i]))
241 |     return result
242 | 
243 | 
244 | @numba.njit(fastmath=True, parallel=True)
245 | def all_pairs_kl_divergence(distributions):
246 |     """Compute all pairwise KL-divergences between a set of multinomial distributions."""
247 |     n = distributions.shape[0]
248 |     result = np.zeros((n, n))
249 |     for i in range(n):
250 |         for j in range(n):
251 |             result[i, j] = kl_divergence(distributions[i], distributions[j])
252 |     return result
253 | 
254 | 
255 | @numba.njit(fastmath=True, parallel=True)
256 | def all_pairs_hellinger_distance(distributions):
257 |     """Compute all pairwise Hellinger distances between a set of multinomial distributions."""
258 |     n = distributions.shape[0]
259 |     result = np.zeros((n, n))
260 |     for i in range(n):
261 |         for j in range(n):
262 |             result[i, j] = hellinger(distributions[i], distributions[j])
263 |     return result
264 | 
265 | 
266 | def generate_combined_topics_kl(all_topics, min_samples=5, min_cluster_size=5):
267 |     """Given a large list of topics select out a small list of stable topics
268 |     by clustering the topics with HDBSCAN using KL-divergence as a distance
269 |     measure between topics.
270 | 
271 | 
272 |     Parameters
273 |     ----------
274 |     all_topics: array of shape (N, n_words)
275 |         The set of topics to be clustered.
276 | 
277 |     min_samples: int (optional, default=5)
278 |         The min_samples parameter to use for HDBSCAN clustering.
279 | 
280 |     min_cluster_size: int (optional, default=5)
281 |         The min_cluster_size parameter to use for HDBSCAN clustering
282 | 
283 |     Returns
284 |     -------
285 |     stable_topics: array of shape (M, n_words)
286 |         A set of M topics, one for each cluster found by HDBSCAN.
287 |     """
288 |     divergence_matrix = all_pairs_kl_divergence(all_topics)
289 |     core_divergences = np.sort(divergence_matrix, axis=1)[:, min_samples]
290 |     tiled_core_divergences = np.tile(core_divergences, (core_divergences.shape[0], 1))
291 |     mutual_reachability = np.dstack(
292 |         [
293 |             divergence_matrix,
294 |             divergence_matrix.T,
295 |             tiled_core_divergences,
296 |             tiled_core_divergences.T,
297 |         ]
298 |     ).max(axis=-1)
299 |     mst_data = mst_linkage_core(mutual_reachability)
300 |     mst_order = np.argsort(mst_data.T[2])
301 |     mst_data = mst_data[mst_order]
302 |     single_linkage_tree = label(mst_data)
303 |     labels, probs, stabs, ctree, stree = _tree_to_labels(
304 |         all_topics,
305 |         single_linkage_tree,
306 |         min_cluster_size=min_cluster_size,
307 |         cluster_selection_method="leaf",
308 |     )
309 |     result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
310 |     for i in range(labels.max() + 1):
311 |         result[i] = np.mean(np.sqrt(all_topics[labels == i]), axis=0) ** 2
312 |         result[i] /= result[i].sum()
313 | 
314 |     return result
315 | 
316 | 
317 | def generate_combined_topics_hellinger(all_topics, min_samples=5, min_cluster_size=5):
318 |     """Given a large list of topics select out a small list of stable topics
319 |     by clustering the topics with HDBSCAN using Hellinger as a distance
320 |     measure between topics.
321 | 
322 | 
323 |     Parameters
324 |     ----------
325 |     all_topics: array of shape (N, n_words)
326 |         The set of topics to be clustered.
327 | 
328 |     min_samples: int (optional, default=5)
329 |         The min_samples parameter to use for HDBSCAN clustering.
330 | 
331 |     min_cluster_size: int (optional, default=5)
332 |         The min_cluster_size parameter to use for HDBSCAN clustering
333 | 
334 |     Returns
335 |     -------
336 |     stable_topics: array of shape (M, n_words)
337 |         A set of M topics, one for each cluster found by HDBSCAN.
338 |     """
339 |     distance_matrix = all_pairs_hellinger_distance(all_topics)
340 |     labels = hdbscan.HDBSCAN(
341 |         min_samples=min_samples,
342 |         min_cluster_size=min_cluster_size,
343 |         metric="precomputed",
344 |         cluster_selection_method="leaf",
345 |     ).fit_predict(distance_matrix)
346 |     result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
347 |     for i in range(labels.max() + 1):
348 |         result[i] = np.mean(np.sqrt(all_topics[labels == i]), axis=0) ** 2
349 |         result[i] /= result[i].sum()
350 | 
351 |     return result
352 | 
353 | 
354 | def generate_combined_topics_hellinger_umap(
355 |     all_topics, min_samples=5, min_cluster_size=5, n_neighbors=15, reduced_dim=5
356 | ):
357 |     """Given a large list of topics select out a small list of stable topics
358 |     by mapping the topics to a low dimensional space with UMAP (using
359 |     Hellinger distance) and then clustering the topics with HDBSCAN using
360 |     Euclidean distance in the embedding space to measure distance between topics.
361 | 
362 | 
363 |     Parameters
364 |     ----------
365 |     all_topics: array of shape (N, n_words)
366 |         The set of topics to be clustered.
367 | 
368 |     min_samples: int (optional, default=5)
369 |         The min_samples parameter to use for HDBSCAN clustering.
370 | 
371 |     min_cluster_size: int (optional, default=5)
372 |         The min_cluster_size parameter to use for HDBSCAN clustering
373 | 
374 |     n_neighbors: int (optional, default=15)
375 |         The n_neighbors value to use with UMAP.
376 | 
377 |     reduced_dim: int (optional, default=5)
378 |         The dimension of the embedding space to use.
379 | 
380 |     Returns
381 |     -------
382 |     stable_topics: array of shape (M, n_words)
383 |         A set of M topics, one for each cluster found by HDBSCAN.
384 |     """
385 |     embedding = umap.UMAP(
386 |         n_neighbors=n_neighbors, n_components=reduced_dim, metric=hellinger
387 |     ).fit_transform(all_topics)
388 |     clusterer = hdbscan.HDBSCAN(
389 |         min_samples=min_samples,
390 |         min_cluster_size=min_cluster_size,
391 |         cluster_selection_method="leaf",
392 |         allow_single_cluster=True,
393 |     ).fit(embedding)
394 |     labels = clusterer.labels_
395 |     membership_strengths = clusterer.probabilities_
396 |     result = np.empty((labels.max() + 1, all_topics.shape[1]), dtype=np.float32)
397 |     for i in range(labels.max() + 1):
398 |         mask = labels == i
399 |         result[i] = (
400 |             np.average(
401 |                 np.sqrt(all_topics[mask]), axis=0, weights=membership_strengths[mask]
402 |             )
403 |             ** 2
404 |         )
405 |         result[i] /= result[i].sum()
406 | 
407 |     return result
408 | 
409 | 
410 | _topic_combiner = {
411 |     "kl_divergence": generate_combined_topics_kl,
412 |     "hellinger": generate_combined_topics_hellinger,
413 |     "hellinger_umap": generate_combined_topics_hellinger_umap,
414 | }
415 | 
416 | 
417 | def ensemble_fit(
418 |     X,
419 |     estimated_n_topics=10,
420 |     model="plsa",
421 |     init="random",
422 |     min_samples=3,
423 |     min_cluster_size=4,
424 |     n_starts=16,
425 |     n_jobs=1,
426 |     parallelism="dask",
427 |     topic_combination="hellinger_umap",
428 |     bootstrap=True,
429 |     n_iter=100,
430 |     n_iter_per_test=10,
431 |     tolerance=0.001,
432 |     e_step_thresh=1e-16,
433 |     lift_factor=1,
434 |     beta_loss=1,
435 |     alpha=0.0,
436 |     solver="mu",
437 |     random_state=None,
438 | ):
439 |     """Generate a set of stable topics by using an ensemble of topic models and then clustering
440 |     the results and generating representative topics for each cluster. The generate a set of
441 |     document vectors based on the selected stable topics.
442 | 
443 |     Parameters
444 |     ----------
445 |     X: array or sparse matrix of shape (n_docs, n_words)
446 |         The bag-of-words matrix for the corpus to train on.
447 | 
448 |     estimated_n_topics: int (optional, default=10)
449 |         The estimated number of topics. Note that the final number of topics produced can differ
450 |         from this value, and may be more or less than the provided value. Instead this value
451 |         provides the algorithm with a suggestion of the approximate number of topics to use.
452 | 
453 |     model: string (optional, default="plsa")
454 |         The topic modeling method to use (either "plsa" or "nmf")
455 | 
456 |     init: string or tuple (optional, default="random")
457 |         The intialization method to use. This should be one of:
458 |             * ``"random"``
459 |             * ``"nndsvd"``
460 |             * ``"nmf"``
461 |         or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).
462 | 
463 |     int (optional, default=3)
464 |         The min_samples parameter to use for HDBSCAN clustering.
465 | 
466 |     min_cluster_size: int (optional, default=4)
467 |         The min_cluster_size parameter to use for HDBSCAN clustering
468 | 
469 |     n_starts: int (optional, default=16)
470 |         The number of bootstrap sampled topic models to run -- the size of the ensemble.
471 | 
472 |     n_jobs: int (optional, default=8)
473 |         The number of parallel jobs to run at a time.
474 | 
475 |     parallelism: string (optional, default="dask")
476 |         The parallelism model to use. Should be one of "dask" or "joblib" or "none".
477 | 
478 |     topic_combination: string (optional, default="hellinger_umap")
479 |         The method of comnining ensemble topics into a set of stable topics. Should be one of:
480 |             * ``"hellinger_umap"``
481 |             * ``"hellinger"``
482 |             * ``"kl_divergence"``
483 | 
484 |     n_iter: int
485 |         The maximum number iterations of EM to perform
486 | 
487 |     n_iter_per_test: int
488 |         The number of iterations between tests for
489 |         relative improvement in log-likelihood.
490 | 
491 |     tolerance: float
492 |         The threshold of relative improvement in
493 |         log-likelihood required to continue iterations.
494 | 
495 |     e_step_thresh: float (optional, default=1e-32)
496 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
497 |         below threshold then write a zero for P(z|w,d).
498 | 
499 |     lift_factor: int (optional, default=1)
500 |         Importance factor to apply to lift -- if high lift value are important to
501 |         you then larger lift factors will be beneficial.
502 | 
503 |     beta_loss: float or string, (optional, default 'kullback-leibler')
504 |         The beta loss to use if using NMF for topic modeling.
505 | 
506 |     alpha: float (optional, default=0.0)
507 |         The alpha parameter defining regularization if using NMF for topic modeling.
508 | 
509 |     solver: string, (optional, default="mu")
510 |         The choice of solver if using NMF for topic modeling. Should be either "cd" or "mu".
511 | 
512 |     random_state int, RandomState instance or None, (optional, default: None)
513 |         If int, random_state is the seed used by the random number generator;
514 |         If RandomState instance, random_state is the random number generator;
515 |         If None, the random number generator is the RandomState instance used
516 |         by `np.random`. Used in in initialization.
517 | 
518 |     Returns
519 |     -------
520 |     doc_vectors, stable_topics: arrays of shape (n_docs, M) and (M, n_words)
521 |         The vectors giving the probability of topics for each document, and the stable topics
522 |         produced by the ensemble.
523 |     """
524 | 
525 |     X = check_array(X, accept_sparse="csr", dtype=np.float32)
526 | 
527 |     if issparse(X):
528 |         X_coo = X.tocoo()
529 |     else:
530 |         X_coo = coo_matrix(X, dtype=np.float32)
531 | 
532 |     all_topics = ensemble_of_topics(
533 |         X_coo,
534 |         estimated_n_topics,
535 |         model,
536 |         n_jobs,
537 |         n_starts,
538 |         parallelism,
539 |         init=init,
540 |         n_iter=n_iter,
541 |         n_iter_per_test=n_iter_per_test,
542 |         tolerance=tolerance,
543 |         e_step_thresh=e_step_thresh,
544 |         bootstrap=bootstrap,
545 |         lift_factor=1,
546 |         beta_loss=beta_loss,
547 |         alpha=alpha,
548 |         solver=solver,
549 |         random_state=random_state,
550 |     )
551 | 
552 |     if topic_combination in _topic_combiner:
553 |         cluster_topics = _topic_combiner[topic_combination]
554 |     else:
555 |         raise ValueError(
556 |             "topic_combination must be one of {}".format(tuple(_topic_combiner.keys()))
557 |         )
558 | 
559 |     stable_topics = cluster_topics(all_topics, min_samples, min_cluster_size)
560 | 
561 |     if lift_factor != 1:
562 |         stable_topics **= lift_factor
563 |         normalize(stable_topics, axis=1)
564 | 
565 |     if model == "plsa":
566 |         sample_weight = _check_sample_weight(None, X, dtype=np.float32)
567 |         doc_vectors = plsa_refit(
568 |             X, stable_topics, sample_weight, e_step_thresh=e_step_thresh,
569 |             random_state=random_state,
570 |         )
571 |     elif model == "nmf":
572 |         doc_vectors, _, _ = non_negative_factorization(
573 |             X,
574 |             H=stable_topics,
575 |             n_components=stable_topics.shape[0],
576 |             update_H=False,
577 |             beta_loss=beta_loss,
578 |             alpha=alpha,
579 |             solver=solver,
580 |         )
581 |     else:
582 |         raise ValueError('Model must be one of "plsa" or "nmf"')
583 | 
584 |     return doc_vectors, stable_topics
585 | 
586 | 
587 | class EnsembleTopics(BaseEstimator, TransformerMixin):
588 |     """Ensemble Topic Modelling (EnsTop)
589 | 
590 |     Given a bag-of-words matrix representation of a corpus of documents, where each row of the
591 |     matrix represents a document, and the jth element of the ith row is the count of the number of
592 |     times the jth vocabulary word occurs in the ith document, build an ensemble of different
593 |     topic models from bootstrap samples of the corpus, and then select a set of representative
594 |     stable topics by clustering the topic produced.
595 | 
596 |     By default this will use pLSA for topic modelling. In that case the result will be matrices
597 |     of conditional probabilities P(z|d) and P(w|z) such that the product matrix of probabilities
598 |     P(w|d) maximises the likelihood of seeing the observed corpus data. Here P(z|d) represents
599 |     the probability of topic z given document d, P(w|z) represents the probability of word w
600 |     given topic z, and P(w|d) represents the probability of word w given document d.
601 | 
602 |     Parameters
603 |     ----------
604 |     n_components: int (optional, default=10)
605 |         The estimated number of topics. Note that the final number of topics produced can differ
606 |         from this value, and may be more or less than the provided value. Instead this value
607 |         provides the algorithm with a suggestion of the approximate number of topics to use.
608 | 
609 |     model: string (optional, default="plsa")
610 |         The topic modeling method to use (either "plsa" or "nmf")
611 | 
612 |     init: string or tuple (optional, default="random")
613 |         The intialization method to use. This should be one of:
614 |             * ``"random"``
615 |             * ``"nndsvd"``
616 |             * ``"nmf"``
617 |         or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).
618 | 
619 |     int (optional, default=3)
620 |         The min_samples parameter to use for HDBSCAN clustering.
621 | 
622 |     min_cluster_size: int (optional, default=4)
623 |         The min_cluster_size parameter to use for HDBSCAN clustering
624 | 
625 |     n_starts: int (optional, default=16)
626 |         The number of bootstrap sampled topic models to run -- the size of the ensemble.
627 | 
628 |     n_jobs: int (optional, default=8)
629 |         The number of parallel jobs to run at a time.
630 | 
631 |     parallelism: string (optional, default="dask")
632 |         The parallelism model to use. Should be one of "dask" or "joblib".
633 | 
634 |     topic_combination: string (optional, default="hellinger_umap")
635 |         The method of comnining ensemble topics into a set of stable topics. Should be one of:
636 |             * ``"hellinger_umap"``
637 |             * ``"hellinger"``
638 |             * ``"kl_divergence"``
639 | 
640 |     bootstrap: bool (optional, default=True)
641 |         Whether to use bootstrap resampling of documents for greater randomization. In general
642 |         this is a good idea that helps to prevent overfitting, however for small document
643 |         collections, or for other reasons, this might not be desireable.
644 | 
645 |     n_iter: int
646 |         The maximum number iterations of EM to perform
647 | 
648 |     n_iter_per_test: int
649 |         The number of iterations between tests for
650 |         relative improvement in log-likelihood.
651 | 
652 |     tolerance: float
653 |         The threshold of relative improvement in
654 |         log-likelihood required to continue iterations.
655 | 
656 |     e_step_thresh: float (optional, default=1e-32)
657 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
658 |         below threshold then write a zero for P(z|w,d).
659 | 
660 |     lift_factor: int (optional, default=1)
661 |         Importance factor to apply to lift -- if high lift value are important to
662 |         you then larger lift factors will be beneficial.
663 | 
664 |     beta_loss: float or string, (optional, default 'kullback-leibler')
665 |         The beta loss to use if using NMF for topic modeling.
666 | 
667 |     alpha: float (optional, default=0.0)
668 |         The alpha parameter defining regularization if using NMF for topic modeling.
669 | 
670 |     solver: string, (optional, default="mu")
671 |         The choice of solver if using NMF for topic modeling. Should be either "cd" or "mu".
672 | 
673 |     random_state int, RandomState instance or None, (optional, default: None)
674 |         If int, random_state is the seed used by the random number generator;
675 |         If RandomState instance, random_state is the random number generator;
676 |         If None, the random number generator is the RandomState instance used
677 |         by `np.random`. Used in in initialization.
678 | 
679 |     Attributes
680 |     ----------
681 | 
682 |     n_components_: int
683 |         The actual number of stable topics generated by the ensemble.
684 | 
685 |     components_: array of shape (n_topics, n_words)
686 |         The topic vectors produced by pLSA. Each row is a topic, which is a probability
687 |         distribution, over the vocabulary, giving the probability of each word given the topic (
688 |         P(w|z)).
689 | 
690 |     embedding_: array of shape (n_docs, n_topics)
691 |         The document vectors produced by pLSA. Each row corresponds to a document, giving a
692 |         probability distribution, over the topic space, specifying the probability of each topic
693 |         occuring in the document (P(z|d)).
694 | 
695 |     training_data_: sparse matrix of shape (n_docs, n_words)
696 |         The original training data saved in sparse matrix format.
697 | 
698 |     References
699 |     ----------
700 | 
701 |     Hofmann, Thomas. "Probabilistic latent semantic analysis." Proceedings of the Fifteenth
702 |     conference on Uncertainty in artificial intelligence. Morgan Kaufmann Publishers Inc., 1999.
703 | 
704 |     Hofmann, Thomas. "Unsupervised learning by probabilistic latent semantic analysis."
705 |     Machine learning 42.1-2 (2001): 177-196.
706 | 
707 |     """
708 | 
709 |     def __init__(
710 |         self,
711 |         n_components=10,
712 |         model="plsa",
713 |         init="random",
714 |         n_starts=16,
715 |         min_samples=3,
716 |         min_cluster_size=5,
717 |         n_jobs=8,
718 |         parallelism="dask",
719 |         topic_combination="hellinger_umap",
720 |         bootstrap=True,
721 |         n_iter=80,
722 |         n_iter_per_test=10,
723 |         tolerance=0.001,
724 |         e_step_thresh=1e-32,
725 |         lift_factor=1,
726 |         beta_loss=1,
727 |         alpha=0.0,
728 |         solver="mu",
729 |         transform_random_seed=42,
730 |         random_state=None,
731 |     ):
732 |         self.n_components = n_components
733 |         self.model = model
734 |         self.init = init
735 |         self.n_starts = n_starts
736 |         self.min_samples = min_samples
737 |         self.min_cluster_size = min_cluster_size
738 |         self.n_jobs = n_jobs
739 |         self.parallelism = parallelism
740 |         self.topic_combination = topic_combination
741 |         self.bootstrap = bootstrap
742 |         self.n_iter = n_iter
743 |         self.n_iter_per_test = n_iter_per_test
744 |         self.tolerance = tolerance
745 |         self.e_step_thresh = e_step_thresh
746 |         self.lift_factor = lift_factor
747 |         self.beta_loss = beta_loss
748 |         self.alpha = alpha
749 |         self.solver = solver
750 |         self.transform_random_seed = transform_random_seed
751 |         self.random_state = random_state
752 | 
753 |     def fit(self, X, y=None):
754 |         """Learn the ensemble model for the data X and return the document vectors.
755 | 
756 |         This is more efficient than calling fit followed by transform.
757 | 
758 |         Parameters
759 |         ----------
760 |         X: array or sparse matrix of shape (n_docs, n_words)
761 |             The data matrix pLSA is attempting to fit to.
762 | 
763 |         y: Ignored
764 | 
765 |         Returns
766 |         -------
767 |         self
768 |         """
769 |         self.fit_transform(X)
770 |         return self
771 | 
772 |     def fit_transform(self, X, y=None, **fit_params):
773 |         """Learn the ensemble model for the data X and return the document vectors.
774 | 
775 |         This is more efficient than calling fit followed by transform.
776 | 
777 |         Parameters
778 |         ----------
779 |         X: array or sparse matrix of shape (n_docs, n_words)
780 |             The data matrix pLSA is attempting to fit to.
781 | 
782 |         y: Ignored
783 | 
784 |         Returns
785 |         -------
786 |         embedding: array of shape (n_docs, n_topics)
787 |             An embedding of the documents into a topic space.
788 |         """
789 |         X = check_array(X, accept_sparse="csr")
790 | 
791 |         if not issparse(X):
792 |             X = csr_matrix(X)
793 | 
794 |         U, V = ensemble_fit(
795 |             X,
796 |             self.n_components,
797 |             self.model,
798 |             self.init,
799 |             self.min_samples,
800 |             self.min_cluster_size,
801 |             self.n_starts,
802 |             self.n_jobs,
803 |             self.parallelism,
804 |             self.topic_combination,
805 |             self.bootstrap,
806 |             self.n_iter,
807 |             self.n_iter_per_test,
808 |             self.tolerance,
809 |             self.e_step_thresh,
810 |             self.lift_factor,
811 |             self.beta_loss,
812 |             self.alpha,
813 |             self.solver,
814 |             self.random_state,
815 |         )
816 |         self.components_ = V
817 |         self.embedding_ = U
818 |         self.training_data_ = X
819 |         self.n_components_ = self.components_.shape[0]
820 | 
821 |         return U
822 | 
823 |     def transform(self, X, y=None):
824 |         """Transform the data X into the topic space of the fitted ensemble model.
825 | 
826 |         Parameters
827 |         ----------
828 |         X: array or sparse matrix of shape (n_docs, n_words)
829 |             Corpus to be embedded into topic space
830 | 
831 |         y: Ignored
832 | 
833 |         Returns
834 |         -------
835 |         embedding: array of shape (n_docs, n_topics)
836 |             An embedding of the documents X into the topic space.
837 |         """
838 | 
839 |         X = check_array(X, accept_sparse="csr")
840 |         random_state = check_random_state(self.transform_random_seed)
841 | 
842 |         if not issparse(X):
843 |             X = coo_matrix(X)
844 |         else:
845 |             X = X.tocoo()
846 | 
847 |         result = plsa_refit(
848 |             X,
849 |             self.components_,
850 |             n_iter=50,
851 |             n_iter_per_test=5,
852 |             tolerance=0.001,
853 |             random_state=random_state,
854 |         )
855 | 
856 |         return result
857 | 
858 |     def coherence(self, topic_num=None, n_words=20):
859 |         """Compute the average coherence of fitted topics, or of a single individual topic.
860 | 
861 |         Parameters
862 |         ----------
863 |         topic_num: int (optional, default=None)
864 |             The topic number to compute coherence for. If ``topic_num`` is None then the average
865 |             coherence over all topics will be computed.
866 | 
867 |         n_words int (optional, default=20)
868 |             The number of topic words to score against. The top ``n_words`` words from the selected
869 |             topic will be used.
870 | 
871 |         Returns
872 |         -------
873 |         topic_coherence: float
874 |             The requested coherence score.
875 |         """
876 | 
877 |         # Test for errors
878 |         if not isinstance(topic_num, int) and topic_num is not None:
879 |             raise ValueError("Topic number must be an integer or None.")
880 | 
881 |         if topic_num is None:
882 |             return mean_coherence(
883 |                 self.components_, self.training_data_, n_words=n_words
884 |             )
885 |         elif topic_num >= 0 and topic_num < self.n_components:
886 |             return coherence(
887 |                 self.components_, topic_num, self.training_data_, n_words=n_words
888 |             )
889 |         else:
890 |             raise ValueError(
891 |                 "Topic number must be in range 0 to {}".format(self.n_components)
892 |             )
893 | 
894 |     def log_lift(self, topic_num=None, n_words=20):
895 |         """Compute the average log lift of fitted topics, or of a single individual topic.
896 | 
897 |         Parameters
898 |         ----------
899 |         topic_num: int (optional, default=None)
900 |             The topic number to compute log lift for. If ``topic_num`` is None then the average
901 |             log lift over all topics will be computed.
902 | 
903 |         n_words int (optional, default=20)
904 |             The number of topic words to score against. The top ``n_words`` words from the selected
905 |             topic will be used.
906 | 
907 | 
908 |         Returns
909 |         -------
910 |         log_lift: float
911 |             The requested log lift score.
912 |         """
913 | 
914 |         # Test for errors
915 |         if not isinstance(topic_num, int) and topic_num is not None:
916 |             raise ValueError("Topic number must be an integer or None.")
917 | 
918 |         if topic_num is None:
919 |             return mean_log_lift(self.components_, self.training_data_, n_words=n_words)
920 |         elif topic_num >= 0 and topic_num < self.n_components:
921 |             return log_lift(
922 |                 self.components_, topic_num, self.training_data_, n_words=n_words
923 |             )
924 |         else:
925 |             raise ValueError(
926 |                 "Topic number must be in range 0 to {}".format(self.n_components)
927 |             )
928 | 


--------------------------------------------------------------------------------
/enstop/plsa.py:
--------------------------------------------------------------------------------
   1 | import numpy as np
   2 | import numba
   3 | 
   4 | from sklearn.base import BaseEstimator, TransformerMixin
   5 | from sklearn.utils import check_array, check_random_state
   6 | from sklearn.utils.extmath import randomized_svd
   7 | 
   8 | try:
   9 |     from sklearn.utils.validation import _check_sample_weight
  10 | except ImportError:
  11 |     from enstop.utils import _check_sample_weight
  12 | from sklearn.decomposition import non_negative_factorization
  13 | from scipy.sparse import issparse, csr_matrix, coo_matrix
  14 | 
  15 | from enstop.utils import (
  16 |     normalize,
  17 |     coherence,
  18 |     mean_coherence,
  19 |     log_lift,
  20 |     mean_log_lift,
  21 |     standardize_input,
  22 | )
  23 | 
  24 | 
  25 | @numba.njit(
  26 |     "f4[:,::1](i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4)",
  27 |     locals={
  28 |         "k": numba.types.intp,
  29 |         "w": numba.types.uint32,
  30 |         "d": numba.types.uint32,
  31 |         "z": numba.types.uint16,
  32 |         "nz_idx": numba.types.uint32,
  33 |         "norm": numba.types.float32,
  34 |     },
  35 |     fastmath=True,
  36 |     nogil=True,
  37 |     parallel=True,
  38 | )
  39 | def plsa_e_step(
  40 |     X_rows,
  41 |     X_cols,
  42 |     X_vals,
  43 |     p_w_given_z,
  44 |     p_z_given_d,
  45 |     p_z_given_wd,
  46 |     probability_threshold=1e-32,
  47 | ):
  48 |     """Perform the E-step of pLSA optimization. This amounts to computing the
  49 |     probability of each topic given each word document pair. The computation
  50 |     implements
  51 | 
  52 |     P(z|w,d) = \frac{P(z|w)P(d|z)}{\sum_{z=1}^k P(z|w)P(d|z)}.
  53 | 
  54 |     This routine is optimized to work with sparse matrices such that P(z|w,d)
  55 |     is only computed for w, d such that X_{w,d} is non-zero, where X is the
  56 |     data matrix.
  57 | 
  58 |     To make this numba compilable the raw arrays defining the COO format sparse
  59 |     matrix must be passed separately.
  60 | 
  61 | 
  62 |     Parameters
  63 |     ----------
  64 |     X_rows: array of shape (nnz,)
  65 |         For each non-zero entry of X, the row of the entry.
  66 | 
  67 |     X_cols: array of shape (nnz,)
  68 |         For each non-zero entry of X, the column of the
  69 |         entry.
  70 | 
  71 |     X_vals: array of shape (nnz,)
  72 |         For each non-zero entry of X, the value of entry.
  73 | 
  74 |     p_w_given_z: array of shape (n_topics, n_words)
  75 |         The current estimates of values for P(w|z)
  76 | 
  77 |     p_z_given_d: array of shape (n_docs, n_topics)
  78 |         The current estimates of values for P(z|d)
  79 | 
  80 |     p_z_given_wd: array of shape (nnz, n_topics)
  81 |         The result array to write new estimates of P(z|w,d) to.
  82 | 
  83 |     probability_threshold: float (optional, default=1e-32)
  84 |         Option to promote sparsity. If the value of P(w|z)P(z|d) falls below
  85 |         threshold then write a zero for P(z|w,d).
  86 | 
  87 |     """
  88 | 
  89 |     k = p_w_given_z.shape[0]
  90 | 
  91 |     for nz_idx in numba.prange(X_vals.shape[0]):
  92 |         d = X_rows[nz_idx]
  93 |         w = X_cols[nz_idx]
  94 | 
  95 |         norm = 0.0
  96 |         for z in range(k):
  97 |             v = p_w_given_z[z, w] * p_z_given_d[d, z]
  98 |             if v > probability_threshold:
  99 |                 p_z_given_wd[nz_idx, z] = v
 100 |                 norm += p_z_given_wd[nz_idx, z]
 101 |             else:
 102 |                 p_z_given_wd[nz_idx, z] = 0.0
 103 |         for z in range(k):
 104 |             if norm > 0:
 105 |                 p_z_given_wd[nz_idx, z] /= norm
 106 | 
 107 |     return p_z_given_wd
 108 | 
 109 | 
 110 | @numba.njit(
 111 |     "UniTuple(f4[:,::1],2)(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1])",
 112 |     locals={
 113 |         "k": numba.types.intp,
 114 |         "w": numba.types.uint32,
 115 |         "d": numba.types.uint32,
 116 |         "z": numba.types.uint16,
 117 |         "nz_idx": numba.types.uint32,
 118 |         "s": numba.types.float32,
 119 |     },
 120 |     fastmath=True,
 121 |     nogil=True,
 122 |     parallel=True,
 123 | )
 124 | def plsa_m_step(
 125 |     X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, p_z_given_wd, norm_pwz, norm_pdz
 126 | ):
 127 |     """Perform the M-step of pLSA optimization. This amounts to using the estimates
 128 |     of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation implements
 129 | 
 130 |     P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)}
 131 |     P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)}
 132 | 
 133 |     This routine is optimized to work with sparse matrices such that P(z|w,d) is only
 134 |     computed for w, d such that X_{w,d} is non-zero, where X is the data matrix.
 135 | 
 136 |     To make this numba compilable the raw arrays defining the COO format sparse
 137 |     matrix must be passed separately.
 138 | 
 139 |     Parameters
 140 |     ----------
 141 |     X_rows: array of shape (nnz,)
 142 |         For each non-zero entry of X, the row of the entry.
 143 | 
 144 |     X_cols: array of shape (nnz,)
 145 |         For each non-zero entry of X, the column of the
 146 |         entry.
 147 | 
 148 |     X_vals: array of shape (nnz,)
 149 |         For each non-zero entry of X, the value of entry.
 150 | 
 151 |     p_w_given_z: array of shape (n_topics, n_words)
 152 |         The result array to write new estimates of P(w|z) to.
 153 | 
 154 |     p_z_given_d: array of shape (n_docs, n_topics)
 155 |         The result array to write new estimates of P(z|d) to.
 156 | 
 157 |     p_z_given_wd: array of shape (nnz, n_topics)
 158 |         The current estimates for P(z|w,d)
 159 | 
 160 |     sample_weight: array of shape (n_docs,)
 161 |         Input document weights.
 162 | 
 163 |     norm_pwz: array of shape (n_topics,)
 164 |         Auxilliary array used for storing row norms; this is passed in to save
 165 |         reallocations.
 166 | 
 167 |     norm_pdz: array of shape (n_docs,)
 168 |         Auxilliary array used for storing row norms; this is passed in to save
 169 |         reallocations.
 170 |     """
 171 | 
 172 |     k = p_z_given_wd.shape[1]
 173 |     n = p_z_given_d.shape[0]
 174 |     m = p_w_given_z.shape[1]
 175 | 
 176 |     p_w_given_z[:] = 0.0
 177 |     p_z_given_d[:] = 0.0
 178 | 
 179 |     norm_pwz[:] = 0.0
 180 |     norm_pdz[:] = 0.0
 181 | 
 182 |     for nz_idx in range(X_vals.shape[0]):
 183 |         d = X_rows[nz_idx]
 184 |         w = X_cols[nz_idx]
 185 |         x = X_vals[nz_idx]
 186 | 
 187 |         for z in range(k):
 188 |             s = x * p_z_given_wd[nz_idx, z]
 189 | 
 190 |             p_w_given_z[z, w] += s
 191 |             p_z_given_d[d, z] += s
 192 | 
 193 |             norm_pwz[z] += s
 194 |             norm_pdz[d] += s
 195 | 
 196 |     for z in numba.prange(k):
 197 |         if norm_pwz[z] > 0:
 198 |             for w in range(m):
 199 |                 p_w_given_z[z, w] /= norm_pwz[z]
 200 |         for d in range(n):
 201 |             if norm_pdz[d] > 0:
 202 |                 p_z_given_d[d, z] /= norm_pdz[d]
 203 | 
 204 |     return p_w_given_z, p_z_given_d
 205 | 
 206 | 
 207 | @numba.njit(
 208 |     "UniTuple(f4[:,::1],2)(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1],f4[::1])",
 209 |     locals={
 210 |         "k": numba.types.intp,
 211 |         "w": numba.types.uint32,
 212 |         "d": numba.types.uint32,
 213 |         "z": numba.types.uint16,
 214 |         "nz_idx": numba.types.uint32,
 215 |         "s": numba.types.float32,
 216 |     },
 217 |     fastmath=True,
 218 |     nogil=True,
 219 |     parallel=True,
 220 | )
 221 | def plsa_m_step_w_sample_weight(
 222 |     X_rows,
 223 |     X_cols,
 224 |     X_vals,
 225 |     p_w_given_z,
 226 |     p_z_given_d,
 227 |     p_z_given_wd,
 228 |     sample_weight,
 229 |     norm_pwz,
 230 |     norm_pdz,
 231 | ):
 232 |     """Perform the M-step of pLSA optimization. This amounts to using the estimates
 233 |     of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation implements
 234 | 
 235 |     P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)}
 236 |     P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)}
 237 | 
 238 |     This routine is optimized to work with sparse matrices such that P(z|w,d) is only
 239 |     computed for w, d such that X_{w,d} is non-zero, where X is the data matrix.
 240 | 
 241 |     To make this numba compilable the raw arrays defining the COO format sparse
 242 |     matrix must be passed separately.
 243 | 
 244 |     Parameters
 245 |     ----------
 246 |     X_rows: array of shape (nnz,)
 247 |         For each non-zero entry of X, the row of the entry.
 248 | 
 249 |     X_cols: array of shape (nnz,)
 250 |         For each non-zero entry of X, the column of the
 251 |         entry.
 252 | 
 253 |     X_vals: array of shape (nnz,)
 254 |         For each non-zero entry of X, the value of entry.
 255 | 
 256 |     p_w_given_z: array of shape (n_topics, n_words)
 257 |         The result array to write new estimates of P(w|z) to.
 258 | 
 259 |     p_z_given_d: array of shape (n_docs, n_topics)
 260 |         The result array to write new estimates of P(z|d) to.
 261 | 
 262 |     p_z_given_wd: array of shape (nnz, n_topics)
 263 |         The current estimates for P(z|w,d)
 264 | 
 265 |     sample_weight: array of shape (n_docs,)
 266 |         Input document weights.
 267 | 
 268 |     norm_pwz: array of shape (n_topics,)
 269 |         Auxilliary array used for storing row norms; this is passed in to save
 270 |         reallocations.
 271 | 
 272 |     norm_pdz: array of shape (n_docs,)
 273 |         Auxilliary array used for storing row norms; this is passed in to save
 274 |         reallocations.
 275 |     """
 276 | 
 277 |     k = p_z_given_wd.shape[1]
 278 |     n = p_z_given_d.shape[0]
 279 |     m = p_w_given_z.shape[1]
 280 | 
 281 |     p_w_given_z[:] = 0.0
 282 |     p_z_given_d[:] = 0.0
 283 | 
 284 |     norm_pwz[:] = 0.0
 285 |     norm_pdz[:] = 0.0
 286 | 
 287 |     for nz_idx in range(X_vals.shape[0]):
 288 |         d = X_rows[nz_idx]
 289 |         w = X_cols[nz_idx]
 290 |         x = X_vals[nz_idx]
 291 | 
 292 |         for z in range(k):
 293 |             s = x * p_z_given_wd[nz_idx, z]
 294 |             t = s * sample_weight[d]
 295 | 
 296 |             p_w_given_z[z, w] += t
 297 |             p_z_given_d[d, z] += s
 298 | 
 299 |             norm_pwz[z] += t
 300 |             norm_pdz[d] += s
 301 | 
 302 |     for z in numba.prange(k):
 303 |         if norm_pwz[z] > 0:
 304 |             for w in range(m):
 305 |                 p_w_given_z[z, w] /= norm_pwz[z]
 306 |         for d in range(n):
 307 |             if norm_pdz[d] > 0:
 308 |                 p_z_given_d[d, z] /= norm_pdz[d]
 309 | 
 310 |     return p_w_given_z, p_z_given_d
 311 | 
 312 | 
 313 | @numba.njit(
 314 |     "f4(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[::1])",
 315 |     locals={
 316 |         "k": numba.types.intp,
 317 |         "w": numba.types.uint32,
 318 |         "d": numba.types.uint32,
 319 |         "z": numba.types.uint16,
 320 |         "nz_idx": numba.types.uint32,
 321 |         "x": numba.types.float32,
 322 |         "result": numba.types.float32,
 323 |         "p_w_given_d": numba.types.float32,
 324 |     },
 325 |     fastmath=True,
 326 |     nogil=True,
 327 |     parallel=True,
 328 | )
 329 | def log_likelihood(X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight):
 330 |     """Compute the log-likelihood of observing the data X given estimates for P(w|z)
 331 |     and P(z|d). The likelihood of X_{w,d} under the model is given by X_{w,d} P(w|d)
 332 |     = X_{w,d} P(w|z) P(z|d). This function returns
 333 | 
 334 |     \log\left(\prod_{w,d} X_{w,d} P(w|d)\right)
 335 | 
 336 |     This routine is optimized to work with sparse matrices and only compute values
 337 |     for w, d such that X_{w,d} is non-zero.
 338 | 
 339 |     To make this numba compilable the raw arrays defining the COO format sparse
 340 |     matrix must be passed separately.
 341 | 
 342 |     Parameters
 343 |     ----------
 344 |     X_rows: array of shape (nnz,)
 345 |         For each non-zero entry of X, the row of the entry.
 346 | 
 347 |     X_cols: array of shape (nnz,)
 348 |         For each non-zero entry of X, the column of the
 349 |         entry.
 350 | 
 351 |     X_vals: array of shape (nnz,)
 352 |         For each non-zero entry of X, the value of entry.
 353 | 
 354 |     p_w_given_z: array of shape (n_topics, n_words)
 355 |         The current estimates of values for P(w|z)
 356 | 
 357 |     p_z_given_d: array of shape (n_docs, n_topics)
 358 |         The current estimates of values for P(z|d)
 359 | 
 360 |     sample_weight: array of shape (n_docs,)
 361 |         Input document weights.
 362 | 
 363 |     Returns
 364 |     -------
 365 | 
 366 |     log_likelihood: float
 367 |         The log of the likelihood of observing X under the
 368 |         model given by the P(z|d) and P(z|w).
 369 | 
 370 |     """
 371 | 
 372 |     result = 0.0
 373 |     k = p_w_given_z.shape[0]
 374 | 
 375 |     for nz_idx in numba.prange(X_vals.shape[0]):
 376 |         d = X_rows[nz_idx]
 377 |         w = X_cols[nz_idx]
 378 |         x = X_vals[nz_idx]
 379 | 
 380 |         p_w_given_d = 0.0
 381 |         for z in range(k):
 382 |             p_w_given_d += p_w_given_z[z, w] * p_z_given_d[d, z]
 383 | 
 384 |         result += x * np.log(p_w_given_d) * sample_weight[d]
 385 | 
 386 |     return result
 387 | 
 388 | 
 389 | @numba.njit(fastmath=True, nogil=True)
 390 | def norm(x):
 391 |     """Numba compilable routine for computing the l2-norm
 392 |     of a given vector x.
 393 | 
 394 |     Parameters
 395 |     ----------
 396 |     x: array of shape (n,)
 397 |         The array to compute the l2-norm of.
 398 | 
 399 |     Returns
 400 |     -------
 401 |     n: float
 402 |         The l2-norm of the input array x.
 403 |     """
 404 |     result = 0.0
 405 | 
 406 |     for i in range(x.shape[0]):
 407 |         result += x[i] ** 2
 408 | 
 409 |     return np.sqrt(result)
 410 | 
 411 | 
 412 | def plsa_init(X, k, init="random", rng=np.random):
 413 |     """Initialize matrices for pLSA. Specifically, given data X, a number of topics
 414 |     k, and an initialization method, compute matrices for P(z|d) and P(w|z) that can
 415 |     be used to begin an EM optimization of pLSA.
 416 | 
 417 |     Various initialization approaches are available. The most straightforward is
 418 |     "random", which randomly initializes values for P(z|d) and P(w|z) and normalizes
 419 |     to make them probabilities. A second approach, borrowing from sklearn's NMF
 420 |     implementation, is to use a non-negative SVD approach ("nndsvd"). A third option
 421 |     is the use the fast coordinate descent under Frobenius loss version of NMF and
 422 |     then normalize to make probabilities ("nmf"). Finally if the ``init`` parameter
 423 |     is a tuple of ndarrays then these will be used, allowing for custom user defined
 424 |     initializations.
 425 | 
 426 |     Parameters
 427 |     ----------
 428 |     X: sparse matrix of shape (n_docs, n_words)
 429 |         The data matrix pLSA is attempting to fit to.
 430 | 
 431 |     k: int
 432 |         The number of topics for pLSA to fit with.
 433 | 
 434 |     init: string or tuple (optional, default="random")
 435 |         The intialization method to use. This should be one of:
 436 |             * ``"random"``
 437 |             * ``"nndsvd"``
 438 |             * ``"nmf"``
 439 |         or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).
 440 | 
 441 |     rng: RandomState instance (optional, default=np.random)
 442 |         Seeded randomness generator. Used for random intialization.
 443 | 
 444 |     Returns
 445 |     -------
 446 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
 447 |         Initialized arrays suitable to passing to
 448 |         pLSA optimization methods.
 449 |     """
 450 | 
 451 |     n = X.shape[0]
 452 |     m = X.shape[1]
 453 | 
 454 |     if init == "random":
 455 |         p_w_given_z = rng.rand(k, m)
 456 |         p_z_given_d = rng.rand(n, k)
 457 | 
 458 |     elif init == "nndsvd":
 459 |         # Taken from sklearn NMF implementation
 460 |         U, S, V = randomized_svd(X, k)
 461 |         p_z_given_d, p_w_given_z = np.zeros(U.shape), np.zeros(V.shape)
 462 | 
 463 |         # The leading singular triplet is non-negative
 464 |         # so it can be used as is for initialization.
 465 |         p_z_given_d[:, 0] = np.sqrt(S[0]) * np.abs(U[:, 0])
 466 |         p_w_given_z[0, :] = np.sqrt(S[0]) * np.abs(V[0, :])
 467 | 
 468 |         for j in range(1, k):
 469 |             x, y = U[:, j], V[j, :]
 470 | 
 471 |             # extract positive and negative parts of column vectors
 472 |             x_p, y_p = np.maximum(x, 0), np.maximum(y, 0)
 473 |             x_n, y_n = np.abs(np.minimum(x, 0)), np.abs(np.minimum(y, 0))
 474 | 
 475 |             # and their norms
 476 |             x_p_nrm, y_p_nrm = norm(x_p), norm(y_p)
 477 |             x_n_nrm, y_n_nrm = norm(x_n), norm(y_n)
 478 | 
 479 |             m_p, m_n = x_p_nrm * y_p_nrm, x_n_nrm * y_n_nrm
 480 | 
 481 |             # choose update
 482 |             if m_p > m_n:
 483 |                 u = x_p / x_p_nrm
 484 |                 v = y_p / y_p_nrm
 485 |                 sigma = m_p
 486 |             else:
 487 |                 u = x_n / x_n_nrm
 488 |                 v = y_n / y_n_nrm
 489 |                 sigma = m_n
 490 | 
 491 |             lbd = np.sqrt(S[j] * sigma)
 492 |             p_z_given_d[:, j] = lbd * u
 493 |             p_w_given_z[j, :] = lbd * v
 494 | 
 495 |     elif init == "nmf":
 496 |         p_z_given_d, p_w_given_z, _ = non_negative_factorization(
 497 |             X,
 498 |             n_components=k,
 499 |             init="nndsvd",
 500 |             solver="cd",
 501 |             beta_loss=2,
 502 |             tol=1e-2,
 503 |             max_iter=100,
 504 |         )
 505 |     elif isinstance(init, tuple) or isinstance(init, list):
 506 |         p_z_given_d, p_w_given_z = init
 507 |     else:
 508 |         raise ValueError("Unrecognized init {}".format(init))
 509 | 
 510 |     normalize(p_w_given_z, axis=1)
 511 |     normalize(p_z_given_d, axis=1)
 512 | 
 513 |     return p_z_given_d, p_w_given_z
 514 | 
 515 | 
 516 | @numba.njit(fastmath=True, nogil=True)
 517 | def plsa_fit_inner(
 518 |     X_rows,
 519 |     X_cols,
 520 |     X_vals,
 521 |     p_w_given_z,
 522 |     p_z_given_d,
 523 |     sample_weight,
 524 |     n_iter=100,
 525 |     n_iter_per_test=10,
 526 |     tolerance=0.001,
 527 |     e_step_thresh=1e-32,
 528 |     use_sample_weights=False,
 529 | ):
 530 |     """Internal loop of EM steps required to optimize pLSA, along with relative
 531 |     convergence tests with respect to the log-likelihood of observing the data under
 532 |     the model.
 533 | 
 534 |     The EM looping will stop when either ``n_iter`` iterations have been reached,
 535 |     or if the relative improvement in log-likelihood over the last
 536 |     ``n_iter_per_test`` steps is under ``threshold``.
 537 | 
 538 |     This function is designed to wrap the internals of the EM process in a numba
 539 |     compilable loop, and is not the preferred entry point for fitting a plsa model.
 540 | 
 541 |     Parameters
 542 |     ----------
 543 |     X_rows: array of shape (nnz,)
 544 |         For each non-zero entry of X, the row of the entry.
 545 | 
 546 |     X_cols: array of shape (nnz,)
 547 |         For each non-zero entry of X, the column of the
 548 |         entry.
 549 | 
 550 |     X_vals: array of shape (nnz,)
 551 |         For each non-zero entry of X, the value of entry.
 552 | 
 553 |     p_w_given_z: array of shape (n_topics, n_words)
 554 |         The current estimates of values for P(w|z)
 555 | 
 556 |     p_z_given_d: array of shape (n_docs, n_topics)
 557 |         The current estimates of values for P(z|d)
 558 | 
 559 |     sample_weight: array of shape (n_docs,)
 560 |         Input document weights.
 561 | 
 562 |     n_iter: int
 563 |         The maximum number iterations of EM to perform
 564 | 
 565 |     n_iter_per_test: int
 566 |         The number of iterations between tests for
 567 |         relative improvement in log-likelihood.
 568 | 
 569 |     tolerance: float
 570 |         The threshold of relative improvement in
 571 |         log-likelihood required to continue iterations.
 572 | 
 573 |     e_step_thresh: float (optional, default=1e-32)
 574 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
 575 |         below threshold then write a zero for P(z|w,d).
 576 | 
 577 |     Returns
 578 |     -------
 579 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
 580 |         The resulting model values of P(z|d) and P(w|z)
 581 | 
 582 |     """
 583 |     k = p_z_given_d.shape[1]
 584 |     n = p_z_given_d.shape[0]
 585 | 
 586 |     p_z_given_wd = np.zeros((X_vals.shape[0], k), dtype=np.float32)
 587 | 
 588 |     norm_pwz = np.zeros(k, dtype=np.float32)
 589 |     norm_pdz = np.zeros(n, dtype=np.float32)
 590 | 
 591 |     previous_log_likelihood = log_likelihood(
 592 |         X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight
 593 |     )
 594 | 
 595 |     for i in range(n_iter):
 596 | 
 597 |         plsa_e_step(
 598 |             X_rows,
 599 |             X_cols,
 600 |             X_vals,
 601 |             p_w_given_z,
 602 |             p_z_given_d,
 603 |             p_z_given_wd,
 604 |             e_step_thresh,
 605 |         )
 606 |         if use_sample_weights:
 607 |             plsa_m_step_w_sample_weight(
 608 |                 X_rows,
 609 |                 X_cols,
 610 |                 X_vals,
 611 |                 p_w_given_z,
 612 |                 p_z_given_d,
 613 |                 p_z_given_wd,
 614 |                 sample_weight,
 615 |                 norm_pwz,
 616 |                 norm_pdz,
 617 |             )
 618 |         else:
 619 |             plsa_m_step(
 620 |                 X_rows,
 621 |                 X_cols,
 622 |                 X_vals,
 623 |                 p_w_given_z,
 624 |                 p_z_given_d,
 625 |                 p_z_given_wd,
 626 |                 norm_pwz,
 627 |                 norm_pdz,
 628 |             )
 629 | 
 630 |         if i % n_iter_per_test == 0:
 631 |             current_log_likelihood = log_likelihood(
 632 |                 X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight
 633 |             )
 634 |             change = np.abs(current_log_likelihood - previous_log_likelihood)
 635 |             if change == 0 or change / np.abs(current_log_likelihood) < tolerance:
 636 |                 break
 637 |             else:
 638 |                 previous_log_likelihood = current_log_likelihood
 639 | 
 640 |     return p_z_given_d, p_w_given_z
 641 | 
 642 | 
 643 | def plsa_fit(
 644 |     X,
 645 |     k,
 646 |     sample_weight,
 647 |     init="random",
 648 |     n_iter=100,
 649 |     n_iter_per_test=10,
 650 |     tolerance=0.001,
 651 |     e_step_thresh=1e-32,
 652 |     random_state=None,
 653 | ):
 654 |     """Fit a pLSA model to a data matrix ``X`` with ``k`` topics, an initialized
 655 |     according to ``init``. This will run an EM method to optimize estimates of P(z|d)
 656 |     and P(w|z). The will perform at most ``n_iter`` EM step iterations,
 657 |     while checking for relative improvement of the log-likelihood of the data under
 658 |     the model every ``n_iter_per_test`` iterations, and stops early if that is under
 659 |     ``tolerance``.
 660 | 
 661 |     Parameters
 662 |     ----------
 663 |     X: sparse matrix of shape (n_docs, n_words)
 664 |         The data matrix pLSA is attempting to fit to.
 665 | 
 666 |     k: int
 667 |         The number of topics for pLSA to fit with.
 668 | 
 669 |     sample_weight: array of shape (n_docs,)
 670 |         Input document weights.
 671 | 
 672 |     init: string or tuple (optional, default="random")
 673 |         The intialization method to use. This should be one of:
 674 |             * ``"random"``
 675 |             * ``"nndsvd"``
 676 |             * ``"nmf"``
 677 |         or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).
 678 | 
 679 |     n_iter: int
 680 |         The maximum number iterations of EM to perform
 681 | 
 682 |     n_iter_per_test: int
 683 |         The number of iterations between tests for
 684 |         relative improvement in log-likelihood.
 685 | 
 686 |     tolerance: float
 687 |         The threshold of relative improvement in
 688 |         log-likelihood required to continue iterations.
 689 | 
 690 |     e_step_thresh: float (optional, default=1e-32)
 691 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
 692 |         below threshold then write a zero for P(z|w,d).
 693 | 
 694 |     random_state: int, RandomState instance or None, (optional, default: None)
 695 |         If int, random_state is the seed used by the random number generator;
 696 |         If RandomState instance, random_state is the random number generator;
 697 |         If None, the random number generator is the RandomState instance used
 698 |         by `np.random`. Used in in initialization.
 699 | 
 700 |     Returns
 701 |     -------
 702 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
 703 |         The resulting model values of P(z|d) and P(w|z)
 704 | 
 705 |     """
 706 | 
 707 |     rng = check_random_state(random_state)
 708 |     p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng)
 709 |     p_z_given_d = p_z_given_d.astype(np.float32, order="C")
 710 |     p_w_given_z = p_w_given_z.astype(np.float32, order="C")
 711 | 
 712 |     use_sample_weights = np.any(sample_weight != 1.0)
 713 | 
 714 |     A = X.tocoo().astype(np.float32)
 715 | 
 716 |     p_z_given_d, p_w_given_z = plsa_fit_inner(
 717 |         A.row,
 718 |         A.col,
 719 |         A.data,
 720 |         p_w_given_z,
 721 |         p_z_given_d,
 722 |         sample_weight,
 723 |         n_iter,
 724 |         n_iter_per_test,
 725 |         tolerance,
 726 |         e_step_thresh,
 727 |         use_sample_weights,
 728 |     )
 729 | 
 730 |     return p_z_given_d, p_w_given_z
 731 | 
 732 | 
 733 | @numba.njit(
 734 |     "UniTuple(f4[:,::1],2)(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1])",
 735 |     locals={
 736 |         "k": numba.types.intp,
 737 |         "w": numba.types.uint32,
 738 |         "d": numba.types.uint32,
 739 |         "z": numba.types.uint16,
 740 |         "nz_idx": numba.types.uint32,
 741 |         "s": numba.types.float32,
 742 |     },
 743 |     fastmath=True,
 744 |     nogil=True,
 745 | )
 746 | def plsa_refit_m_step(
 747 |     X_rows,
 748 |     X_cols,
 749 |     X_vals,
 750 |     p_w_given_z,
 751 |     p_z_given_d,
 752 |     p_z_given_wd,
 753 |     sample_weight,
 754 |     norm_pdz,
 755 | ):
 756 |     """Optimized routine for the M step fitting values of P(z|d) given a fixed set of
 757 |     topics (i.e. P(w|z)).
 758 | 
 759 |     This routine is optimized to work with sparse matrices and only compute values
 760 |     for w, d such that X_{w,d} is non-zero.
 761 | 
 762 |     To make this numba compilable the raw arrays defining the COO format sparse
 763 |     matrix must be passed separately.
 764 | 
 765 |     Parameters
 766 |     ----------
 767 |     X_rows: array of shape (nnz,)
 768 |         For each non-zero entry of X, the row of the entry.
 769 | 
 770 |     X_cols: array of shape (nnz,)
 771 |         For each non-zero entry of X, the column of the
 772 |         entry.
 773 | 
 774 |     X_vals: array of shape (nnz,)
 775 |         For each non-zero entry of X, the value of entry.
 776 | 
 777 |     p_w_given_z: array of shape (n_topics, n_words)
 778 |         The fixed topics P(w|z) to fit P(z|d) against.
 779 | 
 780 |     p_z_given_d: array of shape (n_docs, n_topics)
 781 |         The result array to write new estimates of P(z|d) to.
 782 | 
 783 |     p_z_given_wd: array of shape (nnz, n_topics)
 784 |         The current estimates for P(z|w,d)
 785 | 
 786 |     sample_weight: array of shape (n_docs,)
 787 |         Input document weights.
 788 | 
 789 |     norm_pdz: array of shape (n_docs,)
 790 |         Auxilliary array used for storing row norms; this is passed in to save
 791 |         reallocations.
 792 | 
 793 |     """
 794 | 
 795 |     k = p_z_given_wd.shape[1]
 796 |     n = p_z_given_d.shape[0]
 797 | 
 798 |     p_z_given_d[:] = 0.0
 799 |     norm_pdz[:] = 0.0
 800 | 
 801 |     for nz_idx in range(X_vals.shape[0]):
 802 |         d = X_rows[nz_idx]
 803 |         w = X_cols[nz_idx]
 804 |         x = X_vals[nz_idx]
 805 | 
 806 |         for z in range(k):
 807 |             s = x * p_z_given_wd[nz_idx, z]
 808 |             p_z_given_d[d, z] += s
 809 |             norm_pdz[d] += s
 810 | 
 811 |     for z in range(k):
 812 |         for d in range(n):
 813 |             if norm_pdz[d] > 0:
 814 |                 p_z_given_d[d, z] /= norm_pdz[d]
 815 | 
 816 |     return p_w_given_z, p_z_given_d
 817 | 
 818 | 
 819 | @numba.njit(locals={"e_step_thresh": numba.types.float32,}, fastmath=True, nogil=True)
 820 | def plsa_refit_inner(
 821 |     X_rows,
 822 |     X_cols,
 823 |     X_vals,
 824 |     topics,
 825 |     p_z_given_d,
 826 |     sample_weight,
 827 |     n_iter=50,
 828 |     n_iter_per_test=10,
 829 |     tolerance=0.005,
 830 |     e_step_thresh=1e-32,
 831 | ):
 832 |     """Optimized routine for refitting values of P(z|d) given a fixed set of topics (
 833 |     i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics
 834 |     (given, for example, by an ensemble result).
 835 | 
 836 |     This routine is optimized to work with sparse matrices and only compute values
 837 |     for w, d such that X_{w,d} is non-zero.
 838 | 
 839 |     To make this numba compilable the raw arrays defining the COO format sparse
 840 |     matrix must be passed separately.
 841 | 
 842 |     Parameters
 843 |     ----------
 844 |     X_rows: array of shape (nnz,)
 845 |         For each non-zero entry of X, the row of the entry.
 846 | 
 847 |     X_cols: array of shape (nnz,)
 848 |         For each non-zero entry of X, the column of the
 849 |         entry.
 850 | 
 851 |     X_vals: array of shape (nnz,)
 852 |         For each non-zero entry of X, the value of entry.
 853 | 
 854 |     topics: array of shape (n_topics, n_words)
 855 |         The fixed topics against which to fit the values of P(z|d).
 856 | 
 857 |     p_z_given_d: array of shape (n_docs, n_topics)
 858 |         The current estimates of values for P(z|d)
 859 | 
 860 |     sample_weight: array of shape (n_docs,)
 861 |         Input document weights.
 862 | 
 863 |     n_iter: int
 864 |         The maximum number iterations of EM to perform
 865 | 
 866 |     n_iter_per_test: int
 867 |         The number of iterations between tests for relative improvement in
 868 |         log-likelihood.
 869 | 
 870 |     tolerance: float
 871 |         The threshold of relative improvement in log-likelihood required to continue
 872 |         iterations.
 873 | 
 874 |     e_step_thresh: float (optional, default=1e-32)
 875 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
 876 |         below threshold then write a zero for P(z|w,d).
 877 | 
 878 |     Returns
 879 |     -------
 880 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
 881 |         The resulting model values of P(z|d) and P(w|z)
 882 | 
 883 |     """
 884 |     k = topics.shape[0]
 885 |     p_z_given_wd = np.zeros((X_rows.shape[0], k), dtype=np.float32)
 886 | 
 887 |     norm_pdz = np.zeros(p_z_given_d.shape[0], dtype=np.float32)
 888 | 
 889 |     previous_log_likelihood = log_likelihood(
 890 |         X_rows, X_cols, X_vals, topics, p_z_given_d, sample_weight
 891 |     )
 892 | 
 893 |     for i in range(n_iter):
 894 | 
 895 |         plsa_e_step(
 896 |             X_rows, X_cols, X_vals, topics, p_z_given_d, p_z_given_wd, e_step_thresh
 897 |         )
 898 |         plsa_refit_m_step(
 899 |             X_rows,
 900 |             X_cols,
 901 |             X_vals,
 902 |             topics,
 903 |             p_z_given_d,
 904 |             p_z_given_wd,
 905 |             sample_weight,
 906 |             norm_pdz,
 907 |         )
 908 | 
 909 |         if i % n_iter_per_test == 0:
 910 |             current_log_likelihood = log_likelihood(
 911 |                 X_rows, X_cols, X_vals, topics, p_z_given_d, sample_weight
 912 |             )
 913 |             if current_log_likelihood > 0:
 914 |                 change = np.abs(current_log_likelihood - previous_log_likelihood)
 915 |                 if change / np.abs(current_log_likelihood) < tolerance:
 916 |                     break
 917 |                 else:
 918 |                     previous_log_likelihood = current_log_likelihood
 919 | 
 920 |     return p_z_given_d
 921 | 
 922 | 
 923 | def plsa_refit(
 924 |     X,
 925 |     topics,
 926 |     sample_weight,
 927 |     n_iter=50,
 928 |     n_iter_per_test=10,
 929 |     tolerance=0.005,
 930 |     e_step_thresh=1e-32,
 931 |     random_state=None,
 932 | ):
 933 |     """Routine for refitting values of P(z|d) given a fixed set of topics (
 934 |     i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics
 935 |     (given, for example, by an ensemble result).
 936 | 
 937 |     Parameters
 938 |     ----------
 939 |     X: sparse matrix of shape (n_docs, n_words)
 940 |         The data matrix pLSA is attempting to fit to.
 941 | 
 942 |     topics: array of shape (n_topics, n_words)
 943 |         The fixed topics against which to fit the values of P(z|d).
 944 | 
 945 |     sample_weight: array of shape (n_docs,)
 946 |         Input document weights.
 947 | 
 948 |     n_iter: int
 949 |         The maximum number iterations of EM to perform
 950 | 
 951 |     n_iter_per_test: int
 952 |         The number of iterations between tests for relative improvement in
 953 |         log-likelihood.
 954 | 
 955 |     tolerance: float
 956 |         The threshold of relative improvement in log-likelihood required to continue
 957 |         iterations.
 958 | 
 959 |     e_step_thresh: float (optional, default=1e-32)
 960 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
 961 |         below threshold then write a zero for P(z|w,d).
 962 | 
 963 |     random_state: int, RandomState instance or None, (optional, default: None)
 964 |         If int, random_state is the seed used by the random number generator;
 965 |         If RandomState instance, random_state is the random number generator;
 966 |         If None, the random number generator is the RandomState instance used
 967 |         by `np.random`. Used in in initialization.
 968 | 
 969 |     Returns
 970 |     -------
 971 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
 972 |         The resulting model values of P(z|d) and P(w|z)
 973 | 
 974 |     """
 975 |     A = X.tocoo().astype(np.float32)
 976 |     k = topics.shape[0]
 977 | 
 978 |     rng = check_random_state(random_state)
 979 |     p_z_given_d = rng.rand(A.shape[0], k)
 980 |     normalize(p_z_given_d, axis=1)
 981 |     p_z_given_d = p_z_given_d.astype(np.float32)
 982 |     topics = topics.astype(np.float32)
 983 | 
 984 |     p_z_given_d = plsa_refit_inner(
 985 |         A.row,
 986 |         A.col,
 987 |         A.data,
 988 |         topics,
 989 |         p_z_given_d,
 990 |         sample_weight,
 991 |         n_iter=n_iter,
 992 |         n_iter_per_test=n_iter_per_test,
 993 |         tolerance=tolerance,
 994 |         e_step_thresh=e_step_thresh,
 995 |     )
 996 | 
 997 |     return p_z_given_d
 998 | 
 999 | 
1000 | class PLSA(BaseEstimator, TransformerMixin):
1001 |     """Probabilistic Latent Semantic Analysis (pLSA)
1002 | 
1003 |     Given a bag-of-words matrix representation of a corpus of documents, where each row of the
1004 |     matrix represents a document, and the jth element of the ith row is the count of the number of
1005 |     times the jth vocabulary word occurs in the ith document, estimate matrices of conditional
1006 |     probabilities P(z|d) and P(w|z) such that the product matrix of probabilities P(w|d)
1007 |     maximises the likelihood of seeing the observed corpus data. Here P(z|d) represents the
1008 |     probability of topic z given document d, P(w|z) represents the probability of word w given
1009 |     topic z, and P(w|d) represents the probability of word w given document d.
1010 | 
1011 |     The algorithm proceeds using an Expectation-Maximization (EM) approach to attempt to maximise
1012 |     the likelihood of the observed data under the estimated model.
1013 | 
1014 |     Parameters
1015 |     ----------
1016 |     n_components: int (optional, default=10)
1017 |         The number of topics to use in the matrix factorization.
1018 | 
1019 |     init: string or tuple (optional, default="random")
1020 |         The intialization method to use. This should be one of:
1021 |             * ``"random"``
1022 |             * ``"nndsvd"``
1023 |             * ``"nmf"``
1024 |         or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).
1025 | 
1026 |     n_iter: int
1027 |         The maximum number iterations of EM to perform
1028 | 
1029 |     n_iter_per_test: int
1030 |         The number of iterations between tests for relative improvement in
1031 |         log-likelihood.
1032 | 
1033 |     tolerance: float
1034 |         The threshold of relative improvement in log-likelihood required to continue
1035 |         iterations.
1036 | 
1037 |     e_step_thresh: float (optional, default=1e-32)
1038 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
1039 |         below threshold then write a zero for P(z|w,d).
1040 | 
1041 |     random_state: int, RandomState instance or None, (optional, default: None)
1042 |         If int, random_state is the seed used by the random number generator;
1043 |         If RandomState instance, random_state is the random number generator;
1044 |         If None, the random number generator is the RandomState instance used
1045 |         by `np.random`. Used in in initialization.
1046 | 
1047 |     Attributes
1048 |     ----------
1049 | 
1050 |     components_: array of shape (n_topics, n_words)
1051 |         The topic vectors produced by pLSA. Each row is a topic, which is a probability
1052 |         distribution, over the vocabulary, giving the probability of each word given the topic (
1053 |         P(w|z)).
1054 | 
1055 |     embedding_: array of shape (n_docs, n_topics)
1056 |         The document vectors produced by pLSA. Each row corresponds to a document, giving a
1057 |         probability distribution, over the topic space, specifying the probability of each topic
1058 |         occuring in the document (P(z|d)).
1059 | 
1060 |     training_data_: sparse matrix of shape (n_docs, n_words)
1061 |         The original training data saved in sparse matrix format.
1062 | 
1063 |     References
1064 |     ----------
1065 | 
1066 |     Hofmann, Thomas. "Probabilistic latent semantic analysis." Proceedings of the Fifteenth
1067 |     conference on Uncertainty in artificial intelligence. Morgan Kaufmann Publishers Inc., 1999.
1068 | 
1069 |     Hofmann, Thomas. "Unsupervised learning by probabilistic latent semantic analysis."
1070 |     Machine learning 42.1-2 (2001): 177-196.
1071 | 
1072 |     """
1073 | 
1074 |     def __init__(
1075 |         self,
1076 |         n_components=10,
1077 |         init="random",
1078 |         n_iter=100,
1079 |         n_iter_per_test=10,
1080 |         tolerance=0.001,
1081 |         e_step_thresh=1e-32,
1082 |         transform_random_seed=42,
1083 |         random_state=None,
1084 |     ):
1085 | 
1086 |         self.n_components = n_components
1087 |         self.init = init
1088 |         self.n_iter = n_iter
1089 |         self.n_iter_per_test = n_iter_per_test
1090 |         self.tolerance = tolerance
1091 |         self.e_step_thresh = e_step_thresh
1092 |         self.transform_random_seed = transform_random_seed
1093 |         self.random_state = random_state
1094 | 
1095 |     def fit(self, X, y=None, sample_weight=None):
1096 |         """Learn the pLSA model for the data X and return the document vectors.
1097 | 
1098 |         This is more efficient than calling fit followed by transform.
1099 | 
1100 |         Parameters
1101 |         ----------
1102 |         X: array or sparse matrix of shape (n_docs, n_words)
1103 |             The data matrix pLSA is attempting to fit to.
1104 | 
1105 |         y: Ignored
1106 | 
1107 |         sample_weight: array of shape (n_docs,)
1108 |             Input document weights.
1109 | 
1110 |         Returns
1111 |         -------
1112 |         self
1113 |         """
1114 |         self.fit_transform(X, sample_weight=sample_weight)
1115 |         return self
1116 | 
1117 |     def fit_transform(self, X, y=None, sample_weight=None):
1118 |         """Learn the pLSA model for the data X and return the document vectors.
1119 | 
1120 |         This is more efficient than calling fit followed by transform.
1121 | 
1122 |         Parameters
1123 |         ----------
1124 |         X: array or sparse matrix of shape (n_docs, n_words)
1125 |             The data matrix pLSA is attempting to fit to.
1126 | 
1127 |         y: Ignored
1128 | 
1129 |         sample_weight: array of shape (n_docs,)
1130 |             Input document weights.
1131 | 
1132 |         Returns
1133 |         -------
1134 |         embedding: array of shape (n_docs, n_topics)
1135 |             An embedding of the documents into a topic space.
1136 |         """
1137 | 
1138 |         X = check_array(X, accept_sparse="csr")
1139 |         X = standardize_input(X)
1140 | 
1141 |         if not issparse(X):
1142 |             X = csr_matrix(X)
1143 | 
1144 |         sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
1145 | 
1146 |         if np.any(X.data < 0):
1147 |             raise ValueError(
1148 |                 "PLSA is only valid for matrices with non-negative " "entries"
1149 |             )
1150 | 
1151 |         row_sums = np.array(X.sum(axis=1).T)[0]
1152 |         good_rows = row_sums != 0
1153 | 
1154 |         if not np.all(good_rows):
1155 |             zero_rows_found = True
1156 |             data_for_fitting = X[good_rows]
1157 |         else:
1158 |             zero_rows_found = False
1159 |             data_for_fitting = X
1160 | 
1161 |         U, V = plsa_fit(
1162 |             data_for_fitting,
1163 |             self.n_components,
1164 |             sample_weight,
1165 |             self.init,
1166 |             self.n_iter,
1167 |             self.n_iter_per_test,
1168 |             self.tolerance,
1169 |             self.e_step_thresh,
1170 |             self.random_state,
1171 |         )
1172 | 
1173 |         if zero_rows_found:
1174 |             self.embedding_ = np.zeros((X.shape[0], self.n_components))
1175 |             self.embedding_[good_rows] = U
1176 |         else:
1177 |             self.embedding_ = U
1178 | 
1179 |         self.components_ = V
1180 |         self.training_data_ = X
1181 | 
1182 |         return self.embedding_
1183 | 
1184 |     def transform(self, X, y=None):
1185 |         """Transform the data X into the topic space of the fitted pLSA model.
1186 | 
1187 |         Parameters
1188 |         ----------
1189 |         X: array or sparse matrix of shape (n_docs, n_words)
1190 |             Corpus to be embedded into topic space
1191 | 
1192 |         y: Ignored
1193 | 
1194 |         Returns
1195 |         -------
1196 |         embedding: array of shape (n_docs, n_topics)
1197 |             An embedding of the documents X into the topic space.
1198 |         """
1199 |         X = check_array(X, accept_sparse="csr")
1200 |         random_state = check_random_state(self.transform_random_seed)
1201 | 
1202 |         # Set weights to 1 for all examples
1203 |         sample_weight = _check_sample_weight(None, X, dtype=np.float32)
1204 | 
1205 |         if not issparse(X):
1206 |             X = coo_matrix(X)
1207 |         else:
1208 |             X = X.tocoo()
1209 | 
1210 |         result = plsa_refit(
1211 |             X,
1212 |             self.components_,
1213 |             sample_weight,
1214 |             n_iter=50,
1215 |             n_iter_per_test=5,
1216 |             tolerance=0.001,
1217 |             random_state=random_state,
1218 |         )
1219 | 
1220 |         return result
1221 | 
1222 |     def coherence(self, topic_num=None, n_words=20):
1223 |         """Compute the average coherence of fitted topics, or of a single individual topic.
1224 | 
1225 |         Parameters
1226 |         ----------
1227 |         topic_num: int (optional, default=None)
1228 |             The topic number to compute coherence for. If ``topic_num`` is None then the average
1229 |             coherence over all topics will be computed.
1230 | 
1231 |         n_words int (optional, default=20)
1232 |             The number of topic words to score against. The top ``n_words`` words from the selected
1233 |             topic will be used.
1234 | 
1235 |         Returns
1236 |         -------
1237 |         topic_coherence: float
1238 |             The requested coherence score.
1239 |         """
1240 | 
1241 |         # Test for errors
1242 |         if not isinstance(topic_num, int) and topic_num is not None:
1243 |             raise ValueError("Topic number must be an integer or None.")
1244 | 
1245 |         if topic_num is None:
1246 |             return mean_coherence(self.components_, self.training_data_, n_words)
1247 |         elif topic_num >= 0 and topic_num < self.n_components:
1248 |             return coherence(self.components_, topic_num, self.training_data_, n_words)
1249 |         else:
1250 |             raise ValueError(
1251 |                 "Topic number must be in range 0 to {}".format(self.n_components)
1252 |             )
1253 | 
1254 |     def log_lift(self, topic_num=None, n_words=20):
1255 |         """Compute the average log lift of fitted topics, or of a single individual topic.
1256 | 
1257 |         Parameters
1258 |         ----------
1259 |         topic_num: int (optional, default=None)
1260 |             The topic number to compute log lift for. If ``topic_num`` is None then the average
1261 |             log lift over all topics will be computed.
1262 | 
1263 |         n_words int (optional, default=20)
1264 |             The number of topic words to score against. The top ``n_words`` words from the selected
1265 |             topic will be used.
1266 | 
1267 | 
1268 |         Returns
1269 |         -------
1270 |         log_lift: float
1271 |             The requested log lift score.
1272 |         """
1273 | 
1274 |         # Test for errors
1275 |         if not isinstance(topic_num, int) and topic_num is not None:
1276 |             raise ValueError("Topic number must be an integer or None.")
1277 | 
1278 |         if topic_num is None:
1279 |             return mean_log_lift(self.components_, self.training_data_, n_words)
1280 |         elif topic_num >= 0 and topic_num < self.n_components:
1281 |             return log_lift(self.components_, topic_num, self.training_data_, n_words)
1282 |         else:
1283 |             raise ValueError(
1284 |                 "Topic number must be in range 0 to {}".format(self.n_components)
1285 |             )
1286 | 


--------------------------------------------------------------------------------
/enstop/streamed_plsa.py:
--------------------------------------------------------------------------------
   1 | import numpy as np
   2 | import numba
   3 | 
   4 | from sklearn.base import BaseEstimator, TransformerMixin
   5 | from sklearn.utils import check_array, check_random_state
   6 | 
   7 | try:
   8 |     from sklearn.utils.validation import _check_sample_weight
   9 | except ImportError:
  10 |     from enstop.utils import _check_sample_weight
  11 | from scipy.sparse import issparse, csr_matrix, coo_matrix
  12 | 
  13 | from enstop.utils import (
  14 |     normalize,
  15 |     coherence,
  16 |     mean_coherence,
  17 |     log_lift,
  18 |     mean_log_lift,
  19 |     standardize_input,
  20 | )
  21 | from enstop.plsa import log_likelihood, plsa_init
  22 | 
  23 | 
  24 | @numba.njit(
  25 |     "f4[:,::1](i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],i8,i8,f4)",
  26 |     locals={
  27 |         "k": numba.types.intp,
  28 |         "w": numba.types.uint32,
  29 |         "d": numba.types.uint32,
  30 |         "z": numba.types.uint16,
  31 |         "nz_idx": numba.types.uint32,
  32 |         "norm": numba.types.float32,
  33 |     },
  34 |     fastmath=True,
  35 |     nogil=True,
  36 |     parallel=True,
  37 | )
  38 | def plsa_e_step_on_a_block(
  39 |     X_rows,
  40 |     X_cols,
  41 |     X_vals,
  42 |     p_w_given_z,
  43 |     p_z_given_d,
  44 |     p_z_given_wd_block,
  45 |     block_start,
  46 |     block_end,
  47 |     probability_threshold=1e-32,
  48 | ):
  49 |     """Perform the E-step of pLSA optimization. This amounts to computing the
  50 |     probability of each topic given each word document pair. The computation
  51 |     implements
  52 | 
  53 |     P(z|w,d) = \frac{P(z|w)P(d|z)}{\sum_{z=1}^k P(z|w)P(d|z)}.
  54 | 
  55 |     This routine is optimized to work with sparse matrices such that P(z|w,d)
  56 |     is only computed for w, d such that X_{w,d} is non-zero, where X is the
  57 |     data matrix.
  58 | 
  59 |     To make this numba compilable the raw arrays defining the COO format sparse
  60 |     matrix must be passed separately.
  61 | 
  62 |     To keep memory use lower we only compute a block of P(z|w,d) -- specifically
  63 |     we compute it for all topics and a block of non-zeros of X. We can then use
  64 |     this block to complete a partial M step before computing the E step for
  65 |     the next block.
  66 | 
  67 | 
  68 |     Parameters
  69 |     ----------
  70 |     X_rows: array of shape (nnz,)
  71 |         For each non-zero entry of X, the row of the entry.
  72 | 
  73 |     X_cols: array of shape (nnz,)
  74 |         For each non-zero entry of X, the column of the
  75 |         entry.
  76 | 
  77 |     X_vals: array of shape (nnz,)
  78 |         For each non-zero entry of X, the value of entry.
  79 | 
  80 |     p_w_given_z: array of shape (n_topics, n_words)
  81 |         The current estimates of values for P(w|z)
  82 | 
  83 |     p_z_given_d: array of shape (n_docs, n_topics)
  84 |         The current estimates of values for P(z|d)
  85 | 
  86 |     p_z_given_wd_block: array of shape (block_size, n_topics)
  87 |         The result array to write new estimates of P(z|w,d) to.
  88 | 
  89 |     block_start: int
  90 |         The index into nen-zeros of X where this block starts
  91 | 
  92 |     block_end: int
  93 |         The index into nen-zeros of X where this block ends
  94 | 
  95 |     probability_threshold: float (optional, default=1e-32)
  96 |         Option to promote sparsity. If the value of P(w|z)P(z|d) falls below
  97 |         threshold then write a zero for P(z|w,d).
  98 | 
  99 |     """
 100 | 
 101 |     k = p_w_given_z.shape[0]
 102 | 
 103 |     for nz_idx in numba.prange(block_start, block_end):
 104 |         d = X_rows[nz_idx]
 105 |         w = X_cols[nz_idx]
 106 | 
 107 |         norm = 0.0
 108 |         for z in range(k):
 109 |             v = p_w_given_z[z, w] * p_z_given_d[d, z]
 110 |             if v > probability_threshold:
 111 |                 p_z_given_wd_block[nz_idx - block_start, z] = v
 112 |                 norm += v
 113 |             else:
 114 |                 p_z_given_wd_block[nz_idx - block_start, z] = 0.0
 115 |         for z in range(k):
 116 |             if norm > 0:
 117 |                 p_z_given_wd_block[nz_idx - block_start, z] /= norm
 118 | 
 119 |     return p_z_given_wd_block
 120 | 
 121 | 
 122 | @numba.njit(
 123 |     "void(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1],i8,i8)",
 124 |     locals={
 125 |         "k": numba.types.intp,
 126 |         "w": numba.types.uint32,
 127 |         "d": numba.types.uint32,
 128 |         "z": numba.types.uint16,
 129 |         "nz_idx": numba.types.uint32,
 130 |         "s": numba.types.float32,
 131 |     },
 132 |     fastmath=True,
 133 |     nogil=True,
 134 | )
 135 | def plsa_partial_m_step_on_a_block(
 136 |     X_rows,
 137 |     X_cols,
 138 |     X_vals,
 139 |     p_w_given_z,
 140 |     p_z_given_d,
 141 |     p_z_given_wd_block,
 142 |     norm_pwz,
 143 |     norm_pdz,
 144 |     block_start,
 145 |     block_end,
 146 | ):
 147 |     """Perform a partial M-step of pLSA optimization. This amounts to using the
 148 |     estimates of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation
 149 |     implements
 150 | 
 151 |     P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)}
 152 |     P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)}
 153 | 
 154 |     This routine is optimized to work with sparse matrices such that P(z|w,d) is only
 155 |     computed for w, d such that X_{w,d} is non-zero, where X is the data matrix.
 156 | 
 157 |     To make this numba compilable the raw arrays defining the COO format sparse
 158 |     matrix must be passed separately.
 159 | 
 160 |     Note that in order to not store the entire P(z|w,d) matrix in memory at once
 161 |     we only process a block of it here. The normalization in the above formulas
 162 |     will actually be computed after all blocks have been completed.
 163 | 
 164 |     Parameters
 165 |     ----------
 166 |     X_rows: array of shape (nnz,)
 167 |         For each non-zero entry of X, the row of the entry.
 168 | 
 169 |     X_cols: array of shape (nnz,)
 170 |         For each non-zero entry of X, the column of the
 171 |         entry.
 172 | 
 173 |     X_vals: array of shape (nnz,)
 174 |         For each non-zero entry of X, the value of entry.
 175 | 
 176 |     p_w_given_z: array of shape (n_topics, n_words)
 177 |         The result array to write new estimates of P(w|z) to.
 178 | 
 179 |     p_z_given_d: array of shape (n_docs, n_topics)
 180 |         The result array to write new estimates of P(z|d) to.
 181 | 
 182 |     p_z_given_wd_block: array of shape (block_size, n_topics)
 183 |         The current estimates for P(z|w,d) for a block
 184 | 
 185 |     norm_pwz: array of shape (n_topics,)
 186 |         Auxilliary array used for storing row norms; this is passed in to save
 187 |         reallocations.
 188 | 
 189 |     norm_pdz: array of shape (n_docs,)
 190 |         Auxilliary array used for storing row norms; this is passed in to save
 191 |         reallocations.
 192 | 
 193 |     sample_weight: array of shape (n_docs,)
 194 |         Input document weights.
 195 | 
 196 |     block_start: int
 197 |         The index into nen-zeros of X where this block starts
 198 | 
 199 |     block_end: int
 200 |         The index into nen-zeros of X where this block ends
 201 | 
 202 |     """
 203 | 
 204 |     k = p_z_given_wd_block.shape[1]
 205 | 
 206 |     for nz_idx in range(block_start, block_end):
 207 |         d = X_rows[nz_idx]
 208 |         w = X_cols[nz_idx]
 209 |         x = X_vals[nz_idx]
 210 | 
 211 |         for z in range(k):
 212 |             s = x * p_z_given_wd_block[nz_idx - block_start, z]
 213 | 
 214 |             p_w_given_z[z, w] += s
 215 |             p_z_given_d[d, z] += s
 216 | 
 217 |             norm_pwz[z] += s
 218 |             norm_pdz[d] += s
 219 | 
 220 | 
 221 | @numba.njit(
 222 |     "void(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[:,::1],f4[::1],f4[::1],f4[::1],i8,i8)",
 223 |     locals={
 224 |         "k": numba.types.intp,
 225 |         "w": numba.types.uint32,
 226 |         "d": numba.types.uint32,
 227 |         "z": numba.types.uint16,
 228 |         "nz_idx": numba.types.uint32,
 229 |         "s": numba.types.float32,
 230 |     },
 231 |     fastmath=True,
 232 |     nogil=True,
 233 | )
 234 | def plsa_partial_m_step_on_a_block_w_sample_weight(
 235 |     X_rows,
 236 |     X_cols,
 237 |     X_vals,
 238 |     p_w_given_z,
 239 |     p_z_given_d,
 240 |     p_z_given_wd_block,
 241 |     norm_pwz,
 242 |     norm_pdz,
 243 |     sample_weight,
 244 |     block_start,
 245 |     block_end,
 246 | ):
 247 |     """Perform a partial M-step of pLSA optimization. This amounts to using the
 248 |     estimates of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation
 249 |     implements
 250 | 
 251 |     P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)}
 252 |     P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)}
 253 | 
 254 |     This routine is optimized to work with sparse matrices such that P(z|w,d) is only
 255 |     computed for w, d such that X_{w,d} is non-zero, where X is the data matrix.
 256 | 
 257 |     To make this numba compilable the raw arrays defining the COO format sparse
 258 |     matrix must be passed separately.
 259 | 
 260 |     Note that in order to not store the entire P(z|w,d) matrix in memory at once
 261 |     we only process a block of it here. The normalization in the above formulas
 262 |     will actually be computed after all blocks have been completed.
 263 | 
 264 |     Parameters
 265 |     ----------
 266 |     X_rows: array of shape (nnz,)
 267 |         For each non-zero entry of X, the row of the entry.
 268 | 
 269 |     X_cols: array of shape (nnz,)
 270 |         For each non-zero entry of X, the column of the
 271 |         entry.
 272 | 
 273 |     X_vals: array of shape (nnz,)
 274 |         For each non-zero entry of X, the value of entry.
 275 | 
 276 |     p_w_given_z: array of shape (n_topics, n_words)
 277 |         The result array to write new estimates of P(w|z) to.
 278 | 
 279 |     p_z_given_d: array of shape (n_docs, n_topics)
 280 |         The result array to write new estimates of P(z|d) to.
 281 | 
 282 |     p_z_given_wd_block: array of shape (block_size, n_topics)
 283 |         The current estimates for P(z|w,d) for a block
 284 | 
 285 |     norm_pwz: array of shape (n_topics,)
 286 |         Auxilliary array used for storing row norms; this is passed in to save
 287 |         reallocations.
 288 | 
 289 |     norm_pdz: array of shape (n_docs,)
 290 |         Auxilliary array used for storing row norms; this is passed in to save
 291 |         reallocations.
 292 | 
 293 |     sample_weight: array of shape (n_docs,)
 294 |         Input document weights.
 295 | 
 296 |     block_start: int
 297 |         The index into nen-zeros of X where this block starts
 298 | 
 299 |     block_end: int
 300 |         The index into nen-zeros of X where this block ends
 301 | 
 302 |     """
 303 | 
 304 |     k = p_z_given_wd_block.shape[1]
 305 | 
 306 |     for nz_idx in range(block_start, block_end):
 307 |         d = X_rows[nz_idx]
 308 |         w = X_cols[nz_idx]
 309 |         x = X_vals[nz_idx]
 310 | 
 311 |         for z in range(k):
 312 |             s = x * p_z_given_wd_block[nz_idx - block_start, z]
 313 |             t = s * sample_weight[d]
 314 | 
 315 |             p_w_given_z[z, w] += t
 316 |             p_z_given_d[d, z] += s
 317 | 
 318 |             norm_pwz[z] += t
 319 |             norm_pdz[d] += s
 320 | 
 321 | 
 322 | @numba.njit(parallel=True, fastmath=True, nogil=True)
 323 | def plsa_em_step(
 324 |     X_rows,
 325 |     X_cols,
 326 |     X_vals,
 327 |     prev_p_w_given_z,
 328 |     prev_p_z_given_d,
 329 |     next_p_w_given_z,
 330 |     next_p_z_given_d,
 331 |     p_z_given_wd_block,
 332 |     norm_pwz,
 333 |     norm_pdz,
 334 |     e_step_thresh=1e-32,
 335 | ):
 336 | 
 337 |     k = p_z_given_wd_block.shape[1]
 338 |     n = prev_p_z_given_d.shape[0]
 339 |     m = prev_p_w_given_z.shape[1]
 340 | 
 341 |     block_size = p_z_given_wd_block.shape[0]
 342 |     n_blocks = (X_vals.shape[0] // block_size) + 1
 343 | 
 344 |     # zero out the norms for recomputation
 345 |     norm_pdz[:] = 0.0
 346 |     norm_pwz[:] = 0.0
 347 | 
 348 |     # Loop over blocks doing E step on a block and a partial M step
 349 |     for block_index in range(n_blocks):
 350 |         block_start = block_index * block_size
 351 |         block_end = min(X_vals.shape[0], block_start + block_size)
 352 | 
 353 |         plsa_e_step_on_a_block(
 354 |             X_rows,
 355 |             X_cols,
 356 |             X_vals,
 357 |             prev_p_w_given_z,
 358 |             prev_p_z_given_d,
 359 |             p_z_given_wd_block,
 360 |             block_start,
 361 |             block_end,
 362 |             e_step_thresh,
 363 |         )
 364 |         plsa_partial_m_step_on_a_block(
 365 |             X_rows,
 366 |             X_cols,
 367 |             X_vals,
 368 |             next_p_w_given_z,
 369 |             next_p_z_given_d,
 370 |             p_z_given_wd_block,
 371 |             norm_pwz,
 372 |             norm_pdz,
 373 |             block_start,
 374 |             block_end,
 375 |         )
 376 | 
 377 |     # Once complete we can normalize to complete the M step
 378 |     for z in numba.prange(k):
 379 |         if norm_pwz[z] > 0:
 380 |             for w in range(m):
 381 |                 next_p_w_given_z[z, w] /= norm_pwz[z]
 382 |         for d in range(n):
 383 |             if norm_pdz[d] > 0:
 384 |                 next_p_z_given_d[d, z] /= norm_pdz[d]
 385 | 
 386 |     # Zero out the old matrices, we'll swap them on return and
 387 |     # these will become the new "next"
 388 |     prev_p_w_given_z[:] = 0.0
 389 |     prev_p_z_given_d[:] = 0.0
 390 | 
 391 |     return next_p_w_given_z, next_p_z_given_d, prev_p_w_given_z, prev_p_z_given_d
 392 | 
 393 | 
 394 | @numba.njit(parallel=True, fastmath=True, nogil=True)
 395 | def plsa_em_step_w_sample_weights(
 396 |     X_rows,
 397 |     X_cols,
 398 |     X_vals,
 399 |     prev_p_w_given_z,
 400 |     prev_p_z_given_d,
 401 |     next_p_w_given_z,
 402 |     next_p_z_given_d,
 403 |     p_z_given_wd_block,
 404 |     norm_pwz,
 405 |     norm_pdz,
 406 |     sample_weight,
 407 |     e_step_thresh=1e-32,
 408 | ):
 409 | 
 410 |     k = p_z_given_wd_block.shape[1]
 411 |     n = prev_p_z_given_d.shape[0]
 412 |     m = prev_p_w_given_z.shape[1]
 413 | 
 414 |     block_size = p_z_given_wd_block.shape[0]
 415 |     n_blocks = (X_vals.shape[0] // block_size) + 1
 416 | 
 417 |     # zero out the norms for recomputation
 418 |     norm_pdz[:] = 0.0
 419 |     norm_pwz[:] = 0.0
 420 | 
 421 |     # Loop over blocks doing E step on a block and a partial M step
 422 |     for block_index in range(n_blocks):
 423 |         block_start = block_index * block_size
 424 |         block_end = min(X_vals.shape[0], block_start + block_size)
 425 | 
 426 |         plsa_e_step_on_a_block(
 427 |             X_rows,
 428 |             X_cols,
 429 |             X_vals,
 430 |             prev_p_w_given_z,
 431 |             prev_p_z_given_d,
 432 |             p_z_given_wd_block,
 433 |             block_start,
 434 |             block_end,
 435 |             e_step_thresh,
 436 |         )
 437 |         plsa_partial_m_step_on_a_block_w_sample_weight(
 438 |             X_rows,
 439 |             X_cols,
 440 |             X_vals,
 441 |             next_p_w_given_z,
 442 |             next_p_z_given_d,
 443 |             p_z_given_wd_block,
 444 |             norm_pwz,
 445 |             norm_pdz,
 446 |             sample_weight,
 447 |             block_start,
 448 |             block_end,
 449 |         )
 450 | 
 451 |     # Once complete we can normalize to complete the M step
 452 |     for z in numba.prange(k):
 453 |         if norm_pwz[z] > 0:
 454 |             for w in range(m):
 455 |                 next_p_w_given_z[z, w] /= norm_pwz[z]
 456 |         for d in range(n):
 457 |             if norm_pdz[d] > 0:
 458 |                 next_p_z_given_d[d, z] /= norm_pdz[d]
 459 | 
 460 |     # Zero out the old matrices, we'll swap them on return and
 461 |     # these will become the new "next"
 462 |     prev_p_w_given_z[:] = 0.0
 463 |     prev_p_z_given_d[:] = 0.0
 464 | 
 465 |     return next_p_w_given_z, next_p_z_given_d, prev_p_w_given_z, prev_p_z_given_d
 466 | 
 467 | 
 468 | @numba.njit(fastmath=True, nogil=True)
 469 | def plsa_fit_inner_blockwise(
 470 |     X_rows,
 471 |     X_cols,
 472 |     X_vals,
 473 |     p_w_given_z,
 474 |     p_z_given_d,
 475 |     sample_weight,
 476 |     block_size=65536,
 477 |     n_iter=100,
 478 |     n_iter_per_test=10,
 479 |     tolerance=0.001,
 480 |     e_step_thresh=1e-32,
 481 |     use_sample_weights=False,
 482 | ):
 483 |     """Internal loop of EM steps required to optimize pLSA, along with relative
 484 |     convergence tests with respect to the log-likelihood of observing the data under
 485 |     the model.
 486 | 
 487 |     The EM looping will stop when either ``n_iter`` iterations have been reached,
 488 |     or if the relative improvement in log-likelihood over the last
 489 |     ``n_iter_per_test`` steps is under ``threshold``.
 490 | 
 491 |     This function is designed to wrap the internals of the EM process in a numba
 492 |     compilable loop, and is not the preferred entry point for fitting a plsa model.
 493 | 
 494 |     Parameters
 495 |     ----------
 496 |     X_rows: array of shape (nnz,)
 497 |         For each non-zero entry of X, the row of the entry.
 498 | 
 499 |     X_cols: array of shape (nnz,)
 500 |         For each non-zero entry of X, the column of the
 501 |         entry.
 502 | 
 503 |     X_vals: array of shape (nnz,)
 504 |         For each non-zero entry of X, the value of entry.
 505 | 
 506 |     p_w_given_z: array of shape (n_topics, n_words)
 507 |         The current estimates of values for P(w|z)
 508 | 
 509 |     p_z_given_d: array of shape (n_docs, n_topics)
 510 |         The current estimates of values for P(z|d)
 511 | 
 512 |     sample_weight: array of shape (n_docs,)
 513 |         Input document weights.
 514 | 
 515 |     block_size: int (optional, default=65536)
 516 |         The number of nonzero entries of X to process in a block. The larger this
 517 |         value the faster the compute may go, but at higher memory cost.
 518 | 
 519 |     n_iter: int
 520 |         The maximum number iterations of EM to perform
 521 | 
 522 |     n_iter_per_test: int
 523 |         The number of iterations between tests for
 524 |         relative improvement in log-likelihood.
 525 | 
 526 |     tolerance: float
 527 |         The threshold of relative improvement in
 528 |         log-likelihood required to continue iterations.
 529 | 
 530 |     e_step_thresh: float (optional, default=1e-32)
 531 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
 532 |         below threshold then write a zero for P(z|w,d).
 533 | 
 534 |     Returns
 535 |     -------
 536 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
 537 |         The resulting model values of P(z|d) and P(w|z)
 538 | 
 539 |     """
 540 |     k = p_z_given_d.shape[1]
 541 |     n = p_z_given_d.shape[0]
 542 | 
 543 |     p_z_given_wd_block = np.zeros((block_size, k), dtype=np.float32)
 544 | 
 545 |     norm_pwz = np.zeros(k, dtype=np.float32)
 546 |     norm_pdz = np.zeros(n, dtype=np.float32)
 547 | 
 548 |     previous_log_likelihood = log_likelihood(
 549 |         X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight,
 550 |     )
 551 | 
 552 |     next_p_w_given_z = np.zeros_like(p_w_given_z)
 553 |     next_p_z_given_d = np.zeros_like(p_z_given_d)
 554 | 
 555 |     for i in range(n_iter):
 556 | 
 557 |         if use_sample_weights:
 558 |             (
 559 |                 p_w_given_z,
 560 |                 p_z_given_d,
 561 |                 next_p_w_given_z,
 562 |                 next_p_z_given_d,
 563 |             ) = plsa_em_step_w_sample_weights(
 564 |                 X_rows,
 565 |                 X_cols,
 566 |                 X_vals,
 567 |                 p_w_given_z,
 568 |                 p_z_given_d,
 569 |                 next_p_w_given_z,
 570 |                 next_p_z_given_d,
 571 |                 p_z_given_wd_block,
 572 |                 norm_pwz,
 573 |                 norm_pdz,
 574 |                 sample_weight,
 575 |                 e_step_thresh,
 576 |             )
 577 |         else:
 578 |             p_w_given_z, p_z_given_d, next_p_w_given_z, next_p_z_given_d = plsa_em_step(
 579 |                 X_rows,
 580 |                 X_cols,
 581 |                 X_vals,
 582 |                 p_w_given_z,
 583 |                 p_z_given_d,
 584 |                 next_p_w_given_z,
 585 |                 next_p_z_given_d,
 586 |                 p_z_given_wd_block,
 587 |                 norm_pwz,
 588 |                 norm_pdz,
 589 |                 e_step_thresh,
 590 |             )
 591 | 
 592 |         if i % n_iter_per_test == 0:
 593 |             current_log_likelihood = log_likelihood(
 594 |                 X_rows, X_cols, X_vals, p_w_given_z, p_z_given_d, sample_weight,
 595 |             )
 596 |             change = np.abs(current_log_likelihood - previous_log_likelihood)
 597 |             if change / np.abs(current_log_likelihood) < tolerance:
 598 |                 break
 599 |             else:
 600 |                 previous_log_likelihood = current_log_likelihood
 601 | 
 602 |     return p_z_given_d, p_w_given_z
 603 | 
 604 | 
 605 | def plsa_fit(
 606 |     X,
 607 |     k,
 608 |     sample_weight,
 609 |     init="random",
 610 |     block_size=65536,
 611 |     n_iter=100,
 612 |     n_iter_per_test=10,
 613 |     tolerance=0.001,
 614 |     e_step_thresh=1e-32,
 615 |     random_state=None,
 616 | ):
 617 |     """Fit a pLSA model to a data matrix ``X`` with ``k`` topics, an initialized
 618 |     according to ``init``. This will run an EM method to optimize estimates of P(z|d)
 619 |     and P(w|z). The will perform at most ``n_iter`` EM step iterations,
 620 |     while checking for relative improvement of the log-likelihood of the data under
 621 |     the model every ``n_iter_per_test`` iterations, and stops early if that is under
 622 |     ``tolerance``.
 623 | 
 624 |     Parameters
 625 |     ----------
 626 |     X: sparse matrix of shape (n_docs, n_words)
 627 |         The data matrix pLSA is attempting to fit to.
 628 | 
 629 |     k: int
 630 |         The number of topics for pLSA to fit with.
 631 | 
 632 |     sample_weight: array of shape (n_docs,)
 633 |         Input document weights.
 634 | 
 635 |     init: string or tuple (optional, default="random")
 636 |         The intialization method to use. This should be one of:
 637 |             * ``"random"``
 638 |             * ``"nndsvd"``
 639 |             * ``"nmf"``
 640 |         or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).
 641 | 
 642 |     block_size: int (optional, default=65536)
 643 |         The number of nonzero entries of X to process in a block. The larger this
 644 |         value the faster the compute may go, but at higher memory cost.
 645 | 
 646 |     n_iter: int
 647 |         The maximum number iterations of EM to perform
 648 | 
 649 |     n_iter_per_test: int
 650 |         The number of iterations between tests for
 651 |         relative improvement in log-likelihood.
 652 | 
 653 |     tolerance: float
 654 |         The threshold of relative improvement in
 655 |         log-likelihood required to continue iterations.
 656 | 
 657 |     e_step_thresh: float (optional, default=1e-32)
 658 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
 659 |         below threshold then write a zero for P(z|w,d).
 660 | 
 661 |     random_state: int, RandomState instance or None, (optional, default: None)
 662 |         If int, random_state is the seed used by the random number generator;
 663 |         If RandomState instance, random_state is the random number generator;
 664 |         If None, the random number generator is the RandomState instance used
 665 |         by `np.random`. Used in in initialization.
 666 | 
 667 |     Returns
 668 |     -------
 669 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
 670 |         The resulting model values of P(z|d) and P(w|z)
 671 | 
 672 |     """
 673 | 
 674 |     rng = check_random_state(random_state)
 675 |     p_z_given_d, p_w_given_z = plsa_init(X, k, init=init, rng=rng)
 676 |     p_z_given_d = p_z_given_d.astype(np.float32, order="C")
 677 |     p_w_given_z = p_w_given_z.astype(np.float32, order="C")
 678 | 
 679 |     use_sample_weights = np.any(sample_weight != 1.0)
 680 | 
 681 |     A = X.tocoo().astype(np.float32)
 682 | 
 683 |     p_z_given_d, p_w_given_z = plsa_fit_inner_blockwise(
 684 |         A.row,
 685 |         A.col,
 686 |         A.data,
 687 |         p_w_given_z,
 688 |         p_z_given_d,
 689 |         sample_weight,
 690 |         block_size=block_size,
 691 |         n_iter=n_iter,
 692 |         n_iter_per_test=n_iter_per_test,
 693 |         tolerance=tolerance,
 694 |         e_step_thresh=e_step_thresh,
 695 |         use_sample_weights=use_sample_weights,
 696 |     )
 697 | 
 698 |     return p_z_given_d, p_w_given_z
 699 | 
 700 | 
 701 | @numba.njit(
 702 |     "void(i4[::1],i4[::1],f4[::1],f4[:,::1],f4[:,::1],f4[::1],i8,i8)",
 703 |     locals={
 704 |         "k": numba.types.intp,
 705 |         "w": numba.types.uint32,
 706 |         "d": numba.types.uint32,
 707 |         "z": numba.types.uint16,
 708 |         "nz_idx": numba.types.uint32,
 709 |         "s": numba.types.float32,
 710 |     },
 711 |     fastmath=True,
 712 |     nogil=True,
 713 | )
 714 | def plsa_partial_refit_m_step_on_a_block(
 715 |     X_rows,
 716 |     X_cols,
 717 |     X_vals,
 718 |     p_z_given_d,
 719 |     p_z_given_wd_block,
 720 |     norm_pdz,
 721 |     block_start,
 722 |     block_end,
 723 | ):
 724 |     """Perform a partial M-step of pLSA optimization. This amounts to using the
 725 |     estimates of P(z|w,d) to estimate the values P(w|z) and P(z|d). The computation
 726 |     implements
 727 | 
 728 |     P(w|z) = \frac{\sum_{d\in D} X_{w,d}P(z|w,d)}{\sum_{d,z} X_{w,d}P(z|w,d)}
 729 |     P(z|d) = \frac{\sum_{w\in V} X_{w,d}P(z|w,d)}{\sum_{w,d} X_{w,d}P(z|w,d)}
 730 | 
 731 |     This routine is optimized to work with sparse matrices such that P(z|w,d) is only
 732 |     computed for w, d such that X_{w,d} is non-zero, where X is the data matrix.
 733 | 
 734 |     To make this numba compilable the raw arrays defining the COO format sparse
 735 |     matrix must be passed separately.
 736 | 
 737 |     Note that in order to not store the entire P(z|w,d) matrix in memory at once
 738 |     we only process a block of it here. The normalization in the above formulas
 739 |     will actually be computed after all blocks have been completed.
 740 | 
 741 |     Parameters
 742 |     ----------
 743 |     X_rows: array of shape (nnz,)
 744 |         For each non-zero entry of X, the row of the entry.
 745 | 
 746 |     X_cols: array of shape (nnz,)
 747 |         For each non-zero entry of X, the column of the
 748 |         entry.
 749 | 
 750 |     X_vals: array of shape (nnz,)
 751 |         For each non-zero entry of X, the value of entry.
 752 | 
 753 |     p_z_given_d: array of shape (n_docs, n_topics)
 754 |         The result array to write new estimates of P(z|d) to.
 755 | 
 756 |     p_z_given_wd_block: array of shape (block_size, n_topics)
 757 |         The current estimates for P(z|w,d) for a block
 758 | 
 759 |     sample_weight: array of shape (n_docs,)
 760 |         Input document weights.
 761 | 
 762 |     norm_pdz: array of shape (n_docs,)
 763 |         Auxilliary array used for storing row norms; this is passed in to save
 764 |         reallocations.
 765 | 
 766 |     block_start: int
 767 |         The index into nen-zeros of X where this block starts
 768 | 
 769 |     block_end: int
 770 |         The index into nen-zeros of X where this block ends
 771 | 
 772 |     """
 773 | 
 774 |     k = p_z_given_wd_block.shape[1]
 775 | 
 776 |     for nz_idx in range(block_start, block_end):
 777 |         d = X_rows[nz_idx]
 778 |         w = X_cols[nz_idx]
 779 |         x = X_vals[nz_idx]
 780 | 
 781 |         for z in range(k):
 782 |             s = x * p_z_given_wd_block[nz_idx - block_start, z]
 783 |             p_z_given_d[d, z] += s
 784 |             norm_pdz[d] += s
 785 | 
 786 | 
 787 | @numba.njit()
 788 | def plsa_refit_em_step(
 789 |     X_rows,
 790 |     X_cols,
 791 |     X_vals,
 792 |     p_w_given_z,
 793 |     prev_p_z_given_d,
 794 |     next_p_z_given_d,
 795 |     p_z_given_wd_block,
 796 |     sample_weight,
 797 |     norm_pdz,
 798 |     e_step_thresh=1e-32,
 799 | ):
 800 | 
 801 |     k = p_z_given_wd_block.shape[1]
 802 |     n = prev_p_z_given_d.shape[0]
 803 | 
 804 |     block_size = p_z_given_wd_block.shape[0]
 805 |     n_blocks = (X_vals.shape[0] // block_size) + 1
 806 | 
 807 |     # zero out the norms for recomputation
 808 |     norm_pdz[:] = 0.0
 809 | 
 810 |     # Loop over blocks doing E step on a block and a partial M step
 811 |     for block_index in range(n_blocks):
 812 |         block_start = block_index * block_size
 813 |         block_end = min(X_vals.shape[0], block_start + block_size)
 814 | 
 815 |         plsa_e_step_on_a_block(
 816 |             X_rows,
 817 |             X_cols,
 818 |             X_vals,
 819 |             p_w_given_z,
 820 |             prev_p_z_given_d,
 821 |             p_z_given_wd_block,
 822 |             block_start,
 823 |             block_end,
 824 |             e_step_thresh,
 825 |         )
 826 |         plsa_partial_refit_m_step_on_a_block(
 827 |             X_rows,
 828 |             X_cols,
 829 |             X_vals,
 830 |             next_p_z_given_d,
 831 |             p_z_given_wd_block,
 832 |             norm_pdz,
 833 |             block_start,
 834 |             block_end,
 835 |         )
 836 | 
 837 |     # Once complete we can normalize to complete the M step
 838 |     for z in numba.prange(k):
 839 |         for d in range(n):
 840 |             if norm_pdz[d] > 0:
 841 |                 next_p_z_given_d[d, z] /= norm_pdz[d]
 842 | 
 843 |     # Zero out the old matrices, we'll swap them on return and
 844 |     # these will become the new "next"
 845 |     prev_p_z_given_d[:] = 0.0
 846 | 
 847 |     return next_p_z_given_d, prev_p_z_given_d
 848 | 
 849 | 
 850 | @numba.njit(locals={"e_step_thresh": numba.types.float32,}, fastmath=True, nogil=True)
 851 | def plsa_refit_inner_blockwise(
 852 |     X_rows,
 853 |     X_cols,
 854 |     X_vals,
 855 |     topics,
 856 |     p_z_given_d,
 857 |     sample_weight,
 858 |     block_size=65536,
 859 |     n_iter=50,
 860 |     n_iter_per_test=10,
 861 |     tolerance=0.005,
 862 |     e_step_thresh=1e-32,
 863 | ):
 864 |     """Optimized routine for refitting values of P(z|d) given a fixed set of topics (
 865 |     i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics
 866 |     (given, for example, by an ensemble result).
 867 | 
 868 |     This routine is optimized to work with sparse matrices and only compute values
 869 |     for w, d such that X_{w,d} is non-zero.
 870 | 
 871 |     To make this numba compilable the raw arrays defining the COO format sparse
 872 |     matrix must be passed separately.
 873 | 
 874 |     Parameters
 875 |     ----------
 876 |     X_rows: array of shape (nnz,)
 877 |         For each non-zero entry of X, the row of the entry.
 878 | 
 879 |     X_cols: array of shape (nnz,)
 880 |         For each non-zero entry of X, the column of the
 881 |         entry.
 882 | 
 883 |     X_vals: array of shape (nnz,)
 884 |         For each non-zero entry of X, the value of entry.
 885 | 
 886 |     topics: array of shape (n_topics, n_words)
 887 |         The fixed topics against which to fit the values of P(z|d).
 888 | 
 889 |     p_z_given_d: array of shape (n_docs, n_topics)
 890 |         The current estimates of values for P(z|d)
 891 | 
 892 |     sample_weight: array of shape (n_docs,)
 893 |         Input document weights.
 894 | 
 895 |     block_size: int (optional, default=65536)
 896 |         The number of nonzero entries of X to process in a block. The larger this
 897 |         value the faster the compute may go, but at higher memory cost.
 898 | 
 899 |     n_iter: int
 900 |         The maximum number iterations of EM to perform
 901 | 
 902 |     n_iter_per_test: int
 903 |         The number of iterations between tests for relative improvement in
 904 |         log-likelihood.
 905 | 
 906 |     tolerance: float
 907 |         The threshold of relative improvement in log-likelihood required to continue
 908 |         iterations.
 909 | 
 910 |     e_step_thresh: float (optional, default=1e-32)
 911 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
 912 |         below threshold then write a zero for P(z|w,d).
 913 | 
 914 |     Returns
 915 |     -------
 916 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
 917 |         The resulting model values of P(z|d) and P(w|z)
 918 | 
 919 |     """
 920 |     k = topics.shape[0]
 921 |     p_z_given_wd_block = np.zeros((block_size, k), dtype=np.float32)
 922 | 
 923 |     norm_pdz = np.zeros(p_z_given_d.shape[0], dtype=np.float32)
 924 | 
 925 |     previous_log_likelihood = log_likelihood(
 926 |         X_rows, X_cols, X_vals, topics, p_z_given_d, sample_weight,
 927 |     )
 928 | 
 929 |     next_p_z_given_d = np.zeros_like(p_z_given_d)
 930 | 
 931 |     for i in range(n_iter):
 932 | 
 933 |         p_z_given_d, next_p_z_given_d = plsa_refit_em_step(
 934 |             X_rows,
 935 |             X_cols,
 936 |             X_vals,
 937 |             topics,
 938 |             p_z_given_d,
 939 |             next_p_z_given_d,
 940 |             p_z_given_wd_block,
 941 |             sample_weight,
 942 |             norm_pdz,
 943 |         )
 944 | 
 945 |         if i % n_iter_per_test == 0:
 946 |             current_log_likelihood = log_likelihood(
 947 |                 X_rows, X_cols, X_vals, topics, p_z_given_d, sample_weight,
 948 |             )
 949 |             if current_log_likelihood > 0:
 950 |                 change = np.abs(current_log_likelihood - previous_log_likelihood)
 951 |                 if change / np.abs(current_log_likelihood) < tolerance:
 952 |                     break
 953 |                 else:
 954 |                     previous_log_likelihood = current_log_likelihood
 955 | 
 956 |     return p_z_given_d
 957 | 
 958 | 
 959 | def plsa_refit(
 960 |     X,
 961 |     topics,
 962 |     sample_weight,
 963 |     block_size=65536,
 964 |     n_iter=50,
 965 |     n_iter_per_test=10,
 966 |     tolerance=0.005,
 967 |     e_step_thresh=1e-32,
 968 |     random_state=None,
 969 | ):
 970 |     """Routine for refitting values of P(z|d) given a fixed set of topics (
 971 |     i.e. P(w|z)). This allows fitting document vectors to a predefined set of topics
 972 |     (given, for example, by an ensemble result).
 973 | 
 974 |     Parameters
 975 |     ----------
 976 |     X: sparse matrix of shape (n_docs, n_words)
 977 |         The data matrix pLSA is attempting to fit to.
 978 | 
 979 |     topics: array of shape (n_topics, n_words)
 980 |         The fixed topics against which to fit the values of P(z|d).
 981 | 
 982 |     sample_weight: array of shape (n_docs,)
 983 |         Input document weights.
 984 | 
 985 |     block_size: int (optional, default=65536)
 986 |         The number of nonzero entries of X to process in a block. The larger this
 987 |         value the faster the compute may go, but at higher memory cost.
 988 | 
 989 |     n_iter: int
 990 |         The maximum number iterations of EM to perform
 991 | 
 992 |     n_iter_per_test: int
 993 |         The number of iterations between tests for relative improvement in
 994 |         log-likelihood.
 995 | 
 996 |     tolerance: float
 997 |         The threshold of relative improvement in log-likelihood required to continue
 998 |         iterations.
 999 | 
1000 |     e_step_thresh: float (optional, default=1e-32)
1001 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
1002 |         below threshold then write a zero for P(z|w,d).
1003 | 
1004 |     random_state: int, RandomState instance or None, (optional, default: None)
1005 |         If int, random_state is the seed used by the random number generator;
1006 |         If RandomState instance, random_state is the random number generator;
1007 |         If None, the random number generator is the RandomState instance used
1008 |         by `np.random`. Used in in initialization.
1009 | 
1010 |     Returns
1011 |     -------
1012 |     p_z_given_d, p_w_given_z: arrays of shapes (n_docs, n_topics) and (n_topics, n_words)
1013 |         The resulting model values of P(z|d) and P(w|z)
1014 | 
1015 |     """
1016 |     A = X.tocoo().astype(np.float32)
1017 |     k = topics.shape[0]
1018 | 
1019 |     rng = check_random_state(random_state)
1020 |     p_z_given_d = rng.rand(A.shape[0], k)
1021 |     normalize(p_z_given_d, axis=1)
1022 |     p_z_given_d = p_z_given_d.astype(np.float32)
1023 |     topics = topics.astype(np.float32)
1024 | 
1025 |     p_z_given_d = plsa_refit_inner_blockwise(
1026 |         A.row,
1027 |         A.col,
1028 |         A.data,
1029 |         topics,
1030 |         p_z_given_d,
1031 |         sample_weight,
1032 |         block_size=block_size,
1033 |         n_iter=n_iter,
1034 |         n_iter_per_test=n_iter_per_test,
1035 |         tolerance=tolerance,
1036 |         e_step_thresh=e_step_thresh,
1037 |     )
1038 | 
1039 |     return p_z_given_d
1040 | 
1041 | 
1042 | class StreamedPLSA(BaseEstimator, TransformerMixin):
1043 |     """Probabilistic Latent Semantic Analysis (pLSA)
1044 | 
1045 |     Given a bag-of-words matrix representation of a corpus of documents, where each row of the
1046 |     matrix represents a document, and the jth element of the ith row is the count of the number of
1047 |     times the jth vocabulary word occurs in the ith document, estimate matrices of conditional
1048 |     probabilities P(z|d) and P(w|z) such that the product matrix of probabilities P(w|d)
1049 |     maximises the likelihood of seeing the observed corpus data. Here P(z|d) represents the
1050 |     probability of topic z given document d, P(w|z) represents the probability of word w given
1051 |     topic z, and P(w|d) represents the probability of word w given document d.
1052 | 
1053 |     The algorithm proceeds using an Expectation-Maximization (EM) approach to attempt to maximise
1054 |     the likelihood of the observed data under the estimated model.
1055 | 
1056 |     The StreamedPLSA uses a block based approached to compute partial E-step M-step
1057 |     pairs to lower overall memory usage. This is particularly useful for very large
1058 |     training data and/or large numbers of topics.
1059 | 
1060 |     Parameters
1061 |     ----------
1062 |     n_components: int (optional, default=10)
1063 |         The number of topics to use in the matrix factorization.
1064 | 
1065 |     init: string or tuple (optional, default="random")
1066 |         The intialization method to use. This should be one of:
1067 |             * ``"random"``
1068 |             * ``"nndsvd"``
1069 |             * ``"nmf"``
1070 |         or a tuple of two ndarrays of shape (n_docs, n_topics) and (n_topics, n_words).
1071 | 
1072 |     block_size: int (optional, default=65536)
1073 |         The number of nonzero entries of X to process in a block. The larger this
1074 |         value the faster the compute may go, but at higher memory cost.
1075 | 
1076 |     n_iter: int
1077 |         The maximum number iterations of EM to perform
1078 | 
1079 |     n_iter_per_test: int
1080 |         The number of iterations between tests for relative improvement in
1081 |         log-likelihood.
1082 | 
1083 |     tolerance: float
1084 |         The threshold of relative improvement in log-likelihood required to continue
1085 |         iterations.
1086 | 
1087 |     e_step_thresh: float (optional, default=1e-32)
1088 |         Option to promote sparsity. If the value of P(w|z)P(z|d) in the E step falls
1089 |         below threshold then write a zero for P(z|w,d).
1090 | 
1091 |     random_state: int, RandomState instance or None, (optional, default: None)
1092 |         If int, random_state is the seed used by the random number generator;
1093 |         If RandomState instance, random_state is the random number generator;
1094 |         If None, the random number generator is the RandomState instance used
1095 |         by `np.random`. Used in in initialization.
1096 | 
1097 |     Attributes
1098 |     ----------
1099 | 
1100 |     components_: array of shape (n_topics, n_words)
1101 |         The topic vectors produced by pLSA. Each row is a topic, which is a probability
1102 |         distribution, over the vocabulary, giving the probability of each word given the topic (
1103 |         P(w|z)).
1104 | 
1105 |     embedding_: array of shape (n_docs, n_topics)
1106 |         The document vectors produced by pLSA. Each row corresponds to a document, giving a
1107 |         probability distribution, over the topic space, specifying the probability of each topic
1108 |         occuring in the document (P(z|d)).
1109 | 
1110 |     training_data_: sparse matrix of shape (n_docs, n_words)
1111 |         The original training data saved in sparse matrix format.
1112 | 
1113 |     References
1114 |     ----------
1115 | 
1116 |     Hofmann, Thomas. "Probabilistic latent semantic analysis." Proceedings of the Fifteenth
1117 |     conference on Uncertainty in artificial intelligence. Morgan Kaufmann Publishers Inc., 1999.
1118 | 
1119 |     Hofmann, Thomas. "Unsupervised learning by probabilistic latent semantic analysis."
1120 |     Machine learning 42.1-2 (2001): 177-196.
1121 | 
1122 |     """
1123 | 
1124 |     def __init__(
1125 |         self,
1126 |         n_components=10,
1127 |         init="random",
1128 |         block_size=65536,
1129 |         n_iter=100,
1130 |         n_iter_per_test=10,
1131 |         tolerance=0.001,
1132 |         e_step_thresh=1e-32,
1133 |         transform_random_seed=42,
1134 |         random_state=None,
1135 |     ):
1136 | 
1137 |         self.n_components = n_components
1138 |         self.init = init
1139 |         self.block_size = block_size
1140 |         self.n_iter = n_iter
1141 |         self.n_iter_per_test = n_iter_per_test
1142 |         self.tolerance = tolerance
1143 |         self.e_step_thresh = e_step_thresh
1144 |         self.transform_random_seed = transform_random_seed
1145 |         self.random_state = random_state
1146 | 
1147 |     def fit(self, X, y=None, sample_weight=None):
1148 |         """Learn the pLSA model for the data X and return the document vectors.
1149 | 
1150 |         This is more efficient than calling fit followed by transform.
1151 | 
1152 |         Parameters
1153 |         ----------
1154 |         X: array or sparse matrix of shape (n_docs, n_words)
1155 |             The data matrix pLSA is attempting to fit to.
1156 | 
1157 |         y: Ignored
1158 | 
1159 |         sample_weight: array of shape (n_docs,)
1160 |             Input document weights.
1161 | 
1162 |         Returns
1163 |         -------
1164 |         self
1165 |         """
1166 |         self.fit_transform(X, sample_weight=sample_weight)
1167 |         return self
1168 | 
1169 |     def fit_transform(self, X, y=None, sample_weight=None):
1170 |         """Learn the pLSA model for the data X and return the document vectors.
1171 | 
1172 |         This is more efficient than calling fit followed by transform.
1173 | 
1174 |         Parameters
1175 |         ----------
1176 |         X: array or sparse matrix of shape (n_docs, n_words)
1177 |             The data matrix pLSA is attempting to fit to.
1178 | 
1179 |         y: Ignored
1180 | 
1181 |         sample_weight: array of shape (n_docs,)
1182 |             Input document weights.
1183 | 
1184 |         Returns
1185 |         -------
1186 |         embedding: array of shape (n_docs, n_topics)
1187 |             An embedding of the documents into a topic space.
1188 |         """
1189 | 
1190 |         X = check_array(X, accept_sparse="csr")
1191 |         X = standardize_input(X)
1192 | 
1193 |         if not issparse(X):
1194 |             X = csr_matrix(X)
1195 | 
1196 |         sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
1197 | 
1198 |         if np.any(X.data < 0):
1199 |             raise ValueError(
1200 |                 "PLSA is only valid for matrices with non-negative " "entries"
1201 |             )
1202 | 
1203 |         row_sums = np.array(X.sum(axis=1).T)[0]
1204 |         good_rows = row_sums != 0
1205 | 
1206 |         if not np.all(good_rows):
1207 |             zero_rows_found = True
1208 |             data_for_fitting = X[good_rows]
1209 |         else:
1210 |             zero_rows_found = False
1211 |             data_for_fitting = X
1212 | 
1213 |         U, V = plsa_fit(
1214 |             data_for_fitting,
1215 |             self.n_components,
1216 |             sample_weight,
1217 |             init=self.init,
1218 |             block_size=self.block_size,
1219 |             n_iter=self.n_iter,
1220 |             n_iter_per_test=self.n_iter_per_test,
1221 |             tolerance=self.tolerance,
1222 |             e_step_thresh=self.e_step_thresh,
1223 |             random_state=self.random_state,
1224 |         )
1225 | 
1226 |         if zero_rows_found:
1227 |             self.embedding_ = np.zeros((X.shape[0], self.n_components))
1228 |             self.embedding_[good_rows] = U
1229 |         else:
1230 |             self.embedding_ = U
1231 | 
1232 |         self.components_ = V
1233 |         self.training_data_ = X
1234 | 
1235 |         return self.embedding_
1236 | 
1237 |     def transform(self, X, y=None, sample_weight=None):
1238 |         """Transform the data X into the topic space of the fitted pLSA model.
1239 | 
1240 |         Parameters
1241 |         ----------
1242 |         X: array or sparse matrix of shape (n_docs, n_words)
1243 |             Corpus to be embedded into topic space
1244 | 
1245 |         y: Ignored
1246 | 
1247 |         Returns
1248 |         -------
1249 |         embedding: array of shape (n_docs, n_topics)
1250 |             An embedding of the documents X into the topic space.
1251 |         """
1252 |         X = check_array(X, accept_sparse="csr")
1253 |         sample_weight = _check_sample_weight(sample_weight, X, dtype=np.float32)
1254 |         random_state = check_random_state(self.transform_random_seed)
1255 | 
1256 |         if not issparse(X):
1257 |             X = coo_matrix(X)
1258 |         else:
1259 |             X = X.tocoo()
1260 | 
1261 |         result = plsa_refit(
1262 |             X,
1263 |             self.components_,
1264 |             sample_weight,
1265 |             block_size=self.block_size,
1266 |             n_iter=50,
1267 |             n_iter_per_test=5,
1268 |             tolerance=0.001,
1269 |             random_state=random_state,
1270 |         )
1271 | 
1272 |         return result
1273 | 
1274 |     def coherence(self, topic_num=None, n_words=20):
1275 |         """Compute the average coherence of fitted topics, or of a single individual topic.
1276 | 
1277 |         Parameters
1278 |         ----------
1279 |         topic_num: int (optional, default=None)
1280 |             The topic number to compute coherence for. If ``topic_num`` is None then the average
1281 |             coherence over all topics will be computed.
1282 | 
1283 |         n_words int (optional, default=20)
1284 |             The number of topic words to score against. The top ``n_words`` words from the selected
1285 |             topic will be used.
1286 | 
1287 |         Returns
1288 |         -------
1289 |         topic_coherence: float
1290 |             The requested coherence score.
1291 |         """
1292 | 
1293 |         # Test for errors
1294 |         if not isinstance(topic_num, int) and topic_num is not None:
1295 |             raise ValueError("Topic number must be an integer or None.")
1296 | 
1297 |         if topic_num is None:
1298 |             return mean_coherence(self.components_, self.training_data_, n_words)
1299 |         elif topic_num >= 0 and topic_num < self.n_components:
1300 |             return coherence(self.components_, topic_num, self.training_data_, n_words)
1301 |         else:
1302 |             raise ValueError(
1303 |                 "Topic number must be in range 0 to {}".format(self.n_components)
1304 |             )
1305 | 
1306 |     def log_lift(self, topic_num=None, n_words=20):
1307 |         """Compute the average log lift of fitted topics, or of a single individual topic.
1308 | 
1309 |         Parameters
1310 |         ----------
1311 |         topic_num: int (optional, default=None)
1312 |             The topic number to compute log lift for. If ``topic_num`` is None then the average
1313 |             log lift over all topics will be computed.
1314 | 
1315 |         n_words int (optional, default=20)
1316 |             The number of topic words to score against. The top ``n_words`` words from the selected
1317 |             topic will be used.
1318 | 
1319 | 
1320 |         Returns
1321 |         -------
1322 |         log_lift: float
1323 |             The requested log lift score.
1324 |         """
1325 | 
1326 |         # Test for errors
1327 |         if not isinstance(topic_num, int) and topic_num is not None:
1328 |             raise ValueError("Topic number must be an integer or None.")
1329 | 
1330 |         if topic_num is None:
1331 |             return mean_log_lift(self.components_, self.training_data_, n_words)
1332 |         elif topic_num >= 0 and topic_num < self.n_components:
1333 |             return log_lift(self.components_, topic_num, self.training_data_, n_words)
1334 |         else:
1335 |             raise ValueError(
1336 |                 "Topic number must be in range 0 to {}".format(self.n_components)
1337 |             )
1338 | 


--------------------------------------------------------------------------------