├── .coveragerc
├── .github
└── workflows
│ ├── package-publish.yml
│ └── package-test.yml
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── dataset
└── SearchSnippets.txt.gz
├── docs
├── Makefile
├── make.bat
├── requirements.txt
└── source
│ ├── _static
│ ├── coherence.svg
│ └── perplexity.svg
│ ├── benchmarks.rst
│ ├── bitermplus.metrics.rst
│ ├── bitermplus.rst
│ ├── bitermplus.util.rst
│ ├── conf.py
│ ├── index.rst
│ ├── install.rst
│ └── tutorial.rst
├── images
└── topics_terms_plots.png
├── pyproject.toml
├── setup.py
├── src
└── bitermplus
│ ├── __init__.py
│ ├── _btm.pyx
│ ├── _metrics.pyx
│ └── _util.py
└── tests
└── test_btm.py
/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | plugins = Cython.Coverage
3 | source = src/bitermplus
--------------------------------------------------------------------------------
/.github/workflows/package-publish.yml:
--------------------------------------------------------------------------------
1 | name: Package Upload to PyPi
2 |
3 | on:
4 | release:
5 | types: [created]
6 |
7 | jobs:
8 | deploy:
9 |
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - uses: actions/checkout@v2
14 | - name: Set up Python
15 | uses: actions/setup-python@v2
16 | with:
17 | python-version: '3.x'
18 | - name: Install dependencies
19 | run: |
20 | python -m pip install --upgrade pip
21 | pip install setuptools wheel cython build twine
22 | - name: Build and publish
23 | env:
24 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
25 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
26 | run: |
27 | python -m build -s
28 | twine upload dist/*
29 |
--------------------------------------------------------------------------------
/.github/workflows/package-test.yml:
--------------------------------------------------------------------------------
1 | name: Package Test
2 |
3 | on: [push]
4 |
5 | jobs:
6 | build:
7 | runs-on: ubuntu-latest
8 | strategy:
9 | matrix:
10 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
11 | steps:
12 | - uses: actions/checkout@v4
13 | - name: Set up Python ${{ matrix.python-version }}
14 | uses: actions/setup-python@v5
15 | with:
16 | python-version: ${{ matrix.python-version }}
17 | - name: Install dependencies
18 | run: |
19 | python -m pip install --upgrade pip
20 | pip install .
21 | pip install .[test]
22 | - name: Testing package with pytest
23 | run: |
24 | cythonize -i src/bitermplus/*.pyx
25 | pytest -s
26 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 | *.c
9 |
10 | # Distribution / packaging
11 | .Python
12 | build/
13 | docs/build/
14 | docs/source/_build/
15 | develop-eggs/
16 | dist/
17 | downloads/
18 | eggs/
19 | .eggs/
20 | lib/
21 | lib64/
22 | parts/
23 | sdist/
24 | var/
25 | wheels/
26 | pip-wheel-metadata/
27 | share/python-wheels/
28 | *.egg-info/
29 | .installed.cfg
30 | *.egg
31 | MANIFEST
32 |
33 | # PyInstaller
34 | # Usually these files are written by a python script from a template
35 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
36 | *.manifest
37 | *.spec
38 |
39 | # Installer logs
40 | pip-log.txt
41 | pip-delete-this-directory.txt
42 |
43 | # Unit test / coverage reports
44 | htmlcov/
45 | .tox/
46 | .nox/
47 | .coverage
48 | .coverage.*
49 | .cache
50 | nosetests.xml
51 | coverage.xml
52 | *.cover
53 | *.py,cover
54 | .hypothesis/
55 | .pytest_cache/
56 | src/**/*.html
57 |
58 | # Translations
59 | *.mo
60 | *.pot
61 |
62 | # Django stuff:
63 | *.log
64 | local_settings.py
65 | db.sqlite3
66 | db.sqlite3-journal
67 |
68 | # Flask stuff:
69 | instance/
70 | .webassets-cache
71 |
72 | # Scrapy stuff:
73 | .scrapy
74 |
75 | # Sphinx documentation
76 | docs/_build/
77 |
78 | # PyBuilder
79 | target/
80 |
81 | # Jupyter Notebook
82 | .ipynb_checkpoints
83 |
84 | # IPython
85 | profile_default/
86 | ipython_config.py
87 |
88 | # pyenv
89 | .python-version
90 |
91 | # pipenv
92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
95 | # install all needed dependencies.
96 | #Pipfile.lock
97 |
98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
99 | __pypackages__/
100 |
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 |
105 | # SageMath parsed files
106 | *.sage.py
107 |
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 |
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 |
121 | # Rope project settings
122 | .ropeproject
123 |
124 | # mkdocs documentation
125 | /site
126 |
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 |
132 | # Pyre type checker
133 | .pyre/
134 |
135 | # vscode
136 | .vscode
137 |
138 | # pickles
139 | *.pickle
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | build:
4 | os: ubuntu-22.04
5 | tools:
6 | python: "3.11"
7 |
8 | sphinx:
9 | configuration: docs/source/conf.py
10 |
11 | formats:
12 | - pdf
13 |
14 | python:
15 | install:
16 | - requirements: docs/requirements.txt
17 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2021 Maksim Terpilowski
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include src/bitermplus *.pyx
2 | include LICENSE
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Biterm Topic Model
2 |
3 | 
4 | [](https://bitermplus.readthedocs.io/en/latest/?badge=latest)
5 | [](https://www.codacy.com/gh/maximtrp/bitermplus/dashboard?utm_source=github.com&utm_medium=referral&utm_content=maximtrp/bitermplus&utm_campaign=Badge_Grade)
6 | [](https://github.com/maximtrp/bitermplus/issues)
7 | [](https://pepy.tech/project/bitermplus)
8 | 
9 |
10 | *Bitermplus* implements [Biterm topic model](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.402.4032&rep=rep1&type=pdf) for short texts introduced by Xiaohui Yan, Jiafeng Guo, Yanyan Lan, and Xueqi Cheng. Actually, it is a cythonized version of [BTM](https://github.com/xiaohuiyan/BTM). This package is also capable of computing *perplexity*, *semantic coherence*, and *entropy* metrics.
11 |
12 | ## Donate
13 |
14 | If you find this package useful, please consider donating any amount of money. This will help me spend more time on supporting open-source software.
15 |
16 |
17 |
18 | ## Requirements
19 |
20 | * cython
21 | * numpy
22 | * pandas
23 | * scipy
24 | * scikit-learn
25 | * tqdm
26 |
27 | ## Setup
28 |
29 | ### Linux and Windows
30 |
31 | Be sure to install Python headers if they are not included in your Python installation. For example, in Ubuntu it can be done using this command (where `x` is Python minor version number):
32 |
33 | ```bash
34 | sudo apt-get install python3.x-dev
35 | ```
36 |
37 | Apart from that, there should be no issues with installing *bitermplus* under these OSes. You can install the package directly from PyPi:
38 |
39 | ```bash
40 | pip install bitermplus
41 | ```
42 |
43 | Or from this repo:
44 |
45 | ```bash
46 | pip install git+https://github.com/maximtrp/bitermplus.git
47 | ```
48 |
49 | ### Mac OS
50 |
51 | First, you need to install XCode CLT and [Homebrew](https://brew.sh).
52 | Then, install `libomp` using `brew`:
53 |
54 | ```bash
55 | xcode-select --install
56 | brew install libomp
57 | pip3 install bitermplus
58 | ```
59 |
60 | If you have the following issue with libomp (`fatal error: 'omp.h' file not found`), run `brew info libomp` in the console:
61 |
62 | ```bash
63 | brew info libomp
64 | ```
65 |
66 | You should see the following output:
67 |
68 | ```
69 | libomp: stable 15.0.5 (bottled) [keg-only]
70 | LLVM's OpenMP runtime library
71 | https://openmp.llvm.org/
72 | /opt/homebrew/Cellar/libomp/15.0.5 (7 files, 1.6MB)
73 | Poured from bottle on 2022-11-19 at 12:16:49
74 | From: https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/libomp.rb
75 | License: MIT
76 | ==> Dependencies
77 | Build: cmake ✘, lit ✘
78 | ==> Caveats
79 | libomp is keg-only, which means it was not symlinked into /opt/homebrew,
80 | because it can override GCC headers and result in broken builds.
81 |
82 | For compilers to find libomp you may need to set:
83 | export LDFLAGS="-L/opt/homebrew/opt/libomp/lib"
84 | export CPPFLAGS="-I/opt/homebrew/opt/libomp/include"
85 |
86 | ==> Analytics
87 | install: 192,197 (30 days), 373,389 (90 days), 1,285,192 (365 days)
88 | install-on-request: 24,388 (30 days), 48,013 (90 days), 164,666 (365 days)
89 | build-error: 0 (30 days)
90 | ```
91 |
92 | Export `LDFLAGS` and `CPPFLAGS` as suggested in brew output:
93 |
94 | ```bash
95 | export LDFLAGS="-L/opt/homebrew/opt/libomp/lib"
96 | export CPPFLAGS="-I/opt/homebrew/opt/libomp/include"
97 | ```
98 |
99 | ## Example
100 |
101 | ### Model fitting
102 |
103 | ```python
104 | import bitermplus as btm
105 | import numpy as np
106 | import pandas as pd
107 |
108 | # IMPORTING DATA
109 | df = pd.read_csv(
110 | 'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
111 | texts = df['texts'].str.strip().tolist()
112 |
113 | # PREPROCESSING
114 | # Obtaining terms frequency in a sparse matrix and corpus vocabulary
115 | X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
116 | tf = np.array(X.sum(axis=0)).ravel()
117 | # Vectorizing documents
118 | docs_vec = btm.get_vectorized_docs(texts, vocabulary)
119 | docs_lens = list(map(len, docs_vec))
120 | # Generating biterms
121 | biterms = btm.get_biterms(docs_vec)
122 |
123 | # INITIALIZING AND RUNNING MODEL
124 | model = btm.BTM(
125 | X, vocabulary, seed=12321, T=8, M=20, alpha=50/8, beta=0.01)
126 | model.fit(biterms, iterations=20)
127 | p_zd = model.transform(docs_vec)
128 |
129 | # METRICS
130 | perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
131 | coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
132 | # or
133 | perplexity = model.perplexity_
134 | coherence = model.coherence_
135 |
136 | # LABELS
137 | model.labels_
138 | # or
139 | btm.get_docs_top_topic(texts, model.matrix_docs_topics_)
140 | ```
141 |
142 | ### Results visualization
143 |
144 | You need to install [tmplot](https://github.com/maximtrp/tmplot) first.
145 |
146 | ```python
147 | import tmplot as tmp
148 | tmp.report(model=model, docs=texts)
149 | ```
150 |
151 | 
152 |
153 | ## Tutorial
154 |
155 | There is a [tutorial](https://bitermplus.readthedocs.io/en/latest/tutorial.html)
156 | in documentation that covers the important steps of topic modeling (including
157 | stability measures and results visualization).
158 |
--------------------------------------------------------------------------------
/dataset/SearchSnippets.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/bitermplus/20fd0d1601e007aa1567e6ed97a9c906fd869a7f/dataset/SearchSnippets.txt.gz
--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = source
9 | BUILDDIR = build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | # sphinx-autogen -o source/generated source/*.rst
21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 |
13 | if "%1" == "" goto help
14 |
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | echo.
18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | echo.installed, then set the SPHINXBUILD environment variable to point
20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | echo.may add the Sphinx directory to PATH.
22 | echo.
23 | echo.If you don't have Sphinx installed, grab it from
24 | echo.http://sphinx-doc.org/
25 | exit /b 1
26 | )
27 |
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 |
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 |
34 | :end
35 | popd
36 |
--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 | git+https://github.com/maximtrp/bitermplus
3 |
--------------------------------------------------------------------------------
/docs/source/_static/coherence.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/source/_static/perplexity.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/docs/source/benchmarks.rst:
--------------------------------------------------------------------------------
1 | Benchmarks
2 | ----------
3 |
4 | In this section, the results of a series of benchmarks done on *SearchSnippets* dataset
5 | are presented. Sixteen models were trained with different iterations number
6 | (from 10 to 2000) and default model parameters. Topics number was set to 8.
7 | Semantic topic coherence (``u_mass``) and perplexity were
8 | calculated for each model.
9 |
10 | .. image:: _static/perplexity.svg
11 | :alt: Perplexity
12 |
13 | .. image:: _static/coherence.svg
14 | :alt: Semantic topic coherence
15 |
16 |
--------------------------------------------------------------------------------
/docs/source/bitermplus.metrics.rst:
--------------------------------------------------------------------------------
1 | Metrics
2 | =======
3 |
4 | .. currentmodule:: bitermplus
5 |
6 | .. autofunction:: coherence
7 | .. autofunction:: perplexity
8 | .. autofunction:: entropy
--------------------------------------------------------------------------------
/docs/source/bitermplus.rst:
--------------------------------------------------------------------------------
1 | Model
2 | =====
3 |
4 | .. currentmodule:: bitermplus
5 |
6 | .. autoclass:: BTM
7 | :members:
8 |
--------------------------------------------------------------------------------
/docs/source/bitermplus.util.rst:
--------------------------------------------------------------------------------
1 | Utility functions
2 | =================
3 |
4 | .. currentmodule:: bitermplus
5 |
6 | .. autofunction:: get_words_freqs
7 | .. autofunction:: get_vectorized_docs
8 | .. autofunction:: get_biterms
9 | .. autofunction:: get_top_topic_words
10 | .. autofunction:: get_top_topic_docs
11 | .. autofunction:: get_docs_top_topic
--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
1 | # Configuration file for the Sphinx documentation builder.
2 | #
3 | # This file only contains a selection of the most common options. For a full
4 | # list see the documentation:
5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
6 |
7 | # -- Path setup --------------------------------------------------------------
8 |
9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 |
17 |
18 | # -- Project information -----------------------------------------------------
19 |
20 | project = 'bitermplus'
21 | copyright = '2021, Maksim Terpilowski'
22 | author = 'Maksim Terpilowski'
23 |
24 |
25 | # -- General configuration ---------------------------------------------------
26 |
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 | 'sphinx.ext.autosummary',
32 | 'sphinx.ext.napoleon',
33 | ]
34 |
35 | # Add any paths that contain templates here, relative to this directory.
36 | templates_path = ['_templates']
37 |
38 | # List of patterns, relative to source directory, that match files and
39 | # directories to ignore when looking for source files.
40 | # This pattern also affects html_static_path and html_extra_path.
41 | exclude_patterns = []
42 |
43 |
44 | # -- Options for HTML output -------------------------------------------------
45 |
46 | # The theme to use for HTML and HTML Help pages. See the documentation for
47 | # a list of builtin themes.
48 | #
49 | html_theme = 'sphinx_rtd_theme'
50 |
51 | # Add any paths that contain custom static files (such as style sheets) here,
52 | # relative to this directory. They are copied after the builtin static files,
53 | # so a file named "default.css" will overwrite the builtin "default.css".
54 | html_static_path = ['_static']
55 |
--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
1 | bitermplus
2 | ==========
3 |
4 | *Bitermplus* implements `Biterm topic model
5 | `_
6 | for short texts introduced by Xiaohui Yan, Jiafeng Guo, Yanyan Lan, and Xueqi
7 | Cheng. Actually, it is a cythonized version of `BTM
8 | `_. This package is also capable of computing
9 | *perplexity* and *semantic coherence* metrics.
10 |
11 | .. toctree::
12 | :maxdepth: 2
13 | :caption: Usage
14 | :hidden:
15 |
16 | Installation
17 | Tutorial
18 | Benchmarks
19 |
20 | .. toctree::
21 | :maxdepth: 2
22 | :caption: API
23 | :hidden:
24 |
25 | Model
26 | Metrics
27 | Utility functions
28 |
--------------------------------------------------------------------------------
/docs/source/install.rst:
--------------------------------------------------------------------------------
1 | Setup
2 | -----
3 |
4 | Linux and Windows
5 | ~~~~~~~~~~~~~~~~~
6 |
7 | There should be no issues with installing *bitermplus* under these OSes.
8 | You can install the package directly from PyPi.
9 |
10 | .. code-block:: bash
11 |
12 | pip install bitermplus
13 |
14 | Or from this repo:
15 |
16 | .. code-block:: bash
17 |
18 | pip install git+https://github.com/maximtrp/bitermplus.git
19 |
20 | Mac OS
21 | ~~~~~~
22 |
23 | First, you need to install XCode CLT and `Homebrew `_.
24 | Then, install ``libomp`` using ``brew``:
25 |
26 | .. code-block:: bash
27 |
28 | xcode-select --install
29 | brew install libomp
30 | pip3 install bitermplus
31 |
32 | Requirements
33 | ~~~~~~~~~~~~
34 |
35 | * cython
36 | * numpy
37 | * pandas
38 | * scipy
39 | * scikit-learn
40 | * tqdm
41 |
--------------------------------------------------------------------------------
/docs/source/tutorial.rst:
--------------------------------------------------------------------------------
1 | Tutorial
2 | ========
3 |
4 | Model fitting
5 | -------------
6 |
7 | Here is a simple example of model fitting.
8 | It is supposed that you have already gone through the preprocessing
9 | stage: cleaned, lemmatized or stemmed your documents, and removed stop words.
10 |
11 | .. code-block:: python
12 |
13 | import bitermplus as btm
14 | import numpy as np
15 | import pandas as pd
16 |
17 | # Importing data
18 | df = pd.read_csv(
19 | 'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
20 | texts = df['texts'].str.strip().tolist()
21 |
22 | # Vectorizing documents, obtaining full vocabulary and biterms
23 | # Internally, btm.get_words_freqs uses CountVectorizer from sklearn
24 | # You can pass any of its arguments to btm.get_words_freqs
25 | # For example, you can remove stop words:
26 | stop_words = ["word1", "word2", "word3"]
27 | X, vocabulary, vocab_dict = btm.get_words_freqs(texts, stop_words=stop_words)
28 | docs_vec = btm.get_vectorized_docs(texts, vocabulary)
29 | biterms = btm.get_biterms(docs_vec)
30 |
31 | # Initializing and running model
32 | model = btm.BTM(
33 | X, vocabulary, seed=12321, T=8, M=20, alpha=50/8, beta=0.01)
34 | model.fit(biterms, iterations=20)
35 |
36 |
37 | Inference
38 | ---------
39 |
40 | Now, we will calculate documents vs topics probability matrix (make an inference).
41 |
42 | .. code-block:: python
43 |
44 | p_zd = model.transform(docs_vec)
45 |
46 | If you need to make an inference on a new dataset, you should
47 | vectorize it using your vocabulary from the training set:
48 |
49 | .. code-block:: python
50 |
51 | new_docs_vec = btm.get_vectorized_docs(new_texts, vocabulary)
52 | p_zd = model.transform(new_docs_vec)
53 |
54 |
55 | Calculating metrics
56 | -------------------
57 |
58 | To calculate perplexity, we must provide documents vs topics probability matrix
59 | (``p_zd``) that we calculated at the previous step.
60 |
61 | .. code-block:: python
62 |
63 | perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
64 | coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
65 | # or
66 | perplexity = model.perplexity_
67 | coherence = model.coherence_
68 |
69 |
70 | Visualizing results
71 | -------------------
72 |
73 | For results visualization, we will use `tmplot
74 | `_ package.
75 |
76 | .. code-block:: python
77 |
78 | import tmplot as tmp
79 |
80 | # Run the interactive report interface
81 | tmp.report(model=model, docs=texts)
82 |
83 | Filtering stable topics
84 | -----------------------
85 |
86 | Unsupervised topic models (such as LDA) are subject to topic instability [1]_
87 | [2]_ [3]_. There is a special method in ``tmplot`` package for selecting stable
88 | topics. It uses various distance metrics such as Kullback-Leibler divergence
89 | (symmetric and non-symmetric), Hellinger distance, Jeffrey's divergence,
90 | Jensen-Shannon divergence, Jaccard index, Bhattacharyya distance, Total
91 | variation distance.
92 |
93 | .. code-block:: python
94 |
95 | import pickle as pkl
96 | import tmplot as tmp
97 | import glob
98 |
99 | # Loading saved models
100 | models_files = sorted(glob.glob(r'results/model[0-9].pkl'))
101 | models = []
102 | for fn in models_files:
103 | file = open(fn, 'rb')
104 | models.append(pkl.load(file))
105 | file.close()
106 |
107 | # Choosing reference model
108 | np.random.seed(122334)
109 | reference_model = np.random.randint(1, 6)
110 |
111 | # Getting close topics
112 | close_topics, close_kl = tmp.get_closest_topics(
113 | models, method="sklb", ref=reference_model)
114 |
115 | # Getting stable topics
116 | stable_topics, stable_kl = tmp.get_stable_topics(
117 | close_topics, close_kl, ref=reference_model, thres=0.7)
118 |
119 | # Stable topics indices list
120 | print(stable_topics[:, reference_model])
121 |
122 |
123 | Model loading and saving
124 | ------------------------
125 |
126 | Support for model serializing with `pickle
127 | `_ was implemented in v0.5.3.
128 | Here is how you can save and load a model:
129 |
130 | .. code-block:: python
131 |
132 | import pickle as pkl
133 | # Saving
134 | with open("model.pkl", "wb") as file:
135 | pkl.dump(model, file)
136 |
137 | # Loading
138 | with open("model.pkl", "rb") as file:
139 | model = pkl.load(file)
140 |
141 |
142 | References
143 | ----------
144 |
145 | .. [1] Koltcov, S., Koltsova, O., & Nikolenko, S. (2014, June).
146 | Latent dirichlet allocation: stability and applications to studies of
147 | user-generated content. In Proceedings of the 2014 ACM conference on Web
148 | science (pp. 161-165).
149 |
150 | .. [2] Mantyla, M. V., Claes, M., & Farooq, U. (2018, October).
151 | Measuring LDA topic stability from clusters of replicated runs. In
152 | Proceedings of the 12th ACM/IEEE international symposium on empirical
153 | software engineering and measurement (pp. 1-4).
154 |
155 | .. [3] Greene, D., O’Callaghan, D., & Cunningham, P. (2014, September). How many
156 | topics? stability analysis for topic models. In Joint European conference on
157 | machine learning and knowledge discovery in databases (pp. 498-513). Springer,
158 | Berlin, Heidelberg.
159 |
--------------------------------------------------------------------------------
/images/topics_terms_plots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/bitermplus/20fd0d1601e007aa1567e6ed97a9c906fd869a7f/images/topics_terms_plots.png
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0", "wheel", "cython"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "bitermplus"
7 | dynamic = ["version"]
8 | description = "Biterm Topic Model"
9 | readme = "README.md"
10 | requires-python = ">=3.7"
11 | license.file = "LICENSE"
12 | authors = [
13 | { name = "Maksim Terpilovskii", email = "maximtrp@gmail.com" },
14 | ]
15 | keywords = [
16 | "topic model",
17 | "machine learning",
18 | "nlp"
19 | ]
20 | classifiers = [
21 | "License :: OSI Approved :: MIT License",
22 | "Operating System :: OS Independent",
23 | "Programming Language :: Python :: 3.8",
24 | "Programming Language :: Python :: 3.9",
25 | "Programming Language :: Python :: 3.10",
26 | "Programming Language :: Python :: 3.11",
27 | "Topic :: Scientific/Engineering :: Information Analysis",
28 | "Topic :: Text Processing :: General",
29 | ]
30 | urls.homepage = "https://github.com/maximtrp/bitermplus"
31 | urls.documentation = "https://bitermplus.readthedocs.io/"
32 |
33 | dependencies = [
34 | "numpy",
35 | "cython",
36 | "pandas",
37 | "scipy",
38 | "scikit-learn>=1.0.0",
39 | "tqdm",
40 | ]
41 |
42 | [tool.setuptools]
43 | include-package-data = false
44 |
45 | [tool.setuptools.dynamic]
46 | version = {attr = "bitermplus.__version__"}
47 |
48 | [tool.setuptools.packages.find]
49 | where = ["src"]
50 | include = ["bitermplus"]
51 | exclude = ["tests"]
52 |
53 | [project.optional-dependencies]
54 | test = ["pytest"]
55 |
56 | [tool.pytest.ini_options]
57 | pythonpath = [
58 | ".", "src",
59 | ]
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from platform import system
2 | from setuptools import setup, Extension
3 | from Cython.Build import cythonize
4 | # from numpy import get_include
5 |
6 | extra_link_args = ['-lomp'] if system() == 'Darwin' else ['-fopenmp']
7 | extra_compile_args = ['-Xpreprocessor', '-fopenmp']\
8 | if system() == 'Darwin'\
9 | else ['-fopenmp']
10 |
11 | ext_modules = [
12 | Extension(
13 | "bitermplus._btm",
14 | sources=["src/bitermplus/_btm.pyx"],
15 | extra_compile_args=extra_compile_args,
16 | extra_link_args=extra_link_args),
17 | Extension(
18 | "bitermplus._metrics",
19 | # include_dirs=[get_include()],
20 | # library_dirs=[get_include()],
21 | sources=["src/bitermplus/_metrics.pyx"],
22 | extra_compile_args=extra_compile_args,
23 | extra_link_args=extra_link_args),
24 | ]
25 |
26 | setup(
27 | ext_modules=cythonize(
28 | ext_modules,
29 | compiler_directives={
30 | 'embedsignature': True,
31 | 'language_level': 3})
32 | )
33 |
--------------------------------------------------------------------------------
/src/bitermplus/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.7.0'
2 |
3 | from ._btm import BTM
4 | from ._util import *
5 | from ._metrics import *
6 |
--------------------------------------------------------------------------------
/src/bitermplus/_btm.pyx:
--------------------------------------------------------------------------------
1 | __all__ = ['BTM']
2 |
3 | # from cython.parallel import prange
4 | from libc.time cimport time
5 | from cython.view cimport array
6 | from itertools import chain
7 | from cython import cdivision, wraparound, boundscheck, initializedcheck,\
8 | auto_pickle, nonecheck
9 | import numpy as np
10 | import tqdm
11 | from pandas import DataFrame
12 | from ._metrics import coherence, perplexity
13 |
14 |
15 | @cdivision(True)
16 | @wraparound(False)
17 | @boundscheck(False)
18 | cdef int sample_mult(double[:] p, double random_factor):
19 | cdef int K = p.shape[0]
20 | cdef int i, k
21 |
22 | for i in range(1, K):
23 | p[i] += p[i - 1]
24 |
25 | for k in range(0, K):
26 | if p[k] >= random_factor * p[K - 1]:
27 | break
28 |
29 | return k
30 |
31 |
32 | @auto_pickle(False)
33 | cdef class BTM:
34 | """Biterm Topic Model.
35 |
36 | Parameters
37 | ----------
38 | n_dw : csr.csr_matrix
39 | Documents vs words frequency matrix. Typically, it should be the output
40 | of `CountVectorizer` from sklearn package.
41 | vocabulary : list
42 | Vocabulary (a list of words).
43 | T : int
44 | Number of topics.
45 | M : int = 20
46 | Number of top words for coherence calculation.
47 | alpha : float = 1
48 | Model parameter.
49 | beta : float = 0.01
50 | Model parameter.
51 | seed : int = 0
52 | Random state seed. If seed is equal to 0 (default),
53 | use ``time(NULL)``.
54 | win : int = 15
55 | Biterms generation window.
56 | has_background : bool = False
57 | Use a background topic to accumulate highly frequent words.
58 | """
59 | cdef:
60 | n_dw
61 | vocabulary
62 | int T
63 | int W
64 | int M
65 | double alpha
66 | double beta
67 | int win
68 | bint has_background
69 | double[:] n_bz # T x 1
70 | double[:] p_z # T x 1
71 | double[:, :] p_wz # T x W
72 | double[:, :] n_wz # T x W
73 | double[:, :] p_zd # D x T
74 | double[:] p_wb
75 | int[:, :] B
76 | int iters
77 | unsigned int seed
78 |
79 | # cdef dict __dict__
80 |
81 | def __init__(
82 | self, n_dw, vocabulary, int T, int M=20,
83 | double alpha=1., double beta=0.01, unsigned int seed=0,
84 | int win=15, bint has_background=False):
85 | self.n_dw = n_dw
86 | self.vocabulary = vocabulary
87 | self.T = T
88 | self.W = len(vocabulary)
89 | self.M = M
90 | self.alpha = alpha
91 | self.beta = beta
92 | self.win = win
93 | self.seed = seed
94 | self.p_wb = np.asarray(n_dw.sum(axis=0) / n_dw.sum())[0]
95 | self.p_z = array(
96 | shape=(self.T, ), itemsize=sizeof(double), format="d",
97 | allocate_buffer=True)
98 | self.n_bz = array(
99 | shape=(self.T, ), itemsize=sizeof(double), format="d",
100 | allocate_buffer=True)
101 | self.n_wz = array(
102 | shape=(self.T, self.W), itemsize=sizeof(double), format="d",
103 | allocate_buffer=True)
104 | self.p_wz = array(
105 | shape=(self.T, self.W), itemsize=sizeof(double), format="d",
106 | allocate_buffer=True)
107 | self.p_zd = array(
108 | shape=(self.n_dw.shape[0], self.T), itemsize=sizeof(double),
109 | format="d", allocate_buffer=True)
110 | self.p_z[...] = 0.
111 | self.p_wz[...] = 0.
112 | self.p_zd[...] = 0.
113 | self.n_wz[...] = 0.
114 | self.n_bz[...] = 0.
115 | self.has_background = has_background
116 | self.iters = 0
117 |
118 | def __getstate__(self):
119 | return {
120 | 'alpha': self.alpha,
121 | 'beta': self.beta,
122 | 'B': np.asarray(self.B),
123 | 'T': self.T,
124 | 'W': self.W,
125 | 'M': self.M,
126 | 'win': self.win,
127 | 'n_dw': self.n_dw,
128 | 'vocabulary': self.vocabulary,
129 | 'has_background': self.has_background,
130 | 'iters': self.iters,
131 | 'alpha': self.alpha,
132 | 'n_bz': np.asarray(self.n_bz),
133 | 'n_wz': np.asarray(self.n_wz),
134 | 'p_zd': np.asarray(self.p_zd),
135 | 'p_wz': np.asarray(self.p_wz),
136 | 'p_wb': np.asarray(self.p_wb),
137 | 'p_z': np.asarray(self.p_z)
138 | }
139 |
140 | def __setstate__(self, state):
141 | self.alpha = state.get('alpha')
142 | self.beta = state.get('beta')
143 | self.B = state.get('B', np.zeros((0, 0))).astype(np.int32)
144 | self.T = state.get('T')
145 | self.W = state.get('W')
146 | self.M = state.get('M')
147 | self.win = state.get('win')
148 | self.n_dw = state.get('n_dw')
149 | self.vocabulary = state.get('vocabulary')
150 | self.has_background = state.get('has_background')
151 | self.iters = state.get('iters', 0)
152 | self.n_bz = state.get('n_bz')
153 | self.n_wz = state.get('n_wz')
154 | self.p_zd = state.get('p_zd')
155 | self.p_wz = state.get('p_wz')
156 | self.p_wb = state.get('p_wb')
157 | self.p_z = state.get('p_z')
158 |
159 | cdef int[:, :] _biterms_to_array(self, list B):
160 | rng = np.random.default_rng(self.seed if self.seed else time(NULL))
161 | arr = np.asarray(list(chain(*B)), dtype=np.int32)
162 | random_topics = rng.integers(
163 | low=0, high=self.T, size=(arr.shape[0], 1), dtype=np.int32)
164 | arr = np.append(arr, random_topics, axis=1)
165 | return arr
166 |
167 | @initializedcheck(False)
168 | @boundscheck(False)
169 | @wraparound(False)
170 | @cdivision(True)
171 | cdef void _compute_p_wz(self):
172 | cdef int k, w
173 | for k in range(self.T):
174 | for w in range(self.W):
175 | self.p_wz[k][w] = (self.n_wz[k][w] + self.beta) / \
176 | (self.n_bz[k] * 2. + self.W * self.beta)
177 |
178 | @boundscheck(False)
179 | @cdivision(True)
180 | @wraparound(False)
181 | @initializedcheck(False)
182 | cdef void _compute_p_zb(self, long i, double[:] p_z):
183 | cdef double pw1k, pw2k, pk, p_z_sum
184 | cdef int w1 = self.B[i, 0]
185 | cdef int w2 = self.B[i, 1]
186 | cdef int k
187 |
188 | for k in range(self.T):
189 | if self.has_background is True and k == 0:
190 | pw1k = self.p_wb[w1]
191 | pw2k = self.p_wb[w2]
192 | else:
193 | pw1k = (self.n_wz[k][w1] + self.beta) / \
194 | (2. * self.n_bz[k] + self.W * self.beta)
195 | pw2k = (self.n_wz[k][w2] + self.beta) / \
196 | (2. * self.n_bz[k] + 1. + self.W * self.beta)
197 | pk = (self.n_bz[k] + self.alpha) / \
198 | (self.B.shape[0] + self.T * self.alpha)
199 | p_z[k] = pk * pw1k * pw2k
200 |
201 | # return p_z # self._normalize(p_z)
202 |
203 | @boundscheck(False)
204 | @cdivision(True)
205 | @wraparound(False)
206 | @initializedcheck(False)
207 | cdef void _normalize(self, double[:] p, double smoother=0.0):
208 | """Normalize values in place."""
209 | cdef:
210 | int i = 0
211 | int num = p.shape[0]
212 |
213 | cdef double p_sum = 0.
214 | for i in range(num):
215 | p_sum += p[i]
216 |
217 | for i in range(num):
218 | p[i] = (p[i] + smoother) / (p_sum + num * smoother)
219 |
220 | @initializedcheck(False)
221 | @boundscheck(False)
222 | @wraparound(False)
223 | cpdef fit(self, list Bs, int iterations=600, bint verbose=True):
224 | """Biterm topic model fitting method.
225 |
226 | Parameters
227 | ----------
228 | Bs : list
229 | Biterms list.
230 | iterations : int = 600
231 | Iterations number.
232 | verbose : bool = True
233 | Show progress bar.
234 | """
235 | self.B = self._biterms_to_array(Bs)
236 | # rng = np.random.default_rng(self.seed if self.seed else time(NULL))
237 | # random_factors = rng.random(
238 | # low=0, high=self.T, size=(arr.shape[0], 1))
239 |
240 | cdef:
241 | long i
242 | int j, w1, w2, topic
243 | long B_len = self.B.shape[0]
244 | double[:] p_z = array(
245 | shape=(self.T, ), itemsize=sizeof(double), format="d",
246 | allocate_buffer=True)
247 | double[:] rnd_uniform = array(
248 | shape=(B_len, ), itemsize=sizeof(double), format="d",
249 | allocate_buffer=True)
250 |
251 | rng = np.random.default_rng(self.seed if self.seed else time(NULL))
252 | trange = tqdm.trange if verbose else range
253 |
254 | for i in range(B_len):
255 | w1 = self.B[i, 0]
256 | w2 = self.B[i, 1]
257 | topic = self.B[i, 2]
258 | self.n_bz[topic] += 1
259 | self.n_wz[topic][w1] += 1
260 | self.n_wz[topic][w2] += 1
261 |
262 | for j in trange(iterations):
263 | rnd_uniform = rng.uniform(0, 1, B_len)
264 | for i in range(B_len):
265 | w1 = self.B[i, 0]
266 | w2 = self.B[i, 1]
267 | topic = self.B[i, 2]
268 |
269 | self.n_bz[topic] -= 1
270 | self.n_wz[topic][w1] -= 1
271 | self.n_wz[topic][w2] -= 1
272 |
273 | # Topic reset
274 | # self.B[i, 2] = -1
275 |
276 | # Topic sample
277 | self._compute_p_zb(i, p_z)
278 | topic = sample_mult(p_z, rnd_uniform[i])
279 | self.B[i, 2] = topic
280 |
281 | self.n_bz[topic] += 1
282 | self.n_wz[topic][w1] += 1
283 | self.n_wz[topic][w2] += 1
284 |
285 | self.iters = iterations
286 | self.p_z[:] = self.n_bz
287 | self._normalize(self.p_z, self.alpha)
288 | self._compute_p_wz()
289 |
290 | @cdivision(True)
291 | cdef long _count_biterms(self, int n, int win=15):
292 | cdef:
293 | int i, j
294 | long btn = 0
295 | for i in range(n-1):
296 | for j in range(i+1, min(i + win, n)): # range(i+1, n):
297 | btn += 1
298 | return btn
299 |
300 | @initializedcheck(False)
301 | @boundscheck(False)
302 | @wraparound(False)
303 | cdef int[:, :] _generate_biterms(
304 | self,
305 | int[:, :] biterms,
306 | int[:] words,
307 | int win=15):
308 | cdef int i, j, words_len = words.shape[0]
309 | cdef long n = 0
310 |
311 | for i in range(words_len-1):
312 | # for j in range(i+1, words_len): # min(i + win, words_len)):
313 | for j in range(i+1, min(i + win, words_len)):
314 | biterms[n, 0] = min(words[i], words[j])
315 | biterms[n, 1] = max(words[i], words[j])
316 | n += 1
317 | return biterms
318 |
319 | @initializedcheck(False)
320 | @boundscheck(False)
321 | @wraparound(False)
322 | cdef double[:] _infer_doc(self, int[:] doc, str infer_type, int doc_len):
323 | cdef double[:] p_zd = array(
324 | shape=(self.T, ), itemsize=sizeof(double), format="d",
325 | allocate_buffer=True)
326 |
327 | if (infer_type == "sum_b"):
328 | p_zd = self._infer_doc_sum_b(doc, doc_len)
329 | elif (infer_type == "sum_w"):
330 | p_zd = self._infer_doc_sum_w(doc, doc_len)
331 | elif (infer_type == "mix"):
332 | p_zd = self._infer_doc_mix(doc, doc_len)
333 | else:
334 | return None
335 |
336 | return p_zd
337 |
338 | @initializedcheck(False)
339 | @boundscheck(False)
340 | @wraparound(False)
341 | cdef double[:] _infer_doc_sum_b(self, int[:] doc, int doc_len):
342 | cdef double[:] p_zd = array(
343 | shape=(self.T, ), itemsize=sizeof(double), format="d",
344 | allocate_buffer=True)
345 |
346 | cdef double[:] p_zb = array(
347 | shape=(self.T, ), itemsize=sizeof(double), format="d",
348 | allocate_buffer=True)
349 |
350 | p_zd[...] = 0.
351 | p_zb[...] = 0.
352 | cdef long b, combs_num
353 | cdef int w1, w2
354 | cdef int[:, :] biterms
355 |
356 | if doc_len == 1:
357 | for t in range(self.T):
358 | p_zd[t] = self.p_z[t] * self.p_wz[t][doc[0]]
359 | else:
360 | combs_num = self._count_biterms(doc_len, self.win)
361 | biterms = array(
362 | shape=(combs_num, 2), itemsize=sizeof(int), format="i",
363 | allocate_buffer=True)
364 | biterms = self._generate_biterms(biterms, doc, self.win)
365 |
366 | for b in range(combs_num):
367 | w1 = biterms[b, 0]
368 | w2 = biterms[b, 1]
369 |
370 | if w2 >= self.W:
371 | continue
372 |
373 | for t in range(self.T):
374 | p_zb[t] = self.p_z[t] * self.p_wz[t][w1] * self.p_wz[t][w2]
375 | self._normalize(p_zb)
376 |
377 | for t in range(self.T):
378 | p_zd[t] += p_zb[t]
379 | self._normalize(p_zd)
380 | return p_zd
381 |
382 | @initializedcheck(False)
383 | @boundscheck(False)
384 | @wraparound(False)
385 | cdef double[:] _infer_doc_sum_w(self, int[:] doc, int doc_len):
386 | cdef int i
387 | cdef int w
388 | cdef double[:] p_zd = array(
389 | shape=(self.T, ), itemsize=sizeof(double), format="d",
390 | allocate_buffer=True)
391 | cdef double[:] p_zw = array(
392 | shape=(self.T, ), itemsize=sizeof(double), format="d",
393 | allocate_buffer=True)
394 | p_zd[...] = 0.
395 | p_zw[...] = 0.
396 |
397 | for i in range(doc_len):
398 | w = doc[i]
399 | if (w >= self.W):
400 | continue
401 |
402 | for t in range(self.T):
403 | p_zw[t] = self.p_z[t] * self.p_wz[t][w]
404 |
405 | self._normalize(p_zw)
406 |
407 | for t in range(self.T):
408 | p_zd[t] += p_zw[t]
409 |
410 | self._normalize(p_zd)
411 | return p_zd
412 |
413 | @initializedcheck(False)
414 | @boundscheck(False)
415 | @wraparound(False)
416 | cdef double[:] _infer_doc_mix(self, int[:] doc, int doc_len):
417 | cdef double[:] p_zd = array(
418 | shape=(self.T, ), itemsize=sizeof(double), format="d")
419 | p_zd[...] = 0.
420 | cdef int i, w, t
421 |
422 | for t in range(self.T):
423 | p_zd[t] = self.p_z[t]
424 |
425 | for i in range(doc_len):
426 | w = doc[i]
427 | if (w >= self.W):
428 | continue
429 |
430 | for t in range(self.T):
431 | p_zd[t] *= (self.p_wz[t][w] * self.W)
432 |
433 | self._normalize(p_zd)
434 | return p_zd
435 |
436 | @initializedcheck(False)
437 | @boundscheck(False)
438 | @wraparound(False)
439 | @nonecheck(False)
440 | cpdef transform(
441 | self, list docs, str infer_type='sum_b', bint verbose=True):
442 | """Return documents vs topics probability matrix.
443 |
444 | Parameters
445 | ----------
446 | docs : list
447 | Documents list. Each document must be presented as
448 | a list of words ids. Typically, it can be the output of
449 | :meth:`bitermplus.get_vectorized_docs`.
450 | infer_type : str
451 | Inference type. The following options are available:
452 |
453 | 1) ``sum_b`` (default).
454 | 2) ``sum_w``.
455 | 3) ``mix``.
456 | verbose : bool = True
457 | Be verbose (show progress bar).
458 |
459 | Returns
460 | -------
461 | p_zd : np.ndarray
462 | Documents vs topics probability matrix (D vs T).
463 | """
464 | cdef int d
465 | cdef int doc_len
466 | cdef int docs_len = len(docs)
467 | cdef double[:, :] p_zd = array(
468 | shape=(docs_len, self.T), itemsize=sizeof(double), format="d",
469 | allocate_buffer=True)
470 | p_zd[...] = 0.
471 | cdef int[:] doc
472 |
473 | trange = tqdm.trange if verbose else range
474 |
475 | for d in trange(docs_len):
476 | doc = docs[d]
477 | doc_len = doc.shape[0]
478 | if doc_len > 0:
479 | p_zd[d, :] = self._infer_doc(doc, infer_type, doc_len)
480 | else:
481 | p_zd[d, :] = 0.
482 |
483 | self.p_zd = p_zd
484 | np_p_zd = np.asarray(self.p_zd)
485 | np_p_zd[np.isnan(np_p_zd)] = 0.
486 | return np_p_zd
487 |
488 | cpdef fit_transform(
489 | self, docs, list biterms,
490 | str infer_type='sum_b', int iterations=600, bint verbose=True):
491 | """Run model fitting and return documents vs topics matrix.
492 |
493 | Parameters
494 | ----------
495 | docs : list
496 | Documents list. Each document must be presented as
497 | a list of words ids. Typically, it can be the output of
498 | :meth:`bitermplus.get_vectorized_docs`.
499 | biterms : list
500 | List of biterms.
501 | infer_type : str
502 | Inference type. The following options are available:
503 |
504 | 1) ``sum_b`` (default).
505 | 2) ``sum_w``.
506 | 3) ``mix``.
507 | iterations : int = 600
508 | Iterations number.
509 | verbose : bool = True
510 | Be verbose (show progress bars).
511 |
512 | Returns
513 | -------
514 | p_zd : np.ndarray
515 | Documents vs topics matrix (D x T).
516 | """
517 | self.fit(biterms, iterations=iterations, verbose=verbose)
518 | p_zd = self.transform(
519 | docs, infer_type=infer_type, verbose=verbose)
520 | return p_zd
521 |
522 | @property
523 | def matrix_topics_words_(self) -> np.ndarray:
524 | """Topics vs words probabilities matrix."""
525 | return np.asarray(self.p_wz)
526 |
527 | @property
528 | def matrix_words_topics_(self) -> np.ndarray:
529 | """Words vs topics probabilities matrix."""
530 | return np.asarray(self.p_wz).T
531 |
532 | @property
533 | def df_words_topics_(self) -> DataFrame:
534 | """Words vs topics probabilities in a DataFrame."""
535 | return DataFrame(np.asarray(self.p_wz).T, index=self.vocabulary)
536 |
537 | @property
538 | def matrix_docs_topics_(self) -> np.ndarray:
539 | """Documents vs topics probabilities matrix."""
540 | return np.asarray(self.p_zd)
541 |
542 | @property
543 | def matrix_topics_docs_(self) -> np.ndarray:
544 | """Topics vs documents probabilities matrix."""
545 | return np.asarray(self.p_zd).T
546 |
547 | @property
548 | def coherence_(self) -> np.ndarray:
549 | """Semantic topics coherence."""
550 | return coherence(self.p_wz, self.n_dw, M=self.M)
551 |
552 | @property
553 | def perplexity_(self) -> float:
554 | """Perplexity.
555 |
556 | Run `transform` method before calculating perplexity"""
557 | return perplexity(self.p_wz, self.p_zd, self.n_dw, self.T)
558 |
559 | @property
560 | def vocabulary_(self) -> np.ndarray:
561 | """Vocabulary (list of words)."""
562 | return np.asarray(self.vocabulary)
563 |
564 | @property
565 | def alpha_(self) -> float:
566 | """Model parameter."""
567 | return self.alpha
568 |
569 | @property
570 | def beta_(self) -> float:
571 | """Model parameter."""
572 | return self.beta
573 |
574 | @property
575 | def window_(self) -> int:
576 | """Biterms generation window size."""
577 | return self.win
578 |
579 | @property
580 | def has_background_(self) -> bool:
581 | """Specifies whether the model has a background topic
582 | to accumulate highly frequent words."""
583 | return self.has_background
584 |
585 | @property
586 | def topics_num_(self) -> int:
587 | """Number of topics."""
588 | return self.T
589 |
590 | @property
591 | def vocabulary_size_(self) -> int:
592 | """Vocabulary size (number of words)."""
593 | return len(self.vocabulary)
594 |
595 | @property
596 | def coherence_window_(self) -> int:
597 | """Number of top words for coherence calculation."""
598 | return self.M
599 |
600 | @property
601 | def iterations_(self) -> int:
602 | """Number of iterations the model fitting process has
603 | gone through."""
604 | return self.iters
605 |
606 | @property
607 | def theta_(self) -> np.ndarray:
608 | """Topics probabilities vector."""
609 | return np.array(self.p_z)
610 |
611 | @property
612 | def biterms_(self) -> np.ndarray:
613 | """Model biterms. Terms are coded with the corresponding ids."""
614 | return np.asarray(self.B)
615 |
616 | @property
617 | def labels_(self) -> np.ndarray:
618 | """Model document labels (most probable topic for each document)."""
619 | return np.asarray(self.p_zd).argmax(axis=1)
620 |
--------------------------------------------------------------------------------
/src/bitermplus/_metrics.pyx:
--------------------------------------------------------------------------------
1 | __all__ = ['perplexity', 'coherence', 'entropy']
2 |
3 | from cython.view cimport array
4 | from libc.math cimport exp, log
5 | from typing import Union
6 | from pandas import DataFrame
7 | from scipy.sparse import csr
8 | from cython.parallel import prange
9 | from cython import boundscheck, wraparound, cdivision
10 | import numpy as np
11 |
12 |
13 | @boundscheck(False)
14 | # @wraparound(False)
15 | cpdef double perplexity(
16 | double[:, :] p_wz,
17 | double[:, :] p_zd,
18 | n_dw,
19 | long T):
20 | """Perplexity calculation [1]_.
21 |
22 | Parameters
23 | ----------
24 | p_wz : np.ndarray
25 | Topics vs words probabilities matrix (T x W).
26 |
27 | p_zd : np.ndarray
28 | Documents vs topics probabilities matrix (D x T).
29 |
30 | n_dw : scipy.sparse.csr_matrix
31 | Words frequency matrix for all documents (D x W).
32 |
33 | T : int
34 | Number of topics.
35 |
36 | Returns
37 | -------
38 | perplexity : float
39 | Perplexity estimate.
40 |
41 | References
42 | ----------
43 | .. [1] Heinrich, G. (2005). Parameter estimation for text analysis (pp.
44 | 1-32). Technical report.
45 |
46 | Example
47 | -------
48 | >>> import bitermplus as btm
49 | >>> # Preprocessing step
50 | >>> # ...
51 | >>> # X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
52 | >>> # Model fitting step
53 | >>> # model = ...
54 | >>> # Inference step
55 | >>> # p_zd = model.transform(docs_vec_subset)
56 | >>> # Coherence calculation
57 | >>> perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
58 | """
59 | cdef double pwz_pzd_sum = 0.
60 | cdef double exp_num = 0.
61 | cdef double perplexity = 0.
62 | cdef double n = 0
63 | cdef long d, w, t, w_i, w_ri, w_rj
64 | cdef long D = p_zd.shape[0]
65 | cdef long W = p_wz.shape[1]
66 | cdef long[:] n_dw_indptr = n_dw.indptr.astype(int)
67 | cdef long[:] n_dw_indices = n_dw.indices.astype(int)
68 | cdef double n_dw_sum = n_dw.sum()
69 | cdef double[:] n_dw_data = n_dw.data.astype(float)
70 |
71 | # Iterating over all documents
72 | for d in prange(D, nogil=True):
73 |
74 | w_ri = n_dw_indptr[d]
75 | # if d + 1 == D:
76 | # w_rj = W
77 | # else:
78 | w_rj = n_dw_indptr[d+1]
79 |
80 | for w_i in range(w_ri, w_rj):
81 | w = n_dw_indices[w_i]
82 | n = n_dw_data[w_i]
83 |
84 | pwz_pzd_sum = 0.
85 | for t in range(T):
86 | pwz_pzd_sum = pwz_pzd_sum + p_zd[d, t] * p_wz[t, w]
87 | if pwz_pzd_sum > 0:
88 | exp_num += n * log(pwz_pzd_sum)
89 |
90 | perplexity = exp(-exp_num / n_dw_sum)
91 | return perplexity
92 |
93 |
94 | @boundscheck(False)
95 | @wraparound(False)
96 | @cdivision(True)
97 | cpdef coherence(
98 | double[:, :] p_wz,
99 | n_dw,
100 | double eps=1.,
101 | int M=20):
102 | """Semantic topic coherence calculation [1]_.
103 |
104 | Parameters
105 | ----------
106 | p_wz : np.ndarray
107 | Topics vs words probabilities matrix (T x W).
108 |
109 | n_dw : scipy.sparse.csr_matrix
110 | Words frequency matrix for all documents (D x W).
111 |
112 | eps : float
113 | Calculation parameter. It is summed with a word pair
114 | conditional probability.
115 |
116 | M : int
117 | Number of top words in a topic to take.
118 |
119 | Returns
120 | -------
121 | coherence : np.ndarray
122 | Semantic coherence estimates for all topics.
123 |
124 | References
125 | ----------
126 | .. [1] Mimno, D., Wallach, H., Talley, E., Leenders, M., & McCallum, A.
127 | (2011, July). Optimizing semantic coherence in topic models. In
128 | Proceedings of the 2011 conference on empirical methods in natural
129 | language processing (pp. 262-272).
130 |
131 | Example
132 | -------
133 | >>> import bitermplus as btm
134 | >>> # Preprocessing step
135 | >>> # ...
136 | >>> # X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
137 | >>> # Model fitting step
138 | >>> # model = ...
139 | >>> # Coherence calculation
140 | >>> coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
141 | """
142 | cdef int d, i, j, k, t, tw, w_i, w_ri, w_rj, w
143 | cdef double logSum = 0.
144 | cdef long T = p_wz.shape[0]
145 | cdef long W = p_wz.shape[1]
146 | cdef long D = n_dw.shape[0]
147 | cdef long n
148 | cdef long[:] n_dw_indices = n_dw.indices.astype(int)
149 | cdef long[:] n_dw_indptr = n_dw.indptr.astype(int)
150 | cdef long n_dw_len = n_dw_indices.shape[0]
151 | cdef long[:] n_dw_data = n_dw.data.astype(int)
152 | cdef long[:, :] top_words = np.zeros((M, T), dtype=int)
153 | cdef double[:] coherence = np.zeros(T, dtype=float)
154 | cdef int w1 = 0
155 | cdef int w2 = 0
156 | cdef double D_ij = 0.
157 | cdef double D_j = 0.
158 |
159 | for t in range(T):
160 | words_idx_sorted = np.argsort(p_wz[t, :])[:-M-1:-1]
161 | for i in range(M):
162 | top_words[i, t] = words_idx_sorted[i]
163 |
164 | for t in range(T):
165 | logSum = 0.
166 | for i in prange(1, M, nogil=True):
167 | for j in range(0, i):
168 | D_ij = 0.
169 | D_j = 0.
170 |
171 | for d in range(D):
172 | w1 = 0
173 | w2 = 0
174 | w_ri = n_dw_indptr[d]
175 | if d + 1 == D:
176 | w_rj = W
177 | else:
178 | w_rj = n_dw_indptr[d+1]
179 |
180 | for w_i in range(w_ri, w_rj):
181 | w = n_dw_indices[w_i]
182 | n = n_dw_data[w_i]
183 | for tw in range(M):
184 | if (top_words[i, t] == w and n > 0):
185 | w1 = 1
186 | elif (top_words[j, t] == w and n > 0):
187 | w2 = 1
188 | D_ij += float(w1 & w2)
189 | D_j += float(w2)
190 | logSum += log((D_ij + eps) / D_j)
191 | coherence[t] = logSum
192 |
193 | return np.array(coherence)
194 |
195 |
196 | @boundscheck(False)
197 | @wraparound(False)
198 | @cdivision(True)
199 | cpdef entropy(
200 | double[:, :] p_wz,
201 | bint max_probs=True):
202 | """Renyi entropy calculation routine [1]_.
203 |
204 | Renyi entropy can be used to estimate the optimal number of topics: just fit
205 | several models with a different number of topics and choose the number of
206 | topics for which the Renyi entropy is the least.
207 |
208 | Parameters
209 | ----------
210 | p_wz : np.ndarray
211 | Topics vs words probabilities matrix (T x W).
212 |
213 | Returns
214 | -------
215 | renyi : double
216 | Renyi entropy value.
217 | max_probs : bool
218 | Use maximum probabilities of terms per topics instead of all probability values.
219 |
220 | References
221 | ----------
222 | .. [1] Koltcov, S. (2018). Application of Rényi and Tsallis entropies to
223 | topic modeling optimization. Physica A: Statistical Mechanics and its
224 | Applications, 512, 1192-1204.
225 |
226 | Example
227 | -------
228 | >>> import bitermplus as btm
229 | >>> # Preprocessing step
230 | >>> # ...
231 | >>> # Model fitting step
232 | >>> # model = ...
233 | >>> # Entropy calculation
234 | >>> entropy = btm.entropy(model.matrix_topics_words_)
235 | """
236 | # Words number
237 | cdef int W = p_wz.shape[1]
238 | # Topics number
239 | cdef int T = p_wz.shape[0]
240 |
241 | # Initializing variables
242 | cdef double word_ratio = 0.
243 | cdef double sum_prob = 0.
244 | cdef double shannon = 0.
245 | cdef double energy = 0.
246 | cdef double int_energy = 0.
247 | cdef double free_energy = 0.
248 | cdef double renyi = 0.
249 | cdef int t = 0
250 | cdef int w = 0
251 |
252 | # Setting threshold
253 | cdef double thresh = 1. / W
254 |
255 | for w in range(W):
256 | for t in range(T):
257 | if not max_probs or (max_probs and p_wz[t, w] > thresh):
258 | sum_prob += p_wz[t, w]
259 | word_ratio += 1
260 |
261 | # Shannon entropy
262 | shannon = log(word_ratio / (W * T))
263 |
264 | # Internal energy
265 | int_energy = -log(sum_prob / T)
266 |
267 | # Free energy
268 | free_energy = int_energy - shannon * T
269 |
270 | # Renyi entropy
271 | if T == 1:
272 | renyi = free_energy / T
273 | else:
274 | renyi = free_energy / (T-1)
275 |
276 | return renyi
277 |
--------------------------------------------------------------------------------
/src/bitermplus/_util.py:
--------------------------------------------------------------------------------
1 | __all__ = [
2 | 'get_words_freqs', 'get_vectorized_docs',
3 | 'get_biterms', 'get_top_topic_words',
4 | 'get_top_topic_docs', 'get_docs_top_topic']
5 |
6 | from typing import List, Union, Tuple, Dict, Sequence, Any
7 | from scipy.sparse import csr_matrix
8 | from pandas import DataFrame, Series, concat
9 | from sklearn.feature_extraction.text import CountVectorizer
10 | import numpy as np
11 | from ._btm import BTM
12 |
13 |
14 | def get_words_freqs(
15 | docs: Union[List[str], np.ndarray, Series],
16 | **kwargs: dict) -> Tuple[csr_matrix, np.ndarray, Dict]:
17 | """Compute words vs documents frequency matrix.
18 |
19 | Parameters
20 | ----------
21 | docs : Union[List[str], np.ndarray, Series]
22 | Documents in any format that can be passed to
23 | :meth:`sklearn.feature_extraction.text.CountVectorizer` method.
24 | kwargs : dict
25 | Keyword arguments for
26 | :meth:`sklearn.feature_extraction.text.CountVectorizer` method.
27 |
28 | Returns
29 | -------
30 | Tuple[scipy.sparse.csr_matrix, np.ndarray, Dict]
31 | Documents vs words matrix in CSR format,
32 | vocabulary as a numpy.ndarray of terms,
33 | and vocabulary as a dictionary of {term: id} pairs.
34 |
35 | Example
36 | -------
37 | >>> import pandas as pd
38 | >>> import bitermplus as btm
39 |
40 | >>> # Loading data
41 | >>> df = pd.read_csv(
42 | ... 'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
43 | >>> texts = df['texts'].str.strip().tolist()
44 |
45 | >>> # Vectorizing documents, obtaining full vocabulary and biterms
46 | >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
47 | """
48 | vec = CountVectorizer(**kwargs)
49 | X = vec.fit_transform(docs)
50 | words = np.array(vec.get_feature_names_out())
51 | return X, words, vec.vocabulary_
52 |
53 |
54 | def get_vectorized_docs(
55 | docs: Union[List[str], np.ndarray],
56 | vocab: Union[List[str], np.ndarray]) -> List[np.ndarray]:
57 | """Replace words with their ids in each document.
58 |
59 | Parameters
60 | ----------
61 | docs : Union[List[str], np.ndarray]
62 | Documents (iterable of strings).
63 | vocab: Union[List[str], np.ndarray]
64 | Vocabulary (iterable of terms).
65 |
66 | Returns
67 | -------
68 | docs : List[np.ndarray]
69 | Vectorised documents (list of ``numpy.ndarray``
70 | objects with terms ids).
71 |
72 | Example
73 | -------
74 | >>> import pandas as pd
75 | >>> import bitermplus as btm
76 |
77 | >>> # Loading data
78 | >>> df = pd.read_csv(
79 | ... 'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
80 | >>> texts = df['texts'].str.strip().tolist()
81 |
82 | >>> # Vectorizing documents, obtaining full vocabulary and biterms
83 | >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
84 | >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary)
85 | """
86 | vocab_idx = dict(zip(vocab, range(len(vocab))))
87 |
88 | def _parse_words(w):
89 | return vocab_idx.get(w)
90 |
91 | return list(
92 | map(
93 | lambda doc:
94 | np.array(
95 | list(filter(None, map(_parse_words, doc.split()))),
96 | dtype=np.int32),
97 | docs))
98 |
99 |
100 | def get_biterms(
101 | docs: List[np.ndarray],
102 | win: int = 15) -> List[List[int]]:
103 | """Biterms creation routine.
104 |
105 | Parameters
106 | ----------
107 | docs : List[np.ndarray]
108 | List of numpy.ndarray objects containing word indices.
109 | win : int = 15
110 | Biterms generation window.
111 |
112 | Returns
113 | -------
114 | List[List[int]]
115 | List of biterms for each document.
116 |
117 | Example
118 | -------
119 | >>> import pandas as pd
120 | >>> import bitermplus as btm
121 |
122 | >>> # Loading data
123 | >>> df = pd.read_csv(
124 | ... 'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
125 | >>> texts = df['texts'].str.strip().tolist()
126 |
127 | >>> # Vectorizing documents, obtaining full vocabulary and biterms
128 | >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
129 | >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary)
130 | >>> biterms = btm.get_biterms(docs_vec)
131 | """
132 | biterms = []
133 | for doc in docs:
134 | doc_biterms = []
135 | doc_len = len(doc)
136 | if doc_len < 2:
137 | continue
138 | for i in range(doc_len-1):
139 | for j in range(i+1, min(i + win, doc_len)):
140 | wi = min(doc[i], doc[j])
141 | wj = max(doc[i], doc[j])
142 | doc_biterms.append([wi, wj])
143 | biterms.append(doc_biterms)
144 | return biterms
145 |
146 |
147 | def get_top_topic_words(
148 | model: BTM,
149 | words_num: int = 20,
150 | topics_idx: Sequence[Any] = None) -> DataFrame:
151 | """Select top topic words from a fitted model.
152 |
153 | Parameters
154 | ----------
155 | model : bitermplus._btm.BTM
156 | Fitted BTM model.
157 | words_num : int = 20
158 | The number of words to select.
159 | topics_idx : Union[List, numpy.ndarray] = None
160 | Topics indices. Meant to be used to select only stable
161 | topics.
162 |
163 | Returns
164 | -------
165 | DataFrame
166 | Words with highest probabilities per each selected topic.
167 |
168 | Example
169 | -------
170 | >>> stable_topics = [0, 3, 10, 12, 18, 21]
171 | >>> top_words = btm.get_top_topic_words(
172 | ... model,
173 | ... words_num=100,
174 | ... topics_idx=stable_topics)
175 | """
176 | def _select_words(model, topic_id: int):
177 | probs = model.matrix_topics_words_[topic_id, :]
178 | idx = np.argsort(probs)[:-words_num-1:-1]
179 | result = Series(model.vocabulary_[idx])
180 | result.name = 'topic{}'.format(topic_id)
181 | return result
182 |
183 | topics_num = model.topics_num_
184 | topics_idx = np.arange(topics_num) if topics_idx is None else topics_idx
185 | return concat(
186 | map(lambda x: _select_words(model, x), topics_idx), axis=1)
187 |
188 |
189 | def get_top_topic_docs(
190 | docs: Sequence[Any],
191 | p_zd: np.ndarray,
192 | docs_num: int = 20,
193 | topics_idx: Sequence[Any] = None) -> DataFrame:
194 | """Select top topic docs from a fitted model.
195 |
196 | Parameters
197 | ----------
198 | docs : Sequence[Any]
199 | Iterable of documents (e.g. list of strings).
200 | p_zd : np.ndarray
201 | Documents vs topics probabilities matrix.
202 | docs_num : int = 20
203 | The number of documents to select.
204 | topics_idx : Sequence[Any] = None
205 | Topics indices. Meant to be used to select only stable
206 | topics.
207 |
208 | Returns
209 | -------
210 | DataFrame
211 | Documents with highest probabilities in all selected topics.
212 |
213 | Example
214 | -------
215 | >>> top_docs = btm.get_top_topic_docs(
216 | ... texts,
217 | ... p_zd,
218 | ... docs_num=100,
219 | ... topics_idx=[1,2,3,4])
220 | """
221 | def _select_docs(docs, p_zd, topic_id: int):
222 | probs = p_zd[:, topic_id]
223 | idx = np.argsort(probs)[:-docs_num-1:-1]
224 | result = Series(np.asarray(docs)[idx])
225 | result.name = 'topic{}'.format(topic_id)
226 | return result
227 |
228 | topics_num = p_zd.shape[1]
229 | topics_idx = np.arange(topics_num) if topics_idx is None else topics_idx
230 | return concat(
231 | map(lambda x: _select_docs(docs, p_zd, x), topics_idx), axis=1)
232 |
233 |
234 | def get_docs_top_topic(
235 | docs: Sequence[Any],
236 | p_zd: np.ndarray) -> DataFrame:
237 | """Select most probable topic for each document.
238 |
239 | Parameters
240 | ----------
241 | docs : Sequence[Any]
242 | Iterable of documents (e.g. list of strings).
243 | p_zd : np.ndarray
244 | Documents vs topics probabilities matrix.
245 |
246 | Returns
247 | -------
248 | DataFrame
249 | Documents and the most probable topic for each of them.
250 |
251 | Example
252 | -------
253 | >>> import bitermplus as btm
254 | >>> # Read documents from file
255 | >>> # texts = ...
256 | >>> # Build and train a model
257 | >>> # model = ...
258 | >>> # model.fit(...)
259 | >>> btm.get_docs_top_topic(texts, model.matrix_docs_topics_)
260 | """
261 | return DataFrame({'documents': docs, 'label': p_zd.argmax(axis=1)})
262 |
--------------------------------------------------------------------------------
/tests/test_btm.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import pickle as pkl
3 | import logging
4 | import numpy as np
5 | import pandas as pd
6 | from src import bitermplus as btm
7 |
8 | # import time
9 | LOGGER = logging.getLogger(__name__)
10 |
11 |
12 | class TestBTM(unittest.TestCase):
13 |
14 | # Main tests
15 | def test_btm_class(self):
16 | # Importing and vectorizing text data
17 | df = pd.read_csv("dataset/SearchSnippets.txt.gz", header=None, names=["texts"])
18 | texts = df["texts"].str.strip().tolist()
19 |
20 | # Vectorizing documents, obtaining full vocabulary and biterms
21 | X, vocabulary, _ = btm.get_words_freqs(texts)
22 | docs_vec = btm.get_vectorized_docs(texts, vocabulary)
23 | biterms = btm.get_biterms(docs_vec)
24 |
25 | LOGGER.info("Modeling started")
26 | topics_num = 8
27 | model = btm.BTM(
28 | X,
29 | vocabulary,
30 | seed=52214,
31 | T=topics_num,
32 | M=20,
33 | alpha=50 / topics_num,
34 | beta=0.01,
35 | )
36 | # t1 = time.time()
37 | model.fit(biterms, iterations=20)
38 | # t2 = time.time()
39 | # LOGGER.info(t2 - t1)
40 | # LOGGER.info(model.theta_)
41 | self.assertIsInstance(model.matrix_topics_words_, np.ndarray)
42 | self.assertTupleEqual(
43 | model.matrix_topics_words_.shape, (topics_num, vocabulary.size)
44 | )
45 | LOGGER.info("Modeling finished")
46 |
47 | LOGGER.info('Inference "sum_b" started')
48 | docs_vec_subset = docs_vec[:1000]
49 | docs_vec_subset[100] = np.array([], dtype=np.int32)
50 | p_zd = model.transform(docs_vec_subset)
51 | self.assertTupleEqual(p_zd.shape, (1000, topics_num))
52 | # LOGGER.info(p_zd)
53 | LOGGER.info('Inference "sum_b" finished')
54 |
55 | LOGGER.info("Model saving started")
56 | with open("model.pickle", "wb") as file:
57 | pkl.dump(model, file)
58 | LOGGER.info("Model saving finished")
59 |
60 | LOGGER.info('Inference "sum_w" started')
61 | p_zd = model.transform(docs_vec_subset, infer_type="sum_w")
62 | # LOGGER.info(p_zd)
63 | LOGGER.info('Inference "sum_w" finished')
64 |
65 | LOGGER.info('Inference "mix" started')
66 | p_zd = model.transform(docs_vec_subset, infer_type="mix")
67 | # LOGGER.info(p_zd)
68 | LOGGER.info('Inference "mix" finished')
69 |
70 | LOGGER.info("Perplexity testing started")
71 | perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
72 | self.assertTrue(perplexity, model.perplexity_)
73 | self.assertIsInstance(perplexity, float)
74 | self.assertNotEqual(perplexity, 0.0)
75 | LOGGER.info(f"Perplexity value: {perplexity}")
76 | LOGGER.info("Perplexity testing finished")
77 |
78 | LOGGER.info("Coherence testing started")
79 | coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
80 | self.assertTrue(np.allclose(coherence, model.coherence_))
81 | self.assertIsInstance(coherence, np.ndarray)
82 | self.assertGreater(coherence.shape[0], 0)
83 | LOGGER.info(f"Coherence value: {coherence}")
84 | LOGGER.info("Coherence testing finished")
85 |
86 | LOGGER.info("Entropy testing started")
87 | entropy = btm.entropy(model.matrix_topics_words_, True)
88 | self.assertNotEqual(entropy, 0)
89 | LOGGER.info(f"Entropy value: {entropy}")
90 | LOGGER.info("Entropy testing finished")
91 |
92 | LOGGER.info("Model loading started")
93 | with open("model.pickle", "rb") as file:
94 | self.assertIsInstance(pkl.load(file), btm._btm.BTM)
95 | LOGGER.info("Model loading finished")
96 |
97 |
98 | if __name__ == "__main__":
99 | unittest.main()
100 |
--------------------------------------------------------------------------------