├── .coveragerc
├── .github
    └── workflows
    │   ├── package-publish.yml
    │   └── package-test.yml
├── .gitignore
├── .readthedocs.yml
├── LICENSE
├── MANIFEST.in
├── README.md
├── dataset
    └── SearchSnippets.txt.gz
├── docs
    ├── Makefile
    ├── make.bat
    ├── requirements.txt
    └── source
    │   ├── _static
    │       ├── coherence.svg
    │       └── perplexity.svg
    │   ├── benchmarks.rst
    │   ├── bitermplus.metrics.rst
    │   ├── bitermplus.rst
    │   ├── bitermplus.util.rst
    │   ├── conf.py
    │   ├── index.rst
    │   ├── install.rst
    │   └── tutorial.rst
├── images
    └── topics_terms_plots.png
├── pyproject.toml
├── setup.py
├── src
    └── bitermplus
    │   ├── __init__.py
    │   ├── _btm.pyx
    │   ├── _metrics.pyx
    │   └── _util.py
└── tests
    └── test_btm.py


/.coveragerc:
--------------------------------------------------------------------------------
1 | [run]
2 | plugins = Cython.Coverage
3 | source = src/bitermplus


--------------------------------------------------------------------------------
/.github/workflows/package-publish.yml:
--------------------------------------------------------------------------------
 1 | name: Package Upload to PyPi
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [created]
 6 | 
 7 | jobs:
 8 |   deploy:
 9 | 
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |     - uses: actions/checkout@v2
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v2
16 |       with:
17 |         python-version: '3.x'
18 |     - name: Install dependencies
19 |       run: |
20 |         python -m pip install --upgrade pip
21 |         pip install setuptools wheel cython build twine
22 |     - name: Build and publish
23 |       env:
24 |         TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
25 |         TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
26 |       run: |
27 |         python -m build -s
28 |         twine upload dist/*
29 | 


--------------------------------------------------------------------------------
/.github/workflows/package-test.yml:
--------------------------------------------------------------------------------
 1 | name: Package Test
 2 | 
 3 | on: [push]
 4 | 
 5 | jobs:
 6 |   build:
 7 |     runs-on: ubuntu-latest
 8 |     strategy:
 9 |       matrix:
10 |         python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"]
11 |     steps:
12 |     - uses: actions/checkout@v4
13 |     - name: Set up Python ${{ matrix.python-version }}
14 |       uses: actions/setup-python@v5
15 |       with:
16 |         python-version: ${{ matrix.python-version }}
17 |     - name: Install dependencies
18 |       run: |
19 |         python -m pip install --upgrade pip
20 |         pip install .
21 |         pip install .[test]
22 |     - name: Testing package with pytest
23 |       run: |
24 |         cythonize -i src/bitermplus/*.pyx
25 |         pytest -s
26 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | *.c
  9 | 
 10 | # Distribution / packaging
 11 | .Python
 12 | build/
 13 | docs/build/
 14 | docs/source/_build/
 15 | develop-eggs/
 16 | dist/
 17 | downloads/
 18 | eggs/
 19 | .eggs/
 20 | lib/
 21 | lib64/
 22 | parts/
 23 | sdist/
 24 | var/
 25 | wheels/
 26 | pip-wheel-metadata/
 27 | share/python-wheels/
 28 | *.egg-info/
 29 | .installed.cfg
 30 | *.egg
 31 | MANIFEST
 32 | 
 33 | # PyInstaller
 34 | #  Usually these files are written by a python script from a template
 35 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 36 | *.manifest
 37 | *.spec
 38 | 
 39 | # Installer logs
 40 | pip-log.txt
 41 | pip-delete-this-directory.txt
 42 | 
 43 | # Unit test / coverage reports
 44 | htmlcov/
 45 | .tox/
 46 | .nox/
 47 | .coverage
 48 | .coverage.*
 49 | .cache
 50 | nosetests.xml
 51 | coverage.xml
 52 | *.cover
 53 | *.py,cover
 54 | .hypothesis/
 55 | .pytest_cache/
 56 | src/**/*.html
 57 | 
 58 | # Translations
 59 | *.mo
 60 | *.pot
 61 | 
 62 | # Django stuff:
 63 | *.log
 64 | local_settings.py
 65 | db.sqlite3
 66 | db.sqlite3-journal
 67 | 
 68 | # Flask stuff:
 69 | instance/
 70 | .webassets-cache
 71 | 
 72 | # Scrapy stuff:
 73 | .scrapy
 74 | 
 75 | # Sphinx documentation
 76 | docs/_build/
 77 | 
 78 | # PyBuilder
 79 | target/
 80 | 
 81 | # Jupyter Notebook
 82 | .ipynb_checkpoints
 83 | 
 84 | # IPython
 85 | profile_default/
 86 | ipython_config.py
 87 | 
 88 | # pyenv
 89 | .python-version
 90 | 
 91 | # pipenv
 92 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 93 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 94 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 95 | #   install all needed dependencies.
 96 | #Pipfile.lock
 97 | 
 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 99 | __pypackages__/
100 | 
101 | # Celery stuff
102 | celerybeat-schedule
103 | celerybeat.pid
104 | 
105 | # SageMath parsed files
106 | *.sage.py
107 | 
108 | # Environments
109 | .env
110 | .venv
111 | env/
112 | venv/
113 | ENV/
114 | env.bak/
115 | venv.bak/
116 | 
117 | # Spyder project settings
118 | .spyderproject
119 | .spyproject
120 | 
121 | # Rope project settings
122 | .ropeproject
123 | 
124 | # mkdocs documentation
125 | /site
126 | 
127 | # mypy
128 | .mypy_cache/
129 | .dmypy.json
130 | dmypy.json
131 | 
132 | # Pyre type checker
133 | .pyre/
134 | 
135 | # vscode
136 | .vscode
137 | 
138 | # pickles
139 | *.pickle


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | build:
 4 |   os: ubuntu-22.04
 5 |   tools:
 6 |     python: "3.11"
 7 | 
 8 | sphinx:
 9 |    configuration: docs/source/conf.py
10 | 
11 | formats:
12 |    - pdf
13 | 
14 | python:
15 |    install:
16 |    - requirements: docs/requirements.txt
17 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 Maksim Terpilowski
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include src/bitermplus *.pyx
2 | include LICENSE


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Biterm Topic Model
  2 | 
  3 | ![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/maximtrp/bitermplus/package-test.yml)
  4 | [![Documentation Status](https://readthedocs.org/projects/bitermplus/badge/?version=latest)](https://bitermplus.readthedocs.io/en/latest/?badge=latest)
  5 | [![Codacy Badge](https://app.codacy.com/project/badge/Grade/192b6a75449040ff868932a15ca28ce9)](https://www.codacy.com/gh/maximtrp/bitermplus/dashboard?utm_source=github.com&amp;utm_medium=referral&amp;utm_content=maximtrp/bitermplus&amp;utm_campaign=Badge_Grade)
  6 | [![Issues](https://img.shields.io/github/issues/maximtrp/bitermplus.svg)](https://github.com/maximtrp/bitermplus/issues)
  7 | [![Downloads](https://static.pepy.tech/badge/bitermplus)](https://pepy.tech/project/bitermplus)
  8 | ![PyPI](https://img.shields.io/pypi/v/bitermplus)
  9 | 
 10 | *Bitermplus* implements [Biterm topic model](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.402.4032&rep=rep1&type=pdf) for short texts introduced by Xiaohui Yan, Jiafeng Guo, Yanyan Lan, and Xueqi Cheng. Actually, it is a cythonized version of [BTM](https://github.com/xiaohuiyan/BTM). This package is also capable of computing *perplexity*, *semantic coherence*, and *entropy* metrics.
 11 | 
 12 | ## Donate
 13 | 
 14 | If you find this package useful, please consider donating any amount of money. This will help me spend more time on supporting open-source software.
 15 | 
 16 | <a href="https://www.buymeacoffee.com/maximtrp" target="_blank"><img src="https://cdn.buymeacoffee.com/buttons/v2/default-yellow.png" alt="Buy Me A Coffee" style="height: 60px !important;width: 217px !important;" ></a>
 17 | 
 18 | ## Requirements
 19 | 
 20 | * cython
 21 | * numpy
 22 | * pandas
 23 | * scipy
 24 | * scikit-learn
 25 | * tqdm
 26 | 
 27 | ## Setup
 28 | 
 29 | ### Linux and Windows
 30 | 
 31 | Be sure to install Python headers if they are not included in your Python installation. For example, in Ubuntu it can be done using this command (where `x` is Python minor version number):
 32 | 
 33 | ```bash
 34 | sudo apt-get install python3.x-dev
 35 | ```
 36 | 
 37 | Apart from that, there should be no issues with installing *bitermplus* under these OSes. You can install the package directly from PyPi:
 38 | 
 39 | ```bash
 40 | pip install bitermplus
 41 | ```
 42 | 
 43 | Or from this repo:
 44 | 
 45 | ```bash
 46 | pip install git+https://github.com/maximtrp/bitermplus.git
 47 | ```
 48 | 
 49 | ### Mac OS
 50 | 
 51 | First, you need to install XCode CLT and [Homebrew](https://brew.sh).
 52 | Then, install `libomp` using `brew`:
 53 | 
 54 | ```bash
 55 | xcode-select --install
 56 | brew install libomp
 57 | pip3 install bitermplus
 58 | ```
 59 | 
 60 | If you have the following issue with libomp (`fatal error: 'omp.h' file not found`), run `brew info libomp` in the console:
 61 | 
 62 | ```bash
 63 | brew info libomp
 64 | ```
 65 | 
 66 | You should see the following output:
 67 | 
 68 | ```
 69 | libomp: stable 15.0.5 (bottled) [keg-only]
 70 | LLVM's OpenMP runtime library
 71 | https://openmp.llvm.org/
 72 | /opt/homebrew/Cellar/libomp/15.0.5 (7 files, 1.6MB)
 73 | Poured from bottle on 2022-11-19 at 12:16:49
 74 | From: https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/libomp.rb
 75 | License: MIT
 76 | ==> Dependencies
 77 | Build: cmake ✘, lit ✘
 78 | ==> Caveats
 79 | libomp is keg-only, which means it was not symlinked into /opt/homebrew,
 80 | because it can override GCC headers and result in broken builds.
 81 | 
 82 | For compilers to find libomp you may need to set:
 83 | export LDFLAGS="-L/opt/homebrew/opt/libomp/lib"
 84 | export CPPFLAGS="-I/opt/homebrew/opt/libomp/include"
 85 | 
 86 | ==> Analytics
 87 | install: 192,197 (30 days), 373,389 (90 days), 1,285,192 (365 days)
 88 | install-on-request: 24,388 (30 days), 48,013 (90 days), 164,666 (365 days)
 89 | build-error: 0 (30 days)
 90 | ```
 91 | 
 92 | Export `LDFLAGS` and `CPPFLAGS` as suggested in brew output:
 93 | 
 94 | ```bash
 95 | export LDFLAGS="-L/opt/homebrew/opt/libomp/lib"
 96 | export CPPFLAGS="-I/opt/homebrew/opt/libomp/include"
 97 | ```
 98 | 
 99 | ## Example
100 | 
101 | ### Model fitting
102 | 
103 | ```python
104 | import bitermplus as btm
105 | import numpy as np
106 | import pandas as pd
107 | 
108 | # IMPORTING DATA
109 | df = pd.read_csv(
110 |     'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
111 | texts = df['texts'].str.strip().tolist()
112 | 
113 | # PREPROCESSING
114 | # Obtaining terms frequency in a sparse matrix and corpus vocabulary
115 | X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
116 | tf = np.array(X.sum(axis=0)).ravel()
117 | # Vectorizing documents
118 | docs_vec = btm.get_vectorized_docs(texts, vocabulary)
119 | docs_lens = list(map(len, docs_vec))
120 | # Generating biterms
121 | biterms = btm.get_biterms(docs_vec)
122 | 
123 | # INITIALIZING AND RUNNING MODEL
124 | model = btm.BTM(
125 |     X, vocabulary, seed=12321, T=8, M=20, alpha=50/8, beta=0.01)
126 | model.fit(biterms, iterations=20)
127 | p_zd = model.transform(docs_vec)
128 | 
129 | # METRICS
130 | perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
131 | coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
132 | # or
133 | perplexity = model.perplexity_
134 | coherence = model.coherence_
135 | 
136 | # LABELS
137 | model.labels_
138 | # or
139 | btm.get_docs_top_topic(texts, model.matrix_docs_topics_)
140 | ```
141 | 
142 | ### Results visualization
143 | 
144 | You need to install [tmplot](https://github.com/maximtrp/tmplot) first.
145 | 
146 | ```python
147 | import tmplot as tmp
148 | tmp.report(model=model, docs=texts)
149 | ```
150 | 
151 | ![Report interface](images/topics_terms_plots.png)
152 | 
153 | ## Tutorial
154 | 
155 | There is a [tutorial](https://bitermplus.readthedocs.io/en/latest/tutorial.html)
156 | in documentation that covers the important steps of topic modeling (including
157 | stability measures and results visualization).
158 | 


--------------------------------------------------------------------------------
/dataset/SearchSnippets.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/bitermplus/20fd0d1601e007aa1567e6ed97a9c906fd869a7f/dataset/SearchSnippets.txt.gz


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	# sphinx-autogen -o source/generated source/*.rst
21 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
22 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | sphinx_rtd_theme
2 | git+https://github.com/maximtrp/bitermplus
3 | 


--------------------------------------------------------------------------------
/docs/source/_static/coherence.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" class="marks" width="468" height="356" viewBox="0 0 468 356"><rect width="468" height="356" fill="white"/><g fill="none" stroke-miterlimit="10" transform="translate(62,12)"><g class="mark-group role-frame root" role="graphics-object" aria-roledescription="group mark container"><g transform="translate(0,0)"><path class="background" aria-hidden="true" d="M0.5,0.5h400v300h-400Z" stroke="#ddd"/><g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="X-axis for a linear scale with values from 0 to 2,000"><g transform="translate(0.5,300.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-grid" pointer-events="none"><line transform="translate(0,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(40,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(80,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(120,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(160,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(200,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(240,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(280,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(320,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(360,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(400,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="Y-axis for a linear scale with values from −800 to −600"><g transform="translate(0.5,0.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-grid" pointer-events="none"><line transform="translate(0,300)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,270)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,240)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,210)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,180)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,150)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,120)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,90)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,60)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,30)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,0)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="X-axis titled 'Iterations' for a linear scale with values from 0 to 2,000"><g transform="translate(0.5,300.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-tick" pointer-events="none"><line transform="translate(0,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(40,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(80,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(120,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(160,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(200,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(240,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(280,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(320,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(360,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(400,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-label" pointer-events="none"><text text-anchor="start" transform="translate(0,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">0</text><text text-anchor="middle" transform="translate(40,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">200</text><text text-anchor="middle" transform="translate(80,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">400</text><text text-anchor="middle" transform="translate(120,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">600</text><text text-anchor="middle" transform="translate(160,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">800</text><text text-anchor="middle" transform="translate(200,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">1,000</text><text text-anchor="middle" transform="translate(240,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">1,200</text><text text-anchor="middle" transform="translate(280,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">1,400</text><text text-anchor="middle" transform="translate(320,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">1,600</text><text text-anchor="middle" transform="translate(360,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">1,800</text><text text-anchor="end" transform="translate(400,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">2,000</text></g><g class="mark-rule role-axis-domain" pointer-events="none"><line transform="translate(0,0)" x2="400" y2="0" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-title" pointer-events="none"><text text-anchor="middle" transform="translate(200,36)" font-family="sans-serif" font-size="14px" font-weight="bold" fill="#000" opacity="1">Iterations</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="Y-axis titled 'Semantic coherence' for a linear scale with values from −800 to −600"><g transform="translate(0.5,0.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-tick" pointer-events="none"><line transform="translate(0,300)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,270)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,240)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,210)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,180)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,150)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,120)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,90)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,60)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,30)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,0)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-label" pointer-events="none"><text text-anchor="end" transform="translate(-7,304)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−800</text><text text-anchor="end" transform="translate(-7,274)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−780</text><text text-anchor="end" transform="translate(-7,244)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−760</text><text text-anchor="end" transform="translate(-7,214)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−740</text><text text-anchor="end" transform="translate(-7,184)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−720</text><text text-anchor="end" transform="translate(-7,154)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−700</text><text text-anchor="end" transform="translate(-7,124)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−680</text><text text-anchor="end" transform="translate(-7,94.00000000000001)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−660</text><text text-anchor="end" transform="translate(-7,63.999999999999986)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−640</text><text text-anchor="end" transform="translate(-7,33.99999999999999)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−620</text><text text-anchor="end" transform="translate(-7,4)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">−600</text></g><g class="mark-rule role-axis-domain" pointer-events="none"><line transform="translate(0,300)" x2="0" y2="-300" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-title" pointer-events="none"><text text-anchor="middle" transform="translate(-42.58203125,150) rotate(-90) translate(0,-3)" font-family="sans-serif" font-size="14px" font-weight="bold" fill="#000" opacity="1">Semantic coherence</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-line role-mark layer_0_marks" clip-path="url(#clip1)" role="graphics-symbol" aria-roledescription="line mark container"><path d="M2,179.34747225753628L4,150.35533147669076L6.6000000000000005,94.77327214479232L10,98.45904809635012L13.200000000000001,100.20307210649516L20,122.02276902273616L30,119.23856497277065L40,104.36981620366667L66.60000000000001,101.10886068234574L100,104.61964741979983L133.20000000000002,84.082685908548L200,100.17950006264141L266.59999999999997,81.90628847207381L300,85.93219957405482L333.2,79.82450794639884L400,92.78629514745442" stroke="#4c78a8" stroke-width="2"/></g><g class="mark-symbol role-mark layer_1_marks" clip-path="url(#clip2)" role="graphics-symbol" aria-roledescription="symbol mark container"><path transform="translate(2,179.34747225753628)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(4,150.35533147669076)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(6.6000000000000005,94.77327214479232)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(10,98.45904809635012)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(13.200000000000001,100.20307210649516)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(20,122.02276902273616)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(30,119.23856497277065)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(40,104.36981620366667)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(66.60000000000001,101.10886068234574)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(100,104.61964741979983)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(133.20000000000002,84.082685908548)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(200,100.17950006264141)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(266.59999999999997,81.90628847207381)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(300,85.93219957405482)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(333.2,79.82450794639884)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(400,92.78629514745442)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/></g></g><path class="foreground" aria-hidden="true" d="" display="none"/></g></g></g><defs><clipPath id="clip1"><rect x="0" y="0" width="400" height="300"/></clipPath><clipPath id="clip2"><rect x="0" y="0" width="400" height="300"/></clipPath></defs></svg>


--------------------------------------------------------------------------------
/docs/source/_static/perplexity.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" version="1.1" class="marks" width="473" height="356" viewBox="0 0 473 356"><rect width="473" height="356" fill="white"/><g fill="none" stroke-miterlimit="10" transform="translate(65,12)"><g class="mark-group role-frame root" role="graphics-object" aria-roledescription="group mark container"><g transform="translate(0,0)"><path class="background" aria-hidden="true" d="M0.5,0.5h400v300h-400Z" stroke="#ddd"/><g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="X-axis for a linear scale with values from 0 to 2,000"><g transform="translate(0.5,300.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-grid" pointer-events="none"><line transform="translate(0,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(40,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(80,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(120,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(160,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(200,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(240,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(280,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(320,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(360,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(400,-300)" x2="0" y2="300" stroke="#ddd" stroke-width="1" opacity="1"/></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="Y-axis for a linear scale with values from 700 to 1,000"><g transform="translate(0.5,0.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-grid" pointer-events="none"><line transform="translate(0,300)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,250)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,200)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,150)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,100)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,50)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/><line transform="translate(0,0)" x2="400" y2="0" stroke="#ddd" stroke-width="1" opacity="1"/></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="X-axis titled 'Iterations' for a linear scale with values from 0 to 2,000"><g transform="translate(0.5,300.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-tick" pointer-events="none"><line transform="translate(0,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(40,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(80,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(120,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(160,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(200,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(240,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(280,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(320,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(360,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(400,0)" x2="0" y2="5" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-label" pointer-events="none"><text text-anchor="start" transform="translate(0,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">0</text><text text-anchor="middle" transform="translate(40,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">200</text><text text-anchor="middle" transform="translate(80,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">400</text><text text-anchor="middle" transform="translate(120,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">600</text><text text-anchor="middle" transform="translate(160,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">800</text><text text-anchor="middle" transform="translate(200,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">1,000</text><text text-anchor="middle" transform="translate(240,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">1,200</text><text text-anchor="middle" transform="translate(280,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">1,400</text><text text-anchor="middle" transform="translate(320,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">1,600</text><text text-anchor="middle" transform="translate(360,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="0">1,800</text><text text-anchor="end" transform="translate(400,18)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">2,000</text></g><g class="mark-rule role-axis-domain" pointer-events="none"><line transform="translate(0,0)" x2="400" y2="0" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-title" pointer-events="none"><text text-anchor="middle" transform="translate(200,36)" font-family="sans-serif" font-size="14px" font-weight="bold" fill="#000" opacity="1">Iterations</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-group role-axis" role="graphics-symbol" aria-roledescription="axis" aria-label="Y-axis titled 'Perplexity' for a linear scale with values from 700 to 1,000"><g transform="translate(0.5,0.5)"><path class="background" aria-hidden="true" d="M0,0h0v0h0Z" pointer-events="none"/><g><g class="mark-rule role-axis-tick" pointer-events="none"><line transform="translate(0,300)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,250)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,200)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,150)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,100)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,50)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/><line transform="translate(0,0)" x2="-5" y2="0" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-label" pointer-events="none"><text text-anchor="end" transform="translate(-7,304)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">700</text><text text-anchor="end" transform="translate(-7,254)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">750</text><text text-anchor="end" transform="translate(-7,204.00000000000003)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">800</text><text text-anchor="end" transform="translate(-7,154)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">850</text><text text-anchor="end" transform="translate(-7,104.00000000000001)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">900</text><text text-anchor="end" transform="translate(-7,53.999999999999986)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">950</text><text text-anchor="end" transform="translate(-7,4)" font-family="sans-serif" font-size="14px" fill="#000" opacity="1">1,000</text></g><g class="mark-rule role-axis-domain" pointer-events="none"><line transform="translate(0,300)" x2="0" y2="-300" stroke="#888" stroke-width="1" opacity="1"/></g><g class="mark-text role-axis-title" pointer-events="none"><text text-anchor="middle" transform="translate(-45.193359375,150) rotate(-90) translate(0,-3)" font-family="sans-serif" font-size="14px" font-weight="bold" fill="#000" opacity="1">Perplexity</text></g></g><path class="foreground" aria-hidden="true" d="" pointer-events="none" display="none"/></g></g><g class="mark-line role-mark layer_0_marks" role="graphics-symbol" aria-roledescription="line mark container"><path d="M2,4.12465268821165L4,142.45032679548L6.6000000000000005,176.21950667304236L10,214.07373567958678L13.200000000000001,203.53500324774055L20,183.35133213352063L30,205.35056374323187L40,240.02280556352594L66.60000000000001,236.16609834324606L100,234.96777352030108L133.20000000000002,234.17584186677595L200,238.47174391823322L266.59999999999997,236.42530969184259L300,235.45746426670405L333.2,235.07361123469934L400,234.4506740067245" stroke="#4c78a8" stroke-width="2"/></g><g class="mark-symbol role-mark layer_1_marks" role="graphics-symbol" aria-roledescription="symbol mark container"><path transform="translate(2,4.12465268821165)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(4,142.45032679548)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(6.6000000000000005,176.21950667304236)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(10,214.07373567958678)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(13.200000000000001,203.53500324774055)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(20,183.35133213352063)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(30,205.35056374323187)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(40,240.02280556352594)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(66.60000000000001,236.16609834324606)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(100,234.96777352030108)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(133.20000000000002,234.17584186677595)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(200,238.47174391823322)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(266.59999999999997,236.42530969184259)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(300,235.45746426670405)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(333.2,235.07361123469934)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/><path transform="translate(400,234.4506740067245)" d="M2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,-2.7386127875258306,0A2.7386127875258306,2.7386127875258306,0,1,1,2.7386127875258306,0" fill="#4c78a8" stroke-width="2" opacity="1"/></g></g><path class="foreground" aria-hidden="true" d="" display="none"/></g></g></g></svg>


--------------------------------------------------------------------------------
/docs/source/benchmarks.rst:
--------------------------------------------------------------------------------
 1 | Benchmarks
 2 | ----------
 3 | 
 4 | In this section, the results of a series of benchmarks done on *SearchSnippets* dataset
 5 | are presented. Sixteen models were trained with different iterations number
 6 | (from 10 to 2000) and default model parameters. Topics number was set to 8.
 7 | Semantic topic coherence (``u_mass``) and perplexity were
 8 | calculated for each model.
 9 | 
10 | .. image:: _static/perplexity.svg
11 |    :alt: Perplexity
12 | 
13 | .. image:: _static/coherence.svg
14 |    :alt: Semantic topic coherence
15 | 
16 | 


--------------------------------------------------------------------------------
/docs/source/bitermplus.metrics.rst:
--------------------------------------------------------------------------------
1 | Metrics
2 | =======
3 | 
4 | .. currentmodule:: bitermplus
5 | 
6 | .. autofunction:: coherence
7 | .. autofunction:: perplexity
8 | .. autofunction:: entropy


--------------------------------------------------------------------------------
/docs/source/bitermplus.rst:
--------------------------------------------------------------------------------
1 | Model
2 | =====
3 | 
4 | .. currentmodule:: bitermplus
5 | 
6 | .. autoclass:: BTM
7 |    :members:
8 | 


--------------------------------------------------------------------------------
/docs/source/bitermplus.util.rst:
--------------------------------------------------------------------------------
 1 | Utility functions
 2 | =================
 3 | 
 4 | .. currentmodule:: bitermplus
 5 | 
 6 | .. autofunction:: get_words_freqs
 7 | .. autofunction:: get_vectorized_docs
 8 | .. autofunction:: get_biterms
 9 | .. autofunction:: get_top_topic_words
10 | .. autofunction:: get_top_topic_docs
11 | .. autofunction:: get_docs_top_topic


--------------------------------------------------------------------------------
/docs/source/conf.py:
--------------------------------------------------------------------------------
 1 | # Configuration file for the Sphinx documentation builder.
 2 | #
 3 | # This file only contains a selection of the most common options. For a full
 4 | # list see the documentation:
 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html
 6 | 
 7 | # -- Path setup --------------------------------------------------------------
 8 | 
 9 | # If extensions (or modules to document with autodoc) are in another directory,
10 | # add these directories to sys.path here. If the directory is relative to the
11 | # documentation root, use os.path.abspath to make it absolute, like shown here.
12 | #
13 | # import os
14 | # import sys
15 | # sys.path.insert(0, os.path.abspath('.'))
16 | 
17 | 
18 | # -- Project information -----------------------------------------------------
19 | 
20 | project = 'bitermplus'
21 | copyright = '2021, Maksim Terpilowski'
22 | author = 'Maksim Terpilowski'
23 | 
24 | 
25 | # -- General configuration ---------------------------------------------------
26 | 
27 | # Add any Sphinx extension module names here, as strings. They can be
28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
29 | # ones.
30 | extensions = [
31 |     'sphinx.ext.autosummary',
32 |     'sphinx.ext.napoleon',
33 | ]
34 | 
35 | # Add any paths that contain templates here, relative to this directory.
36 | templates_path = ['_templates']
37 | 
38 | # List of patterns, relative to source directory, that match files and
39 | # directories to ignore when looking for source files.
40 | # This pattern also affects html_static_path and html_extra_path.
41 | exclude_patterns = []
42 | 
43 | 
44 | # -- Options for HTML output -------------------------------------------------
45 | 
46 | # The theme to use for HTML and HTML Help pages.  See the documentation for
47 | # a list of builtin themes.
48 | #
49 | html_theme = 'sphinx_rtd_theme'
50 | 
51 | # Add any paths that contain custom static files (such as style sheets) here,
52 | # relative to this directory. They are copied after the builtin static files,
53 | # so a file named "default.css" will overwrite the builtin "default.css".
54 | html_static_path = ['_static']
55 | 


--------------------------------------------------------------------------------
/docs/source/index.rst:
--------------------------------------------------------------------------------
 1 | bitermplus
 2 | ==========
 3 | 
 4 | *Bitermplus* implements `Biterm topic model
 5 | <https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.402.4032&rep=rep1&type=pdf>`_
 6 | for short texts introduced by Xiaohui Yan, Jiafeng Guo, Yanyan Lan, and Xueqi
 7 | Cheng. Actually, it is a cythonized version of `BTM
 8 | <https://github.com/xiaohuiyan/BTM>`_. This package is also capable of computing
 9 | *perplexity* and *semantic coherence* metrics.
10 | 
11 | .. toctree::
12 |    :maxdepth: 2
13 |    :caption: Usage
14 |    :hidden:
15 | 
16 |    Installation <install>
17 |    Tutorial <tutorial>
18 |    Benchmarks <benchmarks>
19 | 
20 | .. toctree::
21 |    :maxdepth: 2
22 |    :caption: API
23 |    :hidden:
24 | 
25 |    Model <bitermplus>
26 |    Metrics <bitermplus.metrics>
27 |    Utility functions <bitermplus.util>
28 | 


--------------------------------------------------------------------------------
/docs/source/install.rst:
--------------------------------------------------------------------------------
 1 | Setup
 2 | -----
 3 | 
 4 | Linux and Windows
 5 | ~~~~~~~~~~~~~~~~~
 6 | 
 7 | There should be no issues with installing *bitermplus* under these OSes.
 8 | You can install the package directly from PyPi.
 9 | 
10 | .. code-block:: bash
11 | 
12 |     pip install bitermplus
13 | 
14 | Or from this repo:
15 | 
16 | .. code-block:: bash
17 | 
18 |     pip install git+https://github.com/maximtrp/bitermplus.git
19 | 
20 | Mac OS
21 | ~~~~~~
22 | 
23 | First, you need to install XCode CLT and `Homebrew <https://brew.sh>`_.
24 | Then, install ``libomp`` using ``brew``:
25 | 
26 | .. code-block:: bash
27 | 
28 |     xcode-select --install
29 |     brew install libomp
30 |     pip3 install bitermplus
31 | 
32 | Requirements
33 | ~~~~~~~~~~~~
34 | 
35 | * cython
36 | * numpy
37 | * pandas
38 | * scipy
39 | * scikit-learn
40 | * tqdm
41 | 


--------------------------------------------------------------------------------
/docs/source/tutorial.rst:
--------------------------------------------------------------------------------
  1 | Tutorial
  2 | ========
  3 | 
  4 | Model fitting
  5 | -------------
  6 | 
  7 | Here is a simple example of model fitting.
  8 | It is supposed that you have already gone through the preprocessing
  9 | stage: cleaned, lemmatized or stemmed your documents, and removed stop words.
 10 | 
 11 | .. code-block:: python
 12 | 
 13 |     import bitermplus as btm
 14 |     import numpy as np
 15 |     import pandas as pd
 16 | 
 17 |     # Importing data
 18 |     df = pd.read_csv(
 19 |         'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
 20 |     texts = df['texts'].str.strip().tolist()
 21 | 
 22 |     # Vectorizing documents, obtaining full vocabulary and biterms
 23 |     # Internally, btm.get_words_freqs uses CountVectorizer from sklearn
 24 |     # You can pass any of its arguments to btm.get_words_freqs
 25 |     # For example, you can remove stop words:
 26 |     stop_words = ["word1", "word2", "word3"]
 27 |     X, vocabulary, vocab_dict = btm.get_words_freqs(texts, stop_words=stop_words)
 28 |     docs_vec = btm.get_vectorized_docs(texts, vocabulary)
 29 |     biterms = btm.get_biterms(docs_vec)
 30 | 
 31 |     # Initializing and running model
 32 |     model = btm.BTM(
 33 |         X, vocabulary, seed=12321, T=8, M=20, alpha=50/8, beta=0.01)
 34 |     model.fit(biterms, iterations=20)
 35 | 
 36 | 
 37 | Inference
 38 | ---------
 39 | 
 40 | Now, we will calculate documents vs topics probability matrix (make an inference).
 41 | 
 42 | .. code-block:: python
 43 | 
 44 |     p_zd = model.transform(docs_vec)
 45 | 
 46 | If you need to make an inference on a new dataset, you should
 47 | vectorize it using your vocabulary from the training set:
 48 | 
 49 | .. code-block:: python
 50 | 
 51 |     new_docs_vec = btm.get_vectorized_docs(new_texts, vocabulary)
 52 |     p_zd = model.transform(new_docs_vec)
 53 | 
 54 | 
 55 | Calculating metrics
 56 | -------------------
 57 | 
 58 | To calculate perplexity, we must provide documents vs topics probability matrix
 59 | (``p_zd``) that we calculated at the previous step. 
 60 | 
 61 | .. code-block:: python
 62 | 
 63 |     perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
 64 |     coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
 65 |     # or
 66 |     perplexity = model.perplexity_
 67 |     coherence = model.coherence_
 68 | 
 69 | 
 70 | Visualizing results
 71 | -------------------
 72 | 
 73 | For results visualization, we will use `tmplot
 74 | <https://pypi.org/project/tmplot/>`_ package.
 75 | 
 76 | .. code-block:: python
 77 | 
 78 |     import tmplot as tmp
 79 | 
 80 |     # Run the interactive report interface
 81 |     tmp.report(model=model, docs=texts)
 82 | 
 83 | Filtering stable topics
 84 | -----------------------
 85 | 
 86 | Unsupervised topic models (such as LDA) are subject to topic instability [1]_
 87 | [2]_ [3]_. There is a special method in ``tmplot`` package for selecting stable
 88 | topics. It uses various distance metrics such as Kullback-Leibler divergence
 89 | (symmetric and non-symmetric), Hellinger distance, Jeffrey's divergence,
 90 | Jensen-Shannon divergence, Jaccard index, Bhattacharyya distance, Total
 91 | variation distance.
 92 | 
 93 | .. code-block:: python
 94 | 
 95 |     import pickle as pkl
 96 |     import tmplot as tmp
 97 |     import glob
 98 | 
 99 |     # Loading saved models
100 |     models_files = sorted(glob.glob(r'results/model[0-9].pkl'))
101 |     models = []
102 |     for fn in models_files:
103 |         file = open(fn, 'rb')
104 |         models.append(pkl.load(file))
105 |         file.close()
106 | 
107 |     # Choosing reference model
108 |     np.random.seed(122334)
109 |     reference_model = np.random.randint(1, 6)
110 |     
111 |     # Getting close topics
112 |     close_topics, close_kl = tmp.get_closest_topics(
113 |         models, method="sklb", ref=reference_model)
114 | 
115 |     # Getting stable topics
116 |     stable_topics, stable_kl = tmp.get_stable_topics(
117 |         close_topics, close_kl, ref=reference_model, thres=0.7)
118 |     
119 |     # Stable topics indices list
120 |     print(stable_topics[:, reference_model])
121 | 
122 | 
123 | Model loading and saving
124 | ------------------------
125 | 
126 | Support for model serializing with `pickle
127 | <https://docs.python.org/3/library/pickle.html>`_ was implemented in v0.5.3.
128 | Here is how you can save and load a model:
129 | 
130 | .. code-block:: python
131 | 
132 |     import pickle as pkl
133 |     # Saving
134 |     with open("model.pkl", "wb") as file:
135 |         pkl.dump(model, file)
136 | 
137 |     # Loading
138 |     with open("model.pkl", "rb") as file:
139 |         model = pkl.load(file)
140 | 
141 | 
142 | References
143 | ----------
144 | 
145 | .. [1] Koltcov, S., Koltsova, O., & Nikolenko, S. (2014, June).
146 |    Latent dirichlet allocation: stability and applications to studies of
147 |    user-generated content. In Proceedings of the 2014 ACM conference on Web
148 |    science (pp. 161-165).
149 | 
150 | .. [2] Mantyla, M. V., Claes, M., & Farooq, U. (2018, October).
151 |    Measuring LDA topic stability from clusters of replicated runs. In
152 |    Proceedings of the 12th ACM/IEEE international symposium on empirical
153 |    software engineering and measurement (pp. 1-4).
154 | 
155 | .. [3] Greene, D., O’Callaghan, D., & Cunningham, P. (2014, September). How many
156 |    topics? stability analysis for topic models. In Joint European conference on
157 |    machine learning and knowledge discovery in databases (pp. 498-513). Springer,
158 |    Berlin, Heidelberg.
159 | 


--------------------------------------------------------------------------------
/images/topics_terms_plots.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/maximtrp/bitermplus/20fd0d1601e007aa1567e6ed97a9c906fd869a7f/images/topics_terms_plots.png


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0", "wheel", "cython"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "bitermplus"
 7 | dynamic = ["version"]
 8 | description = "Biterm Topic Model"
 9 | readme = "README.md"
10 | requires-python = ">=3.7"
11 | license.file = "LICENSE"
12 | authors = [
13 |     { name = "Maksim Terpilovskii", email = "maximtrp@gmail.com" },
14 | ]
15 | keywords = [
16 |     "topic model",
17 |     "machine learning", 
18 |     "nlp" 
19 | ]
20 | classifiers = [
21 |     "License :: OSI Approved :: MIT License",
22 |     "Operating System :: OS Independent",
23 |     "Programming Language :: Python :: 3.8",
24 |     "Programming Language :: Python :: 3.9",
25 |     "Programming Language :: Python :: 3.10",
26 |     "Programming Language :: Python :: 3.11",
27 |     "Topic :: Scientific/Engineering :: Information Analysis",
28 |     "Topic :: Text Processing :: General",
29 | ]
30 | urls.homepage = "https://github.com/maximtrp/bitermplus"
31 | urls.documentation = "https://bitermplus.readthedocs.io/"
32 | 
33 | dependencies = [
34 |     "numpy",
35 |     "cython",
36 |     "pandas",
37 |     "scipy",
38 |     "scikit-learn>=1.0.0",
39 |     "tqdm",
40 | ]
41 | 
42 | [tool.setuptools]
43 | include-package-data = false
44 | 
45 | [tool.setuptools.dynamic]
46 | version = {attr = "bitermplus.__version__"}
47 | 
48 | [tool.setuptools.packages.find]
49 | where = ["src"]
50 | include = ["bitermplus"]
51 | exclude = ["tests"]
52 | 
53 | [project.optional-dependencies]
54 | test = ["pytest"]
55 | 
56 | [tool.pytest.ini_options]
57 | pythonpath = [
58 |   ".", "src",
59 | ]


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from platform import system
 2 | from setuptools import setup, Extension
 3 | from Cython.Build import cythonize
 4 | # from numpy import get_include
 5 | 
 6 | extra_link_args = ['-lomp'] if system() == 'Darwin' else ['-fopenmp']
 7 | extra_compile_args = ['-Xpreprocessor', '-fopenmp']\
 8 |     if system() == 'Darwin'\
 9 |     else ['-fopenmp']
10 | 
11 | ext_modules = [
12 |     Extension(
13 |         "bitermplus._btm",
14 |         sources=["src/bitermplus/_btm.pyx"],
15 |         extra_compile_args=extra_compile_args,
16 |         extra_link_args=extra_link_args),
17 |     Extension(
18 |         "bitermplus._metrics",
19 |         # include_dirs=[get_include()],
20 |         # library_dirs=[get_include()],
21 |         sources=["src/bitermplus/_metrics.pyx"],
22 |         extra_compile_args=extra_compile_args,
23 |         extra_link_args=extra_link_args),
24 | ]
25 | 
26 | setup(
27 |     ext_modules=cythonize(
28 |         ext_modules,
29 |         compiler_directives={
30 |             'embedsignature': True,
31 |             'language_level': 3})
32 | )
33 | 


--------------------------------------------------------------------------------
/src/bitermplus/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = '0.7.0'
2 | 
3 | from ._btm import BTM
4 | from ._util import *
5 | from ._metrics import *
6 | 


--------------------------------------------------------------------------------
/src/bitermplus/_btm.pyx:
--------------------------------------------------------------------------------
  1 | __all__ = ['BTM']
  2 | 
  3 | # from cython.parallel import prange
  4 | from libc.time cimport time
  5 | from cython.view cimport array
  6 | from itertools import chain
  7 | from cython import cdivision, wraparound, boundscheck, initializedcheck,\
  8 |     auto_pickle, nonecheck
  9 | import numpy as np
 10 | import tqdm
 11 | from pandas import DataFrame
 12 | from ._metrics import coherence, perplexity
 13 | 
 14 | 
 15 | @cdivision(True)
 16 | @wraparound(False)
 17 | @boundscheck(False)
 18 | cdef int sample_mult(double[:] p, double random_factor):
 19 |     cdef int K = p.shape[0]
 20 |     cdef int i, k
 21 | 
 22 |     for i in range(1, K):
 23 |         p[i] += p[i - 1]
 24 | 
 25 |     for k in range(0, K):
 26 |         if p[k] >= random_factor * p[K - 1]:
 27 |             break
 28 | 
 29 |     return k
 30 | 
 31 | 
 32 | @auto_pickle(False)
 33 | cdef class BTM:
 34 |     """Biterm Topic Model.
 35 | 
 36 |     Parameters
 37 |     ----------
 38 |     n_dw : csr.csr_matrix
 39 |         Documents vs words frequency matrix. Typically, it should be the output
 40 |         of `CountVectorizer` from sklearn package.
 41 |     vocabulary : list
 42 |         Vocabulary (a list of words).
 43 |     T : int
 44 |         Number of topics.
 45 |     M : int = 20
 46 |         Number of top words for coherence calculation.
 47 |     alpha : float = 1
 48 |         Model parameter.
 49 |     beta : float = 0.01
 50 |         Model parameter.
 51 |     seed : int = 0
 52 |         Random state seed. If seed is equal to 0 (default),
 53 |         use ``time(NULL)``.
 54 |     win : int = 15
 55 |         Biterms generation window.
 56 |     has_background : bool = False
 57 |         Use a background topic to accumulate highly frequent words.
 58 |     """
 59 |     cdef:
 60 |         n_dw
 61 |         vocabulary
 62 |         int T
 63 |         int W
 64 |         int M
 65 |         double alpha
 66 |         double beta
 67 |         int win
 68 |         bint has_background
 69 |         double[:] n_bz  # T x 1
 70 |         double[:] p_z  # T x 1
 71 |         double[:, :] p_wz  # T x W
 72 |         double[:, :] n_wz  # T x W
 73 |         double[:, :] p_zd  # D x T
 74 |         double[:] p_wb
 75 |         int[:, :] B
 76 |         int iters
 77 |         unsigned int seed
 78 | 
 79 |     # cdef dict __dict__
 80 | 
 81 |     def __init__(
 82 |             self, n_dw, vocabulary, int T, int M=20,
 83 |             double alpha=1., double beta=0.01, unsigned int seed=0,
 84 |             int win=15, bint has_background=False):
 85 |         self.n_dw = n_dw
 86 |         self.vocabulary = vocabulary
 87 |         self.T = T
 88 |         self.W = len(vocabulary)
 89 |         self.M = M
 90 |         self.alpha = alpha
 91 |         self.beta = beta
 92 |         self.win = win
 93 |         self.seed = seed
 94 |         self.p_wb = np.asarray(n_dw.sum(axis=0) / n_dw.sum())[0]
 95 |         self.p_z = array(
 96 |             shape=(self.T, ), itemsize=sizeof(double), format="d",
 97 |             allocate_buffer=True)
 98 |         self.n_bz = array(
 99 |             shape=(self.T, ), itemsize=sizeof(double), format="d",
100 |             allocate_buffer=True)
101 |         self.n_wz = array(
102 |             shape=(self.T, self.W), itemsize=sizeof(double), format="d",
103 |             allocate_buffer=True)
104 |         self.p_wz = array(
105 |             shape=(self.T, self.W), itemsize=sizeof(double), format="d",
106 |             allocate_buffer=True)
107 |         self.p_zd = array(
108 |             shape=(self.n_dw.shape[0], self.T), itemsize=sizeof(double),
109 |             format="d", allocate_buffer=True)
110 |         self.p_z[...] = 0.
111 |         self.p_wz[...] = 0.
112 |         self.p_zd[...] = 0.
113 |         self.n_wz[...] = 0.
114 |         self.n_bz[...] = 0.
115 |         self.has_background = has_background
116 |         self.iters = 0
117 | 
118 |     def __getstate__(self):
119 |         return {
120 |             'alpha': self.alpha,
121 |             'beta': self.beta,
122 |             'B': np.asarray(self.B),
123 |             'T': self.T,
124 |             'W': self.W,
125 |             'M': self.M,
126 |             'win': self.win,
127 |             'n_dw': self.n_dw,
128 |             'vocabulary': self.vocabulary,
129 |             'has_background': self.has_background,
130 |             'iters': self.iters,
131 |             'alpha': self.alpha,
132 |             'n_bz': np.asarray(self.n_bz),
133 |             'n_wz': np.asarray(self.n_wz),
134 |             'p_zd': np.asarray(self.p_zd),
135 |             'p_wz': np.asarray(self.p_wz),
136 |             'p_wb': np.asarray(self.p_wb),
137 |             'p_z': np.asarray(self.p_z)
138 |         }
139 | 
140 |     def __setstate__(self, state):
141 |         self.alpha = state.get('alpha')
142 |         self.beta = state.get('beta')
143 |         self.B = state.get('B', np.zeros((0, 0))).astype(np.int32)
144 |         self.T = state.get('T')
145 |         self.W = state.get('W')
146 |         self.M = state.get('M')
147 |         self.win = state.get('win')
148 |         self.n_dw = state.get('n_dw')
149 |         self.vocabulary = state.get('vocabulary')
150 |         self.has_background = state.get('has_background')
151 |         self.iters = state.get('iters', 0)
152 |         self.n_bz = state.get('n_bz')
153 |         self.n_wz = state.get('n_wz')
154 |         self.p_zd = state.get('p_zd')
155 |         self.p_wz = state.get('p_wz')
156 |         self.p_wb = state.get('p_wb')
157 |         self.p_z = state.get('p_z')
158 | 
159 |     cdef int[:, :] _biterms_to_array(self, list B):
160 |         rng = np.random.default_rng(self.seed if self.seed else time(NULL))
161 |         arr = np.asarray(list(chain(*B)), dtype=np.int32)
162 |         random_topics = rng.integers(
163 |             low=0, high=self.T, size=(arr.shape[0], 1), dtype=np.int32)
164 |         arr = np.append(arr, random_topics, axis=1)
165 |         return arr
166 | 
167 |     @initializedcheck(False)
168 |     @boundscheck(False)
169 |     @wraparound(False)
170 |     @cdivision(True)
171 |     cdef void _compute_p_wz(self):
172 |         cdef int k, w
173 |         for k in range(self.T):
174 |             for w in range(self.W):
175 |                 self.p_wz[k][w] = (self.n_wz[k][w] + self.beta) / \
176 |                     (self.n_bz[k] * 2. + self.W * self.beta)
177 | 
178 |     @boundscheck(False)
179 |     @cdivision(True)
180 |     @wraparound(False)
181 |     @initializedcheck(False)
182 |     cdef void _compute_p_zb(self, long i, double[:] p_z):
183 |         cdef double pw1k, pw2k, pk, p_z_sum
184 |         cdef int w1 = self.B[i, 0]
185 |         cdef int w2 = self.B[i, 1]
186 |         cdef int k
187 | 
188 |         for k in range(self.T):
189 |             if self.has_background is True and k == 0:
190 |                 pw1k = self.p_wb[w1]
191 |                 pw2k = self.p_wb[w2]
192 |             else:
193 |                 pw1k = (self.n_wz[k][w1] + self.beta) / \
194 |                     (2. * self.n_bz[k] + self.W * self.beta)
195 |                 pw2k = (self.n_wz[k][w2] + self.beta) / \
196 |                     (2. * self.n_bz[k] + 1. + self.W * self.beta)
197 |             pk = (self.n_bz[k] + self.alpha) / \
198 |                 (self.B.shape[0] + self.T * self.alpha)
199 |             p_z[k] = pk * pw1k * pw2k
200 | 
201 |         # return p_z  # self._normalize(p_z)
202 | 
203 |     @boundscheck(False)
204 |     @cdivision(True)
205 |     @wraparound(False)
206 |     @initializedcheck(False)
207 |     cdef void _normalize(self, double[:] p, double smoother=0.0):
208 |         """Normalize values in place."""
209 |         cdef:
210 |             int i = 0
211 |             int num = p.shape[0]
212 | 
213 |         cdef double p_sum = 0.
214 |         for i in range(num):
215 |             p_sum += p[i]
216 | 
217 |         for i in range(num):
218 |             p[i] = (p[i] + smoother) / (p_sum + num * smoother)
219 | 
220 |     @initializedcheck(False)
221 |     @boundscheck(False)
222 |     @wraparound(False)
223 |     cpdef fit(self, list Bs, int iterations=600, bint verbose=True):
224 |         """Biterm topic model fitting method.
225 | 
226 |         Parameters
227 |         ----------
228 |         Bs : list
229 |             Biterms list.
230 |         iterations : int = 600
231 |             Iterations number.
232 |         verbose : bool = True
233 |             Show progress bar.
234 |         """
235 |         self.B = self._biterms_to_array(Bs)
236 |         # rng = np.random.default_rng(self.seed if self.seed else time(NULL))
237 |         # random_factors = rng.random(
238 |         #     low=0, high=self.T, size=(arr.shape[0], 1))
239 | 
240 |         cdef:
241 |             long i
242 |             int j, w1, w2, topic
243 |             long B_len = self.B.shape[0]
244 |             double[:] p_z = array(
245 |                 shape=(self.T, ), itemsize=sizeof(double), format="d",
246 |                 allocate_buffer=True)
247 |             double[:] rnd_uniform = array(
248 |                 shape=(B_len, ), itemsize=sizeof(double), format="d",
249 |                 allocate_buffer=True)
250 | 
251 |         rng = np.random.default_rng(self.seed if self.seed else time(NULL))
252 |         trange = tqdm.trange if verbose else range
253 | 
254 |         for i in range(B_len):
255 |             w1 = self.B[i, 0]
256 |             w2 = self.B[i, 1]
257 |             topic = self.B[i, 2]
258 |             self.n_bz[topic] += 1
259 |             self.n_wz[topic][w1] += 1
260 |             self.n_wz[topic][w2] += 1
261 | 
262 |         for j in trange(iterations):
263 |             rnd_uniform = rng.uniform(0, 1, B_len)
264 |             for i in range(B_len):
265 |                 w1 = self.B[i, 0]
266 |                 w2 = self.B[i, 1]
267 |                 topic = self.B[i, 2]
268 | 
269 |                 self.n_bz[topic] -= 1
270 |                 self.n_wz[topic][w1] -= 1
271 |                 self.n_wz[topic][w2] -= 1
272 | 
273 |                 # Topic reset
274 |                 # self.B[i, 2] = -1
275 | 
276 |                 # Topic sample
277 |                 self._compute_p_zb(i, p_z)
278 |                 topic = sample_mult(p_z, rnd_uniform[i])
279 |                 self.B[i, 2] = topic
280 | 
281 |                 self.n_bz[topic] += 1
282 |                 self.n_wz[topic][w1] += 1
283 |                 self.n_wz[topic][w2] += 1
284 | 
285 |         self.iters = iterations
286 |         self.p_z[:] = self.n_bz
287 |         self._normalize(self.p_z, self.alpha)
288 |         self._compute_p_wz()
289 | 
290 |     @cdivision(True)
291 |     cdef long _count_biterms(self, int n, int win=15):
292 |         cdef:
293 |             int i, j
294 |             long btn = 0
295 |         for i in range(n-1):
296 |             for j in range(i+1, min(i + win, n)):  # range(i+1, n):
297 |                 btn += 1
298 |         return btn
299 | 
300 |     @initializedcheck(False)
301 |     @boundscheck(False)
302 |     @wraparound(False)
303 |     cdef int[:, :] _generate_biterms(
304 |             self,
305 |             int[:, :] biterms,
306 |             int[:] words,
307 |             int win=15):
308 |         cdef int i, j, words_len = words.shape[0]
309 |         cdef long n = 0
310 | 
311 |         for i in range(words_len-1):
312 |             # for j in range(i+1, words_len):  # min(i + win, words_len)):
313 |             for j in range(i+1, min(i + win, words_len)):
314 |                 biterms[n, 0] = min(words[i], words[j])
315 |                 biterms[n, 1] = max(words[i], words[j])
316 |                 n += 1
317 |         return biterms
318 | 
319 |     @initializedcheck(False)
320 |     @boundscheck(False)
321 |     @wraparound(False)
322 |     cdef double[:] _infer_doc(self, int[:] doc, str infer_type, int doc_len):
323 |         cdef double[:] p_zd = array(
324 |             shape=(self.T, ), itemsize=sizeof(double), format="d",
325 |             allocate_buffer=True)
326 | 
327 |         if (infer_type == "sum_b"):
328 |             p_zd = self._infer_doc_sum_b(doc, doc_len)
329 |         elif (infer_type == "sum_w"):
330 |             p_zd = self._infer_doc_sum_w(doc, doc_len)
331 |         elif (infer_type == "mix"):
332 |             p_zd = self._infer_doc_mix(doc, doc_len)
333 |         else:
334 |             return None
335 | 
336 |         return p_zd
337 | 
338 |     @initializedcheck(False)
339 |     @boundscheck(False)
340 |     @wraparound(False)
341 |     cdef double[:] _infer_doc_sum_b(self, int[:] doc, int doc_len):
342 |         cdef double[:] p_zd = array(
343 |             shape=(self.T, ), itemsize=sizeof(double), format="d",
344 |             allocate_buffer=True)
345 | 
346 |         cdef double[:] p_zb = array(
347 |             shape=(self.T, ), itemsize=sizeof(double), format="d",
348 |             allocate_buffer=True)
349 | 
350 |         p_zd[...] = 0.
351 |         p_zb[...] = 0.
352 |         cdef long b, combs_num
353 |         cdef int w1, w2
354 |         cdef int[:, :] biterms
355 | 
356 |         if doc_len == 1:
357 |             for t in range(self.T):
358 |                 p_zd[t] = self.p_z[t] * self.p_wz[t][doc[0]]
359 |         else:
360 |             combs_num = self._count_biterms(doc_len, self.win)
361 |             biterms = array(
362 |                 shape=(combs_num, 2), itemsize=sizeof(int), format="i",
363 |                 allocate_buffer=True)
364 |             biterms = self._generate_biterms(biterms, doc, self.win)
365 | 
366 |             for b in range(combs_num):
367 |                 w1 = biterms[b, 0]
368 |                 w2 = biterms[b, 1]
369 | 
370 |                 if w2 >= self.W:
371 |                     continue
372 | 
373 |                 for t in range(self.T):
374 |                     p_zb[t] = self.p_z[t] * self.p_wz[t][w1] * self.p_wz[t][w2]
375 |                 self._normalize(p_zb)
376 | 
377 |                 for t in range(self.T):
378 |                     p_zd[t] += p_zb[t]
379 |         self._normalize(p_zd)
380 |         return p_zd
381 | 
382 |     @initializedcheck(False)
383 |     @boundscheck(False)
384 |     @wraparound(False)
385 |     cdef double[:] _infer_doc_sum_w(self, int[:] doc, int doc_len):
386 |         cdef int i
387 |         cdef int w
388 |         cdef double[:] p_zd = array(
389 |             shape=(self.T, ), itemsize=sizeof(double), format="d",
390 |             allocate_buffer=True)
391 |         cdef double[:] p_zw = array(
392 |             shape=(self.T, ), itemsize=sizeof(double), format="d",
393 |             allocate_buffer=True)
394 |         p_zd[...] = 0.
395 |         p_zw[...] = 0.
396 | 
397 |         for i in range(doc_len):
398 |             w = doc[i]
399 |             if (w >= self.W):
400 |                 continue
401 | 
402 |             for t in range(self.T):
403 |                 p_zw[t] = self.p_z[t] * self.p_wz[t][w]
404 | 
405 |             self._normalize(p_zw)
406 | 
407 |             for t in range(self.T):
408 |                 p_zd[t] += p_zw[t]
409 | 
410 |         self._normalize(p_zd)
411 |         return p_zd
412 | 
413 |     @initializedcheck(False)
414 |     @boundscheck(False)
415 |     @wraparound(False)
416 |     cdef double[:] _infer_doc_mix(self, int[:] doc, int doc_len):
417 |         cdef double[:] p_zd = array(
418 |             shape=(self.T, ), itemsize=sizeof(double), format="d")
419 |         p_zd[...] = 0.
420 |         cdef int i, w, t
421 | 
422 |         for t in range(self.T):
423 |             p_zd[t] = self.p_z[t]
424 | 
425 |         for i in range(doc_len):
426 |             w = doc[i]
427 |             if (w >= self.W):
428 |                 continue
429 | 
430 |             for t in range(self.T):
431 |                 p_zd[t] *= (self.p_wz[t][w] * self.W)
432 | 
433 |         self._normalize(p_zd)
434 |         return p_zd
435 | 
436 |     @initializedcheck(False)
437 |     @boundscheck(False)
438 |     @wraparound(False)
439 |     @nonecheck(False)
440 |     cpdef transform(
441 |             self, list docs, str infer_type='sum_b', bint verbose=True):
442 |         """Return documents vs topics probability matrix.
443 | 
444 |         Parameters
445 |         ----------
446 |         docs : list
447 |             Documents list. Each document must be presented as
448 |             a list of words ids. Typically, it can be the output of
449 |             :meth:`bitermplus.get_vectorized_docs`.
450 |         infer_type : str
451 |             Inference type. The following options are available:
452 | 
453 |             1) ``sum_b`` (default).
454 |             2) ``sum_w``.
455 |             3) ``mix``.
456 |         verbose : bool = True
457 |             Be verbose (show progress bar).
458 | 
459 |         Returns
460 |         -------
461 |         p_zd : np.ndarray
462 |             Documents vs topics probability matrix (D vs T).
463 |         """
464 |         cdef int d
465 |         cdef int doc_len
466 |         cdef int docs_len = len(docs)
467 |         cdef double[:, :] p_zd = array(
468 |             shape=(docs_len, self.T), itemsize=sizeof(double), format="d",
469 |             allocate_buffer=True)
470 |         p_zd[...] = 0.
471 |         cdef int[:] doc
472 | 
473 |         trange = tqdm.trange if verbose else range
474 | 
475 |         for d in trange(docs_len):
476 |             doc = docs[d]
477 |             doc_len = doc.shape[0]
478 |             if doc_len > 0:
479 |                 p_zd[d, :] = self._infer_doc(doc, infer_type, doc_len)
480 |             else:
481 |                 p_zd[d, :] = 0.
482 | 
483 |         self.p_zd = p_zd
484 |         np_p_zd = np.asarray(self.p_zd)
485 |         np_p_zd[np.isnan(np_p_zd)] = 0.
486 |         return np_p_zd
487 | 
488 |     cpdef fit_transform(
489 |             self, docs, list biterms,
490 |             str infer_type='sum_b', int iterations=600, bint verbose=True):
491 |         """Run model fitting and return documents vs topics matrix.
492 | 
493 |         Parameters
494 |         ----------
495 |         docs : list
496 |             Documents list. Each document must be presented as
497 |             a list of words ids. Typically, it can be the output of
498 |             :meth:`bitermplus.get_vectorized_docs`.
499 |         biterms : list
500 |             List of biterms.
501 |         infer_type : str
502 |             Inference type. The following options are available:
503 | 
504 |             1) ``sum_b`` (default).
505 |             2) ``sum_w``.
506 |             3) ``mix``.
507 |         iterations : int = 600
508 |             Iterations number.
509 |         verbose : bool = True
510 |             Be verbose (show progress bars).
511 | 
512 |         Returns
513 |         -------
514 |         p_zd : np.ndarray
515 |             Documents vs topics matrix (D x T).
516 |         """
517 |         self.fit(biterms, iterations=iterations, verbose=verbose)
518 |         p_zd = self.transform(
519 |             docs, infer_type=infer_type, verbose=verbose)
520 |         return p_zd
521 | 
522 |     @property
523 |     def matrix_topics_words_(self) -> np.ndarray:
524 |         """Topics vs words probabilities matrix."""
525 |         return np.asarray(self.p_wz)
526 | 
527 |     @property
528 |     def matrix_words_topics_(self) -> np.ndarray:
529 |         """Words vs topics probabilities matrix."""
530 |         return np.asarray(self.p_wz).T
531 | 
532 |     @property
533 |     def df_words_topics_(self) -> DataFrame:
534 |         """Words vs topics probabilities in a DataFrame."""
535 |         return DataFrame(np.asarray(self.p_wz).T, index=self.vocabulary)
536 | 
537 |     @property
538 |     def matrix_docs_topics_(self) -> np.ndarray:
539 |         """Documents vs topics probabilities matrix."""
540 |         return np.asarray(self.p_zd)
541 | 
542 |     @property
543 |     def matrix_topics_docs_(self) -> np.ndarray:
544 |         """Topics vs documents probabilities matrix."""
545 |         return np.asarray(self.p_zd).T
546 | 
547 |     @property
548 |     def coherence_(self) -> np.ndarray:
549 |         """Semantic topics coherence."""
550 |         return coherence(self.p_wz, self.n_dw, M=self.M)
551 | 
552 |     @property
553 |     def perplexity_(self) -> float:
554 |         """Perplexity.
555 | 
556 |         Run `transform` method before calculating perplexity"""
557 |         return perplexity(self.p_wz, self.p_zd, self.n_dw, self.T)
558 | 
559 |     @property
560 |     def vocabulary_(self) -> np.ndarray:
561 |         """Vocabulary (list of words)."""
562 |         return np.asarray(self.vocabulary)
563 | 
564 |     @property
565 |     def alpha_(self) -> float:
566 |         """Model parameter."""
567 |         return self.alpha
568 | 
569 |     @property
570 |     def beta_(self) -> float:
571 |         """Model parameter."""
572 |         return self.beta
573 | 
574 |     @property
575 |     def window_(self) -> int:
576 |         """Biterms generation window size."""
577 |         return self.win
578 | 
579 |     @property
580 |     def has_background_(self) -> bool:
581 |         """Specifies whether the model has a background topic
582 |         to accumulate highly frequent words."""
583 |         return self.has_background
584 | 
585 |     @property
586 |     def topics_num_(self) -> int:
587 |         """Number of topics."""
588 |         return self.T
589 | 
590 |     @property
591 |     def vocabulary_size_(self) -> int:
592 |         """Vocabulary size (number of words)."""
593 |         return len(self.vocabulary)
594 | 
595 |     @property
596 |     def coherence_window_(self) -> int:
597 |         """Number of top words for coherence calculation."""
598 |         return self.M
599 | 
600 |     @property
601 |     def iterations_(self) -> int:
602 |         """Number of iterations the model fitting process has
603 |         gone through."""
604 |         return self.iters
605 | 
606 |     @property
607 |     def theta_(self) -> np.ndarray:
608 |         """Topics probabilities vector."""
609 |         return np.array(self.p_z)
610 | 
611 |     @property
612 |     def biterms_(self) -> np.ndarray:
613 |         """Model biterms. Terms are coded with the corresponding ids."""
614 |         return np.asarray(self.B)
615 | 
616 |     @property
617 |     def labels_(self) -> np.ndarray:
618 |         """Model document labels (most probable topic for each document)."""
619 |         return np.asarray(self.p_zd).argmax(axis=1)
620 | 


--------------------------------------------------------------------------------
/src/bitermplus/_metrics.pyx:
--------------------------------------------------------------------------------
  1 | __all__ = ['perplexity', 'coherence', 'entropy']
  2 | 
  3 | from cython.view cimport array
  4 | from libc.math cimport exp, log
  5 | from typing import Union
  6 | from pandas import DataFrame
  7 | from scipy.sparse import csr
  8 | from cython.parallel import prange
  9 | from cython import boundscheck, wraparound, cdivision
 10 | import numpy as np
 11 | 
 12 | 
 13 | @boundscheck(False)
 14 | # @wraparound(False)
 15 | cpdef double perplexity(
 16 |         double[:, :] p_wz,
 17 |         double[:, :] p_zd,
 18 |         n_dw,
 19 |         long T):
 20 |     """Perplexity calculation [1]_.
 21 | 
 22 |     Parameters
 23 |     ----------
 24 |     p_wz : np.ndarray
 25 |         Topics vs words probabilities matrix (T x W).
 26 | 
 27 |     p_zd : np.ndarray
 28 |         Documents vs topics probabilities matrix (D x T).
 29 | 
 30 |     n_dw : scipy.sparse.csr_matrix
 31 |         Words frequency matrix for all documents (D x W).
 32 | 
 33 |     T : int
 34 |         Number of topics.
 35 | 
 36 |     Returns
 37 |     -------
 38 |     perplexity : float
 39 |         Perplexity estimate.
 40 | 
 41 |     References
 42 |     ----------
 43 |     .. [1] Heinrich, G. (2005). Parameter estimation for text analysis (pp.
 44 |         1-32). Technical report.
 45 | 
 46 |     Example
 47 |     -------
 48 |     >>> import bitermplus as btm
 49 |     >>> # Preprocessing step
 50 |     >>> # ...
 51 |     >>> # X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
 52 |     >>> # Model fitting step
 53 |     >>> # model = ...
 54 |     >>> # Inference step
 55 |     >>> # p_zd = model.transform(docs_vec_subset)
 56 |     >>> # Coherence calculation
 57 |     >>> perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
 58 |     """
 59 |     cdef double pwz_pzd_sum = 0.
 60 |     cdef double exp_num = 0.
 61 |     cdef double perplexity = 0.
 62 |     cdef double n = 0
 63 |     cdef long d, w, t, w_i, w_ri, w_rj
 64 |     cdef long D = p_zd.shape[0]
 65 |     cdef long W = p_wz.shape[1]
 66 |     cdef long[:] n_dw_indptr = n_dw.indptr.astype(int)
 67 |     cdef long[:] n_dw_indices = n_dw.indices.astype(int)
 68 |     cdef double n_dw_sum = n_dw.sum()
 69 |     cdef double[:] n_dw_data = n_dw.data.astype(float)
 70 | 
 71 |     # Iterating over all documents
 72 |     for d in prange(D, nogil=True):
 73 | 
 74 |         w_ri = n_dw_indptr[d]
 75 |         # if d + 1 == D:
 76 |         #     w_rj = W
 77 |         # else:
 78 |         w_rj = n_dw_indptr[d+1]
 79 | 
 80 |         for w_i in range(w_ri, w_rj):
 81 |             w = n_dw_indices[w_i]
 82 |             n = n_dw_data[w_i]
 83 | 
 84 |             pwz_pzd_sum = 0.
 85 |             for t in range(T):
 86 |                 pwz_pzd_sum = pwz_pzd_sum + p_zd[d, t] * p_wz[t, w]
 87 |             if pwz_pzd_sum > 0:
 88 |                 exp_num += n * log(pwz_pzd_sum)
 89 | 
 90 |     perplexity = exp(-exp_num / n_dw_sum)
 91 |     return perplexity
 92 | 
 93 | 
 94 | @boundscheck(False)
 95 | @wraparound(False)
 96 | @cdivision(True)
 97 | cpdef coherence(
 98 |         double[:, :] p_wz,
 99 |         n_dw,
100 |         double eps=1.,
101 |         int M=20):
102 |     """Semantic topic coherence calculation [1]_.
103 | 
104 |     Parameters
105 |     ----------
106 |     p_wz : np.ndarray
107 |         Topics vs words probabilities matrix (T x W).
108 | 
109 |     n_dw : scipy.sparse.csr_matrix
110 |         Words frequency matrix for all documents (D x W).
111 | 
112 |     eps : float
113 |         Calculation parameter. It is summed with a word pair
114 |         conditional probability.
115 | 
116 |     M : int
117 |         Number of top words in a topic to take.
118 | 
119 |     Returns
120 |     -------
121 |     coherence : np.ndarray
122 |         Semantic coherence estimates for all topics.
123 | 
124 |     References
125 |     ----------
126 |     .. [1] Mimno, D., Wallach, H., Talley, E., Leenders, M., & McCallum, A.
127 |         (2011, July). Optimizing semantic coherence in topic models. In
128 |         Proceedings of the 2011 conference on empirical methods in natural
129 |         language processing (pp. 262-272).
130 | 
131 |     Example
132 |     -------
133 |     >>> import bitermplus as btm
134 |     >>> # Preprocessing step
135 |     >>> # ...
136 |     >>> # X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
137 |     >>> # Model fitting step
138 |     >>> # model = ...
139 |     >>> # Coherence calculation
140 |     >>> coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
141 |     """
142 |     cdef int d, i, j, k, t, tw, w_i, w_ri, w_rj, w
143 |     cdef double logSum = 0.
144 |     cdef long T = p_wz.shape[0]
145 |     cdef long W = p_wz.shape[1]
146 |     cdef long D = n_dw.shape[0]
147 |     cdef long n
148 |     cdef long[:] n_dw_indices = n_dw.indices.astype(int)
149 |     cdef long[:] n_dw_indptr = n_dw.indptr.astype(int)
150 |     cdef long n_dw_len = n_dw_indices.shape[0]
151 |     cdef long[:] n_dw_data = n_dw.data.astype(int)
152 |     cdef long[:, :] top_words = np.zeros((M, T), dtype=int)
153 |     cdef double[:] coherence = np.zeros(T, dtype=float)
154 |     cdef int w1 = 0
155 |     cdef int w2 = 0
156 |     cdef double D_ij = 0.
157 |     cdef double D_j = 0.
158 | 
159 |     for t in range(T):
160 |         words_idx_sorted = np.argsort(p_wz[t, :])[:-M-1:-1]
161 |         for i in range(M):
162 |             top_words[i, t] = words_idx_sorted[i]
163 | 
164 |     for t in range(T):
165 |         logSum = 0.
166 |         for i in prange(1, M, nogil=True):
167 |             for j in range(0, i):
168 |                 D_ij = 0.
169 |                 D_j = 0.
170 | 
171 |                 for d in range(D):
172 |                     w1 = 0
173 |                     w2 = 0
174 |                     w_ri = n_dw_indptr[d]
175 |                     if d + 1 == D:
176 |                         w_rj = W
177 |                     else:
178 |                         w_rj = n_dw_indptr[d+1]
179 | 
180 |                     for w_i in range(w_ri, w_rj):
181 |                         w = n_dw_indices[w_i]
182 |                         n = n_dw_data[w_i]
183 |                         for tw in range(M):
184 |                             if (top_words[i, t] == w and n > 0):
185 |                                 w1 = 1
186 |                             elif (top_words[j, t] == w and n > 0):
187 |                                 w2 = 1
188 |                     D_ij += float(w1 & w2)
189 |                     D_j += float(w2)
190 |                 logSum += log((D_ij + eps) / D_j)
191 |         coherence[t] = logSum
192 | 
193 |     return np.array(coherence)
194 | 
195 | 
196 | @boundscheck(False)
197 | @wraparound(False)
198 | @cdivision(True)
199 | cpdef entropy(
200 |         double[:, :] p_wz,
201 |         bint max_probs=True):
202 |     """Renyi entropy calculation routine [1]_.
203 | 
204 |     Renyi entropy can be used to estimate the optimal number of topics: just fit
205 |     several models with a different number of topics and choose the number of
206 |     topics for which the Renyi entropy is the least.
207 | 
208 |     Parameters
209 |     ----------
210 |     p_wz : np.ndarray
211 |         Topics vs words probabilities matrix (T x W).
212 | 
213 |     Returns
214 |     -------
215 |     renyi : double
216 |         Renyi entropy value.
217 |     max_probs : bool
218 |         Use maximum probabilities of terms per topics instead of all probability values.
219 | 
220 |     References
221 |     ----------
222 |     .. [1] Koltcov, S. (2018). Application of Rényi and Tsallis entropies to
223 |            topic modeling optimization. Physica A: Statistical Mechanics and its
224 |            Applications, 512, 1192-1204.
225 | 
226 |     Example
227 |     -------
228 |     >>> import bitermplus as btm
229 |     >>> # Preprocessing step
230 |     >>> # ...
231 |     >>> # Model fitting step
232 |     >>> # model = ...
233 |     >>> # Entropy calculation
234 |     >>> entropy = btm.entropy(model.matrix_topics_words_)
235 |     """
236 |     # Words number
237 |     cdef int W = p_wz.shape[1]
238 |     # Topics number
239 |     cdef int T = p_wz.shape[0]
240 | 
241 |     # Initializing variables
242 |     cdef double word_ratio = 0.
243 |     cdef double sum_prob = 0.
244 |     cdef double shannon = 0.
245 |     cdef double energy = 0.
246 |     cdef double int_energy = 0.
247 |     cdef double free_energy = 0.
248 |     cdef double renyi = 0.
249 |     cdef int t = 0
250 |     cdef int w = 0
251 | 
252 |     # Setting threshold
253 |     cdef double thresh = 1. / W
254 | 
255 |     for w in range(W):
256 |         for t in range(T):
257 |             if not max_probs or (max_probs and p_wz[t, w] > thresh):
258 |                 sum_prob += p_wz[t, w]
259 |                 word_ratio += 1
260 | 
261 |     # Shannon entropy
262 |     shannon = log(word_ratio / (W * T))
263 | 
264 |     # Internal energy
265 |     int_energy = -log(sum_prob / T)
266 | 
267 |     # Free energy
268 |     free_energy = int_energy - shannon * T
269 | 
270 |     # Renyi entropy
271 |     if T == 1:
272 |         renyi = free_energy / T
273 |     else:
274 |         renyi = free_energy / (T-1)
275 | 
276 |     return renyi
277 | 


--------------------------------------------------------------------------------
/src/bitermplus/_util.py:
--------------------------------------------------------------------------------
  1 | __all__ = [
  2 |     'get_words_freqs', 'get_vectorized_docs',
  3 |     'get_biterms', 'get_top_topic_words',
  4 |     'get_top_topic_docs', 'get_docs_top_topic']
  5 | 
  6 | from typing import List, Union, Tuple, Dict, Sequence, Any
  7 | from scipy.sparse import csr_matrix
  8 | from pandas import DataFrame, Series, concat
  9 | from sklearn.feature_extraction.text import CountVectorizer
 10 | import numpy as np
 11 | from ._btm import BTM
 12 | 
 13 | 
 14 | def get_words_freqs(
 15 |         docs: Union[List[str], np.ndarray, Series],
 16 |         **kwargs: dict) -> Tuple[csr_matrix, np.ndarray, Dict]:
 17 |     """Compute words vs documents frequency matrix.
 18 | 
 19 |     Parameters
 20 |     ----------
 21 |     docs : Union[List[str], np.ndarray, Series]
 22 |         Documents in any format that can be passed to
 23 |         :meth:`sklearn.feature_extraction.text.CountVectorizer` method.
 24 |     kwargs : dict
 25 |         Keyword arguments for
 26 |         :meth:`sklearn.feature_extraction.text.CountVectorizer` method.
 27 | 
 28 |     Returns
 29 |     -------
 30 |     Tuple[scipy.sparse.csr_matrix, np.ndarray, Dict]
 31 |         Documents vs words matrix in CSR format,
 32 |         vocabulary as a numpy.ndarray of terms,
 33 |         and vocabulary as a dictionary of {term: id} pairs.
 34 | 
 35 |     Example
 36 |     -------
 37 |     >>> import pandas as pd
 38 |     >>> import bitermplus as btm
 39 | 
 40 |     >>> # Loading data
 41 |     >>> df = pd.read_csv(
 42 |     ...     'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
 43 |     >>> texts = df['texts'].str.strip().tolist()
 44 | 
 45 |     >>> # Vectorizing documents, obtaining full vocabulary and biterms
 46 |     >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
 47 |     """
 48 |     vec = CountVectorizer(**kwargs)
 49 |     X = vec.fit_transform(docs)
 50 |     words = np.array(vec.get_feature_names_out())
 51 |     return X, words, vec.vocabulary_
 52 | 
 53 | 
 54 | def get_vectorized_docs(
 55 |         docs: Union[List[str],  np.ndarray],
 56 |         vocab: Union[List[str], np.ndarray]) -> List[np.ndarray]:
 57 |     """Replace words with their ids in each document.
 58 | 
 59 |     Parameters
 60 |     ----------
 61 |     docs : Union[List[str],  np.ndarray]
 62 |         Documents (iterable of strings).
 63 |     vocab: Union[List[str], np.ndarray]
 64 |         Vocabulary (iterable of terms).
 65 | 
 66 |     Returns
 67 |     -------
 68 |     docs : List[np.ndarray]
 69 |         Vectorised documents (list of ``numpy.ndarray``
 70 |         objects with terms ids).
 71 | 
 72 |     Example
 73 |     -------
 74 |     >>> import pandas as pd
 75 |     >>> import bitermplus as btm
 76 | 
 77 |     >>> # Loading data
 78 |     >>> df = pd.read_csv(
 79 |     ...     'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
 80 |     >>> texts = df['texts'].str.strip().tolist()
 81 | 
 82 |     >>> # Vectorizing documents, obtaining full vocabulary and biterms
 83 |     >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
 84 |     >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary)
 85 |     """
 86 |     vocab_idx = dict(zip(vocab, range(len(vocab))))
 87 | 
 88 |     def _parse_words(w):
 89 |         return vocab_idx.get(w)
 90 | 
 91 |     return list(
 92 |         map(
 93 |             lambda doc:
 94 |                 np.array(
 95 |                     list(filter(None, map(_parse_words, doc.split()))),
 96 |                     dtype=np.int32),
 97 |             docs))
 98 | 
 99 | 
100 | def get_biterms(
101 |         docs: List[np.ndarray],
102 |         win: int = 15) -> List[List[int]]:
103 |     """Biterms creation routine.
104 | 
105 |     Parameters
106 |     ----------
107 |     docs : List[np.ndarray]
108 |         List of numpy.ndarray objects containing word indices.
109 |     win : int = 15
110 |         Biterms generation window.
111 | 
112 |     Returns
113 |     -------
114 |     List[List[int]]
115 |         List of biterms for each document.
116 | 
117 |     Example
118 |     -------
119 |     >>> import pandas as pd
120 |     >>> import bitermplus as btm
121 | 
122 |     >>> # Loading data
123 |     >>> df = pd.read_csv(
124 |     ...     'dataset/SearchSnippets.txt.gz', header=None, names=['texts'])
125 |     >>> texts = df['texts'].str.strip().tolist()
126 | 
127 |     >>> # Vectorizing documents, obtaining full vocabulary and biterms
128 |     >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts)
129 |     >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary)
130 |     >>> biterms = btm.get_biterms(docs_vec)
131 |     """
132 |     biterms = []
133 |     for doc in docs:
134 |         doc_biterms = []
135 |         doc_len = len(doc)
136 |         if doc_len < 2:
137 |             continue
138 |         for i in range(doc_len-1):
139 |             for j in range(i+1, min(i + win, doc_len)):
140 |                 wi = min(doc[i], doc[j])
141 |                 wj = max(doc[i], doc[j])
142 |                 doc_biterms.append([wi, wj])
143 |         biterms.append(doc_biterms)
144 |     return biterms
145 | 
146 | 
147 | def get_top_topic_words(
148 |         model: BTM,
149 |         words_num: int = 20,
150 |         topics_idx: Sequence[Any] = None) -> DataFrame:
151 |     """Select top topic words from a fitted model.
152 | 
153 |     Parameters
154 |     ----------
155 |     model : bitermplus._btm.BTM
156 |         Fitted BTM model.
157 |     words_num : int = 20
158 |         The number of words to select.
159 |     topics_idx : Union[List, numpy.ndarray] = None
160 |         Topics indices. Meant to be used to select only stable
161 |         topics.
162 | 
163 |     Returns
164 |     -------
165 |     DataFrame
166 |         Words with highest probabilities per each selected topic.
167 | 
168 |     Example
169 |     -------
170 |     >>> stable_topics = [0, 3, 10, 12, 18, 21]
171 |     >>> top_words = btm.get_top_topic_words(
172 |     ...     model,
173 |     ...     words_num=100,
174 |     ...     topics_idx=stable_topics)
175 |     """
176 |     def _select_words(model, topic_id: int):
177 |         probs = model.matrix_topics_words_[topic_id, :]
178 |         idx = np.argsort(probs)[:-words_num-1:-1]
179 |         result = Series(model.vocabulary_[idx])
180 |         result.name = 'topic{}'.format(topic_id)
181 |         return result
182 | 
183 |     topics_num = model.topics_num_
184 |     topics_idx = np.arange(topics_num) if topics_idx is None else topics_idx
185 |     return concat(
186 |         map(lambda x: _select_words(model, x), topics_idx), axis=1)
187 | 
188 | 
189 | def get_top_topic_docs(
190 |         docs: Sequence[Any],
191 |         p_zd: np.ndarray,
192 |         docs_num: int = 20,
193 |         topics_idx: Sequence[Any] = None) -> DataFrame:
194 |     """Select top topic docs from a fitted model.
195 | 
196 |     Parameters
197 |     ----------
198 |     docs : Sequence[Any]
199 |         Iterable of documents (e.g. list of strings).
200 |     p_zd : np.ndarray
201 |         Documents vs topics probabilities matrix.
202 |     docs_num : int = 20
203 |         The number of documents to select.
204 |     topics_idx : Sequence[Any] = None
205 |         Topics indices. Meant to be used to select only stable
206 |         topics.
207 | 
208 |     Returns
209 |     -------
210 |     DataFrame
211 |         Documents with highest probabilities in all selected topics.
212 | 
213 |     Example
214 |     -------
215 |     >>> top_docs = btm.get_top_topic_docs(
216 |     ...     texts,
217 |     ...     p_zd,
218 |     ...     docs_num=100,
219 |     ...     topics_idx=[1,2,3,4])
220 |     """
221 |     def _select_docs(docs, p_zd, topic_id: int):
222 |         probs = p_zd[:, topic_id]
223 |         idx = np.argsort(probs)[:-docs_num-1:-1]
224 |         result = Series(np.asarray(docs)[idx])
225 |         result.name = 'topic{}'.format(topic_id)
226 |         return result
227 | 
228 |     topics_num = p_zd.shape[1]
229 |     topics_idx = np.arange(topics_num) if topics_idx is None else topics_idx
230 |     return concat(
231 |         map(lambda x: _select_docs(docs, p_zd, x), topics_idx), axis=1)
232 | 
233 | 
234 | def get_docs_top_topic(
235 |         docs: Sequence[Any],
236 |         p_zd: np.ndarray) -> DataFrame:
237 |     """Select most probable topic for each document.
238 | 
239 |     Parameters
240 |     ----------
241 |     docs : Sequence[Any]
242 |         Iterable of documents (e.g. list of strings).
243 |     p_zd : np.ndarray
244 |         Documents vs topics probabilities matrix.
245 | 
246 |     Returns
247 |     -------
248 |     DataFrame
249 |         Documents and the most probable topic for each of them.
250 | 
251 |     Example
252 |     -------
253 |     >>> import bitermplus as btm
254 |     >>> # Read documents from file
255 |     >>> # texts = ...
256 |     >>> # Build and train a model
257 |     >>> # model = ...
258 |     >>> # model.fit(...)
259 |     >>> btm.get_docs_top_topic(texts, model.matrix_docs_topics_)
260 |     """
261 |     return DataFrame({'documents': docs, 'label': p_zd.argmax(axis=1)})
262 | 


--------------------------------------------------------------------------------
/tests/test_btm.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | import pickle as pkl
  3 | import logging
  4 | import numpy as np
  5 | import pandas as pd
  6 | from src import bitermplus as btm
  7 | 
  8 | # import time
  9 | LOGGER = logging.getLogger(__name__)
 10 | 
 11 | 
 12 | class TestBTM(unittest.TestCase):
 13 | 
 14 |     # Main tests
 15 |     def test_btm_class(self):
 16 |         # Importing and vectorizing text data
 17 |         df = pd.read_csv("dataset/SearchSnippets.txt.gz", header=None, names=["texts"])
 18 |         texts = df["texts"].str.strip().tolist()
 19 | 
 20 |         # Vectorizing documents, obtaining full vocabulary and biterms
 21 |         X, vocabulary, _ = btm.get_words_freqs(texts)
 22 |         docs_vec = btm.get_vectorized_docs(texts, vocabulary)
 23 |         biterms = btm.get_biterms(docs_vec)
 24 | 
 25 |         LOGGER.info("Modeling started")
 26 |         topics_num = 8
 27 |         model = btm.BTM(
 28 |             X,
 29 |             vocabulary,
 30 |             seed=52214,
 31 |             T=topics_num,
 32 |             M=20,
 33 |             alpha=50 / topics_num,
 34 |             beta=0.01,
 35 |         )
 36 |         # t1 = time.time()
 37 |         model.fit(biterms, iterations=20)
 38 |         # t2 = time.time()
 39 |         # LOGGER.info(t2 - t1)
 40 |         # LOGGER.info(model.theta_)
 41 |         self.assertIsInstance(model.matrix_topics_words_, np.ndarray)
 42 |         self.assertTupleEqual(
 43 |             model.matrix_topics_words_.shape, (topics_num, vocabulary.size)
 44 |         )
 45 |         LOGGER.info("Modeling finished")
 46 | 
 47 |         LOGGER.info('Inference "sum_b" started')
 48 |         docs_vec_subset = docs_vec[:1000]
 49 |         docs_vec_subset[100] = np.array([], dtype=np.int32)
 50 |         p_zd = model.transform(docs_vec_subset)
 51 |         self.assertTupleEqual(p_zd.shape, (1000, topics_num))
 52 |         # LOGGER.info(p_zd)
 53 |         LOGGER.info('Inference "sum_b" finished')
 54 | 
 55 |         LOGGER.info("Model saving started")
 56 |         with open("model.pickle", "wb") as file:
 57 |             pkl.dump(model, file)
 58 |         LOGGER.info("Model saving finished")
 59 | 
 60 |         LOGGER.info('Inference "sum_w" started')
 61 |         p_zd = model.transform(docs_vec_subset, infer_type="sum_w")
 62 |         # LOGGER.info(p_zd)
 63 |         LOGGER.info('Inference "sum_w" finished')
 64 | 
 65 |         LOGGER.info('Inference "mix" started')
 66 |         p_zd = model.transform(docs_vec_subset, infer_type="mix")
 67 |         # LOGGER.info(p_zd)
 68 |         LOGGER.info('Inference "mix" finished')
 69 | 
 70 |         LOGGER.info("Perplexity testing started")
 71 |         perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8)
 72 |         self.assertTrue(perplexity, model.perplexity_)
 73 |         self.assertIsInstance(perplexity, float)
 74 |         self.assertNotEqual(perplexity, 0.0)
 75 |         LOGGER.info(f"Perplexity value: {perplexity}")
 76 |         LOGGER.info("Perplexity testing finished")
 77 | 
 78 |         LOGGER.info("Coherence testing started")
 79 |         coherence = btm.coherence(model.matrix_topics_words_, X, M=20)
 80 |         self.assertTrue(np.allclose(coherence, model.coherence_))
 81 |         self.assertIsInstance(coherence, np.ndarray)
 82 |         self.assertGreater(coherence.shape[0], 0)
 83 |         LOGGER.info(f"Coherence value: {coherence}")
 84 |         LOGGER.info("Coherence testing finished")
 85 | 
 86 |         LOGGER.info("Entropy testing started")
 87 |         entropy = btm.entropy(model.matrix_topics_words_, True)
 88 |         self.assertNotEqual(entropy, 0)
 89 |         LOGGER.info(f"Entropy value: {entropy}")
 90 |         LOGGER.info("Entropy testing finished")
 91 | 
 92 |         LOGGER.info("Model loading started")
 93 |         with open("model.pickle", "rb") as file:
 94 |             self.assertIsInstance(pkl.load(file), btm._btm.BTM)
 95 |         LOGGER.info("Model loading finished")
 96 | 
 97 | 
 98 | if __name__ == "__main__":
 99 |     unittest.main()
100 | 


--------------------------------------------------------------------------------