├── .coveragerc ├── .github └── workflows │ ├── package-publish.yml │ └── package-test.yml ├── .gitignore ├── .readthedocs.yml ├── LICENSE ├── MANIFEST.in ├── README.md ├── dataset └── SearchSnippets.txt.gz ├── docs ├── Makefile ├── make.bat ├── requirements.txt └── source │ ├── _static │ ├── coherence.svg │ └── perplexity.svg │ ├── benchmarks.rst │ ├── bitermplus.metrics.rst │ ├── bitermplus.rst │ ├── bitermplus.util.rst │ ├── conf.py │ ├── index.rst │ ├── install.rst │ └── tutorial.rst ├── images └── topics_terms_plots.png ├── pyproject.toml ├── setup.py ├── src └── bitermplus │ ├── __init__.py │ ├── _btm.pyx │ ├── _metrics.pyx │ └── _util.py └── tests └── test_btm.py /.coveragerc: -------------------------------------------------------------------------------- 1 | [run] 2 | plugins = Cython.Coverage 3 | source = src/bitermplus -------------------------------------------------------------------------------- /.github/workflows/package-publish.yml: -------------------------------------------------------------------------------- 1 | name: Package Upload to PyPi 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - uses: actions/checkout@v2 14 | - name: Set up Python 15 | uses: actions/setup-python@v2 16 | with: 17 | python-version: '3.x' 18 | - name: Install dependencies 19 | run: | 20 | python -m pip install --upgrade pip 21 | pip install setuptools wheel cython build twine 22 | - name: Build and publish 23 | env: 24 | TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }} 25 | TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }} 26 | run: | 27 | python -m build -s 28 | twine upload dist/* 29 | -------------------------------------------------------------------------------- /.github/workflows/package-test.yml: -------------------------------------------------------------------------------- 1 | name: Package Test 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v5 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install . 21 | pip install .[test] 22 | - name: Testing package with pytest 23 | run: | 24 | cythonize -i src/bitermplus/*.pyx 25 | pytest -s 26 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | *.c 9 | 10 | # Distribution / packaging 11 | .Python 12 | build/ 13 | docs/build/ 14 | docs/source/_build/ 15 | develop-eggs/ 16 | dist/ 17 | downloads/ 18 | eggs/ 19 | .eggs/ 20 | lib/ 21 | lib64/ 22 | parts/ 23 | sdist/ 24 | var/ 25 | wheels/ 26 | pip-wheel-metadata/ 27 | share/python-wheels/ 28 | *.egg-info/ 29 | .installed.cfg 30 | *.egg 31 | MANIFEST 32 | 33 | # PyInstaller 34 | # Usually these files are written by a python script from a template 35 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 36 | *.manifest 37 | *.spec 38 | 39 | # Installer logs 40 | pip-log.txt 41 | pip-delete-this-directory.txt 42 | 43 | # Unit test / coverage reports 44 | htmlcov/ 45 | .tox/ 46 | .nox/ 47 | .coverage 48 | .coverage.* 49 | .cache 50 | nosetests.xml 51 | coverage.xml 52 | *.cover 53 | *.py,cover 54 | .hypothesis/ 55 | .pytest_cache/ 56 | src/**/*.html 57 | 58 | # Translations 59 | *.mo 60 | *.pot 61 | 62 | # Django stuff: 63 | *.log 64 | local_settings.py 65 | db.sqlite3 66 | db.sqlite3-journal 67 | 68 | # Flask stuff: 69 | instance/ 70 | .webassets-cache 71 | 72 | # Scrapy stuff: 73 | .scrapy 74 | 75 | # Sphinx documentation 76 | docs/_build/ 77 | 78 | # PyBuilder 79 | target/ 80 | 81 | # Jupyter Notebook 82 | .ipynb_checkpoints 83 | 84 | # IPython 85 | profile_default/ 86 | ipython_config.py 87 | 88 | # pyenv 89 | .python-version 90 | 91 | # pipenv 92 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 93 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 94 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 95 | # install all needed dependencies. 96 | #Pipfile.lock 97 | 98 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 99 | __pypackages__/ 100 | 101 | # Celery stuff 102 | celerybeat-schedule 103 | celerybeat.pid 104 | 105 | # SageMath parsed files 106 | *.sage.py 107 | 108 | # Environments 109 | .env 110 | .venv 111 | env/ 112 | venv/ 113 | ENV/ 114 | env.bak/ 115 | venv.bak/ 116 | 117 | # Spyder project settings 118 | .spyderproject 119 | .spyproject 120 | 121 | # Rope project settings 122 | .ropeproject 123 | 124 | # mkdocs documentation 125 | /site 126 | 127 | # mypy 128 | .mypy_cache/ 129 | .dmypy.json 130 | dmypy.json 131 | 132 | # Pyre type checker 133 | .pyre/ 134 | 135 | # vscode 136 | .vscode 137 | 138 | # pickles 139 | *.pickle -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | build: 4 | os: ubuntu-22.04 5 | tools: 6 | python: "3.11" 7 | 8 | sphinx: 9 | configuration: docs/source/conf.py 10 | 11 | formats: 12 | - pdf 13 | 14 | python: 15 | install: 16 | - requirements: docs/requirements.txt 17 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Maksim Terpilowski 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include src/bitermplus *.pyx 2 | include LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Biterm Topic Model 2 | 3 | ![GitHub Workflow Status](https://img.shields.io/github/actions/workflow/status/maximtrp/bitermplus/package-test.yml) 4 | [![Documentation Status](https://readthedocs.org/projects/bitermplus/badge/?version=latest)](https://bitermplus.readthedocs.io/en/latest/?badge=latest) 5 | [![Codacy Badge](https://app.codacy.com/project/badge/Grade/192b6a75449040ff868932a15ca28ce9)](https://www.codacy.com/gh/maximtrp/bitermplus/dashboard?utm_source=github.com&utm_medium=referral&utm_content=maximtrp/bitermplus&utm_campaign=Badge_Grade) 6 | [![Issues](https://img.shields.io/github/issues/maximtrp/bitermplus.svg)](https://github.com/maximtrp/bitermplus/issues) 7 | [![Downloads](https://static.pepy.tech/badge/bitermplus)](https://pepy.tech/project/bitermplus) 8 | ![PyPI](https://img.shields.io/pypi/v/bitermplus) 9 | 10 | *Bitermplus* implements [Biterm topic model](https://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.402.4032&rep=rep1&type=pdf) for short texts introduced by Xiaohui Yan, Jiafeng Guo, Yanyan Lan, and Xueqi Cheng. Actually, it is a cythonized version of [BTM](https://github.com/xiaohuiyan/BTM). This package is also capable of computing *perplexity*, *semantic coherence*, and *entropy* metrics. 11 | 12 | ## Donate 13 | 14 | If you find this package useful, please consider donating any amount of money. This will help me spend more time on supporting open-source software. 15 | 16 | Buy Me A Coffee 17 | 18 | ## Requirements 19 | 20 | * cython 21 | * numpy 22 | * pandas 23 | * scipy 24 | * scikit-learn 25 | * tqdm 26 | 27 | ## Setup 28 | 29 | ### Linux and Windows 30 | 31 | Be sure to install Python headers if they are not included in your Python installation. For example, in Ubuntu it can be done using this command (where `x` is Python minor version number): 32 | 33 | ```bash 34 | sudo apt-get install python3.x-dev 35 | ``` 36 | 37 | Apart from that, there should be no issues with installing *bitermplus* under these OSes. You can install the package directly from PyPi: 38 | 39 | ```bash 40 | pip install bitermplus 41 | ``` 42 | 43 | Or from this repo: 44 | 45 | ```bash 46 | pip install git+https://github.com/maximtrp/bitermplus.git 47 | ``` 48 | 49 | ### Mac OS 50 | 51 | First, you need to install XCode CLT and [Homebrew](https://brew.sh). 52 | Then, install `libomp` using `brew`: 53 | 54 | ```bash 55 | xcode-select --install 56 | brew install libomp 57 | pip3 install bitermplus 58 | ``` 59 | 60 | If you have the following issue with libomp (`fatal error: 'omp.h' file not found`), run `brew info libomp` in the console: 61 | 62 | ```bash 63 | brew info libomp 64 | ``` 65 | 66 | You should see the following output: 67 | 68 | ``` 69 | libomp: stable 15.0.5 (bottled) [keg-only] 70 | LLVM's OpenMP runtime library 71 | https://openmp.llvm.org/ 72 | /opt/homebrew/Cellar/libomp/15.0.5 (7 files, 1.6MB) 73 | Poured from bottle on 2022-11-19 at 12:16:49 74 | From: https://github.com/Homebrew/homebrew-core/blob/HEAD/Formula/libomp.rb 75 | License: MIT 76 | ==> Dependencies 77 | Build: cmake ✘, lit ✘ 78 | ==> Caveats 79 | libomp is keg-only, which means it was not symlinked into /opt/homebrew, 80 | because it can override GCC headers and result in broken builds. 81 | 82 | For compilers to find libomp you may need to set: 83 | export LDFLAGS="-L/opt/homebrew/opt/libomp/lib" 84 | export CPPFLAGS="-I/opt/homebrew/opt/libomp/include" 85 | 86 | ==> Analytics 87 | install: 192,197 (30 days), 373,389 (90 days), 1,285,192 (365 days) 88 | install-on-request: 24,388 (30 days), 48,013 (90 days), 164,666 (365 days) 89 | build-error: 0 (30 days) 90 | ``` 91 | 92 | Export `LDFLAGS` and `CPPFLAGS` as suggested in brew output: 93 | 94 | ```bash 95 | export LDFLAGS="-L/opt/homebrew/opt/libomp/lib" 96 | export CPPFLAGS="-I/opt/homebrew/opt/libomp/include" 97 | ``` 98 | 99 | ## Example 100 | 101 | ### Model fitting 102 | 103 | ```python 104 | import bitermplus as btm 105 | import numpy as np 106 | import pandas as pd 107 | 108 | # IMPORTING DATA 109 | df = pd.read_csv( 110 | 'dataset/SearchSnippets.txt.gz', header=None, names=['texts']) 111 | texts = df['texts'].str.strip().tolist() 112 | 113 | # PREPROCESSING 114 | # Obtaining terms frequency in a sparse matrix and corpus vocabulary 115 | X, vocabulary, vocab_dict = btm.get_words_freqs(texts) 116 | tf = np.array(X.sum(axis=0)).ravel() 117 | # Vectorizing documents 118 | docs_vec = btm.get_vectorized_docs(texts, vocabulary) 119 | docs_lens = list(map(len, docs_vec)) 120 | # Generating biterms 121 | biterms = btm.get_biterms(docs_vec) 122 | 123 | # INITIALIZING AND RUNNING MODEL 124 | model = btm.BTM( 125 | X, vocabulary, seed=12321, T=8, M=20, alpha=50/8, beta=0.01) 126 | model.fit(biterms, iterations=20) 127 | p_zd = model.transform(docs_vec) 128 | 129 | # METRICS 130 | perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8) 131 | coherence = btm.coherence(model.matrix_topics_words_, X, M=20) 132 | # or 133 | perplexity = model.perplexity_ 134 | coherence = model.coherence_ 135 | 136 | # LABELS 137 | model.labels_ 138 | # or 139 | btm.get_docs_top_topic(texts, model.matrix_docs_topics_) 140 | ``` 141 | 142 | ### Results visualization 143 | 144 | You need to install [tmplot](https://github.com/maximtrp/tmplot) first. 145 | 146 | ```python 147 | import tmplot as tmp 148 | tmp.report(model=model, docs=texts) 149 | ``` 150 | 151 | ![Report interface](images/topics_terms_plots.png) 152 | 153 | ## Tutorial 154 | 155 | There is a [tutorial](https://bitermplus.readthedocs.io/en/latest/tutorial.html) 156 | in documentation that covers the important steps of topic modeling (including 157 | stability measures and results visualization). 158 | -------------------------------------------------------------------------------- /dataset/SearchSnippets.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/bitermplus/20fd0d1601e007aa1567e6ed97a9c906fd869a7f/dataset/SearchSnippets.txt.gz -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | # sphinx-autogen -o source/generated source/*.rst 21 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 22 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | sphinx_rtd_theme 2 | git+https://github.com/maximtrp/bitermplus 3 | -------------------------------------------------------------------------------- /docs/source/_static/coherence.svg: -------------------------------------------------------------------------------- 1 | 02004006008001,0001,2001,4001,6001,8002,000Iterations−800−780−760−740−720−700−680−660−640−620−600Semantic coherence -------------------------------------------------------------------------------- /docs/source/_static/perplexity.svg: -------------------------------------------------------------------------------- 1 | 02004006008001,0001,2001,4001,6001,8002,000Iterations7007508008509009501,000Perplexity -------------------------------------------------------------------------------- /docs/source/benchmarks.rst: -------------------------------------------------------------------------------- 1 | Benchmarks 2 | ---------- 3 | 4 | In this section, the results of a series of benchmarks done on *SearchSnippets* dataset 5 | are presented. Sixteen models were trained with different iterations number 6 | (from 10 to 2000) and default model parameters. Topics number was set to 8. 7 | Semantic topic coherence (``u_mass``) and perplexity were 8 | calculated for each model. 9 | 10 | .. image:: _static/perplexity.svg 11 | :alt: Perplexity 12 | 13 | .. image:: _static/coherence.svg 14 | :alt: Semantic topic coherence 15 | 16 | -------------------------------------------------------------------------------- /docs/source/bitermplus.metrics.rst: -------------------------------------------------------------------------------- 1 | Metrics 2 | ======= 3 | 4 | .. currentmodule:: bitermplus 5 | 6 | .. autofunction:: coherence 7 | .. autofunction:: perplexity 8 | .. autofunction:: entropy -------------------------------------------------------------------------------- /docs/source/bitermplus.rst: -------------------------------------------------------------------------------- 1 | Model 2 | ===== 3 | 4 | .. currentmodule:: bitermplus 5 | 6 | .. autoclass:: BTM 7 | :members: 8 | -------------------------------------------------------------------------------- /docs/source/bitermplus.util.rst: -------------------------------------------------------------------------------- 1 | Utility functions 2 | ================= 3 | 4 | .. currentmodule:: bitermplus 5 | 6 | .. autofunction:: get_words_freqs 7 | .. autofunction:: get_vectorized_docs 8 | .. autofunction:: get_biterms 9 | .. autofunction:: get_top_topic_words 10 | .. autofunction:: get_top_topic_docs 11 | .. autofunction:: get_docs_top_topic -------------------------------------------------------------------------------- /docs/source/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | # import os 14 | # import sys 15 | # sys.path.insert(0, os.path.abspath('.')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'bitermplus' 21 | copyright = '2021, Maksim Terpilowski' 22 | author = 'Maksim Terpilowski' 23 | 24 | 25 | # -- General configuration --------------------------------------------------- 26 | 27 | # Add any Sphinx extension module names here, as strings. They can be 28 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 29 | # ones. 30 | extensions = [ 31 | 'sphinx.ext.autosummary', 32 | 'sphinx.ext.napoleon', 33 | ] 34 | 35 | # Add any paths that contain templates here, relative to this directory. 36 | templates_path = ['_templates'] 37 | 38 | # List of patterns, relative to source directory, that match files and 39 | # directories to ignore when looking for source files. 40 | # This pattern also affects html_static_path and html_extra_path. 41 | exclude_patterns = [] 42 | 43 | 44 | # -- Options for HTML output ------------------------------------------------- 45 | 46 | # The theme to use for HTML and HTML Help pages. See the documentation for 47 | # a list of builtin themes. 48 | # 49 | html_theme = 'sphinx_rtd_theme' 50 | 51 | # Add any paths that contain custom static files (such as style sheets) here, 52 | # relative to this directory. They are copied after the builtin static files, 53 | # so a file named "default.css" will overwrite the builtin "default.css". 54 | html_static_path = ['_static'] 55 | -------------------------------------------------------------------------------- /docs/source/index.rst: -------------------------------------------------------------------------------- 1 | bitermplus 2 | ========== 3 | 4 | *Bitermplus* implements `Biterm topic model 5 | `_ 6 | for short texts introduced by Xiaohui Yan, Jiafeng Guo, Yanyan Lan, and Xueqi 7 | Cheng. Actually, it is a cythonized version of `BTM 8 | `_. This package is also capable of computing 9 | *perplexity* and *semantic coherence* metrics. 10 | 11 | .. toctree:: 12 | :maxdepth: 2 13 | :caption: Usage 14 | :hidden: 15 | 16 | Installation 17 | Tutorial 18 | Benchmarks 19 | 20 | .. toctree:: 21 | :maxdepth: 2 22 | :caption: API 23 | :hidden: 24 | 25 | Model 26 | Metrics 27 | Utility functions 28 | -------------------------------------------------------------------------------- /docs/source/install.rst: -------------------------------------------------------------------------------- 1 | Setup 2 | ----- 3 | 4 | Linux and Windows 5 | ~~~~~~~~~~~~~~~~~ 6 | 7 | There should be no issues with installing *bitermplus* under these OSes. 8 | You can install the package directly from PyPi. 9 | 10 | .. code-block:: bash 11 | 12 | pip install bitermplus 13 | 14 | Or from this repo: 15 | 16 | .. code-block:: bash 17 | 18 | pip install git+https://github.com/maximtrp/bitermplus.git 19 | 20 | Mac OS 21 | ~~~~~~ 22 | 23 | First, you need to install XCode CLT and `Homebrew `_. 24 | Then, install ``libomp`` using ``brew``: 25 | 26 | .. code-block:: bash 27 | 28 | xcode-select --install 29 | brew install libomp 30 | pip3 install bitermplus 31 | 32 | Requirements 33 | ~~~~~~~~~~~~ 34 | 35 | * cython 36 | * numpy 37 | * pandas 38 | * scipy 39 | * scikit-learn 40 | * tqdm 41 | -------------------------------------------------------------------------------- /docs/source/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorial 2 | ======== 3 | 4 | Model fitting 5 | ------------- 6 | 7 | Here is a simple example of model fitting. 8 | It is supposed that you have already gone through the preprocessing 9 | stage: cleaned, lemmatized or stemmed your documents, and removed stop words. 10 | 11 | .. code-block:: python 12 | 13 | import bitermplus as btm 14 | import numpy as np 15 | import pandas as pd 16 | 17 | # Importing data 18 | df = pd.read_csv( 19 | 'dataset/SearchSnippets.txt.gz', header=None, names=['texts']) 20 | texts = df['texts'].str.strip().tolist() 21 | 22 | # Vectorizing documents, obtaining full vocabulary and biterms 23 | # Internally, btm.get_words_freqs uses CountVectorizer from sklearn 24 | # You can pass any of its arguments to btm.get_words_freqs 25 | # For example, you can remove stop words: 26 | stop_words = ["word1", "word2", "word3"] 27 | X, vocabulary, vocab_dict = btm.get_words_freqs(texts, stop_words=stop_words) 28 | docs_vec = btm.get_vectorized_docs(texts, vocabulary) 29 | biterms = btm.get_biterms(docs_vec) 30 | 31 | # Initializing and running model 32 | model = btm.BTM( 33 | X, vocabulary, seed=12321, T=8, M=20, alpha=50/8, beta=0.01) 34 | model.fit(biterms, iterations=20) 35 | 36 | 37 | Inference 38 | --------- 39 | 40 | Now, we will calculate documents vs topics probability matrix (make an inference). 41 | 42 | .. code-block:: python 43 | 44 | p_zd = model.transform(docs_vec) 45 | 46 | If you need to make an inference on a new dataset, you should 47 | vectorize it using your vocabulary from the training set: 48 | 49 | .. code-block:: python 50 | 51 | new_docs_vec = btm.get_vectorized_docs(new_texts, vocabulary) 52 | p_zd = model.transform(new_docs_vec) 53 | 54 | 55 | Calculating metrics 56 | ------------------- 57 | 58 | To calculate perplexity, we must provide documents vs topics probability matrix 59 | (``p_zd``) that we calculated at the previous step. 60 | 61 | .. code-block:: python 62 | 63 | perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8) 64 | coherence = btm.coherence(model.matrix_topics_words_, X, M=20) 65 | # or 66 | perplexity = model.perplexity_ 67 | coherence = model.coherence_ 68 | 69 | 70 | Visualizing results 71 | ------------------- 72 | 73 | For results visualization, we will use `tmplot 74 | `_ package. 75 | 76 | .. code-block:: python 77 | 78 | import tmplot as tmp 79 | 80 | # Run the interactive report interface 81 | tmp.report(model=model, docs=texts) 82 | 83 | Filtering stable topics 84 | ----------------------- 85 | 86 | Unsupervised topic models (such as LDA) are subject to topic instability [1]_ 87 | [2]_ [3]_. There is a special method in ``tmplot`` package for selecting stable 88 | topics. It uses various distance metrics such as Kullback-Leibler divergence 89 | (symmetric and non-symmetric), Hellinger distance, Jeffrey's divergence, 90 | Jensen-Shannon divergence, Jaccard index, Bhattacharyya distance, Total 91 | variation distance. 92 | 93 | .. code-block:: python 94 | 95 | import pickle as pkl 96 | import tmplot as tmp 97 | import glob 98 | 99 | # Loading saved models 100 | models_files = sorted(glob.glob(r'results/model[0-9].pkl')) 101 | models = [] 102 | for fn in models_files: 103 | file = open(fn, 'rb') 104 | models.append(pkl.load(file)) 105 | file.close() 106 | 107 | # Choosing reference model 108 | np.random.seed(122334) 109 | reference_model = np.random.randint(1, 6) 110 | 111 | # Getting close topics 112 | close_topics, close_kl = tmp.get_closest_topics( 113 | models, method="sklb", ref=reference_model) 114 | 115 | # Getting stable topics 116 | stable_topics, stable_kl = tmp.get_stable_topics( 117 | close_topics, close_kl, ref=reference_model, thres=0.7) 118 | 119 | # Stable topics indices list 120 | print(stable_topics[:, reference_model]) 121 | 122 | 123 | Model loading and saving 124 | ------------------------ 125 | 126 | Support for model serializing with `pickle 127 | `_ was implemented in v0.5.3. 128 | Here is how you can save and load a model: 129 | 130 | .. code-block:: python 131 | 132 | import pickle as pkl 133 | # Saving 134 | with open("model.pkl", "wb") as file: 135 | pkl.dump(model, file) 136 | 137 | # Loading 138 | with open("model.pkl", "rb") as file: 139 | model = pkl.load(file) 140 | 141 | 142 | References 143 | ---------- 144 | 145 | .. [1] Koltcov, S., Koltsova, O., & Nikolenko, S. (2014, June). 146 | Latent dirichlet allocation: stability and applications to studies of 147 | user-generated content. In Proceedings of the 2014 ACM conference on Web 148 | science (pp. 161-165). 149 | 150 | .. [2] Mantyla, M. V., Claes, M., & Farooq, U. (2018, October). 151 | Measuring LDA topic stability from clusters of replicated runs. In 152 | Proceedings of the 12th ACM/IEEE international symposium on empirical 153 | software engineering and measurement (pp. 1-4). 154 | 155 | .. [3] Greene, D., O’Callaghan, D., & Cunningham, P. (2014, September). How many 156 | topics? stability analysis for topic models. In Joint European conference on 157 | machine learning and knowledge discovery in databases (pp. 498-513). Springer, 158 | Berlin, Heidelberg. 159 | -------------------------------------------------------------------------------- /images/topics_terms_plots.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/maximtrp/bitermplus/20fd0d1601e007aa1567e6ed97a9c906fd869a7f/images/topics_terms_plots.png -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0", "wheel", "cython"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "bitermplus" 7 | dynamic = ["version"] 8 | description = "Biterm Topic Model" 9 | readme = "README.md" 10 | requires-python = ">=3.7" 11 | license.file = "LICENSE" 12 | authors = [ 13 | { name = "Maksim Terpilovskii", email = "maximtrp@gmail.com" }, 14 | ] 15 | keywords = [ 16 | "topic model", 17 | "machine learning", 18 | "nlp" 19 | ] 20 | classifiers = [ 21 | "License :: OSI Approved :: MIT License", 22 | "Operating System :: OS Independent", 23 | "Programming Language :: Python :: 3.8", 24 | "Programming Language :: Python :: 3.9", 25 | "Programming Language :: Python :: 3.10", 26 | "Programming Language :: Python :: 3.11", 27 | "Topic :: Scientific/Engineering :: Information Analysis", 28 | "Topic :: Text Processing :: General", 29 | ] 30 | urls.homepage = "https://github.com/maximtrp/bitermplus" 31 | urls.documentation = "https://bitermplus.readthedocs.io/" 32 | 33 | dependencies = [ 34 | "numpy", 35 | "cython", 36 | "pandas", 37 | "scipy", 38 | "scikit-learn>=1.0.0", 39 | "tqdm", 40 | ] 41 | 42 | [tool.setuptools] 43 | include-package-data = false 44 | 45 | [tool.setuptools.dynamic] 46 | version = {attr = "bitermplus.__version__"} 47 | 48 | [tool.setuptools.packages.find] 49 | where = ["src"] 50 | include = ["bitermplus"] 51 | exclude = ["tests"] 52 | 53 | [project.optional-dependencies] 54 | test = ["pytest"] 55 | 56 | [tool.pytest.ini_options] 57 | pythonpath = [ 58 | ".", "src", 59 | ] -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from platform import system 2 | from setuptools import setup, Extension 3 | from Cython.Build import cythonize 4 | # from numpy import get_include 5 | 6 | extra_link_args = ['-lomp'] if system() == 'Darwin' else ['-fopenmp'] 7 | extra_compile_args = ['-Xpreprocessor', '-fopenmp']\ 8 | if system() == 'Darwin'\ 9 | else ['-fopenmp'] 10 | 11 | ext_modules = [ 12 | Extension( 13 | "bitermplus._btm", 14 | sources=["src/bitermplus/_btm.pyx"], 15 | extra_compile_args=extra_compile_args, 16 | extra_link_args=extra_link_args), 17 | Extension( 18 | "bitermplus._metrics", 19 | # include_dirs=[get_include()], 20 | # library_dirs=[get_include()], 21 | sources=["src/bitermplus/_metrics.pyx"], 22 | extra_compile_args=extra_compile_args, 23 | extra_link_args=extra_link_args), 24 | ] 25 | 26 | setup( 27 | ext_modules=cythonize( 28 | ext_modules, 29 | compiler_directives={ 30 | 'embedsignature': True, 31 | 'language_level': 3}) 32 | ) 33 | -------------------------------------------------------------------------------- /src/bitermplus/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = '0.7.0' 2 | 3 | from ._btm import BTM 4 | from ._util import * 5 | from ._metrics import * 6 | -------------------------------------------------------------------------------- /src/bitermplus/_btm.pyx: -------------------------------------------------------------------------------- 1 | __all__ = ['BTM'] 2 | 3 | # from cython.parallel import prange 4 | from libc.time cimport time 5 | from cython.view cimport array 6 | from itertools import chain 7 | from cython import cdivision, wraparound, boundscheck, initializedcheck,\ 8 | auto_pickle, nonecheck 9 | import numpy as np 10 | import tqdm 11 | from pandas import DataFrame 12 | from ._metrics import coherence, perplexity 13 | 14 | 15 | @cdivision(True) 16 | @wraparound(False) 17 | @boundscheck(False) 18 | cdef int sample_mult(double[:] p, double random_factor): 19 | cdef int K = p.shape[0] 20 | cdef int i, k 21 | 22 | for i in range(1, K): 23 | p[i] += p[i - 1] 24 | 25 | for k in range(0, K): 26 | if p[k] >= random_factor * p[K - 1]: 27 | break 28 | 29 | return k 30 | 31 | 32 | @auto_pickle(False) 33 | cdef class BTM: 34 | """Biterm Topic Model. 35 | 36 | Parameters 37 | ---------- 38 | n_dw : csr.csr_matrix 39 | Documents vs words frequency matrix. Typically, it should be the output 40 | of `CountVectorizer` from sklearn package. 41 | vocabulary : list 42 | Vocabulary (a list of words). 43 | T : int 44 | Number of topics. 45 | M : int = 20 46 | Number of top words for coherence calculation. 47 | alpha : float = 1 48 | Model parameter. 49 | beta : float = 0.01 50 | Model parameter. 51 | seed : int = 0 52 | Random state seed. If seed is equal to 0 (default), 53 | use ``time(NULL)``. 54 | win : int = 15 55 | Biterms generation window. 56 | has_background : bool = False 57 | Use a background topic to accumulate highly frequent words. 58 | """ 59 | cdef: 60 | n_dw 61 | vocabulary 62 | int T 63 | int W 64 | int M 65 | double alpha 66 | double beta 67 | int win 68 | bint has_background 69 | double[:] n_bz # T x 1 70 | double[:] p_z # T x 1 71 | double[:, :] p_wz # T x W 72 | double[:, :] n_wz # T x W 73 | double[:, :] p_zd # D x T 74 | double[:] p_wb 75 | int[:, :] B 76 | int iters 77 | unsigned int seed 78 | 79 | # cdef dict __dict__ 80 | 81 | def __init__( 82 | self, n_dw, vocabulary, int T, int M=20, 83 | double alpha=1., double beta=0.01, unsigned int seed=0, 84 | int win=15, bint has_background=False): 85 | self.n_dw = n_dw 86 | self.vocabulary = vocabulary 87 | self.T = T 88 | self.W = len(vocabulary) 89 | self.M = M 90 | self.alpha = alpha 91 | self.beta = beta 92 | self.win = win 93 | self.seed = seed 94 | self.p_wb = np.asarray(n_dw.sum(axis=0) / n_dw.sum())[0] 95 | self.p_z = array( 96 | shape=(self.T, ), itemsize=sizeof(double), format="d", 97 | allocate_buffer=True) 98 | self.n_bz = array( 99 | shape=(self.T, ), itemsize=sizeof(double), format="d", 100 | allocate_buffer=True) 101 | self.n_wz = array( 102 | shape=(self.T, self.W), itemsize=sizeof(double), format="d", 103 | allocate_buffer=True) 104 | self.p_wz = array( 105 | shape=(self.T, self.W), itemsize=sizeof(double), format="d", 106 | allocate_buffer=True) 107 | self.p_zd = array( 108 | shape=(self.n_dw.shape[0], self.T), itemsize=sizeof(double), 109 | format="d", allocate_buffer=True) 110 | self.p_z[...] = 0. 111 | self.p_wz[...] = 0. 112 | self.p_zd[...] = 0. 113 | self.n_wz[...] = 0. 114 | self.n_bz[...] = 0. 115 | self.has_background = has_background 116 | self.iters = 0 117 | 118 | def __getstate__(self): 119 | return { 120 | 'alpha': self.alpha, 121 | 'beta': self.beta, 122 | 'B': np.asarray(self.B), 123 | 'T': self.T, 124 | 'W': self.W, 125 | 'M': self.M, 126 | 'win': self.win, 127 | 'n_dw': self.n_dw, 128 | 'vocabulary': self.vocabulary, 129 | 'has_background': self.has_background, 130 | 'iters': self.iters, 131 | 'alpha': self.alpha, 132 | 'n_bz': np.asarray(self.n_bz), 133 | 'n_wz': np.asarray(self.n_wz), 134 | 'p_zd': np.asarray(self.p_zd), 135 | 'p_wz': np.asarray(self.p_wz), 136 | 'p_wb': np.asarray(self.p_wb), 137 | 'p_z': np.asarray(self.p_z) 138 | } 139 | 140 | def __setstate__(self, state): 141 | self.alpha = state.get('alpha') 142 | self.beta = state.get('beta') 143 | self.B = state.get('B', np.zeros((0, 0))).astype(np.int32) 144 | self.T = state.get('T') 145 | self.W = state.get('W') 146 | self.M = state.get('M') 147 | self.win = state.get('win') 148 | self.n_dw = state.get('n_dw') 149 | self.vocabulary = state.get('vocabulary') 150 | self.has_background = state.get('has_background') 151 | self.iters = state.get('iters', 0) 152 | self.n_bz = state.get('n_bz') 153 | self.n_wz = state.get('n_wz') 154 | self.p_zd = state.get('p_zd') 155 | self.p_wz = state.get('p_wz') 156 | self.p_wb = state.get('p_wb') 157 | self.p_z = state.get('p_z') 158 | 159 | cdef int[:, :] _biterms_to_array(self, list B): 160 | rng = np.random.default_rng(self.seed if self.seed else time(NULL)) 161 | arr = np.asarray(list(chain(*B)), dtype=np.int32) 162 | random_topics = rng.integers( 163 | low=0, high=self.T, size=(arr.shape[0], 1), dtype=np.int32) 164 | arr = np.append(arr, random_topics, axis=1) 165 | return arr 166 | 167 | @initializedcheck(False) 168 | @boundscheck(False) 169 | @wraparound(False) 170 | @cdivision(True) 171 | cdef void _compute_p_wz(self): 172 | cdef int k, w 173 | for k in range(self.T): 174 | for w in range(self.W): 175 | self.p_wz[k][w] = (self.n_wz[k][w] + self.beta) / \ 176 | (self.n_bz[k] * 2. + self.W * self.beta) 177 | 178 | @boundscheck(False) 179 | @cdivision(True) 180 | @wraparound(False) 181 | @initializedcheck(False) 182 | cdef void _compute_p_zb(self, long i, double[:] p_z): 183 | cdef double pw1k, pw2k, pk, p_z_sum 184 | cdef int w1 = self.B[i, 0] 185 | cdef int w2 = self.B[i, 1] 186 | cdef int k 187 | 188 | for k in range(self.T): 189 | if self.has_background is True and k == 0: 190 | pw1k = self.p_wb[w1] 191 | pw2k = self.p_wb[w2] 192 | else: 193 | pw1k = (self.n_wz[k][w1] + self.beta) / \ 194 | (2. * self.n_bz[k] + self.W * self.beta) 195 | pw2k = (self.n_wz[k][w2] + self.beta) / \ 196 | (2. * self.n_bz[k] + 1. + self.W * self.beta) 197 | pk = (self.n_bz[k] + self.alpha) / \ 198 | (self.B.shape[0] + self.T * self.alpha) 199 | p_z[k] = pk * pw1k * pw2k 200 | 201 | # return p_z # self._normalize(p_z) 202 | 203 | @boundscheck(False) 204 | @cdivision(True) 205 | @wraparound(False) 206 | @initializedcheck(False) 207 | cdef void _normalize(self, double[:] p, double smoother=0.0): 208 | """Normalize values in place.""" 209 | cdef: 210 | int i = 0 211 | int num = p.shape[0] 212 | 213 | cdef double p_sum = 0. 214 | for i in range(num): 215 | p_sum += p[i] 216 | 217 | for i in range(num): 218 | p[i] = (p[i] + smoother) / (p_sum + num * smoother) 219 | 220 | @initializedcheck(False) 221 | @boundscheck(False) 222 | @wraparound(False) 223 | cpdef fit(self, list Bs, int iterations=600, bint verbose=True): 224 | """Biterm topic model fitting method. 225 | 226 | Parameters 227 | ---------- 228 | Bs : list 229 | Biterms list. 230 | iterations : int = 600 231 | Iterations number. 232 | verbose : bool = True 233 | Show progress bar. 234 | """ 235 | self.B = self._biterms_to_array(Bs) 236 | # rng = np.random.default_rng(self.seed if self.seed else time(NULL)) 237 | # random_factors = rng.random( 238 | # low=0, high=self.T, size=(arr.shape[0], 1)) 239 | 240 | cdef: 241 | long i 242 | int j, w1, w2, topic 243 | long B_len = self.B.shape[0] 244 | double[:] p_z = array( 245 | shape=(self.T, ), itemsize=sizeof(double), format="d", 246 | allocate_buffer=True) 247 | double[:] rnd_uniform = array( 248 | shape=(B_len, ), itemsize=sizeof(double), format="d", 249 | allocate_buffer=True) 250 | 251 | rng = np.random.default_rng(self.seed if self.seed else time(NULL)) 252 | trange = tqdm.trange if verbose else range 253 | 254 | for i in range(B_len): 255 | w1 = self.B[i, 0] 256 | w2 = self.B[i, 1] 257 | topic = self.B[i, 2] 258 | self.n_bz[topic] += 1 259 | self.n_wz[topic][w1] += 1 260 | self.n_wz[topic][w2] += 1 261 | 262 | for j in trange(iterations): 263 | rnd_uniform = rng.uniform(0, 1, B_len) 264 | for i in range(B_len): 265 | w1 = self.B[i, 0] 266 | w2 = self.B[i, 1] 267 | topic = self.B[i, 2] 268 | 269 | self.n_bz[topic] -= 1 270 | self.n_wz[topic][w1] -= 1 271 | self.n_wz[topic][w2] -= 1 272 | 273 | # Topic reset 274 | # self.B[i, 2] = -1 275 | 276 | # Topic sample 277 | self._compute_p_zb(i, p_z) 278 | topic = sample_mult(p_z, rnd_uniform[i]) 279 | self.B[i, 2] = topic 280 | 281 | self.n_bz[topic] += 1 282 | self.n_wz[topic][w1] += 1 283 | self.n_wz[topic][w2] += 1 284 | 285 | self.iters = iterations 286 | self.p_z[:] = self.n_bz 287 | self._normalize(self.p_z, self.alpha) 288 | self._compute_p_wz() 289 | 290 | @cdivision(True) 291 | cdef long _count_biterms(self, int n, int win=15): 292 | cdef: 293 | int i, j 294 | long btn = 0 295 | for i in range(n-1): 296 | for j in range(i+1, min(i + win, n)): # range(i+1, n): 297 | btn += 1 298 | return btn 299 | 300 | @initializedcheck(False) 301 | @boundscheck(False) 302 | @wraparound(False) 303 | cdef int[:, :] _generate_biterms( 304 | self, 305 | int[:, :] biterms, 306 | int[:] words, 307 | int win=15): 308 | cdef int i, j, words_len = words.shape[0] 309 | cdef long n = 0 310 | 311 | for i in range(words_len-1): 312 | # for j in range(i+1, words_len): # min(i + win, words_len)): 313 | for j in range(i+1, min(i + win, words_len)): 314 | biterms[n, 0] = min(words[i], words[j]) 315 | biterms[n, 1] = max(words[i], words[j]) 316 | n += 1 317 | return biterms 318 | 319 | @initializedcheck(False) 320 | @boundscheck(False) 321 | @wraparound(False) 322 | cdef double[:] _infer_doc(self, int[:] doc, str infer_type, int doc_len): 323 | cdef double[:] p_zd = array( 324 | shape=(self.T, ), itemsize=sizeof(double), format="d", 325 | allocate_buffer=True) 326 | 327 | if (infer_type == "sum_b"): 328 | p_zd = self._infer_doc_sum_b(doc, doc_len) 329 | elif (infer_type == "sum_w"): 330 | p_zd = self._infer_doc_sum_w(doc, doc_len) 331 | elif (infer_type == "mix"): 332 | p_zd = self._infer_doc_mix(doc, doc_len) 333 | else: 334 | return None 335 | 336 | return p_zd 337 | 338 | @initializedcheck(False) 339 | @boundscheck(False) 340 | @wraparound(False) 341 | cdef double[:] _infer_doc_sum_b(self, int[:] doc, int doc_len): 342 | cdef double[:] p_zd = array( 343 | shape=(self.T, ), itemsize=sizeof(double), format="d", 344 | allocate_buffer=True) 345 | 346 | cdef double[:] p_zb = array( 347 | shape=(self.T, ), itemsize=sizeof(double), format="d", 348 | allocate_buffer=True) 349 | 350 | p_zd[...] = 0. 351 | p_zb[...] = 0. 352 | cdef long b, combs_num 353 | cdef int w1, w2 354 | cdef int[:, :] biterms 355 | 356 | if doc_len == 1: 357 | for t in range(self.T): 358 | p_zd[t] = self.p_z[t] * self.p_wz[t][doc[0]] 359 | else: 360 | combs_num = self._count_biterms(doc_len, self.win) 361 | biterms = array( 362 | shape=(combs_num, 2), itemsize=sizeof(int), format="i", 363 | allocate_buffer=True) 364 | biterms = self._generate_biterms(biterms, doc, self.win) 365 | 366 | for b in range(combs_num): 367 | w1 = biterms[b, 0] 368 | w2 = biterms[b, 1] 369 | 370 | if w2 >= self.W: 371 | continue 372 | 373 | for t in range(self.T): 374 | p_zb[t] = self.p_z[t] * self.p_wz[t][w1] * self.p_wz[t][w2] 375 | self._normalize(p_zb) 376 | 377 | for t in range(self.T): 378 | p_zd[t] += p_zb[t] 379 | self._normalize(p_zd) 380 | return p_zd 381 | 382 | @initializedcheck(False) 383 | @boundscheck(False) 384 | @wraparound(False) 385 | cdef double[:] _infer_doc_sum_w(self, int[:] doc, int doc_len): 386 | cdef int i 387 | cdef int w 388 | cdef double[:] p_zd = array( 389 | shape=(self.T, ), itemsize=sizeof(double), format="d", 390 | allocate_buffer=True) 391 | cdef double[:] p_zw = array( 392 | shape=(self.T, ), itemsize=sizeof(double), format="d", 393 | allocate_buffer=True) 394 | p_zd[...] = 0. 395 | p_zw[...] = 0. 396 | 397 | for i in range(doc_len): 398 | w = doc[i] 399 | if (w >= self.W): 400 | continue 401 | 402 | for t in range(self.T): 403 | p_zw[t] = self.p_z[t] * self.p_wz[t][w] 404 | 405 | self._normalize(p_zw) 406 | 407 | for t in range(self.T): 408 | p_zd[t] += p_zw[t] 409 | 410 | self._normalize(p_zd) 411 | return p_zd 412 | 413 | @initializedcheck(False) 414 | @boundscheck(False) 415 | @wraparound(False) 416 | cdef double[:] _infer_doc_mix(self, int[:] doc, int doc_len): 417 | cdef double[:] p_zd = array( 418 | shape=(self.T, ), itemsize=sizeof(double), format="d") 419 | p_zd[...] = 0. 420 | cdef int i, w, t 421 | 422 | for t in range(self.T): 423 | p_zd[t] = self.p_z[t] 424 | 425 | for i in range(doc_len): 426 | w = doc[i] 427 | if (w >= self.W): 428 | continue 429 | 430 | for t in range(self.T): 431 | p_zd[t] *= (self.p_wz[t][w] * self.W) 432 | 433 | self._normalize(p_zd) 434 | return p_zd 435 | 436 | @initializedcheck(False) 437 | @boundscheck(False) 438 | @wraparound(False) 439 | @nonecheck(False) 440 | cpdef transform( 441 | self, list docs, str infer_type='sum_b', bint verbose=True): 442 | """Return documents vs topics probability matrix. 443 | 444 | Parameters 445 | ---------- 446 | docs : list 447 | Documents list. Each document must be presented as 448 | a list of words ids. Typically, it can be the output of 449 | :meth:`bitermplus.get_vectorized_docs`. 450 | infer_type : str 451 | Inference type. The following options are available: 452 | 453 | 1) ``sum_b`` (default). 454 | 2) ``sum_w``. 455 | 3) ``mix``. 456 | verbose : bool = True 457 | Be verbose (show progress bar). 458 | 459 | Returns 460 | ------- 461 | p_zd : np.ndarray 462 | Documents vs topics probability matrix (D vs T). 463 | """ 464 | cdef int d 465 | cdef int doc_len 466 | cdef int docs_len = len(docs) 467 | cdef double[:, :] p_zd = array( 468 | shape=(docs_len, self.T), itemsize=sizeof(double), format="d", 469 | allocate_buffer=True) 470 | p_zd[...] = 0. 471 | cdef int[:] doc 472 | 473 | trange = tqdm.trange if verbose else range 474 | 475 | for d in trange(docs_len): 476 | doc = docs[d] 477 | doc_len = doc.shape[0] 478 | if doc_len > 0: 479 | p_zd[d, :] = self._infer_doc(doc, infer_type, doc_len) 480 | else: 481 | p_zd[d, :] = 0. 482 | 483 | self.p_zd = p_zd 484 | np_p_zd = np.asarray(self.p_zd) 485 | np_p_zd[np.isnan(np_p_zd)] = 0. 486 | return np_p_zd 487 | 488 | cpdef fit_transform( 489 | self, docs, list biterms, 490 | str infer_type='sum_b', int iterations=600, bint verbose=True): 491 | """Run model fitting and return documents vs topics matrix. 492 | 493 | Parameters 494 | ---------- 495 | docs : list 496 | Documents list. Each document must be presented as 497 | a list of words ids. Typically, it can be the output of 498 | :meth:`bitermplus.get_vectorized_docs`. 499 | biterms : list 500 | List of biterms. 501 | infer_type : str 502 | Inference type. The following options are available: 503 | 504 | 1) ``sum_b`` (default). 505 | 2) ``sum_w``. 506 | 3) ``mix``. 507 | iterations : int = 600 508 | Iterations number. 509 | verbose : bool = True 510 | Be verbose (show progress bars). 511 | 512 | Returns 513 | ------- 514 | p_zd : np.ndarray 515 | Documents vs topics matrix (D x T). 516 | """ 517 | self.fit(biterms, iterations=iterations, verbose=verbose) 518 | p_zd = self.transform( 519 | docs, infer_type=infer_type, verbose=verbose) 520 | return p_zd 521 | 522 | @property 523 | def matrix_topics_words_(self) -> np.ndarray: 524 | """Topics vs words probabilities matrix.""" 525 | return np.asarray(self.p_wz) 526 | 527 | @property 528 | def matrix_words_topics_(self) -> np.ndarray: 529 | """Words vs topics probabilities matrix.""" 530 | return np.asarray(self.p_wz).T 531 | 532 | @property 533 | def df_words_topics_(self) -> DataFrame: 534 | """Words vs topics probabilities in a DataFrame.""" 535 | return DataFrame(np.asarray(self.p_wz).T, index=self.vocabulary) 536 | 537 | @property 538 | def matrix_docs_topics_(self) -> np.ndarray: 539 | """Documents vs topics probabilities matrix.""" 540 | return np.asarray(self.p_zd) 541 | 542 | @property 543 | def matrix_topics_docs_(self) -> np.ndarray: 544 | """Topics vs documents probabilities matrix.""" 545 | return np.asarray(self.p_zd).T 546 | 547 | @property 548 | def coherence_(self) -> np.ndarray: 549 | """Semantic topics coherence.""" 550 | return coherence(self.p_wz, self.n_dw, M=self.M) 551 | 552 | @property 553 | def perplexity_(self) -> float: 554 | """Perplexity. 555 | 556 | Run `transform` method before calculating perplexity""" 557 | return perplexity(self.p_wz, self.p_zd, self.n_dw, self.T) 558 | 559 | @property 560 | def vocabulary_(self) -> np.ndarray: 561 | """Vocabulary (list of words).""" 562 | return np.asarray(self.vocabulary) 563 | 564 | @property 565 | def alpha_(self) -> float: 566 | """Model parameter.""" 567 | return self.alpha 568 | 569 | @property 570 | def beta_(self) -> float: 571 | """Model parameter.""" 572 | return self.beta 573 | 574 | @property 575 | def window_(self) -> int: 576 | """Biterms generation window size.""" 577 | return self.win 578 | 579 | @property 580 | def has_background_(self) -> bool: 581 | """Specifies whether the model has a background topic 582 | to accumulate highly frequent words.""" 583 | return self.has_background 584 | 585 | @property 586 | def topics_num_(self) -> int: 587 | """Number of topics.""" 588 | return self.T 589 | 590 | @property 591 | def vocabulary_size_(self) -> int: 592 | """Vocabulary size (number of words).""" 593 | return len(self.vocabulary) 594 | 595 | @property 596 | def coherence_window_(self) -> int: 597 | """Number of top words for coherence calculation.""" 598 | return self.M 599 | 600 | @property 601 | def iterations_(self) -> int: 602 | """Number of iterations the model fitting process has 603 | gone through.""" 604 | return self.iters 605 | 606 | @property 607 | def theta_(self) -> np.ndarray: 608 | """Topics probabilities vector.""" 609 | return np.array(self.p_z) 610 | 611 | @property 612 | def biterms_(self) -> np.ndarray: 613 | """Model biterms. Terms are coded with the corresponding ids.""" 614 | return np.asarray(self.B) 615 | 616 | @property 617 | def labels_(self) -> np.ndarray: 618 | """Model document labels (most probable topic for each document).""" 619 | return np.asarray(self.p_zd).argmax(axis=1) 620 | -------------------------------------------------------------------------------- /src/bitermplus/_metrics.pyx: -------------------------------------------------------------------------------- 1 | __all__ = ['perplexity', 'coherence', 'entropy'] 2 | 3 | from cython.view cimport array 4 | from libc.math cimport exp, log 5 | from typing import Union 6 | from pandas import DataFrame 7 | from scipy.sparse import csr 8 | from cython.parallel import prange 9 | from cython import boundscheck, wraparound, cdivision 10 | import numpy as np 11 | 12 | 13 | @boundscheck(False) 14 | # @wraparound(False) 15 | cpdef double perplexity( 16 | double[:, :] p_wz, 17 | double[:, :] p_zd, 18 | n_dw, 19 | long T): 20 | """Perplexity calculation [1]_. 21 | 22 | Parameters 23 | ---------- 24 | p_wz : np.ndarray 25 | Topics vs words probabilities matrix (T x W). 26 | 27 | p_zd : np.ndarray 28 | Documents vs topics probabilities matrix (D x T). 29 | 30 | n_dw : scipy.sparse.csr_matrix 31 | Words frequency matrix for all documents (D x W). 32 | 33 | T : int 34 | Number of topics. 35 | 36 | Returns 37 | ------- 38 | perplexity : float 39 | Perplexity estimate. 40 | 41 | References 42 | ---------- 43 | .. [1] Heinrich, G. (2005). Parameter estimation for text analysis (pp. 44 | 1-32). Technical report. 45 | 46 | Example 47 | ------- 48 | >>> import bitermplus as btm 49 | >>> # Preprocessing step 50 | >>> # ... 51 | >>> # X, vocabulary, vocab_dict = btm.get_words_freqs(texts) 52 | >>> # Model fitting step 53 | >>> # model = ... 54 | >>> # Inference step 55 | >>> # p_zd = model.transform(docs_vec_subset) 56 | >>> # Coherence calculation 57 | >>> perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8) 58 | """ 59 | cdef double pwz_pzd_sum = 0. 60 | cdef double exp_num = 0. 61 | cdef double perplexity = 0. 62 | cdef double n = 0 63 | cdef long d, w, t, w_i, w_ri, w_rj 64 | cdef long D = p_zd.shape[0] 65 | cdef long W = p_wz.shape[1] 66 | cdef long[:] n_dw_indptr = n_dw.indptr.astype(int) 67 | cdef long[:] n_dw_indices = n_dw.indices.astype(int) 68 | cdef double n_dw_sum = n_dw.sum() 69 | cdef double[:] n_dw_data = n_dw.data.astype(float) 70 | 71 | # Iterating over all documents 72 | for d in prange(D, nogil=True): 73 | 74 | w_ri = n_dw_indptr[d] 75 | # if d + 1 == D: 76 | # w_rj = W 77 | # else: 78 | w_rj = n_dw_indptr[d+1] 79 | 80 | for w_i in range(w_ri, w_rj): 81 | w = n_dw_indices[w_i] 82 | n = n_dw_data[w_i] 83 | 84 | pwz_pzd_sum = 0. 85 | for t in range(T): 86 | pwz_pzd_sum = pwz_pzd_sum + p_zd[d, t] * p_wz[t, w] 87 | if pwz_pzd_sum > 0: 88 | exp_num += n * log(pwz_pzd_sum) 89 | 90 | perplexity = exp(-exp_num / n_dw_sum) 91 | return perplexity 92 | 93 | 94 | @boundscheck(False) 95 | @wraparound(False) 96 | @cdivision(True) 97 | cpdef coherence( 98 | double[:, :] p_wz, 99 | n_dw, 100 | double eps=1., 101 | int M=20): 102 | """Semantic topic coherence calculation [1]_. 103 | 104 | Parameters 105 | ---------- 106 | p_wz : np.ndarray 107 | Topics vs words probabilities matrix (T x W). 108 | 109 | n_dw : scipy.sparse.csr_matrix 110 | Words frequency matrix for all documents (D x W). 111 | 112 | eps : float 113 | Calculation parameter. It is summed with a word pair 114 | conditional probability. 115 | 116 | M : int 117 | Number of top words in a topic to take. 118 | 119 | Returns 120 | ------- 121 | coherence : np.ndarray 122 | Semantic coherence estimates for all topics. 123 | 124 | References 125 | ---------- 126 | .. [1] Mimno, D., Wallach, H., Talley, E., Leenders, M., & McCallum, A. 127 | (2011, July). Optimizing semantic coherence in topic models. In 128 | Proceedings of the 2011 conference on empirical methods in natural 129 | language processing (pp. 262-272). 130 | 131 | Example 132 | ------- 133 | >>> import bitermplus as btm 134 | >>> # Preprocessing step 135 | >>> # ... 136 | >>> # X, vocabulary, vocab_dict = btm.get_words_freqs(texts) 137 | >>> # Model fitting step 138 | >>> # model = ... 139 | >>> # Coherence calculation 140 | >>> coherence = btm.coherence(model.matrix_topics_words_, X, M=20) 141 | """ 142 | cdef int d, i, j, k, t, tw, w_i, w_ri, w_rj, w 143 | cdef double logSum = 0. 144 | cdef long T = p_wz.shape[0] 145 | cdef long W = p_wz.shape[1] 146 | cdef long D = n_dw.shape[0] 147 | cdef long n 148 | cdef long[:] n_dw_indices = n_dw.indices.astype(int) 149 | cdef long[:] n_dw_indptr = n_dw.indptr.astype(int) 150 | cdef long n_dw_len = n_dw_indices.shape[0] 151 | cdef long[:] n_dw_data = n_dw.data.astype(int) 152 | cdef long[:, :] top_words = np.zeros((M, T), dtype=int) 153 | cdef double[:] coherence = np.zeros(T, dtype=float) 154 | cdef int w1 = 0 155 | cdef int w2 = 0 156 | cdef double D_ij = 0. 157 | cdef double D_j = 0. 158 | 159 | for t in range(T): 160 | words_idx_sorted = np.argsort(p_wz[t, :])[:-M-1:-1] 161 | for i in range(M): 162 | top_words[i, t] = words_idx_sorted[i] 163 | 164 | for t in range(T): 165 | logSum = 0. 166 | for i in prange(1, M, nogil=True): 167 | for j in range(0, i): 168 | D_ij = 0. 169 | D_j = 0. 170 | 171 | for d in range(D): 172 | w1 = 0 173 | w2 = 0 174 | w_ri = n_dw_indptr[d] 175 | if d + 1 == D: 176 | w_rj = W 177 | else: 178 | w_rj = n_dw_indptr[d+1] 179 | 180 | for w_i in range(w_ri, w_rj): 181 | w = n_dw_indices[w_i] 182 | n = n_dw_data[w_i] 183 | for tw in range(M): 184 | if (top_words[i, t] == w and n > 0): 185 | w1 = 1 186 | elif (top_words[j, t] == w and n > 0): 187 | w2 = 1 188 | D_ij += float(w1 & w2) 189 | D_j += float(w2) 190 | logSum += log((D_ij + eps) / D_j) 191 | coherence[t] = logSum 192 | 193 | return np.array(coherence) 194 | 195 | 196 | @boundscheck(False) 197 | @wraparound(False) 198 | @cdivision(True) 199 | cpdef entropy( 200 | double[:, :] p_wz, 201 | bint max_probs=True): 202 | """Renyi entropy calculation routine [1]_. 203 | 204 | Renyi entropy can be used to estimate the optimal number of topics: just fit 205 | several models with a different number of topics and choose the number of 206 | topics for which the Renyi entropy is the least. 207 | 208 | Parameters 209 | ---------- 210 | p_wz : np.ndarray 211 | Topics vs words probabilities matrix (T x W). 212 | 213 | Returns 214 | ------- 215 | renyi : double 216 | Renyi entropy value. 217 | max_probs : bool 218 | Use maximum probabilities of terms per topics instead of all probability values. 219 | 220 | References 221 | ---------- 222 | .. [1] Koltcov, S. (2018). Application of Rényi and Tsallis entropies to 223 | topic modeling optimization. Physica A: Statistical Mechanics and its 224 | Applications, 512, 1192-1204. 225 | 226 | Example 227 | ------- 228 | >>> import bitermplus as btm 229 | >>> # Preprocessing step 230 | >>> # ... 231 | >>> # Model fitting step 232 | >>> # model = ... 233 | >>> # Entropy calculation 234 | >>> entropy = btm.entropy(model.matrix_topics_words_) 235 | """ 236 | # Words number 237 | cdef int W = p_wz.shape[1] 238 | # Topics number 239 | cdef int T = p_wz.shape[0] 240 | 241 | # Initializing variables 242 | cdef double word_ratio = 0. 243 | cdef double sum_prob = 0. 244 | cdef double shannon = 0. 245 | cdef double energy = 0. 246 | cdef double int_energy = 0. 247 | cdef double free_energy = 0. 248 | cdef double renyi = 0. 249 | cdef int t = 0 250 | cdef int w = 0 251 | 252 | # Setting threshold 253 | cdef double thresh = 1. / W 254 | 255 | for w in range(W): 256 | for t in range(T): 257 | if not max_probs or (max_probs and p_wz[t, w] > thresh): 258 | sum_prob += p_wz[t, w] 259 | word_ratio += 1 260 | 261 | # Shannon entropy 262 | shannon = log(word_ratio / (W * T)) 263 | 264 | # Internal energy 265 | int_energy = -log(sum_prob / T) 266 | 267 | # Free energy 268 | free_energy = int_energy - shannon * T 269 | 270 | # Renyi entropy 271 | if T == 1: 272 | renyi = free_energy / T 273 | else: 274 | renyi = free_energy / (T-1) 275 | 276 | return renyi 277 | -------------------------------------------------------------------------------- /src/bitermplus/_util.py: -------------------------------------------------------------------------------- 1 | __all__ = [ 2 | 'get_words_freqs', 'get_vectorized_docs', 3 | 'get_biterms', 'get_top_topic_words', 4 | 'get_top_topic_docs', 'get_docs_top_topic'] 5 | 6 | from typing import List, Union, Tuple, Dict, Sequence, Any 7 | from scipy.sparse import csr_matrix 8 | from pandas import DataFrame, Series, concat 9 | from sklearn.feature_extraction.text import CountVectorizer 10 | import numpy as np 11 | from ._btm import BTM 12 | 13 | 14 | def get_words_freqs( 15 | docs: Union[List[str], np.ndarray, Series], 16 | **kwargs: dict) -> Tuple[csr_matrix, np.ndarray, Dict]: 17 | """Compute words vs documents frequency matrix. 18 | 19 | Parameters 20 | ---------- 21 | docs : Union[List[str], np.ndarray, Series] 22 | Documents in any format that can be passed to 23 | :meth:`sklearn.feature_extraction.text.CountVectorizer` method. 24 | kwargs : dict 25 | Keyword arguments for 26 | :meth:`sklearn.feature_extraction.text.CountVectorizer` method. 27 | 28 | Returns 29 | ------- 30 | Tuple[scipy.sparse.csr_matrix, np.ndarray, Dict] 31 | Documents vs words matrix in CSR format, 32 | vocabulary as a numpy.ndarray of terms, 33 | and vocabulary as a dictionary of {term: id} pairs. 34 | 35 | Example 36 | ------- 37 | >>> import pandas as pd 38 | >>> import bitermplus as btm 39 | 40 | >>> # Loading data 41 | >>> df = pd.read_csv( 42 | ... 'dataset/SearchSnippets.txt.gz', header=None, names=['texts']) 43 | >>> texts = df['texts'].str.strip().tolist() 44 | 45 | >>> # Vectorizing documents, obtaining full vocabulary and biterms 46 | >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts) 47 | """ 48 | vec = CountVectorizer(**kwargs) 49 | X = vec.fit_transform(docs) 50 | words = np.array(vec.get_feature_names_out()) 51 | return X, words, vec.vocabulary_ 52 | 53 | 54 | def get_vectorized_docs( 55 | docs: Union[List[str], np.ndarray], 56 | vocab: Union[List[str], np.ndarray]) -> List[np.ndarray]: 57 | """Replace words with their ids in each document. 58 | 59 | Parameters 60 | ---------- 61 | docs : Union[List[str], np.ndarray] 62 | Documents (iterable of strings). 63 | vocab: Union[List[str], np.ndarray] 64 | Vocabulary (iterable of terms). 65 | 66 | Returns 67 | ------- 68 | docs : List[np.ndarray] 69 | Vectorised documents (list of ``numpy.ndarray`` 70 | objects with terms ids). 71 | 72 | Example 73 | ------- 74 | >>> import pandas as pd 75 | >>> import bitermplus as btm 76 | 77 | >>> # Loading data 78 | >>> df = pd.read_csv( 79 | ... 'dataset/SearchSnippets.txt.gz', header=None, names=['texts']) 80 | >>> texts = df['texts'].str.strip().tolist() 81 | 82 | >>> # Vectorizing documents, obtaining full vocabulary and biterms 83 | >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts) 84 | >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary) 85 | """ 86 | vocab_idx = dict(zip(vocab, range(len(vocab)))) 87 | 88 | def _parse_words(w): 89 | return vocab_idx.get(w) 90 | 91 | return list( 92 | map( 93 | lambda doc: 94 | np.array( 95 | list(filter(None, map(_parse_words, doc.split()))), 96 | dtype=np.int32), 97 | docs)) 98 | 99 | 100 | def get_biterms( 101 | docs: List[np.ndarray], 102 | win: int = 15) -> List[List[int]]: 103 | """Biterms creation routine. 104 | 105 | Parameters 106 | ---------- 107 | docs : List[np.ndarray] 108 | List of numpy.ndarray objects containing word indices. 109 | win : int = 15 110 | Biterms generation window. 111 | 112 | Returns 113 | ------- 114 | List[List[int]] 115 | List of biterms for each document. 116 | 117 | Example 118 | ------- 119 | >>> import pandas as pd 120 | >>> import bitermplus as btm 121 | 122 | >>> # Loading data 123 | >>> df = pd.read_csv( 124 | ... 'dataset/SearchSnippets.txt.gz', header=None, names=['texts']) 125 | >>> texts = df['texts'].str.strip().tolist() 126 | 127 | >>> # Vectorizing documents, obtaining full vocabulary and biterms 128 | >>> X, vocabulary, vocab_dict = btm.get_words_freqs(texts) 129 | >>> docs_vec = btm.get_vectorized_docs(texts, vocabulary) 130 | >>> biterms = btm.get_biterms(docs_vec) 131 | """ 132 | biterms = [] 133 | for doc in docs: 134 | doc_biterms = [] 135 | doc_len = len(doc) 136 | if doc_len < 2: 137 | continue 138 | for i in range(doc_len-1): 139 | for j in range(i+1, min(i + win, doc_len)): 140 | wi = min(doc[i], doc[j]) 141 | wj = max(doc[i], doc[j]) 142 | doc_biterms.append([wi, wj]) 143 | biterms.append(doc_biterms) 144 | return biterms 145 | 146 | 147 | def get_top_topic_words( 148 | model: BTM, 149 | words_num: int = 20, 150 | topics_idx: Sequence[Any] = None) -> DataFrame: 151 | """Select top topic words from a fitted model. 152 | 153 | Parameters 154 | ---------- 155 | model : bitermplus._btm.BTM 156 | Fitted BTM model. 157 | words_num : int = 20 158 | The number of words to select. 159 | topics_idx : Union[List, numpy.ndarray] = None 160 | Topics indices. Meant to be used to select only stable 161 | topics. 162 | 163 | Returns 164 | ------- 165 | DataFrame 166 | Words with highest probabilities per each selected topic. 167 | 168 | Example 169 | ------- 170 | >>> stable_topics = [0, 3, 10, 12, 18, 21] 171 | >>> top_words = btm.get_top_topic_words( 172 | ... model, 173 | ... words_num=100, 174 | ... topics_idx=stable_topics) 175 | """ 176 | def _select_words(model, topic_id: int): 177 | probs = model.matrix_topics_words_[topic_id, :] 178 | idx = np.argsort(probs)[:-words_num-1:-1] 179 | result = Series(model.vocabulary_[idx]) 180 | result.name = 'topic{}'.format(topic_id) 181 | return result 182 | 183 | topics_num = model.topics_num_ 184 | topics_idx = np.arange(topics_num) if topics_idx is None else topics_idx 185 | return concat( 186 | map(lambda x: _select_words(model, x), topics_idx), axis=1) 187 | 188 | 189 | def get_top_topic_docs( 190 | docs: Sequence[Any], 191 | p_zd: np.ndarray, 192 | docs_num: int = 20, 193 | topics_idx: Sequence[Any] = None) -> DataFrame: 194 | """Select top topic docs from a fitted model. 195 | 196 | Parameters 197 | ---------- 198 | docs : Sequence[Any] 199 | Iterable of documents (e.g. list of strings). 200 | p_zd : np.ndarray 201 | Documents vs topics probabilities matrix. 202 | docs_num : int = 20 203 | The number of documents to select. 204 | topics_idx : Sequence[Any] = None 205 | Topics indices. Meant to be used to select only stable 206 | topics. 207 | 208 | Returns 209 | ------- 210 | DataFrame 211 | Documents with highest probabilities in all selected topics. 212 | 213 | Example 214 | ------- 215 | >>> top_docs = btm.get_top_topic_docs( 216 | ... texts, 217 | ... p_zd, 218 | ... docs_num=100, 219 | ... topics_idx=[1,2,3,4]) 220 | """ 221 | def _select_docs(docs, p_zd, topic_id: int): 222 | probs = p_zd[:, topic_id] 223 | idx = np.argsort(probs)[:-docs_num-1:-1] 224 | result = Series(np.asarray(docs)[idx]) 225 | result.name = 'topic{}'.format(topic_id) 226 | return result 227 | 228 | topics_num = p_zd.shape[1] 229 | topics_idx = np.arange(topics_num) if topics_idx is None else topics_idx 230 | return concat( 231 | map(lambda x: _select_docs(docs, p_zd, x), topics_idx), axis=1) 232 | 233 | 234 | def get_docs_top_topic( 235 | docs: Sequence[Any], 236 | p_zd: np.ndarray) -> DataFrame: 237 | """Select most probable topic for each document. 238 | 239 | Parameters 240 | ---------- 241 | docs : Sequence[Any] 242 | Iterable of documents (e.g. list of strings). 243 | p_zd : np.ndarray 244 | Documents vs topics probabilities matrix. 245 | 246 | Returns 247 | ------- 248 | DataFrame 249 | Documents and the most probable topic for each of them. 250 | 251 | Example 252 | ------- 253 | >>> import bitermplus as btm 254 | >>> # Read documents from file 255 | >>> # texts = ... 256 | >>> # Build and train a model 257 | >>> # model = ... 258 | >>> # model.fit(...) 259 | >>> btm.get_docs_top_topic(texts, model.matrix_docs_topics_) 260 | """ 261 | return DataFrame({'documents': docs, 'label': p_zd.argmax(axis=1)}) 262 | -------------------------------------------------------------------------------- /tests/test_btm.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import pickle as pkl 3 | import logging 4 | import numpy as np 5 | import pandas as pd 6 | from src import bitermplus as btm 7 | 8 | # import time 9 | LOGGER = logging.getLogger(__name__) 10 | 11 | 12 | class TestBTM(unittest.TestCase): 13 | 14 | # Main tests 15 | def test_btm_class(self): 16 | # Importing and vectorizing text data 17 | df = pd.read_csv("dataset/SearchSnippets.txt.gz", header=None, names=["texts"]) 18 | texts = df["texts"].str.strip().tolist() 19 | 20 | # Vectorizing documents, obtaining full vocabulary and biterms 21 | X, vocabulary, _ = btm.get_words_freqs(texts) 22 | docs_vec = btm.get_vectorized_docs(texts, vocabulary) 23 | biterms = btm.get_biterms(docs_vec) 24 | 25 | LOGGER.info("Modeling started") 26 | topics_num = 8 27 | model = btm.BTM( 28 | X, 29 | vocabulary, 30 | seed=52214, 31 | T=topics_num, 32 | M=20, 33 | alpha=50 / topics_num, 34 | beta=0.01, 35 | ) 36 | # t1 = time.time() 37 | model.fit(biterms, iterations=20) 38 | # t2 = time.time() 39 | # LOGGER.info(t2 - t1) 40 | # LOGGER.info(model.theta_) 41 | self.assertIsInstance(model.matrix_topics_words_, np.ndarray) 42 | self.assertTupleEqual( 43 | model.matrix_topics_words_.shape, (topics_num, vocabulary.size) 44 | ) 45 | LOGGER.info("Modeling finished") 46 | 47 | LOGGER.info('Inference "sum_b" started') 48 | docs_vec_subset = docs_vec[:1000] 49 | docs_vec_subset[100] = np.array([], dtype=np.int32) 50 | p_zd = model.transform(docs_vec_subset) 51 | self.assertTupleEqual(p_zd.shape, (1000, topics_num)) 52 | # LOGGER.info(p_zd) 53 | LOGGER.info('Inference "sum_b" finished') 54 | 55 | LOGGER.info("Model saving started") 56 | with open("model.pickle", "wb") as file: 57 | pkl.dump(model, file) 58 | LOGGER.info("Model saving finished") 59 | 60 | LOGGER.info('Inference "sum_w" started') 61 | p_zd = model.transform(docs_vec_subset, infer_type="sum_w") 62 | # LOGGER.info(p_zd) 63 | LOGGER.info('Inference "sum_w" finished') 64 | 65 | LOGGER.info('Inference "mix" started') 66 | p_zd = model.transform(docs_vec_subset, infer_type="mix") 67 | # LOGGER.info(p_zd) 68 | LOGGER.info('Inference "mix" finished') 69 | 70 | LOGGER.info("Perplexity testing started") 71 | perplexity = btm.perplexity(model.matrix_topics_words_, p_zd, X, 8) 72 | self.assertTrue(perplexity, model.perplexity_) 73 | self.assertIsInstance(perplexity, float) 74 | self.assertNotEqual(perplexity, 0.0) 75 | LOGGER.info(f"Perplexity value: {perplexity}") 76 | LOGGER.info("Perplexity testing finished") 77 | 78 | LOGGER.info("Coherence testing started") 79 | coherence = btm.coherence(model.matrix_topics_words_, X, M=20) 80 | self.assertTrue(np.allclose(coherence, model.coherence_)) 81 | self.assertIsInstance(coherence, np.ndarray) 82 | self.assertGreater(coherence.shape[0], 0) 83 | LOGGER.info(f"Coherence value: {coherence}") 84 | LOGGER.info("Coherence testing finished") 85 | 86 | LOGGER.info("Entropy testing started") 87 | entropy = btm.entropy(model.matrix_topics_words_, True) 88 | self.assertNotEqual(entropy, 0) 89 | LOGGER.info(f"Entropy value: {entropy}") 90 | LOGGER.info("Entropy testing finished") 91 | 92 | LOGGER.info("Model loading started") 93 | with open("model.pickle", "rb") as file: 94 | self.assertIsInstance(pkl.load(file), btm._btm.BTM) 95 | LOGGER.info("Model loading finished") 96 | 97 | 98 | if __name__ == "__main__": 99 | unittest.main() 100 | --------------------------------------------------------------------------------