├── .devcontainer └── devcontainer.json ├── .editorconfig ├── .gitattributes ├── .gitignore ├── .idea ├── .gitignore ├── inspectionProfiles │ └── profiles_settings.xml ├── misc.xml ├── modules.xml ├── other.xml ├── streamlit_topic_modeling.iml └── vcs.xml ├── Dockerfile ├── MANIFEST.in ├── Makefile ├── README.rst ├── data ├── Inkfree.ttf ├── Tweets.csv.zip ├── elonmusk.csv.zip ├── favicon.png ├── is-this-a-topic-modeling.jpg └── mf.png ├── docs ├── Makefile ├── _static │ └── .gitignore ├── _templates │ └── .gitignore ├── conf.py ├── index.rst ├── make.bat └── readme.rst ├── newsfragments └── .gitignore ├── pyproject.toml ├── pytest.ini ├── requirements.txt ├── setup.cfg ├── setup.py ├── streamlit_topic_modeling ├── __init__.py ├── app.py └── tests │ ├── __init__.py │ └── test_app.py ├── towncrier.toml └── tox.ini /.devcontainer/devcontainer.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "Python 3", 3 | // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile 4 | "image": "mcr.microsoft.com/devcontainers/python:1-3.11-bullseye", 5 | "customizations": { 6 | "codespaces": { 7 | "openFiles": [ 8 | "README.md", 9 | "streamlit_topic_modeling/app.py" 10 | ] 11 | }, 12 | "vscode": { 13 | "settings": {}, 14 | "extensions": [ 15 | "ms-python.python", 16 | "ms-python.vscode-pylance" 17 | ] 18 | } 19 | }, 20 | "updateContentCommand": "[ -f packages.txt ] && sudo apt update && sudo apt upgrade -y && sudo xargs apt install -y 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/other.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | -------------------------------------------------------------------------------- /.idea/streamlit_topic_modeling.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.8 2 | LABEL maintainer="Bryan Patrick Wood " 3 | 4 | WORKDIR /usr/src/app 5 | COPY .. . 6 | RUN pip install -U pip && pip install --no-cache-dir -e . 7 | EXPOSE 8501 8 | ENTRYPOINT streamlit run ./streamlit_topic_modeling/app.py -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | include README.md 2 | include pyproject.toml 3 | recursive-include data *.png *.zip *.ttf -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | sphinx-apidoc: 2 | sphinx-apidoc -f -o docs . setup.py -------------------------------------------------------------------------------- /README.rst: -------------------------------------------------------------------------------- 1 | README 2 | ****** 3 | 4 | A topic modeling GUI application using Streamlit deployed on Streamlit Sharing `here `_. 5 | 6 | .. image:: ./data/is-this-a-topic-modeling.jpg 7 | -------------------------------------------------------------------------------- /data/Inkfree.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/Inkfree.ttf -------------------------------------------------------------------------------- /data/Tweets.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/Tweets.csv.zip -------------------------------------------------------------------------------- /data/elonmusk.csv.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/elonmusk.csv.zip -------------------------------------------------------------------------------- /data/favicon.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/favicon.png -------------------------------------------------------------------------------- /data/is-this-a-topic-modeling.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/is-this-a-topic-modeling.jpg -------------------------------------------------------------------------------- /data/mf.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/data/mf.png -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore -------------------------------------------------------------------------------- /docs/_templates/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore -------------------------------------------------------------------------------- /docs/conf.py: -------------------------------------------------------------------------------- 1 | # Configuration file for the Sphinx documentation builder. 2 | # 3 | # This file only contains a selection of the most common options. For a full 4 | # list see the documentation: 5 | # https://www.sphinx-doc.org/en/master/usage/configuration.html 6 | 7 | # -- Path setup -------------------------------------------------------------- 8 | 9 | # If extensions (or modules to document with autodoc) are in another directory, 10 | # add these directories to sys.path here. If the directory is relative to the 11 | # documentation root, use os.path.abspath to make it absolute, like shown here. 12 | # 13 | import os 14 | import sys 15 | sys.path.insert(0, os.path.abspath('../../streamlit_topic_modeling')) 16 | 17 | 18 | # -- Project information ----------------------------------------------------- 19 | 20 | project = 'Streamlit Topic Modeling' 21 | copyright = '2021, Bryan Patrick Wood' 22 | author = 'Bryan Patrick Wood' 23 | 24 | # The full version, including alpha/beta/rc tags 25 | release = '0.0a0' 26 | 27 | 28 | # -- General configuration --------------------------------------------------- 29 | 30 | # Add any Sphinx extension module names here, as strings. They can be 31 | # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom 32 | # ones. 33 | extensions = [ 34 | 'sphinx.ext.autodoc', 35 | 'sphinx.ext.autosummary', 36 | 'sphinx.ext.coverage', 37 | 'sphinx.ext.doctest', 38 | 'sphinx.ext.todo', 39 | 'sphinx.ext.viewcode' 40 | ] 41 | 42 | # Add any paths that contain templates here, relative to this directory. 43 | templates_path = ['_templates'] 44 | 45 | # List of patterns, relative to source directory, that match files and 46 | # directories to ignore when looking for source files. 47 | # This pattern also affects html_static_path and html_extra_path. 48 | exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] 49 | 50 | 51 | # -- Options for HTML output ------------------------------------------------- 52 | 53 | # The theme to use for HTML and HTML Help pages. See the documentation for 54 | # a list of builtin themes. 55 | # 56 | html_theme = 'sphinx_rtd_theme' 57 | 58 | # Add any paths that contain custom static files (such as style sheets) here, 59 | # relative to this directory. They are copied after the builtin static files, 60 | # so a file named "default.css" will overwrite the builtin "default.css". 61 | html_static_path = ['_static'] -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. Streamlit Topic Modeling documentation master file, created by 2 | sphinx-quickstart on Sat Jan 9 11:24:07 2021. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to Streamlit Topic Modeling's documentation! 7 | ========================================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 2 11 | :caption: Contents: 12 | 13 | readme 14 | 15 | 16 | 17 | Indices and tables 18 | ================== 19 | 20 | * :ref:`genindex` 21 | * :ref:`modindex` 22 | * :ref:`search` 23 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=_build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/readme.rst: -------------------------------------------------------------------------------- 1 | .. include:: ../README.rst -------------------------------------------------------------------------------- /newsfragments/.gitignore: -------------------------------------------------------------------------------- 1 | !.gitignore -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 51.1.1", "wheel"] 3 | build-backend = "setuptools.build_meta" -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | minversion = 6.0 3 | addopts = -ra -q 4 | testpaths = 5 | streamlit_topic_modeling/tests -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | altair==5.4.1 2 | attrs==24.2.0 3 | blinker==1.8.2 4 | cachetools==5.5.0 5 | certifi==2024.8.30 6 | charset-normalizer==3.3.2 7 | click==8.1.7 8 | colorama==0.4.6 9 | contourpy==1.3.0 10 | cycler==0.12.1 11 | fonttools==4.53.1 12 | funcy==2.0 13 | gensim==4.3.3 14 | gitdb==4.0.11 15 | GitPython==3.1.43 16 | idna==3.8 17 | Jinja2==3.1.4 18 | joblib==1.4.2 19 | jsonschema==4.23.0 20 | jsonschema-specifications==2023.12.1 21 | kiwisolver==1.4.7 22 | llvmlite==0.43.0 23 | markdown-it-py==3.0.0 24 | MarkupSafe==2.1.5 25 | matplotlib==3.9.2 26 | mdurl==0.1.2 27 | narwhals==1.6.2 28 | nltk==3.9.1 29 | numba==0.60.0 30 | numexpr==2.10.1 31 | numpy==1.26.4 32 | packaging==24.1 33 | pandas==2.2.2 34 | patsy==0.5.6 35 | pillow==10.4.0 36 | plotly==5.24.0 37 | plotly-express==0.4.1 38 | protobuf==5.28.0 39 | pyarrow==17.0.0 40 | pydeck==0.9.1 41 | Pygments==2.18.0 42 | pyLDAvis==3.4.1 43 | pynndescent==0.5.13 44 | pyparsing==3.1.4 45 | python-dateutil==2.9.0.post0 46 | pytz==2024.1 47 | referencing==0.35.1 48 | regex==2024.7.24 49 | requests==2.32.3 50 | rich==13.8.0 51 | rpds-py==0.20.0 52 | scikit-learn==1.5.1 53 | scipy==1.13.1 54 | seaborn==0.13.2 55 | six==1.16.0 56 | smart-open==7.0.4 57 | smmap==5.0.1 58 | statsmodels==0.14.2 59 | streamlit==1.38.0 60 | tenacity==8.5.0 61 | threadpoolctl==3.5.0 62 | toml==0.10.2 63 | tornado==6.4.1 64 | tqdm==4.66.5 65 | typing_extensions==4.12.2 66 | tzdata==2024.1 67 | umap-learn==0.5.6 68 | urllib3==2.2.2 69 | watchdog==4.0.2 70 | wordcloud==1.9.3 71 | wrapt==1.16.0 72 | -------------------------------------------------------------------------------- /setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = streamlit_topic_modeling 3 | version = attr:streamlit_topic_modeling.__version__ 4 | description = A topic modeling GUI application using Streamlit. 5 | description-file = README.md 6 | long_description = file:README.md 7 | long_description_content_type = text/markdown 8 | author = Bryan Patrick Wood 9 | author_email = bpw1621@gmail.com 10 | url = 'https://github.com/bpw1621/streamlit_topic_modeling' 11 | download_url = https://github.com/bpw1621/streamlit_topic_modeling/archive/master.zip 12 | project_urls = 13 | Homepage = https://bpw1621.github.io/streamlit_topic_modeling 14 | Source Code = https://github.com/bpw1621/streamlit_topic_modeling 15 | Documentation = https://streamlit_topic_modeling.readthedocs.io/en/latest/ 16 | Bug Tracker = https://github.com/bpw1621/streamlit_topic_modeling/issues 17 | classifiers = 18 | Development Status :: 3 - Alpha 19 | Programming Language :: Python 20 | Programming Language :: Python :: 3 21 | Programming Language :: Python :: 3.11 22 | Programming Language :: Python :: 3 :: Only 23 | 24 | ;keywords = ... 25 | ;license = ... 26 | 27 | [options] 28 | zip_safe = False 29 | include_package_data = True 30 | packages = find: 31 | install_requires = 32 | gensim 33 | matplotlib 34 | nltk 35 | numpy 36 | pandas 37 | plotly-express 38 | plotly 39 | pyldavis 40 | regex 41 | scikit-learn 42 | seaborn 43 | streamlit 44 | umap-learn 45 | wordcloud 46 | tests_require = 47 | pytest 48 | pytest-mock 49 | pytest-cov 50 | setup_requires = 51 | setuptools 52 | pytest-runner 53 | 54 | [bdist_wheel] 55 | universal = true 56 | 57 | [options.extras_require] 58 | dev = 59 | flake8 60 | tox 61 | pretty_errors 62 | twine 63 | doc = 64 | sphinx 65 | sphinx_rtd_theme 66 | towncrier 67 | 68 | [aliases] 69 | test = pytest 70 | 71 | [tool:pytest] 72 | collect_ignore = ['setup.py'] -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | from setuptools import setup 3 | 4 | setup(setup_cfg=True) 5 | -------------------------------------------------------------------------------- /streamlit_topic_modeling/__init__.py: -------------------------------------------------------------------------------- 1 | """Top-level streamlit_topic_modeling package.""" 2 | 3 | import logging 4 | from logging import NullHandler 5 | 6 | __author__ = 'Bryan Patrick Wood' 7 | __email__ = 'bpw1621@gmail.com' 8 | __version__ = '0.0a0' 9 | 10 | logging.getLogger(__name__).addHandler(NullHandler()) 11 | -------------------------------------------------------------------------------- /streamlit_topic_modeling/app.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import gensim 4 | import matplotlib.colors as mcolors 5 | import matplotlib.pyplot as plt 6 | import nltk 7 | import numpy as np 8 | import pandas as pd 9 | import plotly.express as px 10 | import pyLDAvis.gensim_models 11 | import regex 12 | import seaborn as sns 13 | import streamlit as st 14 | import streamlit.components.v1 as components 15 | from gensim import corpora 16 | from gensim.models import CoherenceModel 17 | from gensim.utils import simple_preprocess 18 | from nltk.corpus import stopwords 19 | from sklearn.decomposition import PCA 20 | from sklearn.manifold import TSNE 21 | from umap import UMAP 22 | from wordcloud import WordCloud 23 | 24 | DEFAULT_HIGHLIGHT_PROBABILITY_MINIMUM = 0.001 25 | DEFAULT_NUM_TOPICS = 6 26 | 27 | nltk.download("stopwords") 28 | 29 | DATASETS = { 30 | 'Five Years of Elon Musk Tweets': { 31 | 'path': './data/elonmusk.csv.zip', 32 | 'column': 'tweet', 33 | 'url': 'https://www.kaggle.com/vidyapb/elon-musk-tweets-2015-to-2020', 34 | 'description': ( 35 | 'I scraped Elon Musk\'s tweets from the last 5 years using twint library. My inspiration behind this is to ' 36 | 'see how public personalities are influencing common people on Social Media Platforms. I would love to see ' 37 | 'some notebooks around this dataset, giving us insights like what are the topics which Tesla mostly tweets ' 38 | 'about? How are Tesla\'s stocks being influenced by his tweets?' 39 | ) 40 | }, 41 | 'Airline Tweets': { 42 | 'path': './data/Tweets.csv.zip', 43 | 'column': 'text', 44 | 'url': 'https://www.kaggle.com/crowdflower/twitter-airline-sentiment', 45 | 'description': ( 46 | 'A sentiment analysis job about the problems of each major U.S. airline. Twitter data was scraped from ' 47 | 'February of 2015 and contributors were asked to first classify positive, negative, and neutral tweets, ' 48 | 'followed by categorizing negative reasons (such as "late flight" or "rude service").' 49 | ) 50 | } 51 | } 52 | 53 | 54 | def lda_options(): 55 | return { 56 | 'num_topics': st.number_input('Number of Topics', min_value=1, value=9, 57 | help='The number of requested latent topics to be extracted from the training corpus.'), 58 | 'chunksize': st.number_input('Chunk Size', min_value=1, value=2000, 59 | help='Number of documents to be used in each training chunk.'), 60 | 'passes': st.number_input('Passes', min_value=1, value=1, 61 | help='Number of passes through the corpus during training.'), 62 | 'update_every': st.number_input('Update Every', min_value=1, value=1, 63 | help='Number of documents to be iterated through for each update. Set to 0 for batch learning, > 1 for online iterative learning.'), 64 | 'alpha': st.selectbox('𝛼', ('symmetric', 'asymmetric', 'auto'), 65 | help='A priori belief on document-topic distribution.'), 66 | 'eta': st.selectbox('𝜂', (None, 'symmetric', 'auto'), help='A-priori belief on topic-word distribution'), 67 | 'decay': st.number_input('𝜅', min_value=0.5, max_value=1.0, value=0.5, 68 | help='A number between (0.5, 1] to weight what percentage of the previous lambda value is forgotten when each new document is examined.'), 69 | 'offset': st.number_input('𝜏_0', value=1.0, 70 | help='Hyper-parameter that controls how much we will slow down the first steps the first few iterations.'), 71 | 'eval_every': st.number_input('Evaluate Every', min_value=1, value=10, 72 | help='Log perplexity is estimated every that many updates.'), 73 | 'iterations': st.number_input('Iterations', min_value=1, value=50, 74 | help='Maximum number of iterations through the corpus when inferring the topic distribution of a corpus.'), 75 | 'gamma_threshold': st.number_input('𝛾', min_value=0.0, value=0.001, 76 | help='Minimum change in the value of the gamma parameters to continue iterating.'), 77 | 'minimum_probability': st.number_input('Minimum Probability', min_value=0.0, max_value=1.0, value=0.01, 78 | help='Topics with a probability lower than this threshold will be filtered out.'), 79 | 'minimum_phi_value': st.number_input('𝜑', min_value=0.0, value=0.01, 80 | help='if per_word_topics is True, this represents a lower bound on the term probabilities.'), 81 | 'per_word_topics': st.checkbox('Per Word Topics', 82 | help='If True, the model also computes a list of topics, sorted in descending order of most likely topics for each word, along with their phi values multiplied by the feature length (i.e. word count).') 83 | } 84 | 85 | 86 | def nmf_options(): 87 | return { 88 | 'num_topics': st.number_input('Number of Topics', min_value=1, value=9, help='Number of topics to extract.'), 89 | 'chunksize': st.number_input('Chunk Size', min_value=1, value=2000, 90 | help='Number of documents to be used in each training chunk.'), 91 | 'passes': st.number_input('Passes', min_value=1, value=1, 92 | help='Number of full passes over the training corpus.'), 93 | 'kappa': st.number_input('𝜅', min_value=0.0, value=1.0, help='Gradient descent step size.'), 94 | 'minimum_probability': st.number_input('Minimum Probability', min_value=0.0, max_value=1.0, value=0.01, 95 | help='If normalize is True, topics with smaller probabilities are filtered out. If normalize is False, topics with smaller factors are filtered out. If set to None, a value of 1e-8 is used to prevent 0s.'), 96 | 'w_max_iter': st.number_input('W max iter', min_value=1, value=200, 97 | help='Maximum number of iterations to train W per each batch.'), 98 | 'w_stop_condition': st.number_input('W stop cond', min_value=0.0, value=0.0001, 99 | help=' If error difference gets less than that, training of W stops for the current batch.'), 100 | 'h_max_iter': st.number_input('H max iter', min_value=1, value=50, 101 | help='Maximum number of iterations to train h per each batch.'), 102 | 'h_stop_condition': st.number_input('W stop cond', min_value=0.0, value=0.001, 103 | help='If error difference gets less than that, training of h stops for the current batch.'), 104 | 'eval_every': st.number_input('Evaluate Every', min_value=1, value=10, 105 | help='Number of batches after which l2 norm of (v - Wh) is computed.'), 106 | 'normalize': st.selectbox('Normalize', (True, False, None), help='Whether to normalize the result.') 107 | } 108 | 109 | 110 | MODELS = { 111 | 'Latent Dirichlet Allocation': { 112 | 'options': lda_options, 113 | 'class': gensim.models.LdaModel, 114 | 'help': 'https://radimrehurek.com/gensim/models/ldamodel.html' 115 | }, 116 | 'Non-Negative Matrix Factorization': { 117 | 'options': nmf_options, 118 | 'class': gensim.models.Nmf, 119 | 'help': 'https://radimrehurek.com/gensim/models/nmf.html' 120 | } 121 | } 122 | 123 | COLORS = [color for color in mcolors.XKCD_COLORS.values()] 124 | 125 | WORDCLOUD_FONT_PATH = r'./data/Inkfree.ttf' 126 | 127 | EMAIL_REGEX_STR = r'\S*@\S*' 128 | MENTION_REGEX_STR = r'@\S*' 129 | HASHTAG_REGEX_STR = r'#\S+' 130 | URL_REGEX_STR = r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*' 131 | 132 | 133 | @st.cache_data() 134 | def generate_texts_df(selected_dataset: str): 135 | dataset = DATASETS[selected_dataset] 136 | return pd.read_csv(f'{dataset["path"]}') 137 | 138 | 139 | @st.cache_data() 140 | def denoise_docs(texts_df: pd.DataFrame, text_column: str): 141 | texts = texts_df[text_column].values.tolist() 142 | remove_regex = regex.compile(f'({EMAIL_REGEX_STR}|{MENTION_REGEX_STR}|{HASHTAG_REGEX_STR}|{URL_REGEX_STR})') 143 | texts = [regex.sub(remove_regex, '', text) for text in texts] 144 | docs = [[w for w in simple_preprocess(doc, deacc=True) if w not in stopwords.words('english')] for doc in texts] 145 | return docs 146 | 147 | 148 | @st.cache_data() 149 | def create_bigrams(docs): 150 | bigram_phrases = gensim.models.Phrases(docs) 151 | bigram_phraser = gensim.models.phrases.Phraser(bigram_phrases) 152 | docs = [bigram_phraser[doc] for doc in docs] 153 | return docs 154 | 155 | 156 | @st.cache_data() 157 | def create_trigrams(docs): 158 | bigram_phrases = gensim.models.Phrases(docs) 159 | bigram_phraser = gensim.models.phrases.Phraser(bigram_phrases) 160 | trigram_phrases = gensim.models.Phrases(bigram_phrases[docs]) 161 | trigram_phraser = gensim.models.phrases.Phraser(trigram_phrases) 162 | docs = [trigram_phraser[bigram_phraser[doc]] for doc in docs] 163 | return docs 164 | 165 | 166 | @st.cache_data() 167 | def generate_docs(texts_df: pd.DataFrame, text_column: str, ngrams: str = None): 168 | docs = denoise_docs(texts_df, text_column) 169 | if ngrams == 'bigrams': 170 | docs = create_bigrams(docs) 171 | if ngrams == 'trigrams': 172 | docs = create_trigrams(docs) 173 | return docs 174 | 175 | 176 | @st.cache_data() 177 | def generate_wordcloud(docs, collocations: bool = False): 178 | wordcloud_text = (' '.join(' '.join(doc) for doc in docs)) 179 | wordcloud = WordCloud(font_path=WORDCLOUD_FONT_PATH, width=700, height=600, 180 | background_color='white', collocations=collocations).generate(wordcloud_text) 181 | return wordcloud 182 | 183 | 184 | @st.cache_data() 185 | def prepare_training_data(docs): 186 | id2word = corpora.Dictionary(docs) 187 | corpus = [id2word.doc2bow(doc) for doc in docs] 188 | return id2word, corpus 189 | 190 | 191 | @st.cache_data() 192 | def train_model(docs, base_model, **kwargs): 193 | id2word, corpus = prepare_training_data(docs) 194 | model = base_model(corpus=corpus, id2word=id2word, **kwargs) 195 | return id2word, corpus, model 196 | 197 | 198 | def clear_session_state(): 199 | for key in ('model_kwargs', 'id2word', 'corpus', 'model', 'previous_perplexity', 'previous_coherence_model_value'): 200 | if key in st.session_state: 201 | del st.session_state[key] 202 | 203 | 204 | def calculate_perplexity(model, corpus): 205 | return np.exp2(-model.log_perplexity(corpus)) 206 | 207 | 208 | def calculate_coherence(model, corpus, coherence): 209 | coherence_model = CoherenceModel(model=model, corpus=corpus, coherence=coherence) 210 | return coherence_model.get_coherence() 211 | 212 | 213 | @st.cache_data() 214 | def white_or_black_text(background_color): 215 | # https://stackoverflow.com/questions/3942878/how-to-decide-font-color-in-white-or-black-depending-on-background-color 216 | red = int(background_color[1:3], 16) 217 | green = int(background_color[3:5], 16) 218 | blue = int(background_color[5:], 16) 219 | return 'black' if (red * 0.299 + green * 0.587 + blue * 0.114) > 186 else 'white' 220 | 221 | 222 | def perplexity_section(): 223 | with st.spinner('Calculating Perplexity ...'): 224 | perplexity = calculate_perplexity(st.session_state.model, st.session_state.corpus) 225 | key = 'previous_perplexity' 226 | delta = f'{perplexity - st.session_state[key]:.4}' if key in st.session_state else None 227 | st.metric(label='Perplexity', value=f'{perplexity:.4f}', delta=delta, delta_color='inverse') 228 | st.session_state[key] = perplexity 229 | st.markdown('Viz., https://en.wikipedia.org/wiki/Perplexity') 230 | st.latex(r'Perplexity = \exp\left(-\frac{\sum_d \log(p(w_d|\Phi, \alpha))}{N}\right)') 231 | 232 | 233 | def coherence_section(): 234 | with st.spinner('Calculating Coherence Score ...'): 235 | coherence = calculate_coherence(st.session_state.model, st.session_state.corpus, 'u_mass') 236 | key = 'previous_coherence_model_value' 237 | delta = f'{coherence - st.session_state[key]:.4f}' if key in st.session_state else None 238 | st.metric(label='Coherence Score', value=f'{coherence:.4f}', delta=delta) 239 | st.session_state[key] = coherence 240 | st.markdown('Viz., http://svn.aksw.org/papers/2015/WSDM_Topic_Evaluation/public.pdf') 241 | st.latex( 242 | r'C_{UMass} = \frac{2}{N \cdot (N - 1)}\sum_{i=2}^N\sum_{j=1}^{i-1}\log\frac{P(w_i, w_j) + \epsilon}{P(w_j)}') 243 | 244 | 245 | @st.cache_data() 246 | def train_projection(projection, n_components, df): 247 | if projection == 'PCA': 248 | projection_model = PCA(n_components=n_components) 249 | elif projection == 'T-SNE': 250 | projection_model = TSNE(n_components=n_components) 251 | elif projection == 'UMAP': 252 | projection_model = UMAP(n_components=n_components) 253 | else: 254 | raise ValueError(f'Unknown projection: {projection}') 255 | return projection_model.fit_transform(df) 256 | 257 | 258 | if __name__ == '__main__': 259 | st.set_page_config(page_title='Topic Modeling', page_icon='./data/favicon.png', layout='wide') 260 | 261 | preprocessing_options = st.sidebar.form('preprocessing-options') 262 | with preprocessing_options: 263 | st.header('Preprocessing Options') 264 | ngrams = st.selectbox('N-grams', [None, 'bigrams', 'trigams'], help='TODO ...') # TODO ... 265 | st.form_submit_button('Preprocess') 266 | 267 | visualization_options = st.sidebar.form('visualization-options') 268 | with visualization_options: 269 | st.header('Visualization Options') 270 | collocations = st.checkbox('Enable WordCloud Collocations', 271 | help='Collocations in word clouds enable the display of phrases.') 272 | highlight_probability_minimum = st.select_slider('Highlight Probability Minimum', 273 | options=[10 ** exponent for exponent in range(-10, 1)], 274 | value=DEFAULT_HIGHLIGHT_PROBABILITY_MINIMUM, 275 | help='Minimum topic probability in order to color highlight a word in the _Topic Highlighted Sentences_ visualization.') 276 | st.form_submit_button('Apply') 277 | 278 | st.title('Topic Modeling') 279 | st.header('What is topic modeling?') 280 | with st.expander('Hero Image'): 281 | st.image('./data/is-this-a-topic-modeling.jpg', caption='No ... no it\'s not ...', use_column_width=True) 282 | st.markdown( 283 | 'Topic modeling is a broad term. It encompasses a number of specific statistical learning methods. ' 284 | 'These methods do the following: explain documents in terms of a set of topics and those topics in terms of ' 285 | 'the a set of words. Two very commonly used methods are Latent Dirichlet Allocation (LDA) and Non-Negative ' 286 | 'Matrix Factorization (NMF), for instance. Used without additional qualifiers the approach is usually assumed ' 287 | 'to be unsupervised although there are semi-supervised and supervised variants.' 288 | ) 289 | 290 | with st.expander('Additional Details'): 291 | st.markdown('The objective can be viewed as a matrix factorization.') 292 | st.image('./data/mf.png', use_column_width=True) 293 | st.markdown('This factorization makes the methods much more efficient than directly characterizing documents ' 294 | 'in term of words.') 295 | st.markdown('More information on LDA and NMF can be found at ' 296 | 'https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation and ' 297 | 'https://en.wikipedia.org/wiki/Non-negative_matrix_factorization, respectively.') 298 | 299 | st.header('Datasets') 300 | st.markdown('Preloaded a couple of small example datasets to illustrate.') 301 | selected_dataset = st.selectbox('Dataset', [None, *sorted(list(DATASETS.keys()))], on_change=clear_session_state) 302 | if not selected_dataset: 303 | st.write('Choose a Dataset to Conintue ...') 304 | st.stop() 305 | 306 | with st.expander('Dataset Description'): 307 | st.markdown(DATASETS[selected_dataset]['description']) 308 | st.markdown(DATASETS[selected_dataset]['url']) 309 | 310 | text_column = DATASETS[selected_dataset]['column'] 311 | texts_df = generate_texts_df(selected_dataset) 312 | docs = generate_docs(texts_df, text_column, ngrams=ngrams) 313 | 314 | with st.expander('Sample Documents'): 315 | sample_texts = texts_df[text_column].sample(5).values.tolist() 316 | for index, text in enumerate(sample_texts): 317 | st.markdown(f'**{index + 1}**: _{text}_') 318 | 319 | with st.expander('Frequency Sized Corpus Wordcloud'): 320 | wc = generate_wordcloud(docs) 321 | st.image(wc.to_image(), caption='Dataset Wordcloud (Not A Topic Model)', use_column_width=True) 322 | st.markdown('These are the remaining words after document preprocessing.') 323 | 324 | with st.expander('Document Word Count Distribution'): 325 | len_docs = [len(doc) for doc in docs] 326 | fig, ax = plt.subplots() 327 | sns.histplot(data=pd.DataFrame(len_docs, columns=['Words In Document']), discrete=True, ax=ax) 328 | st.pyplot(fig) 329 | 330 | model_key = st.sidebar.selectbox('Model', [None, *list(MODELS.keys())], on_change=clear_session_state) 331 | model_options = st.sidebar.form('model-options') 332 | if not model_key: 333 | with st.sidebar: 334 | st.write('Choose a Model to Continue ...') 335 | st.stop() 336 | with model_options: 337 | st.header('Model Options') 338 | model_kwargs = MODELS[model_key]['options']() 339 | st.session_state['model_kwargs'] = model_kwargs 340 | train_model_clicked = st.form_submit_button('Train Model') 341 | 342 | if train_model_clicked: 343 | with st.spinner('Training Model ...'): 344 | id2word, corpus, model = train_model(docs, MODELS[model_key]['class'], **st.session_state.model_kwargs) 345 | st.session_state.id2word = id2word 346 | st.session_state.corpus = corpus 347 | st.session_state.model = model 348 | 349 | if 'model' not in st.session_state: 350 | st.stop() 351 | 352 | st.header('Model') 353 | st.write(type(st.session_state.model).__name__) 354 | st.write(st.session_state.model_kwargs) 355 | 356 | st.header('Model Results') 357 | 358 | topics = st.session_state.model.show_topics(formatted=False, num_words=50, 359 | num_topics=st.session_state.model_kwargs['num_topics'], log=False) 360 | with st.expander('Topic Word-Weighted Summaries'): 361 | topic_summaries = {} 362 | for topic in topics: 363 | topic_index = topic[0] 364 | topic_word_weights = topic[1] 365 | topic_summaries[topic_index] = ' + '.join( 366 | f'{weight:.3f} * {word}' for word, weight in topic_word_weights[:10]) 367 | for topic_index, topic_summary in topic_summaries.items(): 368 | st.markdown(f'**Topic {topic_index}**: _{topic_summary}_') 369 | 370 | colors = random.sample(COLORS, k=model_kwargs['num_topics']) 371 | with st.expander('Top N Topic Keywords Wordclouds'): 372 | cols = st.columns(3) 373 | for index, topic in enumerate(topics): 374 | wc = WordCloud(font_path=WORDCLOUD_FONT_PATH, width=700, height=600, 375 | background_color='white', collocations=collocations, prefer_horizontal=1.0, 376 | color_func=lambda *args, **kwargs: colors[index]) 377 | with cols[index % 3]: 378 | wc.generate_from_frequencies(dict(topic[1])) 379 | st.image(wc.to_image(), caption=f'Topic #{index}', use_column_width=True) 380 | 381 | with st.expander('Topic Highlighted Sentences'): 382 | sample = texts_df.sample(10) 383 | for index, row in sample.iterrows(): 384 | html_elements = [] 385 | for token in row[text_column].split(): 386 | if st.session_state.id2word.token2id.get(token) is None: 387 | html_elements.append(f'{token}') 388 | else: 389 | term_topics = st.session_state.model.get_term_topics(token, minimum_probability=0) 390 | topic_probabilities = [term_topic[1] for term_topic in term_topics] 391 | max_topic_probability = max(topic_probabilities) if topic_probabilities else 0 392 | if max_topic_probability < highlight_probability_minimum: 393 | html_elements.append(token) 394 | else: 395 | max_topic_index = topic_probabilities.index(max_topic_probability) 396 | max_topic = term_topics[max_topic_index] 397 | background_color = colors[max_topic[0]] 398 | # color = 'white' 399 | color = white_or_black_text(background_color) 400 | html_elements.append( 401 | f'{token}') 402 | st.markdown(f'Document #{index}: {" ".join(html_elements)}', unsafe_allow_html=True) 403 | 404 | has_log_perplexity = hasattr(st.session_state.model, 'log_perplexity') 405 | with st.expander('Metrics'): 406 | if has_log_perplexity: 407 | left_column, right_column = st.columns(2) 408 | with left_column: 409 | perplexity_section() 410 | with right_column: 411 | coherence_section() 412 | else: 413 | coherence_section() 414 | 415 | with st.expander('Low Dimensional Projections'): 416 | with st.form('projections-form'): 417 | left_column, right_column = st.columns(2) 418 | projection = left_column.selectbox('Projection', ['PCA', 'T-SNE', 'UMAP'], help='TODO ...') 419 | plot_type = right_column.selectbox('Plot', ['2D', '3D'], help='TODO ...') 420 | n_components = 3 421 | columns = [f'proj{i}' for i in range(1, 4)] 422 | generate_projection_clicked = st.form_submit_button('Generate Projection') 423 | 424 | if generate_projection_clicked: 425 | topic_weights = [] 426 | for index, topic_weight in enumerate(st.session_state.model[st.session_state.corpus]): 427 | weight_vector = [0] * int(st.session_state.model_kwargs['num_topics']) 428 | for topic, weight in topic_weight: 429 | weight_vector[topic] = weight 430 | topic_weights.append(weight_vector) 431 | df = pd.DataFrame(topic_weights) 432 | dominant_topic = df.idxmax(axis='columns').astype('string') 433 | dominant_topic_percentage = df.max(axis='columns') 434 | df = df.assign(dominant_topic=dominant_topic, dominant_topic_percentage=dominant_topic_percentage, 435 | text=texts_df[text_column]) 436 | with st.spinner('Training Projection'): 437 | projections = train_projection(projection, n_components, df.drop(columns=['dominant_topic', 'dominant_topic_percentage', 'text']).add_prefix('topic_')) 438 | data = pd.concat([df, pd.DataFrame(projections, columns=columns)], axis=1) 439 | 440 | px_options = {'color': 'dominant_topic', 'size': 'dominant_topic_percentage', 441 | 'hover_data': ['dominant_topic', 'dominant_topic_percentage', 'text']} 442 | if plot_type == '2D': 443 | fig = px.scatter(data, x='proj1', y='proj2', **px_options) 444 | st.plotly_chart(fig) 445 | fig = px.scatter(data, x='proj1', y='proj3', **px_options) 446 | st.plotly_chart(fig) 447 | fig = px.scatter(data, x='proj2', y='proj3', **px_options) 448 | st.plotly_chart(fig) 449 | elif plot_type == '3D': 450 | fig = px.scatter_3d(data, x='proj1', y='proj2', z='proj3', **px_options) 451 | st.plotly_chart(fig) 452 | 453 | if hasattr(st.session_state.model, 'inference'): # gensim Nmf has no 'inference' attribute so pyLDAvis fails 454 | if st.button('Generate pyLDAvis'): 455 | with st.spinner('Creating pyLDAvis Visualization ...'): 456 | py_lda_vis_data = pyLDAvis.gensim_models.prepare(st.session_state.model, st.session_state.corpus, 457 | st.session_state.id2word) 458 | py_lda_vis_html = pyLDAvis.prepared_data_to_html(py_lda_vis_data) 459 | with st.expander('pyLDAvis', expanded=True): 460 | st.markdown('pyLDAvis is designed to help users interpret the topics in a topic model that has been ' 461 | 'fit to a corpus of text data. The package extracts information from a fitted LDA topic ' 462 | 'model to inform an interactive web-based visualization.') 463 | st.markdown('https://github.com/bmabey/pyLDAvis') 464 | components.html(py_lda_vis_html, width=1300, height=800) 465 | -------------------------------------------------------------------------------- /streamlit_topic_modeling/tests/__init__.py: -------------------------------------------------------------------------------- 1 | """Tests for the streamlit_topic_modeling package.""" 2 | -------------------------------------------------------------------------------- /streamlit_topic_modeling/tests/test_app.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/bpw1621/streamlit-topic-modeling/d8cbd1624450ed8ec9f8532358986e7f8892ef3d/streamlit_topic_modeling/tests/test_app.py -------------------------------------------------------------------------------- /towncrier.toml: -------------------------------------------------------------------------------- 1 | [tool.towncrier] 2 | directory = "newsfragments" -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | # tox (https://tox.readthedocs.io/) is a tool for running tests 2 | # in multiple virtualenvs. This configuration file will run the 3 | # test suite on all supported python versions. To use it, "pip install tox" 4 | # and then run "tox" from this directory. 5 | 6 | [tox] 7 | envlist = py36, py37, py38, py39 8 | 9 | [testenv] 10 | deps = 11 | pytest 12 | pytest-cov 13 | pytest-mock 14 | commands = 15 | pytest 16 | --------------------------------------------------------------------------------