├── .flake8 ├── .github └── workflows │ └── unittest.yml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── cluestar.png ├── cluestar └── __init__.py ├── data └── tesco_support.csv ├── docs ├── index.html ├── plot_five.json ├── plot_four.json ├── plot_one.json ├── plot_six.json ├── plot_three.json └── plot_two.json ├── gif-multi.gif ├── gif.gif ├── notebooks └── overview.ipynb ├── setup.py └── tests ├── __init__.py └── test_smoke.py /.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | per-file-ignores = */__init__.py: F401 3 | max-line-length = 160 4 | ignore = E203 -------------------------------------------------------------------------------- /.github/workflows/unittest.yml: -------------------------------------------------------------------------------- 1 | name: Code Checks 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | pull_request: 8 | branches: 9 | - main 10 | 11 | jobs: 12 | build: 13 | runs-on: ubuntu-latest 14 | strategy: 15 | matrix: 16 | python-version: ["3.11"] 17 | 18 | steps: 19 | - uses: actions/checkout@v2 20 | - name: Set up Python ${{ matrix.python-version }} 21 | uses: actions/setup-python@v1 22 | with: 23 | python-version: ${{ matrix.python-version }} 24 | cache: 'pip' 25 | - name: Install Dependencies 26 | run: python -m pip install -e ".[dev]" 27 | - name: Unittest 28 | run: python -m pytest 29 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 vincent d warmerdam 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | black: 2 | black --target-version py38 cluestar tests setup.py 3 | 4 | flake: 5 | flake8 cluestar tests setup.py 6 | 7 | install: 8 | python -m pip install --upgrade pip 9 | python -m pip install -e ".[dev]" 10 | python -m pip install black flake8 isort interrogate twine wheel 11 | 12 | interrogate: 13 | interrogate -vv --ignore-nested-functions --ignore-semiprivate --ignore-private --ignore-magic --ignore-module --ignore-init-method --fail-under 100 cluestar 14 | 15 | pypi: 16 | python setup.py sdist 17 | python setup.py bdist_wheel --universal 18 | twine upload dist/* 19 | 20 | clean: 21 | rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache 22 | 23 | style: clean black flake interrogate clean 24 | 25 | check: clean black flake interrogate clean -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ### cluestar 4 | 5 | > Gain a clue by clustering! 6 | 7 | This library contains visualisation tools that might help you 8 | get started with classification tasks. The idea is that if you 9 | can inspect clusters easily, you might gain a clue on what 10 | good labels for your dataset might be! 11 | 12 | It generates charts that looks like this: 13 | 14 | ![Normal plot](gif.gif) 15 | 16 | There's even a fancy chart that can compare embedding techniques. 17 | 18 | ![Comparing two embeddings](gif-multi.gif) 19 | 20 | ## Install 21 | 22 | ```text 23 | python -m pip install cluestar 24 | ``` 25 | 26 | ## Interactive Demo 27 | 28 | You can see an interactive demo of the generated widgets [here](https://koaning.github.io/cluestar/). 29 | 30 | You can also toy around with the demo notebook found [here](https://github.com/koaning/cluestar/blob/main/notebooks/overview.ipynb). 31 | 32 | ## Usage 33 | 34 | The first step is to encode textdata in two dimensions, like below. 35 | 36 | ```python 37 | from sklearn.pipeline import make_pipeline 38 | from sklearn.decomposition import TruncatedSVD 39 | from sklearn.feature_extraction.text import TfidfVectorizer 40 | 41 | pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(n_components=2)) 42 | 43 | X = pipe.fit_transform(texts) 44 | ``` 45 | 46 | From here you can make an interactive chart via; 47 | 48 | ```python 49 | from cluestar import plot_text 50 | 51 | plot_text(X, texts) 52 | ``` 53 | 54 | The best results are likely found when you use 55 | [umap](https://umap-learn.readthedocs.io/en/latest/) 56 | together with something like 57 | [universal sentence encoder](https://koaning.github.io/whatlies/api/language/universal_sentence/). 58 | 59 | You might also improve the understandability by highlighting points 60 | that have a certain word in it. 61 | 62 | ```python 63 | plot_text(X, texts, color_words=["plastic", "voucher", "deliver"]) 64 | ``` 65 | 66 | You can also use a numeric array, one that contains proba-values for prediction, 67 | to influence the color. 68 | 69 | ```python 70 | # First, get an array of pvals from some model 71 | p_vals = some_model.predict(texts)[:, 0] 72 | # Use these to assign pretty colors. 73 | plot_text(X, texts, color_array=p_vals) 74 | ``` 75 | 76 | You can also compare two embeddings interactively. To do this: 77 | 78 | ```python 79 | from cluestar import plot_text_comparison 80 | 81 | plot_text(X1=X, X2=X, texts) 82 | ``` 83 | -------------------------------------------------------------------------------- /cluestar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/cluestar.png -------------------------------------------------------------------------------- /cluestar/__init__.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import pandas as pd 3 | import altair as alt 4 | 5 | 6 | def plot_text(X, texts, color_array=None, color_words=None, disable_warning=True): 7 | """ 8 | Make a visualisation to help find clues in text data. 9 | 10 | Arguments: 11 | - `X`: the numeric features, should be a 2D numpy array 12 | - `texts`: list of text data 13 | - `color_words`: list of words to highlight 14 | - `color_array`: an array that represents color for the plot 15 | - `disable_warning`: disable the standard altair max rows warning 16 | """ 17 | if disable_warning: 18 | alt.data_transformers.disable_max_rows() 19 | 20 | if len(texts) != X.shape[0]: 21 | raise ValueError( 22 | f"The number of text examples ({len(texts)}) should match X array ({X.shape[0]})." 23 | ) 24 | 25 | df_ = pd.DataFrame({"x1": X[:, 0], "x2": X[:, 1], "text": texts}).assign( 26 | trunc_text=lambda d: d["text"].str[:120], r=0 27 | ) 28 | 29 | if color_array is not None: 30 | if len(color_array) != X.shape[0]: 31 | raise ValueError( 32 | f"The number of color array ({len(color_array)}) should match X array ({X.shape[0]})." 33 | ) 34 | df_ = df_.assign(color=color_array) 35 | 36 | if color_words: 37 | df_ = df_.assign(color="none") 38 | 39 | for w in color_words: 40 | predicate = df_["text"].str.lower().str.contains(w) 41 | df_ = df_.assign(color=lambda d: np.where(predicate, w, d["color"])) 42 | 43 | brush = alt.selection_interval() 44 | 45 | p1 = ( 46 | alt.Chart(df_) 47 | .mark_circle(opacity=0.6, size=20) 48 | .encode( 49 | x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)), 50 | y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)), 51 | tooltip=["text"], 52 | ) 53 | .properties(width=350, height=350, title="embedding space") 54 | .add_params(brush) 55 | ) 56 | 57 | if color_words: 58 | p1 = ( 59 | alt.Chart(df_) 60 | .mark_circle(opacity=0.6, size=20) 61 | .encode( 62 | x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)), 63 | y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)), 64 | tooltip=["text"], 65 | color=alt.Color("color", sort=["none"] + color_words), 66 | ) 67 | .properties(width=350, height=350, title="embedding space") 68 | .add_params(brush) 69 | ) 70 | 71 | if color_array is not None: 72 | p1 = ( 73 | alt.Chart(df_) 74 | .mark_circle(opacity=0.6, size=20) 75 | .encode( 76 | x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)), 77 | y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)), 78 | tooltip=["text"], 79 | color=alt.Color("color"), 80 | ) 81 | .properties(width=350, height=350, title="embedding space") 82 | .add_params(brush) 83 | ) 84 | 85 | p2 = ( 86 | alt.Chart(df_) 87 | .mark_text() 88 | .encode( 89 | x=alt.X("r", axis=None), 90 | y=alt.Y("row_number:O", axis=None), 91 | text="trunc_text:N", 92 | ) 93 | .transform_window(row_number="row_number()") 94 | .transform_filter(brush) 95 | .transform_window(rank="rank(row_number)") 96 | .transform_filter(alt.datum.rank < 18) 97 | .properties(title="text") 98 | ) 99 | 100 | return (p1 | p2).configure_axis(grid=False).configure_view(strokeWidth=0) 101 | 102 | def _single_scatter_chart(df_, idx, brush, title="embedding space", color_words=None, color_array=None): 103 | cols = ("x1:Q", "y1:Q") if idx == 1 else ("x2:Q", "y2:Q") 104 | if color_words: 105 | color=alt.Color("color", sort=["none"] + color_words) 106 | elif color_array: 107 | color=alt.Color("color") 108 | else: 109 | color=alt.condition(brush, 'id:O', alt.value('lightgray'), legend=None) 110 | return ( 111 | alt.Chart(df_) 112 | .mark_circle(opacity=0.6, size=20) 113 | .encode( 114 | x=alt.X(cols[0], axis=None, scale=alt.Scale(zero=False)), 115 | y=alt.Y(cols[1], axis=None, scale=alt.Scale(zero=False)), 116 | tooltip=["text"], 117 | color=color, 118 | ) 119 | .properties(width=350, height=350, title=title) 120 | .add_params(brush) 121 | ) 122 | 123 | def plot_text_comparison(X1, X2, texts, disable_warning=True, color_array=None, color_words=None): 124 | """ 125 | Make a visualisation to help find clues in text data. 126 | 127 | Arguments: 128 | - `X1`: the numeric features, should be a 2D numpy array 129 | - `X2`: the numeric features, should be a 2D numpy array 130 | - `texts`: list of text data 131 | - `disable_warning`: disable the standard altair max rows warning 132 | - `color_words`: list of words to highlight 133 | - `color_array`: an array that represents color for the plot 134 | """ 135 | if disable_warning: 136 | alt.data_transformers.disable_max_rows() 137 | 138 | if (len(texts) != X1.shape[0]) or (len(texts) != X2.shape[0]): 139 | raise ValueError( 140 | f"The number of text examples ({len(texts)}) should match X1/x2 array X1=({X1.shape[0]}) X2=({X2.shape[0]})." 141 | ) 142 | 143 | df_ = pd.DataFrame({"x1": X1[:, 0], "y1": X1[:, 1], "x2": X2[:, 0], "y2": X2[:, 1], "text": texts}).assign( 144 | trunc_text=lambda d: d["text"].str[:120], r=0 145 | ) 146 | 147 | if color_array is not None: 148 | if len(color_array) != X1.shape[0]: 149 | raise ValueError( 150 | f"The number of color array ({len(color_array)}) should match X array ({X.shape[0]})." 151 | ) 152 | df_ = df_.assign(color=color_array) 153 | 154 | if color_words is not None: 155 | df_ = df_.assign(color="none") 156 | 157 | for w in color_words: 158 | predicate = df_["text"].str.lower().str.contains(w) 159 | df_ = df_.assign(color=lambda d: np.where(predicate, w, d["color"])) 160 | 161 | brush = alt.selection_interval() 162 | p1 = _single_scatter_chart(df_, 1, brush, title="embedding space X1", color_words=color_words, color_array=color_array) 163 | p2 = _single_scatter_chart(df_, 2, brush, title="embedding space X2", color_words=color_words, color_array=color_array) 164 | 165 | p3 = ( 166 | alt.Chart(df_) 167 | .mark_text() 168 | .encode( 169 | x=alt.X("r", axis=None), 170 | y=alt.Y("row_number:O", axis=None), 171 | text="trunc_text:N", 172 | ) 173 | .transform_window(row_number="row_number()") 174 | .transform_filter(brush) 175 | .transform_window(rank="rank(row_number)") 176 | .transform_filter(alt.datum.rank < 18) 177 | .properties(title="text") 178 | ) 179 | 180 | return (p1 | p2 | p3).configure_axis(grid=False).configure_view(strokeWidth=0) -------------------------------------------------------------------------------- /docs/index.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 |
8 |
9 |

TfIdf with SVD

10 |
11 | 12 | 13 |

TfIdf with UMAP

14 |
15 | 16 | 17 |

TfIdf with UMAP and Color

18 |
19 | 20 | 21 |

Universal Sentence Encoder with UMAP and Word Color

22 |
23 | 24 | 25 |

Universal Sentence Encoder with UMAP and Class Predicted Color

26 |
27 | 28 | 29 |

Comparison of two embeddings side by side

30 |
31 | 32 |
33 | -------------------------------------------------------------------------------- /gif-multi.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/gif-multi.gif -------------------------------------------------------------------------------- /gif.gif: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/gif.gif -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import pathlib 2 | from setuptools import setup, find_packages 3 | 4 | 5 | base_packages = ["altair>=5.0.0", "numpy>=1.19.2"] 6 | 7 | dev_packages = base_packages + [ 8 | "pytest>=4.0.2", 9 | "black>=19.3b0", 10 | "pytest-cov>=2.6.1", 11 | "pre-commit>=2.2.0", 12 | "jupyterlab", 13 | ] 14 | 15 | 16 | setup( 17 | name="cluestar", 18 | version="0.2.1", 19 | author="Vincent D. Warmerdam", 20 | packages=find_packages(exclude=["notebooks", "docs"]), 21 | description="Gain a clue by clustering!", 22 | long_description=pathlib.Path("README.md").read_text(), 23 | long_description_content_type="text/markdown", 24 | url="https://github.com/koaning/cluestar/", 25 | project_urls={ 26 | "Documentation": "https://github.com/koaning/cluestar/", 27 | "Source Code": "https://github.com/koaning/cluestar/", 28 | "Issue Tracker": "https://github.com/koaning/cluestar/issues", 29 | }, 30 | install_requires=base_packages, 31 | extras_require={"base": base_packages, "dev": dev_packages}, 32 | classifiers=[ 33 | "Intended Audience :: Science/Research", 34 | "Programming Language :: Python :: 3", 35 | "Programming Language :: Python :: 3.8", 36 | "Programming Language :: Python :: 3.9", 37 | "Programming Language :: Python :: 3.10", 38 | "Programming Language :: Python :: 3.11", 39 | "License :: OSI Approved :: MIT License", 40 | "Topic :: Scientific/Engineering", 41 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/tests/__init__.py -------------------------------------------------------------------------------- /tests/test_smoke.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import numpy as np 3 | from cluestar import plot_text, plot_text_comparison 4 | 5 | 6 | @pytest.mark.parametrize("s", [500, 1000, 2000]) 7 | def test_smoke_plot_text(s): 8 | texts = ["random {i}" for i in range(s)] 9 | X = np.random.normal(0, 1, (s, 2)) 10 | assert texts[0] in plot_text(X, texts).to_json() 11 | 12 | 13 | @pytest.mark.parametrize("s", [500, 1000, 2000]) 14 | def test_smoke_plot_text_comparison(s): 15 | texts = ["random {i}" for i in range(s)] 16 | X1 = np.random.normal(0, 1, (s, 2)) 17 | X2 = np.random.normal(0, 1, (s, 2)) 18 | assert texts[0] in plot_text_comparison(X1, X2, texts).to_json() 19 | --------------------------------------------------------------------------------