├── .flake8
├── .github
    └── workflows
    │   └── unittest.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── cluestar.png
├── cluestar
    └── __init__.py
├── data
    └── tesco_support.csv
├── docs
    ├── index.html
    ├── plot_five.json
    ├── plot_four.json
    ├── plot_one.json
    ├── plot_six.json
    ├── plot_three.json
    └── plot_two.json
├── gif-multi.gif
├── gif.gif
├── notebooks
    └── overview.ipynb
├── setup.py
└── tests
    ├── __init__.py
    └── test_smoke.py


/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | per-file-ignores = */__init__.py: F401
3 | max-line-length = 160
4 | ignore = E203


--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
 1 | name: Code Checks
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |     - main
 7 |   pull_request:
 8 |     branches:
 9 |     - main
10 | 
11 | jobs:
12 |   build:
13 |     runs-on: ubuntu-latest
14 |     strategy:
15 |       matrix:
16 |         python-version: ["3.11"]
17 | 
18 |     steps:
19 |     - uses: actions/checkout@v2
20 |     - name: Set up Python ${{ matrix.python-version }}
21 |       uses: actions/setup-python@v1
22 |       with:
23 |         python-version: ${{ matrix.python-version }}
24 |         cache: 'pip'
25 |     - name: Install Dependencies
26 |       run: python -m pip install -e ".[dev]"
27 |     - name: Unittest
28 |       run: python -m pytest
29 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 vincent d warmerdam 
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | black:
 2 | 	black --target-version py38 cluestar tests setup.py
 3 | 
 4 | flake:
 5 | 	flake8 cluestar tests setup.py
 6 | 
 7 | install:
 8 | 	python -m pip install --upgrade pip
 9 | 	python -m pip install -e ".[dev]"
10 | 	python -m pip install black flake8 isort interrogate twine wheel
11 | 
12 | interrogate:
13 | 	interrogate -vv --ignore-nested-functions --ignore-semiprivate --ignore-private --ignore-magic --ignore-module --ignore-init-method --fail-under 100 cluestar
14 | 
15 | pypi:
16 | 	python setup.py sdist
17 | 	python setup.py bdist_wheel --universal
18 | 	twine upload dist/*
19 | 
20 | clean:
21 | 	rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache
22 | 
23 | style: clean black flake interrogate clean
24 | 
25 | check: clean black flake interrogate clean


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <img src="cluestar.png" width=175 align="right">
 2 | 
 3 | ### cluestar
 4 | 
 5 | > Gain a clue by clustering!
 6 | 
 7 | This library contains visualisation tools that might help you
 8 | get started with classification tasks. The idea is that if you
 9 | can inspect clusters easily, you might gain a clue on what
10 | good labels for your dataset might be!
11 | 
12 | It generates charts that looks like this:
13 | 
14 | ![Normal plot](gif.gif)
15 | 
16 | There's even a fancy chart that can compare embedding techniques.
17 | 
18 | ![Comparing two embeddings](gif-multi.gif)
19 | 
20 | ## Install
21 | 
22 | ```text
23 | python -m pip install cluestar
24 | ```
25 | 
26 | ## Interactive Demo
27 | 
28 | You can see an interactive demo of the generated widgets [here](https://koaning.github.io/cluestar/).
29 | 
30 | You can also toy around with the demo notebook found [here](https://github.com/koaning/cluestar/blob/main/notebooks/overview.ipynb).
31 | 
32 | ## Usage
33 | 
34 | The first step is to encode textdata in two dimensions, like below.
35 | 
36 | ```python
37 | from sklearn.pipeline import make_pipeline
38 | from sklearn.decomposition import TruncatedSVD
39 | from sklearn.feature_extraction.text import TfidfVectorizer
40 | 
41 | pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(n_components=2))
42 | 
43 | X = pipe.fit_transform(texts)
44 | ```
45 | 
46 | From here you can make an interactive chart via;
47 | 
48 | ```python
49 | from cluestar import plot_text
50 | 
51 | plot_text(X, texts)
52 | ```
53 | 
54 | The best results are likely found when you use
55 | [umap](https://umap-learn.readthedocs.io/en/latest/)
56 | together with something like
57 | [universal sentence encoder](https://koaning.github.io/whatlies/api/language/universal_sentence/).
58 | 
59 | You might also improve the understandability by highlighting points
60 | that have a certain word in it.
61 | 
62 | ```python
63 | plot_text(X, texts, color_words=["plastic", "voucher", "deliver"])
64 | ```
65 | 
66 | You can also use a numeric array, one that contains proba-values for prediction,
67 | to influence the color.
68 | 
69 | ```python
70 | # First, get an array of pvals from some model
71 | p_vals = some_model.predict(texts)[:, 0]
72 | # Use these to assign pretty colors.
73 | plot_text(X, texts, color_array=p_vals)
74 | ```
75 | 
76 | You can also compare two embeddings interactively. To do this: 
77 | 
78 | ```python
79 | from cluestar import plot_text_comparison
80 | 
81 | plot_text(X1=X, X2=X, texts)
82 | ```
83 | 


--------------------------------------------------------------------------------
/cluestar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/cluestar.png


--------------------------------------------------------------------------------
/cluestar/__init__.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import pandas as pd
  3 | import altair as alt
  4 | 
  5 | 
  6 | def plot_text(X, texts, color_array=None, color_words=None, disable_warning=True):
  7 |     """
  8 |     Make a visualisation to help find clues in text data.
  9 | 
 10 |     Arguments:
 11 |         - `X`: the numeric features, should be a 2D numpy array
 12 |         - `texts`: list of text data
 13 |         - `color_words`: list of words to highlight
 14 |         - `color_array`: an array that represents color for the plot
 15 |         - `disable_warning`: disable the standard altair max rows warning
 16 |     """
 17 |     if disable_warning:
 18 |         alt.data_transformers.disable_max_rows()
 19 | 
 20 |     if len(texts) != X.shape[0]:
 21 |         raise ValueError(
 22 |             f"The number of text examples ({len(texts)}) should match X array ({X.shape[0]})."
 23 |         )
 24 | 
 25 |     df_ = pd.DataFrame({"x1": X[:, 0], "x2": X[:, 1], "text": texts}).assign(
 26 |         trunc_text=lambda d: d["text"].str[:120], r=0
 27 |     )
 28 | 
 29 |     if color_array is not None:
 30 |         if len(color_array) != X.shape[0]:
 31 |             raise ValueError(
 32 |                 f"The number of color array ({len(color_array)}) should match X array ({X.shape[0]})."
 33 |             )
 34 |         df_ = df_.assign(color=color_array)
 35 | 
 36 |     if color_words:
 37 |         df_ = df_.assign(color="none")
 38 | 
 39 |         for w in color_words:
 40 |             predicate = df_["text"].str.lower().str.contains(w)
 41 |             df_ = df_.assign(color=lambda d: np.where(predicate, w, d["color"]))
 42 | 
 43 |     brush = alt.selection_interval()
 44 | 
 45 |     p1 = (
 46 |         alt.Chart(df_)
 47 |         .mark_circle(opacity=0.6, size=20)
 48 |         .encode(
 49 |             x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)),
 50 |             y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)),
 51 |             tooltip=["text"],
 52 |         )
 53 |         .properties(width=350, height=350, title="embedding space")
 54 |         .add_params(brush)
 55 |     )
 56 | 
 57 |     if color_words:
 58 |         p1 = (
 59 |             alt.Chart(df_)
 60 |             .mark_circle(opacity=0.6, size=20)
 61 |             .encode(
 62 |                 x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)),
 63 |                 y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)),
 64 |                 tooltip=["text"],
 65 |                 color=alt.Color("color", sort=["none"] + color_words),
 66 |             )
 67 |             .properties(width=350, height=350, title="embedding space")
 68 |             .add_params(brush)
 69 |         )
 70 | 
 71 |     if color_array is not None:
 72 |         p1 = (
 73 |             alt.Chart(df_)
 74 |             .mark_circle(opacity=0.6, size=20)
 75 |             .encode(
 76 |                 x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)),
 77 |                 y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)),
 78 |                 tooltip=["text"],
 79 |                 color=alt.Color("color"),
 80 |             )
 81 |             .properties(width=350, height=350, title="embedding space")
 82 |             .add_params(brush)
 83 |         )
 84 | 
 85 |     p2 = (
 86 |         alt.Chart(df_)
 87 |         .mark_text()
 88 |         .encode(
 89 |             x=alt.X("r", axis=None),
 90 |             y=alt.Y("row_number:O", axis=None),
 91 |             text="trunc_text:N",
 92 |         )
 93 |         .transform_window(row_number="row_number()")
 94 |         .transform_filter(brush)
 95 |         .transform_window(rank="rank(row_number)")
 96 |         .transform_filter(alt.datum.rank < 18)
 97 |         .properties(title="text")
 98 |     )
 99 | 
100 |     return (p1 | p2).configure_axis(grid=False).configure_view(strokeWidth=0)
101 | 
102 | def _single_scatter_chart(df_, idx, brush, title="embedding space", color_words=None, color_array=None):
103 |     cols = ("x1:Q", "y1:Q") if idx == 1 else ("x2:Q", "y2:Q")
104 |     if color_words:
105 |         color=alt.Color("color", sort=["none"] + color_words)
106 |     elif color_array:
107 |         color=alt.Color("color")
108 |     else:
109 |         color=alt.condition(brush, 'id:O', alt.value('lightgray'), legend=None)
110 |     return (
111 |         alt.Chart(df_)
112 |         .mark_circle(opacity=0.6, size=20)
113 |         .encode(
114 |             x=alt.X(cols[0], axis=None, scale=alt.Scale(zero=False)),
115 |             y=alt.Y(cols[1], axis=None, scale=alt.Scale(zero=False)),
116 |             tooltip=["text"],
117 |             color=color,
118 |         )
119 |         .properties(width=350, height=350, title=title)
120 |         .add_params(brush)
121 |     )
122 | 
123 | def plot_text_comparison(X1, X2, texts, disable_warning=True, color_array=None, color_words=None):
124 |     """
125 |     Make a visualisation to help find clues in text data.
126 | 
127 |     Arguments:
128 |         - `X1`: the numeric features, should be a 2D numpy array
129 |         - `X2`: the numeric features, should be a 2D numpy array
130 |         - `texts`: list of text data
131 |         - `disable_warning`: disable the standard altair max rows warning
132 |         - `color_words`: list of words to highlight
133 |         - `color_array`: an array that represents color for the plot
134 |     """
135 |     if disable_warning:
136 |         alt.data_transformers.disable_max_rows()
137 | 
138 |     if (len(texts) != X1.shape[0]) or (len(texts) != X2.shape[0]):
139 |         raise ValueError(
140 |             f"The number of text examples ({len(texts)}) should match X1/x2 array X1=({X1.shape[0]}) X2=({X2.shape[0]})."
141 |         )
142 |     
143 |     df_ = pd.DataFrame({"x1": X1[:, 0], "y1": X1[:, 1], "x2": X2[:, 0], "y2": X2[:, 1], "text": texts}).assign(
144 |         trunc_text=lambda d: d["text"].str[:120], r=0
145 |     )
146 |     
147 |     if color_array is not None:
148 |         if len(color_array) != X1.shape[0]:
149 |             raise ValueError(
150 |                 f"The number of color array ({len(color_array)}) should match X array ({X.shape[0]})."
151 |             )
152 |         df_ = df_.assign(color=color_array)
153 | 
154 |     if color_words is not None:
155 |         df_ = df_.assign(color="none")
156 | 
157 |         for w in color_words:
158 |             predicate = df_["text"].str.lower().str.contains(w)
159 |             df_ = df_.assign(color=lambda d: np.where(predicate, w, d["color"]))
160 | 
161 |     brush = alt.selection_interval()
162 |     p1 = _single_scatter_chart(df_, 1, brush, title="embedding space X1", color_words=color_words, color_array=color_array)
163 |     p2 = _single_scatter_chart(df_, 2, brush, title="embedding space X2", color_words=color_words, color_array=color_array)
164 | 
165 |     p3 = (
166 |         alt.Chart(df_)
167 |         .mark_text()
168 |         .encode(
169 |             x=alt.X("r", axis=None),
170 |             y=alt.Y("row_number:O", axis=None),
171 |             text="trunc_text:N",
172 |         )
173 |         .transform_window(row_number="row_number()")
174 |         .transform_filter(brush)
175 |         .transform_window(rank="rank(row_number)")
176 |         .transform_filter(alt.datum.rank < 18)
177 |         .properties(title="text")
178 |     )
179 | 
180 |     return (p1 | p2 | p3).configure_axis(grid=False).configure_view(strokeWidth=0)


--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
 1 | <script src="https://cdn.tailwindcss.com"></script>
 2 | <script src="https://cdn.jsdelivr.net/npm/vega@5"></script>
 3 | <script src="https://cdn.jsdelivr.net/npm/vega-lite@5"></script>
 4 | <script src="https://cdn.jsdelivr.net/npm/vega-embed@6"></script>
 5 | <script src="https://cdn.jsdelivr.net/gh/koaning/justcharts/justcharts.js"></script>
 6 | 
 7 | <div class="px-12">
 8 |     <br>
 9 |     <h2 class="font-bold text-2xl text-center">TfIdf with SVD</h2>
10 |     <br>
11 |     <vegachart schema-url="plot_one.json"></vegachart>
12 | 
13 |     <h2 class="font-bold text-2xl text-center">TfIdf with UMAP</h2>
14 |     <br>
15 |     <vegachart schema-url="plot_two.json"></vegachart>
16 | 
17 |     <h2 class="font-bold text-2xl text-center">TfIdf with UMAP and Color</h2>
18 |     <br>
19 |     <vegachart schema-url="plot_three.json"></vegachart>
20 | 
21 |     <h2 class="font-bold text-2xl text-center">Universal Sentence Encoder with UMAP and Word Color</h2>
22 |     <br>
23 |     <vegachart schema-url="plot_four.json"></vegachart>
24 | 
25 |     <h2 class="font-bold text-2xl text-center">Universal Sentence Encoder with UMAP and Class Predicted Color</h2>
26 |     <br>
27 |     <vegachart schema-url="plot_five.json"></vegachart>
28 | 
29 |     <h2 class="font-bold text-2xl text-center">Comparison of two embeddings side by side</h2>
30 |     <br>
31 |     <vegachart schema-url="plot_six.json"></vegachart>
32 | </div>
33 | 


--------------------------------------------------------------------------------
/gif-multi.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/gif-multi.gif


--------------------------------------------------------------------------------
/gif.gif:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/gif.gif


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import pathlib
 2 | from setuptools import setup, find_packages
 3 | 
 4 | 
 5 | base_packages = ["altair>=5.0.0", "numpy>=1.19.2"]
 6 | 
 7 | dev_packages = base_packages + [
 8 |     "pytest>=4.0.2",
 9 |     "black>=19.3b0",
10 |     "pytest-cov>=2.6.1",
11 |     "pre-commit>=2.2.0",
12 |     "jupyterlab",
13 | ]
14 | 
15 | 
16 | setup(
17 |     name="cluestar",
18 |     version="0.2.1",
19 |     author="Vincent D. Warmerdam",
20 |     packages=find_packages(exclude=["notebooks", "docs"]),
21 |     description="Gain a clue by clustering!",
22 |     long_description=pathlib.Path("README.md").read_text(),
23 |     long_description_content_type="text/markdown",
24 |     url="https://github.com/koaning/cluestar/",
25 |     project_urls={
26 |         "Documentation": "https://github.com/koaning/cluestar/",
27 |         "Source Code": "https://github.com/koaning/cluestar/",
28 |         "Issue Tracker": "https://github.com/koaning/cluestar/issues",
29 |     },
30 |     install_requires=base_packages,
31 |     extras_require={"base": base_packages, "dev": dev_packages},
32 |     classifiers=[
33 |         "Intended Audience :: Science/Research",
34 |         "Programming Language :: Python :: 3",
35 |         "Programming Language :: Python :: 3.8",
36 |         "Programming Language :: Python :: 3.9",
37 |         "Programming Language :: Python :: 3.10",
38 |         "Programming Language :: Python :: 3.11",
39 |         "License :: OSI Approved :: MIT License",
40 |         "Topic :: Scientific/Engineering",
41 |         "Topic :: Scientific/Engineering :: Artificial Intelligence",
42 |     ],
43 | )
44 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/tests/__init__.py


--------------------------------------------------------------------------------
/tests/test_smoke.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | import numpy as np 
 3 | from cluestar import plot_text, plot_text_comparison
 4 | 
 5 | 
 6 | @pytest.mark.parametrize("s", [500, 1000, 2000])
 7 | def test_smoke_plot_text(s):
 8 |     texts = ["random {i}" for i in range(s)]
 9 |     X = np.random.normal(0, 1, (s, 2))
10 |     assert texts[0] in plot_text(X, texts).to_json()
11 | 
12 | 
13 | @pytest.mark.parametrize("s", [500, 1000, 2000])
14 | def test_smoke_plot_text_comparison(s):
15 |     texts = ["random {i}" for i in range(s)]
16 |     X1 = np.random.normal(0, 1, (s, 2))
17 |     X2 = np.random.normal(0, 1, (s, 2))
18 |     assert texts[0] in plot_text_comparison(X1, X2, texts).to_json()
19 | 


--------------------------------------------------------------------------------