├── .flake8
├── .github
└── workflows
│ └── unittest.yml
├── .gitignore
├── LICENSE
├── Makefile
├── README.md
├── cluestar.png
├── cluestar
└── __init__.py
├── data
└── tesco_support.csv
├── docs
├── index.html
├── plot_five.json
├── plot_four.json
├── plot_one.json
├── plot_six.json
├── plot_three.json
└── plot_two.json
├── gif-multi.gif
├── gif.gif
├── notebooks
└── overview.ipynb
├── setup.py
└── tests
├── __init__.py
└── test_smoke.py
/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | per-file-ignores = */__init__.py: F401
3 | max-line-length = 160
4 | ignore = E203
--------------------------------------------------------------------------------
/.github/workflows/unittest.yml:
--------------------------------------------------------------------------------
1 | name: Code Checks
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 | pull_request:
8 | branches:
9 | - main
10 |
11 | jobs:
12 | build:
13 | runs-on: ubuntu-latest
14 | strategy:
15 | matrix:
16 | python-version: ["3.11"]
17 |
18 | steps:
19 | - uses: actions/checkout@v2
20 | - name: Set up Python ${{ matrix.python-version }}
21 | uses: actions/setup-python@v1
22 | with:
23 | python-version: ${{ matrix.python-version }}
24 | cache: 'pip'
25 | - name: Install Dependencies
26 | run: python -m pip install -e ".[dev]"
27 | - name: Unittest
28 | run: python -m pytest
29 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | pip-wheel-metadata/
24 | share/python-wheels/
25 | *.egg-info/
26 | .installed.cfg
27 | *.egg
28 | MANIFEST
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .nox/
44 | .coverage
45 | .coverage.*
46 | .cache
47 | nosetests.xml
48 | coverage.xml
49 | *.cover
50 | *.py,cover
51 | .hypothesis/
52 | .pytest_cache/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | target/
76 |
77 | # Jupyter Notebook
78 | .ipynb_checkpoints
79 |
80 | # IPython
81 | profile_default/
82 | ipython_config.py
83 |
84 | # pyenv
85 | .python-version
86 |
87 | # pipenv
88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
91 | # install all needed dependencies.
92 | #Pipfile.lock
93 |
94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95 | __pypackages__/
96 |
97 | # Celery stuff
98 | celerybeat-schedule
99 | celerybeat.pid
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 |
128 | # Pyre type checker
129 | .pyre/
130 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2022 vincent d warmerdam
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
1 | black:
2 | black --target-version py38 cluestar tests setup.py
3 |
4 | flake:
5 | flake8 cluestar tests setup.py
6 |
7 | install:
8 | python -m pip install --upgrade pip
9 | python -m pip install -e ".[dev]"
10 | python -m pip install black flake8 isort interrogate twine wheel
11 |
12 | interrogate:
13 | interrogate -vv --ignore-nested-functions --ignore-semiprivate --ignore-private --ignore-magic --ignore-module --ignore-init-method --fail-under 100 cluestar
14 |
15 | pypi:
16 | python setup.py sdist
17 | python setup.py bdist_wheel --universal
18 | twine upload dist/*
19 |
20 | clean:
21 | rm -rf **/.ipynb_checkpoints **/.pytest_cache **/__pycache__ **/**/__pycache__ .ipynb_checkpoints .pytest_cache
22 |
23 | style: clean black flake interrogate clean
24 |
25 | check: clean black flake interrogate clean
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | ### cluestar
4 |
5 | > Gain a clue by clustering!
6 |
7 | This library contains visualisation tools that might help you
8 | get started with classification tasks. The idea is that if you
9 | can inspect clusters easily, you might gain a clue on what
10 | good labels for your dataset might be!
11 |
12 | It generates charts that looks like this:
13 |
14 | 
15 |
16 | There's even a fancy chart that can compare embedding techniques.
17 |
18 | 
19 |
20 | ## Install
21 |
22 | ```text
23 | python -m pip install cluestar
24 | ```
25 |
26 | ## Interactive Demo
27 |
28 | You can see an interactive demo of the generated widgets [here](https://koaning.github.io/cluestar/).
29 |
30 | You can also toy around with the demo notebook found [here](https://github.com/koaning/cluestar/blob/main/notebooks/overview.ipynb).
31 |
32 | ## Usage
33 |
34 | The first step is to encode textdata in two dimensions, like below.
35 |
36 | ```python
37 | from sklearn.pipeline import make_pipeline
38 | from sklearn.decomposition import TruncatedSVD
39 | from sklearn.feature_extraction.text import TfidfVectorizer
40 |
41 | pipe = make_pipeline(TfidfVectorizer(), TruncatedSVD(n_components=2))
42 |
43 | X = pipe.fit_transform(texts)
44 | ```
45 |
46 | From here you can make an interactive chart via;
47 |
48 | ```python
49 | from cluestar import plot_text
50 |
51 | plot_text(X, texts)
52 | ```
53 |
54 | The best results are likely found when you use
55 | [umap](https://umap-learn.readthedocs.io/en/latest/)
56 | together with something like
57 | [universal sentence encoder](https://koaning.github.io/whatlies/api/language/universal_sentence/).
58 |
59 | You might also improve the understandability by highlighting points
60 | that have a certain word in it.
61 |
62 | ```python
63 | plot_text(X, texts, color_words=["plastic", "voucher", "deliver"])
64 | ```
65 |
66 | You can also use a numeric array, one that contains proba-values for prediction,
67 | to influence the color.
68 |
69 | ```python
70 | # First, get an array of pvals from some model
71 | p_vals = some_model.predict(texts)[:, 0]
72 | # Use these to assign pretty colors.
73 | plot_text(X, texts, color_array=p_vals)
74 | ```
75 |
76 | You can also compare two embeddings interactively. To do this:
77 |
78 | ```python
79 | from cluestar import plot_text_comparison
80 |
81 | plot_text(X1=X, X2=X, texts)
82 | ```
83 |
--------------------------------------------------------------------------------
/cluestar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/koaning/cluestar/734afadc533597a48fce629d6207f944e8a478a4/cluestar.png
--------------------------------------------------------------------------------
/cluestar/__init__.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import pandas as pd
3 | import altair as alt
4 |
5 |
6 | def plot_text(X, texts, color_array=None, color_words=None, disable_warning=True):
7 | """
8 | Make a visualisation to help find clues in text data.
9 |
10 | Arguments:
11 | - `X`: the numeric features, should be a 2D numpy array
12 | - `texts`: list of text data
13 | - `color_words`: list of words to highlight
14 | - `color_array`: an array that represents color for the plot
15 | - `disable_warning`: disable the standard altair max rows warning
16 | """
17 | if disable_warning:
18 | alt.data_transformers.disable_max_rows()
19 |
20 | if len(texts) != X.shape[0]:
21 | raise ValueError(
22 | f"The number of text examples ({len(texts)}) should match X array ({X.shape[0]})."
23 | )
24 |
25 | df_ = pd.DataFrame({"x1": X[:, 0], "x2": X[:, 1], "text": texts}).assign(
26 | trunc_text=lambda d: d["text"].str[:120], r=0
27 | )
28 |
29 | if color_array is not None:
30 | if len(color_array) != X.shape[0]:
31 | raise ValueError(
32 | f"The number of color array ({len(color_array)}) should match X array ({X.shape[0]})."
33 | )
34 | df_ = df_.assign(color=color_array)
35 |
36 | if color_words:
37 | df_ = df_.assign(color="none")
38 |
39 | for w in color_words:
40 | predicate = df_["text"].str.lower().str.contains(w)
41 | df_ = df_.assign(color=lambda d: np.where(predicate, w, d["color"]))
42 |
43 | brush = alt.selection_interval()
44 |
45 | p1 = (
46 | alt.Chart(df_)
47 | .mark_circle(opacity=0.6, size=20)
48 | .encode(
49 | x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)),
50 | y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)),
51 | tooltip=["text"],
52 | )
53 | .properties(width=350, height=350, title="embedding space")
54 | .add_params(brush)
55 | )
56 |
57 | if color_words:
58 | p1 = (
59 | alt.Chart(df_)
60 | .mark_circle(opacity=0.6, size=20)
61 | .encode(
62 | x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)),
63 | y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)),
64 | tooltip=["text"],
65 | color=alt.Color("color", sort=["none"] + color_words),
66 | )
67 | .properties(width=350, height=350, title="embedding space")
68 | .add_params(brush)
69 | )
70 |
71 | if color_array is not None:
72 | p1 = (
73 | alt.Chart(df_)
74 | .mark_circle(opacity=0.6, size=20)
75 | .encode(
76 | x=alt.X("x1", axis=None, scale=alt.Scale(zero=False)),
77 | y=alt.Y("x2", axis=None, scale=alt.Scale(zero=False)),
78 | tooltip=["text"],
79 | color=alt.Color("color"),
80 | )
81 | .properties(width=350, height=350, title="embedding space")
82 | .add_params(brush)
83 | )
84 |
85 | p2 = (
86 | alt.Chart(df_)
87 | .mark_text()
88 | .encode(
89 | x=alt.X("r", axis=None),
90 | y=alt.Y("row_number:O", axis=None),
91 | text="trunc_text:N",
92 | )
93 | .transform_window(row_number="row_number()")
94 | .transform_filter(brush)
95 | .transform_window(rank="rank(row_number)")
96 | .transform_filter(alt.datum.rank < 18)
97 | .properties(title="text")
98 | )
99 |
100 | return (p1 | p2).configure_axis(grid=False).configure_view(strokeWidth=0)
101 |
102 | def _single_scatter_chart(df_, idx, brush, title="embedding space", color_words=None, color_array=None):
103 | cols = ("x1:Q", "y1:Q") if idx == 1 else ("x2:Q", "y2:Q")
104 | if color_words:
105 | color=alt.Color("color", sort=["none"] + color_words)
106 | elif color_array:
107 | color=alt.Color("color")
108 | else:
109 | color=alt.condition(brush, 'id:O', alt.value('lightgray'), legend=None)
110 | return (
111 | alt.Chart(df_)
112 | .mark_circle(opacity=0.6, size=20)
113 | .encode(
114 | x=alt.X(cols[0], axis=None, scale=alt.Scale(zero=False)),
115 | y=alt.Y(cols[1], axis=None, scale=alt.Scale(zero=False)),
116 | tooltip=["text"],
117 | color=color,
118 | )
119 | .properties(width=350, height=350, title=title)
120 | .add_params(brush)
121 | )
122 |
123 | def plot_text_comparison(X1, X2, texts, disable_warning=True, color_array=None, color_words=None):
124 | """
125 | Make a visualisation to help find clues in text data.
126 |
127 | Arguments:
128 | - `X1`: the numeric features, should be a 2D numpy array
129 | - `X2`: the numeric features, should be a 2D numpy array
130 | - `texts`: list of text data
131 | - `disable_warning`: disable the standard altair max rows warning
132 | - `color_words`: list of words to highlight
133 | - `color_array`: an array that represents color for the plot
134 | """
135 | if disable_warning:
136 | alt.data_transformers.disable_max_rows()
137 |
138 | if (len(texts) != X1.shape[0]) or (len(texts) != X2.shape[0]):
139 | raise ValueError(
140 | f"The number of text examples ({len(texts)}) should match X1/x2 array X1=({X1.shape[0]}) X2=({X2.shape[0]})."
141 | )
142 |
143 | df_ = pd.DataFrame({"x1": X1[:, 0], "y1": X1[:, 1], "x2": X2[:, 0], "y2": X2[:, 1], "text": texts}).assign(
144 | trunc_text=lambda d: d["text"].str[:120], r=0
145 | )
146 |
147 | if color_array is not None:
148 | if len(color_array) != X1.shape[0]:
149 | raise ValueError(
150 | f"The number of color array ({len(color_array)}) should match X array ({X.shape[0]})."
151 | )
152 | df_ = df_.assign(color=color_array)
153 |
154 | if color_words is not None:
155 | df_ = df_.assign(color="none")
156 |
157 | for w in color_words:
158 | predicate = df_["text"].str.lower().str.contains(w)
159 | df_ = df_.assign(color=lambda d: np.where(predicate, w, d["color"]))
160 |
161 | brush = alt.selection_interval()
162 | p1 = _single_scatter_chart(df_, 1, brush, title="embedding space X1", color_words=color_words, color_array=color_array)
163 | p2 = _single_scatter_chart(df_, 2, brush, title="embedding space X2", color_words=color_words, color_array=color_array)
164 |
165 | p3 = (
166 | alt.Chart(df_)
167 | .mark_text()
168 | .encode(
169 | x=alt.X("r", axis=None),
170 | y=alt.Y("row_number:O", axis=None),
171 | text="trunc_text:N",
172 | )
173 | .transform_window(row_number="row_number()")
174 | .transform_filter(brush)
175 | .transform_window(rank="rank(row_number)")
176 | .transform_filter(alt.datum.rank < 18)
177 | .properties(title="text")
178 | )
179 |
180 | return (p1 | p2 | p3).configure_axis(grid=False).configure_view(strokeWidth=0)
--------------------------------------------------------------------------------
/docs/index.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |