├── experiments
├── 02_09_2023_16_54_32
│ ├── reducer_umap_15.pkl
│ ├── reducer_umap_2.pkl
│ ├── clusterer_hdbscan.pkl
│ ├── assets
│ │ ├── clusters_viz_1.png
│ │ ├── exemplars_viz_1.png
│ │ ├── exemplars_viz_2.png
│ │ ├── cluster0_subcluster0.png
│ │ ├── cluster0_subcluster1.png
│ │ ├── cluster1_subcluster2.png
│ │ ├── cluster1_subcluster3.png
│ │ ├── cluster1_subcluster4.png
│ │ └── cluster1_subcluster5.png
│ ├── clusterer_subs_hdbscan.pkl
│ ├── prompts_embeddings_all_mpnet_base_v2.pt
│ └── prompts_dataframe_cached_with_results.xlsx
├── 03_09_2023_15_14_39
│ ├── reducer_umap_15.pkl
│ ├── reducer_umap_2.pkl
│ ├── clusterer_hdbscan.pkl
│ ├── clusterer_subs_hdbscan.pkl
│ ├── prompts_embeddings_all_mpnet_base_v2.pt
│ └── prompts_dataframe_cached_with_results.xlsx
└── 04_09_2023_03_02_25
│ └── assets
│ ├── aspens_runway.jpeg
│ ├── batman_midjourney.png
│ ├── selected_5_themes.png
│ ├── futuristic_car_midjourney.png
│ ├── selected_25_cluster_themes.png
│ └── traveler_wanderer_runway.jpeg
├── requirements.txt
├── .gitignore
├── README.md
├── LICENSE
└── notebooks
└── stable-diffusion-prompts-clustering.ipynb
/experiments/02_09_2023_16_54_32/reducer_umap_15.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/reducer_umap_15.pkl
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/reducer_umap_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/reducer_umap_2.pkl
--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/reducer_umap_15.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/reducer_umap_15.pkl
--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/reducer_umap_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/reducer_umap_2.pkl
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/clusterer_hdbscan.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/clusterer_hdbscan.pkl
--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/clusterer_hdbscan.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/clusterer_hdbscan.pkl
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/clusters_viz_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/clusters_viz_1.png
--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/aspens_runway.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/aspens_runway.jpeg
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/exemplars_viz_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/exemplars_viz_1.png
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/exemplars_viz_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/exemplars_viz_2.png
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/clusterer_subs_hdbscan.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/clusterer_subs_hdbscan.pkl
--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/clusterer_subs_hdbscan.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/clusterer_subs_hdbscan.pkl
--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/batman_midjourney.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/batman_midjourney.png
--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/selected_5_themes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/selected_5_themes.png
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster0.png
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster1.png
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster2.png
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster3.png
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster4.png
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster5.png
--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/futuristic_car_midjourney.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/futuristic_car_midjourney.png
--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/selected_25_cluster_themes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/selected_25_cluster_themes.png
--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/traveler_wanderer_runway.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/traveler_wanderer_runway.jpeg
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | datasets
3 | umap-learn
4 | hdbscan
5 | sentence-transformers
6 | numpy
7 | torch
8 | openai
9 | pandas
10 | openpyxl
11 | seaborn
12 | plotly
13 | UliPlot
14 | tiktoken
15 | cleantext
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/prompts_embeddings_all_mpnet_base_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/prompts_embeddings_all_mpnet_base_v2.pt
--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/prompts_embeddings_all_mpnet_base_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/prompts_embeddings_all_mpnet_base_v2.pt
--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/prompts_dataframe_cached_with_results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/prompts_dataframe_cached_with_results.xlsx
--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/prompts_dataframe_cached_with_results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/prompts_dataframe_cached_with_results.xlsx
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | share/python-wheels/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | MANIFEST
28 |
29 | # PyInstaller
30 | # Usually these files are written by a python script from a template
31 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
32 | *.manifest
33 | *.spec
34 |
35 | # Installer logs
36 | pip-log.txt
37 | pip-delete-this-directory.txt
38 |
39 | # Unit test / coverage reports
40 | htmlcov/
41 | .tox/
42 | .nox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | *.py,cover
50 | .hypothesis/
51 | .pytest_cache/
52 | cover/
53 |
54 | # Translations
55 | *.mo
56 | *.pot
57 |
58 | # Django stuff:
59 | *.log
60 | local_settings.py
61 | db.sqlite3
62 | db.sqlite3-journal
63 |
64 | # Flask stuff:
65 | instance/
66 | .webassets-cache
67 |
68 | # Scrapy stuff:
69 | .scrapy
70 |
71 | # Sphinx documentation
72 | docs/_build/
73 |
74 | # PyBuilder
75 | .pybuilder/
76 | target/
77 |
78 | # Jupyter Notebook
79 | .ipynb_checkpoints
80 |
81 | # IPython
82 | profile_default/
83 | ipython_config.py
84 |
85 | # pyenv
86 | # For a library or package, you might want to ignore these files since the code is
87 | # intended to run in multiple environments; otherwise, check them in:
88 | # .python-version
89 |
90 | # pipenv
91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
94 | # install all needed dependencies.
95 | #Pipfile.lock
96 |
97 | # poetry
98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99 | # This is especially recommended for binary packages to ensure reproducibility, and is more
100 | # commonly ignored for libraries.
101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 |
104 | # pdm
105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | # in version control.
109 | # https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 |
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 |
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 |
119 | # SageMath parsed files
120 | *.sage.py
121 |
122 | # Environments
123 | .env
124 | .venv*
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 |
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 |
135 | # Rope project settings
136 | .ropeproject
137 |
138 | # mkdocs documentation
139 | /site
140 |
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 |
146 | # Pyre type checker
147 | .pyre/
148 |
149 | # pytype static type analyzer
150 | .pytype/
151 |
152 | # Cython debug symbols
153 | cython_debug/
154 |
155 | # PyCharm
156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | # and can be added to the global gitignore or merged into this file. For a more nuclear
159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Text clustering: HDBSCAN is probably all you need
2 |
3 | [](https://github.com/daniel-furman/Polyglot-or-Not/blob/main/LICENSE)
4 | [](https://www.python.org/downloads/release/python-390/)
5 | [](https://github.com/psf/black)
6 |
7 | ## Goal
8 |
9 | Segment common items in a text dataset to pinpoint core themes and their distribution.
10 |
11 | * Clusters cover the main topics/subtopics in the dataset
12 | * Clusters backed by accurate, LLM generated summaries
13 |
14 | ## Background
15 |
16 | We employ [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/index.html) for probabilistic clustering. This algorithm is advantageous in many ways, including:
17 |
18 | * Don’t be wrong: Cluster can have varying densities, don’t need to be globular, and won’t include noise
19 | * Intuitive parameters: Choosing a minimum cluster size is very reasonable, and the number of *k* clusters does not need to be specified (HDBSCAN finds the optimal *k* for you)
20 | * Stability: HDBSCAN is stable over runs and subsampling and has good stability over parameter choices
21 | * Performance: When implemented well HDBSCAN can be very efficient; the current implementation has similar performance to fastcluster’s agglomerative clustering
22 |
23 | See the HDBSCAN docs on [comparing clustering algorithms](https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html#hdbscan) and [how hdbscan works](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) for more information.
24 |
25 | ## Citations
26 |
27 | * Datasets
28 | * [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts)
29 | * [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts)
30 | * Embedding models
31 | * [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)
32 |
33 | ## Experiments
34 |
35 | ## 1. Visualizing core themes in [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts)
36 |
37 | These figures correspond to [`experiments/02_09_2023_16_54_32`](https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering/tree/main/experiments/02_09_2023_16_54_32)
38 |
39 |
40 |
41 | ---
42 |
43 | 
44 |
45 | **Figure 1**. HDBSCAN splits the 153 text to text prompts from [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts) into two clusters: Cluster 1 with 44 prompts (orange) and Cluster 2 with 105 prompts (blue). The 4 remaining prompts (gray) were filtered out as outliers/noise.
46 |
47 | 
48 |
49 | **Figure 2**. The most persistent prompts in each leaf cluster are known as "exemplars". These represent the hearts around which the ultimate cluster formed. See the HDBSCAN docs on [soft clustering explanation](https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html#distance-based-membership) for supporting information and functions.
50 |
51 | 
52 |
53 | **Figure 3**. Additional clustering is conducted around the exemplars to identify sub-topics in the dataset. The cases in each sub-cluster subsequently serve as retrieved context for the LLM theme summarization calls below.
54 |
55 | 
56 |
57 | **Figure 4**. Visualizing the "*Computer Programming and Software Development*" theme, which covers 13% of the dataset. The summary was generated by [gpt-3.5-turbo-16k](https://platform.openai.com/docs/models/gpt-3-5). The above was created with [jsoncrack.com/editor](https://jsoncrack.com/editor).
58 |
59 |
60 |
61 |
62 | ## 2. Drift detection for [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts)
63 |
64 | These figures correspond to [`experiments/04_09_2023_03_02_25`](https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering/tree/main/experiments/04_09_2023_03_02_25)
65 |
66 |
67 |
68 |
69 |
70 | ---
71 |
72 | HDBSCAN splits the 73,718 text to image prompts from [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts) into 78 clusters with 25,019 (33%) of the dataset represented. The remaining 48,699 (66%) were filtered out as outliers/noise. The 5 largest clusters cover 9.5% of the dataset - these are the segments we will examine for drift below.
73 |
74 | | cluster id | theme |
75 | |------------|--------------|
76 | | 56 | Portraits and artistic depictions of female anime characters, beautiful women, and fashionable young women |
77 | | 13 | Symmetrical portraits of people, characters, and sci-fi figures |
78 | | 61 | Futuristic sci-fi spaceship concept art |
79 | | 50 | Portraits of famous actresses as characters in various roles, outfits, and styles |
80 | | 74 | Surreal, cinematic, and futuristic digital art |
81 |
82 | | cluster id | train count
(73.7k rows) | test count
(8.19k rows) | drift detection
(% change) |
83 | |------------|-------------------------------|------------------------------|------------------|
84 | | 56 | 2530 (3.43%) | 310 (3.79%) | 10.50 |
85 | | 13 | 1343 (1.82%) | 149 (1.82%) | 0.00 |
86 | | 61 | 1287 (1.75%) | 131 (1.60%) | -8.57 |
87 | | 50 | 1055 (1.43%) | 135 (1.65%) | 15.38 |
88 | | 74 | 749 (1.02%) | 109 (1.33%) | 30.39 |
89 |
90 |
91 | **Tables 1 & 2**. Drift detection for the top 5 largest clusters (bottom), alongside their [claude-2](https://claude.ai/) summaries (top).
92 |
93 |
94 |
95 |
96 |
97 | **Prompt**: "*Beautiful painting of an Aspen forest at sunset, digital art, award winning illustration, golden hour, smooth, sharp lines, concept art, trending on artstation*"
98 | **Model**: [Runway Gen-2](https://app.runwayml.com/video-tools/teams/dryanfurman/ai-tools/text-to-image) (accessed by Daniel Furman on Sep 4, 2023)
99 | **Theme**: Beautiful landscape paintings and matte art (cluster id: 75)
100 |
101 |
102 |
103 |
104 |
105 | **Prompt**: "*Futuristic batman, brush strokes, oil painting, greg rutkowski*"
106 | **Model**: [Midjourney V5.2](https://www.midjourney.com/app/) (accessed by Daniel Furman on Sep 4, 2023)
107 | **Theme**: Art and portraits of Batman characters (cluster id: 41)
108 |
109 |
110 |
111 | **Prompt**: "*Futuristic Porsche designed by Apple, a detailed matte painting by Kitagawa Utamaro, cgsociety, octane render, highly detailed, matte painting, concept art, sci-fi*"
112 | **Model**: [Midjourney V5.2](https://www.midjourney.com/app/) (accessed by Daniel Furman on Sep 4, 2023)
113 | **Theme**: Futuristic and fantasy vehicle concept art (cluster id: 52)
114 |
115 |
116 | **Figure 5**. A sample of 3 text to image generations with various models for prompts from the [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts) dataset (alongside their cluster id).
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/notebooks/stable-diffusion-prompts-clustering.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "id": "LQ-HKp9u4kTR"
7 | },
8 | "source": [
9 | "# Text clustering: HDBSCAN is probably all you need\n",
10 | "\n",
11 | "\n",
12 | "
\n",
13 | ""
14 | ]
15 | },
16 | {
17 | "cell_type": "markdown",
18 | "metadata": {
19 | "id": "IpEhs9ujlcQ1"
20 | },
21 | "source": [
22 | "## Sections\n",
23 | "\n",
24 | "1. Setup\n",
25 | "2. Data I/O\n",
26 | "3. Embed text\n",
27 | "4. Clustering\n",
28 | "5. Exemplar sub-clustering\n",
29 | "6. Knowledge graph theming\n",
30 | "7. Write final df results to disk\n",
31 | "8. Create a JSON knowledge graph viz"
32 | ]
33 | },
34 | {
35 | "cell_type": "markdown",
36 | "metadata": {
37 | "id": "ZMZjJxWGeSZP"
38 | },
39 | "source": [
40 | "## Setup"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "colab": {
48 | "base_uri": "https://localhost:8080/"
49 | },
50 | "id": "a6GR5Tfzx2z9",
51 | "outputId": "0ccecdb6-b57c-4c05-de12-196770056162"
52 | },
53 | "outputs": [],
54 | "source": [
55 | "# copying larger files to GDrive storage for this experiment\n",
56 | "\n",
57 | "from google.colab import drive\n",
58 | "\n",
59 | "drive.mount(\"/content/drive\")"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": null,
65 | "metadata": {
66 | "id": "-mA_PgxV4KV2"
67 | },
68 | "outputs": [],
69 | "source": [
70 | "!git clone https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering.git"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": null,
76 | "metadata": {
77 | "id": "mO55HvFB3egw"
78 | },
79 | "outputs": [],
80 | "source": [
81 | "# for local run, see below commands for setting up a new venv\n",
82 | "\n",
83 | "#!python -m venv .venv_clust_demo\n",
84 | "#!source .venv_clust_demo/bin/activate\n",
85 | "#!pip install --upgrade pip\n",
86 | "#!pip list"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": null,
92 | "metadata": {
93 | "colab": {
94 | "base_uri": "https://localhost:8080/"
95 | },
96 | "id": "96JEWSNtljzH",
97 | "outputId": "04f47fc2-588e-4557-ac04-16989c152ee7"
98 | },
99 | "outputs": [],
100 | "source": [
101 | "import os\n",
102 | "\n",
103 | "os.chdir(\"/content/awesome-chatgpt-prompts-clustering\")\n",
104 | "!ls"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": null,
110 | "metadata": {
111 | "id": "GeFaTjyW2Gk7"
112 | },
113 | "outputs": [],
114 | "source": [
115 | "!pip install -qUr requirements.txt"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": null,
121 | "metadata": {
122 | "colab": {
123 | "base_uri": "https://localhost:8080/"
124 | },
125 | "id": "8nBmAAxvye3w",
126 | "outputId": "04ea9f9f-b092-40da-f5cb-39b468509561"
127 | },
128 | "outputs": [],
129 | "source": [
130 | "os.chdir(\"../..\")\n",
131 | "!ls"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": null,
137 | "metadata": {
138 | "id": "lPBnmvEd3egy"
139 | },
140 | "outputs": [],
141 | "source": [
142 | "#!pip list"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": null,
148 | "metadata": {
149 | "id": "NXc0G4wQ3egy"
150 | },
151 | "outputs": [],
152 | "source": [
153 | "import argparse\n",
154 | "import os\n",
155 | "from tqdm.notebook import tqdm\n",
156 | "import datetime\n",
157 | "import json\n",
158 | "import pickle\n",
159 | "import numpy as np\n",
160 | "from numpy import random\n",
161 | "import pandas as pd\n",
162 | "import seaborn as sns\n",
163 | "import matplotlib.pyplot as plt\n",
164 | "import plotly.graph_objects as go\n",
165 | "\n",
166 | "import umap\n",
167 | "from datasets import load_dataset\n",
168 | "from sentence_transformers import SentenceTransformer\n",
169 | "import torch\n",
170 | "import hdbscan\n",
171 | "from sklearn.metrics.pairwise import euclidean_distances\n",
172 | "import openai\n",
173 | "import tiktoken\n",
174 | "import cleantext\n",
175 | "\n",
176 | "from UliPlot.XLSX import auto_adjust_xlsx_column_width"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": null,
182 | "metadata": {
183 | "colab": {
184 | "base_uri": "https://localhost:8080/"
185 | },
186 | "id": "TnRCOUWK9hkn",
187 | "outputId": "e347ce5a-844b-4dfb-99ff-7f25a071619e"
188 | },
189 | "outputs": [],
190 | "source": [
191 | "args = argparse.Namespace()\n",
192 | "args.inference = True\n",
193 | "args"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "metadata": {
200 | "colab": {
201 | "base_uri": "https://localhost:8080/"
202 | },
203 | "id": "m89BDk6WXSZM",
204 | "outputId": "4ea9d179-75aa-41cd-9a8c-e3022e72d0ff"
205 | },
206 | "outputs": [],
207 | "source": [
208 | "now = datetime.datetime.now()\n",
209 | "# dd/mm/YY H:M:S\n",
210 | "dt_string = now.strftime(\"%d_%m_%Y_%H_%M_%S\")\n",
211 | "\n",
212 | "# hardcode in an existing experiment datetime for inference runs\n",
213 | "\n",
214 | "if args.inference:\n",
215 | " # dt_string identifiers from cached experiments:\n",
216 | " dt_string = \"04_09_2023_03_02_25\"\n",
217 | "\n",
218 | "print(\"experiment's datetime identifier =\", dt_string)\n",
219 | "\n",
220 | "# create results folder if it doesn't exist\n",
221 | "if not os.path.isdir(\n",
222 | " f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n",
223 | "):\n",
224 | " os.mkdir(\n",
225 | " f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n",
226 | " )"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {
233 | "colab": {
234 | "base_uri": "https://localhost:8080/"
235 | },
236 | "id": "3EzBV-Mm6C90",
237 | "outputId": "42ff75a2-11c5-474a-90bd-a0204885e6ba"
238 | },
239 | "outputs": [],
240 | "source": [
241 | "args.cache_folder = (\n",
242 | " f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n",
243 | ")\n",
244 | "args"
245 | ]
246 | },
247 | {
248 | "cell_type": "markdown",
249 | "metadata": {
250 | "id": "ZRwI0I6IeVNr"
251 | },
252 | "source": [
253 | "## Data I/O"
254 | ]
255 | },
256 | {
257 | "cell_type": "code",
258 | "execution_count": null,
259 | "metadata": {
260 | "colab": {
261 | "base_uri": "https://localhost:8080/",
262 | "height": 424
263 | },
264 | "id": "_BNPvSprLRzM",
265 | "outputId": "b9559cd9-8111-42f6-edab-272b616ae73d"
266 | },
267 | "outputs": [],
268 | "source": [
269 | "ds_hf = load_dataset(\"Gustavosta/Stable-Diffusion-Prompts\")\n",
270 | "ds = ds_hf[\"train\"]\n",
271 | "\n",
272 | "ds = ds.to_pandas()\n",
273 | "ds[\"id\"] = ds.index\n",
274 | "ds = ds[[\"id\", \"Prompt\"]]\n",
275 | "ds"
276 | ]
277 | },
278 | {
279 | "cell_type": "markdown",
280 | "metadata": {
281 | "id": "h8uYPW_TeYOa"
282 | },
283 | "source": [
284 | "## Embed Text\n",
285 | "\n",
286 | "* See [pretrained models](https://www.sbert.net/docs/pretrained_models.html) for supporting information"
287 | ]
288 | },
289 | {
290 | "cell_type": "code",
291 | "execution_count": null,
292 | "metadata": {
293 | "id": "pLLXTuZd6QyV"
294 | },
295 | "outputs": [],
296 | "source": [
297 | "model = SentenceTransformer(\"all-mpnet-base-v2\")"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "metadata": {
304 | "id": "UIen4vsj3egz"
305 | },
306 | "outputs": [],
307 | "source": [
308 | "if not args.inference:\n",
309 | " embeddings = torch.zeros([len(ds), 768])\n",
310 | " for i in tqdm(range(len(ds))):\n",
311 | " emb = model.encode(ds.loc[i, \"Prompt\"], convert_to_tensor=True)\n",
312 | " embeddings[i, :] = emb\n",
313 | " embeddings"
314 | ]
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "metadata": {
320 | "colab": {
321 | "base_uri": "https://localhost:8080/"
322 | },
323 | "id": "5h2kpB6C2I_N",
324 | "outputId": "88bfa974-8686-49d7-9f93-b131b89ba9a3"
325 | },
326 | "outputs": [],
327 | "source": [
328 | "f_name = os.path.join(\n",
329 | " args.cache_folder, \"stable_diffusion_prompts_embeddings_all_mpnet_base_v2.pt\"\n",
330 | ")\n",
331 | "print(f_name, \"\\n\")\n",
332 | "\n",
333 | "if not args.inference:\n",
334 | " torch.save(embeddings, f_name)\n",
335 | "loaded_embeddings = torch.load(f_name)\n",
336 | "loaded_embeddings"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "metadata": {
343 | "colab": {
344 | "base_uri": "https://localhost:8080/"
345 | },
346 | "id": "fsWOJjwdzgeG",
347 | "outputId": "b91af737-4ec1-43af-a45d-705a6a893fdd"
348 | },
349 | "outputs": [],
350 | "source": [
351 | "loaded_embeddings.shape"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": null,
357 | "metadata": {
358 | "id": "DWdeAnld5xhH"
359 | },
360 | "outputs": [],
361 | "source": [
362 | "if not args.inference:\n",
363 | " torch.equal(loaded_embeddings.cpu(), embeddings.cpu())"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "metadata": {
370 | "colab": {
371 | "base_uri": "https://localhost:8080/"
372 | },
373 | "id": "j6f0cJmH3egz",
374 | "outputId": "e4b5aee8-31a9-4348-f8da-4e2e1b07da14"
375 | },
376 | "outputs": [],
377 | "source": [
378 | "test_itr = random.randint(low=0, high=len(ds))\n",
379 | "test_itr"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": null,
385 | "metadata": {
386 | "colab": {
387 | "base_uri": "https://localhost:8080/",
388 | "height": 89
389 | },
390 | "id": "2WoZs9mF3egz",
391 | "outputId": "708a1772-9947-4949-f4ad-89bf734aa6b3"
392 | },
393 | "outputs": [],
394 | "source": [
395 | "# test embeddings worked\n",
396 | "ds.loc[test_itr, \"Prompt\"]"
397 | ]
398 | },
399 | {
400 | "cell_type": "code",
401 | "execution_count": null,
402 | "metadata": {
403 | "colab": {
404 | "base_uri": "https://localhost:8080/"
405 | },
406 | "id": "78eiylg53egz",
407 | "outputId": "0a1eee8c-ed7a-4feb-a406-8e001d2fe657"
408 | },
409 | "outputs": [],
410 | "source": [
411 | "test_emb = model.encode(ds.loc[test_itr, \"Prompt\"], convert_to_tensor=True)\n",
412 | "a = np.array(test_emb.cpu())\n",
413 | "b = np.array(loaded_embeddings[test_itr, :].cpu())\n",
414 | "np.allclose(a, b, rtol=1e-02)"
415 | ]
416 | },
417 | {
418 | "cell_type": "markdown",
419 | "metadata": {
420 | "id": "THg1GieGesDQ"
421 | },
422 | "source": [
423 | "## Clustering\n",
424 | "\n",
425 | "* See [how hdbscan works](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) for supporting information"
426 | ]
427 | },
428 | {
429 | "cell_type": "code",
430 | "execution_count": null,
431 | "metadata": {
432 | "colab": {
433 | "base_uri": "https://localhost:8080/"
434 | },
435 | "id": "m2A8EHRg7dzr",
436 | "outputId": "e6f9135c-02c2-4e47-ca4a-75d56369ffd8"
437 | },
438 | "outputs": [],
439 | "source": [
440 | "# second, perform clustering\n",
441 | "\n",
442 | "# first, perform dimensionality reduction from 768 to 15\n",
443 | "f_name = os.path.join(args.cache_folder, \"reducer_umap_15.pkl\")\n",
444 | "print(f_name, \"\\n\")\n",
445 | "\n",
446 | "if not args.inference:\n",
447 | " reducer_15 = umap.UMAP(n_components=15)\n",
448 | " reducer_15.fit(loaded_embeddings)\n",
449 | " embeddings_umap_dim_15 = reducer_15.transform(loaded_embeddings)\n",
450 | " # Verify that the result of calling transform is\n",
451 | " # idenitical to accessing the embedding_ attribute\n",
452 | " assert np.all(embeddings_umap_dim_15 == reducer_15.embedding_)\n",
453 | "\n",
454 | " # cache fitted umap object\n",
455 | " pickle.dump(reducer_15, open(f_name, \"wb\"))\n",
456 | "\n",
457 | "loaded_reducer_15 = pickle.load((open(f_name, \"rb\")))\n",
458 | "\n",
459 | "embeddings_umap_dim_15 = loaded_reducer_15.transform(loaded_embeddings)\n",
460 | "# Verify that the result of calling transform is\n",
461 | "# idenitical to accessing the embedding_ attribute\n",
462 | "assert np.all(embeddings_umap_dim_15 == loaded_reducer_15.embedding_)\n",
463 | "\n",
464 | "print(embeddings_umap_dim_15.shape)"
465 | ]
466 | },
467 | {
468 | "cell_type": "code",
469 | "execution_count": null,
470 | "metadata": {
471 | "colab": {
472 | "base_uri": "https://localhost:8080/"
473 | },
474 | "id": "0TGPS4pSySbc",
475 | "outputId": "8f0486e0-e09c-4947-b5ac-f0a556e76860"
476 | },
477 | "outputs": [],
478 | "source": [
479 | "args.inference = False\n",
480 | "args"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": null,
486 | "metadata": {
487 | "colab": {
488 | "base_uri": "https://localhost:8080/"
489 | },
490 | "id": "ZZxn2DlOyOsr",
491 | "outputId": "758758e8-ac32-486f-db8d-b267f4ec8bf4"
492 | },
493 | "outputs": [],
494 | "source": [
495 | "f_name = os.path.join(args.cache_folder, \"clusterer_hdbscan.pkl\")\n",
496 | "print(f_name, \"\\n\")\n",
497 | "\n",
498 | "if not args.inference:\n",
499 | " clusterer = hdbscan.HDBSCAN(\n",
500 | " min_cluster_size=110, gen_min_span_tree=True, prediction_data=True\n",
501 | " )\n",
502 | " clusterer.fit(embeddings_umap_dim_15)\n",
503 | " pickle.dump(clusterer, open(f_name, \"wb\"))\n",
504 | "\n",
505 | "loaded_clusterer = pickle.load((open(f_name, \"rb\")))\n",
506 | "\n",
507 | "if not args.inference:\n",
508 | " print(\n",
509 | " pd.DataFrame.equals(\n",
510 | " pd.Series(clusterer.labels_).value_counts(),\n",
511 | " pd.Series(loaded_clusterer.labels_).value_counts(),\n",
512 | " )\n",
513 | " )\n",
514 | " print(\n",
515 | " pd.DataFrame.equals(\n",
516 | " pd.Series(clusterer.probabilities_).value_counts(),\n",
517 | " pd.Series(loaded_clusterer.probabilities_).value_counts(),\n",
518 | " )\n",
519 | " )\n",
520 | "\n",
521 | "num_ouliers = pd.Series(loaded_clusterer.labels_).value_counts().loc[-1]\n",
522 | "\n",
523 | "print(pd.Series(loaded_clusterer.labels_).value_counts())\n",
524 | "print(f\"\\nCluster outliers : {num_ouliers}\\n\")"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": null,
530 | "metadata": {
531 | "colab": {
532 | "base_uri": "https://localhost:8080/"
533 | },
534 | "id": "eqIqeraG-jTd",
535 | "outputId": "066f5e27-3bc6-4c4d-d388-2bb67ab9d23e"
536 | },
537 | "outputs": [],
538 | "source": [
539 | "# sum of top 25 cluster counts\n",
540 | "\n",
541 | "pd.Series(loaded_clusterer.labels_).value_counts()[1:26].sum()"
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": null,
547 | "metadata": {
548 | "colab": {
549 | "base_uri": "https://localhost:8080/",
550 | "height": 424
551 | },
552 | "id": "zB9nU0ka_JpA",
553 | "outputId": "fb1f1bf9-581a-4db4-f9b5-fcd9117c2548"
554 | },
555 | "outputs": [],
556 | "source": [
557 | "ds[\"cluster\"] = loaded_clusterer.labels_\n",
558 | "ds[\"cluster membership prob\"] = loaded_clusterer.probabilities_\n",
559 | "ds"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": null,
565 | "metadata": {
566 | "colab": {
567 | "base_uri": "https://localhost:8080/",
568 | "height": 438
569 | },
570 | "id": "pDW5AVDn891V",
571 | "outputId": "983b99a0-0c2a-4aa2-ea63-53b97e5b5f90"
572 | },
573 | "outputs": [],
574 | "source": [
575 | "loaded_clusterer.condensed_tree_.plot()"
576 | ]
577 | },
578 | {
579 | "cell_type": "code",
580 | "execution_count": null,
581 | "metadata": {
582 | "colab": {
583 | "base_uri": "https://localhost:8080/",
584 | "height": 438
585 | },
586 | "id": "N-aJm6Sz9A2h",
587 | "outputId": "641b1b8d-e748-44eb-be4c-ae7799b0646a"
588 | },
589 | "outputs": [],
590 | "source": [
591 | "loaded_clusterer.condensed_tree_.plot(\n",
592 | " select_clusters=True, selection_palette=sns.color_palette()\n",
593 | ")"
594 | ]
595 | },
596 | {
597 | "cell_type": "code",
598 | "execution_count": null,
599 | "metadata": {
600 | "id": "9wphVnvqytn5"
601 | },
602 | "outputs": [],
603 | "source": [
604 | "args.inference = True"
605 | ]
606 | },
607 | {
608 | "cell_type": "code",
609 | "execution_count": null,
610 | "metadata": {
611 | "colab": {
612 | "base_uri": "https://localhost:8080/"
613 | },
614 | "id": "Bug6Ab-n8DqQ",
615 | "outputId": "18a5ca83-d5cd-43e0-aef2-ed0edabb336b"
616 | },
617 | "outputs": [],
618 | "source": [
619 | "# third, perform dimensionality reduction from 15 to 2\n",
620 | "\n",
621 | "f_name = os.path.join(args.cache_folder, \"reducer_umap_2.pkl\")\n",
622 | "print(f_name, \"\\n\")\n",
623 | "\n",
624 | "if not args.inference:\n",
625 | " reducer_2 = umap.UMAP(n_components=2)\n",
626 | " reducer_2.fit(embeddings_umap_dim_15)\n",
627 | " embeddings_umap_dim_2 = reducer_2.transform(embeddings_umap_dim_15)\n",
628 | "\n",
629 | " # Verify that the result of calling transform is\n",
630 | " # idenitical to accessing the embedding_ attribute\n",
631 | " assert np.all(embeddings_umap_dim_2 == reducer_2.embedding_)\n",
632 | "\n",
633 | " # cache fitted umap object\n",
634 | " pickle.dump(reducer_2, open(f_name, \"wb\"))\n",
635 | "\n",
636 | "loaded_reducer_2 = pickle.load((open(f_name, \"rb\")))\n",
637 | "\n",
638 | "embeddings_umap_dim_2 = loaded_reducer_2.transform(embeddings_umap_dim_15)\n",
639 | "# Verify that the result of calling transform is\n",
640 | "# idenitical to accessing the embedding_ attribute\n",
641 | "assert np.all(embeddings_umap_dim_2 == loaded_reducer_2.embedding_)\n",
642 | "embeddings_umap_dim_2.shape"
643 | ]
644 | },
645 | {
646 | "cell_type": "code",
647 | "execution_count": null,
648 | "metadata": {
649 | "id": "PjOkZzekl1vx"
650 | },
651 | "outputs": [],
652 | "source": [
653 | "ds[\"x\"] = embeddings_umap_dim_2[:, 0]\n",
654 | "ds[\"y\"] = embeddings_umap_dim_2[:, 1]"
655 | ]
656 | },
657 | {
658 | "cell_type": "code",
659 | "execution_count": null,
660 | "metadata": {
661 | "colab": {
662 | "base_uri": "https://localhost:8080/",
663 | "height": 450
664 | },
665 | "id": "lcOHNlilQAeU",
666 | "outputId": "8593037f-af04-49dd-815d-7e4b871bc0d2"
667 | },
668 | "outputs": [],
669 | "source": [
670 | "# Visualize clusters\n",
671 | "fig, ax = plt.subplots(figsize=(20, 10))\n",
672 | "outliers = ds[ds[\"cluster\"] == -1]\n",
673 | "clustered = ds[ds[\"cluster\"] != -1]\n",
674 | "plt.scatter(outliers.x, outliers.y, color=\"#BDBDBD\", s=10, alpha=0.1)\n",
675 | "plt.scatter(\n",
676 | " clustered.x, clustered.y, c=clustered.cluster, s=10, alpha=0.35, cmap=\"viridis\"\n",
677 | ")"
678 | ]
679 | },
680 | {
681 | "cell_type": "code",
682 | "execution_count": null,
683 | "metadata": {
684 | "colab": {
685 | "base_uri": "https://localhost:8080/",
686 | "height": 837
687 | },
688 | "id": "jdCSkHYZ9VTK",
689 | "outputId": "2ab60bbd-afb1-47f6-edac-cb077616d153"
690 | },
691 | "outputs": [],
692 | "source": [
693 | "fig = go.Figure()\n",
694 | "fig.add_trace(\n",
695 | " go.Scatter(\n",
696 | " x=ds[\"x\"][ds[\"cluster\"] != -1],\n",
697 | " y=ds[\"y\"][ds[\"cluster\"] != -1],\n",
698 | " mode=\"markers\",\n",
699 | " marker_color=ds[\"cluster\"][ds[\"cluster\"] != -1],\n",
700 | " marker_colorscale=\"Viridis\",\n",
701 | " text=ds[\"cluster\"][ds[\"cluster\"] != -1],\n",
702 | " )\n",
703 | ")\n",
704 | "\n",
705 | "fig.update_traces(marker={\"size\": 5, \"opacity\": 0.45}, showlegend=False)\n",
706 | "fig.update_coloraxes(showscale=False)\n",
707 | "fig.update_layout(width=550 * 2, height=400 * 2)\n",
708 | "fig.show()"
709 | ]
710 | },
711 | {
712 | "cell_type": "markdown",
713 | "metadata": {
714 | "id": "fZl5qjp7r6QJ"
715 | },
716 | "source": [
717 | "## Exemplar Sub-Clustering\n",
718 | "\n",
719 | "* See [soft clustering explanation](https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html) for supporting information"
720 | ]
721 | },
722 | {
723 | "cell_type": "code",
724 | "execution_count": null,
725 | "metadata": {
726 | "id": "87hBN35f-J0-"
727 | },
728 | "outputs": [],
729 | "source": [
730 | "# function copied from:\n",
731 | "# https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html#distance-based-membership\n",
732 | "\n",
733 | "\n",
734 | "def exemplars(cluster_id, condensed_tree):\n",
735 | " raw_tree = condensed_tree._raw_tree\n",
736 | " # Just the cluster elements of the tree, excluding singleton points\n",
737 | " cluster_tree = raw_tree[raw_tree[\"child_size\"] > 1]\n",
738 | " # Get the leaf cluster nodes under the cluster we are considering\n",
739 | " leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)\n",
740 | " # Now collect up the last remaining points of each leaf cluster (the heart of the leaf)\n",
741 | " result = np.array([])\n",
742 | " for leaf in leaves:\n",
743 | " max_lambda = raw_tree[\"lambda_val\"][raw_tree[\"parent\"] == leaf].max()\n",
744 | " points = raw_tree[\"child\"][\n",
745 | " (raw_tree[\"parent\"] == leaf) & (raw_tree[\"lambda_val\"] == max_lambda)\n",
746 | " ]\n",
747 | " result = np.hstack((result, points))\n",
748 | " return result.astype(np.int)"
749 | ]
750 | },
751 | {
752 | "cell_type": "code",
753 | "execution_count": null,
754 | "metadata": {
755 | "colab": {
756 | "base_uri": "https://localhost:8080/"
757 | },
758 | "id": "hu4rVHAHAoms",
759 | "outputId": "888845e2-4a84-4cb0-d72d-56867bdf01b8"
760 | },
761 | "outputs": [],
762 | "source": [
763 | "tree = loaded_clusterer.condensed_tree_\n",
764 | "\n",
765 | "exemplar_ids = []\n",
766 | "for i, c in enumerate(tree._select_clusters()):\n",
767 | " c_exemplars = exemplars(c, tree)\n",
768 | " print(f\"Cluster {i} has {len(c_exemplars)} exemplars\")\n",
769 | " exemplar_ids.extend(c_exemplars)"
770 | ]
771 | },
772 | {
773 | "cell_type": "code",
774 | "execution_count": null,
775 | "metadata": {
776 | "id": "3X3Wc86tnVRY"
777 | },
778 | "outputs": [],
779 | "source": [
780 | "ds[\"exemplars yes/no\"] = np.zeros(len(ds))\n",
781 | "ds.loc[exemplar_ids, \"exemplars yes/no\"] = 1\n",
782 | "\n",
783 | "assert len(ds[ds[\"exemplars yes/no\"] == 1]) == len(exemplar_ids)"
784 | ]
785 | },
786 | {
787 | "cell_type": "code",
788 | "execution_count": null,
789 | "metadata": {
790 | "colab": {
791 | "base_uri": "https://localhost:8080/",
792 | "height": 873
793 | },
794 | "id": "uyJW97LhrRp0",
795 | "outputId": "5176d228-8539-40f4-d8b5-f0acf58b8ffa"
796 | },
797 | "outputs": [],
798 | "source": [
799 | "print(\"\\n\")\n",
800 | "fig = go.Figure()\n",
801 | "\n",
802 | "custom_scale = [\n",
803 | " \"#949494\", # Gray\n",
804 | " \"#F65314\", # Google Red\n",
805 | " \"#4285F4\", # Google Blue\n",
806 | "]\n",
807 | "\n",
808 | "fig.add_trace(\n",
809 | " go.Scatter(\n",
810 | " x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
811 | " y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
812 | " mode=\"markers\",\n",
813 | " marker_color=custom_scale[0],\n",
814 | " text=ds[\"cluster\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
815 | " )\n",
816 | ")\n",
817 | "\n",
818 | "fig.add_trace(\n",
819 | " go.Scatter(\n",
820 | " x=ds[\"x\"][ds[\"exemplars yes/no\"] == 1],\n",
821 | " y=ds[\"y\"][ds[\"exemplars yes/no\"] == 1],\n",
822 | " mode=\"markers\",\n",
823 | " marker_color=ds[\"cluster\"][ds[\"exemplars yes/no\"] == 1],\n",
824 | " marker_colorscale=\"Viridis\",\n",
825 | " text=ds[\"cluster\"][ds[\"exemplars yes/no\"] == 1],\n",
826 | " )\n",
827 | ")\n",
828 | "\n",
829 | "fig.update_traces(marker={\"size\": 5, \"opacity\": 0.45}, showlegend=False)\n",
830 | "fig.update_coloraxes(showscale=False)\n",
831 | "fig.update_layout(width=550 * 2, height=400 * 2)\n",
832 | "fig.show()"
833 | ]
834 | },
835 | {
836 | "cell_type": "code",
837 | "execution_count": null,
838 | "metadata": {
839 | "colab": {
840 | "base_uri": "https://localhost:8080/"
841 | },
842 | "id": "y5bO5SZ4hjAT",
843 | "outputId": "82860182-688b-400c-bb60-162964b8bf23"
844 | },
845 | "outputs": [],
846 | "source": [
847 | "len(ds.loc[exemplar_ids])"
848 | ]
849 | },
850 | {
851 | "cell_type": "code",
852 | "execution_count": null,
853 | "metadata": {
854 | "colab": {
855 | "base_uri": "https://localhost:8080/"
856 | },
857 | "id": "OW44QJyYhibv",
858 | "outputId": "e24e08df-3557-406c-a056-4821b5661dd8"
859 | },
860 | "outputs": [],
861 | "source": [
862 | "embeddings_umap_dim_15[exemplar_ids].shape"
863 | ]
864 | },
865 | {
866 | "cell_type": "code",
867 | "execution_count": null,
868 | "metadata": {
869 | "colab": {
870 | "base_uri": "https://localhost:8080/"
871 | },
872 | "id": "J17BEw2vgvwA",
873 | "outputId": "0336d60f-c0d1-49aa-bd53-bc1da49f71f8"
874 | },
875 | "outputs": [],
876 | "source": [
877 | "# fourth, perform exemplar sub-clustering\n",
878 | "\n",
879 | "f_name = os.path.join(args.cache_folder, \"clusterer_subs_hdbscan.pkl\")\n",
880 | "print(f_name, \"\\n\")\n",
881 | "\n",
882 | "if not args.inference:\n",
883 | " sub_clusterer = hdbscan.HDBSCAN(\n",
884 | " min_cluster_size=4, gen_min_span_tree=True, prediction_data=True\n",
885 | " )\n",
886 | " sub_clusterer.fit(embeddings_umap_dim_15[exemplar_ids])\n",
887 | " pickle.dump(sub_clusterer, open(f_name, \"wb\"))\n",
888 | "\n",
889 | "loaded_sub_clusterer = pickle.load((open(f_name, \"rb\")))\n",
890 | "\n",
891 | "if not args.inference:\n",
892 | " print(\n",
893 | " pd.DataFrame.equals(\n",
894 | " pd.Series(sub_clusterer.labels_).value_counts(),\n",
895 | " pd.Series(loaded_sub_clusterer.labels_).value_counts(),\n",
896 | " )\n",
897 | " )\n",
898 | " print(\n",
899 | " pd.DataFrame.equals(\n",
900 | " pd.Series(sub_clusterer.probabilities_).value_counts(),\n",
901 | " pd.Series(loaded_sub_clusterer.probabilities_).value_counts(),\n",
902 | " )\n",
903 | " )\n",
904 | "\n",
905 | "print(\"\\nCluster value counts:\\n\")\n",
906 | "pd.Series(loaded_sub_clusterer.labels_).value_counts()"
907 | ]
908 | },
909 | {
910 | "cell_type": "code",
911 | "execution_count": null,
912 | "metadata": {
913 | "colab": {
914 | "base_uri": "https://localhost:8080/"
915 | },
916 | "id": "O9033VfDirUu",
917 | "outputId": "97d98be7-f645-4c81-8df4-d20d6a04e1c7"
918 | },
919 | "outputs": [],
920 | "source": [
921 | "loaded_sub_clusterer.labels_"
922 | ]
923 | },
924 | {
925 | "cell_type": "code",
926 | "execution_count": null,
927 | "metadata": {
928 | "colab": {
929 | "base_uri": "https://localhost:8080/"
930 | },
931 | "id": "buRR8jvui1ul",
932 | "outputId": "3b916ffb-a440-4c07-b452-06f993292e6b"
933 | },
934 | "outputs": [],
935 | "source": [
936 | "ds[\"exemplar sub-cluster\"] = np.repeat(np.nan, len(ds))\n",
937 | "ds[\"cluster XX.YY\"] = np.repeat(np.nan, len(ds))\n",
938 | "# ds.loc[exemplar_ids] = loaded_sub_clusterer.labels_\n",
939 | "ds\n",
940 | "for i in range(len(ds.loc[exemplar_ids])):\n",
941 | " row = ds.loc[exemplar_ids].iloc[i]\n",
942 | " ds.loc[row.id, \"exemplar sub-cluster\"] = loaded_sub_clusterer.labels_[i]\n",
943 | "for i in range(len(ds.loc[exemplar_ids])):\n",
944 | " row = ds.loc[exemplar_ids].iloc[i]\n",
945 | " ds.loc[row.id, \"cluster XX.YY\"] = (\n",
946 | " \"Cluster \"\n",
947 | " + str(row.cluster)\n",
948 | " + \", Sub-Cluster \"\n",
949 | " + str(int(row[\"exemplar sub-cluster\"]))\n",
950 | " )\n",
951 | "\n",
952 | "# ds.loc[exemplar_ids]\n",
953 | "# ds"
954 | ]
955 | },
956 | {
957 | "cell_type": "code",
958 | "execution_count": null,
959 | "metadata": {
960 | "colab": {
961 | "base_uri": "https://localhost:8080/"
962 | },
963 | "id": "lJdOWEjSr0NR",
964 | "outputId": "baaa50b4-475e-4f50-a406-700b59c1d0f3"
965 | },
966 | "outputs": [],
967 | "source": [
968 | "ds_inner_exemplars = ds[ds[\"exemplars yes/no\"] == 1]\n",
969 | "ds_inner_exemplars = ds_inner_exemplars[\n",
970 | " ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n",
971 | "]\n",
972 | "len(ds_inner_exemplars)"
973 | ]
974 | },
975 | {
976 | "cell_type": "code",
977 | "execution_count": null,
978 | "metadata": {
979 | "colab": {
980 | "base_uri": "https://localhost:8080/",
981 | "height": 673
982 | },
983 | "id": "NiW84puFkJtJ",
984 | "outputId": "854e83a8-fbea-4eed-bde4-8868d070d77b"
985 | },
986 | "outputs": [],
987 | "source": [
988 | "print(\"\\n\")\n",
989 | "\n",
990 | "fig = go.Figure()\n",
991 | "\n",
992 | "fig.add_trace(\n",
993 | " go.Scatter(\n",
994 | " x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
995 | " y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
996 | " mode=\"markers\",\n",
997 | " marker_color=custom_scale[0],\n",
998 | " text=ds[\"cluster XX.YY\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
999 | " )\n",
1000 | ")\n",
1001 | "\n",
1002 | "fig.add_trace(\n",
1003 | " go.Scatter(\n",
1004 | " x=ds_inner_exemplars[\"x\"],\n",
1005 | " y=ds_inner_exemplars[\"y\"],\n",
1006 | " mode=\"markers\",\n",
1007 | " marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n",
1008 | " marker_colorscale=\"Viridis\",\n",
1009 | " text=ds_inner_exemplars[\"cluster XX.YY\"],\n",
1010 | " )\n",
1011 | ")\n",
1012 | "\n",
1013 | "fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n",
1014 | "fig.update_coloraxes(showscale=False)\n",
1015 | "fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n",
1016 | "fig.show()"
1017 | ]
1018 | },
1019 | {
1020 | "cell_type": "code",
1021 | "execution_count": null,
1022 | "metadata": {
1023 | "colab": {
1024 | "base_uri": "https://localhost:8080/"
1025 | },
1026 | "id": "1JJB33r5qCSq",
1027 | "outputId": "9a7e4eae-7f27-4d36-94c0-37aa8712c3d2"
1028 | },
1029 | "outputs": [],
1030 | "source": [
1031 | "for i in range(len(ds)):\n",
1032 | " ds.loc[i, \"Prompt head\"] = \" \".join(\n",
1033 | " cleantext.clean_words(\n",
1034 | " ds.loc[i, \"Prompt\"],\n",
1035 | " clean_all=False, # Execute all cleaning operations\n",
1036 | " extra_spaces=True, # Remove extra white spaces\n",
1037 | " stemming=False, # Stem the words\n",
1038 | " stopwords=False, # Remove stop words\n",
1039 | " lowercase=False, # Convert to lowercase\n",
1040 | " numbers=False, # Remove all digits\n",
1041 | " punct=False, # Remove all punctuations\n",
1042 | " stp_lang=\"english\", # Language for stop words\n",
1043 | " )[0:12]\n",
1044 | " )"
1045 | ]
1046 | },
1047 | {
1048 | "cell_type": "code",
1049 | "execution_count": null,
1050 | "metadata": {
1051 | "colab": {
1052 | "base_uri": "https://localhost:8080/",
1053 | "height": 1000
1054 | },
1055 | "id": "iEG1AZTurPkN",
1056 | "outputId": "a24923f0-225f-41fd-f9b8-14d09afce366"
1057 | },
1058 | "outputs": [],
1059 | "source": [
1060 | "ds"
1061 | ]
1062 | },
1063 | {
1064 | "cell_type": "code",
1065 | "execution_count": null,
1066 | "metadata": {
1067 | "colab": {
1068 | "base_uri": "https://localhost:8080/",
1069 | "height": 1000
1070 | },
1071 | "id": "UdQ3Rf3YpBSE",
1072 | "outputId": "fa65296d-ac17-466c-8903-4c6cec8a2e7c"
1073 | },
1074 | "outputs": [],
1075 | "source": [
1076 | "ds[\"cluster + Prompt\"] = (\n",
1077 | " \"Cluster: \"\n",
1078 | " + ds[\"cluster\"].astype(str)\n",
1079 | " + \", Prompt id \"\n",
1080 | " + ds[\"id\"].astype(str)\n",
1081 | " + \": \"\n",
1082 | " + '\"'\n",
1083 | " + ds[\"Prompt head\"]\n",
1084 | " + '\"'\n",
1085 | ")\n",
1086 | "ds"
1087 | ]
1088 | },
1089 | {
1090 | "cell_type": "code",
1091 | "execution_count": null,
1092 | "metadata": {
1093 | "colab": {
1094 | "base_uri": "https://localhost:8080/",
1095 | "height": 1000
1096 | },
1097 | "id": "w6l_0xa2luhO",
1098 | "outputId": "f24a901f-356c-45eb-ad69-b6b44d2fc6ee"
1099 | },
1100 | "outputs": [],
1101 | "source": [
1102 | "# visualize top 25 clusters by count\n",
1103 | "\n",
1104 | "clust_to_zoom_list = pd.Series(loaded_clusterer.labels_).value_counts().index[1:26]\n",
1105 | "\n",
1106 | "for clust_to_zoom in clust_to_zoom_list:\n",
1107 | " print(f\"Cluster {clust_to_zoom}:\")\n",
1108 | " ds_inner_exemplars = ds[\n",
1109 | " (ds[\"exemplars yes/no\"] == 1) & (ds[\"cluster\"] == clust_to_zoom)\n",
1110 | " ]\n",
1111 | " ds_inner_exemplars = ds_inner_exemplars[\n",
1112 | " ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n",
1113 | " ]\n",
1114 | "\n",
1115 | " fig = go.Figure()\n",
1116 | "\n",
1117 | " fig.add_trace(\n",
1118 | " go.Scatter(\n",
1119 | " x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)],\n",
1120 | " y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)],\n",
1121 | " mode=\"markers\",\n",
1122 | " marker_color=custom_scale[0],\n",
1123 | " text=ds[\"cluster + Prompt\"][\n",
1124 | " (ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)\n",
1125 | " ],\n",
1126 | " )\n",
1127 | " )\n",
1128 | "\n",
1129 | " fig.add_trace(\n",
1130 | " go.Scatter(\n",
1131 | " x=ds_inner_exemplars[\"x\"],\n",
1132 | " y=ds_inner_exemplars[\"y\"],\n",
1133 | " mode=\"markers\",\n",
1134 | " marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n",
1135 | " marker_colorscale=\"Viridis\",\n",
1136 | " text=ds_inner_exemplars[\"cluster + Prompt\"],\n",
1137 | " )\n",
1138 | " )\n",
1139 | "\n",
1140 | " fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n",
1141 | " fig.update_coloraxes(showscale=False)\n",
1142 | " fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n",
1143 | " fig.show()"
1144 | ]
1145 | },
1146 | {
1147 | "cell_type": "code",
1148 | "execution_count": null,
1149 | "metadata": {
1150 | "id": "PuTTU71N5AkX"
1151 | },
1152 | "outputs": [],
1153 | "source": [
1154 | "ds_inner_exemplars = ds[ds[\"exemplars yes/no\"] == 1]\n",
1155 | "ds_inner_exemplars = ds_inner_exemplars[\n",
1156 | " ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n",
1157 | "]"
1158 | ]
1159 | },
1160 | {
1161 | "cell_type": "code",
1162 | "execution_count": null,
1163 | "metadata": {
1164 | "colab": {
1165 | "base_uri": "https://localhost:8080/",
1166 | "height": 673
1167 | },
1168 | "id": "rx58ZS3-40gI",
1169 | "outputId": "786c3a8c-b837-4fba-cee8-a42bb05ca84d"
1170 | },
1171 | "outputs": [],
1172 | "source": [
1173 | "print(\"\\n\")\n",
1174 | "\n",
1175 | "fig = go.Figure()\n",
1176 | "\n",
1177 | "fig.add_trace(\n",
1178 | " go.Scatter(\n",
1179 | " x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
1180 | " y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
1181 | " mode=\"markers\",\n",
1182 | " marker_color=custom_scale[0],\n",
1183 | " text=ds[\"cluster + Prompt\"][\n",
1184 | " (ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)\n",
1185 | " ],\n",
1186 | " )\n",
1187 | ")\n",
1188 | "\n",
1189 | "fig.add_trace(\n",
1190 | " go.Scatter(\n",
1191 | " x=ds_inner_exemplars[\"x\"],\n",
1192 | " y=ds_inner_exemplars[\"y\"],\n",
1193 | " mode=\"markers\",\n",
1194 | " marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n",
1195 | " marker_colorscale=\"Viridis\",\n",
1196 | " text=ds_inner_exemplars[\"cluster + Prompt\"],\n",
1197 | " )\n",
1198 | ")\n",
1199 | "\n",
1200 | "fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n",
1201 | "fig.update_coloraxes(showscale=False)\n",
1202 | "fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n",
1203 | "fig.show()"
1204 | ]
1205 | },
1206 | {
1207 | "cell_type": "markdown",
1208 | "metadata": {
1209 | "id": "B9fADGA9gvwA"
1210 | },
1211 | "source": [
1212 | "## Create summary themes knowledge graph"
1213 | ]
1214 | },
1215 | {
1216 | "cell_type": "code",
1217 | "execution_count": null,
1218 | "metadata": {
1219 | "id": "kXz16XO5dd6j"
1220 | },
1221 | "outputs": [],
1222 | "source": [
1223 | "claude_prompt = \"Please identify and summarize the core theme for each Sub-Cluster. Respond as succinctly as possible. Each summary cannot be longer than 1 sentence. Do not skip any of the Sub-Clusters. Do not list out the names of individuals in the prompts. Let's think step by step before responding.\"\n",
1224 | "\n",
1225 | "num_subclusts = 0\n",
1226 | "subclusts_in_order = []\n",
1227 | "\n",
1228 | "for clust in np.unique(np.array(ds[(ds[\"exemplars yes/no\"] == 1)][\"cluster\"])):\n",
1229 | " sub_df = ds[(ds[\"cluster\"] == clust) & (ds[\"exemplars yes/no\"] == 1)]\n",
1230 | "\n",
1231 | " for clust_to_zoom in clust_to_zoom_list:\n",
1232 | " if sub_df[\"cluster\"].iloc[0] == clust_to_zoom:\n",
1233 | " sub_clusts = list(np.unique(np.array(sub_df[\"cluster XX.YY\"])))\n",
1234 | " sub_clusts.sort()\n",
1235 | " for sub_clust in sub_clusts:\n",
1236 | " if int(sub_clust.split(\"Sub-Cluster \")[-1]) != -1:\n",
1237 | " # print(sub_clust)\n",
1238 | " num_subclusts += 1\n",
1239 | " subclusts_in_order.append(sub_clust)\n",
1240 | " sub_prompts = sub_df[sub_df[\"cluster XX.YY\"] == sub_clust][\n",
1241 | " \"Prompt\"\n",
1242 | " ].astype(str)\n",
1243 | " claude_prompt += \"\\n\" + str(sub_clust.split(\", \")[1]) + \": \"\n",
1244 | " claude_prompt += f\"\\n\" + str(sub_prompts) + '\"\\n'"
1245 | ]
1246 | },
1247 | {
1248 | "cell_type": "code",
1249 | "execution_count": null,
1250 | "metadata": {
1251 | "id": "y49WrTOLq_Dy"
1252 | },
1253 | "outputs": [],
1254 | "source": [
1255 | "# print(claude_prompt)\n",
1256 | "# subclusts_in_order"
1257 | ]
1258 | },
1259 | {
1260 | "cell_type": "code",
1261 | "execution_count": null,
1262 | "metadata": {
1263 | "colab": {
1264 | "base_uri": "https://localhost:8080/"
1265 | },
1266 | "id": "c0zRcjt0rBJd",
1267 | "outputId": "94f1503d-78b9-44e3-cc71-07b18f614e6d"
1268 | },
1269 | "outputs": [],
1270 | "source": [
1271 | "num_subclusts"
1272 | ]
1273 | },
1274 | {
1275 | "cell_type": "code",
1276 | "execution_count": null,
1277 | "metadata": {
1278 | "colab": {
1279 | "base_uri": "https://localhost:8080/"
1280 | },
1281 | "id": "Qt-LdSAFlvxh",
1282 | "outputId": "71994546-fcc8-4484-a8ea-3674b1f5f594"
1283 | },
1284 | "outputs": [],
1285 | "source": [
1286 | "claude_prompt.count(\"Sub-Cluster \")"
1287 | ]
1288 | },
1289 | {
1290 | "cell_type": "code",
1291 | "execution_count": null,
1292 | "metadata": {
1293 | "id": "vqR42TGDnLvh"
1294 | },
1295 | "outputs": [],
1296 | "source": [
1297 | "ds_exemps = ds[(ds[\"exemplars yes/no\"] == 1) & (ds[\"exemplar sub-cluster\"] != -1)]\n",
1298 | "\n",
1299 | "mask = ds_exemps[\"cluster\"].isin(clust_to_zoom_list)\n",
1300 | "ds_exemps_of_interest = ds_exemps[mask]\n",
1301 | "# ds_exemps_of_interest"
1302 | ]
1303 | },
1304 | {
1305 | "cell_type": "code",
1306 | "execution_count": null,
1307 | "metadata": {
1308 | "colab": {
1309 | "base_uri": "https://localhost:8080/"
1310 | },
1311 | "id": "TJkN6z9clcSR",
1312 | "outputId": "106b7414-2281-4612-af62-b1d1c73e7422"
1313 | },
1314 | "outputs": [],
1315 | "source": [
1316 | "len(np.unique(np.array(ds_exemps_of_interest[\"cluster XX.YY\"])))"
1317 | ]
1318 | },
1319 | {
1320 | "cell_type": "code",
1321 | "execution_count": null,
1322 | "metadata": {
1323 | "colab": {
1324 | "base_uri": "https://localhost:8080/"
1325 | },
1326 | "id": "fEqyYu9tn8vD",
1327 | "outputId": "92679d2f-605d-4236-ae12-8226e341f9b4"
1328 | },
1329 | "outputs": [],
1330 | "source": [
1331 | "tokenizer = tiktoken.encoding_for_model(\"gpt-3.5-turbo-16k\")\n",
1332 | "len(tokenizer.encode(claude_prompt))"
1333 | ]
1334 | },
1335 | {
1336 | "cell_type": "code",
1337 | "execution_count": null,
1338 | "metadata": {
1339 | "colab": {
1340 | "base_uri": "https://localhost:8080/",
1341 | "height": 214
1342 | },
1343 | "id": "VTTbFvQds-yc",
1344 | "outputId": "798ae301-742c-488a-bd9b-b4e991256fb0"
1345 | },
1346 | "outputs": [],
1347 | "source": [
1348 | "claude_prompt"
1349 | ]
1350 | },
1351 | {
1352 | "cell_type": "code",
1353 | "execution_count": null,
1354 | "metadata": {
1355 | "colab": {
1356 | "base_uri": "https://localhost:8080/"
1357 | },
1358 | "id": "DfAbeTWZp2M5",
1359 | "outputId": "3f9e0d5d-939d-4d00-d170-4efb4071ac08"
1360 | },
1361 | "outputs": [],
1362 | "source": [
1363 | "# saved response from claude-2 conversation\n",
1364 | "\n",
1365 | "text_generation = \"\"\"Sub-Cluster 19: Portraits of characters in lofi style by various artists.\n",
1366 | "\n",
1367 | "Sub-Cluster 17: Symmetry portraits of various people and characters.\n",
1368 | "\n",
1369 | "Sub-Cluster 18: Symmetry sci-fi portraits of characters and people.\n",
1370 | "\n",
1371 | "Sub-Cluster 128: Highly detailed illustrations of people, often describing hair and age.\n",
1372 | "\n",
1373 | "Sub-Cluster 162: Highly detailed illustrations of sadistic or aggressive looking people.\n",
1374 | "\n",
1375 | "Sub-Cluster 163: Highly detailed illustrations of attractive people, often with white hair.\n",
1376 | "\n",
1377 | "Sub-Cluster 75: Highly detailed illustrations of beautiful, fierce, or smug women.\n",
1378 | "\n",
1379 | "Sub-Cluster 76: Art of the League of Legends champion Vi.\n",
1380 | "\n",
1381 | "Sub-Cluster 66: Greg Manchess portrait paintings of various characters as different roles.\n",
1382 | "\n",
1383 | "Sub-Cluster 78: Art and portraits featuring Star Wars characters, especially Darth Vader.\n",
1384 | "\n",
1385 | "Sub-Cluster 91: Portraits and art of female cyborg characters.\n",
1386 | "\n",
1387 | "Sub-Cluster 93: Art and portraits of robots and humanoid AI characters.\n",
1388 | "\n",
1389 | "Sub-Cluster 107: Art of Vladimir Putin being killed or defeated.\n",
1390 | "\n",
1391 | "Sub-Cluster 168: Portraits of Putin and Biden as magical characters.\n",
1392 | "\n",
1393 | "Sub-Cluster 235: Art depicting Vladimir Putin as various monsters, animals, or in humiliating situations.\n",
1394 | "\n",
1395 | "Sub-Cluster 236: Art of Putin with Kim Jong Un's haircut.\n",
1396 | "\n",
1397 | "Sub-Cluster 164: Art of characters like aliens eating hamburgers.\n",
1398 | "\n",
1399 | "Sub-Cluster 191: Art of Final Fantasy 7 character Sephiroth.\n",
1400 | "\n",
1401 | "Sub-Cluster 192: Beautiful, award winning pencil drawings and illustrations.\n",
1402 | "\n",
1403 | "Sub-Cluster 240: Portraits of celebrities eating hamburgers.\n",
1404 | "\n",
1405 | "Sub-Cluster 241: Portraits of various real people and characters eating hamburgers.\n",
1406 | "\n",
1407 | "Sub-Cluster 85: Art and portraits of dragons in various settings.\n",
1408 | "\n",
1409 | "Sub-Cluster 92: Art depicting Donald Trump in various roles and situations.\n",
1410 | "\n",
1411 | "Sub-Cluster 89: Art and portraits of Batman characters.\n",
1412 | "\n",
1413 | "Sub-Cluster 90: Art of Spider-Man and related Marvel characters.\n",
1414 | "\n",
1415 | "Sub-Cluster 135: Award winning portrait commissions.\n",
1416 | "\n",
1417 | "Sub-Cluster 136: Award winning portrait commissions of furry characters.\n",
1418 | "\n",
1419 | "Sub-Cluster 143: Anthropomorphic furry fox characters.\n",
1420 | "\n",
1421 | "Sub-Cluster 184: Trending furry fox character art.\n",
1422 | "\n",
1423 | "Sub-Cluster 185: Beautiful portrait commissions of furry characters.\n",
1424 | "\n",
1425 | "Sub-Cluster 65: Art and portraits of fox characters in various outfits and settings.\n",
1426 | "\n",
1427 | "Sub-Cluster 121: Portraits and art of cats in various styles.\n",
1428 | "\n",
1429 | "Sub-Cluster 146: Portraits of goddesses and divine figures.\n",
1430 | "\n",
1431 | "Sub-Cluster 178: Portraits of Megan Fox as characters from video games.\n",
1432 | "\n",
1433 | "Sub-Cluster 186: Psychedelic and Lovecraftian portraits of Megan Fox.\n",
1434 | "\n",
1435 | "Sub-Cluster 187: Portraits of Megan Fox in various roles and outfits.\n",
1436 | "\n",
1437 | "Sub-Cluster 54: Portraits of Emma Watson in various roles and settings.\n",
1438 | "\n",
1439 | "Sub-Cluster 74: Alexandra Daddario and Megan Fox as Scarlet Witch.\n",
1440 | "\n",
1441 | "Sub-Cluster 84: Futuristic and fantasy vehicle concept art.\n",
1442 | "\n",
1443 | "Sub-Cluster 94: Highly detailed realistic portraits of men.\n",
1444 | "\n",
1445 | "Sub-Cluster 113: Anime girl character portraits and concept art.\n",
1446 | "\n",
1447 | "Sub-Cluster 117: Portraits of beautiful women in various settings.\n",
1448 | "\n",
1449 | "Sub-Cluster 118: Portraits of young women in various outfits and styles.\n",
1450 | "\n",
1451 | "Sub-Cluster 81: Cinematic concept art portraits by Jama Jurabaev.\n",
1452 | "\n",
1453 | "Sub-Cluster 82: Futuristic sci-fi spaceship concept art.\n",
1454 | "\n",
1455 | "Sub-Cluster 125: Concept art of knights and warriors.\n",
1456 | "\n",
1457 | "Sub-Cluster 132: Surreal, cinematic, and futuristic digital art.\n",
1458 | "\n",
1459 | "Sub-Cluster 167: Beautiful landscape paintings and matte art.\n",
1460 | "\n",
1461 | "Sub-Cluster 151: Futuristic cityscape concept art.\"\"\"\n",
1462 | "\n",
1463 | "text_generation = text_generation.split(\"\\n\\n\")\n",
1464 | "len(text_generation)"
1465 | ]
1466 | },
1467 | {
1468 | "cell_type": "code",
1469 | "execution_count": null,
1470 | "metadata": {
1471 | "colab": {
1472 | "base_uri": "https://localhost:8080/"
1473 | },
1474 | "id": "qHa4rgqcrQM0",
1475 | "outputId": "5d6d6cf1-431f-4c19-a3ee-cbe5e37b9e6d"
1476 | },
1477 | "outputs": [],
1478 | "source": [
1479 | "summaries_dict = {\n",
1480 | " subclusts_in_order[i]: text_generation[i] for i in range(len(subclusts_in_order))\n",
1481 | "}\n",
1482 | "summaries_dict"
1483 | ]
1484 | },
1485 | {
1486 | "cell_type": "code",
1487 | "execution_count": null,
1488 | "metadata": {
1489 | "id": "kI0Qs_ITvNzG"
1490 | },
1491 | "outputs": [],
1492 | "source": [
1493 | "for i in range(len(list(summaries_dict.keys()))):\n",
1494 | " pass\n",
1495 | " key = list(summaries_dict.keys())[i]\n",
1496 | " summary = summaries_dict[key]\n",
1497 | " key_subclust = key.split(\", \")[-1]\n",
1498 | " summary_subclust = summary.split(\": \")[0]\n",
1499 | " assert key_subclust == summary_subclust"
1500 | ]
1501 | },
1502 | {
1503 | "cell_type": "code",
1504 | "execution_count": null,
1505 | "metadata": {
1506 | "id": "ktj6ipyctjOs"
1507 | },
1508 | "outputs": [],
1509 | "source": [
1510 | "# for i in range(len(text_generation)):\n",
1511 | "# text_generation[i] = text_generation[i].split(\": \")[-1]"
1512 | ]
1513 | },
1514 | {
1515 | "cell_type": "code",
1516 | "execution_count": null,
1517 | "metadata": {
1518 | "colab": {
1519 | "base_uri": "https://localhost:8080/"
1520 | },
1521 | "id": "CmQLnLiXtf7u",
1522 | "outputId": "dce2694a-49b6-4a1a-a956-15176a1d8798"
1523 | },
1524 | "outputs": [],
1525 | "source": [
1526 | "summaries_dict_cleaned = {\n",
1527 | " subclusts_in_order[i]: text_generation[i] for i in range(len(subclusts_in_order))\n",
1528 | "}\n",
1529 | "summaries_dict_cleaned"
1530 | ]
1531 | },
1532 | {
1533 | "cell_type": "code",
1534 | "execution_count": null,
1535 | "metadata": {
1536 | "colab": {
1537 | "base_uri": "https://localhost:8080/"
1538 | },
1539 | "id": "NQ8XhfRZuo8l",
1540 | "outputId": "365d59d2-f01b-442d-d01e-7cc5baa3e625"
1541 | },
1542 | "outputs": [],
1543 | "source": [
1544 | "for i in range(len(ds)):\n",
1545 | " try:\n",
1546 | " ds.loc[i, \"theme\"] = summaries_dict_cleaned[ds.loc[i, \"cluster XX.YY\"]]\n",
1547 | " except KeyError:\n",
1548 | " pass"
1549 | ]
1550 | },
1551 | {
1552 | "cell_type": "code",
1553 | "execution_count": null,
1554 | "metadata": {
1555 | "colab": {
1556 | "base_uri": "https://localhost:8080/",
1557 | "height": 35
1558 | },
1559 | "id": "RgKbnRtut3YG",
1560 | "outputId": "125d5a2f-164e-4b9c-c0bd-c0325bfc8199"
1561 | },
1562 | "outputs": [],
1563 | "source": [
1564 | "ds[\"theme\"][ds[\"cluster XX.YY\"] == \"Cluster 77, Sub-Cluster 151\"].iloc[0]"
1565 | ]
1566 | },
1567 | {
1568 | "cell_type": "markdown",
1569 | "metadata": {
1570 | "id": "0CYEH057uyj8"
1571 | },
1572 | "source": [
1573 | "## Write final df results to disk"
1574 | ]
1575 | },
1576 | {
1577 | "cell_type": "code",
1578 | "execution_count": null,
1579 | "metadata": {
1580 | "colab": {
1581 | "base_uri": "https://localhost:8080/",
1582 | "height": 1000
1583 | },
1584 | "id": "Yjqesxe_oitS",
1585 | "outputId": "620d1909-6874-4e4a-a988-3a6f2df64c94"
1586 | },
1587 | "outputs": [],
1588 | "source": [
1589 | "# write final ds to disk\n",
1590 | "f_name = os.path.join(\n",
1591 | " args.cache_folder, \"stable_diffusion_prompts_dataframe_cached_with_results.xlsx\"\n",
1592 | ")\n",
1593 | "print(f_name, \"\\n\")\n",
1594 | "\n",
1595 | "# re-order cols\n",
1596 | "ds = ds[\n",
1597 | " [\n",
1598 | " \"id\",\n",
1599 | " \"cluster\",\n",
1600 | " \"x\",\n",
1601 | " \"y\",\n",
1602 | " \"cluster membership prob\",\n",
1603 | " \"exemplars yes/no\",\n",
1604 | " \"exemplar sub-cluster\",\n",
1605 | " \"cluster XX.YY\",\n",
1606 | " \"theme\",\n",
1607 | " \"Prompt\",\n",
1608 | " ]\n",
1609 | "]\n",
1610 | "ds"
1611 | ]
1612 | },
1613 | {
1614 | "cell_type": "code",
1615 | "execution_count": null,
1616 | "metadata": {
1617 | "id": "CuoIn4KCvyPR"
1618 | },
1619 | "outputs": [],
1620 | "source": [
1621 | "# write with adjusted col width\n",
1622 | "# if not args.inference:\n",
1623 | "if True:\n",
1624 | " with pd.ExcelWriter(f_name) as writer:\n",
1625 | " ds.to_excel(writer, sheet_name=\"All Prompts\")\n",
1626 | " auto_adjust_xlsx_column_width(ds, writer, sheet_name=\"All Prompts\", margin=1)"
1627 | ]
1628 | },
1629 | {
1630 | "cell_type": "markdown",
1631 | "metadata": {
1632 | "id": "CNrkolSLu1tg"
1633 | },
1634 | "source": [
1635 | "## Format a JSON viz graph"
1636 | ]
1637 | },
1638 | {
1639 | "cell_type": "code",
1640 | "execution_count": null,
1641 | "metadata": {
1642 | "colab": {
1643 | "base_uri": "https://localhost:8080/",
1644 | "height": 35
1645 | },
1646 | "id": "DGsJa7GrN2cf",
1647 | "outputId": "65f53e9b-0a91-47ad-c800-9ff6db6e5e2e"
1648 | },
1649 | "outputs": [],
1650 | "source": [
1651 | "args.cache_folder"
1652 | ]
1653 | },
1654 | {
1655 | "cell_type": "code",
1656 | "execution_count": null,
1657 | "metadata": {
1658 | "colab": {
1659 | "base_uri": "https://localhost:8080/",
1660 | "height": 1000
1661 | },
1662 | "id": "bUhn-leKp0Yv",
1663 | "outputId": "ca475888-d96d-4f5a-d23d-d80c7e693172"
1664 | },
1665 | "outputs": [],
1666 | "source": [
1667 | "# optional ds cached loading\n",
1668 | "ds_loaded = pd.read_excel(\n",
1669 | " os.path.join(\n",
1670 | " args.cache_folder, \"stable_diffusion_prompts_dataframe_cached_with_results.xlsx\"\n",
1671 | " ),\n",
1672 | " index_col=\"Unnamed: 0\",\n",
1673 | ")\n",
1674 | "ds_loaded"
1675 | ]
1676 | },
1677 | {
1678 | "cell_type": "code",
1679 | "execution_count": null,
1680 | "metadata": {
1681 | "colab": {
1682 | "base_uri": "https://localhost:8080/",
1683 | "height": 1000
1684 | },
1685 | "id": "829s5RJtxJ2j",
1686 | "outputId": "a37154d2-a6db-4ead-e0ba-a7600a18f100"
1687 | },
1688 | "outputs": [],
1689 | "source": [
1690 | "ds_clust = ds_loaded[ds_loaded[\"theme\"].notna()]\n",
1691 | "ds_clust"
1692 | ]
1693 | },
1694 | {
1695 | "cell_type": "code",
1696 | "execution_count": null,
1697 | "metadata": {
1698 | "colab": {
1699 | "base_uri": "https://localhost:8080/"
1700 | },
1701 | "id": "Q_H93u9QuC2u",
1702 | "outputId": "84bc7bd9-3eed-4f6f-cd74-a32291211c67"
1703 | },
1704 | "outputs": [],
1705 | "source": [
1706 | "len(np.unique(np.array(ds_clust[\"cluster XX.YY\"])))"
1707 | ]
1708 | },
1709 | {
1710 | "cell_type": "code",
1711 | "execution_count": null,
1712 | "metadata": {
1713 | "id": "oc1HFPyBxnck"
1714 | },
1715 | "outputs": [],
1716 | "source": [
1717 | "knowledge_graphs = []\n",
1718 | "\n",
1719 | "for sub_clust in np.unique(np.array(ds_clust[\"cluster XX.YY\"])):\n",
1720 | " clust = sub_clust.split(\"Cluster \")[1].split(\",\")[0]\n",
1721 | "\n",
1722 | " prompts = []\n",
1723 | " ds_inner = ds_clust[ds_clust[\"cluster XX.YY\"] == sub_clust]\n",
1724 | " for i in range(len(ds_inner)):\n",
1725 | " row = ds_inner.iloc[i]\n",
1726 | " if row[\"exemplars yes/no\"] == 1:\n",
1727 | " prompts.append(\n",
1728 | " {\n",
1729 | " \"Prompt\": row.Prompt,\n",
1730 | " \"id\": float(row[\"id\"]),\n",
1731 | " }\n",
1732 | " )\n",
1733 | "\n",
1734 | " viz = {\n",
1735 | " \"core theme\": ds_inner.iloc[0][\"theme\"],\n",
1736 | " \"cluster id\": ds_inner.iloc[0][\"cluster XX.YY\"],\n",
1737 | " # \"frequency\": str(np.round(100 * len(ds_inner) / len(ds), 2)) + \"%\",\n",
1738 | " # \"count\": len(ds_inner),\n",
1739 | " # \"exemplars\": prompts,\n",
1740 | " }\n",
1741 | "\n",
1742 | " knowledge_graphs.append(viz)"
1743 | ]
1744 | },
1745 | {
1746 | "cell_type": "code",
1747 | "execution_count": null,
1748 | "metadata": {
1749 | "colab": {
1750 | "base_uri": "https://localhost:8080/"
1751 | },
1752 | "id": "iwpVNL_vOze_",
1753 | "outputId": "1b9c8202-67a1-4aad-b398-7cea61e36bb0"
1754 | },
1755 | "outputs": [],
1756 | "source": [
1757 | "for i in range(len(np.unique(np.array(ds_clust[\"cluster XX.YY\"])))):\n",
1758 | " sub_clust = np.unique(np.array(ds_clust[\"cluster XX.YY\"]))[i]\n",
1759 | "\n",
1760 | " # Serializing json\n",
1761 | " json_object = json.dumps(knowledge_graphs[i], indent=4)\n",
1762 | " print(json_object)\n",
1763 | " print(\"\\n\\n\")"
1764 | ]
1765 | },
1766 | {
1767 | "cell_type": "code",
1768 | "execution_count": null,
1769 | "metadata": {
1770 | "id": "uqu2JMCcw3I4"
1771 | },
1772 | "outputs": [],
1773 | "source": [
1774 | "# summaries_dict_cleaned[\"Cluster 75, Sub-Cluster 167\"]"
1775 | ]
1776 | },
1777 | {
1778 | "cell_type": "code",
1779 | "execution_count": null,
1780 | "metadata": {
1781 | "id": "sr7BqFUM5E96"
1782 | },
1783 | "outputs": [],
1784 | "source": [
1785 | "summaries_dict_cluster_level = {}\n",
1786 | "for clust in np.unique(np.array(ds_clust[\"cluster\"])):\n",
1787 | " summaries_dict_cluster_level[clust] = list(\n",
1788 | " np.unique(np.array(ds_clust[ds_clust[\"cluster\"] == clust][\"theme\"]))\n",
1789 | " )"
1790 | ]
1791 | },
1792 | {
1793 | "cell_type": "code",
1794 | "execution_count": null,
1795 | "metadata": {
1796 | "colab": {
1797 | "base_uri": "https://localhost:8080/"
1798 | },
1799 | "id": "_Oe7fGdKyK7q",
1800 | "outputId": "66f7173a-fc25-42d3-d380-9fc3ba50bfa0"
1801 | },
1802 | "outputs": [],
1803 | "source": [
1804 | "summaries_dict_cluster_level"
1805 | ]
1806 | },
1807 | {
1808 | "cell_type": "code",
1809 | "execution_count": null,
1810 | "metadata": {
1811 | "colab": {
1812 | "base_uri": "https://localhost:8080/"
1813 | },
1814 | "id": "Ju1VNC4w5yQD",
1815 | "outputId": "a5514fc5-dcd2-4c41-ddd5-3ceb8f2d9e0e"
1816 | },
1817 | "outputs": [],
1818 | "source": [
1819 | "summaries_dict_cluster_level[10]"
1820 | ]
1821 | },
1822 | {
1823 | "cell_type": "code",
1824 | "execution_count": null,
1825 | "metadata": {
1826 | "id": "rX2glhje2sYI"
1827 | },
1828 | "outputs": [],
1829 | "source": [
1830 | "knowledge_graphs = []\n",
1831 | "\n",
1832 | "itr = 0\n",
1833 | "for clust in ds_loaded[\"cluster\"].value_counts().index:\n",
1834 | " if clust in list(np.unique(np.array(ds_clust[\"cluster\"]))):\n",
1835 | " ds_inner = ds_clust[ds_clust[\"cluster\"] == int(clust)]\n",
1836 | "\n",
1837 | " viz = {\n",
1838 | " \"cluster id\": \"Cluster \" + str(ds_inner.iloc[0][\"cluster\"]),\n",
1839 | " \"count\": float(ds_loaded[\"cluster\"].value_counts().loc[int(clust)]),\n",
1840 | " \"frequency\": str(\n",
1841 | " np.round(\n",
1842 | " 100\n",
1843 | " * float(ds_loaded[\"cluster\"].value_counts().loc[int(clust)])\n",
1844 | " / len(ds_loaded),\n",
1845 | " 2,\n",
1846 | " )\n",
1847 | " )\n",
1848 | " + \"%\",\n",
1849 | " \"core theme\": summaries_dict_cluster_level[clust],\n",
1850 | " }\n",
1851 | "\n",
1852 | " knowledge_graphs.append(viz)"
1853 | ]
1854 | },
1855 | {
1856 | "cell_type": "code",
1857 | "execution_count": null,
1858 | "metadata": {
1859 | "colab": {
1860 | "base_uri": "https://localhost:8080/"
1861 | },
1862 | "id": "_lgrhx7H4iC7",
1863 | "outputId": "19b187c7-a1b9-44d6-cee7-d63dce526198"
1864 | },
1865 | "outputs": [],
1866 | "source": [
1867 | "len(knowledge_graphs)"
1868 | ]
1869 | },
1870 | {
1871 | "cell_type": "code",
1872 | "execution_count": null,
1873 | "metadata": {
1874 | "id": "MKfP4tMj8PHL"
1875 | },
1876 | "outputs": [],
1877 | "source": [
1878 | "knowledge_graphs = {\"knowledge graph\": knowledge_graphs}"
1879 | ]
1880 | },
1881 | {
1882 | "cell_type": "code",
1883 | "execution_count": null,
1884 | "metadata": {
1885 | "colab": {
1886 | "base_uri": "https://localhost:8080/"
1887 | },
1888 | "id": "Gi-uJ6Ds2sYJ",
1889 | "outputId": "4c00ec28-6aa5-497e-ccc8-fb823fa947bf"
1890 | },
1891 | "outputs": [],
1892 | "source": [
1893 | "# Serializing json\n",
1894 | "json_object = json.dumps(knowledge_graphs, indent=4)\n",
1895 | "print(json_object)\n",
1896 | "print(\"\\n\\n\")"
1897 | ]
1898 | },
1899 | {
1900 | "cell_type": "markdown",
1901 | "metadata": {
1902 | "id": "p3NXLZVOPpFk"
1903 | },
1904 | "source": [
1905 | "## Drift detection on the top 25 clusters\n"
1906 | ]
1907 | },
1908 | {
1909 | "cell_type": "code",
1910 | "execution_count": null,
1911 | "metadata": {
1912 | "colab": {
1913 | "base_uri": "https://localhost:8080/",
1914 | "height": 424
1915 | },
1916 | "id": "t_zMrCONOhpd",
1917 | "outputId": "bf2461d3-e5c4-477f-8655-053096f19a99"
1918 | },
1919 | "outputs": [],
1920 | "source": [
1921 | "ds_hf = load_dataset(\"Gustavosta/Stable-Diffusion-Prompts\")\n",
1922 | "ds = ds_hf[\"test\"]\n",
1923 | "\n",
1924 | "ds = ds.to_pandas()\n",
1925 | "ds[\"id\"] = ds.index\n",
1926 | "ds = ds[[\"id\", \"Prompt\"]]\n",
1927 | "ds"
1928 | ]
1929 | },
1930 | {
1931 | "cell_type": "code",
1932 | "execution_count": null,
1933 | "metadata": {
1934 | "colab": {
1935 | "base_uri": "https://localhost:8080/"
1936 | },
1937 | "id": "9KgCWCIlvsFn",
1938 | "outputId": "4069f096-8b20-4ba5-9fb9-2a2c064c8208"
1939 | },
1940 | "outputs": [],
1941 | "source": [
1942 | "args.inference = True\n",
1943 | "args"
1944 | ]
1945 | },
1946 | {
1947 | "cell_type": "code",
1948 | "execution_count": null,
1949 | "metadata": {
1950 | "id": "8B2Ay5OfvftL"
1951 | },
1952 | "outputs": [],
1953 | "source": [
1954 | "# Embed Text\n",
1955 | "# * See [pretrained models](https://www.sbert.net/docs/pretrained_models.html) for supporting information\n",
1956 | "\n",
1957 | "model = SentenceTransformer(\"all-mpnet-base-v2\")\n",
1958 | "\n",
1959 | "if not args.inference:\n",
1960 | " embeddings = torch.zeros([len(ds), 768])\n",
1961 | " for i in tqdm(range(len(ds))):\n",
1962 | " emb = model.encode(ds.loc[i, \"Prompt\"], convert_to_tensor=True)\n",
1963 | " embeddings[i, :] = emb\n",
1964 | " embeddings"
1965 | ]
1966 | },
1967 | {
1968 | "cell_type": "code",
1969 | "execution_count": null,
1970 | "metadata": {
1971 | "colab": {
1972 | "base_uri": "https://localhost:8080/"
1973 | },
1974 | "id": "PshHjy3NvgBA",
1975 | "outputId": "7805b3cb-7d9b-4ee6-93ae-bf771f755f2a"
1976 | },
1977 | "outputs": [],
1978 | "source": [
1979 | "f_name = os.path.join(\n",
1980 | " args.cache_folder, \"stable_diffusion_prompts_test_embeddings_all_mpnet_base_v2.pt\"\n",
1981 | ")\n",
1982 | "print(f_name, \"\\n\")\n",
1983 | "\n",
1984 | "if not args.inference:\n",
1985 | " torch.save(embeddings, f_name)\n",
1986 | "loaded_embeddings = torch.load(f_name)\n",
1987 | "loaded_embeddings"
1988 | ]
1989 | },
1990 | {
1991 | "cell_type": "code",
1992 | "execution_count": null,
1993 | "metadata": {
1994 | "colab": {
1995 | "base_uri": "https://localhost:8080/"
1996 | },
1997 | "id": "HbsirlY0vgBB",
1998 | "outputId": "08e5f4fb-32a9-4e04-ca70-f2e4a407d5d1"
1999 | },
2000 | "outputs": [],
2001 | "source": [
2002 | "loaded_embeddings.shape"
2003 | ]
2004 | },
2005 | {
2006 | "cell_type": "code",
2007 | "execution_count": null,
2008 | "metadata": {
2009 | "id": "hvU350V2vgBB"
2010 | },
2011 | "outputs": [],
2012 | "source": [
2013 | "if not args.inference:\n",
2014 | " torch.equal(loaded_embeddings.cpu(), embeddings.cpu())"
2015 | ]
2016 | },
2017 | {
2018 | "cell_type": "code",
2019 | "execution_count": null,
2020 | "metadata": {
2021 | "colab": {
2022 | "base_uri": "https://localhost:8080/"
2023 | },
2024 | "id": "leU5pK6mvgBB",
2025 | "outputId": "7546ce0b-3f7e-421f-edcd-b0063b51da79"
2026 | },
2027 | "outputs": [],
2028 | "source": [
2029 | "test_itr = random.randint(low=0, high=len(ds))\n",
2030 | "test_itr"
2031 | ]
2032 | },
2033 | {
2034 | "cell_type": "code",
2035 | "execution_count": null,
2036 | "metadata": {
2037 | "colab": {
2038 | "base_uri": "https://localhost:8080/",
2039 | "height": 71
2040 | },
2041 | "id": "M_M-XVkpvgBB",
2042 | "outputId": "3dc80f2c-06c8-4def-c70e-63720ff52f45"
2043 | },
2044 | "outputs": [],
2045 | "source": [
2046 | "# test embeddings worked\n",
2047 | "ds.loc[test_itr, \"Prompt\"]"
2048 | ]
2049 | },
2050 | {
2051 | "cell_type": "code",
2052 | "execution_count": null,
2053 | "metadata": {
2054 | "colab": {
2055 | "base_uri": "https://localhost:8080/"
2056 | },
2057 | "id": "iBffCA3NvgBB",
2058 | "outputId": "36a9dd66-77d2-44fd-c07b-6b250f955258"
2059 | },
2060 | "outputs": [],
2061 | "source": [
2062 | "test_emb = model.encode(ds.loc[test_itr, \"Prompt\"], convert_to_tensor=True)\n",
2063 | "a = np.array(test_emb.cpu())\n",
2064 | "b = np.array(loaded_embeddings[test_itr, :].cpu())\n",
2065 | "np.allclose(a, b, rtol=1e-02)"
2066 | ]
2067 | },
2068 | {
2069 | "cell_type": "code",
2070 | "execution_count": null,
2071 | "metadata": {
2072 | "colab": {
2073 | "base_uri": "https://localhost:8080/"
2074 | },
2075 | "id": "xgUPGdAfv62x",
2076 | "outputId": "467a7ec8-ce53-4a8f-e3de-ec524a7d9a76"
2077 | },
2078 | "outputs": [],
2079 | "source": [
2080 | "# second, perform clustering\n",
2081 | "\n",
2082 | "# first, perform dimensionality reduction from 768 to 15\n",
2083 | "f_name = os.path.join(args.cache_folder, \"reducer_umap_15.pkl\")\n",
2084 | "print(f_name, \"\\n\")\n",
2085 | "\n",
2086 | "loaded_reducer_15 = pickle.load((open(f_name, \"rb\")))\n",
2087 | "\n",
2088 | "embeddings_umap_dim_15 = loaded_reducer_15.transform(loaded_embeddings)\n",
2089 | "\n",
2090 | "embeddings_umap_dim_15.shape"
2091 | ]
2092 | },
2093 | {
2094 | "cell_type": "code",
2095 | "execution_count": null,
2096 | "metadata": {
2097 | "colab": {
2098 | "base_uri": "https://localhost:8080/",
2099 | "height": 130
2100 | },
2101 | "id": "-PM40fYpxPRA",
2102 | "outputId": "b5058681-c52e-4fdf-c17a-75731ce94fae"
2103 | },
2104 | "outputs": [],
2105 | "source": [
2106 | "f_name = os.path.join(args.cache_folder, \"clusterer_hdbscan.pkl\")\n",
2107 | "print(f_name, \"\\n\")\n",
2108 | "\n",
2109 | "loaded_clusterer = pickle.load((open(f_name, \"rb\")))\n",
2110 | "loaded_clusterer"
2111 | ]
2112 | },
2113 | {
2114 | "cell_type": "code",
2115 | "execution_count": null,
2116 | "metadata": {
2117 | "colab": {
2118 | "base_uri": "https://localhost:8080/"
2119 | },
2120 | "id": "8dPZ614OxJoe",
2121 | "outputId": "242b4da9-bdf5-4109-fb46-0719291bd9b6"
2122 | },
2123 | "outputs": [],
2124 | "source": [
2125 | "test_labels, strengths = hdbscan.approximate_predict(\n",
2126 | " loaded_clusterer, embeddings_umap_dim_15\n",
2127 | ")\n",
2128 | "test_labels"
2129 | ]
2130 | },
2131 | {
2132 | "cell_type": "code",
2133 | "execution_count": null,
2134 | "metadata": {
2135 | "colab": {
2136 | "base_uri": "https://localhost:8080/",
2137 | "height": 179
2138 | },
2139 | "id": "fffVi_HuweCQ",
2140 | "outputId": "b26cee5f-0fa5-4203-cec9-6945841be352"
2141 | },
2142 | "outputs": [],
2143 | "source": [
2144 | "pd.Series(test_labels).value_counts()[1:26]"
2145 | ]
2146 | },
2147 | {
2148 | "cell_type": "code",
2149 | "execution_count": null,
2150 | "metadata": {
2151 | "id": "SKC6g7yHweG6"
2152 | },
2153 | "outputs": [],
2154 | "source": []
2155 | }
2156 | ],
2157 | "metadata": {
2158 | "accelerator": "GPU",
2159 | "colab": {
2160 | "gpuType": "T4",
2161 | "machine_shape": "hm",
2162 | "provenance": []
2163 | },
2164 | "interpreter": {
2165 | "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754"
2166 | },
2167 | "kernelspec": {
2168 | "display_name": "Python 3",
2169 | "name": "python3"
2170 | },
2171 | "language_info": {
2172 | "codemirror_mode": {
2173 | "name": "ipython",
2174 | "version": 3
2175 | },
2176 | "file_extension": ".py",
2177 | "mimetype": "text/x-python",
2178 | "name": "python",
2179 | "nbconvert_exporter": "python",
2180 | "pygments_lexer": "ipython3",
2181 | "version": "3.10.10"
2182 | },
2183 | "orig_nbformat": 4
2184 | },
2185 | "nbformat": 4,
2186 | "nbformat_minor": 0
2187 | }
2188 |
--------------------------------------------------------------------------------