├── experiments
    ├── 02_09_2023_16_54_32
    │   ├── reducer_umap_15.pkl
    │   ├── reducer_umap_2.pkl
    │   ├── clusterer_hdbscan.pkl
    │   ├── assets
    │   │   ├── clusters_viz_1.png
    │   │   ├── exemplars_viz_1.png
    │   │   ├── exemplars_viz_2.png
    │   │   ├── cluster0_subcluster0.png
    │   │   ├── cluster0_subcluster1.png
    │   │   ├── cluster1_subcluster2.png
    │   │   ├── cluster1_subcluster3.png
    │   │   ├── cluster1_subcluster4.png
    │   │   └── cluster1_subcluster5.png
    │   ├── clusterer_subs_hdbscan.pkl
    │   ├── prompts_embeddings_all_mpnet_base_v2.pt
    │   └── prompts_dataframe_cached_with_results.xlsx
    ├── 03_09_2023_15_14_39
    │   ├── reducer_umap_15.pkl
    │   ├── reducer_umap_2.pkl
    │   ├── clusterer_hdbscan.pkl
    │   ├── clusterer_subs_hdbscan.pkl
    │   ├── prompts_embeddings_all_mpnet_base_v2.pt
    │   └── prompts_dataframe_cached_with_results.xlsx
    └── 04_09_2023_03_02_25
    │   └── assets
    │       ├── aspens_runway.jpeg
    │       ├── batman_midjourney.png
    │       ├── selected_5_themes.png
    │       ├── futuristic_car_midjourney.png
    │       ├── selected_25_cluster_themes.png
    │       └── traveler_wanderer_runway.jpeg
├── requirements.txt
├── .gitignore
├── README.md
├── LICENSE
└── notebooks
    └── stable-diffusion-prompts-clustering.ipynb


/experiments/02_09_2023_16_54_32/reducer_umap_15.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/reducer_umap_15.pkl


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/reducer_umap_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/reducer_umap_2.pkl


--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/reducer_umap_15.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/reducer_umap_15.pkl


--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/reducer_umap_2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/reducer_umap_2.pkl


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/clusterer_hdbscan.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/clusterer_hdbscan.pkl


--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/clusterer_hdbscan.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/clusterer_hdbscan.pkl


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/clusters_viz_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/clusters_viz_1.png


--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/aspens_runway.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/aspens_runway.jpeg


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/exemplars_viz_1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/exemplars_viz_1.png


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/exemplars_viz_2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/exemplars_viz_2.png


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/clusterer_subs_hdbscan.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/clusterer_subs_hdbscan.pkl


--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/clusterer_subs_hdbscan.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/clusterer_subs_hdbscan.pkl


--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/batman_midjourney.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/batman_midjourney.png


--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/selected_5_themes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/selected_5_themes.png


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster0.png


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster1.png


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster2.png


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster3.png


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster4.png


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster5.png


--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/futuristic_car_midjourney.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/futuristic_car_midjourney.png


--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/selected_25_cluster_themes.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/selected_25_cluster_themes.png


--------------------------------------------------------------------------------
/experiments/04_09_2023_03_02_25/assets/traveler_wanderer_runway.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/traveler_wanderer_runway.jpeg


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | tqdm
 2 | datasets
 3 | umap-learn
 4 | hdbscan
 5 | sentence-transformers
 6 | numpy
 7 | torch
 8 | openai
 9 | pandas
10 | openpyxl
11 | seaborn
12 | plotly 
13 | UliPlot
14 | tiktoken
15 | cleantext


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/prompts_embeddings_all_mpnet_base_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/prompts_embeddings_all_mpnet_base_v2.pt


--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/prompts_embeddings_all_mpnet_base_v2.pt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/prompts_embeddings_all_mpnet_base_v2.pt


--------------------------------------------------------------------------------
/experiments/02_09_2023_16_54_32/prompts_dataframe_cached_with_results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/prompts_dataframe_cached_with_results.xlsx


--------------------------------------------------------------------------------
/experiments/03_09_2023_15_14_39/prompts_dataframe_cached_with_results.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/prompts_dataframe_cached_with_results.xlsx


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv*
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Text clustering: HDBSCAN is probably all you need
  2 | 
  3 | [![License](https://img.shields.io/badge/License-Apache_2.0-green.svg)](https://github.com/daniel-furman/Polyglot-or-Not/blob/main/LICENSE) 
  4 | [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/release/python-390/) 
  5 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 
  6 | 
  7 | ## Goal
  8 | 
  9 | Segment common items in a text dataset to pinpoint core themes and their distribution. 
 10 | 
 11 | * Clusters cover the main topics/subtopics in the dataset
 12 | * Clusters backed by accurate, LLM generated summaries
 13 | 
 14 | ## Background
 15 | 
 16 | We employ [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/index.html) for probabilistic clustering. This algorithm is advantageous in many ways, including:
 17 | 
 18 | * Don’t be wrong: Cluster can have varying densities, don’t need to be globular, and won’t include noise
 19 | * Intuitive parameters: Choosing a minimum cluster size is very reasonable, and the number of *k* clusters does not need to be specified (HDBSCAN finds the optimal *k* for you)
 20 | * Stability: HDBSCAN is stable over runs and subsampling and has good stability over parameter choices
 21 | * Performance: When implemented well HDBSCAN can be very efficient; the current implementation has similar performance to fastcluster’s agglomerative clustering
 22 | 
 23 | See the HDBSCAN docs on [comparing clustering algorithms](https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html#hdbscan) and [how hdbscan works](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) for more information.
 24 | 
 25 | ## Citations
 26 | 
 27 | * Datasets
 28 |     * [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts)
 29 |     * [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts)  
 30 | * Embedding models
 31 |     * [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2)
 32 | 
 33 | ## Experiments
 34 | 
 35 | ## 1. Visualizing core themes in [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts)
 36 | 
 37 | These figures correspond to [`experiments/02_09_2023_16_54_32`](https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering/tree/main/experiments/02_09_2023_16_54_32)
 38 | 
 39 | <a target="_blank" href="https://colab.research.google.com/github/daniel-furman/awesome-chatgpt-prompts-clustering/blob/main/notebooks/awesome-chatgpt-prompts-clustering.ipynb"> <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 40 | 
 41 | ---
 42 | 
 43 | ![](experiments/02_09_2023_16_54_32/assets/clusters_viz_1.png)
 44 | 
 45 | **Figure 1**. HDBSCAN splits the 153 text to text prompts from [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts) into two clusters: Cluster 1 with 44 prompts (orange) and Cluster 2 with 105 prompts (blue). The 4 remaining prompts (gray) were filtered out as outliers/noise.
 46 | 
 47 | ![](experiments/02_09_2023_16_54_32/assets/exemplars_viz_1.png)
 48 | 
 49 | **Figure 2**. The most persistent prompts in each leaf cluster are known as "exemplars". These represent the hearts around which the ultimate cluster formed. See the HDBSCAN docs on [soft clustering explanation](https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html#distance-based-membership) for supporting information and functions.
 50 | 
 51 | ![](experiments/02_09_2023_16_54_32/assets/exemplars_viz_2.png)
 52 | 
 53 | **Figure 3**. Additional clustering is conducted around the exemplars to identify sub-topics in the dataset. The cases in each sub-cluster subsequently serve as retrieved context for the LLM theme summarization calls below.
 54 | 
 55 | ![](experiments/02_09_2023_16_54_32/assets/cluster0_subcluster0.png)
 56 | 
 57 | **Figure 4**. Visualizing the "*Computer Programming and Software Development*" theme, which covers 13% of the dataset. The summary was generated by [gpt-3.5-turbo-16k](https://platform.openai.com/docs/models/gpt-3-5). The above was created with [jsoncrack.com/editor](https://jsoncrack.com/editor).
 58 | 
 59 | 
 60 | <br>
 61 | 
 62 | ## 2. Drift detection for [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts)
 63 | 
 64 | These figures correspond to [`experiments/04_09_2023_03_02_25`](https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering/tree/main/experiments/04_09_2023_03_02_25)
 65 | 
 66 | <a target="_blank" href="https://colab.research.google.com/github/daniel-furman/awesome-chatgpt-prompts-clustering/blob/main/notebooks/stable-diffusion-prompts-clustering.ipynb">
 67 |   <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
 68 | </a>
 69 | 
 70 | ---
 71 | 
 72 | HDBSCAN splits the 73,718 text to image prompts from [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts) into 78 clusters with 25,019 (33%) of the dataset represented. The remaining 48,699 (66%) were filtered out as outliers/noise. The 5 largest clusters cover 9.5% of the dataset - these are the segments we will examine for drift below. 
 73 | 
 74 | | cluster id | theme |
 75 | |------------|--------------|
 76 | | 56         | Portraits and artistic depictions of female anime characters, beautiful women, and fashionable young women |
 77 | | 13         | Symmetrical portraits of people, characters, and sci-fi figures                                            |
 78 | | 61         | Futuristic sci-fi spaceship concept art                                                                    |
 79 | | 50         | Portraits of famous actresses as characters in various roles, outfits, and styles                          |
 80 | | 74         | Surreal, cinematic, and futuristic digital art                                                             |
 81 | 
 82 | | cluster id | train count<br />(73.7k rows) | test count<br />(8.19k rows) | drift detection<br />(% change) |
 83 | |------------|-------------------------------|------------------------------|------------------|
 84 | | 56         | 2530 (3.43%)                  | 310 (3.79%)                  | 10.50            |
 85 | | 13         | 1343 (1.82%)                  | 149 (1.82%)                  | 0.00             |
 86 | | 61         | 1287 (1.75%)                  | 131 (1.60%)                  | -8.57            |
 87 | | 50         | 1055 (1.43%)                  | 135 (1.65%)                  | 15.38            |
 88 | | 74         | 749 (1.02%)                   | 109 (1.33%)                  | 30.39            |
 89 | 
 90 | 
 91 | **Tables 1 & 2**. Drift detection for the top 5 largest clusters (bottom), alongside their [claude-2](https://claude.ai/) summaries (top).
 92 | 
 93 | <br>
 94 | 
 95 | <p align="center"> <img src="experiments/04_09_2023_03_02_25/assets/aspens_runway.jpeg"/ width = "550" height = "366"> </p>
 96 | 
 97 | **Prompt**: "*Beautiful painting of an Aspen forest at sunset, digital art, award winning illustration, golden hour, smooth, sharp lines, concept art, trending on artstation*" <br>
 98 | **Model**: [Runway Gen-2](https://app.runwayml.com/video-tools/teams/dryanfurman/ai-tools/text-to-image) (accessed by Daniel Furman on Sep 4, 2023) <br>
 99 | **Theme**: Beautiful landscape paintings and matte art  (cluster id: 75)<br>
100 | 
101 | <br>
102 | 
103 | <p align="center"> <img src="experiments/04_09_2023_03_02_25/assets/batman_midjourney.png"/ width = "375" height = "375"> </p>
104 | 
105 | **Prompt**: "*Futuristic batman, brush strokes, oil painting, greg rutkowski*" <br>
106 | **Model**: [Midjourney V5.2](https://www.midjourney.com/app/) (accessed by Daniel Furman on Sep 4, 2023) <br>
107 | **Theme**: Art and portraits of Batman characters (cluster id: 41)<br>
108 | 
109 | <p align="center"> <img src="experiments/04_09_2023_03_02_25/assets/futuristic_car_midjourney.png"/ width = "500" height = "384"> </p>
110 | 
111 | **Prompt**: "*Futuristic Porsche designed by Apple, a detailed matte painting by Kitagawa Utamaro, cgsociety, octane render, highly detailed, matte painting, concept art, sci-fi*" <br>
112 | **Model**: [Midjourney V5.2](https://www.midjourney.com/app/) (accessed by Daniel Furman on Sep 4, 2023) <br>
113 | **Theme**: Futuristic and fantasy vehicle concept art  (cluster id: 52) <br>
114 | 
115 | 
116 | **Figure 5**. A sample of 3 text to image generations with various models for prompts from the [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts) dataset (alongside their cluster id). 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/notebooks/stable-diffusion-prompts-clustering.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "id": "LQ-HKp9u4kTR"
   7 |    },
   8 |    "source": [
   9 |     "# Text clustering: HDBSCAN is probably all you need\n",
  10 |     "\n",
  11 |     "<a target=\"_blank\" href=\"https://colab.research.google.com/github/daniel-furman/awesome-chatgpt-prompts-clustering/blob/main/notebooks/stable-diffusion-prompts-clustering.ipynb\">\n",
  12 |     "  <img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/>\n",
  13 |     "</a>"
  14 |    ]
  15 |   },
  16 |   {
  17 |    "cell_type": "markdown",
  18 |    "metadata": {
  19 |     "id": "IpEhs9ujlcQ1"
  20 |    },
  21 |    "source": [
  22 |     "## Sections\n",
  23 |     "\n",
  24 |     "1. Setup\n",
  25 |     "2. Data I/O\n",
  26 |     "3. Embed text\n",
  27 |     "4. Clustering\n",
  28 |     "5. Exemplar sub-clustering\n",
  29 |     "6. Knowledge graph theming\n",
  30 |     "7. Write final df results to disk\n",
  31 |     "8. Create a JSON knowledge graph viz"
  32 |    ]
  33 |   },
  34 |   {
  35 |    "cell_type": "markdown",
  36 |    "metadata": {
  37 |     "id": "ZMZjJxWGeSZP"
  38 |    },
  39 |    "source": [
  40 |     "## Setup"
  41 |    ]
  42 |   },
  43 |   {
  44 |    "cell_type": "code",
  45 |    "execution_count": null,
  46 |    "metadata": {
  47 |     "colab": {
  48 |      "base_uri": "https://localhost:8080/"
  49 |     },
  50 |     "id": "a6GR5Tfzx2z9",
  51 |     "outputId": "0ccecdb6-b57c-4c05-de12-196770056162"
  52 |    },
  53 |    "outputs": [],
  54 |    "source": [
  55 |     "# copying larger files to GDrive storage for this experiment\n",
  56 |     "\n",
  57 |     "from google.colab import drive\n",
  58 |     "\n",
  59 |     "drive.mount(\"/content/drive\")"
  60 |    ]
  61 |   },
  62 |   {
  63 |    "cell_type": "code",
  64 |    "execution_count": null,
  65 |    "metadata": {
  66 |     "id": "-mA_PgxV4KV2"
  67 |    },
  68 |    "outputs": [],
  69 |    "source": [
  70 |     "!git clone https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering.git"
  71 |    ]
  72 |   },
  73 |   {
  74 |    "cell_type": "code",
  75 |    "execution_count": null,
  76 |    "metadata": {
  77 |     "id": "mO55HvFB3egw"
  78 |    },
  79 |    "outputs": [],
  80 |    "source": [
  81 |     "# for local run, see below commands for setting up a new venv\n",
  82 |     "\n",
  83 |     "#!python -m venv .venv_clust_demo\n",
  84 |     "#!source .venv_clust_demo/bin/activate\n",
  85 |     "#!pip install --upgrade pip\n",
  86 |     "#!pip list"
  87 |    ]
  88 |   },
  89 |   {
  90 |    "cell_type": "code",
  91 |    "execution_count": null,
  92 |    "metadata": {
  93 |     "colab": {
  94 |      "base_uri": "https://localhost:8080/"
  95 |     },
  96 |     "id": "96JEWSNtljzH",
  97 |     "outputId": "04f47fc2-588e-4557-ac04-16989c152ee7"
  98 |    },
  99 |    "outputs": [],
 100 |    "source": [
 101 |     "import os\n",
 102 |     "\n",
 103 |     "os.chdir(\"/content/awesome-chatgpt-prompts-clustering\")\n",
 104 |     "!ls"
 105 |    ]
 106 |   },
 107 |   {
 108 |    "cell_type": "code",
 109 |    "execution_count": null,
 110 |    "metadata": {
 111 |     "id": "GeFaTjyW2Gk7"
 112 |    },
 113 |    "outputs": [],
 114 |    "source": [
 115 |     "!pip install -qUr requirements.txt"
 116 |    ]
 117 |   },
 118 |   {
 119 |    "cell_type": "code",
 120 |    "execution_count": null,
 121 |    "metadata": {
 122 |     "colab": {
 123 |      "base_uri": "https://localhost:8080/"
 124 |     },
 125 |     "id": "8nBmAAxvye3w",
 126 |     "outputId": "04ea9f9f-b092-40da-f5cb-39b468509561"
 127 |    },
 128 |    "outputs": [],
 129 |    "source": [
 130 |     "os.chdir(\"../..\")\n",
 131 |     "!ls"
 132 |    ]
 133 |   },
 134 |   {
 135 |    "cell_type": "code",
 136 |    "execution_count": null,
 137 |    "metadata": {
 138 |     "id": "lPBnmvEd3egy"
 139 |    },
 140 |    "outputs": [],
 141 |    "source": [
 142 |     "#!pip list"
 143 |    ]
 144 |   },
 145 |   {
 146 |    "cell_type": "code",
 147 |    "execution_count": null,
 148 |    "metadata": {
 149 |     "id": "NXc0G4wQ3egy"
 150 |    },
 151 |    "outputs": [],
 152 |    "source": [
 153 |     "import argparse\n",
 154 |     "import os\n",
 155 |     "from tqdm.notebook import tqdm\n",
 156 |     "import datetime\n",
 157 |     "import json\n",
 158 |     "import pickle\n",
 159 |     "import numpy as np\n",
 160 |     "from numpy import random\n",
 161 |     "import pandas as pd\n",
 162 |     "import seaborn as sns\n",
 163 |     "import matplotlib.pyplot as plt\n",
 164 |     "import plotly.graph_objects as go\n",
 165 |     "\n",
 166 |     "import umap\n",
 167 |     "from datasets import load_dataset\n",
 168 |     "from sentence_transformers import SentenceTransformer\n",
 169 |     "import torch\n",
 170 |     "import hdbscan\n",
 171 |     "from sklearn.metrics.pairwise import euclidean_distances\n",
 172 |     "import openai\n",
 173 |     "import tiktoken\n",
 174 |     "import cleantext\n",
 175 |     "\n",
 176 |     "from UliPlot.XLSX import auto_adjust_xlsx_column_width"
 177 |    ]
 178 |   },
 179 |   {
 180 |    "cell_type": "code",
 181 |    "execution_count": null,
 182 |    "metadata": {
 183 |     "colab": {
 184 |      "base_uri": "https://localhost:8080/"
 185 |     },
 186 |     "id": "TnRCOUWK9hkn",
 187 |     "outputId": "e347ce5a-844b-4dfb-99ff-7f25a071619e"
 188 |    },
 189 |    "outputs": [],
 190 |    "source": [
 191 |     "args = argparse.Namespace()\n",
 192 |     "args.inference = True\n",
 193 |     "args"
 194 |    ]
 195 |   },
 196 |   {
 197 |    "cell_type": "code",
 198 |    "execution_count": null,
 199 |    "metadata": {
 200 |     "colab": {
 201 |      "base_uri": "https://localhost:8080/"
 202 |     },
 203 |     "id": "m89BDk6WXSZM",
 204 |     "outputId": "4ea9d179-75aa-41cd-9a8c-e3022e72d0ff"
 205 |    },
 206 |    "outputs": [],
 207 |    "source": [
 208 |     "now = datetime.datetime.now()\n",
 209 |     "# dd/mm/YY H:M:S\n",
 210 |     "dt_string = now.strftime(\"%d_%m_%Y_%H_%M_%S\")\n",
 211 |     "\n",
 212 |     "# hardcode in an existing experiment datetime for inference runs\n",
 213 |     "\n",
 214 |     "if args.inference:\n",
 215 |     "    # dt_string identifiers from cached experiments:\n",
 216 |     "    dt_string = \"04_09_2023_03_02_25\"\n",
 217 |     "\n",
 218 |     "print(\"experiment's datetime identifier =\", dt_string)\n",
 219 |     "\n",
 220 |     "# create results folder if it doesn't exist\n",
 221 |     "if not os.path.isdir(\n",
 222 |     "    f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n",
 223 |     "):\n",
 224 |     "    os.mkdir(\n",
 225 |     "        f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n",
 226 |     "    )"
 227 |    ]
 228 |   },
 229 |   {
 230 |    "cell_type": "code",
 231 |    "execution_count": null,
 232 |    "metadata": {
 233 |     "colab": {
 234 |      "base_uri": "https://localhost:8080/"
 235 |     },
 236 |     "id": "3EzBV-Mm6C90",
 237 |     "outputId": "42ff75a2-11c5-474a-90bd-a0204885e6ba"
 238 |    },
 239 |    "outputs": [],
 240 |    "source": [
 241 |     "args.cache_folder = (\n",
 242 |     "    f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n",
 243 |     ")\n",
 244 |     "args"
 245 |    ]
 246 |   },
 247 |   {
 248 |    "cell_type": "markdown",
 249 |    "metadata": {
 250 |     "id": "ZRwI0I6IeVNr"
 251 |    },
 252 |    "source": [
 253 |     "## Data I/O"
 254 |    ]
 255 |   },
 256 |   {
 257 |    "cell_type": "code",
 258 |    "execution_count": null,
 259 |    "metadata": {
 260 |     "colab": {
 261 |      "base_uri": "https://localhost:8080/",
 262 |      "height": 424
 263 |     },
 264 |     "id": "_BNPvSprLRzM",
 265 |     "outputId": "b9559cd9-8111-42f6-edab-272b616ae73d"
 266 |    },
 267 |    "outputs": [],
 268 |    "source": [
 269 |     "ds_hf = load_dataset(\"Gustavosta/Stable-Diffusion-Prompts\")\n",
 270 |     "ds = ds_hf[\"train\"]\n",
 271 |     "\n",
 272 |     "ds = ds.to_pandas()\n",
 273 |     "ds[\"id\"] = ds.index\n",
 274 |     "ds = ds[[\"id\", \"Prompt\"]]\n",
 275 |     "ds"
 276 |    ]
 277 |   },
 278 |   {
 279 |    "cell_type": "markdown",
 280 |    "metadata": {
 281 |     "id": "h8uYPW_TeYOa"
 282 |    },
 283 |    "source": [
 284 |     "## Embed Text\n",
 285 |     "\n",
 286 |     "* See [pretrained models](https://www.sbert.net/docs/pretrained_models.html) for supporting information"
 287 |    ]
 288 |   },
 289 |   {
 290 |    "cell_type": "code",
 291 |    "execution_count": null,
 292 |    "metadata": {
 293 |     "id": "pLLXTuZd6QyV"
 294 |    },
 295 |    "outputs": [],
 296 |    "source": [
 297 |     "model = SentenceTransformer(\"all-mpnet-base-v2\")"
 298 |    ]
 299 |   },
 300 |   {
 301 |    "cell_type": "code",
 302 |    "execution_count": null,
 303 |    "metadata": {
 304 |     "id": "UIen4vsj3egz"
 305 |    },
 306 |    "outputs": [],
 307 |    "source": [
 308 |     "if not args.inference:\n",
 309 |     "    embeddings = torch.zeros([len(ds), 768])\n",
 310 |     "    for i in tqdm(range(len(ds))):\n",
 311 |     "        emb = model.encode(ds.loc[i, \"Prompt\"], convert_to_tensor=True)\n",
 312 |     "        embeddings[i, :] = emb\n",
 313 |     "    embeddings"
 314 |    ]
 315 |   },
 316 |   {
 317 |    "cell_type": "code",
 318 |    "execution_count": null,
 319 |    "metadata": {
 320 |     "colab": {
 321 |      "base_uri": "https://localhost:8080/"
 322 |     },
 323 |     "id": "5h2kpB6C2I_N",
 324 |     "outputId": "88bfa974-8686-49d7-9f93-b131b89ba9a3"
 325 |    },
 326 |    "outputs": [],
 327 |    "source": [
 328 |     "f_name = os.path.join(\n",
 329 |     "    args.cache_folder, \"stable_diffusion_prompts_embeddings_all_mpnet_base_v2.pt\"\n",
 330 |     ")\n",
 331 |     "print(f_name, \"\\n\")\n",
 332 |     "\n",
 333 |     "if not args.inference:\n",
 334 |     "    torch.save(embeddings, f_name)\n",
 335 |     "loaded_embeddings = torch.load(f_name)\n",
 336 |     "loaded_embeddings"
 337 |    ]
 338 |   },
 339 |   {
 340 |    "cell_type": "code",
 341 |    "execution_count": null,
 342 |    "metadata": {
 343 |     "colab": {
 344 |      "base_uri": "https://localhost:8080/"
 345 |     },
 346 |     "id": "fsWOJjwdzgeG",
 347 |     "outputId": "b91af737-4ec1-43af-a45d-705a6a893fdd"
 348 |    },
 349 |    "outputs": [],
 350 |    "source": [
 351 |     "loaded_embeddings.shape"
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": null,
 357 |    "metadata": {
 358 |     "id": "DWdeAnld5xhH"
 359 |    },
 360 |    "outputs": [],
 361 |    "source": [
 362 |     "if not args.inference:\n",
 363 |     "    torch.equal(loaded_embeddings.cpu(), embeddings.cpu())"
 364 |    ]
 365 |   },
 366 |   {
 367 |    "cell_type": "code",
 368 |    "execution_count": null,
 369 |    "metadata": {
 370 |     "colab": {
 371 |      "base_uri": "https://localhost:8080/"
 372 |     },
 373 |     "id": "j6f0cJmH3egz",
 374 |     "outputId": "e4b5aee8-31a9-4348-f8da-4e2e1b07da14"
 375 |    },
 376 |    "outputs": [],
 377 |    "source": [
 378 |     "test_itr = random.randint(low=0, high=len(ds))\n",
 379 |     "test_itr"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": null,
 385 |    "metadata": {
 386 |     "colab": {
 387 |      "base_uri": "https://localhost:8080/",
 388 |      "height": 89
 389 |     },
 390 |     "id": "2WoZs9mF3egz",
 391 |     "outputId": "708a1772-9947-4949-f4ad-89bf734aa6b3"
 392 |    },
 393 |    "outputs": [],
 394 |    "source": [
 395 |     "# test embeddings worked\n",
 396 |     "ds.loc[test_itr, \"Prompt\"]"
 397 |    ]
 398 |   },
 399 |   {
 400 |    "cell_type": "code",
 401 |    "execution_count": null,
 402 |    "metadata": {
 403 |     "colab": {
 404 |      "base_uri": "https://localhost:8080/"
 405 |     },
 406 |     "id": "78eiylg53egz",
 407 |     "outputId": "0a1eee8c-ed7a-4feb-a406-8e001d2fe657"
 408 |    },
 409 |    "outputs": [],
 410 |    "source": [
 411 |     "test_emb = model.encode(ds.loc[test_itr, \"Prompt\"], convert_to_tensor=True)\n",
 412 |     "a = np.array(test_emb.cpu())\n",
 413 |     "b = np.array(loaded_embeddings[test_itr, :].cpu())\n",
 414 |     "np.allclose(a, b, rtol=1e-02)"
 415 |    ]
 416 |   },
 417 |   {
 418 |    "cell_type": "markdown",
 419 |    "metadata": {
 420 |     "id": "THg1GieGesDQ"
 421 |    },
 422 |    "source": [
 423 |     "## Clustering\n",
 424 |     "\n",
 425 |     "* See [how hdbscan works](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) for supporting information"
 426 |    ]
 427 |   },
 428 |   {
 429 |    "cell_type": "code",
 430 |    "execution_count": null,
 431 |    "metadata": {
 432 |     "colab": {
 433 |      "base_uri": "https://localhost:8080/"
 434 |     },
 435 |     "id": "m2A8EHRg7dzr",
 436 |     "outputId": "e6f9135c-02c2-4e47-ca4a-75d56369ffd8"
 437 |    },
 438 |    "outputs": [],
 439 |    "source": [
 440 |     "# second, perform clustering\n",
 441 |     "\n",
 442 |     "# first, perform dimensionality reduction from 768 to 15\n",
 443 |     "f_name = os.path.join(args.cache_folder, \"reducer_umap_15.pkl\")\n",
 444 |     "print(f_name, \"\\n\")\n",
 445 |     "\n",
 446 |     "if not args.inference:\n",
 447 |     "    reducer_15 = umap.UMAP(n_components=15)\n",
 448 |     "    reducer_15.fit(loaded_embeddings)\n",
 449 |     "    embeddings_umap_dim_15 = reducer_15.transform(loaded_embeddings)\n",
 450 |     "    # Verify that the result of calling transform is\n",
 451 |     "    # idenitical to accessing the embedding_ attribute\n",
 452 |     "    assert np.all(embeddings_umap_dim_15 == reducer_15.embedding_)\n",
 453 |     "\n",
 454 |     "    # cache fitted umap object\n",
 455 |     "    pickle.dump(reducer_15, open(f_name, \"wb\"))\n",
 456 |     "\n",
 457 |     "loaded_reducer_15 = pickle.load((open(f_name, \"rb\")))\n",
 458 |     "\n",
 459 |     "embeddings_umap_dim_15 = loaded_reducer_15.transform(loaded_embeddings)\n",
 460 |     "# Verify that the result of calling transform is\n",
 461 |     "# idenitical to accessing the embedding_ attribute\n",
 462 |     "assert np.all(embeddings_umap_dim_15 == loaded_reducer_15.embedding_)\n",
 463 |     "\n",
 464 |     "print(embeddings_umap_dim_15.shape)"
 465 |    ]
 466 |   },
 467 |   {
 468 |    "cell_type": "code",
 469 |    "execution_count": null,
 470 |    "metadata": {
 471 |     "colab": {
 472 |      "base_uri": "https://localhost:8080/"
 473 |     },
 474 |     "id": "0TGPS4pSySbc",
 475 |     "outputId": "8f0486e0-e09c-4947-b5ac-f0a556e76860"
 476 |    },
 477 |    "outputs": [],
 478 |    "source": [
 479 |     "args.inference = False\n",
 480 |     "args"
 481 |    ]
 482 |   },
 483 |   {
 484 |    "cell_type": "code",
 485 |    "execution_count": null,
 486 |    "metadata": {
 487 |     "colab": {
 488 |      "base_uri": "https://localhost:8080/"
 489 |     },
 490 |     "id": "ZZxn2DlOyOsr",
 491 |     "outputId": "758758e8-ac32-486f-db8d-b267f4ec8bf4"
 492 |    },
 493 |    "outputs": [],
 494 |    "source": [
 495 |     "f_name = os.path.join(args.cache_folder, \"clusterer_hdbscan.pkl\")\n",
 496 |     "print(f_name, \"\\n\")\n",
 497 |     "\n",
 498 |     "if not args.inference:\n",
 499 |     "    clusterer = hdbscan.HDBSCAN(\n",
 500 |     "        min_cluster_size=110, gen_min_span_tree=True, prediction_data=True\n",
 501 |     "    )\n",
 502 |     "    clusterer.fit(embeddings_umap_dim_15)\n",
 503 |     "    pickle.dump(clusterer, open(f_name, \"wb\"))\n",
 504 |     "\n",
 505 |     "loaded_clusterer = pickle.load((open(f_name, \"rb\")))\n",
 506 |     "\n",
 507 |     "if not args.inference:\n",
 508 |     "    print(\n",
 509 |     "        pd.DataFrame.equals(\n",
 510 |     "            pd.Series(clusterer.labels_).value_counts(),\n",
 511 |     "            pd.Series(loaded_clusterer.labels_).value_counts(),\n",
 512 |     "        )\n",
 513 |     "    )\n",
 514 |     "    print(\n",
 515 |     "        pd.DataFrame.equals(\n",
 516 |     "            pd.Series(clusterer.probabilities_).value_counts(),\n",
 517 |     "            pd.Series(loaded_clusterer.probabilities_).value_counts(),\n",
 518 |     "        )\n",
 519 |     "    )\n",
 520 |     "\n",
 521 |     "num_ouliers = pd.Series(loaded_clusterer.labels_).value_counts().loc[-1]\n",
 522 |     "\n",
 523 |     "print(pd.Series(loaded_clusterer.labels_).value_counts())\n",
 524 |     "print(f\"\\nCluster outliers : {num_ouliers}\\n\")"
 525 |    ]
 526 |   },
 527 |   {
 528 |    "cell_type": "code",
 529 |    "execution_count": null,
 530 |    "metadata": {
 531 |     "colab": {
 532 |      "base_uri": "https://localhost:8080/"
 533 |     },
 534 |     "id": "eqIqeraG-jTd",
 535 |     "outputId": "066f5e27-3bc6-4c4d-d388-2bb67ab9d23e"
 536 |    },
 537 |    "outputs": [],
 538 |    "source": [
 539 |     "# sum of top 25 cluster counts\n",
 540 |     "\n",
 541 |     "pd.Series(loaded_clusterer.labels_).value_counts()[1:26].sum()"
 542 |    ]
 543 |   },
 544 |   {
 545 |    "cell_type": "code",
 546 |    "execution_count": null,
 547 |    "metadata": {
 548 |     "colab": {
 549 |      "base_uri": "https://localhost:8080/",
 550 |      "height": 424
 551 |     },
 552 |     "id": "zB9nU0ka_JpA",
 553 |     "outputId": "fb1f1bf9-581a-4db4-f9b5-fcd9117c2548"
 554 |    },
 555 |    "outputs": [],
 556 |    "source": [
 557 |     "ds[\"cluster\"] = loaded_clusterer.labels_\n",
 558 |     "ds[\"cluster membership prob\"] = loaded_clusterer.probabilities_\n",
 559 |     "ds"
 560 |    ]
 561 |   },
 562 |   {
 563 |    "cell_type": "code",
 564 |    "execution_count": null,
 565 |    "metadata": {
 566 |     "colab": {
 567 |      "base_uri": "https://localhost:8080/",
 568 |      "height": 438
 569 |     },
 570 |     "id": "pDW5AVDn891V",
 571 |     "outputId": "983b99a0-0c2a-4aa2-ea63-53b97e5b5f90"
 572 |    },
 573 |    "outputs": [],
 574 |    "source": [
 575 |     "loaded_clusterer.condensed_tree_.plot()"
 576 |    ]
 577 |   },
 578 |   {
 579 |    "cell_type": "code",
 580 |    "execution_count": null,
 581 |    "metadata": {
 582 |     "colab": {
 583 |      "base_uri": "https://localhost:8080/",
 584 |      "height": 438
 585 |     },
 586 |     "id": "N-aJm6Sz9A2h",
 587 |     "outputId": "641b1b8d-e748-44eb-be4c-ae7799b0646a"
 588 |    },
 589 |    "outputs": [],
 590 |    "source": [
 591 |     "loaded_clusterer.condensed_tree_.plot(\n",
 592 |     "    select_clusters=True, selection_palette=sns.color_palette()\n",
 593 |     ")"
 594 |    ]
 595 |   },
 596 |   {
 597 |    "cell_type": "code",
 598 |    "execution_count": null,
 599 |    "metadata": {
 600 |     "id": "9wphVnvqytn5"
 601 |    },
 602 |    "outputs": [],
 603 |    "source": [
 604 |     "args.inference = True"
 605 |    ]
 606 |   },
 607 |   {
 608 |    "cell_type": "code",
 609 |    "execution_count": null,
 610 |    "metadata": {
 611 |     "colab": {
 612 |      "base_uri": "https://localhost:8080/"
 613 |     },
 614 |     "id": "Bug6Ab-n8DqQ",
 615 |     "outputId": "18a5ca83-d5cd-43e0-aef2-ed0edabb336b"
 616 |    },
 617 |    "outputs": [],
 618 |    "source": [
 619 |     "# third, perform dimensionality reduction from 15 to 2\n",
 620 |     "\n",
 621 |     "f_name = os.path.join(args.cache_folder, \"reducer_umap_2.pkl\")\n",
 622 |     "print(f_name, \"\\n\")\n",
 623 |     "\n",
 624 |     "if not args.inference:\n",
 625 |     "    reducer_2 = umap.UMAP(n_components=2)\n",
 626 |     "    reducer_2.fit(embeddings_umap_dim_15)\n",
 627 |     "    embeddings_umap_dim_2 = reducer_2.transform(embeddings_umap_dim_15)\n",
 628 |     "\n",
 629 |     "    # Verify that the result of calling transform is\n",
 630 |     "    # idenitical to accessing the embedding_ attribute\n",
 631 |     "    assert np.all(embeddings_umap_dim_2 == reducer_2.embedding_)\n",
 632 |     "\n",
 633 |     "    # cache fitted umap object\n",
 634 |     "    pickle.dump(reducer_2, open(f_name, \"wb\"))\n",
 635 |     "\n",
 636 |     "loaded_reducer_2 = pickle.load((open(f_name, \"rb\")))\n",
 637 |     "\n",
 638 |     "embeddings_umap_dim_2 = loaded_reducer_2.transform(embeddings_umap_dim_15)\n",
 639 |     "# Verify that the result of calling transform is\n",
 640 |     "# idenitical to accessing the embedding_ attribute\n",
 641 |     "assert np.all(embeddings_umap_dim_2 == loaded_reducer_2.embedding_)\n",
 642 |     "embeddings_umap_dim_2.shape"
 643 |    ]
 644 |   },
 645 |   {
 646 |    "cell_type": "code",
 647 |    "execution_count": null,
 648 |    "metadata": {
 649 |     "id": "PjOkZzekl1vx"
 650 |    },
 651 |    "outputs": [],
 652 |    "source": [
 653 |     "ds[\"x\"] = embeddings_umap_dim_2[:, 0]\n",
 654 |     "ds[\"y\"] = embeddings_umap_dim_2[:, 1]"
 655 |    ]
 656 |   },
 657 |   {
 658 |    "cell_type": "code",
 659 |    "execution_count": null,
 660 |    "metadata": {
 661 |     "colab": {
 662 |      "base_uri": "https://localhost:8080/",
 663 |      "height": 450
 664 |     },
 665 |     "id": "lcOHNlilQAeU",
 666 |     "outputId": "8593037f-af04-49dd-815d-7e4b871bc0d2"
 667 |    },
 668 |    "outputs": [],
 669 |    "source": [
 670 |     "# Visualize clusters\n",
 671 |     "fig, ax = plt.subplots(figsize=(20, 10))\n",
 672 |     "outliers = ds[ds[\"cluster\"] == -1]\n",
 673 |     "clustered = ds[ds[\"cluster\"] != -1]\n",
 674 |     "plt.scatter(outliers.x, outliers.y, color=\"#BDBDBD\", s=10, alpha=0.1)\n",
 675 |     "plt.scatter(\n",
 676 |     "    clustered.x, clustered.y, c=clustered.cluster, s=10, alpha=0.35, cmap=\"viridis\"\n",
 677 |     ")"
 678 |    ]
 679 |   },
 680 |   {
 681 |    "cell_type": "code",
 682 |    "execution_count": null,
 683 |    "metadata": {
 684 |     "colab": {
 685 |      "base_uri": "https://localhost:8080/",
 686 |      "height": 837
 687 |     },
 688 |     "id": "jdCSkHYZ9VTK",
 689 |     "outputId": "2ab60bbd-afb1-47f6-edac-cb077616d153"
 690 |    },
 691 |    "outputs": [],
 692 |    "source": [
 693 |     "fig = go.Figure()\n",
 694 |     "fig.add_trace(\n",
 695 |     "    go.Scatter(\n",
 696 |     "        x=ds[\"x\"][ds[\"cluster\"] != -1],\n",
 697 |     "        y=ds[\"y\"][ds[\"cluster\"] != -1],\n",
 698 |     "        mode=\"markers\",\n",
 699 |     "        marker_color=ds[\"cluster\"][ds[\"cluster\"] != -1],\n",
 700 |     "        marker_colorscale=\"Viridis\",\n",
 701 |     "        text=ds[\"cluster\"][ds[\"cluster\"] != -1],\n",
 702 |     "    )\n",
 703 |     ")\n",
 704 |     "\n",
 705 |     "fig.update_traces(marker={\"size\": 5, \"opacity\": 0.45}, showlegend=False)\n",
 706 |     "fig.update_coloraxes(showscale=False)\n",
 707 |     "fig.update_layout(width=550 * 2, height=400 * 2)\n",
 708 |     "fig.show()"
 709 |    ]
 710 |   },
 711 |   {
 712 |    "cell_type": "markdown",
 713 |    "metadata": {
 714 |     "id": "fZl5qjp7r6QJ"
 715 |    },
 716 |    "source": [
 717 |     "## Exemplar Sub-Clustering\n",
 718 |     "\n",
 719 |     "* See [soft clustering explanation](https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html) for supporting information"
 720 |    ]
 721 |   },
 722 |   {
 723 |    "cell_type": "code",
 724 |    "execution_count": null,
 725 |    "metadata": {
 726 |     "id": "87hBN35f-J0-"
 727 |    },
 728 |    "outputs": [],
 729 |    "source": [
 730 |     "# function copied from:\n",
 731 |     "# https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html#distance-based-membership\n",
 732 |     "\n",
 733 |     "\n",
 734 |     "def exemplars(cluster_id, condensed_tree):\n",
 735 |     "    raw_tree = condensed_tree._raw_tree\n",
 736 |     "    # Just the cluster elements of the tree, excluding singleton points\n",
 737 |     "    cluster_tree = raw_tree[raw_tree[\"child_size\"] > 1]\n",
 738 |     "    # Get the leaf cluster nodes under the cluster we are considering\n",
 739 |     "    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)\n",
 740 |     "    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf)\n",
 741 |     "    result = np.array([])\n",
 742 |     "    for leaf in leaves:\n",
 743 |     "        max_lambda = raw_tree[\"lambda_val\"][raw_tree[\"parent\"] == leaf].max()\n",
 744 |     "        points = raw_tree[\"child\"][\n",
 745 |     "            (raw_tree[\"parent\"] == leaf) & (raw_tree[\"lambda_val\"] == max_lambda)\n",
 746 |     "        ]\n",
 747 |     "        result = np.hstack((result, points))\n",
 748 |     "    return result.astype(np.int)"
 749 |    ]
 750 |   },
 751 |   {
 752 |    "cell_type": "code",
 753 |    "execution_count": null,
 754 |    "metadata": {
 755 |     "colab": {
 756 |      "base_uri": "https://localhost:8080/"
 757 |     },
 758 |     "id": "hu4rVHAHAoms",
 759 |     "outputId": "888845e2-4a84-4cb0-d72d-56867bdf01b8"
 760 |    },
 761 |    "outputs": [],
 762 |    "source": [
 763 |     "tree = loaded_clusterer.condensed_tree_\n",
 764 |     "\n",
 765 |     "exemplar_ids = []\n",
 766 |     "for i, c in enumerate(tree._select_clusters()):\n",
 767 |     "    c_exemplars = exemplars(c, tree)\n",
 768 |     "    print(f\"Cluster {i} has {len(c_exemplars)} exemplars\")\n",
 769 |     "    exemplar_ids.extend(c_exemplars)"
 770 |    ]
 771 |   },
 772 |   {
 773 |    "cell_type": "code",
 774 |    "execution_count": null,
 775 |    "metadata": {
 776 |     "id": "3X3Wc86tnVRY"
 777 |    },
 778 |    "outputs": [],
 779 |    "source": [
 780 |     "ds[\"exemplars yes/no\"] = np.zeros(len(ds))\n",
 781 |     "ds.loc[exemplar_ids, \"exemplars yes/no\"] = 1\n",
 782 |     "\n",
 783 |     "assert len(ds[ds[\"exemplars yes/no\"] == 1]) == len(exemplar_ids)"
 784 |    ]
 785 |   },
 786 |   {
 787 |    "cell_type": "code",
 788 |    "execution_count": null,
 789 |    "metadata": {
 790 |     "colab": {
 791 |      "base_uri": "https://localhost:8080/",
 792 |      "height": 873
 793 |     },
 794 |     "id": "uyJW97LhrRp0",
 795 |     "outputId": "5176d228-8539-40f4-d8b5-f0acf58b8ffa"
 796 |    },
 797 |    "outputs": [],
 798 |    "source": [
 799 |     "print(\"\\n\")\n",
 800 |     "fig = go.Figure()\n",
 801 |     "\n",
 802 |     "custom_scale = [\n",
 803 |     "    \"#949494\",  # Gray\n",
 804 |     "    \"#F65314\",  # Google Red\n",
 805 |     "    \"#4285F4\",  # Google Blue\n",
 806 |     "]\n",
 807 |     "\n",
 808 |     "fig.add_trace(\n",
 809 |     "    go.Scatter(\n",
 810 |     "        x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
 811 |     "        y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
 812 |     "        mode=\"markers\",\n",
 813 |     "        marker_color=custom_scale[0],\n",
 814 |     "        text=ds[\"cluster\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
 815 |     "    )\n",
 816 |     ")\n",
 817 |     "\n",
 818 |     "fig.add_trace(\n",
 819 |     "    go.Scatter(\n",
 820 |     "        x=ds[\"x\"][ds[\"exemplars yes/no\"] == 1],\n",
 821 |     "        y=ds[\"y\"][ds[\"exemplars yes/no\"] == 1],\n",
 822 |     "        mode=\"markers\",\n",
 823 |     "        marker_color=ds[\"cluster\"][ds[\"exemplars yes/no\"] == 1],\n",
 824 |     "        marker_colorscale=\"Viridis\",\n",
 825 |     "        text=ds[\"cluster\"][ds[\"exemplars yes/no\"] == 1],\n",
 826 |     "    )\n",
 827 |     ")\n",
 828 |     "\n",
 829 |     "fig.update_traces(marker={\"size\": 5, \"opacity\": 0.45}, showlegend=False)\n",
 830 |     "fig.update_coloraxes(showscale=False)\n",
 831 |     "fig.update_layout(width=550 * 2, height=400 * 2)\n",
 832 |     "fig.show()"
 833 |    ]
 834 |   },
 835 |   {
 836 |    "cell_type": "code",
 837 |    "execution_count": null,
 838 |    "metadata": {
 839 |     "colab": {
 840 |      "base_uri": "https://localhost:8080/"
 841 |     },
 842 |     "id": "y5bO5SZ4hjAT",
 843 |     "outputId": "82860182-688b-400c-bb60-162964b8bf23"
 844 |    },
 845 |    "outputs": [],
 846 |    "source": [
 847 |     "len(ds.loc[exemplar_ids])"
 848 |    ]
 849 |   },
 850 |   {
 851 |    "cell_type": "code",
 852 |    "execution_count": null,
 853 |    "metadata": {
 854 |     "colab": {
 855 |      "base_uri": "https://localhost:8080/"
 856 |     },
 857 |     "id": "OW44QJyYhibv",
 858 |     "outputId": "e24e08df-3557-406c-a056-4821b5661dd8"
 859 |    },
 860 |    "outputs": [],
 861 |    "source": [
 862 |     "embeddings_umap_dim_15[exemplar_ids].shape"
 863 |    ]
 864 |   },
 865 |   {
 866 |    "cell_type": "code",
 867 |    "execution_count": null,
 868 |    "metadata": {
 869 |     "colab": {
 870 |      "base_uri": "https://localhost:8080/"
 871 |     },
 872 |     "id": "J17BEw2vgvwA",
 873 |     "outputId": "0336d60f-c0d1-49aa-bd53-bc1da49f71f8"
 874 |    },
 875 |    "outputs": [],
 876 |    "source": [
 877 |     "# fourth, perform exemplar sub-clustering\n",
 878 |     "\n",
 879 |     "f_name = os.path.join(args.cache_folder, \"clusterer_subs_hdbscan.pkl\")\n",
 880 |     "print(f_name, \"\\n\")\n",
 881 |     "\n",
 882 |     "if not args.inference:\n",
 883 |     "    sub_clusterer = hdbscan.HDBSCAN(\n",
 884 |     "        min_cluster_size=4, gen_min_span_tree=True, prediction_data=True\n",
 885 |     "    )\n",
 886 |     "    sub_clusterer.fit(embeddings_umap_dim_15[exemplar_ids])\n",
 887 |     "    pickle.dump(sub_clusterer, open(f_name, \"wb\"))\n",
 888 |     "\n",
 889 |     "loaded_sub_clusterer = pickle.load((open(f_name, \"rb\")))\n",
 890 |     "\n",
 891 |     "if not args.inference:\n",
 892 |     "    print(\n",
 893 |     "        pd.DataFrame.equals(\n",
 894 |     "            pd.Series(sub_clusterer.labels_).value_counts(),\n",
 895 |     "            pd.Series(loaded_sub_clusterer.labels_).value_counts(),\n",
 896 |     "        )\n",
 897 |     "    )\n",
 898 |     "    print(\n",
 899 |     "        pd.DataFrame.equals(\n",
 900 |     "            pd.Series(sub_clusterer.probabilities_).value_counts(),\n",
 901 |     "            pd.Series(loaded_sub_clusterer.probabilities_).value_counts(),\n",
 902 |     "        )\n",
 903 |     "    )\n",
 904 |     "\n",
 905 |     "print(\"\\nCluster value counts:\\n\")\n",
 906 |     "pd.Series(loaded_sub_clusterer.labels_).value_counts()"
 907 |    ]
 908 |   },
 909 |   {
 910 |    "cell_type": "code",
 911 |    "execution_count": null,
 912 |    "metadata": {
 913 |     "colab": {
 914 |      "base_uri": "https://localhost:8080/"
 915 |     },
 916 |     "id": "O9033VfDirUu",
 917 |     "outputId": "97d98be7-f645-4c81-8df4-d20d6a04e1c7"
 918 |    },
 919 |    "outputs": [],
 920 |    "source": [
 921 |     "loaded_sub_clusterer.labels_"
 922 |    ]
 923 |   },
 924 |   {
 925 |    "cell_type": "code",
 926 |    "execution_count": null,
 927 |    "metadata": {
 928 |     "colab": {
 929 |      "base_uri": "https://localhost:8080/"
 930 |     },
 931 |     "id": "buRR8jvui1ul",
 932 |     "outputId": "3b916ffb-a440-4c07-b452-06f993292e6b"
 933 |    },
 934 |    "outputs": [],
 935 |    "source": [
 936 |     "ds[\"exemplar sub-cluster\"] = np.repeat(np.nan, len(ds))\n",
 937 |     "ds[\"cluster XX.YY\"] = np.repeat(np.nan, len(ds))\n",
 938 |     "# ds.loc[exemplar_ids] = loaded_sub_clusterer.labels_\n",
 939 |     "ds\n",
 940 |     "for i in range(len(ds.loc[exemplar_ids])):\n",
 941 |     "    row = ds.loc[exemplar_ids].iloc[i]\n",
 942 |     "    ds.loc[row.id, \"exemplar sub-cluster\"] = loaded_sub_clusterer.labels_[i]\n",
 943 |     "for i in range(len(ds.loc[exemplar_ids])):\n",
 944 |     "    row = ds.loc[exemplar_ids].iloc[i]\n",
 945 |     "    ds.loc[row.id, \"cluster XX.YY\"] = (\n",
 946 |     "        \"Cluster \"\n",
 947 |     "        + str(row.cluster)\n",
 948 |     "        + \", Sub-Cluster \"\n",
 949 |     "        + str(int(row[\"exemplar sub-cluster\"]))\n",
 950 |     "    )\n",
 951 |     "\n",
 952 |     "# ds.loc[exemplar_ids]\n",
 953 |     "# ds"
 954 |    ]
 955 |   },
 956 |   {
 957 |    "cell_type": "code",
 958 |    "execution_count": null,
 959 |    "metadata": {
 960 |     "colab": {
 961 |      "base_uri": "https://localhost:8080/"
 962 |     },
 963 |     "id": "lJdOWEjSr0NR",
 964 |     "outputId": "baaa50b4-475e-4f50-a406-700b59c1d0f3"
 965 |    },
 966 |    "outputs": [],
 967 |    "source": [
 968 |     "ds_inner_exemplars = ds[ds[\"exemplars yes/no\"] == 1]\n",
 969 |     "ds_inner_exemplars = ds_inner_exemplars[\n",
 970 |     "    ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n",
 971 |     "]\n",
 972 |     "len(ds_inner_exemplars)"
 973 |    ]
 974 |   },
 975 |   {
 976 |    "cell_type": "code",
 977 |    "execution_count": null,
 978 |    "metadata": {
 979 |     "colab": {
 980 |      "base_uri": "https://localhost:8080/",
 981 |      "height": 673
 982 |     },
 983 |     "id": "NiW84puFkJtJ",
 984 |     "outputId": "854e83a8-fbea-4eed-bde4-8868d070d77b"
 985 |    },
 986 |    "outputs": [],
 987 |    "source": [
 988 |     "print(\"\\n\")\n",
 989 |     "\n",
 990 |     "fig = go.Figure()\n",
 991 |     "\n",
 992 |     "fig.add_trace(\n",
 993 |     "    go.Scatter(\n",
 994 |     "        x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
 995 |     "        y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
 996 |     "        mode=\"markers\",\n",
 997 |     "        marker_color=custom_scale[0],\n",
 998 |     "        text=ds[\"cluster XX.YY\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
 999 |     "    )\n",
1000 |     ")\n",
1001 |     "\n",
1002 |     "fig.add_trace(\n",
1003 |     "    go.Scatter(\n",
1004 |     "        x=ds_inner_exemplars[\"x\"],\n",
1005 |     "        y=ds_inner_exemplars[\"y\"],\n",
1006 |     "        mode=\"markers\",\n",
1007 |     "        marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n",
1008 |     "        marker_colorscale=\"Viridis\",\n",
1009 |     "        text=ds_inner_exemplars[\"cluster XX.YY\"],\n",
1010 |     "    )\n",
1011 |     ")\n",
1012 |     "\n",
1013 |     "fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n",
1014 |     "fig.update_coloraxes(showscale=False)\n",
1015 |     "fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n",
1016 |     "fig.show()"
1017 |    ]
1018 |   },
1019 |   {
1020 |    "cell_type": "code",
1021 |    "execution_count": null,
1022 |    "metadata": {
1023 |     "colab": {
1024 |      "base_uri": "https://localhost:8080/"
1025 |     },
1026 |     "id": "1JJB33r5qCSq",
1027 |     "outputId": "9a7e4eae-7f27-4d36-94c0-37aa8712c3d2"
1028 |    },
1029 |    "outputs": [],
1030 |    "source": [
1031 |     "for i in range(len(ds)):\n",
1032 |     "    ds.loc[i, \"Prompt head\"] = \" \".join(\n",
1033 |     "        cleantext.clean_words(\n",
1034 |     "            ds.loc[i, \"Prompt\"],\n",
1035 |     "            clean_all=False,  # Execute all cleaning operations\n",
1036 |     "            extra_spaces=True,  # Remove extra white spaces\n",
1037 |     "            stemming=False,  # Stem the words\n",
1038 |     "            stopwords=False,  # Remove stop words\n",
1039 |     "            lowercase=False,  # Convert to lowercase\n",
1040 |     "            numbers=False,  # Remove all digits\n",
1041 |     "            punct=False,  # Remove all punctuations\n",
1042 |     "            stp_lang=\"english\",  # Language for stop words\n",
1043 |     "        )[0:12]\n",
1044 |     "    )"
1045 |    ]
1046 |   },
1047 |   {
1048 |    "cell_type": "code",
1049 |    "execution_count": null,
1050 |    "metadata": {
1051 |     "colab": {
1052 |      "base_uri": "https://localhost:8080/",
1053 |      "height": 1000
1054 |     },
1055 |     "id": "iEG1AZTurPkN",
1056 |     "outputId": "a24923f0-225f-41fd-f9b8-14d09afce366"
1057 |    },
1058 |    "outputs": [],
1059 |    "source": [
1060 |     "ds"
1061 |    ]
1062 |   },
1063 |   {
1064 |    "cell_type": "code",
1065 |    "execution_count": null,
1066 |    "metadata": {
1067 |     "colab": {
1068 |      "base_uri": "https://localhost:8080/",
1069 |      "height": 1000
1070 |     },
1071 |     "id": "UdQ3Rf3YpBSE",
1072 |     "outputId": "fa65296d-ac17-466c-8903-4c6cec8a2e7c"
1073 |    },
1074 |    "outputs": [],
1075 |    "source": [
1076 |     "ds[\"cluster + Prompt\"] = (\n",
1077 |     "    \"Cluster: \"\n",
1078 |     "    + ds[\"cluster\"].astype(str)\n",
1079 |     "    + \", Prompt id \"\n",
1080 |     "    + ds[\"id\"].astype(str)\n",
1081 |     "    + \": \"\n",
1082 |     "    + '\"'\n",
1083 |     "    + ds[\"Prompt head\"]\n",
1084 |     "    + '\"'\n",
1085 |     ")\n",
1086 |     "ds"
1087 |    ]
1088 |   },
1089 |   {
1090 |    "cell_type": "code",
1091 |    "execution_count": null,
1092 |    "metadata": {
1093 |     "colab": {
1094 |      "base_uri": "https://localhost:8080/",
1095 |      "height": 1000
1096 |     },
1097 |     "id": "w6l_0xa2luhO",
1098 |     "outputId": "f24a901f-356c-45eb-ad69-b6b44d2fc6ee"
1099 |    },
1100 |    "outputs": [],
1101 |    "source": [
1102 |     "# visualize top 25 clusters by count\n",
1103 |     "\n",
1104 |     "clust_to_zoom_list = pd.Series(loaded_clusterer.labels_).value_counts().index[1:26]\n",
1105 |     "\n",
1106 |     "for clust_to_zoom in clust_to_zoom_list:\n",
1107 |     "    print(f\"Cluster {clust_to_zoom}:\")\n",
1108 |     "    ds_inner_exemplars = ds[\n",
1109 |     "        (ds[\"exemplars yes/no\"] == 1) & (ds[\"cluster\"] == clust_to_zoom)\n",
1110 |     "    ]\n",
1111 |     "    ds_inner_exemplars = ds_inner_exemplars[\n",
1112 |     "        ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n",
1113 |     "    ]\n",
1114 |     "\n",
1115 |     "    fig = go.Figure()\n",
1116 |     "\n",
1117 |     "    fig.add_trace(\n",
1118 |     "        go.Scatter(\n",
1119 |     "            x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)],\n",
1120 |     "            y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)],\n",
1121 |     "            mode=\"markers\",\n",
1122 |     "            marker_color=custom_scale[0],\n",
1123 |     "            text=ds[\"cluster + Prompt\"][\n",
1124 |     "                (ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)\n",
1125 |     "            ],\n",
1126 |     "        )\n",
1127 |     "    )\n",
1128 |     "\n",
1129 |     "    fig.add_trace(\n",
1130 |     "        go.Scatter(\n",
1131 |     "            x=ds_inner_exemplars[\"x\"],\n",
1132 |     "            y=ds_inner_exemplars[\"y\"],\n",
1133 |     "            mode=\"markers\",\n",
1134 |     "            marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n",
1135 |     "            marker_colorscale=\"Viridis\",\n",
1136 |     "            text=ds_inner_exemplars[\"cluster + Prompt\"],\n",
1137 |     "        )\n",
1138 |     "    )\n",
1139 |     "\n",
1140 |     "    fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n",
1141 |     "    fig.update_coloraxes(showscale=False)\n",
1142 |     "    fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n",
1143 |     "    fig.show()"
1144 |    ]
1145 |   },
1146 |   {
1147 |    "cell_type": "code",
1148 |    "execution_count": null,
1149 |    "metadata": {
1150 |     "id": "PuTTU71N5AkX"
1151 |    },
1152 |    "outputs": [],
1153 |    "source": [
1154 |     "ds_inner_exemplars = ds[ds[\"exemplars yes/no\"] == 1]\n",
1155 |     "ds_inner_exemplars = ds_inner_exemplars[\n",
1156 |     "    ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n",
1157 |     "]"
1158 |    ]
1159 |   },
1160 |   {
1161 |    "cell_type": "code",
1162 |    "execution_count": null,
1163 |    "metadata": {
1164 |     "colab": {
1165 |      "base_uri": "https://localhost:8080/",
1166 |      "height": 673
1167 |     },
1168 |     "id": "rx58ZS3-40gI",
1169 |     "outputId": "786c3a8c-b837-4fba-cee8-a42bb05ca84d"
1170 |    },
1171 |    "outputs": [],
1172 |    "source": [
1173 |     "print(\"\\n\")\n",
1174 |     "\n",
1175 |     "fig = go.Figure()\n",
1176 |     "\n",
1177 |     "fig.add_trace(\n",
1178 |     "    go.Scatter(\n",
1179 |     "        x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
1180 |     "        y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n",
1181 |     "        mode=\"markers\",\n",
1182 |     "        marker_color=custom_scale[0],\n",
1183 |     "        text=ds[\"cluster + Prompt\"][\n",
1184 |     "            (ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)\n",
1185 |     "        ],\n",
1186 |     "    )\n",
1187 |     ")\n",
1188 |     "\n",
1189 |     "fig.add_trace(\n",
1190 |     "    go.Scatter(\n",
1191 |     "        x=ds_inner_exemplars[\"x\"],\n",
1192 |     "        y=ds_inner_exemplars[\"y\"],\n",
1193 |     "        mode=\"markers\",\n",
1194 |     "        marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n",
1195 |     "        marker_colorscale=\"Viridis\",\n",
1196 |     "        text=ds_inner_exemplars[\"cluster + Prompt\"],\n",
1197 |     "    )\n",
1198 |     ")\n",
1199 |     "\n",
1200 |     "fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n",
1201 |     "fig.update_coloraxes(showscale=False)\n",
1202 |     "fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n",
1203 |     "fig.show()"
1204 |    ]
1205 |   },
1206 |   {
1207 |    "cell_type": "markdown",
1208 |    "metadata": {
1209 |     "id": "B9fADGA9gvwA"
1210 |    },
1211 |    "source": [
1212 |     "## Create summary themes knowledge graph"
1213 |    ]
1214 |   },
1215 |   {
1216 |    "cell_type": "code",
1217 |    "execution_count": null,
1218 |    "metadata": {
1219 |     "id": "kXz16XO5dd6j"
1220 |    },
1221 |    "outputs": [],
1222 |    "source": [
1223 |     "claude_prompt = \"Please identify and summarize the core theme for each Sub-Cluster. Respond as succinctly as possible. Each summary cannot be longer than 1 sentence. Do not skip any of the Sub-Clusters. Do not list out the names of individuals in the prompts. Let's think step by step before responding.\"\n",
1224 |     "\n",
1225 |     "num_subclusts = 0\n",
1226 |     "subclusts_in_order = []\n",
1227 |     "\n",
1228 |     "for clust in np.unique(np.array(ds[(ds[\"exemplars yes/no\"] == 1)][\"cluster\"])):\n",
1229 |     "    sub_df = ds[(ds[\"cluster\"] == clust) & (ds[\"exemplars yes/no\"] == 1)]\n",
1230 |     "\n",
1231 |     "    for clust_to_zoom in clust_to_zoom_list:\n",
1232 |     "        if sub_df[\"cluster\"].iloc[0] == clust_to_zoom:\n",
1233 |     "            sub_clusts = list(np.unique(np.array(sub_df[\"cluster XX.YY\"])))\n",
1234 |     "            sub_clusts.sort()\n",
1235 |     "            for sub_clust in sub_clusts:\n",
1236 |     "                if int(sub_clust.split(\"Sub-Cluster \")[-1]) != -1:\n",
1237 |     "                    # print(sub_clust)\n",
1238 |     "                    num_subclusts += 1\n",
1239 |     "                    subclusts_in_order.append(sub_clust)\n",
1240 |     "                    sub_prompts = sub_df[sub_df[\"cluster XX.YY\"] == sub_clust][\n",
1241 |     "                        \"Prompt\"\n",
1242 |     "                    ].astype(str)\n",
1243 |     "                    claude_prompt += \"\\n\" + str(sub_clust.split(\", \")[1]) + \": \"\n",
1244 |     "                    claude_prompt += f\"\\n\" + str(sub_prompts) + '\"\\n'"
1245 |    ]
1246 |   },
1247 |   {
1248 |    "cell_type": "code",
1249 |    "execution_count": null,
1250 |    "metadata": {
1251 |     "id": "y49WrTOLq_Dy"
1252 |    },
1253 |    "outputs": [],
1254 |    "source": [
1255 |     "# print(claude_prompt)\n",
1256 |     "# subclusts_in_order"
1257 |    ]
1258 |   },
1259 |   {
1260 |    "cell_type": "code",
1261 |    "execution_count": null,
1262 |    "metadata": {
1263 |     "colab": {
1264 |      "base_uri": "https://localhost:8080/"
1265 |     },
1266 |     "id": "c0zRcjt0rBJd",
1267 |     "outputId": "94f1503d-78b9-44e3-cc71-07b18f614e6d"
1268 |    },
1269 |    "outputs": [],
1270 |    "source": [
1271 |     "num_subclusts"
1272 |    ]
1273 |   },
1274 |   {
1275 |    "cell_type": "code",
1276 |    "execution_count": null,
1277 |    "metadata": {
1278 |     "colab": {
1279 |      "base_uri": "https://localhost:8080/"
1280 |     },
1281 |     "id": "Qt-LdSAFlvxh",
1282 |     "outputId": "71994546-fcc8-4484-a8ea-3674b1f5f594"
1283 |    },
1284 |    "outputs": [],
1285 |    "source": [
1286 |     "claude_prompt.count(\"Sub-Cluster \")"
1287 |    ]
1288 |   },
1289 |   {
1290 |    "cell_type": "code",
1291 |    "execution_count": null,
1292 |    "metadata": {
1293 |     "id": "vqR42TGDnLvh"
1294 |    },
1295 |    "outputs": [],
1296 |    "source": [
1297 |     "ds_exemps = ds[(ds[\"exemplars yes/no\"] == 1) & (ds[\"exemplar sub-cluster\"] != -1)]\n",
1298 |     "\n",
1299 |     "mask = ds_exemps[\"cluster\"].isin(clust_to_zoom_list)\n",
1300 |     "ds_exemps_of_interest = ds_exemps[mask]\n",
1301 |     "# ds_exemps_of_interest"
1302 |    ]
1303 |   },
1304 |   {
1305 |    "cell_type": "code",
1306 |    "execution_count": null,
1307 |    "metadata": {
1308 |     "colab": {
1309 |      "base_uri": "https://localhost:8080/"
1310 |     },
1311 |     "id": "TJkN6z9clcSR",
1312 |     "outputId": "106b7414-2281-4612-af62-b1d1c73e7422"
1313 |    },
1314 |    "outputs": [],
1315 |    "source": [
1316 |     "len(np.unique(np.array(ds_exemps_of_interest[\"cluster XX.YY\"])))"
1317 |    ]
1318 |   },
1319 |   {
1320 |    "cell_type": "code",
1321 |    "execution_count": null,
1322 |    "metadata": {
1323 |     "colab": {
1324 |      "base_uri": "https://localhost:8080/"
1325 |     },
1326 |     "id": "fEqyYu9tn8vD",
1327 |     "outputId": "92679d2f-605d-4236-ae12-8226e341f9b4"
1328 |    },
1329 |    "outputs": [],
1330 |    "source": [
1331 |     "tokenizer = tiktoken.encoding_for_model(\"gpt-3.5-turbo-16k\")\n",
1332 |     "len(tokenizer.encode(claude_prompt))"
1333 |    ]
1334 |   },
1335 |   {
1336 |    "cell_type": "code",
1337 |    "execution_count": null,
1338 |    "metadata": {
1339 |     "colab": {
1340 |      "base_uri": "https://localhost:8080/",
1341 |      "height": 214
1342 |     },
1343 |     "id": "VTTbFvQds-yc",
1344 |     "outputId": "798ae301-742c-488a-bd9b-b4e991256fb0"
1345 |    },
1346 |    "outputs": [],
1347 |    "source": [
1348 |     "claude_prompt"
1349 |    ]
1350 |   },
1351 |   {
1352 |    "cell_type": "code",
1353 |    "execution_count": null,
1354 |    "metadata": {
1355 |     "colab": {
1356 |      "base_uri": "https://localhost:8080/"
1357 |     },
1358 |     "id": "DfAbeTWZp2M5",
1359 |     "outputId": "3f9e0d5d-939d-4d00-d170-4efb4071ac08"
1360 |    },
1361 |    "outputs": [],
1362 |    "source": [
1363 |     "# saved response from claude-2 conversation\n",
1364 |     "\n",
1365 |     "text_generation = \"\"\"Sub-Cluster 19: Portraits of characters in lofi style by various artists.\n",
1366 |     "\n",
1367 |     "Sub-Cluster 17: Symmetry portraits of various people and characters.\n",
1368 |     "\n",
1369 |     "Sub-Cluster 18: Symmetry sci-fi portraits of characters and people.\n",
1370 |     "\n",
1371 |     "Sub-Cluster 128: Highly detailed illustrations of people, often describing hair and age.\n",
1372 |     "\n",
1373 |     "Sub-Cluster 162: Highly detailed illustrations of sadistic or aggressive looking people.\n",
1374 |     "\n",
1375 |     "Sub-Cluster 163: Highly detailed illustrations of attractive people, often with white hair.\n",
1376 |     "\n",
1377 |     "Sub-Cluster 75: Highly detailed illustrations of beautiful, fierce, or smug women.\n",
1378 |     "\n",
1379 |     "Sub-Cluster 76: Art of the League of Legends champion Vi.\n",
1380 |     "\n",
1381 |     "Sub-Cluster 66: Greg Manchess portrait paintings of various characters as different roles.\n",
1382 |     "\n",
1383 |     "Sub-Cluster 78: Art and portraits featuring Star Wars characters, especially Darth Vader.\n",
1384 |     "\n",
1385 |     "Sub-Cluster 91: Portraits and art of female cyborg characters.\n",
1386 |     "\n",
1387 |     "Sub-Cluster 93: Art and portraits of robots and humanoid AI characters.\n",
1388 |     "\n",
1389 |     "Sub-Cluster 107: Art of Vladimir Putin being killed or defeated.\n",
1390 |     "\n",
1391 |     "Sub-Cluster 168: Portraits of Putin and Biden as magical characters.\n",
1392 |     "\n",
1393 |     "Sub-Cluster 235: Art depicting Vladimir Putin as various monsters, animals, or in humiliating situations.\n",
1394 |     "\n",
1395 |     "Sub-Cluster 236: Art of Putin with Kim Jong Un's haircut.\n",
1396 |     "\n",
1397 |     "Sub-Cluster 164: Art of characters like aliens eating hamburgers.\n",
1398 |     "\n",
1399 |     "Sub-Cluster 191: Art of Final Fantasy 7 character Sephiroth.\n",
1400 |     "\n",
1401 |     "Sub-Cluster 192: Beautiful, award winning pencil drawings and illustrations.\n",
1402 |     "\n",
1403 |     "Sub-Cluster 240: Portraits of celebrities eating hamburgers.\n",
1404 |     "\n",
1405 |     "Sub-Cluster 241: Portraits of various real people and characters eating hamburgers.\n",
1406 |     "\n",
1407 |     "Sub-Cluster 85: Art and portraits of dragons in various settings.\n",
1408 |     "\n",
1409 |     "Sub-Cluster 92: Art depicting Donald Trump in various roles and situations.\n",
1410 |     "\n",
1411 |     "Sub-Cluster 89: Art and portraits of Batman characters.\n",
1412 |     "\n",
1413 |     "Sub-Cluster 90: Art of Spider-Man and related Marvel characters.\n",
1414 |     "\n",
1415 |     "Sub-Cluster 135: Award winning portrait commissions.\n",
1416 |     "\n",
1417 |     "Sub-Cluster 136: Award winning portrait commissions of furry characters.\n",
1418 |     "\n",
1419 |     "Sub-Cluster 143: Anthropomorphic furry fox characters.\n",
1420 |     "\n",
1421 |     "Sub-Cluster 184: Trending furry fox character art.\n",
1422 |     "\n",
1423 |     "Sub-Cluster 185: Beautiful portrait commissions of furry characters.\n",
1424 |     "\n",
1425 |     "Sub-Cluster 65: Art and portraits of fox characters in various outfits and settings.\n",
1426 |     "\n",
1427 |     "Sub-Cluster 121: Portraits and art of cats in various styles.\n",
1428 |     "\n",
1429 |     "Sub-Cluster 146: Portraits of goddesses and divine figures.\n",
1430 |     "\n",
1431 |     "Sub-Cluster 178: Portraits of Megan Fox as characters from video games.\n",
1432 |     "\n",
1433 |     "Sub-Cluster 186: Psychedelic and Lovecraftian portraits of Megan Fox.\n",
1434 |     "\n",
1435 |     "Sub-Cluster 187: Portraits of Megan Fox in various roles and outfits.\n",
1436 |     "\n",
1437 |     "Sub-Cluster 54: Portraits of Emma Watson in various roles and settings.\n",
1438 |     "\n",
1439 |     "Sub-Cluster 74: Alexandra Daddario and Megan Fox as Scarlet Witch.\n",
1440 |     "\n",
1441 |     "Sub-Cluster 84: Futuristic and fantasy vehicle concept art.\n",
1442 |     "\n",
1443 |     "Sub-Cluster 94: Highly detailed realistic portraits of men.\n",
1444 |     "\n",
1445 |     "Sub-Cluster 113: Anime girl character portraits and concept art.\n",
1446 |     "\n",
1447 |     "Sub-Cluster 117: Portraits of beautiful women in various settings.\n",
1448 |     "\n",
1449 |     "Sub-Cluster 118: Portraits of young women in various outfits and styles.\n",
1450 |     "\n",
1451 |     "Sub-Cluster 81: Cinematic concept art portraits by Jama Jurabaev.\n",
1452 |     "\n",
1453 |     "Sub-Cluster 82: Futuristic sci-fi spaceship concept art.\n",
1454 |     "\n",
1455 |     "Sub-Cluster 125: Concept art of knights and warriors.\n",
1456 |     "\n",
1457 |     "Sub-Cluster 132: Surreal, cinematic, and futuristic digital art.\n",
1458 |     "\n",
1459 |     "Sub-Cluster 167: Beautiful landscape paintings and matte art.\n",
1460 |     "\n",
1461 |     "Sub-Cluster 151: Futuristic cityscape concept art.\"\"\"\n",
1462 |     "\n",
1463 |     "text_generation = text_generation.split(\"\\n\\n\")\n",
1464 |     "len(text_generation)"
1465 |    ]
1466 |   },
1467 |   {
1468 |    "cell_type": "code",
1469 |    "execution_count": null,
1470 |    "metadata": {
1471 |     "colab": {
1472 |      "base_uri": "https://localhost:8080/"
1473 |     },
1474 |     "id": "qHa4rgqcrQM0",
1475 |     "outputId": "5d6d6cf1-431f-4c19-a3ee-cbe5e37b9e6d"
1476 |    },
1477 |    "outputs": [],
1478 |    "source": [
1479 |     "summaries_dict = {\n",
1480 |     "    subclusts_in_order[i]: text_generation[i] for i in range(len(subclusts_in_order))\n",
1481 |     "}\n",
1482 |     "summaries_dict"
1483 |    ]
1484 |   },
1485 |   {
1486 |    "cell_type": "code",
1487 |    "execution_count": null,
1488 |    "metadata": {
1489 |     "id": "kI0Qs_ITvNzG"
1490 |    },
1491 |    "outputs": [],
1492 |    "source": [
1493 |     "for i in range(len(list(summaries_dict.keys()))):\n",
1494 |     "    pass\n",
1495 |     "    key = list(summaries_dict.keys())[i]\n",
1496 |     "    summary = summaries_dict[key]\n",
1497 |     "    key_subclust = key.split(\", \")[-1]\n",
1498 |     "    summary_subclust = summary.split(\": \")[0]\n",
1499 |     "    assert key_subclust == summary_subclust"
1500 |    ]
1501 |   },
1502 |   {
1503 |    "cell_type": "code",
1504 |    "execution_count": null,
1505 |    "metadata": {
1506 |     "id": "ktj6ipyctjOs"
1507 |    },
1508 |    "outputs": [],
1509 |    "source": [
1510 |     "# for i in range(len(text_generation)):\n",
1511 |     "# text_generation[i] = text_generation[i].split(\": \")[-1]"
1512 |    ]
1513 |   },
1514 |   {
1515 |    "cell_type": "code",
1516 |    "execution_count": null,
1517 |    "metadata": {
1518 |     "colab": {
1519 |      "base_uri": "https://localhost:8080/"
1520 |     },
1521 |     "id": "CmQLnLiXtf7u",
1522 |     "outputId": "dce2694a-49b6-4a1a-a956-15176a1d8798"
1523 |    },
1524 |    "outputs": [],
1525 |    "source": [
1526 |     "summaries_dict_cleaned = {\n",
1527 |     "    subclusts_in_order[i]: text_generation[i] for i in range(len(subclusts_in_order))\n",
1528 |     "}\n",
1529 |     "summaries_dict_cleaned"
1530 |    ]
1531 |   },
1532 |   {
1533 |    "cell_type": "code",
1534 |    "execution_count": null,
1535 |    "metadata": {
1536 |     "colab": {
1537 |      "base_uri": "https://localhost:8080/"
1538 |     },
1539 |     "id": "NQ8XhfRZuo8l",
1540 |     "outputId": "365d59d2-f01b-442d-d01e-7cc5baa3e625"
1541 |    },
1542 |    "outputs": [],
1543 |    "source": [
1544 |     "for i in range(len(ds)):\n",
1545 |     "    try:\n",
1546 |     "        ds.loc[i, \"theme\"] = summaries_dict_cleaned[ds.loc[i, \"cluster XX.YY\"]]\n",
1547 |     "    except KeyError:\n",
1548 |     "        pass"
1549 |    ]
1550 |   },
1551 |   {
1552 |    "cell_type": "code",
1553 |    "execution_count": null,
1554 |    "metadata": {
1555 |     "colab": {
1556 |      "base_uri": "https://localhost:8080/",
1557 |      "height": 35
1558 |     },
1559 |     "id": "RgKbnRtut3YG",
1560 |     "outputId": "125d5a2f-164e-4b9c-c0bd-c0325bfc8199"
1561 |    },
1562 |    "outputs": [],
1563 |    "source": [
1564 |     "ds[\"theme\"][ds[\"cluster XX.YY\"] == \"Cluster 77, Sub-Cluster 151\"].iloc[0]"
1565 |    ]
1566 |   },
1567 |   {
1568 |    "cell_type": "markdown",
1569 |    "metadata": {
1570 |     "id": "0CYEH057uyj8"
1571 |    },
1572 |    "source": [
1573 |     "## Write final df results to disk"
1574 |    ]
1575 |   },
1576 |   {
1577 |    "cell_type": "code",
1578 |    "execution_count": null,
1579 |    "metadata": {
1580 |     "colab": {
1581 |      "base_uri": "https://localhost:8080/",
1582 |      "height": 1000
1583 |     },
1584 |     "id": "Yjqesxe_oitS",
1585 |     "outputId": "620d1909-6874-4e4a-a988-3a6f2df64c94"
1586 |    },
1587 |    "outputs": [],
1588 |    "source": [
1589 |     "# write final ds to disk\n",
1590 |     "f_name = os.path.join(\n",
1591 |     "    args.cache_folder, \"stable_diffusion_prompts_dataframe_cached_with_results.xlsx\"\n",
1592 |     ")\n",
1593 |     "print(f_name, \"\\n\")\n",
1594 |     "\n",
1595 |     "# re-order cols\n",
1596 |     "ds = ds[\n",
1597 |     "    [\n",
1598 |     "        \"id\",\n",
1599 |     "        \"cluster\",\n",
1600 |     "        \"x\",\n",
1601 |     "        \"y\",\n",
1602 |     "        \"cluster membership prob\",\n",
1603 |     "        \"exemplars yes/no\",\n",
1604 |     "        \"exemplar sub-cluster\",\n",
1605 |     "        \"cluster XX.YY\",\n",
1606 |     "        \"theme\",\n",
1607 |     "        \"Prompt\",\n",
1608 |     "    ]\n",
1609 |     "]\n",
1610 |     "ds"
1611 |    ]
1612 |   },
1613 |   {
1614 |    "cell_type": "code",
1615 |    "execution_count": null,
1616 |    "metadata": {
1617 |     "id": "CuoIn4KCvyPR"
1618 |    },
1619 |    "outputs": [],
1620 |    "source": [
1621 |     "# write with adjusted col width\n",
1622 |     "# if not args.inference:\n",
1623 |     "if True:\n",
1624 |     "    with pd.ExcelWriter(f_name) as writer:\n",
1625 |     "        ds.to_excel(writer, sheet_name=\"All Prompts\")\n",
1626 |     "        auto_adjust_xlsx_column_width(ds, writer, sheet_name=\"All Prompts\", margin=1)"
1627 |    ]
1628 |   },
1629 |   {
1630 |    "cell_type": "markdown",
1631 |    "metadata": {
1632 |     "id": "CNrkolSLu1tg"
1633 |    },
1634 |    "source": [
1635 |     "## Format a JSON viz graph"
1636 |    ]
1637 |   },
1638 |   {
1639 |    "cell_type": "code",
1640 |    "execution_count": null,
1641 |    "metadata": {
1642 |     "colab": {
1643 |      "base_uri": "https://localhost:8080/",
1644 |      "height": 35
1645 |     },
1646 |     "id": "DGsJa7GrN2cf",
1647 |     "outputId": "65f53e9b-0a91-47ad-c800-9ff6db6e5e2e"
1648 |    },
1649 |    "outputs": [],
1650 |    "source": [
1651 |     "args.cache_folder"
1652 |    ]
1653 |   },
1654 |   {
1655 |    "cell_type": "code",
1656 |    "execution_count": null,
1657 |    "metadata": {
1658 |     "colab": {
1659 |      "base_uri": "https://localhost:8080/",
1660 |      "height": 1000
1661 |     },
1662 |     "id": "bUhn-leKp0Yv",
1663 |     "outputId": "ca475888-d96d-4f5a-d23d-d80c7e693172"
1664 |    },
1665 |    "outputs": [],
1666 |    "source": [
1667 |     "# optional ds cached loading\n",
1668 |     "ds_loaded = pd.read_excel(\n",
1669 |     "    os.path.join(\n",
1670 |     "        args.cache_folder, \"stable_diffusion_prompts_dataframe_cached_with_results.xlsx\"\n",
1671 |     "    ),\n",
1672 |     "    index_col=\"Unnamed: 0\",\n",
1673 |     ")\n",
1674 |     "ds_loaded"
1675 |    ]
1676 |   },
1677 |   {
1678 |    "cell_type": "code",
1679 |    "execution_count": null,
1680 |    "metadata": {
1681 |     "colab": {
1682 |      "base_uri": "https://localhost:8080/",
1683 |      "height": 1000
1684 |     },
1685 |     "id": "829s5RJtxJ2j",
1686 |     "outputId": "a37154d2-a6db-4ead-e0ba-a7600a18f100"
1687 |    },
1688 |    "outputs": [],
1689 |    "source": [
1690 |     "ds_clust = ds_loaded[ds_loaded[\"theme\"].notna()]\n",
1691 |     "ds_clust"
1692 |    ]
1693 |   },
1694 |   {
1695 |    "cell_type": "code",
1696 |    "execution_count": null,
1697 |    "metadata": {
1698 |     "colab": {
1699 |      "base_uri": "https://localhost:8080/"
1700 |     },
1701 |     "id": "Q_H93u9QuC2u",
1702 |     "outputId": "84bc7bd9-3eed-4f6f-cd74-a32291211c67"
1703 |    },
1704 |    "outputs": [],
1705 |    "source": [
1706 |     "len(np.unique(np.array(ds_clust[\"cluster XX.YY\"])))"
1707 |    ]
1708 |   },
1709 |   {
1710 |    "cell_type": "code",
1711 |    "execution_count": null,
1712 |    "metadata": {
1713 |     "id": "oc1HFPyBxnck"
1714 |    },
1715 |    "outputs": [],
1716 |    "source": [
1717 |     "knowledge_graphs = []\n",
1718 |     "\n",
1719 |     "for sub_clust in np.unique(np.array(ds_clust[\"cluster XX.YY\"])):\n",
1720 |     "    clust = sub_clust.split(\"Cluster \")[1].split(\",\")[0]\n",
1721 |     "\n",
1722 |     "    prompts = []\n",
1723 |     "    ds_inner = ds_clust[ds_clust[\"cluster XX.YY\"] == sub_clust]\n",
1724 |     "    for i in range(len(ds_inner)):\n",
1725 |     "        row = ds_inner.iloc[i]\n",
1726 |     "        if row[\"exemplars yes/no\"] == 1:\n",
1727 |     "            prompts.append(\n",
1728 |     "                {\n",
1729 |     "                    \"Prompt\": row.Prompt,\n",
1730 |     "                    \"id\": float(row[\"id\"]),\n",
1731 |     "                }\n",
1732 |     "            )\n",
1733 |     "\n",
1734 |     "    viz = {\n",
1735 |     "        \"core theme\": ds_inner.iloc[0][\"theme\"],\n",
1736 |     "        \"cluster id\": ds_inner.iloc[0][\"cluster XX.YY\"],\n",
1737 |     "        # \"frequency\": str(np.round(100 * len(ds_inner) / len(ds), 2)) + \"%\",\n",
1738 |     "        # \"count\": len(ds_inner),\n",
1739 |     "        # \"exemplars\": prompts,\n",
1740 |     "    }\n",
1741 |     "\n",
1742 |     "    knowledge_graphs.append(viz)"
1743 |    ]
1744 |   },
1745 |   {
1746 |    "cell_type": "code",
1747 |    "execution_count": null,
1748 |    "metadata": {
1749 |     "colab": {
1750 |      "base_uri": "https://localhost:8080/"
1751 |     },
1752 |     "id": "iwpVNL_vOze_",
1753 |     "outputId": "1b9c8202-67a1-4aad-b398-7cea61e36bb0"
1754 |    },
1755 |    "outputs": [],
1756 |    "source": [
1757 |     "for i in range(len(np.unique(np.array(ds_clust[\"cluster XX.YY\"])))):\n",
1758 |     "    sub_clust = np.unique(np.array(ds_clust[\"cluster XX.YY\"]))[i]\n",
1759 |     "\n",
1760 |     "    # Serializing json\n",
1761 |     "    json_object = json.dumps(knowledge_graphs[i], indent=4)\n",
1762 |     "    print(json_object)\n",
1763 |     "    print(\"\\n\\n\")"
1764 |    ]
1765 |   },
1766 |   {
1767 |    "cell_type": "code",
1768 |    "execution_count": null,
1769 |    "metadata": {
1770 |     "id": "uqu2JMCcw3I4"
1771 |    },
1772 |    "outputs": [],
1773 |    "source": [
1774 |     "# summaries_dict_cleaned[\"Cluster 75, Sub-Cluster 167\"]"
1775 |    ]
1776 |   },
1777 |   {
1778 |    "cell_type": "code",
1779 |    "execution_count": null,
1780 |    "metadata": {
1781 |     "id": "sr7BqFUM5E96"
1782 |    },
1783 |    "outputs": [],
1784 |    "source": [
1785 |     "summaries_dict_cluster_level = {}\n",
1786 |     "for clust in np.unique(np.array(ds_clust[\"cluster\"])):\n",
1787 |     "    summaries_dict_cluster_level[clust] = list(\n",
1788 |     "        np.unique(np.array(ds_clust[ds_clust[\"cluster\"] == clust][\"theme\"]))\n",
1789 |     "    )"
1790 |    ]
1791 |   },
1792 |   {
1793 |    "cell_type": "code",
1794 |    "execution_count": null,
1795 |    "metadata": {
1796 |     "colab": {
1797 |      "base_uri": "https://localhost:8080/"
1798 |     },
1799 |     "id": "_Oe7fGdKyK7q",
1800 |     "outputId": "66f7173a-fc25-42d3-d380-9fc3ba50bfa0"
1801 |    },
1802 |    "outputs": [],
1803 |    "source": [
1804 |     "summaries_dict_cluster_level"
1805 |    ]
1806 |   },
1807 |   {
1808 |    "cell_type": "code",
1809 |    "execution_count": null,
1810 |    "metadata": {
1811 |     "colab": {
1812 |      "base_uri": "https://localhost:8080/"
1813 |     },
1814 |     "id": "Ju1VNC4w5yQD",
1815 |     "outputId": "a5514fc5-dcd2-4c41-ddd5-3ceb8f2d9e0e"
1816 |    },
1817 |    "outputs": [],
1818 |    "source": [
1819 |     "summaries_dict_cluster_level[10]"
1820 |    ]
1821 |   },
1822 |   {
1823 |    "cell_type": "code",
1824 |    "execution_count": null,
1825 |    "metadata": {
1826 |     "id": "rX2glhje2sYI"
1827 |    },
1828 |    "outputs": [],
1829 |    "source": [
1830 |     "knowledge_graphs = []\n",
1831 |     "\n",
1832 |     "itr = 0\n",
1833 |     "for clust in ds_loaded[\"cluster\"].value_counts().index:\n",
1834 |     "    if clust in list(np.unique(np.array(ds_clust[\"cluster\"]))):\n",
1835 |     "        ds_inner = ds_clust[ds_clust[\"cluster\"] == int(clust)]\n",
1836 |     "\n",
1837 |     "        viz = {\n",
1838 |     "            \"cluster id\": \"Cluster \" + str(ds_inner.iloc[0][\"cluster\"]),\n",
1839 |     "            \"count\": float(ds_loaded[\"cluster\"].value_counts().loc[int(clust)]),\n",
1840 |     "            \"frequency\": str(\n",
1841 |     "                np.round(\n",
1842 |     "                    100\n",
1843 |     "                    * float(ds_loaded[\"cluster\"].value_counts().loc[int(clust)])\n",
1844 |     "                    / len(ds_loaded),\n",
1845 |     "                    2,\n",
1846 |     "                )\n",
1847 |     "            )\n",
1848 |     "            + \"%\",\n",
1849 |     "            \"core theme\": summaries_dict_cluster_level[clust],\n",
1850 |     "        }\n",
1851 |     "\n",
1852 |     "        knowledge_graphs.append(viz)"
1853 |    ]
1854 |   },
1855 |   {
1856 |    "cell_type": "code",
1857 |    "execution_count": null,
1858 |    "metadata": {
1859 |     "colab": {
1860 |      "base_uri": "https://localhost:8080/"
1861 |     },
1862 |     "id": "_lgrhx7H4iC7",
1863 |     "outputId": "19b187c7-a1b9-44d6-cee7-d63dce526198"
1864 |    },
1865 |    "outputs": [],
1866 |    "source": [
1867 |     "len(knowledge_graphs)"
1868 |    ]
1869 |   },
1870 |   {
1871 |    "cell_type": "code",
1872 |    "execution_count": null,
1873 |    "metadata": {
1874 |     "id": "MKfP4tMj8PHL"
1875 |    },
1876 |    "outputs": [],
1877 |    "source": [
1878 |     "knowledge_graphs = {\"knowledge graph\": knowledge_graphs}"
1879 |    ]
1880 |   },
1881 |   {
1882 |    "cell_type": "code",
1883 |    "execution_count": null,
1884 |    "metadata": {
1885 |     "colab": {
1886 |      "base_uri": "https://localhost:8080/"
1887 |     },
1888 |     "id": "Gi-uJ6Ds2sYJ",
1889 |     "outputId": "4c00ec28-6aa5-497e-ccc8-fb823fa947bf"
1890 |    },
1891 |    "outputs": [],
1892 |    "source": [
1893 |     "# Serializing json\n",
1894 |     "json_object = json.dumps(knowledge_graphs, indent=4)\n",
1895 |     "print(json_object)\n",
1896 |     "print(\"\\n\\n\")"
1897 |    ]
1898 |   },
1899 |   {
1900 |    "cell_type": "markdown",
1901 |    "metadata": {
1902 |     "id": "p3NXLZVOPpFk"
1903 |    },
1904 |    "source": [
1905 |     "## Drift detection on the top 25 clusters\n"
1906 |    ]
1907 |   },
1908 |   {
1909 |    "cell_type": "code",
1910 |    "execution_count": null,
1911 |    "metadata": {
1912 |     "colab": {
1913 |      "base_uri": "https://localhost:8080/",
1914 |      "height": 424
1915 |     },
1916 |     "id": "t_zMrCONOhpd",
1917 |     "outputId": "bf2461d3-e5c4-477f-8655-053096f19a99"
1918 |    },
1919 |    "outputs": [],
1920 |    "source": [
1921 |     "ds_hf = load_dataset(\"Gustavosta/Stable-Diffusion-Prompts\")\n",
1922 |     "ds = ds_hf[\"test\"]\n",
1923 |     "\n",
1924 |     "ds = ds.to_pandas()\n",
1925 |     "ds[\"id\"] = ds.index\n",
1926 |     "ds = ds[[\"id\", \"Prompt\"]]\n",
1927 |     "ds"
1928 |    ]
1929 |   },
1930 |   {
1931 |    "cell_type": "code",
1932 |    "execution_count": null,
1933 |    "metadata": {
1934 |     "colab": {
1935 |      "base_uri": "https://localhost:8080/"
1936 |     },
1937 |     "id": "9KgCWCIlvsFn",
1938 |     "outputId": "4069f096-8b20-4ba5-9fb9-2a2c064c8208"
1939 |    },
1940 |    "outputs": [],
1941 |    "source": [
1942 |     "args.inference = True\n",
1943 |     "args"
1944 |    ]
1945 |   },
1946 |   {
1947 |    "cell_type": "code",
1948 |    "execution_count": null,
1949 |    "metadata": {
1950 |     "id": "8B2Ay5OfvftL"
1951 |    },
1952 |    "outputs": [],
1953 |    "source": [
1954 |     "# Embed Text\n",
1955 |     "# * See [pretrained models](https://www.sbert.net/docs/pretrained_models.html) for supporting information\n",
1956 |     "\n",
1957 |     "model = SentenceTransformer(\"all-mpnet-base-v2\")\n",
1958 |     "\n",
1959 |     "if not args.inference:\n",
1960 |     "    embeddings = torch.zeros([len(ds), 768])\n",
1961 |     "    for i in tqdm(range(len(ds))):\n",
1962 |     "        emb = model.encode(ds.loc[i, \"Prompt\"], convert_to_tensor=True)\n",
1963 |     "        embeddings[i, :] = emb\n",
1964 |     "    embeddings"
1965 |    ]
1966 |   },
1967 |   {
1968 |    "cell_type": "code",
1969 |    "execution_count": null,
1970 |    "metadata": {
1971 |     "colab": {
1972 |      "base_uri": "https://localhost:8080/"
1973 |     },
1974 |     "id": "PshHjy3NvgBA",
1975 |     "outputId": "7805b3cb-7d9b-4ee6-93ae-bf771f755f2a"
1976 |    },
1977 |    "outputs": [],
1978 |    "source": [
1979 |     "f_name = os.path.join(\n",
1980 |     "    args.cache_folder, \"stable_diffusion_prompts_test_embeddings_all_mpnet_base_v2.pt\"\n",
1981 |     ")\n",
1982 |     "print(f_name, \"\\n\")\n",
1983 |     "\n",
1984 |     "if not args.inference:\n",
1985 |     "    torch.save(embeddings, f_name)\n",
1986 |     "loaded_embeddings = torch.load(f_name)\n",
1987 |     "loaded_embeddings"
1988 |    ]
1989 |   },
1990 |   {
1991 |    "cell_type": "code",
1992 |    "execution_count": null,
1993 |    "metadata": {
1994 |     "colab": {
1995 |      "base_uri": "https://localhost:8080/"
1996 |     },
1997 |     "id": "HbsirlY0vgBB",
1998 |     "outputId": "08e5f4fb-32a9-4e04-ca70-f2e4a407d5d1"
1999 |    },
2000 |    "outputs": [],
2001 |    "source": [
2002 |     "loaded_embeddings.shape"
2003 |    ]
2004 |   },
2005 |   {
2006 |    "cell_type": "code",
2007 |    "execution_count": null,
2008 |    "metadata": {
2009 |     "id": "hvU350V2vgBB"
2010 |    },
2011 |    "outputs": [],
2012 |    "source": [
2013 |     "if not args.inference:\n",
2014 |     "    torch.equal(loaded_embeddings.cpu(), embeddings.cpu())"
2015 |    ]
2016 |   },
2017 |   {
2018 |    "cell_type": "code",
2019 |    "execution_count": null,
2020 |    "metadata": {
2021 |     "colab": {
2022 |      "base_uri": "https://localhost:8080/"
2023 |     },
2024 |     "id": "leU5pK6mvgBB",
2025 |     "outputId": "7546ce0b-3f7e-421f-edcd-b0063b51da79"
2026 |    },
2027 |    "outputs": [],
2028 |    "source": [
2029 |     "test_itr = random.randint(low=0, high=len(ds))\n",
2030 |     "test_itr"
2031 |    ]
2032 |   },
2033 |   {
2034 |    "cell_type": "code",
2035 |    "execution_count": null,
2036 |    "metadata": {
2037 |     "colab": {
2038 |      "base_uri": "https://localhost:8080/",
2039 |      "height": 71
2040 |     },
2041 |     "id": "M_M-XVkpvgBB",
2042 |     "outputId": "3dc80f2c-06c8-4def-c70e-63720ff52f45"
2043 |    },
2044 |    "outputs": [],
2045 |    "source": [
2046 |     "# test embeddings worked\n",
2047 |     "ds.loc[test_itr, \"Prompt\"]"
2048 |    ]
2049 |   },
2050 |   {
2051 |    "cell_type": "code",
2052 |    "execution_count": null,
2053 |    "metadata": {
2054 |     "colab": {
2055 |      "base_uri": "https://localhost:8080/"
2056 |     },
2057 |     "id": "iBffCA3NvgBB",
2058 |     "outputId": "36a9dd66-77d2-44fd-c07b-6b250f955258"
2059 |    },
2060 |    "outputs": [],
2061 |    "source": [
2062 |     "test_emb = model.encode(ds.loc[test_itr, \"Prompt\"], convert_to_tensor=True)\n",
2063 |     "a = np.array(test_emb.cpu())\n",
2064 |     "b = np.array(loaded_embeddings[test_itr, :].cpu())\n",
2065 |     "np.allclose(a, b, rtol=1e-02)"
2066 |    ]
2067 |   },
2068 |   {
2069 |    "cell_type": "code",
2070 |    "execution_count": null,
2071 |    "metadata": {
2072 |     "colab": {
2073 |      "base_uri": "https://localhost:8080/"
2074 |     },
2075 |     "id": "xgUPGdAfv62x",
2076 |     "outputId": "467a7ec8-ce53-4a8f-e3de-ec524a7d9a76"
2077 |    },
2078 |    "outputs": [],
2079 |    "source": [
2080 |     "# second, perform clustering\n",
2081 |     "\n",
2082 |     "# first, perform dimensionality reduction from 768 to 15\n",
2083 |     "f_name = os.path.join(args.cache_folder, \"reducer_umap_15.pkl\")\n",
2084 |     "print(f_name, \"\\n\")\n",
2085 |     "\n",
2086 |     "loaded_reducer_15 = pickle.load((open(f_name, \"rb\")))\n",
2087 |     "\n",
2088 |     "embeddings_umap_dim_15 = loaded_reducer_15.transform(loaded_embeddings)\n",
2089 |     "\n",
2090 |     "embeddings_umap_dim_15.shape"
2091 |    ]
2092 |   },
2093 |   {
2094 |    "cell_type": "code",
2095 |    "execution_count": null,
2096 |    "metadata": {
2097 |     "colab": {
2098 |      "base_uri": "https://localhost:8080/",
2099 |      "height": 130
2100 |     },
2101 |     "id": "-PM40fYpxPRA",
2102 |     "outputId": "b5058681-c52e-4fdf-c17a-75731ce94fae"
2103 |    },
2104 |    "outputs": [],
2105 |    "source": [
2106 |     "f_name = os.path.join(args.cache_folder, \"clusterer_hdbscan.pkl\")\n",
2107 |     "print(f_name, \"\\n\")\n",
2108 |     "\n",
2109 |     "loaded_clusterer = pickle.load((open(f_name, \"rb\")))\n",
2110 |     "loaded_clusterer"
2111 |    ]
2112 |   },
2113 |   {
2114 |    "cell_type": "code",
2115 |    "execution_count": null,
2116 |    "metadata": {
2117 |     "colab": {
2118 |      "base_uri": "https://localhost:8080/"
2119 |     },
2120 |     "id": "8dPZ614OxJoe",
2121 |     "outputId": "242b4da9-bdf5-4109-fb46-0719291bd9b6"
2122 |    },
2123 |    "outputs": [],
2124 |    "source": [
2125 |     "test_labels, strengths = hdbscan.approximate_predict(\n",
2126 |     "    loaded_clusterer, embeddings_umap_dim_15\n",
2127 |     ")\n",
2128 |     "test_labels"
2129 |    ]
2130 |   },
2131 |   {
2132 |    "cell_type": "code",
2133 |    "execution_count": null,
2134 |    "metadata": {
2135 |     "colab": {
2136 |      "base_uri": "https://localhost:8080/",
2137 |      "height": 179
2138 |     },
2139 |     "id": "fffVi_HuweCQ",
2140 |     "outputId": "b26cee5f-0fa5-4203-cec9-6945841be352"
2141 |    },
2142 |    "outputs": [],
2143 |    "source": [
2144 |     "pd.Series(test_labels).value_counts()[1:26]"
2145 |    ]
2146 |   },
2147 |   {
2148 |    "cell_type": "code",
2149 |    "execution_count": null,
2150 |    "metadata": {
2151 |     "id": "SKC6g7yHweG6"
2152 |    },
2153 |    "outputs": [],
2154 |    "source": []
2155 |   }
2156 |  ],
2157 |  "metadata": {
2158 |   "accelerator": "GPU",
2159 |   "colab": {
2160 |    "gpuType": "T4",
2161 |    "machine_shape": "hm",
2162 |    "provenance": []
2163 |   },
2164 |   "interpreter": {
2165 |    "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754"
2166 |   },
2167 |   "kernelspec": {
2168 |    "display_name": "Python 3",
2169 |    "name": "python3"
2170 |   },
2171 |   "language_info": {
2172 |    "codemirror_mode": {
2173 |     "name": "ipython",
2174 |     "version": 3
2175 |    },
2176 |    "file_extension": ".py",
2177 |    "mimetype": "text/x-python",
2178 |    "name": "python",
2179 |    "nbconvert_exporter": "python",
2180 |    "pygments_lexer": "ipython3",
2181 |    "version": "3.10.10"
2182 |   },
2183 |   "orig_nbformat": 4
2184 |  },
2185 |  "nbformat": 4,
2186 |  "nbformat_minor": 0
2187 | }
2188 | 


--------------------------------------------------------------------------------