├── experiments ├── 02_09_2023_16_54_32 │ ├── reducer_umap_15.pkl │ ├── reducer_umap_2.pkl │ ├── clusterer_hdbscan.pkl │ ├── assets │ │ ├── clusters_viz_1.png │ │ ├── exemplars_viz_1.png │ │ ├── exemplars_viz_2.png │ │ ├── cluster0_subcluster0.png │ │ ├── cluster0_subcluster1.png │ │ ├── cluster1_subcluster2.png │ │ ├── cluster1_subcluster3.png │ │ ├── cluster1_subcluster4.png │ │ └── cluster1_subcluster5.png │ ├── clusterer_subs_hdbscan.pkl │ ├── prompts_embeddings_all_mpnet_base_v2.pt │ └── prompts_dataframe_cached_with_results.xlsx ├── 03_09_2023_15_14_39 │ ├── reducer_umap_15.pkl │ ├── reducer_umap_2.pkl │ ├── clusterer_hdbscan.pkl │ ├── clusterer_subs_hdbscan.pkl │ ├── prompts_embeddings_all_mpnet_base_v2.pt │ └── prompts_dataframe_cached_with_results.xlsx └── 04_09_2023_03_02_25 │ └── assets │ ├── aspens_runway.jpeg │ ├── batman_midjourney.png │ ├── selected_5_themes.png │ ├── futuristic_car_midjourney.png │ ├── selected_25_cluster_themes.png │ └── traveler_wanderer_runway.jpeg ├── requirements.txt ├── .gitignore ├── README.md ├── LICENSE └── notebooks └── stable-diffusion-prompts-clustering.ipynb /experiments/02_09_2023_16_54_32/reducer_umap_15.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/reducer_umap_15.pkl -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/reducer_umap_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/reducer_umap_2.pkl -------------------------------------------------------------------------------- /experiments/03_09_2023_15_14_39/reducer_umap_15.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/reducer_umap_15.pkl -------------------------------------------------------------------------------- /experiments/03_09_2023_15_14_39/reducer_umap_2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/reducer_umap_2.pkl -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/clusterer_hdbscan.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/clusterer_hdbscan.pkl -------------------------------------------------------------------------------- /experiments/03_09_2023_15_14_39/clusterer_hdbscan.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/clusterer_hdbscan.pkl -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/clusters_viz_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/clusters_viz_1.png -------------------------------------------------------------------------------- /experiments/04_09_2023_03_02_25/assets/aspens_runway.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/aspens_runway.jpeg -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/exemplars_viz_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/exemplars_viz_1.png -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/exemplars_viz_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/exemplars_viz_2.png -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/clusterer_subs_hdbscan.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/clusterer_subs_hdbscan.pkl -------------------------------------------------------------------------------- /experiments/03_09_2023_15_14_39/clusterer_subs_hdbscan.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/clusterer_subs_hdbscan.pkl -------------------------------------------------------------------------------- /experiments/04_09_2023_03_02_25/assets/batman_midjourney.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/batman_midjourney.png -------------------------------------------------------------------------------- /experiments/04_09_2023_03_02_25/assets/selected_5_themes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/selected_5_themes.png -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/cluster0_subcluster0.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster0.png -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/cluster0_subcluster1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster0_subcluster1.png -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/cluster1_subcluster2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster2.png -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/cluster1_subcluster3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster3.png -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/cluster1_subcluster4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster4.png -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/assets/cluster1_subcluster5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/assets/cluster1_subcluster5.png -------------------------------------------------------------------------------- /experiments/04_09_2023_03_02_25/assets/futuristic_car_midjourney.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/futuristic_car_midjourney.png -------------------------------------------------------------------------------- /experiments/04_09_2023_03_02_25/assets/selected_25_cluster_themes.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/selected_25_cluster_themes.png -------------------------------------------------------------------------------- /experiments/04_09_2023_03_02_25/assets/traveler_wanderer_runway.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/04_09_2023_03_02_25/assets/traveler_wanderer_runway.jpeg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | datasets 3 | umap-learn 4 | hdbscan 5 | sentence-transformers 6 | numpy 7 | torch 8 | openai 9 | pandas 10 | openpyxl 11 | seaborn 12 | plotly 13 | UliPlot 14 | tiktoken 15 | cleantext -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/prompts_embeddings_all_mpnet_base_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/prompts_embeddings_all_mpnet_base_v2.pt -------------------------------------------------------------------------------- /experiments/03_09_2023_15_14_39/prompts_embeddings_all_mpnet_base_v2.pt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/prompts_embeddings_all_mpnet_base_v2.pt -------------------------------------------------------------------------------- /experiments/02_09_2023_16_54_32/prompts_dataframe_cached_with_results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/02_09_2023_16_54_32/prompts_dataframe_cached_with_results.xlsx -------------------------------------------------------------------------------- /experiments/03_09_2023_15_14_39/prompts_dataframe_cached_with_results.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/daniel-furman/awesome-chatgpt-prompts-clustering/HEAD/experiments/03_09_2023_15_14_39/prompts_dataframe_cached_with_results.xlsx -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv* 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Text clustering: HDBSCAN is probably all you need 2 | 3 | [![License](https://img.shields.io/badge/License-Apache_2.0-green.svg)](https://github.com/daniel-furman/Polyglot-or-Not/blob/main/LICENSE) 4 | [![Python 3.9+](https://img.shields.io/badge/python-3.9+-blue.svg)](https://www.python.org/downloads/release/python-390/) 5 | [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) 6 | 7 | ## Goal 8 | 9 | Segment common items in a text dataset to pinpoint core themes and their distribution. 10 | 11 | * Clusters cover the main topics/subtopics in the dataset 12 | * Clusters backed by accurate, LLM generated summaries 13 | 14 | ## Background 15 | 16 | We employ [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/index.html) for probabilistic clustering. This algorithm is advantageous in many ways, including: 17 | 18 | * Don’t be wrong: Cluster can have varying densities, don’t need to be globular, and won’t include noise 19 | * Intuitive parameters: Choosing a minimum cluster size is very reasonable, and the number of *k* clusters does not need to be specified (HDBSCAN finds the optimal *k* for you) 20 | * Stability: HDBSCAN is stable over runs and subsampling and has good stability over parameter choices 21 | * Performance: When implemented well HDBSCAN can be very efficient; the current implementation has similar performance to fastcluster’s agglomerative clustering 22 | 23 | See the HDBSCAN docs on [comparing clustering algorithms](https://hdbscan.readthedocs.io/en/latest/comparing_clustering_algorithms.html#hdbscan) and [how hdbscan works](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) for more information. 24 | 25 | ## Citations 26 | 27 | * Datasets 28 | * [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts) 29 | * [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts) 30 | * Embedding models 31 | * [sentence-transformers/all-mpnet-base-v2](https://huggingface.co/sentence-transformers/all-mpnet-base-v2) 32 | 33 | ## Experiments 34 | 35 | ## 1. Visualizing core themes in [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts) 36 | 37 | These figures correspond to [`experiments/02_09_2023_16_54_32`](https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering/tree/main/experiments/02_09_2023_16_54_32) 38 | 39 | Open In Colab 40 | 41 | --- 42 | 43 | ![](experiments/02_09_2023_16_54_32/assets/clusters_viz_1.png) 44 | 45 | **Figure 1**. HDBSCAN splits the 153 text to text prompts from [fka/awesome-chatgpt-prompts](https://huggingface.co/datasets/fka/awesome-chatgpt-prompts) into two clusters: Cluster 1 with 44 prompts (orange) and Cluster 2 with 105 prompts (blue). The 4 remaining prompts (gray) were filtered out as outliers/noise. 46 | 47 | ![](experiments/02_09_2023_16_54_32/assets/exemplars_viz_1.png) 48 | 49 | **Figure 2**. The most persistent prompts in each leaf cluster are known as "exemplars". These represent the hearts around which the ultimate cluster formed. See the HDBSCAN docs on [soft clustering explanation](https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html#distance-based-membership) for supporting information and functions. 50 | 51 | ![](experiments/02_09_2023_16_54_32/assets/exemplars_viz_2.png) 52 | 53 | **Figure 3**. Additional clustering is conducted around the exemplars to identify sub-topics in the dataset. The cases in each sub-cluster subsequently serve as retrieved context for the LLM theme summarization calls below. 54 | 55 | ![](experiments/02_09_2023_16_54_32/assets/cluster0_subcluster0.png) 56 | 57 | **Figure 4**. Visualizing the "*Computer Programming and Software Development*" theme, which covers 13% of the dataset. The summary was generated by [gpt-3.5-turbo-16k](https://platform.openai.com/docs/models/gpt-3-5). The above was created with [jsoncrack.com/editor](https://jsoncrack.com/editor). 58 | 59 | 60 |
61 | 62 | ## 2. Drift detection for [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts) 63 | 64 | These figures correspond to [`experiments/04_09_2023_03_02_25`](https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering/tree/main/experiments/04_09_2023_03_02_25) 65 | 66 |
67 | Open In Colab 68 | 69 | 70 | --- 71 | 72 | HDBSCAN splits the 73,718 text to image prompts from [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts) into 78 clusters with 25,019 (33%) of the dataset represented. The remaining 48,699 (66%) were filtered out as outliers/noise. The 5 largest clusters cover 9.5% of the dataset - these are the segments we will examine for drift below. 73 | 74 | | cluster id | theme | 75 | |------------|--------------| 76 | | 56 | Portraits and artistic depictions of female anime characters, beautiful women, and fashionable young women | 77 | | 13 | Symmetrical portraits of people, characters, and sci-fi figures | 78 | | 61 | Futuristic sci-fi spaceship concept art | 79 | | 50 | Portraits of famous actresses as characters in various roles, outfits, and styles | 80 | | 74 | Surreal, cinematic, and futuristic digital art | 81 | 82 | | cluster id | train count
(73.7k rows) | test count
(8.19k rows) | drift detection
(% change) | 83 | |------------|-------------------------------|------------------------------|------------------| 84 | | 56 | 2530 (3.43%) | 310 (3.79%) | 10.50 | 85 | | 13 | 1343 (1.82%) | 149 (1.82%) | 0.00 | 86 | | 61 | 1287 (1.75%) | 131 (1.60%) | -8.57 | 87 | | 50 | 1055 (1.43%) | 135 (1.65%) | 15.38 | 88 | | 74 | 749 (1.02%) | 109 (1.33%) | 30.39 | 89 | 90 | 91 | **Tables 1 & 2**. Drift detection for the top 5 largest clusters (bottom), alongside their [claude-2](https://claude.ai/) summaries (top). 92 | 93 |
94 | 95 |

96 | 97 | **Prompt**: "*Beautiful painting of an Aspen forest at sunset, digital art, award winning illustration, golden hour, smooth, sharp lines, concept art, trending on artstation*"
98 | **Model**: [Runway Gen-2](https://app.runwayml.com/video-tools/teams/dryanfurman/ai-tools/text-to-image) (accessed by Daniel Furman on Sep 4, 2023)
99 | **Theme**: Beautiful landscape paintings and matte art (cluster id: 75)
100 | 101 |
102 | 103 |

104 | 105 | **Prompt**: "*Futuristic batman, brush strokes, oil painting, greg rutkowski*"
106 | **Model**: [Midjourney V5.2](https://www.midjourney.com/app/) (accessed by Daniel Furman on Sep 4, 2023)
107 | **Theme**: Art and portraits of Batman characters (cluster id: 41)
108 | 109 |

110 | 111 | **Prompt**: "*Futuristic Porsche designed by Apple, a detailed matte painting by Kitagawa Utamaro, cgsociety, octane render, highly detailed, matte painting, concept art, sci-fi*"
112 | **Model**: [Midjourney V5.2](https://www.midjourney.com/app/) (accessed by Daniel Furman on Sep 4, 2023)
113 | **Theme**: Futuristic and fantasy vehicle concept art (cluster id: 52)
114 | 115 | 116 | **Figure 5**. A sample of 3 text to image generations with various models for prompts from the [gustavosta/stable-diffusion-prompts](https://huggingface.co/datasets/Gustavosta/Stable-Diffusion-Prompts) dataset (alongside their cluster id). -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /notebooks/stable-diffusion-prompts-clustering.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "LQ-HKp9u4kTR" 7 | }, 8 | "source": [ 9 | "# Text clustering: HDBSCAN is probably all you need\n", 10 | "\n", 11 | "\n", 12 | " \"Open\n", 13 | "" 14 | ] 15 | }, 16 | { 17 | "cell_type": "markdown", 18 | "metadata": { 19 | "id": "IpEhs9ujlcQ1" 20 | }, 21 | "source": [ 22 | "## Sections\n", 23 | "\n", 24 | "1. Setup\n", 25 | "2. Data I/O\n", 26 | "3. Embed text\n", 27 | "4. Clustering\n", 28 | "5. Exemplar sub-clustering\n", 29 | "6. Knowledge graph theming\n", 30 | "7. Write final df results to disk\n", 31 | "8. Create a JSON knowledge graph viz" 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "id": "ZMZjJxWGeSZP" 38 | }, 39 | "source": [ 40 | "## Setup" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "colab": { 48 | "base_uri": "https://localhost:8080/" 49 | }, 50 | "id": "a6GR5Tfzx2z9", 51 | "outputId": "0ccecdb6-b57c-4c05-de12-196770056162" 52 | }, 53 | "outputs": [], 54 | "source": [ 55 | "# copying larger files to GDrive storage for this experiment\n", 56 | "\n", 57 | "from google.colab import drive\n", 58 | "\n", 59 | "drive.mount(\"/content/drive\")" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": null, 65 | "metadata": { 66 | "id": "-mA_PgxV4KV2" 67 | }, 68 | "outputs": [], 69 | "source": [ 70 | "!git clone https://github.com/daniel-furman/awesome-chatgpt-prompts-clustering.git" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": null, 76 | "metadata": { 77 | "id": "mO55HvFB3egw" 78 | }, 79 | "outputs": [], 80 | "source": [ 81 | "# for local run, see below commands for setting up a new venv\n", 82 | "\n", 83 | "#!python -m venv .venv_clust_demo\n", 84 | "#!source .venv_clust_demo/bin/activate\n", 85 | "#!pip install --upgrade pip\n", 86 | "#!pip list" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": null, 92 | "metadata": { 93 | "colab": { 94 | "base_uri": "https://localhost:8080/" 95 | }, 96 | "id": "96JEWSNtljzH", 97 | "outputId": "04f47fc2-588e-4557-ac04-16989c152ee7" 98 | }, 99 | "outputs": [], 100 | "source": [ 101 | "import os\n", 102 | "\n", 103 | "os.chdir(\"/content/awesome-chatgpt-prompts-clustering\")\n", 104 | "!ls" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": null, 110 | "metadata": { 111 | "id": "GeFaTjyW2Gk7" 112 | }, 113 | "outputs": [], 114 | "source": [ 115 | "!pip install -qUr requirements.txt" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": null, 121 | "metadata": { 122 | "colab": { 123 | "base_uri": "https://localhost:8080/" 124 | }, 125 | "id": "8nBmAAxvye3w", 126 | "outputId": "04ea9f9f-b092-40da-f5cb-39b468509561" 127 | }, 128 | "outputs": [], 129 | "source": [ 130 | "os.chdir(\"../..\")\n", 131 | "!ls" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": null, 137 | "metadata": { 138 | "id": "lPBnmvEd3egy" 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "#!pip list" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": null, 148 | "metadata": { 149 | "id": "NXc0G4wQ3egy" 150 | }, 151 | "outputs": [], 152 | "source": [ 153 | "import argparse\n", 154 | "import os\n", 155 | "from tqdm.notebook import tqdm\n", 156 | "import datetime\n", 157 | "import json\n", 158 | "import pickle\n", 159 | "import numpy as np\n", 160 | "from numpy import random\n", 161 | "import pandas as pd\n", 162 | "import seaborn as sns\n", 163 | "import matplotlib.pyplot as plt\n", 164 | "import plotly.graph_objects as go\n", 165 | "\n", 166 | "import umap\n", 167 | "from datasets import load_dataset\n", 168 | "from sentence_transformers import SentenceTransformer\n", 169 | "import torch\n", 170 | "import hdbscan\n", 171 | "from sklearn.metrics.pairwise import euclidean_distances\n", 172 | "import openai\n", 173 | "import tiktoken\n", 174 | "import cleantext\n", 175 | "\n", 176 | "from UliPlot.XLSX import auto_adjust_xlsx_column_width" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": null, 182 | "metadata": { 183 | "colab": { 184 | "base_uri": "https://localhost:8080/" 185 | }, 186 | "id": "TnRCOUWK9hkn", 187 | "outputId": "e347ce5a-844b-4dfb-99ff-7f25a071619e" 188 | }, 189 | "outputs": [], 190 | "source": [ 191 | "args = argparse.Namespace()\n", 192 | "args.inference = True\n", 193 | "args" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": null, 199 | "metadata": { 200 | "colab": { 201 | "base_uri": "https://localhost:8080/" 202 | }, 203 | "id": "m89BDk6WXSZM", 204 | "outputId": "4ea9d179-75aa-41cd-9a8c-e3022e72d0ff" 205 | }, 206 | "outputs": [], 207 | "source": [ 208 | "now = datetime.datetime.now()\n", 209 | "# dd/mm/YY H:M:S\n", 210 | "dt_string = now.strftime(\"%d_%m_%Y_%H_%M_%S\")\n", 211 | "\n", 212 | "# hardcode in an existing experiment datetime for inference runs\n", 213 | "\n", 214 | "if args.inference:\n", 215 | " # dt_string identifiers from cached experiments:\n", 216 | " dt_string = \"04_09_2023_03_02_25\"\n", 217 | "\n", 218 | "print(\"experiment's datetime identifier =\", dt_string)\n", 219 | "\n", 220 | "# create results folder if it doesn't exist\n", 221 | "if not os.path.isdir(\n", 222 | " f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n", 223 | "):\n", 224 | " os.mkdir(\n", 225 | " f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n", 226 | " )" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": null, 232 | "metadata": { 233 | "colab": { 234 | "base_uri": "https://localhost:8080/" 235 | }, 236 | "id": "3EzBV-Mm6C90", 237 | "outputId": "42ff75a2-11c5-474a-90bd-a0204885e6ba" 238 | }, 239 | "outputs": [], 240 | "source": [ 241 | "args.cache_folder = (\n", 242 | " f\"/content/drive/MyDrive/colab_files/text_clustering/experiments/{dt_string}\"\n", 243 | ")\n", 244 | "args" 245 | ] 246 | }, 247 | { 248 | "cell_type": "markdown", 249 | "metadata": { 250 | "id": "ZRwI0I6IeVNr" 251 | }, 252 | "source": [ 253 | "## Data I/O" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": null, 259 | "metadata": { 260 | "colab": { 261 | "base_uri": "https://localhost:8080/", 262 | "height": 424 263 | }, 264 | "id": "_BNPvSprLRzM", 265 | "outputId": "b9559cd9-8111-42f6-edab-272b616ae73d" 266 | }, 267 | "outputs": [], 268 | "source": [ 269 | "ds_hf = load_dataset(\"Gustavosta/Stable-Diffusion-Prompts\")\n", 270 | "ds = ds_hf[\"train\"]\n", 271 | "\n", 272 | "ds = ds.to_pandas()\n", 273 | "ds[\"id\"] = ds.index\n", 274 | "ds = ds[[\"id\", \"Prompt\"]]\n", 275 | "ds" 276 | ] 277 | }, 278 | { 279 | "cell_type": "markdown", 280 | "metadata": { 281 | "id": "h8uYPW_TeYOa" 282 | }, 283 | "source": [ 284 | "## Embed Text\n", 285 | "\n", 286 | "* See [pretrained models](https://www.sbert.net/docs/pretrained_models.html) for supporting information" 287 | ] 288 | }, 289 | { 290 | "cell_type": "code", 291 | "execution_count": null, 292 | "metadata": { 293 | "id": "pLLXTuZd6QyV" 294 | }, 295 | "outputs": [], 296 | "source": [ 297 | "model = SentenceTransformer(\"all-mpnet-base-v2\")" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": null, 303 | "metadata": { 304 | "id": "UIen4vsj3egz" 305 | }, 306 | "outputs": [], 307 | "source": [ 308 | "if not args.inference:\n", 309 | " embeddings = torch.zeros([len(ds), 768])\n", 310 | " for i in tqdm(range(len(ds))):\n", 311 | " emb = model.encode(ds.loc[i, \"Prompt\"], convert_to_tensor=True)\n", 312 | " embeddings[i, :] = emb\n", 313 | " embeddings" 314 | ] 315 | }, 316 | { 317 | "cell_type": "code", 318 | "execution_count": null, 319 | "metadata": { 320 | "colab": { 321 | "base_uri": "https://localhost:8080/" 322 | }, 323 | "id": "5h2kpB6C2I_N", 324 | "outputId": "88bfa974-8686-49d7-9f93-b131b89ba9a3" 325 | }, 326 | "outputs": [], 327 | "source": [ 328 | "f_name = os.path.join(\n", 329 | " args.cache_folder, \"stable_diffusion_prompts_embeddings_all_mpnet_base_v2.pt\"\n", 330 | ")\n", 331 | "print(f_name, \"\\n\")\n", 332 | "\n", 333 | "if not args.inference:\n", 334 | " torch.save(embeddings, f_name)\n", 335 | "loaded_embeddings = torch.load(f_name)\n", 336 | "loaded_embeddings" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": null, 342 | "metadata": { 343 | "colab": { 344 | "base_uri": "https://localhost:8080/" 345 | }, 346 | "id": "fsWOJjwdzgeG", 347 | "outputId": "b91af737-4ec1-43af-a45d-705a6a893fdd" 348 | }, 349 | "outputs": [], 350 | "source": [ 351 | "loaded_embeddings.shape" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": null, 357 | "metadata": { 358 | "id": "DWdeAnld5xhH" 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "if not args.inference:\n", 363 | " torch.equal(loaded_embeddings.cpu(), embeddings.cpu())" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "metadata": { 370 | "colab": { 371 | "base_uri": "https://localhost:8080/" 372 | }, 373 | "id": "j6f0cJmH3egz", 374 | "outputId": "e4b5aee8-31a9-4348-f8da-4e2e1b07da14" 375 | }, 376 | "outputs": [], 377 | "source": [ 378 | "test_itr = random.randint(low=0, high=len(ds))\n", 379 | "test_itr" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": { 386 | "colab": { 387 | "base_uri": "https://localhost:8080/", 388 | "height": 89 389 | }, 390 | "id": "2WoZs9mF3egz", 391 | "outputId": "708a1772-9947-4949-f4ad-89bf734aa6b3" 392 | }, 393 | "outputs": [], 394 | "source": [ 395 | "# test embeddings worked\n", 396 | "ds.loc[test_itr, \"Prompt\"]" 397 | ] 398 | }, 399 | { 400 | "cell_type": "code", 401 | "execution_count": null, 402 | "metadata": { 403 | "colab": { 404 | "base_uri": "https://localhost:8080/" 405 | }, 406 | "id": "78eiylg53egz", 407 | "outputId": "0a1eee8c-ed7a-4feb-a406-8e001d2fe657" 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "test_emb = model.encode(ds.loc[test_itr, \"Prompt\"], convert_to_tensor=True)\n", 412 | "a = np.array(test_emb.cpu())\n", 413 | "b = np.array(loaded_embeddings[test_itr, :].cpu())\n", 414 | "np.allclose(a, b, rtol=1e-02)" 415 | ] 416 | }, 417 | { 418 | "cell_type": "markdown", 419 | "metadata": { 420 | "id": "THg1GieGesDQ" 421 | }, 422 | "source": [ 423 | "## Clustering\n", 424 | "\n", 425 | "* See [how hdbscan works](https://hdbscan.readthedocs.io/en/latest/how_hdbscan_works.html) for supporting information" 426 | ] 427 | }, 428 | { 429 | "cell_type": "code", 430 | "execution_count": null, 431 | "metadata": { 432 | "colab": { 433 | "base_uri": "https://localhost:8080/" 434 | }, 435 | "id": "m2A8EHRg7dzr", 436 | "outputId": "e6f9135c-02c2-4e47-ca4a-75d56369ffd8" 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "# second, perform clustering\n", 441 | "\n", 442 | "# first, perform dimensionality reduction from 768 to 15\n", 443 | "f_name = os.path.join(args.cache_folder, \"reducer_umap_15.pkl\")\n", 444 | "print(f_name, \"\\n\")\n", 445 | "\n", 446 | "if not args.inference:\n", 447 | " reducer_15 = umap.UMAP(n_components=15)\n", 448 | " reducer_15.fit(loaded_embeddings)\n", 449 | " embeddings_umap_dim_15 = reducer_15.transform(loaded_embeddings)\n", 450 | " # Verify that the result of calling transform is\n", 451 | " # idenitical to accessing the embedding_ attribute\n", 452 | " assert np.all(embeddings_umap_dim_15 == reducer_15.embedding_)\n", 453 | "\n", 454 | " # cache fitted umap object\n", 455 | " pickle.dump(reducer_15, open(f_name, \"wb\"))\n", 456 | "\n", 457 | "loaded_reducer_15 = pickle.load((open(f_name, \"rb\")))\n", 458 | "\n", 459 | "embeddings_umap_dim_15 = loaded_reducer_15.transform(loaded_embeddings)\n", 460 | "# Verify that the result of calling transform is\n", 461 | "# idenitical to accessing the embedding_ attribute\n", 462 | "assert np.all(embeddings_umap_dim_15 == loaded_reducer_15.embedding_)\n", 463 | "\n", 464 | "print(embeddings_umap_dim_15.shape)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "colab": { 472 | "base_uri": "https://localhost:8080/" 473 | }, 474 | "id": "0TGPS4pSySbc", 475 | "outputId": "8f0486e0-e09c-4947-b5ac-f0a556e76860" 476 | }, 477 | "outputs": [], 478 | "source": [ 479 | "args.inference = False\n", 480 | "args" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": { 487 | "colab": { 488 | "base_uri": "https://localhost:8080/" 489 | }, 490 | "id": "ZZxn2DlOyOsr", 491 | "outputId": "758758e8-ac32-486f-db8d-b267f4ec8bf4" 492 | }, 493 | "outputs": [], 494 | "source": [ 495 | "f_name = os.path.join(args.cache_folder, \"clusterer_hdbscan.pkl\")\n", 496 | "print(f_name, \"\\n\")\n", 497 | "\n", 498 | "if not args.inference:\n", 499 | " clusterer = hdbscan.HDBSCAN(\n", 500 | " min_cluster_size=110, gen_min_span_tree=True, prediction_data=True\n", 501 | " )\n", 502 | " clusterer.fit(embeddings_umap_dim_15)\n", 503 | " pickle.dump(clusterer, open(f_name, \"wb\"))\n", 504 | "\n", 505 | "loaded_clusterer = pickle.load((open(f_name, \"rb\")))\n", 506 | "\n", 507 | "if not args.inference:\n", 508 | " print(\n", 509 | " pd.DataFrame.equals(\n", 510 | " pd.Series(clusterer.labels_).value_counts(),\n", 511 | " pd.Series(loaded_clusterer.labels_).value_counts(),\n", 512 | " )\n", 513 | " )\n", 514 | " print(\n", 515 | " pd.DataFrame.equals(\n", 516 | " pd.Series(clusterer.probabilities_).value_counts(),\n", 517 | " pd.Series(loaded_clusterer.probabilities_).value_counts(),\n", 518 | " )\n", 519 | " )\n", 520 | "\n", 521 | "num_ouliers = pd.Series(loaded_clusterer.labels_).value_counts().loc[-1]\n", 522 | "\n", 523 | "print(pd.Series(loaded_clusterer.labels_).value_counts())\n", 524 | "print(f\"\\nCluster outliers : {num_ouliers}\\n\")" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": null, 530 | "metadata": { 531 | "colab": { 532 | "base_uri": "https://localhost:8080/" 533 | }, 534 | "id": "eqIqeraG-jTd", 535 | "outputId": "066f5e27-3bc6-4c4d-d388-2bb67ab9d23e" 536 | }, 537 | "outputs": [], 538 | "source": [ 539 | "# sum of top 25 cluster counts\n", 540 | "\n", 541 | "pd.Series(loaded_clusterer.labels_).value_counts()[1:26].sum()" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": null, 547 | "metadata": { 548 | "colab": { 549 | "base_uri": "https://localhost:8080/", 550 | "height": 424 551 | }, 552 | "id": "zB9nU0ka_JpA", 553 | "outputId": "fb1f1bf9-581a-4db4-f9b5-fcd9117c2548" 554 | }, 555 | "outputs": [], 556 | "source": [ 557 | "ds[\"cluster\"] = loaded_clusterer.labels_\n", 558 | "ds[\"cluster membership prob\"] = loaded_clusterer.probabilities_\n", 559 | "ds" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": null, 565 | "metadata": { 566 | "colab": { 567 | "base_uri": "https://localhost:8080/", 568 | "height": 438 569 | }, 570 | "id": "pDW5AVDn891V", 571 | "outputId": "983b99a0-0c2a-4aa2-ea63-53b97e5b5f90" 572 | }, 573 | "outputs": [], 574 | "source": [ 575 | "loaded_clusterer.condensed_tree_.plot()" 576 | ] 577 | }, 578 | { 579 | "cell_type": "code", 580 | "execution_count": null, 581 | "metadata": { 582 | "colab": { 583 | "base_uri": "https://localhost:8080/", 584 | "height": 438 585 | }, 586 | "id": "N-aJm6Sz9A2h", 587 | "outputId": "641b1b8d-e748-44eb-be4c-ae7799b0646a" 588 | }, 589 | "outputs": [], 590 | "source": [ 591 | "loaded_clusterer.condensed_tree_.plot(\n", 592 | " select_clusters=True, selection_palette=sns.color_palette()\n", 593 | ")" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "metadata": { 600 | "id": "9wphVnvqytn5" 601 | }, 602 | "outputs": [], 603 | "source": [ 604 | "args.inference = True" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": { 611 | "colab": { 612 | "base_uri": "https://localhost:8080/" 613 | }, 614 | "id": "Bug6Ab-n8DqQ", 615 | "outputId": "18a5ca83-d5cd-43e0-aef2-ed0edabb336b" 616 | }, 617 | "outputs": [], 618 | "source": [ 619 | "# third, perform dimensionality reduction from 15 to 2\n", 620 | "\n", 621 | "f_name = os.path.join(args.cache_folder, \"reducer_umap_2.pkl\")\n", 622 | "print(f_name, \"\\n\")\n", 623 | "\n", 624 | "if not args.inference:\n", 625 | " reducer_2 = umap.UMAP(n_components=2)\n", 626 | " reducer_2.fit(embeddings_umap_dim_15)\n", 627 | " embeddings_umap_dim_2 = reducer_2.transform(embeddings_umap_dim_15)\n", 628 | "\n", 629 | " # Verify that the result of calling transform is\n", 630 | " # idenitical to accessing the embedding_ attribute\n", 631 | " assert np.all(embeddings_umap_dim_2 == reducer_2.embedding_)\n", 632 | "\n", 633 | " # cache fitted umap object\n", 634 | " pickle.dump(reducer_2, open(f_name, \"wb\"))\n", 635 | "\n", 636 | "loaded_reducer_2 = pickle.load((open(f_name, \"rb\")))\n", 637 | "\n", 638 | "embeddings_umap_dim_2 = loaded_reducer_2.transform(embeddings_umap_dim_15)\n", 639 | "# Verify that the result of calling transform is\n", 640 | "# idenitical to accessing the embedding_ attribute\n", 641 | "assert np.all(embeddings_umap_dim_2 == loaded_reducer_2.embedding_)\n", 642 | "embeddings_umap_dim_2.shape" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": null, 648 | "metadata": { 649 | "id": "PjOkZzekl1vx" 650 | }, 651 | "outputs": [], 652 | "source": [ 653 | "ds[\"x\"] = embeddings_umap_dim_2[:, 0]\n", 654 | "ds[\"y\"] = embeddings_umap_dim_2[:, 1]" 655 | ] 656 | }, 657 | { 658 | "cell_type": "code", 659 | "execution_count": null, 660 | "metadata": { 661 | "colab": { 662 | "base_uri": "https://localhost:8080/", 663 | "height": 450 664 | }, 665 | "id": "lcOHNlilQAeU", 666 | "outputId": "8593037f-af04-49dd-815d-7e4b871bc0d2" 667 | }, 668 | "outputs": [], 669 | "source": [ 670 | "# Visualize clusters\n", 671 | "fig, ax = plt.subplots(figsize=(20, 10))\n", 672 | "outliers = ds[ds[\"cluster\"] == -1]\n", 673 | "clustered = ds[ds[\"cluster\"] != -1]\n", 674 | "plt.scatter(outliers.x, outliers.y, color=\"#BDBDBD\", s=10, alpha=0.1)\n", 675 | "plt.scatter(\n", 676 | " clustered.x, clustered.y, c=clustered.cluster, s=10, alpha=0.35, cmap=\"viridis\"\n", 677 | ")" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": null, 683 | "metadata": { 684 | "colab": { 685 | "base_uri": "https://localhost:8080/", 686 | "height": 837 687 | }, 688 | "id": "jdCSkHYZ9VTK", 689 | "outputId": "2ab60bbd-afb1-47f6-edac-cb077616d153" 690 | }, 691 | "outputs": [], 692 | "source": [ 693 | "fig = go.Figure()\n", 694 | "fig.add_trace(\n", 695 | " go.Scatter(\n", 696 | " x=ds[\"x\"][ds[\"cluster\"] != -1],\n", 697 | " y=ds[\"y\"][ds[\"cluster\"] != -1],\n", 698 | " mode=\"markers\",\n", 699 | " marker_color=ds[\"cluster\"][ds[\"cluster\"] != -1],\n", 700 | " marker_colorscale=\"Viridis\",\n", 701 | " text=ds[\"cluster\"][ds[\"cluster\"] != -1],\n", 702 | " )\n", 703 | ")\n", 704 | "\n", 705 | "fig.update_traces(marker={\"size\": 5, \"opacity\": 0.45}, showlegend=False)\n", 706 | "fig.update_coloraxes(showscale=False)\n", 707 | "fig.update_layout(width=550 * 2, height=400 * 2)\n", 708 | "fig.show()" 709 | ] 710 | }, 711 | { 712 | "cell_type": "markdown", 713 | "metadata": { 714 | "id": "fZl5qjp7r6QJ" 715 | }, 716 | "source": [ 717 | "## Exemplar Sub-Clustering\n", 718 | "\n", 719 | "* See [soft clustering explanation](https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html) for supporting information" 720 | ] 721 | }, 722 | { 723 | "cell_type": "code", 724 | "execution_count": null, 725 | "metadata": { 726 | "id": "87hBN35f-J0-" 727 | }, 728 | "outputs": [], 729 | "source": [ 730 | "# function copied from:\n", 731 | "# https://hdbscan.readthedocs.io/en/latest/soft_clustering_explanation.html#distance-based-membership\n", 732 | "\n", 733 | "\n", 734 | "def exemplars(cluster_id, condensed_tree):\n", 735 | " raw_tree = condensed_tree._raw_tree\n", 736 | " # Just the cluster elements of the tree, excluding singleton points\n", 737 | " cluster_tree = raw_tree[raw_tree[\"child_size\"] > 1]\n", 738 | " # Get the leaf cluster nodes under the cluster we are considering\n", 739 | " leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)\n", 740 | " # Now collect up the last remaining points of each leaf cluster (the heart of the leaf)\n", 741 | " result = np.array([])\n", 742 | " for leaf in leaves:\n", 743 | " max_lambda = raw_tree[\"lambda_val\"][raw_tree[\"parent\"] == leaf].max()\n", 744 | " points = raw_tree[\"child\"][\n", 745 | " (raw_tree[\"parent\"] == leaf) & (raw_tree[\"lambda_val\"] == max_lambda)\n", 746 | " ]\n", 747 | " result = np.hstack((result, points))\n", 748 | " return result.astype(np.int)" 749 | ] 750 | }, 751 | { 752 | "cell_type": "code", 753 | "execution_count": null, 754 | "metadata": { 755 | "colab": { 756 | "base_uri": "https://localhost:8080/" 757 | }, 758 | "id": "hu4rVHAHAoms", 759 | "outputId": "888845e2-4a84-4cb0-d72d-56867bdf01b8" 760 | }, 761 | "outputs": [], 762 | "source": [ 763 | "tree = loaded_clusterer.condensed_tree_\n", 764 | "\n", 765 | "exemplar_ids = []\n", 766 | "for i, c in enumerate(tree._select_clusters()):\n", 767 | " c_exemplars = exemplars(c, tree)\n", 768 | " print(f\"Cluster {i} has {len(c_exemplars)} exemplars\")\n", 769 | " exemplar_ids.extend(c_exemplars)" 770 | ] 771 | }, 772 | { 773 | "cell_type": "code", 774 | "execution_count": null, 775 | "metadata": { 776 | "id": "3X3Wc86tnVRY" 777 | }, 778 | "outputs": [], 779 | "source": [ 780 | "ds[\"exemplars yes/no\"] = np.zeros(len(ds))\n", 781 | "ds.loc[exemplar_ids, \"exemplars yes/no\"] = 1\n", 782 | "\n", 783 | "assert len(ds[ds[\"exemplars yes/no\"] == 1]) == len(exemplar_ids)" 784 | ] 785 | }, 786 | { 787 | "cell_type": "code", 788 | "execution_count": null, 789 | "metadata": { 790 | "colab": { 791 | "base_uri": "https://localhost:8080/", 792 | "height": 873 793 | }, 794 | "id": "uyJW97LhrRp0", 795 | "outputId": "5176d228-8539-40f4-d8b5-f0acf58b8ffa" 796 | }, 797 | "outputs": [], 798 | "source": [ 799 | "print(\"\\n\")\n", 800 | "fig = go.Figure()\n", 801 | "\n", 802 | "custom_scale = [\n", 803 | " \"#949494\", # Gray\n", 804 | " \"#F65314\", # Google Red\n", 805 | " \"#4285F4\", # Google Blue\n", 806 | "]\n", 807 | "\n", 808 | "fig.add_trace(\n", 809 | " go.Scatter(\n", 810 | " x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n", 811 | " y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n", 812 | " mode=\"markers\",\n", 813 | " marker_color=custom_scale[0],\n", 814 | " text=ds[\"cluster\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n", 815 | " )\n", 816 | ")\n", 817 | "\n", 818 | "fig.add_trace(\n", 819 | " go.Scatter(\n", 820 | " x=ds[\"x\"][ds[\"exemplars yes/no\"] == 1],\n", 821 | " y=ds[\"y\"][ds[\"exemplars yes/no\"] == 1],\n", 822 | " mode=\"markers\",\n", 823 | " marker_color=ds[\"cluster\"][ds[\"exemplars yes/no\"] == 1],\n", 824 | " marker_colorscale=\"Viridis\",\n", 825 | " text=ds[\"cluster\"][ds[\"exemplars yes/no\"] == 1],\n", 826 | " )\n", 827 | ")\n", 828 | "\n", 829 | "fig.update_traces(marker={\"size\": 5, \"opacity\": 0.45}, showlegend=False)\n", 830 | "fig.update_coloraxes(showscale=False)\n", 831 | "fig.update_layout(width=550 * 2, height=400 * 2)\n", 832 | "fig.show()" 833 | ] 834 | }, 835 | { 836 | "cell_type": "code", 837 | "execution_count": null, 838 | "metadata": { 839 | "colab": { 840 | "base_uri": "https://localhost:8080/" 841 | }, 842 | "id": "y5bO5SZ4hjAT", 843 | "outputId": "82860182-688b-400c-bb60-162964b8bf23" 844 | }, 845 | "outputs": [], 846 | "source": [ 847 | "len(ds.loc[exemplar_ids])" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": null, 853 | "metadata": { 854 | "colab": { 855 | "base_uri": "https://localhost:8080/" 856 | }, 857 | "id": "OW44QJyYhibv", 858 | "outputId": "e24e08df-3557-406c-a056-4821b5661dd8" 859 | }, 860 | "outputs": [], 861 | "source": [ 862 | "embeddings_umap_dim_15[exemplar_ids].shape" 863 | ] 864 | }, 865 | { 866 | "cell_type": "code", 867 | "execution_count": null, 868 | "metadata": { 869 | "colab": { 870 | "base_uri": "https://localhost:8080/" 871 | }, 872 | "id": "J17BEw2vgvwA", 873 | "outputId": "0336d60f-c0d1-49aa-bd53-bc1da49f71f8" 874 | }, 875 | "outputs": [], 876 | "source": [ 877 | "# fourth, perform exemplar sub-clustering\n", 878 | "\n", 879 | "f_name = os.path.join(args.cache_folder, \"clusterer_subs_hdbscan.pkl\")\n", 880 | "print(f_name, \"\\n\")\n", 881 | "\n", 882 | "if not args.inference:\n", 883 | " sub_clusterer = hdbscan.HDBSCAN(\n", 884 | " min_cluster_size=4, gen_min_span_tree=True, prediction_data=True\n", 885 | " )\n", 886 | " sub_clusterer.fit(embeddings_umap_dim_15[exemplar_ids])\n", 887 | " pickle.dump(sub_clusterer, open(f_name, \"wb\"))\n", 888 | "\n", 889 | "loaded_sub_clusterer = pickle.load((open(f_name, \"rb\")))\n", 890 | "\n", 891 | "if not args.inference:\n", 892 | " print(\n", 893 | " pd.DataFrame.equals(\n", 894 | " pd.Series(sub_clusterer.labels_).value_counts(),\n", 895 | " pd.Series(loaded_sub_clusterer.labels_).value_counts(),\n", 896 | " )\n", 897 | " )\n", 898 | " print(\n", 899 | " pd.DataFrame.equals(\n", 900 | " pd.Series(sub_clusterer.probabilities_).value_counts(),\n", 901 | " pd.Series(loaded_sub_clusterer.probabilities_).value_counts(),\n", 902 | " )\n", 903 | " )\n", 904 | "\n", 905 | "print(\"\\nCluster value counts:\\n\")\n", 906 | "pd.Series(loaded_sub_clusterer.labels_).value_counts()" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": null, 912 | "metadata": { 913 | "colab": { 914 | "base_uri": "https://localhost:8080/" 915 | }, 916 | "id": "O9033VfDirUu", 917 | "outputId": "97d98be7-f645-4c81-8df4-d20d6a04e1c7" 918 | }, 919 | "outputs": [], 920 | "source": [ 921 | "loaded_sub_clusterer.labels_" 922 | ] 923 | }, 924 | { 925 | "cell_type": "code", 926 | "execution_count": null, 927 | "metadata": { 928 | "colab": { 929 | "base_uri": "https://localhost:8080/" 930 | }, 931 | "id": "buRR8jvui1ul", 932 | "outputId": "3b916ffb-a440-4c07-b452-06f993292e6b" 933 | }, 934 | "outputs": [], 935 | "source": [ 936 | "ds[\"exemplar sub-cluster\"] = np.repeat(np.nan, len(ds))\n", 937 | "ds[\"cluster XX.YY\"] = np.repeat(np.nan, len(ds))\n", 938 | "# ds.loc[exemplar_ids] = loaded_sub_clusterer.labels_\n", 939 | "ds\n", 940 | "for i in range(len(ds.loc[exemplar_ids])):\n", 941 | " row = ds.loc[exemplar_ids].iloc[i]\n", 942 | " ds.loc[row.id, \"exemplar sub-cluster\"] = loaded_sub_clusterer.labels_[i]\n", 943 | "for i in range(len(ds.loc[exemplar_ids])):\n", 944 | " row = ds.loc[exemplar_ids].iloc[i]\n", 945 | " ds.loc[row.id, \"cluster XX.YY\"] = (\n", 946 | " \"Cluster \"\n", 947 | " + str(row.cluster)\n", 948 | " + \", Sub-Cluster \"\n", 949 | " + str(int(row[\"exemplar sub-cluster\"]))\n", 950 | " )\n", 951 | "\n", 952 | "# ds.loc[exemplar_ids]\n", 953 | "# ds" 954 | ] 955 | }, 956 | { 957 | "cell_type": "code", 958 | "execution_count": null, 959 | "metadata": { 960 | "colab": { 961 | "base_uri": "https://localhost:8080/" 962 | }, 963 | "id": "lJdOWEjSr0NR", 964 | "outputId": "baaa50b4-475e-4f50-a406-700b59c1d0f3" 965 | }, 966 | "outputs": [], 967 | "source": [ 968 | "ds_inner_exemplars = ds[ds[\"exemplars yes/no\"] == 1]\n", 969 | "ds_inner_exemplars = ds_inner_exemplars[\n", 970 | " ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n", 971 | "]\n", 972 | "len(ds_inner_exemplars)" 973 | ] 974 | }, 975 | { 976 | "cell_type": "code", 977 | "execution_count": null, 978 | "metadata": { 979 | "colab": { 980 | "base_uri": "https://localhost:8080/", 981 | "height": 673 982 | }, 983 | "id": "NiW84puFkJtJ", 984 | "outputId": "854e83a8-fbea-4eed-bde4-8868d070d77b" 985 | }, 986 | "outputs": [], 987 | "source": [ 988 | "print(\"\\n\")\n", 989 | "\n", 990 | "fig = go.Figure()\n", 991 | "\n", 992 | "fig.add_trace(\n", 993 | " go.Scatter(\n", 994 | " x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n", 995 | " y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n", 996 | " mode=\"markers\",\n", 997 | " marker_color=custom_scale[0],\n", 998 | " text=ds[\"cluster XX.YY\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n", 999 | " )\n", 1000 | ")\n", 1001 | "\n", 1002 | "fig.add_trace(\n", 1003 | " go.Scatter(\n", 1004 | " x=ds_inner_exemplars[\"x\"],\n", 1005 | " y=ds_inner_exemplars[\"y\"],\n", 1006 | " mode=\"markers\",\n", 1007 | " marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n", 1008 | " marker_colorscale=\"Viridis\",\n", 1009 | " text=ds_inner_exemplars[\"cluster XX.YY\"],\n", 1010 | " )\n", 1011 | ")\n", 1012 | "\n", 1013 | "fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n", 1014 | "fig.update_coloraxes(showscale=False)\n", 1015 | "fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n", 1016 | "fig.show()" 1017 | ] 1018 | }, 1019 | { 1020 | "cell_type": "code", 1021 | "execution_count": null, 1022 | "metadata": { 1023 | "colab": { 1024 | "base_uri": "https://localhost:8080/" 1025 | }, 1026 | "id": "1JJB33r5qCSq", 1027 | "outputId": "9a7e4eae-7f27-4d36-94c0-37aa8712c3d2" 1028 | }, 1029 | "outputs": [], 1030 | "source": [ 1031 | "for i in range(len(ds)):\n", 1032 | " ds.loc[i, \"Prompt head\"] = \" \".join(\n", 1033 | " cleantext.clean_words(\n", 1034 | " ds.loc[i, \"Prompt\"],\n", 1035 | " clean_all=False, # Execute all cleaning operations\n", 1036 | " extra_spaces=True, # Remove extra white spaces\n", 1037 | " stemming=False, # Stem the words\n", 1038 | " stopwords=False, # Remove stop words\n", 1039 | " lowercase=False, # Convert to lowercase\n", 1040 | " numbers=False, # Remove all digits\n", 1041 | " punct=False, # Remove all punctuations\n", 1042 | " stp_lang=\"english\", # Language for stop words\n", 1043 | " )[0:12]\n", 1044 | " )" 1045 | ] 1046 | }, 1047 | { 1048 | "cell_type": "code", 1049 | "execution_count": null, 1050 | "metadata": { 1051 | "colab": { 1052 | "base_uri": "https://localhost:8080/", 1053 | "height": 1000 1054 | }, 1055 | "id": "iEG1AZTurPkN", 1056 | "outputId": "a24923f0-225f-41fd-f9b8-14d09afce366" 1057 | }, 1058 | "outputs": [], 1059 | "source": [ 1060 | "ds" 1061 | ] 1062 | }, 1063 | { 1064 | "cell_type": "code", 1065 | "execution_count": null, 1066 | "metadata": { 1067 | "colab": { 1068 | "base_uri": "https://localhost:8080/", 1069 | "height": 1000 1070 | }, 1071 | "id": "UdQ3Rf3YpBSE", 1072 | "outputId": "fa65296d-ac17-466c-8903-4c6cec8a2e7c" 1073 | }, 1074 | "outputs": [], 1075 | "source": [ 1076 | "ds[\"cluster + Prompt\"] = (\n", 1077 | " \"Cluster: \"\n", 1078 | " + ds[\"cluster\"].astype(str)\n", 1079 | " + \", Prompt id \"\n", 1080 | " + ds[\"id\"].astype(str)\n", 1081 | " + \": \"\n", 1082 | " + '\"'\n", 1083 | " + ds[\"Prompt head\"]\n", 1084 | " + '\"'\n", 1085 | ")\n", 1086 | "ds" 1087 | ] 1088 | }, 1089 | { 1090 | "cell_type": "code", 1091 | "execution_count": null, 1092 | "metadata": { 1093 | "colab": { 1094 | "base_uri": "https://localhost:8080/", 1095 | "height": 1000 1096 | }, 1097 | "id": "w6l_0xa2luhO", 1098 | "outputId": "f24a901f-356c-45eb-ad69-b6b44d2fc6ee" 1099 | }, 1100 | "outputs": [], 1101 | "source": [ 1102 | "# visualize top 25 clusters by count\n", 1103 | "\n", 1104 | "clust_to_zoom_list = pd.Series(loaded_clusterer.labels_).value_counts().index[1:26]\n", 1105 | "\n", 1106 | "for clust_to_zoom in clust_to_zoom_list:\n", 1107 | " print(f\"Cluster {clust_to_zoom}:\")\n", 1108 | " ds_inner_exemplars = ds[\n", 1109 | " (ds[\"exemplars yes/no\"] == 1) & (ds[\"cluster\"] == clust_to_zoom)\n", 1110 | " ]\n", 1111 | " ds_inner_exemplars = ds_inner_exemplars[\n", 1112 | " ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n", 1113 | " ]\n", 1114 | "\n", 1115 | " fig = go.Figure()\n", 1116 | "\n", 1117 | " fig.add_trace(\n", 1118 | " go.Scatter(\n", 1119 | " x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)],\n", 1120 | " y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)],\n", 1121 | " mode=\"markers\",\n", 1122 | " marker_color=custom_scale[0],\n", 1123 | " text=ds[\"cluster + Prompt\"][\n", 1124 | " (ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] == clust_to_zoom)\n", 1125 | " ],\n", 1126 | " )\n", 1127 | " )\n", 1128 | "\n", 1129 | " fig.add_trace(\n", 1130 | " go.Scatter(\n", 1131 | " x=ds_inner_exemplars[\"x\"],\n", 1132 | " y=ds_inner_exemplars[\"y\"],\n", 1133 | " mode=\"markers\",\n", 1134 | " marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n", 1135 | " marker_colorscale=\"Viridis\",\n", 1136 | " text=ds_inner_exemplars[\"cluster + Prompt\"],\n", 1137 | " )\n", 1138 | " )\n", 1139 | "\n", 1140 | " fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n", 1141 | " fig.update_coloraxes(showscale=False)\n", 1142 | " fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n", 1143 | " fig.show()" 1144 | ] 1145 | }, 1146 | { 1147 | "cell_type": "code", 1148 | "execution_count": null, 1149 | "metadata": { 1150 | "id": "PuTTU71N5AkX" 1151 | }, 1152 | "outputs": [], 1153 | "source": [ 1154 | "ds_inner_exemplars = ds[ds[\"exemplars yes/no\"] == 1]\n", 1155 | "ds_inner_exemplars = ds_inner_exemplars[\n", 1156 | " ds_inner_exemplars[\"exemplar sub-cluster\"] != -1\n", 1157 | "]" 1158 | ] 1159 | }, 1160 | { 1161 | "cell_type": "code", 1162 | "execution_count": null, 1163 | "metadata": { 1164 | "colab": { 1165 | "base_uri": "https://localhost:8080/", 1166 | "height": 673 1167 | }, 1168 | "id": "rx58ZS3-40gI", 1169 | "outputId": "786c3a8c-b837-4fba-cee8-a42bb05ca84d" 1170 | }, 1171 | "outputs": [], 1172 | "source": [ 1173 | "print(\"\\n\")\n", 1174 | "\n", 1175 | "fig = go.Figure()\n", 1176 | "\n", 1177 | "fig.add_trace(\n", 1178 | " go.Scatter(\n", 1179 | " x=ds[\"x\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n", 1180 | " y=ds[\"y\"][(ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)],\n", 1181 | " mode=\"markers\",\n", 1182 | " marker_color=custom_scale[0],\n", 1183 | " text=ds[\"cluster + Prompt\"][\n", 1184 | " (ds[\"exemplars yes/no\"] == 0) & (ds[\"cluster\"] != -1)\n", 1185 | " ],\n", 1186 | " )\n", 1187 | ")\n", 1188 | "\n", 1189 | "fig.add_trace(\n", 1190 | " go.Scatter(\n", 1191 | " x=ds_inner_exemplars[\"x\"],\n", 1192 | " y=ds_inner_exemplars[\"y\"],\n", 1193 | " mode=\"markers\",\n", 1194 | " marker_color=ds_inner_exemplars[\"exemplar sub-cluster\"],\n", 1195 | " marker_colorscale=\"Viridis\",\n", 1196 | " text=ds_inner_exemplars[\"cluster + Prompt\"],\n", 1197 | " )\n", 1198 | ")\n", 1199 | "\n", 1200 | "fig.update_traces(marker={\"size\": 11, \"opacity\": 0.55}, showlegend=False)\n", 1201 | "fig.update_coloraxes(showscale=False)\n", 1202 | "fig.update_layout(width=550 * 1.5, height=400 * 1.5)\n", 1203 | "fig.show()" 1204 | ] 1205 | }, 1206 | { 1207 | "cell_type": "markdown", 1208 | "metadata": { 1209 | "id": "B9fADGA9gvwA" 1210 | }, 1211 | "source": [ 1212 | "## Create summary themes knowledge graph" 1213 | ] 1214 | }, 1215 | { 1216 | "cell_type": "code", 1217 | "execution_count": null, 1218 | "metadata": { 1219 | "id": "kXz16XO5dd6j" 1220 | }, 1221 | "outputs": [], 1222 | "source": [ 1223 | "claude_prompt = \"Please identify and summarize the core theme for each Sub-Cluster. Respond as succinctly as possible. Each summary cannot be longer than 1 sentence. Do not skip any of the Sub-Clusters. Do not list out the names of individuals in the prompts. Let's think step by step before responding.\"\n", 1224 | "\n", 1225 | "num_subclusts = 0\n", 1226 | "subclusts_in_order = []\n", 1227 | "\n", 1228 | "for clust in np.unique(np.array(ds[(ds[\"exemplars yes/no\"] == 1)][\"cluster\"])):\n", 1229 | " sub_df = ds[(ds[\"cluster\"] == clust) & (ds[\"exemplars yes/no\"] == 1)]\n", 1230 | "\n", 1231 | " for clust_to_zoom in clust_to_zoom_list:\n", 1232 | " if sub_df[\"cluster\"].iloc[0] == clust_to_zoom:\n", 1233 | " sub_clusts = list(np.unique(np.array(sub_df[\"cluster XX.YY\"])))\n", 1234 | " sub_clusts.sort()\n", 1235 | " for sub_clust in sub_clusts:\n", 1236 | " if int(sub_clust.split(\"Sub-Cluster \")[-1]) != -1:\n", 1237 | " # print(sub_clust)\n", 1238 | " num_subclusts += 1\n", 1239 | " subclusts_in_order.append(sub_clust)\n", 1240 | " sub_prompts = sub_df[sub_df[\"cluster XX.YY\"] == sub_clust][\n", 1241 | " \"Prompt\"\n", 1242 | " ].astype(str)\n", 1243 | " claude_prompt += \"\\n\" + str(sub_clust.split(\", \")[1]) + \": \"\n", 1244 | " claude_prompt += f\"\\n\" + str(sub_prompts) + '\"\\n'" 1245 | ] 1246 | }, 1247 | { 1248 | "cell_type": "code", 1249 | "execution_count": null, 1250 | "metadata": { 1251 | "id": "y49WrTOLq_Dy" 1252 | }, 1253 | "outputs": [], 1254 | "source": [ 1255 | "# print(claude_prompt)\n", 1256 | "# subclusts_in_order" 1257 | ] 1258 | }, 1259 | { 1260 | "cell_type": "code", 1261 | "execution_count": null, 1262 | "metadata": { 1263 | "colab": { 1264 | "base_uri": "https://localhost:8080/" 1265 | }, 1266 | "id": "c0zRcjt0rBJd", 1267 | "outputId": "94f1503d-78b9-44e3-cc71-07b18f614e6d" 1268 | }, 1269 | "outputs": [], 1270 | "source": [ 1271 | "num_subclusts" 1272 | ] 1273 | }, 1274 | { 1275 | "cell_type": "code", 1276 | "execution_count": null, 1277 | "metadata": { 1278 | "colab": { 1279 | "base_uri": "https://localhost:8080/" 1280 | }, 1281 | "id": "Qt-LdSAFlvxh", 1282 | "outputId": "71994546-fcc8-4484-a8ea-3674b1f5f594" 1283 | }, 1284 | "outputs": [], 1285 | "source": [ 1286 | "claude_prompt.count(\"Sub-Cluster \")" 1287 | ] 1288 | }, 1289 | { 1290 | "cell_type": "code", 1291 | "execution_count": null, 1292 | "metadata": { 1293 | "id": "vqR42TGDnLvh" 1294 | }, 1295 | "outputs": [], 1296 | "source": [ 1297 | "ds_exemps = ds[(ds[\"exemplars yes/no\"] == 1) & (ds[\"exemplar sub-cluster\"] != -1)]\n", 1298 | "\n", 1299 | "mask = ds_exemps[\"cluster\"].isin(clust_to_zoom_list)\n", 1300 | "ds_exemps_of_interest = ds_exemps[mask]\n", 1301 | "# ds_exemps_of_interest" 1302 | ] 1303 | }, 1304 | { 1305 | "cell_type": "code", 1306 | "execution_count": null, 1307 | "metadata": { 1308 | "colab": { 1309 | "base_uri": "https://localhost:8080/" 1310 | }, 1311 | "id": "TJkN6z9clcSR", 1312 | "outputId": "106b7414-2281-4612-af62-b1d1c73e7422" 1313 | }, 1314 | "outputs": [], 1315 | "source": [ 1316 | "len(np.unique(np.array(ds_exemps_of_interest[\"cluster XX.YY\"])))" 1317 | ] 1318 | }, 1319 | { 1320 | "cell_type": "code", 1321 | "execution_count": null, 1322 | "metadata": { 1323 | "colab": { 1324 | "base_uri": "https://localhost:8080/" 1325 | }, 1326 | "id": "fEqyYu9tn8vD", 1327 | "outputId": "92679d2f-605d-4236-ae12-8226e341f9b4" 1328 | }, 1329 | "outputs": [], 1330 | "source": [ 1331 | "tokenizer = tiktoken.encoding_for_model(\"gpt-3.5-turbo-16k\")\n", 1332 | "len(tokenizer.encode(claude_prompt))" 1333 | ] 1334 | }, 1335 | { 1336 | "cell_type": "code", 1337 | "execution_count": null, 1338 | "metadata": { 1339 | "colab": { 1340 | "base_uri": "https://localhost:8080/", 1341 | "height": 214 1342 | }, 1343 | "id": "VTTbFvQds-yc", 1344 | "outputId": "798ae301-742c-488a-bd9b-b4e991256fb0" 1345 | }, 1346 | "outputs": [], 1347 | "source": [ 1348 | "claude_prompt" 1349 | ] 1350 | }, 1351 | { 1352 | "cell_type": "code", 1353 | "execution_count": null, 1354 | "metadata": { 1355 | "colab": { 1356 | "base_uri": "https://localhost:8080/" 1357 | }, 1358 | "id": "DfAbeTWZp2M5", 1359 | "outputId": "3f9e0d5d-939d-4d00-d170-4efb4071ac08" 1360 | }, 1361 | "outputs": [], 1362 | "source": [ 1363 | "# saved response from claude-2 conversation\n", 1364 | "\n", 1365 | "text_generation = \"\"\"Sub-Cluster 19: Portraits of characters in lofi style by various artists.\n", 1366 | "\n", 1367 | "Sub-Cluster 17: Symmetry portraits of various people and characters.\n", 1368 | "\n", 1369 | "Sub-Cluster 18: Symmetry sci-fi portraits of characters and people.\n", 1370 | "\n", 1371 | "Sub-Cluster 128: Highly detailed illustrations of people, often describing hair and age.\n", 1372 | "\n", 1373 | "Sub-Cluster 162: Highly detailed illustrations of sadistic or aggressive looking people.\n", 1374 | "\n", 1375 | "Sub-Cluster 163: Highly detailed illustrations of attractive people, often with white hair.\n", 1376 | "\n", 1377 | "Sub-Cluster 75: Highly detailed illustrations of beautiful, fierce, or smug women.\n", 1378 | "\n", 1379 | "Sub-Cluster 76: Art of the League of Legends champion Vi.\n", 1380 | "\n", 1381 | "Sub-Cluster 66: Greg Manchess portrait paintings of various characters as different roles.\n", 1382 | "\n", 1383 | "Sub-Cluster 78: Art and portraits featuring Star Wars characters, especially Darth Vader.\n", 1384 | "\n", 1385 | "Sub-Cluster 91: Portraits and art of female cyborg characters.\n", 1386 | "\n", 1387 | "Sub-Cluster 93: Art and portraits of robots and humanoid AI characters.\n", 1388 | "\n", 1389 | "Sub-Cluster 107: Art of Vladimir Putin being killed or defeated.\n", 1390 | "\n", 1391 | "Sub-Cluster 168: Portraits of Putin and Biden as magical characters.\n", 1392 | "\n", 1393 | "Sub-Cluster 235: Art depicting Vladimir Putin as various monsters, animals, or in humiliating situations.\n", 1394 | "\n", 1395 | "Sub-Cluster 236: Art of Putin with Kim Jong Un's haircut.\n", 1396 | "\n", 1397 | "Sub-Cluster 164: Art of characters like aliens eating hamburgers.\n", 1398 | "\n", 1399 | "Sub-Cluster 191: Art of Final Fantasy 7 character Sephiroth.\n", 1400 | "\n", 1401 | "Sub-Cluster 192: Beautiful, award winning pencil drawings and illustrations.\n", 1402 | "\n", 1403 | "Sub-Cluster 240: Portraits of celebrities eating hamburgers.\n", 1404 | "\n", 1405 | "Sub-Cluster 241: Portraits of various real people and characters eating hamburgers.\n", 1406 | "\n", 1407 | "Sub-Cluster 85: Art and portraits of dragons in various settings.\n", 1408 | "\n", 1409 | "Sub-Cluster 92: Art depicting Donald Trump in various roles and situations.\n", 1410 | "\n", 1411 | "Sub-Cluster 89: Art and portraits of Batman characters.\n", 1412 | "\n", 1413 | "Sub-Cluster 90: Art of Spider-Man and related Marvel characters.\n", 1414 | "\n", 1415 | "Sub-Cluster 135: Award winning portrait commissions.\n", 1416 | "\n", 1417 | "Sub-Cluster 136: Award winning portrait commissions of furry characters.\n", 1418 | "\n", 1419 | "Sub-Cluster 143: Anthropomorphic furry fox characters.\n", 1420 | "\n", 1421 | "Sub-Cluster 184: Trending furry fox character art.\n", 1422 | "\n", 1423 | "Sub-Cluster 185: Beautiful portrait commissions of furry characters.\n", 1424 | "\n", 1425 | "Sub-Cluster 65: Art and portraits of fox characters in various outfits and settings.\n", 1426 | "\n", 1427 | "Sub-Cluster 121: Portraits and art of cats in various styles.\n", 1428 | "\n", 1429 | "Sub-Cluster 146: Portraits of goddesses and divine figures.\n", 1430 | "\n", 1431 | "Sub-Cluster 178: Portraits of Megan Fox as characters from video games.\n", 1432 | "\n", 1433 | "Sub-Cluster 186: Psychedelic and Lovecraftian portraits of Megan Fox.\n", 1434 | "\n", 1435 | "Sub-Cluster 187: Portraits of Megan Fox in various roles and outfits.\n", 1436 | "\n", 1437 | "Sub-Cluster 54: Portraits of Emma Watson in various roles and settings.\n", 1438 | "\n", 1439 | "Sub-Cluster 74: Alexandra Daddario and Megan Fox as Scarlet Witch.\n", 1440 | "\n", 1441 | "Sub-Cluster 84: Futuristic and fantasy vehicle concept art.\n", 1442 | "\n", 1443 | "Sub-Cluster 94: Highly detailed realistic portraits of men.\n", 1444 | "\n", 1445 | "Sub-Cluster 113: Anime girl character portraits and concept art.\n", 1446 | "\n", 1447 | "Sub-Cluster 117: Portraits of beautiful women in various settings.\n", 1448 | "\n", 1449 | "Sub-Cluster 118: Portraits of young women in various outfits and styles.\n", 1450 | "\n", 1451 | "Sub-Cluster 81: Cinematic concept art portraits by Jama Jurabaev.\n", 1452 | "\n", 1453 | "Sub-Cluster 82: Futuristic sci-fi spaceship concept art.\n", 1454 | "\n", 1455 | "Sub-Cluster 125: Concept art of knights and warriors.\n", 1456 | "\n", 1457 | "Sub-Cluster 132: Surreal, cinematic, and futuristic digital art.\n", 1458 | "\n", 1459 | "Sub-Cluster 167: Beautiful landscape paintings and matte art.\n", 1460 | "\n", 1461 | "Sub-Cluster 151: Futuristic cityscape concept art.\"\"\"\n", 1462 | "\n", 1463 | "text_generation = text_generation.split(\"\\n\\n\")\n", 1464 | "len(text_generation)" 1465 | ] 1466 | }, 1467 | { 1468 | "cell_type": "code", 1469 | "execution_count": null, 1470 | "metadata": { 1471 | "colab": { 1472 | "base_uri": "https://localhost:8080/" 1473 | }, 1474 | "id": "qHa4rgqcrQM0", 1475 | "outputId": "5d6d6cf1-431f-4c19-a3ee-cbe5e37b9e6d" 1476 | }, 1477 | "outputs": [], 1478 | "source": [ 1479 | "summaries_dict = {\n", 1480 | " subclusts_in_order[i]: text_generation[i] for i in range(len(subclusts_in_order))\n", 1481 | "}\n", 1482 | "summaries_dict" 1483 | ] 1484 | }, 1485 | { 1486 | "cell_type": "code", 1487 | "execution_count": null, 1488 | "metadata": { 1489 | "id": "kI0Qs_ITvNzG" 1490 | }, 1491 | "outputs": [], 1492 | "source": [ 1493 | "for i in range(len(list(summaries_dict.keys()))):\n", 1494 | " pass\n", 1495 | " key = list(summaries_dict.keys())[i]\n", 1496 | " summary = summaries_dict[key]\n", 1497 | " key_subclust = key.split(\", \")[-1]\n", 1498 | " summary_subclust = summary.split(\": \")[0]\n", 1499 | " assert key_subclust == summary_subclust" 1500 | ] 1501 | }, 1502 | { 1503 | "cell_type": "code", 1504 | "execution_count": null, 1505 | "metadata": { 1506 | "id": "ktj6ipyctjOs" 1507 | }, 1508 | "outputs": [], 1509 | "source": [ 1510 | "# for i in range(len(text_generation)):\n", 1511 | "# text_generation[i] = text_generation[i].split(\": \")[-1]" 1512 | ] 1513 | }, 1514 | { 1515 | "cell_type": "code", 1516 | "execution_count": null, 1517 | "metadata": { 1518 | "colab": { 1519 | "base_uri": "https://localhost:8080/" 1520 | }, 1521 | "id": "CmQLnLiXtf7u", 1522 | "outputId": "dce2694a-49b6-4a1a-a956-15176a1d8798" 1523 | }, 1524 | "outputs": [], 1525 | "source": [ 1526 | "summaries_dict_cleaned = {\n", 1527 | " subclusts_in_order[i]: text_generation[i] for i in range(len(subclusts_in_order))\n", 1528 | "}\n", 1529 | "summaries_dict_cleaned" 1530 | ] 1531 | }, 1532 | { 1533 | "cell_type": "code", 1534 | "execution_count": null, 1535 | "metadata": { 1536 | "colab": { 1537 | "base_uri": "https://localhost:8080/" 1538 | }, 1539 | "id": "NQ8XhfRZuo8l", 1540 | "outputId": "365d59d2-f01b-442d-d01e-7cc5baa3e625" 1541 | }, 1542 | "outputs": [], 1543 | "source": [ 1544 | "for i in range(len(ds)):\n", 1545 | " try:\n", 1546 | " ds.loc[i, \"theme\"] = summaries_dict_cleaned[ds.loc[i, \"cluster XX.YY\"]]\n", 1547 | " except KeyError:\n", 1548 | " pass" 1549 | ] 1550 | }, 1551 | { 1552 | "cell_type": "code", 1553 | "execution_count": null, 1554 | "metadata": { 1555 | "colab": { 1556 | "base_uri": "https://localhost:8080/", 1557 | "height": 35 1558 | }, 1559 | "id": "RgKbnRtut3YG", 1560 | "outputId": "125d5a2f-164e-4b9c-c0bd-c0325bfc8199" 1561 | }, 1562 | "outputs": [], 1563 | "source": [ 1564 | "ds[\"theme\"][ds[\"cluster XX.YY\"] == \"Cluster 77, Sub-Cluster 151\"].iloc[0]" 1565 | ] 1566 | }, 1567 | { 1568 | "cell_type": "markdown", 1569 | "metadata": { 1570 | "id": "0CYEH057uyj8" 1571 | }, 1572 | "source": [ 1573 | "## Write final df results to disk" 1574 | ] 1575 | }, 1576 | { 1577 | "cell_type": "code", 1578 | "execution_count": null, 1579 | "metadata": { 1580 | "colab": { 1581 | "base_uri": "https://localhost:8080/", 1582 | "height": 1000 1583 | }, 1584 | "id": "Yjqesxe_oitS", 1585 | "outputId": "620d1909-6874-4e4a-a988-3a6f2df64c94" 1586 | }, 1587 | "outputs": [], 1588 | "source": [ 1589 | "# write final ds to disk\n", 1590 | "f_name = os.path.join(\n", 1591 | " args.cache_folder, \"stable_diffusion_prompts_dataframe_cached_with_results.xlsx\"\n", 1592 | ")\n", 1593 | "print(f_name, \"\\n\")\n", 1594 | "\n", 1595 | "# re-order cols\n", 1596 | "ds = ds[\n", 1597 | " [\n", 1598 | " \"id\",\n", 1599 | " \"cluster\",\n", 1600 | " \"x\",\n", 1601 | " \"y\",\n", 1602 | " \"cluster membership prob\",\n", 1603 | " \"exemplars yes/no\",\n", 1604 | " \"exemplar sub-cluster\",\n", 1605 | " \"cluster XX.YY\",\n", 1606 | " \"theme\",\n", 1607 | " \"Prompt\",\n", 1608 | " ]\n", 1609 | "]\n", 1610 | "ds" 1611 | ] 1612 | }, 1613 | { 1614 | "cell_type": "code", 1615 | "execution_count": null, 1616 | "metadata": { 1617 | "id": "CuoIn4KCvyPR" 1618 | }, 1619 | "outputs": [], 1620 | "source": [ 1621 | "# write with adjusted col width\n", 1622 | "# if not args.inference:\n", 1623 | "if True:\n", 1624 | " with pd.ExcelWriter(f_name) as writer:\n", 1625 | " ds.to_excel(writer, sheet_name=\"All Prompts\")\n", 1626 | " auto_adjust_xlsx_column_width(ds, writer, sheet_name=\"All Prompts\", margin=1)" 1627 | ] 1628 | }, 1629 | { 1630 | "cell_type": "markdown", 1631 | "metadata": { 1632 | "id": "CNrkolSLu1tg" 1633 | }, 1634 | "source": [ 1635 | "## Format a JSON viz graph" 1636 | ] 1637 | }, 1638 | { 1639 | "cell_type": "code", 1640 | "execution_count": null, 1641 | "metadata": { 1642 | "colab": { 1643 | "base_uri": "https://localhost:8080/", 1644 | "height": 35 1645 | }, 1646 | "id": "DGsJa7GrN2cf", 1647 | "outputId": "65f53e9b-0a91-47ad-c800-9ff6db6e5e2e" 1648 | }, 1649 | "outputs": [], 1650 | "source": [ 1651 | "args.cache_folder" 1652 | ] 1653 | }, 1654 | { 1655 | "cell_type": "code", 1656 | "execution_count": null, 1657 | "metadata": { 1658 | "colab": { 1659 | "base_uri": "https://localhost:8080/", 1660 | "height": 1000 1661 | }, 1662 | "id": "bUhn-leKp0Yv", 1663 | "outputId": "ca475888-d96d-4f5a-d23d-d80c7e693172" 1664 | }, 1665 | "outputs": [], 1666 | "source": [ 1667 | "# optional ds cached loading\n", 1668 | "ds_loaded = pd.read_excel(\n", 1669 | " os.path.join(\n", 1670 | " args.cache_folder, \"stable_diffusion_prompts_dataframe_cached_with_results.xlsx\"\n", 1671 | " ),\n", 1672 | " index_col=\"Unnamed: 0\",\n", 1673 | ")\n", 1674 | "ds_loaded" 1675 | ] 1676 | }, 1677 | { 1678 | "cell_type": "code", 1679 | "execution_count": null, 1680 | "metadata": { 1681 | "colab": { 1682 | "base_uri": "https://localhost:8080/", 1683 | "height": 1000 1684 | }, 1685 | "id": "829s5RJtxJ2j", 1686 | "outputId": "a37154d2-a6db-4ead-e0ba-a7600a18f100" 1687 | }, 1688 | "outputs": [], 1689 | "source": [ 1690 | "ds_clust = ds_loaded[ds_loaded[\"theme\"].notna()]\n", 1691 | "ds_clust" 1692 | ] 1693 | }, 1694 | { 1695 | "cell_type": "code", 1696 | "execution_count": null, 1697 | "metadata": { 1698 | "colab": { 1699 | "base_uri": "https://localhost:8080/" 1700 | }, 1701 | "id": "Q_H93u9QuC2u", 1702 | "outputId": "84bc7bd9-3eed-4f6f-cd74-a32291211c67" 1703 | }, 1704 | "outputs": [], 1705 | "source": [ 1706 | "len(np.unique(np.array(ds_clust[\"cluster XX.YY\"])))" 1707 | ] 1708 | }, 1709 | { 1710 | "cell_type": "code", 1711 | "execution_count": null, 1712 | "metadata": { 1713 | "id": "oc1HFPyBxnck" 1714 | }, 1715 | "outputs": [], 1716 | "source": [ 1717 | "knowledge_graphs = []\n", 1718 | "\n", 1719 | "for sub_clust in np.unique(np.array(ds_clust[\"cluster XX.YY\"])):\n", 1720 | " clust = sub_clust.split(\"Cluster \")[1].split(\",\")[0]\n", 1721 | "\n", 1722 | " prompts = []\n", 1723 | " ds_inner = ds_clust[ds_clust[\"cluster XX.YY\"] == sub_clust]\n", 1724 | " for i in range(len(ds_inner)):\n", 1725 | " row = ds_inner.iloc[i]\n", 1726 | " if row[\"exemplars yes/no\"] == 1:\n", 1727 | " prompts.append(\n", 1728 | " {\n", 1729 | " \"Prompt\": row.Prompt,\n", 1730 | " \"id\": float(row[\"id\"]),\n", 1731 | " }\n", 1732 | " )\n", 1733 | "\n", 1734 | " viz = {\n", 1735 | " \"core theme\": ds_inner.iloc[0][\"theme\"],\n", 1736 | " \"cluster id\": ds_inner.iloc[0][\"cluster XX.YY\"],\n", 1737 | " # \"frequency\": str(np.round(100 * len(ds_inner) / len(ds), 2)) + \"%\",\n", 1738 | " # \"count\": len(ds_inner),\n", 1739 | " # \"exemplars\": prompts,\n", 1740 | " }\n", 1741 | "\n", 1742 | " knowledge_graphs.append(viz)" 1743 | ] 1744 | }, 1745 | { 1746 | "cell_type": "code", 1747 | "execution_count": null, 1748 | "metadata": { 1749 | "colab": { 1750 | "base_uri": "https://localhost:8080/" 1751 | }, 1752 | "id": "iwpVNL_vOze_", 1753 | "outputId": "1b9c8202-67a1-4aad-b398-7cea61e36bb0" 1754 | }, 1755 | "outputs": [], 1756 | "source": [ 1757 | "for i in range(len(np.unique(np.array(ds_clust[\"cluster XX.YY\"])))):\n", 1758 | " sub_clust = np.unique(np.array(ds_clust[\"cluster XX.YY\"]))[i]\n", 1759 | "\n", 1760 | " # Serializing json\n", 1761 | " json_object = json.dumps(knowledge_graphs[i], indent=4)\n", 1762 | " print(json_object)\n", 1763 | " print(\"\\n\\n\")" 1764 | ] 1765 | }, 1766 | { 1767 | "cell_type": "code", 1768 | "execution_count": null, 1769 | "metadata": { 1770 | "id": "uqu2JMCcw3I4" 1771 | }, 1772 | "outputs": [], 1773 | "source": [ 1774 | "# summaries_dict_cleaned[\"Cluster 75, Sub-Cluster 167\"]" 1775 | ] 1776 | }, 1777 | { 1778 | "cell_type": "code", 1779 | "execution_count": null, 1780 | "metadata": { 1781 | "id": "sr7BqFUM5E96" 1782 | }, 1783 | "outputs": [], 1784 | "source": [ 1785 | "summaries_dict_cluster_level = {}\n", 1786 | "for clust in np.unique(np.array(ds_clust[\"cluster\"])):\n", 1787 | " summaries_dict_cluster_level[clust] = list(\n", 1788 | " np.unique(np.array(ds_clust[ds_clust[\"cluster\"] == clust][\"theme\"]))\n", 1789 | " )" 1790 | ] 1791 | }, 1792 | { 1793 | "cell_type": "code", 1794 | "execution_count": null, 1795 | "metadata": { 1796 | "colab": { 1797 | "base_uri": "https://localhost:8080/" 1798 | }, 1799 | "id": "_Oe7fGdKyK7q", 1800 | "outputId": "66f7173a-fc25-42d3-d380-9fc3ba50bfa0" 1801 | }, 1802 | "outputs": [], 1803 | "source": [ 1804 | "summaries_dict_cluster_level" 1805 | ] 1806 | }, 1807 | { 1808 | "cell_type": "code", 1809 | "execution_count": null, 1810 | "metadata": { 1811 | "colab": { 1812 | "base_uri": "https://localhost:8080/" 1813 | }, 1814 | "id": "Ju1VNC4w5yQD", 1815 | "outputId": "a5514fc5-dcd2-4c41-ddd5-3ceb8f2d9e0e" 1816 | }, 1817 | "outputs": [], 1818 | "source": [ 1819 | "summaries_dict_cluster_level[10]" 1820 | ] 1821 | }, 1822 | { 1823 | "cell_type": "code", 1824 | "execution_count": null, 1825 | "metadata": { 1826 | "id": "rX2glhje2sYI" 1827 | }, 1828 | "outputs": [], 1829 | "source": [ 1830 | "knowledge_graphs = []\n", 1831 | "\n", 1832 | "itr = 0\n", 1833 | "for clust in ds_loaded[\"cluster\"].value_counts().index:\n", 1834 | " if clust in list(np.unique(np.array(ds_clust[\"cluster\"]))):\n", 1835 | " ds_inner = ds_clust[ds_clust[\"cluster\"] == int(clust)]\n", 1836 | "\n", 1837 | " viz = {\n", 1838 | " \"cluster id\": \"Cluster \" + str(ds_inner.iloc[0][\"cluster\"]),\n", 1839 | " \"count\": float(ds_loaded[\"cluster\"].value_counts().loc[int(clust)]),\n", 1840 | " \"frequency\": str(\n", 1841 | " np.round(\n", 1842 | " 100\n", 1843 | " * float(ds_loaded[\"cluster\"].value_counts().loc[int(clust)])\n", 1844 | " / len(ds_loaded),\n", 1845 | " 2,\n", 1846 | " )\n", 1847 | " )\n", 1848 | " + \"%\",\n", 1849 | " \"core theme\": summaries_dict_cluster_level[clust],\n", 1850 | " }\n", 1851 | "\n", 1852 | " knowledge_graphs.append(viz)" 1853 | ] 1854 | }, 1855 | { 1856 | "cell_type": "code", 1857 | "execution_count": null, 1858 | "metadata": { 1859 | "colab": { 1860 | "base_uri": "https://localhost:8080/" 1861 | }, 1862 | "id": "_lgrhx7H4iC7", 1863 | "outputId": "19b187c7-a1b9-44d6-cee7-d63dce526198" 1864 | }, 1865 | "outputs": [], 1866 | "source": [ 1867 | "len(knowledge_graphs)" 1868 | ] 1869 | }, 1870 | { 1871 | "cell_type": "code", 1872 | "execution_count": null, 1873 | "metadata": { 1874 | "id": "MKfP4tMj8PHL" 1875 | }, 1876 | "outputs": [], 1877 | "source": [ 1878 | "knowledge_graphs = {\"knowledge graph\": knowledge_graphs}" 1879 | ] 1880 | }, 1881 | { 1882 | "cell_type": "code", 1883 | "execution_count": null, 1884 | "metadata": { 1885 | "colab": { 1886 | "base_uri": "https://localhost:8080/" 1887 | }, 1888 | "id": "Gi-uJ6Ds2sYJ", 1889 | "outputId": "4c00ec28-6aa5-497e-ccc8-fb823fa947bf" 1890 | }, 1891 | "outputs": [], 1892 | "source": [ 1893 | "# Serializing json\n", 1894 | "json_object = json.dumps(knowledge_graphs, indent=4)\n", 1895 | "print(json_object)\n", 1896 | "print(\"\\n\\n\")" 1897 | ] 1898 | }, 1899 | { 1900 | "cell_type": "markdown", 1901 | "metadata": { 1902 | "id": "p3NXLZVOPpFk" 1903 | }, 1904 | "source": [ 1905 | "## Drift detection on the top 25 clusters\n" 1906 | ] 1907 | }, 1908 | { 1909 | "cell_type": "code", 1910 | "execution_count": null, 1911 | "metadata": { 1912 | "colab": { 1913 | "base_uri": "https://localhost:8080/", 1914 | "height": 424 1915 | }, 1916 | "id": "t_zMrCONOhpd", 1917 | "outputId": "bf2461d3-e5c4-477f-8655-053096f19a99" 1918 | }, 1919 | "outputs": [], 1920 | "source": [ 1921 | "ds_hf = load_dataset(\"Gustavosta/Stable-Diffusion-Prompts\")\n", 1922 | "ds = ds_hf[\"test\"]\n", 1923 | "\n", 1924 | "ds = ds.to_pandas()\n", 1925 | "ds[\"id\"] = ds.index\n", 1926 | "ds = ds[[\"id\", \"Prompt\"]]\n", 1927 | "ds" 1928 | ] 1929 | }, 1930 | { 1931 | "cell_type": "code", 1932 | "execution_count": null, 1933 | "metadata": { 1934 | "colab": { 1935 | "base_uri": "https://localhost:8080/" 1936 | }, 1937 | "id": "9KgCWCIlvsFn", 1938 | "outputId": "4069f096-8b20-4ba5-9fb9-2a2c064c8208" 1939 | }, 1940 | "outputs": [], 1941 | "source": [ 1942 | "args.inference = True\n", 1943 | "args" 1944 | ] 1945 | }, 1946 | { 1947 | "cell_type": "code", 1948 | "execution_count": null, 1949 | "metadata": { 1950 | "id": "8B2Ay5OfvftL" 1951 | }, 1952 | "outputs": [], 1953 | "source": [ 1954 | "# Embed Text\n", 1955 | "# * See [pretrained models](https://www.sbert.net/docs/pretrained_models.html) for supporting information\n", 1956 | "\n", 1957 | "model = SentenceTransformer(\"all-mpnet-base-v2\")\n", 1958 | "\n", 1959 | "if not args.inference:\n", 1960 | " embeddings = torch.zeros([len(ds), 768])\n", 1961 | " for i in tqdm(range(len(ds))):\n", 1962 | " emb = model.encode(ds.loc[i, \"Prompt\"], convert_to_tensor=True)\n", 1963 | " embeddings[i, :] = emb\n", 1964 | " embeddings" 1965 | ] 1966 | }, 1967 | { 1968 | "cell_type": "code", 1969 | "execution_count": null, 1970 | "metadata": { 1971 | "colab": { 1972 | "base_uri": "https://localhost:8080/" 1973 | }, 1974 | "id": "PshHjy3NvgBA", 1975 | "outputId": "7805b3cb-7d9b-4ee6-93ae-bf771f755f2a" 1976 | }, 1977 | "outputs": [], 1978 | "source": [ 1979 | "f_name = os.path.join(\n", 1980 | " args.cache_folder, \"stable_diffusion_prompts_test_embeddings_all_mpnet_base_v2.pt\"\n", 1981 | ")\n", 1982 | "print(f_name, \"\\n\")\n", 1983 | "\n", 1984 | "if not args.inference:\n", 1985 | " torch.save(embeddings, f_name)\n", 1986 | "loaded_embeddings = torch.load(f_name)\n", 1987 | "loaded_embeddings" 1988 | ] 1989 | }, 1990 | { 1991 | "cell_type": "code", 1992 | "execution_count": null, 1993 | "metadata": { 1994 | "colab": { 1995 | "base_uri": "https://localhost:8080/" 1996 | }, 1997 | "id": "HbsirlY0vgBB", 1998 | "outputId": "08e5f4fb-32a9-4e04-ca70-f2e4a407d5d1" 1999 | }, 2000 | "outputs": [], 2001 | "source": [ 2002 | "loaded_embeddings.shape" 2003 | ] 2004 | }, 2005 | { 2006 | "cell_type": "code", 2007 | "execution_count": null, 2008 | "metadata": { 2009 | "id": "hvU350V2vgBB" 2010 | }, 2011 | "outputs": [], 2012 | "source": [ 2013 | "if not args.inference:\n", 2014 | " torch.equal(loaded_embeddings.cpu(), embeddings.cpu())" 2015 | ] 2016 | }, 2017 | { 2018 | "cell_type": "code", 2019 | "execution_count": null, 2020 | "metadata": { 2021 | "colab": { 2022 | "base_uri": "https://localhost:8080/" 2023 | }, 2024 | "id": "leU5pK6mvgBB", 2025 | "outputId": "7546ce0b-3f7e-421f-edcd-b0063b51da79" 2026 | }, 2027 | "outputs": [], 2028 | "source": [ 2029 | "test_itr = random.randint(low=0, high=len(ds))\n", 2030 | "test_itr" 2031 | ] 2032 | }, 2033 | { 2034 | "cell_type": "code", 2035 | "execution_count": null, 2036 | "metadata": { 2037 | "colab": { 2038 | "base_uri": "https://localhost:8080/", 2039 | "height": 71 2040 | }, 2041 | "id": "M_M-XVkpvgBB", 2042 | "outputId": "3dc80f2c-06c8-4def-c70e-63720ff52f45" 2043 | }, 2044 | "outputs": [], 2045 | "source": [ 2046 | "# test embeddings worked\n", 2047 | "ds.loc[test_itr, \"Prompt\"]" 2048 | ] 2049 | }, 2050 | { 2051 | "cell_type": "code", 2052 | "execution_count": null, 2053 | "metadata": { 2054 | "colab": { 2055 | "base_uri": "https://localhost:8080/" 2056 | }, 2057 | "id": "iBffCA3NvgBB", 2058 | "outputId": "36a9dd66-77d2-44fd-c07b-6b250f955258" 2059 | }, 2060 | "outputs": [], 2061 | "source": [ 2062 | "test_emb = model.encode(ds.loc[test_itr, \"Prompt\"], convert_to_tensor=True)\n", 2063 | "a = np.array(test_emb.cpu())\n", 2064 | "b = np.array(loaded_embeddings[test_itr, :].cpu())\n", 2065 | "np.allclose(a, b, rtol=1e-02)" 2066 | ] 2067 | }, 2068 | { 2069 | "cell_type": "code", 2070 | "execution_count": null, 2071 | "metadata": { 2072 | "colab": { 2073 | "base_uri": "https://localhost:8080/" 2074 | }, 2075 | "id": "xgUPGdAfv62x", 2076 | "outputId": "467a7ec8-ce53-4a8f-e3de-ec524a7d9a76" 2077 | }, 2078 | "outputs": [], 2079 | "source": [ 2080 | "# second, perform clustering\n", 2081 | "\n", 2082 | "# first, perform dimensionality reduction from 768 to 15\n", 2083 | "f_name = os.path.join(args.cache_folder, \"reducer_umap_15.pkl\")\n", 2084 | "print(f_name, \"\\n\")\n", 2085 | "\n", 2086 | "loaded_reducer_15 = pickle.load((open(f_name, \"rb\")))\n", 2087 | "\n", 2088 | "embeddings_umap_dim_15 = loaded_reducer_15.transform(loaded_embeddings)\n", 2089 | "\n", 2090 | "embeddings_umap_dim_15.shape" 2091 | ] 2092 | }, 2093 | { 2094 | "cell_type": "code", 2095 | "execution_count": null, 2096 | "metadata": { 2097 | "colab": { 2098 | "base_uri": "https://localhost:8080/", 2099 | "height": 130 2100 | }, 2101 | "id": "-PM40fYpxPRA", 2102 | "outputId": "b5058681-c52e-4fdf-c17a-75731ce94fae" 2103 | }, 2104 | "outputs": [], 2105 | "source": [ 2106 | "f_name = os.path.join(args.cache_folder, \"clusterer_hdbscan.pkl\")\n", 2107 | "print(f_name, \"\\n\")\n", 2108 | "\n", 2109 | "loaded_clusterer = pickle.load((open(f_name, \"rb\")))\n", 2110 | "loaded_clusterer" 2111 | ] 2112 | }, 2113 | { 2114 | "cell_type": "code", 2115 | "execution_count": null, 2116 | "metadata": { 2117 | "colab": { 2118 | "base_uri": "https://localhost:8080/" 2119 | }, 2120 | "id": "8dPZ614OxJoe", 2121 | "outputId": "242b4da9-bdf5-4109-fb46-0719291bd9b6" 2122 | }, 2123 | "outputs": [], 2124 | "source": [ 2125 | "test_labels, strengths = hdbscan.approximate_predict(\n", 2126 | " loaded_clusterer, embeddings_umap_dim_15\n", 2127 | ")\n", 2128 | "test_labels" 2129 | ] 2130 | }, 2131 | { 2132 | "cell_type": "code", 2133 | "execution_count": null, 2134 | "metadata": { 2135 | "colab": { 2136 | "base_uri": "https://localhost:8080/", 2137 | "height": 179 2138 | }, 2139 | "id": "fffVi_HuweCQ", 2140 | "outputId": "b26cee5f-0fa5-4203-cec9-6945841be352" 2141 | }, 2142 | "outputs": [], 2143 | "source": [ 2144 | "pd.Series(test_labels).value_counts()[1:26]" 2145 | ] 2146 | }, 2147 | { 2148 | "cell_type": "code", 2149 | "execution_count": null, 2150 | "metadata": { 2151 | "id": "SKC6g7yHweG6" 2152 | }, 2153 | "outputs": [], 2154 | "source": [] 2155 | } 2156 | ], 2157 | "metadata": { 2158 | "accelerator": "GPU", 2159 | "colab": { 2160 | "gpuType": "T4", 2161 | "machine_shape": "hm", 2162 | "provenance": [] 2163 | }, 2164 | "interpreter": { 2165 | "hash": "bd385fe162c5ca0c84973b7dd5c518456272446b2b64e67c2a69f949ca7a1754" 2166 | }, 2167 | "kernelspec": { 2168 | "display_name": "Python 3", 2169 | "name": "python3" 2170 | }, 2171 | "language_info": { 2172 | "codemirror_mode": { 2173 | "name": "ipython", 2174 | "version": 3 2175 | }, 2176 | "file_extension": ".py", 2177 | "mimetype": "text/x-python", 2178 | "name": "python", 2179 | "nbconvert_exporter": "python", 2180 | "pygments_lexer": "ipython3", 2181 | "version": "3.10.10" 2182 | }, 2183 | "orig_nbformat": 4 2184 | }, 2185 | "nbformat": 4, 2186 | "nbformat_minor": 0 2187 | } 2188 | --------------------------------------------------------------------------------