├── src
    ├── plot_utils.py
    └── text_clustering.py
├── examples
    └── README.md
├── README.md
├── run_pipeline.py
└── LICENSE


/src/plot_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import matplotlib.pyplot as plt
 4 | import pandas as pd
 5 | import seaborn as sns
 6 | from datasets import load_dataset
 7 | 
 8 | 
 9 | def get_size(ds):
10 |     return len([e for i in range(len(ds)) for e in ds[i]["examples"]])
11 | 
12 | 
13 | def extract_score(example):
14 |     summary = example["summary"]
15 |     category = summary.split(". Educational")[0].strip()
16 |     score = summary.split(" Educational score: ")[1].strip()
17 |     return {"category": category, "educational_score": score}
18 | 
19 | 
20 | def plot_distributions(ds_path, image_path="."):
21 |     """Plot distribution of educational score of topics & distribution of samples accross topics"""
22 |     ds = load_dataset(ds_path, split="train", num_proc=2, token=os.getenv("HF_TOKEN"))
23 |     ds = ds.map(extract_score)
24 |     print(ds["category"])
25 |     ds = ds.filter(lambda x: x["educational_score"] not in ["None", ""])
26 |     # distribution of scores
27 |     df = ds.to_pandas()
28 |     df["educational_score"] = pd.to_numeric(df["educational_score"], errors="coerce")
29 |     df.dropna(subset=["educational_score"], inplace=True)
30 | 
31 |     sns.set_theme(style="whitegrid")
32 |     plt.figure(figsize=(10, 6))
33 |     sns.histplot(df["educational_score"], kde=False, bins=10)
34 |     plt.title("Distribution of Educational Scores")
35 |     plt.xlabel("Educational Score")
36 |     plt.ylabel("Frequency")
37 |     plt.savefig(f"{image_path}/educational_score.png", bbox_inches="tight")
38 | 
39 |     # distribution of samples
40 |     df = ds.to_pandas().explode("examples")
41 |     sorted_filtered_ds = df.groupby(by="category").size().sort_values(ascending=False)
42 |     category_df = sorted_filtered_ds.reset_index()
43 |     category_df.columns = ["category", "number_files"]
44 |     print(f"Saving csv in {image_path}!")
45 |     category_df.to_csv(f"{image_path}/df_categories_count.csv")
46 | 
47 |     sns.set_theme(style="whitegrid")
48 |     plt.figure(figsize=(25, 20))
49 | 
50 |     barplot = sns.barplot(
51 |         x="number_files", y="category", data=category_df, palette="Blues_d", ci=None
52 |     )
53 | 
54 |     plt.xlabel("Number of Examples")
55 |     plt.ylabel("Categories")
56 |     plt.title("Histogram of Categories and their number of FW files")
57 |     plt.tight_layout(pad=1.0)
58 |     plt.show()
59 |     plt.savefig(f"{image_path}/topics_distpng", bbox_inches="tight", dpi=200)
60 | 


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
 1 | # Examples
 2 | 
 3 | ## Cosmopedia experiments: clustering of web samples
 4 | 
 5 | Here you can find the commands we used during the selection of web samples for [Cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia) prompts. 
 6 | 
 7 | Our goal was to find the topics in random web samples and their educational score. The topics were used in the creation of prompts for synthetic data generation and helped us understand the range of domains covered. Initially, we clustered **100,000 samples**, yielding **145 clusters**. Then we assigned **15 million samples** to these clusters using the inference mode of `text-clustering`; however, half of them did not fit into any cluster and were excluded from prompt creation.
 8 | 
 9 | For illustration, we will use  [AutoMathText](https://huggingface.co/datasets/math-ai/AutoMathText) here. In Cosmopedia we used samples from a web dataset like [RefineWeb](https://huggingface.co/datasets/tiiuae/falcon-refinedweb). 
10 | 
11 | We will run the clustering using `topic_mode` single with educational scores. This pipeline clusters files and prompts an LLM (by default [Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1)) to find the topic of each cluster and give it an educational score. We plot the distribution of samples over topics and the distribution of the educational score and save the plots in the `save_load_path` folder. 
12 | 
13 | ```bash
14 | python run_pipeline.py --mode run \
15 |   --save_load_path './web_samples_100k' \
16 |   --input_dataset math-ai/AutoMathText \
17 |   --data_subset "web-0.70-to-1.00" \
18 |   --input_content text \
19 |   --n_samples 100000 \
20 |   --build_hf_ds \
21 |   --topic_mode single_topic \
22 |   --dbscan_eps 0.08 \
23 |   --dbscan_min_samples 50
24 | ```
25 | 
26 | 
27 | This detects 213 clusters that you can visualize in this [plot](https://huggingface.co/datasets/HuggingFaceTB/miscellaneous/blob/main/AMT_plots/topics_distpng.png) along with the [educational scores](https://huggingface.co/datasets/HuggingFaceTB/miscellaneous/blob/main/AMT_plots/educational_score.png) which is very high for this AutoMathText dataset.
28 | 
29 | When using general web datasets, you might want to filter out files with a lower quality by discarding clusters with a low educational score (e.g. Explicit Adult Content). You can check this [demo](https://huggingface.co/spaces/HuggingFaceTB/inspect_clusters_free_topics) for an example.
30 | 
31 | 
32 | <div align="center">
33 |     <img src="https://huggingface.co/datasets/HuggingFaceTB/miscellaneous/resolve/main/AMT_plots/topics_distpng.png" alt="clusters" width="1000" height="700">
34 |     <p>The clusters of AutoMathText</p>
35 | </div>
36 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Text Clustering
  2 | 
  3 | The Text Clustering repository contains tools to easily embed and cluster texts as well as label clusters semantically. This repository is a work in progress and serves as a minimal codebase that can be modified and adapted to other use cases.
  4 | 
  5 | <center><img src="https://cdn-uploads.huggingface.co/production/uploads/61c141342aac764ce1654e43/jMKGaE_UnEfH3j8iZYXVN.png"></center>
  6 | <center>Clustering of texts in the <a href="https://huggingface.co/datasets/HuggingFaceTB/cosmopedia">Cosmopedia dataset</a>.</center>
  7 | 
  8 | 
  9 | ## How it works
 10 | The pipeline consists of several distinct blocks that can be customized and the whole pipeline can run in a few minutes on a consumer laptop. Each block uses existing standard methods and works quite robustly.
 11 | 
 12 | <center><img src="https://huggingface.co/datasets/lvwerra/admin/resolve/main/text-clustering.png"></center>
 13 | <center>Text clustering pipeline.</center>
 14 | 
 15 | 
 16 | ## Install 
 17 | Install the following libraries to get started:
 18 | ```bash
 19 | pip install scikit-learn umap-learn sentence_transformers faiss-cpu plotly matplotlib datasets
 20 | ```
 21 | Clone this repository and navigate to the folder:
 22 | ```bash
 23 | git clone https://github.com/huggingface/text-clustering.git
 24 | cd text-clustering
 25 | ```
 26 | 
 27 | ## Usage
 28 | 
 29 | Run pipeline and visualize results:
 30 | 
 31 | ```python
 32 | from src.text_clustering import ClusterClassifier
 33 | from datasets import load_dataset
 34 | 
 35 | SAMPLE = 100_000
 36 | 
 37 | texts = load_dataset("HuggingFaceTB/cosmopedia-100k", split="train").select(range(SAMPLE))["text"]
 38 | 
 39 | cc = ClusterClassifier(embed_device="mps")
 40 | 
 41 | # run the pipeline:
 42 | embs, labels, summaries = cc.fit(texts)
 43 | 
 44 | # show the results
 45 | cc.show()
 46 | 
 47 | # save 
 48 | cc.save("./cc_100k")
 49 | ```
 50 | 
 51 | Load classifier and run inference:
 52 | ```python
 53 | from src.text_clustering import ClusterClassifier
 54 | 
 55 | cc = ClusterClassifier(embed_device="mps")
 56 | 
 57 | # load state
 58 | cc.load("./cc_100k")
 59 | 
 60 | # visualize
 61 | cc.show()
 62 | 
 63 | # classify new texts with k-nearest neighbour search
 64 | cluster_labels, embeddings = cc.infer(some_texts, top_k=1)
 65 | ```
 66 | 
 67 | If you want to reproduce the color scheme in the plot above you can add the following code before you run `cc.show()`:
 68 | ```python
 69 | from cycler import cycler
 70 | import matplotlib.pyplot as plt
 71 | 
 72 | default_cycler = (cycler(color=[
 73 |     "0F0A0A",
 74 |     "FF6600",
 75 |     "FFBE00",
 76 |     "496767",
 77 |     "87A19E",
 78 |     "FF9200",
 79 |     "0F3538",
 80 |     "F8E08E",
 81 |     "0F2021",
 82 |     "FAFAF0"])
 83 |     )
 84 | plt.rc('axes', prop_cycle=default_cycler)
 85 | ```
 86 | If you would like to customize the plotting further the easiest way is to customize or overwrite the `_show_mpl` and `_show_plotly` methods.
 87 | 
 88 | You can also run the pipeline using a script with:
 89 | ```bash
 90 | # run a new pipeline
 91 | python run_pipeline.py --mode run  --save_load_path './cc_100k' --n_samples 100000 --build_hf_ds
 92 | # load existing pipeline
 93 | python run_pipeline.py --mode load --save_load_path './cc_100k' --build_hf_ds
 94 | # inference mode on new texts from an input dataset
 95 | python run_pipeline.py --mode infer --save_load_path './cc_100k'  --n_samples <NB_INFERENCE_SAMPLES> --input_dataset <HF_DATA_FOR_INFERENCE>
 96 | ```
 97 | The `build_hf_ds` flag builds and pushes HF datasets, for the files and clusters, that can be directly used in the FW visualization space. In `infer` mode, we push the clusters dataset by default.
 98 | 
 99 | You can also change how the clusters are labeled (multiple topics (default) vs single topic with an educational score) using the flag `--topic_mode`.
100 | 
101 | ## Examples
102 | 
103 | Check the `examples` folder for an example of clustering and topic labeling applied to the [AutoMathText](https://huggingface.co/datasets/math-ai/AutoMathText/) dataset, utilizing [Cosmopedia](https://huggingface.co/datasets/HuggingFaceTB/cosmopedia)'s web labeling approach.


--------------------------------------------------------------------------------
/run_pipeline.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import textwrap
  3 | import time
  4 | 
  5 | import numpy as np
  6 | import pandas as pd
  7 | from datasets import Dataset, load_dataset
  8 | 
  9 | from src.plot_utils import plot_distributions
 10 | from src.text_clustering import ClusterClassifier
 11 | 
 12 | INSTRUCTION_SINGLE_TOPIC = "The examples below are web samples from the same cluster, identify the topic they have in common, for example: Philosophy, Lifesyle, Linear Algebra, Biochemistry, Economics...\
 13 | Additionally determine if the topics in the examples \
 14 | are broadly suitable as college/school material, while being mindful to exclude any sensitive/inappropriate/irrelevant content, \
 15 | including but not limited to sex, explicit violence, ads & scams, and other non-academic subjects. Consider a wide range of content including scientific, \
 16 | educational, historical, cultural, and practical applications and give a rating of how educational these topics could be from 1 to 10, 1 being extremely un-educational \
 17 | and inapproriate for an education setting and 10 being highly educational. The output format should be like this: Topic: the_topic, Educational value rating: score."
 18 | INSTRUCTION_MULTIPLE_TOPICS = "Use three words total (comma separated)\
 19 | to describe general topics in above texts. Under no circumstances use enumeration. \
 20 | Example format: Tree, Cat, Fireman"
 21 | 
 22 | 
 23 | TEMPLATE_MULTIPLE_TOPICS = "<s>[INST]{examples}\n\n{instruction}[/INST]"
 24 | TEMPLATE_SINGLE_TOPIC = "<s>[INST]{instruction}\n\nExamples:\n{examples}\nRemember that the output format should be like this: Topic: the_topic, Educational value rating: score.[/INST]"
 25 | 
 26 | 
 27 | def get_args():
 28 |     parser = argparse.ArgumentParser()
 29 |     parser.add_argument("--n_samples", type=int, default=100_000)
 30 |     parser.add_argument("--start", type=int, default=-1)
 31 |     parser.add_argument("--end", type=int, default=100_000)
 32 |     parser.add_argument("--device", type=str, default="cuda")
 33 |     parser.add_argument("--save_load_path", type=str, default="./cc_100k")
 34 |     parser.add_argument(
 35 |         "--input_dataset",
 36 |         type=str,
 37 |         default="HuggingFaceFW/FW-12-12-2023-CC-2023-06",
 38 |         help="dataset with the samples to use for clustering",
 39 |     )
 40 |     parser.add_argument(
 41 |         "--data_subset",
 42 |         type=str,
 43 |         default=None,
 44 |         help="dataset subset",
 45 |     )
 46 |     parser.add_argument("--input_content", type=str, default="content")
 47 |     parser.add_argument(
 48 |         "--topic_mode",
 49 |         type=str,
 50 |         choices=["single_topic", "multiple_topics"],
 51 |         default="multiple_topics",
 52 |         help="Specify 'single_topic' to generate only one topic and score its educational value, or 'multiple_topics' to generate the 3 most relevant topics in the cluster.",
 53 |     )
 54 |     parser.add_argument(
 55 |         "--dbscan_eps",
 56 |         type=float,
 57 |         default=0.08,
 58 |         help="The maximum distance between two samples for them to be considered as in the neighborhood of each other.",
 59 |     )
 60 |     parser.add_argument(
 61 |         "--dbscan_min_samples",
 62 |         type=int,
 63 |         default=50,
 64 |         help="The number of samples in a neighborhood for a point to be considered as a core point.",
 65 |     )
 66 |     parser.add_argument(
 67 |         "--mode",
 68 |         choices=["run", "load", "infer"],
 69 |         default="run",
 70 |         help="Run the pipeline from scratch/load existing model to build hf datasets or to infer on new texts",
 71 |     )
 72 |     parser.add_argument(
 73 |         "--inference_repo_name",
 74 |         type=str,
 75 |         default="infer_fw_on_ultrachat",
 76 |         help="HF repo name for the clusters dataset in inference mode",
 77 |     )
 78 |     parser.add_argument(
 79 |         "--build_hf_ds",
 80 |         action="store_true",
 81 |         help="Builds HF datasets used for space visualization and pushes them to the hub",
 82 |     )
 83 |     parser.add_argument("--username", type=str, default="loubnabnl")
 84 |     return parser.parse_args()
 85 | 
 86 | 
 87 | def extract_res(example):
 88 |     summary = example["summary"]
 89 |     category = summary.split(". Educational")[0].strip()
 90 |     score = summary.split(" Educational score: ")[1].strip()
 91 |     return {"category": category, "educational_score": score}
 92 | 
 93 | 
 94 | def build_hf_data_clusters(cc, texts=None, labels=None):
 95 |     """
 96 |     Build an HF dataset containing information on each cluster.
 97 | 
 98 |     Args:
 99 |         cc: ClusterClassifier object.
100 |         texts: list of texts used for inference mode.
101 |         labels: list of cluster labels corresponding to the texts for inference mode.
102 | 
103 |     If `texts` and `labels` are not provided, the function will use the data available in `cc`
104 |     to construct the dataset. Otherwise it will run in inference mode on texts.
105 |     """
106 |     cluster_data = []
107 |     for cluster_id in cc.label2docs.keys():
108 |         if cluster_id == -1:
109 |             continue
110 | 
111 |         # inference mode
112 |         if texts is not None and labels is not None:
113 |             labels_array = np.array(labels)
114 |             files_in_cluster = np.where(labels_array == cluster_id)[0]
115 |             examples = [texts[doc_id] for doc_id in files_in_cluster]
116 |         else:
117 |             doc_ids = cc.label2docs[cluster_id]
118 |             examples = [cc.texts[doc_id] for doc_id in doc_ids]
119 | 
120 |         cluster_info = {
121 |             "cluster_id": cluster_id,
122 |             "summary": cc.cluster_summaries[cluster_id],
123 |             "examples": examples,
124 |         }
125 | 
126 |         if not texts:
127 |             cluster_info["position"] = cc.cluster_centers[cluster_id]
128 | 
129 |         cluster_data.append(cluster_info)
130 | 
131 |     return Dataset.from_pandas(pd.DataFrame(cluster_data))
132 | 
133 | 
134 | def build_hf_data_files(cc):
135 |     """
136 |     Build an HF dataset containing information on each file and the cluster they belong to
137 |     """
138 | 
139 |     df = pd.DataFrame(
140 |         data={
141 |             "X": cc.projections[:, 0],
142 |             "Y": cc.projections[:, 1],
143 |             "labels": cc.cluster_labels,
144 |             "content_display": [textwrap.fill(txt[:1024], 64) for txt in cc.texts],
145 |         }
146 |     )
147 |     return Dataset.from_pandas(df)
148 | 
149 | 
150 | def build_and_push(cc, args):
151 |     """Build HF files & clusters datasts and push them to the hub"""
152 |     print("Building HF datasets...")
153 |     ds = build_hf_data_clusters(cc)
154 |     ds = ds.map(extract_res)
155 |     data_clusters = build_hf_data_files(cc)
156 |     print(f"Files dataset {ds}\nClusters dataset {data_clusters}")
157 | 
158 |     repo_name = args.save_load_path.split("/")[-1]
159 |     print(f"Pushing to the hub at {repo_name}...")
160 |     ds.push_to_hub(f"{args.username}/{repo_name}", private=True)
161 |     data_clusters.push_to_hub(f"{args.username}/{repo_name}_clusters", private=True)
162 | 
163 | 
164 | def main():
165 |     args = get_args()
166 | 
167 |     template = (
168 |         TEMPLATE_MULTIPLE_TOPICS
169 |         if args.topic_mode == "multiple_topics"
170 |         else TEMPLATE_SINGLE_TOPIC
171 |     )
172 |     instruction = (
173 |         INSTRUCTION_MULTIPLE_TOPICS
174 |         if args.topic_mode == "multiple_topics"
175 |         else INSTRUCTION_SINGLE_TOPIC
176 |     )
177 |     print(f"Using {args.topic_mode} for topic labeling")
178 |     cc = ClusterClassifier(
179 |         embed_device=args.device,
180 |         topic_mode=args.topic_mode,
181 |         summary_template=template,
182 |         summary_instruction=instruction,
183 |         dbscan_eps=args.dbscan_eps,
184 |         dbscan_min_samples=args.dbscan_min_samples,
185 |     )
186 | 
187 |     if args.mode == "run":
188 |         # Run a new pipeline on texts
189 |         dataset_args = (args.input_dataset, args.data_subset) if args.data_subset else (args.input_dataset,)
190 |         ds = load_dataset(*dataset_args, split="train", token=True).shuffle(
191 |             seed=42
192 |         )
193 | 
194 |         print(ds)
195 |         indexes = (
196 |             range(args.start, args.end) if args.start > 0 else range(args.n_samples)
197 |         )
198 |         text_start = f" starting from {args.start}" if args.start > 0 else ""
199 |         print(f"Processing {len(indexes)} samples{text_start}")
200 | 
201 |         texts = ds.select(indexes)[args.input_content]
202 | 
203 |         _, _, summaries = cc.fit(texts)
204 |         print(f"10 example Summaries:\n{[e for e in summaries.values()][:10]}")
205 | 
206 |         cc.save(args.save_load_path)
207 |         print(f"Saved clusters in {args.save_load_path}.")
208 | 
209 |         if args.build_hf_ds:
210 |             build_and_push(cc, args)
211 | 
212 |         ds_path = f"{args.username}/{args.save_load_path.split('/')[-1]}"
213 |         if args.topic_mode == "single_topic":
214 |             plot_distributions(ds_path, image_path=args.save_load_path)
215 |             print("📊 Saved plots for educational score and files distribution.")
216 | 
217 |     elif args.mode == "infer":
218 |         # Run inference mode on texts using an existing pipeline
219 |         cc.load(args.save_load_path)
220 |         indexes = (
221 |             range(args.start, args.end) if args.start >= 0 else range(args.n_samples)
222 |         )
223 |         text_start = f" starting from {args.start}" if args.start >= 0 else ""
224 |         print(
225 |             f"Running inference on {len(indexes)} samples{text_start} of {args.input_dataset} using clusters in {args.save_load_path}."
226 |         )
227 |         dataset_args = (args.input_dataset, args.data_subset) if args.data_subset else (args.input_dataset,)
228 |         ds = load_dataset(*dataset_args, split="train", token=True)
229 |         texts = ds.select(indexes)[args.input_content]
230 | 
231 |         start_time = time.time()
232 |         cluster_labels, _ = cc.infer(texts, top_k=1)
233 | 
234 |         ds = build_hf_data_clusters(cc, texts, cluster_labels)
235 |         print(f"Total time is {(time.time() - start_time)/60}min")
236 |         target_repo = f"{args.username}/{args.inference_repo_name}"
237 |         print(f"Samples with clusters: {ds}")
238 |         print(f"Pushing to hub at {target_repo}...")
239 |         ds.push_to_hub(f"{target_repo}", private=True)
240 | 
241 |     else:
242 |         # Load existing pipeline
243 |         if args.build_hf_ds:
244 |             cc.load(args.save_load_path)
245 |             build_and_push(cc, args)
246 |             ds_path = f"{args.username}/{args.save_load_path.split('/')[-1]}"
247 |             if args.topic_mode == "single_topic":
248 |                 plot_distributions(ds_path, image_path=args.save_load_path)
249 |                 print("📊 Saved plots for educational score and files distribution.")
250 |         else:
251 |             print("Using mode=load but build_hf_ds is False, nothing to be done.")
252 | 
253 |     print("Done 🎉")
254 | 
255 | 
256 | if __name__ == "__main__":
257 |     main()
258 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 Hugging Face
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/text_clustering.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import logging
  3 | import os
  4 | import random
  5 | import textwrap
  6 | from collections import Counter, defaultdict
  7 | 
  8 | import faiss
  9 | import matplotlib.pyplot as plt
 10 | import numpy as np
 11 | import pandas as pd
 12 | import plotly.express as px
 13 | from huggingface_hub import InferenceClient
 14 | from sentence_transformers import SentenceTransformer
 15 | from sklearn.cluster import DBSCAN
 16 | from tqdm import tqdm
 17 | from umap import UMAP
 18 | 
 19 | logging.basicConfig(level=logging.INFO)
 20 | 
 21 | 
 22 | DEFAULT_INSTRUCTION = (
 23 |     instruction
 24 | ) = "Use three words total (comma separated)\
 25 | to describe general topics in above texts. Under no circumstances use enumeration. \
 26 | Example format: Tree, Cat, Fireman"
 27 | 
 28 | DEFAULT_TEMPLATE = "<s>[INST]{examples}\n\n{instruction}[/INST]"
 29 | 
 30 | 
 31 | class ClusterClassifier:
 32 |     def __init__(
 33 |         self,
 34 |         embed_model_name="all-MiniLM-L6-v2",
 35 |         embed_device="cpu",
 36 |         embed_batch_size=64,
 37 |         embed_max_seq_length=512,
 38 |         embed_agg_strategy=None,
 39 |         umap_components=2,
 40 |         umap_metric="cosine",
 41 |         dbscan_eps=0.08,
 42 |         dbscan_min_samples=50,
 43 |         dbscan_n_jobs=16,
 44 |         summary_create=True,
 45 |         summary_model="mistralai/Mixtral-8x7B-Instruct-v0.1",
 46 |         topic_mode="multiple_topics",
 47 |         summary_n_examples=10,
 48 |         summary_chunk_size=420,
 49 |         summary_model_token=True,
 50 |         summary_template=None,
 51 |         summary_instruction=None,
 52 |     ):
 53 |         self.embed_model_name = embed_model_name
 54 |         self.embed_device = embed_device
 55 |         self.embed_batch_size = embed_batch_size
 56 |         self.embed_max_seq_length = embed_max_seq_length
 57 |         self.embed_agg_strategy = embed_agg_strategy
 58 | 
 59 |         self.umap_components = umap_components
 60 |         self.umap_metric = umap_metric
 61 | 
 62 |         self.dbscan_eps = dbscan_eps
 63 |         self.dbscan_min_samples = dbscan_min_samples
 64 |         self.dbscan_n_jobs = dbscan_n_jobs
 65 | 
 66 |         self.summary_create = summary_create
 67 |         self.summary_model = summary_model
 68 |         self.topic_mode = topic_mode
 69 |         self.summary_n_examples = summary_n_examples
 70 |         self.summary_chunk_size = summary_chunk_size
 71 |         self.summary_model_token = summary_model_token
 72 | 
 73 |         if summary_template is None:
 74 |             self.summary_template = DEFAULT_TEMPLATE
 75 |         else:
 76 |             self.summary_template = summary_template
 77 | 
 78 |         if summary_instruction is None:
 79 |             self.summary_instruction = DEFAULT_INSTRUCTION
 80 |         else:
 81 |             self.summary_instruction = summary_instruction
 82 | 
 83 |         self.embeddings = None
 84 |         self.faiss_index = None
 85 |         self.cluster_labels = None
 86 |         self.texts = None
 87 |         self.projections = None
 88 |         self.umap_mapper = None
 89 |         self.id2label = None
 90 |         self.label2docs = None
 91 | 
 92 |         self.embed_model = SentenceTransformer(
 93 |             self.embed_model_name, device=self.embed_device
 94 |         )
 95 |         self.embed_model.max_seq_length = self.embed_max_seq_length
 96 | 
 97 |     def fit(self, texts, embeddings=None):
 98 |         self.texts = texts
 99 | 
100 |         if embeddings is None:
101 |             logging.info("embedding texts...")
102 |             self.embeddings = self.embed(texts)
103 |         else:
104 |             logging.info("using precomputed embeddings...")
105 |             self.embeddings = embeddings
106 | 
107 |         logging.info("building faiss index...")
108 |         self.faiss_index = self.build_faiss_index(self.embeddings)
109 |         logging.info("projecting with umap...")
110 |         self.projections, self.umap_mapper = self.project(self.embeddings)
111 |         logging.info("dbscan clustering...")
112 |         self.cluster_labels = self.cluster(self.projections)
113 | 
114 |         self.id2cluster = {
115 |             index: label for index, label in enumerate(self.cluster_labels)
116 |         }
117 |         self.label2docs = defaultdict(list)
118 |         for i, label in enumerate(self.cluster_labels):
119 |             self.label2docs[label].append(i)
120 | 
121 |         self.cluster_centers = {}
122 |         for label in self.label2docs.keys():
123 |             x = np.mean([self.projections[doc, 0] for doc in self.label2docs[label]])
124 |             y = np.mean([self.projections[doc, 1] for doc in self.label2docs[label]])
125 |             self.cluster_centers[label] = (x, y)
126 | 
127 |         if self.summary_create:
128 |             logging.info("summarizing cluster centers...")
129 |             self.cluster_summaries = self.summarize(self.texts, self.cluster_labels)
130 |         else:
131 |             self.cluster_summaries = None
132 | 
133 |         return self.embeddings, self.cluster_labels, self.cluster_summaries
134 | 
135 |     def infer(self, texts, top_k=1):
136 |         embeddings = self.embed(texts)
137 | 
138 |         dist, neighbours = self.faiss_index.search(embeddings, top_k)
139 |         inferred_labels = []
140 |         for i in tqdm(range(embeddings.shape[0])):
141 |             labels = [self.cluster_labels[doc] for doc in neighbours[i]]
142 |             inferred_labels.append(Counter(labels).most_common(1)[0][0])
143 | 
144 |         return inferred_labels, embeddings
145 | 
146 |     def embed(self, texts):
147 |         embeddings = self.embed_model.encode(
148 |             texts,
149 |             batch_size=self.embed_batch_size,
150 |             show_progress_bar=True,
151 |             convert_to_numpy=True,
152 |             normalize_embeddings=True,
153 |         )
154 | 
155 |         return embeddings
156 | 
157 |     def project(self, embeddings):
158 |         mapper = UMAP(n_components=self.umap_components, metric=self.umap_metric).fit(
159 |             embeddings
160 |         )
161 |         return mapper.embedding_, mapper
162 | 
163 |     def cluster(self, embeddings):
164 |         print(
165 |             f"Using DBSCAN (eps, nim_samples)=({self.dbscan_eps,}, {self.dbscan_min_samples})"
166 |         )
167 |         clustering = DBSCAN(
168 |             eps=self.dbscan_eps,
169 |             min_samples=self.dbscan_min_samples,
170 |             n_jobs=self.dbscan_n_jobs,
171 |         ).fit(embeddings)
172 | 
173 |         return clustering.labels_
174 | 
175 |     def build_faiss_index(self, embeddings):
176 |         index = faiss.IndexFlatL2(embeddings.shape[1])
177 |         index.add(embeddings)
178 |         return index
179 | 
180 |     def summarize(self, texts, labels):
181 |         unique_labels = len(set(labels)) - 1  # exclude the "-1" label
182 |         client = InferenceClient(self.summary_model, token=self.summary_model_token)
183 |         cluster_summaries = {-1: "None"}
184 | 
185 |         for label in range(unique_labels):
186 |             ids = np.random.choice(self.label2docs[label], self.summary_n_examples)
187 |             examples = "\n\n".join(
188 |                 [
189 |                     f"Example {i+1}:\n{texts[_id][:self.summary_chunk_size]}"
190 |                     for i, _id in enumerate(ids)
191 |                 ]
192 |             )
193 | 
194 |             request = self.summary_template.format(
195 |                 examples=examples, instruction=self.summary_instruction
196 |             )
197 |             response = client.text_generation(request)
198 |             if label == 0:
199 |                 print(f"Request:\n{request}")
200 |             cluster_summaries[label] = self._postprocess_response(response)
201 |         print(f"Number of clusters is {len(cluster_summaries)}")
202 |         return cluster_summaries
203 | 
204 |     def _postprocess_response(self, response):
205 |         if self.topic_mode == "multiple_topics":
206 |             summary = response.split("\n")[0].split(".")[0].split("(")[0]
207 |             summary = ",".join(
208 |                 [txt for txt in summary.strip().split(",") if len(txt) > 0]
209 |             )
210 |             return summary
211 |         elif self.topic_mode == "single_topic":
212 |             first_line = response.split("\n")[0]
213 |             topic, score = None, None
214 |             try:
215 |                 topic = first_line.split("Topic:")[1].split("(")[0].split(",")[0].strip()
216 |             except IndexError:
217 |                 print("No topic found")
218 |             try:
219 |                 score = first_line.split("Educational value rating:")[1].strip().split(".")[0].strip()
220 |             except IndexError:
221 |                 print("No educational score found")
222 |             full_output = f"{topic}. Educational score: {score}"
223 |             return full_output
224 |         else:
225 |             raise ValueError(
226 |                 f"Topic labeling mode {self.topic_mode} is not supported, use single_topic or multiple_topics instead."
227 |             )
228 | 
229 |     def save(self, folder):
230 |         if not os.path.exists(folder):
231 |             os.makedirs(folder)
232 | 
233 |         with open(f"{folder}/embeddings.npy", "wb") as f:
234 |             np.save(f, self.embeddings)
235 | 
236 |         faiss.write_index(self.faiss_index, f"{folder}/faiss.index")
237 | 
238 |         with open(f"{folder}/projections.npy", "wb") as f:
239 |             np.save(f, self.projections)
240 | 
241 |         with open(f"{folder}/cluster_labels.npy", "wb") as f:
242 |             np.save(f, self.cluster_labels)
243 | 
244 |         with open(f"{folder}/texts.json", "w") as f:
245 |             json.dump(self.texts, f)
246 | 
247 |         with open(f"{folder}/mistral_prompt.txt", "w") as f:
248 |             f.write(DEFAULT_INSTRUCTION)
249 | 
250 |         if self.cluster_summaries is not None:
251 |             with open(f"{folder}/cluster_summaries.json", "w") as f:
252 |                 json.dump(self.cluster_summaries, f)
253 | 
254 |     def load(self, folder):
255 |         if not os.path.exists(folder):
256 |             raise ValueError(f"The folder '{folder}' does not exsit.")
257 | 
258 |         with open(f"{folder}/embeddings.npy", "rb") as f:
259 |             self.embeddings = np.load(f)
260 | 
261 |         self.faiss_index = faiss.read_index(f"{folder}/faiss.index")
262 | 
263 |         with open(f"{folder}/projections.npy", "rb") as f:
264 |             self.projections = np.load(f)
265 | 
266 |         with open(f"{folder}/cluster_labels.npy", "rb") as f:
267 |             self.cluster_labels = np.load(f)
268 | 
269 |         with open(f"{folder}/texts.json", "r") as f:
270 |             self.texts = json.load(f)
271 | 
272 |         if os.path.exists(f"{folder}/cluster_summaries.json"):
273 |             with open(f"{folder}/cluster_summaries.json", "r") as f:
274 |                 self.cluster_summaries = json.load(f)
275 |                 keys = list(self.cluster_summaries.keys())
276 |                 for key in keys:
277 |                     self.cluster_summaries[int(key)] = self.cluster_summaries.pop(key)
278 | 
279 |         # those objects can be inferred and don't need to be saved/loaded
280 |         self.id2cluster = {
281 |             index: label for index, label in enumerate(self.cluster_labels)
282 |         }
283 |         self.label2docs = defaultdict(list)
284 |         for i, label in enumerate(self.cluster_labels):
285 |             self.label2docs[label].append(i)
286 | 
287 |         self.cluster_centers = {}
288 |         for label in self.label2docs.keys():
289 |             x = np.mean([self.projections[doc, 0] for doc in self.label2docs[label]])
290 |             y = np.mean([self.projections[doc, 1] for doc in self.label2docs[label]])
291 |             self.cluster_centers[label] = (x, y)
292 | 
293 |     def show(self, interactive=False):
294 |         df = pd.DataFrame(
295 |             data={
296 |                 "X": self.projections[:, 0],
297 |                 "Y": self.projections[:, 1],
298 |                 "labels": self.cluster_labels,
299 |                 "content_display": [
300 |                     textwrap.fill(txt[:1024], 64) for txt in self.texts
301 |                 ],
302 |             }
303 |         )
304 | 
305 |         if interactive:
306 |             self._show_plotly(df)
307 |         else:
308 |             self._show_mpl(df)
309 | 
310 |     def _show_mpl(self, df):
311 |         fig, ax = plt.subplots(figsize=(12, 8), dpi=300)
312 | 
313 |         df["color"] = df["labels"].apply(lambda x: "C0" if x==-1 else f"C{(x%9)+1}")
314 | 
315 |         df.plot(
316 |             kind="scatter",
317 |             x="X",
318 |             y="Y",
319 |             c="labels",
320 |             s=0.75,
321 |             alpha=0.8,
322 |             linewidth=0,
323 |             color=df["color"],
324 |             ax=ax,
325 |             colorbar=False,
326 |         )
327 | 
328 |         for label in self.cluster_summaries.keys():
329 |             if label == -1:
330 |                 continue
331 |             summary = self.cluster_summaries[label]
332 |             position = self.cluster_centers[label]
333 |             t= ax.text(
334 |                 position[0],
335 |                 position[1],
336 |                 summary,
337 |                 horizontalalignment='center',
338 |                 verticalalignment='center',
339 |                 fontsize=4,
340 |             )
341 |             t.set_bbox(dict(facecolor='white', alpha=0.9, linewidth=0, boxstyle='square,pad=0.1'))
342 |         ax.set_axis_off()
343 | 
344 |     def _show_plotly(self, df):
345 |         fig = px.scatter(
346 |             df,
347 |             x="X",
348 |             y="Y",
349 |             color="labels",
350 |             hover_data={"content_display": True, "X": False, "Y": False},
351 |             width=1600,
352 |             height=800,
353 |             color_continuous_scale="HSV",
354 |         )
355 | 
356 |         fig.update_traces(hovertemplate="%{customdata[0]}<extra></extra>")
357 | 
358 |         fig.update_traces(
359 |             marker=dict(size=1, opacity=0.8),  # color="white"
360 |             selector=dict(mode="markers"),
361 |         )
362 | 
363 |         fig.update_layout(
364 |             template="plotly_dark",
365 |         )
366 | 
367 |         # show cluster summaries
368 |         for label in self.cluster_summaries.keys():
369 |             if label == -1:
370 |                 continue
371 |             summary = self.cluster_summaries[label]
372 |             position = self.cluster_centers[label]
373 | 
374 |             fig.add_annotation(
375 |                 x=position[0],
376 |                 y=position[1],
377 |                 text=summary,
378 |                 showarrow=False,
379 |                 yshift=0,
380 |             )
381 | 
382 |         fig.show()
383 | 


--------------------------------------------------------------------------------