├── reddit_data
    ├── praw.ini
    ├── extract_splits.py
    └── download_data.py
├── requirements.txt
├── LICENSE
├── scripts
    ├── run_cteb_de.py
    ├── FlexibleClusteringEvaluator.py
    ├── ClusteringTasks.py
    └── run_fine_tuned_cteb_de.py
├── README.md
└── results
    └── tecb-de-full-results.csv


/reddit_data/praw.ini:
--------------------------------------------------------------------------------
1 | [DEFAULT]
2 | client_id=<YOUR_CLIENT_ID>
3 | client_secret=<YOUR_CLIENT_SECRET>
4 | user_agent=<YOUR_USER_AGENT>s


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | hdbscan==0.8.29
2 | mteb==1.0.1
3 | numba==0.56.4
4 | river==0.14.0
5 | sentence-transformers==2.2.2
6 | umap-learn==0.5.3
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2023 Climate and Societal Analytics
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/reddit_data/extract_splits.py:
--------------------------------------------------------------------------------
 1 | """Script to generate splits for benchmarking text embedding clustering.
 2 | Based on Reddit data as retrieved by the official Reddit API."""
 3 | import random
 4 | 
 5 | import jsonlines
 6 | import pandas as pd
 7 | 
 8 | NUM_SPLITS = 10
 9 | MIN_LABELS = 10
10 | MAX_LABELS = 50
11 | 
12 | random.seed(42)
13 | 
14 | 
15 | def get_split(submissions, labels_mask, col_name):
16 |     return (
17 |         submissions[labels_mask]
18 |         .sample(frac=1.0)
19 |         .rename(columns={col_name: "sentences"})[["sentences", "labels"]]
20 |         .to_dict("list")
21 |     )
22 | 
23 | 
24 | def write_sets(name, sets):
25 |     with jsonlines.open(name, "w") as f_out:
26 |         f_out.write_all(sets)
27 | 
28 | 
29 | submissions = pd.read_csv("submissions.tsv", delimiter="\t")
30 | submissions.head()
31 | 
32 | submissions = submissions.rename(
33 |     columns={"title": "s2s", "selftext": "p2p", "subreddit": "labels"}
34 | )
35 | submissions["p2p"] = submissions["s2s"] + " " + submissions["p2p"]
36 | 
37 | subreddits = list(submissions["labels"].unique())
38 | test_sets_s2s, test_sets_p2p = [], []
39 | for _ in range(NUM_SPLITS):
40 |     num_labels = random.randint(MIN_LABELS, MAX_LABELS)
41 |     random.shuffle(subreddits)
42 |     labels = subreddits[:num_labels]
43 | 
44 |     labels_mask = submissions.labels.isin(labels)
45 |     test_sets_s2s.append(get_split(submissions, labels_mask, "s2s"))
46 |     test_sets_p2p.append(get_split(submissions, labels_mask, "p2p"))
47 | 
48 | write_sets("s2s_test.jsonl", test_sets_s2s)
49 | write_sets("p2p_test.jsonl", test_sets_p2p)
50 | 


--------------------------------------------------------------------------------
/reddit_data/download_data.py:
--------------------------------------------------------------------------------
 1 | """Download Reddit submissions given a list of submission ids."""
 2 | import csv
 3 | import re
 4 | 
 5 | import praw
 6 | import tqdm
 7 | from tqdm import tqdm
 8 | 
 9 | 
10 | # based on: https://huggingface.co/datasets/sentence-transformers/reddit-title-body/blob/main/extraction_script/extract_title_selftext.py
11 | def clean_text(text):
12 |     text = text.strip()
13 |     text = re.sub(r"\[(.*)\]\(.*\)", "\g<1>", text)  # Markdown
14 |     text = re.sub(
15 |         r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+",
16 |         "",
17 |         text,
18 |     )  # URLs
19 |     return text
20 | 
21 | 
22 | # credentials must be specificed in praw.ini
23 | # read-only mode is enough but an application has to be registered on Reddit: https://praw.readthedocs.io/en/stable/getting_started/authentication.html
24 | reddit = praw.Reddit()
25 | 
26 | ids = open("submission_ids.txt", "r").read().splitlines()[1:]
27 | 
28 | submissions = []
29 | for s in tqdm(reddit.info(fullnames=["t3_" + id_ for id_ in ids]), total=len(ids)):
30 |     # ignore deleted or removed submissions (if submission or user is deleted)
31 |     if s.selftext in ["[removed]", "[deleted]"]:
32 |         continue
33 | 
34 |     title = clean_text(s.title)
35 |     selftext = clean_text(s.selftext)
36 |     submissions.append([s.id, title, selftext, s.subreddit])
37 | 
38 | with open("submissions.tsv", "w", encoding="utf8", newline="") as tsv_file:
39 |     tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n")
40 |     tsv_writer.writerow(["id", "title", "selftext", "subreddit"])
41 |     i = 0
42 |     for row in submissions:
43 |         i += 1
44 |         tsv_writer.writerow(row)
45 | 


--------------------------------------------------------------------------------
/scripts/run_cteb_de.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import logging
 3 | 
 4 | from ClusteringTasks import (
 5 |     BlurbsClusteringP2P,
 6 |     BlurbsClusteringS2S,
 7 |     RedditClusteringP2P,
 8 |     RedditClusteringS2S,
 9 |     TenKGnadClusteringP2P,
10 |     TenKGnadClusteringS2S,
11 | )
12 | from mteb import MTEB
13 | from sentence_transformers import SentenceTransformer
14 | 
15 | # logging.basicConfig(level=logging.INFO)
16 | 
17 | base_tasks = [
18 |     BlurbsClusteringS2S,
19 |     BlurbsClusteringP2P,
20 |     TenKGnadClusteringS2S,
21 |     TenKGnadClusteringP2P,
22 | ]
23 | 
24 | # change task config here
25 | task_configs = [
26 |     {"dim_red": None, "clustering_alg": "minibatch_kmeans"},
27 |     {"dim_red": "pca", "clustering_alg": "minibatch_kmeans"},
28 |     {"dim_red": "umap", "clustering_alg": "minibatch_kmeans"},
29 |     {"dim_red": "pca+umap", "clustering_alg": "minibatch_kmeans"},
30 |     {"dim_red": None, "clustering_alg": "agglomerative"},
31 |     {"dim_red": "pca", "clustering_alg": "agglomerative"},
32 |     {"dim_red": "umap", "clustering_alg": "agglomerative"},
33 |     {"dim_red": "pca+umap", "clustering_alg": "agglomerative"},
34 |     {"dim_red": "pca", "clustering_alg": "hdbscan"},
35 |     {"dim_red": "umap", "clustering_alg": "hdbscan"},
36 |     {"dim_red": "pca+umap", "clustering_alg": "hdbscan"},
37 |     {"dim_red": "pca", "clustering_alg": "dbstream"},
38 |     {"dim_red": "umap", "clustering_alg": "dbstream"},
39 |     {"dim_red": "pca+umap", "clustering_alg": "dbstream"},
40 | ]
41 | 
42 | # change models here
43 | model_names = [
44 |     "deepset/gbert-base",
45 |     "deepset/gbert-large",
46 |     "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
47 |     "sentence-transformers/sentence-t5-xxl",
48 | ]
49 | 
50 | 
51 | def main(args: argparse.ArgumentParser):
52 |     if args.include_reddit:
53 |         base_tasks.extend([RedditClusteringS2S, RedditClusteringP2P])
54 |     for model_name in model_names:
55 |         model = SentenceTransformer(model_name)
56 |         evaluation = MTEB(
57 |             tasks=[task(**config) for task in base_tasks for config in task_configs]
58 |         )
59 |         evaluation.run(model, output_folder=f"results/{model_name.split('/')[-1]}")
60 | 
61 | 
62 | if __name__ == "__main__":
63 |     parser = argparse.ArgumentParser()
64 |     parser.add_argument("--include-reddit", action="store_true")
65 | 
66 |     args = parser.parse_args()
67 |     main(args)
68 | 


--------------------------------------------------------------------------------
/scripts/FlexibleClusteringEvaluator.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import Optional
  3 | 
  4 | import hdbscan
  5 | import numpy as np
  6 | import sklearn
  7 | import umap
  8 | from mteb.evaluation.evaluators.Evaluator import Evaluator
  9 | from river import cluster, stream
 10 | from sklearn.pipeline import make_pipeline
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | 
 15 | class DBSTREAMWrapper(cluster.DBSTREAM):
 16 |     """Wrapper for river.cluster.DBSTREAM so it can be used similarly to sklearn API. Wrapper is used to store
 17 |     online predictions in self.labels_ attribute (as in sklearn), i.e., any new predictions will be added to this attribute.
 18 |     """
 19 | 
 20 |     def __init__(self, compute_labels: bool = True, *args, **kwargs):
 21 |         super().__init__(*args, **kwargs)
 22 |         self.compute_labels = compute_labels
 23 |         self.labels_ = []
 24 | 
 25 |     def _learn_many(self, X, sample_weights: Optional[list] = None):
 26 |         """Update model with multiple data points."""
 27 |         sample_weights = [None] * len(X) if sample_weights is None else sample_weights
 28 |         for i, (x, _) in enumerate(stream.iter_array(X)):
 29 |             self.learn_one(x, sample_weight=sample_weights[i])
 30 | 
 31 |     def _predict_many(self, X: np.array, sample_weights: Optional[list] = None) -> list:
 32 |         """Predict multiple data points."""
 33 |         labels = []
 34 |         sample_weights = [None] * len(X) if sample_weights is None else sample_weights
 35 |         for i, (x, _) in enumerate(stream.iter_array(X)):
 36 |             labels.append(self.predict_one(x, sample_weight=sample_weights[i]))
 37 |         return labels
 38 | 
 39 |     def fit(self, X: np.array, sample_weights: Optional[list] = None):
 40 |         """sklearn API logic to handle model updates."""
 41 |         self._learn_many(X, sample_weights=sample_weights)
 42 |         if self.compute_labels:
 43 |             self.predict(X, sample_weights=sample_weights)
 44 |         return self
 45 | 
 46 |     def fit_predict(self, X: np.array, sample_weights: Optional[list] = None):
 47 |         """sklearn API logic to handle simultaneous model updates and predictions."""
 48 |         self._learn_many(X, sample_weights=sample_weights)
 49 |         return self.predict(X, sample_weights=sample_weights)
 50 | 
 51 |     def predict(self, X: np.array, sample_weights: Optional[list] = None):
 52 |         """sklearn API logic to handle model predictions."""
 53 |         self.labels_.extend(self._predict_many(X, sample_weights=sample_weights))
 54 |         return self.labels_
 55 | 
 56 | 
 57 | class FlexibleClusteringEvaluator(Evaluator):
 58 |     def __init__(
 59 |         self,
 60 |         sentences: list[str],
 61 |         labels: list[int],
 62 |         clustering_alg: str = "minibatch_kmeans",
 63 |         clustering_params: Optional[dict] = None,
 64 |         dim_red: Optional[str] = None,
 65 |         dim_red_params: Optional[dict] = None,
 66 |         limit: Optional[int] = None,
 67 |         **kwargs,
 68 |     ):
 69 |         super().__init__(**kwargs)
 70 |         if limit is not None:
 71 |             sentences = sentences[:limit]
 72 |             labels = labels[:limit]
 73 |         self.sentences = sentences
 74 |         self.labels = labels
 75 | 
 76 |         if clustering_params is None:
 77 |             clustering_params = {}
 78 | 
 79 |         if dim_red_params is None:
 80 |             dim_red_params = {}
 81 | 
 82 |         nr_labels = len(set(self.labels))
 83 |         if clustering_alg == "agglomerative":
 84 |             self.clustering_model = sklearn.cluster.AgglomerativeClustering(
 85 |                 n_clusters=nr_labels, **clustering_params
 86 |             )
 87 |         elif clustering_alg == "dbstream":
 88 |             self.clustering_model = DBSTREAMWrapper(**clustering_params)
 89 | 
 90 |         elif clustering_alg == "hdbscan":
 91 |             self.clustering_model = hdbscan.HDBSCAN(**clustering_params)
 92 | 
 93 |         elif clustering_alg == "minibatch_kmeans":
 94 |             if "batch_size" not in clustering_params:
 95 |                 clustering_params["batch_size"] = 500
 96 |             if "n_init" not in clustering_params:
 97 |                 clustering_params["n_init"] = "auto"
 98 |             self.clustering_model = sklearn.cluster.MiniBatchKMeans(
 99 |                 n_clusters=nr_labels, **clustering_params
100 |             )
101 |         else:
102 |             raise ValueError("Option not implemented")
103 | 
104 |         if dim_red == "pca":
105 |             self.dim_red_model = sklearn.decomposition.PCA(
106 |                 n_components=2, **dim_red_params
107 |             )
108 |         elif dim_red == "umap":
109 |             self.dim_red_model = umap.UMAP(metric="cosine", **dim_red_params)
110 |         elif dim_red == "pca+umap":
111 |             self.dim_red_model = make_pipeline(
112 |                 sklearn.decomposition.PCA(n_components=50),
113 |                 umap.UMAP(metric="cosine", **dim_red_params),
114 |             )
115 |         elif dim_red is None:
116 |             self.dim_red_model = None
117 |         else:
118 |             raise ValueError("Option not implemented")
119 | 
120 |         self.model_name = f"{dim_red}>{clustering_alg}" if dim_red else clustering_alg
121 | 
122 |     def __call__(self, model):
123 |         logger.info(f"Encoding {len(self.sentences)} sentences...")
124 |         corpus_embeddings = np.asarray(model.encode(self.sentences))
125 | 
126 |         logger.info(f"Fitting {self.model_name} model...")
127 |         if self.dim_red_model is not None:
128 |             corpus_embeddings = self.dim_red_model.fit_transform(corpus_embeddings)
129 |         self.clustering_model.fit(corpus_embeddings)
130 |         cluster_assignment = self.clustering_model.labels_
131 | 
132 |         logger.info("Evaluating...")
133 |         v_measure = sklearn.metrics.cluster.v_measure_score(
134 |             self.labels, cluster_assignment
135 |         )
136 | 
137 |         return {"v_measure": v_measure}
138 | 


--------------------------------------------------------------------------------
/scripts/ClusteringTasks.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | from pathlib import Path
  4 | from typing import Callable, Optional, TypeVar
  5 | 
  6 | import datasets
  7 | import numpy as np
  8 | import tqdm
  9 | from FlexibleClusteringEvaluator import FlexibleClusteringEvaluator
 10 | from mteb.abstasks.AbsTask import AbsTask
 11 | from sentence_transformers import SentenceTransformer
 12 | from typing_extensions import ParamSpec
 13 | 
 14 | T = TypeVar("T")
 15 | P = ParamSpec("P")
 16 | 
 17 | 
 18 | def dynamic_description(d: Callable[P, T]) -> Callable[P, T]:
 19 |     @property
 20 |     def wrapper(self) -> dict:
 21 |         desc = d(self)
 22 |         desc["name"] = (
 23 |             desc["name"]
 24 |             + "{"
 25 |             + (self.dim_red + "," if self.dim_red else "")
 26 |             + self.clustering_alg
 27 |             + (
 28 |                 "{"
 29 |                 + ",".join(
 30 |                     [
 31 |                         str(key) + "=" + str(value)
 32 |                         for key, value in self.clustering_params.items()
 33 |                     ]
 34 |                 )
 35 |                 + "}"
 36 |                 if len(self.clustering_params) > 0
 37 |                 else ""
 38 |             )
 39 |             + "}"
 40 |         )
 41 |         return desc
 42 | 
 43 |     return wrapper
 44 | 
 45 | 
 46 | class AbsTaskFlexibleClustering(AbsTask):
 47 |     def __init__(
 48 |         self,
 49 |         clustering_alg: str = "minibatch_kmeans",
 50 |         clustering_params: Optional[dict] = None,
 51 |         dim_red: Optional[str] = None,
 52 |         dim_red_params: Optional[dict] = None,
 53 |         **kwargs,
 54 |     ):
 55 |         super().__init__(**kwargs)
 56 | 
 57 |         self.clustering_alg = clustering_alg
 58 |         self.dim_red = dim_red
 59 | 
 60 |         if clustering_params is None:
 61 |             self.clustering_params = {}
 62 |         else:
 63 |             self.clustering_params = clustering_params
 64 |         if dim_red_params is None:
 65 |             self.dim_red_params = {}
 66 |         else:
 67 |             self.dim_red_params = dim_red_params
 68 | 
 69 |     def evaluate(self, model, split: str = "test", **kwargs):
 70 |         if not self.data_loaded:
 71 |             self.load_data()
 72 | 
 73 |         v_measures = []
 74 |         for cluster_set in tqdm.tqdm(self.dataset[split], desc="Clustering"):
 75 |             evaluator = FlexibleClusteringEvaluator(
 76 |                 cluster_set["sentences"],
 77 |                 cluster_set["labels"],
 78 |                 clustering_alg=self.clustering_alg,
 79 |                 clustering_params=self.clustering_params.copy(),
 80 |                 dim_red=self.dim_red,
 81 |                 dim_red_params=self.dim_red_params.copy(),
 82 |             )
 83 | 
 84 |             metrics = evaluator(model)
 85 |             v_measures.append(metrics["v_measure"])
 86 | 
 87 |         v_mean = np.mean(v_measures)
 88 |         v_std = np.std(v_measures)
 89 |         return {"v_measure": v_mean, "v_measure_std": v_std}
 90 | 
 91 |     @dynamic_description
 92 |     def description(self):
 93 |         raise NotImplementedError
 94 | 
 95 | 
 96 | class BlurbsClusteringS2S(AbsTaskFlexibleClustering):
 97 |     @dynamic_description
 98 |     def description(self) -> dict:
 99 |         return {
100 |             "name": "BlurbsClusteringS2S",
101 |             "hf_hub_name": "slvnwhrl/blurbs-clustering-s2s",
102 |             "description": "Clustering of book blurbs (titles only).",
103 |             "type": "Clustering",
104 |             "category": "s2s",
105 |             "eval_splits": ["test"],
106 |             "eval_langs": ["de"],
107 |             "main_score": ["v_measure"],
108 |         }
109 | 
110 | 
111 | class BlurbsClusteringP2P(AbsTaskFlexibleClustering):
112 |     @dynamic_description
113 |     def description(self) -> dict:
114 |         return {
115 |             "name": "BlurbsClusteringP2P",
116 |             "hf_hub_name": "slvnwhrl/blurbs-clustering-p2p",
117 |             "description": "Clustering of book blurbs (titles + blurbs).",
118 |             "type": "Clustering",
119 |             "category": "p2p",
120 |             "eval_splits": ["test"],
121 |             "eval_langs": ["de"],
122 |             "main_score": ["v_measure"],
123 |         }
124 | 
125 | 
126 | class TenKGnadClusteringS2S(AbsTaskFlexibleClustering):
127 |     @dynamic_description
128 |     def description(self) -> dict:
129 |         return {
130 |             "name": "TenKGnadClusteringS2S",
131 |             "hf_hub_name": "slvnwhrl/tenkgnad-clustering-s2s",
132 |             "description": "Clustering of German news articles titles.",
133 |             "type": "Clustering",
134 |             "category": "s2s",
135 |             "eval_splits": ["test"],
136 |             "eval_langs": ["de"],
137 |             "main_score": ["v_measure"],
138 |         }
139 | 
140 | 
141 | class TenKGnadClusteringP2P(AbsTaskFlexibleClustering):
142 |     @dynamic_description
143 |     def description(self) -> dict:
144 |         return {
145 |             "name": "TenKGnadClusteringP2P",
146 |             "hf_hub_name": "slvnwhrl/tenkgnad-clustering-p2p",
147 |             "description": "Clustering of German news articles (titles + body).",
148 |             "type": "Clustering",
149 |             "category": "p2p",
150 |             "eval_splits": ["test"],
151 |             "eval_langs": ["de"],
152 |             "main_score": ["v_measure"],
153 |         }
154 | 
155 | 
156 | reddit_data_path = os.path.join(Path(__file__).parent.parent.absolute(), "reddit_data")
157 | 
158 | 
159 | class RedditClusteringS2S(AbsTaskFlexibleClustering):
160 |     @dynamic_description
161 |     def description(self) -> dict:
162 |         return {
163 |             "name": "RedditClusteringS2S",
164 |             "description": "Clustering of German reddit submission titles.",
165 |             "type": "Clustering",
166 |             "category": "s2s",
167 |             "eval_splits": ["test"],
168 |             "eval_langs": ["de"],
169 |             "main_score": ["v_measure"],
170 |         }
171 | 
172 |     def load_data(self, **kwargs):
173 |         if self.data_loaded:
174 |             return
175 | 
176 |         self.dataset = datasets.load_dataset(
177 |             reddit_data_path,
178 |             data_files={"test": "s2s_test.jsonl"},
179 |         )
180 |         self.data_loaded = True
181 | 
182 | 
183 | class RedditClusteringP2P(AbsTaskFlexibleClustering):
184 |     @dynamic_description
185 |     def description(self) -> dict:
186 |         return {
187 |             "name": "RedditClusteringP2P",
188 |             "description": "Clustering of German reddit submission (titles + body).",
189 |             "type": "Clustering",
190 |             "category": "p2p",
191 |             "eval_splits": ["test"],
192 |             "eval_langs": ["de"],
193 |             "main_score": ["v_measure"],
194 |         }
195 | 
196 |     def load_data(self, **kwargs):
197 |         if self.data_loaded:
198 |             return
199 | 
200 |         self.dataset = datasets.load_dataset(
201 |             reddit_data_path,
202 |             data_files={"test": "p2p_test.jsonl"},
203 |         )
204 |         self.data_loaded = True
205 | 


--------------------------------------------------------------------------------
/scripts/run_fine_tuned_cteb_de.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | import csv
  3 | import logging
  4 | import os
  5 | import random
  6 | import types
  7 | from typing import Optional
  8 | 
  9 | import numpy as np
 10 | import torch
 11 | from ClusteringTasks import (
 12 |     AbsTaskFlexibleClustering,
 13 |     BlurbsClusteringP2P,
 14 |     BlurbsClusteringS2S,
 15 |     RedditClusteringP2P,
 16 |     RedditClusteringS2S,
 17 |     TenKGnadClusteringP2P,
 18 |     TenKGnadClusteringS2S,
 19 | )
 20 | from datasets import load_dataset
 21 | from sentence_transformers import SentenceTransformer, datasets, losses
 22 | from sentence_transformers.util import batch_to_device
 23 | from tqdm.autonotebook import trange
 24 | from transformers import (
 25 |     AutoModelForMaskedLM,
 26 |     AutoTokenizer,
 27 |     DataCollatorForWholeWordMask,
 28 |     Trainer,
 29 |     TrainingArguments,
 30 | )
 31 | 
 32 | os.environ["TOKENIZERS_PARALLELISM"] = "false"
 33 | 
 34 | # logging.basicConfig(level=logging.INFO)
 35 | logger = logging.getLogger("main")
 36 | 
 37 | 
 38 | def write_results(
 39 |     task: AbsTaskFlexibleClustering,
 40 |     model,
 41 |     output_path: str,
 42 |     epoch: int,
 43 |     steps: int,
 44 |     v_measure: float,
 45 | ):
 46 |     """Write evaluation results to file."""
 47 |     csv_headers = ["epoch", "steps", "v_measure"]
 48 |     csv_path = os.path.join(output_path, f"{task.description['name']}.json")
 49 |     if not os.path.isfile(csv_path):
 50 |         with open(csv_path, newline="", mode="w", encoding="utf-8") as f:
 51 |             writer = csv.writer(f)
 52 |             writer.writerow(csv_headers)
 53 |             writer.writerow([epoch, steps, v_measure])
 54 |     else:
 55 |         with open(csv_path, newline="", mode="a", encoding="utf-8") as f:
 56 |             writer = csv.writer(f)
 57 |             writer.writerow([epoch, steps, v_measure])
 58 | 
 59 | 
 60 | def eval_wrapper(tasks: list, output_path: Optional[str] = None):
 61 |     """Wrapper for TSDAE training procedure so custom mteb tasks can be evaluated."""
 62 | 
 63 |     def eval(model, output_path=output_path, epoch=0, steps=0):
 64 |         v_measures = []
 65 | 
 66 |         model.eval()
 67 |         with torch.no_grad():
 68 |             for task in tasks:
 69 |                 v_measure = task.evaluate(model)["v_measure"]
 70 |                 if output_path is not None:
 71 |                     write_results(task, model, output_path, epoch, steps, v_measure)
 72 |                 v_measures.append(v_measure)
 73 | 
 74 |         return np.mean(v_measures)
 75 | 
 76 |     return eval
 77 | 
 78 | 
 79 | def encode_wrapper(tokenizer, batch_size: int = 32):
 80 |     """Wrapper to make instances of transformer models compatible with mteb (which uses encode method similar to SentenceTransformer models)."""
 81 | 
 82 |     def encode(self, sentences):
 83 |         embeddings = []
 84 |         length_sorted_idx = np.argsort([-len(sen) for sen in sentences])
 85 |         sentences_sorted = [sentences[idx] for idx in length_sorted_idx]
 86 |         for i in range(0, len(sentences), batch_size):
 87 |             encoded_input = encoded_input = tokenizer(
 88 |                 sentences_sorted[i : i + batch_size],
 89 |                 return_tensors="pt",
 90 |                 padding=True,
 91 |                 truncation=True,
 92 |                 max_length=512,
 93 |             )
 94 |             encoded_input = batch_to_device(encoded_input, self.device)
 95 |             output = self(**encoded_input)
 96 | 
 97 |             token_embeddings = output[0]
 98 |             attention_mask = encoded_input.attention_mask
 99 |             input_mask_expanded = (
100 |                 attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
101 |             )
102 |             sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
103 |             sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
104 | 
105 |             embeddings.extend((sum_embeddings / sum_mask).detach().cpu())
106 | 
107 |         embeddings = [embeddings[idx] for idx in np.argsort(length_sorted_idx)]
108 |         return np.asarray([emb.numpy() for emb in embeddings])
109 | 
110 |     return encode
111 | 
112 | 
113 | class CustomTrainer(Trainer):
114 |     """Subclassed Hugging Face Trainer so custom mteb tasks can be evaluated."""
115 | 
116 |     def __init__(self, tasks: list, **kwargs):
117 |         super().__init__(**kwargs)
118 |         self.tasks = tasks
119 |         self.performed_evals = 0
120 |         self.model.encode = types.MethodType(
121 |             encode_wrapper(
122 |                 self.data_collator.tokenizer, self.args.per_device_eval_batch_size
123 |             ),
124 |             self.model,
125 |         )
126 | 
127 |     def evaluate(self, **kwargs):
128 |         self.performed_evals += 1
129 |         steps = self.performed_evals * self.args.eval_steps
130 |         steps_per_epoch = len(self.train_dataset) / (
131 |             self.args.per_device_train_batch_size
132 |             * self.args.gradient_accumulation_steps
133 |         )
134 |         epoch = int(np.floor((steps - 1) / steps_per_epoch))
135 | 
136 |         v_measures = []
137 | 
138 |         self.model.eval()
139 |         with torch.no_grad():
140 |             for task in self.tasks:
141 |                 v_measure = task.evaluate(self.model)["v_measure"]
142 |                 if self.args.output_dir is not None:
143 |                     write_results(
144 |                         task, self.model, self.args.output_dir, epoch, steps, v_measure
145 |                     )
146 |                 v_measures.append(v_measure)
147 | 
148 |         return np.mean(v_measures)
149 | 
150 | 
151 | class TokenizedSentencesDataset:
152 |     """Wrapper for on-the-fly tokenization for MLM training."""
153 | 
154 |     def __init__(
155 |         self,
156 |         sentences: list[str],
157 |         tokenizer,
158 |         max_length: int,
159 |         cache_tokenization: bool = False,
160 |     ):
161 |         self.tokenizer = tokenizer
162 |         self.sentences = sentences
163 |         self.max_length = max_length
164 |         self.cache_tokenization = cache_tokenization
165 | 
166 |     def __getitem__(self, item):
167 |         if not self.cache_tokenization:
168 |             return self.tokenizer(
169 |                 self.sentences[item],
170 |                 add_special_tokens=True,
171 |                 truncation=True,
172 |                 max_length=self.max_length,
173 |                 return_special_tokens_mask=True,
174 |             )
175 | 
176 |         if isinstance(self.sentences[item], str):
177 |             self.sentences[item] = self.tokenizer(
178 |                 self.sentences[item],
179 |                 add_special_tokens=True,
180 |                 truncation=True,
181 |                 max_length=self.max_length,
182 |                 return_special_tokens_mask=True,
183 |             )
184 |         return self.sentences[item]
185 | 
186 |     def __len__(self):
187 |         return len(self.sentences)
188 | 
189 | 
190 | def seed_worker(worker_id):
191 |     worker_seed = torch.initial_seed() % 2**32
192 |     np.random.seed(worker_seed)
193 |     random.seed(worker_seed)
194 | 
195 | 
196 | model_names = [
197 |     "deepset/gbert-base",
198 | ]
199 | 
200 | base_tasks = [
201 |     BlurbsClusteringS2S,
202 |     BlurbsClusteringP2P,
203 |     TenKGnadClusteringS2S,
204 |     TenKGnadClusteringP2P,
205 | ]
206 | task_configs = [
207 |     {
208 |         "dim_red": None,
209 |         "clustering_alg": "minibatch_kmeans",
210 |         "clustering_params": {"random_state": 42},
211 |     },
212 | ]
213 | 
214 | seed_list = [42, 1, 2]
215 | use_fp16 = False
216 | 
217 | # MLM
218 | mlm_prob = 0.15
219 | 
220 | 
221 | def main(args: argparse.ArgumentParser):
222 |     if args.include_reddit:
223 |         base_tasks.extend([RedditClusteringS2S, RedditClusteringP2P])
224 | 
225 |     for model_name in model_names:
226 |         for base_task in base_tasks:
227 |             tasks = [base_task(**config) for config in task_configs]
228 |             logger.info(tasks[0].description["name"])
229 |             base_task_name = tasks[0].description["name"].split("{")[0]
230 | 
231 |             tasks[0].load_data()
232 |             train_sentences = list(
233 |                 set(
234 |                     [
235 |                         sent
236 |                         for split in tasks[0].dataset["test"]["sentences"]
237 |                         for sent in split
238 |                     ]
239 |                 )
240 |             )
241 | 
242 |             # TSDAE
243 |             train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences)
244 | 
245 |             for seed in seed_list:
246 |                 torch.manual_seed(seed)
247 | 
248 |                 model = SentenceTransformer(model_name)
249 | 
250 |                 result_folder = os.path.join(
251 |                     "results",
252 |                     model_name.split("/")[-1],
253 |                     f"tsdae_ft_{str(seed)}",
254 |                     base_task_name,
255 |                 )
256 | 
257 |                 g = torch.Generator()
258 |                 g.manual_seed(seed)
259 |                 train_dataloader = torch.utils.data.DataLoader(
260 |                     train_dataset,
261 |                     batch_size=8,
262 |                     shuffle=True,
263 |                     drop_last=False,
264 |                     worker_init_fn=seed_worker,
265 |                     generator=g,
266 |                 )
267 |                 train_loss = losses.DenoisingAutoEncoderLoss(
268 |                     model, decoder_name_or_path=model_name, tie_encoder_decoder=True
269 |                 )
270 | 
271 |                 dev_evaluator = eval_wrapper(tasks, output_path=result_folder)
272 | 
273 |                 logger.info(f"Start TSDAE training <<seed: {seed}>>")
274 |                 model.fit(
275 |                     train_objectives=[(train_dataloader, train_loss)],
276 |                     evaluator=dev_evaluator,
277 |                     evaluation_steps=512,
278 |                     epochs=30,
279 |                     weight_decay=0,
280 |                     scheduler="constantlr",
281 |                     optimizer_params={"lr": 3e-5},
282 |                     output_path=result_folder,
283 |                     save_best_model=True,
284 |                     checkpoint_save_total_limit=1,
285 |                     show_progress_bar=True,
286 |                     use_amp=use_fp16,
287 |                 )
288 | 
289 |                 logger.info("Training done")
290 | 
291 |             # MLM
292 |             for seed in seed_list:
293 |                 model = AutoModelForMaskedLM.from_pretrained(model_name)
294 |                 tokenizer = AutoTokenizer.from_pretrained(model_name)
295 | 
296 |                 train_dataset = TokenizedSentencesDataset(
297 |                     train_sentences, tokenizer, 512
298 |                 )  # handle max seq length
299 |                 data_collator = DataCollatorForWholeWordMask(
300 |                     tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob
301 |                 )
302 | 
303 |                 result_folder = os.path.join(
304 |                     "results",
305 |                     model_name.split("/")[-1],
306 |                     f"mlm_ft_{str(seed)}",
307 |                     base_task_name,
308 |                 )
309 | 
310 |                 training_args = TrainingArguments(
311 |                     output_dir=result_folder,
312 |                     overwrite_output_dir=True,
313 |                     num_train_epochs=30,
314 |                     evaluation_strategy="steps",
315 |                     per_device_train_batch_size=32,
316 |                     per_device_eval_batch_size=128,
317 |                     eval_steps=25,
318 |                     save_steps=25,
319 |                     logging_steps=25,
320 |                     gradient_accumulation_steps=8,
321 |                     save_total_limit=1,
322 |                     load_best_model_at_end=True,
323 |                     metric_for_best_model="v_measure",
324 |                     prediction_loss_only=True,
325 |                     fp16=use_fp16,
326 |                     learning_rate=1e-04,
327 |                     weight_decay=0.01,
328 |                     lr_scheduler_type="constant_with_warmup",
329 |                     warmup_ratio=0.06,
330 |                     seed=seed,
331 |                 )
332 | 
333 |                 trainer = CustomTrainer(
334 |                     model=model,
335 |                     args=training_args,
336 |                     data_collator=data_collator,
337 |                     train_dataset=train_dataset,
338 |                     eval_dataset="placeholder",
339 |                     tasks=tasks,
340 |                 )
341 | 
342 |                 logger.info(f"Start MLM training <<seed: {seed}>>")
343 |                 trainer.train()
344 |                 logger.info("Training done")
345 | 
346 | 
347 | if __name__ == "__main__":
348 |     parser = argparse.ArgumentParser()
349 |     parser.add_argument("--include-reddit", action="store_true")
350 | 
351 |     args = parser.parse_args()
352 |     main(args)
353 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # German Text Embedding Clustering Benchmark
  2 | 
  3 | Shortcut: [Datasets](https://github.com/ClimSocAna/tecb-de#datasets) - [Results](https://github.com/ClimSocAna/tecb-de#results) - [Installation](https://github.com/ClimSocAna/tecb-de#installation) - [Usage](https://github.com/ClimSocAna/tecb-de#usage) - [Citation](https://github.com/ClimSocAna/tecb-de#citation)
  4 | 
  5 | ## Remarks
  6 | This repository contains code to evaluate language models for clustering word embeddings as used in neural topic modelling (see for example [BERTopic](https://github.com/MaartenGr/BERTopic)) specifically for <b>German</b>. This work builds on [Massive Text Embedding Benchmark (MTEB)](https://github.com/embeddings-benchmark/mteb), which provides benchmark datasets and results for a wide range of tasks.
  7 | 
  8 | More specifically, this work contributes to mteb in the following ways:
  9 | - clustering datasets in German (MTEB only consider English datasets)
 10 | - the evaluation of more clustering algorithms
 11 | 
 12 | :trophy: Note that you can contribute results to the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) as our datasets are officially part of MTEB (apart from the Reddit datasets, see below)! You can either use this library or MTEB directly to produce results. If you run into any problems, please raise an issue. :trophy:
 13 | 
 14 | 
 15 | ## Datasets
 16 | 
 17 | Currently, we provide 4 datasets. The datasets are built similarly to the English clustering datasets in MTEB. Unfortunately, there are fewer datasets available for German and, therefore, we were not able to build as many datasets (e.g. Arxiv only contains very few German papers). However, we plan to add more datasets in the future.
 18 | 
 19 | | **Name**              | **Hub URL**                      | **Description**                                              |
 20 | |-----------------------|----------------------------------|--------------------------------------------------------------|
 21 | | BlurbsClusteringS2S<br>([data ref.](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html)) | [slvnwhrl/blurbs-clustering-s2s](https://huggingface.co/datasets/slvnwhrl/blurbs-clustering-s2s/tree/main)   | Clustering of <b>book titles</b>: 17'726 unqiue samples, 28 splits with 177 to 16'425 samples and 4 to 93 unique classes (as represented by genres, e.g. fantasy). On average, a sample is 23.17 chars long. Splits are built similarly to MTEB's [ArxivClusteringS2S](https://huggingface.co/datasets/mteb/arxiv-clustering-s2s) ([Paper](https://arxiv.org/abs/2210.07316)). |
 22 | | BlurbsClusteringP2P<br>([data ref.](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html)) | [slvnwhrl/blurbs-clustering-p2p](https://huggingface.co/datasets/slvnwhrl/blurbs-clustering-p2p/tree/main)   | Clustering of <b>book blurbs</b> (title + blurb): Clustering of book titles: 18'084 unqiue samples, 28 splits with 177 to 16'425 samples and 4 to 93 unique classes as represented by genres, e.g. fantasy. On average, a sample is 663.91 chars long. Splits are built similarly to MTEB's [ArxivClusteringP2P](https://huggingface.co/datasets/mteb/arxiv-clustering-sp2p) ([paper](https://arxiv.org/abs/2210.07316)). |
 23 | | TenKGNADClusteringS2S<br>([data ref.](https://ofai.github.io/million-post-corpus/)) | [slvnwhrl/tenkgnad-clustering-s2s](https://huggingface.co/datasets/slvnwhrl/tenkgnad-clustering-s2s) | Clustering of <b>news article titles</b>: 10'267 unique samples, 10 splits with 1'436 to 9'962 samples and 9 unique classes (as represented by, e.g. politics). On average, a sample is 50.97 chars long. Splits are built similarly to MTEB's [TwentyNewsgroupsClustering](https://huggingface.co/datasets/mteb/twentynewsgroups-clustering) ([paper](https://arxiv.org/abs/2210.07316)).|
 24 | | TenKGNADClusteringP2P<br>([data ref.](https://ofai.github.io/million-post-corpus/)) | [slvnwhrl-tenkgnad-clustering-p2p](https://huggingface.co/datasets/slvnwhrl/tenkgnad-clustering-p2p) | Clustering of <b>news articles</b> (title + article body): 10'275 unique samples, 10 splits with 1'436 to 9'962 samples and 9 unique classes (as represented by, e.g. politics). On average, a sample is 2648.46 chars long. Splits are built similarly to MTEB's [TwentyNewsgroupsClustering](https://huggingface.co/datasets/mteb/twentynewsgroups-clustering) ([paper](https://arxiv.org/abs/2210.07316)). |
 25 | 
 26 | ### Reddit datasets
 27 | 
 28 | We also include two Reddit datasets in the benchmark (similar to MTEB's [RedditClustering](https://huggingface.co/datasets/mteb/reddit-clustering) and [RedditClusteringP2P](https://huggingface.co/datasets/mteb/reddit-clustering-p2p) datasets). However, we only provide ids, and if you want to use these datasets, you need to download the data yourself (see [Including the Reddit dataset](https://github.com/ClimSocAna/tecb-de#including-the-reddit-dataset) for instructions). The datasets contain "hot" and "top" submissions to 80 popular German subreddits and were extracted using [PRAW](https://praw.readthedocs.io/en/stable/).
 29 | 
 30 | | **Name**            | **Description** |
 31 | |---------------------|-----------------|
 32 | | RedditClusteringS2S | Clustering of reddit submission titles: 40'181 unique samples, 10 splits with 9'288 to 26'221 samples and 10 to 50 unique classes (as represented by subbredits, e.g. r/Finanzen). On average, a sample is 52.16 chars long. Splits are built similarly to MTEB's [RedditClustering](https://huggingface.co/datasets/mteb/reddit-clustering) ([paper](https://arxiv.org/abs/2210.07316)). |
 33 | | RedditClusteringP2P | Clustering of reddit submissions (title + body): 40'305 unique samples, 10 splits with 9'288 to 26'221 samples and 10 to 50 unique classes (as represented by subbredits, e.g. r/Finanzen). On average, a sample is 901.78 chars long. Splits are built similarly to MTEB's [RedditClusteringP2P](https://huggingface.co/datasets/mteb/reddit-clustering-p2p) ([paper](https://arxiv.org/abs/2210.07316)). |
 34 | 
 35 | ***Important**: As of June 19, 2023 new [Data API Terms](https://www.redditinc.com/policies/data-api-terms) become effective for Reddit. Most likely, it will not be allowed anymore to use Reddit data for such purposes (see especially "2.4 User Content" in the terms). Make sure you understand these terms and use Reddit data accordingly.*
 36 | 
 37 | ## Results
 38 | All results show the [V-measure](https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure) (multiplied by 100 and rounded to two decimal points).
 39 | ### k-means (same as MTEB)
 40 | | **Model** | **BlurbsClusteringS2S** | **BlurbsClusteringP2P** | **TenKGNADClusteringS2S** | **TenKGNADClusteringP2P** | **RedditClusteringS2S** | **RedditClusteringP2P** | **AVG**|
 41 | |----|-------:|-------:|-------:|-------:|-------:|-------:|-------:|
 42 | | [deepset/gbert-base](https://huggingface.co/deepset/gbert-base) | 11.27 | 35.36 | 24.23 | 37.16 | 28.57 | 35.30 | 28.66 |
 43 | | [deepset/gbert-large](https://huggingface.co/deepset/gbert-large) | 13.34 | 39.30 | **34.97** | 41.69 | 34.35 | 44.61 | 34.71 |
 44 | | [deepset/gelectra-base](https://huggingface.co/deepset/gelectra-base) | 7.74 | 10.06 | 4.11 | 9.02 | 6.59 | 7.73 | 7.54 |
 45 | | [deepset/gelectra-large](https://huggingface.co/deepset/gelectra-large) |7.57 | 13.96 | 3.91 | 11.49 | 7.59 | 10.54 | 9.18 |
 46 | | [uklfr/gottbert-base](https://huggingface.co/uklfr/gottbert-base) |8.37 |  34.49 | 9.34 | 33.66 | 16.07 | 19.46 | 20.23 |
 47 | | [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) | 14.33 | 32.46 | 22.26 | 36.13 |33.33 | 44.59 | 30.52 |
 48 | | [sentence-transformers/paraphrase-multilingual-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) |15.81 | 34.38 | 22.00 | 35.96 | 36.39 | 48.43 | 32.16 |
 49 | | [T-Systems-onsite/cross-en-de-roberta-sentence-transformer](https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer) | 12.69 | 30.81 | 10.94 | 23.50 | 27.98 | 33.01 | 23.16 |
 50 | | [sentence-transformers/use-cmlm-multilingual](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) |15.24 | 29.63 | 25.64 | 37.10 | 33.62 | 49.70 | 31.82 |
 51 | | [sentence-transformers/sentence-t5-base](https://huggingface.co/sentence-transformers/sentence-t5-base) |11.57 | 30.59 | 18.11 | **44.88** |  31.99 | 45.80|  30.49 |
 52 | | [sentence-transformers/sentence-t5-xxl](https://huggingface.co/sentence-transformers/sentence-t5-xxl) | **15.94** | **39.91** | 19.69 | 43.43 | **38.54** | **55.90** | **35.57** |
 53 | | [xlm-roberta-large](https://huggingface.co/xlm-roberta-large) |7.29 | 29.84 | 6.16 | 32.46 | 10.19 | 23.50 | 18.24 |
 54 | 
 55 | ### additional clustering algorithms 
 56 | In addition to [k-means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html), we evaluate the following different clustering algorithms:
 57 | - [Agglomerative clustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn-cluster-agglomerativeclustering) (distance-based, number of clusers is assumed)
 58 | - [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/) (density-based, number of clusters is assumed)
 59 | - [DBSTREAM](https://riverml.xyz/dev/api/cluster/DBSTREAM/#dbstream) (streaming algorithm, number of clusters is *not* assumed)
 60 | 
 61 | Inspired by [BERTopic](https://github.com/MaartenGr/BERTopic), we also evaluate [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn-decomposition-pca) and [UMAP](https://umap-learn.readthedocs.io/en/latest/index.html) as a "preprocessing" step.
 62 | 
 63 | *If you want add/evaluate more algorithms, please have a look at [FlexibleClusteringEvaluator.py](https://github.com/ClimSocAna/tecb-de/blob/main/scripts/FlexibleClusteringEvaluator.py) on how to achieve that.*
 64 | 
 65 | 
 66 | ### UMAP + {k-means, HDBSCAN, DBSTREAM}
 67 | *For all results have a look at [results/tecb-de-full-results.csv](https://github.com/ClimSocAna/tecb-de/blob/3364f94faba7b235c7498a2bb724324064ac4537/results/tecb-de-full-results.csv).*
 68 |  
 69 | | **Model**  | **Algorithm**                    | **BlurbsClusteringS2S** | **BlurbsClusteringP2P** | **10KGNADClusteringS2S** | **10KGNADClusteringP2P** | **RedditClusteringS2S** | **RedditClusteringP2P** | **AVG** |
 70 | |------------|----------------------------------|------------------------:|------------------------:|-------------------------:|-------------------------:|--------:|--------:|--------:|
 71 | | [deepset/gbert-base](https://huggingface.co/deepset/gbert-base) | k-means<br> HDBSCAN<br> DBSTREAM | 12.81<br> 14.31<br> 12.70 | **38.81**<br> 22.83<br> 37.06 | 29.31<br> 05.44<br> **28.92**  | 43.61<br> 32.45 <br>  42.74 | 31.77<br> 17.21<br> 31.70 | 46.06<br> 31.99<br> 44.84 | 33.73<br> 20.71<br> **32.99** |
 72 | | [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) | k-means<br> HDBSCAN<br> DBSTREAM | 13.80<br> **20.00**<br> 15.77 | 34.16<br> 30.67<br> 32.47 | 25.22<br> 24.90<br> 26.44 | **43.75**<br> 35.53<br> 41.31 | 32.64<br> 32.23<br> **33.14** | **47.46**<br> 39.70<br> 46.47 | 32.84<br> 30.51<br> 32.60 |
 73 | 
 74 | 
 75 | ## Installation
 76 | If you want to run the code from this repository (for creating the Reddit dataset or model evaluation), clone this repository and move to the downloaded folder
 77 | 
 78 | ```
 79 | git clone https://github.com/ClimSocAna/tecb-de.git 
 80 | ```
 81 | 
 82 | and create a new environment with the necessary packages
 83 | ```
 84 | python -m venv tecb-de
 85 | 
 86 | source tecb-de/bin/activate # Linux, MacOS
 87 | venv\Scripts\activate.bat # Windows
 88 | 
 89 | pip install -r requirements.txt
 90 | ```
 91 | 
 92 | ## Usage
 93 | ### Running the evaluation
 94 | Simply run `python scripts/run_cteb_de.py`. This will produce an `results` folder. You can modify the script to run the evaluation for models and clustering algorithms (and configuration) of your choosing.
 95 | 
 96 | ### Including the Reddit dataset
 97 | If you want to use the reddit dataset, you first have to download the data
 98 | ```
 99 | # move to the reddit_data folder in tecb-de
100 | # make sure you have PRAW and tdqm installed: pip install praw, pip install tqdm
101 | 
102 | # downloads the data and saves it to submissions.tsv
103 | python download.py
104 | ```
105 | Note that for this to work, you have to edit the `reddit_data/praw.ini` with your client data. You can find instructions [here](https://praw.readthedocs.io/en/stable/getting_started/authentication.html).
106 | 
107 | Then you can create the datasets
108 | ```
109 | # creates the splits for both tasks (RedditClusteringS2S and ReddictClusteringP2P)
110 | # and saves them in the reddit_data folder
111 | python create_splits.py
112 | ```
113 | 
114 | Finally, you can can run the evaluation using the `--include-reddit` flag
115 | ```
116 | # assuming your position is in the top-level folder
117 | python scripts/run_cteb_de.py --reddit-flag
118 | ```
119 | 
120 | ### Adaptive pre-training
121 | If you want to experiment with adaptive pre-training, you can have a look at `scripts/run_fine_tuned_cteb_de.py`. Basically, it allows you to train models using whole word masking (WWM) and [TSDAE](https://arxiv.org/abs/2104.06979) and to evaluate on a clustering algorithm during training.
122 | 
123 | ## Citation
124 | If you make use of this [work](https://aclanthology.org/2023.konvens-main.20/), please cite:
125 | ```
126 | @inproceedings{wehrli-etal-2023-german,
127 |     title = "{G}erman Text Embedding Clustering Benchmark",
128 |     author = "Wehrli, Silvan  and
129 |       Arnrich, Bert  and
130 |       Irrgang, Christopher",
131 |     editor = "Georges, Munir  and
132 |       Herygers, Aaricia  and
133 |       Friedrich, Annemarie  and
134 |       Roth, Benjamin",
135 |     booktitle = "Proceedings of the 19th Conference on Natural Language Processing (KONVENS 2023)",
136 |     month = sep,
137 |     year = "2023",
138 |     address = "Ingolstadt, Germany",
139 |     publisher = "Association for Computational Lingustics",
140 |     url = "https://aclanthology.org/2023.konvens-main.20",
141 |     pages = "187--201",
142 | }
143 | ```
144 | 
145 | 


--------------------------------------------------------------------------------
/results/tecb-de-full-results.csv:
--------------------------------------------------------------------------------
  1 | transformer,"clustering
  2 | algorithm","dimensionality
  3 | reduction",BlurbsClusteringS2S,BlurbsClusteringSP2P,10kGNADClusteringS2S,10kGNADClusteringP2P,RedditClusteringS2S,RedditClusteringP2P
  4 | GBERT-base,minibatch_kmeans,-,0.112671049316939,0.353583482102813,0.242344552391346,0.371566057467428,0.285676283561016,0.353022032105768
  5 | GBERT-base,minibatch_kmeans,pca,0.0777671041805759,0.282103942265806,0.161621841318318,0.223432937769083,0.116485294575606,0.148203164549033
  6 | GBERT-base,minibatch_kmeans,umap,0.128132315578683,0.388111794621666,0.293149898962799,0.436132825338465,0.317563157505019,0.460595202703818
  7 | GBERT-base,minibatch_kmeans,pca+umap,0.130091638633461,0.381730338558007,0.288475293664304,0.40041151901687,0.317337410195825,0.409005177020971
  8 | GBERT-base,agglomerative,-,0.122555007222791,0.365500728379183,0.233698977687029,0.390790344115844,0.29663587891276,0.377253253266381
  9 | GBERT-base,agglomerative,pca,0.0764838065513696,0.281541533230531,0.156090401197698,0.215749839242043,0.115666322019327,0.14788684839578
 10 | GBERT-base,agglomerative,umap,0.128414238366791,0.386473044041221,0.284943294898888,0.432163776799657,0.316535914208462,0.460153958922544
 11 | GBERT-base,agglomerative,pca+umap,0.135298949874227,0.378635201892333,0.286346037071778,0.406316649434935,0.31901894616499,0.41342137179276
 12 | GBERT-base,hdbscan,pca,0.092841455320237,0.121429731876474,0.0878780147634584,0.112088120454238,0.14268405884612,0.17842937399194
 13 | GBERT-base,hdbscan,umap,0.143147353990695,0.228356662475019,0.0543865219456264,0.324525869290628,0.172064090037639,0.319903379129749
 14 | GBERT-base,hdbscan,pca+umap,0.145646923652315,0.242303154361018,0.162201771418459,0.352124570489061,0.217985362145401,0.198177856655754
 15 | GBERT-base,dbstream,pca,0.117158951149945,0.261506054193205,0.162466842655547,0.218331474577554,0.123380259546021,0.151373101721904
 16 | GBERT-base,dbstream,umap,0.126968078297751,0.370571106215641,0.289172894805377,0.427386392167265,0.316957936467504,0.448401802063983
 17 | GBERT-base,dbstream,pca+umap,0.135526610923928,0.363080746655188,0.292474309841386,0.412041561853931,0.312073808060135,0.402044907553444
 18 | GBERT-large,minibatch_kmeans,-,0.133771100864799,0.392968164291731,0.349667381745043,0.416860247351736,0.344661288673389,0.446109745930674
 19 | GBERT-large,minibatch_kmeans,pca,0.100443756439491,0.308775265057465,0.246710867213275,0.281775578873874,0.134243982216329,0.171871809632161
 20 | GBERT-large,minibatch_kmeans,umap,0.156253205331452,0.423950492830575,0.407429584861096,0.483115805893804,0.412444628594533,0.53406057923917
 21 | GBERT-large,minibatch_kmeans,pca+umap,0.16155232224427,0.420486875135654,0.422991965269453,0.470110554797457,0.408114696584046,0.515781058414509
 22 | GBERT-large,agglomerative,-,0.15346046365093,0.417414864554982,0.357465069383771,0.455382211806237,0.371736021694433,0.476017771864391
 23 | GBERT-large,agglomerative,pca,0.100496751158374,0.312347273528722,0.242867854737821,0.278716878244411,0.133246053268639,0.171189869000204
 24 | GBERT-large,agglomerative,umap,0.156750709690904,0.425642820144678,0.414465348015092,0.499101116065403,0.417677737736199,0.545820076431979
 25 | GBERT-large,agglomerative,pca+umap,0.162417052178325,0.418969823081815,0.416535509533656,0.483820186361541,0.413522308477915,0.520477794301093
 26 | GBERT-large,hdbscan,pca,0.0688053584191239,0.131955201072378,0.0866870186629883,0.17368387849622,0.170946875457958,0.172281670663991
 27 | GBERT-large,hdbscan,umap,0.164851651926161,0.302589576982622,0.30047817241985,0.38842013121876,0.253130648500744,0.319903379129749
 28 | GBERT-large,hdbscan,pca+umap,0.160635359866094,0.348198697135544,0.233143414646736,0.385744616169722,0.276748823137632,0.435959870184842
 29 | GBERT-large,dbstream,pca,0.106698549211481,0.295978705611152,0.234910103348833,0.276880781462188,0.136330874897316,0.17605394161897
 30 | GBERT-large,dbstream,umap,0.159465218791482,0.406349632716036,0.397319423736426,0.468089123992303,0.408302307486785,0.525689818695595
 31 | GBERT-large,dbstream,pca+umap,0.166484777774721,0.40287889845942,0.392131715328795,0.454491213561863,0.397626487765563,0.502095687897378
 32 | GELECTRA-base,minibatch_kmeans,-,0.0773802653856308,0.100585204166098,0.0411202271800372,0.0901582074000665,0.0659322534895928,0.0773094974486189
 33 | GELECTRA-base,minibatch_kmeans,pca,0.070499734843759,0.0733878212922891,0.0398844118151661,0.0837270797322409,0.0462823492581079,0.0628378246232701
 34 | GELECTRA-base,minibatch_kmeans,umap,0.0832639892802492,0.150989178222435,0.0368744634793366,0.126632031872035,0.0757934402337694,0.0867223381899473
 35 | GELECTRA-base,minibatch_kmeans,pca+umap,0.0821402843835579,0.111147670127865,0.0439943667547346,0.099359124202817,0.0688833827432366,0.0756486802686404
 36 | GELECTRA-base,agglomerative,-,0.0805738718522408,0.100437845777627,0.0415656710931226,0.0938546916920069,0.0700148688131128,0.0758744191728137
 37 | GELECTRA-base,agglomerative,pca,0.0689162175103905,0.0703669587491154,0.0388949809210833,0.0812604389117952,0.0460949752690999,0.0628422944868003
 38 | GELECTRA-base,agglomerative,umap,0.0829724184211444,0.147433226414405,0.037138338638813,0.134257261521715,0.0773489329783762,0.0866563731090761
 39 | GELECTRA-base,agglomerative,pca+umap,0.0828431480521785,0.113517957291586,0.0451961770136774,0.102051956539211,0.0705261969672755,0.0770632640910475
 40 | GELECTRA-base,hdbscan,pca,0.0795215157301383,0.0681243106140609,0.0573689275392659,0.0614447918758936,0.134271051369632,0.15047581912456
 41 | GELECTRA-base,hdbscan,umap,0.0803815035008141,0.105646071345495,0.0214482012053893,0.0158125145229272,0.0715719345933788,0.0897987629752991
 42 | GELECTRA-base,hdbscan,pca+umap,0.0603676964248639,0.0873807761634588,0.0125080425581685,0.0222740008863571,0.0917062282030941,0.116773941216348
 43 | GELECTRA-base,dbstream,pca,0.0747123404435195,0.0490477571830085,0.0493326702365123,0.0895059618949814,0.0503590995282146,0.0658460594781778
 44 | GELECTRA-base,dbstream,umap,0.0812069360094833,0.144200548562882,0.0537496715910561,0.165618198863684,0.0844168634808037,0.0946369012953892
 45 | GELECTRA-base,dbstream,pca+umap,0.0832343762177447,0.123659678514983,0.0596532877599249,0.137048073813919,0.0801875071892082,0.0862277466845775
 46 | GELECTRA-large,minibatch_kmeans,-,0.0756562949711355,0.139649776313644,0.0391456840437532,0.114856819040429,0.0758819635082978,0.105449787326776
 47 | GELECTRA-large,minibatch_kmeans,pca,0.0690597768133245,0.100969510677138,0.0382481011456329,0.096978027986081,0.0476178361184982,0.0725538725129464
 48 | GELECTRA-large,minibatch_kmeans,umap,0.0797416843974231,0.14995345938628,0.0428616810166543,0.129387587119327,0.0786477675730954,0.100098610516064
 49 | GELECTRA-large,minibatch_kmeans,pca+umap,0.0778985275607628,0.148094534491185,0.0429929292565679,0.129223718235341,0.0807834894512504,0.101722519357038
 50 | GELECTRA-large,agglomerative,-,0.0794046738058272,0.144175117529874,0.036954786402055,0.113359827010857,0.0808854297797115,0.101090516583789
 51 | GELECTRA-large,agglomerative,pca,0.0676768420972781,0.0997378573724905,0.0364053640599704,0.0957223001611836,0.0484190086354535,0.0731042975964091
 52 | GELECTRA-large,agglomerative,umap,0.0800819728660301,0.150797893634037,0.0423621116101067,0.134342915078929,0.0809906315914207,0.101013587651946
 53 | GELECTRA-large,agglomerative,pca+umap,0.0795740246073101,0.150844299339887,0.0429223185369049,0.13224703421997,0.0819430797767072,0.102097736541606
 54 | GELECTRA-large,hdbscan,pca,0.0850417837907285,0.0951759874223363,0.0419533905566293,0.0605897439215855,0.0699755894445285,0.139432001736411
 55 | GELECTRA-large,hdbscan,umap,0.0685202308843401,0.124137654104527,0.0216641106966775,0.039247795635016,0.0818448120576973,0.0802031392817206
 56 | GELECTRA-large,hdbscan,pca+umap,0.0570058537366616,0.133192871612496,0.0205543208434148,0.0167069179709859,0.0637752742196379,0.0830144614433586
 57 | GELECTRA-large,dbstream,pca,0.0957672356104231,0.0864211123476051,0.0585879181326365,0.10479185085122,0.060389512217864,0.0781944401467215
 58 | GELECTRA-large,dbstream,umap,0.0781451796558462,0.140578476542862,0.0581150858127453,0.151993671575655,0.0867163416866888,0.106302135013381
 59 | GELECTRA-large,dbstream,pca+umap,0.0795784065932305,0.140536357474925,0.0580540290268102,0.150899417299781,0.0860368355559608,0.107050532468636
 60 | GottBERT,minibatch_kmeans,-,0.0837044646117393,0.34485091823134,0.0934176142975861,0.336570384849751,0.160661859855629,0.194636791866662
 61 | GottBERT,minibatch_kmeans,pca,0.068051687111486,0.260148765123045,0.0386391242699837,0.18466914757721,0.0667620484203464,0.0216689797288547
 62 | GottBERT,minibatch_kmeans,umap,0.098948086568718,0.384135061654815,0.156570290402716,0.456459404652668,0.190989379873784,0.349863795382876
 63 | GottBERT,minibatch_kmeans,pca+umap,0.0996830939741513,0.367756163474323,0.153183411536109,0.419939942912584,0.173617694400565,0.226030592014639
 64 | GottBERT,agglomerative,-,0.100965058637155,0.357053102548251,0.109507995279931,0.361436147241082,0.186460317717443,0.239189611017973
 65 | GottBERT,agglomerative,pca,0.0671109123925713,0.259060982597514,0.0351029542411723,0.180102166777926,0.0669233668519784,0.0962431574687843
 66 | GottBERT,agglomerative,umap,0.0995300270666096,0.37943546066005,0.146403253052514,0.447852339753709,0.196918284264601,0.349092060755467
 67 | GottBERT,agglomerative,pca+umap,0.100446731958692,0.374259090061671,0.14835085955226,0.424651830004859,0.172546473294592,0.227011409651262
 68 | GottBERT,hdbscan,pca,0.0744033072554521,0.143314883231155,0.0477826085962066,0.117618424072631,0.13637429350962,0.161942551115745
 69 | GottBERT,hdbscan,umap,0.105805079879835,0.206115307193552,0.0425455018104545,0.272757736302706,0.169065873235678,0.209252753882957
 70 | GottBERT,hdbscan,pca+umap,0.109761131518183,0.198171542304016,0.0150279535499423,0.284081039265491,0.131515868624308,0.160698074901665
 71 | GottBERT,dbstream,pca,0.0652861036486094,0.248920864808596,0.0429044677961951,0.172668458905421,0.0677674958104433,0.0968130165380901
 72 | GottBERT,dbstream,umap,0.0921964865954673,0.363893050252978,0.183648502009858,0.425414468569382,0.199013810541506,0.339471877055011
 73 | GottBERT,dbstream,pca+umap,0.102584996618217,0.350788940697117,0.187715019275042,0.40893035812329,0.176815805091496,0.229809023262869
 74 | cross-en-de-roberta-sentence-transformer,minibatch_kmeans,-,0.126922500216336,0.308182108428217,0.109383325261002,0.235009855126727,0.279809048751092,0.330123183048337
 75 | cross-en-de-roberta-sentence-transformer,minibatch_kmeans,pca,0.101889909333311,0.241805177060928,0.0597552027128782,0.143614178383154,0.130708685936294,0.115293414189967
 76 | cross-en-de-roberta-sentence-transformer,minibatch_kmeans,umap,0.139959761599388,0.352253942462203,0.170335032943622,0.419897355645651,0.342321952129765,0.434055874735054
 77 | cross-en-de-roberta-sentence-transformer,minibatch_kmeans,pca+umap,0.137707954971196,0.327624828932546,0.131777754994125,0.358844279058356,0.311601921374999,0.390146779328902
 78 | cross-en-de-roberta-sentence-transformer,agglomerative,-,0.134736552840553,0.317603548274424,0.116231901966189,0.287051113571006,0.290739229022921,0.359270566865045
 79 | cross-en-de-roberta-sentence-transformer,agglomerative,pca,0.101441749756072,0.241938363962351,0.0582658196433544,0.140378409327646,0.129762926565247,0.11380776389892
 80 | cross-en-de-roberta-sentence-transformer,agglomerative,umap,0.14238737023567,0.349198972050411,0.164938400738108,0.413533593465502,0.344095484305058,0.432259404463818
 81 | cross-en-de-roberta-sentence-transformer,agglomerative,pca+umap,0.138511189632746,0.334581339480389,0.129400903997604,0.364054155686124,0.313701616589672,0.387463111351893
 82 | cross-en-de-roberta-sentence-transformer,hdbscan,pca,0.120963497089993,0.168990060915356,0.0300083321630728,0.0434155029160883,0.166376697915542,0.133542021450355
 83 | cross-en-de-roberta-sentence-transformer,hdbscan,umap,0.190105444179433,0.174899427159922,0.191232709480606,0.187535783156795,0.323913776293285,0.2397046537872
 84 | cross-en-de-roberta-sentence-transformer,hdbscan,pca+umap,0.180613178895982,0.197241556801549,0.167758693775905,0.28666990860315,0.301241449075686,0.19257281426062
 85 | cross-en-de-roberta-sentence-transformer,dbstream,pca,0.0883563768649365,0.221087792098191,0.067723480387753,0.151692351164815,0.130229408885868,0.114899146229848
 86 | cross-en-de-roberta-sentence-transformer,dbstream,umap,0.14548254652485,0.329144468311907,0.184334253008602,0.385416082187906,0.341173927808671,0.422533044940713
 87 | cross-en-de-roberta-sentence-transformer,dbstream,pca+umap,0.141861757546557,0.312588998213157,0.15008447777113,0.34939155383041,0.307636704091922,0.380199089343415
 88 | paraphrase-multilingual-mpnet-base-v2,minibatch_kmeans,-,0.158123291626306,0.343757380287587,0.220025131753068,0.359641464232561,0.363883363255256,0.484310751591357
 89 | paraphrase-multilingual-mpnet-base-v3,minibatch_kmeans,pca,0.125896791552805,0.273244279623173,0.170173986207635,0.257857889611419,0.202982120072103,0.276100300730128
 90 | paraphrase-multilingual-mpnet-base-v4,minibatch_kmeans,umap,0.151443842892341,0.355880613392402,0.281772388405197,0.458497179011279,0.36697526415137,0.509440343971668
 91 | paraphrase-multilingual-mpnet-base-v5,minibatch_kmeans,pca+umap,0.156507916618269,0.352779308405236,0.267460936265054,0.457806763220476,0.374216472427233,0.0356072494262668
 92 | paraphrase-multilingual-mpnet-base-v6,agglomerative,-,0.157046014794122,0.340099873422921,0.231485210022495,0.39278421349265,0.338707270361313,0.469328836409909
 93 | paraphrase-multilingual-mpnet-base-v7,agglomerative,pca,0.122447138811386,0.267452330792705,0.164009842079526,0.252186755920444,0.201879465115538,0.275048671029251
 94 | paraphrase-multilingual-mpnet-base-v8,agglomerative,umap,0.154301708800618,0.354311913879471,0.283234851207097,0.459723984972906,0.361890505633581,0.513401182650167
 95 | paraphrase-multilingual-mpnet-base-v9,agglomerative,pca+umap,0.156173947197582,0.356721558069448,0.266927652058493,0.45604623892775,0.37598851849118,0.515899565007311
 96 | paraphrase-multilingual-mpnet-base-v10,hdbscan,pca,0.127035310941449,0.141762226418964,0.0818351026043876,0.0733421112001386,0.135266821942914,0.214931844385903
 97 | paraphrase-multilingual-mpnet-base-v11,hdbscan,umap,0.201236395432702,0.274649493406005,0.260983527433073,0.360823558170628,0.3337765900697,0.417602492915373
 98 | paraphrase-multilingual-mpnet-base-v12,hdbscan,pca+umap,0.202344394697466,0.262882064874875,0.260146676816762,0.36366036232013,0.336973232402841,0.39034305879441
 99 | paraphrase-multilingual-mpnet-base-v13,dbstream,pca,0.0849065598349565,0.22880722218876,0.135534308643385,0.206324391024045,0.191258191065933,0.252781816917832
100 | paraphrase-multilingual-mpnet-base-v14,dbstream,umap,0.166980199469946,0.342111226742814,0.288731030116555,0.423665080926009,0.364034005702961,0.498943886937568
101 | paraphrase-multilingual-mpnet-base-v15,dbstream,pca+umap,0.172956188862994,0.339471333903835,0.283070859375078,0.422958088887883,0.369348873755288,0.499738008422725
102 | paraphrase-multilingual-MiniLM-L12-v2,minibatch_kmeans,-,0.143270359549276,0.324644806309516,0.222630823956282,0.361341840550912,0.333360860095429,0.44591558586307
103 | paraphrase-multilingual-MiniLM-L12-v3,minibatch_kmeans,pca,0.113823511279356,0.262694275694776,0.156415191235443,0.22762359752661,0.195508336363258,0.258826217792971
104 | paraphrase-multilingual-MiniLM-L12-v4,minibatch_kmeans,umap,0.137999493541642,0.341580795289891,0.252211317772711,0.437469491596867,0.326359774702181,0.474564540394354
105 | paraphrase-multilingual-MiniLM-L12-v5,minibatch_kmeans,pca+umap,0.143009218184408,0.336986580271545,0.248742508388581,0.435084335125087,0.336444460531342,0.474902003632741
106 | paraphrase-multilingual-MiniLM-L12-v6,agglomerative,-,0.146536382523302,0.317431549139208,0.213014658008777,0.380760237973582,0.300456488748825,0.428889617479689
107 | paraphrase-multilingual-MiniLM-L12-v7,agglomerative,pca,0.115779352365279,0.256410239991004,0.15032021458738,0.224178693769346,0.195123755282815,0.258506724848499
108 | paraphrase-multilingual-MiniLM-L12-v8,agglomerative,umap,0.139682521581287,0.342734047702612,0.246290466453738,0.442441125052441,0.324053238239874,0.474193210063386
109 | paraphrase-multilingual-MiniLM-L12-v9,agglomerative,pca+umap,0.145008875778656,0.340618821131352,0.247826494356137,0.43909161580558,0.339124182809281,0.475158193791564
110 | paraphrase-multilingual-MiniLM-L12-v10,hdbscan,pca,0.106742407287862,0.155525989691692,0.0995948924699086,0.125624599957583,0.188164326875565,0.19045868597509
111 | paraphrase-multilingual-MiniLM-L12-v11,hdbscan,umap,0.198883103430658,0.306704774323358,0.248962542645798,0.355306608281706,0.322269995394452,0.397043077631587
112 | paraphrase-multilingual-MiniLM-L12-v12,hdbscan,pca+umap,0.19673681182096,0.283696920569254,0.245202118373034,0.337511019460207,0.320714118485378,0.362949368319386
113 | paraphrase-multilingual-MiniLM-L12-v13,dbstream,pca,0.0987312265630075,0.243655986909509,0.150418611013133,0.222556064613291,0.197586573793126,0.253777193556048
114 | paraphrase-multilingual-MiniLM-L12-v14,dbstream,umap,0.157660274772465,0.324704812605918,0.264405938038737,0.41305520382289,0.331401225159257,0.464706108218722
115 | paraphrase-multilingual-MiniLM-L12-v15,dbstream,pca+umap,0.163624915072092,0.322838184260603,0.258488811106791,0.408413966317955,0.337677315573374,0.460363639591615
116 | use-cmlm-multilingual,minibatch_kmeans,-,0.152393753822012,0.296268068879049,0.256400687832879,0.371041645369963,0.336201098358395,0.497047679337723
117 | use-cmlm-multilingual,minibatch_kmeans,pca,0.116494579302846,0.238005376737296,0.193574697899891,0.202009001379832,0.165994886403545,0.196308371589102
118 | use-cmlm-multilingual,minibatch_kmeans,umap,0.14985167015025,0.327247085116927,0.32157530808599,0.456417606855682,0.379542193966009,0.546814043563995
119 | use-cmlm-multilingual,minibatch_kmeans,pca+umap,0.15510483255276,0.315655367262148,0.288724564321235,0.471530107598779,0.337027189391226,0.536756135865593
120 | use-cmlm-multilingual,agglomerative,-,0.146547015429232,0.283304677098585,0.231158809846395,0.398723051831382,0.336453169331313,0.483293138708737
121 | use-cmlm-multilingual,agglomerative,pca,0.11609266255309,0.235870451509495,0.188140806042258,0.202159352500591,0.165035923826414,0.194792598883006
122 | use-cmlm-multilingual,agglomerative,umap,0.152976721976694,0.325475611432752,0.316539957617672,0.454053297196607,0.378616757107362,0.550820533259835
123 | use-cmlm-multilingual,agglomerative,pca+umap,0.154833736233735,0.317824528936831,0.279209048894811,0.471933143162109,0.33932457568417,0.541088159859084
124 | use-cmlm-multilingual,hdbscan,pca,0.119097059058545,0.11786982294329,0.103759125713307,0.0918062786214231,0.193351879195506,0.144941464960226
125 | use-cmlm-multilingual,hdbscan,umap,0.174974232413532,0.239241093961519,0.138601805587234,0.325121936727263,0.348007320884774,0.446376397150588
126 | use-cmlm-multilingual,hdbscan,pca+umap,0.177388443591204,0.242643170981103,0.259199234914231,0.332705297333893,0.274873156514591,0.447342824463761
127 | use-cmlm-multilingual,dbstream,pca,0,0,0,0,0.0154841696264087,0
128 | use-cmlm-multilingual,dbstream,umap,0.153740540810348,0.316003890266619,0.312504571317317,0.42524666601542,0.369821053977469,0.53586992704974
129 | use-cmlm-multilingual,dbstream,pca+umap,0.155561936402515,0.308245103579314,0.292175461902841,0.422363422904517,0.335584315848073,0.521365277702398
130 | sentence-t5-base,minibatch_kmeans,-,0.115675300517631,0.305921599517768,0.181057125760787,0.448832789492342,0.319928501316492,0.458018030992708
131 | sentence-t5-base,minibatch_kmeans,pca,0.0915559352199022,0.262675084843275,0.129879358918697,0.349841259278436,0.135629905100003,0.233550909602233
132 | sentence-t5-base,minibatch_kmeans,umap,0.130772330756021,0.324953989590045,0.227648303681791,0.44739162490236,0.362103490766273,0.511945757865168
133 | sentence-t5-base,minibatch_kmeans,pca+umap,0.12203323705175,0.311633187208728,0.208373717494624,0.462246733586854,0.335873376538146,0.472643096636938
134 | sentence-t5-base,agglomerative,-,0.12764498347471,0.302068577986526,0.185381996953064,0.43556037724739,0.311204249694432,0.452394731219337
135 | sentence-t5-base,agglomerative,pca,0.0913065210825795,0.259159854414581,0.122216723506138,0.347369601581619,0.13472269401308,0.232663726002279
136 | sentence-t5-base,agglomerative,umap,0.127418551941871,0.323204936462978,0.224144638401092,0.455177461345823,0.365557049191391,0.517477292664523
137 | sentence-t5-base,agglomerative,pca+umap,0.123820570828254,0.312173952185688,0.206866039729986,0.47228531651795,0.338438708778302,0.475773508642492
138 | sentence-t5-base,hdbscan,pca,0.0923035199181703,0.156712135550618,0.0845452428117134,0.174146206497858,0.138170198046753,0.203538467502638
139 | sentence-t5-base,hdbscan,umap,0.179520552043039,0.251778153082897,0.217615526417385,0.356194725742828,0.336027474130198,0.414066493440279
140 | sentence-t5-base,hdbscan,pca+umap,0.17138933736827,0.17239407272415,0.199116397327886,0.37587963127474,0.241371176314441,0.406802441369697
141 | sentence-t5-base,dbstream,pca,0,0,0,0,0,0
142 | sentence-t5-base,dbstream,umap,0.135375043482475,0.31261482838458,0.227727385718491,0.430964861023771,0.358516370578722,0.0312831192054188
143 | sentence-t5-base,dbstream,pca+umap,0.132470514631532,0.304373714095415,0.217329018477362,0.436990434223905,0.330473523993147,0.470442875694425
144 | sentence-t5-large,minibatch_kmeans,-,0.132674364827593,0.353313857300341,0.17264312500235,0.441123059571999,0.353669393966877,0.516390444915677
145 | sentence-t5-large,minibatch_kmeans,pca,0.108177015006833,0.284629725184563,0.091824478080705,0.315681635568616,0.145708485497716,0.257067009080023
146 | sentence-t5-large,minibatch_kmeans,umap,0.143984669694599,0.372837780183789,0.242942125966471,0.457481102325703,0.39403518632197,0.554087132962702
147 | sentence-t5-large,minibatch_kmeans,pca+umap,0.141606930306109,0.367488455588896,0.22584054081795,0.479036853190647,0.366268845186034,0.536285328781896
148 | sentence-t5-large,agglomerative,-,0.141097653445106,0.355376300274668,0.199683300770897,0.439790230192988,0.341605348522922,0.510222005967502
149 | sentence-t5-large,agglomerative,pca,0.107374768508383,0.283706380410588,0.0871744621056125,0.313753178808346,0.143654069788621,0.254513149214071
150 | sentence-t5-large,agglomerative,umap,0.0870710234486703,0.373039424149989,0.240889346643162,0.463332678124837,0.396569004688373,0.563207299963289
151 | sentence-t5-large,agglomerative,pca+umap,0.146214706759965,0.367876820623957,0.217975900531938,0.481872766836543,0.371180890700693,0.541086434437474
152 | sentence-t5-large,hdbscan,pca,0.106529816725044,0.179689828701708,0.0619063851451809,0.107428154931518,0.139674678670881,0.185251335632262
153 | sentence-t5-large,hdbscan,umap,0.190838542453084,0.274027306372176,0.231716132081408,0.378559186655157,0.344199446128254,0.449320319359473
154 | sentence-t5-large,hdbscan,pca+umap,0.180387595658813,0.176266124434699,0.0119189356311079,0.38435805139819,0.255262271093516,0.456274217164703
155 | sentence-t5-large,dbstream,pca,0,0,0,0,0,0
156 | sentence-t5-large,dbstream,umap,0.147369788244745,0.359191888215373,0.249041008925446,0.443325349491755,0.386819925270799,0.540183676275449
157 | sentence-t5-large,dbstream,pca+umap,0.148844242241185,0.350740951929464,0.235562611878174,0.448121255485833,0.360092686834314,0.522102675553645
158 | sentence-t5-xxl,minibatch_kmeans,-,0.15935051142789,0.399108862901951,0.196874386472812,0.434348084580018,0.385370978707388,0.558955647217913
159 | sentence-t5-xxl,minibatch_kmeans,pca,0.12570384699927,0.306572596380755,0.103692173864114,0.265053226841241,0.151195306846239,0.255351017507615
160 | sentence-t5-xxl,minibatch_kmeans,umap,0.174986786082349,0.40254267763501,0.315534120586959,0.459345136464197,0.440062162556857,0.589952812356259
161 | sentence-t5-xxl,minibatch_kmeans,pca+umap,0.170605295428964,0.407179982390405,0.275048929797589,0.467889923470059,0.396857453580994,0.591698543713776
162 | sentence-t5-xxl,agglomerative,-,0.158865040708109,0.385456940017055,0.237959927743163,0.442556380964094,0.373688181633057,0.549462971610934
163 | sentence-t5-xxl,agglomerative,pca,0.1253796781399,0.305435768121001,0.0995964032631627,0.262091575644706,0.149967880567176,0.254449947865279
164 | sentence-t5-xxl,agglomerative,umap,0.176141240235063,0.407671753893563,0.308957570419374,0.455741908863803,0.440150360519924,0.594349281058195
165 | sentence-t5-xxl,agglomerative,pca+umap,0.171582233571898,0.402478256574296,0.268773163808219,0.46602323058645,0.398394468821512,0.59181063988529
166 | sentence-t5-xxl,hdbscan,pca,0.112877823175006,0.175668626775163,0.0838045821791226,0.137295083932818,0.142228600698664,0.161371079983652
167 | sentence-t5-xxl,hdbscan,umap,0.183423585223045,0.314729331573393,0.217023269231727,0.382516958853735,0.335810997794907,0.47471979165273
168 | sentence-t5-xxl,hdbscan,pca+umap,0.197929483001782,0.291193111112434,0.229666031091625,0.364343935312214,0.329913032357581,0.487997454866726
169 | sentence-t5-xxl,dbstream,pca,0,0,0,0,0,0
170 | sentence-t5-xxl,dbstream,umap,0.171959893336347,0.384603415332465,0.311092826313863,0.441835249669936,0.427493180639225,0.570723989613225
171 | sentence-t5-xxl,dbstream,pca+umap,0.174645391590583,0.389184230256408,0.280893967439587,0.448626654853136,0.388972898797556,0.563125955132959
172 | XLM-RoBERTa-large,minibatch_kmeans,-,0.0728861946091863,0.298423391336534,0.0616183106143701,0.324574040090756,0.101917006949572,0.235017049723222
173 | XLM-RoBERTa-large,minibatch_kmeans,pca,0.0667658843045096,0.209413384159574,0.0311372569415547,0.150766398997943,0.040436730972003,0.118929367059747
174 | XLM-RoBERTa-large,minibatch_kmeans,umap,0.0864387783942206,0.350404086153387,0.105776865919053,0.456327945759019,0.134101392050016,0.356030791413997
175 | XLM-RoBERTa-large,minibatch_kmeans,pca+umap,0.0881057536722783,0.333455673409919,0.0966469382936431,0.41051126094098,0.115634835753774,0.0378167944039251
176 | XLM-RoBERTa-large,agglomerative,-,0.0854002509620479,0.317779172767462,0.075827172915677,0.34916468777658,0.131361361440265,0.290799046872692
177 | XLM-RoBERTa-large,agglomerative,pca,0.0658881486783091,0.206542571147759,0.031663043023735,0.149419344682624,0.0402264042491285,0.119386004502601
178 | XLM-RoBERTa-large,agglomerative,umap,0.0857636592722182,0.353799629400483,0.103986208208367,0.463823871963038,0.133662609040372,0.354871370123206
179 | XLM-RoBERTa-large,agglomerative,pca+umap,0.0882723295291705,0.330118708964256,0.0948692020431576,0.412716135639609,0.117394056942406,0.299242225288215
180 | XLM-RoBERTa-large,hdbscan,pca,0.0817370344914418,0.0976405115400701,0.0446446838193045,0.100375750191014,0.131276495367633,0.168521129173072
181 | XLM-RoBERTa-large,hdbscan,umap,0.106531300179375,0.172668458791544,0.0202942550256425,0.345458130906793,0.155062418947832,0.265007469196041
182 | XLM-RoBERTa-large,hdbscan,pca+umap,0.106109268730964,0.199335101548672,0.0119919301083495,0.312241245253147,0.12165550564456,0.163180565353472
183 | XLM-RoBERTa-large,dbstream,pca,0.0372369629650578,0.0672694846154926,0.0329614375735175,0.0528708808795484,0.0327779205845074,0.0916759930516592
184 | XLM-RoBERTa-large,dbstream,umap,0.0829279984660521,0.334595800525538,0.136469851639152,0.433921143881,0.144932674569599,0.351920884470888
185 | XLM-RoBERTa-large,dbstream,pca+umap,0.091275187106685,0.320110599891455,0.12869199062505,0.402554155784173,0.126074714902633,0.302206666584313
186 | 


--------------------------------------------------------------------------------