├── reddit_data ├── praw.ini ├── extract_splits.py └── download_data.py ├── requirements.txt ├── LICENSE ├── scripts ├── run_cteb_de.py ├── FlexibleClusteringEvaluator.py ├── ClusteringTasks.py └── run_fine_tuned_cteb_de.py ├── README.md └── results └── tecb-de-full-results.csv /reddit_data/praw.ini: -------------------------------------------------------------------------------- 1 | [DEFAULT] 2 | client_id= 3 | client_secret= 4 | user_agent=s -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | hdbscan==0.8.29 2 | mteb==1.0.1 3 | numba==0.56.4 4 | river==0.14.0 5 | sentence-transformers==2.2.2 6 | umap-learn==0.5.3 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Climate and Societal Analytics 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /reddit_data/extract_splits.py: -------------------------------------------------------------------------------- 1 | """Script to generate splits for benchmarking text embedding clustering. 2 | Based on Reddit data as retrieved by the official Reddit API.""" 3 | import random 4 | 5 | import jsonlines 6 | import pandas as pd 7 | 8 | NUM_SPLITS = 10 9 | MIN_LABELS = 10 10 | MAX_LABELS = 50 11 | 12 | random.seed(42) 13 | 14 | 15 | def get_split(submissions, labels_mask, col_name): 16 | return ( 17 | submissions[labels_mask] 18 | .sample(frac=1.0) 19 | .rename(columns={col_name: "sentences"})[["sentences", "labels"]] 20 | .to_dict("list") 21 | ) 22 | 23 | 24 | def write_sets(name, sets): 25 | with jsonlines.open(name, "w") as f_out: 26 | f_out.write_all(sets) 27 | 28 | 29 | submissions = pd.read_csv("submissions.tsv", delimiter="\t") 30 | submissions.head() 31 | 32 | submissions = submissions.rename( 33 | columns={"title": "s2s", "selftext": "p2p", "subreddit": "labels"} 34 | ) 35 | submissions["p2p"] = submissions["s2s"] + " " + submissions["p2p"] 36 | 37 | subreddits = list(submissions["labels"].unique()) 38 | test_sets_s2s, test_sets_p2p = [], [] 39 | for _ in range(NUM_SPLITS): 40 | num_labels = random.randint(MIN_LABELS, MAX_LABELS) 41 | random.shuffle(subreddits) 42 | labels = subreddits[:num_labels] 43 | 44 | labels_mask = submissions.labels.isin(labels) 45 | test_sets_s2s.append(get_split(submissions, labels_mask, "s2s")) 46 | test_sets_p2p.append(get_split(submissions, labels_mask, "p2p")) 47 | 48 | write_sets("s2s_test.jsonl", test_sets_s2s) 49 | write_sets("p2p_test.jsonl", test_sets_p2p) 50 | -------------------------------------------------------------------------------- /reddit_data/download_data.py: -------------------------------------------------------------------------------- 1 | """Download Reddit submissions given a list of submission ids.""" 2 | import csv 3 | import re 4 | 5 | import praw 6 | import tqdm 7 | from tqdm import tqdm 8 | 9 | 10 | # based on: https://huggingface.co/datasets/sentence-transformers/reddit-title-body/blob/main/extraction_script/extract_title_selftext.py 11 | def clean_text(text): 12 | text = text.strip() 13 | text = re.sub(r"\[(.*)\]\(.*\)", "\g<1>", text) # Markdown 14 | text = re.sub( 15 | r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", 16 | "", 17 | text, 18 | ) # URLs 19 | return text 20 | 21 | 22 | # credentials must be specificed in praw.ini 23 | # read-only mode is enough but an application has to be registered on Reddit: https://praw.readthedocs.io/en/stable/getting_started/authentication.html 24 | reddit = praw.Reddit() 25 | 26 | ids = open("submission_ids.txt", "r").read().splitlines()[1:] 27 | 28 | submissions = [] 29 | for s in tqdm(reddit.info(fullnames=["t3_" + id_ for id_ in ids]), total=len(ids)): 30 | # ignore deleted or removed submissions (if submission or user is deleted) 31 | if s.selftext in ["[removed]", "[deleted]"]: 32 | continue 33 | 34 | title = clean_text(s.title) 35 | selftext = clean_text(s.selftext) 36 | submissions.append([s.id, title, selftext, s.subreddit]) 37 | 38 | with open("submissions.tsv", "w", encoding="utf8", newline="") as tsv_file: 39 | tsv_writer = csv.writer(tsv_file, delimiter="\t", lineterminator="\n") 40 | tsv_writer.writerow(["id", "title", "selftext", "subreddit"]) 41 | i = 0 42 | for row in submissions: 43 | i += 1 44 | tsv_writer.writerow(row) 45 | -------------------------------------------------------------------------------- /scripts/run_cteb_de.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import logging 3 | 4 | from ClusteringTasks import ( 5 | BlurbsClusteringP2P, 6 | BlurbsClusteringS2S, 7 | RedditClusteringP2P, 8 | RedditClusteringS2S, 9 | TenKGnadClusteringP2P, 10 | TenKGnadClusteringS2S, 11 | ) 12 | from mteb import MTEB 13 | from sentence_transformers import SentenceTransformer 14 | 15 | # logging.basicConfig(level=logging.INFO) 16 | 17 | base_tasks = [ 18 | BlurbsClusteringS2S, 19 | BlurbsClusteringP2P, 20 | TenKGnadClusteringS2S, 21 | TenKGnadClusteringP2P, 22 | ] 23 | 24 | # change task config here 25 | task_configs = [ 26 | {"dim_red": None, "clustering_alg": "minibatch_kmeans"}, 27 | {"dim_red": "pca", "clustering_alg": "minibatch_kmeans"}, 28 | {"dim_red": "umap", "clustering_alg": "minibatch_kmeans"}, 29 | {"dim_red": "pca+umap", "clustering_alg": "minibatch_kmeans"}, 30 | {"dim_red": None, "clustering_alg": "agglomerative"}, 31 | {"dim_red": "pca", "clustering_alg": "agglomerative"}, 32 | {"dim_red": "umap", "clustering_alg": "agglomerative"}, 33 | {"dim_red": "pca+umap", "clustering_alg": "agglomerative"}, 34 | {"dim_red": "pca", "clustering_alg": "hdbscan"}, 35 | {"dim_red": "umap", "clustering_alg": "hdbscan"}, 36 | {"dim_red": "pca+umap", "clustering_alg": "hdbscan"}, 37 | {"dim_red": "pca", "clustering_alg": "dbstream"}, 38 | {"dim_red": "umap", "clustering_alg": "dbstream"}, 39 | {"dim_red": "pca+umap", "clustering_alg": "dbstream"}, 40 | ] 41 | 42 | # change models here 43 | model_names = [ 44 | "deepset/gbert-base", 45 | "deepset/gbert-large", 46 | "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", 47 | "sentence-transformers/sentence-t5-xxl", 48 | ] 49 | 50 | 51 | def main(args: argparse.ArgumentParser): 52 | if args.include_reddit: 53 | base_tasks.extend([RedditClusteringS2S, RedditClusteringP2P]) 54 | for model_name in model_names: 55 | model = SentenceTransformer(model_name) 56 | evaluation = MTEB( 57 | tasks=[task(**config) for task in base_tasks for config in task_configs] 58 | ) 59 | evaluation.run(model, output_folder=f"results/{model_name.split('/')[-1]}") 60 | 61 | 62 | if __name__ == "__main__": 63 | parser = argparse.ArgumentParser() 64 | parser.add_argument("--include-reddit", action="store_true") 65 | 66 | args = parser.parse_args() 67 | main(args) 68 | -------------------------------------------------------------------------------- /scripts/FlexibleClusteringEvaluator.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import Optional 3 | 4 | import hdbscan 5 | import numpy as np 6 | import sklearn 7 | import umap 8 | from mteb.evaluation.evaluators.Evaluator import Evaluator 9 | from river import cluster, stream 10 | from sklearn.pipeline import make_pipeline 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | 15 | class DBSTREAMWrapper(cluster.DBSTREAM): 16 | """Wrapper for river.cluster.DBSTREAM so it can be used similarly to sklearn API. Wrapper is used to store 17 | online predictions in self.labels_ attribute (as in sklearn), i.e., any new predictions will be added to this attribute. 18 | """ 19 | 20 | def __init__(self, compute_labels: bool = True, *args, **kwargs): 21 | super().__init__(*args, **kwargs) 22 | self.compute_labels = compute_labels 23 | self.labels_ = [] 24 | 25 | def _learn_many(self, X, sample_weights: Optional[list] = None): 26 | """Update model with multiple data points.""" 27 | sample_weights = [None] * len(X) if sample_weights is None else sample_weights 28 | for i, (x, _) in enumerate(stream.iter_array(X)): 29 | self.learn_one(x, sample_weight=sample_weights[i]) 30 | 31 | def _predict_many(self, X: np.array, sample_weights: Optional[list] = None) -> list: 32 | """Predict multiple data points.""" 33 | labels = [] 34 | sample_weights = [None] * len(X) if sample_weights is None else sample_weights 35 | for i, (x, _) in enumerate(stream.iter_array(X)): 36 | labels.append(self.predict_one(x, sample_weight=sample_weights[i])) 37 | return labels 38 | 39 | def fit(self, X: np.array, sample_weights: Optional[list] = None): 40 | """sklearn API logic to handle model updates.""" 41 | self._learn_many(X, sample_weights=sample_weights) 42 | if self.compute_labels: 43 | self.predict(X, sample_weights=sample_weights) 44 | return self 45 | 46 | def fit_predict(self, X: np.array, sample_weights: Optional[list] = None): 47 | """sklearn API logic to handle simultaneous model updates and predictions.""" 48 | self._learn_many(X, sample_weights=sample_weights) 49 | return self.predict(X, sample_weights=sample_weights) 50 | 51 | def predict(self, X: np.array, sample_weights: Optional[list] = None): 52 | """sklearn API logic to handle model predictions.""" 53 | self.labels_.extend(self._predict_many(X, sample_weights=sample_weights)) 54 | return self.labels_ 55 | 56 | 57 | class FlexibleClusteringEvaluator(Evaluator): 58 | def __init__( 59 | self, 60 | sentences: list[str], 61 | labels: list[int], 62 | clustering_alg: str = "minibatch_kmeans", 63 | clustering_params: Optional[dict] = None, 64 | dim_red: Optional[str] = None, 65 | dim_red_params: Optional[dict] = None, 66 | limit: Optional[int] = None, 67 | **kwargs, 68 | ): 69 | super().__init__(**kwargs) 70 | if limit is not None: 71 | sentences = sentences[:limit] 72 | labels = labels[:limit] 73 | self.sentences = sentences 74 | self.labels = labels 75 | 76 | if clustering_params is None: 77 | clustering_params = {} 78 | 79 | if dim_red_params is None: 80 | dim_red_params = {} 81 | 82 | nr_labels = len(set(self.labels)) 83 | if clustering_alg == "agglomerative": 84 | self.clustering_model = sklearn.cluster.AgglomerativeClustering( 85 | n_clusters=nr_labels, **clustering_params 86 | ) 87 | elif clustering_alg == "dbstream": 88 | self.clustering_model = DBSTREAMWrapper(**clustering_params) 89 | 90 | elif clustering_alg == "hdbscan": 91 | self.clustering_model = hdbscan.HDBSCAN(**clustering_params) 92 | 93 | elif clustering_alg == "minibatch_kmeans": 94 | if "batch_size" not in clustering_params: 95 | clustering_params["batch_size"] = 500 96 | if "n_init" not in clustering_params: 97 | clustering_params["n_init"] = "auto" 98 | self.clustering_model = sklearn.cluster.MiniBatchKMeans( 99 | n_clusters=nr_labels, **clustering_params 100 | ) 101 | else: 102 | raise ValueError("Option not implemented") 103 | 104 | if dim_red == "pca": 105 | self.dim_red_model = sklearn.decomposition.PCA( 106 | n_components=2, **dim_red_params 107 | ) 108 | elif dim_red == "umap": 109 | self.dim_red_model = umap.UMAP(metric="cosine", **dim_red_params) 110 | elif dim_red == "pca+umap": 111 | self.dim_red_model = make_pipeline( 112 | sklearn.decomposition.PCA(n_components=50), 113 | umap.UMAP(metric="cosine", **dim_red_params), 114 | ) 115 | elif dim_red is None: 116 | self.dim_red_model = None 117 | else: 118 | raise ValueError("Option not implemented") 119 | 120 | self.model_name = f"{dim_red}>{clustering_alg}" if dim_red else clustering_alg 121 | 122 | def __call__(self, model): 123 | logger.info(f"Encoding {len(self.sentences)} sentences...") 124 | corpus_embeddings = np.asarray(model.encode(self.sentences)) 125 | 126 | logger.info(f"Fitting {self.model_name} model...") 127 | if self.dim_red_model is not None: 128 | corpus_embeddings = self.dim_red_model.fit_transform(corpus_embeddings) 129 | self.clustering_model.fit(corpus_embeddings) 130 | cluster_assignment = self.clustering_model.labels_ 131 | 132 | logger.info("Evaluating...") 133 | v_measure = sklearn.metrics.cluster.v_measure_score( 134 | self.labels, cluster_assignment 135 | ) 136 | 137 | return {"v_measure": v_measure} 138 | -------------------------------------------------------------------------------- /scripts/ClusteringTasks.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from pathlib import Path 4 | from typing import Callable, Optional, TypeVar 5 | 6 | import datasets 7 | import numpy as np 8 | import tqdm 9 | from FlexibleClusteringEvaluator import FlexibleClusteringEvaluator 10 | from mteb.abstasks.AbsTask import AbsTask 11 | from sentence_transformers import SentenceTransformer 12 | from typing_extensions import ParamSpec 13 | 14 | T = TypeVar("T") 15 | P = ParamSpec("P") 16 | 17 | 18 | def dynamic_description(d: Callable[P, T]) -> Callable[P, T]: 19 | @property 20 | def wrapper(self) -> dict: 21 | desc = d(self) 22 | desc["name"] = ( 23 | desc["name"] 24 | + "{" 25 | + (self.dim_red + "," if self.dim_red else "") 26 | + self.clustering_alg 27 | + ( 28 | "{" 29 | + ",".join( 30 | [ 31 | str(key) + "=" + str(value) 32 | for key, value in self.clustering_params.items() 33 | ] 34 | ) 35 | + "}" 36 | if len(self.clustering_params) > 0 37 | else "" 38 | ) 39 | + "}" 40 | ) 41 | return desc 42 | 43 | return wrapper 44 | 45 | 46 | class AbsTaskFlexibleClustering(AbsTask): 47 | def __init__( 48 | self, 49 | clustering_alg: str = "minibatch_kmeans", 50 | clustering_params: Optional[dict] = None, 51 | dim_red: Optional[str] = None, 52 | dim_red_params: Optional[dict] = None, 53 | **kwargs, 54 | ): 55 | super().__init__(**kwargs) 56 | 57 | self.clustering_alg = clustering_alg 58 | self.dim_red = dim_red 59 | 60 | if clustering_params is None: 61 | self.clustering_params = {} 62 | else: 63 | self.clustering_params = clustering_params 64 | if dim_red_params is None: 65 | self.dim_red_params = {} 66 | else: 67 | self.dim_red_params = dim_red_params 68 | 69 | def evaluate(self, model, split: str = "test", **kwargs): 70 | if not self.data_loaded: 71 | self.load_data() 72 | 73 | v_measures = [] 74 | for cluster_set in tqdm.tqdm(self.dataset[split], desc="Clustering"): 75 | evaluator = FlexibleClusteringEvaluator( 76 | cluster_set["sentences"], 77 | cluster_set["labels"], 78 | clustering_alg=self.clustering_alg, 79 | clustering_params=self.clustering_params.copy(), 80 | dim_red=self.dim_red, 81 | dim_red_params=self.dim_red_params.copy(), 82 | ) 83 | 84 | metrics = evaluator(model) 85 | v_measures.append(metrics["v_measure"]) 86 | 87 | v_mean = np.mean(v_measures) 88 | v_std = np.std(v_measures) 89 | return {"v_measure": v_mean, "v_measure_std": v_std} 90 | 91 | @dynamic_description 92 | def description(self): 93 | raise NotImplementedError 94 | 95 | 96 | class BlurbsClusteringS2S(AbsTaskFlexibleClustering): 97 | @dynamic_description 98 | def description(self) -> dict: 99 | return { 100 | "name": "BlurbsClusteringS2S", 101 | "hf_hub_name": "slvnwhrl/blurbs-clustering-s2s", 102 | "description": "Clustering of book blurbs (titles only).", 103 | "type": "Clustering", 104 | "category": "s2s", 105 | "eval_splits": ["test"], 106 | "eval_langs": ["de"], 107 | "main_score": ["v_measure"], 108 | } 109 | 110 | 111 | class BlurbsClusteringP2P(AbsTaskFlexibleClustering): 112 | @dynamic_description 113 | def description(self) -> dict: 114 | return { 115 | "name": "BlurbsClusteringP2P", 116 | "hf_hub_name": "slvnwhrl/blurbs-clustering-p2p", 117 | "description": "Clustering of book blurbs (titles + blurbs).", 118 | "type": "Clustering", 119 | "category": "p2p", 120 | "eval_splits": ["test"], 121 | "eval_langs": ["de"], 122 | "main_score": ["v_measure"], 123 | } 124 | 125 | 126 | class TenKGnadClusteringS2S(AbsTaskFlexibleClustering): 127 | @dynamic_description 128 | def description(self) -> dict: 129 | return { 130 | "name": "TenKGnadClusteringS2S", 131 | "hf_hub_name": "slvnwhrl/tenkgnad-clustering-s2s", 132 | "description": "Clustering of German news articles titles.", 133 | "type": "Clustering", 134 | "category": "s2s", 135 | "eval_splits": ["test"], 136 | "eval_langs": ["de"], 137 | "main_score": ["v_measure"], 138 | } 139 | 140 | 141 | class TenKGnadClusteringP2P(AbsTaskFlexibleClustering): 142 | @dynamic_description 143 | def description(self) -> dict: 144 | return { 145 | "name": "TenKGnadClusteringP2P", 146 | "hf_hub_name": "slvnwhrl/tenkgnad-clustering-p2p", 147 | "description": "Clustering of German news articles (titles + body).", 148 | "type": "Clustering", 149 | "category": "p2p", 150 | "eval_splits": ["test"], 151 | "eval_langs": ["de"], 152 | "main_score": ["v_measure"], 153 | } 154 | 155 | 156 | reddit_data_path = os.path.join(Path(__file__).parent.parent.absolute(), "reddit_data") 157 | 158 | 159 | class RedditClusteringS2S(AbsTaskFlexibleClustering): 160 | @dynamic_description 161 | def description(self) -> dict: 162 | return { 163 | "name": "RedditClusteringS2S", 164 | "description": "Clustering of German reddit submission titles.", 165 | "type": "Clustering", 166 | "category": "s2s", 167 | "eval_splits": ["test"], 168 | "eval_langs": ["de"], 169 | "main_score": ["v_measure"], 170 | } 171 | 172 | def load_data(self, **kwargs): 173 | if self.data_loaded: 174 | return 175 | 176 | self.dataset = datasets.load_dataset( 177 | reddit_data_path, 178 | data_files={"test": "s2s_test.jsonl"}, 179 | ) 180 | self.data_loaded = True 181 | 182 | 183 | class RedditClusteringP2P(AbsTaskFlexibleClustering): 184 | @dynamic_description 185 | def description(self) -> dict: 186 | return { 187 | "name": "RedditClusteringP2P", 188 | "description": "Clustering of German reddit submission (titles + body).", 189 | "type": "Clustering", 190 | "category": "p2p", 191 | "eval_splits": ["test"], 192 | "eval_langs": ["de"], 193 | "main_score": ["v_measure"], 194 | } 195 | 196 | def load_data(self, **kwargs): 197 | if self.data_loaded: 198 | return 199 | 200 | self.dataset = datasets.load_dataset( 201 | reddit_data_path, 202 | data_files={"test": "p2p_test.jsonl"}, 203 | ) 204 | self.data_loaded = True 205 | -------------------------------------------------------------------------------- /scripts/run_fine_tuned_cteb_de.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import csv 3 | import logging 4 | import os 5 | import random 6 | import types 7 | from typing import Optional 8 | 9 | import numpy as np 10 | import torch 11 | from ClusteringTasks import ( 12 | AbsTaskFlexibleClustering, 13 | BlurbsClusteringP2P, 14 | BlurbsClusteringS2S, 15 | RedditClusteringP2P, 16 | RedditClusteringS2S, 17 | TenKGnadClusteringP2P, 18 | TenKGnadClusteringS2S, 19 | ) 20 | from datasets import load_dataset 21 | from sentence_transformers import SentenceTransformer, datasets, losses 22 | from sentence_transformers.util import batch_to_device 23 | from tqdm.autonotebook import trange 24 | from transformers import ( 25 | AutoModelForMaskedLM, 26 | AutoTokenizer, 27 | DataCollatorForWholeWordMask, 28 | Trainer, 29 | TrainingArguments, 30 | ) 31 | 32 | os.environ["TOKENIZERS_PARALLELISM"] = "false" 33 | 34 | # logging.basicConfig(level=logging.INFO) 35 | logger = logging.getLogger("main") 36 | 37 | 38 | def write_results( 39 | task: AbsTaskFlexibleClustering, 40 | model, 41 | output_path: str, 42 | epoch: int, 43 | steps: int, 44 | v_measure: float, 45 | ): 46 | """Write evaluation results to file.""" 47 | csv_headers = ["epoch", "steps", "v_measure"] 48 | csv_path = os.path.join(output_path, f"{task.description['name']}.json") 49 | if not os.path.isfile(csv_path): 50 | with open(csv_path, newline="", mode="w", encoding="utf-8") as f: 51 | writer = csv.writer(f) 52 | writer.writerow(csv_headers) 53 | writer.writerow([epoch, steps, v_measure]) 54 | else: 55 | with open(csv_path, newline="", mode="a", encoding="utf-8") as f: 56 | writer = csv.writer(f) 57 | writer.writerow([epoch, steps, v_measure]) 58 | 59 | 60 | def eval_wrapper(tasks: list, output_path: Optional[str] = None): 61 | """Wrapper for TSDAE training procedure so custom mteb tasks can be evaluated.""" 62 | 63 | def eval(model, output_path=output_path, epoch=0, steps=0): 64 | v_measures = [] 65 | 66 | model.eval() 67 | with torch.no_grad(): 68 | for task in tasks: 69 | v_measure = task.evaluate(model)["v_measure"] 70 | if output_path is not None: 71 | write_results(task, model, output_path, epoch, steps, v_measure) 72 | v_measures.append(v_measure) 73 | 74 | return np.mean(v_measures) 75 | 76 | return eval 77 | 78 | 79 | def encode_wrapper(tokenizer, batch_size: int = 32): 80 | """Wrapper to make instances of transformer models compatible with mteb (which uses encode method similar to SentenceTransformer models).""" 81 | 82 | def encode(self, sentences): 83 | embeddings = [] 84 | length_sorted_idx = np.argsort([-len(sen) for sen in sentences]) 85 | sentences_sorted = [sentences[idx] for idx in length_sorted_idx] 86 | for i in range(0, len(sentences), batch_size): 87 | encoded_input = encoded_input = tokenizer( 88 | sentences_sorted[i : i + batch_size], 89 | return_tensors="pt", 90 | padding=True, 91 | truncation=True, 92 | max_length=512, 93 | ) 94 | encoded_input = batch_to_device(encoded_input, self.device) 95 | output = self(**encoded_input) 96 | 97 | token_embeddings = output[0] 98 | attention_mask = encoded_input.attention_mask 99 | input_mask_expanded = ( 100 | attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 101 | ) 102 | sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) 103 | sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9) 104 | 105 | embeddings.extend((sum_embeddings / sum_mask).detach().cpu()) 106 | 107 | embeddings = [embeddings[idx] for idx in np.argsort(length_sorted_idx)] 108 | return np.asarray([emb.numpy() for emb in embeddings]) 109 | 110 | return encode 111 | 112 | 113 | class CustomTrainer(Trainer): 114 | """Subclassed Hugging Face Trainer so custom mteb tasks can be evaluated.""" 115 | 116 | def __init__(self, tasks: list, **kwargs): 117 | super().__init__(**kwargs) 118 | self.tasks = tasks 119 | self.performed_evals = 0 120 | self.model.encode = types.MethodType( 121 | encode_wrapper( 122 | self.data_collator.tokenizer, self.args.per_device_eval_batch_size 123 | ), 124 | self.model, 125 | ) 126 | 127 | def evaluate(self, **kwargs): 128 | self.performed_evals += 1 129 | steps = self.performed_evals * self.args.eval_steps 130 | steps_per_epoch = len(self.train_dataset) / ( 131 | self.args.per_device_train_batch_size 132 | * self.args.gradient_accumulation_steps 133 | ) 134 | epoch = int(np.floor((steps - 1) / steps_per_epoch)) 135 | 136 | v_measures = [] 137 | 138 | self.model.eval() 139 | with torch.no_grad(): 140 | for task in self.tasks: 141 | v_measure = task.evaluate(self.model)["v_measure"] 142 | if self.args.output_dir is not None: 143 | write_results( 144 | task, self.model, self.args.output_dir, epoch, steps, v_measure 145 | ) 146 | v_measures.append(v_measure) 147 | 148 | return np.mean(v_measures) 149 | 150 | 151 | class TokenizedSentencesDataset: 152 | """Wrapper for on-the-fly tokenization for MLM training.""" 153 | 154 | def __init__( 155 | self, 156 | sentences: list[str], 157 | tokenizer, 158 | max_length: int, 159 | cache_tokenization: bool = False, 160 | ): 161 | self.tokenizer = tokenizer 162 | self.sentences = sentences 163 | self.max_length = max_length 164 | self.cache_tokenization = cache_tokenization 165 | 166 | def __getitem__(self, item): 167 | if not self.cache_tokenization: 168 | return self.tokenizer( 169 | self.sentences[item], 170 | add_special_tokens=True, 171 | truncation=True, 172 | max_length=self.max_length, 173 | return_special_tokens_mask=True, 174 | ) 175 | 176 | if isinstance(self.sentences[item], str): 177 | self.sentences[item] = self.tokenizer( 178 | self.sentences[item], 179 | add_special_tokens=True, 180 | truncation=True, 181 | max_length=self.max_length, 182 | return_special_tokens_mask=True, 183 | ) 184 | return self.sentences[item] 185 | 186 | def __len__(self): 187 | return len(self.sentences) 188 | 189 | 190 | def seed_worker(worker_id): 191 | worker_seed = torch.initial_seed() % 2**32 192 | np.random.seed(worker_seed) 193 | random.seed(worker_seed) 194 | 195 | 196 | model_names = [ 197 | "deepset/gbert-base", 198 | ] 199 | 200 | base_tasks = [ 201 | BlurbsClusteringS2S, 202 | BlurbsClusteringP2P, 203 | TenKGnadClusteringS2S, 204 | TenKGnadClusteringP2P, 205 | ] 206 | task_configs = [ 207 | { 208 | "dim_red": None, 209 | "clustering_alg": "minibatch_kmeans", 210 | "clustering_params": {"random_state": 42}, 211 | }, 212 | ] 213 | 214 | seed_list = [42, 1, 2] 215 | use_fp16 = False 216 | 217 | # MLM 218 | mlm_prob = 0.15 219 | 220 | 221 | def main(args: argparse.ArgumentParser): 222 | if args.include_reddit: 223 | base_tasks.extend([RedditClusteringS2S, RedditClusteringP2P]) 224 | 225 | for model_name in model_names: 226 | for base_task in base_tasks: 227 | tasks = [base_task(**config) for config in task_configs] 228 | logger.info(tasks[0].description["name"]) 229 | base_task_name = tasks[0].description["name"].split("{")[0] 230 | 231 | tasks[0].load_data() 232 | train_sentences = list( 233 | set( 234 | [ 235 | sent 236 | for split in tasks[0].dataset["test"]["sentences"] 237 | for sent in split 238 | ] 239 | ) 240 | ) 241 | 242 | # TSDAE 243 | train_dataset = datasets.DenoisingAutoEncoderDataset(train_sentences) 244 | 245 | for seed in seed_list: 246 | torch.manual_seed(seed) 247 | 248 | model = SentenceTransformer(model_name) 249 | 250 | result_folder = os.path.join( 251 | "results", 252 | model_name.split("/")[-1], 253 | f"tsdae_ft_{str(seed)}", 254 | base_task_name, 255 | ) 256 | 257 | g = torch.Generator() 258 | g.manual_seed(seed) 259 | train_dataloader = torch.utils.data.DataLoader( 260 | train_dataset, 261 | batch_size=8, 262 | shuffle=True, 263 | drop_last=False, 264 | worker_init_fn=seed_worker, 265 | generator=g, 266 | ) 267 | train_loss = losses.DenoisingAutoEncoderLoss( 268 | model, decoder_name_or_path=model_name, tie_encoder_decoder=True 269 | ) 270 | 271 | dev_evaluator = eval_wrapper(tasks, output_path=result_folder) 272 | 273 | logger.info(f"Start TSDAE training <>") 274 | model.fit( 275 | train_objectives=[(train_dataloader, train_loss)], 276 | evaluator=dev_evaluator, 277 | evaluation_steps=512, 278 | epochs=30, 279 | weight_decay=0, 280 | scheduler="constantlr", 281 | optimizer_params={"lr": 3e-5}, 282 | output_path=result_folder, 283 | save_best_model=True, 284 | checkpoint_save_total_limit=1, 285 | show_progress_bar=True, 286 | use_amp=use_fp16, 287 | ) 288 | 289 | logger.info("Training done") 290 | 291 | # MLM 292 | for seed in seed_list: 293 | model = AutoModelForMaskedLM.from_pretrained(model_name) 294 | tokenizer = AutoTokenizer.from_pretrained(model_name) 295 | 296 | train_dataset = TokenizedSentencesDataset( 297 | train_sentences, tokenizer, 512 298 | ) # handle max seq length 299 | data_collator = DataCollatorForWholeWordMask( 300 | tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob 301 | ) 302 | 303 | result_folder = os.path.join( 304 | "results", 305 | model_name.split("/")[-1], 306 | f"mlm_ft_{str(seed)}", 307 | base_task_name, 308 | ) 309 | 310 | training_args = TrainingArguments( 311 | output_dir=result_folder, 312 | overwrite_output_dir=True, 313 | num_train_epochs=30, 314 | evaluation_strategy="steps", 315 | per_device_train_batch_size=32, 316 | per_device_eval_batch_size=128, 317 | eval_steps=25, 318 | save_steps=25, 319 | logging_steps=25, 320 | gradient_accumulation_steps=8, 321 | save_total_limit=1, 322 | load_best_model_at_end=True, 323 | metric_for_best_model="v_measure", 324 | prediction_loss_only=True, 325 | fp16=use_fp16, 326 | learning_rate=1e-04, 327 | weight_decay=0.01, 328 | lr_scheduler_type="constant_with_warmup", 329 | warmup_ratio=0.06, 330 | seed=seed, 331 | ) 332 | 333 | trainer = CustomTrainer( 334 | model=model, 335 | args=training_args, 336 | data_collator=data_collator, 337 | train_dataset=train_dataset, 338 | eval_dataset="placeholder", 339 | tasks=tasks, 340 | ) 341 | 342 | logger.info(f"Start MLM training <>") 343 | trainer.train() 344 | logger.info("Training done") 345 | 346 | 347 | if __name__ == "__main__": 348 | parser = argparse.ArgumentParser() 349 | parser.add_argument("--include-reddit", action="store_true") 350 | 351 | args = parser.parse_args() 352 | main(args) 353 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # German Text Embedding Clustering Benchmark 2 | 3 | Shortcut: [Datasets](https://github.com/ClimSocAna/tecb-de#datasets) - [Results](https://github.com/ClimSocAna/tecb-de#results) - [Installation](https://github.com/ClimSocAna/tecb-de#installation) - [Usage](https://github.com/ClimSocAna/tecb-de#usage) - [Citation](https://github.com/ClimSocAna/tecb-de#citation) 4 | 5 | ## Remarks 6 | This repository contains code to evaluate language models for clustering word embeddings as used in neural topic modelling (see for example [BERTopic](https://github.com/MaartenGr/BERTopic)) specifically for German. This work builds on [Massive Text Embedding Benchmark (MTEB)](https://github.com/embeddings-benchmark/mteb), which provides benchmark datasets and results for a wide range of tasks. 7 | 8 | More specifically, this work contributes to mteb in the following ways: 9 | - clustering datasets in German (MTEB only consider English datasets) 10 | - the evaluation of more clustering algorithms 11 | 12 | :trophy: Note that you can contribute results to the [MTEB leaderboard](https://huggingface.co/spaces/mteb/leaderboard) as our datasets are officially part of MTEB (apart from the Reddit datasets, see below)! You can either use this library or MTEB directly to produce results. If you run into any problems, please raise an issue. :trophy: 13 | 14 | 15 | ## Datasets 16 | 17 | Currently, we provide 4 datasets. The datasets are built similarly to the English clustering datasets in MTEB. Unfortunately, there are fewer datasets available for German and, therefore, we were not able to build as many datasets (e.g. Arxiv only contains very few German papers). However, we plan to add more datasets in the future. 18 | 19 | | **Name** | **Hub URL** | **Description** | 20 | |-----------------------|----------------------------------|--------------------------------------------------------------| 21 | | BlurbsClusteringS2S
([data ref.](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html)) | [slvnwhrl/blurbs-clustering-s2s](https://huggingface.co/datasets/slvnwhrl/blurbs-clustering-s2s/tree/main) | Clustering of book titles: 17'726 unqiue samples, 28 splits with 177 to 16'425 samples and 4 to 93 unique classes (as represented by genres, e.g. fantasy). On average, a sample is 23.17 chars long. Splits are built similarly to MTEB's [ArxivClusteringS2S](https://huggingface.co/datasets/mteb/arxiv-clustering-s2s) ([Paper](https://arxiv.org/abs/2210.07316)). | 22 | | BlurbsClusteringP2P
([data ref.](https://www.inf.uni-hamburg.de/en/inst/ab/lt/resources/data/germeval-2019-hmc.html)) | [slvnwhrl/blurbs-clustering-p2p](https://huggingface.co/datasets/slvnwhrl/blurbs-clustering-p2p/tree/main) | Clustering of book blurbs (title + blurb): Clustering of book titles: 18'084 unqiue samples, 28 splits with 177 to 16'425 samples and 4 to 93 unique classes as represented by genres, e.g. fantasy. On average, a sample is 663.91 chars long. Splits are built similarly to MTEB's [ArxivClusteringP2P](https://huggingface.co/datasets/mteb/arxiv-clustering-sp2p) ([paper](https://arxiv.org/abs/2210.07316)). | 23 | | TenKGNADClusteringS2S
([data ref.](https://ofai.github.io/million-post-corpus/)) | [slvnwhrl/tenkgnad-clustering-s2s](https://huggingface.co/datasets/slvnwhrl/tenkgnad-clustering-s2s) | Clustering of news article titles: 10'267 unique samples, 10 splits with 1'436 to 9'962 samples and 9 unique classes (as represented by, e.g. politics). On average, a sample is 50.97 chars long. Splits are built similarly to MTEB's [TwentyNewsgroupsClustering](https://huggingface.co/datasets/mteb/twentynewsgroups-clustering) ([paper](https://arxiv.org/abs/2210.07316)).| 24 | | TenKGNADClusteringP2P
([data ref.](https://ofai.github.io/million-post-corpus/)) | [slvnwhrl-tenkgnad-clustering-p2p](https://huggingface.co/datasets/slvnwhrl/tenkgnad-clustering-p2p) | Clustering of news articles (title + article body): 10'275 unique samples, 10 splits with 1'436 to 9'962 samples and 9 unique classes (as represented by, e.g. politics). On average, a sample is 2648.46 chars long. Splits are built similarly to MTEB's [TwentyNewsgroupsClustering](https://huggingface.co/datasets/mteb/twentynewsgroups-clustering) ([paper](https://arxiv.org/abs/2210.07316)). | 25 | 26 | ### Reddit datasets 27 | 28 | We also include two Reddit datasets in the benchmark (similar to MTEB's [RedditClustering](https://huggingface.co/datasets/mteb/reddit-clustering) and [RedditClusteringP2P](https://huggingface.co/datasets/mteb/reddit-clustering-p2p) datasets). However, we only provide ids, and if you want to use these datasets, you need to download the data yourself (see [Including the Reddit dataset](https://github.com/ClimSocAna/tecb-de#including-the-reddit-dataset) for instructions). The datasets contain "hot" and "top" submissions to 80 popular German subreddits and were extracted using [PRAW](https://praw.readthedocs.io/en/stable/). 29 | 30 | | **Name** | **Description** | 31 | |---------------------|-----------------| 32 | | RedditClusteringS2S | Clustering of reddit submission titles: 40'181 unique samples, 10 splits with 9'288 to 26'221 samples and 10 to 50 unique classes (as represented by subbredits, e.g. r/Finanzen). On average, a sample is 52.16 chars long. Splits are built similarly to MTEB's [RedditClustering](https://huggingface.co/datasets/mteb/reddit-clustering) ([paper](https://arxiv.org/abs/2210.07316)). | 33 | | RedditClusteringP2P | Clustering of reddit submissions (title + body): 40'305 unique samples, 10 splits with 9'288 to 26'221 samples and 10 to 50 unique classes (as represented by subbredits, e.g. r/Finanzen). On average, a sample is 901.78 chars long. Splits are built similarly to MTEB's [RedditClusteringP2P](https://huggingface.co/datasets/mteb/reddit-clustering-p2p) ([paper](https://arxiv.org/abs/2210.07316)). | 34 | 35 | ***Important**: As of June 19, 2023 new [Data API Terms](https://www.redditinc.com/policies/data-api-terms) become effective for Reddit. Most likely, it will not be allowed anymore to use Reddit data for such purposes (see especially "2.4 User Content" in the terms). Make sure you understand these terms and use Reddit data accordingly.* 36 | 37 | ## Results 38 | All results show the [V-measure](https://scikit-learn.org/stable/modules/clustering.html#homogeneity-completeness-and-v-measure) (multiplied by 100 and rounded to two decimal points). 39 | ### k-means (same as MTEB) 40 | | **Model** | **BlurbsClusteringS2S** | **BlurbsClusteringP2P** | **TenKGNADClusteringS2S** | **TenKGNADClusteringP2P** | **RedditClusteringS2S** | **RedditClusteringP2P** | **AVG**| 41 | |----|-------:|-------:|-------:|-------:|-------:|-------:|-------:| 42 | | [deepset/gbert-base](https://huggingface.co/deepset/gbert-base) | 11.27 | 35.36 | 24.23 | 37.16 | 28.57 | 35.30 | 28.66 | 43 | | [deepset/gbert-large](https://huggingface.co/deepset/gbert-large) | 13.34 | 39.30 | **34.97** | 41.69 | 34.35 | 44.61 | 34.71 | 44 | | [deepset/gelectra-base](https://huggingface.co/deepset/gelectra-base) | 7.74 | 10.06 | 4.11 | 9.02 | 6.59 | 7.73 | 7.54 | 45 | | [deepset/gelectra-large](https://huggingface.co/deepset/gelectra-large) |7.57 | 13.96 | 3.91 | 11.49 | 7.59 | 10.54 | 9.18 | 46 | | [uklfr/gottbert-base](https://huggingface.co/uklfr/gottbert-base) |8.37 | 34.49 | 9.34 | 33.66 | 16.07 | 19.46 | 20.23 | 47 | | [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) | 14.33 | 32.46 | 22.26 | 36.13 |33.33 | 44.59 | 30.52 | 48 | | [sentence-transformers/paraphrase-multilingual-mpnet-base-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2) |15.81 | 34.38 | 22.00 | 35.96 | 36.39 | 48.43 | 32.16 | 49 | | [T-Systems-onsite/cross-en-de-roberta-sentence-transformer](https://huggingface.co/T-Systems-onsite/cross-en-de-roberta-sentence-transformer) | 12.69 | 30.81 | 10.94 | 23.50 | 27.98 | 33.01 | 23.16 | 50 | | [sentence-transformers/use-cmlm-multilingual](https://huggingface.co/sentence-transformers/use-cmlm-multilingual) |15.24 | 29.63 | 25.64 | 37.10 | 33.62 | 49.70 | 31.82 | 51 | | [sentence-transformers/sentence-t5-base](https://huggingface.co/sentence-transformers/sentence-t5-base) |11.57 | 30.59 | 18.11 | **44.88** | 31.99 | 45.80| 30.49 | 52 | | [sentence-transformers/sentence-t5-xxl](https://huggingface.co/sentence-transformers/sentence-t5-xxl) | **15.94** | **39.91** | 19.69 | 43.43 | **38.54** | **55.90** | **35.57** | 53 | | [xlm-roberta-large](https://huggingface.co/xlm-roberta-large) |7.29 | 29.84 | 6.16 | 32.46 | 10.19 | 23.50 | 18.24 | 54 | 55 | ### additional clustering algorithms 56 | In addition to [k-means](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html), we evaluate the following different clustering algorithms: 57 | - [Agglomerative clustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html#sklearn-cluster-agglomerativeclustering) (distance-based, number of clusers is assumed) 58 | - [HDBSCAN](https://hdbscan.readthedocs.io/en/latest/) (density-based, number of clusters is assumed) 59 | - [DBSTREAM](https://riverml.xyz/dev/api/cluster/DBSTREAM/#dbstream) (streaming algorithm, number of clusters is *not* assumed) 60 | 61 | Inspired by [BERTopic](https://github.com/MaartenGr/BERTopic), we also evaluate [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html#sklearn-decomposition-pca) and [UMAP](https://umap-learn.readthedocs.io/en/latest/index.html) as a "preprocessing" step. 62 | 63 | *If you want add/evaluate more algorithms, please have a look at [FlexibleClusteringEvaluator.py](https://github.com/ClimSocAna/tecb-de/blob/main/scripts/FlexibleClusteringEvaluator.py) on how to achieve that.* 64 | 65 | 66 | ### UMAP + {k-means, HDBSCAN, DBSTREAM} 67 | *For all results have a look at [results/tecb-de-full-results.csv](https://github.com/ClimSocAna/tecb-de/blob/3364f94faba7b235c7498a2bb724324064ac4537/results/tecb-de-full-results.csv).* 68 | 69 | | **Model** | **Algorithm** | **BlurbsClusteringS2S** | **BlurbsClusteringP2P** | **10KGNADClusteringS2S** | **10KGNADClusteringP2P** | **RedditClusteringS2S** | **RedditClusteringP2P** | **AVG** | 70 | |------------|----------------------------------|------------------------:|------------------------:|-------------------------:|-------------------------:|--------:|--------:|--------:| 71 | | [deepset/gbert-base](https://huggingface.co/deepset/gbert-base) | k-means
HDBSCAN
DBSTREAM | 12.81
14.31
12.70 | **38.81**
22.83
37.06 | 29.31
05.44
**28.92** | 43.61
32.45
42.74 | 31.77
17.21
31.70 | 46.06
31.99
44.84 | 33.73
20.71
**32.99** | 72 | | [sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2](https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2) | k-means
HDBSCAN
DBSTREAM | 13.80
**20.00**
15.77 | 34.16
30.67
32.47 | 25.22
24.90
26.44 | **43.75**
35.53
41.31 | 32.64
32.23
**33.14** | **47.46**
39.70
46.47 | 32.84
30.51
32.60 | 73 | 74 | 75 | ## Installation 76 | If you want to run the code from this repository (for creating the Reddit dataset or model evaluation), clone this repository and move to the downloaded folder 77 | 78 | ``` 79 | git clone https://github.com/ClimSocAna/tecb-de.git 80 | ``` 81 | 82 | and create a new environment with the necessary packages 83 | ``` 84 | python -m venv tecb-de 85 | 86 | source tecb-de/bin/activate # Linux, MacOS 87 | venv\Scripts\activate.bat # Windows 88 | 89 | pip install -r requirements.txt 90 | ``` 91 | 92 | ## Usage 93 | ### Running the evaluation 94 | Simply run `python scripts/run_cteb_de.py`. This will produce an `results` folder. You can modify the script to run the evaluation for models and clustering algorithms (and configuration) of your choosing. 95 | 96 | ### Including the Reddit dataset 97 | If you want to use the reddit dataset, you first have to download the data 98 | ``` 99 | # move to the reddit_data folder in tecb-de 100 | # make sure you have PRAW and tdqm installed: pip install praw, pip install tqdm 101 | 102 | # downloads the data and saves it to submissions.tsv 103 | python download.py 104 | ``` 105 | Note that for this to work, you have to edit the `reddit_data/praw.ini` with your client data. You can find instructions [here](https://praw.readthedocs.io/en/stable/getting_started/authentication.html). 106 | 107 | Then you can create the datasets 108 | ``` 109 | # creates the splits for both tasks (RedditClusteringS2S and ReddictClusteringP2P) 110 | # and saves them in the reddit_data folder 111 | python create_splits.py 112 | ``` 113 | 114 | Finally, you can can run the evaluation using the `--include-reddit` flag 115 | ``` 116 | # assuming your position is in the top-level folder 117 | python scripts/run_cteb_de.py --reddit-flag 118 | ``` 119 | 120 | ### Adaptive pre-training 121 | If you want to experiment with adaptive pre-training, you can have a look at `scripts/run_fine_tuned_cteb_de.py`. Basically, it allows you to train models using whole word masking (WWM) and [TSDAE](https://arxiv.org/abs/2104.06979) and to evaluate on a clustering algorithm during training. 122 | 123 | ## Citation 124 | If you make use of this [work](https://aclanthology.org/2023.konvens-main.20/), please cite: 125 | ``` 126 | @inproceedings{wehrli-etal-2023-german, 127 | title = "{G}erman Text Embedding Clustering Benchmark", 128 | author = "Wehrli, Silvan and 129 | Arnrich, Bert and 130 | Irrgang, Christopher", 131 | editor = "Georges, Munir and 132 | Herygers, Aaricia and 133 | Friedrich, Annemarie and 134 | Roth, Benjamin", 135 | booktitle = "Proceedings of the 19th Conference on Natural Language Processing (KONVENS 2023)", 136 | month = sep, 137 | year = "2023", 138 | address = "Ingolstadt, Germany", 139 | publisher = "Association for Computational Lingustics", 140 | url = "https://aclanthology.org/2023.konvens-main.20", 141 | pages = "187--201", 142 | } 143 | ``` 144 | 145 | -------------------------------------------------------------------------------- /results/tecb-de-full-results.csv: -------------------------------------------------------------------------------- 1 | transformer,"clustering 2 | algorithm","dimensionality 3 | reduction",BlurbsClusteringS2S,BlurbsClusteringSP2P,10kGNADClusteringS2S,10kGNADClusteringP2P,RedditClusteringS2S,RedditClusteringP2P 4 | GBERT-base,minibatch_kmeans,-,0.112671049316939,0.353583482102813,0.242344552391346,0.371566057467428,0.285676283561016,0.353022032105768 5 | GBERT-base,minibatch_kmeans,pca,0.0777671041805759,0.282103942265806,0.161621841318318,0.223432937769083,0.116485294575606,0.148203164549033 6 | GBERT-base,minibatch_kmeans,umap,0.128132315578683,0.388111794621666,0.293149898962799,0.436132825338465,0.317563157505019,0.460595202703818 7 | GBERT-base,minibatch_kmeans,pca+umap,0.130091638633461,0.381730338558007,0.288475293664304,0.40041151901687,0.317337410195825,0.409005177020971 8 | GBERT-base,agglomerative,-,0.122555007222791,0.365500728379183,0.233698977687029,0.390790344115844,0.29663587891276,0.377253253266381 9 | GBERT-base,agglomerative,pca,0.0764838065513696,0.281541533230531,0.156090401197698,0.215749839242043,0.115666322019327,0.14788684839578 10 | GBERT-base,agglomerative,umap,0.128414238366791,0.386473044041221,0.284943294898888,0.432163776799657,0.316535914208462,0.460153958922544 11 | GBERT-base,agglomerative,pca+umap,0.135298949874227,0.378635201892333,0.286346037071778,0.406316649434935,0.31901894616499,0.41342137179276 12 | GBERT-base,hdbscan,pca,0.092841455320237,0.121429731876474,0.0878780147634584,0.112088120454238,0.14268405884612,0.17842937399194 13 | GBERT-base,hdbscan,umap,0.143147353990695,0.228356662475019,0.0543865219456264,0.324525869290628,0.172064090037639,0.319903379129749 14 | GBERT-base,hdbscan,pca+umap,0.145646923652315,0.242303154361018,0.162201771418459,0.352124570489061,0.217985362145401,0.198177856655754 15 | GBERT-base,dbstream,pca,0.117158951149945,0.261506054193205,0.162466842655547,0.218331474577554,0.123380259546021,0.151373101721904 16 | GBERT-base,dbstream,umap,0.126968078297751,0.370571106215641,0.289172894805377,0.427386392167265,0.316957936467504,0.448401802063983 17 | GBERT-base,dbstream,pca+umap,0.135526610923928,0.363080746655188,0.292474309841386,0.412041561853931,0.312073808060135,0.402044907553444 18 | GBERT-large,minibatch_kmeans,-,0.133771100864799,0.392968164291731,0.349667381745043,0.416860247351736,0.344661288673389,0.446109745930674 19 | GBERT-large,minibatch_kmeans,pca,0.100443756439491,0.308775265057465,0.246710867213275,0.281775578873874,0.134243982216329,0.171871809632161 20 | GBERT-large,minibatch_kmeans,umap,0.156253205331452,0.423950492830575,0.407429584861096,0.483115805893804,0.412444628594533,0.53406057923917 21 | GBERT-large,minibatch_kmeans,pca+umap,0.16155232224427,0.420486875135654,0.422991965269453,0.470110554797457,0.408114696584046,0.515781058414509 22 | GBERT-large,agglomerative,-,0.15346046365093,0.417414864554982,0.357465069383771,0.455382211806237,0.371736021694433,0.476017771864391 23 | GBERT-large,agglomerative,pca,0.100496751158374,0.312347273528722,0.242867854737821,0.278716878244411,0.133246053268639,0.171189869000204 24 | GBERT-large,agglomerative,umap,0.156750709690904,0.425642820144678,0.414465348015092,0.499101116065403,0.417677737736199,0.545820076431979 25 | GBERT-large,agglomerative,pca+umap,0.162417052178325,0.418969823081815,0.416535509533656,0.483820186361541,0.413522308477915,0.520477794301093 26 | GBERT-large,hdbscan,pca,0.0688053584191239,0.131955201072378,0.0866870186629883,0.17368387849622,0.170946875457958,0.172281670663991 27 | GBERT-large,hdbscan,umap,0.164851651926161,0.302589576982622,0.30047817241985,0.38842013121876,0.253130648500744,0.319903379129749 28 | GBERT-large,hdbscan,pca+umap,0.160635359866094,0.348198697135544,0.233143414646736,0.385744616169722,0.276748823137632,0.435959870184842 29 | GBERT-large,dbstream,pca,0.106698549211481,0.295978705611152,0.234910103348833,0.276880781462188,0.136330874897316,0.17605394161897 30 | GBERT-large,dbstream,umap,0.159465218791482,0.406349632716036,0.397319423736426,0.468089123992303,0.408302307486785,0.525689818695595 31 | GBERT-large,dbstream,pca+umap,0.166484777774721,0.40287889845942,0.392131715328795,0.454491213561863,0.397626487765563,0.502095687897378 32 | GELECTRA-base,minibatch_kmeans,-,0.0773802653856308,0.100585204166098,0.0411202271800372,0.0901582074000665,0.0659322534895928,0.0773094974486189 33 | GELECTRA-base,minibatch_kmeans,pca,0.070499734843759,0.0733878212922891,0.0398844118151661,0.0837270797322409,0.0462823492581079,0.0628378246232701 34 | GELECTRA-base,minibatch_kmeans,umap,0.0832639892802492,0.150989178222435,0.0368744634793366,0.126632031872035,0.0757934402337694,0.0867223381899473 35 | GELECTRA-base,minibatch_kmeans,pca+umap,0.0821402843835579,0.111147670127865,0.0439943667547346,0.099359124202817,0.0688833827432366,0.0756486802686404 36 | GELECTRA-base,agglomerative,-,0.0805738718522408,0.100437845777627,0.0415656710931226,0.0938546916920069,0.0700148688131128,0.0758744191728137 37 | GELECTRA-base,agglomerative,pca,0.0689162175103905,0.0703669587491154,0.0388949809210833,0.0812604389117952,0.0460949752690999,0.0628422944868003 38 | GELECTRA-base,agglomerative,umap,0.0829724184211444,0.147433226414405,0.037138338638813,0.134257261521715,0.0773489329783762,0.0866563731090761 39 | GELECTRA-base,agglomerative,pca+umap,0.0828431480521785,0.113517957291586,0.0451961770136774,0.102051956539211,0.0705261969672755,0.0770632640910475 40 | GELECTRA-base,hdbscan,pca,0.0795215157301383,0.0681243106140609,0.0573689275392659,0.0614447918758936,0.134271051369632,0.15047581912456 41 | GELECTRA-base,hdbscan,umap,0.0803815035008141,0.105646071345495,0.0214482012053893,0.0158125145229272,0.0715719345933788,0.0897987629752991 42 | GELECTRA-base,hdbscan,pca+umap,0.0603676964248639,0.0873807761634588,0.0125080425581685,0.0222740008863571,0.0917062282030941,0.116773941216348 43 | GELECTRA-base,dbstream,pca,0.0747123404435195,0.0490477571830085,0.0493326702365123,0.0895059618949814,0.0503590995282146,0.0658460594781778 44 | GELECTRA-base,dbstream,umap,0.0812069360094833,0.144200548562882,0.0537496715910561,0.165618198863684,0.0844168634808037,0.0946369012953892 45 | GELECTRA-base,dbstream,pca+umap,0.0832343762177447,0.123659678514983,0.0596532877599249,0.137048073813919,0.0801875071892082,0.0862277466845775 46 | GELECTRA-large,minibatch_kmeans,-,0.0756562949711355,0.139649776313644,0.0391456840437532,0.114856819040429,0.0758819635082978,0.105449787326776 47 | GELECTRA-large,minibatch_kmeans,pca,0.0690597768133245,0.100969510677138,0.0382481011456329,0.096978027986081,0.0476178361184982,0.0725538725129464 48 | GELECTRA-large,minibatch_kmeans,umap,0.0797416843974231,0.14995345938628,0.0428616810166543,0.129387587119327,0.0786477675730954,0.100098610516064 49 | GELECTRA-large,minibatch_kmeans,pca+umap,0.0778985275607628,0.148094534491185,0.0429929292565679,0.129223718235341,0.0807834894512504,0.101722519357038 50 | GELECTRA-large,agglomerative,-,0.0794046738058272,0.144175117529874,0.036954786402055,0.113359827010857,0.0808854297797115,0.101090516583789 51 | GELECTRA-large,agglomerative,pca,0.0676768420972781,0.0997378573724905,0.0364053640599704,0.0957223001611836,0.0484190086354535,0.0731042975964091 52 | GELECTRA-large,agglomerative,umap,0.0800819728660301,0.150797893634037,0.0423621116101067,0.134342915078929,0.0809906315914207,0.101013587651946 53 | GELECTRA-large,agglomerative,pca+umap,0.0795740246073101,0.150844299339887,0.0429223185369049,0.13224703421997,0.0819430797767072,0.102097736541606 54 | GELECTRA-large,hdbscan,pca,0.0850417837907285,0.0951759874223363,0.0419533905566293,0.0605897439215855,0.0699755894445285,0.139432001736411 55 | GELECTRA-large,hdbscan,umap,0.0685202308843401,0.124137654104527,0.0216641106966775,0.039247795635016,0.0818448120576973,0.0802031392817206 56 | GELECTRA-large,hdbscan,pca+umap,0.0570058537366616,0.133192871612496,0.0205543208434148,0.0167069179709859,0.0637752742196379,0.0830144614433586 57 | GELECTRA-large,dbstream,pca,0.0957672356104231,0.0864211123476051,0.0585879181326365,0.10479185085122,0.060389512217864,0.0781944401467215 58 | GELECTRA-large,dbstream,umap,0.0781451796558462,0.140578476542862,0.0581150858127453,0.151993671575655,0.0867163416866888,0.106302135013381 59 | GELECTRA-large,dbstream,pca+umap,0.0795784065932305,0.140536357474925,0.0580540290268102,0.150899417299781,0.0860368355559608,0.107050532468636 60 | GottBERT,minibatch_kmeans,-,0.0837044646117393,0.34485091823134,0.0934176142975861,0.336570384849751,0.160661859855629,0.194636791866662 61 | GottBERT,minibatch_kmeans,pca,0.068051687111486,0.260148765123045,0.0386391242699837,0.18466914757721,0.0667620484203464,0.0216689797288547 62 | GottBERT,minibatch_kmeans,umap,0.098948086568718,0.384135061654815,0.156570290402716,0.456459404652668,0.190989379873784,0.349863795382876 63 | GottBERT,minibatch_kmeans,pca+umap,0.0996830939741513,0.367756163474323,0.153183411536109,0.419939942912584,0.173617694400565,0.226030592014639 64 | GottBERT,agglomerative,-,0.100965058637155,0.357053102548251,0.109507995279931,0.361436147241082,0.186460317717443,0.239189611017973 65 | GottBERT,agglomerative,pca,0.0671109123925713,0.259060982597514,0.0351029542411723,0.180102166777926,0.0669233668519784,0.0962431574687843 66 | GottBERT,agglomerative,umap,0.0995300270666096,0.37943546066005,0.146403253052514,0.447852339753709,0.196918284264601,0.349092060755467 67 | GottBERT,agglomerative,pca+umap,0.100446731958692,0.374259090061671,0.14835085955226,0.424651830004859,0.172546473294592,0.227011409651262 68 | GottBERT,hdbscan,pca,0.0744033072554521,0.143314883231155,0.0477826085962066,0.117618424072631,0.13637429350962,0.161942551115745 69 | GottBERT,hdbscan,umap,0.105805079879835,0.206115307193552,0.0425455018104545,0.272757736302706,0.169065873235678,0.209252753882957 70 | GottBERT,hdbscan,pca+umap,0.109761131518183,0.198171542304016,0.0150279535499423,0.284081039265491,0.131515868624308,0.160698074901665 71 | GottBERT,dbstream,pca,0.0652861036486094,0.248920864808596,0.0429044677961951,0.172668458905421,0.0677674958104433,0.0968130165380901 72 | GottBERT,dbstream,umap,0.0921964865954673,0.363893050252978,0.183648502009858,0.425414468569382,0.199013810541506,0.339471877055011 73 | GottBERT,dbstream,pca+umap,0.102584996618217,0.350788940697117,0.187715019275042,0.40893035812329,0.176815805091496,0.229809023262869 74 | cross-en-de-roberta-sentence-transformer,minibatch_kmeans,-,0.126922500216336,0.308182108428217,0.109383325261002,0.235009855126727,0.279809048751092,0.330123183048337 75 | cross-en-de-roberta-sentence-transformer,minibatch_kmeans,pca,0.101889909333311,0.241805177060928,0.0597552027128782,0.143614178383154,0.130708685936294,0.115293414189967 76 | cross-en-de-roberta-sentence-transformer,minibatch_kmeans,umap,0.139959761599388,0.352253942462203,0.170335032943622,0.419897355645651,0.342321952129765,0.434055874735054 77 | cross-en-de-roberta-sentence-transformer,minibatch_kmeans,pca+umap,0.137707954971196,0.327624828932546,0.131777754994125,0.358844279058356,0.311601921374999,0.390146779328902 78 | cross-en-de-roberta-sentence-transformer,agglomerative,-,0.134736552840553,0.317603548274424,0.116231901966189,0.287051113571006,0.290739229022921,0.359270566865045 79 | cross-en-de-roberta-sentence-transformer,agglomerative,pca,0.101441749756072,0.241938363962351,0.0582658196433544,0.140378409327646,0.129762926565247,0.11380776389892 80 | cross-en-de-roberta-sentence-transformer,agglomerative,umap,0.14238737023567,0.349198972050411,0.164938400738108,0.413533593465502,0.344095484305058,0.432259404463818 81 | cross-en-de-roberta-sentence-transformer,agglomerative,pca+umap,0.138511189632746,0.334581339480389,0.129400903997604,0.364054155686124,0.313701616589672,0.387463111351893 82 | cross-en-de-roberta-sentence-transformer,hdbscan,pca,0.120963497089993,0.168990060915356,0.0300083321630728,0.0434155029160883,0.166376697915542,0.133542021450355 83 | cross-en-de-roberta-sentence-transformer,hdbscan,umap,0.190105444179433,0.174899427159922,0.191232709480606,0.187535783156795,0.323913776293285,0.2397046537872 84 | cross-en-de-roberta-sentence-transformer,hdbscan,pca+umap,0.180613178895982,0.197241556801549,0.167758693775905,0.28666990860315,0.301241449075686,0.19257281426062 85 | cross-en-de-roberta-sentence-transformer,dbstream,pca,0.0883563768649365,0.221087792098191,0.067723480387753,0.151692351164815,0.130229408885868,0.114899146229848 86 | cross-en-de-roberta-sentence-transformer,dbstream,umap,0.14548254652485,0.329144468311907,0.184334253008602,0.385416082187906,0.341173927808671,0.422533044940713 87 | cross-en-de-roberta-sentence-transformer,dbstream,pca+umap,0.141861757546557,0.312588998213157,0.15008447777113,0.34939155383041,0.307636704091922,0.380199089343415 88 | paraphrase-multilingual-mpnet-base-v2,minibatch_kmeans,-,0.158123291626306,0.343757380287587,0.220025131753068,0.359641464232561,0.363883363255256,0.484310751591357 89 | paraphrase-multilingual-mpnet-base-v3,minibatch_kmeans,pca,0.125896791552805,0.273244279623173,0.170173986207635,0.257857889611419,0.202982120072103,0.276100300730128 90 | paraphrase-multilingual-mpnet-base-v4,minibatch_kmeans,umap,0.151443842892341,0.355880613392402,0.281772388405197,0.458497179011279,0.36697526415137,0.509440343971668 91 | paraphrase-multilingual-mpnet-base-v5,minibatch_kmeans,pca+umap,0.156507916618269,0.352779308405236,0.267460936265054,0.457806763220476,0.374216472427233,0.0356072494262668 92 | paraphrase-multilingual-mpnet-base-v6,agglomerative,-,0.157046014794122,0.340099873422921,0.231485210022495,0.39278421349265,0.338707270361313,0.469328836409909 93 | paraphrase-multilingual-mpnet-base-v7,agglomerative,pca,0.122447138811386,0.267452330792705,0.164009842079526,0.252186755920444,0.201879465115538,0.275048671029251 94 | paraphrase-multilingual-mpnet-base-v8,agglomerative,umap,0.154301708800618,0.354311913879471,0.283234851207097,0.459723984972906,0.361890505633581,0.513401182650167 95 | paraphrase-multilingual-mpnet-base-v9,agglomerative,pca+umap,0.156173947197582,0.356721558069448,0.266927652058493,0.45604623892775,0.37598851849118,0.515899565007311 96 | paraphrase-multilingual-mpnet-base-v10,hdbscan,pca,0.127035310941449,0.141762226418964,0.0818351026043876,0.0733421112001386,0.135266821942914,0.214931844385903 97 | paraphrase-multilingual-mpnet-base-v11,hdbscan,umap,0.201236395432702,0.274649493406005,0.260983527433073,0.360823558170628,0.3337765900697,0.417602492915373 98 | paraphrase-multilingual-mpnet-base-v12,hdbscan,pca+umap,0.202344394697466,0.262882064874875,0.260146676816762,0.36366036232013,0.336973232402841,0.39034305879441 99 | paraphrase-multilingual-mpnet-base-v13,dbstream,pca,0.0849065598349565,0.22880722218876,0.135534308643385,0.206324391024045,0.191258191065933,0.252781816917832 100 | paraphrase-multilingual-mpnet-base-v14,dbstream,umap,0.166980199469946,0.342111226742814,0.288731030116555,0.423665080926009,0.364034005702961,0.498943886937568 101 | paraphrase-multilingual-mpnet-base-v15,dbstream,pca+umap,0.172956188862994,0.339471333903835,0.283070859375078,0.422958088887883,0.369348873755288,0.499738008422725 102 | paraphrase-multilingual-MiniLM-L12-v2,minibatch_kmeans,-,0.143270359549276,0.324644806309516,0.222630823956282,0.361341840550912,0.333360860095429,0.44591558586307 103 | paraphrase-multilingual-MiniLM-L12-v3,minibatch_kmeans,pca,0.113823511279356,0.262694275694776,0.156415191235443,0.22762359752661,0.195508336363258,0.258826217792971 104 | paraphrase-multilingual-MiniLM-L12-v4,minibatch_kmeans,umap,0.137999493541642,0.341580795289891,0.252211317772711,0.437469491596867,0.326359774702181,0.474564540394354 105 | paraphrase-multilingual-MiniLM-L12-v5,minibatch_kmeans,pca+umap,0.143009218184408,0.336986580271545,0.248742508388581,0.435084335125087,0.336444460531342,0.474902003632741 106 | paraphrase-multilingual-MiniLM-L12-v6,agglomerative,-,0.146536382523302,0.317431549139208,0.213014658008777,0.380760237973582,0.300456488748825,0.428889617479689 107 | paraphrase-multilingual-MiniLM-L12-v7,agglomerative,pca,0.115779352365279,0.256410239991004,0.15032021458738,0.224178693769346,0.195123755282815,0.258506724848499 108 | paraphrase-multilingual-MiniLM-L12-v8,agglomerative,umap,0.139682521581287,0.342734047702612,0.246290466453738,0.442441125052441,0.324053238239874,0.474193210063386 109 | paraphrase-multilingual-MiniLM-L12-v9,agglomerative,pca+umap,0.145008875778656,0.340618821131352,0.247826494356137,0.43909161580558,0.339124182809281,0.475158193791564 110 | paraphrase-multilingual-MiniLM-L12-v10,hdbscan,pca,0.106742407287862,0.155525989691692,0.0995948924699086,0.125624599957583,0.188164326875565,0.19045868597509 111 | paraphrase-multilingual-MiniLM-L12-v11,hdbscan,umap,0.198883103430658,0.306704774323358,0.248962542645798,0.355306608281706,0.322269995394452,0.397043077631587 112 | paraphrase-multilingual-MiniLM-L12-v12,hdbscan,pca+umap,0.19673681182096,0.283696920569254,0.245202118373034,0.337511019460207,0.320714118485378,0.362949368319386 113 | paraphrase-multilingual-MiniLM-L12-v13,dbstream,pca,0.0987312265630075,0.243655986909509,0.150418611013133,0.222556064613291,0.197586573793126,0.253777193556048 114 | paraphrase-multilingual-MiniLM-L12-v14,dbstream,umap,0.157660274772465,0.324704812605918,0.264405938038737,0.41305520382289,0.331401225159257,0.464706108218722 115 | paraphrase-multilingual-MiniLM-L12-v15,dbstream,pca+umap,0.163624915072092,0.322838184260603,0.258488811106791,0.408413966317955,0.337677315573374,0.460363639591615 116 | use-cmlm-multilingual,minibatch_kmeans,-,0.152393753822012,0.296268068879049,0.256400687832879,0.371041645369963,0.336201098358395,0.497047679337723 117 | use-cmlm-multilingual,minibatch_kmeans,pca,0.116494579302846,0.238005376737296,0.193574697899891,0.202009001379832,0.165994886403545,0.196308371589102 118 | use-cmlm-multilingual,minibatch_kmeans,umap,0.14985167015025,0.327247085116927,0.32157530808599,0.456417606855682,0.379542193966009,0.546814043563995 119 | use-cmlm-multilingual,minibatch_kmeans,pca+umap,0.15510483255276,0.315655367262148,0.288724564321235,0.471530107598779,0.337027189391226,0.536756135865593 120 | use-cmlm-multilingual,agglomerative,-,0.146547015429232,0.283304677098585,0.231158809846395,0.398723051831382,0.336453169331313,0.483293138708737 121 | use-cmlm-multilingual,agglomerative,pca,0.11609266255309,0.235870451509495,0.188140806042258,0.202159352500591,0.165035923826414,0.194792598883006 122 | use-cmlm-multilingual,agglomerative,umap,0.152976721976694,0.325475611432752,0.316539957617672,0.454053297196607,0.378616757107362,0.550820533259835 123 | use-cmlm-multilingual,agglomerative,pca+umap,0.154833736233735,0.317824528936831,0.279209048894811,0.471933143162109,0.33932457568417,0.541088159859084 124 | use-cmlm-multilingual,hdbscan,pca,0.119097059058545,0.11786982294329,0.103759125713307,0.0918062786214231,0.193351879195506,0.144941464960226 125 | use-cmlm-multilingual,hdbscan,umap,0.174974232413532,0.239241093961519,0.138601805587234,0.325121936727263,0.348007320884774,0.446376397150588 126 | use-cmlm-multilingual,hdbscan,pca+umap,0.177388443591204,0.242643170981103,0.259199234914231,0.332705297333893,0.274873156514591,0.447342824463761 127 | use-cmlm-multilingual,dbstream,pca,0,0,0,0,0.0154841696264087,0 128 | use-cmlm-multilingual,dbstream,umap,0.153740540810348,0.316003890266619,0.312504571317317,0.42524666601542,0.369821053977469,0.53586992704974 129 | use-cmlm-multilingual,dbstream,pca+umap,0.155561936402515,0.308245103579314,0.292175461902841,0.422363422904517,0.335584315848073,0.521365277702398 130 | sentence-t5-base,minibatch_kmeans,-,0.115675300517631,0.305921599517768,0.181057125760787,0.448832789492342,0.319928501316492,0.458018030992708 131 | sentence-t5-base,minibatch_kmeans,pca,0.0915559352199022,0.262675084843275,0.129879358918697,0.349841259278436,0.135629905100003,0.233550909602233 132 | sentence-t5-base,minibatch_kmeans,umap,0.130772330756021,0.324953989590045,0.227648303681791,0.44739162490236,0.362103490766273,0.511945757865168 133 | sentence-t5-base,minibatch_kmeans,pca+umap,0.12203323705175,0.311633187208728,0.208373717494624,0.462246733586854,0.335873376538146,0.472643096636938 134 | sentence-t5-base,agglomerative,-,0.12764498347471,0.302068577986526,0.185381996953064,0.43556037724739,0.311204249694432,0.452394731219337 135 | sentence-t5-base,agglomerative,pca,0.0913065210825795,0.259159854414581,0.122216723506138,0.347369601581619,0.13472269401308,0.232663726002279 136 | sentence-t5-base,agglomerative,umap,0.127418551941871,0.323204936462978,0.224144638401092,0.455177461345823,0.365557049191391,0.517477292664523 137 | sentence-t5-base,agglomerative,pca+umap,0.123820570828254,0.312173952185688,0.206866039729986,0.47228531651795,0.338438708778302,0.475773508642492 138 | sentence-t5-base,hdbscan,pca,0.0923035199181703,0.156712135550618,0.0845452428117134,0.174146206497858,0.138170198046753,0.203538467502638 139 | sentence-t5-base,hdbscan,umap,0.179520552043039,0.251778153082897,0.217615526417385,0.356194725742828,0.336027474130198,0.414066493440279 140 | sentence-t5-base,hdbscan,pca+umap,0.17138933736827,0.17239407272415,0.199116397327886,0.37587963127474,0.241371176314441,0.406802441369697 141 | sentence-t5-base,dbstream,pca,0,0,0,0,0,0 142 | sentence-t5-base,dbstream,umap,0.135375043482475,0.31261482838458,0.227727385718491,0.430964861023771,0.358516370578722,0.0312831192054188 143 | sentence-t5-base,dbstream,pca+umap,0.132470514631532,0.304373714095415,0.217329018477362,0.436990434223905,0.330473523993147,0.470442875694425 144 | sentence-t5-large,minibatch_kmeans,-,0.132674364827593,0.353313857300341,0.17264312500235,0.441123059571999,0.353669393966877,0.516390444915677 145 | sentence-t5-large,minibatch_kmeans,pca,0.108177015006833,0.284629725184563,0.091824478080705,0.315681635568616,0.145708485497716,0.257067009080023 146 | sentence-t5-large,minibatch_kmeans,umap,0.143984669694599,0.372837780183789,0.242942125966471,0.457481102325703,0.39403518632197,0.554087132962702 147 | sentence-t5-large,minibatch_kmeans,pca+umap,0.141606930306109,0.367488455588896,0.22584054081795,0.479036853190647,0.366268845186034,0.536285328781896 148 | sentence-t5-large,agglomerative,-,0.141097653445106,0.355376300274668,0.199683300770897,0.439790230192988,0.341605348522922,0.510222005967502 149 | sentence-t5-large,agglomerative,pca,0.107374768508383,0.283706380410588,0.0871744621056125,0.313753178808346,0.143654069788621,0.254513149214071 150 | sentence-t5-large,agglomerative,umap,0.0870710234486703,0.373039424149989,0.240889346643162,0.463332678124837,0.396569004688373,0.563207299963289 151 | sentence-t5-large,agglomerative,pca+umap,0.146214706759965,0.367876820623957,0.217975900531938,0.481872766836543,0.371180890700693,0.541086434437474 152 | sentence-t5-large,hdbscan,pca,0.106529816725044,0.179689828701708,0.0619063851451809,0.107428154931518,0.139674678670881,0.185251335632262 153 | sentence-t5-large,hdbscan,umap,0.190838542453084,0.274027306372176,0.231716132081408,0.378559186655157,0.344199446128254,0.449320319359473 154 | sentence-t5-large,hdbscan,pca+umap,0.180387595658813,0.176266124434699,0.0119189356311079,0.38435805139819,0.255262271093516,0.456274217164703 155 | sentence-t5-large,dbstream,pca,0,0,0,0,0,0 156 | sentence-t5-large,dbstream,umap,0.147369788244745,0.359191888215373,0.249041008925446,0.443325349491755,0.386819925270799,0.540183676275449 157 | sentence-t5-large,dbstream,pca+umap,0.148844242241185,0.350740951929464,0.235562611878174,0.448121255485833,0.360092686834314,0.522102675553645 158 | sentence-t5-xxl,minibatch_kmeans,-,0.15935051142789,0.399108862901951,0.196874386472812,0.434348084580018,0.385370978707388,0.558955647217913 159 | sentence-t5-xxl,minibatch_kmeans,pca,0.12570384699927,0.306572596380755,0.103692173864114,0.265053226841241,0.151195306846239,0.255351017507615 160 | sentence-t5-xxl,minibatch_kmeans,umap,0.174986786082349,0.40254267763501,0.315534120586959,0.459345136464197,0.440062162556857,0.589952812356259 161 | sentence-t5-xxl,minibatch_kmeans,pca+umap,0.170605295428964,0.407179982390405,0.275048929797589,0.467889923470059,0.396857453580994,0.591698543713776 162 | sentence-t5-xxl,agglomerative,-,0.158865040708109,0.385456940017055,0.237959927743163,0.442556380964094,0.373688181633057,0.549462971610934 163 | sentence-t5-xxl,agglomerative,pca,0.1253796781399,0.305435768121001,0.0995964032631627,0.262091575644706,0.149967880567176,0.254449947865279 164 | sentence-t5-xxl,agglomerative,umap,0.176141240235063,0.407671753893563,0.308957570419374,0.455741908863803,0.440150360519924,0.594349281058195 165 | sentence-t5-xxl,agglomerative,pca+umap,0.171582233571898,0.402478256574296,0.268773163808219,0.46602323058645,0.398394468821512,0.59181063988529 166 | sentence-t5-xxl,hdbscan,pca,0.112877823175006,0.175668626775163,0.0838045821791226,0.137295083932818,0.142228600698664,0.161371079983652 167 | sentence-t5-xxl,hdbscan,umap,0.183423585223045,0.314729331573393,0.217023269231727,0.382516958853735,0.335810997794907,0.47471979165273 168 | sentence-t5-xxl,hdbscan,pca+umap,0.197929483001782,0.291193111112434,0.229666031091625,0.364343935312214,0.329913032357581,0.487997454866726 169 | sentence-t5-xxl,dbstream,pca,0,0,0,0,0,0 170 | sentence-t5-xxl,dbstream,umap,0.171959893336347,0.384603415332465,0.311092826313863,0.441835249669936,0.427493180639225,0.570723989613225 171 | sentence-t5-xxl,dbstream,pca+umap,0.174645391590583,0.389184230256408,0.280893967439587,0.448626654853136,0.388972898797556,0.563125955132959 172 | XLM-RoBERTa-large,minibatch_kmeans,-,0.0728861946091863,0.298423391336534,0.0616183106143701,0.324574040090756,0.101917006949572,0.235017049723222 173 | XLM-RoBERTa-large,minibatch_kmeans,pca,0.0667658843045096,0.209413384159574,0.0311372569415547,0.150766398997943,0.040436730972003,0.118929367059747 174 | XLM-RoBERTa-large,minibatch_kmeans,umap,0.0864387783942206,0.350404086153387,0.105776865919053,0.456327945759019,0.134101392050016,0.356030791413997 175 | XLM-RoBERTa-large,minibatch_kmeans,pca+umap,0.0881057536722783,0.333455673409919,0.0966469382936431,0.41051126094098,0.115634835753774,0.0378167944039251 176 | XLM-RoBERTa-large,agglomerative,-,0.0854002509620479,0.317779172767462,0.075827172915677,0.34916468777658,0.131361361440265,0.290799046872692 177 | XLM-RoBERTa-large,agglomerative,pca,0.0658881486783091,0.206542571147759,0.031663043023735,0.149419344682624,0.0402264042491285,0.119386004502601 178 | XLM-RoBERTa-large,agglomerative,umap,0.0857636592722182,0.353799629400483,0.103986208208367,0.463823871963038,0.133662609040372,0.354871370123206 179 | XLM-RoBERTa-large,agglomerative,pca+umap,0.0882723295291705,0.330118708964256,0.0948692020431576,0.412716135639609,0.117394056942406,0.299242225288215 180 | XLM-RoBERTa-large,hdbscan,pca,0.0817370344914418,0.0976405115400701,0.0446446838193045,0.100375750191014,0.131276495367633,0.168521129173072 181 | XLM-RoBERTa-large,hdbscan,umap,0.106531300179375,0.172668458791544,0.0202942550256425,0.345458130906793,0.155062418947832,0.265007469196041 182 | XLM-RoBERTa-large,hdbscan,pca+umap,0.106109268730964,0.199335101548672,0.0119919301083495,0.312241245253147,0.12165550564456,0.163180565353472 183 | XLM-RoBERTa-large,dbstream,pca,0.0372369629650578,0.0672694846154926,0.0329614375735175,0.0528708808795484,0.0327779205845074,0.0916759930516592 184 | XLM-RoBERTa-large,dbstream,umap,0.0829279984660521,0.334595800525538,0.136469851639152,0.433921143881,0.144932674569599,0.351920884470888 185 | XLM-RoBERTa-large,dbstream,pca+umap,0.091275187106685,0.320110599891455,0.12869199062505,0.402554155784173,0.126074714902633,0.302206666584313 186 | --------------------------------------------------------------------------------