├── tests └── __init__.py ├── diversity ├── utils │ ├── __init__.py │ ├── memoize.py │ └── openai.py ├── patterns │ ├── __init__.py │ ├── Token.py │ └── part_of_speech.py ├── qudsim_modules │ ├── __init__.py │ ├── qudsim_preprocessing │ │ └── number.py │ ├── qudsim_alignment │ │ ├── answer.py │ │ ├── align.py │ │ ├── metric.py │ │ └── similarity.py │ └── qudsim_qud_generation │ │ ├── decontextualize.py │ │ ├── qud.py │ │ ├── segment.py │ │ └── pipeline.py ├── __init__.py ├── ngram_diversity.py ├── self_repetition.py ├── compression.py ├── embedding.py ├── homogenization.py ├── functions.py ├── template.py ├── compute_all_metrics.py └── qudsim.py ├── .gitignore ├── config.yaml ├── .pre-commit-config.yaml ├── pyproject.toml ├── test-diversity └── hom_tests.py ├── .github └── workflows │ └── publish.yaml ├── examples └── summarization.py ├── scripts └── timing.py ├── LICENSE.md └── README.md /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /diversity/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .memoize import memoized 2 | from .openai import GPT -------------------------------------------------------------------------------- /diversity/patterns/__init__.py: -------------------------------------------------------------------------------- 1 | from .Token import token_patterns 2 | from .part_of_speech import get_pos, pos_patterns 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | dist/ 2 | .DS_Store 3 | diversity/__pycache__/ 4 | diversity/patterns/__pycache__/ 5 | test.ipynb 6 | __pycache__ 7 | .nova/ -------------------------------------------------------------------------------- /config.yaml: -------------------------------------------------------------------------------- 1 | qg_gpt_model: "gpt-4o-2024-08-06" 2 | qa_gpt_model: "gpt-4o-2024-08-06" 3 | level: 1 4 | temperature: 1 5 | threshold: 0.20 6 | max_tries: 3 -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # See https://pre-commit.com for more information 2 | # See https://pre-commit.com/hooks.html for more hooks 3 | repos: 4 | - repo: https://github.com/pre-commit/pre-commit-hooks 5 | rev: v4.0.1 6 | hooks: 7 | - id: check-toml 8 | - id: check-yaml 9 | - id: end-of-file-fixer 10 | - id: mixed-line-ending 11 | -------------------------------------------------------------------------------- /diversity/qudsim_modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .qudsim_preprocessing.number import number_text 2 | from .qudsim_qud_generation.segment import segment 3 | from .qudsim_qud_generation.decontextualize import decontextualize 4 | from .qudsim_qud_generation.qud import generate_quds 5 | from .qudsim_qud_generation.pipeline import get_quds 6 | from .qudsim_alignment.metric import FrequencyBasedSimilarity 7 | from .qudsim_alignment.similarity import get_harmonic_similarity 8 | from .qudsim_alignment.align import align 9 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "diversity" 3 | version = "0.3.0" 4 | description = "" 5 | authors = ["Chantal Shaib "] 6 | readme = "README.md" 7 | 8 | [tool.poetry.dependencies] 9 | python = ">=3.10,<3.13" 10 | typer = ">=0.9.0" 11 | nltk = "^3.8.1" 12 | rouge-score = "^0.1.2" 13 | evaluate = "^0.4.1" 14 | sentence-transformers = "^5.1.0" 15 | transformers = "^4.41.0" 16 | 17 | [tool.poetry.group.dev.dependencies] 18 | pre-commit = "^3.4.0" 19 | pandas = "^2.1.2" 20 | numpy = "^1.26.1" 21 | 22 | [build-system] 23 | requires = ["poetry-core"] 24 | build-backend = "poetry.core.masonry.api" 25 | -------------------------------------------------------------------------------- /diversity/__init__.py: -------------------------------------------------------------------------------- 1 | from .compression import compression_ratio 2 | from .patterns.Token import token_patterns 3 | from .patterns.part_of_speech import pos_patterns, get_pos 4 | from .utils.memoize import memoized 5 | from .homogenization import homogenization_score 6 | from .ngram_diversity import ngram_diversity_score 7 | from .functions import extract_patterns, match_patterns 8 | from .self_repetition import self_repetition_score 9 | from .template import template_rate, templates_per_token 10 | from .qudsim import qudsim 11 | from .embedding import remote_clique, chamfer_dist 12 | from .compute_all_metrics import compute_all_metrics -------------------------------------------------------------------------------- /diversity/ngram_diversity.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | import nltk 3 | 4 | def ngram_diversity_score( 5 | data: List[str], 6 | num_n: int = 4, 7 | ) -> float: 8 | """ Calculates corpus-level ngram diversity based on unique ngrams 9 | (e.g., https://arxiv.org/pdf/2202.00666.pdf). 10 | 11 | Args: 12 | data (List[str]): List of documents. 13 | num_n (int): Max ngrams to test up to. Defaults to 5. 14 | 15 | Returns: 16 | float: ngram diveristy score. 17 | """ 18 | score = 0 19 | data = ' '.join(data).split(' ') # format to list of words 20 | 21 | for i in range(1, num_n + 1): 22 | ngrams = list(nltk.ngrams(data, i)) 23 | # num unique ngrams / all ngrams for each size n 24 | score += len(set(ngrams)) / len(ngrams) 25 | 26 | return round(score, 3) 27 | -------------------------------------------------------------------------------- /diversity/patterns/Token.py: -------------------------------------------------------------------------------- 1 | 2 | from typing import List, Tuple 3 | import nltk 4 | 5 | 6 | def token_patterns( 7 | data: List[str], 8 | n: int, 9 | top_n: int = 10 10 | ) -> List[Tuple[str, int]]: 11 | """ Finds ngrams patterns in the data. 12 | 13 | Args: 14 | data (List[str]): Data to run frequency counts on. 15 | n (int): N-gram length. 16 | top_n (int, optional): Top patterns to display. Defaults to 10. 17 | 18 | Returns: 19 | List[Tuple[str, int]]: Sorted list of top n-gram patterns. 20 | """ 21 | 22 | # treat data as one string 23 | all_data = ' '.join(data) 24 | 25 | ngrams = list(nltk.ngrams(all_data.split(' '), n)) 26 | frequency = nltk.FreqDist(ngrams) 27 | 28 | sorted_frequency = sorted(frequency.items(), key=lambda kv: kv[1], reverse=True)[:top_n] 29 | 30 | sorted_frequency = [(' '.join(x[0]), x[1]) for x in sorted_frequency] 31 | 32 | return sorted_frequency 33 | -------------------------------------------------------------------------------- /diversity/utils/memoize.py: -------------------------------------------------------------------------------- 1 | import collections 2 | import functools 3 | 4 | class memoized(object): 5 | ''' 6 | Decorator. Caches a function's return value each time it is called. 7 | If called later with the same arguments, the cached value is returned 8 | (not re-evaluated). 9 | From https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize. 10 | ''' 11 | 12 | def __init__(self, func): 13 | self.func = func 14 | self.cache = {} 15 | 16 | def __call__(self, *args): 17 | if not isinstance(args, collections.abc.Hashable): 18 | # better to not cache than blow up. 19 | return self.func(*args) 20 | if args in self.cache: 21 | return self.cache[args] 22 | else: 23 | value = self.func(*args) 24 | self.cache[args] = value 25 | return value 26 | 27 | def __repr__(self): 28 | '''Return the function's docstring.''' 29 | return self.func.__doc__ 30 | 31 | def __get__(self, obj, objtype): 32 | '''Support instance methods.''' 33 | return functools.partial(self.__call__, obj) 34 | -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_preprocessing/number.py: -------------------------------------------------------------------------------- 1 | import nltk 2 | nltk.download('punkt_tab') 3 | from nltk.tokenize import sent_tokenize 4 | import logging 5 | logger = logging.getLogger(__name__) 6 | 7 | 8 | def _tokenize_sentences(new_text): 9 | sentences = sent_tokenize(new_text) 10 | number_sentence_dict = {key:val for key, val in zip(range(1, len(sentences)+1), sentences)} 11 | return number_sentence_dict 12 | 13 | def number_text(text: str): 14 | """ Pre-processes and numbers each sentence in the document. 15 | 16 | Args: 17 | text (str): document 18 | 19 | Returns: 20 | str: document with sentence numbers prepending each sentence 21 | dict: mapping between sentence number and its corresponding sentence 22 | """ 23 | new_text = text.replace("“", "\'") 24 | new_text = new_text.replace("”", "\'") 25 | try: 26 | number_sentence_dict = _tokenize_sentences(new_text) 27 | except Exception as e: 28 | logger.error(e) 29 | return None, None 30 | 31 | 32 | numbered_text = "" 33 | for key in number_sentence_dict: 34 | numbered_text += "[%d] " % key 35 | numbered_text += number_sentence_dict[key] 36 | 37 | return numbered_text, number_sentence_dict -------------------------------------------------------------------------------- /test-diversity/hom_tests.py: -------------------------------------------------------------------------------- 1 | ''' 2 | Times and tests BERTScore, ROUGE and BLEU score computation over CNN-DailyMail 3 | summaries. 4 | ''' 5 | 6 | from datasets import load_dataset 7 | from homogenization import homogenization_score 8 | from time import perf_counter as pc 9 | from numpy import round 10 | 11 | if __name__ == '__main__': 12 | data = load_dataset("argilla/cnn-dailymail-summaries")["train"].to_pandas().highlights.sample(500, random_state=1).values.tolist() 13 | 14 | start = pc() 15 | bs = homogenization_score(data, measure='bertscore', verbose=True, model="distilbert-base-uncased") 16 | end = pc() 17 | 18 | print(f"Time taken to compute BERTScore over CNN summaries: {round(end-start,2)}\nBERTScore over 500 reference summaries: {bs}") 19 | 20 | start = pc() 21 | rl = homogenization_score(data, measure='rougel', verbose=True) 22 | end = pc() 23 | 24 | print(f"Time taken to compute Rouge over CNN summaries: {round(end-start,2)} secs\nRouge over 500 reference summaries: {rl}") 25 | 26 | start = pc() 27 | rl = homogenization_score(data, measure='bleu', verbose=True) 28 | end = pc() 29 | 30 | print(f"Time taken to compute BLEU over CNN summaries: {round(end-start,2)} secs\nBLEU over 500 reference summaries: {rl}") 31 | -------------------------------------------------------------------------------- /diversity/utils/openai.py: -------------------------------------------------------------------------------- 1 | import openai 2 | from pydantic import BaseModel 3 | import os 4 | 5 | class GPT(): 6 | ''' 7 | Wrapper. Helps instantiate and make requests to openai clients. 8 | ''' 9 | def __init__(self, gpt_model, key): 10 | self.model = gpt_model 11 | openai.api_key = key 12 | self.client = openai.OpenAI() 13 | 14 | def call(self, prompt: str, system_prompt: str): 15 | response = self.client.chat.completions.create( 16 | model=self.model, 17 | messages=[ 18 | {"role": "system", "content": system_prompt}, 19 | {"role": "user", "content": prompt} 20 | ] 21 | ) 22 | msg = response.choices[0].message 23 | return msg.content 24 | 25 | 26 | def call_gpt_format(self, prompt: str, system_prompt: str, format): 27 | try: 28 | completion = self.client.beta.chat.completions.parse( 29 | model=self.model, 30 | messages=[ 31 | {"role": "system", "content": system_prompt}, 32 | {"role": "user", "content":prompt} 33 | ], 34 | response_format=format, 35 | ) 36 | 37 | answer = completion.choices[0].message.parsed 38 | return answer 39 | except: 40 | raise TypeError 41 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish to PyPI 2 | 3 | on: 4 | release: 5 | types: [published] 6 | workflow_dispatch: 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | 14 | - name: Set up Python 15 | uses: actions/setup-python@v5 16 | with: 17 | python-version: '3.11' 18 | 19 | - name: Install dependencies 20 | run: | 21 | python -m pip install --upgrade pip 22 | pip install build twine 23 | 24 | - name: Build package 25 | run: python -m build 26 | 27 | - name: Check package with twine 28 | run: twine check dist/* 29 | 30 | - name: Upload artifacts 31 | uses: actions/upload-artifact@v4 32 | with: 33 | name: python-package-distributions 34 | path: dist/ 35 | 36 | publish-to-pypi: 37 | name: Publish to PyPI 38 | needs: build 39 | runs-on: ubuntu-latest 40 | environment: 41 | name: pypi 42 | url: https://pypi.org/p/diversity 43 | permissions: 44 | id-token: write 45 | 46 | steps: 47 | - name: Download artifacts 48 | uses: actions/download-artifact@v4 49 | with: 50 | name: python-package-distributions 51 | path: dist/ 52 | 53 | - name: Publish to PyPI 54 | uses: pypa/gh-action-pypi-publish@release/v1 55 | -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_alignment/answer.py: -------------------------------------------------------------------------------- 1 | from ...utils import openai 2 | 3 | system_prompt = "You are an expert reading comprehension agent. You will be given a passage with numbered sentences and a series of questions. For each question, your task is to extract all sentences that directly help answer it. You must return the question and a list of sentence numbers and sentences that answer it. The question may not always be answerable. In that case, return an empty list. Do NOT overgenerate. Do not modify the original text." 4 | 5 | class Answer(openai.BaseModel): 6 | question: str 7 | sentence_nums: list[int] 8 | sentences: list[str] 9 | 10 | class Response(openai.BaseModel): 11 | excerpts: list[Answer] 12 | 13 | 14 | def get_answer(gpt_model, numbered_segments: str, qud_list: str, num_quds: int, num_target_sentences: int, max_tries: int): 15 | for i in range(max_tries): 16 | try: 17 | prompt = "Passage: %s\nQuestions:\n%s" % (numbered_segments, qud_list) 18 | answer = gpt_model.call_gpt_format(prompt, system_prompt, Response) 19 | 20 | if len(answer.excerpts)!=num_quds: 21 | continue 22 | 23 | for ans in answer.excerpts: 24 | for sentence in ans.sentence_nums: 25 | if sentence<0 or sentence>num_target_sentences: 26 | continue 27 | 28 | return answer.model_dump_json() 29 | except Exception as e: 30 | print(e) 31 | continue 32 | 33 | return None -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_qud_generation/decontextualize.py: -------------------------------------------------------------------------------- 1 | from ...utils import openai 2 | 3 | system_prompt = "You will be given several numbered paragraphs. Decontextualize each paragraph such that the paragraph's general plot is captured. Names, places, extraneous details and descriptive language should all be abstracted away." 4 | 5 | class Paragraph(openai.BaseModel): 6 | para_num: int 7 | para: str 8 | 9 | class Answer(openai.BaseModel): 10 | decontextualized_paragraphs: list[Paragraph] 11 | 12 | def decontextualize(gpt_model: openai.GPT, text: str, num_segments: int, max_tries: int): 13 | """Extracts and removes entities from segments 14 | 15 | Args: 16 | gpt_model: an OpenAI client 17 | text (str): string concatenation of segments (default delimeter is two newlines) 18 | num_segments (int): number of segments being entity-extracted 19 | max_tries (int): maximum number of attempts the client can make in case of failure 20 | 21 | Returns: 22 | Answer: decontextualized segments wrapped in an openai.BaseModel class 23 | """ 24 | 25 | for i in range(max_tries): 26 | try: 27 | decontextualized = gpt_model.call_gpt_format(text, system_prompt, Answer) 28 | 29 | if not decontextualized: 30 | continue 31 | 32 | if len(decontextualized.decontextualized_paragraphs)!=num_segments: 33 | continue 34 | 35 | return decontextualized 36 | except Exception as e: 37 | continue 38 | 39 | return None 40 | 41 | 42 | -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_qud_generation/qud.py: -------------------------------------------------------------------------------- 1 | from ...utils import openai 2 | 3 | system_prompt_high_level = "You will be given a paragraph. We are interested in forming unique, high-level, abstract QUDs with minimal details such that when they are answered, we understand the main themes of the paragraph. Details specific to the content should be omitted. QUDs should like: What were the individual's greatest accomplishments? What legacy did the individual leave behind?. First answer the minimum number of QUD(s) required. Then list the QUDs. Do not use conjunctions in the QUDs." 4 | 5 | class QUD(openai.BaseModel): 6 | qud: str 7 | 8 | class Answer(openai.BaseModel): 9 | num_quds: int 10 | quds: list[QUD] 11 | 12 | def generate_quds(gpt_model: openai.GPT, segment: str, max_tries: int): 13 | """Generate Question(s) Under Discussion for a given segment 14 | 15 | Args: 16 | gpt_model: an OpenAI client 17 | segment (str): the segment for which QUDs need to be generated (QUDs are answered by the segment) 18 | max_tries (int): maximum number of attempts the client can make in case of failure 19 | 20 | Returns: 21 | Answer: QUDs wrapped in an openai.BaseModel class 22 | """ 23 | for i in range(max_tries): 24 | try: 25 | quds = gpt_model.call_gpt_format(segment, system_prompt_high_level, Answer) 26 | 27 | if not quds or len(quds.quds)==0: 28 | continue 29 | 30 | return quds 31 | except Exception as e: 32 | continue 33 | 34 | return None 35 | -------------------------------------------------------------------------------- /diversity/self_repetition.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nltk.util import ngrams 3 | 4 | from tqdm import tqdm 5 | from typing import List 6 | from collections import Counter 7 | 8 | def self_repetition_score( 9 | dataset: List[str], 10 | n: int = 4, 11 | verbose: bool = True 12 | ) -> float: 13 | """ 14 | Calculates a self-repetition score for a dataset based on the 15 | repetition of ngrams within the corpus. 16 | 17 | Args: 18 | dataset (List[str]): A list of documents (strings) to analyze. 19 | n (int): Size of the ngrams to check for repetition. Defaults to 4. 20 | verbose (bool): enable/disable show progress bar 21 | 22 | Returns: 23 | float: The self-repetition score, averaged over the dataset. 24 | """ 25 | total_sum = 0 26 | 27 | # Get all unique ngrams per doc 28 | ngram_docs = [list(set([' '.join(ngram) for ngram in ngrams(doc.split(), 4)])) for doc in dataset] 29 | 30 | # Count occurrences of unique ngrams across whole dataset 31 | all_ngrams = sum(ngram_docs, []) 32 | ngram_counts = Counter(all_ngrams) 33 | 34 | for ngram_doc in tqdm(ngram_docs, desc="Calculating self-repetition score", disable=(not verbose)): 35 | # Find the total occurrence of an n-gram and subtract current doc's n-gram count 36 | # to get the count of occurrences of an n-gram in other docs 37 | sum_ni = sum([ngram_counts[ngram] for ngram in ngram_doc]) - len(ngram_doc) 38 | 39 | # add-one to avoid zero error 40 | total_sum += np.log(sum_ni + 1) 41 | return total_sum / len(dataset) -------------------------------------------------------------------------------- /diversity/compression.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from pathlib import Path 3 | 4 | import tempfile 5 | import gzip 6 | import os 7 | import lzma as xz 8 | 9 | 10 | def compression_ratio( 11 | data: List[str], 12 | algorithm: str = 'gzip', 13 | verbose: bool = False, 14 | path: Optional[str] = None 15 | ) -> float: 16 | """ Calculates the compression ratio for a collection of text. 17 | Args: 18 | path (str): Path to store temporarily zipped files. 19 | data (List[str]): Strings to compress. 20 | algorithm (str, optional): Either 'gzip' or 'xz'. Defaults to 'gzip'. 21 | verbose (bool, optional): Print out the original and compressed size separately. Defaults to False. 22 | Returns: 23 | float: Compression ratio (original size / compressed size) 24 | """ 25 | 26 | temp_dir = None 27 | if not path: 28 | temp_dir = tempfile.TemporaryDirectory() 29 | path = Path(temp_dir.name) 30 | else: 31 | path = Path(path) 32 | 33 | with (path / 'original.txt').open('w+') as f: 34 | f.write(' '.join(data)) 35 | 36 | original_size = os.path.getsize(os.path.join(path, "original.txt")) 37 | 38 | if algorithm == 'gzip': 39 | 40 | with gzip.GzipFile(str(path / 'compressed.gz'), 'w+') as f: 41 | f.write(gzip.compress(' '.join(data).encode('utf-8'))) 42 | 43 | compressed_size = os.path.getsize(os.path.join(path, "compressed.gz")) 44 | 45 | elif algorithm == 'xz': 46 | 47 | with xz.open(str(path / 'compressed.gz'), 'wb') as f: 48 | f.write(' '.join(data).encode('utf-8')) 49 | 50 | compressed_size = (path / "compressed.gz").stat().st_size 51 | 52 | if verbose: 53 | print(f"Original Size: {original_size}\nCompressed Size: {compressed_size}") 54 | 55 | if temp_dir: 56 | temp_dir.cleanup() 57 | 58 | return round(original_size / compressed_size, 3) 59 | 60 | -------------------------------------------------------------------------------- /examples/summarization.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from diversity import get_pos, pos_patterns, token_patterns, compression_ratio 3 | from transformers import pipeline 4 | from datasets import load_dataset 5 | 6 | import typer 7 | 8 | app = typer.Typer() 9 | 10 | @app.command() 11 | def summarization( 12 | dataset: str, 13 | column: str, 14 | split: str, 15 | model: str = "t5-base", 16 | tokenizer: Optional[str] = None, 17 | ngram: Optional[int] = 5 18 | ): 19 | tokenizer = tokenizer or model 20 | 21 | summarizer = pipeline( 22 | "summarization", 23 | model=model, 24 | tokenizer=tokenizer, 25 | return_text=True) 26 | 27 | # load dataset (either custom CSV or dataset from HF) 28 | if dataset.endswith('.csv'): 29 | data = load_dataset("csv", data_files=dataset)[split][:10][column] 30 | else: 31 | data = load_dataset(dataset) 32 | data = data[split][:10][column] 33 | 34 | # generate the summaries 35 | outputs = summarizer(data) 36 | outputs = [instance['summary_text'] for instance in outputs] 37 | 38 | # get the token-level patterns 39 | patterns_token = token_patterns(outputs, ngram) 40 | 41 | # get the POS patterns 42 | joined_pos, tuples = get_pos(outputs) 43 | ngrams_pos = token_patterns(joined_pos, ngram) 44 | 45 | # for the top n-gram patterns, cycle through and get the matching text 46 | text_matches = {} 47 | 48 | for pattern, _ in ngrams_pos: 49 | text_matches['pattern'] = pos_patterns(tuples, pattern) 50 | 51 | # get the compression score 52 | compression = compression_ratio(outputs, 'gzip') 53 | 54 | # TODO: function to nicely display results 55 | print(patterns_token) 56 | print(text_matches) 57 | print(compression) 58 | 59 | # TODO: compare between two models 60 | return 61 | 62 | 63 | def display_results(): 64 | pass 65 | 66 | 67 | if __name__ == "__main__": 68 | app() 69 | -------------------------------------------------------------------------------- /scripts/timing.py: -------------------------------------------------------------------------------- 1 | import time 2 | import pandas as pd 3 | import numpy as np 4 | 5 | from diversity import compression_ratio, homogenization_score, ngram_diversity_score 6 | 7 | # Replace this with the actual path where your datasets are located 8 | dataset_path = "synthetic_datasets/" 9 | 10 | # The number of times to repeat each experiment 11 | num_experiments = 10 12 | 13 | # Initialize results dictionary 14 | results = { 15 | 'Dataset': [], 16 | 'Function': [], 17 | 'Mean Time': [], 18 | 'Std Dev Time': [], 19 | } 20 | 21 | # Define the timing function 22 | def time_function(func, *args): 23 | times = [] 24 | for _ in range(num_experiments): 25 | start_time = time.time() 26 | func(*args) 27 | end_time = time.time() 28 | times.append(end_time - start_time) 29 | return np.mean(times), np.std(times) 30 | 31 | # Iterate over datasets and functions 32 | for i in range(1, 6): 33 | dataset_filename = f"dataset_{i}.txt" 34 | dataset = [x.strip() for x in open(dataset_path + dataset_filename).read().split("\n")][:10] 35 | 36 | # Measure compression_ratio 37 | cr_mean, cr_std = time_function(compression_ratio, dataset, 'gzip') 38 | results['Dataset'].append(i) 39 | results['Function'].append('compression_ratio') 40 | results['Mean Time'].append(cr_mean) 41 | results['Std Dev Time'].append(cr_std) 42 | 43 | # Measure homogenization_score with rougel 44 | hs_mean, hs_std = time_function(homogenization_score, dataset, 'rougel') 45 | results['Dataset'].append(i) 46 | results['Function'].append('homogenization_score_rougel') 47 | results['Mean Time'].append(hs_mean) 48 | results['Std Dev Time'].append(hs_std) 49 | 50 | # Measure ngram_diversity_score 51 | nds_mean, nds_std = time_function(ngram_diversity_score, dataset, 4) 52 | results['Dataset'].append(i) 53 | results['Function'].append('ngram_diversity_score') 54 | results['Mean Time'].append(nds_mean) 55 | results['Std Dev Time'].append(nds_std) 56 | 57 | # Convert results to DataFrame and save as CSV 58 | results_df = pd.DataFrame(results) 59 | results_csv_path = dataset_path + 'timing_experiments_results.csv' 60 | results_df.to_csv(results_csv_path, index=False) 61 | 62 | print(f"Results saved to {results_csv_path}") 63 | -------------------------------------------------------------------------------- /diversity/patterns/part_of_speech.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Any, Set 2 | 3 | import spacy 4 | 5 | def get_pos( 6 | data: List[str] 7 | ) -> Tuple[List[str], List[Tuple[str, str]]]: 8 | """ Turns a sequence into parts of speech. 9 | 10 | Args: 11 | data (List[str]): Data to tranform into part of speech tags. 12 | 13 | Returns: 14 | Tuple[List[str], List[Tuple[str, str]]]: Part-of-speech tags only, tuple of (token, part-of-speech tag). 15 | """ 16 | nlp = spacy.load("en_core_web_sm", enable=["tok2vec", "tagger"]) 17 | 18 | pos_tuples = [] 19 | joined_pos = [] 20 | joined_text = [] 21 | 22 | docs = nlp.pipe(data, n_process=4, batch_size=1000) 23 | 24 | for doc in docs: 25 | joined_text.append(' '.join([token.text for token in doc])) 26 | joined_pos.append(' '.join([token.tag_ for token in doc])) 27 | pos_tuples.append([(token.text, token.tag_) for token in doc]) 28 | 29 | return joined_pos, pos_tuples 30 | 31 | 32 | def _find_sub_list( 33 | sl: List[Any], 34 | l: List[Any] 35 | ) -> List[Any]: 36 | """ Given a pattern and a list of strings, returns sublists matching the pattern. """ 37 | 38 | results = [] 39 | sll = len(sl) 40 | for ind in (i for i,e in enumerate(l) if e==sl[0]): 41 | if l[ind:ind+sll]==sl: 42 | results.append((ind, ind+sll-1)) 43 | 44 | return results 45 | 46 | 47 | def pos_patterns( 48 | text: List[List[Tuple[str, str]]], 49 | pattern: str 50 | ) -> Set[str]: 51 | """ Finds all substrings matching a part of speech pattern. 52 | 53 | Args: 54 | text (List[List[Tuple[str, str]]]): Text containing words and part-of-speech tags. 55 | pattern (str): Part-of-speech tag pattern to search for. 56 | 57 | Returns: 58 | Set[str]: Returns all the string matching the pattern. 59 | """ 60 | 61 | pos = [] 62 | word = [] 63 | 64 | # text is a list of lists of tuples (word, part of speech) 65 | for doc in text: 66 | pos.append([i[1] for i in doc]) 67 | word.append([i[0] for i in doc]) 68 | 69 | pos = [' '.join(x) for x in pos] 70 | word = [' '.join(x) for x in word] 71 | 72 | all_matches = [] 73 | 74 | # return positions of each tag and the corresponding tokens 75 | for w, p in zip(word, pos): 76 | 77 | test = _find_sub_list(pattern.split(), p.split()) 78 | 79 | if test: 80 | for occ in test: 81 | splits = w.split()[int(occ[0]):int(occ[1]+1)] 82 | all_matches.append(" ".join(splits)) 83 | 84 | return set(all_matches) 85 | -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_alignment/align.py: -------------------------------------------------------------------------------- 1 | from .answer import get_answer 2 | from .similarity import get_harmonic_similarity 3 | import numpy as np 4 | 5 | def align(gpt_model, 6 | source_quds, 7 | target_text, 8 | target_quds, 9 | source_text, 10 | num_source_segments, 11 | num_target_segments, 12 | source_segment_qud_dict, 13 | target_segment_qud_dict, 14 | source_segments, 15 | target_segments, 16 | num_source_sentences, 17 | num_target_sentences, 18 | threshold, 19 | max_tries): 20 | 21 | # get [A_q for q in source_quds] 22 | quds = [] 23 | for q_group in source_quds: 24 | for q in eval(q_group)['quds']: 25 | quds.append(q['qud']) 26 | 27 | source_qud_list = "\n\n".join(quds) 28 | source_qud_answers = get_answer(gpt_model, target_text, source_qud_list, len(quds), num_target_sentences, max_tries) 29 | 30 | if source_qud_answers is None: 31 | print("Finding an answer given source QUDs and target document was unsuccessful") 32 | return None, None, [], [] 33 | else: 34 | source_qud_answers = eval(source_qud_answers) 35 | 36 | # get [A_q for q in target_quds] 37 | quds = [] 38 | for q_group in target_quds: 39 | for q in eval(q_group)['quds']: 40 | quds.append(q['qud']) 41 | 42 | target_qud_list = "\n\n".join(quds) 43 | target_qud_answers = get_answer(gpt_model, source_text, target_qud_list, len(quds), num_source_sentences, max_tries) 44 | 45 | if target_qud_answers is None: 46 | print("Finding an answer given target QUDs and source document was unsuccessful") 47 | return None, None, [], [] 48 | else: 49 | target_qud_answers = eval(target_qud_answers) 50 | 51 | # get harmonic similarity 52 | harmonic_mean_scores = get_harmonic_similarity(num_target_segments, 53 | num_source_segments, 54 | source_qud_answers, 55 | source_segment_qud_dict, 56 | target_segments, 57 | target_qud_answers, 58 | target_segment_qud_dict, 59 | source_segments) 60 | 61 | aligned_segments = np.where(np.array(harmonic_mean_scores) float: 21 | """ 22 | Calculates the remote clique score for a set of documents (corpus-level). 23 | This is the average mean pairwise distance of a data instance to other instances. 24 | Args: 25 | data (List[str]): Strings to score. 26 | model(str, optional): Model to use for embedding. Defaults to 'Qwen/Qwen3-Embedding-0.6B'. 27 | verbose(bool, optional): Whether to display progress bar. Defaults to True. 28 | batch_size(int, optional): Batch size for embedding. Defaults to 64. 29 | Returns: 30 | float: Remote clique score. 31 | """ 32 | model = SentenceTransformer(model) 33 | embeddings = model.encode(data, batch_size=batch_size, show_progress_bar=verbose) 34 | distances = cosine_distances(embeddings) 35 | mean_distances = np.mean(distances, axis=1) 36 | return np.mean(mean_distances).round(3) 37 | 38 | 39 | def chamfer_dist( 40 | data: List[str], 41 | model: Optional[str] = 'Qwen/Qwen3-Embedding-0.6B', 42 | verbose: Optional[bool] = True, 43 | batch_size: Optional[int] = 64 44 | ) -> float: 45 | """ 46 | Calculates the chamfer distance for a set of documents (corpus-level). 47 | This is the average minimum pairwise distance of a data instance to other instances. 48 | Args: 49 | data (List[str]): Strings to score. 50 | model(str, optional): Model to use for embedding. Defaults to 'Qwen/Qwen3-Embedding-0.6B'. 51 | verbose(bool, optional): Whether to display progress bar. Defaults to True. 52 | batch_size(int, optional): Batch size for embedding. Defaults to 64. 53 | Returns: 54 | float: Chamfer distance. 55 | """ 56 | model = SentenceTransformer(model) 57 | embeddings = model.encode(data, batch_size=batch_size, show_progress_bar=verbose) 58 | distances = cosine_distances(embeddings) 59 | min_distances = np.min(distances + np.eye(len(distances)) * 1e9, axis=1) 60 | return np.mean(min_distances).round(3) 61 | -------------------------------------------------------------------------------- /diversity/homogenization.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from tqdm import tqdm 3 | from rouge_score import rouge_scorer 4 | from evaluate import load 5 | 6 | def homogenization_score( 7 | data: List[str], 8 | measure: str = 'rougel', 9 | use_stemmer: Optional[str] = False, 10 | model: Optional[str] = "microsoft/deberta-base-mnli", 11 | verbose: Optional[bool] = True, 12 | batch_size: Optional[int] = 64 13 | ) -> float: 14 | """ 15 | Calculates the homogenization score for a set of documents (corpus-level). 16 | From https://arxiv.org/pdf/2309.05196.pdf 17 | Args: 18 | data (List[str]): Strings to score. 19 | measure (str, optional): Either 'rougel', 'bertscore', or 'bleu'. Defaults to 'rougel'. 20 | use_stemmer(str, optional): Whether to use stemming in the ROUGE-L calculation. Defaults to False. 21 | model(str, optional): Model to use for BERTScore. Defaults to 'microsoft/deberta-base-mnli'. 22 | verbose(bool, optional): Whether to display progress bar. Defaults to True. 23 | Returns: 24 | float: Homogenization score. 25 | """ 26 | 27 | if measure == 'rougel': 28 | scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=use_stemmer) 29 | elif measure == 'bertscore': 30 | scorer = load("bertscore") 31 | elif measure == 'bleu': 32 | scorer = load("bleu") 33 | else: 34 | raise ValueError("Scoring measure must be one of `rougel`, `bleu`, or `bertscore`.") 35 | 36 | corpus_score = 0 37 | 38 | if verbose: 39 | print('==> Scoring all pairs') 40 | 41 | for i, ref in tqdm(enumerate(data), total=len(data), disable=(not verbose)): 42 | # Get all the other utterances to compare against a specific utterance 43 | preds = [x for j,x in enumerate(data) if j!=i] 44 | refs = [ref for _ in range(len(preds))] 45 | 46 | # Get scores over whole batch and sum it up 47 | if measure=='rougel': 48 | doc_score = sum([scorer.score(pred, ref)['rougeL'].fmeasure for pred in preds]) 49 | elif measure=='bertscore': 50 | doc_score = sum(scorer.compute(predictions=preds, 51 | references=refs, 52 | model_type=model, 53 | batch_size=batch_size)['f1']) 54 | elif measure=='bleu': 55 | # Need to double check that this is right 56 | doc_score = scorer.compute(predictions=preds, 57 | references=[[r] for r in refs])['bleu'] 58 | # Then average 59 | corpus_score += doc_score / (len(data) - 1) 60 | 61 | # case where all strings are the exact same in the list 62 | if corpus_score == 0: 63 | corpus_score += len(data) 64 | 65 | # returns corpus level homogenization score 66 | return round(corpus_score/len(data), 3) 67 | -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_qud_generation/segment.py: -------------------------------------------------------------------------------- 1 | 2 | from ...utils import openai 3 | 4 | system_prompt = "You will be given text with numbered sentences and your task is to redraw the paragraph boundaries such that each chunk is about one atomic topic. Each segment cannot be about multiple topics or about a complex topic. You may not change the text or change the order of the sentences. For each segment, provide the list of sentence numbers that belong to that segment." 5 | 6 | class Segment(openai.BaseModel): 7 | sentences: list[int] 8 | 9 | class Answer(openai.BaseModel): 10 | segmentation: list[Segment] 11 | 12 | def segment(gpt_model: openai.GPT, passage: str, sentence_dict: dict, max_tries: int): 13 | """Segment the passage such that each segment is thematically atomic 14 | 15 | Args: 16 | gpt_model: an OpenAI client 17 | passage (str): Passage/document 18 | sentence_dict (dict): Dictionary mapping between sentence numbers and corresponding sentences in the passage 19 | max_tries (int): maximum number of attempts the client can make in case of failure 20 | 21 | Returns: 22 | Answer: Segments wrapped in an openai.BaseModel class 23 | """ 24 | text = " ".join(passage.split("\n\n")) 25 | 26 | document_sentences = [k for k,v in sentence_dict.items()] 27 | document_sentences = [int(num) for num in document_sentences if type(num)!=int] 28 | 29 | 30 | for i in range(max_tries): 31 | try: 32 | segments = gpt_model.call_gpt_format(text, system_prompt, Answer) 33 | 34 | if not segments: 35 | continue 36 | 37 | # all sentences in document must be accounted for 38 | segmented_sentences = set() 39 | for segment in segments.segmentation: 40 | for sentence_num in segment.sentences: 41 | segmented_sentences.add(sentence_num) 42 | if len(set(document_sentences).difference(set(segmented_sentences)))!=0: 43 | # extraneous or missing sentences 44 | continue 45 | 46 | 47 | # all sentences in each segments must be in document 48 | successful_segmentation = True 49 | segmented_text = [] 50 | for segment in segments.segmentation: 51 | segment_text = "" 52 | for sentence_num in segment.sentences: 53 | try: 54 | sentence = sentence_dict[str(sentence_num)] 55 | segment_text += sentence + " " 56 | except: 57 | try: 58 | sentence = sentence_dict[sentence_num] 59 | segment_text += sentence + " " 60 | except: 61 | successful_segmentation = False 62 | segmented_text.append(segment_text) 63 | 64 | if successful_segmentation: 65 | return segments, segmented_text 66 | 67 | except Exception as e: 68 | print(e) 69 | continue 70 | 71 | return None, None 72 | 73 | 74 | 75 | 76 | 77 | -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_alignment/metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class SimilarityMetric(): 4 | def _calculate_document_similarity(self, segment_scores): 5 | max_sim_scores = np.max(segment_scores, axis=1) 6 | overall_similarity = np.average(max_sim_scores) 7 | return overall_similarity 8 | 9 | class FrequencyBasedSimilarity(SimilarityMetric): 10 | 11 | def _count_sentences(self, qud_answers, 12 | num_target_segments:int, 13 | target_segments): 14 | 15 | arr = np.zeros((len(qud_answers['excerpts']), num_target_segments)) 16 | 17 | for i, qud_ans in enumerate(qud_answers['excerpts']): 18 | answer_sentences = qud_ans['sentence_nums'] 19 | if len(answer_sentences)==0: 20 | continue 21 | for j, target_segment in enumerate(eval(target_segments)['segmentation']): 22 | target_segment_sentences = set(target_segment['sentences']) 23 | intersection = target_segment_sentences.intersection(answer_sentences) 24 | arr[i][j] = len(intersection) / len(answer_sentences) 25 | 26 | return arr 27 | 28 | def _get_segment_scores(self, sentence_count_map, 29 | source_seg_qud_dict, 30 | num_source_segments, 31 | num_target_segments): 32 | 33 | segment_scores = np.zeros((num_source_segments, num_target_segments)) 34 | 35 | for src, quds in source_seg_qud_dict.items(): 36 | segment_scores[int(src)] = np.average(sentence_count_map[quds], axis=0) 37 | 38 | return segment_scores 39 | 40 | def calculate_similarity(self, num_target_segments:int, 41 | num_source_segments:int, 42 | qud_answers:dict, 43 | source_seg_qud_dict: dict, 44 | target_segments: str): 45 | """ Calculates directional similarity between a source and target document 46 | (Figure 2 in https://arxiv.org/pdf/2504.09373) 47 | 48 | Args: 49 | num_target_segments (int): Number of target segments 50 | num_source_segments (int): Number of source segments 51 | qud_answers (dict): JSON representation of answers to source quds given target document 52 | source_seg_qud_dict (dict): Mapping between source segment indices and a list of corresponding QUD indices 53 | target_segments (str): JSON string representation of target segments (qudsim_qud_generation.segment.Answer) 54 | 55 | Returns: 56 | ndarray: array of dimensions (num_source_segments, num_target_segments) 57 | representing similarity scores between each pair of segments 58 | """ 59 | sentence_count_map = self._count_sentences(qud_answers, 60 | num_target_segments, 61 | target_segments) 62 | 63 | segment_scores = self._get_segment_scores(sentence_count_map, 64 | source_seg_qud_dict, 65 | num_source_segments, 66 | num_target_segments) 67 | 68 | return segment_scores -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_alignment/similarity.py: -------------------------------------------------------------------------------- 1 | from .metric import FrequencyBasedSimilarity 2 | import numpy as np 3 | 4 | def _get_frequency_similarities(num_target_segments, 5 | num_source_segments, 6 | qud_answers, 7 | source_segment_qud_dict, 8 | target_segments): 9 | similarity_metric = FrequencyBasedSimilarity() 10 | 11 | segment_scores = similarity_metric.calculate_similarity(num_target_segments, 12 | num_source_segments, 13 | qud_answers, 14 | source_segment_qud_dict, 15 | target_segments) 16 | return segment_scores 17 | 18 | def get_harmonic_similarity(num_target_segments, 19 | num_source_segments, 20 | source_qud_answers, 21 | source_segment_qud_dict, 22 | target_segments, 23 | target_qud_answers, 24 | target_segment_qud_dict, 25 | source_segments): 26 | """ Calculates harmonic mean between source->target and target->source similarities 27 | 28 | Args: 29 | num_target_segments (int): Number of target segments 30 | num_source_segments (int): Number of source segments 31 | source_qud_answers (dict): JSON representation of answers to source quds given target document 32 | source_seg_qud_dict (dict): Mapping between source segment indices and a list of corresponding QUD indices 33 | target_segments (str): JSON string representation of target segments (qudsim_qud_generation.segment.Answer) 34 | target_qud_answers (dict): JSON representation of answers to target quds given source document 35 | target_seg_qud_dict (dict): Mapping between target segment indices and a list of corresponding QUD indices 36 | source_segments (str): JSON string representation of source segments (qudsim_qud_generation.segment.Answer) 37 | 38 | Returns: 39 | ndarray: array of dimensions (num_source_segments, num_target_segments) 40 | representing harmonic mean of direction similarity scores between each pair of segments 41 | """ 42 | 43 | src_to_tgt_segment_scores = _get_frequency_similarities(num_target_segments, 44 | num_source_segments, 45 | source_qud_answers, 46 | source_segment_qud_dict, 47 | target_segments) 48 | tgt_to_src_segment_scores = _get_frequency_similarities(num_source_segments, 49 | num_target_segments, 50 | target_qud_answers, 51 | target_segment_qud_dict, 52 | source_segments) 53 | 54 | denom = src_to_tgt_segment_scores + np.transpose(tgt_to_src_segment_scores) 55 | denom = np.where(denom>0, denom, 1) 56 | 57 | harmonic_mean_scores = 2*(src_to_tgt_segment_scores*np.transpose(tgt_to_src_segment_scores))/denom 58 | 59 | return harmonic_mean_scores -------------------------------------------------------------------------------- /diversity/functions.py: -------------------------------------------------------------------------------- 1 | """ Functions for extracting patterns and matching text to patterns. """ 2 | 3 | import numpy as np 4 | import itertools 5 | from typing import List, Optional 6 | from tqdm import tqdm 7 | from .patterns import token_patterns, get_pos, pos_patterns 8 | from nltk.tokenize import sent_tokenize 9 | 10 | def extract_patterns(text: List[str], 11 | n: int = 5, 12 | top_n: int = 100 13 | ) -> dict: 14 | """ Extracts text and part-of-speech patterns from text input. 15 | Used to return a dictionary of patterns and the corresponding text match. 16 | Args: 17 | text (List[str]): List of strings to extract patterns from. 18 | n (int, optional): N-gram size. Defaults to 5. 19 | top_n (int, optional): Number of top patterns to extract. Defaults to 100. 20 | Returns: 21 | dict: Dictionary of patterns and their corresponding text. 22 | 23 | Example Usage: 24 | >>> text = ["The quick brown fox jumps over the lazy dog.", 25 | "The slow red fox walks on the hyper dog."] 26 | >>> extract_patterns(text, 4) 27 | 28 | {'DT JJ NN NN': {'The quick brown fox'}, 29 | 'JJ NN NN VBZ': {'quick brown fox jumps'}, 30 | 'NN NN VBZ IN': {'brown fox jumps over'}, 31 | 'NN VBZ IN DT': {'fox jumps over the'}, 32 | 'VBZ IN DT JJ': {'jumps over the lazy'}, 33 | 'IN DT JJ NN': {'over the lazy dog.'}, 34 | 'DT JJ JJ NN': {'The slow red fox'}, 35 | 'JJ JJ NN NNS': {'slow red fox walks'}, 36 | 'JJ NN NNS IN': {'red fox walks on'}, 37 | 'NN NNS IN DT': {'fox walks on the'}, 38 | 'NNS IN DT NN': {'walks on the hyper'}, 39 | 'IN DT NN NN': {'on the hyper dog.'}} 40 | """ 41 | 42 | # sentence tokenize then search for patterns in the entire list 43 | outputs = sent_tokenize(' '.join(text)) 44 | 45 | # get the token (word)-level patterns 46 | patterns_token = token_patterns(outputs, n) 47 | 48 | # get the part-of-speech patterns (only include top_n patterns) 49 | joined_pos, tuples = get_pos(outputs) 50 | ngrams_pos = token_patterns(joined_pos, n, top_n) 51 | 52 | # for the top n-gram patterns, cycle through and get the matching text 53 | text_matches = {} 54 | 55 | for pattern, _ in ngrams_pos: 56 | text_matches[pattern] = pos_patterns(tuples, pattern) 57 | 58 | return text_matches 59 | 60 | 61 | def match_patterns(text: str, 62 | patterns: dict 63 | ) -> List[tuple]: 64 | """ Matches text to part-of-speech patterns extracted from the `extract_patterns` function. 65 | Given set of patterns, used to identify which patterns appears in a single input text. 66 | Args: 67 | text (str): Text to match patterns to. 68 | patterns (dict): Dictionary of patterns and their corresponding text. 69 | Returns: 70 | List[tuple]: List of tuples with the pattern and the text that matched. 71 | 72 | Example Usage: 73 | >>> text = ["The quick brown fox jumps over the lazy dog.", 74 | "The slow red fox walks on the hyper dog.", 75 | "The cranky blue cat scratches the calm fish." ] 76 | >>> patterns = extract_patterns(text, 4) 77 | >>> match_patterns(text[2], patterns) 78 | 79 | [('DT NN JJ NN', 'The cranky blue cat'), 80 | ('NN JJ NN VBZ', 'cranky blue cat scratches'), 81 | ('JJ NN VBZ DT', 'blue cat scratches the'), 82 | ('NN VBZ DT NN', 'cat scratches the calm'), 83 | ('VBZ DT NN NN', 'scratches the calm fish.')] 84 | """ 85 | 86 | matches = [] 87 | 88 | for pattern, text_match in patterns.items(): 89 | for substr in text_match: 90 | if substr in text: 91 | matches.append((pattern, substr)) 92 | 93 | return matches 94 | -------------------------------------------------------------------------------- /diversity/qudsim_modules/qudsim_qud_generation/pipeline.py: -------------------------------------------------------------------------------- 1 | from .decontextualize import decontextualize 2 | from .qud import generate_quds 3 | from .segment import segment 4 | 5 | def _get_qud_dict(quds): 6 | qud_segment_dict = {} 7 | segment_qud_dict = {} 8 | num_quds = 0 9 | 10 | for i, source_qud in enumerate(quds): 11 | qud_idx_list = [] 12 | for q in eval(source_qud)['quds']: 13 | qud_segment_dict[num_quds] = i 14 | qud_idx_list.append(num_quds) 15 | num_quds+=1 16 | segment_qud_dict[i] = qud_idx_list 17 | 18 | return segment_qud_dict, qud_segment_dict 19 | 20 | def get_quds(gpt_model, text: str, sentence_num_dict: dict, level: int, max_tries: int): 21 | """Performs segmentation, entity extraction and QUD generation on the text at the specified level 22 | 23 | Args: 24 | gpt_model: an OpenAI client 25 | text (str): passage/document 26 | sentence_dict (dict): Dictionary mapping between sentence numbers and corresponding sentences in the passage 27 | level (0/1): Abstraction level of QUDs (1=abstract; 0=specific) 28 | max_tries (int): maximum number of attempts the client can make in case of failure 29 | 30 | Returns: 31 | dict: results of the pipeline: 32 | segments: segmentation json 33 | segment_dict: Maps sentence number to its corresponding segment 34 | entity_abstracted_segments: segments post-entity-extraction 35 | quds: quds for the document in json 36 | segment_qud_dict: Maps segment index to a list of corresponding QUD indices 37 | qud_segment_dict: Maps QUD indices to their corresponding segment indices 38 | """ 39 | 40 | # SEGMENTATION 41 | segments, segmented_text = segment(gpt_model, text, sentence_num_dict, max_tries) 42 | if segments is None or segmented_text is None: 43 | print("Segmentation was unsuccessful") 44 | return None 45 | 46 | segments_json = segments.model_dump_json() 47 | 48 | # DICTIONARY: SENTENCE --> SEGMENT 49 | segment_dict = {} 50 | for i, s in enumerate(segments.segmentation): 51 | for sentence_num in s.sentences: 52 | segment_dict[sentence_num] = i 53 | 54 | if level==1: 55 | # ENTITY ABSTRACTION 56 | numbered_segment_text = "\n\n".join(["[" + str(i)+ "] " + seg for i, seg in enumerate(segmented_text)]) 57 | decontextualized_segments = decontextualize(gpt_model, 58 | numbered_segment_text, 59 | len(segmented_text), 60 | max_tries) 61 | if decontextualized_segments is None: 62 | print("Entity abstraction was unsuccessful.") 63 | return None 64 | 65 | decontextualized_segments_json = decontextualized_segments.model_dump_json() 66 | 67 | # QUD GENERATION - Level 1 68 | quds = [(generate_quds(gpt_model, seg.para, max_tries)).model_dump_json() 69 | for seg in decontextualized_segments.decontextualized_paragraphs] 70 | if quds is None: 71 | return None 72 | else: 73 | decontextualized_segments_json = None 74 | 75 | # QUD GENERATION - Level 0 76 | quds = [(generate_quds(gpt_model, seg, max_tries)).model_dump_json() for seg in segmented_text] 77 | if quds is None: 78 | return None 79 | 80 | segment_qud_dict, qud_segment_dict = _get_qud_dict(quds) 81 | qg_output_item = {"segments": segments_json, 82 | "segment_dict": segment_dict, 83 | "entity_abstracted_segments": decontextualized_segments_json, 84 | "quds": quds, 85 | "segment_qud_dict": segment_qud_dict, 86 | "qud_segment_dict": qud_segment_dict} 87 | 88 | return qg_output_item -------------------------------------------------------------------------------- /diversity/template.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Optional 3 | from .functions import extract_patterns 4 | from typing import Dict, Iterable, List, Optional 5 | 6 | 7 | def template_rate( 8 | data = List[str], 9 | templates: Optional[Dict[str, Iterable[str]]] = None, 10 | shard_size: int = 500, 11 | 12 | ) -> float: 13 | """ 14 | Calculates the template rate (fraction of texts in a corpus that contain at least 1 template) 15 | for a set of documents (corpus-level), following https://arxiv.org/abs/2407.00211. 16 | 17 | Args: 18 | data (List[str]): A list of strings to score. 19 | templates (dict, optional): Dictionary containing the templates extracted from the corpus. Defaults to None. 20 | shard_size (int, optional): Size of regex shards to compile. Defaults to 500. 21 | 22 | Returns: 23 | float: Template rate, a value between 0 and 1 indicating the fraction of texts that contain at least one template. 24 | """ 25 | if not data: return 0.0 26 | 27 | if templates is None: 28 | # get the templates if not passed in 29 | templates = extract_patterns(data) 30 | 31 | matched_text = _gather_substrings(templates) 32 | 33 | if not matched_text: return 0.0 34 | 35 | regexes = _compile_regex_shards(matched_text, shard_size=shard_size) 36 | match = sum(1 for doc in data if _has_any(doc, regexes)) 37 | 38 | return match / len(data) 39 | 40 | 41 | def templates_per_token( 42 | data: List[str], 43 | templates: Optional[Dict[str, Iterable[str]]] = None, 44 | shard_size: int = 500, 45 | ) -> List[float]: 46 | """ 47 | Calculates the templates-per-token rate from https://arxiv.org/abs/2407.00211. 48 | 49 | Args: 50 | data (List[str]): A list of strings to score. 51 | templates (dict, optional): Dictionary containing the templates extracted from the corpus. Defaults to None. 52 | shard_size (int, optional): Size of regex shards to compile. Defaults to 500. 53 | 54 | 55 | Returns: 56 | List[float]: List of templates-per-token rates for each document in the corpus. 57 | """ 58 | if not data: 59 | return [] 60 | 61 | # Build templates if not provided 62 | if templates is None: 63 | templates = extract_patterns(data) 64 | 65 | substrings = _gather_substrings(templates) 66 | if not substrings: 67 | return [0.0] * len(data) 68 | 69 | # Use lookahead shards to count overlapping occurrences 70 | shards = _compile_regex_shards(substrings, shard_size, overlap=True) 71 | 72 | # Compute per-doc TPT 73 | tpt: List[float] = [] 74 | for doc in data: 75 | word_count = len(doc.split()) 76 | if word_count == 0: 77 | tpt.append(0.0) 78 | continue 79 | 80 | occ = 0 81 | for rx in shards: 82 | occ += sum(1 for _ in rx.finditer(doc)) # each match = one occurrence start 83 | tpt.append(occ / word_count) 84 | 85 | return tpt 86 | 87 | 88 | def _compile_regex_shards( 89 | substrings: List[str], 90 | shard_size: int = 500, 91 | *, 92 | overlap: bool = False, 93 | ) -> List[re.Pattern]: 94 | """ 95 | Reusable shard compiler. 96 | - overlap=False: plain alternation (fast existence tests). 97 | - overlap=True: lookahead alternation for counting overlapping matches. 98 | """ 99 | regs: List[re.Pattern] = [] 100 | 101 | for i in range(0, len(substrings), shard_size): 102 | chunk = substrings[i:i + shard_size] 103 | 104 | if not chunk: 105 | continue 106 | 107 | alt = "|".join(map(re.escape, chunk)) 108 | pat = f"(?=(?:{alt}))" if overlap else f"(?:{alt})" 109 | 110 | regs.append(re.compile(pat)) 111 | 112 | return regs 113 | 114 | 115 | def _has_any( 116 | text: str, 117 | regexes: List[re.Pattern] 118 | ) -> bool: 119 | for rx in regexes: 120 | # faster search 121 | if rx.search(text): 122 | return True 123 | return False 124 | 125 | 126 | def _gather_substrings( 127 | templates: Dict[str, Iterable[str]] 128 | ) -> List[str]: 129 | """ 130 | Gathers all substrings from the templates dictionary. 131 | 132 | Args: 133 | templates (Dict[str, Iterable[str]]): Dictionary of templates with their corresponding text matches. 134 | 135 | Returns: 136 | List[str]: List of unique substrings extracted from the templates. 137 | """ 138 | # get the flattened values from the extracted patterns 139 | matched_text = set() 140 | 141 | for v in templates.values(): 142 | matched_text.update(v) 143 | 144 | return list(matched_text) 145 | -------------------------------------------------------------------------------- /diversity/compute_all_metrics.py: -------------------------------------------------------------------------------- 1 | """ 2 | Comprehensive diversity metrics computation function. 3 | This module computes all lexical diversity metrics plus embedding metrics for a given corpus of text. 4 | """ 5 | 6 | from typing import List, Optional, Dict, Any 7 | import logging 8 | from .compression import compression_ratio 9 | from .homogenization import homogenization_score 10 | from .ngram_diversity import ngram_diversity_score 11 | from .self_repetition import self_repetition_score 12 | from .embedding import remote_clique, chamfer_dist 13 | from .template import template_rate, templates_per_token 14 | from .functions import extract_patterns 15 | 16 | 17 | def compute_all_metrics( 18 | corpus: List[str], 19 | output_format: str = "dict", 20 | embedding_model: Optional[str] = "Qwen/Qwen3-Embedding-0.6B", 21 | homogenization_measure: str = "rougel", 22 | compression_algorithm: str = "gzip", 23 | ngram_n: int = 4, 24 | self_repetition_n: int = 4, 25 | template_shard_size: int = 500, 26 | verbose: bool = True, 27 | batch_size: int = 64 28 | ) -> Dict[str, Any]: 29 | """ 30 | Computes all available diversity metrics for a corpus of text. 31 | 32 | Args: 33 | corpus (List[str]): List of text documents to analyze 34 | output_format (str): Format for output - "dict", "markdown", or "latex" 35 | embedding_model (str): Model to use for embedding-based metrics 36 | homogenization_measure (str): Measure for homogenization score ("rougel", "bertscore", "bleu") 37 | compression_algorithm (str): Algorithm for compression ratio ("gzip", "xz") 38 | ngram_n (int): Maximum n-gram size for n-gram diversity 39 | self_repetition_n (int): N-gram size for self-repetition score 40 | template_shard_size (int): Shard size for template processing 41 | verbose (bool): Whether to show progress messages 42 | batch_size (int): Batch size for embedding computations 43 | 44 | Returns: 45 | Dict[str, Any]: Dictionary containing all computed metrics and formatted table if requested 46 | """ 47 | 48 | if verbose: 49 | print("Computing diversity metrics for corpus...") 50 | print(f"Corpus size: {len(corpus)} documents") 51 | 52 | results = {} 53 | 54 | # Compression-based metrics 55 | if verbose: 56 | print("Computing compression ratio...") 57 | results["compression_ratio_gzip"] = compression_ratio( 58 | corpus, algorithm="gzip", verbose=False 59 | ) 60 | 61 | if compression_algorithm == "xz": 62 | results["compression_ratio_xz"] = compression_ratio( 63 | corpus, algorithm="xz", verbose=False 64 | ) 65 | 66 | # Homogenization score 67 | if verbose: 68 | print(f"Computing homogenization score using {homogenization_measure}...") 69 | results[f"homogenization_score_{homogenization_measure}"] = homogenization_score( 70 | corpus, measure=homogenization_measure, verbose=verbose, batch_size=batch_size 71 | ) 72 | 73 | # N-gram diversity 74 | if verbose: 75 | print(f"Computing n-gram diversity (n={ngram_n})...") 76 | results["ngram_diversity"] = ngram_diversity_score(corpus, num_n=ngram_n) 77 | 78 | # Self-repetition score 79 | if verbose: 80 | print(f"Computing self-repetition score (n={self_repetition_n})...") 81 | results["self_repetition_score"] = self_repetition_score( 82 | corpus, n=self_repetition_n, verbose=verbose 83 | ) 84 | 85 | # Embedding-based metrics 86 | if verbose: 87 | print(f"Computing embedding-based metrics using {embedding_model}...") 88 | 89 | try: 90 | results["remote_clique_score"] = remote_clique( 91 | corpus, model=embedding_model, verbose=verbose, batch_size=batch_size 92 | ) 93 | results["chamfer_distance"] = chamfer_dist( 94 | corpus, model=embedding_model, verbose=verbose, batch_size=batch_size 95 | ) 96 | except Exception as e: 97 | if verbose: 98 | print(f"⚠️ Warning: Could not compute embedding metrics - {e}") 99 | results["remote_clique_score"] = None 100 | results["chamfer_distance"] = None 101 | 102 | # Template-based metrics 103 | if verbose: 104 | print("Extracting patterns for template metrics...") 105 | 106 | try: 107 | patterns = extract_patterns(corpus) 108 | 109 | if verbose: 110 | print("Computing template rate...") 111 | results["template_rate"] = template_rate( 112 | corpus, templates=patterns, shard_size=template_shard_size 113 | ) 114 | 115 | if verbose: 116 | print("Computing templates per token...") 117 | tpt_scores = templates_per_token( 118 | corpus, templates=patterns, shard_size=template_shard_size 119 | ) 120 | results["avg_templates_per_token"] = sum(tpt_scores) / len(tpt_scores) if tpt_scores else 0.0 121 | results["templates_per_token_scores"] = tpt_scores 122 | 123 | except Exception as e: 124 | if verbose: 125 | print(f"Warning: Could not compute template metrics - {e}") 126 | results["template_rate"] = None 127 | results["avg_templates_per_token"] = None 128 | results["templates_per_token_scores"] = None 129 | 130 | if verbose: 131 | print("All metrics computed successfully!") 132 | 133 | # Format output based on requested format 134 | if output_format.lower() == "markdown": 135 | results["formatted_table"] = _format_markdown_table(results) 136 | elif output_format.lower() == "latex": 137 | results["formatted_table"] = _format_latex_table(results) 138 | 139 | return results 140 | 141 | 142 | def _format_markdown_table(results: Dict[str, Any]) -> str: 143 | """Format results as a markdown table.""" 144 | 145 | table = "# Diversity Metrics Results\n\n" 146 | table += "| Metric | Value |\n" 147 | table += "|--------|-------|\n" 148 | 149 | for metric, value in results.items(): 150 | if metric in ["formatted_table", "templates_per_token_scores"]: 151 | continue 152 | 153 | if value is None: 154 | value_str = "N/A" 155 | elif isinstance(value, float): 156 | value_str = f"{value:.3f}" 157 | else: 158 | value_str = str(value) 159 | 160 | table += f"| {metric.replace('_', ' ').title()} | {value_str} |\n" 161 | 162 | return table 163 | 164 | 165 | def _format_latex_table(results: Dict[str, Any]) -> str: 166 | """Format results as a LaTeX table. Requires booktabs""" 167 | 168 | table = "\\begin{table}[htbp]\n" 169 | table += "\\centering\n" 170 | table += "\\caption{Diversity Metrics Results}\n" 171 | table += "\\begin{tabular}{lc}\n" 172 | table += "\\hline\n" 173 | table += "\\textbf{Metric} & \\textbf{Value} \\\\\n" 174 | table += "\\hline\n" 175 | 176 | for metric, value in results.items(): 177 | if metric in ["formatted_table", "templates_per_token_scores"]: 178 | continue 179 | 180 | if value is None: 181 | value_str = "N/A" 182 | elif isinstance(value, float): 183 | value_str = f"{value:.3f}" 184 | else: 185 | value_str = str(value) 186 | 187 | metric_name = metric.replace('_', ' ').title() 188 | table += f"{metric_name} & {value_str} \\\\\n" 189 | table += "\\\n" 190 | 191 | table += "\\end{tabular}\n" 192 | table += "\\end{table}\n" 193 | 194 | return table 195 | -------------------------------------------------------------------------------- /diversity/qudsim.py: -------------------------------------------------------------------------------- 1 | from .qudsim_modules import number_text, get_quds, align 2 | from .utils import openai 3 | from tqdm import tqdm 4 | import itertools 5 | import json 6 | import yaml 7 | import os 8 | 9 | class Document: 10 | def __init__(self, document): 11 | self.document = document 12 | 13 | def preprocess_document(self): 14 | 15 | numbered_text, number_sentence_dict = number_text(self.document) 16 | 17 | if not numbered_text or not number_sentence_dict: 18 | print("Could not preprocess document: ", self.document) 19 | self.numbered_text = None 20 | self.number_sentence_dict = None 21 | return False 22 | 23 | self.numbered_text = numbered_text 24 | self.number_sentence_dict = number_sentence_dict 25 | return True 26 | 27 | def generate_quds(self, gpt_model, config): 28 | try: 29 | level = config['level'] 30 | max_tries = config['max_tries'] 31 | except: 32 | print("Failed to parse configurations") 33 | return None, None, [], [] 34 | 35 | qg_item = get_quds(gpt_model, self.numbered_text, self.number_sentence_dict, level, max_tries) 36 | 37 | if qg_item is None: 38 | print("Could not segment, abstract or generate quds") 39 | self.segments = None 40 | self.entity_abstracted_segments = None 41 | self.quds = None 42 | self.segment_qud_dict = None 43 | self.qud_segment_dict = None 44 | return False 45 | else: 46 | self.segments = qg_item['segments'] 47 | self.entity_abstracted_segments = qg_item['entity_abstracted_segments'] 48 | self.quds = qg_item['quds'] 49 | self.segment_qud_dict = qg_item['segment_qud_dict'] 50 | self.qud_segment_dict = qg_item['qud_segment_dict'] 51 | return True 52 | 53 | class AlignmentPair: 54 | def __init__(self, document1: Document, document2: Document): 55 | self.source_document = document1 56 | self.target_document = document2 57 | 58 | def align_documents(self, gpt_model, config): 59 | try: 60 | threshold = config['threshold'] 61 | max_tries = config['max_tries'] 62 | except: 63 | print("Failed to parse configurations") 64 | return None, None, [], [] 65 | 66 | num_source_segments = len(self.source_document.segment_qud_dict) 67 | num_target_segments = len(self.target_document.segment_qud_dict) 68 | num_source_sentences = len(self.source_document.number_sentence_dict) 69 | num_target_sentences = len(self.target_document.number_sentence_dict) 70 | source_segments = self.source_document.segments 71 | target_segments = self.target_document.segments 72 | 73 | source_qud_answers, target_qud_answers, harmonic_mean_scores, aligned_segments = align(gpt_model, 74 | self.source_document.quds, 75 | self.target_document.numbered_text, 76 | self.target_document.quds, 77 | self.source_document.numbered_text, 78 | num_source_segments, 79 | num_target_segments, 80 | self.source_document.segment_qud_dict, 81 | self.target_document.segment_qud_dict, 82 | self.source_document.segments, 83 | self.target_document.segments, 84 | num_source_sentences, 85 | num_target_sentences, 86 | threshold, 87 | max_tries) 88 | 89 | self.source_qud_answers = source_qud_answers 90 | self.target_qud_answers = target_qud_answers 91 | self.harmonic_mean_scores = harmonic_mean_scores.tolist() 92 | self.aligned_segments = aligned_segments.tolist() 93 | 94 | aligned_segment_text = [] 95 | for i, src in enumerate(aligned_segments): 96 | for j, tgt in enumerate(src): 97 | if tgt>0: 98 | # alignment exists 99 | source_sentences = eval(source_segments)['segmentation'][i]['sentences'] 100 | try: 101 | source_text = [self.source_document.number_sentence_dict[str(num)] for num in source_sentences] 102 | except: 103 | try: 104 | source_text = [self.source_document.number_sentence_dict[num] for num in source_sentences] 105 | except Exception as e: 106 | print(e) 107 | 108 | target_sentences = eval(target_segments)['segmentation'][j]['sentences'] 109 | try: 110 | target_text = [self.target_document.number_sentence_dict[str(num)] for num in target_sentences] 111 | except: 112 | try: 113 | target_text = [self.target_document.number_sentence_dict[num] for num in target_sentences] 114 | except Exception as e: 115 | print(e) 116 | 117 | aligned_segment_text.append((" ".join(source_text), " ".join(target_text))) 118 | 119 | self.aligned_segment_text = aligned_segment_text 120 | 121 | 122 | def _compile_documents(documents: list[str], qg_gpt_model: openai.GPT, config): 123 | document_list = [] 124 | for document in tqdm(documents, total=len(documents), desc="Generating QUDs"): 125 | document_obj = Document(document=document) 126 | 127 | preprocessing_status = document_obj.preprocess_document() 128 | if not preprocessing_status: 129 | continue 130 | 131 | qud_generation_status = document_obj.generate_quds(gpt_model=qg_gpt_model, config=config) 132 | if not qud_generation_status: 133 | continue 134 | 135 | document_list.append(document_obj) 136 | 137 | return document_list 138 | 139 | def _custom_serializer(obj): 140 | if hasattr(obj, '__dict__'): 141 | return obj.__dict__ 142 | raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable") 143 | 144 | 145 | def qudsim(documents: list[str], key=None, config_file=None): 146 | """ 147 | Args: 148 | documents (list[str]): a list of texts to be aligned (all combinations of pairs will be computed) 149 | key (str): OpenAI Key 150 | config_file (str): a .yaml or .yml file that contains the necessary configurations (see config.yaml for the default config) 151 | 152 | Returns: 153 | 154 | 155 | """ 156 | 157 | if not config_file: 158 | config_file = os.path.join(os.path.dirname(__file__), '../config.yaml') 159 | 160 | with open(config_file, 'r') as file: 161 | configs = yaml.safe_load(file) 162 | 163 | try: 164 | qg_gpt_model_name = configs['qg_gpt_model'] 165 | qa_gpt_model_name = configs['qa_gpt_model'] 166 | level = configs['level'] 167 | threshold = configs['threshold'] 168 | max_tries = configs['max_tries'] 169 | except: 170 | print("Failed to parse configurations") 171 | return 172 | 173 | 174 | 175 | qg_gpt_model = openai.GPT(qg_gpt_model_name, key=key) 176 | qa_gpt_model = openai.GPT(qa_gpt_model_name, key=key) 177 | 178 | # level of abstraction of QUDs, with 0 being highly specific and 1 being abstractive 179 | 180 | if level!=0 and level!=1: 181 | print("Levels 0 and 1 are supported, 0 being specific and 1 being abstract. Value passed was an unsupported level.") 182 | return 183 | 184 | if threshold < 0 or threshold > 1: 185 | print("Threshold value is outside the valid range (0,1).") 186 | return 187 | 188 | if max_tries<=0: 189 | print("Maximum number of attempts to successfully align pairs must be at least 1.") 190 | return 191 | 192 | if len(documents)<2: 193 | print("At least two documents must be provided.") 194 | return 195 | 196 | # create document objects (one Document per document) 197 | document_list = _compile_documents(documents=documents, qg_gpt_model=qg_gpt_model, config=configs) 198 | 199 | if len(document_list)<2: 200 | print("At least two documents must successfully generate QUDs.") 201 | return 202 | 203 | # make pairs from the list of Document objects 204 | pair_combinations = list(itertools.combinations(document_list, 2)) 205 | alignment_pairs = [] 206 | for doc1, doc2 in tqdm(pair_combinations, total=len(pair_combinations), desc='Aligning Document Pairs'): 207 | alignment_pair = AlignmentPair(document1=doc1, document2=doc2) 208 | alignment_pair.align_documents(gpt_model=qa_gpt_model, config=configs) 209 | alignment_pairs.append(alignment_pair) 210 | 211 | 212 | json_str = json.dumps(alignment_pairs, default=_custom_serializer) 213 | return json_str 214 | -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. 203 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # diversity 2 | [![PyPI version](https://img.shields.io/pypi/v/diversity.svg)](https://pypi.org/project/diversity/) 3 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE) 4 | [![ArXiv](https://img.shields.io/badge/arXiv-2403.00553-b31b1b.svg)](https://arxiv.org/abs/2403.00553) 5 | 6 | ### **A Python toolkit for measuring diversity in text.** 7 | 8 | --- 9 | 10 | ## Table of Contents 11 | - [Installation](#installation) 12 | - [Quick Start](#quick-start) 13 | - [Lexical Diversity Measures](#lexical-diversity-measures) 14 | - [`compression_ratio`](#compression_ratiotexts-methodgzip) 15 | - [`homogenization_score`](#homogenization_scoretexts-methodself-bleu) 16 | - [`ngram_diversity_score`](#ngram_diversity_scoretexts-n3) 17 | - [`self_repetition_score`](#self_repetition_scoretexts-n4) 18 | - [Syntactic Diversity Measures](#syntactic-diversity-measures) 19 | - [`extract_patterns`](#extract_patternstexts-n4-top_n5) 20 | - [`match_patterns`](#match_patternstext-patterns) 21 | - [`template_rate`](#template_ratedata-templatesnone-shard_size500) 22 | - [`templates_per_token`](#templates_per_tokendata-templatesnone-shard_size500) 23 | - [Embedding-Based Diversity Measures](#embedding-based-diversity-measures) 24 | - [`remote_clique`](#remote_cliquedata-modelqwenqwen3-embedding-06b-verbo-true-batch_size64) 25 | - [`chamfer_dist`](#chamfer_distdata-modelqwenqwen3-embedding-06b-verbo-true-batch_size64) 26 | - [QUDSim (Question Under Discussion Similarity)](#qudsim-question-under-discussion-similarity) 27 | - [`qudsim`](#qudsimdocuments-key) 28 | - [Citations](#citations) 29 | - [Requirements](#requirements) 30 | - [License](#license) 31 | - [Contributing](#contributing) 32 | 33 | --- 34 | 35 | ## Installation 36 | 37 | Install via pip: 38 | 39 | ```bash 40 | pip install diversity 41 | ``` 42 | 43 | Or from source: 44 | 45 | ```bash 46 | git clone https://github.com/cshaib/diversity.git 47 | cd diversity 48 | pip install . 49 | ``` 50 | 51 | ---------- 52 | 53 | ## Quick Start 54 | 55 | The function `compute_all_metrics` will return a dictionary (and optionally a LaTeX/Markdown formatted tabular output) computing the diversity metrics described individually in the following section. 56 | 57 | ``` 58 | from diversity import compute_all_metrics 59 | import json 60 | 61 | texts = [ 62 | "The quick brown fox jumps over the lazy dog.", 63 | "The quick brown fox jumps over the lazy dog again.", 64 | "Suddenly, the quick brown fox leaps swiftly over the sleeping dog." 65 | ] 66 | 67 | # Compute metrics 68 | results = compute_all_metrics(corpus=texts) 69 | 70 | # Remove the list of per-document scores for cleaner dict output 71 | clean_results = {k: v for k, v in results.items() 72 | if k != "templates_per_token_scores"} 73 | output_content = json.dumps(clean_results, indent=2) 74 | 75 | with open('diversity_metrics.json', 'w', encoding='utf-8') as f: 76 | f.write(output_content) 77 | ``` 78 | 79 | ### Lexical Diversity Measures 80 | 81 | We provide implementations for Compression Ratio, Homogenization Score, and n-gram Diversity Score: 82 | 83 | ```python 84 | from diversity import ( 85 | compression_ratio, 86 | homogenization_score, 87 | ngram_diversity_score, 88 | ) 89 | 90 | texts = [ 91 | "The quick brown fox jumps over the lazy dog.", 92 | "The quick brown fox jumps over the lazy dog again.", 93 | "Suddenly, the quick brown fox leaps swiftly over the sleeping dog." 94 | ] 95 | 96 | # Compression ratio 97 | cr = compression_ratio(texts, method='gzip') 98 | print(f"Compression Ratio: {cr:.4f}") 99 | 100 | # Homogenization score (Self-BLEU) 101 | hs = homogenization_score(texts, method='self-bleu') 102 | print(f"Homogenization (Self-BLEU): {hs:.4f}") 103 | 104 | # N-gram diversity 105 | ngd = ngram_diversity_score(texts, n=3) 106 | print(f"3-gram Diversity: {ngd:.4f}") 107 | 108 | # Self-repetition score 109 | srs = self_repetition_score (texts) 110 | print(f"Self-repetition score: {srs:4f}") 111 | ``` 112 | #### `compression_ratio(texts, method='gzip')` 113 | 114 | - **Parameters:** 115 | - `texts` (list): List of text strings 116 | - `method` (str): Compression algorithm ('gzip', 'bz2', 'lzma') 117 | - **Returns:** Float, higher = more repetitive 118 | 119 | #### `homogenization_score(texts, method='self-bleu')` 120 | 121 | - **Parameters:** 122 | - `texts` (list): List of text strings 123 | - `method` (str): Scoring method ('self-bleu', 'rouge-l', 'bertscore') 124 | - **Returns:** Float, higher = more homogeneous 125 | 126 | #### `ngram_diversity_score(texts, n=3)` 127 | 128 | - **Parameters:** 129 | - `texts` (list): List of text strings 130 | - `n` (int): N-gram size 131 | - **Returns:** Float, higher = more diverse 132 | 133 | #### `self_repetition_score(texts, n=4)` 134 | 135 | - **Parameters:** 136 | - `text` (list): List of text strings 137 | - **Returns:** Float, higher = more diverse 138 | ---------- 139 | 140 | ### Syntactic Diversity Measures 141 | 142 | We also provide functions for extracting and analyze Part-of-Speech (POS) patterns to identify repetitive syntactic structures in your text: 143 | 144 | ```python 145 | from diversity import ( 146 | extract_patterns, 147 | match_patterns, 148 | template_rate, 149 | templates_per_token 150 | ) 151 | 152 | texts = [ 153 | "The quick brown fox jumps over the lazy dog.", 154 | "The quick brown fox jumps over the lazy dog again.", 155 | "Suddenly, the quick brown fox leaps swiftly over the sleeping dog." 156 | ] 157 | 158 | # POS pattern extraction 159 | patterns = extract_patterns(texts, n=4, top_n=5) 160 | print("Top POS patterns:", patterns) 161 | # Example output: [(('DT', 'JJ', 'JJ', 'NN'), 15), ...] 162 | 163 | # Match patterns in a single text 164 | matches = match_patterns(texts[2], patterns) 165 | print("Patterns in 3rd sentence:", matches) 166 | # Example output: [{'pattern': ('DT', 'JJ', 'JJ', 'NN'), 'text': 'the quick brown fox', 'position': (0, 4)}] 167 | 168 | # Template Rate (number of templates that appear in each text, averaged across documents0 169 | tr = template_rate(texts, templates) 170 | print("Template Rate:", tr) 171 | 172 | # Templates-per-token (normalized by text length, per output) 173 | tpt = templates_per_token(texts, templates) 174 | print("Templates per Token:", tpt) 175 | ``` 176 | 177 | #### `extract_patterns(text, n=5, top_n=100)` 178 | 179 | - **text (list of str):** Documents to extract syntactic patterns from. 180 | 181 | - **n (int):** N-gram size for POS pattern extraction (default: `5`). 182 | 183 | - **top_n (int):** Number of most frequent patterns to keep (default: `100`). 184 | 185 | - **Returns:** `dict` — dictionary mapping POS patterns (e.g., `"DT JJ NN NN"`) to sets of text spans that match the patterns 186 | 187 | 188 | #### `match_patterns(text, patterns)` 189 | 190 | - **text (str):** Input text to search for patterns. 191 | 192 | - **patterns (dict):** Dictionary of patterns and their text matches as returned by `extract_patterns`. 193 | 194 | - **Returns:** `list[tuple]` — list of `(pattern, text)` pairs showing which syntactic patterns appear in the input and the exact spans that match 195 | 196 | 197 | #### `template_rate(data, templates=None, shard_size=500)` 198 | 199 | - **data (list of str):** Documents to score. 200 | 201 | - **templates (dict, optional):** Dictionary of templates extracted from the corpus. If `None`, templates are computed using `extract_patterns`. 202 | 203 | - **shard_size (int):** Number of regex patterns to compile per shard (default: `500`). 204 | 205 | - **Returns:** `float` — fraction of documents in the corpus that contain at least one template (higher = more templated, lower = more original). 206 | 207 | 208 | #### `templates_per_token(data, templates=None, shard_size=500)` 209 | 210 | - **data (list of str):** Documents to score. 211 | 212 | - **templates (dict, optional):** Dictionary of templates extracted from the corpus. If `None`, templates are computed using `extract_patterns`. 213 | 214 | - **shard_size (int):** Number of regex patterns to compile per shard (default: `500`). 215 | 216 | - **Returns:** `float` — per-document ratio of template matches to tokens (higher = more templated per word, lower = more diverse writing). 217 | 218 | ---------- 219 | 220 | ### Embedding-Based Diversity Measures 221 | 222 | You can also measure semantic diversity using *embedding*-based similarity. These scores compute distances between document embeddings to quantify how spread out or clustered the texts are: 223 | 224 | ```python 225 | from diversity.embedding import remote_clique, chamfer_dist 226 | 227 | texts = [ 228 | "The quick brown fox jumps over the lazy dog.", 229 | "A swift auburn fox vaulted a sleeping canine.", 230 | "I brewed coffee and read the paper." 231 | ] 232 | 233 | # Remote Clique Score 234 | rc = remote_clique(texts, model="Qwen/Qwen3-Embedding-0.6B") 235 | print(f"Remote Clique: {rc:.3f}") 236 | 237 | # Chamfer Distance 238 | cd = chamfer_dist(texts, model="Qwen/Qwen3-Embedding-0.6B") 239 | print(f"Chamfer Distance: {cd:.3f}") 240 | ``` 241 | #### `remote_clique(data, model='Qwen/Qwen3-Embedding-0.6B', verbose=True, batch_size=64)` 242 | 243 | - **data (list of str):** Documents to score. 244 | 245 | - **model (str):** HuggingFace/Sentence-Transformers embedding model to use (default: `"Qwen/Qwen3-Embedding-0.6B"`). 246 | 247 | - **verbose (bool):** Whether to show a progress bar during encoding (default: `True`). 248 | 249 | - **batch_size (int):** Batch size for embedding (default: `64`). 250 | 251 | - **Returns:** `float` — average mean pairwise cosine distance between documents (higher = more spread out / diverse). 252 | 253 | 254 | #### `chamfer_dist(data, model='Qwen/Qwen3-Embedding-0.6B', verbose=True, batch_size=64)` 255 | 256 | - **data (list of str):** Documents to score. 257 | 258 | - **model (str):** HuggingFace/Sentence-Transformers embedding model to use (default: `"Qwen/Qwen3-Embedding-0.6B"`). 259 | 260 | - **verbose (bool):** Whether to show a progress bar during encoding (default: `True`). 261 | 262 | - **batch_size (int):** Batch size for embedding (default: `64`). 263 | 264 | - **Returns:** `float` — average minimum pairwise cosine distance (sensitive to near-duplicates; higher = less redundancy). 265 | 266 | ---------- 267 | 268 | ### QUDSim (Question Under Discussion Similarity) 269 | 270 | QUDSim aligns document segments based on Questions Under Discussion (QUDs) --- implicit questions that segments of text address ([QUDsim: Quantifying Discourse Similarities in LLM-Generated Text](https://arxiv.org/abs/2504.09373)). 271 | 272 | This function requires OpenAI API access. 273 | 274 | ```python 275 | from diversity import qudsim 276 | 277 | # Two documents about the same topic 278 | document1 = "In the heart of ancient Macedonia, Philip II ascended to the throne in 359 BC..." 279 | document2 = "The sun beat down on the rough-hewn hills of ancient Macedonia..." 280 | 281 | # Requires OpenAI API key 282 | import os 283 | key = os.environ.get('OPENAI_API_KEY') # or your API key 284 | 285 | # Generate QUD-based alignment 286 | alignment = qudsim([document1, document2], key=key) 287 | 288 | # Access alignment results 289 | results = eval(alignment)[0] # First document pair 290 | 291 | # View aligned segments 292 | for source_text, target_text in results['aligned_segment_text']: 293 | print(f"Source: {source_text[:100]}...") 294 | print(f"Target: {target_text[:100]}...") 295 | print("---") 296 | 297 | # View alignment scores (harmonic mean scores matrix) 298 | scores = results['harmonic_mean_scores'] 299 | print(f"Alignment scores shape: {len(scores)}x{len(scores[0])}") 300 | 301 | # Other available fields: 302 | # - results['source_qud_answers']: QUDs generated for source document 303 | # - results['target_qud_answers']: QUDs generated for target document 304 | # - results['aligned_segments']: Indices of aligned segments 305 | ``` 306 | 307 | #### `qudsim(documents, key)` 308 | 309 | - **Parameters:** 310 | - `documents` (list): List of texts to align 311 | - `key` (str): OpenAI API key for QUD generation 312 | - `model` (str): LLM model to use (default: `gpt-4`) 313 | - `threshold` (float): Minimum alignment score threshold (default: 0.5) 314 | - **Returns:** list of alignment scores 315 | ---------- 316 | 317 | ## Citation(s) 318 | 319 | If you use this package, please cite: 320 | 321 | ```bibtex 322 | @misc{shaib2025standardizingmeasurementtextdiversity, 323 | title={Standardizing the Measurement of Text Diversity: A Tool and a Comparative Analysis of Scores}, 324 | author={Chantal Shaib and Joe Barrow and Jiuding Sun and Alexa F. Siu and Byron C. Wallace and Ani Nenkova}, 325 | year={2025}, 326 | eprint={2403.00553}, 327 | archivePrefix={arXiv}, 328 | primaryClass={cs.CL}, 329 | url={https://arxiv.org/abs/2403.00553}, 330 | } 331 | ``` 332 | 333 | If you use QUDSim, please **also** cite: 334 | 335 | ```bibtex 336 | @inproceedings{ 337 | namuduri2025qudsim, 338 | title={{QUD}sim: Quantifying Discourse Similarities in {LLM}-Generated Text}, 339 | author={Ramya Namuduri and Yating Wu and Anshun Asher Zheng and Manya Wadhwa and Greg Durrett and Junyi Jessy Li}, 340 | booktitle={Second Conference on Language Modeling}, 341 | year={2025}, 342 | url={https://openreview.net/forum?id=zFz1BJu211} 343 | } 344 | ``` 345 | 346 | ---------- 347 | 348 | ## Requirements 349 | 350 | - Python 3.10-3.12 351 | - Core dependencies: 352 | - `numpy` 353 | - `nltk` 354 | - `scikit-learn` 355 | - For embedding-based metrics: 356 | - `sentence-transformers` 357 | - `torch` 358 | - For QUDSim: 359 | - `openai` 360 | - `tqdm` 361 | 362 | ---------- 363 | 364 | ## License 365 | 366 | This package is released under the **Apache License 2.0**. 367 | 368 | ---------- 369 | 370 | ## Contributing 371 | 372 | Contributions are welcome! 373 | Please open an issue or submit a pull request on GitHub. 374 | 375 | ---------- 376 | --------------------------------------------------------------------------------