├── tests
    └── __init__.py
├── diversity
    ├── utils
    │   ├── __init__.py
    │   ├── memoize.py
    │   └── openai.py
    ├── patterns
    │   ├── __init__.py
    │   ├── Token.py
    │   └── part_of_speech.py
    ├── qudsim_modules
    │   ├── __init__.py
    │   ├── qudsim_preprocessing
    │   │   └── number.py
    │   ├── qudsim_alignment
    │   │   ├── answer.py
    │   │   ├── align.py
    │   │   ├── metric.py
    │   │   └── similarity.py
    │   └── qudsim_qud_generation
    │   │   ├── decontextualize.py
    │   │   ├── qud.py
    │   │   ├── segment.py
    │   │   └── pipeline.py
    ├── __init__.py
    ├── ngram_diversity.py
    ├── self_repetition.py
    ├── compression.py
    ├── embedding.py
    ├── homogenization.py
    ├── functions.py
    ├── template.py
    ├── compute_all_metrics.py
    └── qudsim.py
├── .gitignore
├── config.yaml
├── .pre-commit-config.yaml
├── pyproject.toml
├── test-diversity
    └── hom_tests.py
├── .github
    └── workflows
    │   └── publish.yaml
├── examples
    └── summarization.py
├── scripts
    └── timing.py
├── LICENSE.md
└── README.md


/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/diversity/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .memoize import memoized
2 | from .openai import GPT


--------------------------------------------------------------------------------
/diversity/patterns/__init__.py:
--------------------------------------------------------------------------------
1 | from .Token import token_patterns
2 | from .part_of_speech import get_pos, pos_patterns
3 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | dist/
2 | .DS_Store
3 | diversity/__pycache__/
4 | diversity/patterns/__pycache__/
5 | test.ipynb
6 | __pycache__
7 | .nova/


--------------------------------------------------------------------------------
/config.yaml:
--------------------------------------------------------------------------------
1 | qg_gpt_model: "gpt-4o-2024-08-06"
2 | qa_gpt_model: "gpt-4o-2024-08-06"
3 | level: 1
4 | temperature: 1
5 | threshold: 0.20
6 | max_tries: 3


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | # See https://pre-commit.com for more information
 2 | # See https://pre-commit.com/hooks.html for more hooks
 3 | repos:
 4 |   - repo: https://github.com/pre-commit/pre-commit-hooks
 5 |     rev: v4.0.1
 6 |     hooks:
 7 |       - id: check-toml
 8 |       - id: check-yaml
 9 |       - id: end-of-file-fixer
10 |       - id: mixed-line-ending
11 | 


--------------------------------------------------------------------------------
/diversity/qudsim_modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .qudsim_preprocessing.number import number_text
2 | from .qudsim_qud_generation.segment import segment
3 | from .qudsim_qud_generation.decontextualize import decontextualize
4 | from .qudsim_qud_generation.qud import generate_quds
5 | from .qudsim_qud_generation.pipeline import get_quds
6 | from .qudsim_alignment.metric import FrequencyBasedSimilarity
7 | from .qudsim_alignment.similarity import get_harmonic_similarity
8 | from .qudsim_alignment.align import align
9 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "diversity"
 3 | version = "0.3.0"
 4 | description = ""
 5 | authors = ["Chantal Shaib <shaib.c@northeastern.edu>"]
 6 | readme = "README.md"
 7 | 
 8 | [tool.poetry.dependencies]
 9 | python = ">=3.10,<3.13"
10 | typer = ">=0.9.0"
11 | nltk = "^3.8.1"
12 | rouge-score = "^0.1.2"
13 | evaluate = "^0.4.1"
14 | sentence-transformers = "^5.1.0"
15 | transformers = "^4.41.0"
16 | 
17 | [tool.poetry.group.dev.dependencies]
18 | pre-commit = "^3.4.0"
19 | pandas = "^2.1.2"
20 | numpy = "^1.26.1"
21 | 
22 | [build-system]
23 | requires = ["poetry-core"]
24 | build-backend = "poetry.core.masonry.api"
25 | 


--------------------------------------------------------------------------------
/diversity/__init__.py:
--------------------------------------------------------------------------------
 1 | from .compression import compression_ratio
 2 | from .patterns.Token import token_patterns
 3 | from .patterns.part_of_speech import pos_patterns, get_pos
 4 | from .utils.memoize import memoized
 5 | from .homogenization import homogenization_score
 6 | from .ngram_diversity import ngram_diversity_score
 7 | from .functions import extract_patterns, match_patterns
 8 | from .self_repetition import self_repetition_score
 9 | from .template import template_rate, templates_per_token
10 | from .qudsim import qudsim
11 | from .embedding import remote_clique, chamfer_dist
12 | from .compute_all_metrics import compute_all_metrics


--------------------------------------------------------------------------------
/diversity/ngram_diversity.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | import nltk
 3 | 
 4 | def ngram_diversity_score(
 5 |         data: List[str],
 6 |         num_n: int = 4, 
 7 | ) -> float:
 8 |     """ Calculates corpus-level ngram diversity based on unique ngrams 
 9 |        (e.g., https://arxiv.org/pdf/2202.00666.pdf).
10 | 
11 |     Args:
12 |         data (List[str]): List of documents. 
13 |         num_n (int): Max ngrams to test up to. Defaults to 5. 
14 | 
15 |     Returns:
16 |         float: ngram diveristy score.
17 |     """
18 |     score = 0 
19 |     data = ' '.join(data).split(' ') # format to list of words
20 | 
21 |     for i in range(1, num_n + 1): 
22 |         ngrams = list(nltk.ngrams(data, i))
23 |         # num unique ngrams / all ngrams for each size n 
24 |         score += len(set(ngrams)) / len(ngrams) 
25 | 
26 |     return round(score, 3)
27 | 


--------------------------------------------------------------------------------
/diversity/patterns/Token.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from typing import List, Tuple
 3 | import nltk 
 4 | 
 5 | 
 6 | def token_patterns(
 7 |         data: List[str],
 8 |         n: int,
 9 |         top_n: int = 10
10 | ) -> List[Tuple[str, int]]:
11 |     """ Finds ngrams patterns in the data.
12 | 
13 |     Args:
14 |         data (List[str]): Data to run frequency counts on.
15 |         n (int): N-gram length.
16 |         top_n (int, optional): Top patterns to display. Defaults to 10.
17 | 
18 |     Returns:
19 |         List[Tuple[str, int]]: Sorted list of top n-gram patterns.
20 |     """
21 | 
22 |     # treat data as one string 
23 |     all_data = ' '.join(data)
24 | 
25 |     ngrams = list(nltk.ngrams(all_data.split(' '), n))
26 |     frequency = nltk.FreqDist(ngrams)
27 | 
28 |     sorted_frequency = sorted(frequency.items(), key=lambda kv: kv[1], reverse=True)[:top_n]
29 | 
30 |     sorted_frequency = [(' '.join(x[0]), x[1]) for x in sorted_frequency]
31 | 
32 |     return sorted_frequency
33 | 


--------------------------------------------------------------------------------
/diversity/utils/memoize.py:
--------------------------------------------------------------------------------
 1 | import collections
 2 | import functools
 3 | 
 4 | class memoized(object):
 5 |    '''
 6 |    Decorator. Caches a function's return value each time it is called.
 7 |    If called later with the same arguments, the cached value is returned
 8 |    (not re-evaluated).
 9 |    From https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize.
10 |    '''
11 | 
12 |    def __init__(self, func):
13 |       self.func = func
14 |       self.cache = {}
15 | 
16 |    def __call__(self, *args):
17 |       if not isinstance(args, collections.abc.Hashable):
18 |          # better to not cache than blow up.
19 |          return self.func(*args)
20 |       if args in self.cache:
21 |          return self.cache[args]
22 |       else:
23 |          value = self.func(*args)
24 |          self.cache[args] = value
25 |          return value
26 |       
27 |    def __repr__(self):
28 |       '''Return the function's docstring.'''
29 |       return self.func.__doc__
30 |    
31 |    def __get__(self, obj, objtype):
32 |       '''Support instance methods.'''
33 |       return functools.partial(self.__call__, obj)
34 | 


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_preprocessing/number.py:
--------------------------------------------------------------------------------
 1 | import nltk
 2 | nltk.download('punkt_tab')
 3 | from nltk.tokenize import sent_tokenize
 4 | import logging
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | 
 8 | def _tokenize_sentences(new_text):
 9 |     sentences = sent_tokenize(new_text)
10 |     number_sentence_dict = {key:val for key, val in zip(range(1, len(sentences)+1), sentences)}
11 |     return number_sentence_dict
12 | 
13 | def number_text(text: str):
14 |     """ Pre-processes and numbers each sentence in the document.
15 | 
16 |     Args:
17 |         text (str): document
18 | 
19 |     Returns:
20 |         str: document with sentence numbers prepending each sentence
21 |         dict: mapping between sentence number and its corresponding sentence
22 |     """
23 |     new_text = text.replace("“", "\'")
24 |     new_text = new_text.replace("”", "\'")
25 |     try:
26 |         number_sentence_dict = _tokenize_sentences(new_text)
27 |     except Exception as e:
28 |         logger.error(e)
29 |         return None, None
30 | 
31 | 
32 |     numbered_text = ""
33 |     for key in number_sentence_dict:
34 |         numbered_text += "[%d] " % key
35 |         numbered_text += number_sentence_dict[key]
36 |     
37 |     return numbered_text, number_sentence_dict


--------------------------------------------------------------------------------
/test-diversity/hom_tests.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Times and tests BERTScore, ROUGE and BLEU score computation over CNN-DailyMail
 3 | summaries.
 4 | '''
 5 | 
 6 | from datasets import load_dataset
 7 | from homogenization import homogenization_score
 8 | from time import perf_counter as pc
 9 | from numpy import round
10 | 
11 | if __name__ == '__main__':
12 |     data = load_dataset("argilla/cnn-dailymail-summaries")["train"].to_pandas().highlights.sample(500, random_state=1).values.tolist()
13 |     
14 |     start = pc()
15 |     bs = homogenization_score(data, measure='bertscore', verbose=True, model="distilbert-base-uncased")
16 |     end = pc()
17 |     
18 |     print(f"Time taken to compute BERTScore over CNN summaries: {round(end-start,2)}\nBERTScore over 500 reference summaries: {bs}")
19 |     
20 |     start = pc()
21 |     rl = homogenization_score(data, measure='rougel', verbose=True)
22 |     end = pc()
23 |     
24 |     print(f"Time taken to compute Rouge over CNN summaries: {round(end-start,2)} secs\nRouge over 500 reference summaries: {rl}")
25 |     
26 |     start = pc()
27 |     rl = homogenization_score(data, measure='bleu', verbose=True)
28 |     end = pc()
29 |     
30 |     print(f"Time taken to compute BLEU over CNN summaries: {round(end-start,2)} secs\nBLEU over 500 reference summaries: {rl}")
31 | 


--------------------------------------------------------------------------------
/diversity/utils/openai.py:
--------------------------------------------------------------------------------
 1 | import openai
 2 | from pydantic import BaseModel
 3 | import os
 4 | 
 5 | class GPT():
 6 |     '''
 7 |     Wrapper. Helps instantiate and make requests to openai clients.
 8 |     '''
 9 |     def __init__(self, gpt_model, key):
10 |         self.model = gpt_model
11 |         openai.api_key = key
12 |         self.client = openai.OpenAI()
13 | 
14 |     def call(self, prompt: str, system_prompt: str):
15 |         response = self.client.chat.completions.create(
16 |             model=self.model,
17 |             messages=[
18 |                 {"role": "system", "content": system_prompt},
19 |                 {"role": "user", "content": prompt}
20 |             ]
21 |         )
22 |         msg = response.choices[0].message
23 |         return msg.content
24 | 
25 | 
26 |     def call_gpt_format(self, prompt: str, system_prompt: str, format):
27 |         try:
28 |             completion = self.client.beta.chat.completions.parse(
29 |                 model=self.model,
30 |                 messages=[
31 |                     {"role": "system", "content": system_prompt},
32 |                     {"role": "user", "content":prompt}
33 |                 ],
34 |                 response_format=format,
35 |             )
36 | 
37 |             answer = completion.choices[0].message.parsed
38 |             return answer
39 |         except:
40 |             raise TypeError
41 |         


--------------------------------------------------------------------------------
/.github/workflows/publish.yaml:
--------------------------------------------------------------------------------
 1 | name: Publish to PyPI
 2 | 
 3 | on:
 4 |   release:
 5 |     types: [published]
 6 |   workflow_dispatch: 
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v4
13 |     
14 |     - name: Set up Python
15 |       uses: actions/setup-python@v5
16 |       with:
17 |         python-version: '3.11'
18 |     
19 |     - name: Install dependencies
20 |       run: |
21 |         python -m pip install --upgrade pip
22 |         pip install build twine
23 |     
24 |     - name: Build package
25 |       run: python -m build
26 |     
27 |     - name: Check package with twine
28 |       run: twine check dist/*
29 |     
30 |     - name: Upload artifacts
31 |       uses: actions/upload-artifact@v4
32 |       with:
33 |         name: python-package-distributions
34 |         path: dist/
35 | 
36 |   publish-to-pypi:
37 |     name: Publish to PyPI
38 |     needs: build
39 |     runs-on: ubuntu-latest
40 |     environment:
41 |       name: pypi
42 |       url: https://pypi.org/p/diversity
43 |     permissions:
44 |       id-token: write  
45 |     
46 |     steps:
47 |     - name: Download artifacts
48 |       uses: actions/download-artifact@v4
49 |       with:
50 |         name: python-package-distributions
51 |         path: dist/
52 |     
53 |     - name: Publish to PyPI
54 |       uses: pypa/gh-action-pypi-publish@release/v1
55 | 


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_alignment/answer.py:
--------------------------------------------------------------------------------
 1 | from ...utils import openai
 2 | 
 3 | system_prompt = "You are an expert reading comprehension agent. You will be given a passage with numbered sentences and a series of questions. For each question, your task is to extract all sentences that directly help answer it. You must return the question and a list of sentence numbers and sentences that answer it. The question may not always be answerable. In that case, return an empty list. Do NOT overgenerate. Do not modify the original text."
 4 | 
 5 | class Answer(openai.BaseModel):
 6 |     question: str
 7 |     sentence_nums: list[int]
 8 |     sentences: list[str]
 9 | 
10 | class Response(openai.BaseModel):
11 |     excerpts: list[Answer]
12 | 
13 | 
14 | def get_answer(gpt_model, numbered_segments: str, qud_list: str, num_quds: int, num_target_sentences: int, max_tries: int):
15 |     for i in range(max_tries):
16 |         try:
17 |             prompt = "Passage: %s\nQuestions:\n%s" % (numbered_segments, qud_list)
18 |             answer = gpt_model.call_gpt_format(prompt, system_prompt, Response)
19 | 
20 |             if len(answer.excerpts)!=num_quds:
21 |                 continue
22 | 
23 |             for ans in answer.excerpts:
24 |                 for sentence in ans.sentence_nums:
25 |                     if sentence<0 or sentence>num_target_sentences:
26 |                         continue
27 | 
28 |             return answer.model_dump_json()
29 |         except Exception as e:
30 |             print(e)
31 |             continue
32 | 
33 |     return None


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_qud_generation/decontextualize.py:
--------------------------------------------------------------------------------
 1 | from ...utils import openai
 2 | 
 3 | system_prompt = "You will be given several numbered paragraphs. Decontextualize each paragraph such that the paragraph's general plot is captured. Names, places, extraneous details and descriptive language should all be abstracted away."
 4 | 
 5 | class Paragraph(openai.BaseModel):
 6 |     para_num: int
 7 |     para: str
 8 | 
 9 | class Answer(openai.BaseModel):
10 |     decontextualized_paragraphs: list[Paragraph]
11 | 
12 | def decontextualize(gpt_model: openai.GPT, text: str, num_segments: int, max_tries: int):
13 |     """Extracts and removes entities from segments
14 | 
15 |     Args:
16 |         gpt_model: an OpenAI client
17 |         text (str): string concatenation of segments (default delimeter is two newlines) 
18 |         num_segments (int): number of segments being entity-extracted
19 |         max_tries (int): maximum number of attempts the client can make in case of failure 
20 |     
21 |     Returns:
22 |         Answer: decontextualized segments wrapped in an openai.BaseModel class
23 |     """
24 | 
25 |     for i in range(max_tries):
26 |         try:
27 |             decontextualized = gpt_model.call_gpt_format(text, system_prompt, Answer)
28 | 
29 |             if not decontextualized:
30 |                 continue
31 | 
32 |             if len(decontextualized.decontextualized_paragraphs)!=num_segments:
33 |                 continue
34 | 
35 |             return decontextualized
36 |         except Exception as e:
37 |             continue
38 | 
39 |     return None
40 | 
41 |     
42 | 


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_qud_generation/qud.py:
--------------------------------------------------------------------------------
 1 | from ...utils import openai
 2 | 
 3 | system_prompt_high_level = "You will be given a paragraph. We are interested in forming unique, high-level, abstract QUDs with minimal details such that when they are answered, we understand the main themes of the paragraph. Details specific to the content should be omitted. QUDs should like: What were the individual's greatest accomplishments? What legacy did the individual leave behind?. First answer the minimum number of QUD(s) required. Then list the QUDs. Do not use conjunctions in the QUDs."
 4 | 
 5 | class QUD(openai.BaseModel):
 6 |     qud: str
 7 | 
 8 | class Answer(openai.BaseModel):
 9 |     num_quds: int
10 |     quds: list[QUD]
11 | 
12 | def generate_quds(gpt_model: openai.GPT, segment: str, max_tries: int):
13 |     """Generate Question(s) Under Discussion for a given segment
14 | 
15 |     Args:
16 |         gpt_model: an OpenAI client
17 |         segment (str): the segment for which QUDs need to be generated (QUDs are answered by the segment)
18 |         max_tries (int): maximum number of attempts the client can make in case of failure 
19 |     
20 |     Returns:
21 |         Answer: QUDs wrapped in an openai.BaseModel class
22 |     """
23 |     for i in range(max_tries):
24 |         try:
25 |             quds = gpt_model.call_gpt_format(segment, system_prompt_high_level, Answer)
26 | 
27 |             if not quds or len(quds.quds)==0:
28 |                 continue
29 | 
30 |             return quds
31 |         except Exception as e:
32 |             continue
33 |     
34 |     return None
35 |             


--------------------------------------------------------------------------------
/diversity/self_repetition.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from nltk.util import ngrams
 3 | 
 4 | from tqdm import tqdm
 5 | from typing import List
 6 | from collections import Counter
 7 | 
 8 | def self_repetition_score(
 9 |         dataset: List[str],
10 |         n: int = 4,
11 |         verbose: bool = True
12 | ) -> float:
13 |     """
14 |     Calculates a self-repetition score for a dataset based on the 
15 |     repetition of ngrams within the corpus.
16 | 
17 |     Args:
18 |         dataset (List[str]): A list of documents (strings) to analyze.
19 |         n (int): Size of the ngrams to check for repetition. Defaults to 4.
20 |         verbose (bool): enable/disable show progress bar
21 | 
22 |     Returns:
23 |         float: The self-repetition score, averaged over the dataset.
24 |     """
25 |     total_sum = 0
26 |     
27 |     # Get all unique ngrams per doc
28 |     ngram_docs = [list(set([' '.join(ngram) for ngram in ngrams(doc.split(), 4)])) for doc in dataset]
29 |     
30 |     # Count occurrences of unique ngrams across whole dataset
31 |     all_ngrams = sum(ngram_docs, [])
32 |     ngram_counts = Counter(all_ngrams)
33 | 
34 |     for ngram_doc in tqdm(ngram_docs, desc="Calculating self-repetition score", disable=(not verbose)):
35 |         # Find the total occurrence of an n-gram and subtract current doc's n-gram count
36 |         # to get the count of occurrences of an n-gram in other docs
37 |         sum_ni = sum([ngram_counts[ngram] for ngram in ngram_doc]) - len(ngram_doc)
38 |         
39 |         # add-one to avoid zero error
40 |         total_sum += np.log(sum_ni + 1)
41 |     return total_sum / len(dataset)


--------------------------------------------------------------------------------
/diversity/compression.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from pathlib import Path
 3 | 
 4 | import tempfile
 5 | import gzip
 6 | import os
 7 | import lzma as xz
 8 | 
 9 | 
10 | def compression_ratio(
11 |         data: List[str],
12 |         algorithm: str = 'gzip',
13 |         verbose: bool = False,
14 |         path: Optional[str] = None
15 | ) -> float:
16 |     """ Calculates the compression ratio for a collection of text.
17 |      Args:
18 |          path (str): Path to store temporarily zipped files.
19 |          data (List[str]): Strings to compress.
20 |          algorithm (str, optional): Either 'gzip' or 'xz'. Defaults to 'gzip'.
21 |          verbose (bool, optional): Print out the original and compressed size separately. Defaults to False.
22 |      Returns:
23 |          float: Compression ratio (original size / compressed size)
24 |      """
25 |      
26 |     temp_dir = None
27 |     if not path:
28 |         temp_dir = tempfile.TemporaryDirectory()
29 |         path = Path(temp_dir.name)
30 |     else:
31 |         path = Path(path)
32 | 
33 |     with (path / 'original.txt').open('w+') as f:
34 |         f.write(' '.join(data))
35 | 
36 |     original_size = os.path.getsize(os.path.join(path, "original.txt"))
37 | 
38 |     if algorithm == 'gzip':
39 | 
40 |         with gzip.GzipFile(str(path / 'compressed.gz'), 'w+') as f:
41 |             f.write(gzip.compress(' '.join(data).encode('utf-8')))
42 | 
43 |         compressed_size = os.path.getsize(os.path.join(path, "compressed.gz"))
44 | 
45 |     elif algorithm == 'xz': 
46 | 
47 |         with xz.open(str(path / 'compressed.gz'), 'wb') as f:
48 |             f.write(' '.join(data).encode('utf-8'))
49 | 
50 |         compressed_size = (path / "compressed.gz").stat().st_size
51 | 
52 |     if verbose: 
53 |         print(f"Original Size: {original_size}\nCompressed Size: {compressed_size}")
54 | 
55 |     if temp_dir:
56 |         temp_dir.cleanup()
57 | 
58 |     return round(original_size / compressed_size, 3)
59 |     
60 | 


--------------------------------------------------------------------------------
/examples/summarization.py:
--------------------------------------------------------------------------------
 1 | from typing import Optional
 2 | from diversity import get_pos, pos_patterns, token_patterns, compression_ratio
 3 | from transformers import pipeline
 4 | from datasets import load_dataset
 5 | 
 6 | import typer
 7 | 
 8 | app = typer.Typer()
 9 | 
10 | @app.command()
11 | def summarization(
12 |         dataset: str,
13 |         column: str,
14 |         split: str,
15 |         model: str = "t5-base",
16 |         tokenizer: Optional[str] = None,
17 |         ngram: Optional[int] = 5
18 | ):
19 |     tokenizer = tokenizer or model
20 | 
21 |     summarizer = pipeline(
22 |         "summarization",
23 |         model=model,
24 |         tokenizer=tokenizer,
25 |         return_text=True)
26 |     
27 |     # load dataset (either custom CSV or dataset from HF)
28 |     if dataset.endswith('.csv'): 
29 |         data = load_dataset("csv", data_files=dataset)[split][:10][column]
30 |     else: 
31 |         data = load_dataset(dataset)
32 |         data = data[split][:10][column]
33 | 
34 |     # generate the summaries
35 |     outputs = summarizer(data)
36 |     outputs = [instance['summary_text'] for instance in outputs]
37 | 
38 |     # get the token-level patterns
39 |     patterns_token = token_patterns(outputs, ngram)
40 |     
41 |     # get the POS patterns 
42 |     joined_pos, tuples = get_pos(outputs)
43 |     ngrams_pos = token_patterns(joined_pos, ngram)
44 | 
45 |     # for the top n-gram patterns, cycle through and get the matching text
46 |     text_matches = {}
47 | 
48 |     for pattern, _ in ngrams_pos:
49 |         text_matches['pattern'] = pos_patterns(tuples, pattern)
50 | 
51 |     # get the compression score
52 |     compression = compression_ratio(outputs, 'gzip')
53 | 
54 |     # TODO: function to nicely display results
55 |     print(patterns_token)
56 |     print(text_matches)
57 |     print(compression)
58 | 
59 |     # TODO: compare between two models
60 |     return 
61 | 
62 | 
63 | def display_results():
64 |     pass
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     app()
69 | 


--------------------------------------------------------------------------------
/scripts/timing.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import pandas as pd
 3 | import numpy as np
 4 | 
 5 | from diversity import compression_ratio, homogenization_score, ngram_diversity_score
 6 | 
 7 | # Replace this with the actual path where your datasets are located
 8 | dataset_path = "synthetic_datasets/"
 9 | 
10 | # The number of times to repeat each experiment
11 | num_experiments = 10
12 | 
13 | # Initialize results dictionary
14 | results = {
15 |     'Dataset': [],
16 |     'Function': [],
17 |     'Mean Time': [],
18 |     'Std Dev Time': [],
19 | }
20 | 
21 | # Define the timing function
22 | def time_function(func, *args):
23 |     times = []
24 |     for _ in range(num_experiments):
25 |         start_time = time.time()
26 |         func(*args)
27 |         end_time = time.time()
28 |         times.append(end_time - start_time)
29 |     return np.mean(times), np.std(times)
30 | 
31 | # Iterate over datasets and functions
32 | for i in range(1, 6):
33 |     dataset_filename = f"dataset_{i}.txt"
34 |     dataset = [x.strip() for x in open(dataset_path + dataset_filename).read().split("\n")][:10]
35 | 
36 |     # Measure compression_ratio
37 |     cr_mean, cr_std = time_function(compression_ratio, dataset, 'gzip')
38 |     results['Dataset'].append(i)
39 |     results['Function'].append('compression_ratio')
40 |     results['Mean Time'].append(cr_mean)
41 |     results['Std Dev Time'].append(cr_std)
42 | 
43 |     # Measure homogenization_score with rougel
44 |     hs_mean, hs_std = time_function(homogenization_score, dataset, 'rougel')
45 |     results['Dataset'].append(i)
46 |     results['Function'].append('homogenization_score_rougel')
47 |     results['Mean Time'].append(hs_mean)
48 |     results['Std Dev Time'].append(hs_std)
49 | 
50 |     # Measure ngram_diversity_score
51 |     nds_mean, nds_std = time_function(ngram_diversity_score, dataset, 4)
52 |     results['Dataset'].append(i)
53 |     results['Function'].append('ngram_diversity_score')
54 |     results['Mean Time'].append(nds_mean)
55 |     results['Std Dev Time'].append(nds_std)
56 | 
57 | # Convert results to DataFrame and save as CSV
58 | results_df = pd.DataFrame(results)
59 | results_csv_path = dataset_path + 'timing_experiments_results.csv'
60 | results_df.to_csv(results_csv_path, index=False)
61 | 
62 | print(f"Results saved to {results_csv_path}")
63 | 


--------------------------------------------------------------------------------
/diversity/patterns/part_of_speech.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Any, Set
 2 | 
 3 | import spacy
 4 | 
 5 | def get_pos(
 6 |         data: List[str]
 7 | ) -> Tuple[List[str], List[Tuple[str, str]]]:
 8 |     """ Turns a sequence into parts of speech.
 9 | 
10 |     Args:
11 |         data (List[str]): Data to tranform into part of speech tags.
12 | 
13 |     Returns:
14 |         Tuple[List[str], List[Tuple[str, str]]]: Part-of-speech tags only, tuple of (token, part-of-speech tag).
15 |     """
16 |     nlp = spacy.load("en_core_web_sm", enable=["tok2vec", "tagger"])
17 | 
18 |     pos_tuples = []
19 |     joined_pos = []
20 |     joined_text = []
21 | 
22 |     docs = nlp.pipe(data, n_process=4, batch_size=1000)
23 |     
24 |     for doc in docs:
25 |         joined_text.append(' '.join([token.text for token in doc]))
26 |         joined_pos.append(' '.join([token.tag_ for token in doc]))
27 |         pos_tuples.append([(token.text, token.tag_) for token in doc])
28 | 
29 |     return joined_pos, pos_tuples
30 | 
31 | 
32 | def _find_sub_list(
33 |         sl: List[Any],
34 |         l: List[Any]
35 | ) -> List[Any]:
36 |     """ Given a pattern and a list of strings, returns sublists matching the pattern. """
37 |     
38 |     results = []
39 |     sll = len(sl)
40 |     for ind in (i for i,e in enumerate(l) if e==sl[0]):
41 |         if l[ind:ind+sll]==sl:
42 |             results.append((ind, ind+sll-1))
43 | 
44 |     return results
45 | 
46 | 
47 | def pos_patterns(
48 |         text: List[List[Tuple[str, str]]],
49 |         pattern: str
50 | ) -> Set[str]: 
51 |     """ Finds all substrings matching a part of speech pattern. 
52 | 
53 |     Args:
54 |         text (List[List[Tuple[str, str]]]): Text containing words and part-of-speech tags.
55 |         pattern (str): Part-of-speech tag pattern to search for.
56 | 
57 |     Returns:
58 |         Set[str]: Returns all the string matching the pattern.
59 |     """
60 | 
61 |     pos = []
62 |     word = []
63 |     
64 |     # text is a list of lists of tuples (word, part of speech)
65 |     for doc in text: 
66 |         pos.append([i[1] for i in doc])
67 |         word.append([i[0] for i in doc])
68 |     
69 |     pos = [' '.join(x) for x in pos]
70 |     word = [' '.join(x) for x in word]
71 | 
72 |     all_matches = [] 
73 | 
74 |     # return positions of each tag and the corresponding tokens
75 |     for w, p in zip(word, pos):
76 | 
77 |         test = _find_sub_list(pattern.split(), p.split())
78 | 
79 |         if test:
80 |             for occ in test: 
81 |                 splits = w.split()[int(occ[0]):int(occ[1]+1)]
82 |                 all_matches.append(" ".join(splits))
83 | 
84 |     return set(all_matches) 
85 | 


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_alignment/align.py:
--------------------------------------------------------------------------------
 1 | from .answer import get_answer
 2 | from .similarity import get_harmonic_similarity
 3 | import numpy as np
 4 | 
 5 | def align(gpt_model,
 6 |           source_quds, 
 7 |           target_text, 
 8 |           target_quds, 
 9 |           source_text,
10 |           num_source_segments,
11 |           num_target_segments,
12 |           source_segment_qud_dict,
13 |           target_segment_qud_dict,
14 |           source_segments,
15 |           target_segments,
16 |           num_source_sentences,
17 |           num_target_sentences,
18 |           threshold,
19 |           max_tries):
20 |     
21 |     # get [A_q for q in source_quds]
22 |     quds = []
23 |     for q_group in source_quds:
24 |         for q in eval(q_group)['quds']:
25 |             quds.append(q['qud'])
26 | 
27 |     source_qud_list = "\n\n".join(quds)
28 |     source_qud_answers = get_answer(gpt_model, target_text, source_qud_list, len(quds), num_target_sentences, max_tries)
29 |     
30 |     if source_qud_answers is None:
31 |         print("Finding an answer given source QUDs and target document was unsuccessful")
32 |         return None, None, [], []
33 |     else:
34 |         source_qud_answers = eval(source_qud_answers)
35 |     
36 |     # get [A_q for q in target_quds]
37 |     quds = []
38 |     for q_group in target_quds:
39 |         for q in eval(q_group)['quds']:
40 |             quds.append(q['qud'])
41 | 
42 |     target_qud_list = "\n\n".join(quds)
43 |     target_qud_answers = get_answer(gpt_model, source_text, target_qud_list, len(quds), num_source_sentences, max_tries)
44 |     
45 |     if target_qud_answers is None:
46 |         print("Finding an answer given target QUDs and source document was unsuccessful")
47 |         return None, None, [], []
48 |     else:
49 |         target_qud_answers = eval(target_qud_answers)
50 | 
51 |     # get harmonic similarity
52 |     harmonic_mean_scores = get_harmonic_similarity(num_target_segments, 
53 |                                                     num_source_segments, 
54 |                                                     source_qud_answers,
55 |                                                     source_segment_qud_dict,
56 |                                                     target_segments,
57 |                                                     target_qud_answers,
58 |                                                     target_segment_qud_dict,
59 |                                                     source_segments)
60 |     
61 |     aligned_segments = np.where(np.array(harmonic_mean_scores)<threshold, 0, 1)
62 | 
63 |     return source_qud_answers, target_qud_answers, harmonic_mean_scores, aligned_segments


--------------------------------------------------------------------------------
/diversity/embedding.py:
--------------------------------------------------------------------------------
 1 | '''
 2 | Computes remote clique and chamfer distance on embeddings for a set of documents
 3 | to understand their semantic (in embedding space) diversity.
 4 | 
 5 | Following Samuel Rhys Cox et al. 2021. "Directed Diversity: Leveraging Language Embedding Distances for Collective Creativity in Crowd Ideation". In Proceedings of the 2021 CHI Conference on Human Factors in Computing Systems (CHI '21). Association for Computing Machinery, New York, NY, USA, Article 393, 1–35. https://doi.org/10.1145/3411764.3445782
 6 | '''
 7 | 
 8 | from sentence_transformers import SentenceTransformer
 9 | from sklearn.metrics.pairwise import cosine_distances
10 | import numpy as np
11 | from tqdm import tqdm
12 | from typing import List, Optional
13 | 
14 | 
15 | def remote_clique(
16 |         data: List[str],
17 |         model: Optional[str] = 'Qwen/Qwen3-Embedding-0.6B',
18 |         verbose: Optional[bool] = True,
19 |         batch_size: Optional[int] = 64
20 | ) -> float:
21 |     """
22 |     Calculates the remote clique score for a set of documents (corpus-level).
23 |     This is the average mean pairwise distance of a data instance to other instances.
24 |     Args:
25 |         data (List[str]): Strings to score.
26 |         model(str, optional): Model to use for embedding. Defaults to 'Qwen/Qwen3-Embedding-0.6B'.
27 |         verbose(bool, optional): Whether to display progress bar. Defaults to True.
28 |         batch_size(int, optional): Batch size for embedding. Defaults to 64.
29 |     Returns:
30 |         float: Remote clique score.
31 |     """
32 |     model = SentenceTransformer(model)
33 |     embeddings = model.encode(data, batch_size=batch_size, show_progress_bar=verbose)
34 |     distances = cosine_distances(embeddings)
35 |     mean_distances = np.mean(distances, axis=1)
36 |     return np.mean(mean_distances).round(3)
37 | 
38 | 
39 | def chamfer_dist(
40 |         data: List[str],
41 |         model: Optional[str] = 'Qwen/Qwen3-Embedding-0.6B',
42 |         verbose: Optional[bool] = True,
43 |         batch_size: Optional[int] = 64
44 | ) -> float:
45 |     """
46 |     Calculates the chamfer distance for a set of documents (corpus-level).
47 |     This is the average minimum pairwise distance of a data instance to other instances.
48 |     Args:
49 |         data (List[str]): Strings to score.
50 |         model(str, optional): Model to use for embedding. Defaults to 'Qwen/Qwen3-Embedding-0.6B'.
51 |         verbose(bool, optional): Whether to display progress bar. Defaults to True.
52 |         batch_size(int, optional): Batch size for embedding. Defaults to 64.
53 |     Returns:
54 |         float: Chamfer distance.
55 |     """
56 |     model = SentenceTransformer(model)
57 |     embeddings = model.encode(data, batch_size=batch_size, show_progress_bar=verbose)
58 |     distances = cosine_distances(embeddings)
59 |     min_distances = np.min(distances + np.eye(len(distances)) * 1e9, axis=1)
60 |     return np.mean(min_distances).round(3)
61 | 


--------------------------------------------------------------------------------
/diversity/homogenization.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from tqdm import tqdm
 3 | from rouge_score import rouge_scorer
 4 | from evaluate import load
 5 | 
 6 | def homogenization_score(
 7 |         data: List[str],
 8 |         measure: str = 'rougel',
 9 |         use_stemmer: Optional[str] = False,
10 |         model: Optional[str] = "microsoft/deberta-base-mnli",
11 |         verbose: Optional[bool] = True,
12 |         batch_size: Optional[int] = 64
13 | ) -> float:
14 |     """ 
15 |     Calculates the homogenization score for a set of documents (corpus-level). 
16 |         From https://arxiv.org/pdf/2309.05196.pdf 
17 |      Args:
18 |          data (List[str]): Strings to score.
19 |          measure (str, optional): Either 'rougel', 'bertscore', or 'bleu'. Defaults to 'rougel'.
20 |          use_stemmer(str, optional): Whether to use stemming in the ROUGE-L calculation. Defaults to False.
21 |          model(str, optional): Model to use for BERTScore. Defaults to 'microsoft/deberta-base-mnli'. 
22 |          verbose(bool, optional): Whether to display progress bar. Defaults to True.
23 |      Returns:
24 |          float: Homogenization score.
25 |      """
26 | 
27 |     if measure == 'rougel':
28 |         scorer = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=use_stemmer)
29 |     elif measure == 'bertscore': 
30 |         scorer = load("bertscore")
31 |     elif measure == 'bleu':
32 |         scorer = load("bleu")
33 |     else: 
34 |         raise ValueError("Scoring measure must be one of `rougel`, `bleu`, or `bertscore`.")
35 | 
36 |     corpus_score = 0
37 |     
38 |     if verbose:
39 |         print('==> Scoring all pairs')
40 |      
41 |     for i, ref  in tqdm(enumerate(data), total=len(data), disable=(not verbose)):
42 |         # Get all the other utterances to compare against a specific utterance
43 |         preds = [x for j,x in enumerate(data) if j!=i]
44 |         refs = [ref for _ in range(len(preds))]
45 |         
46 |         # Get scores over whole batch and sum it up
47 |         if measure=='rougel':
48 |             doc_score = sum([scorer.score(pred, ref)['rougeL'].fmeasure for pred in preds])
49 |         elif measure=='bertscore':
50 |             doc_score = sum(scorer.compute(predictions=preds, 
51 |                                            references=refs, 
52 |                                            model_type=model, 
53 |                                            batch_size=batch_size)['f1'])
54 |         elif measure=='bleu':
55 |             # Need to double check that this is right
56 |             doc_score = scorer.compute(predictions=preds, 
57 |                                        references=[[r] for r in refs])['bleu']
58 |         # Then average
59 |         corpus_score += doc_score / (len(data) - 1)
60 |     
61 |     # case where all strings are the exact same in the list
62 |     if corpus_score == 0: 
63 |         corpus_score += len(data)
64 |     
65 |     # returns corpus level homogenization score 
66 |     return round(corpus_score/len(data), 3)
67 | 


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_qud_generation/segment.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from ...utils import openai
 3 | 
 4 | system_prompt = "You will be given text with numbered sentences and your task is to redraw the paragraph boundaries such that each chunk is about one atomic topic. Each segment cannot be about multiple topics or about a complex topic. You may not change the text or change the order of the sentences. For each segment, provide the list of sentence numbers that belong to that segment."
 5 | 
 6 | class Segment(openai.BaseModel):
 7 |     sentences: list[int]
 8 | 
 9 | class Answer(openai.BaseModel):
10 |     segmentation: list[Segment]
11 | 
12 | def segment(gpt_model: openai.GPT, passage: str, sentence_dict: dict, max_tries: int):
13 |     """Segment the passage such that each segment is thematically atomic
14 | 
15 |     Args:
16 |         gpt_model: an OpenAI client
17 |         passage (str): Passage/document
18 |         sentence_dict (dict): Dictionary mapping between sentence numbers and corresponding sentences in the passage
19 |         max_tries (int): maximum number of attempts the client can make in case of failure 
20 |     
21 |     Returns:
22 |         Answer: Segments wrapped in an openai.BaseModel class
23 |     """
24 |     text = " ".join(passage.split("\n\n"))
25 | 
26 |     document_sentences = [k for k,v in sentence_dict.items()]
27 |     document_sentences = [int(num) for num in document_sentences if type(num)!=int]
28 | 
29 | 
30 |     for i in range(max_tries):
31 |         try:
32 |             segments = gpt_model.call_gpt_format(text, system_prompt, Answer)
33 | 
34 |             if not segments:
35 |                 continue            
36 | 
37 |             # all sentences in document must be accounted for
38 |             segmented_sentences = set()
39 |             for segment in segments.segmentation:
40 |                 for sentence_num in segment.sentences:
41 |                     segmented_sentences.add(sentence_num)
42 |             if len(set(document_sentences).difference(set(segmented_sentences)))!=0:
43 |                 # extraneous or missing sentences
44 |                 continue
45 | 
46 | 
47 |             # all sentences in each segments must be in document
48 |             successful_segmentation = True
49 |             segmented_text = []
50 |             for segment in segments.segmentation:
51 |                 segment_text = ""
52 |                 for sentence_num in segment.sentences:
53 |                     try:
54 |                         sentence = sentence_dict[str(sentence_num)]
55 |                         segment_text += sentence +  " "
56 |                     except:
57 |                         try:
58 |                             sentence = sentence_dict[sentence_num]
59 |                             segment_text += sentence +  " "
60 |                         except:
61 |                             successful_segmentation = False
62 |                 segmented_text.append(segment_text)
63 | 
64 |             if successful_segmentation:
65 |                 return segments, segmented_text
66 |             
67 |         except Exception as e:
68 |             print(e)
69 |             continue
70 |     
71 |     return None, None
72 | 
73 |     
74 | 
75 |     
76 | 
77 |     


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_alignment/metric.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | class SimilarityMetric():
 4 |     def _calculate_document_similarity(self, segment_scores):
 5 |         max_sim_scores = np.max(segment_scores, axis=1)
 6 |         overall_similarity = np.average(max_sim_scores)
 7 |         return  overall_similarity
 8 | 
 9 | class FrequencyBasedSimilarity(SimilarityMetric):
10 |     
11 |     def _count_sentences(self, qud_answers, 
12 |                         num_target_segments:int, 
13 |                         target_segments):
14 |         
15 |         arr = np.zeros((len(qud_answers['excerpts']), num_target_segments))
16 | 
17 |         for i, qud_ans in enumerate(qud_answers['excerpts']):
18 |             answer_sentences = qud_ans['sentence_nums']
19 |             if len(answer_sentences)==0:
20 |                 continue
21 |             for j, target_segment in enumerate(eval(target_segments)['segmentation']):
22 |                 target_segment_sentences = set(target_segment['sentences'])
23 |                 intersection = target_segment_sentences.intersection(answer_sentences)
24 |                 arr[i][j] = len(intersection) / len(answer_sentences)
25 | 
26 |         return arr
27 | 
28 |     def _get_segment_scores(self, sentence_count_map, 
29 |                             source_seg_qud_dict, 
30 |                             num_source_segments, 
31 |                             num_target_segments):
32 |         
33 |         segment_scores = np.zeros((num_source_segments, num_target_segments))
34 | 
35 |         for src, quds in source_seg_qud_dict.items():
36 |             segment_scores[int(src)] = np.average(sentence_count_map[quds], axis=0)
37 | 
38 |         return segment_scores
39 | 
40 |     def calculate_similarity(self, num_target_segments:int, 
41 |                              num_source_segments:int, 
42 |                              qud_answers:dict, 
43 |                              source_seg_qud_dict: dict, 
44 |                              target_segments: str): 
45 |         """ Calculates directional similarity between a source and target document
46 |             (Figure 2 in https://arxiv.org/pdf/2504.09373)
47 | 
48 |         Args:
49 |             num_target_segments (int): Number of target segments
50 |             num_source_segments (int): Number of source segments
51 |             qud_answers (dict): JSON representation of answers to source quds given target document
52 |             source_seg_qud_dict (dict): Mapping between source segment indices and a list of corresponding QUD indices
53 |             target_segments (str): JSON string representation of target segments (qudsim_qud_generation.segment.Answer)
54 |         
55 |         Returns:
56 |             ndarray: array of dimensions (num_source_segments, num_target_segments)
57 |                 representing similarity scores between each pair of segments
58 |         """
59 |         sentence_count_map = self._count_sentences(qud_answers,
60 |                                                    num_target_segments,
61 |                                                    target_segments)
62 |         
63 |         segment_scores = self._get_segment_scores(sentence_count_map, 
64 |                                                   source_seg_qud_dict, 
65 |                                                   num_source_segments, 
66 |                                                   num_target_segments)
67 | 
68 |         return segment_scores


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_alignment/similarity.py:
--------------------------------------------------------------------------------
 1 | from .metric import FrequencyBasedSimilarity
 2 | import numpy as np
 3 | 
 4 | def _get_frequency_similarities(num_target_segments, 
 5 |                                num_source_segments, 
 6 |                                qud_answers, 
 7 |                                source_segment_qud_dict,
 8 |                                target_segments):
 9 |     similarity_metric = FrequencyBasedSimilarity()
10 | 
11 |     segment_scores = similarity_metric.calculate_similarity(num_target_segments, 
12 |                                                             num_source_segments, 
13 |                                                             qud_answers,
14 |                                                             source_segment_qud_dict,
15 |                                                             target_segments)
16 |     return segment_scores
17 | 
18 | def get_harmonic_similarity(num_target_segments,
19 |                             num_source_segments,
20 |                             source_qud_answers,
21 |                             source_segment_qud_dict,
22 |                             target_segments,
23 |                             target_qud_answers,
24 |                             target_segment_qud_dict,
25 |                             source_segments):
26 |     """ Calculates harmonic mean between source->target and target->source similarities
27 | 
28 |         Args:
29 |             num_target_segments (int): Number of target segments
30 |             num_source_segments (int): Number of source segments
31 |             source_qud_answers (dict): JSON representation of answers to source quds given target document
32 |             source_seg_qud_dict (dict): Mapping between source segment indices and a list of corresponding QUD indices
33 |             target_segments (str): JSON string representation of target segments (qudsim_qud_generation.segment.Answer)
34 |             target_qud_answers (dict): JSON representation of answers to target quds given source document
35 |             target_seg_qud_dict (dict): Mapping between target segment indices and a list of corresponding QUD indices
36 |             source_segments (str): JSON string representation of source segments (qudsim_qud_generation.segment.Answer)
37 |         
38 |         Returns:
39 |             ndarray: array of dimensions (num_source_segments, num_target_segments)
40 |                 representing harmonic mean of direction similarity scores between each pair of segments
41 |         """
42 |     
43 |     src_to_tgt_segment_scores = _get_frequency_similarities(num_target_segments,
44 |                                                             num_source_segments,
45 |                                                             source_qud_answers,
46 |                                                             source_segment_qud_dict,
47 |                                                             target_segments)
48 |     tgt_to_src_segment_scores = _get_frequency_similarities(num_source_segments,
49 |                                                             num_target_segments,
50 |                                                             target_qud_answers,
51 |                                                             target_segment_qud_dict,
52 |                                                             source_segments)
53 | 
54 |     denom = src_to_tgt_segment_scores + np.transpose(tgt_to_src_segment_scores)
55 |     denom = np.where(denom>0, denom, 1)
56 | 
57 |     harmonic_mean_scores = 2*(src_to_tgt_segment_scores*np.transpose(tgt_to_src_segment_scores))/denom
58 | 
59 |     return harmonic_mean_scores


--------------------------------------------------------------------------------
/diversity/functions.py:
--------------------------------------------------------------------------------
 1 | """ Functions for extracting patterns and matching text to patterns. """
 2 | 
 3 | import numpy as np
 4 | import itertools
 5 | from typing import List, Optional
 6 | from tqdm import tqdm
 7 | from .patterns import token_patterns, get_pos, pos_patterns
 8 | from nltk.tokenize import sent_tokenize
 9 | 
10 | def extract_patterns(text: List[str], 
11 |                      n: int = 5,
12 |                      top_n: int = 100
13 | ) -> dict:
14 |     """ Extracts text and part-of-speech patterns from text input. 
15 |         Used to return a dictionary of patterns and the corresponding text match. 
16 |     Args:
17 |         text (List[str]): List of strings to extract patterns from.
18 |         n (int, optional): N-gram size. Defaults to 5.
19 |         top_n (int, optional): Number of top patterns to extract. Defaults to 100.
20 |     Returns:
21 |         dict: Dictionary of patterns and their corresponding text.
22 | 
23 |     Example Usage:
24 |     >>> text = ["The quick brown fox jumps over the lazy dog.", 
25 |                 "The slow red fox walks on the hyper dog."]
26 |     >>> extract_patterns(text, 4)
27 | 
28 |     {'DT JJ NN NN': {'The quick brown fox'},
29 |     'JJ NN NN VBZ': {'quick brown fox jumps'},
30 |     'NN NN VBZ IN': {'brown fox jumps over'},
31 |     'NN VBZ IN DT': {'fox jumps over the'},
32 |     'VBZ IN DT JJ': {'jumps over the lazy'},
33 |     'IN DT JJ NN': {'over the lazy dog.'},
34 |     'DT JJ JJ NN': {'The slow red fox'},
35 |     'JJ JJ NN NNS': {'slow red fox walks'},
36 |     'JJ NN NNS IN': {'red fox walks on'},
37 |     'NN NNS IN DT': {'fox walks on the'},
38 |     'NNS IN DT NN': {'walks on the hyper'},
39 |     'IN DT NN NN': {'on the hyper dog.'}}
40 |     """
41 | 
42 |     # sentence tokenize then search for patterns in the entire list 
43 |     outputs = sent_tokenize(' '.join(text))
44 | 
45 |     # get the token (word)-level patterns
46 |     patterns_token  =  token_patterns(outputs, n)
47 | 
48 |     # get the part-of-speech patterns (only include top_n patterns)
49 |     joined_pos, tuples  =  get_pos(outputs)
50 |     ngrams_pos  =  token_patterns(joined_pos, n, top_n)
51 | 
52 |     # for the top n-gram patterns, cycle through and get the matching text
53 |     text_matches  = {}
54 | 
55 |     for  pattern, _  in  ngrams_pos:
56 |         text_matches[pattern] =  pos_patterns(tuples, pattern)
57 |     
58 |     return text_matches
59 | 
60 | 
61 | def match_patterns(text: str, 
62 |                    patterns: dict
63 | ) -> List[tuple]:
64 |     """ Matches text to part-of-speech patterns extracted from the `extract_patterns` function.
65 |         Given set of patterns, used to identify which patterns appears in a single input text. 
66 |     Args:
67 |         text (str): Text to match patterns to.
68 |         patterns (dict): Dictionary of patterns and their corresponding text.
69 |     Returns:
70 |         List[tuple]: List of tuples with the pattern and the text that matched.
71 | 
72 |     Example Usage: 
73 |     >>> text = ["The quick brown fox jumps over the lazy dog.", 
74 |         "The slow red fox walks on the hyper dog.",
75 |         "The cranky blue cat scratches the calm fish." ]
76 |     >>> patterns = extract_patterns(text, 4)
77 |     >>> match_patterns(text[2], patterns)
78 | 
79 |     [('DT NN JJ NN', 'The cranky blue cat'),
80 |     ('NN JJ NN VBZ', 'cranky blue cat scratches'),
81 |     ('JJ NN VBZ DT', 'blue cat scratches the'),
82 |     ('NN VBZ DT NN', 'cat scratches the calm'),
83 |     ('VBZ DT NN NN', 'scratches the calm fish.')]
84 |     """
85 | 
86 |     matches  =  []
87 | 
88 |     for pattern, text_match in patterns.items():
89 |         for substr in text_match:
90 |             if substr in text:
91 |                 matches.append((pattern, substr))
92 | 
93 |     return matches
94 | 


--------------------------------------------------------------------------------
/diversity/qudsim_modules/qudsim_qud_generation/pipeline.py:
--------------------------------------------------------------------------------
 1 | from .decontextualize import decontextualize
 2 | from .qud import generate_quds
 3 | from .segment import segment
 4 | 
 5 | def _get_qud_dict(quds):
 6 |     qud_segment_dict = {}
 7 |     segment_qud_dict = {}
 8 |     num_quds = 0
 9 | 
10 |     for i, source_qud in enumerate(quds):
11 |         qud_idx_list = []
12 |         for q in eval(source_qud)['quds']:
13 |             qud_segment_dict[num_quds] = i
14 |             qud_idx_list.append(num_quds)
15 |             num_quds+=1
16 |         segment_qud_dict[i] = qud_idx_list
17 |             
18 |     return segment_qud_dict, qud_segment_dict
19 | 
20 | def get_quds(gpt_model, text: str, sentence_num_dict: dict, level: int, max_tries: int):
21 |     """Performs segmentation, entity extraction and QUD generation on the text at the specified level
22 | 
23 |     Args:
24 |         gpt_model: an OpenAI client
25 |         text (str): passage/document
26 |         sentence_dict (dict): Dictionary mapping between sentence numbers and corresponding sentences in the passage
27 |         level (0/1): Abstraction level of QUDs (1=abstract; 0=specific)
28 |         max_tries (int): maximum number of attempts the client can make in case of failure 
29 |     
30 |     Returns:
31 |         dict: results of the pipeline:
32 |             segments: segmentation json
33 |             segment_dict: Maps sentence number to its corresponding segment
34 |             entity_abstracted_segments: segments post-entity-extraction
35 |             quds: quds for the document in json
36 |             segment_qud_dict: Maps segment index to a list of corresponding QUD indices
37 |             qud_segment_dict: Maps QUD indices to their corresponding segment indices
38 |     """
39 | 
40 |     # SEGMENTATION
41 |     segments, segmented_text = segment(gpt_model, text, sentence_num_dict, max_tries)
42 |     if segments is None or segmented_text is None:
43 |         print("Segmentation was unsuccessful")
44 |         return None
45 |     
46 |     segments_json = segments.model_dump_json()
47 |     
48 |     # DICTIONARY: SENTENCE --> SEGMENT
49 |     segment_dict = {}
50 |     for i, s in enumerate(segments.segmentation):
51 |         for sentence_num in s.sentences:
52 |             segment_dict[sentence_num] = i
53 | 
54 |     if level==1:
55 |         # ENTITY ABSTRACTION
56 |         numbered_segment_text = "\n\n".join(["[" + str(i)+ "] " + seg for i, seg in enumerate(segmented_text)])
57 |         decontextualized_segments = decontextualize(gpt_model, 
58 |                                                     numbered_segment_text, 
59 |                                                     len(segmented_text), 
60 |                                                     max_tries)
61 |         if decontextualized_segments is None:
62 |             print("Entity abstraction was unsuccessful.")
63 |             return None
64 |         
65 |         decontextualized_segments_json = decontextualized_segments.model_dump_json()
66 | 
67 |         # QUD GENERATION - Level 1
68 |         quds = [(generate_quds(gpt_model, seg.para, max_tries)).model_dump_json() 
69 |                         for seg in decontextualized_segments.decontextualized_paragraphs]
70 |         if quds is None:
71 |             return None
72 |     else:
73 |         decontextualized_segments_json = None
74 | 
75 |         # QUD GENERATION - Level 0
76 |         quds = [(generate_quds(gpt_model, seg, max_tries)).model_dump_json() for seg in segmented_text]
77 |         if quds is None:
78 |             return None
79 | 
80 |     segment_qud_dict, qud_segment_dict = _get_qud_dict(quds)
81 |     qg_output_item = {"segments": segments_json,
82 |                       "segment_dict": segment_dict,
83 |                       "entity_abstracted_segments": decontextualized_segments_json,
84 |                       "quds": quds,
85 |                       "segment_qud_dict": segment_qud_dict,
86 |                       "qud_segment_dict": qud_segment_dict}
87 | 
88 |     return qg_output_item 


--------------------------------------------------------------------------------
/diversity/template.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Optional
  3 | from .functions import extract_patterns
  4 | from typing import Dict, Iterable, List, Optional
  5 | 
  6 | 
  7 | def template_rate(
  8 |     data = List[str], 
  9 |     templates: Optional[Dict[str, Iterable[str]]] = None,
 10 |     shard_size: int = 500,
 11 |         
 12 | ) -> float:
 13 |     """ 
 14 |     Calculates the template rate (fraction of texts in a corpus that contain at least 1 template)
 15 |     for a set of documents (corpus-level), following https://arxiv.org/abs/2407.00211.  
 16 | 
 17 |     Args:
 18 |         data (List[str]): A list of strings to score.
 19 |         templates (dict, optional): Dictionary containing the templates extracted from the corpus. Defaults to None.
 20 |         shard_size (int, optional): Size of regex shards to compile. Defaults to 500.
 21 | 
 22 |     Returns:
 23 |         float: Template rate, a value between 0 and 1 indicating the fraction of texts that contain at least one template.
 24 |     """
 25 |     if not data: return 0.0
 26 |     
 27 |     if templates is None:
 28 |         # get the templates if not passed in 
 29 |         templates = extract_patterns(data)
 30 |     
 31 |     matched_text = _gather_substrings(templates)
 32 |     
 33 |     if not matched_text: return 0.0
 34 |         
 35 |     regexes = _compile_regex_shards(matched_text, shard_size=shard_size)
 36 |     match = sum(1 for doc in data if _has_any(doc, regexes))
 37 |     
 38 |     return match / len(data)
 39 | 
 40 | 
 41 | def templates_per_token(
 42 |         data: List[str],
 43 |         templates: Optional[Dict[str, Iterable[str]]] = None,
 44 |         shard_size: int = 500,
 45 | ) -> List[float]:
 46 |     """ 
 47 |     Calculates the templates-per-token rate from https://arxiv.org/abs/2407.00211. 
 48 |     
 49 |     Args:
 50 |         data (List[str]):  A list of strings to score.
 51 |         templates (dict, optional): Dictionary containing the templates extracted from the corpus. Defaults to None.
 52 |         shard_size (int, optional): Size of regex shards to compile. Defaults to 500.
 53 | 
 54 | 
 55 |     Returns:
 56 |         List[float]: List of templates-per-token rates for each document in the corpus.
 57 |     """
 58 |     if not data:
 59 |         return []
 60 | 
 61 |     # Build templates if not provided
 62 |     if templates is None:
 63 |         templates = extract_patterns(data)
 64 | 
 65 |     substrings = _gather_substrings(templates)
 66 |     if not substrings:
 67 |         return [0.0] * len(data)
 68 | 
 69 |     # Use lookahead shards to count overlapping occurrences
 70 |     shards = _compile_regex_shards(substrings, shard_size, overlap=True)
 71 | 
 72 |     # Compute per-doc TPT
 73 |     tpt: List[float] = []
 74 |     for doc in data:
 75 |         word_count = len(doc.split())
 76 |         if word_count == 0:
 77 |             tpt.append(0.0)
 78 |             continue
 79 | 
 80 |         occ = 0
 81 |         for rx in shards:
 82 |             occ += sum(1 for _ in rx.finditer(doc))  # each match = one occurrence start
 83 |         tpt.append(occ / word_count)
 84 | 
 85 |     return tpt
 86 | 
 87 | 
 88 | def _compile_regex_shards(
 89 |     substrings: List[str],
 90 |     shard_size: int = 500,
 91 |     *,
 92 |     overlap: bool = False,
 93 | ) -> List[re.Pattern]:
 94 |     """
 95 |     Reusable shard compiler.
 96 |     - overlap=False: plain alternation (fast existence tests).
 97 |     - overlap=True: lookahead alternation for counting overlapping matches.
 98 |     """
 99 |     regs: List[re.Pattern] = []
100 |     
101 |     for i in range(0, len(substrings), shard_size):
102 |         chunk = substrings[i:i + shard_size]
103 |         
104 |         if not chunk:
105 |             continue
106 |         
107 |         alt = "|".join(map(re.escape, chunk))
108 |         pat = f"(?=(?:{alt}))" if overlap else f"(?:{alt})"
109 |         
110 |         regs.append(re.compile(pat))
111 |         
112 |     return regs
113 | 
114 | 
115 | def _has_any(
116 |         text: str, 
117 |         regexes: List[re.Pattern]
118 |     ) -> bool:
119 |     for rx in regexes:
120 |         # faster search
121 |         if rx.search(text):
122 |             return True
123 |     return False
124 | 
125 | 
126 | def _gather_substrings(
127 |     templates: Dict[str, Iterable[str]]
128 | ) -> List[str]:
129 |     """
130 |     Gathers all substrings from the templates dictionary.
131 |     
132 |     Args:
133 |         templates (Dict[str, Iterable[str]]): Dictionary of templates with their corresponding text matches.
134 |         
135 |     Returns:
136 |         List[str]: List of unique substrings extracted from the templates.
137 |     """
138 |     # get the flattened values from the extracted patterns 
139 |     matched_text = set()
140 |     
141 |     for v in templates.values():
142 |         matched_text.update(v)
143 |     
144 |     return list(matched_text)
145 | 


--------------------------------------------------------------------------------
/diversity/compute_all_metrics.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Comprehensive diversity metrics computation function.
  3 | This module computes all lexical diversity metrics plus embedding metrics for a given corpus of text.
  4 | """
  5 | 
  6 | from typing import List, Optional, Dict, Any
  7 | import logging
  8 | from .compression import compression_ratio
  9 | from .homogenization import homogenization_score
 10 | from .ngram_diversity import ngram_diversity_score
 11 | from .self_repetition import self_repetition_score
 12 | from .embedding import remote_clique, chamfer_dist
 13 | from .template import template_rate, templates_per_token
 14 | from .functions import extract_patterns
 15 | 
 16 | 
 17 | def compute_all_metrics(
 18 |     corpus: List[str],
 19 |     output_format: str = "dict",
 20 |     embedding_model: Optional[str] = "Qwen/Qwen3-Embedding-0.6B",
 21 |     homogenization_measure: str = "rougel",
 22 |     compression_algorithm: str = "gzip",
 23 |     ngram_n: int = 4,
 24 |     self_repetition_n: int = 4,
 25 |     template_shard_size: int = 500,
 26 |     verbose: bool = True,
 27 |     batch_size: int = 64
 28 | ) -> Dict[str, Any]:
 29 |     """
 30 |     Computes all available diversity metrics for a corpus of text.
 31 |     
 32 |     Args:
 33 |         corpus (List[str]): List of text documents to analyze
 34 |         output_format (str): Format for output - "dict", "markdown", or "latex"
 35 |         embedding_model (str): Model to use for embedding-based metrics
 36 |         homogenization_measure (str): Measure for homogenization score ("rougel", "bertscore", "bleu")
 37 |         compression_algorithm (str): Algorithm for compression ratio ("gzip", "xz")
 38 |         ngram_n (int): Maximum n-gram size for n-gram diversity
 39 |         self_repetition_n (int): N-gram size for self-repetition score
 40 |         template_shard_size (int): Shard size for template processing
 41 |         verbose (bool): Whether to show progress messages
 42 |         batch_size (int): Batch size for embedding computations
 43 |     
 44 |     Returns:
 45 |         Dict[str, Any]: Dictionary containing all computed metrics and formatted table if requested
 46 |     """
 47 |     
 48 |     if verbose:
 49 |         print("Computing diversity metrics for corpus...")
 50 |         print(f"Corpus size: {len(corpus)} documents")
 51 |     
 52 |     results = {}
 53 |     
 54 |     # Compression-based metrics
 55 |     if verbose:
 56 |         print("Computing compression ratio...")
 57 |     results["compression_ratio_gzip"] = compression_ratio(
 58 |         corpus, algorithm="gzip", verbose=False
 59 |     )
 60 |     
 61 |     if compression_algorithm == "xz":
 62 |         results["compression_ratio_xz"] = compression_ratio(
 63 |             corpus, algorithm="xz", verbose=False
 64 |         )
 65 |     
 66 |     # Homogenization score
 67 |     if verbose:
 68 |         print(f"Computing homogenization score using {homogenization_measure}...")
 69 |     results[f"homogenization_score_{homogenization_measure}"] = homogenization_score(
 70 |         corpus, measure=homogenization_measure, verbose=verbose, batch_size=batch_size
 71 |     )
 72 |     
 73 |     # N-gram diversity
 74 |     if verbose:
 75 |         print(f"Computing n-gram diversity (n={ngram_n})...")
 76 |     results["ngram_diversity"] = ngram_diversity_score(corpus, num_n=ngram_n)
 77 |     
 78 |     # Self-repetition score
 79 |     if verbose:
 80 |         print(f"Computing self-repetition score (n={self_repetition_n})...")
 81 |     results["self_repetition_score"] = self_repetition_score(
 82 |         corpus, n=self_repetition_n, verbose=verbose
 83 |     )
 84 |     
 85 |     # Embedding-based metrics
 86 |     if verbose:
 87 |         print(f"Computing embedding-based metrics using {embedding_model}...")
 88 |     
 89 |     try:
 90 |         results["remote_clique_score"] = remote_clique(
 91 |             corpus, model=embedding_model, verbose=verbose, batch_size=batch_size
 92 |         )
 93 |         results["chamfer_distance"] = chamfer_dist(
 94 |             corpus, model=embedding_model, verbose=verbose, batch_size=batch_size
 95 |         )
 96 |     except Exception as e:
 97 |         if verbose:
 98 |             print(f"⚠️  Warning: Could not compute embedding metrics - {e}")
 99 |         results["remote_clique_score"] = None
100 |         results["chamfer_distance"] = None
101 |     
102 |     # Template-based metrics
103 |     if verbose:
104 |         print("Extracting patterns for template metrics...")
105 |     
106 |     try:
107 |         patterns = extract_patterns(corpus)
108 |         
109 |         if verbose:
110 |             print("Computing template rate...")
111 |         results["template_rate"] = template_rate(
112 |             corpus, templates=patterns, shard_size=template_shard_size
113 |         )
114 |         
115 |         if verbose:
116 |             print("Computing templates per token...")
117 |         tpt_scores = templates_per_token(
118 |             corpus, templates=patterns, shard_size=template_shard_size
119 |         )
120 |         results["avg_templates_per_token"] = sum(tpt_scores) / len(tpt_scores) if tpt_scores else 0.0
121 |         results["templates_per_token_scores"] = tpt_scores
122 |         
123 |     except Exception as e:
124 |         if verbose:
125 |             print(f"Warning: Could not compute template metrics - {e}")
126 |         results["template_rate"] = None
127 |         results["avg_templates_per_token"] = None
128 |         results["templates_per_token_scores"] = None
129 |     
130 |     if verbose:
131 |         print("All metrics computed successfully!")
132 |     
133 |     # Format output based on requested format
134 |     if output_format.lower() == "markdown":
135 |         results["formatted_table"] = _format_markdown_table(results)
136 |     elif output_format.lower() == "latex":
137 |         results["formatted_table"] = _format_latex_table(results)
138 |     
139 |     return results
140 | 
141 | 
142 | def _format_markdown_table(results: Dict[str, Any]) -> str:
143 |     """Format results as a markdown table."""
144 |     
145 |     table = "# Diversity Metrics Results\n\n"
146 |     table += "| Metric | Value |\n"
147 |     table += "|--------|-------|\n"
148 |     
149 |     for metric, value in results.items():
150 |         if metric in ["formatted_table", "templates_per_token_scores"]:
151 |             continue
152 |         
153 |         if value is None:
154 |             value_str = "N/A"
155 |         elif isinstance(value, float):
156 |             value_str = f"{value:.3f}"
157 |         else:
158 |             value_str = str(value)
159 |         
160 |         table += f"| {metric.replace('_', ' ').title()} | {value_str} |\n"
161 |     
162 |     return table
163 | 
164 | 
165 | def _format_latex_table(results: Dict[str, Any]) -> str:
166 |     """Format results as a LaTeX table. Requires booktabs"""
167 |     
168 |     table = "\\begin{table}[htbp]\n"
169 |     table += "\\centering\n"
170 |     table += "\\caption{Diversity Metrics Results}\n"
171 |     table += "\\begin{tabular}{lc}\n"
172 |     table += "\\hline\n"
173 |     table += "\\textbf{Metric} & \\textbf{Value} \\\\\n"
174 |     table += "\\hline\n"
175 |     
176 |     for metric, value in results.items():
177 |         if metric in ["formatted_table", "templates_per_token_scores"]:
178 |             continue
179 |             
180 |         if value is None:
181 |             value_str = "N/A"
182 |         elif isinstance(value, float):
183 |             value_str = f"{value:.3f}"
184 |         else:
185 |             value_str = str(value)
186 |         
187 |         metric_name = metric.replace('_', ' ').title()
188 |         table += f"{metric_name} & {value_str} \\\\\n"
189 |         table += "\\\n"
190 |     
191 |     table += "\\end{tabular}\n"
192 |     table += "\\end{table}\n"
193 |     
194 |     return table
195 | 


--------------------------------------------------------------------------------
/diversity/qudsim.py:
--------------------------------------------------------------------------------
  1 | from .qudsim_modules import number_text, get_quds, align
  2 | from .utils import openai
  3 | from tqdm import tqdm
  4 | import itertools
  5 | import json
  6 | import yaml
  7 | import os
  8 | 
  9 | class Document:
 10 |     def __init__(self, document):
 11 |         self.document = document
 12 | 
 13 |     def preprocess_document(self):
 14 | 
 15 |         numbered_text, number_sentence_dict = number_text(self.document)
 16 | 
 17 |         if not numbered_text or not number_sentence_dict:
 18 |             print("Could not preprocess document: ", self.document)
 19 |             self.numbered_text = None
 20 |             self.number_sentence_dict = None
 21 |             return False
 22 |         
 23 |         self.numbered_text = numbered_text
 24 |         self.number_sentence_dict = number_sentence_dict
 25 |         return True
 26 | 
 27 |     def generate_quds(self, gpt_model, config):
 28 |         try:
 29 |             level = config['level']
 30 |             max_tries = config['max_tries']
 31 |         except:
 32 |             print("Failed to parse configurations")
 33 |             return None, None, [], []
 34 |         
 35 |         qg_item = get_quds(gpt_model, self.numbered_text, self.number_sentence_dict, level, max_tries)
 36 |         
 37 |         if qg_item is None:
 38 |             print("Could not segment, abstract or generate quds")
 39 |             self.segments = None
 40 |             self.entity_abstracted_segments = None
 41 |             self.quds = None
 42 |             self.segment_qud_dict = None
 43 |             self.qud_segment_dict = None
 44 |             return False
 45 |         else:
 46 |             self.segments = qg_item['segments']
 47 |             self.entity_abstracted_segments = qg_item['entity_abstracted_segments']
 48 |             self.quds = qg_item['quds']
 49 |             self.segment_qud_dict = qg_item['segment_qud_dict']
 50 |             self.qud_segment_dict = qg_item['qud_segment_dict']
 51 |             return True
 52 |     
 53 | class AlignmentPair:
 54 |     def __init__(self, document1: Document, document2: Document):
 55 |         self.source_document = document1
 56 |         self.target_document = document2
 57 | 
 58 |     def align_documents(self, gpt_model, config):
 59 |         try:
 60 |             threshold = config['threshold']
 61 |             max_tries = config['max_tries']
 62 |         except:
 63 |             print("Failed to parse configurations")
 64 |             return None, None, [], []
 65 |         
 66 |         num_source_segments = len(self.source_document.segment_qud_dict)
 67 |         num_target_segments = len(self.target_document.segment_qud_dict)
 68 |         num_source_sentences = len(self.source_document.number_sentence_dict)
 69 |         num_target_sentences = len(self.target_document.number_sentence_dict)
 70 |         source_segments = self.source_document.segments
 71 |         target_segments = self.target_document.segments
 72 |         
 73 |         source_qud_answers, target_qud_answers, harmonic_mean_scores, aligned_segments = align(gpt_model,
 74 |                                                                                                      self.source_document.quds,
 75 |                                                                                                      self.target_document.numbered_text,
 76 |                                                                                                      self.target_document.quds,
 77 |                                                                                                      self.source_document.numbered_text,
 78 |                                                                                                      num_source_segments,
 79 |                                                                                                      num_target_segments,
 80 |                                                                                                      self.source_document.segment_qud_dict,
 81 |                                                                                                      self.target_document.segment_qud_dict,
 82 |                                                                                                      self.source_document.segments,
 83 |                                                                                                      self.target_document.segments,
 84 |                                                                                                      num_source_sentences,
 85 |                                                                                                      num_target_sentences,
 86 |                                                                                                      threshold,
 87 |                                                                                                      max_tries)
 88 | 
 89 |         self.source_qud_answers = source_qud_answers
 90 |         self.target_qud_answers = target_qud_answers
 91 |         self.harmonic_mean_scores = harmonic_mean_scores.tolist()
 92 |         self.aligned_segments = aligned_segments.tolist()
 93 | 
 94 |         aligned_segment_text = []
 95 |         for i, src in enumerate(aligned_segments):
 96 |             for j, tgt in enumerate(src):
 97 |                 if tgt>0:
 98 |                     # alignment exists
 99 |                     source_sentences = eval(source_segments)['segmentation'][i]['sentences']
100 |                     try:
101 |                         source_text = [self.source_document.number_sentence_dict[str(num)] for num in source_sentences]
102 |                     except:
103 |                         try:
104 |                             source_text = [self.source_document.number_sentence_dict[num] for num in source_sentences]
105 |                         except Exception as e:
106 |                             print(e)
107 | 
108 |                     target_sentences = eval(target_segments)['segmentation'][j]['sentences']
109 |                     try:
110 |                         target_text = [self.target_document.number_sentence_dict[str(num)] for num in target_sentences]
111 |                     except:
112 |                         try:
113 |                             target_text = [self.target_document.number_sentence_dict[num] for num in target_sentences]
114 |                         except Exception as e:
115 |                             print(e)
116 |                     
117 |                     aligned_segment_text.append((" ".join(source_text), " ".join(target_text)))
118 |         
119 |         self.aligned_segment_text = aligned_segment_text
120 | 
121 | 
122 | def _compile_documents(documents: list[str], qg_gpt_model: openai.GPT, config):
123 |     document_list = []
124 |     for document in tqdm(documents, total=len(documents), desc="Generating QUDs"):
125 |         document_obj = Document(document=document)
126 | 
127 |         preprocessing_status = document_obj.preprocess_document()
128 |         if not preprocessing_status:
129 |             continue
130 | 
131 |         qud_generation_status = document_obj.generate_quds(gpt_model=qg_gpt_model, config=config)
132 |         if not qud_generation_status:
133 |             continue
134 | 
135 |         document_list.append(document_obj)
136 | 
137 |     return document_list
138 | 
139 | def _custom_serializer(obj):
140 |     if hasattr(obj, '__dict__'):
141 |         return obj.__dict__
142 |     raise TypeError(f"Object of type {obj.__class__.__name__} is not JSON serializable")
143 | 
144 | 
145 | def qudsim(documents: list[str], key=None, config_file=None):
146 |     """
147 |     Args:
148 |         documents (list[str]): a list of texts to be aligned (all combinations of pairs will be computed)
149 |         key (str): OpenAI Key
150 |         config_file (str): a .yaml or .yml file that contains the necessary configurations (see config.yaml for the default config)
151 | 
152 |     Returns:
153 | 
154 |         
155 |     """
156 | 
157 |     if not config_file:
158 |         config_file = os.path.join(os.path.dirname(__file__), '../config.yaml')
159 | 
160 |     with open(config_file, 'r') as file:
161 |         configs = yaml.safe_load(file)
162 | 
163 |     try:
164 |         qg_gpt_model_name = configs['qg_gpt_model']
165 |         qa_gpt_model_name = configs['qa_gpt_model']
166 |         level = configs['level']
167 |         threshold = configs['threshold']
168 |         max_tries = configs['max_tries']
169 |     except:
170 |         print("Failed to parse configurations")
171 |         return
172 | 
173 | 
174 |     
175 |     qg_gpt_model = openai.GPT(qg_gpt_model_name, key=key)
176 |     qa_gpt_model = openai.GPT(qa_gpt_model_name, key=key)
177 |     
178 |     # level of abstraction of QUDs, with 0 being highly specific and 1 being abstractive
179 |     
180 |     if level!=0 and level!=1:
181 |         print("Levels 0 and 1 are supported, 0 being specific and 1 being abstract. Value passed was an unsupported level.")
182 |         return
183 |     
184 |     if threshold < 0 or threshold > 1:
185 |         print("Threshold value is outside the valid range (0,1).")
186 |         return
187 |     
188 |     if max_tries<=0:
189 |         print("Maximum number of attempts to successfully align pairs must be at least 1.")
190 |         return
191 |     
192 |     if len(documents)<2:
193 |         print("At least two documents must be provided.")
194 |         return
195 | 
196 |     # create document objects (one Document per document)
197 |     document_list = _compile_documents(documents=documents, qg_gpt_model=qg_gpt_model, config=configs)
198 | 
199 |     if len(document_list)<2:
200 |         print("At least two documents must successfully generate QUDs.")
201 |         return
202 | 
203 |     # make pairs from the list of Document objects
204 |     pair_combinations = list(itertools.combinations(document_list, 2))
205 |     alignment_pairs = []
206 |     for doc1, doc2 in tqdm(pair_combinations, total=len(pair_combinations), desc='Aligning Document Pairs'):
207 |         alignment_pair = AlignmentPair(document1=doc1, document2=doc2)
208 |         alignment_pair.align_documents(gpt_model=qa_gpt_model, config=configs)
209 |         alignment_pairs.append(alignment_pair)
210 | 
211 |     
212 |     json_str = json.dumps(alignment_pairs, default=_custom_serializer)
213 |     return json_str
214 | 


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
  1 | 
  2 |                                  Apache License
  3 |                            Version 2.0, January 2004
  4 |                         http://www.apache.org/licenses/
  5 | 
  6 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  7 | 
  8 |    1. Definitions.
  9 | 
 10 |       "License" shall mean the terms and conditions for use, reproduction,
 11 |       and distribution as defined by Sections 1 through 9 of this document.
 12 | 
 13 |       "Licensor" shall mean the copyright owner or entity authorized by
 14 |       the copyright owner that is granting the License.
 15 | 
 16 |       "Legal Entity" shall mean the union of the acting entity and all
 17 |       other entities that control, are controlled by, or are under common
 18 |       control with that entity. For the purposes of this definition,
 19 |       "control" means (i) the power, direct or indirect, to cause the
 20 |       direction or management of such entity, whether by contract or
 21 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 22 |       outstanding shares, or (iii) beneficial ownership of such entity.
 23 | 
 24 |       "You" (or "Your") shall mean an individual or Legal Entity
 25 |       exercising permissions granted by this License.
 26 | 
 27 |       "Source" form shall mean the preferred form for making modifications,
 28 |       including but not limited to software source code, documentation
 29 |       source, and configuration files.
 30 | 
 31 |       "Object" form shall mean any form resulting from mechanical
 32 |       transformation or translation of a Source form, including but
 33 |       not limited to compiled object code, generated documentation,
 34 |       and conversions to other media types.
 35 | 
 36 |       "Work" shall mean the work of authorship, whether in Source or
 37 |       Object form, made available under the License, as indicated by a
 38 |       copyright notice that is included in or attached to the work
 39 |       (an example is provided in the Appendix below).
 40 | 
 41 |       "Derivative Works" shall mean any work, whether in Source or Object
 42 |       form, that is based on (or derived from) the Work and for which the
 43 |       editorial revisions, annotations, elaborations, or other modifications
 44 |       represent, as a whole, an original work of authorship. For the purposes
 45 |       of this License, Derivative Works shall not include works that remain
 46 |       separable from, or merely link (or bind by name) to the interfaces of,
 47 |       the Work and Derivative Works thereof.
 48 | 
 49 |       "Contribution" shall mean any work of authorship, including
 50 |       the original version of the Work and any modifications or additions
 51 |       to that Work or Derivative Works thereof, that is intentionally
 52 |       submitted to Licensor for inclusion in the Work by the copyright owner
 53 |       or by an individual or Legal Entity authorized to submit on behalf of
 54 |       the copyright owner. For the purposes of this definition, "submitted"
 55 |       means any form of electronic, verbal, or written communication sent
 56 |       to the Licensor or its representatives, including but not limited to
 57 |       communication on electronic mailing lists, source code control systems,
 58 |       and issue tracking systems that are managed by, or on behalf of, the
 59 |       Licensor for the purpose of discussing and improving the Work, but
 60 |       excluding communication that is conspicuously marked or otherwise
 61 |       designated in writing by the copyright owner as "Not a Contribution."
 62 | 
 63 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 64 |       on behalf of whom a Contribution has been received by Licensor and
 65 |       subsequently incorporated within the Work.
 66 | 
 67 |    2. Grant of Copyright License. Subject to the terms and conditions of
 68 |       this License, each Contributor hereby grants to You a perpetual,
 69 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 70 |       copyright license to reproduce, prepare Derivative Works of,
 71 |       publicly display, publicly perform, sublicense, and distribute the
 72 |       Work and such Derivative Works in Source or Object form.
 73 | 
 74 |    3. Grant of Patent License. Subject to the terms and conditions of
 75 |       this License, each Contributor hereby grants to You a perpetual,
 76 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 77 |       (except as stated in this section) patent license to make, have made,
 78 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 79 |       where such license applies only to those patent claims licensable
 80 |       by such Contributor that are necessarily infringed by their
 81 |       Contribution(s) alone or by combination of their Contribution(s)
 82 |       with the Work to which such Contribution(s) was submitted. If You
 83 |       institute patent litigation against any entity (including a
 84 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 85 |       or a Contribution incorporated within the Work constitutes direct
 86 |       or contributory patent infringement, then any patent licenses
 87 |       granted to You under this License for that Work shall terminate
 88 |       as of the date such litigation is filed.
 89 | 
 90 |    4. Redistribution. You may reproduce and distribute copies of the
 91 |       Work or Derivative Works thereof in any medium, with or without
 92 |       modifications, and in Source or Object form, provided that You
 93 |       meet the following conditions:
 94 | 
 95 |       (a) You must give any other recipients of the Work or
 96 |           Derivative Works a copy of this License; and
 97 | 
 98 |       (b) You must cause any modified files to carry prominent notices
 99 |           stating that You changed the files; and
100 | 
101 |       (c) You must retain, in the Source form of any Derivative Works
102 |           that You distribute, all copyright, patent, trademark, and
103 |           attribution notices from the Source form of the Work,
104 |           excluding those notices that do not pertain to any part of
105 |           the Derivative Works; and
106 | 
107 |       (d) If the Work includes a "NOTICE" text file as part of its
108 |           distribution, then any Derivative Works that You distribute must
109 |           include a readable copy of the attribution notices contained
110 |           within such NOTICE file, excluding those notices that do not
111 |           pertain to any part of the Derivative Works, in at least one
112 |           of the following places: within a NOTICE text file distributed
113 |           as part of the Derivative Works; within the Source form or
114 |           documentation, if provided along with the Derivative Works; or,
115 |           within a display generated by the Derivative Works, if and
116 |           wherever such third-party notices normally appear. The contents
117 |           of the NOTICE file are for informational purposes only and
118 |           do not modify the License. You may add Your own attribution
119 |           notices within Derivative Works that You distribute, alongside
120 |           or as an addendum to the NOTICE text from the Work, provided
121 |           that such additional attribution notices cannot be construed
122 |           as modifying the License.
123 | 
124 |       You may add Your own copyright statement to Your modifications and
125 |       may provide additional or different license terms and conditions
126 |       for use, reproduction, or distribution of Your modifications, or
127 |       for any such Derivative Works as a whole, provided Your use,
128 |       reproduction, and distribution of the Work otherwise complies with
129 |       the conditions stated in this License.
130 | 
131 |    5. Submission of Contributions. Unless You explicitly state otherwise,
132 |       any Contribution intentionally submitted for inclusion in the Work
133 |       by You to the Licensor shall be under the terms and conditions of
134 |       this License, without any additional terms or conditions.
135 |       Notwithstanding the above, nothing herein shall supersede or modify
136 |       the terms of any separate license agreement you may have executed
137 |       with Licensor regarding such Contributions.
138 | 
139 |    6. Trademarks. This License does not grant permission to use the trade
140 |       names, trademarks, service marks, or product names of the Licensor,
141 |       except as required for reasonable and customary use in describing the
142 |       origin of the Work and reproducing the content of the NOTICE file.
143 | 
144 |    7. Disclaimer of Warranty. Unless required by applicable law or
145 |       agreed to in writing, Licensor provides the Work (and each
146 |       Contributor provides its Contributions) on an "AS IS" BASIS,
147 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 |       implied, including, without limitation, any warranties or conditions
149 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 |       PARTICULAR PURPOSE. You are solely responsible for determining the
151 |       appropriateness of using or redistributing the Work and assume any
152 |       risks associated with Your exercise of permissions under this License.
153 | 
154 |    8. Limitation of Liability. In no event and under no legal theory,
155 |       whether in tort (including negligence), contract, or otherwise,
156 |       unless required by applicable law (such as deliberate and grossly
157 |       negligent acts) or agreed to in writing, shall any Contributor be
158 |       liable to You for damages, including any direct, indirect, special,
159 |       incidental, or consequential damages of any character arising as a
160 |       result of this License or out of the use or inability to use the
161 |       Work (including but not limited to damages for loss of goodwill,
162 |       work stoppage, computer failure or malfunction, or any and all
163 |       other commercial damages or losses), even if such Contributor
164 |       has been advised of the possibility of such damages.
165 | 
166 |    9. Accepting Warranty or Additional Liability. While redistributing
167 |       the Work or Derivative Works thereof, You may choose to offer,
168 |       and charge a fee for, acceptance of support, warranty, indemnity,
169 |       or other liability obligations and/or rights consistent with this
170 |       License. However, in accepting such obligations, You may act only
171 |       on Your own behalf and on Your sole responsibility, not on behalf
172 |       of any other Contributor, and only if You agree to indemnify,
173 |       defend, and hold each Contributor harmless for any liability
174 |       incurred by, or claims asserted against, such Contributor by reason
175 |       of your accepting any such warranty or additional liability.
176 | 
177 |    END OF TERMS AND CONDITIONS
178 | 
179 |    APPENDIX: How to apply the Apache License to your work.
180 | 
181 |       To apply the Apache License to your work, attach the following
182 |       boilerplate notice, with the fields enclosed by brackets "[]"
183 |       replaced with your own identifying information. (Don't include
184 |       the brackets!)  The text should be enclosed in the appropriate
185 |       comment syntax for the file format. We also recommend that a
186 |       file or class name and description of purpose be included on the
187 |       same "printed page" as the copyright notice for easier
188 |       identification within third-party archives.
189 | 
190 |    Copyright [yyyy] [name of copyright owner]
191 | 
192 |    Licensed under the Apache License, Version 2.0 (the "License");
193 |    you may not use this file except in compliance with the License.
194 |    You may obtain a copy of the License at
195 | 
196 |        http://www.apache.org/licenses/LICENSE-2.0
197 | 
198 |    Unless required by applicable law or agreed to in writing, software
199 |    distributed under the License is distributed on an "AS IS" BASIS,
200 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 |    See the License for the specific language governing permissions and
202 |    limitations under the License.
203 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # diversity
  2 | [![PyPI version](https://img.shields.io/pypi/v/diversity.svg)](https://pypi.org/project/diversity/)
  3 | [![License](https://img.shields.io/badge/license-Apache%202.0-blue.svg)](LICENSE)
  4 | [![ArXiv](https://img.shields.io/badge/arXiv-2403.00553-b31b1b.svg)](https://arxiv.org/abs/2403.00553)
  5 | 
  6 | ### **A Python toolkit for measuring diversity in text.**
  7 | 
  8 | ---
  9 | 
 10 | ## Table of Contents
 11 | - [Installation](#installation)
 12 | - [Quick Start](#quick-start)
 13 |   - [Lexical Diversity Measures](#lexical-diversity-measures)
 14 |     - [`compression_ratio`](#compression_ratiotexts-methodgzip)
 15 |     - [`homogenization_score`](#homogenization_scoretexts-methodself-bleu)
 16 |     - [`ngram_diversity_score`](#ngram_diversity_scoretexts-n3)
 17 |     - [`self_repetition_score`](#self_repetition_scoretexts-n4)
 18 |   - [Syntactic Diversity Measures](#syntactic-diversity-measures)
 19 |     - [`extract_patterns`](#extract_patternstexts-n4-top_n5)
 20 |     - [`match_patterns`](#match_patternstext-patterns)
 21 |     - [`template_rate`](#template_ratedata-templatesnone-shard_size500)
 22 |     - [`templates_per_token`](#templates_per_tokendata-templatesnone-shard_size500)
 23 |   - [Embedding-Based Diversity Measures](#embedding-based-diversity-measures)
 24 |     - [`remote_clique`](#remote_cliquedata-modelqwenqwen3-embedding-06b-verbo-true-batch_size64)
 25 |     - [`chamfer_dist`](#chamfer_distdata-modelqwenqwen3-embedding-06b-verbo-true-batch_size64)
 26 |   - [QUDSim (Question Under Discussion Similarity)](#qudsim-question-under-discussion-similarity)
 27 |     - [`qudsim`](#qudsimdocuments-key)
 28 | - [Citations](#citations)
 29 | - [Requirements](#requirements)
 30 | - [License](#license)
 31 | - [Contributing](#contributing)
 32 | 
 33 | ---
 34 | 
 35 | ## Installation
 36 | 
 37 | Install via pip:
 38 | 
 39 | ```bash
 40 | pip install diversity
 41 | ```
 42 | 
 43 | Or from source:
 44 | 
 45 | ```bash
 46 | git clone https://github.com/cshaib/diversity.git
 47 | cd diversity
 48 | pip install .
 49 | ```
 50 | 
 51 | ----------
 52 | 
 53 | ## Quick Start
 54 | 
 55 | The function `compute_all_metrics` will return a dictionary (and optionally a LaTeX/Markdown formatted tabular output) computing the diversity metrics described individually in the following section. 
 56 | 
 57 | ```
 58 | from diversity import compute_all_metrics
 59 | import json
 60 | 
 61 | texts = [
 62 |     "The quick brown fox jumps over the lazy dog.",
 63 |     "The quick brown fox jumps over the lazy dog again.",
 64 |     "Suddenly, the quick brown fox leaps swiftly over the sleeping dog."
 65 | ]
 66 | 
 67 | # Compute metrics
 68 | results = compute_all_metrics(corpus=texts)
 69 | 
 70 | # Remove the list of per-document scores for cleaner dict output
 71 | clean_results = {k: v for k, v in results.items() 
 72 |                 if k != "templates_per_token_scores"}
 73 | output_content = json.dumps(clean_results, indent=2)
 74 | 
 75 | with open('diversity_metrics.json', 'w', encoding='utf-8') as f:
 76 |     f.write(output_content)
 77 | ```
 78 | 
 79 | ### Lexical Diversity Measures
 80 | 
 81 | We provide implementations for Compression Ratio, Homogenization Score, and n-gram Diversity Score: 
 82 | 
 83 | ```python
 84 | from diversity import (
 85 |     compression_ratio,
 86 |     homogenization_score,
 87 |     ngram_diversity_score,
 88 | )
 89 | 
 90 | texts = [
 91 |     "The quick brown fox jumps over the lazy dog.",
 92 |     "The quick brown fox jumps over the lazy dog again.",
 93 |     "Suddenly, the quick brown fox leaps swiftly over the sleeping dog."
 94 | ]
 95 | 
 96 | # Compression ratio
 97 | cr = compression_ratio(texts, method='gzip')
 98 | print(f"Compression Ratio: {cr:.4f}")
 99 | 
100 | # Homogenization score (Self-BLEU)
101 | hs = homogenization_score(texts, method='self-bleu')
102 | print(f"Homogenization (Self-BLEU): {hs:.4f}")
103 | 
104 | # N-gram diversity
105 | ngd = ngram_diversity_score(texts, n=3)
106 | print(f"3-gram Diversity: {ngd:.4f}")
107 | 
108 | # Self-repetition score
109 | srs = self_repetition_score (texts)
110 | print(f"Self-repetition score: {srs:4f}")
111 | ```
112 | #### `compression_ratio(texts, method='gzip')`
113 | 
114 | -   **Parameters:**
115 |     -   `texts`  (list): List of text strings
116 |     -   `method`  (str): Compression algorithm ('gzip', 'bz2', 'lzma')
117 | -   **Returns:**  Float, higher = more repetitive
118 | 
119 | #### `homogenization_score(texts, method='self-bleu')`
120 | 
121 | -   **Parameters:**
122 |     -   `texts`  (list): List of text strings
123 |     -   `method`  (str): Scoring method ('self-bleu', 'rouge-l', 'bertscore')
124 | -   **Returns:**  Float, higher = more homogeneous
125 | 
126 | #### `ngram_diversity_score(texts, n=3)`
127 | 
128 | -   **Parameters:**
129 |     -   `texts`  (list): List of text strings
130 |     -   `n`  (int): N-gram size
131 | -   **Returns:**  Float, higher = more diverse
132 | 
133 | #### `self_repetition_score(texts, n=4)`
134 | 
135 | -   **Parameters:**
136 |     -   `text`  (list): List of text strings
137 | -   **Returns:**  Float, higher = more diverse
138 | ----------
139 | 
140 | ### Syntactic Diversity Measures
141 | 
142 | We also provide functions for extracting and analyze Part-of-Speech (POS) patterns to identify repetitive syntactic structures in your text: 
143 | 
144 | ```python
145 | from diversity import (
146 |     extract_patterns,
147 |     match_patterns,
148 |     template_rate,
149 |     templates_per_token
150 | )
151 | 
152 | texts = [
153 |     "The quick brown fox jumps over the lazy dog.",
154 |     "The quick brown fox jumps over the lazy dog again.",
155 |     "Suddenly, the quick brown fox leaps swiftly over the sleeping dog."
156 | ]
157 | 
158 | # POS pattern extraction
159 | patterns = extract_patterns(texts, n=4, top_n=5)
160 | print("Top POS patterns:", patterns)
161 | # Example output: [(('DT', 'JJ', 'JJ', 'NN'), 15), ...]
162 | 
163 | # Match patterns in a single text
164 | matches = match_patterns(texts[2], patterns)
165 | print("Patterns in 3rd sentence:", matches)
166 | # Example output: [{'pattern': ('DT', 'JJ', 'JJ', 'NN'), 'text': 'the quick brown fox', 'position': (0, 4)}]
167 | 
168 | # Template Rate (number of templates that appear in each text, averaged across documents0
169 | tr = template_rate(texts, templates)
170 | print("Template Rate:", tr)
171 | 
172 | # Templates-per-token (normalized by text length, per output) 
173 | tpt = templates_per_token(texts, templates)
174 | print("Templates per Token:", tpt)
175 | ```
176 | 
177 | #### `extract_patterns(text, n=5, top_n=100)`
178 | 
179 | -   **text (list of str):**  Documents to extract syntactic patterns from.
180 |     
181 | -   **n (int):**  N-gram size for POS pattern extraction (default:  `5`).
182 |     
183 | -   **top_n (int):**  Number of most frequent patterns to keep (default:  `100`).
184 |     
185 | -   **Returns:**  `dict`  — dictionary mapping POS patterns (e.g., `"DT JJ NN NN"`) to sets of text spans that match the patterns
186 | 
187 | 
188 | #### `match_patterns(text, patterns)`
189 | 
190 | -   **text (str):**  Input text to search for patterns.
191 |     
192 | -   **patterns (dict):**  Dictionary of patterns and their text matches as returned by `extract_patterns`.
193 |     
194 | -   **Returns:**  `list[tuple]`  — list of `(pattern, text)` pairs showing which syntactic patterns appear in the input and the exact spans that match
195 | 
196 | 
197 | #### `template_rate(data, templates=None, shard_size=500)`
198 | 
199 | -   **data (list of str):**  Documents to score.
200 |     
201 | -   **templates (dict, optional):**  Dictionary of templates extracted from the corpus. If `None`, templates are computed using `extract_patterns`.  
202 |     
203 | -   **shard_size (int):**  Number of regex patterns to compile per shard (default: `500`).  
204 |     
205 | -   **Returns:**  `float`  — fraction of documents in the corpus that contain at least one template (higher = more templated, lower = more original).  
206 |     
207 | 
208 | #### `templates_per_token(data, templates=None, shard_size=500)`
209 | 
210 | -   **data (list of str):**  Documents to score.  
211 |     
212 | -   **templates (dict, optional):**  Dictionary of templates extracted from the corpus. If `None`, templates are computed using `extract_patterns`.  
213 |     
214 | -   **shard_size (int):**  Number of regex patterns to compile per shard (default: `500`).  
215 |     
216 | -   **Returns:**  `float`  — per-document ratio of template matches to tokens (higher = more templated per word, lower = more diverse writing).  
217 | 
218 | ----------
219 | 
220 | ### Embedding-Based Diversity Measures
221 | 
222 | You can also measure semantic diversity using *embedding*-based similarity. These scores compute distances between document embeddings to quantify how spread out or clustered the texts are:
223 | 
224 | ```python
225 | from diversity.embedding import remote_clique, chamfer_dist
226 | 
227 | texts = [
228 |     "The quick brown fox jumps over the lazy dog.",
229 |     "A swift auburn fox vaulted a sleeping canine.",
230 |     "I brewed coffee and read the paper."
231 | ]
232 | 
233 | # Remote Clique Score
234 | rc = remote_clique(texts, model="Qwen/Qwen3-Embedding-0.6B")
235 | print(f"Remote Clique: {rc:.3f}")
236 | 
237 | # Chamfer Distance
238 | cd = chamfer_dist(texts, model="Qwen/Qwen3-Embedding-0.6B")
239 | print(f"Chamfer Distance: {cd:.3f}")
240 | ```
241 | #### `remote_clique(data, model='Qwen/Qwen3-Embedding-0.6B', verbose=True, batch_size=64)`
242 | 
243 | -   **data (list of str):**  Documents to score.
244 |     
245 | -   **model (str):**  HuggingFace/Sentence-Transformers embedding model to use (default:  `"Qwen/Qwen3-Embedding-0.6B"`).
246 |     
247 | -   **verbose (bool):**  Whether to show a progress bar during encoding (default:  `True`).
248 |     
249 | -   **batch_size (int):**  Batch size for embedding (default:  `64`).
250 |     
251 | -   **Returns:**  `float`  — average mean pairwise cosine distance between documents (higher = more spread out / diverse).
252 |     
253 | 
254 | #### `chamfer_dist(data, model='Qwen/Qwen3-Embedding-0.6B', verbose=True, batch_size=64)`
255 | 
256 | -   **data (list of str):**  Documents to score.
257 |     
258 | -   **model (str):**  HuggingFace/Sentence-Transformers embedding model to use (default:  `"Qwen/Qwen3-Embedding-0.6B"`).
259 |     
260 | -   **verbose (bool):**  Whether to show a progress bar during encoding (default:  `True`).
261 |     
262 | -   **batch_size (int):**  Batch size for embedding (default:  `64`).
263 |     
264 | -   **Returns:**  `float`  — average minimum pairwise cosine distance (sensitive to near-duplicates; higher = less redundancy).
265 | 
266 | ----------
267 | 
268 | ### QUDSim (Question Under Discussion Similarity)
269 | 
270 | QUDSim aligns document segments based on Questions Under Discussion (QUDs) --- implicit questions that segments of text address ([QUDsim: Quantifying Discourse Similarities in LLM-Generated Text](https://arxiv.org/abs/2504.09373)). 
271 | 
272 | This function requires OpenAI API access.
273 | 
274 | ```python
275 | from diversity import qudsim
276 | 
277 | # Two documents about the same topic
278 | document1 = "In the heart of ancient Macedonia, Philip II ascended to the throne in 359 BC..."
279 | document2 = "The sun beat down on the rough-hewn hills of ancient Macedonia..."
280 | 
281 | # Requires OpenAI API key
282 | import os
283 | key = os.environ.get('OPENAI_API_KEY')  # or your API key
284 | 
285 | # Generate QUD-based alignment
286 | alignment = qudsim([document1, document2], key=key)
287 | 
288 | # Access alignment results
289 | results = eval(alignment)[0]  # First document pair
290 | 
291 | # View aligned segments
292 | for source_text, target_text in results['aligned_segment_text']:
293 |     print(f"Source: {source_text[:100]}...")
294 |     print(f"Target: {target_text[:100]}...")
295 |     print("---")
296 | 
297 | # View alignment scores (harmonic mean scores matrix)
298 | scores = results['harmonic_mean_scores']
299 | print(f"Alignment scores shape: {len(scores)}x{len(scores[0])}")
300 | 
301 | # Other available fields:
302 | # - results['source_qud_answers']: QUDs generated for source document
303 | # - results['target_qud_answers']: QUDs generated for target document
304 | # - results['aligned_segments']: Indices of aligned segments
305 | ```
306 | 
307 | #### `qudsim(documents, key)`
308 | 
309 | -   **Parameters:**
310 |     -   `documents` (list): List of texts to align
311 |     -   `key` (str): OpenAI API key for QUD generation
312 |     - `model` (str): LLM model to use (default: `gpt-4`)
313 |     -  `threshold` (float): Minimum alignment score threshold (default: 0.5)
314 | -   **Returns:**  list of alignment scores
315 | ----------
316 |  
317 | ## Citation(s)
318 | 
319 | If you use this package, please cite:
320 | 
321 | ```bibtex
322 | @misc{shaib2025standardizingmeasurementtextdiversity,
323 |   title={Standardizing the Measurement of Text Diversity: A Tool and a Comparative Analysis of Scores},
324 |   author={Chantal Shaib and Joe Barrow and Jiuding Sun and Alexa F. Siu and Byron C. Wallace and Ani Nenkova},
325 |   year={2025},
326 |   eprint={2403.00553},
327 |   archivePrefix={arXiv},
328 |   primaryClass={cs.CL},
329 |   url={https://arxiv.org/abs/2403.00553},
330 | }
331 | ```
332 | 
333 | If you use QUDSim, please  **also**  cite:
334 | 
335 | ```bibtex
336 | @inproceedings{
337 | namuduri2025qudsim,
338 | title={{QUD}sim: Quantifying Discourse Similarities in {LLM}-Generated Text},
339 | author={Ramya Namuduri and Yating Wu and Anshun Asher Zheng and Manya Wadhwa and Greg Durrett and Junyi Jessy Li},
340 | booktitle={Second Conference on Language Modeling},
341 | year={2025},
342 | url={https://openreview.net/forum?id=zFz1BJu211}
343 | }
344 | ```
345 | 
346 | ----------
347 | 
348 | ## Requirements
349 | 
350 | -   Python 3.10-3.12
351 | -   Core dependencies:
352 |     -   `numpy`
353 |     -   `nltk`
354 |     -   `scikit-learn`
355 | -   For embedding-based metrics:
356 |     -   `sentence-transformers`
357 |     -   `torch`
358 | -   For QUDSim:
359 |     -   `openai`
360 |     -   `tqdm`
361 | 
362 | ----------
363 | 
364 | ## License
365 | 
366 | This package is released under the  **Apache License 2.0**.
367 | 
368 | ----------
369 | 
370 | ## Contributing
371 | 
372 | Contributions are welcome!  
373 | Please open an issue or submit a pull request on GitHub.
374 | 
375 | ----------
376 | 


--------------------------------------------------------------------------------