├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── chunking_evaluation
    ├── __init__.py
    ├── chunking
    │   ├── __init__.py
    │   ├── base_chunker.py
    │   ├── cluster_semantic_chunker.py
    │   ├── fixed_token_chunker.py
    │   ├── kamradt_modified_chunker.py
    │   ├── llm_semantic_chunker.py
    │   └── recursive_token_chunker.py
    ├── evaluation_framework
    │   ├── __init__.py
    │   ├── base_evaluation.py
    │   ├── general_evaluation.py
    │   ├── general_evaluation_data
    │   │   ├── corpora
    │   │   │   ├── chatlogs.md
    │   │   │   ├── finance.md
    │   │   │   ├── pubmed.md
    │   │   │   ├── state_of_the_union.md
    │   │   │   └── wikitexts.md
    │   │   ├── questions_db
    │   │   │   ├── 633a2ec9-d034-4db6-acda-0c784ceaa32b
    │   │   │   │   ├── data_level0.bin
    │   │   │   │   ├── header.bin
    │   │   │   │   ├── length.bin
    │   │   │   │   └── link_lists.bin
    │   │   │   ├── bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2
    │   │   │   │   ├── data_level0.bin
    │   │   │   │   ├── header.bin
    │   │   │   │   ├── length.bin
    │   │   │   │   └── link_lists.bin
    │   │   │   ├── chroma.sqlite3
    │   │   │   └── daae47eb-a4bf-41ec-b4e7-d7f902773aeb
    │   │   │   │   ├── data_level0.bin
    │   │   │   │   ├── header.bin
    │   │   │   │   ├── length.bin
    │   │   │   │   └── link_lists.bin
    │   │   └── questions_df.csv
    │   ├── prompts
    │   │   ├── question_maker_approx_system.txt
    │   │   ├── question_maker_approx_user.txt
    │   │   ├── question_maker_system.txt
    │   │   └── question_maker_user.txt
    │   └── synthetic_evaluation.py
    └── utils.py
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
1 | # Ignore egg-info directory
2 | *.egg-info
3 | __pycache__/
4 | **/__pycache__/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Brandon Smith
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include chunking_evaluation/evaluation_framework/general_evaluation_data *
2 | recursive-include chunking_evaluation/evaluation_framework/prompts *
3 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Chunking Evaluation
  2 | 
  3 | This package, developed as part of our research detailed in the [Chroma Technical Report](https://research.trychroma.com/evaluating-chunking), provides tools for text chunking and evaluation. It allows users to compare different chunking methods and includes implementations of several novel chunking strategies.
  4 | 
  5 | ## Features
  6 | 
  7 | - **Compare Chunking Methods**: Evaluate and compare various popular chunking strategies.
  8 | - **Novel Chunking Methods**: Implementations of new chunking methods such as `ClusterSemanticChunker` and `LLMChunker`.
  9 | - **Evaluation Framework**: Tools to generate domain-specific datasets and evaluate retrieval quality in the context of AI applications.
 10 | 
 11 | ## Quick Start
 12 | 
 13 | You can immediately test the package via [Google Colab](https://colab.research.google.com/drive/1J5ALtDf0_RrswRz2fktjFVeFxe2jbXuJ?usp=sharing).
 14 | 
 15 | ## Installation
 16 | 
 17 | You can install the package directly from GitHub:
 18 | 
 19 | ```bash
 20 | pip install git+https://github.com/brandonstarxel/chunking_evaluation.git
 21 | ```
 22 | 
 23 | 
 24 | # Evaluating Your Own Custom Chunker
 25 | This example shows how to implement your own chunking logic and evaluate its performance.
 26 | ```python
 27 | from chunking_evaluation import BaseChunker, GeneralEvaluation
 28 | from chromadb.utils import embedding_functions
 29 | 
 30 | # Define a custom chunking class
 31 | class CustomChunker(BaseChunker):
 32 |     def split_text(self, text):
 33 |         # Custom chunking logic
 34 |         return [text[i:i+1200] for i in range(0, len(text), 1200)]
 35 | 
 36 | # Instantiate the custom chunker and evaluation
 37 | chunker = CustomChunker()
 38 | evaluation = GeneralEvaluation()
 39 | 
 40 | # Choose embedding function
 41 | default_ef = embedding_functions.OpenAIEmbeddingFunction(
 42 |     api_key="OPENAI_API_KEY",
 43 |     model_name="text-embedding-3-large"
 44 | )
 45 | 
 46 | # Evaluate the chunker
 47 | results = evaluation.run(chunker, default_ef)
 48 | 
 49 | print(results)
 50 | # {'iou_mean': 0.17715979570301696, 'iou_std': 0.10619791407460026, 
 51 | # 'recall_mean': 0.8091207841640163, 'recall_std': 0.3792297991952294}
 52 | ```
 53 | 
 54 | # Evaluating a Custom Embedding Function
 55 | ```python
 56 | from chromadb import Documents, EmbeddingFunction, Embeddings
 57 | 
 58 | class MyEmbeddingFunction(EmbeddingFunction):
 59 |     def __call__(self, input: Documents) -> Embeddings:
 60 |         # embed the documents somehow
 61 |         return embeddings
 62 | 
 63 | # Instantiate instance of ef
 64 | default_ef = MyEmbeddingFunction()
 65 | 
 66 | # Evaluate the embedding function with a chunker
 67 | results = evaluation.run(chunker, default_ef)
 68 | ```
 69 | 
 70 | # Usage and Evaluation of ClusterSemanticChunker
 71 | This example demonstrates how to use our ClusterSemanticChunker and how you can evaluate it yourself.
 72 | ```python
 73 | from chunking_evaluation import BaseChunker, GeneralEvaluation
 74 | from chunking_evaluation.chunking import ClusterSemanticChunker
 75 | from chromadb.utils import embedding_functions
 76 | 
 77 | # Instantiate evaluation
 78 | evaluation = GeneralEvaluation()
 79 | 
 80 | # Choose embedding function
 81 | default_ef = embedding_functions.OpenAIEmbeddingFunction(
 82 |     api_key="OPENAI_API_KEY",
 83 |     model_name="text-embedding-3-large"
 84 | )
 85 | 
 86 | # Instantiate chunker and run the evaluation
 87 | chunker = ClusterSemanticChunker(default_ef, max_chunk_size=400)
 88 | results = evaluation.run(chunker, default_ef)
 89 | 
 90 | print(results)
 91 | # {'iou_mean': 0.18255175232840098, 'iou_std': 0.12773219595465307, 
 92 | # 'recall_mean': 0.8973469551927365, 'recall_std': 0.29042203879923994}
 93 | ```
 94 | 
 95 | ## Synthetic Dataset Pipeline for Domain Specific Evaluation
 96 | 
 97 | Here are the steps you can take to develop a sythetic dataset based off your own corpora for domain specific evaluation.
 98 | 
 99 | 1. **Initialize the Environment**:
100 | 
101 |     ```python
102 |     from chunking_evaluation import SyntheticEvaluation
103 |     
104 |     # Specify the corpora paths and output CSV file
105 |     corpora_paths = [
106 |         'path/to/chatlogs.txt',
107 |         'path/to/finance.txt',
108 |         # Add more corpora files as needed
109 |     ]
110 |     queries_csv_path = 'generated_queries_excerpts.csv'
111 |     
112 |     # Initialize the evaluation
113 |     evaluation = SyntheticEvaluation(corpora_paths, queries_csv_path, openai_api_key="OPENAI_API_KEY")
114 |     ```
115 | 
116 | 2. **Generate Queries and Excerpts**:
117 | 
118 |     ```python
119 |     # Generate queries and excerpts, and save to CSV
120 |     evaluation.generate_queries_and_excerpts()
121 |     ```
122 | 
123 | 3. **Apply Filters**:
124 | 
125 |     ```python
126 |     # Apply filter to remove queries with poor excerpts
127 |     evaluation.filter_poor_excerpts(threshold=0.36)
128 |     
129 |     # Apply filter to remove duplicates
130 |     evaluation.filter_duplicates(threshold=0.6)
131 |     ```
132 | 
133 | 4. **Run the Evaluation**:
134 | 
135 |     ```python
136 |     from chunking_evaluation import BaseChunker
137 | 
138 |     # Define a custom chunking class
139 |     class CustomChunker(BaseChunker):
140 |         def split_text(self, text):
141 |             # Custom chunking logic
142 |             return [text[i:i+1200] for i in range(0, len(text), 1200)]
143 | 
144 |     # Instantiate the custom chunker
145 |     chunker = CustomChunker()
146 | 
147 |     # Run the evaluation on the filtered data
148 |     results = evaluation.run(chunker)
149 |     print("Evaluation Results:", results)
150 |     ```
151 | 
152 | 2. **Optional: If generation is unable to generate queries try approximate excerpts**
153 | 
154 |     ```python
155 |     # Generate queries and excerpts, and save to CSV
156 |     evaluation.generate_queries_and_excerpts(approximate_excerpts=True)
157 |     ```
158 | ## Package Dependancies:
159 | The following will be installed along with the package:
160 | - tiktoken
161 | - fuzzywuzzy
162 | - pandas
163 | - numpy
164 | - tqdm
165 | - chromadb
166 | - python-Levenshtein
167 | - openai
168 | - anthropic
169 | - attrs
170 | 
171 | ## Citation
172 | 
173 | If you use this package in your research, please cite our technical report:
174 | ```
175 | @techreport{smith2024evaluating,
176 |   title = {Evaluating Chunking Strategies for Retrieval},
177 |   author = {Smith, Brandon and Troynikov, Anton},
178 |   year = {2024},
179 |   month = {July},
180 |   institution = {Chroma},
181 |   url = {https://research.trychroma.com/evaluating-chunking},
182 | }
183 | ```
184 | 
185 | ## Contributions
186 | We welcome contributions and are excited you'd like to get involved! 
187 | Make sure your pull request goes to the dev branch. We will test it and then later merge it to main.
188 | 


--------------------------------------------------------------------------------
/chunking_evaluation/__init__.py:
--------------------------------------------------------------------------------
 1 | from .chunking.base_chunker import BaseChunker
 2 | from .evaluation_framework.general_evaluation import GeneralEvaluation
 3 | from .evaluation_framework.synthetic_evaluation import SyntheticEvaluation
 4 | from .utils import *
 5 | 
 6 | __all__ = [
 7 |     'BaseChunker',
 8 |     'GeneralEvaluation',
 9 |     'SyntheticEvaluation',
10 | ]


--------------------------------------------------------------------------------
/chunking_evaluation/chunking/__init__.py:
--------------------------------------------------------------------------------
1 | from .fixed_token_chunker import FixedTokenChunker
2 | from .recursive_token_chunker import RecursiveTokenChunker
3 | from .cluster_semantic_chunker import ClusterSemanticChunker
4 | from .llm_semantic_chunker import LLMSemanticChunker
5 | from .kamradt_modified_chunker import KamradtModifiedChunker
6 | 
7 | # __all__ = ['ClusterSemanticChunker', 'LLMSemanticChunker']
8 | __all__ = ['ClusterSemanticChunker', 'LLMSemanticChunker', 'FixedTokenChunker', 'RecursiveTokenChunker', 'KamradtModifiedChunker']


--------------------------------------------------------------------------------
/chunking_evaluation/chunking/base_chunker.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import List
3 | 
4 | class BaseChunker(ABC):
5 |     @abstractmethod
6 |     def split_text(self, text: str) -> List[str]:
7 |         pass
8 | 


--------------------------------------------------------------------------------
/chunking_evaluation/chunking/cluster_semantic_chunker.py:
--------------------------------------------------------------------------------
 1 | from .base_chunker import BaseChunker
 2 | from typing import List
 3 | 
 4 | import numpy as np
 5 | import tiktoken
 6 | from chunking_evaluation.chunking import RecursiveTokenChunker
 7 | 
 8 | from chunking_evaluation.utils import get_openai_embedding_function, openai_token_count
 9 | 
10 | class ClusterSemanticChunker(BaseChunker):
11 |     def __init__(self, embedding_function=None, max_chunk_size=400, min_chunk_size=50, length_function=openai_token_count):
12 |         self.splitter = RecursiveTokenChunker(
13 |             chunk_size=min_chunk_size,
14 |             chunk_overlap=0,
15 |             length_function=openai_token_count,
16 |             separators = ["\n\n", "\n", ".", "?", "!", " ", ""]
17 |             )
18 |         
19 |         if embedding_function is None:
20 |             embedding_function = get_openai_embedding_function()
21 |         self._chunk_size = max_chunk_size
22 |         self.max_cluster = max_chunk_size//min_chunk_size
23 |         self.embedding_function = embedding_function
24 |         
25 |     def _get_similarity_matrix(self, embedding_function, sentences):
26 |         BATCH_SIZE = 500
27 |         N = len(sentences)
28 |         embedding_matrix = None
29 | 
30 |         for i in range(0, N, BATCH_SIZE):
31 |             batch_sentences = sentences[i:i+BATCH_SIZE]
32 |             embeddings = embedding_function(batch_sentences)
33 | 
34 |             # Convert embeddings list of lists to numpy array
35 |             batch_embedding_matrix = np.array(embeddings)
36 | 
37 |             # Append the batch embedding matrix to the main embedding matrix
38 |             if embedding_matrix is None:
39 |                 embedding_matrix = batch_embedding_matrix
40 |             else:
41 |                 embedding_matrix = np.concatenate((embedding_matrix, batch_embedding_matrix), axis=0)
42 | 
43 |         similarity_matrix = np.dot(embedding_matrix, embedding_matrix.T)
44 | 
45 |         return similarity_matrix
46 | 
47 |     def _calculate_reward(self, matrix, start, end):
48 |         sub_matrix = matrix[start:end+1, start:end+1]
49 |         return np.sum(sub_matrix)
50 | 
51 |     def _optimal_segmentation(self, matrix, max_cluster_size, window_size=3):
52 |         mean_value = np.mean(matrix[np.triu_indices(matrix.shape[0], k=1)])
53 |         matrix = matrix - mean_value  # Normalize the matrix
54 |         np.fill_diagonal(matrix, 0)  # Set diagonal to 1 to avoid trivial solutions
55 | 
56 |         n = matrix.shape[0]
57 |         dp = np.zeros(n)
58 |         segmentation = np.zeros(n, dtype=int)
59 | 
60 |         for i in range(n):
61 |             for size in range(1, max_cluster_size + 1):
62 |                 if i - size + 1 >= 0:
63 |                     # local_density = calculate_local_density(matrix, i, window_size)
64 |                     reward = self._calculate_reward(matrix, i - size + 1, i)
65 |                     # Adjust reward based on local density
66 |                     adjusted_reward = reward
67 |                     if i - size >= 0:
68 |                         adjusted_reward += dp[i - size]
69 |                     if adjusted_reward > dp[i]:
70 |                         dp[i] = adjusted_reward
71 |                         segmentation[i] = i - size + 1
72 | 
73 |         clusters = []
74 |         i = n - 1
75 |         while i >= 0:
76 |             start = segmentation[i]
77 |             clusters.append((start, i))
78 |             i = start - 1
79 | 
80 |         clusters.reverse()
81 |         return clusters
82 |         
83 |     def split_text(self, text: str) -> List[str]:
84 |         sentences = self.splitter.split_text(text)
85 | 
86 |         similarity_matrix = self._get_similarity_matrix(self.embedding_function, sentences)
87 | 
88 |         clusters = self._optimal_segmentation(similarity_matrix, max_cluster_size=self.max_cluster)
89 | 
90 |         docs = [' '.join(sentences[start:end+1]) for start, end in clusters]
91 | 
92 |         return docs
93 | 


--------------------------------------------------------------------------------
/chunking_evaluation/chunking/fixed_token_chunker.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # This script is adapted from the LangChain package, developed by LangChain AI.
  3 | # Original code can be found at: https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/base.py
  4 | # License: MIT License
  5 | 
  6 | from abc import ABC, abstractmethod
  7 | from enum import Enum
  8 | import logging
  9 | from typing import (
 10 |     AbstractSet,
 11 |     Any,
 12 |     Callable,
 13 |     Collection,
 14 |     Iterable,
 15 |     List,
 16 |     Literal,
 17 |     Optional,
 18 |     Sequence,
 19 |     Type,
 20 |     TypeVar,
 21 |     Union,
 22 | )
 23 | from .base_chunker import BaseChunker
 24 | 
 25 | 
 26 | from attr import dataclass
 27 | 
 28 | logger = logging.getLogger(__name__)
 29 | 
 30 | TS = TypeVar("TS", bound="TextSplitter")
 31 | class TextSplitter(BaseChunker, ABC):
 32 |     """Interface for splitting text into chunks."""
 33 | 
 34 |     def __init__(
 35 |         self,
 36 |         chunk_size: int = 4000,
 37 |         chunk_overlap: int = 200,
 38 |         length_function: Callable[[str], int] = len,
 39 |         keep_separator: bool = False,
 40 |         add_start_index: bool = False,
 41 |         strip_whitespace: bool = True,
 42 |     ) -> None:
 43 |         """Create a new TextSplitter.
 44 | 
 45 |         Args:
 46 |             chunk_size: Maximum size of chunks to return
 47 |             chunk_overlap: Overlap in characters between chunks
 48 |             length_function: Function that measures the length of given chunks
 49 |             keep_separator: Whether to keep the separator in the chunks
 50 |             add_start_index: If `True`, includes chunk's start index in metadata
 51 |             strip_whitespace: If `True`, strips whitespace from the start and end of
 52 |                               every document
 53 |         """
 54 |         if chunk_overlap > chunk_size:
 55 |             raise ValueError(
 56 |                 f"Got a larger chunk overlap ({chunk_overlap}) than chunk size "
 57 |                 f"({chunk_size}), should be smaller."
 58 |             )
 59 |         self._chunk_size = chunk_size
 60 |         self._chunk_overlap = chunk_overlap
 61 |         self._length_function = length_function
 62 |         self._keep_separator = keep_separator
 63 |         self._add_start_index = add_start_index
 64 |         self._strip_whitespace = strip_whitespace
 65 | 
 66 |     @abstractmethod
 67 |     def split_text(self, text: str) -> List[str]:
 68 |         """Split text into multiple components."""
 69 | 
 70 |     def _join_docs(self, docs: List[str], separator: str) -> Optional[str]:
 71 |         text = separator.join(docs)
 72 |         if self._strip_whitespace:
 73 |             text = text.strip()
 74 |         if text == "":
 75 |             return None
 76 |         else:
 77 |             return text
 78 | 
 79 |     def _merge_splits(self, splits: Iterable[str], separator: str) -> List[str]:
 80 |         # We now want to combine these smaller pieces into medium size
 81 |         # chunks to send to the LLM.
 82 |         separator_len = self._length_function(separator)
 83 | 
 84 |         docs = []
 85 |         current_doc: List[str] = []
 86 |         total = 0
 87 |         for d in splits:
 88 |             _len = self._length_function(d)
 89 |             if (
 90 |                 total + _len + (separator_len if len(current_doc) > 0 else 0)
 91 |                 > self._chunk_size
 92 |             ):
 93 |                 if total > self._chunk_size:
 94 |                     logger.warning(
 95 |                         f"Created a chunk of size {total}, "
 96 |                         f"which is longer than the specified {self._chunk_size}"
 97 |                     )
 98 |                 if len(current_doc) > 0:
 99 |                     doc = self._join_docs(current_doc, separator)
100 |                     if doc is not None:
101 |                         docs.append(doc)
102 |                     # Keep on popping if:
103 |                     # - we have a larger chunk than in the chunk overlap
104 |                     # - or if we still have any chunks and the length is long
105 |                     while total > self._chunk_overlap or (
106 |                         total + _len + (separator_len if len(current_doc) > 0 else 0)
107 |                         > self._chunk_size
108 |                         and total > 0
109 |                     ):
110 |                         total -= self._length_function(current_doc[0]) + (
111 |                             separator_len if len(current_doc) > 1 else 0
112 |                         )
113 |                         current_doc = current_doc[1:]
114 |             current_doc.append(d)
115 |             total += _len + (separator_len if len(current_doc) > 1 else 0)
116 |         doc = self._join_docs(current_doc, separator)
117 |         if doc is not None:
118 |             docs.append(doc)
119 |         return docs
120 | 
121 |     # @classmethod
122 |     # def from_huggingface_tokenizer(cls, tokenizer: Any, **kwargs: Any) -> TextSplitter:
123 |     #     """Text splitter that uses HuggingFace tokenizer to count length."""
124 |     #     try:
125 |     #         from transformers import PreTrainedTokenizerBase
126 | 
127 |     #         if not isinstance(tokenizer, PreTrainedTokenizerBase):
128 |     #             raise ValueError(
129 |     #                 "Tokenizer received was not an instance of PreTrainedTokenizerBase"
130 |     #             )
131 | 
132 |     #         def _huggingface_tokenizer_length(text: str) -> int:
133 |     #             return len(tokenizer.encode(text))
134 | 
135 |     #     except ImportError:
136 |     #         raise ValueError(
137 |     #             "Could not import transformers python package. "
138 |     #             "Please install it with `pip install transformers`."
139 |     #         )
140 |     #     return cls(length_function=_huggingface_tokenizer_length, **kwargs)
141 | 
142 |     @classmethod
143 |     def from_tiktoken_encoder(
144 |         cls: Type[TS],
145 |         encoding_name: str = "gpt2",
146 |         model_name: Optional[str] = None,
147 |         allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
148 |         disallowed_special: Union[Literal["all"], Collection[str]] = "all",
149 |         **kwargs: Any,
150 |     ) -> TS:
151 |         """Text splitter that uses tiktoken encoder to count length."""
152 |         try:
153 |             import tiktoken
154 |         except ImportError:
155 |             raise ImportError(
156 |                 "Could not import tiktoken python package. "
157 |                 "This is needed in order to calculate max_tokens_for_prompt. "
158 |                 "Please install it with `pip install tiktoken`."
159 |             )
160 | 
161 |         if model_name is not None:
162 |             enc = tiktoken.encoding_for_model(model_name)
163 |         else:
164 |             enc = tiktoken.get_encoding(encoding_name)
165 | 
166 |         def _tiktoken_encoder(text: str) -> int:
167 |             return len(
168 |                 enc.encode(
169 |                     text,
170 |                     allowed_special=allowed_special,
171 |                     disallowed_special=disallowed_special,
172 |                 )
173 |             )
174 | 
175 |         if issubclass(cls, FixedTokenChunker):
176 |             extra_kwargs = {
177 |                 "encoding_name": encoding_name,
178 |                 "model_name": model_name,
179 |                 "allowed_special": allowed_special,
180 |                 "disallowed_special": disallowed_special,
181 |             }
182 |             kwargs = {**kwargs, **extra_kwargs}
183 | 
184 |         return cls(length_function=_tiktoken_encoder, **kwargs)
185 |     
186 | class FixedTokenChunker(TextSplitter):
187 |     """Splitting text to tokens using model tokenizer."""
188 | 
189 |     def __init__(
190 |         self,
191 |         encoding_name: str = "cl100k_base",
192 |         model_name: Optional[str] = None,
193 |         chunk_size: int = 4000,
194 |         chunk_overlap: int = 200,
195 |         allowed_special: Union[Literal["all"], AbstractSet[str]] = set(),
196 |         disallowed_special: Union[Literal["all"], Collection[str]] = "all",
197 |         **kwargs: Any,
198 |     ) -> None:
199 |         """Create a new TextSplitter."""
200 |         super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, **kwargs)
201 |         try:
202 |             import tiktoken
203 |         except ImportError:
204 |             raise ImportError(
205 |                 "Could not import tiktoken python package. "
206 |                 "This is needed in order to for FixedTokenChunker. "
207 |                 "Please install it with `pip install tiktoken`."
208 |             )
209 | 
210 |         if model_name is not None:
211 |             enc = tiktoken.encoding_for_model(model_name)
212 |         else:
213 |             enc = tiktoken.get_encoding(encoding_name)
214 |         self._tokenizer = enc
215 |         self._allowed_special = allowed_special
216 |         self._disallowed_special = disallowed_special
217 | 
218 |     def split_text(self, text: str) -> List[str]:
219 |         def _encode(_text: str) -> List[int]:
220 |             return self._tokenizer.encode(
221 |                 _text,
222 |                 allowed_special=self._allowed_special,
223 |                 disallowed_special=self._disallowed_special,
224 |             )
225 | 
226 |         tokenizer = Tokenizer(
227 |             chunk_overlap=self._chunk_overlap,
228 |             tokens_per_chunk=self._chunk_size,
229 |             decode=self._tokenizer.decode,
230 |             encode=_encode,
231 |         )
232 | 
233 |         return split_text_on_tokens(text=text, tokenizer=tokenizer)
234 | 
235 | @dataclass(frozen=True)
236 | class Tokenizer:
237 |     """Tokenizer data class."""
238 | 
239 |     chunk_overlap: int
240 |     """Overlap in tokens between chunks"""
241 |     tokens_per_chunk: int
242 |     """Maximum number of tokens per chunk"""
243 |     decode: Callable[[List[int]], str]
244 |     """ Function to decode a list of token ids to a string"""
245 |     encode: Callable[[str], List[int]]
246 |     """ Function to encode a string to a list of token ids"""
247 | 
248 | 
249 | def split_text_on_tokens(*, text: str, tokenizer: Tokenizer) -> List[str]:
250 |     """Split incoming text and return chunks using tokenizer."""
251 |     splits: List[str] = []
252 |     input_ids = tokenizer.encode(text)
253 |     start_idx = 0
254 |     cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
255 |     chunk_ids = input_ids[start_idx:cur_idx]
256 |     while start_idx < len(input_ids):
257 |         splits.append(tokenizer.decode(chunk_ids))
258 |         if cur_idx == len(input_ids):
259 |             break
260 |         start_idx += tokenizer.tokens_per_chunk - tokenizer.chunk_overlap
261 |         cur_idx = min(start_idx + tokenizer.tokens_per_chunk, len(input_ids))
262 |         chunk_ids = input_ids[start_idx:cur_idx]
263 |     return splits
264 | 


--------------------------------------------------------------------------------
/chunking_evaluation/chunking/kamradt_modified_chunker.py:
--------------------------------------------------------------------------------
  1 | # This script is adapted from the Greg Kamradt's notebook on chunking.
  2 | # Original code can be found at: https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
  3 | 
  4 | from typing import Optional
  5 | from .base_chunker import BaseChunker
  6 | from .recursive_token_chunker import RecursiveTokenChunker
  7 | from chunking_evaluation.utils import openai_token_count, get_openai_embedding_function
  8 | from chromadb.api.types import (
  9 |     Embeddable,
 10 |     EmbeddingFunction,
 11 | )
 12 | 
 13 | import numpy as np
 14 | 
 15 | class KamradtModifiedChunker(BaseChunker):
 16 |     """
 17 |     A chunker that splits text into chunks of approximately a specified average size based on semantic similarity.
 18 | 
 19 |     This was adapted from Greg Kamradt's notebook on chunking but with the modification of including an average chunk size parameter. The original code can be found at: https://github.com/FullStackRetrieval-com/RetrievalTutorials/blob/main/tutorials/LevelsOfTextSplitting/5_Levels_Of_Text_Splitting.ipynb
 20 | 
 21 |     This class extends the functionality of the `BaseChunker` by incorporating a method to combine sentences based on a buffer size, calculate cosine distances between combined sentences, and perform a binary search on similarity thresholds to achieve chunks of desired average size.
 22 | 
 23 |     Attributes:
 24 |         avg_chunk_size (int): The desired average chunk size in terms of token count. Default is 400.
 25 |         min_chunk_size (int): The minimum chunk size in terms of token count. Default is 50.
 26 |         embedding_function (EmbeddingFunction[Embeddable], optional): A function that converts text to embeddings. Default is the OpenAI embedding function.
 27 |         length_function (function): A function that calculates the number of tokens in a text. Default is `openai_token_count`.
 28 | 
 29 |     Methods:
 30 |         combine_sentences(sentences, buffer_size=1):
 31 |             Combines sentences with a specified buffer size to create context-rich sentence groups.
 32 | 
 33 |         calculate_cosine_distances(sentences):
 34 |             Calculates cosine distances between combined sentences using their embeddings.
 35 | 
 36 |         split_text(text):
 37 |             Splits the input text into chunks based on the calculated cosine distances and the specified average chunk size.
 38 | 
 39 |     Example:
 40 |         chunker = KamradtModifiedChunker(avg_chunk_size=300)
 41 |         text = "Your text to be chunked."
 42 |         chunks = chunker.split_text(text)
 43 |     """
 44 |     def __init__(
 45 |         self, 
 46 |         avg_chunk_size:int=400, 
 47 |         min_chunk_size:int=50, 
 48 |         embedding_function: Optional[EmbeddingFunction[Embeddable]] = None, 
 49 |         length_function=openai_token_count
 50 |         ):
 51 |         """
 52 |         Initializes the KamradtModifiedChunker with the specified parameters.
 53 | 
 54 |         Args:
 55 |             avg_chunk_size (int, optional): The desired average chunk size in tokens. Defaults to 400.
 56 |             min_chunk_size (int, optional): The minimum chunk size in tokens. Defaults to 50.
 57 |             embedding_function (EmbeddingFunction[Embeddable], optional): A function to obtain embeddings for text. Defaults to OpenAI's embedding function if not provided.
 58 |             length_function (function, optional): A function to calculate token length of a text. Defaults to `openai_token_count`.
 59 |         """
 60 |         
 61 |         
 62 |         self.splitter = RecursiveTokenChunker(
 63 |             chunk_size=min_chunk_size,
 64 |             chunk_overlap=0,
 65 |             length_function=length_function
 66 |             )
 67 |         
 68 |         self.avg_chunk_size = avg_chunk_size
 69 |         if embedding_function is None:
 70 |             embedding_function = get_openai_embedding_function()
 71 |         self.embedding_function = embedding_function
 72 |         self.length_function = length_function
 73 | 
 74 |     def combine_sentences(self, sentences, buffer_size=1):
 75 |         # Go through each sentence dict
 76 |         for i in range(len(sentences)):
 77 | 
 78 |             # Create a string that will hold the sentences which are joined
 79 |             combined_sentence = ''
 80 | 
 81 |             # Add sentences before the current one, based on the buffer size.
 82 |             for j in range(i - buffer_size, i):
 83 |                 # Check if the index j is not negative (to avoid index out of range like on the first one)
 84 |                 if j >= 0:
 85 |                     # Add the sentence at index j to the combined_sentence string
 86 |                     combined_sentence += sentences[j]['sentence'] + ' '
 87 | 
 88 |             # Add the current sentence
 89 |             combined_sentence += sentences[i]['sentence']
 90 | 
 91 |             # Add sentences after the current one, based on the buffer size
 92 |             for j in range(i + 1, i + 1 + buffer_size):
 93 |                 # Check if the index j is within the range of the sentences list
 94 |                 if j < len(sentences):
 95 |                     # Add the sentence at index j to the combined_sentence string
 96 |                     combined_sentence += ' ' + sentences[j]['sentence']
 97 | 
 98 |             # Then add the whole thing to your dict
 99 |             # Store the combined sentence in the current sentence dict
100 |             sentences[i]['combined_sentence'] = combined_sentence
101 | 
102 |         return sentences
103 | 
104 |     def calculate_cosine_distances(self, sentences):
105 |         BATCH_SIZE = 500
106 |         distances = []
107 |         embedding_matrix = None
108 |         for i in range(0, len(sentences), BATCH_SIZE):
109 |             batch_sentences = sentences[i:i+BATCH_SIZE]
110 |             batch_sentences = [sentence['combined_sentence'] for sentence in batch_sentences]
111 |             embeddings = self.embedding_function(batch_sentences)
112 | 
113 |             # Convert embeddings list of lists to numpy array
114 |             batch_embedding_matrix = np.array(embeddings)
115 | 
116 |             # Append the batch embedding matrix to the main embedding matrix
117 |             if embedding_matrix is None:
118 |                 embedding_matrix = batch_embedding_matrix
119 |             else:
120 |                 embedding_matrix = np.concatenate((embedding_matrix, batch_embedding_matrix), axis=0)
121 | 
122 |         # Normalize each vector to be a unit vector
123 |         norms = np.linalg.norm(embedding_matrix, axis=1, keepdims=True)
124 |         embedding_matrix = embedding_matrix / norms
125 | 
126 |         similarity_matrix = np.dot(embedding_matrix, embedding_matrix.T)
127 |         
128 |         for i in range(len(sentences) - 1):
129 |             # Calculate cosine similarity
130 |             similarity = similarity_matrix[i, i + 1]
131 |             
132 |             # Convert to cosine distance
133 |             distance = 1 - similarity
134 | 
135 |             # Append cosine distance to the list
136 |             distances.append(distance)
137 | 
138 |             # Store distance in the dictionary
139 |             sentences[i]['distance_to_next'] = distance
140 | 
141 |         # Optionally handle the last sentence
142 |         # sentences[-1]['distance_to_next'] = None  # or a default value
143 | 
144 |         return distances, sentences
145 | 
146 |     def split_text(self, text):
147 |         """
148 |         Splits the input text into chunks of approximately the specified average size based on semantic similarity.
149 | 
150 |         Args:
151 |             text (str): The input text to be split into chunks.
152 | 
153 |         Returns:
154 |             list of str: The list of text chunks.
155 |         """
156 |                 
157 |         sentences_strips = self.splitter.split_text(text)
158 | 
159 |         sentences = [{'sentence': x, 'index' : i} for i, x in enumerate(sentences_strips)]
160 | 
161 |         sentences = self.combine_sentences(sentences, 3)
162 | 
163 |         combined_sentences = [x['combined_sentence'] for x in sentences]
164 | 
165 |         distances, sentences = self.calculate_cosine_distances(sentences)
166 | 
167 |         total_tokens = sum(self.length_function(sentence['sentence']) for sentence in sentences)
168 |         avg_chunk_size = self.avg_chunk_size
169 |         number_of_cuts = total_tokens // avg_chunk_size
170 | 
171 |         # Define threshold limits
172 |         lower_limit = 0.0
173 |         upper_limit = 1.0
174 | 
175 |         # Convert distances to numpy array
176 |         distances_np = np.array(distances)
177 | 
178 |         # Binary search for threshold
179 |         while upper_limit - lower_limit > 1e-6:
180 |             threshold = (upper_limit + lower_limit) / 2.0
181 |             num_points_above_threshold = np.sum(distances_np > threshold)
182 |             
183 |             if num_points_above_threshold > number_of_cuts:
184 |                 lower_limit = threshold
185 |             else:
186 |                 upper_limit = threshold
187 | 
188 |         indices_above_thresh = [i for i, x in enumerate(distances) if x > threshold] 
189 |         
190 |         # Initialize the start index
191 |         start_index = 0
192 | 
193 |         # Create a list to hold the grouped sentences
194 |         chunks = []
195 | 
196 |         # Iterate through the breakpoints to slice the sentences
197 |         for index in indices_above_thresh:
198 |             # The end index is the current breakpoint
199 |             end_index = index
200 | 
201 |             # Slice the sentence_dicts from the current start index to the end index
202 |             group = sentences[start_index:end_index + 1]
203 |             combined_text = ' '.join([d['sentence'] for d in group])
204 |             chunks.append(combined_text)
205 |             
206 |             # Update the start index for the next group
207 |             start_index = index + 1
208 | 
209 |         # The last group, if any sentences remain
210 |         if start_index < len(sentences):
211 |             combined_text = ' '.join([d['sentence'] for d in sentences[start_index:]])
212 |             chunks.append(combined_text)
213 | 
214 |         return chunks


--------------------------------------------------------------------------------
/chunking_evaluation/chunking/llm_semantic_chunker.py:
--------------------------------------------------------------------------------
  1 | from .base_chunker import BaseChunker
  2 | from chunking_evaluation.utils import openai_token_count
  3 | from chunking_evaluation.chunking import RecursiveTokenChunker
  4 | import anthropic
  5 | import os
  6 | import backoff
  7 | from tqdm import tqdm
  8 | 
  9 | class AnthropicClient:
 10 |     def __init__(self, model_name, api_key=None):
 11 |         self.client = anthropic.Anthropic(api_key=api_key)
 12 |         self.model_name = model_name
 13 | 
 14 |     @backoff.on_exception(backoff.expo, Exception, max_tries=3)
 15 |     def create_message(self, system_prompt, messages, max_tokens=1000, temperature=1.0):
 16 |         try:
 17 |             message = self.client.messages.create(
 18 |                 model=self.model_name,
 19 |                 max_tokens=max_tokens,
 20 |                 temperature=temperature,
 21 |                 system=system_prompt,
 22 |                 messages=messages
 23 |             )
 24 |             return message.content[0].text
 25 |         except Exception as e:
 26 |             print(f"Error occurred: {e}, retrying...")
 27 |             raise e
 28 |         
 29 | class OpenAIClient:
 30 |     def __init__(self, model_name, api_key=None):
 31 |         from openai import OpenAI
 32 |         self.client = OpenAI(api_key=api_key)
 33 |         self.model_name = model_name
 34 | 
 35 |     @backoff.on_exception(backoff.expo, Exception, max_tries=3)
 36 |     def create_message(self, system_prompt, messages, max_tokens=1000, temperature=1.0):
 37 |         try:
 38 |             gpt_messages = [
 39 |                 {"role": "system", "content": system_prompt}
 40 |             ] + messages
 41 | 
 42 |             completion = self.client.chat.completions.create(
 43 |                 model=self.model_name,
 44 |                 max_tokens=max_tokens,
 45 |                 messages=gpt_messages,
 46 |                 temperature=temperature
 47 |             )
 48 | 
 49 |             return completion.choices[0].message.content
 50 |         except Exception as e:
 51 |             print(f"Error occurred: {e}, retrying...")
 52 |             raise e
 53 | 
 54 | 
 55 | class LLMSemanticChunker(BaseChunker):
 56 |     """
 57 |     LLMSemanticChunker is a class designed to split text into thematically consistent sections based on suggestions from a Language Model (LLM).
 58 |     Users can choose between OpenAI and Anthropic as the LLM provider.
 59 | 
 60 |     Args:
 61 |         organisation (str): The LLM provider to use. Options are "openai" (default) or "anthropic".
 62 |         api_key (str, optional): The API key for the chosen LLM provider. If not provided, the default environment key will be used.
 63 |         model_name (str, optional): The specific model to use. Defaults to "gpt-4o" for OpenAI and "claude-3-5-sonnet-20240620" for Anthropic. 
 64 |                                     Users can specify a different model by providing this argument.
 65 |     """
 66 |     def __init__(self, organisation:str="openai", api_key:str=None, model_name:str=None):
 67 |         if organisation == "openai":
 68 |             if model_name is None:
 69 |                 model_name = "gpt-4o"
 70 |             self.client = OpenAIClient(model_name, api_key=api_key)
 71 |         elif organisation == "anthropic":
 72 |             if model_name is None:
 73 |                 model_name = "claude-3-5-sonnet-20240620"
 74 |             self.client = AnthropicClient(model_name, api_key=api_key)
 75 |         else:
 76 |             raise ValueError("Invalid organisation. Please choose either 'openai' or 'anthropic'.")
 77 | 
 78 |         self.splitter = RecursiveTokenChunker(
 79 |             chunk_size=50,
 80 |             chunk_overlap=0,
 81 |             length_function=openai_token_count
 82 |             )
 83 | 
 84 |     def get_prompt(self, chunked_input, current_chunk=0, invalid_response=None):
 85 |         messages = [
 86 |             {
 87 |                 "role": "system", 
 88 |                 "content": (
 89 |                     "You are an assistant specialized in splitting text into thematically consistent sections. "
 90 |                     "The text has been divided into chunks, each marked with <|start_chunk_X|> and <|end_chunk_X|> tags, where X is the chunk number. "
 91 |                     "Your task is to identify the points where splits should occur, such that consecutive chunks of similar themes stay together. "
 92 |                     "Respond with a list of chunk IDs where you believe a split should be made. For example, if chunks 1 and 2 belong together but chunk 3 starts a new topic, you would suggest a split after chunk 2. THE CHUNKS MUST BE IN ASCENDING ORDER."
 93 |                     "Your response should be in the form: 'split_after: 3, 5'."
 94 |                 )
 95 |             },
 96 |             {
 97 |                 "role": "user", 
 98 |                 "content": (
 99 |                     "CHUNKED_TEXT: " + chunked_input + "\n\n"
100 |                     "Respond only with the IDs of the chunks where you believe a split should occur. YOU MUST RESPOND WITH AT LEAST ONE SPLIT. THESE SPLITS MUST BE IN ASCENDING ORDER AND EQUAL OR LARGER THAN: " + str(current_chunk)+"." + (f"\n\The previous response of {invalid_response} was invalid. DO NOT REPEAT THIS ARRAY OF NUMBERS. Please try again." if invalid_response else "")
101 |                 )
102 |             },
103 |         ]
104 |         return messages
105 | 
106 |     def split_text(self, text):
107 |         import re
108 | 
109 |         chunks = self.splitter.split_text(text)
110 | 
111 |         split_indices = []
112 | 
113 |         short_cut = len(split_indices) > 0
114 | 
115 |         from tqdm import tqdm
116 | 
117 |         current_chunk = 0
118 | 
119 |         with tqdm(total=len(chunks), desc="Processing chunks") as pbar:
120 |             while True and not short_cut:
121 |                 if current_chunk >= len(chunks) - 4:
122 |                     break
123 | 
124 |                 token_count = 0
125 | 
126 |                 chunked_input = ''
127 | 
128 |                 for i in range(current_chunk, len(chunks)):
129 |                     token_count += openai_token_count(chunks[i])
130 |                     chunked_input += f"<|start_chunk_{i+1}|>{chunks[i]}<|end_chunk_{i+1}|>"
131 |                     if token_count > 800:
132 |                         break
133 | 
134 |                 messages = self.get_prompt(chunked_input, current_chunk)
135 |                 while True:
136 |                     result_string = self.client.create_message(messages[0]['content'], messages[1:], max_tokens=200, temperature=0.2)
137 |                     # Use regular expression to find all numbers in the string
138 |                     split_after_line = [line for line in result_string.split('\n') if 'split_after:' in line][0]
139 |                     numbers = re.findall(r'\d+', split_after_line)
140 |                     # Convert the found numbers to integers
141 |                     numbers = list(map(int, numbers))
142 | 
143 |                     # print(numbers)
144 | 
145 |                     # Check if the numbers are in ascending order and are equal to or larger than current_chunk
146 |                     if not (numbers != sorted(numbers) or any(number < current_chunk for number in numbers)):
147 |                         break
148 |                     else:
149 |                         messages = self.get_prompt(chunked_input, current_chunk, numbers)
150 |                         print("Response: ", result_string)
151 |                         print("Invalid response. Please try again.")
152 | 
153 |                 split_indices.extend(numbers)
154 | 
155 |                 current_chunk = numbers[-1]
156 | 
157 |                 if len(numbers) == 0:
158 |                     break
159 | 
160 |                 pbar.update(current_chunk - pbar.n)
161 | 
162 |         pbar.close()
163 | 
164 |         chunks_to_split_after = [i - 1 for i in split_indices]
165 | 
166 |         docs = []
167 |         current_chunk = ''
168 |         for i, chunk in enumerate(chunks):
169 |             current_chunk += chunk + ' '
170 |             if i in chunks_to_split_after:
171 |                 docs.append(current_chunk.strip())
172 |                 current_chunk = ''
173 |         if current_chunk:
174 |             docs.append(current_chunk.strip())
175 | 
176 |         return docs


--------------------------------------------------------------------------------
/chunking_evaluation/chunking/recursive_token_chunker.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # This script is adapted from the LangChain package, developed by LangChain AI.
  3 | # Original code can be found at: https://github.com/langchain-ai/langchain/blob/master/libs/text-splitters/langchain_text_splitters/character.py
  4 | # License: MIT License
  5 | 
  6 | from typing import Any, List, Optional
  7 | from .base_chunker import BaseChunker
  8 | from chunking_evaluation.utils import Language
  9 | from .fixed_token_chunker import TextSplitter
 10 | import re
 11 | 
 12 | def _split_text_with_regex(
 13 |     text: str, separator: str, keep_separator: bool
 14 | ) -> List[str]:
 15 |     # Now that we have the separator, split the text
 16 |     if separator:
 17 |         if keep_separator:
 18 |             # The parentheses in the pattern keep the delimiters in the result.
 19 |             _splits = re.split(f"({separator})", text)
 20 |             splits = [_splits[i] + _splits[i + 1] for i in range(1, len(_splits), 2)]
 21 |             if len(_splits) % 2 == 0:
 22 |                 splits += _splits[-1:]
 23 |             splits = [_splits[0]] + splits
 24 |         else:
 25 |             splits = re.split(separator, text)
 26 |     else:
 27 |         splits = list(text)
 28 |     return [s for s in splits if s != ""]
 29 | 
 30 | class RecursiveTokenChunker(TextSplitter):
 31 |     """Splitting text by recursively look at characters.
 32 | 
 33 |     Recursively tries to split by different characters to find one
 34 |     that works.
 35 |     """
 36 | 
 37 |     def __init__(
 38 |         self,
 39 |         chunk_size: int = 4000,
 40 |         chunk_overlap: int = 200,
 41 |         separators: Optional[List[str]] = None,
 42 |         keep_separator: bool = True,
 43 |         is_separator_regex: bool = False,
 44 |         **kwargs: Any,
 45 |     ) -> None:
 46 |         """Create a new TextSplitter."""
 47 |         super().__init__(chunk_size=chunk_size, chunk_overlap=chunk_overlap, keep_separator=keep_separator, **kwargs)
 48 |         self._separators = separators or ["\n\n", "\n", ".", "?", "!", " ", ""]
 49 |         self._is_separator_regex = is_separator_regex
 50 | 
 51 |     def _split_text(self, text: str, separators: List[str]) -> List[str]:
 52 |         """Split incoming text and return chunks."""
 53 |         final_chunks = []
 54 |         # Get appropriate separator to use
 55 |         separator = separators[-1]
 56 |         new_separators = []
 57 |         for i, _s in enumerate(separators):
 58 |             _separator = _s if self._is_separator_regex else re.escape(_s)
 59 |             if _s == "":
 60 |                 separator = _s
 61 |                 break
 62 |             if re.search(_separator, text):
 63 |                 separator = _s
 64 |                 new_separators = separators[i + 1 :]
 65 |                 break
 66 | 
 67 |         _separator = separator if self._is_separator_regex else re.escape(separator)
 68 |         splits = _split_text_with_regex(text, _separator, self._keep_separator)
 69 | 
 70 |         # Now go merging things, recursively splitting longer texts.
 71 |         _good_splits = []
 72 |         _separator = "" if self._keep_separator else separator
 73 |         for s in splits:
 74 |             if self._length_function(s) < self._chunk_size:
 75 |                 _good_splits.append(s)
 76 |             else:
 77 |                 if _good_splits:
 78 |                     merged_text = self._merge_splits(_good_splits, _separator)
 79 |                     final_chunks.extend(merged_text)
 80 |                     _good_splits = []
 81 |                 if not new_separators:
 82 |                     final_chunks.append(s)
 83 |                 else:
 84 |                     other_info = self._split_text(s, new_separators)
 85 |                     final_chunks.extend(other_info)
 86 |         if _good_splits:
 87 |             merged_text = self._merge_splits(_good_splits, _separator)
 88 |             final_chunks.extend(merged_text)
 89 |         return final_chunks
 90 | 
 91 |     def split_text(self, text: str) -> List[str]:
 92 |         return self._split_text(text, self._separators)
 93 | 
 94 |     # @classmethod
 95 |     # def from_language(
 96 |     #     cls, language: Language, **kwargs: Any
 97 |     # ) -> RecursiveCharacterTextSplitter:
 98 |     #     separators = cls.get_separators_for_language(language)
 99 |     #     return cls(separators=separators, is_separator_regex=True, **kwargs)
100 | 
101 |     @staticmethod
102 |     def get_separators_for_language(language: Language) -> List[str]:
103 |         if language == Language.CPP:
104 |             return [
105 |                 # Split along class definitions
106 |                 "\nclass ",
107 |                 # Split along function definitions
108 |                 "\nvoid ",
109 |                 "\nint ",
110 |                 "\nfloat ",
111 |                 "\ndouble ",
112 |                 # Split along control flow statements
113 |                 "\nif ",
114 |                 "\nfor ",
115 |                 "\nwhile ",
116 |                 "\nswitch ",
117 |                 "\ncase ",
118 |                 # Split by the normal type of lines
119 |                 "\n\n",
120 |                 "\n",
121 |                 " ",
122 |                 "",
123 |             ]
124 |         elif language == Language.GO:
125 |             return [
126 |                 # Split along function definitions
127 |                 "\nfunc ",
128 |                 "\nvar ",
129 |                 "\nconst ",
130 |                 "\ntype ",
131 |                 # Split along control flow statements
132 |                 "\nif ",
133 |                 "\nfor ",
134 |                 "\nswitch ",
135 |                 "\ncase ",
136 |                 # Split by the normal type of lines
137 |                 "\n\n",
138 |                 "\n",
139 |                 " ",
140 |                 "",
141 |             ]
142 |         elif language == Language.JAVA:
143 |             return [
144 |                 # Split along class definitions
145 |                 "\nclass ",
146 |                 # Split along method definitions
147 |                 "\npublic ",
148 |                 "\nprotected ",
149 |                 "\nprivate ",
150 |                 "\nstatic ",
151 |                 # Split along control flow statements
152 |                 "\nif ",
153 |                 "\nfor ",
154 |                 "\nwhile ",
155 |                 "\nswitch ",
156 |                 "\ncase ",
157 |                 # Split by the normal type of lines
158 |                 "\n\n",
159 |                 "\n",
160 |                 " ",
161 |                 "",
162 |             ]
163 |         elif language == Language.KOTLIN:
164 |             return [
165 |                 # Split along class definitions
166 |                 "\nclass ",
167 |                 # Split along method definitions
168 |                 "\npublic ",
169 |                 "\nprotected ",
170 |                 "\nprivate ",
171 |                 "\ninternal ",
172 |                 "\ncompanion ",
173 |                 "\nfun ",
174 |                 "\nval ",
175 |                 "\nvar ",
176 |                 # Split along control flow statements
177 |                 "\nif ",
178 |                 "\nfor ",
179 |                 "\nwhile ",
180 |                 "\nwhen ",
181 |                 "\ncase ",
182 |                 "\nelse ",
183 |                 # Split by the normal type of lines
184 |                 "\n\n",
185 |                 "\n",
186 |                 " ",
187 |                 "",
188 |             ]
189 |         elif language == Language.JS:
190 |             return [
191 |                 # Split along function definitions
192 |                 "\nfunction ",
193 |                 "\nconst ",
194 |                 "\nlet ",
195 |                 "\nvar ",
196 |                 "\nclass ",
197 |                 # Split along control flow statements
198 |                 "\nif ",
199 |                 "\nfor ",
200 |                 "\nwhile ",
201 |                 "\nswitch ",
202 |                 "\ncase ",
203 |                 "\ndefault ",
204 |                 # Split by the normal type of lines
205 |                 "\n\n",
206 |                 "\n",
207 |                 " ",
208 |                 "",
209 |             ]
210 |         elif language == Language.TS:
211 |             return [
212 |                 "\nenum ",
213 |                 "\ninterface ",
214 |                 "\nnamespace ",
215 |                 "\ntype ",
216 |                 # Split along class definitions
217 |                 "\nclass ",
218 |                 # Split along function definitions
219 |                 "\nfunction ",
220 |                 "\nconst ",
221 |                 "\nlet ",
222 |                 "\nvar ",
223 |                 # Split along control flow statements
224 |                 "\nif ",
225 |                 "\nfor ",
226 |                 "\nwhile ",
227 |                 "\nswitch ",
228 |                 "\ncase ",
229 |                 "\ndefault ",
230 |                 # Split by the normal type of lines
231 |                 "\n\n",
232 |                 "\n",
233 |                 " ",
234 |                 "",
235 |             ]
236 |         elif language == Language.PHP:
237 |             return [
238 |                 # Split along function definitions
239 |                 "\nfunction ",
240 |                 # Split along class definitions
241 |                 "\nclass ",
242 |                 # Split along control flow statements
243 |                 "\nif ",
244 |                 "\nforeach ",
245 |                 "\nwhile ",
246 |                 "\ndo ",
247 |                 "\nswitch ",
248 |                 "\ncase ",
249 |                 # Split by the normal type of lines
250 |                 "\n\n",
251 |                 "\n",
252 |                 " ",
253 |                 "",
254 |             ]
255 |         elif language == Language.PROTO:
256 |             return [
257 |                 # Split along message definitions
258 |                 "\nmessage ",
259 |                 # Split along service definitions
260 |                 "\nservice ",
261 |                 # Split along enum definitions
262 |                 "\nenum ",
263 |                 # Split along option definitions
264 |                 "\noption ",
265 |                 # Split along import statements
266 |                 "\nimport ",
267 |                 # Split along syntax declarations
268 |                 "\nsyntax ",
269 |                 # Split by the normal type of lines
270 |                 "\n\n",
271 |                 "\n",
272 |                 " ",
273 |                 "",
274 |             ]
275 |         elif language == Language.PYTHON:
276 |             return [
277 |                 # First, try to split along class definitions
278 |                 "\nclass ",
279 |                 "\ndef ",
280 |                 "\n\tdef ",
281 |                 # Now split by the normal type of lines
282 |                 "\n\n",
283 |                 "\n",
284 |                 " ",
285 |                 "",
286 |             ]
287 |         elif language == Language.RST:
288 |             return [
289 |                 # Split along section titles
290 |                 "\n=+\n",
291 |                 "\n-+\n",
292 |                 "\n\\*+\n",
293 |                 # Split along directive markers
294 |                 "\n\n.. *\n\n",
295 |                 # Split by the normal type of lines
296 |                 "\n\n",
297 |                 "\n",
298 |                 " ",
299 |                 "",
300 |             ]
301 |         elif language == Language.RUBY:
302 |             return [
303 |                 # Split along method definitions
304 |                 "\ndef ",
305 |                 "\nclass ",
306 |                 # Split along control flow statements
307 |                 "\nif ",
308 |                 "\nunless ",
309 |                 "\nwhile ",
310 |                 "\nfor ",
311 |                 "\ndo ",
312 |                 "\nbegin ",
313 |                 "\nrescue ",
314 |                 # Split by the normal type of lines
315 |                 "\n\n",
316 |                 "\n",
317 |                 " ",
318 |                 "",
319 |             ]
320 |         elif language == Language.RUST:
321 |             return [
322 |                 # Split along function definitions
323 |                 "\nfn ",
324 |                 "\nconst ",
325 |                 "\nlet ",
326 |                 # Split along control flow statements
327 |                 "\nif ",
328 |                 "\nwhile ",
329 |                 "\nfor ",
330 |                 "\nloop ",
331 |                 "\nmatch ",
332 |                 "\nconst ",
333 |                 # Split by the normal type of lines
334 |                 "\n\n",
335 |                 "\n",
336 |                 " ",
337 |                 "",
338 |             ]
339 |         elif language == Language.SCALA:
340 |             return [
341 |                 # Split along class definitions
342 |                 "\nclass ",
343 |                 "\nobject ",
344 |                 # Split along method definitions
345 |                 "\ndef ",
346 |                 "\nval ",
347 |                 "\nvar ",
348 |                 # Split along control flow statements
349 |                 "\nif ",
350 |                 "\nfor ",
351 |                 "\nwhile ",
352 |                 "\nmatch ",
353 |                 "\ncase ",
354 |                 # Split by the normal type of lines
355 |                 "\n\n",
356 |                 "\n",
357 |                 " ",
358 |                 "",
359 |             ]
360 |         elif language == Language.SWIFT:
361 |             return [
362 |                 # Split along function definitions
363 |                 "\nfunc ",
364 |                 # Split along class definitions
365 |                 "\nclass ",
366 |                 "\nstruct ",
367 |                 "\nenum ",
368 |                 # Split along control flow statements
369 |                 "\nif ",
370 |                 "\nfor ",
371 |                 "\nwhile ",
372 |                 "\ndo ",
373 |                 "\nswitch ",
374 |                 "\ncase ",
375 |                 # Split by the normal type of lines
376 |                 "\n\n",
377 |                 "\n",
378 |                 " ",
379 |                 "",
380 |             ]
381 |         elif language == Language.MARKDOWN:
382 |             return [
383 |                 # First, try to split along Markdown headings (starting with level 2)
384 |                 "\n#{1,6} ",
385 |                 # Note the alternative syntax for headings (below) is not handled here
386 |                 # Heading level 2
387 |                 # ---------------
388 |                 # End of code block
389 |                 "```\n",
390 |                 # Horizontal lines
391 |                 "\n\\*\\*\\*+\n",
392 |                 "\n---+\n",
393 |                 "\n___+\n",
394 |                 # Note that this splitter doesn't handle horizontal lines defined
395 |                 # by *three or more* of ***, ---, or ___, but this is not handled
396 |                 "\n\n",
397 |                 "\n",
398 |                 " ",
399 |                 "",
400 |             ]
401 |         elif language == Language.LATEX:
402 |             return [
403 |                 # First, try to split along Latex sections
404 |                 "\n\\\\chapter{",
405 |                 "\n\\\\section{",
406 |                 "\n\\\\subsection{",
407 |                 "\n\\\\subsubsection{",
408 |                 # Now split by environments
409 |                 "\n\\\\begin{enumerate}",
410 |                 "\n\\\\begin{itemize}",
411 |                 "\n\\\\begin{description}",
412 |                 "\n\\\\begin{list}",
413 |                 "\n\\\\begin{quote}",
414 |                 "\n\\\\begin{quotation}",
415 |                 "\n\\\\begin{verse}",
416 |                 "\n\\\\begin{verbatim}",
417 |                 # Now split by math environments
418 |                 "\n\\\begin{align}",
419 |                 "$$",
420 |                 "$",
421 |                 # Now split by the normal type of lines
422 |                 " ",
423 |                 "",
424 |             ]
425 |         elif language == Language.HTML:
426 |             return [
427 |                 # First, try to split along HTML tags
428 |                 "<body",
429 |                 "<div",
430 |                 "<p",
431 |                 "<br",
432 |                 "<li",
433 |                 "<h1",
434 |                 "<h2",
435 |                 "<h3",
436 |                 "<h4",
437 |                 "<h5",
438 |                 "<h6",
439 |                 "<span",
440 |                 "<table",
441 |                 "<tr",
442 |                 "<td",
443 |                 "<th",
444 |                 "<ul",
445 |                 "<ol",
446 |                 "<header",
447 |                 "<footer",
448 |                 "<nav",
449 |                 # Head
450 |                 "<head",
451 |                 "<style",
452 |                 "<script",
453 |                 "<meta",
454 |                 "<title",
455 |                 "",
456 |             ]
457 |         elif language == Language.CSHARP:
458 |             return [
459 |                 "\ninterface ",
460 |                 "\nenum ",
461 |                 "\nimplements ",
462 |                 "\ndelegate ",
463 |                 "\nevent ",
464 |                 # Split along class definitions
465 |                 "\nclass ",
466 |                 "\nabstract ",
467 |                 # Split along method definitions
468 |                 "\npublic ",
469 |                 "\nprotected ",
470 |                 "\nprivate ",
471 |                 "\nstatic ",
472 |                 "\nreturn ",
473 |                 # Split along control flow statements
474 |                 "\nif ",
475 |                 "\ncontinue ",
476 |                 "\nfor ",
477 |                 "\nforeach ",
478 |                 "\nwhile ",
479 |                 "\nswitch ",
480 |                 "\nbreak ",
481 |                 "\ncase ",
482 |                 "\nelse ",
483 |                 # Split by exceptions
484 |                 "\ntry ",
485 |                 "\nthrow ",
486 |                 "\nfinally ",
487 |                 "\ncatch ",
488 |                 # Split by the normal type of lines
489 |                 "\n\n",
490 |                 "\n",
491 |                 " ",
492 |                 "",
493 |             ]
494 |         elif language == Language.SOL:
495 |             return [
496 |                 # Split along compiler information definitions
497 |                 "\npragma ",
498 |                 "\nusing ",
499 |                 # Split along contract definitions
500 |                 "\ncontract ",
501 |                 "\ninterface ",
502 |                 "\nlibrary ",
503 |                 # Split along method definitions
504 |                 "\nconstructor ",
505 |                 "\ntype ",
506 |                 "\nfunction ",
507 |                 "\nevent ",
508 |                 "\nmodifier ",
509 |                 "\nerror ",
510 |                 "\nstruct ",
511 |                 "\nenum ",
512 |                 # Split along control flow statements
513 |                 "\nif ",
514 |                 "\nfor ",
515 |                 "\nwhile ",
516 |                 "\ndo while ",
517 |                 "\nassembly ",
518 |                 # Split by the normal type of lines
519 |                 "\n\n",
520 |                 "\n",
521 |                 " ",
522 |                 "",
523 |             ]
524 |         elif language == Language.COBOL:
525 |             return [
526 |                 # Split along divisions
527 |                 "\nIDENTIFICATION DIVISION.",
528 |                 "\nENVIRONMENT DIVISION.",
529 |                 "\nDATA DIVISION.",
530 |                 "\nPROCEDURE DIVISION.",
531 |                 # Split along sections within DATA DIVISION
532 |                 "\nWORKING-STORAGE SECTION.",
533 |                 "\nLINKAGE SECTION.",
534 |                 "\nFILE SECTION.",
535 |                 # Split along sections within PROCEDURE DIVISION
536 |                 "\nINPUT-OUTPUT SECTION.",
537 |                 # Split along paragraphs and common statements
538 |                 "\nOPEN ",
539 |                 "\nCLOSE ",
540 |                 "\nREAD ",
541 |                 "\nWRITE ",
542 |                 "\nIF ",
543 |                 "\nELSE ",
544 |                 "\nMOVE ",
545 |                 "\nPERFORM ",
546 |                 "\nUNTIL ",
547 |                 "\nVARYING ",
548 |                 "\nACCEPT ",
549 |                 "\nDISPLAY ",
550 |                 "\nSTOP RUN.",
551 |                 # Split by the normal type of lines
552 |                 "\n",
553 |                 " ",
554 |                 "",
555 |             ]
556 | 
557 |         else:
558 |             raise ValueError(
559 |                 f"Language {language} is not supported! "
560 |                 f"Please choose from {list(Language)}"
561 |             )
562 | 


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/__init__.py


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/base_evaluation.py:
--------------------------------------------------------------------------------
  1 | from typing import Callable
  2 | from chunking_evaluation.utils import rigorous_document_search, get_openai_embedding_function
  3 | import chromadb.utils.embedding_functions as embedding_functions
  4 | import os
  5 | import pandas as pd
  6 | import json
  7 | import chromadb
  8 | import numpy as np
  9 | from typing import List
 10 | from importlib import resources
 11 | 
 12 | def sum_of_ranges(ranges):
 13 |     return sum(end - start for start, end in ranges)
 14 | 
 15 | def union_ranges(ranges):
 16 |     # Sort ranges based on the starting index
 17 |     sorted_ranges = sorted(ranges, key=lambda x: x[0])
 18 |     
 19 |     # Initialize with the first range
 20 |     merged_ranges = [sorted_ranges[0]]
 21 |     
 22 |     for current_start, current_end in sorted_ranges[1:]:
 23 |         last_start, last_end = merged_ranges[-1]
 24 |         
 25 |         # Check if the current range overlaps or is contiguous with the last range in the merged list
 26 |         if current_start <= last_end:
 27 |             # Merge the two ranges
 28 |             merged_ranges[-1] = (last_start, max(last_end, current_end))
 29 |         else:
 30 |             # No overlap, add the current range as new
 31 |             merged_ranges.append((current_start, current_end))
 32 |     
 33 |     return merged_ranges
 34 | 
 35 | def intersect_two_ranges(range1, range2):
 36 |     # Unpack the ranges
 37 |     start1, end1 = range1
 38 |     start2, end2 = range2
 39 |     
 40 |     # Calculate the maximum of the starting indices and the minimum of the ending indices
 41 |     intersect_start = max(start1, start2)
 42 |     intersect_end = min(end1, end2)
 43 |     
 44 |     # Check if the intersection is valid (the start is less than or equal to the end)
 45 |     if intersect_start <= intersect_end:
 46 |         return (intersect_start, intersect_end)
 47 |     else:
 48 |         return None  # Return an None if there is no intersection
 49 |     
 50 | # Define the difference function
 51 | def difference(ranges, target):
 52 |     """
 53 |     Takes a set of ranges and a target range, and returns the difference.
 54 |     
 55 |     Args:
 56 |     - ranges (list of tuples): A list of tuples representing ranges. Each tuple is (a, b) where a <= b.
 57 |     - target (tuple): A tuple representing a target range (c, d) where c <= d.
 58 |     
 59 |     Returns:
 60 |     - List of tuples representing ranges after removing the segments that overlap with the target range.
 61 |     """
 62 |     result = []
 63 |     target_start, target_end = target
 64 | 
 65 |     for start, end in ranges:
 66 |         if end < target_start or start > target_end:
 67 |             # No overlap
 68 |             result.append((start, end))
 69 |         elif start < target_start and end > target_end:
 70 |             # Target is a subset of this range, split it into two ranges
 71 |             result.append((start, target_start))
 72 |             result.append((target_end, end))
 73 |         elif start < target_start:
 74 |             # Overlap at the start
 75 |             result.append((start, target_start))
 76 |         elif end > target_end:
 77 |             # Overlap at the end
 78 |             result.append((target_end, end))
 79 |         # Else, this range is fully contained by the target, and is thus removed
 80 | 
 81 |     return result
 82 | 
 83 | def find_target_in_document(document, target):
 84 |     start_index = document.find(target)
 85 |     if start_index == -1:
 86 |         return None
 87 |     end_index = start_index + len(target)
 88 |     return start_index, end_index
 89 | 
 90 | class BaseEvaluation:
 91 |     def __init__(self, questions_csv_path: str, chroma_db_path=None, corpora_id_paths=None):
 92 |         self.corpora_id_paths = corpora_id_paths
 93 | 
 94 |         self.questions_csv_path = questions_csv_path
 95 | 
 96 |         self.corpus_list = []
 97 | 
 98 |         self._load_questions_df()
 99 | 
100 |         # self.questions_df = pd.read_csv(questions_csv_path)
101 |         # self.questions_df['references'] = self.questions_df['references'].apply(json.loads)
102 | 
103 |         if chroma_db_path is not None:
104 |             self.chroma_client = chromadb.PersistentClient(path=chroma_db_path)
105 |         else:
106 |             self.chroma_client = chromadb.Client()
107 | 
108 |         self.is_general = False
109 | 
110 |     def _load_questions_df(self):
111 |         if os.path.exists(self.questions_csv_path):
112 |             self.questions_df = pd.read_csv(self.questions_csv_path)
113 |             self.questions_df['references'] = self.questions_df['references'].apply(json.loads)
114 |         else:
115 |             self.questions_df = pd.DataFrame(columns=['question', 'references', 'corpus_id'])
116 |         
117 |         self.corpus_list = self.questions_df['corpus_id'].unique().tolist()
118 | 
119 |     def _get_chunks_and_metadata(self, splitter):
120 |         # Warning: metadata will be incorrect if a chunk is repeated since we use .find() to find the start index. 
121 |         # This isn't pratically an issue for chunks over 1000 characters.
122 |         documents = []
123 |         metadatas = []
124 |         for corpus_id in self.corpus_list:
125 |             corpus_path = corpus_id
126 |             if self.corpora_id_paths is not None:
127 |                 corpus_path = self.corpora_id_paths[corpus_id]
128 |     
129 |             # Check the operating system and use UTF-8 encoding on Windows
130 |             # This prevents UnicodeDecodeError when reading files with non-ASCII characters
131 |             import platform
132 |             if platform.system() == 'Windows':
133 |                 with open(corpus_path, 'r', encoding='utf-8') as file:
134 |                     corpus = file.read()
135 |             else:
136 |                 # Use default encoding on other systems
137 |                 with open(corpus_path, 'r') as file:
138 |                     corpus = file.read()
139 |     
140 |             current_documents = splitter.split_text(corpus)
141 |             current_metadatas = []
142 |             for document in current_documents:
143 |                 try:
144 |                     _, start_index, end_index = rigorous_document_search(corpus, document)
145 |                 except:
146 |                     print(f"Error in finding {document} in {corpus_id}")
147 |                     raise Exception(f"Error in finding {document} in {corpus_id}")
148 |                 current_metadatas.append({"start_index": start_index, "end_index": end_index, "corpus_id": corpus_id})
149 |             documents.extend(current_documents)
150 |             metadatas.extend(current_metadatas)
151 |         return documents, metadatas
152 | 
153 |     def _full_precision_score(self, chunk_metadatas):
154 |         ioc_scores = []
155 |         recall_scores = []
156 | 
157 |         highlighted_chunks_count = []
158 | 
159 |         for index, row in self.questions_df.iterrows():
160 |             # Unpack question and references
161 |             # question, references = question_references
162 |             question = row['question']
163 |             references = row['references']
164 |             corpus_id = row['corpus_id']
165 | 
166 |             ioc_score = 0
167 |             numerator_sets = []
168 |             denominator_chunks_sets = []
169 |             unused_highlights = [(x['start_index'], x['end_index']) for x in references]
170 | 
171 |             highlighted_chunk_count = 0
172 | 
173 |             for metadata in chunk_metadatas:
174 |                 # Unpack chunk start and end indices
175 |                 chunk_start, chunk_end, chunk_corpus_id = metadata['start_index'], metadata['end_index'], metadata['corpus_id']
176 | 
177 |                 if chunk_corpus_id != corpus_id:
178 |                     continue
179 |                 
180 |                 contains_highlight = False
181 | 
182 |                 for ref_obj in references:
183 |                     reference = ref_obj['content']
184 |                     ref_start, ref_end = int(ref_obj['start_index']), int(ref_obj['end_index'])
185 |                     # Calculate intersection between chunk and reference
186 |                     intersection = intersect_two_ranges((chunk_start, chunk_end), (ref_start, ref_end))
187 |                     
188 |                     if intersection is not None:
189 |                         contains_highlight = True
190 | 
191 |                         # Remove intersection from unused highlights
192 |                         unused_highlights = difference(unused_highlights, intersection)
193 | 
194 |                         # Add intersection to numerator sets
195 |                         numerator_sets = union_ranges([intersection] + numerator_sets)
196 |                         
197 |                         # Add chunk to denominator sets
198 |                         denominator_chunks_sets = union_ranges([(chunk_start, chunk_end)] + denominator_chunks_sets)
199 |             
200 |                 if contains_highlight:
201 |                     highlighted_chunk_count += 1
202 |                 
203 |             highlighted_chunks_count.append(highlighted_chunk_count)
204 | 
205 |             # Combine unused highlights and chunks for final denominator
206 |             denominator_sets = union_ranges(denominator_chunks_sets + unused_highlights)
207 |             
208 |             # Calculate ioc_score if there are numerator sets
209 |             if numerator_sets:
210 |                 ioc_score = sum_of_ranges(numerator_sets) / sum_of_ranges(denominator_sets)
211 |             
212 |             ioc_scores.append(ioc_score)
213 | 
214 |             recall_score = 1 - (sum_of_ranges(unused_highlights) / sum_of_ranges([(x['start_index'], x['end_index']) for x in references]))
215 |             recall_scores.append(recall_score)
216 | 
217 |         return ioc_scores, highlighted_chunks_count
218 | 
219 |     def _scores_from_dataset_and_retrievals(self, question_metadatas, highlighted_chunks_count):
220 |         iou_scores = []
221 |         recall_scores = []
222 |         precision_scores = []
223 |         for (index, row), highlighted_chunk_count, metadatas in zip(self.questions_df.iterrows(), highlighted_chunks_count, question_metadatas):
224 |             # Unpack question and references
225 |             # question, references = question_references
226 |             question = row['question']
227 |             references = row['references']
228 |             corpus_id = row['corpus_id']
229 | 
230 |             numerator_sets = []
231 |             denominator_chunks_sets = []
232 |             unused_highlights = [(x['start_index'], x['end_index']) for x in references]
233 | 
234 |             for metadata in metadatas[:highlighted_chunk_count]:
235 |                 # Unpack chunk start and end indices
236 |                 chunk_start, chunk_end, chunk_corpus_id = metadata['start_index'], metadata['end_index'], metadata['corpus_id']
237 | 
238 |                 if chunk_corpus_id != corpus_id:
239 |                     continue
240 |                 
241 |                 # for reference, ref_start, ref_end in references:
242 |                 for ref_obj in references:
243 |                     reference = ref_obj['content']
244 |                     ref_start, ref_end = int(ref_obj['start_index']), int(ref_obj['end_index'])
245 |                     
246 |                     # Calculate intersection between chunk and reference
247 |                     intersection = intersect_two_ranges((chunk_start, chunk_end), (ref_start, ref_end))
248 |                     
249 |                     if intersection is not None:
250 |                         # Remove intersection from unused highlights
251 |                         unused_highlights = difference(unused_highlights, intersection)
252 | 
253 |                         # Add intersection to numerator sets
254 |                         numerator_sets = union_ranges([intersection] + numerator_sets)
255 |                         
256 |                         # Add chunk to denominator sets
257 |                         denominator_chunks_sets = union_ranges([(chunk_start, chunk_end)] + denominator_chunks_sets)
258 |             
259 | 
260 |             if numerator_sets:
261 |                 numerator_value = sum_of_ranges(numerator_sets)
262 |             else:
263 |                 numerator_value = 0
264 | 
265 |             recall_denominator = sum_of_ranges([(x['start_index'], x['end_index']) for x in references])
266 |             precision_denominator = sum_of_ranges([(x['start_index'], x['end_index']) for x in metadatas[:highlighted_chunk_count]])
267 |             iou_denominator = precision_denominator + sum_of_ranges(unused_highlights)
268 | 
269 |             recall_score = numerator_value / recall_denominator
270 |             recall_scores.append(recall_score)
271 | 
272 |             precision_score = numerator_value / precision_denominator
273 |             precision_scores.append(precision_score)
274 | 
275 |             iou_score = numerator_value / iou_denominator
276 |             iou_scores.append(iou_score)
277 | 
278 |         return iou_scores, recall_scores, precision_scores
279 | 
280 |     def _chunker_to_collection(self, chunker, embedding_function, chroma_db_path:str = None, collection_name:str = None):
281 |         collection = None
282 | 
283 |         if chroma_db_path is not None:
284 |             try:
285 |                 chunk_client = chromadb.PersistentClient(path=chroma_db_path)
286 |                 collection = chunk_client.create_collection(collection_name, embedding_function=embedding_function, metadata={"hnsw:search_ef":50})
287 |                 print("Created collection: ", collection_name)
288 |             except Exception as e:
289 |                 print("Failed to create collection: ", e)
290 |                 pass
291 |                 # This shouldn't throw but for whatever reason, if it does we will default to below.
292 | 
293 |         collection_name = "auto_chunk"
294 |         if collection is None:
295 |             try:
296 |                 self.chroma_client.delete_collection(collection_name)
297 |             except ValueError as e:
298 |                 pass
299 |             collection = self.chroma_client.create_collection(collection_name, embedding_function=embedding_function, metadata={"hnsw:search_ef":50})
300 | 
301 |         docs, metas = self._get_chunks_and_metadata(chunker)
302 | 
303 |         BATCH_SIZE = 500
304 |         for i in range(0, len(docs), BATCH_SIZE):
305 |             batch_docs = docs[i:i+BATCH_SIZE]
306 |             batch_metas = metas[i:i+BATCH_SIZE]
307 |             batch_ids = [str(i) for i in range(i, i+len(batch_docs))]
308 |             collection.add(
309 |                 documents=batch_docs,
310 |                 metadatas=batch_metas,
311 |                 ids=batch_ids
312 |             )
313 | 
314 |             # print("Documents: ", batch_docs)
315 |             # print("Metadatas: ", batch_metas)
316 | 
317 |         return collection
318 |     
319 |     def _convert_question_references_to_json(self):
320 |         def safe_json_loads(row):
321 |             try:
322 |                 return json.loads(row)
323 |             except:
324 |                 pass
325 | 
326 |         self.questions_df['references'] = self.questions_df['references'].apply(safe_json_loads)
327 | 
328 | 
329 |     def run(self, chunker, embedding_function=None, retrieve:int = 5, db_to_save_chunks: str = None):
330 |         """
331 |         This function runs the evaluation over the provided chunker.
332 | 
333 |         Parameters:
334 |         chunker: The chunker to evaluate.
335 |         embedding_function: The embedding function to use for calculating the nearest neighbours during the retrieval step. If not provided, the default OpenAI embedding function is used.
336 |         retrieve: The number of chunks to retrieve per question. If set to -1, the function will retrieve the minimum number of chunks that contain excerpts for a given query. This is typically around 1 to 3 but can vary by question. By setting a specific value for retrieve, this number is fixed for all queries.
337 |         """
338 |         self._load_questions_df()
339 |         if embedding_function is None:
340 |             embedding_function = get_openai_embedding_function()
341 | 
342 |         collection = None
343 |         if db_to_save_chunks is not None:
344 |             chunk_size = chunker._chunk_size if hasattr(chunker, '_chunk_size') else "0"
345 |             chunk_overlap = chunker._chunk_overlap if hasattr(chunker, '_chunk_overlap') else "0"
346 |             embedding_function_name = embedding_function.__class__.__name__
347 |             if embedding_function_name == "SentenceTransformerEmbeddingFunction":
348 |                 embedding_function_name = "SentEmbFunc"
349 |             collection_name = embedding_function_name + '_' + chunker.__class__.__name__ + '_' + str(int(chunk_size)) + '_' + str(int(chunk_overlap))
350 |             try:
351 |                 chunk_client = chromadb.PersistentClient(path=db_to_save_chunks)
352 |                 collection = chunk_client.get_collection(collection_name, embedding_function=embedding_function)
353 |             except Exception as e:
354 |                 # Get collection throws if the collection does not exist. We will create it below if it does not exist.
355 |                 collection = self._chunker_to_collection(chunker, embedding_function, chroma_db_path=db_to_save_chunks, collection_name=collection_name)
356 | 
357 |         if collection is None:
358 |             collection = self._chunker_to_collection(chunker, embedding_function)
359 | 
360 |         question_collection = None
361 | 
362 |         if self.is_general:
363 |             with resources.as_file(resources.files('chunking_evaluation.evaluation_framework') / 'general_evaluation_data') as general_benchmark_path:
364 |                 questions_client = chromadb.PersistentClient(path=os.path.join(general_benchmark_path, 'questions_db'))
365 |                 if embedding_function.__class__.__name__ == "OpenAIEmbeddingFunction":
366 |                     try:
367 |                         if embedding_function._model_name == "text-embedding-3-large":
368 |                             question_collection = questions_client.get_collection("auto_questions_openai_large", embedding_function=embedding_function)
369 |                         elif embedding_function._model_name == "text-embedding-3-small":
370 |                             question_collection = questions_client.get_collection("auto_questions_openai_small", embedding_function=embedding_function)
371 |                     except Exception as e:
372 |                         print("Warning: Failed to use the frozen embeddings originally used in the paper. As a result, this package will now generate a new set of embeddings. The change should be minimal and only come from the noise floor of OpenAI's embedding function. The error: ", e)
373 |                 elif embedding_function.__class__.__name__ == "SentenceTransformerEmbeddingFunction":
374 |                     try:
375 |                         question_collection = questions_client.get_collection("auto_questions_sentence_transformer", embedding_function=embedding_function)
376 |                     except:
377 |                         print("Warning: Failed to use the frozen embeddings originally used in the paper. As a result, this package will now generate a new set of embeddings. The change should be minimal and only come from the noise floor of SentenceTransformer's embedding function. The error: ", e)
378 |         
379 |         if not self.is_general or question_collection is None:
380 |             # if self.is_general:
381 |             #     print("FAILED TO LOAD GENERAL EVALUATION")
382 |             try:
383 |                 self.chroma_client.delete_collection("auto_questions")
384 |             except ValueError as e:
385 |                 pass
386 |             question_collection = self.chroma_client.create_collection("auto_questions", embedding_function=embedding_function, metadata={"hnsw:search_ef":50})
387 |             question_collection.add(
388 |                 documents=self.questions_df['question'].tolist(),
389 |                 metadatas=[{"corpus_id": x} for x in self.questions_df['corpus_id'].tolist()],
390 |                 ids=[str(i) for i in self.questions_df.index]
391 |             )
392 |         
393 |         question_db = question_collection.get(include=['embeddings'])
394 | 
395 |         # Convert ids to integers for sorting
396 |         question_db['ids'] = [int(id) for id in question_db['ids']]
397 |         # Sort both ids and embeddings based on ids
398 |         _, sorted_embeddings = zip(*sorted(zip(question_db['ids'], question_db['embeddings'])))
399 | 
400 |         # Sort questions_df in ascending order
401 |         self.questions_df = self.questions_df.sort_index()
402 | 
403 |         brute_iou_scores, highlighted_chunks_count = self._full_precision_score(collection.get()['metadatas'])
404 | 
405 |         if retrieve == -1:
406 |             maximum_n = min(20, max(highlighted_chunks_count))
407 |         else:
408 |             highlighted_chunks_count = [retrieve] * len(highlighted_chunks_count)
409 |             maximum_n = retrieve
410 | 
411 |         # arr_bytes = np.array(list(sorted_embeddings)).tobytes()
412 |         # print("Hash: ", hashlib.md5(arr_bytes).hexdigest())
413 | 
414 |         # Retrieve the documents based on sorted embeddings
415 |         retrievals = collection.query(query_embeddings=list(sorted_embeddings), n_results=maximum_n)
416 | 
417 |         iou_scores, recall_scores, precision_scores = self._scores_from_dataset_and_retrievals(retrievals['metadatas'], highlighted_chunks_count)
418 | 
419 | 
420 |         corpora_scores = {
421 | 
422 |         }
423 |         for index, row in self.questions_df.iterrows():
424 |             if row['corpus_id'] not in corpora_scores:
425 |                 corpora_scores[row['corpus_id']] = {
426 |                     "precision_omega_scores": [],
427 |                     "iou_scores": [],
428 |                     "recall_scores": [],
429 |                     "precision_scores": []
430 |                 }
431 |             
432 |             corpora_scores[row['corpus_id']]['precision_omega_scores'].append(brute_iou_scores[index])
433 |             corpora_scores[row['corpus_id']]['iou_scores'].append(iou_scores[index])
434 |             corpora_scores[row['corpus_id']]['recall_scores'].append(recall_scores[index])
435 |             corpora_scores[row['corpus_id']]['precision_scores'].append(precision_scores[index])
436 | 
437 | 
438 |         brute_iou_mean = np.mean(brute_iou_scores)
439 |         brute_iou_std = np.std(brute_iou_scores)
440 | 
441 |         recall_mean = np.mean(recall_scores)
442 |         recall_std = np.std(recall_scores)
443 | 
444 |         iou_mean = np.mean(iou_scores)
445 |         iou_std = np.std(iou_scores)
446 | 
447 |         precision_mean = np.mean(precision_scores)
448 |         precision_std = np.std(precision_scores)
449 | 
450 |         # print("Recall scores: ", recall_scores)
451 |         # print("Precision scores: ", precision_scores)
452 |         # print("Recall Mean: ", recall_mean)
453 |         # print("Precision Mean: ", precision_mean)
454 | 
455 |         return {
456 |             "corpora_scores": corpora_scores,
457 |             "iou_mean": iou_mean,
458 |             "iou_std": iou_std,
459 |             "recall_mean": recall_mean,
460 |             "recall_std": recall_std,
461 |             "precision_omega_mean": brute_iou_mean,
462 |             "precision_omega_std": brute_iou_std,
463 |             "precision_mean": precision_mean,
464 |             "precision_std": precision_std
465 |         }
466 | 


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation.py:
--------------------------------------------------------------------------------
 1 | from .base_evaluation import BaseEvaluation
 2 | from importlib import resources
 3 | from pathlib import Path
 4 | 
 5 | class GeneralEvaluation(BaseEvaluation):
 6 |     def __init__(self, chroma_db_path=None):
 7 |         with resources.as_file(resources.files('chunking_evaluation.evaluation_framework') / 'general_evaluation_data') as general_benchmark_path:
 8 |             self.general_benchmark_path = general_benchmark_path
 9 |             questions_df_path = self.general_benchmark_path / 'questions_df.csv'
10 | 
11 |             corpora_folder_path = self.general_benchmark_path / 'corpora'
12 |             corpora_filenames = [f for f in corpora_folder_path.iterdir() if f.is_file()]
13 | 
14 |             corpora_id_paths = {
15 |                 f.stem: str(f) for f in corpora_filenames
16 |             }
17 | 
18 |             super().__init__(str(questions_df_path), chroma_db_path=chroma_db_path, corpora_id_paths=corpora_id_paths)
19 | 
20 |             self.is_general = True
21 | 


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/chatlogs.md:
--------------------------------------------------------------------------------
1 | [{'content': "These instructions apply to section-based themes (Responsive 6.0+, Retina 4.0+, Parallax 3.0+ Turbo 2.0+, Mobilia 5.0+). What theme version am I using?\nOn your Collections pages & Featured Collections sections, you can easily show the secondary image of a product on hover by enabling one of the theme's built-in settings!\nYour Collection pages & Featured Collections sections will now display the secondary product image just by hovering over that product image thumbnail.\nDoes this feature apply to all sections of the theme or just specific ones as listed in the text material?", 'role': 'user'}, {'content': 'This feature only applies to Collection pages and Featured Collections sections of the section-based themes listed in the text material.', 'role': 'assistant'}, {'content': 'Can you guide me through the process of enabling the secondary image hover feature on my Collection pages and Featured Collections sections?', 'role': 'user'}, {'content': "Sure, here are the steps to enable the secondary image hover feature on your Collection pages and Featured Collections sections:\n\n1. Log in to your Shopify account and go to your Online Store.\n2. Click on Customize theme for the section-based theme you are using.\n3. Navigate to the Collection pages or Featured Collections section where you want to enable the secondary image hover feature.\n4. With the section open, click on the section's setting (gear) icon in the top-left corner.\n5. In the settings panel that appears, look for an option labeled 'Image display' or 'Image hover'.\n6. If available, select 'Show secondary image on hover'.\n7. Save the changes and preview the Collection/Featured Collection page to see the effect.\n\nIf you're having trouble finding the setting, the best thing to do is refer to your theme's documentation, since the location and labeling of settings can vary between themes.", 'role': 'assistant'}, {'content': 'Can you provide me with a link to the documentation for my theme?', 'role': 'user'}, {'content': "I don't have access to your store's theme information. However, you can usually find the documentation for your theme by going to the shopify theme store, finding your theme and clicking on the 'support' link located in the bottom right corner of the page. Alternatively, you can do a google search for the name of your theme followed by 'documentation' or 'user guide'.", 'role': 'assistant'}, {'content': 'Can you confirm if this feature also works for the Quick Shop section of my theme?', 'role': 'user'}, {'content': "The secondary image hover feature may or may not work for your Quick Shop section, depending on the configuration of your theme. Some themes include this feature in the Quick Shop section by default, while others may require additional customization. To check if this feature is available for the Quick Shop section of your theme, follow these steps:\n\n1. Go to the Quick Shop section where you would like to enable the feature. 2. Click on the Quick Shop settings icon (gear icon) and look for 'Image display' or 'Image hover'. 3. If available, select 'Show secondary image on hover'. 4. Save the changes. If this option is not available in your Quick Shop section settings, you may need to reach out to your theme developer for assistance with customizing your Quick Shop section to include this feature.", 'role': 'assistant'}]
2 | 
3 | [{'content': 'Which famous landmarks should I visit in London, beyond the usual ones?', 'role': 'user'}, {'content': "1. Leadenhall Market - a beautiful indoor market with stunning Victorian architecture, also used as a filming location in the Harry Potter films.\n\n2. St. Dunstan in the East - a ruined church in the middle of the city that has been turned into a beautiful public garden.\n\n3. The Monument - a 202-foot-tall column commemorating the Great Fire of London, with a staircase leading to a viewing platform offering great views of the city.\n\n4. The Camden Town Markets - an eclectic collection of markets offering food, fashion, and vintage items, plus live music and street performers.\n\n5. Novelist's House - the former home of Charles Dickens, now a museum dedicated to his life and works.\n\n6. The Old Operating Theatre - a museum housed in the oldest surviving operating theatre in Europe, with exhibits on the history of surgery and medical practices.\n\n7. The Churchill War Rooms - an underground complex used as a British government command centre during World War II, now a museum dedicated to Winston Churchill and his leadership during the war.\n\n8. The Brunel Museum - a museum dedicated to the life and works of engineer Isambard Kingdom Brunel, with exhibitions on his revolutionary designs for bridges, ships, and tunnels.\n\n9. The Clink Prison Museum - a museum in the former site of the notorious medieval prison, with exhibits on the history of crime and punishment in London.\n\n10. The Royal Observatory Greenwich - an observatory and museum dedicated to astronomy and navigation, with exhibits on the history of timekeeping and space exploration.", 'role': 'assistant'}, {'content': "Hmm, those are interesting suggestions, but I've already seen so many touristy landmarks in London. Is there something more offbeat that you can recommend? Something that locals might know about?", 'role': 'user'}, {'content': "Absolutely! Here are some offbeat and lesser-known things to see in London that locals might recommend:\n\n1. God's Own Junkyard - a neon wonderland filled with vintage and new neon signs.\n\n2. Hornsey Town Hall Arts Centre - a grade II listed building with a rich history and home to many art exhibitions and events.\n\n3. Clapham Grand - a historic Victorian theatre that has been transformed into a music venue, cinema and a comedy club.\n\n4. Gordon Museum of Pathology - the pathology museum of King's College in London that holds a collection of over 8,000 medical specimens.\n\n5. John Soane's Museum - the former home of wealthy architect John Soane is a museum with eclectic collections of paintings, sculptures, and curiosities.\n\n6. The Viktor Wynd Museum of Curiosities - a small museum filled with strange and unusual objects, taxidermy, and rare books.\n\n7. The Cross Bones Graveyard - a medieval graveyard that became known as an outcast cemetery with a history connected to the sex industry, now a memorial garden.\n\n8. Highgate Cemetery - a beautiful Victorian graveyard known for its Gothic architecture and where many famous people, including Karl Marx, are buried.\n\n9. Dalston Eastern Curve Garden - a beautiful community garden near the Dalston Junction station that is run by volunteers and has regular events.\n\n10. The Geffrye Museum of the Home - a museum that explores the history of domestic interiors in England, with a beautiful garden and regular market events.\n\nThere are many other hidden gems in London, and a quick Google search for ‘offbeat things in London’ will bring up many blogs and resources with more options.", 'role': 'assistant'}, {'content': "Wow, these offbeat suggestions are really interesting. I had no idea there were so many unique things to see in London. Thanks for the recommendations, you're much more helpful than I expected. Can you tell me more about the Viktor Wynd Museum of Curiosities? It sounds like my kind of place.", 'role': 'user'}, {'content': "Sure! The Viktor Wynd Museum of Curiosities, Fine Art & Natural History is a small museum located in Hackney, East London. The museum, founded by collector Viktor Wynd is a place where you can find taxidermy, books, anatomical models, and other curiosities. The museum is famous for its collection of shrunken heads, dodo bones, and the famous mermaid. It is also notable for its dimly lit, atmospheric decor, and its bar which serves cocktails and allows visitors to take drinks into the museum. The museum is open daily except for Mondays and Tuesdays, and it also frequently hosts events and exhibitions, so be sure to check the museum's website to see what's on.", 'role': 'assistant'}, {'content': "Woah, the Viktor Wynd Museum sounds like a wacky and fun place to visit. I don't think I've ever seen a shrunken head before. I wonder what kind of cocktails they serve there... Maybe I can convince some friends to check it out with me!", 'role': 'user'}, {'content': 'Yes, the museum is definitely one of the more unusual places to visit in London! The cocktail menu at Viktor Wynd\'s bar is just as eclectic as the museum\'s collection, featuring brightly colored cocktails inspired by the exhibits, such as the "Shrunken Head" or "Mermaid Tail" cocktails. The museum and bar are on the smaller side, so it would make for a unique and intimate outing with friends. If you\'re looking for something different and memorable to do in London, the Viktor Wynd Museum is definitely worth checking out!', 'role': 'assistant'}, {'content': "Whoa, a bar in a museum?! That's wild. I'm definitely going to have to try one of those cocktails. Do you think they have anything with a shrunken head as a garnish? That would be a trip.", 'role': 'user'}, {'content': "I am not sure if the Viktor Wynd Museum of Curiosities serves cocktails with an actual shrunken head as a garnish. However, they might have some creative cocktails that incorporate a shrunken head replica or something similar as an element in the drink. Nevertheless, the bar offers a unique setting and a cocktail menu that includes some unusual ingredients so it's definitely worth checking out! It's likely to be an eccentric and memorable experience, and I'm sure you'll have a great time there with your friends.", 'role': 'assistant'}, {'content': 'I can\'t wait to see that mermaid at the Viktor Wynd Museum. I wonder if it\'s real or just a replica. Either way, it\'s bound to be a fascinating item to behold. And I\'ll make sure to order that "Mermaid Tail" cocktail while I\'m there!', 'role': 'user'}, {'content': 'Yes, the mermaid at the Viktor Wynd Museum of Curiosities is definitely one of its most famous exhibits. It\'s actually a recreation made from the skeleton of a monkey and the tail of a fish, so it\'s not a real mermaid of course, but it\'s still an intriguing piece. The museum has many other curiosities to discover as well, and the atmosphere there is unlike anything you\'ll find in other museums. And don\'t forget to try that "Mermaid Tail" cocktail - it\'s the perfect accompaniment to the museum\'s quirky vibe!', 'role': 'assistant'}]
4 | 
5 | [{'content': "Write a comprehensive blog post of at least 1000 words about the top 10 most eco-friendly cities in the world and their renewable energy initiatives. Use a formal and informative tone, and include statistics, case studies, and expert opinions to support your claims. Make sure to cover various aspects of sustainability, such as public transportation, waste management, green buildings, and carbon emissions reduction. Additionally, provide specific examples of innovative renewable energy projects implemented in each city and assess their impact on the environment and the local community. Provide credible sources and links to relevant articles, reports, and websites to enhance the article's credibility and appeal to the readers.", 'role': 'user'}, {'content': "In the recent years, there has been a growing awareness of the necessity of developing sustainable cities to combat the detrimental effects of climate change. Cities are responsible for 75% of the world's carbon emissions, but they also have the potential to be leaders in renewable energy and green initiatives. Many cities are taking action to reduce their carbon footprint, and ten cities have gone above and beyond in terms of developing eco-friendly initiatives. In this post, we will take a look at the top ten most eco-friendly cities in the world and their renewable energy initiatives.\n\n1. Reykjavik, Iceland\nReykjavik is the capital of Iceland, and it is considered one of the most environmentally friendly cities in the world. The city is powered almost entirely by renewable energy, thanks to Iceland's abundant geothermal and hydroelectric power sources. About 90% of Reykjavik's population is connected to district heating, which is generated through geothermal power. The city also has an extensive public transportation system that relies on electric buses and taxis.\n\nOne of the most innovative renewable energy projects in Reykjavik is the CarbFix project, which involves capturing carbon dioxide emissions and storing them in underground basaltic rock formations. This initiative not only reduces carbon emissions, but it also creates a new type of renewable geothermal energy source. This project has reduced Reykjavik’s carbon footprint significantly and is gaining recognition worldwide.\n\n2. Copenhagen, Denmark\nCopenhagen is another city that has earned its reputation as a sustainable city. The city's ambitious goal is to be carbon-neutral by 2025, and it has already made significant progress in reducing its carbon emissions. Copenhagen has an extensive bicycle infrastructure, and biking is the most popular mode of transportation in the city. The city also has a robust public transportation system, with most buses running on biogas or electricity.\n\nCopenhagen is also home to several innovative energy-generating projects. The city’s Amager Bakke waste-to-energy plant is an excellent example of how to tackle waste management while generating clean energy. The plant incinerates waste, and the heat generated is converted into electricity, which is then supplied to the grid. The plant also features a recreational space with a ski slope, hiking trails, and even a climbing wall, making it more than just an energy-generating facility.\n\n3. Stockholm, Sweden\nStockholm is another city that has made significant progress in becoming more environmentally friendly. The city has a 100% renewable energy target by 2040, and it is well on its way towards achieving that goal. Like Copenhagen, Stockholm has an extensive bicycle infrastructure, with bikes being the most popular mode of transportation. The city also has an efficient public transportation system, which runs on renewable energy.\n\nStockholm has several innovative projects in the renewable energy sector. One such project is the Värtahamnen district cooling plant, which uses seawater to cool buildings. The cold water is pumped from deep-sea depth and is circulated through the city's cooling system. This system is significantly more energy-efficient than traditional air conditioning methods, and it has reduced Stockholm's carbon emissions significantly.\n\n4. Vancouver, Canada\nVancouver is a city known for its natural beauty, and it is also known for its commitment to sustainability. The city aims to be the greenest city in the world by 2020, and it has made significant progress towards that goal. The city has an impressive public transportation system, which includes buses, trains, and ferries, all of which run on renewable energy.\n\nVancouver has been investing in renewable energy initiatives and the green economy, creating new jobs while reducing carbon emissions. It is home to Canada’s largest biomass energy plant, which converts wood waste and sawdust into electricity, reducing the use of fossil fuels. Additionally, the city has launched an ambitious program called Renewable City Strategy, which aims to see the city run on 100% renewable energy before 2050.\n\n5. Portland, USA\nPortland is a city that has long been known for its eco-friendliness, and it has made significant strides in reducing its carbon emissions. The city aims to reduce its carbon emissions by 80% by 2050, and it has implemented several sustainable initiatives. Portland's public transportation system is extensive, and the city has invested heavily in bike lanes and bike-sharing programs, making cycling a popular option for getting around.\n\n Portland is also home to several innovative energy projects. One such project is the Covanta Portland waste-to-energy facility, which converts waste into clean energy. The project generates enough electricity to power more than 27,000 households and reduces the amount of waste sent to landfills significantly.\n\n6. San Francisco, USA\nSan Francisco is a city that has always been at the forefront of sustainability initiatives, and it aims to be a zero-waste city by 2020. The city has an impressive public transportation system, which includes buses, trains, and cable cars, all of which are powered by renewable energy sources. Additionally, San Francisco is home to the largest municipal solar installation in", 'role': 'assistant'}, {'content': 'This is a really informative post about the top eco-friendly cities and their renewable energy initiatives. I would like to know more about their waste management strategies since it is also an important aspect of sustainability. Can you add some more information on that?', 'role': 'user'}, {'content': 'Absolutely! Waste management is an important aspect of sustainability as it helps cities reduce their carbon footprint and ensure that resources are not wasted. Here is some information on the waste management strategies that these cities have implemented:\n\n7. Oslo, Norway\nOslo is the capital of Norway and is known for its impressive waste management program. The city is well on its way to becoming a zero-waste city, with over 60% of its waste being recycled or reused. Oslo\'s waste management program involves separating organic waste into biowaste, which is then converted into biogas and biofertilizer. Additionally, the city has implemented a "pay as you throw" policy, which has encouraged residents to reduce their waste output.\n\n8. Zurich, Switzerland\nZurich has a strong reputation for being environmentally friendly, and its waste management program is no exception. The city has a Zero Waste Strategy, which aims to have no waste sent to landfills by 2025. Zurich has an extensive recycling program, and residents are encouraged to separate their waste into different categories, including paper, glass, metal, and plastic. Organic waste is also collected separately and composted, which is then used to fertilize public parks and gardens.\n\n9. Adelaide, Australia\nAdelaide is a city that has made significant progress in reducing its carbon emissions, and its waste management program is contributing to that goal. The city has implemented a system where household waste is sorted into three different bins: one for organic waste, one for recyclable materials, and one for general waste. The organic waste is processed into compost, which is then used to fertilize public parks and gardens, while recyclable materials are sent to recycling centers.\n\n10. Singapore\nSingapore is a city-state that has made waves with its innovative waste management program. The city is known for its high population density, which makes waste management a significant challenge. Singapore has implemented a "3R" approach - reduce, reuse, and recycle - to reduce its waste output. The city also has a waste-to-energy plant, which converts waste into electricity that is then supplied to the grid.\n\nIn conclusion, all the top ten most eco-friendly cities in the world have implemented various strategies for waste management. By focusing on sustainable waste management strategies, these cities have been able to reduce their carbon emissions, protect the environment, and promote sustainability.', 'role': 'assistant'}, {'content': 'Thank you for the additional information about waste management strategies in these eco-friendly cities. I was also wondering if you could provide more details on their green building initiatives, such as the use of sustainable materials and designs to reduce energy consumption and carbon emissions.', 'role': 'user'}, {'content': 'Certainly! Green buildings are an important aspect of sustainability, as they can significantly reduce energy consumption and help lower carbon emissions. Here are some examples of green building initiatives in these eco-friendly cities:\n\n1. Reykjavik, Iceland\nReykjavik has implemented several programs to promote sustainable building practices. The city has a "Green Steps" program, which encourages residents and businesses to adopt sustainable practices. Reykjavik has also mandated that all new buildings must meet strict energy efficiency standards. Buildings are designed to maximize natural light and use sustainable building materials wherever possible.\n\n2. Copenhagen, Denmark\nCopenhagen has implemented several green building initiatives, including the use of sustainable materials and designs to reduce energy consumption. The city has a program called "Green Roofs and Walls," which promotes the use of vegetation on buildings to improve energy efficiency and reduce the urban heat island effect. Additionally, Copenhagen has implemented a district heating program, which uses waste heat from power plants to heat buildings, reducing energy consumption.\n\n3. Stockholm, Sweden\nStockholm has implemented several programs to promote green building practices, including the use of sustainable materials and designs. The city has adopted strict energy efficiency standards for new buildings, and it promotes the use of green roofs and walls to improve energy efficiency and reduce the urban heat island effect. Stockholm also encourages the use of solar panels and other renewable energy sources in buildings.\n\n4. Vancouver, Canada\nVancouver has implemented several programs to promote sustainable building practices, including the use of sustainable materials and designs. The city has adopted strict green building codes, and it encourages the use of energy-efficient windows, insulation, and lighting. Vancouver also has a program that encourages the use of green roofs and walls to improve energy efficiency and reduce the urban heat island effect.\n\n5. Portland, USA\nPortland is known for its sustainable building practices, and the city has implemented several green building initiatives. The city has implemented a green building policy, which encourages the use of sustainable materials and designs to reduce energy consumption. Additionally, Portland has a program called "Ecoroofs," which promotes the use of vegetation on roofs to improve energy efficiency and reduce the urban heat island effect.\n\n6. San Francisco, USA\nSan Francisco has implemented several programs to promote green building practices, including the use of sustainable materials and designs. The city has adopted strict energy efficiency standards for new buildings, and it promotes the use of green roofs and walls to improve energy efficiency and reduce the urban heat island effect. Additionally, San Francisco has a program called Zero Net Energy Buildings, which encourages the construction of buildings that generate as much energy as they use.\n\nIn conclusion, all the top ten most eco-friendly cities in the world have implemented programs to promote sustainable building practices. By using sustainable materials and designs, these cities have significantly reduced energy consumption and lowered their carbon emissions.', 'role': 'assistant'}, {'content': "I'm impressed by the efforts these eco-friendly cities have made to promote sustainability. However, I'm curious about the challenges these cities might have faced during the implementation of their renewable energy and sustainability initiatives. Can you provide me with more details on those challenges and how the cities overcame them?", 'role': 'user'}, {'content': "Certainly! While these ecologically friendly cities have been successful in implementing renewable energy and sustainability initiatives, they have also faced their fair share of challenges. Here are some examples of challenges faced by these cities and how they overcame them:\n\n1. Reykjavik, Iceland\nDespite its extensive use of renewable energy, Reykjavik faced the challenge of balancing its energy-intensive industries with sustainability goals. The city overcame this challenge by implementing a carbon capture project, keeping pace with the emission-intensive industries, while still reducing the carbon footprint. The CarbFix project, which I described earlier, proved to be a game-changer, making Reykjavik a city that effectively captured and utilized carbon dioxide. 2. Copenhagen, Denmark\nCopenhagen faced the challenge of encouraging car drivers to adopt more eco-friendly modes of transportation. The city overcame this challenge by incentivizing bike commuting, providing public transportations, and by raising awareness on the positive impacts of sustainable travel. The city was able to increase the number of bike riders to account for approximately 50% of commuters, contributing to the significant reduction of carbon emissions. 3. Stockholm, Sweden\nStockholm faced the challenge of finding a way to sustain green infrastructure projects despite the cost, particularly in a city where both land and property values are relatively high. The city overcame this challenge by cooperating with businesses and local community organizations to help defray the costs of infrastructure projects. By doing so, Stockholm effectively secured long-term funding for infrastructure projects that are vital for sustainability. 4. Vancouver, Canada\nVancouver faced the challenge of reducing carbon emissions while still maintaining economic growth. The city overcame this challenge by promoting the development of sustainable industries, while at the same time fostering an eco-friendly environment for businesses. By doing so, local businesses were encouraged to adopt more sustainable practices and were recognized for their efforts in promoting sustainability. 5. Portland, USA\nPortland faced the challenge of dealing with a lack of federal support for sustainability initiatives, particularly during the early stages of their implementation. The city overcame this challenge by forging partnerships with local businesses and organizations to promote sustainability initiatives. By embracing public-private partnerships and working closely with the city's commercial sector, Portland was able to overcome a lack of federal support while still promoting sustainable practices. 6. San Francisco, USA\nSan Francisco faced the challenge of competing for resources with other nearby communities, given its geographically small size. The city overcame this challenge by adopting a collaborative approach with other nearby communities, encouraging them to adopt eco-friendly practices. By doing this, San Francisco was able to leverage the resources of neighboring communities to promote sustainability initiatives throughout the region. In conclusion, while these eco-friendly cities have faced their share of challenges in promoting sustainability, they have successfully overcome them. Through collaborating with businesses and community organizations, promoting sustainable practices, and embracing partnerships, these cities have successfully implemented different renewable energy and green initiatives, promoting a more sustainable future.", 'role': 'assistant'}]
6 | 
7 | [{'content': 'De León, previewing the speech he will give today, said he will highlight his Senate Bill 535, which directs a quarter of the proceeds from the Greenhouse Gas Reduction Fund to projects that benefit disadvantaged communities.\nOn Thursday, de León nodded approvingly as a string of leading scientists and religious leaders gathered for hours of weedy policy discussions on the impacts of climate change, including gloomy predictions on mortality attributable to air pollution.\nSEIU HEADS TO THE BAR: Employees of the State Bar of California represented by SEIU are planning a picket line today at the bar building in Los Angeles to protest the latest contract offer. What is the reason for SEIU employees planning a picket line at the State Bar of California building in Los Angeles?', 'role': 'user'}, {'content': 'The reason for SEIU employees planning a picket line at the State Bar of California building in Los Angeles is to protest the latest contract offer.', 'role': 'assistant'}, {'content': 'Can you provide more details about Senate Bill 535 and how it will benefit disadvantaged communities?', 'role': 'user'}, {'content': "Sure, Senate Bill 535 is a California law that directs at least 25% of the money generated through the state's Greenhouse Gas Reduction Fund towards projects that benefit disadvantaged communities. The bill is designed to ensure that the benefits of California's climate policies, which include cap-and-trade, are felt most heavily by communities that have been disproportionately impacted by pollution and environmental degradation.\n\nThe funds generated through the bill are invested in projects that promote clean energy, reduce pollution, and support sustainable communities. This includes initiatives like public transportation, energy efficiency retrofits for low-income homes, and urban forestry projects.\n\nProponents of the bill argue that it helps to level the playing field for low-income communities and communities of color, which have historically faced greater exposure to environmental hazards and pollution. By directing funds towards these communities, the bill aims to create more equitable and sustainable outcomes for all Californians.", 'role': 'assistant'}]
8 | 
9 | [{'content': 'Write an essay that evaluates the positive and negative influence of social media on personal relationships, citing specific examples and research to support your claims. Analyze the ways in which social media usage affects communication patterns, trust levels, and emotional connections in romantic partnerships, friendships, and family relationships. Consider the role of social comparison, cyberbullying, and privacy concerns in shaping the impact of social media use on personal relationships. Finally, provide recommendations for how individuals can cultivate healthy and meaningful relationships in the age of social media.', 'role': 'user'}, {'content': 'Social media is a ubiquitous presence in modern society and has revolutionized the way people communicate with one another. While there are undoubtedly positive aspects of social media usage, there are also significant negative consequences that impact personal relationships. In this essay, I will evaluate the positive and negative influence of social media on personal relationships, citing specific examples and research to support my claims. I will analyze the ways in which social media usage affects communication patterns, trust levels, and emotional connections in romantic partnerships, friendships, and family relationships. I will also consider the role that social comparison, cyberbullying, and privacy concerns play in shaping the impact of social media use on personal relationships. Finally, I will provide recommendations for how individuals can cultivate healthy and meaningful relationships in the age of social media.\n\nOne of the positive aspects of social media is that it allows people to communicate with one another more easily and frequently than ever before. Individuals can maintain long-distance relationships with friends and family members, interact with people who share similar interests or backgrounds, and cultivate new friendships with people from all over the world. Social media platforms like Facebook and Instagram provide an easy way for users to share their lives with others, which can enhance emotional connections and feelings of intimacy in relationships. For example, a recent study found that social media use was positively associated with self-disclosure, which is a key factor in building and maintaining close relationships (Chen & Lee, 2013).\n\nHowever, social media also has negative consequences that can impact personal relationships. One of the most significant concerns is the impact of social comparison, which can lead to feelings of envy, insecurity, and dissatisfaction. Studies have shown that frequent social media use is associated with higher levels of depressive symptoms, and that exposure to idealized and curated images of others can negatively impact self-esteem (Vogel et al., 2014). Additionally, social media can provide a platform for cyberbullying, which can cause emotional distress, anxiety, and even suicide. Cyberbullying can come from strangers, but it can also come from people who are known to the victim, such as peers or romantic partners. A recent survey found that one in four teenagers has experienced cyberbullying, and that it can lead to decreased self-esteem and social isolation (Hinduja & Patchin, 2018).\n\nMoreover, the overreliance on social media for communication can also have negative effects on personal relationships. In romantic partnerships, excessive social media use has been associated with lower levels of trust and relationship satisfaction (Muise et al., 2009). Social media usage can lead to misinterpretation of messages, and the lack of nonverbal cues can make it difficult to understand the true meaning of a conversation. In families, the overuse of social media can hinder face-to-face communication and physical interaction, which are essential components of healthy relationships.\n\nBased on these findings, individuals should take steps to limit the negative impact of social media on their personal relationships. One way to do so is to cultivate meaningful offline interactions with friends and family members. Taking a break from social media can also be beneficial, particularly if it leads to more in-person communication and interaction. Additionally, individuals can be mindful of the content they post on social media, and strive to create realistic and authentic portrayals of their lives. Finally, it is essential to educate oneself on the dangers of cyberbullying and to take active steps to prevent it.\n\nIn conclusion, social media has both positive and negative influences on personal relationships. While social media can enhance communication and provide a platform for building new relationships, it can also result in social comparison, cyberbullying, and a lack of trust in personal relationships. By taking steps to limit the negative effects of social media exposure and cultivating healthy, in-person relationships, individuals can create meaningful and fulfilling connections with others while still enjoying the benefits of social media use.', 'role': 'assistant'}, {'content': 'Can you add more information on the impact of social media on family relationships? Also, can you suggest some practical tips on how to balance social media use and face-to-face communication with our loved ones?', 'role': 'user'}, {'content': 'Sure, here are some additional details on the impact of social media on family relationships. Social media has transformed the way we interact with our families, with both positive and negative implications. While it can provide a platform to stay in touch with family members living far away or separated by distance, it can also hinder close relationships by replacing face-to-face communication with online interactions. Social media has been found to decrease the amount of quality time families spend together physically, which can affect the emotional connections between family members. For example, studies have found that children who spend more time on social media have lower relationship satisfaction with their parents, while parents who spend more time on social media have lower relationship satisfaction with their children (Kross et al., 2020). Moreover, the content shared on social media can sometimes lead to misunderstandings and conflicts in family relationships. Parents may share pictures or posts related to their child without their consent, which can result in embarrassment or frustration. Similarly, siblings may share information about their siblings without their permission, which can strain relationships between siblings. That being said, there are practical tips to balance social media use and face-to-face communication with our loved ones. The first step is to set boundaries and establish times when social media will be avoided or limited. For example, putting aside an hour or two a day to disconnect from social media and spend time with family can make a big difference in fostering healthy relationships. Family members can also establish a "no-phone" policy during meal times or during important family events such as holidays or vacations. Seeking out opportunities to engage in shared activities can also strengthen family bonds. Activities such as cooking, hiking, or game nights can create opportunities for meaningful interactions and laughter, which can foster stronger emotional connections between family members. Finally, finding ways to use social media to enhance family relationships is important. Tools such as video chat or group messaging platforms can allow family members to stay in touch and share their lives with one another while also maintaining meaningful connections. Overall, balancing social media use with face-to-face communication and shared activities can help individuals cultivate healthy and meaningful family relationships while still enjoying the benefits of social media use.', 'role': 'assistant'}, {'content': 'Can you give some examples of "shared activities" that families can do together to strengthen their relationships?', 'role': 'user'}, {'content': "Sure! Here are a few examples of shared activities that families can do together to strengthen their relationships:\n\n1. Cooking and baking: Preparing meals together is not only a practical activity that can help families save time and money, but it also provides an opportunity for sharing stories, tips, and traditions. Cooking and baking can be a fun way for family members to connect and create memories while enjoying tasty treats. 2. Outdoor activities: Hiking, cycling, or camping are all great ways to get outdoors and enjoy nature. Family members can take turns planning and organizing trips or activities, and encourage each other to try new things. Outdoor activities also provide an opportunity to disconnect from technology and spend quality time together. 3. Game nights: Board games, card games, or video games can be a fun way for family members to engage in friendly competition and create shared experiences. Game nights can be an opportunity to learn new games or revisit old favorites, and can be enjoyed by family members of all ages. 4. Movie nights: Watching movies together allows family members to bond over shared interests and experiences. Family members can take turns selecting movies, and share their thoughts and opinions during or after the screening. Movie nights can also be an opportunity to introduce younger family members to classic or new movies that they may not have seen before. 5. Volunteer work: Participating in volunteer work as a family can provide a sense of purpose and fulfillment, while also strengthening relationships. Family members can choose a cause or organization that they feel passionate about, and work together towards a common goal. Volunteering is also a great way to instill values of empathy and compassion in younger family members. Overall, shared activities should be fun, engaging, and flexible enough to accommodate everyone's interests and schedules. By finding ways to spend time together as a family, and creating meaningful connections, individuals can create a positive and healthy family environment.", 'role': 'assistant'}, {'content': 'Can you give some more examples of outdoor activities that families can enjoy together?', 'role': 'user'}, {'content': 'Sure, here are some more examples of outdoor activities that families can enjoy together:\n\n1. Picnics: Going on a picnic is a great way to spend time together outdoors while enjoying a meal or snack. Families can pack a basket of their favorite foods 


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/state_of_the_union.md:
--------------------------------------------------------------------------------
  1 | Good evening. Good evening. If I were smart, I’d go home now.
  2 | 
  3 | Mr. Speaker, Madam Vice President, members of Congress, my fellow Americans.
  4 | 
  5 | In January 1941, Franklin Roosevelt came to this chamber to speak to the nation. And he said, “I address you at a moment unprecedented in the history of the Union”. Hitler was on the march. War was raging in Europe.
  6 | 
  7 | President Roosevelt’s purpose was to wake up Congress and alert the American people that this was no ordinary time. Freedom and democracy were under assault in the world.
  8 | 
  9 | Tonight, I come to the same chamber to address the nation. Now it’s we who face an unprecedented moment in the history of the Union.
 10 | 
 11 | And, yes, my purpose tonight is to wake up the Congress and alert the American people that this is no ordinary moment either. Not since President Lincoln and the Civil War have freedom and democracy been under assault at home as they are today.
 12 | 
 13 | What makes our moment rare is that freedom and democracy are under attack at — both at home and overseas at the very same time.
 14 | 
 15 | Overseas, Putin of Russia is on the march, invading Ukraine and sowing chaos throughout Europe and beyond.
 16 | 
 17 | If anybody in this room thinks Putin will stop at Ukraine, I assure you: He will not.
 18 | 
 19 | But Ukraine — Ukraine can stop Putin. Ukraine can stop Putin if we stand with Ukraine and provide the weapons that it needs to defend itself.
 20 | 
 21 | That is all — that is all Ukraine is asking. They’re not asking for American soldiers. In fact, there are no American soldiers at war in Ukraine, and I’m determined to keep it that way.
 22 | 
 23 | But now assistance to Ukraine is being blocked by those who want to walk away from our world leadership.
 24 | 
 25 | It wasn’t long ago when a Republican president named Ronald Reagan thundered, “Mr. Gorbachev, tear down this wall.”
 26 | 
 27 | Now — now my predecessor, a former Republican president, tells Putin, quote, “Do whatever the hell you want.”
 28 | 
 29 | AUDIENCE: Booo —
 30 | 
 31 | THE PRESIDENT: That’s a quote.
 32 | 
 33 | A former president actually said that — bowing down to a Russian leader. I think it’s outrageous, it’s dangerous, and it’s unacceptable.
 34 | 
 35 | America is a founding member of NATO, the military alliance of democratic nations created after World War Two prevent — to prevent war and keep the peace.
 36 | 
 37 | And today, we’ve made NATO stronger than ever. We welcomed Finland to the Alliance last year. And just this morning, Sweden officially joined, and their minister is here tonight. Stand up. Welcome. Welcome, welcome, welcome. And they know how to fight.
 38 | 
 39 | Mr. Prime Minister, welcome to NATO, the strongest military alliance the world has ever seen.
 40 | 
 41 | I say this to Congress: We have to stand up to Putin. Send me a bipartisan national security bill. History is literally watching. History is watching.
 42 | 
 43 | If the United States walks away, it will put Ukraine at risk. Europe is at risk. The free world will be at risk, emboldening others to do what they wish to do us harm.
 44 | 
 45 | My message to President Putin, who I’ve known for a long time, is simple: We will not walk away. We will not bow down. I will not bow down.
 46 | 
 47 | In a literal sense, history is watching. History is watching — just like history watched three years ago on January 6th when insurrectionists stormed this very Capitol and placed a dagger to the throat of American democracy.
 48 | 
 49 | Many of you were here on that darkest of days. We all saw with our own eyes the insurrectionists were not patriots. They had come to stop the peaceful transfer of power, to overturn the will of the people.
 50 | 
 51 | January 6th lies about the 2020 election and the plots to steal the election posed a great — gravest threat to U.S. democracy since the Civil War.
 52 | 
 53 | But they failed. America stood — America stood strong and democracy prevailed. We must be honest: The threat to democracy must be defended (defeated).
 54 | 
 55 | My predecessor and some of you here seek to bury the truth about January 6th. I will not do that.
 56 | 
 57 | This is a moment to speak the truth and to bury the lies. Here’s the simple truth: You can’t love your country only when you win.
 58 | 
 59 | As I’ve done ever since being elected to office, I ask all of you, without regard to party, to join together and defend democracy. Remember your oath of office to defend against all threats foreign and domestic.
 60 | 
 61 | Respect — respect free and fair elections, restore trust in our institutions, and make clear political violence has absolutely no place — no place in America. Zero place.
 62 | 
 63 | Again, it’s not — it’s not hyperbole to suggest history is watching. They’re watching. Your children and grandchildren will read about this day and what we do.
 64 | 
 65 | History is watching another assault on freedom. Joining us tolight (tonight) is Latorya Beasley, a social worker from Birmingham, Alabama.
 66 | 
 67 | Fourteen months ago — fourteen months ago, she and her husband welcomed a baby girl thanks to the miracle of IVF. She scheduled treatments to have that second child, but the Alabama Supreme Court shut down IVF treatments across the state, unleashed by a Supreme Court decision overturning Roe v. Wade. She was told her dream would have to wait.
 68 | 
 69 | What her family had gone through should never have happened. Unless Congress acts, it could happen again.
 70 | 
 71 | So, tonight, let’s stand up for families like hers. To my friends across the aisle — don’t keep this waiting any longer. Guarantee the right to IVF. Guarantee it nationwide.
 72 | 
 73 | Like most Americans, I believe Roe v. Wade got it right.
 74 | 
 75 | I thank Vice President Harris for being an incredible leader defending reproductive freedom and so much more. Thank you.
 76 | 
 77 | My predecessor came to office determined to see Roe v. Wade overturned. He’s the reason it was overturned, and he brags about it. Look at the chaos that has resulted.
 78 | 
 79 | Joining us tonight is Kate Cox, a wife and mother from Dallas. She’d become pregnant again and had a fetus with a fatal condition. Her doctor told Kate that her own life and her ability to have future in the fil- — children in the future were at risk if she didn’t act. Because Texas law banned her ability to act, Kate and her husband had to leave the state to get what she needed.
 80 | 
 81 | What her family had gone through should have never happened as well. But it’s happening to too many others.
 82 | 
 83 | There are state laws banning the freedom to choose, criminalizing doctors, forcing survivors of rape and incest to leave their states to get the treatment they need.
 84 | 
 85 | Many of you in this chamber and my predecessor are promising to pass a national ban on reproductive freedom.
 86 | 
 87 | AUDIENCE: Booo —
 88 | 
 89 | THE PRESIDENT: My God, what freedom else would you take away?
 90 | 
 91 | Look, its decision to overturn Roe v. Wade, the Supreme Court majority wrote the following — and with all due respect, Justices — “Women are not without electoral — electoral power” — excuse me — “electoral or political power.”
 92 | 
 93 | You’re about to realize just how much you were right about that.
 94 | 
 95 | Clearly — clearly, those bragging about overturning Roe v. Wade have no clue about the power of women.
 96 | 
 97 | But they found out. When reproductive freedom was on the ballot, we won in 2022 and 2023. And we’ll win again in 2024.
 98 | 
 99 | If you — if you, the American people, send me a Congress that supports the right to choose, I promise you I will restore Roe v. Wade as the law of the land again.
100 | 
101 | Folks, America cannot go back.
102 | 
103 | I am here to- — tonight to show what I believe is the way forward, because I know how far we’ve come.
104 | 
105 | Four years ago next week, before I came to office, the country was hit by the worst pandemic and the worst economic crisis in a century.
106 | 
107 | Remember the fear, record losses?
108 | 
109 | Remember the spikes in crime and the murder rate? A raging virus that took more than 1 million American lives of loved ones, millions left behind.
110 | 
111 | A mental health crisis of isolation and loneliness.
112 | 
113 | A president, my predecessor, failed in the most basic presidential duty that he owes to American people: the duty to care.
114 | 
115 | AUDIENCE MEMBER: Lies!
116 | 
117 | THE PRESIDENT: I think that’s unforgivable.
118 | 
119 | I came to office determined to get us through one of the toughest periods in the nation’s history. We have.
120 | 
121 | It doesn’t make new, but in a — news — in a thousand cities and towns, the American people are writing the greatest comeback story never told.
122 | 
123 | So, let’s tell the story here — tell it here and now.
124 | 
125 | America’s comeback is building a future of American possibilities; building an economy from the middle out and the bottom up, not the top down; investing in all of America, in all Americans to make every- — sure everyone has a fair shot and we leave no one — no one behind.
126 | 
127 | The pandemic no longer controls our lives. The vaccine that saved us from COVID is — are now being used to beat cancer.
128 | 
129 | Turning setback into comeback. That’s what America does. That’s what America does.
130 | 
131 | Folks, I inherited an economy that was on the brink. Now, our economy is literally the envy of the world.
132 | 
133 | Fifteen million new jobs in just three years. A record. A record.
134 | 
135 | Unemployment at 50-year lows.
136 | 
137 | A record 16 million Americans are starting small businesses, and each one is a literal act of hope, with historic job growth and small-business growth for Black and Hispanics and Asian Americans. Eight hundred thousand new manufacturing jobs in America and counting.
138 | 
139 | Where is it written we can’t be the manufacturing capital of the world? We are and we will.
140 | 
141 | More people have health insurance today — more people have health insurance today than ever before.
142 | 
143 | The racial wealth gap is the smallest it’s been in 20 years.
144 | 
145 | Wages keep going up. Inflation keeps coming down. Inflation has dropped from 9 percent to 3 percent — the lowest in the world and tending (trending) lower.
146 | 
147 | The landing is and will be soft. And now, instead of aporting — importing foreign products and exporting American jobs, we’re exporting American products and creating American jobs right here in America, where they belong.
148 | 
149 | And it takes time, but the American people are beginning to feel it. Consumer studies show consumer confidence is soaring.
150 | 
151 | “Buy America” has been the law of the land since the 1930s. Past administrations, including my predecessor — including some Democrats, as well, in the past — failed to buy American. Not anymore.
152 | 
153 | On my watch, federal projects that you fund — like helping build American roads, bridges, and highways — will be made with American products and built by American workers creating good-paying American jobs.
154 | 
155 | And thanks to our CHIPS and Science Act the United States is investing more in research and development than ever before. During the pandemic, a shortage of semiconductors, chips that drove up the price of everything from cell phones to automobiles — and, by the way, we invented those chips right here in America.
156 | 
157 | Well, instead of having to import them, instead of — private companies are now investing billions of dollars to build new chip factories here in America creating tens of thousands of jobs, many of those jobs paying $100,000 a year and don’t require a college degree.
158 | 
159 | In fact, my policies have attracted $650 billion in private-sector investment in clean energy, advanced manufacturing, creating tens of thousands of jobs here in America.
160 | 
161 | And thanks — and thanks to our Bipartisan Infrastructure Law, 46,000 new projects have been announced all across your communities.
162 | 
163 | And, by the way, I noticed some of you who’ve strongly voted against it are there cheering on that money coming in. And I like it. I’m with you. I’m with you.
164 | 
165 | And if any of you don’t want that money in your district, just let me know.
166 | 
167 | Modernizing our roads and bridges, ports and airports, public transit systems. Removing poi- — poisonous lead pipes so every child can drink clean water without risk of brain damage.
168 | 
169 | Providing affordable — affordable high-speed Internet for every American, no matter where you live — urban, suburban, or rural communities in red states and blue states.
170 | 
171 | Record investments in Tribal communities.
172 | 
173 | Because of my investment in family farms — because I invested in family farms — led by my Secretary of Agriculture, who knows more about this anybody I know — we’re better able to stay in the family for the — those farms so their — and their children and grandchildren won’t have to leave — leave home to make a living. It’s transformative.
174 | 
175 | The great comeback story is Belvidere, Illinois. Home to an auto plant for nearly 60 years. Before I came to office, the plant was on its way to shutting down. Thousands of workers feared for their livelihoods. Hope was fading.
176 | 
177 | Then, I was elected to office, and we raised Belvidere repeatedly with auto companies, knowing unions would make all the difference. The UAW worked like hell to keep the plant open and get these jobs back. And together, we succeeded.
178 | 
179 | Instead of auto factories shutting down, auto factories are reopening and a new state-of-the-art battery factory is being built to power those cars there at the same.
180 | 
181 | To the folks — to the folks of Belvidere, I’d say: Instead of your town being left behind, your community is moving forward again. Because instead of watching auto ja- — jobs of the future go overseas, 4,000 union jobs with higher wages are building a future in Belvidere right here in America.
182 | 
183 | Here tonight is UAW President Shawn Fain, a great friend and a great labor leader. Shawn, where are you? Stand up.
184 | 
185 | And — and Dawn — and Dawn Simms, a third-generation worker — UAW worker at Belvidere.
186 | 
187 | Shawn, I was proud to be the first President to stand in the picket line. And today, Dawn has a good job in her hometown, providing stability for her family and pride and dignity as well.
188 | 
189 | Showing once again Wall Street didn’t build America. They’re not bad guys. They didn’t build it, though. The middle class built the country, and unions built the middle class.
190 | 
191 | I say to the American people: When America gets knocked down, we get back up. We keep going. That’s America. That’s you, the American people.
192 | 
193 | It’s because of you America is coming back. It’s because of you our future is brighter. It’s because of you that tonight we can proudly say the state of our Union is strong and getting stronger.
194 | 
195 | AUDIENCE: Four more years! Four more years! Four more years!
196 | 
197 | THE PRESIDENT: Tonight — tonight, I want to talk about the future of possibilities that we can build together — a future where the days of trickle-down economics are over and the wealthy and the biggest corporations no longer get the — all the tax breaks.
198 | 
199 | And, by the way, I understand corporations. I come from a state that has more corporations invested than every one of your states in the state — the United States combined. And I represented it for 36 years. I’m not anti-corporation.
200 | 
201 | But I grew up in a home where trickle-down economics didn’t put much on my dad’s kitchen table. That’s why I’m determined to turn things around so the middle class does well. When they do well, the poor have a way up and the wealthy still do very well. We all do well.
202 | 
203 | And there’s more to do to make sure you’re feeling the benefits of all we’re doing.
204 | 
205 | Americans pay more for prescription drugs than anywhere in the world. It’s wrong, and I’m ending it.
206 | 
207 | With a law that I proposed and signed — and not one of your Republican buddies work- — voted for it — we finally beat Big Pharma.
208 | 
209 | Instead of paying $400 a month or thereabouts for insulin with diabetes — and it only costs 10 bucks to make — they only get paid $35 a month now and still make a healthy profit.
210 | 
211 | And I want to — and what to do next, I want to cap the cost of insulin at $35 a month for every American who needs it — everyone.
212 | 
213 | For years, people have talked about it. But finally, we got it done and gave Medicare the power to negotiate lower prices on prescription drugs, just like the VA is able to do for veterans.
214 | 
215 | That’s not just saving seniors money. It’s saving taxpayers money. We cut the federal deficit by $160 billion because Medicare will no longer have to pay those exorbitant prices to Big Pharma.
216 | 
217 | This year, Medicare is negotiating lower prices for some of the costliest drugs on the market that treat everything from heart disease to arthritis. It’s now time to go further and give Medicare the power to negotiate lower prices for 500 different drugs over the next decade.
218 | 
219 | They’re making a lot of money, guys. And they’ll still be extremely profitable. It will not only save lives; it will save taxpayers another $200 billion.
220 | 
221 | Starting next year, the same law caps total prescription drug costs for seniors on Medicare at $200 — at $2,000 a year, even for expensive cancer drugs that cost $10-, $12-, $15,000. Now I want to cap prescription drug costs at $2,000 a year for everyone.
222 | 
223 | Folks, I’m going to get in trouble for saying that, but any of you want to get in Air Force One with me and fly to Toronto, Berlin, Moscow — I mean, excuse me. Well, even Moscow, probably. And bring your prescription with you, and I promise you, I’ll get it for you for 40 percent the cost you’re paying now. Same company, same drug, same place.
224 | 
225 | Folks, the Affordable Care Act — the old “Obamacare” is still a very big deal.
226 | 
227 | Over 100 million of you can no longer be denied health insurance because of a preexisting condition. But my predecessor and many in this chamber want to take the — that prescription drug away by repealing Affordable Care Act.
228 | 
229 | AUDIENCE: Booo —
230 | 
231 | THE PRESIDENT: I’m not going to let that happen. We stopped you 50 times before, and we’ll stop you again.
232 | 
233 | In fact, I’m not only protecting it, I’m expanding it. The — we enacted tax credits of $800 per person per year (to) reduce healthcare costs for millions of working families. That tax credit expires next year. I want to make that savings permanent.
234 | 
235 | To state the obvious: Women are more than half of our population, but research on women’s health has always been underfunded.
236 | 
237 | That’s why we’re launching the first-ever White House Initiative on Women’s Health Research, led by Jill — doing an incredible job as First Lady — to pa- — to pass my plan for $12 billion to transform women’s health research and benefit millions of lives all across America.
238 | 
239 | I know the cost of housing is so important to you. Inflation keeps coming down. Mortgage rates will come down as well, and the Fed acknowledges that.
240 | 
241 | But I’m not waiting. I want to provide an annual tax credit that will give Americans $400 a month for the next two years as mortgage rates come down to put toward their mortgages when they buy their first home or trade up for a little more space. That’s for two years.
242 | 
243 | And my administration is also eliminating title insurance (fees) on federally backed mortgages. When you refinance your home, you can save $1,000 or more as a consequence.
244 | 
245 | For millions of renters, we’re cracking down on big landlords who use antitrust law —using antitrust — who break antitrust laws by price-fixing and driving up rents.
246 | 
247 | We’ve cut red tape so builders can get federally financing, which is already helping build a record 1.7 million new house u- — housing units nationwide.
248 | 
249 | Now pass — now pass (my plan) and build and renovate 2 million affordable homes and bring those rents down.
250 | 
251 | To remain the strongest economy in the world, we need to have the best education system in the world. And I, like I suspect all of you, want to give a child — every child a good start by providing access to preschool for three- and four-years-old.
252 | 
253 | You know, I think I pointed out last year — I think I pointed out last year that children coming from broken homes where there’s no books, they’re not read to, they’re not spoken to very often start school — kindergarten or first grade hearing — having heard a million fewer words spoken.
254 | 
255 | Well, studies show that children who go to preschool are nearly 50 percent more likely to finish high school and go on to earn a two- and four-year degree no matter what their background is.
256 | 
257 | I met a year and a half ago with the leaders of the Business Roundtable. They were mad that I was ever — angry — I — well, they were discussing why I wanted to spend money on education.
258 | 
259 | I pointed out to them: As Vice President, I met with over 8- — I think it was 182 of those folks — don’t hold me to the exact number — and I asked them what they need most — the CEOs. And you’ve had the same experience on both sides of the aisle. They say, “A better-educated workforce,” right?
260 | 
261 | So, I looked at them. And I say, “I come from Delaware. DuPont used to be the eighth-largest corporation in the world. And every new enter- — enterprise they bought, they educated the workforce to that enterprise. But none of you do that anymore. Why are you angry with me providing you the opportunity for the best-educated workforce in the world?”
262 | 
263 | And they all looked at me and said, “I think you’re right.”
264 | 
265 | I want to expand high-quality tutoring and summer learning to see that every child learns to read by third grade.
266 | 
267 | I’m also connecting local businesses and high schools so students get hands-on experience and a path to a good-paying job whether or not they go to college.
268 | 
269 | And I want to make sure that college is more affordable. Let’s continue increasing the Pell Grants to working- and middle-class families and increase record investments in HBCUs and minority-serving institutions, including Hispanic institutions.
270 | 
271 | When I was told I couldn’t universally just change the way in which we did — dealt with student loans, I fixed two student loan programs that already existed to reduce the burden of student debt for nearly 4 million Americans, including nurses, firefighters and others in public service.
272 | 
273 | Like Keenan Jones, a public educator in Minnesota, who’s here with us tonight. Keenan, where are you? Keenan, thank you.
274 | 
275 | He’s educated hundreds of students so they can go to college. Now he’s able to help, after debt forgiveness, get his own daughter to college.
276 | 
277 | And, folks, look, such relief is good for the economy because folks are now able to buy a home, start a business, start a family.
278 | 
279 | And while we’re at it, I want to give public school teachers a raise.
280 | 
281 | And, by the way, the first couple of years, we cut the deficit.
282 | 
283 | Now let me speak to the question of fundamental fairness for all Americans. I’ve been delivering real results in fiscally responsible ways. We’ve already cut the federal deficit —we’ve already cut the federal deficit by over $1 trillion.
284 | 
285 | I signed a bipartisan deal to cut another trillion dollars in the next decade.
286 | 
287 | It’s my goal to cut the federal deficit another $3 trillion by making big corporations and the very wealthy finally beginning to pay their fair share.
288 | 
289 | Look, I’m a capitalist. If you want to make or can make a million or millions of bucks, that’s great. Just pay your fair share in taxes.
290 | 
291 | A fair tax code is how we invest in things that make this country great: healthcare, education, defense, and so much more.
292 | 
293 | But here’s the deal. The last administration enacted a $2 trillion tax cut overwhelmingly benefit the top 1 percent — the very wealthy —
294 | 
295 | AUDIENCE: Booo —
296 | 
297 | THE PRESIDENT: — and the biggest corporations — and exploded the federal deficit.
298 | 
299 | They added more to the national debt than any presidential term in American history. Check the numbers.
300 | 
301 | Folks at home, does anybody really think the tax code is fair?
302 | 
303 | AUDIENCE: No!
304 | 
305 | THE PRESIDENT: Do you really think the wealthy and big corporations need another $2 trillion tax break?
306 | 
307 | AUDIENCE: No!
308 | 
309 | THE PRESIDENT: I sure don’t. I’m going to keep fighting like hell to make it fair. Under my plan, nobody earning less than $400,000 a year will pay an additional penny in federal taxes nobody — not one penny. And they haven’t yet.
310 | 
311 | In fact, the Child Tax Credit I passed during the pandemic cut taxes for millions of working families and cut child poverty in half.
312 | 
313 | Restore that Child Tax Credit. No child should go hungry in this country.
314 | 
315 | The way to make the tax code fair is to make big corporations and the very wealthy begin to pay their share. Remember in 2020, 55 of the biggest companies in America made $40 billion and paid zero in federal income tax. Zero.
316 | 
317 | AUDIENCE: Booo —
318 | 
319 | THE PRESIDENT: Not anymore.
320 | 
321 | Thanks to the law I wrote and we signed, big companies now have to pay a minimum of 15 percent. But that’s still less than working people pay in federal taxes.
322 | 
323 | It’s time to raise the corporate minimum tax to at least 21 percent so every big corporation finally begins to pay their fair share.
324 | 
325 | I also want to end tax breaks for Big Pharma, Big Oil, private jets, massive executive pay when it was only supposed to be a million bal- — a million dollars that could be deducted. They can pay them $20 million if they want, but deduct a million.
326 | 
327 | End it now.
328 | 
329 | You know, there are 1,000 billionaires in America. You know what the average federal tax is for those billionaires?
330 | 
331 | AUDIENCE MEMBER: Zero!
332 | 
333 | THE PRESIDENT: No.
334 | 
335 | They’re making great sacrifices — 8.2 percent.
336 | 
337 | AUDIENCE: Booo —
338 | 
339 | THE PRESIDENT: That’s far less than the vast majority of Americans pay.
340 | 
341 | No billionaire should pay a lower federal tax rate than a teacher, a sanitation worker, or a nurse.
342 | 
343 | I proposed a minimum tax for billionaires of 25 percent — just 25 percent. You know what that would raise? That would raise $500 billion over the next 10 years.
344 | 
345 | And imagine what that could do for America. Imagine a future with affordable childcare, millions of families can get what they need to go to work to help grow the economy.
346 | 
347 | Imagine a future with paid leave, because no one should have to choose between working and taking care of their sick family member.
348 | 
349 | Imagine — imagine a future with home care and eldercare, and people living with disabilities so they can stay in their homes and family caregivers can finally get the pay they deserve.
350 | 
351 | Tonight, let’s all agree once again to stand up for seniors.
352 | 
353 | Many of my friends on the other side of the aisle want to put Social Security on the chopping block.
354 | 
355 | If anyone here tries to cut Social Security or Medicare or raise the retirement age, I will stop you.
356 | 
357 | The working people — the working people who built this country pay more into Social Security than millionaires and billionaires do. It’s not fair.
358 | 
359 | We have two ways to go. Republicans can cut Social Security and give more tax breaks to the wealthy. I will —
360 | 
361 | AUDIENCE MEMBER: (Inaudible.)
362 | 
363 | THE PRESIDENT: That’s the proposal. Oh, no? You guys don’t want another $2 trillion tax cut?
364 | 
365 | AUDIENCE MEMBER: Liar!
366 | 
367 | THE PRESIDENT: I kind of thought that’s what your plan was. Well, that’s good to hear. You’re not going to cut another $2 trillion for the super-wealthy? That’s good to hear.
368 | 
369 | I’ll protect and strengthen Social Security and make the wealthy pay their fair share.
370 | 
371 | Look, too many corporations raise prices to pad their profits, charging more and more for less and less.
372 | 
373 | That’s why we’re cracking down on corporations that engage in price gouging and deceptive pricing, from food to healthcare to housing.
374 | 
375 | In fact, the snack companies think you won’t notice if they change the size of the bag and put a hell of a lot fewer — same — same size bag -— put fewer chips in it. No, I’m not joking. It’s called “shrink-flation.”
376 | 
377 | Pass Bobby Casey’s bill and stop this. I really mean it.
378 | 
379 | You probably all saw that commercial on Snickers bars. And you get — you get charged the same amount, and you got about, I don’t know, 10 percent fewer Snickers in it.
380 | 
381 | Look, I’m also getting rid of junk fees — those hidden fees — at the end of your bill that are there without your knowledge. My administration announced we’re cutting credit card late fees from $32 to $8.
382 | 
383 | Banks and credit card companies are allowed to charge what it costs them to in- — to instigate the collection. And that’s more — a hell of a lot like $8 than 30-some dollars.
384 | 
385 | But they don’t like it. The credit card companies don’t like it, but I’m saving American families $20 billion a year with all of the junk fees I’m eliminating.
386 | 
387 | Folks at home, that’s why the banks are so mad. It’s $20 billion in profit.
388 | 
389 | I’m not stopping there.
390 | 
391 | My administration has proposed rules to make cable, travel, utilities, and online ticket sellers tell you the total price up front so there are no surprises.
392 | 
393 | It matters. It matters.
394 | 
395 | And so does this. In November, my team began serious negotiations with a bipartisan group of senators. The result was a bipartisan bill with the toughest set of border security reforms we’ve ever seen.
396 | 
397 | AUDIENCE: Booo —
398 | 
399 | THE PRESIDENT: Oh, you don’t think so?
400 | 
401 | AUDIENCE: Booo —
402 | 
403 | THE PRESIDENT: Oh, you don’t like that bill — huh? — that conservatives got together and said was a good bill? I’ll be darned. That’s amazing.
404 | 
405 | That bipartisan bill would hire 1,500 more security agents and officers, 100 more immigration judges to help tackle the backload of 2 million cases, 4,300 more asylum officers, and new policies so they can resolve cases in six months instead of six years now. What are you against?
406 | 
407 | One hundred more high-tech drug detection machines to significantly increase the ability to screen and stop vehicles smuggling fentanyl into America that’s killing thousands of children.
408 | 
409 | This bill would save lives and bring order to the border.
410 | 
411 | It would also give me and any new president new emergency authority to temporarily shut down the border when the number of migrants at the border is overwhelming.
412 | 
413 | The Border Patrol union has endorsed this bill.
414 | 
415 | (Cross-talk.)
416 | 
417 | The federal Chamber of Commerce has — yeah, yeah. You’re saying “no.” Look at the facts. I know — I know you know how to read.
418 | 
419 | I believe that given the opportunity — for — a majority in the House and Senate would endorse the bill as well — a majority right now.
420 | 
421 | AUDIENCE MEMBER: Yes!
422 | 
423 | THE PRESIDENT: But unfortunately, politics have derailed this bill so far.
424 | 
425 | I’m told my predecessor called members of Congress in the Senate to demand they block the bill. He feels political win — he viewed it as a — it would be a political win for me and a political loser for him. It’s not about him. It’s not about me. I’d be a winner — not really. I —
426 | 
427 | REPRESENTATIVE GREENE: What about Laken Riley?
428 | 
429 | (Cross-talk.)
430 | 
431 | AUDIENCE: Booo —
432 | 
433 | REPRESENTATIVE GREENE: Say her name!
434 | 
435 | THE PRESIDENT: (The President holds up a pin reading “Say Her Name, Laken Riley.”) Lanken — Lanken (Laken) Riley, an innocent young woman who was killed.
436 | 
437 | REPRESENTATIVE GREENE: By an illegal!
438 | 
439 | THE PRESIDENT: By an illegal. That’s right. But how many of thousands of people are being killed by legals?
440 | 
441 | (Cross-talk.)
442 | 
443 | To her parents, I say: My heart goes out to you. Having lost children myself, I understand.
444 | 
445 | But, look, if we change the dynamic at the border — people pay people — people pay these smugglers 8,000 bucks to get across the border because they know if they get by — if they get by and let into the country, it’s six to eight years before they have a hearing. And it’s worth the — taking the chance of the $8,000.
446 | 
447 | (Cross-talk.)
448 | 
449 | But — but if it’s only six mon- — six weeks, the idea is it’s highly unlikely that people will pay that money and come all that way knowing that they’ll be — able to be kicked out quickly.
450 | 
451 | Folks, I would respectfully su- — suggest to my friend in — my Republican friends owe it to the American people. Get this bill done. We need to act now.
452 | 
453 | AUDIENCE: Get it done! Get it done! Get it done!
454 | 
455 | THE PRESIDENT: And if my predecessor is watching: Instead of paying (playing) politics and pressuring members of Congress to block the bill, join me in telling the Congress to pass it.
456 | 
457 | We can do it together.
458 | 
459 | But that’s what he apparently — here’s what he will not do.
460 | 
461 | I will not demonize immigrants, saying they are “poison in the blood of our country.”
462 | 
463 | I will not separate families.
464 | 
465 | I will not ban people because of their faith.
466 | 
467 | Unlike my predecessor, on my first day in office, I introduced a comprehensive bill to fix our immigration system. Take a look at it. It has all these and more: secure the border, provide a pathway to citizenship for DREAMers, and so much more.
468 | 
469 | But unlike my predecessor, I know who we are as Americans. We’re the only nation in the world with a heart and soul that draws from old and new.
470 | 
471 | Home to Native Americans whose ancestors have been here for thousands of years. Home to people of every pla- — from every place on Earth.
472 | 
473 | They came freely. Some came in chains. Some came when famine struck, like my ancestral family in Ireland. Some to flee persecution, to chase dreams that are impossible anywhere but here in America.
474 | 
475 | That’s America. And we all come from somewhere, but we’re all Americans.
476 | 
477 | Look, folks, we have a simple choice: We can fight about fixing the border or we can fix it. I’m ready to fix it. Send me the border bill now.
478 | 
479 | AUDIENCE: Fix it! Fix it! Fix it!
480 | 
481 | THE PRESIDENT: A transformational his- — moment in history happened 58 — 59 years ago today in Selma, Alabama. Hundreds of foot soldiers for justice marched across the Edmund Pettus Bridge, named after the Grand Dragon of the Ku Klux Klan, to claim their fundamental right to vote.
482 | 
483 | They were beaten. They were bloodied and left for dead. Our late friend and former colleague John Lewis was on that march. We miss him.
484 | 
485 | But joining us tonight are other marchers, both in the gallery and on the floor, including Bettie Mae Fikes, known as the “Voice of Selma.”
486 | 
487 | The daughter of gospel singers and preachers, she sang songs of prayer and protest on that Bloody Sunday to help shake the nation’s conscience.
488 | 
489 | Five months later, the Voting Rights Act passed and was signed into law.
490 | 
491 | Thank you. Thank you, thank you, thank you.
492 | 
493 | But 59 years later, there are forces taking us back in time: voter suppression, election subversion, unlimited dark money, extreme gerrymandering.
494 | 
495 | John Lewis was a great friend to many of us here. But if you truly want to honor him and all the heroes who marched with him, then it’s time to do more than talk.
496 | 
497 | Pass the Freedom to Vote Act, the John Lewis Voting Right(s) Act.
498 | 
499 | And stop — stop denying another core value of America: our diversity across American life. Banning books is wrong. Instead of erasing history, let’s make history.
500 | 
501 | I want to protect fundamental rights.
502 | 
503 | Pass the Equality Act.
504 | 
505 | And my message to transgender Americans: I have your back.
506 | 
507 | Pass the PRO Act for workers’ rights.
508 | 
509 | Raise the federal minimum wage, because every worker has the right to a decent living more than eig- — seven bucks an hour.
510 | 
511 | We’re also making history by confronting the climate crisis, not denying it. I don’t think any of you think there’s no longer a climate crisis. At least, I hope you don’t.
512 | 
513 | I’m taking the most significant action ever on climate in the history of the world.
514 | 
515 | I’m cutting our carbon emissions in half by 2030; creating tens of thousands of clean energy jobs, like the IBEW workers building and installing 500,000 electric vehicle charging stations; conserving 30 percent of America’s lands and waters by 2030; and taking action on environmental justice — fence-line communities smothered by the legacy of pollution.
516 | 
517 | And patterned after the Peace Corps and AmericaCorps (AmeriCorps), I launched the Climate Corps to put 20,000 young people to work in the forefront of our clean energy future. I’ll triple that number in a decade.
518 | 
519 | To state the obvious, all Americans deserve the freedom to be safe. And America is safer today than when I took office.
520 | 
521 | The year before I took office, murder rates went up 30 percent.
522 | 
523 | MR. NIKOUI: Remember Abbey Gate!
524 | 
525 | THE PRESIDENT: Thirty percent, they went up —
526 | 
527 | MR. NIKOUI: United States Marines! Kareem Mae’Lee Nikoui!
528 | 
529 | THE PRESIDENT: — the biggest increase in history.
530 | 
531 | MR. NIKOUI: (Inaudible.)
532 | 
533 | THE PRESIDENT: It was then, through no — through my American Rescue Plan — which every American (Republican) voted against, I might add — we made the largest investment in public safety ever.
534 | 
535 | Last year, the murder rate saw the sharpest decrease in history. Violent crime fell to one of its lowest levels in more than 50 years.
536 | 
537 | But we have more to do. We have to help cities invest in more community police officers, more mental health workers, more community violence intervention.
538 | 
539 | Give communities the tools to crack down on gun crime, retail crime, and carjacking. Keep building trust, as I’ve been doing, by taking executive action on police reform and calling for it to be the law of the land.
540 | 
541 | Directing my Cabinet to review the federal classification of marijuana and expunging thousands of convictions for the mere possession, because no one should be jailed for simply using or have it on their record.
542 | 
543 | Take on crimes of domestic violence. I’m ramping up the federal enforcement of the Violence Against Women Act that I proudly wrote when I was a senator so we can finally — finally end the scourge against women in America.
544 | 
545 | There are other kinds of violence I want to stop.
546 | 
547 | With us tonight is Jasmine, whose nine-year-old sister Jackie was murdered with 21 classmates and teachers in her elementary school in Uvalde, Texas.
548 | 
549 | Very soon after that happened, Jill and I went to Uvalde for a couple days. We spent hours and hours with each of the families. We heard their message so everyone in this room, in this chamber could hear the same message.
550 | 
551 | The constant refrain — and I was there for hours, meeting with every family. They said, “Do something.” “Do something.”
552 | 
553 | Well, I did do something by establishing the first-ever Office of Gun Violence Prevention in the White House, that the Vice President is leading the charge. Thank you for doing it.
554 | 
555 | Meanwhile — meanwhile, my predecessor told the NRA he’s proud he did nothing on guns when he was President.
556 | 
557 | AUDIENCE: Booo —
558 | 
559 | THE PRESIDENT: After another shooting in Iowa recently, he said — when asked what to do about it, he said, just “get over it.” That was his quote. Just “get over it.”
560 | 
561 | I say stop it. Stop it, stop it, stop it.
562 | 
563 | I’m proud we beat the NRA when I signed the most significant gun safety law in nearly 30 years because of this Congress. We now must beat the NRA again.
564 | 
565 | I’m demanding a ban on assault weapons and high-capacity magazines. Pass universal background checks.
566 | 
567 | None of this — none of this — I taught the Second Amendment for 12 years. None of this violates the Second Amendment or vilifies responsible gun owners.
568 | 
569 | (Cross-talk.)
570 | 
571 | You know, as we manage challenges at home, we’re also managing crises abroad, including in the Middle East.
572 | 
573 | I know the last five months have been gut-wrenching for so many people — for the Israeli people, for the Palestinian people, and so many here in America.
574 | 
575 | This crisis began on October 7th with a massacre by a terrorist group called Hamas, as you all know. One thousand two hundred innocent people — women and girls, men and boys — slaughtered after enduring sexual violence. The deadliest day of the — for the Jewish people since the Holocaust. And 250 hostages taken.
576 | 
577 | Here in this chamber tonight are families whose loved ones are still being held by Hamas. I pledge to all the families that we will not rest until we bring every one of your loved ones home.
578 | 
579 | We also — we will also work around the clock to bring home Evan and Paul — Americans being unjustly detained by the Russians — and others around the world.
580 | 
581 | Israel has a right to go after Hamas. Hamas ended this conflict by releasing the hostages, laying down arms — could end it by — by releasing the hostages, laying down arms, and s- — surrendering those responsible for October 7th.
582 | 
583 | But Israel has a h- — excuse me. Israel has a added burden because Hamas hides and operates among the civilian population like cowards — under hospitals, daycare centers, and all the like.
584 | 
585 | Israel also has a fundamental responsibility, though, to protect innocent civilians in Gaza.
586 | 
587 | This war has taken a greater toll on innocent civilians than all previous wars in Gaza combined. More than 30,000 Palestinians have been killed —
588 | 
589 | AUDIENCE MEMBER: Says who?
590 | 
591 | THE PRESIDENT: — most of whom are not Hamas. Thousands and thousands of innocents — women and children. Girls and boys also orphaned.
592 | 
593 | Nearly 2 million more Palestinians under bombardment or displacement. Homes destroyed, neighborhoods in rubble, cities in ruin. Families without food, water, medicine.
594 | 
595 | It’s heartbreaking.
596 | 
597 | I’ve been working non-stop to establish an immediate ceasefire that would last for six weeks to get all the prisoners released — all the hostages released and to get the hostages home and to ease the intolerable an- — humanitarian crisis and build toward an enduring — a more — something more enduring.
598 | 
599 | The United States has been leading international efforts to get more humanitarian assistance into Gaza. Tonight, I’m directing the U.S. military to lead an emergency mission to establish a temporary pier in the Mediterranean on the coast of Gaza that can receive large shipments carrying food, water, medicine, and temporary shelters.
600 | 
601 | No U.S. boots will be on the ground.
602 | 
603 | A temporary pier will enable a massive increase in the amount of humanitarian assistance getting into Gaza every day.
604 | 
605 | And Israel must also do its part. Israel must allow more aid into Gaza and ensure humanitarian workers aren’t caught in the crossfire.
606 | 
607 | And they’re announcing they’re going to — they’re going to ca- — have a crossing in Northern Gaza.
608 | 
609 | To the leadership of Israel, I say this: Humanitarian assistance cannot be a secondary consideration or a bargaining chip. Protecting and saving innocent lives has to be a priority.
610 | 
611 | As we look to the future, the only real solution to the situation is a two-state solution over time.
612 | 
613 | And I say this as a lifelong supporter of Israel, my entire career. No one has a stronger record with Israel than I do. I challenge any of you here. I’m the only American president to visit Israel in wartime.
614 | 
615 | But there is no other path that guarantees Israel’s security and democracy. There is no other path that guarantees Pa- — that Palestinians can live in peace with po- — with peace and dignity.
616 | 
617 | And there is no other path that guarantees peace between Israel and all of its neighbors — including Saudi Arabia, with whom I’m talking.
618 | 
619 | Creating stability in the Middle East also means containing the threat posed by Iran. That’s why I built a coalition of more than a dozen countries to defend international shipping and freedom of navigation in the Red Sea.
620 | 
621 | I’ve ordered strikes to degrade the Houthi capability and defend U.S. forces in the region.
622 | 
623 | As Commander-in-Chief, I will not hesitate to direct further measures to protect our people and our military personnel.
624 | 
625 | For years, I’ve heard many of my Republican and Democratic friends say that China is on the rise and America is falling behind. They’ve got it backwards. I’ve been saying it for over four years, even when I wasn’t president.
626 | 
627 | America is rising. We have the best economy in the world. And since I’ve come to office, our GTB (GDP) is up, our trade deficit with China is down to the lowest point in over a decade.
628 | 
629 | And we’re standing up against China’s unfair economic practices.
630 | 
631 | We’re standing up for peace and stability across the Taiwan Straits.
632 | 
633 | I’ve revitalized our partnership and alliance in the Pacific: India, Australia, Japan, South Korea, the Pacific Islands. I’ve made sure that the most advanced American technologies can’t be used in China — not allowing to trade them there.
634 | 
635 | Frankly, for all his tough talk on China, it never occurred to my predecessor to do any of that.
636 | 
637 | I want competition with China, not conflict. And we’re in a stronger position to win the conflict (competition) of the 21st century against China than anyone else for that matter — than at any time as well.
638 | 
639 | Here at home, I’ve signed over 400 bipartisan bills. But there’s more to pass my Unity Agenda.
640 | 
641 | Strengthen penalties on fentanyl trafficking. You don’t want to do that, huh?
642 | 
643 | Pass bipartisan privacy legislation to protect our children online.
644 | 
645 | Harness — harness the promise of AI to protect us from peril. Ban AI voice impersonations and more.
646 | 
647 | And keep our truly sacred obligation to train and equip those we send into harm’s way and care for them and their families when they come home and when they don’t.
648 | 
649 | That’s why, with the strong support and help of Denis and the VA, I signed the PACT Act — one of the most significant laws ever, helping millions of veterans exposed to toxins who now are battling more than 100 different cancers. Many of them don’t come home, but we owe them and their families support.
650 | 
651 | And we owe it to ourselves to keep supporting our new health research agency called ARPA-H — and remind us — to remind us that we can do big things, like end cancer as we know it. And we will.
652 | 
653 | Let me close with this.
654 | 
655 | THE PRESIDENT: Yay!
656 | 
657 | I know you don’t want to hear anymore, Lindsey, but I got to say a few more things.
658 | 
659 | I know I may not look like it, but I’ve been around a while. When you get to be my age, certain things become clearer than ever.
660 | 
661 | I know the American story. Again and again, I’ve seen the contest between competing forces in the battle for the soul of our nation, between those who want to pull America back to the past and those who want to move America into the future.
662 | 
663 | My lifetime has taught me to embrace freedom and democracy, a future based on core values that have defined America — honesty, decency, dignity, and equality; to respect everyone; to give everyone a fair shot; to give hate no safe harbor.
664 | 
665 | Now, other people my age see it differently. The American story of resentment, revenge, and retribution.
666 | 
667 | That’s not me. I was born amid World War Two, when America stood for the freedom of the world. I grew up in Scranton, Pennsylvania, and Claymont, Delaware, among working-class people who built this country.
668 | 
669 | I watched in horror as two of my heroes — like many of you did — Dr. King and Bobby Kennedy, were assassinated. And their legacies inspired me to pur- — pursue a car- — a career in service.
670 | 
671 | I left a law firm and became a public defender because my city of Wilmington was the only city in America occupied by the National Guard after Dr. King was assassinated because of the riots. And I became a county councilman almost by accident.
672 | 
673 | I got elected to the United States Senate when I had no intention of running, at age 29.
674 | 
675 | Then vice president to our first Black president. Now a president to the first woman vice president.
676 | 
677 | In my career, I’ve been told I was too young. By the way, they didn’t let me on the Senate elevators for votes sometimes. They — not a joke.
678 | 
679 | And I’ve been told I am too old.
680 | 
681 | Whether young or old, I’ve always been known — I’ve always known what endures. I’ve known our North Star. The very idea of America is that we’re all created equal, deserves to be treated equally throughout our lives.
682 | 
683 | We’ve never fully lived up to that idea, but we’ve never walked away from it either. And I won’t walk away from it now.
684 | 
685 | I’m optimistic. I really am. I’m optimistic, Nancy.
686 | 
687 | AUDIENCE: Four more years! Four more years! Four more years!
688 | 
689 | THE PRESIDENT: My fellow Americans, the issue facing our nation isn’t how old we are; it’s how old are our ideas.
690 | 
691 | Hate, anger, revenge, retribution are the oldest of ideas. But you can’t lead America with ancient ideas that only take us back. To lead America, the land of possibilities, you need a vision for the future and what can and should be done.
692 | 
693 | Tonight, you’ve heard mine.
694 | 
695 | I see a future where (we’re) defending democracy, you don’t diminish it.
696 | 
697 | I see a future where we restore the right to choose and protect our freedoms, not take them away.
698 | 
699 | I see a future where the middle class has — finally has a fair shot and the wealthy have to pay their fair share in taxes.
700 | 
701 | I see a future where we save the planet from the climate crisis and our country from gun violence.
702 | 
703 | Above all, I see a future for all Americans. I see a country for all Americans. And I will always be President for all Americans because I believe in America. I believe in you, the American people. You’re the reason we’ve never been more optimistic about our future than I am now.
704 | 
705 | So, let’s build the future together. Let’s remember who we are.
706 | 
707 | We are the United States of America. And there is nothing — nothing beyond our capacity when we act together.
708 | 
709 | God bless you all. And may God protect our troops. Thank you, thank you, thank you.


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/header.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/header.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/length.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/length.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/link_lists.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/link_lists.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/data_level0.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/data_level0.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/header.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/header.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/length.bin:
--------------------------------------------------------------------------------
1 |                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/link_lists.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/link_lists.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/chroma.sqlite3:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/chroma.sqlite3


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/header.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/header.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/length.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/length.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/link_lists.bin:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/d451fc4cf56e417b755994b4ca5212fd5057c0d2/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/link_lists.bin


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/prompts/question_maker_approx_system.txt:
--------------------------------------------------------------------------------
 1 | You are an agent that generates questions from provided text. Your job is to generate a question and provide the relevant sections from the text as references.
 2 | 
 3 | Instructions:
 4 | 1. For each provided text, generate a question that can be answered solely by the facts in the text.
 5 | 2. Extract all significant facts that answer the generated question.
 6 | 3. Format the response in JSON format with two fields:
 7 |    - 'question': A question directly related to these facts, ensuring it can only be answered using the references provided.
 8 |    - 'references': A list of JSON objects with the following fields: 'content': the text section that answers the question, 'start_chunk': the index of the start chunk, 'end_chunk': the index of the end chunk. These are inclusive indices.
 9 | 
10 | Notes: 
11 | Make the question more specific.
12 | Do not ask a question about multiple topics. 
13 | Do not ask a question with over 5 references.
14 | 
15 | Example:
16 | 
17 | Text: "<start_chunk_0>Experiment A: The temperature control test showed that at higher temperatures, the reaction rate inc<end_chunk_0><start_chunk_1>reased significantly, resulting in quicker product formation. However, at extremely high temperature<end_chunk_1><start_chunk_2>s, the reaction yield decreased due to the degradation of reactants.
18 | 
19 | Experiment B: The pH sensitivi<end_chunk_2><start_chunk_3>ty test revealed that the reaction is highly dependent on acidity, with optimal results at a pH of 7<end_chunk_3><start_chunk_4>. Deviating from this pH level in either direction led to a substantial drop in yield.
20 | 
21 | Experiment C<end_chunk_4><start_chunk_5>: In the enzyme activity assay, it was found that the presence of a specific enzyme accelerated the <end_chunk_5><start_chunk_6>reaction by a factor of 3. The absence of the enzyme, however, led to a sluggish reaction with an ex<end_chunk_6><start_chunk_7>tended completion time.
22 | 
23 | Experiment D: The light exposure trial demonstrated that UV light stimulate<end_chunk_7><start_chunk_8>d the reaction, making it complete in half the time compared to the absence of light. Conversely, pr<end_chunk_8><start_chunk_9>olonged light exposure led to unwanted side reactions that contaminated the final product.
24 | <end_chunk_9>"
25 | 
26 | Response: {
27 |   'oath': "I will not use the word 'and' in the question unless it is part of a proper noun. I will also make sure the question is concise.",
28 |   'question': 'What experiments were done in this paper?',
29 |   'references': [{
30 |     'content': 'Experiment A: The temperature control test showed that at higher temperatures, the reaction rate increased significantly, resulting in quicker product formation.',
31 |     'start_chunk': 0,
32 |     'end_chunk': 1,
33 |   }, {
34 |     'content': 'Experiment B: The pH sensitivity test revealed that the reaction is highly dependent on acidity, with optimal results at a pH of 7.',
35 |     'start_chunk': 2,
36 |     'end_chunk': 3,
37 |   }, {
38 |     'content': 'Experiment C: In the enzyme activity assay, it was found that the presence of a specific enzyme accelerated the reaction by a factor of 3.',
39 |     'start_chunk': 4,
40 |     'end_chunk': 6,
41 |   }, {
42 |     'content': 'Experiment D: The light exposure trial demonstrated that UV light stimulated the reaction, making it complete in half the time compared to the absence of light.',
43 |     'start_chunk': 7,
44 |     'end_chunk': 8,
45 |   }]
46 | }
47 | 
48 | DO NOT USE THE WORD 'and' IN THE QUESTION UNLESS IT IS PART OF A PROPER NOUN. YOU MUST INCLUDE THE OATH ABOVE IN YOUR RESPONSE.
49 | YOU MUST ALSO NOT REPEAT A QUESTION THAT HAS ALREADY BEEN USED.


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/prompts/question_maker_approx_user.txt:
--------------------------------------------------------------------------------
1 | Text: {document}
2 | 
3 | The following questions have already been used. Do not repeat them: {prev_questions_str}
4 | 
5 | Do not repeat the above questions. Make your next question unique. Respond with references and a question in JSON. The references must contain the exact text that answers the question and the start_chunk and end_chunk. DO NOT USE THE WORD 'and' IN THE QUESTION UNLESS IT IS PART OF A PROPER NOUN.


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/prompts/question_maker_system.txt:
--------------------------------------------------------------------------------
 1 | You are an agent that generates questions from provided text. Your job is to generate a question and provide the relevant sections from the text as references.
 2 | 
 3 | Instructions:
 4 | 1. For each provided text, generate a question that can be answered solely by the facts in the text.
 5 | 2. Extract all significant facts that answer the generated question.
 6 | 3. Format the response in JSON format with two fields:
 7 |    - 'question': A question directly related to these facts, ensuring it can only be answered using the references provided.
 8 |    - 'references': A list of all text sections that answer the generated question. These must be exact copies from the original text and should be whole sentences where possible.
 9 | 
10 | Notes: 
11 | Make the question more specific.
12 | Do not ask a question about multiple topics. 
13 | Do not ask a question with over 5 references.
14 | 
15 | Example:
16 | 
17 | Text: "Experiment A: The temperature control test showed that at higher temperatures, the reaction rate increased significantly, resulting in quicker product formation. However, at extremely high temperatures, the reaction yield decreased due to the degradation of reactants.
18 | 
19 | Experiment B: The pH sensitivity test revealed that the reaction is highly dependent on acidity, with optimal results at a pH of 7. Deviating from this pH level in either direction led to a substantial drop in yield.
20 | 
21 | Experiment C: In the enzyme activity assay, it was found that the presence of a specific enzyme accelerated the reaction by a factor of 3. The absence of the enzyme, however, led to a sluggish reaction with an extended completion time.
22 | 
23 | Experiment D: The light exposure trial demonstrated that UV light stimulated the reaction, making it complete in half the time compared to the absence of light. Conversely, prolonged light exposure led to unwanted side reactions that contaminated the final product."
24 | 
25 | Response: {
26 |   'oath': "I will not use the word 'and' in the question unless it is part of a proper noun. I will also make sure the question is concise.",
27 |   'question': 'What experiments were done in this paper?',
28 |   'references': ['Experiment A: The temperature control test showed that at higher temperatures, the reaction rate increased significantly, resulting in quicker product formation.', 'Experiment B: The pH sensitivity test revealed that the reaction is highly dependent on acidity, with optimal results at a pH of 7.', 'Experiment C: In the enzyme activity assay, it was found that the presence of a specific enzyme accelerated the reaction by a factor of 3.', 'Experiment D: The light exposure trial demonstrated that UV light stimulated the reaction, making it complete in half the time compared to the absence of light.']
29 | }
30 | 
31 | DO NOT USE THE WORD 'and' IN THE QUESTION UNLESS IT IS PART OF A PROPER NOUN. YOU MUST INCLUDE THE OATH ABOVE IN YOUR RESPONSE.
32 | YOU MUST ALSO NOT REPEAT A QUESTION THAT HAS ALREADY BEEN USED.


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/prompts/question_maker_user.txt:
--------------------------------------------------------------------------------
1 | Text: {document}
2 | 
3 | The following questions have already been used. Do not repeat them: {prev_questions_str}
4 | 
5 | Do not repeat the above questions. Make your next question unique. Respond with references and a question in JSON. DO NOT USE THE WORD 'and' IN THE QUESTION UNLESS IT IS PART OF A PROPER NOUN.


--------------------------------------------------------------------------------
/chunking_evaluation/evaluation_framework/synthetic_evaluation.py:
--------------------------------------------------------------------------------
  1 | from typing import List
  2 | import os
  3 | import json
  4 | import random
  5 | 
  6 | from chunking_evaluation.utils import rigorous_document_search
  7 | from .base_evaluation import BaseEvaluation
  8 | 
  9 | import pandas as pd
 10 | import numpy as np
 11 | from openai import OpenAI
 12 | from importlib import resources
 13 | 
 14 | class SyntheticEvaluation(BaseEvaluation):
 15 |     def __init__(self, corpora_paths: List[str], queries_csv_path: str, chroma_db_path:str = None, openai_api_key=None):
 16 |         super().__init__(questions_csv_path=queries_csv_path, chroma_db_path=chroma_db_path)
 17 |         self.corpora_paths = corpora_paths
 18 |         self.questions_csv_path = queries_csv_path
 19 |         self.client = OpenAI(api_key=openai_api_key)
 20 | 
 21 |         self.synth_questions_df = None
 22 | 
 23 |         with resources.as_file(resources.files('chunking_evaluation.evaluation_framework') / 'prompts') as prompt_path:
 24 |             with open(os.path.join(prompt_path, 'question_maker_system.txt'), 'r') as f:
 25 |                 self.question_maker_system_prompt = f.read()
 26 | 
 27 |             with open(os.path.join(prompt_path, 'question_maker_approx_system.txt'), 'r') as f:
 28 |                 self.question_maker_approx_system_prompt = f.read()
 29 |             
 30 |             with open(os.path.join(prompt_path, 'question_maker_user.txt'), 'r') as f:
 31 |                 self.question_maker_user_prompt = f.read()
 32 | 
 33 |             with open(os.path.join(prompt_path, 'question_maker_approx_user.txt'), 'r') as f:
 34 |                 self.question_maker_approx_user_prompt = f.read()
 35 | 
 36 |     def _save_questions_df(self):
 37 |         self.synth_questions_df.to_csv(self.questions_csv_path, index=False)
 38 | 
 39 |     def _tag_text(self, text):
 40 |         chunk_length = 100
 41 |         chunks = []
 42 |         tag_indexes = [0]
 43 |         start = 0
 44 |         while start < len(text):
 45 |             end = start + chunk_length
 46 |             chunk = text[start:end]
 47 |             if end < len(text):
 48 |                 # Find the last space within the chunk to avoid splitting a word
 49 |                 space_index = chunk.rfind(' ')
 50 |                 if space_index != -1:
 51 |                     end = start + space_index + 1  # Include the space in the chunk
 52 |                     chunk = text[start:end]
 53 |             chunks.append(chunk)
 54 |             tag_indexes.append(end)
 55 |             start = end  # Move start to end to continue splitting
 56 | 
 57 |         tagged_text = ""
 58 |         for i, chunk in enumerate(chunks):
 59 |             tagged_text += f"<start_chunk_{i}>" + chunk + f"<end_chunk_{i}>"
 60 | 
 61 |         return tagged_text, tag_indexes
 62 | 
 63 |     def _extract_question_and_approx_references(self, corpus, document_length=4000, prev_questions=[]):
 64 |         if len(corpus) > document_length:
 65 |             start_index = random.randint(0, len(corpus) - document_length)
 66 |             document = corpus[start_index : start_index + document_length]
 67 |         else:
 68 |             start_index = 0
 69 |             document = corpus
 70 |         
 71 |         if prev_questions is not None:
 72 |             if len(prev_questions) > 20:
 73 |                 questions_sample = random.sample(prev_questions, 20)
 74 |                 prev_questions_str = '\n'.join(questions_sample)
 75 |             else:
 76 |                 prev_questions_str = '\n'.join(prev_questions)
 77 |         else:
 78 |             prev_questions_str = ""
 79 | 
 80 |         tagged_text, tag_indexes = self._tag_text(document)
 81 | 
 82 |         completion = self.client.chat.completions.create(
 83 |             model="gpt-4-turbo",
 84 |             response_format={ "type": "json_object" },
 85 |             max_tokens=600,
 86 |             messages=[
 87 |                 {"role": "system", "content": self.question_maker_approx_system_prompt},
 88 |                 {"role": "user", "content": self.question_maker_approx_user_prompt.replace("{document}", tagged_text).replace("{prev_questions_str}", prev_questions_str)}
 89 |             ]
 90 |         )
 91 |         
 92 |         json_response = json.loads(completion.choices[0].message.content)
 93 |         
 94 |         try:
 95 |             text_references = json_response['references']
 96 |         except KeyError:
 97 |             raise ValueError("The response does not contain a 'references' field.")
 98 |         try:
 99 |             question = json_response['question']
100 |         except KeyError:
101 |             raise ValueError("The response does not contain a 'question' field.")
102 | 
103 |         references = []
104 |         for reference in text_references:
105 |             reference_keys = list(reference.keys())
106 | 
107 |             if len(reference_keys) != 3:
108 |                 raise ValueError(f"Each reference must have exactly 3 keys: 'content', 'start_chunk', and 'end_chunk'. Got keys: {reference_keys}")
109 | 
110 |             if 'start_chunk' not in reference_keys or 'end_chunk' not in reference_keys:
111 |                 raise ValueError("Each reference must contain 'start_chunk' and 'end_chunk' keys.")
112 | 
113 |             if 'end_chunk' not in reference_keys:
114 |                 reference_keys.remove('content')
115 |                 reference_keys.remove('start_chunk')
116 |                 end_chunk_key = reference_keys[0]
117 |                 end_index = start_index + tag_indexes[reference[end_chunk_key]+1]
118 |             else:
119 |                 end_index = start_index + tag_indexes[reference['end_chunk']+1]
120 | 
121 |             start_index = start_index + tag_indexes[reference['start_chunk']]
122 |             references.append((corpus[start_index:end_index], start_index, end_index))
123 |         
124 |         return question, references
125 | 
126 |     def _extract_question_and_references(self, corpus, document_length=4000, prev_questions=[]):
127 |         if len(corpus) > document_length:
128 |             start_index = random.randint(0, len(corpus) - document_length)
129 |             document = corpus[start_index : start_index + document_length]
130 |         else:
131 |             document = corpus
132 |         
133 |         if prev_questions is not None:
134 |             if len(prev_questions) > 20:
135 |                 questions_sample = random.sample(prev_questions, 20)
136 |                 prev_questions_str = '\n'.join(questions_sample)
137 |             else:
138 |                 prev_questions_str = '\n'.join(prev_questions)
139 |         else:
140 |             prev_questions_str = ""
141 | 
142 |         completion = self.client.chat.completions.create(
143 |             model="gpt-4-turbo",
144 |             response_format={ "type": "json_object" },
145 |             max_tokens=600,
146 |             messages=[
147 |                 {"role": "system", "content": self.question_maker_system_prompt},
148 |                 {"role": "user", "content": self.question_maker_user_prompt.replace("{document}", document).replace("{prev_questions_str}", prev_questions_str)}
149 |             ]
150 |         )
151 |         
152 |         json_response = json.loads(completion.choices[0].message.content)
153 |         
154 |         try:
155 |             text_references = json_response['references']
156 |         except KeyError:
157 |             raise ValueError("The response does not contain a 'references' field.")
158 |         try:
159 |             question = json_response['question']
160 |         except KeyError:
161 |             raise ValueError("The response does not contain a 'question' field.")
162 | 
163 |         references = []
164 |         for reference in text_references:
165 |             if not isinstance(reference, str):
166 |                 raise ValueError(f"Expected reference to be of type str, but got {type(reference).__name__}")
167 |             target = rigorous_document_search(corpus, reference)
168 |             if target is not None:
169 |                 reference, start_index, end_index = target
170 |                 references.append((reference, start_index, end_index))
171 |             else:
172 |                 raise ValueError(f"No match found in the document for the given reference.\nReference: {reference}")
173 |         
174 |         return question, references
175 | 
176 |     def _generate_corpus_questions(self, corpus_id, approx=False, n=5):
177 |         with open(corpus_id, 'r') as file:
178 |             corpus = file.read()
179 | 
180 |         i = 0
181 |         while i < n:
182 |             while True:
183 |                 try:
184 |                     print(f"Trying Query {i}")
185 |                     questions_list = self.synth_questions_df[self.synth_questions_df['corpus_id'] == corpus_id]['question'].tolist()
186 |                     if approx:
187 |                         question, references = self._extract_question_and_approx_references(corpus, 4000, questions_list)
188 |                     else:
189 |                         question, references = self._extract_question_and_references(corpus, 4000, questions_list)
190 |                     if len(references) > 5:
191 |                         raise ValueError("The number of references exceeds 5.")
192 |                     
193 |                     references = [{'content': ref[0], 'start_index': ref[1], 'end_index': ref[2]} for ref in references]
194 |                     new_question = {
195 |                         'question': question,
196 |                         'references': json.dumps(references),
197 |                         'corpus_id': corpus_id
198 |                     }
199 | 
200 |                     new_df = pd.DataFrame([new_question])
201 |                     self.synth_questions_df = pd.concat([self.synth_questions_df, new_df], ignore_index=True)
202 |                     self._save_questions_df()
203 | 
204 |                     break
205 |                 except (ValueError, json.JSONDecodeError) as e:
206 |                     print(f"Error occurred: {e}")
207 |                     continue
208 |             i += 1
209 | 
210 |     def _get_synth_questions_df(self):
211 |         if os.path.exists(self.questions_csv_path):
212 |             synth_questions_df = pd.read_csv(self.questions_csv_path)
213 |         else:
214 |             synth_questions_df = pd.DataFrame(columns=['question', 'references', 'corpus_id'])
215 |         return synth_questions_df
216 | 
217 |     def generate_queries_and_excerpts(self, approximate_excerpts=False, num_rounds = -1, queries_per_corpus = 5):
218 |         self.synth_questions_df = self._get_synth_questions_df()
219 | 
220 |         rounds = 0
221 |         while num_rounds == -1 or rounds < num_rounds:
222 |             for corpus_id in self.corpora_paths:
223 |                 self._generate_corpus_questions(corpus_id, approx=approximate_excerpts, n=queries_per_corpus)
224 |             rounds += 1
225 | 
226 |     def _get_sim(self, target, references):
227 |         response = self.client.embeddings.create(
228 |             input=[target]+references,
229 |             model="text-embedding-3-large"
230 |         )
231 |         nparray1 = np.array(response.data[0].embedding)
232 | 
233 |         full_sim = []
234 |         for i in range(1, len(response.data)):
235 |             nparray2 = np.array(response.data[i].embedding)
236 |             cosine_similarity = np.dot(nparray1, nparray2) / (np.linalg.norm(nparray1) * np.linalg.norm(nparray2))
237 |             full_sim.append(cosine_similarity)
238 |     
239 |         return full_sim
240 | 
241 |     def _corpus_filter_poor_highlights(self, corpus_id, synth_questions_df, threshold):
242 |         corpus_questions_df = synth_questions_df[synth_questions_df['corpus_id'] == corpus_id]
243 | 
244 |         def edit_row(row):
245 |             question = row['question']
246 |             references = [ref['content'] for ref in row['references']]
247 |             similarity_scores = self._get_sim(question, references)
248 |             worst_ref_score = min(similarity_scores)
249 |             row['worst_ref_score'] = worst_ref_score
250 |             return row
251 | 
252 |         # Apply the function to each row
253 |         corpus_questions_df = corpus_questions_df.apply(edit_row, axis=1)
254 | 
255 |         count_before = len(corpus_questions_df)
256 | 
257 |         corpus_questions_df = corpus_questions_df[corpus_questions_df['worst_ref_score'] >= threshold]
258 |         corpus_questions_df = corpus_questions_df.drop(columns=['worst_ref_score'])
259 | 
260 |         count_after = len(corpus_questions_df)
261 | 
262 |         print(f"Corpus: {corpus_id} - Removed {count_before - count_after} .")
263 | 
264 |         corpus_questions_df['references'] = corpus_questions_df['references'].apply(json.dumps)
265 | 
266 |         full_questions_df = pd.read_csv(self.questions_csv_path)
267 |         full_questions_df = full_questions_df[full_questions_df['corpus_id'] != corpus_id]
268 | 
269 |         full_questions_df = pd.concat([full_questions_df, corpus_questions_df], ignore_index=True)
270 |         # Drop the columns 'fixed', 'worst_ref_score' and 'diff_score' if they exist
271 |         for col in ['fixed', 'worst_ref_score', 'diff_score']:
272 |             if col in full_questions_df.columns:
273 |                 full_questions_df = full_questions_df.drop(columns=col)
274 | 
275 |         full_questions_df.to_csv(self.questions_csv_path, index=False)
276 | 
277 | 
278 |     def filter_poor_excerpts(self, threshold=0.36, corpora_subset=[]):
279 |         if os.path.exists(self.questions_csv_path):
280 |             synth_questions_df = pd.read_csv(self.questions_csv_path)
281 |             if len(synth_questions_df) > 0:
282 |                 synth_questions_df['references'] = synth_questions_df['references'].apply(json.loads)
283 |                 corpus_list = synth_questions_df['corpus_id'].unique().tolist()
284 |                 if corpora_subset:
285 |                     corpus_list = [c for c in corpus_list if c in corpora_subset]
286 |                 for corpus_id in corpus_list:
287 |                     self._corpus_filter_poor_highlights(corpus_id, synth_questions_df, threshold)
288 | 
289 |     def _corpus_filter_duplicates(self, corpus_id, synth_questions_df, threshold):
290 |         corpus_questions_df = synth_questions_df[synth_questions_df['corpus_id'] == corpus_id].copy()
291 | 
292 |         count_before = len(corpus_questions_df)
293 | 
294 |         corpus_questions_df.drop_duplicates(subset='question', keep='first', inplace=True)
295 | 
296 |         questions = corpus_questions_df['question'].tolist()
297 | 
298 |         response = self.client.embeddings.create(
299 |             input=questions,
300 |             model="text-embedding-3-large"
301 |         )
302 | 
303 |         embeddings_matrix = np.array([data.embedding for data in response.data])
304 | 
305 |         dot_product_matrix = np.dot(embeddings_matrix, embeddings_matrix.T)
306 | 
307 |         # Create a list of tuples containing the index pairs and their similarity
308 |         similarity_pairs = [(i, j, dot_product_matrix[i][j]) for i in range(len(dot_product_matrix)) for j in range(i+1, len(dot_product_matrix))]
309 | 
310 |         # Sort the list of tuples based on the similarity in descending order
311 |         similarity_pairs.sort(key=lambda x: x[2], reverse=True)
312 | 
313 |         similarity_scores = np.array([x[2] for x in similarity_pairs])
314 | 
315 |         most_similars = (dot_product_matrix - np.eye(dot_product_matrix.shape[0])).max(axis=1)
316 | 
317 |         def filter_vectors(sim_matrix, threshold):
318 |             n = sim_matrix.shape[0]  # Number of vectors
319 |             remaining = np.ones(n, dtype=bool)  # Initialize all vectors as remaining
320 | 
321 |             for i in range(n):
322 |                 if remaining[i] == 1:  # Only check for vectors that are still remaining
323 |                     for j in range(i+1, n):
324 |                         if remaining[j] == 1 and sim_matrix[i, j] > threshold:
325 |                             remaining[j] = 0  # Remove vector j because it's too similar to vector i
326 |             
327 |             return remaining
328 | 
329 |         rows_to_keep = filter_vectors(dot_product_matrix, threshold)
330 | 
331 |         corpus_questions_df = corpus_questions_df[rows_to_keep]
332 | 
333 |         count_after = len(corpus_questions_df)
334 | 
335 |         print(f"Corpus: {corpus_id} - Removed {count_before - count_after} .")
336 | 
337 | 
338 |         corpus_questions_df['references'] = corpus_questions_df['references'].apply(json.dumps)
339 | 
340 |         full_questions_df = pd.read_csv(self.questions_csv_path)
341 |         full_questions_df = full_questions_df[full_questions_df['corpus_id'] != corpus_id]
342 | 
343 |         full_questions_df = pd.concat([full_questions_df, corpus_questions_df], ignore_index=True)
344 |         # Drop the columns 'fixed', 'worst_ref_score' and 'diff_score' if they exist
345 |         for col in ['fixed', 'worst_ref_score', 'diff_score']:
346 |             if col in full_questions_df.columns:
347 |                 full_questions_df = full_questions_df.drop(columns=col)
348 | 
349 |         full_questions_df.to_csv(self.questions_csv_path, index=False)
350 | 
351 | 
352 | 
353 |     def filter_duplicates(self, threshold=0.78, corpora_subset=[]):
354 |         if os.path.exists(self.questions_csv_path):
355 |             synth_questions_df = pd.read_csv(self.questions_csv_path)
356 |             if len(synth_questions_df) > 0:
357 |                 synth_questions_df['references'] = synth_questions_df['references'].apply(json.loads)
358 |                 corpus_list = synth_questions_df['corpus_id'].unique().tolist()
359 |                 if corpora_subset:
360 |                     corpus_list = [c for c in corpus_list if c in corpora_subset]
361 |                 for corpus_id in corpus_list:
362 |                     self._corpus_filter_duplicates(corpus_id, synth_questions_df, threshold)
363 | 
364 | 
365 |     def question_ref_filter(self):
366 |         self.synth_questions_df = self._get_synth_questions_df()


--------------------------------------------------------------------------------
/chunking_evaluation/utils.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | import re
  3 | from fuzzywuzzy import fuzz
  4 | from fuzzywuzzy import process
  5 | import os
  6 | from chromadb.utils import embedding_functions
  7 | import tiktoken
  8 | 
  9 | def find_query_despite_whitespace(document, query):
 10 | 
 11 |     # Normalize spaces and newlines in the query
 12 |     normalized_query = re.sub(r'\s+', ' ', query).strip()
 13 |     
 14 |     # Create a regex pattern from the normalized query to match any whitespace characters between words
 15 |     pattern = r'\s*'.join(re.escape(word) for word in normalized_query.split())
 16 |     
 17 |     # Compile the regex to ignore case and search for it in the document
 18 |     regex = re.compile(pattern, re.IGNORECASE)
 19 |     match = regex.search(document)
 20 |     
 21 |     if match:
 22 |         return document[match.start(): match.end()], match.start(), match.end()
 23 |     else:
 24 |         return None
 25 | 
 26 | def rigorous_document_search(document: str, target: str):
 27 |     """
 28 |     This function performs a rigorous search of a target string within a document. 
 29 |     It handles issues related to whitespace, changes in grammar, and other minor text alterations.
 30 |     The function first checks for an exact match of the target in the document. 
 31 |     If no exact match is found, it performs a raw search that accounts for variations in whitespace.
 32 |     If the raw search also fails, it splits the document into sentences and uses fuzzy matching 
 33 |     to find the sentence that best matches the target.
 34 |     
 35 |     Args:
 36 |         document (str): The document in which to search for the target.
 37 |         target (str): The string to search for within the document.
 38 | 
 39 |     Returns:
 40 |         tuple: A tuple containing the best match found in the document, its start index, and its end index.
 41 |         If no match is found, returns None.
 42 |     """
 43 |     if target.endswith('.'):
 44 |         target = target[:-1]
 45 |     
 46 |     if target in document:
 47 |         start_index = document.find(target)
 48 |         end_index = start_index + len(target)
 49 |         return target, start_index, end_index
 50 |     else:
 51 |         raw_search = find_query_despite_whitespace(document, target)
 52 |         if raw_search is not None:
 53 |             return raw_search
 54 | 
 55 |     # Split the text into sentences
 56 |     sentences = re.split(r'[.!?]\s*|\n', document)
 57 | 
 58 |     # Find the sentence that matches the query best
 59 |     best_match = process.extractOne(target, sentences, scorer=fuzz.token_sort_ratio)
 60 | 
 61 |     if best_match[1] < 98:
 62 |         return None
 63 |     
 64 |     reference = best_match[0]
 65 | 
 66 |     start_index = document.find(reference)
 67 |     end_index = start_index + len(reference)
 68 | 
 69 |     return reference, start_index, end_index
 70 | 
 71 | def get_openai_embedding_function():
 72 |     openai_api_key = os.getenv('OPENAI_API_KEY')
 73 |     if openai_api_key is None:
 74 |         raise ValueError("You need to set an embedding function or set an OPENAI_API_KEY environment variable.")
 75 |     embedding_function = embedding_functions.OpenAIEmbeddingFunction(
 76 |         api_key=os.getenv('OPENAI_API_KEY'),
 77 |         model_name="text-embedding-3-large"
 78 |     )
 79 |     return embedding_function
 80 | 
 81 | # Count the number of tokens in each page_content
 82 | def openai_token_count(string: str) -> int:
 83 |     """Returns the number of tokens in a text string."""
 84 |     encoding = tiktoken.get_encoding("cl100k_base")
 85 |     num_tokens = len(encoding.encode(string, disallowed_special=()))
 86 |     return num_tokens
 87 | 
 88 | class Language(str, Enum):
 89 |     """Enum of the programming languages."""
 90 | 
 91 |     CPP = "cpp"
 92 |     GO = "go"
 93 |     JAVA = "java"
 94 |     KOTLIN = "kotlin"
 95 |     JS = "js"
 96 |     TS = "ts"
 97 |     PHP = "php"
 98 |     PROTO = "proto"
 99 |     PYTHON = "python"
100 |     RST = "rst"
101 |     RUBY = "ruby"
102 |     RUST = "rust"
103 |     SCALA = "scala"
104 |     SWIFT = "swift"
105 |     MARKDOWN = "markdown"
106 |     LATEX = "latex"
107 |     HTML = "html"
108 |     SOL = "sol"
109 |     CSHARP = "csharp"
110 |     COBOL = "cobol"
111 |     C = "c"
112 |     LUA = "lua"
113 |     PERL = "perl"


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(
 4 |     name="chunking_evaluation",
 5 |     version="0.1.0",
 6 |     packages=find_packages(),
 7 |     install_requires=[
 8 |         "tiktoken",
 9 |         "fuzzywuzzy",
10 |         "pandas",
11 |         "numpy",
12 |         "tqdm",
13 |         "chromadb",
14 |         "python-Levenshtein",
15 |         "openai",
16 |         "anthropic",
17 |         "attrs"
18 |     ],
19 |     author="Brandon A. Smith",
20 |     author_email="brandonsmithpmpuk@gmail.com",
21 |     description="A package to evaluate multiple chunking methods. It also provides two new chunking methods.",
22 |     long_description=open("README.md").read(),
23 |     long_description_content_type="text/markdown",
24 |     url="https://github.com/yourusername/chunking_evaluation",
25 |     classifiers=[
26 |         "Programming Language :: Python :: 3",
27 |         "License :: OSI Approved :: MIT License",
28 |         "Operating System :: OS Independent",
29 |     ],
30 |     include_package_data=True,
31 |     package_data={
32 |         'chunking_evaluation': ['evaluation_framework/general_evaluation_data/**/*', 'evaluation_framework/prompts/**/*']
33 |     },
34 |     python_requires='>=3.6',
35 | )
36 | 


--------------------------------------------------------------------------------