├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── chunking_evaluation ├── __init__.py ├── chunking │ ├── __init__.py │ ├── base_chunker.py │ ├── cluster_semantic_chunker.py │ ├── fixed_token_chunker.py │ ├── kamradt_modified_chunker.py │ ├── llm_semantic_chunker.py │ └── recursive_token_chunker.py ├── evaluation_framework │ ├── __init__.py │ ├── base_evaluation.py │ ├── general_evaluation.py │ ├── general_evaluation_data │ │ ├── corpora │ │ │ ├── chatlogs.md │ │ │ ├── finance.md │ │ │ ├── pubmed.md │ │ │ ├── state_of_the_union.md │ │ │ └── wikitexts.md │ │ ├── questions_db │ │ │ ├── 633a2ec9-d034-4db6-acda-0c784ceaa32b │ │ │ │ ├── data_level0.bin │ │ │ │ ├── header.bin │ │ │ │ ├── length.bin │ │ │ │ └── link_lists.bin │ │ │ ├── bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2 │ │ │ │ ├── data_level0.bin │ │ │ │ ├── header.bin │ │ │ │ ├── length.bin │ │ │ │ └── link_lists.bin │ │ │ ├── chroma.sqlite3 │ │ │ └── daae47eb-a4bf-41ec-b4e7-d7f902773aeb │ │ │ │ ├── data_level0.bin │ │ │ │ ├── header.bin │ │ │ │ ├── length.bin │ │ │ │ └── link_lists.bin │ │ └── questions_df.csv │ ├── prompts │ │ ├── question_maker_approx_system.txt │ │ ├── question_maker_approx_user.txt │ │ ├── question_maker_system.txt │ │ └── question_maker_user.txt │ └── synthetic_evaluation.py └── utils.py └── setup.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/LICENSE -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/MANIFEST.in -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/README.md -------------------------------------------------------------------------------- /chunking_evaluation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/__init__.py -------------------------------------------------------------------------------- /chunking_evaluation/chunking/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/chunking/__init__.py -------------------------------------------------------------------------------- /chunking_evaluation/chunking/base_chunker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/chunking/base_chunker.py -------------------------------------------------------------------------------- /chunking_evaluation/chunking/cluster_semantic_chunker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/chunking/cluster_semantic_chunker.py -------------------------------------------------------------------------------- /chunking_evaluation/chunking/fixed_token_chunker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/chunking/fixed_token_chunker.py -------------------------------------------------------------------------------- /chunking_evaluation/chunking/kamradt_modified_chunker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/chunking/kamradt_modified_chunker.py -------------------------------------------------------------------------------- /chunking_evaluation/chunking/llm_semantic_chunker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/chunking/llm_semantic_chunker.py -------------------------------------------------------------------------------- /chunking_evaluation/chunking/recursive_token_chunker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/chunking/recursive_token_chunker.py -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/base_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/base_evaluation.py -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation.py -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/chatlogs.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/chatlogs.md -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/finance.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/finance.md -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/pubmed.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/pubmed.md -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/state_of_the_union.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/state_of_the_union.md -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/wikitexts.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/corpora/wikitexts.md -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/data_level0.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/data_level0.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/header.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/header.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/length.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/length.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/633a2ec9-d034-4db6-acda-0c784ceaa32b/link_lists.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/data_level0.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/data_level0.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/header.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/header.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/length.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/length.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/bfc1cdb1-8697-49a8-a1ae-a1459d98f1a2/link_lists.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/chroma.sqlite3: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/chroma.sqlite3 -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/data_level0.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/data_level0.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/header.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/header.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/length.bin: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/length.bin -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_db/daae47eb-a4bf-41ec-b4e7-d7f902773aeb/link_lists.bin: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/general_evaluation_data/questions_df.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/general_evaluation_data/questions_df.csv -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/prompts/question_maker_approx_system.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/prompts/question_maker_approx_system.txt -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/prompts/question_maker_approx_user.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/prompts/question_maker_approx_user.txt -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/prompts/question_maker_system.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/prompts/question_maker_system.txt -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/prompts/question_maker_user.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/prompts/question_maker_user.txt -------------------------------------------------------------------------------- /chunking_evaluation/evaluation_framework/synthetic_evaluation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/evaluation_framework/synthetic_evaluation.py -------------------------------------------------------------------------------- /chunking_evaluation/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/chunking_evaluation/utils.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/brandonstarxel/chunking_evaluation/HEAD/setup.py --------------------------------------------------------------------------------