├── .gitignore
├── LICENSE
├── LLMChunkizer.ipynb
├── LLMChunkizerLib
    ├── __init__.py
    └── chunkizer.py
├── README.md
├── docs
    └── WEB_Article_Efficient_Document_Chunking_Using_LLMs_Unlocking_Knowledge_One_Block_at_a_Time_by_Carlo_Peron_Oct_2024_TowardsDataScience.pdf
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | tests/
 29 | z_test.py
 30 | 
 31 | # PyInstaller
 32 | #  Usually these files are written by a python script from a template
 33 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 34 | *.manifest
 35 | *.spec
 36 | 
 37 | # Installer logs
 38 | pip-log.txt
 39 | pip-delete-this-directory.txt
 40 | 
 41 | # Unit test / coverage reports
 42 | htmlcov/
 43 | .tox/
 44 | .nox/
 45 | .coverage
 46 | .coverage.*
 47 | .cache
 48 | nosetests.xml
 49 | coverage.xml
 50 | *.cover
 51 | *.py,cover
 52 | .hypothesis/
 53 | .pytest_cache/
 54 | cover/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | db.sqlite3
 64 | db.sqlite3-journal
 65 | 
 66 | # Flask stuff:
 67 | instance/
 68 | .webassets-cache
 69 | 
 70 | # Scrapy stuff:
 71 | .scrapy
 72 | 
 73 | # Sphinx documentation
 74 | docs/_build/
 75 | 
 76 | # PyBuilder
 77 | .pybuilder/
 78 | target/
 79 | 
 80 | # Jupyter Notebook
 81 | .ipynb_checkpoints
 82 | 
 83 | # IPython
 84 | profile_default/
 85 | ipython_config.py
 86 | 
 87 | # pyenv
 88 | #   For a library or package, you might want to ignore these files since the code is
 89 | #   intended to run in multiple environments; otherwise, check them in:
 90 | # .python-version
 91 | 
 92 | # pipenv
 93 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 94 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 95 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 96 | #   install all needed dependencies.
 97 | #Pipfile.lock
 98 | 
 99 | # poetry
100 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
101 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
102 | #   commonly ignored for libraries.
103 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
104 | #poetry.lock
105 | 
106 | # pdm
107 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
108 | #pdm.lock
109 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
110 | #   in version control.
111 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
112 | .pdm.toml
113 | .pdm-python
114 | .pdm-build/
115 | 
116 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
117 | __pypackages__/
118 | 
119 | # Celery stuff
120 | celerybeat-schedule
121 | celerybeat.pid
122 | 
123 | # SageMath parsed files
124 | *.sage.py
125 | 
126 | # Environments
127 | .env
128 | .venv
129 | env/
130 | venv/
131 | ENV/
132 | env.bak/
133 | venv.bak/
134 | 
135 | # Spyder project settings
136 | .spyderproject
137 | .spyproject
138 | 
139 | # Rope project settings
140 | .ropeproject
141 | 
142 | # mkdocs documentation
143 | /site
144 | 
145 | # mypy
146 | .mypy_cache/
147 | .dmypy.json
148 | dmypy.json
149 | 
150 | # Pyre type checker
151 | .pyre/
152 | 
153 | # pytype static type analyzer
154 | .pytype/
155 | 
156 | # Cython debug symbols
157 | cython_debug/
158 | 
159 | # PyCharm
160 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
161 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
162 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
163 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
164 | #.idea/
165 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 peronc
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/LLMChunkizer.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "#Large Language Model Chunkizer\n",
  8 |     "##Introduction\n",
  9 |     "In this notebook, I demonstrate how __LLMChunkizerLib__ leverages a Large Language Model (LLM) to split text (even from large documents) into coherent chunks that preserve the same concept or idea."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "##Import Library\n",
 17 |     "This notebook leverages LangChain and the OpenAI model deployed on Azure.\n",
 18 |     "\n",
 19 |     "First, we import the necessary standard libraries, including os, langchain, and dotenv.\n",
 20 |     "\n",
 21 |     "Next, we import my llm_chunkizer class, which provides several static methods essential for split document."
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 1,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "import os\n",
 31 |     "from langchain_openai.chat_models.azure import AzureChatOpenAI\n",
 32 |     "from dotenv import load_dotenv\n",
 33 |     "from LLMChunkizerLib.chunkizer import llm_chunkizer"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "markdown",
 38 |    "metadata": {},
 39 |    "source": [
 40 |     "##Setting variables\n",
 41 |     "Following that, we need to import the necessary variables required for utilizing Azure OpenAI."
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {},
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "load_dotenv()\n",
 51 |     "azure_deployment = os.getenv(\"AZURE_DEPLOYMENT\")\n",
 52 |     "temperature = float(os.getenv(\"TEMPERATURE\"))\n",
 53 |     "api_key  = os.getenv(\"AZURE_OPENAI_API_KEY\")\n",
 54 |     "endpoint = os.getenv(\"AZURE_OPENAI_ENDPOINT\")\n",
 55 |     "api_version = os.getenv(\"API_VERSION\")"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "##Define database\n",
 63 |     "In a real-world scenario, I obtain paragraphs from a 30-page Word document. However, to simplify this example, I will create a list containing three paragraphs from __Around the World in Eighty Days__."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 3,
 69 |    "metadata": {},
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "documents = [ \n",
 73 |     "     \"\"\"On October 2, 1872, Phileas Fogg, an English gentleman, left London for an extraordinary journey. \n",
 74 |     "\tHe had wagered that he could circumnavigate the globe in just eighty days. \n",
 75 |     "\tFogg was a man of strict habits and a very methodical life; everything was planned down to the smallest detail, and nothing was left to chance.\n",
 76 |     "\tHe departed London on a train to Dover, then crossed the Channel by ship. His journey took him through many countries, \n",
 77 |     "\tincluding France, India, Japan, and America. At each stop, he encountered various people and faced countless adventures, but his determination never wavered.\"\"\",\n",
 78 |     "\n",
 79 |     "    \"\"\"However, time was his enemy, and any delay risked losing the bet. With the help of his faithful servant Passepartout, Fogg had to face \n",
 80 |     "\tunexpected obstacles and dangerous situations.\"\"\",\n",
 81 |     "\t\"\"\"Yet, each time, his cunning and indomitable spirit guided him to victory, while the world watched in disbelief.\"\"\",\n",
 82 |     "\n",
 83 |     "    \"\"\"With one final effort, Fogg and Passepartout reached London just in time to prove that they had completed their journey in less than eighty days. \n",
 84 |     "\tThis extraordinary adventurer not only won the bet but also discovered that the true treasure was the friendship and experiences he had accumulated along the way.\"\"\"\n",
 85 |     "]"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "markdown",
 90 |    "metadata": {},
 91 |    "source": [
 92 |     "##Initiate LLM\n",
 93 |     "Now I create an AzureOpenAI LLM GPT-4o . "
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "code",
 98 |    "execution_count": 4,
 99 |    "metadata": {},
100 |    "outputs": [],
101 |    "source": [
102 |     "# Initialize the LLM\n",
103 |     "llm = AzureChatOpenAI(api_key=api_key, azure_endpoint=endpoint, azure_deployment=azure_deployment, api_version=api_version,temperature=temperature)"
104 |    ]
105 |   },
106 |   {
107 |    "cell_type": "markdown",
108 |    "metadata": {},
109 |    "source": [
110 |     "##Block creation\n",
111 |     "Now, I need to take the paragraphs and transform them into blocks of text, each with a maximum size of 200 tokens. The block size is arbitrary, and in a real-world scenario, I typically consider block sizes ranging from 3,000 to 5,000 tokens."
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 7,
117 |    "metadata": {},
118 |    "outputs": [],
119 |    "source": [
120 |     "refined_blocks = llm_chunkizer.split_document_into_blocks(documents, 200)"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "markdown",
125 |    "metadata": {},
126 |    "source": [
127 |     "Print blocks"
128 |    ]
129 |   },
130 |   {
131 |    "cell_type": "code",
132 |    "execution_count": null,
133 |    "metadata": {},
134 |    "outputs": [],
135 |    "source": [
136 |     "for idx, block in enumerate(refined_blocks):\n",
137 |     "    if (block.strip() != ''):\n",
138 |     "        print(f\"{idx}: {block}\")"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "markdown",
143 |    "metadata": {},
144 |    "source": [
145 |     "#Chunkize blocks\n",
146 |     "\n",
147 |     "In the chunk_text_with_llm function, I split the block into chunks and address the potential issue of adjacent paragraphs that convey the same idea but were initially separated into distinct blocks.\n",
148 |     "This is important because splitting related information can lead to a loss of context and negatively affect the understanding of the content when processed by the model.\n",
149 |     "To mitigate this, I take the last two chunks (if they exist) derived from the current block and append them to the beginning of the next block before analyzing it.\n",
150 |     "This ensures that related concepts are kept together, preserving their context and improving the overall coherence of the information. This process is repeated for all remaining blocks.            "
151 |    ]
152 |   },
153 |   {
154 |    "cell_type": "code",
155 |    "execution_count": null,
156 |    "metadata": {},
157 |    "outputs": [],
158 |    "source": [
159 |     "final_chunks = llm_chunkizer.chunk_text_with_llm(llm, refined_blocks)\n",
160 |     "for idx, chunk in enumerate(final_chunks):\n",
161 |     "    if (chunk.strip() != ''):\n",
162 |     "        print(f\"{idx}: {chunk}\")"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "markdown",
167 |    "metadata": {},
168 |    "source": [
169 |     "You can see how the database has been split into six distinct chunks.\n",
170 |     "\n",
171 |     "0: On October 2, 1872, Phileas Fogg, an English gentleman, left London for an extraordinary journey. He had wagered that he could circumnavigate the globe in just eighty days. Fogg was a man of strict habits and a very methodical life; everything was planned down to the smallest detail, and nothing was left to chance. \n",
172 |     "1:  He departed London on a train to Dover, then crossed the Channel by ship. His journey took him through many countries, including France, India, Japan, and America. At each stop, he encountered various people and faced countless adventures, but his determination never wavered. \n",
173 |     "2: However, time was his enemy, and any delay risked losing the bet. With the help of his faithful servant Passepartout, Fogg had to face unexpected obstacles and dangerous situations. \n",
174 |     "3: Yet, each time, his cunning and indomitable spirit guided him to victory, while the world watched in disbelief.\n",
175 |     "4: With one final effort, Fogg and Passepartout reached London just in time to prove that they had completed their journey in less than eighty days.\n",
176 |     "5: This extraordinary adventurer not only won the bet but also discovered that the true treasure was the friendship and experiences he had accumulated along the way."
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "markdown",
181 |    "metadata": {},
182 |    "source": [
183 |     "Let's see what happens when I split the original database into larger blocks."
184 |    ]
185 |   },
186 |   {
187 |    "cell_type": "code",
188 |    "execution_count": null,
189 |    "metadata": {},
190 |    "outputs": [],
191 |    "source": [
192 |     "refined_blocks = llm_chunkizer.split_document_into_blocks(documents, 1000)\n",
193 |     "\n",
194 |     "final_chunks = llm_chunkizer.chunk_text_with_llm(llm, refined_blocks)\n",
195 |     "for idx, chunk in enumerate(final_chunks):\n",
196 |     "    if (chunk.strip() != ''):\n",
197 |     "        print(f\"{idx}: {chunk}\")"
198 |    ]
199 |   },
200 |   {
201 |    "cell_type": "markdown",
202 |    "metadata": {},
203 |    "source": [
204 |     "With a larger block size, the system generates 4 chunks instead of 6. This behavior is expected, as the prompt responsible for dividing the text into chunks analyzed a larger portion of text at once and was able to create fewer chunks by using more text to represent a single concept.\n",
205 |     "\n",
206 |     "0: On October 2, 1872, Phileas Fogg, an English gentleman, left London for an extraordinary journey. He had wagered that he could circumnavigate the globe in just eighty days. Fogg was a man of strict habits and a very methodical life; everything was planned down to the smallest detail, and nothing was left to chance. \n",
207 |     "1: He departed London on a train to Dover, then crossed the Channel by ship. His journey took him through many countries, including France, India, Japan, and America. At each stop, he encountered various people and faced countless adventures, but his determination never wavered.\n",
208 |     "2: However, time was his enemy, and any delay risked losing the bet. With the help of his faithful servant Passepartout, Fogg had to face unexpected obstacles and dangerous situations. Yet, each time, his cunning and indomitable spirit guided him to victory, while the world watched in disbelief.\n",
209 |     "3: With one final effort, Fogg and Passepartout reached London just in time to prove that they had completed their journey in less than eighty days. This extraordinary adventurer not only won the bet but also discovered that the true treasure was the friendship and experiences he had accumulated along the way.\n"
210 |    ]
211 |   },
212 |   {
213 |    "cell_type": "markdown",
214 |    "metadata": {},
215 |    "source": [
216 |     "#Final Thoughts\n",
217 |     "Ultimately, it's important to perform multiple chunking attempts, varying the block size passed to the chunkizer each time. It is essential to review the results after each attempt to determine which approach best suits the desired outcome. "
218 |    ]
219 |   }
220 |  ],
221 |  "metadata": {
222 |   "kernelspec": {
223 |    "display_name": "Python 3",
224 |    "language": "python",
225 |    "name": "python3"
226 |   },
227 |   "language_info": {
228 |    "codemirror_mode": {
229 |     "name": "ipython",
230 |     "version": 3
231 |    },
232 |    "file_extension": ".py",
233 |    "mimetype": "text/x-python",
234 |    "name": "python",
235 |    "nbconvert_exporter": "python",
236 |    "pygments_lexer": "ipython3",
237 |    "version": "3.12.2"
238 |   }
239 |  },
240 |  "nbformat": 4,
241 |  "nbformat_minor": 2
242 | }
243 | 


--------------------------------------------------------------------------------
/LLMChunkizerLib/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peronc/LLMChunkizer/03e8b25fa2dbcec3bddd29f27776bb5d11074595/LLMChunkizerLib/__init__.py


--------------------------------------------------------------------------------
/LLMChunkizerLib/chunkizer.py:
--------------------------------------------------------------------------------
  1 | from langchain_openai.chat_models.azure import AzureChatOpenAI
  2 | import tiktoken
  3 | 
  4 | class llm_chunkizer:
  5 |     @staticmethod
  6 |     # Function to estimate token count for a text block
  7 |     def estimate_token_count(text: str, tokenizer_model: str = "gpt-4"):
  8 |         """
  9 |         Given a text we estimate token need
 10 |         Args:
 11 |             text (str): text for which to estimate needed tokens 
 12 |             tokenizer_model (str, optional): tokenizer to use for estimation . Defaults to "gpt-4".
 13 |         """
 14 |         encoding = tiktoken.encoding_for_model(tokenizer_model)
 15 |         return len(encoding.encode(text))
 16 | 
 17 |     @staticmethod
 18 |     # Function to split document into large blocks of a specific token size
 19 |     def split_document_into_blocks(paragraphs, block_token_limit: int = 5000):
 20 |         """
 21 |         The main idea is to extract text from a document as paragraphs or sections from the original content. 
 22 |         Then, we create a list of text blocks, each formatted to fit within the token limit allowed 
 23 |         by the LLM for prompts (in this case, defined by block_token_limit).
 24 |         Args:
 25 |             paragraphs: an array of strings that represent the paragraph of a document
 26 |             block_token_limit (int, optional): max block size in token. Defaults to 5000.
 27 |         """
 28 |         blocks = []
 29 |         current_block = ""
 30 |         current_token_count = 0
 31 | 
 32 |         for paragraph in paragraphs:
 33 |             paragraph_token_count = llm_chunkizer.estimate_token_count(paragraph) 
 34 |             # If adding this paragraph exceeds the token limit, finalize the current block
 35 |             if current_token_count + paragraph_token_count > block_token_limit:
 36 |                 blocks.append(current_block.strip())
 37 |                 # Start new block with current paragraph
 38 |                 current_block = paragraph  
 39 |                 current_token_count = paragraph_token_count
 40 |             else:
 41 |                 current_block += paragraph.strip() + "\n"
 42 |                 current_token_count += paragraph_token_count
 43 | 
 44 |         # Add any remaining text as the final block
 45 |         if current_block:
 46 |             blocks.append(current_block.strip())
 47 | 
 48 |         return blocks
 49 | 
 50 |     @staticmethod
 51 |     # Function to chunk text using LLM
 52 |     def chunk_text_with_llm(llm: AzureChatOpenAI, blocks):
 53 |         """
 54 |             In this function we take an array with blocks of text and ask the LLM to divide them into self-consistent chunks
 55 |             using a prompt that ask to keep toghether text that express a concrete idea.
 56 |             
 57 |             Each block used for input, was created by putting together paragraphs extracted from a text document. 
 58 |             When grouping into blocks, we tried to respect the maximum token length foreseen by the prompt
 59 |             without considering that the content at the end of one block and the beginning of the next 
 60 |             could be be kept together because they deal with a single topic.
 61 |             
 62 |             We address the potential issue of adjacent paragraphs that convey the same idea being split into separate blocks.
 63 |             This is important because splitting related information can lead to a loss of context and negatively affect
 64 |             the understanding of the content when processed by the model.
 65 |             To mitigate this, we take the last two chunks (if they exist) from the current block and append them 
 66 |             to the beginning of the next block before analyzing it.
 67 |             This ensures that related concepts are kept together, preserving their context and improving the overall
 68 |             coherence of the information.
 69 |             This process is repeated for all remaining blocks.
 70 |         Args:
 71 |             llm (AzureChatOpenAI): Azure OpenAI model
 72 |             blocks (array: str): a list of block to split in consistent chunks 
 73 | 
 74 |         Returns:
 75 |             array: str: an array of chunk splitted by idea
 76 |         """
 77 |         final_chunks = []
 78 |         last_chunk = ""
 79 |         last_chunk_2 = ""
 80 |         last_chunk_1 = ""
 81 | 
 82 |         for block in blocks:
 83 |             text = last_chunk + "\n" + block
 84 |             prompt = [
 85 |                 {"role": "system", "content": "You are an assistant that helps divide documents into logical chunks based on complete ideas."},
 86 |                 {"role": "user", "content": f"Please split the following text into logical chunks, using '!-!-!-!-!-!-!-!-!-!-!' to separate them. \n\n{text}"}
 87 |             ]
 88 |             #invoke llm with prompt
 89 |             response = llm.invoke(prompt)        
 90 |             text_to_split = response.content
 91 |             #split text by '!-!-!-!-!-!-!-!-!-!-!'. Each element of the splitted_array is an autoconsistent chunk.
 92 |             splitted_array = text_to_split.split('!-!-!-!-!-!-!-!-!-!-!')
 93 |             
 94 |             #retain last 2 chunk for the this block and use them to the next
 95 |             last_chunk_1 = splitted_array.pop() if splitted_array else ""
 96 |             last_chunk_2 = splitted_array.pop() if splitted_array else ""          
 97 |             last_chunk = last_chunk_2 + "\n" + last_chunk_1
 98 |             
 99 |             final_chunks.extend(splitted_array)
100 |                 
101 |         final_chunks.append(last_chunk_2)
102 |         final_chunks.append(last_chunk_1)
103 |         return final_chunks
104 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Large Language Model Chunkizer
 2 | In this repository we use Azure OpenAI GPT-4o to split a document in a consistent way based on the concept of "idea"
 3 | 
 4 | When splitting blocks into chunks, we tackle the potential problem of adjacent paragraphs that express the same idea but are initially separated into distinct blocks.
 5 | 
 6 | This is important because splitting related information can lead to a loss of context and negatively affect the understanding of the content when processed by the model.
 7 | 
 8 | To mitigate this, we take the last two chunks (if they exist) derived from the current block and append them to the beginning of the next block before analyzing it.
 9 | 
10 | This ensures that related concepts are kept together, preserving their context and improving the overall coherence of the information.
11 | 
12 | This process is repeated for all remaining blocks.
13 | 
14 | See an example  [LLMChunkizer.ipynb](LLMChunkizer.ipynb)
15 | 
16 | See the library [LLMChunkizerLib](LLMChunkizerLib/)
17 | 
18 | ---
19 | 
20 | ## Article on Towards Data Science
21 | In my latest article published on Towards Data Science, I explore how large language models (LLMs) can revolutionize the way we segment and analyze documents. 
22 | 
23 | This technique, known as "document chunking," is essential for:
24 | - Enhancing information retrieval accuracy,
25 | - Managing large documents more efficiently,
26 | - Unlocking knowledge from fragmented blocks of text.
27 | 
28 | If you're interested in learning how to apply these techniques to optimize document processing, check out my full article on Towards Data Science here: [Efficient Document Chunking Using LLMs: Unlocking Knowledge One Block at a Time](https://medium.com/@peronc79/355717a88c5c?sk=1cc4e46c40708d5057d54da391035cfa) 🚀
29 | 
30 | Here you can find a pdf copy of the [article](docs/WEB_Article_Efficient_Document_Chunking_Using_LLMs_Unlocking_Knowledge_One_Block_at_a_Time_by_Carlo_Peron_Oct_2024_TowardsDataScience.pdf)
31 | 


--------------------------------------------------------------------------------
/docs/WEB_Article_Efficient_Document_Chunking_Using_LLMs_Unlocking_Knowledge_One_Block_at_a_Time_by_Carlo_Peron_Oct_2024_TowardsDataScience.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/peronc/LLMChunkizer/03e8b25fa2dbcec3bddd29f27776bb5d11074595/docs/WEB_Article_Efficient_Document_Chunking_Using_LLMs_Unlocking_Knowledge_One_Block_at_a_Time_by_Carlo_Peron_Oct_2024_TowardsDataScience.pdf


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | # setup.py
 2 | from setuptools import setup, find_packages
 3 | 
 4 | setup(
 5 |     name="LLMChunkizer",
 6 |     version="0.1",
 7 |     packages=find_packages(),
 8 |     install_requires=[],  # Add dependencies if neededs
 9 |     author="Peron Carlo",
10 |     author_email="peronc79@gmail.com",
11 |     description="A small Python library useful to perform a chunkizer by LLM",
12 |     url="https://github.com/peronc/LLMRetrieve",  # Optional: link to your GitHub
13 | )
14 | 
15 | 


--------------------------------------------------------------------------------