├── dl-curriculum.pdf
├── length_based.py
├── text_structure_based.py
├── python_code_splitting.py
├── markdown_splitting.py
└── semantic_meaning_based.py


/dl-curriculum.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/rohanmistry231/langchain-text-splitters/main/dl-curriculum.pdf


--------------------------------------------------------------------------------
/length_based.py:
--------------------------------------------------------------------------------
 1 | from langchain.text_splitter import CharacterTextSplitter
 2 | from langchain_community.document_loaders import PyPDFLoader
 3 | 
 4 | loader = PyPDFLoader('dl-curriculum.pdf')
 5 | 
 6 | docs = loader.load()
 7 | 
 8 | splitter = CharacterTextSplitter(
 9 |     chunk_size=200,
10 |     chunk_overlap=0,
11 |     separator=''
12 | )
13 | 
14 | result = splitter.split_documents(docs)
15 | 
16 | print(result[1].page_content)


--------------------------------------------------------------------------------
/text_structure_based.py:
--------------------------------------------------------------------------------
 1 | from langchain.text_splitter import RecursiveCharacterTextSplitter
 2 | 
 3 | text = """
 4 | Space exploration has led to incredible scientific discoveries. From landing on the Moon to exploring Mars, humanity continues to push the boundaries of what’s possible beyond our planet.
 5 | 
 6 | These missions have not only expanded our knowledge of the universe but have also contributed to advancements in technology here on Earth. Satellite communications, GPS, and even certain medical imaging techniques trace their roots back to innovations driven by space programs.
 7 | """
 8 | 
 9 | # Initialize the splitter
10 | splitter = RecursiveCharacterTextSplitter(
11 |     chunk_size=500,
12 |     chunk_overlap=0,
13 | )
14 | 
15 | # Perform the split
16 | chunks = splitter.split_text(text)
17 | 
18 | print(len(chunks))
19 | print(chunks)


--------------------------------------------------------------------------------
/python_code_splitting.py:
--------------------------------------------------------------------------------
 1 | from langchain.text_splitter import RecursiveCharacterTextSplitter,Language
 2 | 
 3 | text = """
 4 | class Student:
 5 |     def __init__(self, name, age, grade):
 6 |         self.name = name
 7 |         self.age = age
 8 |         self.grade = grade  # Grade is a float (like 8.5 or 9.2)
 9 | 
10 |     def get_details(self):
11 |         return self.name"
12 | 
13 |     def is_passing(self):
14 |         return self.grade >= 6.0
15 | 
16 | 
17 | # Example usage
18 | student1 = Student("Aarav", 20, 8.2)
19 | print(student1.get_details())
20 | 
21 | if student1.is_passing():
22 |     print("The student is passing.")
23 | else:
24 |     print("The student is not passing.")
25 | 
26 | """
27 | 
28 | # Initialize the splitter
29 | splitter = RecursiveCharacterTextSplitter.from_language(
30 |     language=Language.PYTHON,
31 |     chunk_size=300,
32 |     chunk_overlap=0,
33 | )
34 | 
35 | # Perform the split
36 | chunks = splitter.split_text(text)
37 | 
38 | print(len(chunks))
39 | print(chunks[1])


--------------------------------------------------------------------------------
/markdown_splitting.py:
--------------------------------------------------------------------------------
 1 | from langchain.text_splitter import RecursiveCharacterTextSplitter,Language
 2 | 
 3 | text = """
 4 | # Project Name: Smart Student Tracker
 5 | 
 6 | A simple Python-based project to manage and track student data, including their grades, age, and academic status.
 7 | 
 8 | 
 9 | ## Features
10 | 
11 | - Add new students with relevant info
12 | - View student details
13 | - Check if a student is passing
14 | - Easily extendable class-based design
15 | 
16 | 
17 | ## 🛠 Tech Stack
18 | 
19 | - Python 3.10+
20 | - No external dependencies
21 | 
22 | 
23 | ## Getting Started
24 | 
25 | 1. Clone the repo  
26 |    ```bash
27 |    git clone https://github.com/your-username/student-tracker.git
28 | 
29 | """
30 | 
31 | # Initialize the splitter
32 | splitter = RecursiveCharacterTextSplitter.from_language(
33 |     language=Language.MARKDOWN,
34 |     chunk_size=200,
35 |     chunk_overlap=0,
36 | )
37 | 
38 | # Perform the split
39 | chunks = splitter.split_text(text)
40 | 
41 | print(len(chunks))
42 | print(chunks[0])


--------------------------------------------------------------------------------
/semantic_meaning_based.py:
--------------------------------------------------------------------------------
 1 | from langchain_experimental.text_splitter import SemanticChunker
 2 | from langchain_openai.embeddings import OpenAIEmbeddings
 3 | from dotenv import load_dotenv
 4 | 
 5 | load_dotenv()
 6 | 
 7 | text_splitter = SemanticChunker(
 8 |     OpenAIEmbeddings(), breakpoint_threshold_type="standard_deviation",
 9 |     breakpoint_threshold_amount=3
10 | )
11 | 
12 | sample = """
13 | Farmers were working hard in the fields, preparing the soil and planting seeds for the next season. The sun was bright, and the air smelled of earth and fresh grass. The Indian Premier League (IPL) is the biggest cricket league in the world. People all over the world watch the matches and cheer for their favourite teams.
14 | 
15 | 
16 | Terrorism is a big danger to peace and safety. It causes harm to people and creates fear in cities and villages. When such attacks happen, they leave behind pain and sadness. To fight terrorism, we need strong laws, alert security forces, and support from people who care about peace and safety.
17 | """
18 | 
19 | docs = text_splitter.create_documents([sample])
20 | print(len(docs))
21 | print(docs)
22 | 
23 | 
24 | 


--------------------------------------------------------------------------------