├── app ├── __init__.py ├── api │ ├── __init__.py │ └── routes.py ├── models │ ├── __init__.py │ └── schemas.py ├── services │ ├── __init__.py │ ├── embeddings.py │ └── preprocessor.py ├── config.py └── main.py ├── tests ├── __init__.py ├── test_preprocessor.py ├── test_embeddings.py └── test_vietnamese_processing.py ├── requirements.txt ├── .env.example ├── docker-compose.yml ├── .dockerignore ├── .gitignore ├── Dockerfile ├── DOCKER.md ├── README.md ├── plan.md ├── run_test.py └── docs └── vietnamese-rag-research.md /app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/models/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/services/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi>=0.68.0 2 | uvicorn>=0.15.0 3 | underthesea>=1.3.0 4 | sentence-transformers>=2.2.0 5 | numpy>=1.21.0 6 | pydantic>=1.8.0 7 | python-dotenv>=0.19.0 -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # Application settings 2 | DEBUG=False 3 | 4 | # Server settings 5 | HOST=0.0.0.0 6 | PORT=8000 7 | 8 | # Model settings 9 | EMBEDDING_MODEL=bkai-foundation-models/vietnamese-bi-encoder 10 | MAX_TOKEN_LIMIT=128 11 | DEFAULT_CHUNK_SIZE=110 12 | DEFAULT_CHUNK_OVERLAP=20 13 | DEFAULT_TOP_K=5 14 | 15 | # Cache settings 16 | ENABLE_CACHE=True 17 | CACHE_SIZE=1000 -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '3' 2 | 3 | services: 4 | app: 5 | build: 6 | context: . 7 | args: 8 | - PORT=${PORT:-8000} 9 | container_name: vietnamese-rag-app 10 | ports: 11 | - "${PORT:-8000}:${PORT:-8000}" 12 | environment: 13 | - PORT=${PORT:-8000} 14 | volumes: 15 | - huggingface_cache:/app/.cache/huggingface 16 | restart: unless-stopped 17 | 18 | volumes: 19 | huggingface_cache: 20 | -------------------------------------------------------------------------------- /.dockerignore: -------------------------------------------------------------------------------- 1 | # Git 2 | .git 3 | .gitignore 4 | .github 5 | 6 | # Python 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | *.so 11 | .Python 12 | env/ 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | .pytest_cache/ 28 | .coverage 29 | htmlcov/ 30 | .tox/ 31 | .nox/ 32 | .hypothesis/ 33 | .coverage.* 34 | 35 | # Virtual Environment 36 | venv/ 37 | .venv/ 38 | ENV/ 39 | env/ 40 | 41 | # IDE 42 | .idea/ 43 | .vscode/ 44 | *.swp 45 | *.swo 46 | .DS_Store 47 | 48 | # Docker 49 | .dockerignore 50 | Dockerfile 51 | docker-compose.yml 52 | 53 | # Project specific 54 | .cache/ 55 | models/ 56 | *.log 57 | tasks/ 58 | scripts/ 59 | .env.example 60 | README.md 61 | LICENSE 62 | -------------------------------------------------------------------------------- /app/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | # Load environment variables from .env file 5 | load_dotenv(override=True) 6 | 7 | # Application settings 8 | APP_NAME = "Vietnamese RAG" 9 | DEBUG = os.getenv("DEBUG", "False").lower() in ("true", "1", "t") 10 | API_PREFIX = "/api" 11 | 12 | # Model settings 13 | EMBEDDING_MODEL = "bkai-foundation-models/vietnamese-bi-encoder" 14 | MAX_TOKEN_LIMIT = 128 15 | DEFAULT_CHUNK_SIZE = 110 # Safe margin below MAX_TOKEN_LIMIT 16 | DEFAULT_CHUNK_OVERLAP = 20 17 | DEFAULT_TOP_K = 5 18 | 19 | # Server settings 20 | HOST = os.getenv("HOST", "0.0.0.0") 21 | PORT = int(os.getenv("PORT", "8000")) 22 | 23 | # Cache settings 24 | ENABLE_CACHE = os.getenv("ENABLE_CACHE", "True").lower() in ("true", "1", "t") 25 | CACHE_SIZE = int(os.getenv("CACHE_SIZE", "1000")) -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Python 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | *.so 6 | .Python 7 | env/ 8 | build/ 9 | develop-eggs/ 10 | dist/ 11 | downloads/ 12 | eggs/ 13 | .eggs/ 14 | lib/ 15 | lib64/ 16 | parts/ 17 | sdist/ 18 | var/ 19 | *.egg-info/ 20 | .installed.cfg 21 | *.egg 22 | 23 | # Virtual Environment 24 | venv/ 25 | ENV/ 26 | .env 27 | 28 | # IDE 29 | .idea/ 30 | .vscode/ 31 | *.swp 32 | *.swo 33 | 34 | # Logs 35 | logs/ 36 | *.log 37 | 38 | # OS 39 | .DS_Store 40 | .DS_Store? 41 | ._* 42 | .Spotlight-V100 43 | .Trashes 44 | ehthumbs.db 45 | Thumbs.db 46 | 47 | # Added by Claude Task Master 48 | logs 49 | npm-debug.log* 50 | yarn-debug.log* 51 | yarn-error.log* 52 | dev-debug.log 53 | # Dependency directories 54 | node_modules/ 55 | # Environment variables 56 | # Editor directories and files 57 | .idea 58 | .vscode 59 | *.suo 60 | *.ntvs* 61 | *.njsproj 62 | *.sln 63 | *.sw? 64 | # OS specific 65 | # Task files 66 | tasks.json 67 | tasks/ 68 | .cursor/* 69 | .roo/* 70 | .winds* 71 | .roo* 72 | .task* -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | # Set environment variables 4 | ENV PYTHONDONTWRITEBYTECODE=1 \ 5 | PYTHONUNBUFFERED=1 \ 6 | LANG=C.UTF-8 \ 7 | LC_ALL=C.UTF-8 \ 8 | TZ=Asia/Ho_Chi_Minh \ 9 | HF_HOME=/app/.cache/huggingface 10 | 11 | # Set working directory 12 | WORKDIR /app 13 | 14 | # Install system dependencies 15 | RUN apt-get update && apt-get install -y --no-install-recommends \ 16 | build-essential \ 17 | git \ 18 | && apt-get clean \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | # Copy requirements file 22 | COPY requirements.txt . 23 | 24 | # Install Python dependencies 25 | RUN pip install --no-cache-dir --upgrade pip && \ 26 | pip install --no-cache-dir -r requirements.txt 27 | 28 | # Copy application code 29 | COPY . . 30 | 31 | # Create model cache directory 32 | RUN mkdir -p /app/.cache/huggingface 33 | 34 | # Expose the port the app runs on (default to 8000 if not set) 35 | ARG PORT=8000 36 | ENV PORT=${PORT} 37 | # Only expose the configured port, not the default 8000 38 | EXPOSE ${PORT} 39 | 40 | # Command to run the application 41 | CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"] 42 | -------------------------------------------------------------------------------- /DOCKER.md: -------------------------------------------------------------------------------- 1 | # Docker Implementation for Vietnamese RAG Application 2 | 3 | This document provides instructions for building, running, and configuring the Vietnamese RAG application using Docker. 4 | 5 | ## Prerequisites 6 | 7 | - [Docker](https://docs.docker.com/get-docker/) 8 | - [Docker Compose](https://docs.docker.com/compose/install/) 9 | 10 | ## Quick Start 11 | 12 | 1. Clone the repository: 13 | ```bash 14 | git clone https://github.com/yourusername/n8n-rag-vn.git 15 | cd n8n-rag-vn 16 | ``` 17 | 18 | 2. Create a `.env` file with your configuration (or copy from `.env.example`): 19 | ```bash 20 | cp .env.example .env 21 | ``` 22 | 23 | 3. Customize the port in the `.env` file if needed: 24 | ``` 25 | PORT=24600 # Change this to your desired port 26 | ``` 27 | 28 | 4. Build and start the application using Docker Compose: 29 | ```bash 30 | docker compose up -d 31 | ``` 32 | 33 | 5. Access the API at http://localhost:${PORT}/docs (where ${PORT} is the port specified in your .env file, default is 8000) 34 | 35 | ## Configuration 36 | 37 | ### Environment Variables 38 | 39 | The Docker container can be configured using the `PORT` environment variable in the `.env` file. 40 | 41 | ### Volumes 42 | 43 | The Docker container uses a volume to persist the downloaded models: 44 | 45 | - `huggingface_cache`: Persistent storage for downloaded models 46 | 47 | ## Troubleshooting 48 | 49 | If you encounter issues with the Docker implementation, try the following: 50 | 51 | 1. Check the container logs: 52 | ```bash 53 | docker logs vietnamese-rag-app 54 | ``` 55 | 56 | 2. Ensure the PORT environment variable is correctly set in your `.env` file. 57 | 58 | 3. If the container fails to start, try rebuilding the image: 59 | ```bash 60 | docker compose build --no-cache 61 | docker compose up -d 62 | ``` 63 | 64 | 65 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from fastapi import FastAPI, Request 3 | from fastapi.middleware.cors import CORSMiddleware 4 | from fastapi.responses import JSONResponse 5 | import time 6 | 7 | from app.api.routes import router as api_router 8 | from app.config import APP_NAME, API_PREFIX, DEBUG 9 | 10 | # Configure logging 11 | logging.basicConfig( 12 | level=logging.DEBUG if DEBUG else logging.INFO, 13 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 14 | ) 15 | logger = logging.getLogger(__name__) 16 | 17 | # Create FastAPI app 18 | app = FastAPI( 19 | title=APP_NAME, 20 | description="API for Vietnamese RAG (Retrieval Augmented Generation)", 21 | version="1.0.0", 22 | debug=DEBUG, 23 | ) 24 | 25 | # Add CORS middleware 26 | app.add_middleware( 27 | CORSMiddleware, 28 | allow_origins=["*"], # In production, restrict this to specific origins 29 | allow_credentials=True, 30 | allow_methods=["*"], 31 | allow_headers=["*"], 32 | ) 33 | 34 | # Add API router 35 | app.include_router(api_router, prefix=API_PREFIX) 36 | 37 | 38 | # Add middleware for request timing 39 | @app.middleware("http") 40 | async def add_process_time_header(request: Request, call_next): 41 | start_time = time.time() 42 | response = await call_next(request) 43 | process_time = time.time() - start_time 44 | response.headers["X-Process-Time"] = str(process_time) 45 | return response 46 | 47 | 48 | # Root endpoint 49 | @app.get("/") 50 | async def root(): 51 | return { 52 | "app": APP_NAME, 53 | "version": "1.0.0", 54 | "status": "active", 55 | "api_docs": "/docs", 56 | } 57 | 58 | 59 | # Health check endpoint 60 | @app.get("/health") 61 | async def health_check(): 62 | return {"status": "healthy"} 63 | 64 | 65 | # Global exception handler 66 | @app.exception_handler(Exception) 67 | async def global_exception_handler(request: Request, exc: Exception): 68 | logger.error(f"Unhandled exception: {str(exc)}", exc_info=True) 69 | return JSONResponse( 70 | status_code=500, 71 | content={"detail": "An unexpected error occurred, please try again later"}, 72 | ) 73 | 74 | 75 | if __name__ == "__main__": 76 | import uvicorn 77 | from app.config import HOST, PORT 78 | 79 | logger.info(f"Starting {APP_NAME} server on {HOST}:{PORT}") 80 | uvicorn.run("main:app", host=HOST, port=PORT, reload=DEBUG) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Vietnamese RAG Implementation 2 | 3 | A Vietnamese language Retrieval Augmented Generation (RAG) system with specialized text processing and embeddings for Vietnamese language. 4 | 5 | ## Features 6 | 7 | - Text normalization using `underthesea` 8 | - Sentence segmentation 9 | - Word segmentation with domain-specific fixed words (optional) 10 | - Smart chunking strategy with configurable chunk size and overlap (default: 110 tokens with 20 token overlap) 11 | - Embedding generation using `bkai-foundation-models/vietnamese-bi-encoder` 12 | - API for processing documents and querying similar chunks 13 | - Caching for embeddings (optional, enabled by default) 14 | - Input validation to ensure chunk size and overlap constraints 15 | 16 | ## Setup 17 | 18 | 1. Clone the repository 19 | 2. Install dependencies: 20 | ``` 21 | pip install -r requirements.txt 22 | ``` 23 | 3. Run the application: 24 | ``` 25 | uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 26 | ``` 27 | (Note: The `--host` and `--port` are optional and default to `0.0.0.0` and `8000` respectively, as defined in `app/config.py`) 28 | 29 | ## API Endpoints 30 | 31 | - `POST /api/process`: Process text documents into chunks and embeddings. Takes a `ProcessingRequest` as input, allowing specification of `chunk_size` and `chunk_overlap`. Returns a list of `EmbeddingResponse`. 32 | - `POST /api/query`: Find similar chunks for a given query text. Takes a `QueryRequest` and returns a `QueryResponse`. 33 | - `GET /api/status`: Get server status. 34 | - `GET /health`: Health check endpoint. 35 | - `GET /`: Root endpoint with basic application information. 36 | 37 | ## Configuration 38 | 39 | Configuration options are managed in `app/config.py` and can be overridden using environment variables: 40 | 41 | - `DEBUG`: Enable debug mode (default: `False`) 42 | - `EMBEDDING_MODEL`: The SentenceTransformer model to use (default: `bkai-foundation-models/vietnamese-bi-encoder`) 43 | - `MAX_TOKEN_LIMIT`: Maximum number of tokens per chunk (default: 128) 44 | - `DEFAULT_CHUNK_SIZE`: Default chunk size in tokens (default: 110) 45 | - `DEFAULT_CHUNK_OVERLAP`: Default chunk overlap in tokens (default: 20) 46 | - `DEFAULT_TOP_K`: Default number of top matches to return for a query (default: 5) 47 | - `ENABLE_CACHE`: Enable embedding caching (default: `True`) 48 | - `CACHE_SIZE`: Maximum size of the embedding cache (default: 1000) 49 | - `HOST`: Host address (default: `0.0.0.0`) 50 | - `PORT`: Port number (default: 8000) 51 | 52 | ## License 53 | 54 | [MIT License](LICENSE) -------------------------------------------------------------------------------- /tests/test_preprocessor.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | 5 | # Add the project root to the Python path 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 7 | 8 | from services.preprocessor import TextPreprocessor 9 | from config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, MAX_TOKEN_LIMIT 10 | 11 | 12 | class TestTextPreprocessor(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | cls.preprocessor = TextPreprocessor() 16 | cls.sample_text = ( 17 | "Trí tuệ nhân tạo (AI) đang cách mạng hóa ngành công nghệ. " 18 | "COVID-19 đã thúc đẩy quá trình chuyển đổi số. " 19 | "Các mô hình NLP như BERT và GPT-3 đạt được nhiều tiến bộ đáng kể." 20 | ) 21 | 22 | def test_normalization(self): 23 | normalized = self.preprocessor.normalize_text(" Đây là ví dụ ") 24 | print("test_normalization", normalized) 25 | self.assertEqual(normalized, "Đây là ví dụ") 26 | 27 | def test_word_segmentation(self): 28 | tokens = self.preprocessor.segment_words(self.sample_text) 29 | print("test_word_segmentation", tokens) 30 | self.assertIn("AI", tokens) 31 | self.assertIn("COVID-19", tokens) 32 | self.assertIn("NLP", tokens) 33 | 34 | def test_token_counting(self): 35 | count = self.preprocessor.count_tokens(self.sample_text) 36 | print("test_token_counting", count) 37 | self.assertGreater(count, 15) 38 | self.assertLess(count, 50) 39 | 40 | def test_chunk_creation(self): 41 | chunks = self.preprocessor.process_text(self.sample_text) 42 | print("test_chunk_creation", chunks) 43 | self.assertGreater(len(chunks), 0) 44 | 45 | # Test chunk sizes 46 | for chunk in chunks: 47 | self.assertLessEqual( 48 | chunk["metadata"]["token_count"], 49 | MAX_TOKEN_LIMIT, 50 | "Chunk exceeds maximum token limit" 51 | ) 52 | 53 | def test_overlap_handling(self): 54 | chunks = self.preprocessor.process_text( 55 | self.sample_text, 56 | chunk_size=15, 57 | chunk_overlap=3 58 | ) 59 | print("test_overlap_handling", chunks) 60 | # Verify overlap between consecutive chunks 61 | for i in range(1, len(chunks)): 62 | prev_words = set(self.preprocessor.segment_words(chunks[i-1]["text"])) 63 | current_words = set(self.preprocessor.segment_words(chunks[i]["text"])) 64 | overlap = prev_words & current_words 65 | self.assertGreaterEqual(len(overlap), 2) 66 | 67 | 68 | if __name__ == "__main__": 69 | unittest.main() -------------------------------------------------------------------------------- /tests/test_embeddings.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import os 3 | import unittest 4 | import numpy as np 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | 7 | from services.embeddings import EmbeddingService 8 | from services.preprocessor import TextPreprocessor 9 | from config import EMBEDDING_MODEL, MAX_TOKEN_LIMIT 10 | 11 | class TestEmbeddingService(unittest.TestCase): 12 | @classmethod 13 | def setUpClass(cls): 14 | cls.preprocessor = TextPreprocessor() 15 | cls.embedding_service = EmbeddingService(preprocessor=cls.preprocessor) 16 | 17 | # Process sample text through preprocessor 18 | sample_text = ( 19 | "Transformer là kiến trúc mạng neural tiên tiến. " 20 | "BERT và GPT sử dụng kiến trúc này để xử lý ngôn ngữ tự nhiên. " 21 | "Các mô hình AI hiện đại đạt được kết quả ấn tượng trong NLP." 22 | ) 23 | cls.chunks = cls.preprocessor.process_text(sample_text) 24 | 25 | def test_embedding_generation(self): 26 | # Test single embedding 27 | embedding = self.embedding_service.get_embedding("ví dụ về embedding") 28 | print("test_embedding_generation", embedding) 29 | print("test_embedding_generation length", len(embedding)) 30 | print("test_embedding_generation model_dim", self.embedding_service.model_dim) 31 | self.assertEqual(len(embedding), self.embedding_service.model_dim) 32 | 33 | # Test batch embeddings 34 | embeddings = self.embedding_service.get_embeddings_batch(["text 1", "text 2"]) 35 | self.assertEqual(len(embeddings), 2) 36 | 37 | def test_chunk_embedding(self): 38 | embedded_chunks = self.embedding_service.embed_chunks(self.chunks) 39 | print("test_chunk_embedding", embedded_chunks) 40 | self.assertEqual(len(embedded_chunks), len(self.chunks)) 41 | 42 | for chunk in embedded_chunks: 43 | self.assertIn("embedding", chunk) 44 | self.assertEqual(len(chunk["embedding"]), self.embedding_service.model_dim) 45 | 46 | def test_token_limit_enforcement(self): 47 | # Create long text that exceeds token limit 48 | long_text = " ".join(["token"] * (MAX_TOKEN_LIMIT + 10)) 49 | 50 | with self.assertRaises(ValueError): 51 | self.embedding_service.get_embedding(long_text) 52 | 53 | with self.assertRaises(ValueError): 54 | self.embedding_service.get_embeddings_batch([long_text]) 55 | 56 | def test_similarity_search(self): 57 | # Embed sample chunks 58 | embedded_chunks = self.embedding_service.embed_chunks(self.chunks) 59 | 60 | # Perform similarity search 61 | query = "kiến trúc transformer" 62 | results = self.embedding_service.similarity_search( 63 | query=query, 64 | embeddings=[c["embedding"] for c in embedded_chunks], 65 | texts=[c["text"] for c in embedded_chunks], 66 | metadata=[c["metadata"] for c in embedded_chunks] 67 | ) 68 | print("test_similarity_search", results) 69 | self.assertGreater(len(results), 0) 70 | self.assertIn("transformer", results[0]["text"].lower()) 71 | 72 | if __name__ == "__main__": 73 | unittest.main() -------------------------------------------------------------------------------- /app/models/schemas.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional, Dict, Any 2 | from pydantic import BaseModel, Field, validator 3 | from app.config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, DEFAULT_TOP_K, MAX_TOKEN_LIMIT 4 | import uuid 5 | 6 | 7 | class ProcessingRequest(BaseModel): 8 | """Request model for processing text into chunks and embeddings.""" 9 | text: str = Field(..., description="The text to be processed") 10 | chunk_size: Optional[int] = Field(DEFAULT_CHUNK_SIZE, description="Target size of each chunk in tokens") 11 | chunk_overlap: Optional[int] = Field(DEFAULT_CHUNK_OVERLAP, description="Number of tokens to overlap between chunks") 12 | file_id: Optional[str] = Field(None, description="ID of the file being processed") 13 | file_title: Optional[str] = Field(None, description="Title of the file being processed") 14 | 15 | @validator('chunk_size') 16 | def validate_chunk_size(cls, v): 17 | if v > MAX_TOKEN_LIMIT: 18 | raise ValueError(f"chunk_size cannot exceed {MAX_TOKEN_LIMIT} tokens") 19 | if v <= 0: 20 | raise ValueError("chunk_size must be positive") 21 | return v 22 | 23 | @validator('chunk_overlap') 24 | def validate_chunk_overlap(cls, v, values): 25 | if 'chunk_size' in values and v >= values['chunk_size']: 26 | raise ValueError("chunk_overlap must be less than chunk_size") 27 | if v < 0: 28 | raise ValueError("chunk_overlap cannot be negative") 29 | return v 30 | 31 | 32 | class TextChunk(BaseModel): 33 | """Model representing a chunk of text with its embedding.""" 34 | text: str 35 | embedding: List[float] = None 36 | metadata: Dict[str, Any] = Field(default_factory=dict) 37 | 38 | 39 | class ChunkData(BaseModel): 40 | """Model representing a processed chunk with its metadata.""" 41 | id: str = Field(default_factory=lambda: str(uuid.uuid4())) 42 | content: str 43 | embedding: List[float] 44 | metadata: Dict[str, Any] = Field(default_factory=dict) 45 | 46 | 47 | class ProcessingResponse(BaseModel): 48 | """Response model for processed text chunks and embeddings.""" 49 | chunks: List[ChunkData] 50 | total_chunks: int 51 | 52 | class EmbeddingResponse(BaseModel): 53 | """Response model for embedding text.""" 54 | embedding: List[float] 55 | 56 | 57 | class QueryRequest(BaseModel): 58 | """Request model for querying similar chunks.""" 59 | query_text: str = Field(..., description="The query text to find similar chunks for") 60 | top_k: Optional[int] = Field(DEFAULT_TOP_K, description="Number of top matches to return") 61 | 62 | @validator('top_k') 63 | def validate_top_k(cls, v): 64 | if v <= 0: 65 | raise ValueError("top_k must be positive") 66 | return v 67 | 68 | 69 | class QueryMatch(BaseModel): 70 | """Model representing a matched chunk with similarity score.""" 71 | id: str = Field(default_factory=lambda: str(uuid.uuid4())) 72 | text: str 73 | score: float 74 | metadata: Dict[str, Any] = Field(default_factory=dict) 75 | 76 | 77 | class QueryResponse(BaseModel): 78 | """Response model for query results.""" 79 | matches: List[QueryMatch] 80 | total_matches: int 81 | 82 | 83 | class NormalizationResponse(BaseModel): 84 | # normalized_text: str 85 | # sentences: List[str] 86 | segmented_sentences: List[List[str]] 87 | 88 | class BaseResponse(BaseModel): 89 | status: str 90 | message: str 91 | -------------------------------------------------------------------------------- /plan.md: -------------------------------------------------------------------------------- 1 | I'll update the plan.md with our refined understanding of token limits and chunking strategy, without including the code implementations: 2 | 3 | ```markdown 4 | # Vietnamese RAG Implementation Plan 5 | 6 | ## 1. Project Structure (unchanged) 7 | ``` 8 | vietnamese-rag/ 9 | ├── app/ 10 | │ ├── __init__.py 11 | │ ├── main.py # FastAPI application entry point 12 | │ ├── config.py # Configuration settings 13 | │ ├── models/ 14 | │ │ ├── __init__.py 15 | │ │ └── schemas.py # Pydantic models 16 | │ ├── services/ 17 | │ │ ├── __init__.py 18 | │ │ ├── preprocessor.py # Text preprocessing pipeline 19 | │ │ └── embeddings.py # Embedding generation 20 | │ └── api/ 21 | │ ├── __init__.py 22 | │ └── routes.py # API endpoints 23 | ├── tests/ 24 | │ └── __init__.py 25 | ├── requirements.txt 26 | └── README.md 27 | ``` 28 | 29 | ## 2. Core Components: 30 | 31 | ### A. Data Models 32 | - ProcessingRequest with configurable chunk parameters 33 | - chunk_size: default 110 tokens (safety margin below 128) 34 | - chunk_overlap: fixed at 20 tokens 35 | - ProcessingResponse with chunks and embeddings 36 | - QueryRequest with configurable top_k 37 | - QueryResponse with matches and scores 38 | 39 | ### B. Text Preprocessing Service 40 | Key Features: 41 | - Text normalization using underthesea 42 | - Sentence segmentation 43 | - Word segmentation with domain-specific fixed words 44 | - Chunking strategy: 45 | - Target chunk size: 110 tokens (safety margin below 128) 46 | - Fixed overlap: 20 tokens 47 | - Special handling for long sentences 48 | - Validation to ensure chunks never exceed 128 tokens 49 | 50 | ### C. Embedding Service using sentence-transformers 51 | Key Features: 52 | - Uses bkai-foundation-models/vietnamese-bi-encoder 53 | - Input validation for 128 token limit 54 | - Batch processing for efficiency 55 | - Caching for frequent queries 56 | - CPU optimization for VPS deployment 57 | 58 | ## 3. Implementation Requirements: 59 | ``` 60 | fastapi>=0.68.0 61 | uvicorn>=0.15.0 62 | underthesea>=1.3.0 63 | sentence-transformers>=2.2.0 64 | numpy>=1.21.0 65 | pydantic>=1.8.0 66 | python-dotenv>=0.19.0 67 | ``` 68 | 69 | ## 4. Implementation Steps: 70 | 71 | 1. **Setup and Configuration**: 72 | - Virtual environment setup 73 | - Dependency installation 74 | - Configuration management 75 | - Logging setup 76 | 77 | 2. **Preprocessing Pipeline Implementation**: 78 | - Text normalization with underthesea 79 | - Word segmentation with fixed_words list 80 | - Implement chunking strategy: 81 | - Maintain 110 token chunk size 82 | - Ensure 20 token overlap 83 | - Validate against 128 token limit 84 | - Add error handling and validation 85 | 86 | 3. **Embedding Service Implementation**: 87 | - Setup sentence-transformers 88 | - Implement token limit validation 89 | - Add caching system 90 | - Optimize batch processing 91 | - Add error handling 92 | 93 | 4. **API Development**: 94 | - FastAPI endpoints implementation 95 | - Request/response validation 96 | - Error handling 97 | - API documentation 98 | - Rate limiting for VPS 99 | 100 | 5. **Testing and Optimization**: 101 | - Unit tests for components 102 | - Integration tests 103 | - Token limit validation tests 104 | - Chunking strategy tests 105 | - Performance testing 106 | - Memory optimization 107 | 108 | ## 5. VPS Deployment Considerations: 109 | 110 | 1. **Resource Management**: 111 | - Memory monitoring for embedding model 112 | - Batch size optimization 113 | - Request queuing system 114 | - Comprehensive logging 115 | 116 | 2. **Performance Optimization**: 117 | - Embedding cache implementation 118 | - Chunk size optimization 119 | - Resource cleanup 120 | - Health check system 121 | 122 | 3. **Scaling Strategy**: 123 | - Horizontal scaling plan 124 | - Load balancing setup 125 | - Monitoring system 126 | - Backup procedures 127 | 128 | ## 6. Token Management Strategy: 129 | 130 | 1. **Chunking Rules**: 131 | - Maximum chunk size: 110 tokens (safety margin) 132 | - Fixed overlap: 20 tokens 133 | - Never exceed 128 token limit 134 | 135 | 2. **Validation Layers**: 136 | - Preprocessor validation 137 | - Embedding service validation 138 | - API request validation 139 | 140 | 3. **Error Handling**: 141 | - Token limit exceeded errors 142 | - Invalid input handling 143 | - Chunk size violations 144 | 145 | 4. **Monitoring**: 146 | - Token usage tracking 147 | - Chunk size distribution 148 | - Embedding generation times 149 | - Cache hit rates 150 | 151 | Would you like me to elaborate on any specific section of this updated plan? 152 | -------------------------------------------------------------------------------- /tests/test_vietnamese_processing.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from services.preprocessor import TextPreprocessor 3 | from services.embeddings import EmbeddingService 4 | from config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP 5 | 6 | 7 | class TestVietnameseProcessing(unittest.TestCase): 8 | @classmethod 9 | def setUpClass(cls): 10 | # Initialize preprocessor and embedding service once for all tests 11 | cls.preprocessor = TextPreprocessor() 12 | cls.embedding_service = EmbeddingService(preprocessor=cls.preprocessor) 13 | 14 | # Sample Vietnamese text 15 | cls.sample_text = """ 16 | Xử lý ngôn ngữ tự nhiên (NLP) là một lĩnh vực nghiên cứu quan trọng trong trí tuệ nhân tạo. 17 | Đối với tiếng Việt, việc xử lý ngôn ngữ tự nhiên có những thách thức riêng do đặc điểm ngôn ngữ. 18 | Tiếng Việt là một ngôn ngữ đơn lập, có thanh điệu, và một từ có thể bao gồm nhiều âm tiết. 19 | Phân đoạn từ trong tiếng Việt là một bước quan trọng để xử lý văn bản tiếng Việt chính xác. 20 | 21 | Hệ thống RAG (Retrieval Augmented Generation) cho tiếng Việt cần phải giải quyết các vấn đề như: 22 | 1. Chuẩn hóa văn bản tiếng Việt 23 | 2. Phân đoạn câu và từ chính xác 24 | 3. Xử lý các từ đặc biệt như COVID-19, AI, NLP 25 | 4. Tính toán độ tương đồng ngữ nghĩa giữa các đoạn văn bản 26 | 27 | Việc chia nhỏ văn bản thành các đoạn có kích thước phù hợp (chunking) cũng đòi hỏi phải 28 | hiểu cấu trúc ngữ pháp tiếng Việt để không chia cắt thông tin ở những vị trí không phù hợp. 29 | """ 30 | 31 | def test_token_counting(self): 32 | """Test that token counting works correctly for Vietnamese text.""" 33 | # Check token count for the sample text 34 | token_count_preprocessor = self.preprocessor.count_tokens(self.sample_text) 35 | token_count_embedding = self.preprocessor.count_tokens(self.sample_text) # Now using the same method 36 | 37 | # Both methods should give the same result 38 | self.assertEqual(token_count_preprocessor, token_count_embedding) 39 | 40 | # Check that multi-syllable Vietnamese words are counted correctly 41 | text = "xử lý ngôn ngữ tự nhiên" 42 | segmented_tokens = self.preprocessor.segment_words(text) 43 | token_count = len(segmented_tokens) 44 | # In Vietnamese "xử lý", "ngôn ngữ", "tự nhiên" should be 3 tokens 45 | self.assertEqual(token_count, 3) # Should count as 3 tokens, not 6 46 | 47 | # Check with fixed words 48 | text = "COVID-19 và AI trong NLP" 49 | segmented_tokens = self.preprocessor.segment_words(text) 50 | token_count = len(segmented_tokens) 51 | self.assertEqual(token_count, 5) # COVID-19, và, AI, trong, NLP 52 | 53 | def test_chunking_with_default_parameters(self): 54 | """Test chunking with default parameters (110 tokens, 20 token overlap).""" 55 | chunks = self.preprocessor.process_text( 56 | text=self.sample_text, 57 | chunk_size=DEFAULT_CHUNK_SIZE, 58 | chunk_overlap=DEFAULT_CHUNK_OVERLAP 59 | ) 60 | 61 | # Verify we got some chunks 62 | self.assertGreater(len(chunks), 0) 63 | 64 | # Verify chunk sizes don't exceed the limit 65 | for chunk in chunks: 66 | token_count = chunk["metadata"]["token_count"] 67 | self.assertLessEqual(token_count, DEFAULT_CHUNK_SIZE) 68 | 69 | # Check for overlap between consecutive chunks 70 | if len(chunks) > 1: 71 | first_chunk_text = chunks[0]["text"] 72 | second_chunk_text = chunks[1]["text"] 73 | 74 | # There should be some overlap between chunks 75 | # Extract the last few words from first chunk 76 | first_chunk_words = set(first_chunk_text.split()) 77 | next_words = set(second_chunk_text.split()) 78 | overlap_words = first_chunk_words.intersection(next_words) 79 | 80 | # Should find some overlapping words 81 | self.assertTrue( 82 | len(overlap_words) > 0, 83 | "No overlap found between consecutive chunks" 84 | ) 85 | 86 | def test_fixed_words_preservation(self): 87 | """Test that domain-specific fixed words are preserved during segmentation.""" 88 | # Create text with fixed words 89 | text = "COVID-19 đang là vấn đề toàn cầu, AI và NLP giúp nghiên cứu nhanh hơn." 90 | 91 | # Process the text 92 | segmented_tokens = self.preprocessor.segment_words(text) 93 | 94 | # Fixed words should be preserved as is (check they exist in the token list) 95 | self.assertTrue("COVID-19" in segmented_tokens) 96 | self.assertTrue("AI" in segmented_tokens) 97 | self.assertTrue("NLP" in segmented_tokens) 98 | 99 | def test_embedding_generation(self): 100 | """Test embedding generation for Vietnamese text.""" 101 | # Create a small chunk of Vietnamese text 102 | text = "Xử lý ngôn ngữ tự nhiên cho tiếng Việt." 103 | 104 | # Generate embedding 105 | embedding = self.embedding_service.get_embedding(text) 106 | 107 | # Check embedding dimensions 108 | self.assertEqual(len(embedding), self.embedding_service.model_dim) 109 | 110 | # Check that all values are floats 111 | self.assertTrue(all(isinstance(value, float) for value in embedding)) 112 | 113 | 114 | if __name__ == "__main__": 115 | unittest.main() -------------------------------------------------------------------------------- /run_test.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | import sys 3 | import json 4 | import logging 5 | from services.preprocessor import TextPreprocessor 6 | from services.embeddings import EmbeddingService 7 | from config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, MAX_TOKEN_LIMIT 8 | 9 | # Configure logging 10 | logging.basicConfig( 11 | level=logging.INFO, 12 | format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 13 | ) 14 | logger = logging.getLogger(__name__) 15 | 16 | def main(): 17 | """Run a simple test of the Vietnamese RAG system.""" 18 | logger.info("Testing Vietnamese RAG preprocessing and embedding...") 19 | 20 | # Sample Vietnamese text for testing - longer sample to demonstrate chunking 21 | sample_text = """ 22 | Xử lý ngôn ngữ tự nhiên (NLP) là một lĩnh vực nghiên cứu quan trọng trong trí tuệ nhân tạo. 23 | Đối với tiếng Việt, việc xử lý ngôn ngữ tự nhiên có những thách thức riêng do đặc điểm ngôn ngữ. 24 | Tiếng Việt là một ngôn ngữ đơn lập, có thanh điệu, và một từ có thể bao gồm nhiều âm tiết. 25 | Phân đoạn từ trong tiếng Việt là một bước quan trọng để xử lý văn bản tiếng Việt chính xác. 26 | 27 | Hệ thống RAG (Retrieval Augmented Generation) cho tiếng Việt cần phải giải quyết các vấn đề như: 28 | 1. Chuẩn hóa văn bản tiếng Việt 29 | 2. Phân đoạn câu và từ chính xác 30 | 3. Xử lý các từ đặc biệt như COVID-19, AI, NLP 31 | 4. Tính toán độ tương đồng ngữ nghĩa giữa các đoạn văn bản 32 | 33 | Việc chia nhỏ văn bản thành các đoạn có kích thước phù hợp (chunking) cũng đòi hỏi phải 34 | hiểu cấu trúc ngữ pháp tiếng Việt để không chia cắt thông tin ở những vị trí không phù hợp. 35 | Khi các đoạn văn bản đã được chia nhỏ, chúng sẽ được chuyển đổi thành các vector embedding 36 | để có thể tìm kiếm ngữ nghĩa hiệu quả. 37 | 38 | Underthesea là một thư viện xử lý ngôn ngữ tự nhiên cho tiếng Việt được phát triển bởi nhóm 39 | nghiên cứu Underthesea. Thư viện này cung cấp nhiều công cụ hữu ích như phân đoạn từ (word segmentation), 40 | phân đoạn câu (sentence segmentation), chuẩn hóa văn bản (text normalization), và nhiều chức năng khác. 41 | 42 | Khi phát triển hệ thống RAG cho tiếng Việt, chúng ta cần đảm bảo rằng các đoạn văn bản được chia nhỏ 43 | có kích thước không quá 128 token để phù hợp với giới hạn của mô hình embedding. Đồng thời, các đoạn 44 | văn bản cần có sự chồng lấp (overlap) để đảm bảo tính liên tục của ngữ nghĩa. 45 | """ 46 | 47 | # Initialize services 48 | logger.info("Initializing services...") 49 | preprocessor = TextPreprocessor() 50 | embedding_service = EmbeddingService(preprocessor=preprocessor) 51 | 52 | # Step 1: Normalize text 53 | logger.info("Step 1: Normalizing text...") 54 | normalized_text = preprocessor.normalize_text(sample_text) 55 | print(f"\nNormalized text (first 200 chars):\n{normalized_text[:200]}...\n") 56 | 57 | # Step 2: Count tokens 58 | logger.info("Step 2: Counting tokens...") 59 | token_count = preprocessor.count_tokens(normalized_text) 60 | print(f"Total token count: {token_count}\n") 61 | 62 | # Step 3: Process text into chunks 63 | logger.info(f"Step 3: Processing text into chunks (size={DEFAULT_CHUNK_SIZE}, overlap={DEFAULT_CHUNK_OVERLAP})...") 64 | chunks = preprocessor.process_text( 65 | text=normalized_text, 66 | chunk_size=DEFAULT_CHUNK_SIZE, 67 | chunk_overlap=DEFAULT_CHUNK_OVERLAP 68 | ) 69 | 70 | print(f"Created {len(chunks)} chunks\n") 71 | 72 | # Print detailed chunk information 73 | print("=== CHUNK DETAILS ===") 74 | for i, chunk in enumerate(chunks): 75 | print(f"Chunk {i+1} ({chunk['metadata']['token_count']} tokens):") 76 | print(f"Text: {chunk['text'][:100]}...\n") 77 | 78 | # Show visualization of chunk coverage 79 | if i < len(chunks) - 1: 80 | # Find overlap with next chunk 81 | current_words = set(chunk['text'].split()) 82 | next_words = set(chunks[i+1]['text'].split()) 83 | overlap_words = current_words.intersection(next_words) 84 | 85 | print(f"Overlap with next chunk: {len(overlap_words)} words") 86 | 87 | # Show some overlap words 88 | if overlap_words: 89 | print(f"Sample overlap words: {list(overlap_words)[:5]}") 90 | print("-" * 80) 91 | 92 | # Step 4: Generate embeddings 93 | logger.info("Step 4: Generating embeddings...") 94 | chunks_with_embeddings = embedding_service.embed_chunks(chunks) 95 | 96 | # Print embedding dimensions 97 | embedding_dim = len(chunks_with_embeddings[0]['embedding']) 98 | print(f"\nEmbedding dimensions: {embedding_dim}\n") 99 | 100 | # Step 5: Demonstrate the RAG query process 101 | logger.info("Step 5: Testing similarity search...") 102 | query = "Xử lý ngôn ngữ tự nhiên tiếng Việt" 103 | print(f"\nQuery: \"{query}\"\n") 104 | 105 | # Segment and count tokens in the query 106 | segmented_query = preprocessor.segment_words(query) 107 | query_tokens = len(segmented_query) # Now just use len since segment_words returns a list 108 | print(f"Segmented query: {' '.join(segmented_query)}") 109 | print(f"Query token count: {query_tokens}\n") 110 | 111 | # Check token limit 112 | if query_tokens > MAX_TOKEN_LIMIT: 113 | print(f"WARNING: Query exceeds max token limit ({query_tokens} > {MAX_TOKEN_LIMIT})") 114 | print("Would need to be chunked for production use.") 115 | 116 | try: 117 | # Perform similarity search 118 | embeddings = [chunk["embedding"] for chunk in chunks_with_embeddings] 119 | texts = [chunk["text"] for chunk in chunks_with_embeddings] 120 | metadata = [chunk["metadata"] for chunk in chunks_with_embeddings] 121 | 122 | matches = embedding_service.similarity_search( 123 | query=query, 124 | embeddings=embeddings, 125 | texts=texts, 126 | metadata=metadata, 127 | top_k=2 128 | ) 129 | 130 | # Display search results 131 | print("=== SEARCH RESULTS ===") 132 | for i, match in enumerate(matches): 133 | print(f"Match {i+1} (score: {match['score']:.4f}):") 134 | print(f"Text: {match['text'][:150]}...\n") 135 | print(f"Token count: {match['metadata']['token_count']}") 136 | print("-" * 80) 137 | except ValueError as e: 138 | print(f"ERROR: {str(e)}") 139 | 140 | logger.info("Test completed successfully!") 141 | return 0 142 | 143 | if __name__ == "__main__": 144 | sys.exit(main()) -------------------------------------------------------------------------------- /app/services/embeddings.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List, Dict, Any, Optional, Union 3 | import numpy as np 4 | from functools import lru_cache 5 | from sentence_transformers import SentenceTransformer 6 | 7 | from app.config import ( 8 | EMBEDDING_MODEL, 9 | MAX_TOKEN_LIMIT, 10 | ENABLE_CACHE, 11 | CACHE_SIZE 12 | ) 13 | from app.services.preprocessor import TextPreprocessor 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | 18 | class EmbeddingService: 19 | """Service for generating embeddings for text using sentence-transformers.""" 20 | 21 | def __init__(self, model_name: str = EMBEDDING_MODEL, preprocessor=None): 22 | """ 23 | Initialize the embedding service. 24 | 25 | Args: 26 | model_name: Name of the sentence-transformers model to use 27 | preprocessor: Optional TextPreprocessor instance 28 | """ 29 | logger.info(f"Loading embedding model: {model_name}") 30 | self.model = SentenceTransformer(model_name) 31 | self.model_dim = self.model.get_sentence_embedding_dimension() 32 | logger.info(f"Model loaded. Embedding dimension: {self.model_dim}") 33 | 34 | # Use provided preprocessor or create one 35 | self.preprocessor = preprocessor or TextPreprocessor() 36 | 37 | # Set up caching if enabled 38 | if ENABLE_CACHE: 39 | self.get_embedding = lru_cache(maxsize=CACHE_SIZE)(self._get_embedding) 40 | else: 41 | self.get_embedding = self._get_embedding 42 | 43 | def _get_embedding(self, text: str) -> List[float]: 44 | """ 45 | Generate embedding for a text string. 46 | 47 | Args: 48 | text: Text to generate embedding for 49 | 50 | Returns: 51 | List of floats representing the embedding vector 52 | """ 53 | if not text or not isinstance(text, str): 54 | logger.warning("Empty or invalid text provided for embedding generation") 55 | return [0.0] * self.model_dim 56 | 57 | # Use preprocessor for token counting only 58 | token_count = self.preprocessor.count_tokens(text) 59 | 60 | # Check against token limit 61 | if token_count > MAX_TOKEN_LIMIT: 62 | logger.error( 63 | f"Text exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT}). " 64 | f"Please chunk your text before encoding." 65 | ) 66 | raise ValueError(f"Text exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT})") 67 | 68 | try: 69 | # Directly encode the text string 70 | embedding = self.model.encode(text).tolist() 71 | return embedding 72 | except Exception as e: 73 | logger.error(f"Error generating embedding: {str(e)}") 74 | return [0.0] * self.model_dim 75 | 76 | def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]: 77 | """ 78 | Generate embeddings for a batch of texts. 79 | 80 | Args: 81 | texts: List of texts to generate embeddings for 82 | 83 | Returns: 84 | List of embedding vectors 85 | """ 86 | if not texts: 87 | return [] 88 | 89 | # Validate texts are within token limit 90 | for i, text in enumerate(texts): 91 | if not text or not isinstance(text, str): 92 | logger.warning(f"Empty or invalid text at index {i}") 93 | continue 94 | 95 | # Check token count 96 | token_count = self.preprocessor.count_tokens(text) 97 | if token_count > MAX_TOKEN_LIMIT: 98 | logger.error( 99 | f"Text at index {i} exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT}). " 100 | f"Please chunk your text before encoding." 101 | ) 102 | raise ValueError(f"Text at index {i} exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT})") 103 | 104 | try: 105 | # Let the model handle the batch encoding directly 106 | embeddings = self.model.encode(texts).tolist() 107 | return embeddings 108 | except Exception as e: 109 | logger.error(f"Error generating batch embeddings: {str(e)}") 110 | return [[0.0] * self.model_dim] * len(texts) 111 | 112 | def embed_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 113 | """ 114 | Generate embeddings for a list of text chunks. 115 | 116 | Args: 117 | chunks: List of chunk dictionaries with text and metadata 118 | 119 | Returns: 120 | List of chunk dictionaries with added embeddings 121 | """ 122 | if not chunks: 123 | return [] 124 | 125 | # Extract texts from chunks 126 | texts = [chunk["text"] for chunk in chunks] 127 | 128 | # Generate embeddings 129 | embeddings = self.get_embeddings_batch(texts) 130 | 131 | # Add embeddings to chunks 132 | result_chunks = [] 133 | for chunk, embedding in zip(chunks, embeddings): 134 | chunk_with_embedding = chunk.copy() 135 | chunk_with_embedding["embedding"] = embedding 136 | result_chunks.append(chunk_with_embedding) 137 | 138 | return result_chunks 139 | 140 | def similarity_search( 141 | self, 142 | query: str, 143 | embeddings: List[List[float]], 144 | texts: List[str], 145 | metadata: Optional[List[Dict[str, Any]]] = None, 146 | top_k: int = 5 147 | ) -> List[Dict[str, Any]]: 148 | """ 149 | Find the most similar texts to a query. 150 | 151 | Args: 152 | query: Query text 153 | embeddings: List of embedding vectors to search 154 | texts: List of texts corresponding to the embeddings 155 | metadata: Optional list of metadata for each text 156 | top_k: Number of top matches to return 157 | 158 | Returns: 159 | List of matches with text, score, and metadata 160 | """ 161 | if not query or not embeddings or not texts: 162 | return [] 163 | 164 | if metadata is None: 165 | metadata = [{} for _ in range(len(texts))] 166 | 167 | # Generate query embedding 168 | query_embedding = self.get_embedding(query) 169 | 170 | # Convert to numpy arrays for efficient computation 171 | query_embedding_np = np.array(query_embedding) 172 | embeddings_np = np.array(embeddings) 173 | 174 | # Compute cosine similarity 175 | similarity_scores = np.dot(embeddings_np, query_embedding_np) / ( 176 | np.linalg.norm(embeddings_np, axis=1) * np.linalg.norm(query_embedding_np) 177 | ) 178 | 179 | # Get top-k indices 180 | if top_k > len(texts): 181 | top_k = len(texts) 182 | 183 | top_indices = np.argsort(similarity_scores)[-top_k:][::-1] 184 | 185 | # Prepare results 186 | results = [] 187 | for idx in top_indices: 188 | results.append({ 189 | "text": texts[idx], 190 | "score": float(similarity_scores[idx]), 191 | "metadata": metadata[idx] 192 | }) 193 | 194 | return results -------------------------------------------------------------------------------- /app/api/routes.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, HTTPException, Depends 2 | from typing import List 3 | import logging 4 | from functools import lru_cache 5 | 6 | from app.models.schemas import ( 7 | ProcessingRequest, 8 | ProcessingResponse, 9 | QueryRequest, 10 | QueryResponse, 11 | TextChunk, 12 | ChunkData, 13 | QueryMatch, 14 | NormalizationResponse, 15 | BaseResponse, 16 | EmbeddingResponse 17 | ) 18 | from app.services.preprocessor import TextPreprocessor 19 | from app.services.embeddings import EmbeddingService 20 | from app.config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, DEFAULT_TOP_K, MAX_TOKEN_LIMIT 21 | 22 | router = APIRouter() 23 | logger = logging.getLogger(__name__) 24 | 25 | # Updated dependency injection with caching 26 | @lru_cache 27 | def get_preprocessor(): 28 | return TextPreprocessor() 29 | 30 | @lru_cache 31 | def get_embedding_service(preprocessor: TextPreprocessor = Depends(get_preprocessor)): 32 | return EmbeddingService(preprocessor=preprocessor) 33 | 34 | # In-memory storage for chunks and embeddings 35 | # In a production system, this would be replaced by a vector database 36 | stored_chunks = [] 37 | 38 | 39 | @router.get("/status") 40 | async def get_status(): 41 | return BaseResponse( 42 | status="ok", 43 | message="Server is running" 44 | ) 45 | 46 | @router.post("/process", response_model=List[EmbeddingResponse]) 47 | async def process_text( 48 | request: ProcessingRequest, 49 | preprocessor: TextPreprocessor = Depends(get_preprocessor), 50 | embedding_service: EmbeddingService = Depends(get_embedding_service) 51 | ) -> List[EmbeddingResponse]: 52 | """ 53 | Process text into chunks and generate embeddings. 54 | 55 | This endpoint: 56 | 1. Normalizes the Vietnamese text using underthesea's text_normalize 57 | 2. Segments the text into sentences using underthesea's sent_tokenize 58 | 3. Segments words with underthesea's word_tokenize and fixed-word preservation 59 | 4. Chunks the text with target size of 110 tokens and 20 token overlap 60 | (or as specified in the request) 61 | 5. Generates embeddings for each chunk using vietnamese-bi-encoder 62 | 63 | Each chunk is guaranteed to: 64 | - Not exceed 128 tokens (MAX_TOKEN_LIMIT) 65 | - Have proper overlap with adjacent chunks 66 | - Preserve sentence boundaries when possible 67 | """ 68 | try: 69 | logger.info(f"Processing text of length {len(request.text)} with chunk_size={request.chunk_size}, chunk_overlap={request.chunk_overlap}") 70 | 71 | # Validate chunk parameters 72 | chunk_size = request.chunk_size or DEFAULT_CHUNK_SIZE 73 | chunk_overlap = request.chunk_overlap or DEFAULT_CHUNK_OVERLAP 74 | 75 | if chunk_size > MAX_TOKEN_LIMIT: 76 | logger.warning(f"Requested chunk_size {chunk_size} exceeds MAX_TOKEN_LIMIT {MAX_TOKEN_LIMIT}") 77 | chunk_size = MAX_TOKEN_LIMIT 78 | 79 | if chunk_overlap >= chunk_size: 80 | logger.warning(f"Requested chunk_overlap {chunk_overlap} is too large") 81 | chunk_overlap = chunk_size - 1 82 | 83 | # Process text into chunks 84 | chunks = preprocessor.process_text( 85 | text=request.text, 86 | chunk_size=chunk_size, 87 | chunk_overlap=chunk_overlap 88 | ) 89 | 90 | logger.info(f"Text processed into {len(chunks)} chunks") 91 | 92 | # Log token counts for debugging 93 | token_counts = [chunk["metadata"]["token_count"] for chunk in chunks] 94 | logger.debug(f"Token counts per chunk: {token_counts}") 95 | logger.debug(f"Min tokens: {min(token_counts) if token_counts else 0}, Max tokens: {max(token_counts) if token_counts else 0}") 96 | 97 | # Calculate average chunk size 98 | avg_chunk_size = sum(token_counts) / len(token_counts) if token_counts else 0 99 | logger.info(f"Average chunk size: {avg_chunk_size:.1f} tokens") 100 | 101 | # Verify no chunks exceed the token limit 102 | for i, chunk in enumerate(chunks): 103 | token_count = chunk["metadata"]["token_count"] 104 | if token_count > MAX_TOKEN_LIMIT: 105 | logger.error(f"Chunk {i} exceeds token limit: {token_count} > {MAX_TOKEN_LIMIT}") 106 | raise HTTPException( 107 | status_code=400, 108 | detail=f"Chunk {i} exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT})" 109 | ) 110 | 111 | # Generate embeddings for chunks 112 | try: 113 | chunks_with_embeddings = embedding_service.embed_chunks(chunks) 114 | except ValueError as e: 115 | # Catch token limit errors from embedding service 116 | logger.error(f"Token limit error during embedding: {str(e)}") 117 | raise HTTPException(status_code=400, detail=str(e)) 118 | 119 | # Add file metadata to each chunk 120 | for chunk in chunks_with_embeddings: 121 | if "metadata" not in chunk: 122 | chunk["metadata"] = {} 123 | 124 | if request.file_id: 125 | chunk["metadata"]["file_id"] = request.file_id 126 | 127 | if request.file_title: 128 | chunk["metadata"]["file_title"] = request.file_title 129 | 130 | # Store chunks and embeddings in memory 131 | global stored_chunks 132 | stored_chunks = chunks_with_embeddings 133 | 134 | # Convert to response model 135 | chunk_objects = [] 136 | for chunk in chunks_with_embeddings: 137 | chunk_objects.append(EmbeddingResponse( 138 | embedding=chunk["embedding"] 139 | )) 140 | 141 | return chunk_objects 142 | 143 | except HTTPException: 144 | raise 145 | except ValueError as e: 146 | logger.error(f"Value error processing text: {str(e)}") 147 | raise HTTPException(status_code=400, detail=str(e)) 148 | except Exception as e: 149 | logger.error(f"Error processing text: {str(e)}", exc_info=True) 150 | raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}") 151 | 152 | 153 | @router.post("/query", response_model=QueryResponse) 154 | async def query_similar( 155 | request: QueryRequest, 156 | preprocessor: TextPreprocessor = Depends(get_preprocessor), 157 | embedding_service: EmbeddingService = Depends(get_embedding_service) 158 | ) -> QueryResponse: 159 | """ 160 | Find chunks similar to the query text. 161 | """ 162 | try: 163 | global stored_chunks 164 | 165 | if not stored_chunks: 166 | raise HTTPException(status_code=400, detail="No chunks available. Process text first.") 167 | 168 | logger.info(f"Querying with '{request.query_text}', top_k={request.top_k}") 169 | 170 | # Verify query doesn't exceed token limit 171 | tokens = preprocessor.segment_words(request.query_text) 172 | token_count = len(tokens) 173 | if token_count > MAX_TOKEN_LIMIT: 174 | logger.error(f"Query exceeds token limit: {token_count} > {MAX_TOKEN_LIMIT}") 175 | raise HTTPException( 176 | status_code=400, 177 | detail=f"Query exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT})" 178 | ) 179 | 180 | # Extract embeddings and texts from stored chunks 181 | embeddings = [chunk["embedding"] for chunk in stored_chunks] 182 | texts = [chunk["text"] for chunk in stored_chunks] 183 | metadata = [chunk["metadata"] for chunk in stored_chunks] 184 | 185 | # Perform similarity search 186 | top_k = request.top_k or DEFAULT_TOP_K 187 | try: 188 | matches = embedding_service.similarity_search( 189 | query=request.query_text, 190 | embeddings=embeddings, 191 | texts=texts, 192 | metadata=metadata, 193 | top_k=top_k 194 | ) 195 | except ValueError as e: 196 | # Catch token limit errors from embedding service 197 | logger.error(f"Token limit error during similarity search: {str(e)}") 198 | raise HTTPException(status_code=400, detail=str(e)) 199 | 200 | # Convert to response model 201 | match_objects = [] 202 | for match in matches: 203 | match_objects.append(QueryMatch( 204 | text=match["text"], 205 | score=match["score"], 206 | metadata=match["metadata"] 207 | )) 208 | 209 | logger.info(f"Found {len(match_objects)} matching chunks") 210 | 211 | return QueryResponse( 212 | matches=match_objects, 213 | total_matches=len(match_objects) 214 | ) 215 | 216 | except HTTPException: 217 | raise 218 | except ValueError as e: 219 | logger.error(f"Value error querying similar chunks: {str(e)}") 220 | raise HTTPException(status_code=400, detail=str(e)) 221 | except Exception as e: 222 | logger.error(f"Error querying similar chunks: {str(e)}", exc_info=True) 223 | raise HTTPException(status_code=500, detail=f"Error querying similar chunks: {str(e)}") 224 | 225 | 226 | # New endpoint for text normalization 227 | @router.post("/normalize", response_model=NormalizationResponse) 228 | async def normalize_text( 229 | request: ProcessingRequest, # Reuse the existing ProcessingRequest schema 230 | preprocessor: TextPreprocessor = Depends(get_preprocessor), 231 | ) -> NormalizationResponse: 232 | """ 233 | Normalize, sentence segment, and word segment the input text. 234 | 235 | This endpoint: 236 | 1. Normalizes Vietnamese text. 237 | 2. Segments the text into sentences. 238 | 3. Segments the sentences into words. 239 | """ 240 | try: 241 | logger.info(f"Normalizing text of length {len(request.text)}") 242 | 243 | # 1. Normalize text 244 | normalized_text = preprocessor.normalize_text(request.text) 245 | 246 | # 2. Segment sentences 247 | sentences = preprocessor.segment_sentences(normalized_text) 248 | 249 | # 3. Segment words (preserving sentence structure) 250 | segmented_sentences = [] 251 | for sentence in sentences: 252 | segmented_sentences.append(preprocessor.segment_words(sentence)) 253 | 254 | return NormalizationResponse( 255 | # normalized_text=normalized_text, 256 | # sentences=sentences, 257 | segmented_sentences=segmented_sentences 258 | ) 259 | 260 | except Exception as e: 261 | logger.error(f"Error normalizing text: {str(e)}", exc_info=True) 262 | raise HTTPException(status_code=500, detail=f"Error normalizing text: {str(e)}") -------------------------------------------------------------------------------- /app/services/preprocessor.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List, Dict, Any, Optional, Tuple 3 | import logging 4 | from underthesea import word_tokenize, sent_tokenize, text_normalize 5 | from app.config import MAX_TOKEN_LIMIT, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | # List of domain-specific fixed words that shouldn't be segmented 10 | FIXED_WORDS = [ 11 | "COVID-19", 12 | "AI", 13 | "NLP", 14 | "RAG", 15 | # Add more domain-specific terms here 16 | ] 17 | 18 | 19 | class TextPreprocessor: 20 | def __init__(self, fixed_words: Optional[List[str]] = None): 21 | """ 22 | Initialize the text preprocessor. 23 | 24 | Args: 25 | fixed_words: List of domain-specific fixed words that shouldn't be segmented 26 | """ 27 | self.fixed_words = fixed_words or FIXED_WORDS 28 | # Compile a regex pattern for fixed words to avoid segmentation 29 | self.fixed_words_pattern = self._compile_fixed_words_pattern() 30 | 31 | def _compile_fixed_words_pattern(self) -> re.Pattern: 32 | """Compile a regex pattern for fixed words to avoid segmentation.""" 33 | if not self.fixed_words: 34 | return re.compile(r"^$") # Empty pattern 35 | 36 | # Escape special characters and join with OR 37 | escaped_words = [re.escape(word) for word in self.fixed_words] 38 | pattern = r"\b(" + "|".join(escaped_words) + r")\b" 39 | return re.compile(pattern, re.IGNORECASE) 40 | 41 | def normalize_text(self, text: str) -> str: 42 | """ 43 | Normalize text using underthesea's text_normalize and clean whitespace. 44 | 45 | Args: 46 | text: Raw text to normalize 47 | 48 | Returns: 49 | Normalized text 50 | """ 51 | if not text or not isinstance(text, str): 52 | return "" 53 | 54 | # Use underthesea's text normalization 55 | text = text_normalize(text) 56 | 57 | # Replace multiple spaces with a single space 58 | text = re.sub(r'\s+', ' ', text) 59 | text = text.strip() 60 | 61 | return text 62 | 63 | def segment_words(self, text: str) -> List[str]: 64 | """ 65 | Segment Vietnamese text into words while preserving fixed words. 66 | 67 | Args: 68 | text: Text to segment 69 | 70 | Returns: 71 | List of segmented words 72 | """ 73 | if not text: 74 | return [] 75 | 76 | # Extract fixed words and replace with placeholders 77 | placeholders = {} 78 | def replace_with_placeholder(match): 79 | word = match.group(0) 80 | placeholder = f"__FIXED_WORD_{len(placeholders)}__" 81 | placeholders[placeholder] = word 82 | return placeholder 83 | 84 | text_with_placeholders = self.fixed_words_pattern.sub(replace_with_placeholder, text) 85 | 86 | # Segment words - by default, word_tokenize returns a list of tokens 87 | segmented_tokens = word_tokenize(text_with_placeholders) 88 | 89 | # Restore fixed words in the tokens 90 | if placeholders: 91 | # If word_tokenize returned a list, process each token 92 | if isinstance(segmented_tokens, list): 93 | for i, token in enumerate(segmented_tokens): 94 | for placeholder, word in placeholders.items(): 95 | if placeholder in token: 96 | segmented_tokens[i] = token.replace(placeholder, word) 97 | # If it returned a string (in case of format="text"), process the string 98 | elif isinstance(segmented_tokens, str): 99 | for placeholder, word in placeholders.items(): 100 | segmented_tokens = segmented_tokens.replace(placeholder, word) 101 | # Convert to list by splitting on spaces 102 | segmented_tokens = segmented_tokens.split() 103 | 104 | return segmented_tokens 105 | 106 | def segment_sentences(self, text: str) -> List[str]: 107 | """ 108 | Split text into sentences using underthesea's sent_tokenize. 109 | 110 | Args: 111 | text: Text to split into sentences 112 | 113 | Returns: 114 | List of sentences 115 | """ 116 | if not text: 117 | return [] 118 | 119 | sentences = sent_tokenize(text) 120 | return sentences 121 | 122 | def count_tokens(self, text: str) -> int: 123 | """ 124 | Count tokens in Vietnamese text accurately by first segmenting words. 125 | 126 | Args: 127 | text: Text to count tokens for 128 | 129 | Returns: 130 | Accurate token count for Vietnamese text 131 | """ 132 | if not text: 133 | return 0 134 | 135 | # Simply get the list of tokens and count them 136 | tokens = self.segment_words(text) 137 | return len(tokens) 138 | 139 | def process_text(self, text: str, chunk_size: int = DEFAULT_CHUNK_SIZE, 140 | chunk_overlap: int = DEFAULT_CHUNK_OVERLAP) -> List[Dict[str, Any]]: 141 | """ 142 | Main method to process text into chunks with proper overlap. 143 | 144 | Args: 145 | text: Raw text to process 146 | chunk_size: Target size of each chunk in tokens (default: 110) 147 | chunk_overlap: Number of tokens to overlap between chunks (default: 20) 148 | 149 | Returns: 150 | List of chunk dictionaries, each with text and metadata 151 | """ 152 | # Normalize text 153 | normalized_text = self.normalize_text(text) 154 | 155 | # Segment into sentences 156 | sentences = self.segment_sentences(normalized_text) 157 | 158 | # Create chunks with proper overlap 159 | chunks = self._create_chunks_from_sentences(sentences, chunk_size, chunk_overlap) 160 | 161 | # Validate and adjust chunk sizes if needed 162 | chunks = self._validate_chunk_sizes(chunks) 163 | 164 | return chunks 165 | 166 | def _create_chunks_from_sentences(self, sentences: List[str], 167 | chunk_size: int, chunk_overlap: int) -> List[Dict[str, Any]]: 168 | """ 169 | Create chunks from a list of sentences with proper overlap. 170 | 171 | Args: 172 | sentences: List of sentences to process 173 | chunk_size: Max token count for each chunk 174 | chunk_overlap: Number of tokens to overlap between chunks 175 | 176 | Returns: 177 | List of chunk dictionaries 178 | """ 179 | chunks = [] 180 | current_chunk_tokens = [] 181 | current_size = 0 182 | 183 | for sentence in sentences: 184 | # Get tokens for this sentence 185 | sentence_tokens = self.segment_words(sentence) 186 | sentence_token_count = len(sentence_tokens) 187 | 188 | if sentence_token_count > chunk_size: 189 | # Handle long sentences separately 190 | self._handle_long_sentence(chunks, current_chunk_tokens, current_size, 191 | sentence_tokens, chunk_size) 192 | # Reset current chunk tracking 193 | current_chunk_tokens = [] 194 | current_size = 0 195 | continue 196 | 197 | if current_size + sentence_token_count > chunk_size: 198 | # Finish current chunk and start new one with overlap 199 | chunks.append(self._create_chunk(current_chunk_tokens, current_size)) 200 | # Create overlap for next chunk 201 | current_chunk_tokens, current_size = self._create_overlap( 202 | current_chunk_tokens, current_size, chunk_overlap) 203 | 204 | # Add sentence to current chunk 205 | current_chunk_tokens.extend(sentence_tokens) 206 | current_size += sentence_token_count 207 | 208 | # Add the final chunk if not empty 209 | if current_chunk_tokens: 210 | chunks.append(self._create_chunk(current_chunk_tokens, current_size)) 211 | 212 | return chunks 213 | 214 | def _handle_long_sentence(self, chunks: List[Dict[str, Any]], 215 | current_chunk_tokens: List[str], current_size: int, 216 | sentence_tokens: List[str], chunk_size: int) -> None: 217 | """ 218 | Handle sentences that are longer than the chunk size. 219 | 220 | Args: 221 | chunks: List of chunks to append to 222 | current_chunk_tokens: Tokens in the current chunk 223 | current_size: Size of current chunk in tokens 224 | sentence_tokens: Tokens of the long sentence 225 | chunk_size: Maximum chunk size 226 | """ 227 | # First save any existing chunk 228 | if current_chunk_tokens: 229 | chunks.append(self._create_chunk(current_chunk_tokens, current_size)) 230 | 231 | # Split long sentence into parts 232 | current_part = [] 233 | current_part_tokens = 0 234 | 235 | for token in sentence_tokens: 236 | if current_part_tokens + 1 <= chunk_size: 237 | current_part.append(token) 238 | current_part_tokens += 1 239 | else: 240 | # Save current part and start a new one 241 | chunks.append(self._create_chunk(current_part, current_part_tokens)) 242 | current_part = [token] 243 | current_part_tokens = 1 244 | 245 | # Save any remaining part 246 | if current_part: 247 | chunks.append(self._create_chunk(current_part, current_part_tokens)) 248 | 249 | def _create_overlap(self, tokens: List[str], size: int, 250 | overlap_size: int) -> Tuple[List[str], int]: 251 | """ 252 | Create overlap for the next chunk. 253 | 254 | Args: 255 | tokens: Tokens from the previous chunk 256 | size: Size of the previous chunk 257 | overlap_size: Number of tokens to overlap 258 | 259 | Returns: 260 | Tuple of (overlap_tokens, overlap_size) 261 | """ 262 | overlap_size = min(overlap_size, size) 263 | if overlap_size <= 0: 264 | return [], 0 265 | 266 | # Take tokens from the end for overlap 267 | tokens_to_keep = [] 268 | tokens_kept = 0 269 | 270 | for token in reversed(tokens): 271 | if tokens_kept < overlap_size: 272 | tokens_to_keep.insert(0, token) 273 | tokens_kept += 1 274 | else: 275 | break 276 | 277 | return tokens_to_keep, tokens_kept 278 | 279 | def _create_chunk(self, tokens: List[str], token_count: int) -> Dict[str, Any]: 280 | """ 281 | Create a chunk dictionary from tokens. 282 | 283 | Args: 284 | tokens: List of tokens to include in the chunk 285 | token_count: Number of tokens 286 | 287 | Returns: 288 | Chunk dictionary with text and metadata 289 | """ 290 | return { 291 | "text": " ".join(tokens), 292 | "metadata": { 293 | "token_count": token_count 294 | } 295 | } 296 | 297 | def _validate_chunk_sizes(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]: 298 | """ 299 | Validate that all chunks are within the token limit. 300 | 301 | Args: 302 | chunks: List of chunks to validate 303 | 304 | Returns: 305 | List of validated/adjusted chunks 306 | """ 307 | validated_chunks = [] 308 | 309 | for chunk in chunks: 310 | token_count = chunk["metadata"]["token_count"] 311 | 312 | if token_count > MAX_TOKEN_LIMIT: 313 | # Re-tokenize and truncate if necessary 314 | text = chunk["text"] 315 | tokens = self.segment_words(text)[:MAX_TOKEN_LIMIT] 316 | 317 | validated_chunks.append({ 318 | "text": " ".join(tokens), 319 | "metadata": { 320 | "token_count": len(tokens), 321 | "truncated": True 322 | } 323 | }) 324 | else: 325 | validated_chunks.append(chunk) 326 | 327 | return validated_chunks -------------------------------------------------------------------------------- /docs/vietnamese-rag-research.md: -------------------------------------------------------------------------------- 1 | A Comprehensive Text Processing Strategy for Vietnamese RAG Application Using Underthesea and BKAi Bi-Encoder 2 | Introduction: Setting the Stage for Vietnamese RAG with Underthesea and BKAi Bi-Encoder 3 | Retrieval-Augmented Generation (RAG) represents a significant advancement in the field of natural language processing, particularly for languages like Vietnamese where the availability of extensive, high-quality training datasets for large language models might be less abundant than for English. RAG enhances the capabilities of language models by enabling them to access and incorporate information from external knowledge sources during the text generation process. This approach is especially beneficial for knowledge-intensive tasks in Vietnamese, allowing for more accurate, contextually relevant, and informative responses grounded in factual data. 4 | The success of any RAG application is heavily dependent on the effectiveness of the text preprocessing stage. For a language like Vietnamese, which possesses unique linguistic characteristics such as its agglutinative nature and tonal system, meticulous preprocessing is paramount. Proper handling of text before it is fed into the retrieval and generation components ensures that the underlying models can accurately understand and process the information. This includes tasks like standardizing the text, segmenting it into meaningful units, and preparing it for embedding. 5 | In the context of Vietnamese natural language processing, the underthesea NLP toolkit stands out as a robust and versatile open-source Python library 1. This toolkit offers a wide array of functionalities specifically designed for Vietnamese, including word segmentation, text normalization, part-of-speech tagging, and named entity recognition. Its active development and widespread adoption within the Vietnamese NLP community make it a reliable and valuable resource for developers 1. 6 | For the crucial task of generating dense vector embeddings, the bkai-foundation-models/vietnamese-bi-encoder has been chosen 4. This model is specifically trained on Vietnamese text and excels at mapping sentences and paragraphs into a high-dimensional semantic space. These embeddings are essential for enabling efficient similarity search and retrieval of relevant documents from the knowledge base. The model's training on a diverse Vietnamese dataset, which includes translated versions of MS MARCO and SQuAD, as well as a significant portion of the Legal Text Retrieval Zalo 2021 challenge dataset, suggests its strong capabilities in understanding the semantic nuances of Vietnamese text across various domains 4. This broad training makes it potentially well-suited for handling data originating from Google Docs and Sheets, which can contain diverse content. 7 | This report aims to provide a comprehensive and tailored text preprocessing strategy for the user's Vietnamese RAG application. The strategy will focus on effectively utilizing the functionalities offered by the underthesea toolkit while strictly adhering to the constraints imposed by the vietnamese-bi-encoder, particularly its maximum input sequence length. The ultimate goal is to ensure optimal performance of the RAG application by preparing the Vietnamese text data in the most suitable manner for embedding and retrieval. 8 | Understanding the Constraints and Requirements 9 | A critical aspect of developing an effective text preprocessing strategy is a thorough understanding of the limitations and requirements of the chosen embedding model. In this case, the bkai-foundation-models/vietnamese-bi-encoder imposes specific constraints that must be carefully considered. 10 | The vietnamese-bi-encoder has a strict maximum input sequence length of 128 tokens 4. This limitation has significant implications for the RAG application. Any text input that exceeds this token count will likely be truncated by the model. Truncation can lead to a loss of crucial context, especially if important information resides towards the end of a long document or sentence. Consequently, the quality of the generated embeddings might be compromised, and the accuracy of the retrieval process could be negatively affected. Therefore, a robust strategy for breaking down the source documents into smaller, manageable chunks, each adhering to this token limit, is essential. This will likely involve sentence segmentation followed by further chunking if necessary. 11 | Furthermore, the vietnamese-bi-encoder requires the input Vietnamese text to be pre-segmented into individual words 4. Unlike English, where spaces typically delineate word boundaries, Vietnamese often requires additional processing to accurately identify words. Word segmentation is a fundamental step for the embedding model to correctly interpret the semantic units within the text. Without proper segmentation, the model might treat syllables or parts of compound words as independent entities, leading to inaccurate vector representations that do not capture the true meaning of the text. This requirement directly necessitates the use of a Vietnamese word segmenter, making the underthesea.word_tokenize function a core component of the preprocessing pipeline. 12 | Finally, the user's documents will originate from Google Docs and Google Sheets. This specifies the initial stage of the preprocessing workflow: data extraction. Strategies must be implemented to programmatically access and retrieve the textual content from these platforms in a format that can be readily processed by the subsequent NLP steps. Google Docs, being a word processing application, might contain rich text formatting, while Google Sheets store data in a tabular format. The extraction methods must be able to handle these different structures and retrieve the relevant text content for the RAG application. 13 | Data Extraction and Initial Handling 14 | The first step in the preprocessing pipeline involves extracting the text data from its source, which, in this case, are Google Docs and Google Sheets. Utilizing the respective Google APIs provides the most reliable and efficient way to accomplish this programmatically. 15 | For Google Docs, the Google Docs API (Application Programming Interface) allows developers to interact with documents stored in Google Drive 7. To use this API, it is necessary to set up a project in the Google Cloud Console, enable the Google Docs API for that project, configure the OAuth consent screen to manage user authorization, and obtain the necessary credentials for authentication 8. Once these steps are completed, the API can be used to retrieve the content of a Google Doc given its unique document ID. The API offers options to extract the document content in various formats, including plain text 7. For a RAG application focused on semantic understanding, extracting the plain text content is generally preferred to avoid processing any formatting information that might not contribute to the meaning. The google-api-python-client library provides a convenient way to interact with the Google Docs API using Python 7. A basic workflow involves authenticating the client using the obtained credentials and then making a request to the API to retrieve the document content based on its ID. It is important to implement proper error handling to manage potential issues during API calls and to be mindful of the API's usage quotas to prevent service disruptions. 16 | Similarly, for Google Sheets, the Google Sheets API enables programmatic access to spreadsheet data 13. The setup process in the Google Cloud Console is analogous to that of the Google Docs API: enabling the Google Sheets API and obtaining the necessary authentication credentials, which can include API keys or OAuth 2.0 credentials 13. To extract data, the API requires specifying the ID of the target spreadsheet and the range of cells or sheets to retrieve 13. Google Sheets store data in a structured, tabular format, so the extraction process might yield data in a grid-like structure. Depending on the specific information needed for the RAG application, this tabular data might need to be flattened or processed to extract the relevant textual content. The google-api-python-client library also supports interaction with the Google Sheets API in Python 17. The process involves authenticating and then making requests to the API to read values from the specified spreadsheet and range. Handling different data types within the spreadsheet cells and appropriately structuring the extracted text for further processing will be important considerations. 17 | After successfully extracting the text content from both Google Docs and Google Sheets, an initial data cleaning and preparation phase is recommended. This might involve addressing character encoding issues to ensure consistency across all documents. Removing any irrelevant metadata or boilerplate text that might have been extracted along with the main content, such as headers, footers, or table of contents, can also be beneficial. Furthermore, converting the extracted text to a standard encoding format, such as UTF-8, will help prevent issues in the subsequent NLP processing steps. These initial cleaning tasks ensure that the data is in a consistent and suitable format for the more advanced preprocessing stages. 18 | Vietnamese Text Normalization using Underthesea 19 | Text normalization plays a crucial role in Vietnamese NLP by addressing the inherent variations that can occur in written text 19. These variations can stem from typos, inconsistencies in the use of diacritics (tone marks), different input methods, and the presence of special characters. The goal of normalization is to reduce these inconsistencies, ensuring that semantically equivalent words are represented in a uniform manner. This standardization is vital for improving the accuracy and effectiveness of downstream NLP tasks, including word segmentation and the generation of meaningful embeddings. 20 | Common normalization tasks for Vietnamese text include diacritic standardization, which involves ensuring a consistent representation of the five tones and the absence of tone marks (for the base tone); spelling correction to identify and rectify common misspellings and typographical errors; case handling, although the case sensitivity of the vietnamese-bi-encoder should be investigated, converting text to lowercase can sometimes be beneficial for reducing vocabulary size and improving matching; and punctuation handling, which might involve standardizing or removing punctuation marks based on the requirements of the embedding model and the specific RAG tasks 1. 21 | The underthesea library provides a convenient and effective tool for addressing many of these normalization needs through its text_normalize function 1. This function is specifically designed to handle common Vietnamese text normalization tasks, including correcting diacritic errors and standardizing spelling. For example, as demonstrated in the library's documentation, the input string "Ðảm baỏ chất lựơng phòng thí nghịêm hoá học" is transformed into the normalized form "Đảm bảo chất lượng phòng thí nghiệm hóa học" 1. This function can be readily integrated into the preprocessing pipeline as an initial normalization step. 22 | Beyond text_normalize, the underthesea toolkit might offer other functionalities that can further aid in the normalization process 26. Exploring the library's documentation 25 for modules that handle special characters or case conversion could be valuable. Depending on the specific characteristics of the data extracted from Google Docs and Sheets, there might also be a need for more advanced or custom normalization rules. These could be implemented using regular expressions 21 to address specific patterns of errors or variations that are prevalent in the dataset. For instance, if the data contains inconsistencies in spacing around punctuation, regular expressions can be used to standardize this. The choice of normalization techniques should be guided by the specific types of variations observed in the data and the requirements of the subsequent processing steps. 23 | Vietnamese Word Segmentation using Underthesea 24 | Accurate word segmentation is a fundamental prerequisite for the vietnamese-bi-encoder to effectively process and understand Vietnamese text 4. The quality of the word segmentation directly influences the accuracy of the embeddings generated by the model. If words are not correctly identified and segmented, the embedding model might learn flawed representations, leading to poor semantic matching and retrieval performance in the RAG application. 25 | The underthesea library provides the word_tokenize function, which is specifically designed for segmenting Vietnamese text into individual words 1. This function takes a string of Vietnamese text as input and returns a list of segmented words by default. It also offers a format="text" option that returns a string with the segmented words joined by underscores or spaces 1. For example, the sentence "Chàng trai 9X Quảng Trị khởi nghiệp từ nấm sò" is segmented into the list `` 1. 26 | A particularly useful feature of word_tokenize is the fixed_words parameter 1. This allows the user to provide a list of multi-word expressions or named entities that should be treated as single tokens during the segmentation process. This is crucial for preserving the meaning of phrases like "Viện Nghiên Cứu" (Research Institute) or "học máy" (machine learning), which should not be split into their constituent words for accurate semantic representation 1. By identifying and providing such fixed words relevant to the domain of the Google Docs and Sheets data, the accuracy of word segmentation can be significantly improved. 27 | Vietnamese word segmentation can be challenging due to the presence of compound words and multi-word expressions 19. These linguistic units often convey a meaning that is more than the sum of their individual components. While underthesea is trained on Vietnamese corpora and incorporates rules and statistical models to handle many of these cases, achieving perfect segmentation can be difficult. For specialized domains or with specific technical terminology present in the user's documents, it might be necessary to refine the segmentation by providing custom lists of fixed_words or by exploring more advanced techniques if the default performance of underthesea is insufficient. 28 | Research suggests that underthesea employs a hybrid approach to word segmentation, likely combining rule-based methods with statistical models trained on Vietnamese text data 1. These methods often involve identifying potential word boundaries based on linguistic rules and then using statistical language models to disambiguate between different possible segmentations. Techniques like maximal matching, where the longest possible valid words are identified, are also likely employed 28. While a deep dive into the exact algorithms used by underthesea might require examining the library's source code or related research papers, understanding the general principles behind its operation can be helpful for troubleshooting and for deciding if further customization or alternative tools might be necessary. 29 | Strategies for Handling the Maximum Sequence Length Constraint 30 | Given the strict maximum sequence length of 128 tokens for the vietnamese-bi-encoder, effective strategies for handling potentially long input texts are crucial. The first step in this process is typically sentence segmentation. 31 | Underthesea provides the sent_tokenize function for dividing a block of Vietnamese text into individual sentences 1. This function analyzes the text and identifies sentence boundaries based on punctuation marks and other linguistic cues. For example, it can split a text containing multiple sentences into a list where each element is a sentence 1. Sentence segmentation is a logical first step as it breaks down large documents into smaller, semantically coherent units. The subsequent word segmentation and embedding processes can then be applied to these individual sentences. However, it is important to recognize that even after sentence segmentation, some sentences, particularly in technical or legal documents, can still be quite lengthy and might exceed the 128-token limit after being segmented into words 2. Therefore, further chunking might be necessary. 32 | Text chunking involves breaking down longer sequences of text into smaller segments. Several chunking strategies are commonly used in RAG applications 37. One straightforward approach is fixed-size chunking with overlap 37. This method involves dividing the word-segmented text into chunks, each containing a predefined maximum number of tokens (considerably less than 128 to account for variations in sentence length). To maintain context across these chunks, a certain number of tokens from the end of one chunk are repeated at the beginning of the next chunk, creating an overlap. The key parameters for this strategy are the chunk_size (the maximum number of tokens per chunk) and the chunk_overlap (the number of overlapping tokens). These parameters need to be carefully chosen and might require experimentation to find the optimal balance between context retention and redundancy 37. 33 | Another strategy is recursive character text splitting 37. This more adaptive method involves splitting the text using a hierarchy of separators, such as paragraphs, sentences, and words, in a recursive manner until the chunks meet the desired size criteria. This approach attempts to respect the natural semantic boundaries within the text, resulting in more contextually coherent chunks compared to fixed-size splitting. While underthesea might not offer a direct implementation of recursive splitting, the logic can be implemented programmatically, or libraries like LangChain, which are often used in conjunction with tools like underthesea, can be employed for this purpose 40. 34 | Finally, semantic chunking involves splitting text based on the semantic similarity of its content 38. This approach aims to group together text segments that are closely related in meaning. Implementing semantic chunking typically involves using embeddings themselves to identify natural breaks in the semantic flow of the text. While this method has the potential to create the most contextually relevant chunks for retrieval, it is also generally more complex to implement and might require using the vietnamese-bi-encoder or another embedding model to guide the chunking process. 35 | Proposed Preprocessing Pipeline for Your Vietnamese RAG Application 36 | Based on the analysis of the requirements and available tools, a comprehensive preprocessing pipeline for the Vietnamese RAG application can be outlined as follows: 37 | Data Extraction: The process begins with extracting the raw text content from the Google Docs and Google Sheets using their respective APIs. This will involve setting up the necessary Google Cloud project, enabling the APIs, handling authentication, and writing code (potentially in Python using the google-api-python-client library) to retrieve the text content based on the document and spreadsheet IDs. 38 | Initial Cleaning: After extraction, perform basic cleaning operations on the text data. This might include handling character encoding issues, removing irrelevant metadata or boilerplate text, and standardizing the text format (e.g., to UTF-8). 39 | Text Normalization: Apply the underthesea.text_normalize function to the cleaned text to correct diacritic errors and standardize spelling. Depending on the specific characteristics of the data, additional custom normalization steps using regular expressions or other techniques might be necessary. 40 | Sentence Segmentation: Use the underthesea.sent_tokenize function to divide the normalized text into individual sentences. This provides an initial level of granularity for further processing. 41 | Word Segmentation: Employ the underthesea.word_tokenize function to segment each sentence into a sequence of words. Consider creating and using a list of fixed_words for any known multi-word expressions or named entities relevant to the content of the Google Docs and Sheets. 42 | Chunking: Implement a suitable chunking strategy to ensure that each text chunk, after word segmentation, does not exceed the 128-token limit of the vietnamese-bi-encoder. A reasonable starting point could be fixed-size chunking with an overlap. For example, aim for a chunk size of around 100-110 tokens after word segmentation and an overlap of 20-30 tokens. Alternatively, explore implementing recursive character text splitting, using sentence boundaries as a primary separator. The choice of strategy and parameters should be guided by experimentation and evaluation. 43 | The optimal chunking strategy might depend on the specific nature and structure of the content in the Google Docs and Sheets. For highly structured documents, recursive splitting that respects headings or other structural elements could be beneficial. For simpler, more uniform text, fixed-size chunking might be sufficient. It is strongly recommended to experiment with different chunking strategies and parameter settings on a representative sample of the data and to evaluate the performance of the RAG application to determine the most effective approach for the user's specific needs. 44 | Key Considerations and Recommendations 45 | Several key considerations and recommendations should guide the implementation of the proposed preprocessing strategy. 46 | The choices made during each step of the preprocessing pipeline will have a direct impact on the quality of the embeddings generated by the vietnamese-bi-encoder and, ultimately, on the retrieval performance of the RAG application. Therefore, each step should be carefully considered and potentially optimized through experimentation. 47 | When using fixed-size chunking, determining the optimal chunk size and overlap is crucial. Starting with a chunk size that leaves a safety margin below the 128-token limit (e.g., aiming for around 100-110 tokens after word segmentation) and an overlap of 20-30 tokens is a reasonable approach. These values should then be iteratively adjusted based on the evaluation of the RAG application's performance. Too small a chunk size might lead to a loss of necessary context, while too large a size risks exceeding the model's input limit. Similarly, insufficient overlap might break semantic connections between chunks, whereas excessive overlap can introduce unnecessary redundancy and increase processing time. 48 | It is also important to consider the potential for out-of-vocabulary (OOV) words after word segmentation. These are words that are not present in the vocabulary of the vietnamese-bi-encoder. OOV words might not be well-represented by the embedding model, potentially affecting retrieval accuracy if these words are semantically significant. Investigating whether the bi-encoder uses subword tokenization could provide insights into how it handles OOV words. Further normalization or exploring techniques like stemming (with caution, as aggressive stemming might not align with the embedding model's training) could be considered. In cases where OOV words pose a significant problem, fine-tuning the embedding model on the user's specific vocabulary might be an option, although this is a more advanced step. 49 | Finally, it is paramount to emphasize the importance of experimentation and rigorous evaluation. The optimal preprocessing strategy and chunking parameters are likely to be specific to the user's data and the nature of their queries. Therefore, it is recommended to test different approaches on a representative subset of the Google Docs and Sheets data and to evaluate the performance of the RAG application using relevant metrics. This iterative process of experimentation and evaluation will be key to finding the most effective preprocessing configuration for achieving the desired results. 50 | Conclusion: Towards an Effective Vietnamese RAG Application 51 | In conclusion, developing an effective Vietnamese Retrieval-Augmented Generation (RAG) application using underthesea and the bkai-foundation-models/vietnamese-bi-encoder requires a well-designed and carefully implemented text preprocessing strategy. This strategy must encompass thorough data extraction from Google Docs and Sheets, comprehensive text normalization and accurate word segmentation using the underthesea toolkit, and effective chunking techniques to adhere to the embedding model's 128-token limit. 52 | Throughout the preprocessing pipeline, it is crucial to consider the specific characteristics of the user's data and the requirements of the chosen embedding model. Experimentation and evaluation are essential to optimize the various preprocessing steps and to find the most suitable configuration for the RAG application. By following the proposed strategies and continuously refining them based on empirical results, the user can build a robust and high-performing Vietnamese RAG application capable of leveraging external knowledge for enhanced language model capabilities. 53 | Works cited 54 | 1. Underthesea - Vietnamese NLP Toolkit - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/underthesea 55 | 2. undertheseanlp - Hugging Face, accessed March 15, 2025, https://huggingface.co/undertheseanlp 56 | 3. underthesea - PyPI, accessed March 15, 2025, https://pypi.org/project/underthesea/ 57 | 4. Bkai-foundation-models - Find Top AI Models on Hugging Face - AIModels.fyi, accessed March 15, 2025, https://www.aimodels.fyi/creators/huggingFace/bkai-foundation-models 58 | 5. vietnamese-bi-encoder | AI Model Details - AIModels.fyi, accessed March 15, 2025, https://www.aimodels.fyi/models/huggingFace/vietnamese-bi-encoder-bkai-foundation-models 59 | 6. Vietnamese Bi Encoder · Models - Dataloop, accessed March 15, 2025, https://dataloop.ai/library/model/bkai-foundation-models_vietnamese-bi-encoder/ 60 | 7. Extract the text from a document with Docs API - Google for Developers, accessed March 15, 2025, https://developers.google.com/docs/api/samples/extract-text 61 | 8. How to Get Document Texts with the Google Docs API in Python | Endgrate, accessed March 15, 2025, https://endgrate.com/blog/how-to-get-document-texts-with-the-google-docs-api-in-python 62 | 9. Using the Google Docs API to Get Document Texts (with Javascript examples) - Endgrate, accessed March 15, 2025, https://endgrate.com/blog/using-the-google-docs-api-to-get-document-texts-(with-javascript-examples) 63 | 10. Google Docs API samples, accessed March 15, 2025, https://developers.google.com/docs/api/samples 64 | 11. Reading text from Gdoc for feeding to ChatGPT - Help - Pipedream, accessed March 15, 2025, https://pipedream.com/community/t/reading-text-from-gdoc-for-feeding-to-chatgpt/7087 65 | 12. Python quickstart | Google Docs, accessed March 15, 2025, https://developers.google.com/docs/api/quickstart/python 66 | 13. Extracting data from Google Sheets via API - Sharperlight, accessed March 15, 2025, https://www.sharperlight.com/advanced/2022/04/06/accessing-the-google-sheets-api-via-sharperlight-query-builder/ 67 | 14. Extract data from smart chips in your Google Sheets - Google Docs Editors Help, accessed March 15, 2025, https://support.google.com/docs/answer/13524011?hl=en 68 | 15. Basic reading | Google Sheets, accessed March 15, 2025, https://developers.google.com/sheets/api/samples/reading 69 | 16. Read and Write Data in Google Sheets using Python and the Google Sheets API, accessed March 15, 2025, https://aryanirani123.medium.com/read-and-write-data-in-google-sheets-using-python-and-the-google-sheets-api-6e206a242f20 70 | 17. Python quickstart | Google Sheets, accessed March 15, 2025, https://developers.google.com/sheets/api/quickstart/python 71 | 18. Accessing Google Sheet Data with Python: A Practical Guide using the Google Sheets API, accessed March 15, 2025, https://medium.com/@techworldthink/accessing-google-sheet-data-with-python-a-practical-guide-using-the-google-sheets-api-dc57759d387a 72 | 19. Underthesea Vietnamese NLP Toolkit | Restackio, accessed March 15, 2025, https://www.restack.io/p/vietnamese-nlp-tools-answer-underthesea-cat-ai 73 | 20. Vietnamese Text Recognition Dataset | Restackio, accessed March 15, 2025, https://www.restack.io/p/vietnamese-nlp-tools-answer-text-recognition-dataset-cat-ai 74 | 21. Vietnamese Sentiment Analysis - Kaggle, accessed March 15, 2025, https://www.kaggle.com/code/tonibui3107/vietnamese-sentiment-analysis 75 | 22. Nlp Cho Tiếng Việt - Vietnamese Nlp Tools | Restackio, accessed March 15, 2025, https://www.restack.io/p/vietnamese-nlp-tools-answer-nlp-cho-tieng-viet-cat-ai 76 | 23. NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py at main - GitHub, accessed March 15, 2025, https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize.py 77 | 24. Chuẩn hóa văn bản - ProtonX, accessed March 15, 2025, https://protonx.io/courses/66487737f91fdc001a81ce3a/topics/665057c88c287e0019bb800b 78 | 25. Underthesea documentation — Under The Sea 1.1.9 documentation, accessed March 15, 2025, https://underthesea.readthedocs.io/ 79 | 26. Underthesea v6.6.0 [Latest Version] - Colab, accessed March 15, 2025, https://colab.research.google.com/drive/1gD8dSMSE_uNacW4qJ-NSnvRT85xo9ZY2 80 | 27. Vietnamese NLP Toolkit — Under The Sea 1.1.9 ... - Underthesea, accessed March 15, 2025, https://underthesea.readthedocs.io/en/latest/readme.html 81 | 28. A Hybrid Approach to Word Segmentation of Vietnamese Texts - ResearchGate, accessed March 15, 2025, https://www.researchgate.net/publication/29616221_A_Hybrid_Approach_to_Word_Segmentation_of_Vietnamese_Texts 82 | 29. Is word segmentation necessary for Vietnamese sentiment classification? - arXiv, accessed March 15, 2025, https://arxiv.org/pdf/2301.00418 83 | 30. NLP-Vietnamese-progress/tasks/word_segmentation.md at master - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/NLP-Vietnamese-progress/blob/master/tasks/word_segmentation.md 84 | 31. A Large-Scale Benchmark for Vietnamese Sentence Paraphrases - arXiv, accessed March 15, 2025, https://arxiv.org/html/2502.07188v1 85 | 32. NLP Benchmarking popular Vietnamese tokenizer - Huy Bik's Blog, accessed March 15, 2025, https://huybik.github.io/Word-Tokenizer-Benchmark/ 86 | 33. undertheseanlp/sent_tokenize: Vietnamese Sentence ... - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/sent_tokenize 87 | 34. sentence-segmentation · GitHub Topics, accessed March 15, 2025, https://github.com/topics/sentence-segmentation?o=desc&s=forks 88 | 35. underthesea/tox.ini at main - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/underthesea/blob/main/tox.ini 89 | 36. AttributeError: module 'pytest' has no attribute 'mark' · Issue #303 · undertheseanlp/underthesea - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/underthesea/issues/303 90 | 37. 7 Chunking Strategies in RAG You Need To Know - F22 Labs, accessed March 15, 2025, https://www.f22labs.com/blogs/7-chunking-strategies-in-rag-you-need-to-know/ 91 | 38. 11 Chunking Strategies for RAG — Simplified & Visualized | by Mastering LLM (Large Language Model), accessed March 15, 2025, https://masteringllm.medium.com/11-chunking-strategies-for-rag-simplified-visualized-df0dbec8e373 92 | 39. A Guide to Chunking Strategies for Retrieval Augmented Generation (RAG) - Sagacify, accessed March 15, 2025, https://www.sagacify.com/news/a-guide-to-chunking-strategies-for-retrieval-augmented-generation-rag 93 | 40. Five Levels of Chunking Strategies in RAG| Notes from Greg's Video | by Anurag Mishra, accessed March 15, 2025, https://medium.com/@anuragmishra_27746/five-levels-of-chunking-strategies-in-rag-notes-from-gregs-video-7b735895694d 94 | 41. Effective Chunking Strategies for RAG - Cohere Documentation, accessed March 15, 2025, https://docs.cohere.com/v2/page/chunking-strategies 95 | 42. Simple Chunking Strategies for RAG Applications (Part 1) | by kirouane Ayoub | Medium, accessed March 15, 2025, https://medium.com/@ayoubkirouane3/simple-chunking-strategies-for-rag-applications-part-1-d56903b167c5 96 | 43. Chunking strategies for RAG tutorial using Granite - IBM, accessed March 15, 2025, https://www.ibm.com/think/tutorials/chunking-strategies-for-rag-with-langchain-watsonx-ai 97 | --------------------------------------------------------------------------------