├── app
    ├── __init__.py
    ├── api
    │   ├── __init__.py
    │   └── routes.py
    ├── models
    │   ├── __init__.py
    │   └── schemas.py
    ├── services
    │   ├── __init__.py
    │   ├── embeddings.py
    │   └── preprocessor.py
    ├── config.py
    └── main.py
├── tests
    ├── __init__.py
    ├── test_preprocessor.py
    ├── test_embeddings.py
    └── test_vietnamese_processing.py
├── requirements.txt
├── .env.example
├── docker-compose.yml
├── .dockerignore
├── .gitignore
├── Dockerfile
├── DOCKER.md
├── README.md
├── plan.md
├── run_test.py
└── docs
    └── vietnamese-rag-research.md


/app/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/services/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi>=0.68.0
2 | uvicorn>=0.15.0
3 | underthesea>=1.3.0
4 | sentence-transformers>=2.2.0
5 | numpy>=1.21.0
6 | pydantic>=1.8.0
7 | python-dotenv>=0.19.0 


--------------------------------------------------------------------------------
/.env.example:
--------------------------------------------------------------------------------
 1 | # Application settings
 2 | DEBUG=False
 3 | 
 4 | # Server settings
 5 | HOST=0.0.0.0
 6 | PORT=8000
 7 | 
 8 | # Model settings
 9 | EMBEDDING_MODEL=bkai-foundation-models/vietnamese-bi-encoder
10 | MAX_TOKEN_LIMIT=128
11 | DEFAULT_CHUNK_SIZE=110
12 | DEFAULT_CHUNK_OVERLAP=20
13 | DEFAULT_TOP_K=5
14 | 
15 | # Cache settings
16 | ENABLE_CACHE=True
17 | CACHE_SIZE=1000


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '3'
 2 | 
 3 | services:
 4 |   app:
 5 |     build:
 6 |       context: .
 7 |       args:
 8 |         - PORT=${PORT:-8000}
 9 |     container_name: vietnamese-rag-app
10 |     ports:
11 |       - "${PORT:-8000}:${PORT:-8000}"
12 |     environment:
13 |       - PORT=${PORT:-8000}
14 |     volumes:
15 |       - huggingface_cache:/app/.cache/huggingface
16 |     restart: unless-stopped
17 | 
18 | volumes:
19 |   huggingface_cache:
20 | 


--------------------------------------------------------------------------------
/.dockerignore:
--------------------------------------------------------------------------------
 1 | # Git
 2 | .git
 3 | .gitignore
 4 | .github
 5 | 
 6 | # Python
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | *.so
11 | .Python
12 | env/
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | *.egg-info/
25 | .installed.cfg
26 | *.egg
27 | .pytest_cache/
28 | .coverage
29 | htmlcov/
30 | .tox/
31 | .nox/
32 | .hypothesis/
33 | .coverage.*
34 | 
35 | # Virtual Environment
36 | venv/
37 | .venv/
38 | ENV/
39 | env/
40 | 
41 | # IDE
42 | .idea/
43 | .vscode/
44 | *.swp
45 | *.swo
46 | .DS_Store
47 | 
48 | # Docker
49 | .dockerignore
50 | Dockerfile
51 | docker-compose.yml
52 | 
53 | # Project specific
54 | .cache/
55 | models/
56 | *.log
57 | tasks/
58 | scripts/
59 | .env.example
60 | README.md
61 | LICENSE
62 | 


--------------------------------------------------------------------------------
/app/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dotenv import load_dotenv
 3 | 
 4 | # Load environment variables from .env file
 5 | load_dotenv(override=True)
 6 | 
 7 | # Application settings
 8 | APP_NAME = "Vietnamese RAG"
 9 | DEBUG = os.getenv("DEBUG", "False").lower() in ("true", "1", "t")
10 | API_PREFIX = "/api"
11 | 
12 | # Model settings
13 | EMBEDDING_MODEL = "bkai-foundation-models/vietnamese-bi-encoder"
14 | MAX_TOKEN_LIMIT = 128
15 | DEFAULT_CHUNK_SIZE = 110  # Safe margin below MAX_TOKEN_LIMIT
16 | DEFAULT_CHUNK_OVERLAP = 20
17 | DEFAULT_TOP_K = 5
18 | 
19 | # Server settings
20 | HOST = os.getenv("HOST", "0.0.0.0")
21 | PORT = int(os.getenv("PORT", "8000"))
22 | 
23 | # Cache settings
24 | ENABLE_CACHE = os.getenv("ENABLE_CACHE", "True").lower() in ("true", "1", "t")
25 | CACHE_SIZE = int(os.getenv("CACHE_SIZE", "1000")) 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Python
 2 | __pycache__/
 3 | *.py[cod]
 4 | *$py.class
 5 | *.so
 6 | .Python
 7 | env/
 8 | build/
 9 | develop-eggs/
10 | dist/
11 | downloads/
12 | eggs/
13 | .eggs/
14 | lib/
15 | lib64/
16 | parts/
17 | sdist/
18 | var/
19 | *.egg-info/
20 | .installed.cfg
21 | *.egg
22 | 
23 | # Virtual Environment
24 | venv/
25 | ENV/
26 | .env
27 | 
28 | # IDE
29 | .idea/
30 | .vscode/
31 | *.swp
32 | *.swo
33 | 
34 | # Logs
35 | logs/
36 | *.log
37 | 
38 | # OS
39 | .DS_Store
40 | .DS_Store?
41 | ._*
42 | .Spotlight-V100
43 | .Trashes
44 | ehthumbs.db
45 | Thumbs.db
46 | 
47 | # Added by Claude Task Master
48 | logs
49 | npm-debug.log*
50 | yarn-debug.log*
51 | yarn-error.log*
52 | dev-debug.log
53 | # Dependency directories
54 | node_modules/
55 | # Environment variables
56 | # Editor directories and files
57 | .idea
58 | .vscode
59 | *.suo
60 | *.ntvs*
61 | *.njsproj
62 | *.sln
63 | *.sw?
64 | # OS specific
65 | # Task files
66 | tasks.json
67 | tasks/
68 | .cursor/*
69 | .roo/*
70 | .winds*
71 | .roo*
72 | .task*


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | # Set environment variables
 4 | ENV PYTHONDONTWRITEBYTECODE=1 \
 5 |     PYTHONUNBUFFERED=1 \
 6 |     LANG=C.UTF-8 \
 7 |     LC_ALL=C.UTF-8 \
 8 |     TZ=Asia/Ho_Chi_Minh \
 9 |     HF_HOME=/app/.cache/huggingface
10 | 
11 | # Set working directory
12 | WORKDIR /app
13 | 
14 | # Install system dependencies
15 | RUN apt-get update && apt-get install -y --no-install-recommends \
16 |     build-essential \
17 |     git \
18 |     && apt-get clean \
19 |     && rm -rf /var/lib/apt/lists/*
20 | 
21 | # Copy requirements file
22 | COPY requirements.txt .
23 | 
24 | # Install Python dependencies
25 | RUN pip install --no-cache-dir --upgrade pip && \
26 |     pip install --no-cache-dir -r requirements.txt
27 | 
28 | # Copy application code
29 | COPY . .
30 | 
31 | # Create model cache directory
32 | RUN mkdir -p /app/.cache/huggingface
33 | 
34 | # Expose the port the app runs on (default to 8000 if not set)
35 | ARG PORT=8000
36 | ENV PORT=${PORT}
37 | # Only expose the configured port, not the default 8000
38 | EXPOSE ${PORT}
39 | 
40 | # Command to run the application
41 | CMD ["sh", "-c", "uvicorn app.main:app --host 0.0.0.0 --port ${PORT}"]
42 | 


--------------------------------------------------------------------------------
/DOCKER.md:
--------------------------------------------------------------------------------
 1 | # Docker Implementation for Vietnamese RAG Application
 2 | 
 3 | This document provides instructions for building, running, and configuring the Vietnamese RAG application using Docker.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - [Docker](https://docs.docker.com/get-docker/)
 8 | - [Docker Compose](https://docs.docker.com/compose/install/)
 9 | 
10 | ## Quick Start
11 | 
12 | 1. Clone the repository:
13 |    ```bash
14 |    git clone https://github.com/yourusername/n8n-rag-vn.git
15 |    cd n8n-rag-vn
16 |    ```
17 | 
18 | 2. Create a `.env` file with your configuration (or copy from `.env.example`):
19 |    ```bash
20 |    cp .env.example .env
21 |    ```
22 | 
23 | 3. Customize the port in the `.env` file if needed:
24 |    ```
25 |    PORT=24600  # Change this to your desired port
26 |    ```
27 | 
28 | 4. Build and start the application using Docker Compose:
29 |    ```bash
30 |    docker compose up -d
31 |    ```
32 | 
33 | 5. Access the API at http://localhost:${PORT}/docs (where ${PORT} is the port specified in your .env file, default is 8000)
34 | 
35 | ## Configuration
36 | 
37 | ### Environment Variables
38 | 
39 | The Docker container can be configured using the `PORT` environment variable in the `.env` file.
40 | 
41 | ### Volumes
42 | 
43 | The Docker container uses a volume to persist the downloaded models:
44 | 
45 | - `huggingface_cache`: Persistent storage for downloaded models
46 | 
47 | ## Troubleshooting
48 | 
49 | If you encounter issues with the Docker implementation, try the following:
50 | 
51 | 1. Check the container logs:
52 |    ```bash
53 |    docker logs vietnamese-rag-app
54 |    ```
55 | 
56 | 2. Ensure the PORT environment variable is correctly set in your `.env` file.
57 | 
58 | 3. If the container fails to start, try rebuilding the image:
59 |    ```bash
60 |    docker compose build --no-cache
61 |    docker compose up -d
62 |    ```
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from fastapi import FastAPI, Request
 3 | from fastapi.middleware.cors import CORSMiddleware
 4 | from fastapi.responses import JSONResponse
 5 | import time
 6 | 
 7 | from app.api.routes import router as api_router
 8 | from app.config import APP_NAME, API_PREFIX, DEBUG
 9 | 
10 | # Configure logging
11 | logging.basicConfig(
12 |     level=logging.DEBUG if DEBUG else logging.INFO,
13 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
14 | )
15 | logger = logging.getLogger(__name__)
16 | 
17 | # Create FastAPI app
18 | app = FastAPI(
19 |     title=APP_NAME,
20 |     description="API for Vietnamese RAG (Retrieval Augmented Generation)",
21 |     version="1.0.0",
22 |     debug=DEBUG,
23 | )
24 | 
25 | # Add CORS middleware
26 | app.add_middleware(
27 |     CORSMiddleware,
28 |     allow_origins=["*"],  # In production, restrict this to specific origins
29 |     allow_credentials=True,
30 |     allow_methods=["*"],
31 |     allow_headers=["*"],
32 | )
33 | 
34 | # Add API router
35 | app.include_router(api_router, prefix=API_PREFIX)
36 | 
37 | 
38 | # Add middleware for request timing
39 | @app.middleware("http")
40 | async def add_process_time_header(request: Request, call_next):
41 |     start_time = time.time()
42 |     response = await call_next(request)
43 |     process_time = time.time() - start_time
44 |     response.headers["X-Process-Time"] = str(process_time)
45 |     return response
46 | 
47 | 
48 | # Root endpoint
49 | @app.get("/")
50 | async def root():
51 |     return {
52 |         "app": APP_NAME,
53 |         "version": "1.0.0",
54 |         "status": "active",
55 |         "api_docs": "/docs",
56 |     }
57 | 
58 | 
59 | # Health check endpoint
60 | @app.get("/health")
61 | async def health_check():
62 |     return {"status": "healthy"}
63 | 
64 | 
65 | # Global exception handler
66 | @app.exception_handler(Exception)
67 | async def global_exception_handler(request: Request, exc: Exception):
68 |     logger.error(f"Unhandled exception: {str(exc)}", exc_info=True)
69 |     return JSONResponse(
70 |         status_code=500,
71 |         content={"detail": "An unexpected error occurred, please try again later"},
72 |     )
73 | 
74 | 
75 | if __name__ == "__main__":
76 |     import uvicorn
77 |     from app.config import HOST, PORT
78 |     
79 |     logger.info(f"Starting {APP_NAME} server on {HOST}:{PORT}")
80 |     uvicorn.run("main:app", host=HOST, port=PORT, reload=DEBUG) 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Vietnamese RAG Implementation
 2 | 
 3 | A Vietnamese language Retrieval Augmented Generation (RAG) system with specialized text processing and embeddings for Vietnamese language.
 4 | 
 5 | ## Features
 6 | 
 7 | - Text normalization using `underthesea`
 8 | - Sentence segmentation
 9 | - Word segmentation with domain-specific fixed words (optional)
10 | - Smart chunking strategy with configurable chunk size and overlap (default: 110 tokens with 20 token overlap)
11 | - Embedding generation using `bkai-foundation-models/vietnamese-bi-encoder`
12 | - API for processing documents and querying similar chunks
13 | - Caching for embeddings (optional, enabled by default)
14 | - Input validation to ensure chunk size and overlap constraints
15 | 
16 | ## Setup
17 | 
18 | 1. Clone the repository
19 | 2. Install dependencies:
20 |    ```
21 |    pip install -r requirements.txt
22 |    ```
23 | 3. Run the application:
24 |    ```
25 |    uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
26 |    ```
27 |    (Note: The `--host` and `--port` are optional and default to `0.0.0.0` and `8000` respectively, as defined in `app/config.py`)
28 | 
29 | ## API Endpoints
30 | 
31 | - `POST /api/process`: Process text documents into chunks and embeddings.  Takes a `ProcessingRequest` as input, allowing specification of `chunk_size` and `chunk_overlap`. Returns a list of `EmbeddingResponse`.
32 | - `POST /api/query`: Find similar chunks for a given query text. Takes a `QueryRequest` and returns a `QueryResponse`.
33 | - `GET /api/status`:  Get server status.
34 | - `GET /health`: Health check endpoint.
35 | - `GET /`: Root endpoint with basic application information.
36 | 
37 | ## Configuration
38 | 
39 | Configuration options are managed in `app/config.py` and can be overridden using environment variables:
40 | 
41 | - `DEBUG`: Enable debug mode (default: `False`)
42 | - `EMBEDDING_MODEL`:  The SentenceTransformer model to use (default: `bkai-foundation-models/vietnamese-bi-encoder`)
43 | - `MAX_TOKEN_LIMIT`: Maximum number of tokens per chunk (default: 128)
44 | - `DEFAULT_CHUNK_SIZE`: Default chunk size in tokens (default: 110)
45 | - `DEFAULT_CHUNK_OVERLAP`: Default chunk overlap in tokens (default: 20)
46 | - `DEFAULT_TOP_K`: Default number of top matches to return for a query (default: 5)
47 | - `ENABLE_CACHE`: Enable embedding caching (default: `True`)
48 | - `CACHE_SIZE`: Maximum size of the embedding cache (default: 1000)
49 | - `HOST`: Host address (default: `0.0.0.0`)
50 | - `PORT`: Port number (default: 8000)
51 | 
52 | ## License
53 | 
54 | [MIT License](LICENSE) 


--------------------------------------------------------------------------------
/tests/test_preprocessor.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | 
 5 | # Add the project root to the Python path
 6 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 7 | 
 8 | from services.preprocessor import TextPreprocessor
 9 | from config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, MAX_TOKEN_LIMIT
10 | 
11 | 
12 | class TestTextPreprocessor(unittest.TestCase):
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         cls.preprocessor = TextPreprocessor()
16 |         cls.sample_text = (
17 |             "Trí tuệ nhân tạo (AI) đang cách mạng hóa ngành công nghệ. "
18 |             "COVID-19 đã thúc đẩy quá trình chuyển đổi số. "
19 |             "Các mô hình NLP như BERT và GPT-3 đạt được nhiều tiến bộ đáng kể."
20 |         )
21 |     
22 |     def test_normalization(self):
23 |         normalized = self.preprocessor.normalize_text("  Đây   là   ví   dụ  ")
24 |         print("test_normalization", normalized)
25 |         self.assertEqual(normalized, "Đây là ví dụ")
26 |     
27 |     def test_word_segmentation(self):
28 |         tokens = self.preprocessor.segment_words(self.sample_text)
29 |         print("test_word_segmentation", tokens)
30 |         self.assertIn("AI", tokens)
31 |         self.assertIn("COVID-19", tokens)
32 |         self.assertIn("NLP", tokens)
33 |     
34 |     def test_token_counting(self):
35 |         count = self.preprocessor.count_tokens(self.sample_text)
36 |         print("test_token_counting", count)
37 |         self.assertGreater(count, 15)
38 |         self.assertLess(count, 50)
39 |     
40 |     def test_chunk_creation(self):
41 |         chunks = self.preprocessor.process_text(self.sample_text)
42 |         print("test_chunk_creation", chunks)
43 |         self.assertGreater(len(chunks), 0)
44 |         
45 |         # Test chunk sizes
46 |         for chunk in chunks:
47 |             self.assertLessEqual(
48 |                 chunk["metadata"]["token_count"], 
49 |                 MAX_TOKEN_LIMIT,
50 |                 "Chunk exceeds maximum token limit"
51 |             )
52 |         
53 |     def test_overlap_handling(self):
54 |         chunks = self.preprocessor.process_text(
55 |             self.sample_text,
56 |             chunk_size=15,
57 |             chunk_overlap=3
58 |         )
59 |         print("test_overlap_handling", chunks)
60 |         # Verify overlap between consecutive chunks
61 |         for i in range(1, len(chunks)):
62 |             prev_words = set(self.preprocessor.segment_words(chunks[i-1]["text"]))
63 |             current_words = set(self.preprocessor.segment_words(chunks[i]["text"]))
64 |             overlap = prev_words & current_words
65 |             self.assertGreaterEqual(len(overlap), 2)
66 | 
67 | 
68 | if __name__ == "__main__":
69 |     unittest.main() 


--------------------------------------------------------------------------------
/tests/test_embeddings.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import os
 3 | import unittest
 4 | import numpy as np
 5 | sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 6 | 
 7 | from services.embeddings import EmbeddingService
 8 | from services.preprocessor import TextPreprocessor
 9 | from config import EMBEDDING_MODEL, MAX_TOKEN_LIMIT
10 | 
11 | class TestEmbeddingService(unittest.TestCase):
12 |     @classmethod
13 |     def setUpClass(cls):
14 |         cls.preprocessor = TextPreprocessor()
15 |         cls.embedding_service = EmbeddingService(preprocessor=cls.preprocessor)
16 |         
17 |         # Process sample text through preprocessor
18 |         sample_text = (
19 |             "Transformer là kiến trúc mạng neural tiên tiến. "
20 |             "BERT và GPT sử dụng kiến trúc này để xử lý ngôn ngữ tự nhiên. "
21 |             "Các mô hình AI hiện đại đạt được kết quả ấn tượng trong NLP."
22 |         )
23 |         cls.chunks = cls.preprocessor.process_text(sample_text)
24 | 
25 |     def test_embedding_generation(self):
26 |         # Test single embedding
27 |         embedding = self.embedding_service.get_embedding("ví dụ về embedding")
28 |         print("test_embedding_generation", embedding)
29 |         print("test_embedding_generation length", len(embedding))
30 |         print("test_embedding_generation model_dim", self.embedding_service.model_dim)
31 |         self.assertEqual(len(embedding), self.embedding_service.model_dim)
32 |         
33 |         # Test batch embeddings
34 |         embeddings = self.embedding_service.get_embeddings_batch(["text 1", "text 2"])
35 |         self.assertEqual(len(embeddings), 2)
36 |         
37 |     def test_chunk_embedding(self):
38 |         embedded_chunks = self.embedding_service.embed_chunks(self.chunks)
39 |         print("test_chunk_embedding", embedded_chunks)
40 |         self.assertEqual(len(embedded_chunks), len(self.chunks))
41 |         
42 |         for chunk in embedded_chunks:
43 |             self.assertIn("embedding", chunk)
44 |             self.assertEqual(len(chunk["embedding"]), self.embedding_service.model_dim)
45 | 
46 |     def test_token_limit_enforcement(self):
47 |         # Create long text that exceeds token limit
48 |         long_text = " ".join(["token"] * (MAX_TOKEN_LIMIT + 10))
49 |         
50 |         with self.assertRaises(ValueError):
51 |             self.embedding_service.get_embedding(long_text)
52 |             
53 |         with self.assertRaises(ValueError):
54 |             self.embedding_service.get_embeddings_batch([long_text])
55 | 
56 |     def test_similarity_search(self):
57 |         # Embed sample chunks
58 |         embedded_chunks = self.embedding_service.embed_chunks(self.chunks)
59 |         
60 |         # Perform similarity search
61 |         query = "kiến trúc transformer"
62 |         results = self.embedding_service.similarity_search(
63 |             query=query,
64 |             embeddings=[c["embedding"] for c in embedded_chunks],
65 |             texts=[c["text"] for c in embedded_chunks],
66 |             metadata=[c["metadata"] for c in embedded_chunks]
67 |         )
68 |         print("test_similarity_search", results)
69 |         self.assertGreater(len(results), 0)
70 |         self.assertIn("transformer", results[0]["text"].lower())
71 | 
72 | if __name__ == "__main__":
73 |     unittest.main() 


--------------------------------------------------------------------------------
/app/models/schemas.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional, Dict, Any
 2 | from pydantic import BaseModel, Field, validator
 3 | from app.config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, DEFAULT_TOP_K, MAX_TOKEN_LIMIT
 4 | import uuid
 5 | 
 6 | 
 7 | class ProcessingRequest(BaseModel):
 8 |     """Request model for processing text into chunks and embeddings."""
 9 |     text: str = Field(..., description="The text to be processed")
10 |     chunk_size: Optional[int] = Field(DEFAULT_CHUNK_SIZE, description="Target size of each chunk in tokens")
11 |     chunk_overlap: Optional[int] = Field(DEFAULT_CHUNK_OVERLAP, description="Number of tokens to overlap between chunks")
12 |     file_id: Optional[str] = Field(None, description="ID of the file being processed")
13 |     file_title: Optional[str] = Field(None, description="Title of the file being processed")
14 |     
15 |     @validator('chunk_size')
16 |     def validate_chunk_size(cls, v):
17 |         if v > MAX_TOKEN_LIMIT:
18 |             raise ValueError(f"chunk_size cannot exceed {MAX_TOKEN_LIMIT} tokens")
19 |         if v <= 0:
20 |             raise ValueError("chunk_size must be positive")
21 |         return v
22 |     
23 |     @validator('chunk_overlap')
24 |     def validate_chunk_overlap(cls, v, values):
25 |         if 'chunk_size' in values and v >= values['chunk_size']:
26 |             raise ValueError("chunk_overlap must be less than chunk_size")
27 |         if v < 0:
28 |             raise ValueError("chunk_overlap cannot be negative")
29 |         return v
30 | 
31 | 
32 | class TextChunk(BaseModel):
33 |     """Model representing a chunk of text with its embedding."""
34 |     text: str
35 |     embedding: List[float] = None
36 |     metadata: Dict[str, Any] = Field(default_factory=dict)
37 | 
38 | 
39 | class ChunkData(BaseModel):
40 |     """Model representing a processed chunk with its metadata."""
41 |     id: str = Field(default_factory=lambda: str(uuid.uuid4()))
42 |     content: str
43 |     embedding: List[float]
44 |     metadata: Dict[str, Any] = Field(default_factory=dict)
45 | 
46 | 
47 | class ProcessingResponse(BaseModel):
48 |     """Response model for processed text chunks and embeddings."""
49 |     chunks: List[ChunkData]
50 |     total_chunks: int
51 | 
52 | class EmbeddingResponse(BaseModel):
53 |     """Response model for embedding text."""
54 |     embedding: List[float]
55 |     
56 | 
57 | class QueryRequest(BaseModel):
58 |     """Request model for querying similar chunks."""
59 |     query_text: str = Field(..., description="The query text to find similar chunks for")
60 |     top_k: Optional[int] = Field(DEFAULT_TOP_K, description="Number of top matches to return")
61 |     
62 |     @validator('top_k')
63 |     def validate_top_k(cls, v):
64 |         if v <= 0:
65 |             raise ValueError("top_k must be positive")
66 |         return v
67 | 
68 | 
69 | class QueryMatch(BaseModel):
70 |     """Model representing a matched chunk with similarity score."""
71 |     id: str = Field(default_factory=lambda: str(uuid.uuid4()))
72 |     text: str
73 |     score: float
74 |     metadata: Dict[str, Any] = Field(default_factory=dict)
75 | 
76 | 
77 | class QueryResponse(BaseModel):
78 |     """Response model for query results."""
79 |     matches: List[QueryMatch]
80 |     total_matches: int
81 | 
82 | 
83 | class NormalizationResponse(BaseModel):
84 |     # normalized_text: str
85 |     # sentences: List[str]
86 |     segmented_sentences: List[List[str]] 
87 | 
88 | class BaseResponse(BaseModel):
89 |     status: str
90 |     message: str
91 | 


--------------------------------------------------------------------------------
/plan.md:
--------------------------------------------------------------------------------
  1 | I'll update the plan.md with our refined understanding of token limits and chunking strategy, without including the code implementations:
  2 | 
  3 | ```markdown
  4 | # Vietnamese RAG Implementation Plan
  5 | 
  6 | ## 1. Project Structure (unchanged)
  7 | ```
  8 | vietnamese-rag/
  9 | ├── app/
 10 | │   ├── __init__.py
 11 | │   ├── main.py              # FastAPI application entry point
 12 | │   ├── config.py            # Configuration settings
 13 | │   ├── models/
 14 | │   │   ├── __init__.py
 15 | │   │   └── schemas.py       # Pydantic models
 16 | │   ├── services/
 17 | │   │   ├── __init__.py
 18 | │   │   ├── preprocessor.py  # Text preprocessing pipeline
 19 | │   │   └── embeddings.py    # Embedding generation
 20 | │   └── api/
 21 | │       ├── __init__.py
 22 | │       └── routes.py        # API endpoints
 23 | ├── tests/
 24 | │   └── __init__.py
 25 | ├── requirements.txt
 26 | └── README.md
 27 | ```
 28 | 
 29 | ## 2. Core Components:
 30 | 
 31 | ### A. Data Models
 32 | - ProcessingRequest with configurable chunk parameters
 33 |   - chunk_size: default 110 tokens (safety margin below 128)
 34 |   - chunk_overlap: fixed at 20 tokens
 35 | - ProcessingResponse with chunks and embeddings
 36 | - QueryRequest with configurable top_k
 37 | - QueryResponse with matches and scores
 38 | 
 39 | ### B. Text Preprocessing Service
 40 | Key Features:
 41 | - Text normalization using underthesea
 42 | - Sentence segmentation
 43 | - Word segmentation with domain-specific fixed words
 44 | - Chunking strategy:
 45 |   - Target chunk size: 110 tokens (safety margin below 128)
 46 |   - Fixed overlap: 20 tokens
 47 |   - Special handling for long sentences
 48 |   - Validation to ensure chunks never exceed 128 tokens
 49 | 
 50 | ### C. Embedding Service using sentence-transformers
 51 | Key Features:
 52 | - Uses bkai-foundation-models/vietnamese-bi-encoder
 53 | - Input validation for 128 token limit
 54 | - Batch processing for efficiency
 55 | - Caching for frequent queries
 56 | - CPU optimization for VPS deployment
 57 | 
 58 | ## 3. Implementation Requirements:
 59 | ```
 60 | fastapi>=0.68.0
 61 | uvicorn>=0.15.0
 62 | underthesea>=1.3.0
 63 | sentence-transformers>=2.2.0
 64 | numpy>=1.21.0
 65 | pydantic>=1.8.0
 66 | python-dotenv>=0.19.0
 67 | ```
 68 | 
 69 | ## 4. Implementation Steps:
 70 | 
 71 | 1. **Setup and Configuration**:
 72 |    - Virtual environment setup
 73 |    - Dependency installation
 74 |    - Configuration management
 75 |    - Logging setup
 76 | 
 77 | 2. **Preprocessing Pipeline Implementation**:
 78 |    - Text normalization with underthesea
 79 |    - Word segmentation with fixed_words list
 80 |    - Implement chunking strategy:
 81 |      - Maintain 110 token chunk size
 82 |      - Ensure 20 token overlap
 83 |      - Validate against 128 token limit
 84 |    - Add error handling and validation
 85 | 
 86 | 3. **Embedding Service Implementation**:
 87 |    - Setup sentence-transformers
 88 |    - Implement token limit validation
 89 |    - Add caching system
 90 |    - Optimize batch processing
 91 |    - Add error handling
 92 | 
 93 | 4. **API Development**:
 94 |    - FastAPI endpoints implementation
 95 |    - Request/response validation
 96 |    - Error handling
 97 |    - API documentation
 98 |    - Rate limiting for VPS
 99 | 
100 | 5. **Testing and Optimization**:
101 |    - Unit tests for components
102 |    - Integration tests
103 |    - Token limit validation tests
104 |    - Chunking strategy tests
105 |    - Performance testing
106 |    - Memory optimization
107 | 
108 | ## 5. VPS Deployment Considerations:
109 | 
110 | 1. **Resource Management**:
111 |    - Memory monitoring for embedding model
112 |    - Batch size optimization
113 |    - Request queuing system
114 |    - Comprehensive logging
115 | 
116 | 2. **Performance Optimization**:
117 |    - Embedding cache implementation
118 |    - Chunk size optimization
119 |    - Resource cleanup
120 |    - Health check system
121 | 
122 | 3. **Scaling Strategy**:
123 |    - Horizontal scaling plan
124 |    - Load balancing setup
125 |    - Monitoring system
126 |    - Backup procedures
127 | 
128 | ## 6. Token Management Strategy:
129 | 
130 | 1. **Chunking Rules**:
131 |    - Maximum chunk size: 110 tokens (safety margin)
132 |    - Fixed overlap: 20 tokens
133 |    - Never exceed 128 token limit
134 | 
135 | 2. **Validation Layers**:
136 |    - Preprocessor validation
137 |    - Embedding service validation
138 |    - API request validation
139 | 
140 | 3. **Error Handling**:
141 |    - Token limit exceeded errors
142 |    - Invalid input handling
143 |    - Chunk size violations
144 | 
145 | 4. **Monitoring**:
146 |    - Token usage tracking
147 |    - Chunk size distribution
148 |    - Embedding generation times
149 |    - Cache hit rates
150 | 
151 | Would you like me to elaborate on any specific section of this updated plan?
152 | 


--------------------------------------------------------------------------------
/tests/test_vietnamese_processing.py:
--------------------------------------------------------------------------------
  1 | import unittest
  2 | from services.preprocessor import TextPreprocessor
  3 | from services.embeddings import EmbeddingService
  4 | from config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP
  5 | 
  6 | 
  7 | class TestVietnameseProcessing(unittest.TestCase):
  8 |     @classmethod
  9 |     def setUpClass(cls):
 10 |         # Initialize preprocessor and embedding service once for all tests
 11 |         cls.preprocessor = TextPreprocessor()
 12 |         cls.embedding_service = EmbeddingService(preprocessor=cls.preprocessor)
 13 |         
 14 |         # Sample Vietnamese text
 15 |         cls.sample_text = """
 16 |         Xử lý ngôn ngữ tự nhiên (NLP) là một lĩnh vực nghiên cứu quan trọng trong trí tuệ nhân tạo. 
 17 |         Đối với tiếng Việt, việc xử lý ngôn ngữ tự nhiên có những thách thức riêng do đặc điểm ngôn ngữ. 
 18 |         Tiếng Việt là một ngôn ngữ đơn lập, có thanh điệu, và một từ có thể bao gồm nhiều âm tiết. 
 19 |         Phân đoạn từ trong tiếng Việt là một bước quan trọng để xử lý văn bản tiếng Việt chính xác.
 20 |         
 21 |         Hệ thống RAG (Retrieval Augmented Generation) cho tiếng Việt cần phải giải quyết các vấn đề như:
 22 |         1. Chuẩn hóa văn bản tiếng Việt
 23 |         2. Phân đoạn câu và từ chính xác
 24 |         3. Xử lý các từ đặc biệt như COVID-19, AI, NLP
 25 |         4. Tính toán độ tương đồng ngữ nghĩa giữa các đoạn văn bản
 26 |         
 27 |         Việc chia nhỏ văn bản thành các đoạn có kích thước phù hợp (chunking) cũng đòi hỏi phải 
 28 |         hiểu cấu trúc ngữ pháp tiếng Việt để không chia cắt thông tin ở những vị trí không phù hợp.
 29 |         """
 30 |     
 31 |     def test_token_counting(self):
 32 |         """Test that token counting works correctly for Vietnamese text."""
 33 |         # Check token count for the sample text
 34 |         token_count_preprocessor = self.preprocessor.count_tokens(self.sample_text)
 35 |         token_count_embedding = self.preprocessor.count_tokens(self.sample_text)  # Now using the same method
 36 |         
 37 |         # Both methods should give the same result
 38 |         self.assertEqual(token_count_preprocessor, token_count_embedding)
 39 |         
 40 |         # Check that multi-syllable Vietnamese words are counted correctly
 41 |         text = "xử lý ngôn ngữ tự nhiên"
 42 |         segmented_tokens = self.preprocessor.segment_words(text)
 43 |         token_count = len(segmented_tokens)
 44 |         # In Vietnamese "xử lý", "ngôn ngữ", "tự nhiên" should be 3 tokens
 45 |         self.assertEqual(token_count, 3)  # Should count as 3 tokens, not 6
 46 |         
 47 |         # Check with fixed words
 48 |         text = "COVID-19 và AI trong NLP"
 49 |         segmented_tokens = self.preprocessor.segment_words(text)
 50 |         token_count = len(segmented_tokens)
 51 |         self.assertEqual(token_count, 5)  # COVID-19, và, AI, trong, NLP
 52 |     
 53 |     def test_chunking_with_default_parameters(self):
 54 |         """Test chunking with default parameters (110 tokens, 20 token overlap)."""
 55 |         chunks = self.preprocessor.process_text(
 56 |             text=self.sample_text,
 57 |             chunk_size=DEFAULT_CHUNK_SIZE,
 58 |             chunk_overlap=DEFAULT_CHUNK_OVERLAP
 59 |         )
 60 |         
 61 |         # Verify we got some chunks
 62 |         self.assertGreater(len(chunks), 0)
 63 |         
 64 |         # Verify chunk sizes don't exceed the limit
 65 |         for chunk in chunks:
 66 |             token_count = chunk["metadata"]["token_count"]
 67 |             self.assertLessEqual(token_count, DEFAULT_CHUNK_SIZE)
 68 |         
 69 |         # Check for overlap between consecutive chunks
 70 |         if len(chunks) > 1:
 71 |             first_chunk_text = chunks[0]["text"]
 72 |             second_chunk_text = chunks[1]["text"]
 73 |             
 74 |             # There should be some overlap between chunks
 75 |             # Extract the last few words from first chunk
 76 |             first_chunk_words = set(first_chunk_text.split())
 77 |             next_words = set(second_chunk_text.split())
 78 |             overlap_words = first_chunk_words.intersection(next_words)
 79 |             
 80 |             # Should find some overlapping words
 81 |             self.assertTrue(
 82 |                 len(overlap_words) > 0,
 83 |                 "No overlap found between consecutive chunks"
 84 |             )
 85 |     
 86 |     def test_fixed_words_preservation(self):
 87 |         """Test that domain-specific fixed words are preserved during segmentation."""
 88 |         # Create text with fixed words
 89 |         text = "COVID-19 đang là vấn đề toàn cầu, AI và NLP giúp nghiên cứu nhanh hơn."
 90 |         
 91 |         # Process the text
 92 |         segmented_tokens = self.preprocessor.segment_words(text)
 93 |         
 94 |         # Fixed words should be preserved as is (check they exist in the token list)
 95 |         self.assertTrue("COVID-19" in segmented_tokens)
 96 |         self.assertTrue("AI" in segmented_tokens)
 97 |         self.assertTrue("NLP" in segmented_tokens)
 98 |     
 99 |     def test_embedding_generation(self):
100 |         """Test embedding generation for Vietnamese text."""
101 |         # Create a small chunk of Vietnamese text
102 |         text = "Xử lý ngôn ngữ tự nhiên cho tiếng Việt."
103 |         
104 |         # Generate embedding
105 |         embedding = self.embedding_service.get_embedding(text)
106 |         
107 |         # Check embedding dimensions
108 |         self.assertEqual(len(embedding), self.embedding_service.model_dim)
109 |         
110 |         # Check that all values are floats
111 |         self.assertTrue(all(isinstance(value, float) for value in embedding))
112 | 
113 | 
114 | if __name__ == "__main__":
115 |     unittest.main() 


--------------------------------------------------------------------------------
/run_test.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import sys
  3 | import json
  4 | import logging
  5 | from services.preprocessor import TextPreprocessor
  6 | from services.embeddings import EmbeddingService
  7 | from config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, MAX_TOKEN_LIMIT
  8 | 
  9 | # Configure logging
 10 | logging.basicConfig(
 11 |     level=logging.INFO,
 12 |     format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
 13 | )
 14 | logger = logging.getLogger(__name__)
 15 | 
 16 | def main():
 17 |     """Run a simple test of the Vietnamese RAG system."""
 18 |     logger.info("Testing Vietnamese RAG preprocessing and embedding...")
 19 |     
 20 |     # Sample Vietnamese text for testing - longer sample to demonstrate chunking
 21 |     sample_text = """
 22 |     Xử lý ngôn ngữ tự nhiên (NLP) là một lĩnh vực nghiên cứu quan trọng trong trí tuệ nhân tạo. 
 23 |     Đối với tiếng Việt, việc xử lý ngôn ngữ tự nhiên có những thách thức riêng do đặc điểm ngôn ngữ. 
 24 |     Tiếng Việt là một ngôn ngữ đơn lập, có thanh điệu, và một từ có thể bao gồm nhiều âm tiết. 
 25 |     Phân đoạn từ trong tiếng Việt là một bước quan trọng để xử lý văn bản tiếng Việt chính xác.
 26 |     
 27 |     Hệ thống RAG (Retrieval Augmented Generation) cho tiếng Việt cần phải giải quyết các vấn đề như:
 28 |     1. Chuẩn hóa văn bản tiếng Việt
 29 |     2. Phân đoạn câu và từ chính xác
 30 |     3. Xử lý các từ đặc biệt như COVID-19, AI, NLP
 31 |     4. Tính toán độ tương đồng ngữ nghĩa giữa các đoạn văn bản
 32 |     
 33 |     Việc chia nhỏ văn bản thành các đoạn có kích thước phù hợp (chunking) cũng đòi hỏi phải 
 34 |     hiểu cấu trúc ngữ pháp tiếng Việt để không chia cắt thông tin ở những vị trí không phù hợp.
 35 |     Khi các đoạn văn bản đã được chia nhỏ, chúng sẽ được chuyển đổi thành các vector embedding 
 36 |     để có thể tìm kiếm ngữ nghĩa hiệu quả.
 37 |     
 38 |     Underthesea là một thư viện xử lý ngôn ngữ tự nhiên cho tiếng Việt được phát triển bởi nhóm 
 39 |     nghiên cứu Underthesea. Thư viện này cung cấp nhiều công cụ hữu ích như phân đoạn từ (word segmentation),
 40 |     phân đoạn câu (sentence segmentation), chuẩn hóa văn bản (text normalization), và nhiều chức năng khác.
 41 |     
 42 |     Khi phát triển hệ thống RAG cho tiếng Việt, chúng ta cần đảm bảo rằng các đoạn văn bản được chia nhỏ
 43 |     có kích thước không quá 128 token để phù hợp với giới hạn của mô hình embedding. Đồng thời, các đoạn 
 44 |     văn bản cần có sự chồng lấp (overlap) để đảm bảo tính liên tục của ngữ nghĩa.
 45 |     """
 46 |     
 47 |     # Initialize services
 48 |     logger.info("Initializing services...")
 49 |     preprocessor = TextPreprocessor()
 50 |     embedding_service = EmbeddingService(preprocessor=preprocessor)
 51 |     
 52 |     # Step 1: Normalize text
 53 |     logger.info("Step 1: Normalizing text...")
 54 |     normalized_text = preprocessor.normalize_text(sample_text)
 55 |     print(f"\nNormalized text (first 200 chars):\n{normalized_text[:200]}...\n")
 56 |     
 57 |     # Step 2: Count tokens
 58 |     logger.info("Step 2: Counting tokens...")
 59 |     token_count = preprocessor.count_tokens(normalized_text)
 60 |     print(f"Total token count: {token_count}\n")
 61 |     
 62 |     # Step 3: Process text into chunks
 63 |     logger.info(f"Step 3: Processing text into chunks (size={DEFAULT_CHUNK_SIZE}, overlap={DEFAULT_CHUNK_OVERLAP})...")
 64 |     chunks = preprocessor.process_text(
 65 |         text=normalized_text,
 66 |         chunk_size=DEFAULT_CHUNK_SIZE,
 67 |         chunk_overlap=DEFAULT_CHUNK_OVERLAP
 68 |     )
 69 |     
 70 |     print(f"Created {len(chunks)} chunks\n")
 71 |     
 72 |     # Print detailed chunk information
 73 |     print("=== CHUNK DETAILS ===")
 74 |     for i, chunk in enumerate(chunks):
 75 |         print(f"Chunk {i+1} ({chunk['metadata']['token_count']} tokens):")
 76 |         print(f"Text: {chunk['text'][:100]}...\n")
 77 |         
 78 |         # Show visualization of chunk coverage
 79 |         if i < len(chunks) - 1:
 80 |             # Find overlap with next chunk
 81 |             current_words = set(chunk['text'].split())
 82 |             next_words = set(chunks[i+1]['text'].split())
 83 |             overlap_words = current_words.intersection(next_words)
 84 |             
 85 |             print(f"Overlap with next chunk: {len(overlap_words)} words")
 86 |             
 87 |             # Show some overlap words
 88 |             if overlap_words:
 89 |                 print(f"Sample overlap words: {list(overlap_words)[:5]}")
 90 |         print("-" * 80)
 91 |     
 92 |     # Step 4: Generate embeddings
 93 |     logger.info("Step 4: Generating embeddings...")
 94 |     chunks_with_embeddings = embedding_service.embed_chunks(chunks)
 95 |     
 96 |     # Print embedding dimensions
 97 |     embedding_dim = len(chunks_with_embeddings[0]['embedding'])
 98 |     print(f"\nEmbedding dimensions: {embedding_dim}\n")
 99 |     
100 |     # Step 5: Demonstrate the RAG query process
101 |     logger.info("Step 5: Testing similarity search...")
102 |     query = "Xử lý ngôn ngữ tự nhiên tiếng Việt"
103 |     print(f"\nQuery: \"{query}\"\n")
104 |     
105 |     # Segment and count tokens in the query
106 |     segmented_query = preprocessor.segment_words(query)
107 |     query_tokens = len(segmented_query)  # Now just use len since segment_words returns a list
108 |     print(f"Segmented query: {' '.join(segmented_query)}")
109 |     print(f"Query token count: {query_tokens}\n")
110 |     
111 |     # Check token limit
112 |     if query_tokens > MAX_TOKEN_LIMIT:
113 |         print(f"WARNING: Query exceeds max token limit ({query_tokens} > {MAX_TOKEN_LIMIT})")
114 |         print("Would need to be chunked for production use.")
115 |     
116 |     try:
117 |         # Perform similarity search
118 |         embeddings = [chunk["embedding"] for chunk in chunks_with_embeddings]
119 |         texts = [chunk["text"] for chunk in chunks_with_embeddings]
120 |         metadata = [chunk["metadata"] for chunk in chunks_with_embeddings]
121 |         
122 |         matches = embedding_service.similarity_search(
123 |             query=query,
124 |             embeddings=embeddings,
125 |             texts=texts,
126 |             metadata=metadata,
127 |             top_k=2
128 |         )
129 |         
130 |         # Display search results
131 |         print("=== SEARCH RESULTS ===")
132 |         for i, match in enumerate(matches):
133 |             print(f"Match {i+1} (score: {match['score']:.4f}):")
134 |             print(f"Text: {match['text'][:150]}...\n")
135 |             print(f"Token count: {match['metadata']['token_count']}")
136 |             print("-" * 80)
137 |     except ValueError as e:
138 |         print(f"ERROR: {str(e)}")
139 |     
140 |     logger.info("Test completed successfully!")
141 |     return 0
142 | 
143 | if __name__ == "__main__":
144 |     sys.exit(main()) 


--------------------------------------------------------------------------------
/app/services/embeddings.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List, Dict, Any, Optional, Union
  3 | import numpy as np
  4 | from functools import lru_cache
  5 | from sentence_transformers import SentenceTransformer
  6 | 
  7 | from app.config import (
  8 |     EMBEDDING_MODEL, 
  9 |     MAX_TOKEN_LIMIT, 
 10 |     ENABLE_CACHE, 
 11 |     CACHE_SIZE
 12 | )
 13 | from app.services.preprocessor import TextPreprocessor
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | 
 18 | class EmbeddingService:
 19 |     """Service for generating embeddings for text using sentence-transformers."""
 20 |     
 21 |     def __init__(self, model_name: str = EMBEDDING_MODEL, preprocessor=None):
 22 |         """
 23 |         Initialize the embedding service.
 24 |         
 25 |         Args:
 26 |             model_name: Name of the sentence-transformers model to use
 27 |             preprocessor: Optional TextPreprocessor instance
 28 |         """
 29 |         logger.info(f"Loading embedding model: {model_name}")
 30 |         self.model = SentenceTransformer(model_name)
 31 |         self.model_dim = self.model.get_sentence_embedding_dimension()
 32 |         logger.info(f"Model loaded. Embedding dimension: {self.model_dim}")
 33 |         
 34 |         # Use provided preprocessor or create one
 35 |         self.preprocessor = preprocessor or TextPreprocessor()
 36 |         
 37 |         # Set up caching if enabled
 38 |         if ENABLE_CACHE:
 39 |             self.get_embedding = lru_cache(maxsize=CACHE_SIZE)(self._get_embedding)
 40 |         else:
 41 |             self.get_embedding = self._get_embedding
 42 |     
 43 |     def _get_embedding(self, text: str) -> List[float]:
 44 |         """
 45 |         Generate embedding for a text string.
 46 |         
 47 |         Args:
 48 |             text: Text to generate embedding for
 49 |             
 50 |         Returns:
 51 |             List of floats representing the embedding vector
 52 |         """
 53 |         if not text or not isinstance(text, str):
 54 |             logger.warning("Empty or invalid text provided for embedding generation")
 55 |             return [0.0] * self.model_dim
 56 |         
 57 |         # Use preprocessor for token counting only
 58 |         token_count = self.preprocessor.count_tokens(text)
 59 |         
 60 |         # Check against token limit
 61 |         if token_count > MAX_TOKEN_LIMIT:
 62 |             logger.error(
 63 |                 f"Text exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT}). "
 64 |                 f"Please chunk your text before encoding."
 65 |             )
 66 |             raise ValueError(f"Text exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT})")
 67 |             
 68 |         try:
 69 |             # Directly encode the text string
 70 |             embedding = self.model.encode(text).tolist()
 71 |             return embedding
 72 |         except Exception as e:
 73 |             logger.error(f"Error generating embedding: {str(e)}")
 74 |             return [0.0] * self.model_dim
 75 |     
 76 |     def get_embeddings_batch(self, texts: List[str]) -> List[List[float]]:
 77 |         """
 78 |         Generate embeddings for a batch of texts.
 79 |         
 80 |         Args:
 81 |             texts: List of texts to generate embeddings for
 82 |             
 83 |         Returns:
 84 |             List of embedding vectors
 85 |         """
 86 |         if not texts:
 87 |             return []
 88 |         
 89 |         # Validate texts are within token limit
 90 |         for i, text in enumerate(texts):
 91 |             if not text or not isinstance(text, str):
 92 |                 logger.warning(f"Empty or invalid text at index {i}")
 93 |                 continue
 94 |                 
 95 |             # Check token count
 96 |             token_count = self.preprocessor.count_tokens(text)
 97 |             if token_count > MAX_TOKEN_LIMIT:
 98 |                 logger.error(
 99 |                     f"Text at index {i} exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT}). "
100 |                     f"Please chunk your text before encoding."
101 |                 )
102 |                 raise ValueError(f"Text at index {i} exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT})")
103 |         
104 |         try:
105 |             # Let the model handle the batch encoding directly
106 |             embeddings = self.model.encode(texts).tolist()
107 |             return embeddings
108 |         except Exception as e:
109 |             logger.error(f"Error generating batch embeddings: {str(e)}")
110 |             return [[0.0] * self.model_dim] * len(texts)
111 |     
112 |     def embed_chunks(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
113 |         """
114 |         Generate embeddings for a list of text chunks.
115 |         
116 |         Args:
117 |             chunks: List of chunk dictionaries with text and metadata
118 |             
119 |         Returns:
120 |             List of chunk dictionaries with added embeddings
121 |         """
122 |         if not chunks:
123 |             return []
124 |         
125 |         # Extract texts from chunks
126 |         texts = [chunk["text"] for chunk in chunks]
127 |         
128 |         # Generate embeddings
129 |         embeddings = self.get_embeddings_batch(texts)
130 |         
131 |         # Add embeddings to chunks
132 |         result_chunks = []
133 |         for chunk, embedding in zip(chunks, embeddings):
134 |             chunk_with_embedding = chunk.copy()
135 |             chunk_with_embedding["embedding"] = embedding
136 |             result_chunks.append(chunk_with_embedding)
137 |         
138 |         return result_chunks
139 |     
140 |     def similarity_search(
141 |         self, 
142 |         query: str, 
143 |         embeddings: List[List[float]], 
144 |         texts: List[str],
145 |         metadata: Optional[List[Dict[str, Any]]] = None,
146 |         top_k: int = 5
147 |     ) -> List[Dict[str, Any]]:
148 |         """
149 |         Find the most similar texts to a query.
150 |         
151 |         Args:
152 |             query: Query text
153 |             embeddings: List of embedding vectors to search
154 |             texts: List of texts corresponding to the embeddings
155 |             metadata: Optional list of metadata for each text
156 |             top_k: Number of top matches to return
157 |             
158 |         Returns:
159 |             List of matches with text, score, and metadata
160 |         """
161 |         if not query or not embeddings or not texts:
162 |             return []
163 |         
164 |         if metadata is None:
165 |             metadata = [{} for _ in range(len(texts))]
166 |         
167 |         # Generate query embedding
168 |         query_embedding = self.get_embedding(query)
169 |         
170 |         # Convert to numpy arrays for efficient computation
171 |         query_embedding_np = np.array(query_embedding)
172 |         embeddings_np = np.array(embeddings)
173 |         
174 |         # Compute cosine similarity
175 |         similarity_scores = np.dot(embeddings_np, query_embedding_np) / (
176 |             np.linalg.norm(embeddings_np, axis=1) * np.linalg.norm(query_embedding_np)
177 |         )
178 |         
179 |         # Get top-k indices
180 |         if top_k > len(texts):
181 |             top_k = len(texts)
182 |             
183 |         top_indices = np.argsort(similarity_scores)[-top_k:][::-1]
184 |         
185 |         # Prepare results
186 |         results = []
187 |         for idx in top_indices:
188 |             results.append({
189 |                 "text": texts[idx],
190 |                 "score": float(similarity_scores[idx]),
191 |                 "metadata": metadata[idx]
192 |             })
193 |         
194 |         return results 


--------------------------------------------------------------------------------
/app/api/routes.py:
--------------------------------------------------------------------------------
  1 | from fastapi import APIRouter, HTTPException, Depends
  2 | from typing import List
  3 | import logging
  4 | from functools import lru_cache
  5 | 
  6 | from app.models.schemas import (
  7 |     ProcessingRequest, 
  8 |     ProcessingResponse, 
  9 |     QueryRequest, 
 10 |     QueryResponse,
 11 |     TextChunk,
 12 |     ChunkData,
 13 |     QueryMatch,
 14 |     NormalizationResponse,
 15 |     BaseResponse,
 16 |     EmbeddingResponse
 17 | )
 18 | from app.services.preprocessor import TextPreprocessor
 19 | from app.services.embeddings import EmbeddingService
 20 | from app.config import DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP, DEFAULT_TOP_K, MAX_TOKEN_LIMIT
 21 | 
 22 | router = APIRouter()
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | # Updated dependency injection with caching
 26 | @lru_cache
 27 | def get_preprocessor():
 28 |     return TextPreprocessor()
 29 | 
 30 | @lru_cache
 31 | def get_embedding_service(preprocessor: TextPreprocessor = Depends(get_preprocessor)):
 32 |     return EmbeddingService(preprocessor=preprocessor)
 33 | 
 34 | # In-memory storage for chunks and embeddings
 35 | # In a production system, this would be replaced by a vector database
 36 | stored_chunks = []
 37 | 
 38 | 
 39 | @router.get("/status")
 40 | async def get_status():
 41 |     return BaseResponse(
 42 |         status="ok",
 43 |         message="Server is running"
 44 |     )
 45 | 
 46 | @router.post("/process", response_model=List[EmbeddingResponse])
 47 | async def process_text(
 48 |     request: ProcessingRequest,
 49 |     preprocessor: TextPreprocessor = Depends(get_preprocessor),
 50 |     embedding_service: EmbeddingService = Depends(get_embedding_service)
 51 | ) -> List[EmbeddingResponse]:
 52 |     """
 53 |     Process text into chunks and generate embeddings.
 54 |     
 55 |     This endpoint:
 56 |     1. Normalizes the Vietnamese text using underthesea's text_normalize
 57 |     2. Segments the text into sentences using underthesea's sent_tokenize
 58 |     3. Segments words with underthesea's word_tokenize and fixed-word preservation
 59 |     4. Chunks the text with target size of 110 tokens and 20 token overlap 
 60 |        (or as specified in the request)
 61 |     5. Generates embeddings for each chunk using vietnamese-bi-encoder
 62 |     
 63 |     Each chunk is guaranteed to:
 64 |     - Not exceed 128 tokens (MAX_TOKEN_LIMIT)
 65 |     - Have proper overlap with adjacent chunks
 66 |     - Preserve sentence boundaries when possible
 67 |     """
 68 |     try:
 69 |         logger.info(f"Processing text of length {len(request.text)} with chunk_size={request.chunk_size}, chunk_overlap={request.chunk_overlap}")
 70 |         
 71 |         # Validate chunk parameters
 72 |         chunk_size = request.chunk_size or DEFAULT_CHUNK_SIZE
 73 |         chunk_overlap = request.chunk_overlap or DEFAULT_CHUNK_OVERLAP
 74 |         
 75 |         if chunk_size > MAX_TOKEN_LIMIT:
 76 |             logger.warning(f"Requested chunk_size {chunk_size} exceeds MAX_TOKEN_LIMIT {MAX_TOKEN_LIMIT}")
 77 |             chunk_size = MAX_TOKEN_LIMIT
 78 |             
 79 |         if chunk_overlap >= chunk_size:
 80 |             logger.warning(f"Requested chunk_overlap {chunk_overlap} is too large")
 81 |             chunk_overlap = chunk_size - 1
 82 |         
 83 |         # Process text into chunks
 84 |         chunks = preprocessor.process_text(
 85 |             text=request.text,
 86 |             chunk_size=chunk_size,
 87 |             chunk_overlap=chunk_overlap
 88 |         )
 89 |         
 90 |         logger.info(f"Text processed into {len(chunks)} chunks")
 91 |         
 92 |         # Log token counts for debugging
 93 |         token_counts = [chunk["metadata"]["token_count"] for chunk in chunks]
 94 |         logger.debug(f"Token counts per chunk: {token_counts}")
 95 |         logger.debug(f"Min tokens: {min(token_counts) if token_counts else 0}, Max tokens: {max(token_counts) if token_counts else 0}")
 96 |         
 97 |         # Calculate average chunk size
 98 |         avg_chunk_size = sum(token_counts) / len(token_counts) if token_counts else 0
 99 |         logger.info(f"Average chunk size: {avg_chunk_size:.1f} tokens")
100 |         
101 |         # Verify no chunks exceed the token limit
102 |         for i, chunk in enumerate(chunks):
103 |             token_count = chunk["metadata"]["token_count"]
104 |             if token_count > MAX_TOKEN_LIMIT:
105 |                 logger.error(f"Chunk {i} exceeds token limit: {token_count} > {MAX_TOKEN_LIMIT}")
106 |                 raise HTTPException(
107 |                     status_code=400, 
108 |                     detail=f"Chunk {i} exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT})"
109 |                 )
110 |         
111 |         # Generate embeddings for chunks
112 |         try:
113 |             chunks_with_embeddings = embedding_service.embed_chunks(chunks)
114 |         except ValueError as e:
115 |             # Catch token limit errors from embedding service
116 |             logger.error(f"Token limit error during embedding: {str(e)}")
117 |             raise HTTPException(status_code=400, detail=str(e))
118 |         
119 |         # Add file metadata to each chunk
120 |         for chunk in chunks_with_embeddings:
121 |             if "metadata" not in chunk:
122 |                 chunk["metadata"] = {}
123 |             
124 |             if request.file_id:
125 |                 chunk["metadata"]["file_id"] = request.file_id
126 |             
127 |             if request.file_title:
128 |                 chunk["metadata"]["file_title"] = request.file_title
129 |         
130 |         # Store chunks and embeddings in memory
131 |         global stored_chunks
132 |         stored_chunks = chunks_with_embeddings
133 |         
134 |         # Convert to response model
135 |         chunk_objects = []
136 |         for chunk in chunks_with_embeddings:
137 |             chunk_objects.append(EmbeddingResponse(
138 |                 embedding=chunk["embedding"]
139 |             ))
140 |         
141 |         return chunk_objects
142 |     
143 |     except HTTPException:
144 |         raise
145 |     except ValueError as e:
146 |         logger.error(f"Value error processing text: {str(e)}")
147 |         raise HTTPException(status_code=400, detail=str(e))
148 |     except Exception as e:
149 |         logger.error(f"Error processing text: {str(e)}", exc_info=True)
150 |         raise HTTPException(status_code=500, detail=f"Error processing text: {str(e)}")
151 | 
152 | 
153 | @router.post("/query", response_model=QueryResponse)
154 | async def query_similar(
155 |     request: QueryRequest,
156 |     preprocessor: TextPreprocessor = Depends(get_preprocessor),
157 |     embedding_service: EmbeddingService = Depends(get_embedding_service)
158 | ) -> QueryResponse:
159 |     """
160 |     Find chunks similar to the query text.
161 |     """
162 |     try:
163 |         global stored_chunks
164 |         
165 |         if not stored_chunks:
166 |             raise HTTPException(status_code=400, detail="No chunks available. Process text first.")
167 |         
168 |         logger.info(f"Querying with '{request.query_text}', top_k={request.top_k}")
169 |         
170 |         # Verify query doesn't exceed token limit
171 |         tokens = preprocessor.segment_words(request.query_text)
172 |         token_count = len(tokens)
173 |         if token_count > MAX_TOKEN_LIMIT:
174 |             logger.error(f"Query exceeds token limit: {token_count} > {MAX_TOKEN_LIMIT}")
175 |             raise HTTPException(
176 |                 status_code=400, 
177 |                 detail=f"Query exceeds max token limit ({token_count} > {MAX_TOKEN_LIMIT})"
178 |             )
179 |         
180 |         # Extract embeddings and texts from stored chunks
181 |         embeddings = [chunk["embedding"] for chunk in stored_chunks]
182 |         texts = [chunk["text"] for chunk in stored_chunks]
183 |         metadata = [chunk["metadata"] for chunk in stored_chunks]
184 |         
185 |         # Perform similarity search
186 |         top_k = request.top_k or DEFAULT_TOP_K
187 |         try:
188 |             matches = embedding_service.similarity_search(
189 |                 query=request.query_text,
190 |                 embeddings=embeddings,
191 |                 texts=texts,
192 |                 metadata=metadata,
193 |                 top_k=top_k
194 |             )
195 |         except ValueError as e:
196 |             # Catch token limit errors from embedding service
197 |             logger.error(f"Token limit error during similarity search: {str(e)}")
198 |             raise HTTPException(status_code=400, detail=str(e))
199 |         
200 |         # Convert to response model
201 |         match_objects = []
202 |         for match in matches:
203 |             match_objects.append(QueryMatch(
204 |                 text=match["text"],
205 |                 score=match["score"],
206 |                 metadata=match["metadata"]
207 |             ))
208 |         
209 |         logger.info(f"Found {len(match_objects)} matching chunks")
210 |         
211 |         return QueryResponse(
212 |             matches=match_objects,
213 |             total_matches=len(match_objects)
214 |         )
215 |     
216 |     except HTTPException:
217 |         raise
218 |     except ValueError as e:
219 |         logger.error(f"Value error querying similar chunks: {str(e)}")
220 |         raise HTTPException(status_code=400, detail=str(e))
221 |     except Exception as e:
222 |         logger.error(f"Error querying similar chunks: {str(e)}", exc_info=True)
223 |         raise HTTPException(status_code=500, detail=f"Error querying similar chunks: {str(e)}")
224 | 
225 | 
226 | # New endpoint for text normalization
227 | @router.post("/normalize", response_model=NormalizationResponse)
228 | async def normalize_text(
229 |     request: ProcessingRequest,  # Reuse the existing ProcessingRequest schema
230 |     preprocessor: TextPreprocessor = Depends(get_preprocessor),
231 | ) -> NormalizationResponse:
232 |     """
233 |     Normalize, sentence segment, and word segment the input text.
234 | 
235 |     This endpoint:
236 |     1. Normalizes Vietnamese text.
237 |     2. Segments the text into sentences.
238 |     3. Segments the sentences into words.
239 |     """
240 |     try:
241 |         logger.info(f"Normalizing text of length {len(request.text)}")
242 | 
243 |         # 1. Normalize text
244 |         normalized_text = preprocessor.normalize_text(request.text)
245 | 
246 |         # 2. Segment sentences
247 |         sentences = preprocessor.segment_sentences(normalized_text)
248 | 
249 |         # 3. Segment words (preserving sentence structure)
250 |         segmented_sentences = []
251 |         for sentence in sentences:
252 |             segmented_sentences.append(preprocessor.segment_words(sentence))
253 | 
254 |         return NormalizationResponse(
255 |             # normalized_text=normalized_text,
256 |             # sentences=sentences,
257 |             segmented_sentences=segmented_sentences
258 |         )
259 | 
260 |     except Exception as e:
261 |         logger.error(f"Error normalizing text: {str(e)}", exc_info=True)
262 |         raise HTTPException(status_code=500, detail=f"Error normalizing text: {str(e)}") 


--------------------------------------------------------------------------------
/app/services/preprocessor.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from typing import List, Dict, Any, Optional, Tuple
  3 | import logging
  4 | from underthesea import word_tokenize, sent_tokenize, text_normalize
  5 | from app.config import MAX_TOKEN_LIMIT, DEFAULT_CHUNK_SIZE, DEFAULT_CHUNK_OVERLAP
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | # List of domain-specific fixed words that shouldn't be segmented
 10 | FIXED_WORDS = [
 11 |     "COVID-19",
 12 |     "AI",
 13 |     "NLP",
 14 |     "RAG",
 15 |     # Add more domain-specific terms here
 16 | ]
 17 | 
 18 | 
 19 | class TextPreprocessor:
 20 |     def __init__(self, fixed_words: Optional[List[str]] = None):
 21 |         """
 22 |         Initialize the text preprocessor.
 23 |         
 24 |         Args:
 25 |             fixed_words: List of domain-specific fixed words that shouldn't be segmented
 26 |         """
 27 |         self.fixed_words = fixed_words or FIXED_WORDS
 28 |         # Compile a regex pattern for fixed words to avoid segmentation
 29 |         self.fixed_words_pattern = self._compile_fixed_words_pattern()
 30 |         
 31 |     def _compile_fixed_words_pattern(self) -> re.Pattern:
 32 |         """Compile a regex pattern for fixed words to avoid segmentation."""
 33 |         if not self.fixed_words:
 34 |             return re.compile(r"^$")  # Empty pattern
 35 |         
 36 |         # Escape special characters and join with OR
 37 |         escaped_words = [re.escape(word) for word in self.fixed_words]
 38 |         pattern = r"\b(" + "|".join(escaped_words) + r")\b"
 39 |         return re.compile(pattern, re.IGNORECASE)
 40 |     
 41 |     def normalize_text(self, text: str) -> str:
 42 |         """
 43 |         Normalize text using underthesea's text_normalize and clean whitespace.
 44 |         
 45 |         Args:
 46 |             text: Raw text to normalize
 47 |             
 48 |         Returns:
 49 |             Normalized text
 50 |         """
 51 |         if not text or not isinstance(text, str):
 52 |             return ""
 53 |         
 54 |         # Use underthesea's text normalization
 55 |         text = text_normalize(text)
 56 |         
 57 |         # Replace multiple spaces with a single space
 58 |         text = re.sub(r'\s+', ' ', text)
 59 |         text = text.strip()
 60 |         
 61 |         return text
 62 |     
 63 |     def segment_words(self, text: str) -> List[str]:
 64 |         """
 65 |         Segment Vietnamese text into words while preserving fixed words.
 66 |         
 67 |         Args:
 68 |             text: Text to segment
 69 |             
 70 |         Returns:
 71 |             List of segmented words
 72 |         """
 73 |         if not text:
 74 |             return []
 75 |         
 76 |         # Extract fixed words and replace with placeholders
 77 |         placeholders = {}
 78 |         def replace_with_placeholder(match):
 79 |             word = match.group(0)
 80 |             placeholder = f"__FIXED_WORD_{len(placeholders)}__"
 81 |             placeholders[placeholder] = word
 82 |             return placeholder
 83 |         
 84 |         text_with_placeholders = self.fixed_words_pattern.sub(replace_with_placeholder, text)
 85 |         
 86 |         # Segment words - by default, word_tokenize returns a list of tokens
 87 |         segmented_tokens = word_tokenize(text_with_placeholders)
 88 |         
 89 |         # Restore fixed words in the tokens
 90 |         if placeholders:
 91 |             # If word_tokenize returned a list, process each token
 92 |             if isinstance(segmented_tokens, list):
 93 |                 for i, token in enumerate(segmented_tokens):
 94 |                     for placeholder, word in placeholders.items():
 95 |                         if placeholder in token:
 96 |                             segmented_tokens[i] = token.replace(placeholder, word)
 97 |             # If it returned a string (in case of format="text"), process the string
 98 |             elif isinstance(segmented_tokens, str):
 99 |                 for placeholder, word in placeholders.items():
100 |                     segmented_tokens = segmented_tokens.replace(placeholder, word)
101 |                 # Convert to list by splitting on spaces
102 |                 segmented_tokens = segmented_tokens.split()
103 |         
104 |         return segmented_tokens
105 |     
106 |     def segment_sentences(self, text: str) -> List[str]:
107 |         """
108 |         Split text into sentences using underthesea's sent_tokenize.
109 |         
110 |         Args:
111 |             text: Text to split into sentences
112 |             
113 |         Returns:
114 |             List of sentences
115 |         """
116 |         if not text:
117 |             return []
118 |         
119 |         sentences = sent_tokenize(text)
120 |         return sentences
121 |     
122 |     def count_tokens(self, text: str) -> int:
123 |         """
124 |         Count tokens in Vietnamese text accurately by first segmenting words.
125 |         
126 |         Args:
127 |             text: Text to count tokens for
128 |             
129 |         Returns:
130 |             Accurate token count for Vietnamese text
131 |         """
132 |         if not text:
133 |             return 0
134 |         
135 |         # Simply get the list of tokens and count them
136 |         tokens = self.segment_words(text)
137 |         return len(tokens)
138 |     
139 |     def process_text(self, text: str, chunk_size: int = DEFAULT_CHUNK_SIZE, 
140 |                     chunk_overlap: int = DEFAULT_CHUNK_OVERLAP) -> List[Dict[str, Any]]:
141 |         """
142 |         Main method to process text into chunks with proper overlap.
143 |         
144 |         Args:
145 |             text: Raw text to process
146 |             chunk_size: Target size of each chunk in tokens (default: 110)
147 |             chunk_overlap: Number of tokens to overlap between chunks (default: 20)
148 |             
149 |         Returns:
150 |             List of chunk dictionaries, each with text and metadata
151 |         """
152 |         # Normalize text
153 |         normalized_text = self.normalize_text(text)
154 |         
155 |         # Segment into sentences
156 |         sentences = self.segment_sentences(normalized_text)
157 |         
158 |         # Create chunks with proper overlap
159 |         chunks = self._create_chunks_from_sentences(sentences, chunk_size, chunk_overlap)
160 |         
161 |         # Validate and adjust chunk sizes if needed
162 |         chunks = self._validate_chunk_sizes(chunks)
163 |         
164 |         return chunks
165 | 
166 |     def _create_chunks_from_sentences(self, sentences: List[str], 
167 |                                      chunk_size: int, chunk_overlap: int) -> List[Dict[str, Any]]:
168 |         """
169 |         Create chunks from a list of sentences with proper overlap.
170 |         
171 |         Args:
172 |             sentences: List of sentences to process
173 |             chunk_size: Max token count for each chunk
174 |             chunk_overlap: Number of tokens to overlap between chunks
175 |             
176 |         Returns:
177 |             List of chunk dictionaries
178 |         """
179 |         chunks = []
180 |         current_chunk_tokens = []
181 |         current_size = 0
182 |         
183 |         for sentence in sentences:
184 |             # Get tokens for this sentence
185 |             sentence_tokens = self.segment_words(sentence)
186 |             sentence_token_count = len(sentence_tokens)
187 |             
188 |             if sentence_token_count > chunk_size:
189 |                 # Handle long sentences separately
190 |                 self._handle_long_sentence(chunks, current_chunk_tokens, current_size, 
191 |                                           sentence_tokens, chunk_size)
192 |                 # Reset current chunk tracking
193 |                 current_chunk_tokens = []
194 |                 current_size = 0
195 |                 continue
196 |             
197 |             if current_size + sentence_token_count > chunk_size:
198 |                 # Finish current chunk and start new one with overlap
199 |                 chunks.append(self._create_chunk(current_chunk_tokens, current_size))
200 |                 # Create overlap for next chunk
201 |                 current_chunk_tokens, current_size = self._create_overlap(
202 |                     current_chunk_tokens, current_size, chunk_overlap)
203 |             
204 |             # Add sentence to current chunk
205 |             current_chunk_tokens.extend(sentence_tokens)
206 |             current_size += sentence_token_count
207 |         
208 |         # Add the final chunk if not empty
209 |         if current_chunk_tokens:
210 |             chunks.append(self._create_chunk(current_chunk_tokens, current_size))
211 |         
212 |         return chunks
213 | 
214 |     def _handle_long_sentence(self, chunks: List[Dict[str, Any]], 
215 |                              current_chunk_tokens: List[str], current_size: int,
216 |                              sentence_tokens: List[str], chunk_size: int) -> None:
217 |         """
218 |         Handle sentences that are longer than the chunk size.
219 |         
220 |         Args:
221 |             chunks: List of chunks to append to
222 |             current_chunk_tokens: Tokens in the current chunk
223 |             current_size: Size of current chunk in tokens
224 |             sentence_tokens: Tokens of the long sentence
225 |             chunk_size: Maximum chunk size
226 |         """
227 |         # First save any existing chunk
228 |         if current_chunk_tokens:
229 |             chunks.append(self._create_chunk(current_chunk_tokens, current_size))
230 |         
231 |         # Split long sentence into parts
232 |         current_part = []
233 |         current_part_tokens = 0
234 |         
235 |         for token in sentence_tokens:
236 |             if current_part_tokens + 1 <= chunk_size:
237 |                 current_part.append(token)
238 |                 current_part_tokens += 1
239 |             else:
240 |                 # Save current part and start a new one
241 |                 chunks.append(self._create_chunk(current_part, current_part_tokens))
242 |                 current_part = [token]
243 |                 current_part_tokens = 1
244 |         
245 |         # Save any remaining part
246 |         if current_part:
247 |             chunks.append(self._create_chunk(current_part, current_part_tokens))
248 | 
249 |     def _create_overlap(self, tokens: List[str], size: int, 
250 |                        overlap_size: int) -> Tuple[List[str], int]:
251 |         """
252 |         Create overlap for the next chunk.
253 |         
254 |         Args:
255 |             tokens: Tokens from the previous chunk
256 |             size: Size of the previous chunk
257 |             overlap_size: Number of tokens to overlap
258 |             
259 |         Returns:
260 |             Tuple of (overlap_tokens, overlap_size)
261 |         """
262 |         overlap_size = min(overlap_size, size)
263 |         if overlap_size <= 0:
264 |             return [], 0
265 |         
266 |         # Take tokens from the end for overlap
267 |         tokens_to_keep = []
268 |         tokens_kept = 0
269 |         
270 |         for token in reversed(tokens):
271 |             if tokens_kept < overlap_size:
272 |                 tokens_to_keep.insert(0, token)
273 |                 tokens_kept += 1
274 |             else:
275 |                 break
276 |             
277 |         return tokens_to_keep, tokens_kept
278 | 
279 |     def _create_chunk(self, tokens: List[str], token_count: int) -> Dict[str, Any]:
280 |         """
281 |         Create a chunk dictionary from tokens.
282 |         
283 |         Args:
284 |             tokens: List of tokens to include in the chunk
285 |             token_count: Number of tokens
286 |             
287 |         Returns:
288 |             Chunk dictionary with text and metadata
289 |         """
290 |         return {
291 |             "text": " ".join(tokens),
292 |             "metadata": {
293 |                 "token_count": token_count
294 |             }
295 |         }
296 | 
297 |     def _validate_chunk_sizes(self, chunks: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
298 |         """
299 |         Validate that all chunks are within the token limit.
300 |         
301 |         Args:
302 |             chunks: List of chunks to validate
303 |             
304 |         Returns:
305 |             List of validated/adjusted chunks
306 |         """
307 |         validated_chunks = []
308 |         
309 |         for chunk in chunks:
310 |             token_count = chunk["metadata"]["token_count"]
311 |             
312 |             if token_count > MAX_TOKEN_LIMIT:
313 |                 # Re-tokenize and truncate if necessary
314 |                 text = chunk["text"]
315 |                 tokens = self.segment_words(text)[:MAX_TOKEN_LIMIT]
316 |                 
317 |                 validated_chunks.append({
318 |                     "text": " ".join(tokens),
319 |                     "metadata": {
320 |                         "token_count": len(tokens),
321 |                         "truncated": True
322 |                     }
323 |                 })
324 |             else:
325 |                 validated_chunks.append(chunk)
326 |             
327 |         return validated_chunks 


--------------------------------------------------------------------------------
/docs/vietnamese-rag-research.md:
--------------------------------------------------------------------------------
 1 | A Comprehensive Text Processing Strategy for Vietnamese RAG Application Using Underthesea and BKAi Bi-Encoder
 2 | Introduction: Setting the Stage for Vietnamese RAG with Underthesea and BKAi Bi-Encoder
 3 | Retrieval-Augmented Generation (RAG) represents a significant advancement in the field of natural language processing, particularly for languages like Vietnamese where the availability of extensive, high-quality training datasets for large language models might be less abundant than for English. RAG enhances the capabilities of language models by enabling them to access and incorporate information from external knowledge sources during the text generation process. This approach is especially beneficial for knowledge-intensive tasks in Vietnamese, allowing for more accurate, contextually relevant, and informative responses grounded in factual data.
 4 | The success of any RAG application is heavily dependent on the effectiveness of the text preprocessing stage. For a language like Vietnamese, which possesses unique linguistic characteristics such as its agglutinative nature and tonal system, meticulous preprocessing is paramount. Proper handling of text before it is fed into the retrieval and generation components ensures that the underlying models can accurately understand and process the information. This includes tasks like standardizing the text, segmenting it into meaningful units, and preparing it for embedding.
 5 | In the context of Vietnamese natural language processing, the underthesea NLP toolkit stands out as a robust and versatile open-source Python library 1. This toolkit offers a wide array of functionalities specifically designed for Vietnamese, including word segmentation, text normalization, part-of-speech tagging, and named entity recognition. Its active development and widespread adoption within the Vietnamese NLP community make it a reliable and valuable resource for developers 1.
 6 | For the crucial task of generating dense vector embeddings, the bkai-foundation-models/vietnamese-bi-encoder has been chosen 4. This model is specifically trained on Vietnamese text and excels at mapping sentences and paragraphs into a high-dimensional semantic space. These embeddings are essential for enabling efficient similarity search and retrieval of relevant documents from the knowledge base. The model's training on a diverse Vietnamese dataset, which includes translated versions of MS MARCO and SQuAD, as well as a significant portion of the Legal Text Retrieval Zalo 2021 challenge dataset, suggests its strong capabilities in understanding the semantic nuances of Vietnamese text across various domains 4. This broad training makes it potentially well-suited for handling data originating from Google Docs and Sheets, which can contain diverse content.
 7 | This report aims to provide a comprehensive and tailored text preprocessing strategy for the user's Vietnamese RAG application. The strategy will focus on effectively utilizing the functionalities offered by the underthesea toolkit while strictly adhering to the constraints imposed by the vietnamese-bi-encoder, particularly its maximum input sequence length. The ultimate goal is to ensure optimal performance of the RAG application by preparing the Vietnamese text data in the most suitable manner for embedding and retrieval.
 8 | Understanding the Constraints and Requirements
 9 | A critical aspect of developing an effective text preprocessing strategy is a thorough understanding of the limitations and requirements of the chosen embedding model. In this case, the bkai-foundation-models/vietnamese-bi-encoder imposes specific constraints that must be carefully considered.
10 | The vietnamese-bi-encoder has a strict maximum input sequence length of 128 tokens 4. This limitation has significant implications for the RAG application. Any text input that exceeds this token count will likely be truncated by the model. Truncation can lead to a loss of crucial context, especially if important information resides towards the end of a long document or sentence. Consequently, the quality of the generated embeddings might be compromised, and the accuracy of the retrieval process could be negatively affected. Therefore, a robust strategy for breaking down the source documents into smaller, manageable chunks, each adhering to this token limit, is essential. This will likely involve sentence segmentation followed by further chunking if necessary.
11 | Furthermore, the vietnamese-bi-encoder requires the input Vietnamese text to be pre-segmented into individual words 4. Unlike English, where spaces typically delineate word boundaries, Vietnamese often requires additional processing to accurately identify words. Word segmentation is a fundamental step for the embedding model to correctly interpret the semantic units within the text. Without proper segmentation, the model might treat syllables or parts of compound words as independent entities, leading to inaccurate vector representations that do not capture the true meaning of the text. This requirement directly necessitates the use of a Vietnamese word segmenter, making the underthesea.word_tokenize function a core component of the preprocessing pipeline.
12 | Finally, the user's documents will originate from Google Docs and Google Sheets. This specifies the initial stage of the preprocessing workflow: data extraction. Strategies must be implemented to programmatically access and retrieve the textual content from these platforms in a format that can be readily processed by the subsequent NLP steps. Google Docs, being a word processing application, might contain rich text formatting, while Google Sheets store data in a tabular format. The extraction methods must be able to handle these different structures and retrieve the relevant text content for the RAG application.
13 | Data Extraction and Initial Handling
14 | The first step in the preprocessing pipeline involves extracting the text data from its source, which, in this case, are Google Docs and Google Sheets. Utilizing the respective Google APIs provides the most reliable and efficient way to accomplish this programmatically.
15 | For Google Docs, the Google Docs API (Application Programming Interface) allows developers to interact with documents stored in Google Drive 7. To use this API, it is necessary to set up a project in the Google Cloud Console, enable the Google Docs API for that project, configure the OAuth consent screen to manage user authorization, and obtain the necessary credentials for authentication 8. Once these steps are completed, the API can be used to retrieve the content of a Google Doc given its unique document ID. The API offers options to extract the document content in various formats, including plain text 7. For a RAG application focused on semantic understanding, extracting the plain text content is generally preferred to avoid processing any formatting information that might not contribute to the meaning. The google-api-python-client library provides a convenient way to interact with the Google Docs API using Python 7. A basic workflow involves authenticating the client using the obtained credentials and then making a request to the API to retrieve the document content based on its ID. It is important to implement proper error handling to manage potential issues during API calls and to be mindful of the API's usage quotas to prevent service disruptions.
16 | Similarly, for Google Sheets, the Google Sheets API enables programmatic access to spreadsheet data 13. The setup process in the Google Cloud Console is analogous to that of the Google Docs API: enabling the Google Sheets API and obtaining the necessary authentication credentials, which can include API keys or OAuth 2.0 credentials 13. To extract data, the API requires specifying the ID of the target spreadsheet and the range of cells or sheets to retrieve 13. Google Sheets store data in a structured, tabular format, so the extraction process might yield data in a grid-like structure. Depending on the specific information needed for the RAG application, this tabular data might need to be flattened or processed to extract the relevant textual content. The google-api-python-client library also supports interaction with the Google Sheets API in Python 17. The process involves authenticating and then making requests to the API to read values from the specified spreadsheet and range. Handling different data types within the spreadsheet cells and appropriately structuring the extracted text for further processing will be important considerations.
17 | After successfully extracting the text content from both Google Docs and Google Sheets, an initial data cleaning and preparation phase is recommended. This might involve addressing character encoding issues to ensure consistency across all documents. Removing any irrelevant metadata or boilerplate text that might have been extracted along with the main content, such as headers, footers, or table of contents, can also be beneficial. Furthermore, converting the extracted text to a standard encoding format, such as UTF-8, will help prevent issues in the subsequent NLP processing steps. These initial cleaning tasks ensure that the data is in a consistent and suitable format for the more advanced preprocessing stages.
18 | Vietnamese Text Normalization using Underthesea
19 | Text normalization plays a crucial role in Vietnamese NLP by addressing the inherent variations that can occur in written text 19. These variations can stem from typos, inconsistencies in the use of diacritics (tone marks), different input methods, and the presence of special characters. The goal of normalization is to reduce these inconsistencies, ensuring that semantically equivalent words are represented in a uniform manner. This standardization is vital for improving the accuracy and effectiveness of downstream NLP tasks, including word segmentation and the generation of meaningful embeddings.
20 | Common normalization tasks for Vietnamese text include diacritic standardization, which involves ensuring a consistent representation of the five tones and the absence of tone marks (for the base tone); spelling correction to identify and rectify common misspellings and typographical errors; case handling, although the case sensitivity of the vietnamese-bi-encoder should be investigated, converting text to lowercase can sometimes be beneficial for reducing vocabulary size and improving matching; and punctuation handling, which might involve standardizing or removing punctuation marks based on the requirements of the embedding model and the specific RAG tasks 1.
21 | The underthesea library provides a convenient and effective tool for addressing many of these normalization needs through its text_normalize function 1. This function is specifically designed to handle common Vietnamese text normalization tasks, including correcting diacritic errors and standardizing spelling. For example, as demonstrated in the library's documentation, the input string "Ðảm baỏ chất lựơng phòng thí nghịêm hoá học" is transformed into the normalized form "Đảm bảo chất lượng phòng thí nghiệm hóa học" 1. This function can be readily integrated into the preprocessing pipeline as an initial normalization step.
22 | Beyond text_normalize, the underthesea toolkit might offer other functionalities that can further aid in the normalization process 26. Exploring the library's documentation 25 for modules that handle special characters or case conversion could be valuable. Depending on the specific characteristics of the data extracted from Google Docs and Sheets, there might also be a need for more advanced or custom normalization rules. These could be implemented using regular expressions 21 to address specific patterns of errors or variations that are prevalent in the dataset. For instance, if the data contains inconsistencies in spacing around punctuation, regular expressions can be used to standardize this. The choice of normalization techniques should be guided by the specific types of variations observed in the data and the requirements of the subsequent processing steps.
23 | Vietnamese Word Segmentation using Underthesea
24 | Accurate word segmentation is a fundamental prerequisite for the vietnamese-bi-encoder to effectively process and understand Vietnamese text 4. The quality of the word segmentation directly influences the accuracy of the embeddings generated by the model. If words are not correctly identified and segmented, the embedding model might learn flawed representations, leading to poor semantic matching and retrieval performance in the RAG application.
25 | The underthesea library provides the word_tokenize function, which is specifically designed for segmenting Vietnamese text into individual words 1. This function takes a string of Vietnamese text as input and returns a list of segmented words by default. It also offers a format="text" option that returns a string with the segmented words joined by underscores or spaces 1. For example, the sentence "Chàng trai 9X Quảng Trị khởi nghiệp từ nấm sò" is segmented into the list `` 1.
26 | A particularly useful feature of word_tokenize is the fixed_words parameter 1. This allows the user to provide a list of multi-word expressions or named entities that should be treated as single tokens during the segmentation process. This is crucial for preserving the meaning of phrases like "Viện Nghiên Cứu" (Research Institute) or "học máy" (machine learning), which should not be split into their constituent words for accurate semantic representation 1. By identifying and providing such fixed words relevant to the domain of the Google Docs and Sheets data, the accuracy of word segmentation can be significantly improved.
27 | Vietnamese word segmentation can be challenging due to the presence of compound words and multi-word expressions 19. These linguistic units often convey a meaning that is more than the sum of their individual components. While underthesea is trained on Vietnamese corpora and incorporates rules and statistical models to handle many of these cases, achieving perfect segmentation can be difficult. For specialized domains or with specific technical terminology present in the user's documents, it might be necessary to refine the segmentation by providing custom lists of fixed_words or by exploring more advanced techniques if the default performance of underthesea is insufficient.
28 | Research suggests that underthesea employs a hybrid approach to word segmentation, likely combining rule-based methods with statistical models trained on Vietnamese text data 1. These methods often involve identifying potential word boundaries based on linguistic rules and then using statistical language models to disambiguate between different possible segmentations. Techniques like maximal matching, where the longest possible valid words are identified, are also likely employed 28. While a deep dive into the exact algorithms used by underthesea might require examining the library's source code or related research papers, understanding the general principles behind its operation can be helpful for troubleshooting and for deciding if further customization or alternative tools might be necessary.
29 | Strategies for Handling the Maximum Sequence Length Constraint
30 | Given the strict maximum sequence length of 128 tokens for the vietnamese-bi-encoder, effective strategies for handling potentially long input texts are crucial. The first step in this process is typically sentence segmentation.
31 | Underthesea provides the sent_tokenize function for dividing a block of Vietnamese text into individual sentences 1. This function analyzes the text and identifies sentence boundaries based on punctuation marks and other linguistic cues. For example, it can split a text containing multiple sentences into a list where each element is a sentence 1. Sentence segmentation is a logical first step as it breaks down large documents into smaller, semantically coherent units. The subsequent word segmentation and embedding processes can then be applied to these individual sentences. However, it is important to recognize that even after sentence segmentation, some sentences, particularly in technical or legal documents, can still be quite lengthy and might exceed the 128-token limit after being segmented into words 2. Therefore, further chunking might be necessary.
32 | Text chunking involves breaking down longer sequences of text into smaller segments. Several chunking strategies are commonly used in RAG applications 37. One straightforward approach is fixed-size chunking with overlap 37. This method involves dividing the word-segmented text into chunks, each containing a predefined maximum number of tokens (considerably less than 128 to account for variations in sentence length). To maintain context across these chunks, a certain number of tokens from the end of one chunk are repeated at the beginning of the next chunk, creating an overlap. The key parameters for this strategy are the chunk_size (the maximum number of tokens per chunk) and the chunk_overlap (the number of overlapping tokens). These parameters need to be carefully chosen and might require experimentation to find the optimal balance between context retention and redundancy 37.
33 | Another strategy is recursive character text splitting 37. This more adaptive method involves splitting the text using a hierarchy of separators, such as paragraphs, sentences, and words, in a recursive manner until the chunks meet the desired size criteria. This approach attempts to respect the natural semantic boundaries within the text, resulting in more contextually coherent chunks compared to fixed-size splitting. While underthesea might not offer a direct implementation of recursive splitting, the logic can be implemented programmatically, or libraries like LangChain, which are often used in conjunction with tools like underthesea, can be employed for this purpose 40.
34 | Finally, semantic chunking involves splitting text based on the semantic similarity of its content 38. This approach aims to group together text segments that are closely related in meaning. Implementing semantic chunking typically involves using embeddings themselves to identify natural breaks in the semantic flow of the text. While this method has the potential to create the most contextually relevant chunks for retrieval, it is also generally more complex to implement and might require using the vietnamese-bi-encoder or another embedding model to guide the chunking process.
35 | Proposed Preprocessing Pipeline for Your Vietnamese RAG Application
36 | Based on the analysis of the requirements and available tools, a comprehensive preprocessing pipeline for the Vietnamese RAG application can be outlined as follows:
37 | Data Extraction: The process begins with extracting the raw text content from the Google Docs and Google Sheets using their respective APIs. This will involve setting up the necessary Google Cloud project, enabling the APIs, handling authentication, and writing code (potentially in Python using the google-api-python-client library) to retrieve the text content based on the document and spreadsheet IDs.
38 | Initial Cleaning: After extraction, perform basic cleaning operations on the text data. This might include handling character encoding issues, removing irrelevant metadata or boilerplate text, and standardizing the text format (e.g., to UTF-8).
39 | Text Normalization: Apply the underthesea.text_normalize function to the cleaned text to correct diacritic errors and standardize spelling. Depending on the specific characteristics of the data, additional custom normalization steps using regular expressions or other techniques might be necessary.
40 | Sentence Segmentation: Use the underthesea.sent_tokenize function to divide the normalized text into individual sentences. This provides an initial level of granularity for further processing.
41 | Word Segmentation: Employ the underthesea.word_tokenize function to segment each sentence into a sequence of words. Consider creating and using a list of fixed_words for any known multi-word expressions or named entities relevant to the content of the Google Docs and Sheets.
42 | Chunking: Implement a suitable chunking strategy to ensure that each text chunk, after word segmentation, does not exceed the 128-token limit of the vietnamese-bi-encoder. A reasonable starting point could be fixed-size chunking with an overlap. For example, aim for a chunk size of around 100-110 tokens after word segmentation and an overlap of 20-30 tokens. Alternatively, explore implementing recursive character text splitting, using sentence boundaries as a primary separator. The choice of strategy and parameters should be guided by experimentation and evaluation.
43 | The optimal chunking strategy might depend on the specific nature and structure of the content in the Google Docs and Sheets. For highly structured documents, recursive splitting that respects headings or other structural elements could be beneficial. For simpler, more uniform text, fixed-size chunking might be sufficient. It is strongly recommended to experiment with different chunking strategies and parameter settings on a representative sample of the data and to evaluate the performance of the RAG application to determine the most effective approach for the user's specific needs.
44 | Key Considerations and Recommendations
45 | Several key considerations and recommendations should guide the implementation of the proposed preprocessing strategy.
46 | The choices made during each step of the preprocessing pipeline will have a direct impact on the quality of the embeddings generated by the vietnamese-bi-encoder and, ultimately, on the retrieval performance of the RAG application. Therefore, each step should be carefully considered and potentially optimized through experimentation.
47 | When using fixed-size chunking, determining the optimal chunk size and overlap is crucial. Starting with a chunk size that leaves a safety margin below the 128-token limit (e.g., aiming for around 100-110 tokens after word segmentation) and an overlap of 20-30 tokens is a reasonable approach. These values should then be iteratively adjusted based on the evaluation of the RAG application's performance. Too small a chunk size might lead to a loss of necessary context, while too large a size risks exceeding the model's input limit. Similarly, insufficient overlap might break semantic connections between chunks, whereas excessive overlap can introduce unnecessary redundancy and increase processing time.
48 | It is also important to consider the potential for out-of-vocabulary (OOV) words after word segmentation. These are words that are not present in the vocabulary of the vietnamese-bi-encoder. OOV words might not be well-represented by the embedding model, potentially affecting retrieval accuracy if these words are semantically significant. Investigating whether the bi-encoder uses subword tokenization could provide insights into how it handles OOV words. Further normalization or exploring techniques like stemming (with caution, as aggressive stemming might not align with the embedding model's training) could be considered. In cases where OOV words pose a significant problem, fine-tuning the embedding model on the user's specific vocabulary might be an option, although this is a more advanced step.
49 | Finally, it is paramount to emphasize the importance of experimentation and rigorous evaluation. The optimal preprocessing strategy and chunking parameters are likely to be specific to the user's data and the nature of their queries. Therefore, it is recommended to test different approaches on a representative subset of the Google Docs and Sheets data and to evaluate the performance of the RAG application using relevant metrics. This iterative process of experimentation and evaluation will be key to finding the most effective preprocessing configuration for achieving the desired results.
50 | Conclusion: Towards an Effective Vietnamese RAG Application
51 | In conclusion, developing an effective Vietnamese Retrieval-Augmented Generation (RAG) application using underthesea and the bkai-foundation-models/vietnamese-bi-encoder requires a well-designed and carefully implemented text preprocessing strategy. This strategy must encompass thorough data extraction from Google Docs and Sheets, comprehensive text normalization and accurate word segmentation using the underthesea toolkit, and effective chunking techniques to adhere to the embedding model's 128-token limit.
52 | Throughout the preprocessing pipeline, it is crucial to consider the specific characteristics of the user's data and the requirements of the chosen embedding model. Experimentation and evaluation are essential to optimize the various preprocessing steps and to find the most suitable configuration for the RAG application. By following the proposed strategies and continuously refining them based on empirical results, the user can build a robust and high-performing Vietnamese RAG application capable of leveraging external knowledge for enhanced language model capabilities.
53 | Works cited
54 | 1. Underthesea - Vietnamese NLP Toolkit - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/underthesea
55 | 2. undertheseanlp - Hugging Face, accessed March 15, 2025, https://huggingface.co/undertheseanlp
56 | 3. underthesea - PyPI, accessed March 15, 2025, https://pypi.org/project/underthesea/
57 | 4. Bkai-foundation-models - Find Top AI Models on Hugging Face - AIModels.fyi, accessed March 15, 2025, https://www.aimodels.fyi/creators/huggingFace/bkai-foundation-models
58 | 5. vietnamese-bi-encoder | AI Model Details - AIModels.fyi, accessed March 15, 2025, https://www.aimodels.fyi/models/huggingFace/vietnamese-bi-encoder-bkai-foundation-models
59 | 6. Vietnamese Bi Encoder · Models - Dataloop, accessed March 15, 2025, https://dataloop.ai/library/model/bkai-foundation-models_vietnamese-bi-encoder/
60 | 7. Extract the text from a document with Docs API - Google for Developers, accessed March 15, 2025, https://developers.google.com/docs/api/samples/extract-text
61 | 8. How to Get Document Texts with the Google Docs API in Python | Endgrate, accessed March 15, 2025, https://endgrate.com/blog/how-to-get-document-texts-with-the-google-docs-api-in-python
62 | 9. Using the Google Docs API to Get Document Texts (with Javascript examples) - Endgrate, accessed March 15, 2025, https://endgrate.com/blog/using-the-google-docs-api-to-get-document-texts-(with-javascript-examples)
63 | 10. Google Docs API samples, accessed March 15, 2025, https://developers.google.com/docs/api/samples
64 | 11. Reading text from Gdoc for feeding to ChatGPT - Help - Pipedream, accessed March 15, 2025, https://pipedream.com/community/t/reading-text-from-gdoc-for-feeding-to-chatgpt/7087
65 | 12. Python quickstart | Google Docs, accessed March 15, 2025, https://developers.google.com/docs/api/quickstart/python
66 | 13. Extracting data from Google Sheets via API - Sharperlight, accessed March 15, 2025, https://www.sharperlight.com/advanced/2022/04/06/accessing-the-google-sheets-api-via-sharperlight-query-builder/
67 | 14. Extract data from smart chips in your Google Sheets - Google Docs Editors Help, accessed March 15, 2025, https://support.google.com/docs/answer/13524011?hl=en
68 | 15. Basic reading | Google Sheets, accessed March 15, 2025, https://developers.google.com/sheets/api/samples/reading
69 | 16. Read and Write Data in Google Sheets using Python and the Google Sheets API, accessed March 15, 2025, https://aryanirani123.medium.com/read-and-write-data-in-google-sheets-using-python-and-the-google-sheets-api-6e206a242f20
70 | 17. Python quickstart | Google Sheets, accessed March 15, 2025, https://developers.google.com/sheets/api/quickstart/python
71 | 18. Accessing Google Sheet Data with Python: A Practical Guide using the Google Sheets API, accessed March 15, 2025, https://medium.com/@techworldthink/accessing-google-sheet-data-with-python-a-practical-guide-using-the-google-sheets-api-dc57759d387a
72 | 19. Underthesea Vietnamese NLP Toolkit | Restackio, accessed March 15, 2025, https://www.restack.io/p/vietnamese-nlp-tools-answer-underthesea-cat-ai
73 | 20. Vietnamese Text Recognition Dataset | Restackio, accessed March 15, 2025, https://www.restack.io/p/vietnamese-nlp-tools-answer-text-recognition-dataset-cat-ai
74 | 21. Vietnamese Sentiment Analysis - Kaggle, accessed March 15, 2025, https://www.kaggle.com/code/tonibui3107/vietnamese-sentiment-analysis
75 | 22. Nlp Cho Tiếng Việt - Vietnamese Nlp Tools | Restackio, accessed March 15, 2025, https://www.restack.io/p/vietnamese-nlp-tools-answer-nlp-cho-tieng-viet-cat-ai
76 | 23. NeMo-text-processing/nemo_text_processing/text_normalization/normalize.py at main - GitHub, accessed March 15, 2025, https://github.com/NVIDIA/NeMo-text-processing/blob/main/nemo_text_processing/text_normalization/normalize.py
77 | 24. Chuẩn hóa văn bản - ProtonX, accessed March 15, 2025, https://protonx.io/courses/66487737f91fdc001a81ce3a/topics/665057c88c287e0019bb800b
78 | 25. Underthesea documentation — Under The Sea 1.1.9 documentation, accessed March 15, 2025, https://underthesea.readthedocs.io/
79 | 26. Underthesea v6.6.0 [Latest Version] - Colab, accessed March 15, 2025, https://colab.research.google.com/drive/1gD8dSMSE_uNacW4qJ-NSnvRT85xo9ZY2
80 | 27. Vietnamese NLP Toolkit — Under The Sea 1.1.9 ... - Underthesea, accessed March 15, 2025, https://underthesea.readthedocs.io/en/latest/readme.html
81 | 28. A Hybrid Approach to Word Segmentation of Vietnamese Texts - ResearchGate, accessed March 15, 2025, https://www.researchgate.net/publication/29616221_A_Hybrid_Approach_to_Word_Segmentation_of_Vietnamese_Texts
82 | 29. Is word segmentation necessary for Vietnamese sentiment classification? - arXiv, accessed March 15, 2025, https://arxiv.org/pdf/2301.00418
83 | 30. NLP-Vietnamese-progress/tasks/word_segmentation.md at master - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/NLP-Vietnamese-progress/blob/master/tasks/word_segmentation.md
84 | 31. A Large-Scale Benchmark for Vietnamese Sentence Paraphrases - arXiv, accessed March 15, 2025, https://arxiv.org/html/2502.07188v1
85 | 32. NLP Benchmarking popular Vietnamese tokenizer - Huy Bik's Blog, accessed March 15, 2025, https://huybik.github.io/Word-Tokenizer-Benchmark/
86 | 33. undertheseanlp/sent_tokenize: Vietnamese Sentence ... - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/sent_tokenize
87 | 34. sentence-segmentation · GitHub Topics, accessed March 15, 2025, https://github.com/topics/sentence-segmentation?o=desc&s=forks
88 | 35. underthesea/tox.ini at main - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/underthesea/blob/main/tox.ini
89 | 36. AttributeError: module 'pytest' has no attribute 'mark' · Issue #303 · undertheseanlp/underthesea - GitHub, accessed March 15, 2025, https://github.com/undertheseanlp/underthesea/issues/303
90 | 37. 7 Chunking Strategies in RAG You Need To Know - F22 Labs, accessed March 15, 2025, https://www.f22labs.com/blogs/7-chunking-strategies-in-rag-you-need-to-know/
91 | 38. 11 Chunking Strategies for RAG — Simplified & Visualized | by Mastering LLM (Large Language Model), accessed March 15, 2025, https://masteringllm.medium.com/11-chunking-strategies-for-rag-simplified-visualized-df0dbec8e373
92 | 39. A Guide to Chunking Strategies for Retrieval Augmented Generation (RAG) - Sagacify, accessed March 15, 2025, https://www.sagacify.com/news/a-guide-to-chunking-strategies-for-retrieval-augmented-generation-rag
93 | 40. Five Levels of Chunking Strategies in RAG| Notes from Greg's Video | by Anurag Mishra, accessed March 15, 2025, https://medium.com/@anuragmishra_27746/five-levels-of-chunking-strategies-in-rag-notes-from-gregs-video-7b735895694d
94 | 41. Effective Chunking Strategies for RAG - Cohere Documentation, accessed March 15, 2025, https://docs.cohere.com/v2/page/chunking-strategies
95 | 42. Simple Chunking Strategies for RAG Applications (Part 1) | by kirouane Ayoub | Medium, accessed March 15, 2025, https://medium.com/@ayoubkirouane3/simple-chunking-strategies-for-rag-applications-part-1-d56903b167c5
96 | 43. Chunking strategies for RAG tutorial using Granite - IBM, accessed March 15, 2025, https://www.ibm.com/think/tutorials/chunking-strategies-for-rag-with-langchain-watsonx-ai
97 | 


--------------------------------------------------------------------------------