├── requirements.txt ├── LICENSE ├── README.md └── ckip_processor.py /requirements.txt: -------------------------------------------------------------------------------- 1 | # Core dependencies 2 | torch>=2.0.0 3 | transformers>=4.34.0 4 | langchain>=0.0.350 5 | langchain-openai>=0.0.5 6 | faiss-cpu>=1.7.4 7 | openai>=1.3.0 8 | 9 | # CKIP Related 10 | ckip-transformers>=0.3.4 11 | tokenizers>=0.15.0 12 | 13 | # Document Processing 14 | PyMuPDF>=1.23.6 15 | pdfplumber>=0.10.3 16 | pytesseract>=0.3.10 17 | Pillow>=10.1.0 18 | 19 | # Data Processing and Utilities 20 | numpy>=1.24.0 21 | pandas>=2.1.0 22 | tqdm>=4.66.1 23 | python-dotenv>=1.0.0 24 | pydantic>=2.5.0 25 | msgpack>=1.0.7 26 | 27 | # BM25 28 | rank_bm25>=0.2.2 29 | 30 | # Logging and Monitoring 31 | logging>=0.5.1 32 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 lzrong0203 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 金融文件檢索增強生成系統 2 | 3 | 本專案是透過與 Claude (Anthropic) AI 助理的協作開發完成。Claude 協助了系統架構設計、程式碼實作、錯誤處理邏輯以及文檔撰寫等關鍵部分。這種人機協作的方式讓我們能夠快速開發並改進強化系統功能與效能。 4 | 5 | ## 協作亮點 6 | 7 | - 系統架構:與 Claude 共同設計模組化架構 8 | - 程式碼實作:結合人類專業知識與 AI 最佳實踐 9 | - 錯誤處理:完善的異常處理機制 10 | - 文檔撰寫:清晰的文檔和註解說明 11 | - 持續優化:反覆討論和改進系統設計 12 | 13 | ## 主要特點 14 | 15 | - 結合向量搜索和 BM25 的混合檢索 16 | - 針對繁體中文優化的 CKIP 文本處理 17 | - 強化的快取和批次處理 18 | - 多模態文件處理(PDF、文本) 19 | - 針對金融和保險領域的專業優化 20 | - 詳細的錯誤分析和日誌記錄 21 | 22 | ## 系統需求 23 | 24 | - Python 3.8+ 25 | - PyTorch 26 | - transformers 27 | - langchain 28 | - CKIP Transformers 29 | - FAISS 30 | - OpenAI API 金鑰 31 | 32 | ## 安裝方式 33 | 34 | ```bash 35 | pip install -r requirements.txt 36 | ``` 37 | 38 | 設置環境變數: 39 | ```bash 40 | export OPENAI_API_KEY=你的金鑰 41 | ``` 42 | 43 | ## 使用方式 44 | 45 | 基本用法: 46 | ```bash 47 | python complete_rag_openai.py \ 48 | --question_path questions.json \ 49 | --source_path documents/ \ 50 | --output_path results.json \ 51 | --retrieval_mode hybrid 52 | ``` 53 | 54 | 重要參數: 55 | - `retrieval_mode`: vector | bm25 | hybrid 56 | - `device`: GPU 設備 ID(-1 表示使用 CPU) 57 | - `chunk_size`: 文本分塊大小 58 | - `chunk_overlap`: 分塊重疊度 59 | 60 | ## 配置說明 61 | 62 | 系統可通過 `RAGConfig` 配置以下參數: 63 | 64 | - 文本處理(分塊大小、重疊度) 65 | - 模型選擇(CKIP、OpenAI) 66 | - 硬體使用(GPU/CPU) 67 | - 快取行為 68 | - 處理批次大小 69 | 70 | ## 系統架構 71 | 72 | 主要組件: 73 | 74 | 1. 文件處理器 75 | - PDF 提取 76 | - 文本正規化 77 | - 快取管理 78 | 79 | 2. CKIP 處理器 80 | - 中文斷詞 81 | - 文本分割 82 | - 批次處理 83 | 84 | 3. 查詢處理器 85 | - 查詢擴展 86 | - 模式匹配 87 | - 結果重排序 88 | 89 | 4. 向量存儲管理器 90 | - FAISS 整合 91 | - 嵌入管理 92 | - 存儲驗證 93 | 94 | ## 錯誤分析 95 | 96 | 包含全面的錯誤分析功能: 97 | 98 | - 類別特定模式分析 99 | - 詞彙重疊分析 100 | - 上下文評估 101 | - 詳細日誌記錄 102 | 103 | ## License 104 | 105 | MIT License 106 | 107 | ## 參與貢獻 108 | 109 | 歡迎提交 Issue 和 Pull Request。請確保測試通過並符合專案程式碼風格。 110 | 111 | ## 開發團隊 112 | 113 | - 人類開發者 Steve:負責專案規劃、需求分析、錯誤分析、程式實作和系統整合 114 | - Claude (Anthropic):協助系統設計、程式實作和文檔撰寫 115 | - README 由 Claude (Anthropic) 協助生成 116 | 117 | ## 鳴謝 118 | 119 | 特別感謝 Claude (Anthropic) 在本專案開發過程中提供的寶貴建議和協助。這種人機協作的開發模式展現了 AI 輔助開發的潛力。 120 | -------------------------------------------------------------------------------- /ckip_processor.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import logging 3 | import multiprocessing 4 | import os 5 | import pickle 6 | import re 7 | import threading 8 | from concurrent.futures import ProcessPoolExecutor 9 | from dataclasses import dataclass 10 | from datetime import datetime 11 | from functools import lru_cache 12 | from pathlib import Path 13 | from typing import List, Optional, Any 14 | 15 | import msgpack 16 | import torch 17 | import transformers 18 | from ckip_transformers.nlp import CkipWordSegmenter 19 | from tqdm import tqdm 20 | 21 | # 設置 transformers 的日誌級別 22 | transformers.logging.set_verbosity_error() 23 | os.environ['TOKENIZERS_PARALLELISM'] = 'false' 24 | 25 | # 設置日誌 26 | logging.basicConfig( 27 | level=logging.INFO, 28 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 29 | handlers=[ 30 | logging.FileHandler(f'ckip_processor_{datetime.now().strftime("%Y%m%d")}.log'), 31 | logging.StreamHandler() 32 | ] 33 | ) 34 | logger = logging.getLogger(__name__) 35 | 36 | 37 | class CKIPProcessorError(Exception): 38 | """CKIP處理器基礎異常類""" 39 | pass 40 | 41 | 42 | class ModelInitializationError(CKIPProcessorError): 43 | """模型初始化異常""" 44 | pass 45 | 46 | 47 | class ProcessingError(CKIPProcessorError): 48 | """文本處理異常""" 49 | pass 50 | 51 | 52 | @dataclass 53 | class ProcessingMetrics: 54 | """處理指標數據類""" 55 | total_texts: int 56 | processed_texts: int 57 | cache_hits: int 58 | processing_errors: int 59 | start_time: float 60 | end_time: Optional[float] = None 61 | 62 | @property 63 | def processing_time(self) -> Optional[float]: 64 | if self.end_time: 65 | return self.end_time - self.start_time 66 | return None 67 | 68 | @property 69 | def cache_hit_rate(self) -> float: 70 | return self.cache_hits / self.total_texts if self.total_texts > 0 else 0 71 | 72 | @property 73 | def error_rate(self) -> float: 74 | return self.processing_errors / self.total_texts if self.total_texts > 0 else 0 75 | 76 | 77 | class CacheManager: 78 | """緩存管理器""" 79 | 80 | def __init__(self, cache_dir: Path): 81 | self.cache_dir = cache_dir 82 | self.cache_file = cache_dir / "segmentation_cache.pkl" 83 | self.cache: Dict[int, str] = {} 84 | self.modified = False 85 | self._load_cache() 86 | 87 | def _load_cache(self) -> None: 88 | """載入緩存""" 89 | if self.cache_file.exists(): 90 | try: 91 | with self.cache_file.open('rb') as f: 92 | self.cache = pickle.load(f) 93 | logger.info(f"Loaded {len(self.cache)} items from cache") 94 | except Exception as e: 95 | logger.warning(f"Failed to load cache: {e}. Starting with empty cache.") 96 | self.cache = {} 97 | 98 | def save_cache(self) -> None: 99 | """安全地保存緩存""" 100 | if not self.modified: 101 | return 102 | 103 | temp_file = self.cache_file.with_suffix('.tmp') 104 | try: 105 | with temp_file.open('wb') as f: 106 | pickle.dump(self.cache, f) 107 | temp_file.replace(self.cache_file) 108 | self.modified = False 109 | logger.info(f"Saved {len(self.cache)} items to cache") 110 | except Exception as e: 111 | logger.error(f"Failed to save cache: {e}") 112 | if temp_file.exists(): 113 | temp_file.unlink() 114 | 115 | def get(self, key: int) -> Optional[str]: 116 | """獲取緩存項""" 117 | return self.cache.get(key) 118 | 119 | def set(self, key: int, value: str) -> None: 120 | """設置緩存項""" 121 | self.cache[key] = value 122 | self.modified = True 123 | 124 | 125 | class CKIPProcessor: 126 | """CKIP文本處理器""" 127 | _instance = None 128 | 129 | def __new__(cls, *args: Any, **kwargs: Any) -> 'CKIPProcessor': 130 | if cls._instance is None: 131 | cls._instance = super().__new__(cls) 132 | return cls._instance 133 | 134 | def __init__( 135 | self, 136 | cache_dir: str = "ckip_cache", 137 | batch_size: int = 32, 138 | num_processes: Optional[int] = None, 139 | model_name: str = "bert-base", 140 | device: int = 0, 141 | use_delim: bool = True, 142 | max_retries: int = 3 143 | ) -> None: 144 | """ 145 | 初始化CKIP處理器 146 | Args: 147 | cache_dir: 緩存目錄 148 | batch_size: 批次大小 149 | num_processes: CPU進程數 150 | model_name: 模型名稱 151 | device: GPU設備ID 152 | use_delim: 是否使用分隔符 153 | max_retries: 最大重試次數 154 | """ 155 | if hasattr(self, 'initialized'): 156 | return 157 | 158 | self.model_name = model_name 159 | self.batch_size = batch_size 160 | self.cache_dir = Path(cache_dir) 161 | self.num_processes = num_processes or max(1, multiprocessing.cpu_count() - 1) 162 | self.device = device 163 | self.use_delim = use_delim 164 | self.max_retries = max_retries 165 | 166 | # 檢查CUDA可用性 167 | if self.device >= 0 and not torch.cuda.is_available(): 168 | logger.warning("CUDA not available. Falling back to CPU.") 169 | self.device = -1 170 | 171 | # 創建緩存目錄 172 | self.cache_dir.mkdir(parents=True, exist_ok=True) 173 | self.cache_manager = CacheManager(self.cache_dir) 174 | 175 | # 初始化模型 176 | self._initialize_model() 177 | 178 | self.initialized = True 179 | logger.info( 180 | f"Initialized CKIP Processor with {self.num_processes} processes " 181 | f"on {'GPU:' + str(self.device) if self.device >= 0 else 'CPU'}" 182 | ) 183 | 184 | def _initialize_model(self) -> None: 185 | """初始化CKIP模型""" 186 | try: 187 | # 清理可能損壞的緩存 188 | for cache_path in [ 189 | self.cache_dir / "hub", 190 | Path.home() / ".cache" / "huggingface", 191 | Path.home() / ".cache" / "torch" 192 | ]: 193 | if cache_path.exists(): 194 | try: 195 | import shutil 196 | shutil.rmtree(str(cache_path)) 197 | except Exception as e: 198 | logger.warning(f"Failed to clean cache at {cache_path}: {e}") 199 | 200 | # 模型名稱映射 201 | model_mapping = { 202 | 'bert-base': 'bert-base', 203 | 'bert-tiny': 'bert-tiny', 204 | 'albert-base': 'albert-base', 205 | 'albert-tiny': 'albert-tiny' 206 | } 207 | 208 | model_name = model_mapping.get(self.model_name, 'bert-base') 209 | 210 | for attempt in range(self.max_retries): 211 | try: 212 | self.ws_driver = CkipWordSegmenter(model=model_name, device=self.device) 213 | 214 | # 驗證模型 215 | test_result = self.ws_driver(["測試句子"]) 216 | if not test_result or not isinstance(test_result[0], list): 217 | raise ValueError("Model validation failed") 218 | 219 | logger.info(f"Successfully initialized CKIP model: {model_name}") 220 | break 221 | except Exception as e: 222 | if attempt == self.max_retries - 1: 223 | raise ModelInitializationError( 224 | f"Failed to initialize model after {self.max_retries} attempts: {e}") 225 | logger.warning(f"Attempt {attempt + 1} failed: {e}") 226 | continue 227 | 228 | except Exception as e: 229 | raise ModelInitializationError(f"Fatal error initializing CKIP model: {e}") 230 | 231 | def _process_batch(self, texts: List[str], metrics: ProcessingMetrics) -> List[str]: 232 | """處理單一批次的文本""" 233 | results = [] 234 | uncached_texts = [] 235 | uncached_indices = [] 236 | 237 | # 檢查快取 238 | for i, text in enumerate(texts): 239 | if not text.strip(): 240 | results.append("") 241 | continue 242 | 243 | cached_result = self.cache_manager.get(text) 244 | if cached_result is not None: 245 | results.append(cached_result) 246 | metrics.cache_hits += 1 247 | else: 248 | results.append("") # 佔位符 249 | uncached_texts.append(text) 250 | uncached_indices.append(i) 251 | 252 | # 處理未快取的文本 253 | if uncached_texts: 254 | try: 255 | segmented = self.ws_driver(uncached_texts, use_delim=self.use_delim) 256 | 257 | # 更新結果和快取 258 | for i, seg_result in zip(uncached_indices, segmented): 259 | result = " ".join(seg_result) 260 | results[i] = result 261 | self.cache_manager.set(uncached_texts[i], result) 262 | 263 | metrics.processed_texts += len(uncached_texts) 264 | 265 | except Exception as e: 266 | logger.error(f"Error in word segmentation: {e}") 267 | metrics.processing_errors += len(uncached_texts) 268 | for i, text in zip(uncached_indices, uncached_texts): 269 | results[i] = text 270 | 271 | # 定期保存快取 272 | if metrics.processed_texts % 100 == 0: # 減少存檔頻率 273 | self.cache_manager.save_cache() 274 | 275 | return results 276 | 277 | def segment_parallel(self, texts: List[str]) -> List[str]: 278 | """平行處理多個文本""" 279 | if not texts: 280 | return [] 281 | 282 | # 過濾空文本 283 | texts = [text for text in texts if text and text.strip()] 284 | if not texts: 285 | return [] 286 | 287 | # 初始化指標 288 | metrics = ProcessingMetrics( 289 | total_texts=len(texts), 290 | processed_texts=0, 291 | cache_hits=0, 292 | processing_errors=0, 293 | start_time=datetime.now().timestamp() 294 | ) 295 | 296 | try: 297 | # 分批處理 298 | batches = [ 299 | texts[i:i + self.batch_size] 300 | for i in range(0, len(texts), self.batch_size) 301 | ] 302 | 303 | results = [] 304 | 305 | # GPU處理 306 | if self.device >= 0: 307 | # for batch in tqdm(batches, desc="Processing batches on GPU"): 308 | for batch in batches: 309 | results.extend(self._process_batch(batch, metrics)) 310 | # CPU多進程處理 311 | else: 312 | with ProcessPoolExecutor(max_workers=self.num_processes) as executor: 313 | futures = [] 314 | for batch in batches: 315 | future = executor.submit(self._process_batch, batch, metrics) 316 | futures.append(future) 317 | 318 | for future in tqdm(futures, desc="Processing batches on CPU"): 319 | results.extend(future.result()) 320 | 321 | # 更新並記錄指標 322 | metrics.end_time = datetime.now().timestamp() 323 | # logger.info( 324 | # f"Processing completed: {metrics.processed_texts}/{metrics.total_texts} texts processed, " 325 | # f"cache hit rate: {metrics.cache_hit_rate:.2%}, " 326 | # f"error rate: {metrics.error_rate:.2%}, " 327 | # f"processing time: {metrics.processing_time:.2f}s" 328 | # ) 329 | 330 | # 定期保存緩存 331 | if metrics.processed_texts % 1000 == 0: 332 | self.cache_manager.save_cache() 333 | 334 | return results 335 | 336 | except Exception as e: 337 | logger.error(f"Error in parallel segmentation: {e}") 338 | metrics.end_time = datetime.now().timestamp() 339 | metrics.processing_errors = len(texts) 340 | raise ProcessingError(f"Failed to process texts: {e}") 341 | 342 | finally: 343 | # 確保緩存被保存 344 | self.cache_manager.save_cache() 345 | 346 | def __del__(self) -> None: 347 | """確保緩存被保存""" 348 | if hasattr(self, 'cache_manager'): 349 | self.cache_manager.save_cache() 350 | 351 | 352 | from langchain.text_splitter import RecursiveCharacterTextSplitter 353 | from typing import List, Any, Optional, Dict 354 | import logging 355 | 356 | logger = logging.getLogger(__name__) 357 | 358 | 359 | @dataclass 360 | class CacheStats: 361 | """快取統計資訊""" 362 | total_hits: int = 0 363 | total_misses: int = 0 364 | total_items: int = 0 365 | last_cleanup: Optional[datetime] = None 366 | 367 | @property 368 | def hit_rate(self) -> float: 369 | total = self.total_hits + self.total_misses 370 | return self.total_hits / total if total > 0 else 0 371 | 372 | 373 | class CacheManager: 374 | """改進版快取管理器""" 375 | 376 | def __init__(self, cache_dir: Path): 377 | self.cache_dir = Path(cache_dir) 378 | self.cache_file = self.cache_dir / "segmentation_cache.pkl" 379 | self.stats_file = self.cache_dir / "cache_stats.json" 380 | self.cache: Dict[str, str] = {} 381 | self.modified = False 382 | self.stats = CacheStats() 383 | self._lock = threading.Lock() 384 | 385 | # 確保目錄存在 386 | self.cache_dir.mkdir(parents=True, exist_ok=True) 387 | self._load_cache() 388 | 389 | def get_cache_key(self, text: str) -> str: 390 | """使用 MD5 生成穩定的快取金鑰""" 391 | return hashlib.md5(text.encode('utf-8')).hexdigest() 392 | 393 | @lru_cache(maxsize=1000) 394 | def get(self, text: str) -> Optional[str]: 395 | """從快取獲取結果,先檢查記憶體快取""" 396 | with self._lock: 397 | key = self.get_cache_key(text) 398 | result = self.cache.get(key) 399 | if result is not None: 400 | self.stats.total_hits += 1 401 | else: 402 | self.stats.total_misses += 1 403 | return result 404 | 405 | def set(self, text: str, value: str) -> None: 406 | """設置快取值""" 407 | with self._lock: 408 | key = self.get_cache_key(text) 409 | self.cache[key] = value 410 | self.stats.total_items += 1 411 | self.modified = True 412 | 413 | # 當快取數量達到閾值時進行清理 414 | if self.stats.total_items > 100000: 415 | self._cleanup_cache() 416 | 417 | def _load_cache(self) -> None: 418 | """載入快取和統計資訊""" 419 | try: 420 | if self.cache_file.exists(): 421 | with self.cache_file.open('rb') as f: 422 | try: 423 | self.cache = msgpack.unpack(f) 424 | except: 425 | # 如果 msgpack 載入失敗,嘗試使用 pickle 426 | f.seek(0) 427 | self.cache = pickle.load(f) 428 | 429 | self.stats.total_items = len(self.cache) 430 | logger.info(f"Loaded {len(self.cache)} items from cache") 431 | 432 | except Exception as e: 433 | logger.warning(f"Failed to load cache: {e}. Starting with empty cache.") 434 | self.cache = {} 435 | 436 | def save_cache(self) -> None: 437 | """安全地保存快取""" 438 | if not self.modified: 439 | return 440 | 441 | with self._lock: 442 | temp_file = self.cache_file.with_suffix('.tmp') 443 | try: 444 | with temp_file.open('wb') as f: 445 | msgpack.pack(self.cache, f) 446 | 447 | # 安全地替換檔案 448 | temp_file.replace(self.cache_file) 449 | self.modified = False 450 | logger.info(f"Saved {len(self.cache)} items to cache") 451 | 452 | except Exception as e: 453 | logger.error(f"Failed to save cache: {e}") 454 | if temp_file.exists(): 455 | temp_file.unlink() 456 | 457 | def _cleanup_cache(self) -> None: 458 | """清理過期的快取項目""" 459 | with self._lock: 460 | if len(self.cache) > 50000: # 保留最新的 50000 項 461 | sorted_items = sorted( 462 | self.cache.items(), 463 | key=lambda x: len(x[1]) # 根據值的長度排序 464 | ) 465 | self.cache = dict(sorted_items[-50000:]) 466 | self.stats.total_items = len(self.cache) 467 | self.stats.last_cleanup = datetime.now() 468 | self.modified = True 469 | logger.info(f"Cleaned up cache, remaining items: {len(self.cache)}") 470 | 471 | 472 | class CKIPEnhancedTextSplitter(RecursiveCharacterTextSplitter): 473 | """整合 CKIP 處理器的增強型文本分割器""" 474 | 475 | def __init__( 476 | self, 477 | ckip_processor: 'CKIPProcessor', 478 | chunk_size: int = 800, 479 | chunk_overlap: int = 400, 480 | category: str = None, # 新增類別參數 481 | **kwargs: Any, 482 | ) -> None: 483 | # 基礎分隔符 484 | separators = [ 485 | "\n\n", # 段落分隔 486 | "。\n", # 句號換行 487 | "。", # 句號 488 | ";", # 分號 489 | ",", # 逗號 490 | " ", # 空格 491 | "" # 字符級別 492 | ] 493 | 494 | # 根據不同類別設置不同的保護模式 495 | self.protect_patterns = self._get_protect_patterns(category) 496 | 497 | super().__init__( 498 | chunk_size=chunk_size, 499 | chunk_overlap=chunk_overlap, 500 | separators=separators, 501 | is_separator_regex=True, 502 | **kwargs 503 | ) 504 | self.ckip_processor = ckip_processor 505 | 506 | def _get_protect_patterns(self, category: str) -> List[str]: 507 | """獲取需要保護的內容模式""" 508 | common_patterns = [ 509 | r"第[一二三四五六七八九十]+條[\s\S]*?(?=第[一二三四五六七八九十]+條|$)" # 完整條款 510 | ] 511 | 512 | if category == 'finance': 513 | finance_patterns = [ 514 | # 財報表頭 515 | r"合併資產負債表[\s\S]*?單位:新台幣仟元", 516 | # 財務項目行 517 | r"現金及約當現金.*?(?=\n)", 518 | r"資產總計.*?(?=\n)", 519 | # 具體數字段落 520 | r"\$\s*[\d,]+\s*\d+%?\s*(?=\n)", 521 | # 報表時期 522 | r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日" 523 | ] 524 | return common_patterns + finance_patterns 525 | 526 | return common_patterns 527 | 528 | def split_text(self, text: str) -> List[str]: 529 | """改進的文本分割方法""" 530 | try: 531 | # 1. 標記需要保護的部分 532 | protected_parts = [] 533 | for pattern in self.protect_patterns: 534 | matches = re.finditer(pattern, text) 535 | for match in matches: 536 | protected_parts.append((match.start(), match.end(), match.group())) 537 | 538 | # 2. 如果找到需要保護的部分,優先按這些部分分割 539 | if protected_parts: 540 | protected_parts.sort(key=lambda x: x[0]) 541 | chunks = [] 542 | last_end = 0 543 | 544 | for start, end, content in protected_parts: 545 | # 處理保護部分之前的文本 546 | if start > last_end: 547 | interim_text = text[last_end:start] 548 | if interim_text.strip(): 549 | chunks.extend(super().split_text(interim_text)) 550 | 551 | # 處理保護內容 552 | if len(content) > self._chunk_size: 553 | # 如果保護內容過長,使用更保守的分割方式 554 | sub_chunks = self._split_long_protected_content(content) 555 | chunks.extend(sub_chunks) 556 | else: 557 | chunks.append(content) 558 | 559 | last_end = end 560 | 561 | # 處理最後剩餘的部分 562 | if last_end < len(text): 563 | remaining = text[last_end:] 564 | if remaining.strip(): 565 | chunks.extend(super().split_text(remaining)) 566 | 567 | else: 568 | # 如果沒有需要保護的部分,使用基本分割方法 569 | chunks = super().split_text(text) 570 | 571 | # 3. 後處理:清理和驗證 572 | return self._postprocess_chunks(chunks) 573 | 574 | except Exception as e: 575 | logger.error(f"Error in enhanced splitting: {e}") 576 | return super().split_text(text) 577 | 578 | def _split_long_protected_content(self, content: str) -> List[str]: 579 | """處理過長的保護內容""" 580 | # 針對不同類型的內容使用不同的分割策略 581 | if re.match(r"第[一二三四五六七八九十]+條", content): 582 | # 條款的分割 583 | return self._split_by_sentences(content) 584 | elif re.search(r"[0-9,]+(?:元|%)", content): 585 | # 數字相關內容的分割 586 | return self._split_preserving_numbers(content) 587 | else: 588 | # 一般內容的分割 589 | return self._split_by_delimiters(content) 590 | 591 | def _split_by_sentences(self, text: str) -> List[str]: 592 | """按句子分割,保持句意完整""" 593 | sentences = re.split(r'(。|;)', text) 594 | current_chunk = [] 595 | chunks = [] 596 | current_length = 0 597 | 598 | for i in range(0, len(sentences), 2): 599 | sentence = sentences[i] 600 | if i + 1 < len(sentences): 601 | sentence += sentences[i + 1] # 加回分隔符 602 | 603 | if current_length + len(sentence) > self._chunk_size and current_chunk: 604 | chunks.append(''.join(current_chunk)) 605 | current_chunk = [] 606 | current_length = 0 607 | 608 | current_chunk.append(sentence) 609 | current_length += len(sentence) 610 | 611 | if current_chunk: 612 | chunks.append(''.join(current_chunk)) 613 | 614 | return chunks 615 | 616 | def _postprocess_chunks(self, chunks: List[str]) -> List[str]: 617 | """後處理分割後的文本塊""" 618 | processed_chunks = [] 619 | for chunk in chunks: 620 | chunk = chunk.strip() 621 | if not chunk: 622 | continue 623 | 624 | # 確保條款編號不會出現在末尾 625 | if re.search(r'第[一二三四五六七八九十]+條\s*$', chunk): 626 | continue 627 | 628 | # 檢查並修復可能被分割的關鍵片段 629 | if re.search(r'^[,。;:]', chunk): 630 | if processed_chunks: 631 | processed_chunks[-1] += chunk 632 | continue 633 | 634 | processed_chunks.append(chunk) 635 | 636 | return processed_chunks 637 | --------------------------------------------------------------------------------