├── requirements.txt
├── LICENSE
├── README.md
└── ckip_processor.py


/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies
 2 | torch>=2.0.0
 3 | transformers>=4.34.0
 4 | langchain>=0.0.350
 5 | langchain-openai>=0.0.5
 6 | faiss-cpu>=1.7.4
 7 | openai>=1.3.0
 8 | 
 9 | # CKIP Related
10 | ckip-transformers>=0.3.4
11 | tokenizers>=0.15.0
12 | 
13 | # Document Processing
14 | PyMuPDF>=1.23.6
15 | pdfplumber>=0.10.3
16 | pytesseract>=0.3.10
17 | Pillow>=10.1.0
18 | 
19 | # Data Processing and Utilities
20 | numpy>=1.24.0
21 | pandas>=2.1.0
22 | tqdm>=4.66.1
23 | python-dotenv>=1.0.0
24 | pydantic>=2.5.0
25 | msgpack>=1.0.7
26 | 
27 | # BM25
28 | rank_bm25>=0.2.2
29 | 
30 | # Logging and Monitoring
31 | logging>=0.5.1
32 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 lzrong0203
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 金融文件檢索增強生成系統
  2 | 
  3 | 本專案是透過與 Claude (Anthropic) AI 助理的協作開發完成。Claude 協助了系統架構設計、程式碼實作、錯誤處理邏輯以及文檔撰寫等關鍵部分。這種人機協作的方式讓我們能夠快速開發並改進強化系統功能與效能。
  4 | 
  5 | ## 協作亮點
  6 | 
  7 | - 系統架構：與 Claude 共同設計模組化架構
  8 | - 程式碼實作：結合人類專業知識與 AI 最佳實踐
  9 | - 錯誤處理：完善的異常處理機制
 10 | - 文檔撰寫：清晰的文檔和註解說明
 11 | - 持續優化：反覆討論和改進系統設計
 12 | 
 13 | ## 主要特點
 14 | 
 15 | - 結合向量搜索和 BM25 的混合檢索
 16 | - 針對繁體中文優化的 CKIP 文本處理
 17 | - 強化的快取和批次處理
 18 | - 多模態文件處理（PDF、文本）
 19 | - 針對金融和保險領域的專業優化
 20 | - 詳細的錯誤分析和日誌記錄
 21 | 
 22 | ## 系統需求
 23 | 
 24 | - Python 3.8+
 25 | - PyTorch 
 26 | - transformers
 27 | - langchain
 28 | - CKIP Transformers
 29 | - FAISS
 30 | - OpenAI API 金鑰
 31 | 
 32 | ## 安裝方式
 33 | 
 34 | ```bash
 35 | pip install -r requirements.txt
 36 | ```
 37 | 
 38 | 設置環境變數：
 39 | ```bash
 40 | export OPENAI_API_KEY=你的金鑰
 41 | ```
 42 | 
 43 | ## 使用方式
 44 | 
 45 | 基本用法：
 46 | ```bash
 47 | python complete_rag_openai.py \
 48 |   --question_path questions.json \
 49 |   --source_path documents/ \
 50 |   --output_path results.json \
 51 |   --retrieval_mode hybrid
 52 | ```
 53 | 
 54 | 重要參數：
 55 | - `retrieval_mode`: vector | bm25 | hybrid
 56 | - `device`: GPU 設備 ID（-1 表示使用 CPU）
 57 | - `chunk_size`: 文本分塊大小
 58 | - `chunk_overlap`: 分塊重疊度
 59 | 
 60 | ## 配置說明
 61 | 
 62 | 系統可通過 `RAGConfig` 配置以下參數：
 63 | 
 64 | - 文本處理（分塊大小、重疊度）
 65 | - 模型選擇（CKIP、OpenAI）
 66 | - 硬體使用（GPU/CPU）
 67 | - 快取行為
 68 | - 處理批次大小
 69 | 
 70 | ## 系統架構
 71 | 
 72 | 主要組件：
 73 | 
 74 | 1. 文件處理器
 75 |    - PDF 提取
 76 |    - 文本正規化
 77 |    - 快取管理
 78 |   
 79 | 2. CKIP 處理器
 80 |    - 中文斷詞
 81 |    - 文本分割
 82 |    - 批次處理
 83 | 
 84 | 3. 查詢處理器
 85 |    - 查詢擴展
 86 |    - 模式匹配
 87 |    - 結果重排序
 88 | 
 89 | 4. 向量存儲管理器
 90 |    - FAISS 整合
 91 |    - 嵌入管理
 92 |    - 存儲驗證
 93 | 
 94 | ## 錯誤分析
 95 | 
 96 | 包含全面的錯誤分析功能：
 97 | 
 98 | - 類別特定模式分析
 99 | - 詞彙重疊分析
100 | - 上下文評估
101 | - 詳細日誌記錄
102 | 
103 | ## License
104 | 
105 | MIT License
106 | 
107 | ## 參與貢獻
108 | 
109 | 歡迎提交 Issue 和 Pull Request。請確保測試通過並符合專案程式碼風格。
110 | 
111 | ## 開發團隊
112 | 
113 | - 人類開發者 Steve：負責專案規劃、需求分析、錯誤分析、程式實作和系統整合
114 | - Claude (Anthropic)：協助系統設計、程式實作和文檔撰寫
115 | - README 由 Claude (Anthropic) 協助生成
116 | 
117 | ## 鳴謝
118 | 
119 | 特別感謝 Claude (Anthropic) 在本專案開發過程中提供的寶貴建議和協助。這種人機協作的開發模式展現了 AI 輔助開發的潛力。
120 | 


--------------------------------------------------------------------------------
/ckip_processor.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import logging
  3 | import multiprocessing
  4 | import os
  5 | import pickle
  6 | import re
  7 | import threading
  8 | from concurrent.futures import ProcessPoolExecutor
  9 | from dataclasses import dataclass
 10 | from datetime import datetime
 11 | from functools import lru_cache
 12 | from pathlib import Path
 13 | from typing import List, Optional, Any
 14 | 
 15 | import msgpack
 16 | import torch
 17 | import transformers
 18 | from ckip_transformers.nlp import CkipWordSegmenter
 19 | from tqdm import tqdm
 20 | 
 21 | # 設置 transformers 的日誌級別
 22 | transformers.logging.set_verbosity_error()
 23 | os.environ['TOKENIZERS_PARALLELISM'] = 'false'
 24 | 
 25 | # 設置日誌
 26 | logging.basicConfig(
 27 |     level=logging.INFO,
 28 |     format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
 29 |     handlers=[
 30 |         logging.FileHandler(f'ckip_processor_{datetime.now().strftime("%Y%m%d")}.log'),
 31 |         logging.StreamHandler()
 32 |     ]
 33 | )
 34 | logger = logging.getLogger(__name__)
 35 | 
 36 | 
 37 | class CKIPProcessorError(Exception):
 38 |     """CKIP處理器基礎異常類"""
 39 |     pass
 40 | 
 41 | 
 42 | class ModelInitializationError(CKIPProcessorError):
 43 |     """模型初始化異常"""
 44 |     pass
 45 | 
 46 | 
 47 | class ProcessingError(CKIPProcessorError):
 48 |     """文本處理異常"""
 49 |     pass
 50 | 
 51 | 
 52 | @dataclass
 53 | class ProcessingMetrics:
 54 |     """處理指標數據類"""
 55 |     total_texts: int
 56 |     processed_texts: int
 57 |     cache_hits: int
 58 |     processing_errors: int
 59 |     start_time: float
 60 |     end_time: Optional[float] = None
 61 | 
 62 |     @property
 63 |     def processing_time(self) -> Optional[float]:
 64 |         if self.end_time:
 65 |             return self.end_time - self.start_time
 66 |         return None
 67 | 
 68 |     @property
 69 |     def cache_hit_rate(self) -> float:
 70 |         return self.cache_hits / self.total_texts if self.total_texts > 0 else 0
 71 | 
 72 |     @property
 73 |     def error_rate(self) -> float:
 74 |         return self.processing_errors / self.total_texts if self.total_texts > 0 else 0
 75 | 
 76 | 
 77 | class CacheManager:
 78 |     """緩存管理器"""
 79 | 
 80 |     def __init__(self, cache_dir: Path):
 81 |         self.cache_dir = cache_dir
 82 |         self.cache_file = cache_dir / "segmentation_cache.pkl"
 83 |         self.cache: Dict[int, str] = {}
 84 |         self.modified = False
 85 |         self._load_cache()
 86 | 
 87 |     def _load_cache(self) -> None:
 88 |         """載入緩存"""
 89 |         if self.cache_file.exists():
 90 |             try:
 91 |                 with self.cache_file.open('rb') as f:
 92 |                     self.cache = pickle.load(f)
 93 |                 logger.info(f"Loaded {len(self.cache)} items from cache")
 94 |             except Exception as e:
 95 |                 logger.warning(f"Failed to load cache: {e}. Starting with empty cache.")
 96 |                 self.cache = {}
 97 | 
 98 |     def save_cache(self) -> None:
 99 |         """安全地保存緩存"""
100 |         if not self.modified:
101 |             return
102 | 
103 |         temp_file = self.cache_file.with_suffix('.tmp')
104 |         try:
105 |             with temp_file.open('wb') as f:
106 |                 pickle.dump(self.cache, f)
107 |             temp_file.replace(self.cache_file)
108 |             self.modified = False
109 |             logger.info(f"Saved {len(self.cache)} items to cache")
110 |         except Exception as e:
111 |             logger.error(f"Failed to save cache: {e}")
112 |             if temp_file.exists():
113 |                 temp_file.unlink()
114 | 
115 |     def get(self, key: int) -> Optional[str]:
116 |         """獲取緩存項"""
117 |         return self.cache.get(key)
118 | 
119 |     def set(self, key: int, value: str) -> None:
120 |         """設置緩存項"""
121 |         self.cache[key] = value
122 |         self.modified = True
123 | 
124 | 
125 | class CKIPProcessor:
126 |     """CKIP文本處理器"""
127 |     _instance = None
128 | 
129 |     def __new__(cls, *args: Any, **kwargs: Any) -> 'CKIPProcessor':
130 |         if cls._instance is None:
131 |             cls._instance = super().__new__(cls)
132 |         return cls._instance
133 | 
134 |     def __init__(
135 |             self,
136 |             cache_dir: str = "ckip_cache",
137 |             batch_size: int = 32,
138 |             num_processes: Optional[int] = None,
139 |             model_name: str = "bert-base",
140 |             device: int = 0,
141 |             use_delim: bool = True,
142 |             max_retries: int = 3
143 |     ) -> None:
144 |         """
145 |         初始化CKIP處理器
146 |         Args:
147 |             cache_dir: 緩存目錄
148 |             batch_size: 批次大小
149 |             num_processes: CPU進程數
150 |             model_name: 模型名稱
151 |             device: GPU設備ID
152 |             use_delim: 是否使用分隔符
153 |             max_retries: 最大重試次數
154 |         """
155 |         if hasattr(self, 'initialized'):
156 |             return
157 | 
158 |         self.model_name = model_name
159 |         self.batch_size = batch_size
160 |         self.cache_dir = Path(cache_dir)
161 |         self.num_processes = num_processes or max(1, multiprocessing.cpu_count() - 1)
162 |         self.device = device
163 |         self.use_delim = use_delim
164 |         self.max_retries = max_retries
165 | 
166 |         # 檢查CUDA可用性
167 |         if self.device >= 0 and not torch.cuda.is_available():
168 |             logger.warning("CUDA not available. Falling back to CPU.")
169 |             self.device = -1
170 | 
171 |         # 創建緩存目錄
172 |         self.cache_dir.mkdir(parents=True, exist_ok=True)
173 |         self.cache_manager = CacheManager(self.cache_dir)
174 | 
175 |         # 初始化模型
176 |         self._initialize_model()
177 | 
178 |         self.initialized = True
179 |         logger.info(
180 |             f"Initialized CKIP Processor with {self.num_processes} processes "
181 |             f"on {'GPU:' + str(self.device) if self.device >= 0 else 'CPU'}"
182 |         )
183 | 
184 |     def _initialize_model(self) -> None:
185 |         """初始化CKIP模型"""
186 |         try:
187 |             # 清理可能損壞的緩存
188 |             for cache_path in [
189 |                 self.cache_dir / "hub",
190 |                 Path.home() / ".cache" / "huggingface",
191 |                 Path.home() / ".cache" / "torch"
192 |             ]:
193 |                 if cache_path.exists():
194 |                     try:
195 |                         import shutil
196 |                         shutil.rmtree(str(cache_path))
197 |                     except Exception as e:
198 |                         logger.warning(f"Failed to clean cache at {cache_path}: {e}")
199 | 
200 |             # 模型名稱映射
201 |             model_mapping = {
202 |                 'bert-base': 'bert-base',
203 |                 'bert-tiny': 'bert-tiny',
204 |                 'albert-base': 'albert-base',
205 |                 'albert-tiny': 'albert-tiny'
206 |             }
207 | 
208 |             model_name = model_mapping.get(self.model_name, 'bert-base')
209 | 
210 |             for attempt in range(self.max_retries):
211 |                 try:
212 |                     self.ws_driver = CkipWordSegmenter(model=model_name, device=self.device)
213 | 
214 |                     # 驗證模型
215 |                     test_result = self.ws_driver(["測試句子"])
216 |                     if not test_result or not isinstance(test_result[0], list):
217 |                         raise ValueError("Model validation failed")
218 | 
219 |                     logger.info(f"Successfully initialized CKIP model: {model_name}")
220 |                     break
221 |                 except Exception as e:
222 |                     if attempt == self.max_retries - 1:
223 |                         raise ModelInitializationError(
224 |                             f"Failed to initialize model after {self.max_retries} attempts: {e}")
225 |                     logger.warning(f"Attempt {attempt + 1} failed: {e}")
226 |                     continue
227 | 
228 |         except Exception as e:
229 |             raise ModelInitializationError(f"Fatal error initializing CKIP model: {e}")
230 | 
231 |     def _process_batch(self, texts: List[str], metrics: ProcessingMetrics) -> List[str]:
232 |         """處理單一批次的文本"""
233 |         results = []
234 |         uncached_texts = []
235 |         uncached_indices = []
236 | 
237 |         # 檢查快取
238 |         for i, text in enumerate(texts):
239 |             if not text.strip():
240 |                 results.append("")
241 |                 continue
242 | 
243 |             cached_result = self.cache_manager.get(text)
244 |             if cached_result is not None:
245 |                 results.append(cached_result)
246 |                 metrics.cache_hits += 1
247 |             else:
248 |                 results.append("")  # 佔位符
249 |                 uncached_texts.append(text)
250 |                 uncached_indices.append(i)
251 | 
252 |         # 處理未快取的文本
253 |         if uncached_texts:
254 |             try:
255 |                 segmented = self.ws_driver(uncached_texts, use_delim=self.use_delim)
256 | 
257 |                 # 更新結果和快取
258 |                 for i, seg_result in zip(uncached_indices, segmented):
259 |                     result = " ".join(seg_result)
260 |                     results[i] = result
261 |                     self.cache_manager.set(uncached_texts[i], result)
262 | 
263 |                 metrics.processed_texts += len(uncached_texts)
264 | 
265 |             except Exception as e:
266 |                 logger.error(f"Error in word segmentation: {e}")
267 |                 metrics.processing_errors += len(uncached_texts)
268 |                 for i, text in zip(uncached_indices, uncached_texts):
269 |                     results[i] = text
270 | 
271 |         # 定期保存快取
272 |         if metrics.processed_texts % 100 == 0:  # 減少存檔頻率
273 |             self.cache_manager.save_cache()
274 | 
275 |         return results
276 | 
277 |     def segment_parallel(self, texts: List[str]) -> List[str]:
278 |         """平行處理多個文本"""
279 |         if not texts:
280 |             return []
281 | 
282 |         # 過濾空文本
283 |         texts = [text for text in texts if text and text.strip()]
284 |         if not texts:
285 |             return []
286 | 
287 |         # 初始化指標
288 |         metrics = ProcessingMetrics(
289 |             total_texts=len(texts),
290 |             processed_texts=0,
291 |             cache_hits=0,
292 |             processing_errors=0,
293 |             start_time=datetime.now().timestamp()
294 |         )
295 | 
296 |         try:
297 |             # 分批處理
298 |             batches = [
299 |                 texts[i:i + self.batch_size]
300 |                 for i in range(0, len(texts), self.batch_size)
301 |             ]
302 | 
303 |             results = []
304 | 
305 |             # GPU處理
306 |             if self.device >= 0:
307 |                 # for batch in tqdm(batches, desc="Processing batches on GPU"):
308 |                 for batch in batches:
309 |                     results.extend(self._process_batch(batch, metrics))
310 |             # CPU多進程處理
311 |             else:
312 |                 with ProcessPoolExecutor(max_workers=self.num_processes) as executor:
313 |                     futures = []
314 |                     for batch in batches:
315 |                         future = executor.submit(self._process_batch, batch, metrics)
316 |                         futures.append(future)
317 | 
318 |                     for future in tqdm(futures, desc="Processing batches on CPU"):
319 |                         results.extend(future.result())
320 | 
321 |             # 更新並記錄指標
322 |             metrics.end_time = datetime.now().timestamp()
323 |             # logger.info(
324 |             #     f"Processing completed: {metrics.processed_texts}/{metrics.total_texts} texts processed, "
325 |             #     f"cache hit rate: {metrics.cache_hit_rate:.2%}, "
326 |             #     f"error rate: {metrics.error_rate:.2%}, "
327 |             #     f"processing time: {metrics.processing_time:.2f}s"
328 |             # )
329 | 
330 |             # 定期保存緩存
331 |             if metrics.processed_texts % 1000 == 0:
332 |                 self.cache_manager.save_cache()
333 | 
334 |             return results
335 | 
336 |         except Exception as e:
337 |             logger.error(f"Error in parallel segmentation: {e}")
338 |             metrics.end_time = datetime.now().timestamp()
339 |             metrics.processing_errors = len(texts)
340 |             raise ProcessingError(f"Failed to process texts: {e}")
341 | 
342 |         finally:
343 |             # 確保緩存被保存
344 |             self.cache_manager.save_cache()
345 | 
346 |     def __del__(self) -> None:
347 |         """確保緩存被保存"""
348 |         if hasattr(self, 'cache_manager'):
349 |             self.cache_manager.save_cache()
350 | 
351 | 
352 | from langchain.text_splitter import RecursiveCharacterTextSplitter
353 | from typing import List, Any, Optional, Dict
354 | import logging
355 | 
356 | logger = logging.getLogger(__name__)
357 | 
358 | 
359 | @dataclass
360 | class CacheStats:
361 |     """快取統計資訊"""
362 |     total_hits: int = 0
363 |     total_misses: int = 0
364 |     total_items: int = 0
365 |     last_cleanup: Optional[datetime] = None
366 | 
367 |     @property
368 |     def hit_rate(self) -> float:
369 |         total = self.total_hits + self.total_misses
370 |         return self.total_hits / total if total > 0 else 0
371 | 
372 | 
373 | class CacheManager:
374 |     """改進版快取管理器"""
375 | 
376 |     def __init__(self, cache_dir: Path):
377 |         self.cache_dir = Path(cache_dir)
378 |         self.cache_file = self.cache_dir / "segmentation_cache.pkl"
379 |         self.stats_file = self.cache_dir / "cache_stats.json"
380 |         self.cache: Dict[str, str] = {}
381 |         self.modified = False
382 |         self.stats = CacheStats()
383 |         self._lock = threading.Lock()
384 | 
385 |         # 確保目錄存在
386 |         self.cache_dir.mkdir(parents=True, exist_ok=True)
387 |         self._load_cache()
388 | 
389 |     def get_cache_key(self, text: str) -> str:
390 |         """使用 MD5 生成穩定的快取金鑰"""
391 |         return hashlib.md5(text.encode('utf-8')).hexdigest()
392 | 
393 |     @lru_cache(maxsize=1000)
394 |     def get(self, text: str) -> Optional[str]:
395 |         """從快取獲取結果，先檢查記憶體快取"""
396 |         with self._lock:
397 |             key = self.get_cache_key(text)
398 |             result = self.cache.get(key)
399 |             if result is not None:
400 |                 self.stats.total_hits += 1
401 |             else:
402 |                 self.stats.total_misses += 1
403 |             return result
404 | 
405 |     def set(self, text: str, value: str) -> None:
406 |         """設置快取值"""
407 |         with self._lock:
408 |             key = self.get_cache_key(text)
409 |             self.cache[key] = value
410 |             self.stats.total_items += 1
411 |             self.modified = True
412 | 
413 |             # 當快取數量達到閾值時進行清理
414 |             if self.stats.total_items > 100000:
415 |                 self._cleanup_cache()
416 | 
417 |     def _load_cache(self) -> None:
418 |         """載入快取和統計資訊"""
419 |         try:
420 |             if self.cache_file.exists():
421 |                 with self.cache_file.open('rb') as f:
422 |                     try:
423 |                         self.cache = msgpack.unpack(f)
424 |                     except:
425 |                         # 如果 msgpack 載入失敗，嘗試使用 pickle
426 |                         f.seek(0)
427 |                         self.cache = pickle.load(f)
428 | 
429 |                 self.stats.total_items = len(self.cache)
430 |                 logger.info(f"Loaded {len(self.cache)} items from cache")
431 | 
432 |         except Exception as e:
433 |             logger.warning(f"Failed to load cache: {e}. Starting with empty cache.")
434 |             self.cache = {}
435 | 
436 |     def save_cache(self) -> None:
437 |         """安全地保存快取"""
438 |         if not self.modified:
439 |             return
440 | 
441 |         with self._lock:
442 |             temp_file = self.cache_file.with_suffix('.tmp')
443 |             try:
444 |                 with temp_file.open('wb') as f:
445 |                     msgpack.pack(self.cache, f)
446 | 
447 |                 # 安全地替換檔案
448 |                 temp_file.replace(self.cache_file)
449 |                 self.modified = False
450 |                 logger.info(f"Saved {len(self.cache)} items to cache")
451 | 
452 |             except Exception as e:
453 |                 logger.error(f"Failed to save cache: {e}")
454 |                 if temp_file.exists():
455 |                     temp_file.unlink()
456 | 
457 |     def _cleanup_cache(self) -> None:
458 |         """清理過期的快取項目"""
459 |         with self._lock:
460 |             if len(self.cache) > 50000:  # 保留最新的 50000 項
461 |                 sorted_items = sorted(
462 |                     self.cache.items(),
463 |                     key=lambda x: len(x[1])  # 根據值的長度排序
464 |                 )
465 |                 self.cache = dict(sorted_items[-50000:])
466 |                 self.stats.total_items = len(self.cache)
467 |                 self.stats.last_cleanup = datetime.now()
468 |                 self.modified = True
469 |                 logger.info(f"Cleaned up cache, remaining items: {len(self.cache)}")
470 | 
471 | 
472 | class CKIPEnhancedTextSplitter(RecursiveCharacterTextSplitter):
473 |     """整合 CKIP 處理器的增強型文本分割器"""
474 | 
475 |     def __init__(
476 |             self,
477 |             ckip_processor: 'CKIPProcessor',
478 |             chunk_size: int = 800,
479 |             chunk_overlap: int = 400,
480 |             category: str = None,  # 新增類別參數
481 |             **kwargs: Any,
482 |     ) -> None:
483 |         # 基礎分隔符
484 |         separators = [
485 |             "\n\n",  # 段落分隔
486 |             "。\n",  # 句號換行
487 |             "。",  # 句號
488 |             "；",  # 分號
489 |             "，",  # 逗號
490 |             " ",  # 空格
491 |             ""  # 字符級別
492 |         ]
493 | 
494 |         # 根據不同類別設置不同的保護模式
495 |         self.protect_patterns = self._get_protect_patterns(category)
496 | 
497 |         super().__init__(
498 |             chunk_size=chunk_size,
499 |             chunk_overlap=chunk_overlap,
500 |             separators=separators,
501 |             is_separator_regex=True,
502 |             **kwargs
503 |         )
504 |         self.ckip_processor = ckip_processor
505 | 
506 |     def _get_protect_patterns(self, category: str) -> List[str]:
507 |         """獲取需要保護的內容模式"""
508 |         common_patterns = [
509 |             r"第[一二三四五六七八九十]+條[\s\S]*?(?=第[一二三四五六七八九十]+條|$)"  # 完整條款
510 |         ]
511 | 
512 |         if category == 'finance':
513 |             finance_patterns = [
514 |                 # 財報表頭
515 |                 r"合併資產負債表[\s\S]*?單位：新台幣仟元",
516 |                 # 財務項目行
517 |                 r"現金及約當現金.*?(?=\n)",
518 |                 r"資產總計.*?(?=\n)",
519 |                 # 具體數字段落
520 |                 r"\$\s*[\d,]+\s*\d+%?\s*(?=\n)",
521 |                 # 報表時期
522 |                 r"民國\s*\d+\s*年\s*\d+\s*月\s*\d+\s*日"
523 |             ]
524 |             return common_patterns + finance_patterns
525 | 
526 |         return common_patterns
527 | 
528 |     def split_text(self, text: str) -> List[str]:
529 |         """改進的文本分割方法"""
530 |         try:
531 |             # 1. 標記需要保護的部分
532 |             protected_parts = []
533 |             for pattern in self.protect_patterns:
534 |                 matches = re.finditer(pattern, text)
535 |                 for match in matches:
536 |                     protected_parts.append((match.start(), match.end(), match.group()))
537 | 
538 |             # 2. 如果找到需要保護的部分，優先按這些部分分割
539 |             if protected_parts:
540 |                 protected_parts.sort(key=lambda x: x[0])
541 |                 chunks = []
542 |                 last_end = 0
543 | 
544 |                 for start, end, content in protected_parts:
545 |                     # 處理保護部分之前的文本
546 |                     if start > last_end:
547 |                         interim_text = text[last_end:start]
548 |                         if interim_text.strip():
549 |                             chunks.extend(super().split_text(interim_text))
550 | 
551 |                     # 處理保護內容
552 |                     if len(content) > self._chunk_size:
553 |                         # 如果保護內容過長，使用更保守的分割方式
554 |                         sub_chunks = self._split_long_protected_content(content)
555 |                         chunks.extend(sub_chunks)
556 |                     else:
557 |                         chunks.append(content)
558 | 
559 |                     last_end = end
560 | 
561 |                 # 處理最後剩餘的部分
562 |                 if last_end < len(text):
563 |                     remaining = text[last_end:]
564 |                     if remaining.strip():
565 |                         chunks.extend(super().split_text(remaining))
566 | 
567 |             else:
568 |                 # 如果沒有需要保護的部分，使用基本分割方法
569 |                 chunks = super().split_text(text)
570 | 
571 |             # 3. 後處理：清理和驗證
572 |             return self._postprocess_chunks(chunks)
573 | 
574 |         except Exception as e:
575 |             logger.error(f"Error in enhanced splitting: {e}")
576 |             return super().split_text(text)
577 | 
578 |     def _split_long_protected_content(self, content: str) -> List[str]:
579 |         """處理過長的保護內容"""
580 |         # 針對不同類型的內容使用不同的分割策略
581 |         if re.match(r"第[一二三四五六七八九十]+條", content):
582 |             # 條款的分割
583 |             return self._split_by_sentences(content)
584 |         elif re.search(r"[0-9,]+(?:元|%)", content):
585 |             # 數字相關內容的分割
586 |             return self._split_preserving_numbers(content)
587 |         else:
588 |             # 一般內容的分割
589 |             return self._split_by_delimiters(content)
590 | 
591 |     def _split_by_sentences(self, text: str) -> List[str]:
592 |         """按句子分割，保持句意完整"""
593 |         sentences = re.split(r'(。|；)', text)
594 |         current_chunk = []
595 |         chunks = []
596 |         current_length = 0
597 | 
598 |         for i in range(0, len(sentences), 2):
599 |             sentence = sentences[i]
600 |             if i + 1 < len(sentences):
601 |                 sentence += sentences[i + 1]  # 加回分隔符
602 | 
603 |             if current_length + len(sentence) > self._chunk_size and current_chunk:
604 |                 chunks.append(''.join(current_chunk))
605 |                 current_chunk = []
606 |                 current_length = 0
607 | 
608 |             current_chunk.append(sentence)
609 |             current_length += len(sentence)
610 | 
611 |         if current_chunk:
612 |             chunks.append(''.join(current_chunk))
613 | 
614 |         return chunks
615 | 
616 |     def _postprocess_chunks(self, chunks: List[str]) -> List[str]:
617 |         """後處理分割後的文本塊"""
618 |         processed_chunks = []
619 |         for chunk in chunks:
620 |             chunk = chunk.strip()
621 |             if not chunk:
622 |                 continue
623 | 
624 |             # 確保條款編號不會出現在末尾
625 |             if re.search(r'第[一二三四五六七八九十]+條\s*$', chunk):
626 |                 continue
627 | 
628 |             # 檢查並修復可能被分割的關鍵片段
629 |             if re.search(r'^[，。；：]', chunk):
630 |                 if processed_chunks:
631 |                     processed_chunks[-1] += chunk
632 |                     continue
633 | 
634 |             processed_chunks.append(chunk)
635 | 
636 |         return processed_chunks
637 | 


--------------------------------------------------------------------------------