├── maru_lang ├── py.typed ├── api │ ├── __init__.py │ └── endpoints │ │ ├── __init__.py │ │ ├── user_group.py │ │ └── auth.py ├── core │ ├── __init__.py │ ├── relation_db │ │ ├── models │ │ │ ├── __init__.py │ │ │ ├── chat.py │ │ │ ├── auth.py │ │ │ └── documents.py │ │ ├── __init__.py │ │ └── connection.py │ └── vector_db │ │ ├── __init__.py │ │ ├── factory.py │ │ ├── retrieve_document.py │ │ └── base.py ├── commands │ ├── __init__.py │ ├── transfer.py │ └── tree.py ├── pipelines │ ├── __init__.py │ ├── chat │ │ └── __init__.py │ ├── ingest │ │ └── __init__.py │ └── base.py ├── schemas │ ├── __init__.py │ ├── chat.py │ ├── auth.py │ └── ingest.py ├── services │ ├── __init__.py │ ├── admin.py │ ├── chat.py │ └── ingest.py ├── dependencies │ ├── __init__.py │ ├── llm.py │ ├── chat.py │ ├── ingest.py │ ├── email.py │ └── auth.py ├── pluggable │ ├── embedders │ │ ├── __init__.py │ │ └── manager.py │ ├── rerankers │ │ ├── __init__.py │ │ └── manager.py │ ├── llms │ │ └── __init__.py │ ├── retrievers │ │ └── __init__.py │ ├── __init__.py │ ├── agents │ │ ├── __init__.py │ │ ├── builtin │ │ │ ├── __init__.py │ │ │ └── intent_extractor.py │ │ └── agent_factory.py │ ├── models │ │ ├── chunker.py │ │ ├── embedder.py │ │ ├── reranker.py │ │ ├── __init__.py │ │ ├── llm.py │ │ └── loader.py │ ├── configs │ │ ├── __init__.py │ │ ├── rag_loader.py │ │ ├── chunker_config.py │ │ ├── embedder_config.py │ │ ├── loader_config.py │ │ └── reranker_config.py │ ├── chunkers │ │ ├── base.py │ │ ├── sentence.py │ │ └── paragraph.py │ └── loaders │ │ ├── txt_parser.py │ │ ├── markdown_parser.py │ │ ├── base.py │ │ ├── pdf_parser.py │ │ ├── json_parser.py │ │ ├── docx_parser.py │ │ ├── html_parser.py │ │ ├── yaml_parser.py │ │ ├── xlsx_parser.py │ │ ├── csv_parser.py │ │ ├── pptx_parser.py │ │ └── xml_parser.py ├── enums │ ├── chat.py │ ├── agents.py │ ├── documents.py │ ├── configs.py │ ├── auth.py │ └── __init__.py ├── templates │ ├── yaml │ │ ├── embedder_config.yaml │ │ ├── openai.yaml │ │ ├── local.yaml │ │ ├── chunker_config.yaml │ │ ├── agents │ │ │ ├── mcps │ │ │ │ └── agents_firecrawl_mcp.yaml │ │ │ ├── agents_calculator.yaml │ │ │ └── builtin │ │ │ │ ├── agents_knowledge_search.yaml │ │ │ │ ├── agents_intent_extractor.yaml │ │ │ │ ├── agents_keyword_extractor.yaml │ │ │ │ ├── agents_group_classifier.yaml │ │ │ │ └── agents_response.yaml │ │ ├── reranker_config.yaml │ │ ├── loader_config.yaml │ │ ├── agents_build_selector.yaml │ │ ├── rag_config.yaml │ │ ├── llm_reranker.yaml │ │ └── system_config.yaml │ └── python │ │ ├── main.py │ │ ├── calculator_agent.py │ │ └── custom_parser.py ├── __init__.py ├── models │ ├── ingest.py │ ├── configs │ │ ├── __init__.py │ │ └── group.py │ ├── vector_db.py │ └── agents.py ├── utils │ ├── __init__.py │ ├── distribution.py │ ├── document.py │ └── security.py └── configs │ └── __init__.py ├── LICENSE └── pyproject.toml /maru_lang/py.typed: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/commands/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/schemas/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/services/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/api/endpoints/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/dependencies/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /maru_lang/core/relation_db/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .auth import * 2 | from .documents import * 3 | from .chat import * 4 | -------------------------------------------------------------------------------- /maru_lang/pipelines/chat/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Chat Pipeline 3 | """ 4 | from maru_lang.pipelines.chat.pipeline import ChatPipeline 5 | 6 | __all__ = ["ChatPipeline"] 7 | -------------------------------------------------------------------------------- /maru_lang/pipelines/ingest/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ingest Pipeline 3 | """ 4 | from maru_lang.pipelines.ingest.pipeline import IngestPipeline, IngestResult 5 | 6 | __all__ = ["IngestPipeline", "IngestResult"] 7 | -------------------------------------------------------------------------------- /maru_lang/core/vector_db/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import VectorDB 2 | from .chroma import ChromaVectorDB 3 | from .retrieve_document import RetrieveDocument 4 | 5 | __all__ = ["VectorDB", "ChromaVectorDB", "RetrieveDocument"] -------------------------------------------------------------------------------- /maru_lang/pluggable/embedders/__init__.py: -------------------------------------------------------------------------------- 1 | """Embedder for handling embedding models.""" 2 | 3 | from .manager import ( 4 | Embedder, 5 | get_embedder, 6 | ) 7 | 8 | __all__ = [ 9 | "Embedder", 10 | "get_embedder", 11 | ] 12 | -------------------------------------------------------------------------------- /maru_lang/pluggable/rerankers/__init__.py: -------------------------------------------------------------------------------- 1 | """Reranker for handling reranking models.""" 2 | 3 | from .manager import ( 4 | Reranker, 5 | get_reranker, 6 | ) 7 | 8 | __all__ = [ 9 | "Reranker", 10 | "get_reranker", 11 | ] 12 | -------------------------------------------------------------------------------- /maru_lang/pluggable/llms/__init__.py: -------------------------------------------------------------------------------- 1 | # LLM 클라이언트 2 | from .client import LLMServerClient 3 | 4 | # LLM 서버 매니저 5 | from .server_manager import LLMServerManager 6 | 7 | __all__ = [ 8 | "LLMServerClient", 9 | "LLMServerManager" 10 | ] 11 | -------------------------------------------------------------------------------- /maru_lang/pluggable/retrievers/__init__.py: -------------------------------------------------------------------------------- 1 | """Retriever for handling search operations.""" 2 | 3 | from .manager import ( 4 | Retriever, 5 | get_retriever, 6 | RetriveMethod, 7 | ) 8 | 9 | __all__ = [ 10 | "Retriever", 11 | "get_retriever", 12 | "RetriveMethod", 13 | ] 14 | -------------------------------------------------------------------------------- /maru_lang/enums/chat.py: -------------------------------------------------------------------------------- 1 | from enum import Enum 2 | 3 | 4 | class ChatProcessStep(str, Enum): 5 | """채팅 처리 단계""" 6 | START = "start" 7 | AGENT_SELECTION = "agent_selection" 8 | AGENT_EXECUTION = "agent_execution" 9 | ANSWER_GENERATION = "answer_generation" 10 | COMPLETED = "completed" -------------------------------------------------------------------------------- /maru_lang/templates/yaml/embedder_config.yaml: -------------------------------------------------------------------------------- 1 | # Embedder Configuration 2 | # Configure embedding models and device preferences 3 | 4 | # Default embedding model for all document groups 5 | # Can be overridden per-group in rag_config.yaml 6 | default_model: BAAI/bge-m3 7 | 8 | # Device selection (null => auto-detect: cuda > mps > cpu) 9 | device: null 10 | -------------------------------------------------------------------------------- /maru_lang/enums/agents.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent-related enums 3 | """ 4 | from enum import Enum 5 | 6 | 7 | class LLMFallbackStrategy(Enum): 8 | """LLM fallback strategies when specified LLM server is not available""" 9 | ANY_AVAILABLE = "any_available" # Use any available LLM server 10 | ERROR = "error" # Raise error and stop execution -------------------------------------------------------------------------------- /maru_lang/__init__.py: -------------------------------------------------------------------------------- 1 | """MaruLang - Advanced AI Agent Framework with RAG and multi-agent system""" 2 | 3 | __version__ = "0.0.0" 4 | 5 | from maru_lang.app import MaruLangApp, default_app 6 | 7 | # FastAPI app instance 8 | app = default_app.get_fastapi_app() 9 | 10 | __all__ = [ 11 | "MaruLangApp", 12 | "default_app", 13 | "app", 14 | "__version__", 15 | ] -------------------------------------------------------------------------------- /maru_lang/enums/documents.py: -------------------------------------------------------------------------------- 1 | from enum import IntEnum 2 | 3 | 4 | class PermissionAction(IntEnum): 5 | READ = 1 6 | WRITE = 2 7 | MANAGE = 3 # sync, base_path 변경 등 관리 권한 8 | 9 | 10 | class DocumentStatus(IntEnum): 11 | PROCESSING = 1 # 처리 중 (파싱/청킹/임베딩 대기) 12 | ACTIVE = 2 # 활성화 (임베딩 완료, 검색 가능) 13 | INACTIVE = 3 # 비활성화 (검색 불가) 14 | 15 | -------------------------------------------------------------------------------- /maru_lang/enums/configs.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration type enums 3 | """ 4 | from enum import Enum 5 | 6 | 7 | class ConfigType(Enum): 8 | """Configuration types""" 9 | LLMS = "llms" 10 | RAGS = "rags" # RAG 설정 (retriever + groups) 11 | AGENTS = "agents" 12 | LOADERS = "loaders" 13 | CHUNKERS = "chunkers" 14 | EMBEDDERS = "embedders" 15 | RERANKERS = "rerankers" -------------------------------------------------------------------------------- /maru_lang/pluggable/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Pluggable components for extensibility 3 | 4 | This package contains all pluggable/extensible components: 5 | - loaders: File parsers (txt, pdf, docx, etc.) 6 | - chunkers: Text chunking strategies (paragraph, sentence, etc.) 7 | - embedders: Embedding models management 8 | - rerankers: Result reranking models 9 | - configs: Configuration loaders for pluggable components 10 | - models: Data models for configurations 11 | """ 12 | -------------------------------------------------------------------------------- /maru_lang/enums/auth.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | from enum import Enum 3 | 4 | 5 | class UserRoleCode(Enum): 6 | # 기본적으로 생성하는 코드 사용자가 만들 수 있다. 7 | EDITOR = 'editor' 8 | ADMIN = 'admin' 9 | 10 | @classmethod 11 | def is_valid_role(cls, role_name: str) -> bool: 12 | try: 13 | cls(role_name) 14 | return True 15 | except ValueError as e: 16 | print(e) 17 | return False 18 | -------------------------------------------------------------------------------- /maru_lang/enums/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Enums for the LLM Chatbot application 3 | """ 4 | from .agents import LLMFallbackStrategy 5 | from .auth import UserRoleCode 6 | from .chat import ChatProcessStep 7 | from .configs import ConfigType 8 | from .documents import PermissionAction, DocumentStatus 9 | 10 | __all__ = [ 11 | "LLMFallbackStrategy", 12 | "UserRoleCode", 13 | "ChatProcessStep", 14 | "ConfigType", 15 | "PermissionAction", 16 | "DocumentStatus", 17 | ] -------------------------------------------------------------------------------- /maru_lang/models/ingest.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | from typing import Optional 3 | 4 | 5 | @dataclass(frozen=True) 6 | class PipelineConfig: 7 | model_name: str 8 | model_dim: int 9 | normalize_ver: str 10 | pooling: str 11 | lang_hint: Optional[str] = None 12 | pipeline_version: Optional[str] = None # 메타 기록용 13 | 14 | 15 | @dataclass(frozen=True) 16 | class ChunkInput: 17 | number: int # 페이지/문단/슬롯 인덱스 18 | content: str 19 | meta: Optional[dict] = None 20 | -------------------------------------------------------------------------------- /maru_lang/pluggable/agents/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent components for the chatbot system 3 | """ 4 | from .base import BaseAgent 5 | from .agent_selector import AgentSelector 6 | from .agent_executor import AgentExecutor 7 | from .agent_factory import AgentFactory 8 | from .mcp_client_agent import MCPClientAgent 9 | 10 | __all__ = [ 11 | # Core components 12 | "BaseAgent", 13 | "AgentSelector", 14 | "AgentExecutor", 15 | "AgentFactory", 16 | # Individual agents 17 | "DocumentSearchAgent", 18 | "MCPClientAgent", 19 | ] 20 | -------------------------------------------------------------------------------- /maru_lang/pluggable/models/chunker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Chunker configuration models 3 | """ 4 | from dataclasses import dataclass, field 5 | from typing import Dict, Any 6 | 7 | 8 | @dataclass 9 | class ChunkerConfig: 10 | """ 11 | Chunker configuration 12 | 13 | 각 chunker의 생성자 파라미터를 설정 14 | """ 15 | # chunker 이름 -> 생성자 파라미터 매핑 16 | # 예: {"paragraph": {"max_chunk_size": 500}} 17 | chunkers: Dict[str, Dict[str, Any]] = field(default_factory=dict) 18 | 19 | # Configuration metadata 20 | source_path: str = "" 21 | is_override: bool = False 22 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/openai.yaml: -------------------------------------------------------------------------------- 1 | # OpenAI API basic template 2 | 3 | name: openai 4 | description: "OpenAI API" 5 | url: https://api.openai.com 6 | model_name: gpt-4o-mini 7 | api_key: ${OPENAI_API_KEY} 8 | timeout: 30 9 | enabled: true 10 | 11 | chat_completions_path: /v1/chat/completions 12 | health_check_endpoint: /v1/models 13 | 14 | headers: 15 | Content-Type: application/json 16 | 17 | config: 18 | temperature: 0.7 19 | max_tokens: 2000 20 | top_p: 1.0 21 | 22 | retry: 23 | max_attempts: 3 24 | backoff_factor: 2 25 | max_delay: 60 26 | 27 | log_level: INFO -------------------------------------------------------------------------------- /maru_lang/pluggable/models/embedder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Embedder configuration models 3 | """ 4 | from dataclasses import dataclass 5 | from typing import Optional 6 | 7 | 8 | @dataclass 9 | class EmbedderConfig: 10 | """ 11 | Embedder configuration 12 | 13 | 임베딩 모델 및 디바이스 설정 14 | """ 15 | # 기본 임베딩 모델 (모든 document group의 기본값) 16 | # rag_config.yaml에서 그룹별로 override 가능 17 | default_model: str = "BAAI/bge-m3" 18 | 19 | # 디바이스 (None이면 자동 선택: cuda > mps > cpu) 20 | device: Optional[str] = None 21 | 22 | # Configuration metadata 23 | source_path: str = "" 24 | is_override: bool = False 25 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/local.yaml: -------------------------------------------------------------------------------- 1 | # OpenAI-Compatible Local LLM Template 2 | 3 | name: local-llm 4 | description: "Local OpenAI-Compatible Server" 5 | url: http://localhost:8000 6 | model_name: meta-llama/Llama-2-7b-chat-hf 7 | api_key: "" 8 | timeout: 60 9 | enabled: true 10 | 11 | chat_completions_path: /v1/chat/completions 12 | health_check_endpoint: /health 13 | 14 | headers: 15 | Content-Type: application/json 16 | 17 | config: 18 | temperature: 0.7 19 | max_tokens: 2048 20 | top_p: 0.95 21 | stream: false 22 | 23 | retry: 24 | max_attempts: 3 25 | backoff_factor: 2 26 | max_delay: 60 27 | 28 | log_level: INFO -------------------------------------------------------------------------------- /maru_lang/pluggable/configs/__init__.py: -------------------------------------------------------------------------------- 1 | """Configuration loaders for pluggable components""" 2 | 3 | from .llm_config import LLMConfigLoader 4 | from .agent_config import AgentConfigLoader 5 | from .loader_config import LoaderConfigLoader 6 | from .chunker_config import ChunkerConfigLoader 7 | from .embedder_config import EmbedderConfigLoader 8 | from .reranker_config import RerankerConfigLoader 9 | from .rag_loader import RagConfigLoader 10 | 11 | __all__ = [ 12 | "LLMConfigLoader", 13 | "AgentConfigLoader", 14 | "LoaderConfigLoader", 15 | "ChunkerConfigLoader", 16 | "EmbedderConfigLoader", 17 | "RerankerConfigLoader", 18 | "RagConfigLoader", 19 | ] 20 | -------------------------------------------------------------------------------- /maru_lang/schemas/chat.py: -------------------------------------------------------------------------------- 1 | from typing import Any 2 | from typing import List, Optional 3 | from datetime import datetime 4 | from pydantic import BaseModel, Field, field_validator 5 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument 6 | 7 | 8 | class ChatRequest(BaseModel): 9 | content: str 10 | session_start_time: Optional[datetime] = Field( 11 | default=None, 12 | description="세션 시작 시간") 13 | 14 | class ChatResponse(BaseModel): 15 | answer: str 16 | references: list[RetrieveDocument] 17 | 18 | 19 | class ConversationResponse(BaseModel): 20 | id: int 21 | question: str 22 | answer: str 23 | created_at: datetime 24 | -------------------------------------------------------------------------------- /maru_lang/utils/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unified utility module. 3 | 4 | This package exposes shared utility functions used across the project. 5 | 6 | Submodules: 7 | - security: Security and encryption utilities (JWT, AES, etc.) 8 | 9 | """ 10 | 11 | 12 | # Security utilities 13 | from .security import ( 14 | generate_anonymized_key, 15 | create_jwt_token, 16 | decode_token, 17 | get_key_spec, 18 | aes256_decrypt, 19 | aes256_encrypt 20 | ) 21 | 22 | __all__ = [ 23 | 24 | # Security helpers 25 | "generate_anonymized_key", 26 | "create_jwt_token", 27 | "decode_token", 28 | "get_key_spec", 29 | "aes256_decrypt", 30 | "aes256_encrypt" 31 | ] -------------------------------------------------------------------------------- /maru_lang/dependencies/llm.py: -------------------------------------------------------------------------------- 1 | from maru_lang.pluggable.llms import LLMServerClient, LLMServerManager 2 | 3 | 4 | _llm_manager = None 5 | 6 | 7 | async def get_llm_manager() -> LLMServerManager: 8 | """LLMServerManager 인스턴스를 반환합니다.""" 9 | global _llm_manager 10 | if _llm_manager is None: 11 | _llm_manager = LLMServerManager() 12 | # 서버가 초기화되지 않았다면 초기화 13 | if not _llm_manager.all_servers: 14 | await _llm_manager.initialize_servers() 15 | 16 | return _llm_manager 17 | 18 | 19 | async def get_llm() -> LLMServerClient | None: 20 | """활성화된 LLM 서버 중 하나를 반환합니다.""" 21 | manager = await get_llm_manager() 22 | 23 | 24 | return await manager.get_active_server() 25 | -------------------------------------------------------------------------------- /maru_lang/pluggable/chunkers/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import List, Optional 3 | from maru_lang.models.ingest import ChunkInput 4 | 5 | 6 | class BaseChunker(ABC): 7 | """텍스트 청킹 전략의 기본 인터페이스""" 8 | 9 | # Chunker 식별 정보 10 | name: str = "base_chunker" 11 | description: str = "기본 청킹 전략" 12 | 13 | @abstractmethod 14 | def chunk(self, text: str) -> List[ChunkInput]: 15 | """전체 텍스트를 받아서 ChunkInput 리스트로 변환""" 16 | pass 17 | 18 | def get_metadata(self) -> dict: 19 | """Chunker 메타데이터 반환""" 20 | return { 21 | "chunker_name": self.name, 22 | "chunker_description": self.description, 23 | } 24 | -------------------------------------------------------------------------------- /maru_lang/pluggable/models/reranker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reranker configuration models 3 | """ 4 | from dataclasses import dataclass 5 | from typing import Optional, Literal 6 | 7 | 8 | @dataclass 9 | class RerankerConfig: 10 | """Reranker configuration - reranker 모델 및 사용 여부 설정""" 11 | enabled: bool = True 12 | method: Literal["model", "agent"] = "model" 13 | 14 | # Method: "model" - 임베딩 모델 기반 reranking 15 | default_model: str = "BAAI/bge-reranker-v2-m3" 16 | 17 | # Method: "agent" - Agent 기반 reranking (LLM 등) 18 | agent_name: Optional[str] = None 19 | 20 | # Reranking 후 반환할 최대 개수 (None이면 원본 k 사용) 21 | top_k: Optional[int] = 5 22 | 23 | source_path: str = "" 24 | is_override: bool = False 25 | -------------------------------------------------------------------------------- /maru_lang/pluggable/agents/builtin/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Builtin agents - core system agents 3 | These agents are not customizable by users and are part of the core system 4 | """ 5 | from maru_lang.pluggable.agents.builtin.group_classifier import GroupClassifierAgent 6 | from maru_lang.pluggable.agents.builtin.intent_extractor import IntentExtractorAgent 7 | from maru_lang.pluggable.agents.builtin.keyword_extractor import KeywordExtractorAgent 8 | from maru_lang.pluggable.agents.builtin.response_agent import ResponseAgent 9 | from maru_lang.pluggable.agents.builtin.knowledge_search import KnowledgeSearchAgent 10 | 11 | __all__ = [ 12 | "GroupClassifierAgent", 13 | "IntentExtractorAgent", 14 | "KeywordExtractorAgent", 15 | "ResponseAgent", 16 | "KnowledgeSearchAgent", 17 | ] 18 | -------------------------------------------------------------------------------- /maru_lang/models/configs/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Configuration models for the LLM Chatbot application 3 | 4 | Note: Most config models have been moved to pluggable.models 5 | This module now only contains Group configuration which is not pluggable 6 | """ 7 | from .group import GroupConfig, GroupsConfig 8 | 9 | # Import pluggable models for backward compatibility 10 | from maru_lang.pluggable.models import ( 11 | LLMConfig, 12 | AgentConfig, 13 | LoaderConfig, 14 | ExtensionMapping, 15 | ChunkerConfig, 16 | EmbedderConfig, 17 | RerankerConfig, 18 | ) 19 | 20 | __all__ = [ 21 | "LLMConfig", 22 | "GroupConfig", 23 | "GroupsConfig", 24 | "AgentConfig", 25 | "LoaderConfig", 26 | "ExtensionMapping", 27 | "ChunkerConfig", 28 | "EmbedderConfig", 29 | "RerankerConfig", 30 | ] -------------------------------------------------------------------------------- /maru_lang/schemas/auth.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Optional 3 | 4 | 5 | class SignUpRequest(BaseModel): 6 | email: str 7 | 8 | 9 | class LogoutRequest(BaseModel): 10 | device_id: str 11 | 12 | 13 | class VerifyCodeRequest(BaseModel): 14 | device_id: str 15 | email: str 16 | code: str 17 | 18 | 19 | class UserResponse(BaseModel): 20 | id: int 21 | email: str 22 | name: Optional[str] = None 23 | 24 | class Config: 25 | from_attributes = True 26 | 27 | 28 | class UserGroupResponse(BaseModel): 29 | id: int 30 | name: str 31 | manager: Optional[UserResponse] = None 32 | created_at: Optional[str] = None 33 | 34 | class Config: 35 | from_attributes = True 36 | 37 | 38 | class UserGroupsResponse(BaseModel): 39 | groups: List[UserGroupResponse] 40 | total: int 41 | -------------------------------------------------------------------------------- /maru_lang/pluggable/models/__init__.py: -------------------------------------------------------------------------------- 1 | """Configuration data models for pluggable components""" 2 | 3 | from .llm import LLMConfig 4 | from .agent import AgentConfig 5 | from .loader import LoaderConfig, ExtensionMapping 6 | from .chunker import ChunkerConfig 7 | from .embedder import EmbedderConfig 8 | from .reranker import RerankerConfig 9 | from .rag import ( 10 | RagConfig, 11 | RetrieverConfig, 12 | GroupRagConfig, 13 | QueryTypeWeights, 14 | FallbackLogicConfig, 15 | GroupComponents, 16 | ) 17 | 18 | __all__ = [ 19 | "LLMConfig", 20 | "AgentConfig", 21 | "LoaderConfig", 22 | "ExtensionMapping", 23 | "ChunkerConfig", 24 | "EmbedderConfig", 25 | "RerankerConfig", 26 | "RagConfig", 27 | "RetrieverConfig", 28 | "GroupRagConfig", 29 | "QueryTypeWeights", 30 | "FallbackLogicConfig", 31 | "GroupComponents", 32 | ] 33 | -------------------------------------------------------------------------------- /maru_lang/services/admin.py: -------------------------------------------------------------------------------- 1 | """ 2 | Admin user management service 3 | """ 4 | from maru_lang.core.relation_db.models.auth import User 5 | 6 | 7 | ADMIN_EMAIL = "admin@maru.local" 8 | ADMIN_NAME = "Admin" 9 | 10 | 11 | async def get_or_create_admin_user() -> User: 12 | """ 13 | Admin 사용자를 가져오거나 없으면 생성합니다. 14 | CLI 명령어는 기본적으로 admin 사용자로 실행됩니다. 15 | 16 | Returns: 17 | Admin User 인스턴스 18 | """ 19 | admin_user = await User.get_or_none(email=ADMIN_EMAIL) 20 | 21 | if admin_user is None: 22 | admin_user = await User.create( 23 | email=ADMIN_EMAIL, 24 | name=ADMIN_NAME, 25 | ) 26 | 27 | return admin_user 28 | 29 | 30 | async def ensure_admin_user() -> User: 31 | """ 32 | Admin 사용자가 존재하는지 확인하고 반환합니다. 33 | DB 초기화 시 호출됩니다. 34 | 35 | Returns: 36 | Admin User 인스턴스 37 | """ 38 | return await get_or_create_admin_user() 39 | -------------------------------------------------------------------------------- /maru_lang/templates/python/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom MaruLang Application 3 | This file was generated by maru install command. 4 | """ 5 | from maru_lang import MaruLangApp 6 | 7 | # Create your custom MaruLang instance 8 | app = MaruLangApp( 9 | title="My MaruLang App", 10 | version="1.0.0", 11 | description="Custom MaruLang Application" 12 | ) 13 | 14 | # You can customize the app here 15 | # For example: 16 | # - Add custom startup events 17 | # - Add custom routes 18 | # - Add middleware 19 | # - Configure CORS settings 20 | 21 | @app.on_event("startup") 22 | async def custom_startup(): 23 | """Custom startup event""" 24 | print("🚀 Custom MaruLang app started!") 25 | 26 | # Optional: Add custom routes 27 | # @app.get("/custom-health") 28 | # async def custom_health_check(): 29 | # return {"status": "healthy", "custom": True} 30 | 31 | # The app instance will be imported by the serve command 32 | # Usage: maru serve --app-module main:app -------------------------------------------------------------------------------- /maru_lang/templates/yaml/chunker_config.yaml: -------------------------------------------------------------------------------- 1 | # Chunker Configuration 2 | # Configure constructor parameters for each chunker. 3 | # 4 | # Available built-in chunkers: 5 | # - paragraph: chunk by paragraph (split on blank lines) 6 | # - sentence: chunk by sentence and merge when needed 7 | # - fixed_size: chunk by fixed size (supports overlap) 8 | # 9 | # Add custom chunkers under the chunkers/ directory. 10 | 11 | # Chunker-specific settings 12 | chunkers: 13 | # Paragraph-based chunker 14 | paragraph: 15 | max_chunk_size: 2000 16 | 17 | # Sentence-based chunker 18 | sentence: 19 | max_chunk_size: 500 20 | 21 | # Fixed-size chunker 22 | fixed_size: 23 | chunk_size: 1000 24 | overlap: 200 25 | 26 | # Example for custom chunkers: 27 | # chunkers: 28 | # header: # Markdown header-based chunker 29 | # max_level: 3 30 | # 31 | # page: # PDF page-based chunker 32 | # combine_small_pages: true 33 | # min_page_chars: 100 34 | -------------------------------------------------------------------------------- /maru_lang/models/configs/group.py: -------------------------------------------------------------------------------- 1 | """ 2 | Group configuration models 3 | """ 4 | from dataclasses import dataclass, field 5 | from typing import Dict, Any, List 6 | 7 | 8 | @dataclass 9 | class GroupConfig: 10 | """Group configuration for chatbot categorization""" 11 | name: str 12 | description: str = "" 13 | force_rag: bool = False 14 | permissions: List[str] = field(default_factory=list) 15 | prompts: List[str] = field(default_factory=list) 16 | priority: str = "normal" # high, normal, low 17 | weight: float = 1.0 18 | settings: Dict[str, Any] = field(default_factory=dict) 19 | source_path: str = "" 20 | is_override: bool = False 21 | 22 | 23 | @dataclass 24 | class GroupsConfig: 25 | """Complete groups configuration including priorities""" 26 | group_priorities: Dict[str, Any] = field(default_factory=dict) 27 | groups: Dict[str, GroupConfig] = field(default_factory=dict) 28 | tool_choice_reason: Dict[str, str] = field(default_factory=dict) 29 | source_path: str = "" 30 | is_override: bool = False -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 ML2 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /maru_lang/core/relation_db/models/chat.py: -------------------------------------------------------------------------------- 1 | from tortoise.models import Model 2 | from tortoise import fields 3 | from datetime import datetime, timezone 4 | from enum import IntEnum 5 | 6 | 7 | class Conversation(Model): 8 | id = fields.IntField(pk=True) 9 | user = fields.ForeignKeyField( 10 | "models.User", related_name="conversations", on_delete=fields.OnDelete.CASCADE) 11 | question = fields.TextField() # 사용자 질문 12 | enhanced_question = fields.TextField(null=True) # 확장된 질문 13 | answer = fields.TextField() # AI 답변 14 | metadata = fields.JSONField(default={}) # API 호출 정보, 토큰 수, 처리 시간 등 15 | created_at = fields.DatetimeField(auto_now_add=True) 16 | 17 | class Meta: 18 | table = "conversation" 19 | 20 | 21 | class ConversationReference(Model): 22 | id = fields.IntField(pk=True) 23 | conversation = fields.ForeignKeyField( 24 | "models.Conversation", 25 | related_name="references", 26 | on_delete=fields.OnDelete.CASCADE) 27 | document = fields.ForeignKeyField( 28 | "models.Document", 29 | related_name="conversation_references", 30 | on_delete=fields.OnDelete.CASCADE) 31 | score = fields.FloatField() # 검색 관련성 점수 32 | 33 | class Meta: 34 | table = "conversation_reference" 35 | -------------------------------------------------------------------------------- /maru_lang/pluggable/models/llm.py: -------------------------------------------------------------------------------- 1 | """ 2 | LLM configuration models 3 | """ 4 | from dataclasses import dataclass, field 5 | from typing import Dict, Any, Optional 6 | 7 | 8 | @dataclass 9 | class LLMConfig: 10 | """LLM server configuration""" 11 | name: str 12 | url: str 13 | model_name: str = "" 14 | description: str = "" 15 | api_key: Optional[str] = None 16 | timeout: float = 30.0 17 | enabled: bool = True 18 | max_retries: int = 3 19 | health_check_endpoint: str = "/health" 20 | chat_completions_path: str = "/v1/chat/completions" 21 | headers: Dict[str, str] = field(default_factory=dict) 22 | config: Dict[str, Any] = field(default_factory=dict) 23 | health_check: Dict[str, Any] = field(default_factory=dict) 24 | cost_tracking: Dict[str, Any] = field(default_factory=dict) 25 | limits: Dict[str, Any] = field(default_factory=dict) 26 | retry: Dict[str, Any] = field(default_factory=dict) 27 | log_level: str = "INFO" 28 | source_path: str = "" 29 | is_override: bool = False 30 | 31 | def __post_init__(self): 32 | """Process environment variables in api_key""" 33 | if self.api_key and self.api_key.startswith('${') and self.api_key.endswith('}'): 34 | import os 35 | env_var = self.api_key[2:-1] 36 | self.api_key = os.getenv(env_var) -------------------------------------------------------------------------------- /maru_lang/pluggable/models/loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | Loader configuration models 3 | """ 4 | from dataclasses import dataclass, field 5 | from typing import Dict, Optional, Any 6 | 7 | 8 | @dataclass 9 | class ExtensionMapping: 10 | """확장자별 loader와 chunker 매핑""" 11 | loader: str # 사용할 loader (parser) 이름 12 | chunker: str # 사용할 chunker 이름 13 | 14 | 15 | @dataclass 16 | class LoaderConfig: 17 | """ 18 | Loader configuration 19 | 20 | 파일 확장자별로 어떤 loader(parser)와 chunker를 사용할지 설정 21 | """ 22 | # Default loader/chunker (확장자 매핑 없을 때 사용) 23 | # default_loader가 None이면 등록된 확장자만 처리 (whitelist 모드) 24 | default_loader: Optional[str] = None 25 | default_chunker: Optional[str] = "paragraph" 26 | 27 | # 확장자 -> {loader, chunker} 매핑 28 | # 예: {".pdf": {"loader": "pdf", "chunker": "paragraph"}} 29 | extensions: Dict[str, ExtensionMapping] = field(default_factory=dict) 30 | 31 | # Configuration metadata 32 | source_path: str = "" 33 | is_override: bool = False 34 | 35 | def __post_init__(self): 36 | """Post-process configuration""" 37 | # extensions를 dict에서 ExtensionMapping으로 변환 38 | new_extensions = {} 39 | for ext, mapping in self.extensions.items(): 40 | if isinstance(mapping, dict): 41 | new_extensions[ext] = ExtensionMapping(**mapping) 42 | else: 43 | new_extensions[ext] = mapping 44 | self.extensions = new_extensions 45 | -------------------------------------------------------------------------------- /maru_lang/core/relation_db/__init__.py: -------------------------------------------------------------------------------- 1 | from .connection import get_register_orm, orm_context 2 | 3 | 4 | # def get_tortoise_orm(): 5 | # """ 6 | # Get Tortoise ORM configuration lazily. 7 | # 8 | # This function is called by Aerich when needed, avoiding issues with 9 | # configuration loading at import time. 10 | # """ 11 | # from maru_lang.configs.system_config import get_system_config 12 | # 13 | # config = get_system_config() 14 | # if not config: 15 | # raise RuntimeError( 16 | # "System configuration not found. Please run 'maru install' first." 17 | # ) 18 | # 19 | # return { 20 | # "connections": {"default": config.database.get_database_url()}, 21 | # "apps": { 22 | # "models": { 23 | # "models": ["maru_lang.models", "aerich.models"], 24 | # "default_connection": "default", 25 | # }, 26 | # }, 27 | # "use_tz": True, 28 | # } 29 | 30 | 31 | # Tortoise ORM configuration for Aerich 32 | # This is evaluated lazily - only accessed when needed by Aerich commands 33 | # try: 34 | # TORTOISE_ORM = get_tortoise_orm() 35 | # except RuntimeError: 36 | # # If config not available at import time, set to None 37 | # # It will be initialized later when needed 38 | # TORTOISE_ORM = None 39 | 40 | __all__ = [ 41 | "get_register_orm", 42 | "orm_context", 43 | # "TORTOISE_ORM", 44 | # "get_tortoise_orm", 45 | ] 46 | -------------------------------------------------------------------------------- /maru_lang/schemas/ingest.py: -------------------------------------------------------------------------------- 1 | from typing import List, Optional 2 | from datetime import datetime 3 | from pydantic import BaseModel, Field 4 | 5 | 6 | class FileInfo(BaseModel): 7 | """Individual file information for sync check""" 8 | fileName: str = Field(..., description="파일 이름") 9 | createdAt: datetime = Field(..., description="파일 생성 시간") 10 | relativePath: str = Field(..., description="상대 경로 (프로젝트폴더명/경로/파일명)") 11 | size: int = Field(..., description="파일 크기 (bytes)") 12 | 13 | 14 | class SyncCheckRequest(BaseModel): 15 | """Request for checking which files need to be uploaded""" 16 | folderPath: str = Field(..., description="프로젝트 폴더명") 17 | files: List[FileInfo] = Field(..., description="폴더 내 파일 정보 목록") 18 | description: Optional[str] = Field(None, description="DocumentGroup 설명") 19 | 20 | 21 | class SyncCheckResponse(BaseModel): 22 | """Response for sync check""" 23 | filesToUpload: List[str] = Field(..., description="업로드가 필요한 파일의 relativePath 목록") 24 | totalFiles: int = Field(..., description="전체 파일 개수") 25 | message: str = Field(..., description="상태 메시지") 26 | 27 | 28 | class SyncUploadResponse(BaseModel): 29 | """Response for batch upload""" 30 | success: bool = Field(..., description="업로드 성공 여부") 31 | message: str = Field(..., description="상태 메시지 (예: '배치 1/4 업로드 완료')") 32 | uploadedCount: int = Field(..., description="업로드된 파일 개수") 33 | errors: Optional[List[str]] = Field(default=None, description="에러 메시지 목록 (있는 경우)") 34 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/txt_parser.py: -------------------------------------------------------------------------------- 1 | """Plain text file parser.""" 2 | 3 | from pathlib import Path 4 | from .base import BaseParser, ParseResult 5 | 6 | 7 | class TxtParser(BaseParser): 8 | """일반 텍스트 파일 파서""" 9 | 10 | def parse(self, file_path: Path) -> ParseResult: 11 | """ 12 | 텍스트 파일을 읽어 내용을 반환합니다. 13 | 14 | Args: 15 | file_path: 파싱할 텍스트 파일 경로 16 | 17 | Returns: 18 | ParseResult: 파싱된 텍스트와 메타데이터 19 | """ 20 | if not file_path.exists(): 21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 22 | 23 | try: 24 | with open(file_path, 'r', encoding='utf-8') as f: 25 | content = f.read() 26 | 27 | metadata = { 28 | 'file_type': 'text', 29 | 'encoding': 'utf-8', 30 | 'file_size': file_path.stat().st_size, 31 | } 32 | 33 | return ParseResult(content=content, metadata=metadata) 34 | 35 | except UnicodeDecodeError as e: 36 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e 37 | except Exception as e: 38 | raise ValueError(f"파일 읽기 실패: {file_path}") from e 39 | 40 | def supports(self, file_path: Path) -> bool: 41 | """텍스트 파일 확장자 지원 확인""" 42 | return file_path.suffix.lower() in self.supported_extensions 43 | 44 | @property 45 | def supported_extensions(self) -> list[str]: 46 | """지원하는 텍스트 파일 확장자""" 47 | return ['.txt', '.text', '.log'] 48 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/agents/mcps/agents_firecrawl_mcp.yaml: -------------------------------------------------------------------------------- 1 | name: firecrawl_mcp 2 | description: "Firecrawl MCP - Web scraping, crawling, and discovery. If crawling or searching is required, it finds information on the web." 3 | type: mcp_client 4 | enabled: false 5 | version: "1.0.0" 6 | 7 | # Agent Tags 8 | tags: 9 | - mcp 10 | - firecrawl 11 | - tools 12 | - files 13 | 14 | # LLM Server Settings (Optimized for MCP tool usage) 15 | target_llm_config: 16 | server_name: "openai" # Model suitable for tool usage 17 | 18 | override_params: 19 | temperature: 0.1 # Low temperature for accuracy in file operations 20 | max_tokens: 2000 21 | top_p: 0.8 22 | 23 | fallback_strategy: "error" # MCP agents require an LLM 24 | 25 | # MCP Server Connection Settings 26 | mcp_config: 27 | # Notion MCP server configuration 28 | transport: "stdio" 29 | command: "npx" 30 | args: ["-y", "firecrawl-mcp"] 31 | env: 32 | FIRECRAWL_API_KEY: ${FIRECRAWL_API_KEY} 33 | timeout: 30 34 | 35 | # Prompt Settings 36 | prompts: 37 | system_prompt: | 38 | You are an agent that uses web scraping, crawling, and search tools. 39 | Perform web scraping, crawling, and search according to the user's requests. 40 | 41 | Available tools: 42 | - web_scraping: Web scraping 43 | - crawling: Crawling 44 | - searching: Search 45 | user_prompt_template: | 46 | {question} 47 | 48 | # Agent Execution Settings 49 | config: 50 | timeout: 60 # File operations can take time 51 | retry_count: 2 52 | max_context_length: 8000 # File contents can be long -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/markdown_parser.py: -------------------------------------------------------------------------------- 1 | """Markdown file parser.""" 2 | 3 | from pathlib import Path 4 | from .base import BaseParser, ParseResult 5 | 6 | 7 | class MarkdownParser(BaseParser): 8 | """마크다운 파일 파서""" 9 | 10 | def parse(self, file_path: Path) -> ParseResult: 11 | """ 12 | 마크다운 파일을 읽어 내용을 반환합니다. 13 | (나중에 HTML 변환 등의 추가 처리 가능) 14 | 15 | Args: 16 | file_path: 파싱할 마크다운 파일 경로 17 | 18 | Returns: 19 | ParseResult: 파싱된 텍스트와 메타데이터 20 | """ 21 | if not file_path.exists(): 22 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 23 | 24 | try: 25 | with open(file_path, 'r', encoding='utf-8') as f: 26 | content = f.read() 27 | 28 | metadata = { 29 | 'file_type': 'markdown', 30 | 'encoding': 'utf-8', 31 | 'file_size': file_path.stat().st_size, 32 | } 33 | 34 | return ParseResult(content=content, metadata=metadata) 35 | 36 | except UnicodeDecodeError as e: 37 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e 38 | except Exception as e: 39 | raise ValueError(f"파일 읽기 실패: {file_path}") from e 40 | 41 | def supports(self, file_path: Path) -> bool: 42 | """마크다운 파일 확장자 지원 확인""" 43 | return file_path.suffix.lower() in self.supported_extensions 44 | 45 | @property 46 | def supported_extensions(self) -> list[str]: 47 | """지원하는 마크다운 파일 확장자""" 48 | return ['.md', '.markdown', '.mdown', '.mkd'] 49 | -------------------------------------------------------------------------------- /maru_lang/pluggable/chunkers/sentence.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import List 3 | from maru_lang.models.ingest import ChunkInput 4 | from .base import BaseChunker 5 | 6 | 7 | class SentenceChunker(BaseChunker): 8 | """문장 단위로 청킹 (마침표/물음표/느낌표 기준), 최대 크기 제한""" 9 | 10 | name = "sentence" 11 | description = "문장 단위로 청킹하고 최대 크기에 맞춰 병합" 12 | 13 | def __init__(self, max_chunk_size: int = 500): 14 | self.max_chunk_size = max_chunk_size 15 | 16 | def chunk(self, text: str) -> List[ChunkInput]: 17 | # 한글/영문 문장 끝 패턴 18 | sentence_pattern = r'[.!?]+[\s\n]+' 19 | sentences = [s.strip() for s in re.split(sentence_pattern, text) if s.strip()] 20 | 21 | chunks = [] 22 | current_chunk = [] 23 | current_size = 0 24 | chunk_num = 1 25 | 26 | for sentence in sentences: 27 | sentence_len = len(sentence) 28 | 29 | if current_size + sentence_len > self.max_chunk_size and current_chunk: 30 | # 현재 청크 저장 31 | chunks.append(ChunkInput( 32 | number=chunk_num, 33 | content=' '.join(current_chunk) 34 | )) 35 | chunk_num += 1 36 | current_chunk = [] 37 | current_size = 0 38 | 39 | current_chunk.append(sentence) 40 | current_size += sentence_len 41 | 42 | # 마지막 청크 43 | if current_chunk: 44 | chunks.append(ChunkInput( 45 | number=chunk_num, 46 | content=' '.join(current_chunk) 47 | )) 48 | 49 | return chunks 50 | -------------------------------------------------------------------------------- /maru_lang/core/relation_db/connection.py: -------------------------------------------------------------------------------- 1 | from tortoise import Tortoise 2 | from tortoise.contrib.fastapi import RegisterTortoise 3 | from functools import partial 4 | from maru_lang.configs.system_config import get_system_config 5 | from contextlib import asynccontextmanager 6 | from typing import Awaitable, Callable 7 | import asyncio 8 | 9 | 10 | 11 | def run_with_orm_context(coro: Callable[..., Awaitable], *args, **kwargs): 12 | async def runner(): 13 | async with orm_context(): 14 | return await coro(*args, **kwargs) 15 | return asyncio.run(runner()) 16 | 17 | 18 | def get_register_orm(): 19 | config = get_system_config() 20 | # partial을 사용해서 미리 설정된 RegisterTortoise를 반환 21 | return partial( 22 | RegisterTortoise, 23 | generate_schemas=True, 24 | add_exception_handlers=True, 25 | db_url=config.database.get_database_url(), 26 | modules={"models": [ 27 | "maru_lang.core.relation_db.models", "aerich.models"]}, 28 | use_tz=True, 29 | ) 30 | 31 | 32 | @asynccontextmanager 33 | async def orm_context(): 34 | config = get_system_config() 35 | 36 | await Tortoise.init( 37 | db_url=config.database.get_database_url(), 38 | modules={"models": [ 39 | "maru_lang.core.relation_db.models", "aerich.models"]}, 40 | use_tz=True, 41 | ) 42 | await Tortoise.generate_schemas() 43 | 44 | # Admin 사용자 자동 생성 45 | from maru_lang.services.admin import ensure_admin_user 46 | await ensure_admin_user() 47 | 48 | try: 49 | yield 50 | finally: 51 | await Tortoise.close_connections() -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/base.py: -------------------------------------------------------------------------------- 1 | """Base parser interface for document parsing.""" 2 | 3 | from abc import ABC, abstractmethod 4 | from pathlib import Path 5 | from typing import Optional 6 | from dataclasses import dataclass 7 | 8 | 9 | @dataclass 10 | class ParseResult: 11 | """파싱 결과를 담는 데이터 클래스""" 12 | content: str 13 | metadata: Optional[dict] = None 14 | 15 | def __post_init__(self): 16 | if self.metadata is None: 17 | self.metadata = {} 18 | 19 | 20 | class BaseParser(ABC): 21 | """문서 파싱을 위한 기본 인터페이스""" 22 | 23 | @property 24 | def default_chunker_name(self) -> Optional[str]: 25 | """ 26 | 이 파서의 기본 chunker 이름 27 | 28 | Returns: 29 | Optional[str]: chunker 이름 (None이면 전역 기본 chunker 사용) 30 | """ 31 | return None # 기본값: None (전역 기본 chunker 사용) 32 | 33 | @abstractmethod 34 | def parse(self, file_path: Path) -> ParseResult: 35 | """ 36 | 파일을 파싱하여 텍스트 콘텐츠를 추출합니다. 37 | 38 | Args: 39 | file_path: 파싱할 파일의 경로 40 | 41 | Returns: 42 | ParseResult: 파싱된 텍스트와 메타데이터 43 | 44 | Raises: 45 | ValueError: 파일을 읽을 수 없거나 파싱할 수 없는 경우 46 | FileNotFoundError: 파일이 존재하지 않는 경우 47 | """ 48 | pass 49 | 50 | @abstractmethod 51 | def supports(self, file_path: Path) -> bool: 52 | """ 53 | 해당 파서가 주어진 파일을 지원하는지 확인합니다. 54 | 55 | Args: 56 | file_path: 확인할 파일 경로 57 | 58 | Returns: 59 | bool: 지원 여부 60 | """ 61 | pass 62 | 63 | @property 64 | @abstractmethod 65 | def supported_extensions(self) -> list[str]: 66 | """ 67 | 이 파서가 지원하는 파일 확장자 목록 68 | 69 | Returns: 70 | list[str]: 지원하는 확장자 리스트 (예: ['.txt', '.text']) 71 | """ 72 | pass 73 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/reranker_config.yaml: -------------------------------------------------------------------------------- 1 | # Reranker Configuration 2 | # Configure reranker usage and behavior 3 | 4 | # Enable or disable the reranker 5 | enabled: true 6 | 7 | # Reranking method 8 | # - "model": embedding-model-based reranking (fast, low cost) 9 | # - "agent": agent-based reranking (LLM-powered, higher accuracy) 10 | method: agent 11 | 12 | # ============================================================ 13 | # Method: "agent" - agent-based reranking (LLM) 14 | # ============================================================ 15 | 16 | # Agent name (define in agent_config.yaml) 17 | agent_name: llm_reranker 18 | 19 | # ============================================================ 20 | # Method: "model" - embedding-model-based reranking 21 | # ============================================================ 22 | 23 | # Default reranker model (only used when method: model) 24 | default_model: BAAI/bge-reranker-v2-m3 25 | 26 | # Maximum number of documents to return after reranking 27 | # - If not set (or null), returns the same number as the original search results 28 | # - Use this to retrieve more documents (k=20) and rerank to top-k (top_k=5) for better quality 29 | top_k: 5 30 | 31 | # Example: Using LLM-based reranking (default) 32 | # 1. The llm_reranker agent is already included in rerankers/ 33 | # 2. Make sure to register it in configs/agent_config.yaml (already done if using default) 34 | # 3. Set method: agent and agent_name: llm_reranker (already set above) 35 | # 36 | # Example: Switching to model-based reranking 37 | # 1. Set method: model 38 | # 2. Uncomment and configure default_model 39 | # 3. Comment out agent_name 40 | # 41 | # Example: Creating a custom reranker agent 42 | # 1. Create my_reranker.py in configs/rerankers/ (inherits BaseAgent) 43 | # 2. Create my_reranker.yaml in configs/rerankers/ with prompts and tools 44 | # 3. Register in configs/agent_config.yaml: 45 | # my_reranker: 46 | # type: custom 47 | # file: rerankers/my_reranker.py 48 | # config: rerankers/my_reranker.yaml 49 | # 4. Set agent_name: my_reranker 50 | -------------------------------------------------------------------------------- /maru_lang/configs/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Unified configuration management system 3 | 4 | Note: Most config loaders have been moved to pluggable.configs 5 | This module provides backward compatibility imports and manages non-pluggable configs 6 | """ 7 | from .base import DefaultConfigLoader 8 | from .manager import ConfigManager, get_config_manager 9 | from .diff_checker import check_config_differences, ConfigDiffChecker 10 | 11 | # Import pluggable configs for backward compatibility 12 | from maru_lang.pluggable.configs import ( 13 | LLMConfigLoader, 14 | AgentConfigLoader, 15 | LoaderConfigLoader, 16 | ChunkerConfigLoader, 17 | EmbedderConfigLoader, 18 | RerankerConfigLoader, 19 | RagConfigLoader, 20 | ) 21 | 22 | # Import models for convenience 23 | from maru_lang.models.configs import ( 24 | LLMConfig, 25 | GroupConfig, 26 | GroupsConfig, 27 | AgentConfig, 28 | LoaderConfig, 29 | ChunkerConfig, 30 | EmbedderConfig, 31 | RerankerConfig, 32 | ) 33 | 34 | # Import RAG models 35 | from maru_lang.pluggable.models import ( 36 | RagConfig, 37 | RetrieverConfig, 38 | GroupRagConfig, 39 | ) 40 | 41 | __all__ = [ 42 | # Base 43 | 'DefaultConfigLoader', 44 | 45 | # RAG (replaces Group) 46 | 'RagConfig', 47 | 'RetrieverConfig', 48 | 'GroupRagConfig', 49 | 'RagConfigLoader', 50 | 51 | # Backward compatibility - Group (deprecated, use RAG instead) 52 | 'GroupConfig', 53 | 'GroupsConfig', 54 | 55 | # Pluggable configs (re-exported for convenience) 56 | 'LLMConfig', 57 | 'LLMConfigLoader', 58 | 'AgentConfig', 59 | 'AgentConfigLoader', 60 | 'LoaderConfig', 61 | 'LoaderConfigLoader', 62 | 'ChunkerConfig', 63 | 'ChunkerConfigLoader', 64 | 'EmbedderConfig', 65 | 'EmbedderConfigLoader', 66 | 'RerankerConfig', 67 | 'RerankerConfigLoader', 68 | 69 | # Config Manager 70 | 'ConfigManager', 71 | 'get_config_manager', 72 | 73 | # Config Diff Checker 74 | 'check_config_differences', 75 | 'ConfigDiffChecker', 76 | ] 77 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/pdf_parser.py: -------------------------------------------------------------------------------- 1 | """PDF file parser.""" 2 | 3 | from pathlib import Path 4 | from .base import BaseParser, ParseResult 5 | 6 | 7 | class PDFParser(BaseParser): 8 | """PDF 파일 파서 (PyPDF2 또는 pdfplumber 사용)""" 9 | 10 | def parse(self, file_path: Path) -> ParseResult: 11 | """ 12 | PDF 파일에서 텍스트를 추출합니다. 13 | 14 | Args: 15 | file_path: 파싱할 PDF 파일 경로 16 | 17 | Returns: 18 | ParseResult: 파싱된 텍스트와 메타데이터 19 | """ 20 | if not file_path.exists(): 21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 22 | 23 | try: 24 | # PyPDF2 사용 (나중에 pdfplumber로 변경 가능) 25 | try: 26 | import PyPDF2 27 | except ImportError: 28 | raise ImportError( 29 | "PyPDF2가 설치되지 않았습니다. 'pip install PyPDF2'로 설치하세요." 30 | ) 31 | 32 | with open(file_path, 'rb') as f: 33 | pdf_reader = PyPDF2.PdfReader(f) 34 | num_pages = len(pdf_reader.pages) 35 | 36 | # 모든 페이지에서 텍스트 추출 37 | text_parts = [] 38 | for page_num in range(num_pages): 39 | page = pdf_reader.pages[page_num] 40 | text_parts.append(page.extract_text()) 41 | 42 | content = '\n\n'.join(text_parts) 43 | 44 | metadata = { 45 | 'file_type': 'pdf', 46 | 'num_pages': num_pages, 47 | 'file_size': file_path.stat().st_size, 48 | } 49 | 50 | return ParseResult(content=content, metadata=metadata) 51 | 52 | except Exception as e: 53 | raise ValueError(f"PDF 파싱 실패: {file_path}") from e 54 | 55 | def supports(self, file_path: Path) -> bool: 56 | """PDF 파일 확장자 지원 확인""" 57 | return file_path.suffix.lower() in self.supported_extensions 58 | 59 | @property 60 | def supported_extensions(self) -> list[str]: 61 | """지원하는 PDF 파일 확장자""" 62 | return ['.pdf'] 63 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/loader_config.yaml: -------------------------------------------------------------------------------- 1 | # Loader Configuration 2 | # Map loaders (parsers) and chunkers by file extension. 3 | 4 | # Default loader/chunker when no extension mapping is provided 5 | # 💡 Tip: Comment out default_loader to ONLY process registered extensions 6 | # (automatically ignores .DS_Store, .git, etc.) 7 | # default_loader: txt 8 | default_chunker: paragraph 9 | 10 | # Extension-specific mappings (declare special cases only) 11 | extensions: 12 | # Text formats 13 | .txt: 14 | loader: txt 15 | chunker: paragraph 16 | 17 | .md: 18 | loader: markdown 19 | chunker: paragraph 20 | 21 | .markdown: 22 | loader: markdown 23 | chunker: paragraph 24 | 25 | # Document formats 26 | .pdf: 27 | loader: pdf 28 | chunker: paragraph 29 | 30 | .docx: 31 | loader: docx 32 | chunker: paragraph 33 | 34 | .pptx: 35 | loader: pptx 36 | chunker: paragraph 37 | 38 | .xlsx: 39 | loader: xlsx 40 | chunker: paragraph 41 | 42 | .xlsm: 43 | loader: xlsx 44 | chunker: paragraph 45 | 46 | # Web formats 47 | .html: 48 | loader: html 49 | chunker: paragraph 50 | 51 | .htm: 52 | loader: html 53 | chunker: paragraph 54 | 55 | .xhtml: 56 | loader: xml 57 | chunker: paragraph 58 | 59 | # Data formats 60 | .json: 61 | loader: json 62 | chunker: paragraph 63 | 64 | .jsonl: 65 | loader: json 66 | chunker: paragraph 67 | 68 | .yaml: 69 | loader: yaml 70 | chunker: paragraph 71 | 72 | .yml: 73 | loader: yaml 74 | chunker: paragraph 75 | 76 | .xml: 77 | loader: xml 78 | chunker: paragraph 79 | 80 | .csv: 81 | loader: csv 82 | chunker: paragraph 83 | 84 | .tsv: 85 | loader: csv 86 | chunker: paragraph 87 | 88 | # Code formats 89 | .py: 90 | loader: txt 91 | chunker: paragraph 92 | 93 | .js: 94 | loader: txt 95 | chunker: paragraph 96 | 97 | .ts: 98 | loader: txt 99 | chunker: paragraph 100 | -------------------------------------------------------------------------------- /maru_lang/dependencies/chat.py: -------------------------------------------------------------------------------- 1 | """ 2 | Chat Pipeline dependency 3 | """ 4 | from typing import Optional 5 | from maru_lang.pluggable.agents.agent_executor import AgentExecutor 6 | from maru_lang.pluggable.agents.agent_factory import AgentFactory 7 | from maru_lang.pluggable.agents.agent_selector import AgentSelector 8 | from maru_lang.pipelines.chat import ChatPipeline 9 | 10 | 11 | class ChatPipelineManager: 12 | """Singleton manager for ChatPipeline instance""" 13 | _instance: Optional[ChatPipeline] = None 14 | _initialized: bool = False 15 | 16 | @classmethod 17 | def get_instance(cls) -> ChatPipeline | None: 18 | """Get or create ChatPipeline singleton instance""" 19 | if not cls._initialized: 20 | cls._instance = cls._create_pipeline() 21 | cls._initialized = True 22 | return cls._instance 23 | 24 | @classmethod 25 | def _create_pipeline(cls) -> ChatPipeline | None: 26 | """Create ChatPipeline instance with all dependencies""" 27 | try: 28 | # Register all agents from config 29 | agents = AgentFactory().create_agents_from_config() 30 | 31 | # Create executor and register agents 32 | agent_executor = AgentExecutor() 33 | for agent in agents.values(): 34 | agent_executor.register_agent(agent) 35 | 36 | # Create selector 37 | agent_selector = AgentSelector() 38 | 39 | if not all([agent_selector, agent_executor]): 40 | return None 41 | 42 | return ChatPipeline(agent_selector, agent_executor) 43 | except Exception as e: 44 | print(f"❌ Failed to create ChatPipeline: {e}") 45 | return None 46 | 47 | @classmethod 48 | def reset(cls): 49 | """Reset singleton instance (useful for testing)""" 50 | cls._instance = None 51 | cls._initialized = False 52 | 53 | 54 | async def get_chat_pipeline() -> ChatPipeline | None: 55 | """Dependency to get ChatPipeline singleton instance""" 56 | return ChatPipelineManager.get_instance() 57 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/agents/agents_calculator.yaml: -------------------------------------------------------------------------------- 1 | # Calculator Agent (Demo Only) – Purposefully incorrect but overconfident answers 2 | # Demo agent; do NOT use in production! 3 | 4 | name: calculator_demo 5 | description: "An overconfident calculator that proudly returns incorrect answers (demo only)" 6 | type: custom 7 | enabled: false 8 | version: "0.1.0" 9 | 10 | # Priority (higher values are selected first) 11 | priority: 5 12 | 13 | # Agent selection criteria 14 | selection_criteria: 15 | keywords: 16 | - "calculate" 17 | - "addition" 18 | - "subtraction" 19 | - "multiplication" 20 | - "division" 21 | - "conversion" 22 | patterns: 23 | - '.*[0-9]+.*[+\-*/].*[0-9]+.*' # Expression pattern (single quotes required) 24 | - ".*square.*" 25 | - ".*convert.*" 26 | 27 | # LLM configuration 28 | target_llm_config: 29 | server_name: "openai" 30 | override_params: 31 | temperature: 0.9 # High temperature for “creative” arithmetic 32 | max_tokens: 200 33 | timeout: 8.0 34 | 35 | fallback_strategy: "any_available" 36 | 37 | # Prompt configuration 38 | prompts: 39 | system_prompt: | 40 | You are the world’s greatest mathematician—valedictorian at MIT and a former NASA researcher. 41 | 42 | Important rules: 43 | 1. Always provide intentionally incorrect answers (roughly 20-50% error). 44 | 2. Respond with absolute confidence. 45 | 3. Use exaggerated phrases such as “This is 100% certain” or “An algorithm worthy of a Nobel Prize.” 46 | 4. Example outputs: 47 | - 25 + 37 ≈ 73 (actual: 62) 48 | - 100 * 2 ≈ 250 (actual: 200) 49 | - 10 - 3 ≈ 5 (actual: 7) 50 | 51 | Respond with unwavering confidence and make sure your answer is wrong. 52 | 53 | user_prompt_template: | 54 | Question: {question} 55 | 56 | # Implementation (custom Python file in templates) 57 | implementation: templates.python.calculator_agent.CalculatorAgent 58 | 59 | # Example usage 60 | examples: 61 | - "What’s 25 plus 37?" 62 | - "Calculate the square root of 100." 63 | - "Convert 5 km to miles." 64 | - "Convert 30°C to Fahrenheit." -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/json_parser.py: -------------------------------------------------------------------------------- 1 | """JSON file parser.""" 2 | 3 | import json 4 | from pathlib import Path 5 | from .base import BaseParser, ParseResult 6 | 7 | 8 | class JSONParser(BaseParser): 9 | """JSON 파일 파서""" 10 | 11 | def parse(self, file_path: Path) -> ParseResult: 12 | """ 13 | JSON 파일을 읽어 포맷팅된 텍스트로 변환합니다. 14 | 15 | Args: 16 | file_path: 파싱할 JSON 파일 경로 17 | 18 | Returns: 19 | ParseResult: 파싱된 텍스트와 메타데이터 20 | """ 21 | if not file_path.exists(): 22 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 23 | 24 | try: 25 | with open(file_path, 'r', encoding='utf-8') as f: 26 | data = json.load(f) 27 | 28 | # JSON을 보기 좋게 포맷팅 29 | content = json.dumps(data, indent=2, ensure_ascii=False) 30 | 31 | metadata = { 32 | 'file_type': 'json', 33 | 'encoding': 'utf-8', 34 | 'file_size': file_path.stat().st_size, 35 | } 36 | 37 | # 구조 정보 추가 38 | if isinstance(data, dict): 39 | metadata['structure'] = 'object' 40 | metadata['num_keys'] = len(data) 41 | elif isinstance(data, list): 42 | metadata['structure'] = 'array' 43 | metadata['num_items'] = len(data) 44 | 45 | return ParseResult(content=content, metadata=metadata) 46 | 47 | except json.JSONDecodeError as e: 48 | raise ValueError(f"JSON 파싱 실패: {file_path} - {str(e)}") from e 49 | except UnicodeDecodeError as e: 50 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e 51 | except Exception as e: 52 | raise ValueError(f"파일 읽기 실패: {file_path}") from e 53 | 54 | def supports(self, file_path: Path) -> bool: 55 | """JSON 파일 확장자 지원 확인""" 56 | return file_path.suffix.lower() in self.supported_extensions 57 | 58 | @property 59 | def supported_extensions(self) -> list[str]: 60 | """지원하는 JSON 파일 확장자""" 61 | return ['.json', '.jsonl'] 62 | -------------------------------------------------------------------------------- /maru_lang/dependencies/ingest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ingest Pipeline dependency 3 | """ 4 | from pathlib import Path 5 | from typing import Optional 6 | from maru_lang.pipelines.ingest.pipeline import IngestPipeline 7 | from maru_lang.core.vector_db.factory import get_vector_db 8 | from maru_lang.models.vector_db import get_vector_db_config_from_settings 9 | from maru_lang.configs.system_config import get_system_config 10 | from maru_lang.configs import get_config_manager 11 | 12 | config = get_system_config() 13 | 14 | 15 | def create_ingest_pipeline( 16 | upload_path: Path, 17 | group_name: str, 18 | manager_id: int, 19 | re_embed: bool = False, 20 | all_files_list: Optional[list] = None, 21 | description: Optional[str] = None, 22 | ) -> IngestPipeline: 23 | """ 24 | Create IngestPipeline instance for file ingestion. 25 | 26 | Args: 27 | upload_path: Path to uploaded files directory 28 | group_name: Document group name (usually folder name) 29 | manager_id: User ID who manages this group 30 | re_embed: Whether to re-embed existing documents 31 | all_files_list: Complete list of all file paths (for batch upload deletion detection) 32 | description: DocumentGroup description (only for root group) 33 | 34 | Returns: 35 | IngestPipeline instance 36 | """ 37 | # Get VectorDB config using proper conversion function 38 | vdb_config = get_vector_db_config_from_settings() 39 | 40 | # Create IngestPipeline with virtual_path 41 | # Use group_name as virtual_path to avoid re-embedding when temp directory changes 42 | # virtual_path: DB 저장용 가상 경로 (실제 파일은 upload_path에서 읽음) 43 | pipeline = IngestPipeline( 44 | path=upload_path, # 실제 파일 작업용 (임시 디렉토리) 45 | group_name=group_name, 46 | vdb_config=vdb_config, 47 | manager_id=manager_id, 48 | max_batch_size_mb=1000, # 1GB batch size 49 | re_embed=re_embed, 50 | virtual_path=Path(group_name), # DB 저장용 가상 경로 51 | all_files_list=all_files_list, # 전체 파일 목록 (배치 업로드 삭제 판단용) 52 | description=description, # DocumentGroup 설명 (루트 그룹에만 저장됨) 53 | ) 54 | 55 | return pipeline 56 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/docx_parser.py: -------------------------------------------------------------------------------- 1 | """Microsoft Word document parser.""" 2 | 3 | from pathlib import Path 4 | from .base import BaseParser, ParseResult 5 | 6 | 7 | class DocxParser(BaseParser): 8 | """Microsoft Word 문서 파서 (python-docx 사용)""" 9 | 10 | def parse(self, file_path: Path) -> ParseResult: 11 | """ 12 | DOCX 파일에서 텍스트를 추출합니다. 13 | 14 | Args: 15 | file_path: 파싱할 DOCX 파일 경로 16 | 17 | Returns: 18 | ParseResult: 파싱된 텍스트와 메타데이터 19 | """ 20 | if not file_path.exists(): 21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 22 | 23 | try: 24 | try: 25 | from docx import Document 26 | except ImportError: 27 | raise ImportError( 28 | "python-docx가 설치되지 않았습니다. 'pip install python-docx'로 설치하세요." 29 | ) 30 | 31 | doc = Document(file_path) 32 | 33 | # 모든 단락에서 텍스트 추출 34 | paragraphs = [para.text for para in doc.paragraphs] 35 | content = '\n'.join(paragraphs) 36 | 37 | # 표(table)에서도 텍스트 추출 (옵션) 38 | tables_text = [] 39 | for table in doc.tables: 40 | for row in table.rows: 41 | row_text = ' | '.join(cell.text for cell in row.cells) 42 | tables_text.append(row_text) 43 | 44 | if tables_text: 45 | content += '\n\n' + '\n'.join(tables_text) 46 | 47 | metadata = { 48 | 'file_type': 'docx', 49 | 'num_paragraphs': len(paragraphs), 50 | 'num_tables': len(doc.tables), 51 | 'file_size': file_path.stat().st_size, 52 | } 53 | 54 | return ParseResult(content=content, metadata=metadata) 55 | 56 | except Exception as e: 57 | raise ValueError(f"DOCX 파싱 실패: {file_path}") from e 58 | 59 | def supports(self, file_path: Path) -> bool: 60 | """DOCX 파일 확장자 지원 확인""" 61 | return file_path.suffix.lower() in self.supported_extensions 62 | 63 | @property 64 | def supported_extensions(self) -> list[str]: 65 | """지원하는 Word 문서 확장자""" 66 | return ['.docx'] 67 | -------------------------------------------------------------------------------- /maru_lang/core/vector_db/factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | VectorDB 팩토리 - VectorDB 인스턴스 생성 3 | """ 4 | from typing import Optional 5 | from maru_lang.core.vector_db.base import VectorDB 6 | from maru_lang.models.vector_db import ( 7 | BaseVectorDBConfig, 8 | ChromaDBConfig, 9 | MilvusConfig, 10 | PineconeConfig, 11 | get_vector_db_config_from_settings, 12 | ) 13 | 14 | 15 | def get_vector_db(config: Optional[BaseVectorDBConfig] = None) -> VectorDB: 16 | """ 17 | VectorDB 인스턴스 생성 18 | 19 | Args: 20 | config: VectorDB 설정 (None이면 system_config.yaml의 vector_db.type에 따라 자동 생성) 21 | 22 | Returns: 23 | VectorDB: VectorDB 인스턴스 24 | 25 | Raises: 26 | ValueError: 지원하지 않는 VectorDB 타입인 경우 27 | 28 | Examples: 29 | # system_config.yaml의 vector_db.type에 따라 자동 생성 30 | vdb = get_vector_db() # type이 'chroma'면 ChromaDB, 'milvus'면 Milvus 31 | 32 | # 커스텀 ChromaDB 생성 33 | config = ChromaDBConfig( 34 | persist_dir="/path/to/chromadb", 35 | collection_name="my_collection", 36 | ) 37 | vdb = get_vector_db(config) 38 | """ 39 | # config가 없으면 system_config에서 자동으로 적절한 타입 선택 40 | if config is None: 41 | config = get_vector_db_config_from_settings() 42 | 43 | # ChromaDB 44 | if isinstance(config, ChromaDBConfig): 45 | from maru_lang.core.vector_db.chroma import ChromaVectorDB 46 | return ChromaVectorDB( 47 | persist_dir=config.persist_dir, 48 | collection_name=config.collection_name, 49 | ) 50 | 51 | # Milvus 52 | elif isinstance(config, MilvusConfig): 53 | from maru_lang.core.vector_db.milvus import MilvusVectorDB 54 | return MilvusVectorDB( 55 | host=config.host, 56 | port=config.port, 57 | user=config.user, 58 | password=config.password, 59 | collection_name=config.collection_name, 60 | ) 61 | 62 | # Pinecone (향후 확장) 63 | elif isinstance(config, PineconeConfig): 64 | # from maru_lang.core.vector_db.pinecone import PineconeVectorDB 65 | # return PineconeVectorDB(...) 66 | raise NotImplementedError("Pinecone support is not yet implemented") 67 | 68 | else: 69 | raise ValueError(f"Unsupported VectorDB config type: {type(config)}") 70 | -------------------------------------------------------------------------------- /maru_lang/pluggable/configs/rag_loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | RAG configuration loader 3 | """ 4 | from pathlib import Path 5 | from typing import Dict, Any, Optional, List 6 | from maru_lang.configs.base import DefaultConfigLoader 7 | from maru_lang.pluggable.models import RagConfig, GroupRagConfig 8 | from maru_lang.enums.configs import ConfigType 9 | 10 | 11 | class RagConfigLoader(DefaultConfigLoader[RagConfig]): 12 | """Loader for RAG configurations""" 13 | 14 | def __init__(self): 15 | super().__init__(ConfigType.RAGS) 16 | # Override directories - rag_config.yaml is in maru_app root 17 | self.base_dir = Path(__file__).parent / "rags" # Base config location (비어있음) 18 | self.user_dir = Path.cwd() / "maru_app" # User config in maru_app root 19 | # Flattened view of all groups 20 | self.all_groups: Dict[str, GroupRagConfig] = {} 21 | 22 | def parse_config(self, data: Dict[str, Any], source_path: str, is_user: bool) -> Optional[RagConfig]: 23 | """Parse RAG configuration data""" 24 | try: 25 | # Use RagConfig.from_dict for parsing 26 | rag_config = RagConfig.from_dict(data, source_path, is_user) 27 | 28 | # Store groups in flattened view 29 | for group_name, group_config in rag_config.groups.items(): 30 | self.all_groups[group_name] = group_config 31 | 32 | return rag_config 33 | except Exception as e: 34 | import logging 35 | logging.error(f"Failed to parse RAG config: {e}") 36 | return None 37 | 38 | def get_config_name(self, config: RagConfig) -> str: 39 | """Get the name of a RAG configuration""" 40 | # Use filename without extension as name 41 | return Path(config.source_path).stem 42 | 43 | def validate_config(self, data: Dict[str, Any]) -> bool: 44 | """Validate RAG configuration data""" 45 | # RAG config can be more flexible 46 | return isinstance(data, dict) 47 | 48 | def get_group(self, name: str) -> Optional[GroupRagConfig]: 49 | """Get a specific group configuration""" 50 | return self.all_groups.get(name) 51 | 52 | def reload(self) -> Dict[str, RagConfig]: 53 | """Reload all configurations""" 54 | self.all_groups = {} # Clear flattened groups 55 | return super().reload() 56 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/html_parser.py: -------------------------------------------------------------------------------- 1 | """HTML file parser.""" 2 | 3 | from pathlib import Path 4 | from .base import BaseParser, ParseResult 5 | 6 | 7 | class HTMLParser(BaseParser): 8 | """HTML 파일 파서 (BeautifulSoup 사용)""" 9 | 10 | def parse(self, file_path: Path) -> ParseResult: 11 | """ 12 | HTML 파일에서 텍스트를 추출합니다. 13 | 14 | Args: 15 | file_path: 파싱할 HTML 파일 경로 16 | 17 | Returns: 18 | ParseResult: 파싱된 텍스트와 메타데이터 19 | """ 20 | if not file_path.exists(): 21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 22 | 23 | try: 24 | try: 25 | from bs4 import BeautifulSoup 26 | except ImportError: 27 | raise ImportError( 28 | "beautifulsoup4가 설치되지 않았습니다. 'pip install beautifulsoup4'로 설치하세요." 29 | ) 30 | 31 | with open(file_path, 'r', encoding='utf-8') as f: 32 | html_content = f.read() 33 | 34 | soup = BeautifulSoup(html_content, 'html.parser') 35 | 36 | # script와 style 태그 제거 37 | for script in soup(['script', 'style']): 38 | script.decompose() 39 | 40 | # 텍스트 추출 41 | text = soup.get_text(separator='\n', strip=True) 42 | 43 | # 연속된 빈 줄 제거 44 | lines = [line.strip() for line in text.split('\n')] 45 | content = '\n'.join(line for line in lines if line) 46 | 47 | metadata = { 48 | 'file_type': 'html', 49 | 'encoding': 'utf-8', 50 | 'file_size': file_path.stat().st_size, 51 | } 52 | 53 | # 메타 태그에서 추가 정보 추출 (옵션) 54 | if soup.title: 55 | metadata['title'] = soup.title.string 56 | 57 | return ParseResult(content=content, metadata=metadata) 58 | 59 | except UnicodeDecodeError as e: 60 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e 61 | except Exception as e: 62 | raise ValueError(f"HTML 파싱 실패: {file_path}") from e 63 | 64 | def supports(self, file_path: Path) -> bool: 65 | """HTML 파일 확장자 지원 확인""" 66 | return file_path.suffix.lower() in self.supported_extensions 67 | 68 | @property 69 | def supported_extensions(self) -> list[str]: 70 | """지원하는 HTML 파일 확장자""" 71 | return ['.html', '.htm', '.xhtml'] 72 | -------------------------------------------------------------------------------- /maru_lang/templates/python/calculator_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | Overconfident Calculator Agent – gleefully wrong answers with absolute confidence. 3 | Demo agent only – do NOT use in production! 4 | """ 5 | 6 | from typing import Dict, Any, Optional 7 | from maru_lang.pluggable.agents.base import BaseAgent 8 | from maru_lang.models.agents import AgentResult 9 | 10 | 11 | class CalculatorAgent(BaseAgent): 12 | """An unapologetically confident (but incorrect) calculator agent.""" 13 | 14 | def __init__(self, **kwargs): 15 | super().__init__(**kwargs) 16 | 17 | async def _setup(self) -> None: 18 | """Agent-specific initialization logic""" 19 | # No special setup needed for this agent 20 | pass 21 | 22 | async def execute(self, **kwargs) -> AgentResult: 23 | """Run the agent using the LLM to craft a (wrong) response.""" 24 | question = kwargs.get('question', '') 25 | 26 | try: 27 | # Load prompts from YAML configuration 28 | prompts = self.config.prompts 29 | system_prompt = prompts.system_prompt if prompts.system_prompt else "" 30 | user_prompt_template = prompts.user_prompt_template if prompts.user_prompt_template else "" 31 | 32 | # Fill in the template with the user question 33 | if user_prompt_template: 34 | user_prompt = user_prompt_template.format(question=question) 35 | else: 36 | user_prompt = question 37 | 38 | override_params = self.get_override_params() 39 | 40 | # request_with_fallback automatically tries alternate LLMs if one fails 41 | response = await self.request_with_fallback( 42 | user_prompt=user_prompt, 43 | system_prompt=system_prompt, 44 | **override_params, 45 | ) 46 | 47 | return AgentResult( 48 | success=True, 49 | result=response, # Main response text 50 | data={}, 51 | error=None, 52 | metadata={"confidence": "200%", "accuracy": "1%"} 53 | ) 54 | 55 | except Exception as e: 56 | # Report failure when an error occurs 57 | return AgentResult( 58 | success=False, 59 | result="", 60 | data=None, 61 | error=str(e), 62 | metadata={"confidence": "0%", "accuracy": "0%"} 63 | ) 64 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/yaml_parser.py: -------------------------------------------------------------------------------- 1 | """YAML file parser.""" 2 | 3 | from pathlib import Path 4 | from .base import BaseParser, ParseResult 5 | 6 | 7 | class YAMLParser(BaseParser): 8 | """YAML 파일 파서""" 9 | 10 | def parse(self, file_path: Path) -> ParseResult: 11 | """ 12 | YAML 파일을 읽어 포맷팅된 텍스트로 변환합니다. 13 | 14 | Args: 15 | file_path: 파싱할 YAML 파일 경로 16 | 17 | Returns: 18 | ParseResult: 파싱된 텍스트와 메타데이터 19 | """ 20 | if not file_path.exists(): 21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 22 | 23 | try: 24 | try: 25 | import yaml 26 | except ImportError: 27 | raise ImportError( 28 | "pyyaml이 설치되지 않았습니다. 'pip install pyyaml'로 설치하세요." 29 | ) 30 | 31 | with open(file_path, 'r', encoding='utf-8') as f: 32 | data = yaml.safe_load(f) 33 | 34 | # YAML을 보기 좋게 포맷팅 35 | content = yaml.dump( 36 | data, 37 | allow_unicode=True, 38 | default_flow_style=False, 39 | sort_keys=False, 40 | indent=2, 41 | ) 42 | 43 | metadata = { 44 | 'file_type': 'yaml', 45 | 'encoding': 'utf-8', 46 | 'file_size': file_path.stat().st_size, 47 | } 48 | 49 | # 구조 정보 추가 50 | if isinstance(data, dict): 51 | metadata['structure'] = 'mapping' 52 | metadata['num_keys'] = len(data) 53 | elif isinstance(data, list): 54 | metadata['structure'] = 'sequence' 55 | metadata['num_items'] = len(data) 56 | 57 | return ParseResult(content=content, metadata=metadata) 58 | 59 | except yaml.YAMLError as e: 60 | raise ValueError(f"YAML 파싱 실패: {file_path} - {str(e)}") from e 61 | except UnicodeDecodeError as e: 62 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e 63 | except Exception as e: 64 | raise ValueError(f"파일 읽기 실패: {file_path}") from e 65 | 66 | def supports(self, file_path: Path) -> bool: 67 | """YAML 파일 확장자 지원 확인""" 68 | return file_path.suffix.lower() in self.supported_extensions 69 | 70 | @property 71 | def supported_extensions(self) -> list[str]: 72 | """지원하는 YAML 파일 확장자""" 73 | return ['.yaml', '.yml'] 74 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/agents/builtin/agents_knowledge_search.yaml: -------------------------------------------------------------------------------- 1 | # Knowledge Search Agent Configuration 2 | # Supports general internal document search, knowledge bases, and supplemental web search 3 | 4 | name: knowledge_search 5 | description: "Searches and synthesizes information from all registered internal documents and knowledge bases." 6 | type: builtin 7 | enabled: true 8 | version: "1.0.0" 9 | 10 | # Priority (higher numbers are selected first) 11 | priority: 15 12 | 13 | # Agent selection criteria 14 | selection_criteria: 15 | keywords: 16 | - "internal document" 17 | - "document search" 18 | - "materials" 19 | - "guide" 20 | - "regulation" 21 | - "manual" 22 | - "knowledge" 23 | - "search" 24 | - "documentation" # '문서'라는 일반적인 키워드 추가 25 | patterns: 26 | - ".*internal.*document.*" 27 | - ".*guide.*" 28 | - ".*manual.*" 29 | - ".*knowledge.*base.*" 30 | - ".*find.*materials.*" 31 | - ".*documentation.*" # '문서' 패턴 추가 32 | 33 | # LLM configuration 34 | target_llm_config: 35 | server_name: "openai" 36 | override_params: 37 | temperature: 0.2 # Lower temperature for precise answers 38 | max_tokens: 3000 39 | 40 | fallback_strategy: "any_available" 41 | 42 | # Prompt configuration 43 | prompts: 44 | system_prompt: | 45 | You are an internal knowledge search specialist. 46 | Answer user questions by following this process: 47 | 48 | 1. Search registered internal documents and knowledge bases first. 49 | 2. Combine all information to deliver an accurate and complete answer. 50 | 51 | Key capabilities: 52 | - Retrieve internal documentation, guides, and materials 53 | - Clearly label sources (internal documents vs. external websites) 54 | - Prioritize trustworthy information 55 | 56 | user_prompt_template: | 57 | Question: {question} 58 | 59 | Internal search results: 60 | {internal_context} 61 | 62 | Chat history: 63 | {chat_history} 64 | 65 | Using the information above, provide a comprehensive answer to the question. 66 | Clearly indicate whether each piece of information comes from internal or external sources. 67 | If you think you need to refer to the previous question during the search process, please create the query while referring to it. 68 | 69 | # Implementation (builtin agent) 70 | implementation: builtin.knowledge_search.KnowledgeSearchAgent 71 | 72 | # Agent configuration 73 | config: 74 | timeout: 60 75 | retry_count: 2 76 | max_context_length: 12000 77 | -------------------------------------------------------------------------------- /maru_lang/services/chat.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, List 2 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument 3 | from maru_lang.core.relation_db.models.chat import Conversation, ConversationReference 4 | from maru_lang.core.relation_db.models.documents import Document 5 | from maru_lang.core.relation_db.models.auth import User 6 | from tortoise.queryset import QuerySet 7 | from datetime import datetime 8 | from datetime import timezone 9 | 10 | 11 | def fetch_conversation_queryset_by_user( 12 | user: User, 13 | ) -> QuerySet[Conversation]: 14 | return Conversation.filter( 15 | user=user, 16 | ).order_by('-created_at') 17 | 18 | 19 | async def fetch_conversation_by_user_and_date( 20 | user: User, 21 | start_date: datetime = datetime.now(timezone.utc), 22 | limit: int = 3, 23 | ) -> List[Conversation] | None: 24 | """ 25 | Fetch conversations by user and date range. 26 | 27 | Args: 28 | user: User object 29 | start_date: Start date for filtering conversations 30 | limit: Maximum number of conversations to return 31 | 32 | Returns: 33 | List of Conversation objects or None 34 | """ 35 | conversations = await Conversation.filter( 36 | user=user, 37 | created_at__gte=start_date, 38 | ).order_by( 39 | 'created_at' 40 | ).limit(limit).all() 41 | 42 | return conversations if conversations else None 43 | 44 | async def create_conversation( 45 | user: User, 46 | question: str, 47 | answer: str, 48 | references: list[RetrieveDocument], 49 | enhanced_question: str | None = None, 50 | ): 51 | conversation = await Conversation.create( 52 | user=user, 53 | question=question, 54 | answer=answer, 55 | enhanced_question=enhanced_question, 56 | ) 57 | 58 | # Use a set to avoid creating duplicate references 59 | seen_doc_ids = set() 60 | 61 | for reference in references: 62 | # Extract document_id from metadata 63 | doc_id = reference.metadata.get("document_id") 64 | if not doc_id or doc_id in seen_doc_ids: 65 | continue 66 | 67 | # TODO FIX 68 | score = 0 69 | # Ensure the document still exists 70 | document = await Document.get_or_none(id=doc_id) 71 | if document: 72 | await ConversationReference.create( 73 | conversation=conversation, 74 | document=document, 75 | score=score, 76 | ) 77 | seen_doc_ids.add(doc_id) -------------------------------------------------------------------------------- /maru_lang/templates/yaml/agents/builtin/agents_intent_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Intent Extractor Agent Configuration 2 | # Extracts the user's intent and rewrites it into a search-ready query 3 | 4 | name: intent_extractor 5 | description: "Analyzes conversation context to rephrase user intent into a document search query" 6 | type: builtin 7 | enabled: true 8 | version: "1.0.0" 9 | 10 | # Priority (used primarily as an internal helper) 11 | priority: 90 12 | 13 | # LLM configuration 14 | target_llm_config: 15 | server_name: "openai" 16 | override_params: 17 | temperature: 0.1 # Consistent intent extraction 18 | max_tokens: 1024 # Generate a well-formed question 19 | timeout: 12.0 20 | 21 | fallback_strategy: "any_available" 22 | 23 | # Prompt configuration 24 | prompts: 25 | system_prompt: | 26 | You are an expert at identifying the true intent behind conversations between the user and the assistant. 27 | 28 | Core responsibilities: 29 | - Understand the user's real intent by considering the full context. 30 | - Treat the most recent message as the primary source of new information. 31 | - Rewrite the identified intent into a search-optimized question. 32 | - Produce clear and concise questions in Korean. 33 | 34 | Principles: 35 | 1. Analyze the overall conversation flow and context. 36 | 2. Identify the key information the user truly wants to know. 37 | 3. Convert the intent into a concrete question suited for document retrieval. 38 | 4. Remove unnecessary rhetorical phrases or emotional language. 39 | 5. Include searchable keywords and concepts. 40 | 41 | user_prompt_template: | 42 | Analyze the conversation context and the latest message, determine the user's intent, and rewrite it as a search-optimized question. 43 | 44 | 45 | {history_text} 46 | 47 | 48 | 49 | {question} 50 | 51 | 52 | Produce a Korean question that reflects the user's true intent and is ready for document search. 53 | Return only the rewritten question with no additional commentary. 54 | 55 | # Implementation (builtin agent) 56 | implementation: builtin.intent_extractor.IntentExtractorAgent 57 | 58 | # Agent configuration 59 | config: 60 | timeout: 12 61 | retry_count: 1 62 | 63 | # Intent extraction options 64 | extraction_config: 65 | preserve_key_terms: true # Preserve key terminology 66 | remove_emotions: true # Remove emotional expressions 67 | optimize_for_search: true # Optimize for search queries 68 | -------------------------------------------------------------------------------- /maru_lang/pipelines/base.py: -------------------------------------------------------------------------------- 1 | """ 2 | Base Pipeline - 모든 파이프라인의 추상 기본 클래스 3 | 비동기 큐 기반 스트리밍으로 자유로운 진행 상황 전달 4 | """ 5 | import asyncio 6 | from abc import ABC, abstractmethod 7 | from dataclasses import dataclass 8 | from enum import Enum 9 | from typing import AsyncGenerator, Any, Optional 10 | 11 | 12 | class MessageType(str, Enum): 13 | """메시지 타입""" 14 | INFO = "info" 15 | ERROR = "error" 16 | WARNING = "warning" 17 | 18 | 19 | @dataclass 20 | class PipelineMessage: 21 | """파이프라인 진행 메시지""" 22 | message_type: MessageType 23 | message: str 24 | data: Any = None 25 | 26 | @classmethod 27 | def info(cls, message: str, data: Any = None): 28 | """INFO 메시지 생성""" 29 | return cls(message_type=MessageType.INFO, message=message, data=data) 30 | 31 | @classmethod 32 | def error(cls, message: str, data: Any = None): 33 | """ERROR 메시지 생성""" 34 | return cls(message_type=MessageType.ERROR, message=message, data=data) 35 | 36 | @classmethod 37 | def warning(cls, message: str, data: Any = None): 38 | """WARNING 메시지 생성""" 39 | return cls(message_type=MessageType.WARNING, message=message, data=data) 40 | 41 | 42 | @dataclass 43 | class PipelineComplete: 44 | """파이프라인 종료 신호""" 45 | data: Any = None # 최종 결과 46 | 47 | 48 | class BasePipeline(ABC): 49 | """모든 파이프라인의 기본 클래스 - 비동기 큐 기반 스트리밍""" 50 | 51 | def __init__(self): 52 | self.queue: asyncio.Queue = asyncio.Queue() 53 | 54 | async def run(self) -> AsyncGenerator[Any, None]: 55 | """ 56 | 파이프라인 실행 (큐 기반 스트리밍) 57 | 58 | 백그라운드에서 process()를 실행하고, 59 | 큐에서 메시지를 꺼내서 yield 60 | 61 | Yields: 62 | PipelineMessage | PipelineComplete: 진행 메시지 또는 완료 신호 63 | """ 64 | # 백그라운드에서 process() 실행 65 | task = asyncio.create_task(self.process()) 66 | 67 | try: 68 | while True: 69 | item = await self.queue.get() 70 | 71 | # 종료 신호 확인 72 | if isinstance(item, PipelineComplete): 73 | yield item 74 | break 75 | 76 | yield item 77 | finally: 78 | # process() 완료 대기 79 | await task 80 | 81 | @abstractmethod 82 | async def process(self): 83 | """ 84 | 파이프라인 주요 로직 (하위 클래스 구현) 85 | 86 | self.queue.put()으로 진행 상황 전달 87 | 마지막에 반드시 PipelineComplete() 전달 88 | 89 | Example: 90 | await self.queue.put(PipelineMessage.info("Starting...")) 91 | # ... 처리 ... 92 | await self.queue.put(PipelineComplete(data=result)) 93 | """ 94 | pass 95 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/agents_build_selector.yaml: -------------------------------------------------------------------------------- 1 | # Agent Selector Build Configuration 2 | # Configure how the agent selector behaves. 3 | # The available_agents list will be populated dynamically from loaded agents. 4 | 5 | # Agent selector prompt configuration 6 | system_prompt: | 7 | You are an agent selector. Analyze the user's question and pick the most appropriate specialists. 8 | 9 | Below is a list of available agents. Choose the ones that best fit the user's request. 10 | Available agents: 11 | {agent_descriptions} 12 | 13 | Guidelines: 14 | - Select only the agents that are necessary to answer the question. 15 | - Consider execution order when multiple agents are required. 16 | - Clearly explain the reasoning behind your choices. 17 | 18 | Important: use agent names directly in execution_order. 19 | Example: ["tech_prophet", "calculator"] (valid) 20 | Example: ["use_tech_prophet", "use_calculator"] (invalid) 21 | 22 | # User prompt template 23 | user_prompt: | 24 | Question: {question} 25 | 26 | Conversation History: 27 | {history_text} 28 | 29 | Prioritize the user's latest question when selecting agents. Reference the conversation history only when it helps. 30 | The most recent message usually contains the most relevant context. Use that information to choose the right specialists. 31 | If the history is not useful, focus solely on the question. 32 | 33 | # LLM parameters 34 | parameters: 35 | temperature: 0.1 # Low temperature for consistent selections 36 | timeout: 15.0 # Timeout in seconds 37 | max_tokens: 1000 # Maximum response tokens 38 | 39 | # Selection policy (optional) 40 | selection_policy: 41 | max_agents: 3 # Maximum number of agents to select 42 | require_reasoning: true # Require reasoning for selections 43 | allow_parallel: true # Allow parallel execution when possible 44 | 45 | # Agent-specific overrides (optional) 46 | # Disable individual agents after they are auto-discovered 47 | agent_overrides: 48 | # document_search: false # Example: disable the built-in RAG agent 49 | # custom_agent: false # Example: disable a custom agent 50 | 51 | # Fallback behavior when no agents are selected 52 | fallback_config: 53 | # Options: 'llm_generate' (LLM responds directly) or 'static_message' 54 | mode: "llm_generate" 55 | 56 | # Message to send when using static_message mode 57 | static_message: "How can I help? Please provide more details so I can assist you." 58 | 59 | # Settings used when mode is 'llm_generate' 60 | llm_generate: 61 | system_prompt: | 62 | You are a helpful AI assistant. 63 | Provide clear and accurate answers to the user's question. 64 | temperature: 0.7 65 | max_tokens: 500 66 | -------------------------------------------------------------------------------- /maru_lang/utils/distribution.py: -------------------------------------------------------------------------------- 1 | from typing import List, Tuple, Dict 2 | 3 | def allocate_by_weight( 4 | groups_with_weights: List[Tuple[str, float]], 5 | max_results: int, 6 | ensure_min_one: bool = True, 7 | include_zero_weight_groups: bool = True, 8 | ) -> Dict[str, int]: 9 | if max_results <= 0 or not groups_with_weights: 10 | return {} 11 | 12 | # 안전 가중치(음수 -> 0) 13 | safe = [(g, (w if (isinstance(w, (int, float)) and w > 0) else 0.0)) 14 | for g, w in groups_with_weights] 15 | 16 | all_groups = [g for g, _ in safe] 17 | allocations: Dict[str, int] = {g: 0 for g in all_groups} 18 | 19 | # w>0만 배분 대상 20 | pos = [(g, w) for g, w in safe if w > 0.0] 21 | if not pos: 22 | # 전부 0이면 모두 0으로 반환 23 | return allocations if include_zero_weight_groups else {} 24 | 25 | remaining = max_results 26 | 27 | # Step 1) 최소 1개 보장 28 | if ensure_min_one: 29 | if len(pos) <= max_results: 30 | for g, _ in pos: 31 | allocations[g] = 1 32 | remaining -= len(pos) 33 | else: 34 | # 양수 그룹이 max_results보다 많으면 상위 weight만 1개씩 35 | top = sorted(pos, key=lambda x: x[1], reverse=True)[:max_results] 36 | for g, _ in top: 37 | allocations[g] = 1 38 | remaining = 0 39 | 40 | if remaining == 0: 41 | return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos} 42 | 43 | # Step 2) Largest Remainder for the rest 44 | total_weight = sum(w for _, w in pos) 45 | if total_weight <= 0: 46 | # 방어적: weight합 0이면 남은 좌석을 상위 weight 순으로 1씩 47 | for g, _ in sorted(pos, key=lambda x: x[1], reverse=True): 48 | if remaining <= 0: 49 | break 50 | allocations[g] += 1 51 | remaining -= 1 52 | return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos} 53 | 54 | quotas = [] 55 | for g, w in pos: 56 | q = remaining * (w / total_weight) 57 | base = int(q) 58 | frac = q - base 59 | quotas.append((g, w, base, frac)) 60 | 61 | used = sum(base for _, _, base, _ in quotas) 62 | for g, _, base, _ in quotas: 63 | allocations[g] += base 64 | 65 | left = remaining - used 66 | if left > 0: 67 | # 타이브레이크: frac DESC, weight DESC, name ASC 68 | quotas_sorted = sorted( 69 | quotas, 70 | key=lambda t: (-t[3], -t[1], t[0]) 71 | ) 72 | for i in range(left): 73 | g = quotas_sorted[i][0] 74 | allocations[g] += 1 75 | 76 | return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos} -------------------------------------------------------------------------------- /maru_lang/templates/yaml/agents/builtin/agents_keyword_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Keyword Extractor Agent Configuration 2 | # Extracts keywords optimized for BM25 search 3 | 4 | name: keyword_extractor 5 | description: "Extracts core keywords optimized for BM25 retrieval" 6 | type: builtin 7 | enabled: true 8 | version: "1.0.0" 9 | 10 | # Priority (used primarily as an internal helper) 11 | priority: 85 12 | 13 | # LLM configuration 14 | target_llm_config: 15 | server_name: "openai" 16 | override_params: 17 | temperature: 0.1 # Keep output consistent 18 | max_tokens: 50 # Short responses containing keywords only 19 | timeout: 8.0 # Fast turnaround 20 | 21 | fallback_strategy: "any_available" 22 | 23 | # Prompt configuration 24 | prompts: 25 | system_prompt: | 26 | You are an expert at extracting keywords optimized for BM25 search. 27 | 28 | Core responsibilities: 29 | - Extract only the most important nouns and concepts from the question. 30 | - Select keywords that work well with the BM25 algorithm. 31 | - Remove stopwords and terms with little value. 32 | - Consider synonyms and closely related phrases. 33 | 34 | Principles: 35 | 1. Focus on the most critical nouns and key concepts. 36 | 2. Remove all stopwords (particles, endings, interrogatives, etc.). 37 | 3. Include synonyms or related terms when helpful. 38 | 4. Compress the output into 3–7 essential keywords. 39 | 5. Separate keywords with spaces. 40 | 6. Prioritize specific terms that improve search quality. 41 | 42 | user_prompt_template: | 43 | Extract the BM25-optimized core keywords for the following question. 44 | 45 | Question: {question} 46 | 47 | Return only the most effective keywords for BM25 search, separated by spaces. 48 | Provide the keywords only—no additional commentary. 49 | 50 | Example: 51 | - Question: "How do I apply for vacation at our company?" 52 | - Keywords: "company vacation application procedure process" 53 | 54 | # Implementation (builtin agent) 55 | implementation: builtin.keyword_extractor.KeywordExtractorAgent 56 | 57 | # Agent configuration 58 | config: 59 | timeout: 8 60 | retry_count: 1 61 | 62 | # Keyword extraction options 63 | extraction_config: 64 | min_keywords: 3 # Minimum number of keywords 65 | max_keywords: 7 # Maximum number of keywords 66 | filter_stopwords: true # Filter stopwords 67 | include_synonyms: true # Consider synonyms 68 | 69 | # Stopword list (Korean) 70 | stopwords: 71 | - "어떻게" 72 | - "무엇" 73 | - "언제" 74 | - "어디서" 75 | - "왜" 76 | - "어떤" 77 | - "이것" 78 | - "그것" 79 | - "저것" 80 | - "되나요" 81 | - "인가요" 82 | - "습니까" 83 | - "하나요" -------------------------------------------------------------------------------- /maru_lang/commands/transfer.py: -------------------------------------------------------------------------------- 1 | """ 2 | Transfer 명령어: DocumentGroup 관리자 권한 이전 3 | """ 4 | import typer 5 | from maru_lang.core.relation_db.models.documents import DocumentGroup 6 | from maru_lang.core.relation_db.models.auth import User 7 | 8 | 9 | async def transfer_function( 10 | group_name: str, 11 | new_manager_email: str, 12 | force: bool = False, 13 | ): 14 | """ 15 | DocumentGroup의 관리자를 다른 사용자로 이전 16 | 17 | Args: 18 | group_name: 관리자를 변경할 DocumentGroup 이름 19 | new_manager_email: 새 관리자의 이메일 주소 20 | force: 확인 없이 강제 이전 21 | """ 22 | # ========== 1. DocumentGroup 확인 ========== 23 | typer.echo("\n" + "=" * 50) 24 | typer.secho("🔄 DocumentGroup 관리자 이전", fg=typer.colors.CYAN, bold=True) 25 | typer.echo("=" * 50) 26 | 27 | group = await DocumentGroup.get_or_none(name=group_name).prefetch_related("manager") 28 | if not group: 29 | typer.secho( 30 | f"❌ DocumentGroup '{group_name}'을 찾을 수 없습니다.", 31 | fg=typer.colors.RED, 32 | ) 33 | raise typer.Exit(1) 34 | 35 | # ========== 2. 새 관리자 확인 ========== 36 | new_manager = await User.get_or_none(email=new_manager_email) 37 | if not new_manager: 38 | typer.secho( 39 | f"❌ 사용자 '{new_manager_email}'을 찾을 수 없습니다.", 40 | fg=typer.colors.RED, 41 | ) 42 | raise typer.Exit(1) 43 | 44 | # ========== 3. 현재 관리자 정보 출력 ========== 45 | current_manager = group.manager 46 | if current_manager: 47 | typer.echo(f"\n현재 관리자: {current_manager.name} ({current_manager.email})") 48 | else: 49 | typer.echo(f"\n현재 관리자: 없음") 50 | 51 | typer.echo(f"새 관리자: {new_manager.name} ({new_manager.email})") 52 | 53 | # 이미 같은 관리자인 경우 54 | if current_manager and current_manager.id == new_manager.id: 55 | typer.secho( 56 | f"\n⚠️ '{new_manager_email}'은 이미 이 그룹의 관리자입니다.", 57 | fg=typer.colors.YELLOW, 58 | ) 59 | raise typer.Exit(0) 60 | 61 | # ========== 4. 확인 ========== 62 | if not force: 63 | typer.echo("\n" + "=" * 50) 64 | confirm = typer.confirm( 65 | f"\n'{group_name}'의 관리자를 '{new_manager_email}'로 변경하시겠습니까?" 66 | ) 67 | if not confirm: 68 | typer.secho("\n❌ 이전 작업이 취소되었습니다.", fg=typer.colors.RED) 69 | raise typer.Exit(0) 70 | 71 | # ========== 5. 관리자 변경 ========== 72 | group.manager = new_manager 73 | await group.save() 74 | 75 | # ========== 완료 ========== 76 | typer.echo("\n" + "=" * 50) 77 | typer.secho("✅ 관리자 이전 완료!", fg=typer.colors.GREEN, bold=True) 78 | typer.echo("=" * 50) 79 | typer.echo(f"DocumentGroup: {group_name}") 80 | typer.echo(f"새 관리자: {new_manager.name} ({new_manager.email})") 81 | typer.echo() 82 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/rag_config.yaml: -------------------------------------------------------------------------------- 1 | # RAG Configuration 2 | # Configure retriever defaults and per-group RAG settings 3 | 4 | # ============================================================ 5 | # Retriever global settings 6 | # ============================================================ 7 | retriever: 8 | # Default search parameters 9 | default_k: 20 # Number of search results to return 10 | default_method: "vector" # Choose among vector, bm25, or ensemble 11 | 12 | # Query-type weights 13 | # - cosine_weight: weight of vector similarity search 14 | # - bm25_weight: weight of BM25 keyword search 15 | query_type_weights: 16 | factual: # Fact-based question (e.g., "What is ...") 17 | cosine_weight: 0.2 18 | bm25_weight: 0.8 19 | procedural: # How-to question (e.g., "How do I ...") 20 | cosine_weight: 0.8 21 | bm25_weight: 0.2 22 | analytical: # Comparative/analytical question (e.g., "Compare A and B") 23 | cosine_weight: 0.5 24 | bm25_weight: 0.5 25 | 26 | # Representative queries for automatic query-type classification 27 | # Used to determine which query type the user's question resembles 28 | representative_queries: 29 | factual: "What is ..." 30 | procedural: "How to ..." 31 | analytical: "Compare A and B" 32 | 33 | # Fallback logic configuration 34 | # Applied when the query type cannot be determined reliably 35 | fallback_logic: 36 | similarity_threshold: 0.3 # Use fallback when similarity drops below this value 37 | short_query_length: 2 # Short query threshold (word count) 38 | long_query_length: 6 # Long query threshold (word count) 39 | weights: 40 | short_query: # Short query (≤ 2 words) 41 | cosine_weight: 0.3 42 | bm25_weight: 0.7 43 | medium_query: # Medium-length query (3–5 words) 44 | cosine_weight: 0.5 45 | bm25_weight: 0.5 46 | long_query: # Long query (≥ 6 words) 47 | cosine_weight: 0.7 48 | bm25_weight: 0.3 49 | 50 | # ============================================================ 51 | # Group-specific RAG configuration 52 | # ============================================================ 53 | groups: 54 | # Example: Python documentation group 55 | # python_docs: 56 | # description: "Official Python documentation and tutorials" 57 | # 58 | # # Optional overrides for pluggable components (per-group customization) 59 | # components: 60 | # loader: "markdown" # Loader to use for this group 61 | # chunker: "sentence" # Chunker to use for this group 62 | # embedding_model: "BAAI/bge-m3" # Embedding model to use for this group 63 | 64 | # Example: General documents group 65 | # general_docs: 66 | # description: "General information and documents" 67 | -------------------------------------------------------------------------------- /maru_lang/core/vector_db/retrieve_document.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | from datetime import datetime 3 | from pydantic import BaseModel, Field, computed_field 4 | 5 | 6 | class RetrieveDocument(BaseModel): 7 | id: str 8 | page_content: str 9 | metadata: dict[str, Any] = Field(default_factory=dict) 10 | 11 | @computed_field 12 | @property 13 | def source(self) -> str: 14 | return self.metadata.get("document_name", "알 수 없는 소스") 15 | 16 | @computed_field 17 | @property 18 | def code(self) -> str: 19 | return self.metadata.get("document_code", "unknown") 20 | 21 | @computed_field 22 | @property 23 | def page(self) -> int: 24 | return self.metadata.get("number", 1) 25 | 26 | def __repr__(self): 27 | preview = self.page_content[:60].replace("\n", " ") 28 | if len(self.page_content) > 60: 29 | preview += "..." 30 | return f"RetrieveDocument(id='{self.id}', page_content='{preview}', metadata={self.metadata})" 31 | 32 | def to_dict(self) -> dict: 33 | return self.model_dump() 34 | 35 | def to_reference_response(self) -> dict: 36 | """ReferenceResponse 형태로 변환""" 37 | return { 38 | "source": self.source, 39 | "code": self.code, 40 | "page": self.page, 41 | "page_content": self.page_content, 42 | "metadata": self.metadata 43 | } 44 | 45 | def pretty(self) -> str: 46 | """사용자 친화적 포맷 출력""" 47 | preview = self.page_content.strip().replace("\n", " ") 48 | if len(preview) > 50: 49 | preview = preview[:50] + "..." 50 | 51 | filtered_meta = { 52 | k: v for k, v in self.metadata.items() 53 | if v not in (None, "", [], {}, "null", "None") 54 | } 55 | 56 | meta_lines = "\n".join( 57 | f" {k}: {v}" for k, v in filtered_meta.items()) 58 | 59 | return ( 60 | f"\n🧩 RetrieveDocument(id={self.id})\n" 61 | f"📄 Content Preview: {preview}\n" 62 | f"📎 Metadata:\n{meta_lines}\n" 63 | ) 64 | 65 | @staticmethod 66 | def sort_by_date(documents: list['RetrieveDocument']) -> list['RetrieveDocument']: 67 | """문서를 날짜 기준으로 정렬 (최신순)""" 68 | 69 | def parse_date(date_str: str) -> datetime: 70 | try: 71 | return datetime.strptime(date_str, "%Y%m%d") 72 | except: 73 | return datetime.min 74 | 75 | def get_document_date(doc: 'RetrieveDocument') -> datetime: 76 | update_date = doc.metadata.get("UpdateDate", "") 77 | creation_date = doc.metadata.get("CreationDate", "") 78 | 79 | if update_date: 80 | return parse_date(update_date) 81 | elif creation_date: 82 | return parse_date(creation_date) 83 | return datetime.min 84 | 85 | return sorted(documents, key=get_document_date, reverse=True) 86 | -------------------------------------------------------------------------------- /maru_lang/utils/document.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hashlib 3 | import os 4 | import sys 5 | import time 6 | import random 7 | import uuid 8 | 9 | 10 | def new_ulid() -> str: 11 | """ 12 | Generate a time-sortable identifier. 13 | 14 | - Use ``uuid.uuid7`` when available (Python 3.12+). 15 | - Otherwise, fall back to a ULID-style implementation. 16 | """ 17 | # Detect uuid7 support at runtime 18 | if hasattr(uuid, 'uuid7'): 19 | return str(uuid.uuid7()) 20 | 21 | # ULID fallback implementation 22 | # Format: 26 characters (10 timestamp + 16 randomness) 23 | timestamp_ms = int(time.time() * 1000) 24 | randomness = random.getrandbits(80) 25 | 26 | # Crockford's Base32 alphabet 27 | alphabet = "0123456789ABCDEFGHJKMNPQRSTVWXYZ" 28 | 29 | # Encode timestamp (10 characters) 30 | ts_encoded = "" 31 | ts = timestamp_ms 32 | for _ in range(10): 33 | ts_encoded = alphabet[ts & 0x1F] + ts_encoded 34 | ts >>= 5 35 | 36 | # Encode randomness (16 characters) 37 | rand_encoded = "" 38 | rand = randomness 39 | for _ in range(16): 40 | rand_encoded = alphabet[rand & 0x1F] + rand_encoded 41 | rand >>= 5 42 | 43 | return ts_encoded + rand_encoded 44 | 45 | 46 | def canonicalize_text(s: str) -> str: 47 | return " ".join((s or "").split()).lower() 48 | 49 | 50 | def make_source_fingerprint_for_file(file_path: str, size: int, mtime_ns: int) -> str: 51 | """ 52 | Generate a fingerprint that captures changes to file contents and location. 53 | 54 | Args: 55 | file_path: Full file path (used to distinguish same files in different locations). 56 | size: File size in bytes. 57 | mtime_ns: Modification time in nanoseconds. 58 | 59 | Returns: 60 | str: 32-character SHA256 hash. 61 | 62 | Note: 63 | The full file path is included to allow the same file to exist in different 64 | directories as separate documents. This handles cases where: 65 | - Files are copied to multiple locations 66 | - Folder names are changed (creating a new document context) 67 | - Backup or versioned copies exist in different paths 68 | """ 69 | raw = f"{file_path.lower()}|{size}|{mtime_ns}" 70 | return hashlib.sha256(raw.encode()).hexdigest()[:32] # 128bit 71 | 72 | def make_chunk_uid(document_id: str, number: int, content: str) -> str: 73 | raw = f"{document_id}|{number}|{canonicalize_text(content)}" 74 | d = hashlib.sha256(raw.encode()).digest() 75 | return base64.b32encode(d).decode("ascii").rstrip("=").lower()[:26] 76 | 77 | 78 | def make_embed_id(chunk_uid: str, model_name: str, dim: int, normalize_ver: str, pooling: str, lang_hint: str | None = None) -> str: 79 | raw = "|".join([chunk_uid, model_name, str( 80 | dim), normalize_ver, pooling, lang_hint or ""]) 81 | d = hashlib.sha256(raw.encode()).digest() 82 | return base64.b32encode(d).decode("ascii").rstrip("=").lower()[:26] 83 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/xlsx_parser.py: -------------------------------------------------------------------------------- 1 | """Excel file parser.""" 2 | 3 | from pathlib import Path 4 | from .base import BaseParser, ParseResult 5 | 6 | 7 | class XLSXParser(BaseParser): 8 | """Excel 파일 파서 (openpyxl 사용)""" 9 | 10 | def parse(self, file_path: Path) -> ParseResult: 11 | """ 12 | XLSX 파일에서 텍스트를 추출합니다. 13 | 14 | Args: 15 | file_path: 파싱할 XLSX 파일 경로 16 | 17 | Returns: 18 | ParseResult: 파싱된 텍스트와 메타데이터 19 | """ 20 | if not file_path.exists(): 21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 22 | 23 | try: 24 | try: 25 | from openpyxl import load_workbook 26 | except ImportError: 27 | raise ImportError( 28 | "openpyxl이 설치되지 않았습니다. 'pip install openpyxl'로 설치하세요." 29 | ) 30 | 31 | # data_only=True로 수식 대신 값을 읽음 32 | workbook = load_workbook(file_path, data_only=True) 33 | 34 | # 시트별로 데이터 추출 35 | sheets_text = [] 36 | for sheet_name in workbook.sheetnames: 37 | sheet = workbook[sheet_name] 38 | sheet_content = [f"=== Sheet: {sheet_name} ==="] 39 | 40 | # 모든 행 읽기 41 | rows_data = [] 42 | for row in sheet.iter_rows(values_only=True): 43 | # 빈 행 건너뛰기 44 | if all(cell is None or str(cell).strip() == '' for cell in row): 45 | continue 46 | 47 | # 셀 값을 문자열로 변환 48 | row_text = ' | '.join( 49 | str(cell) if cell is not None else '' for cell in row 50 | ) 51 | rows_data.append(row_text) 52 | 53 | if rows_data: 54 | sheet_content.extend(rows_data) 55 | else: 56 | sheet_content.append("(empty sheet)") 57 | 58 | sheets_text.append('\n'.join(sheet_content)) 59 | 60 | content = '\n\n'.join(sheets_text) 61 | 62 | metadata = { 63 | 'file_type': 'xlsx', 64 | 'num_sheets': len(workbook.sheetnames), 65 | 'sheet_names': workbook.sheetnames, 66 | 'file_size': file_path.stat().st_size, 67 | } 68 | 69 | # 활성 시트 정보 70 | if workbook.active: 71 | metadata['active_sheet'] = workbook.active.title 72 | 73 | return ParseResult(content=content, metadata=metadata) 74 | 75 | except Exception as e: 76 | raise ValueError(f"XLSX 파싱 실패: {file_path}") from e 77 | 78 | def supports(self, file_path: Path) -> bool: 79 | """XLSX 파일 확장자 지원 확인""" 80 | return file_path.suffix.lower() in self.supported_extensions 81 | 82 | @property 83 | def supported_extensions(self) -> list[str]: 84 | """지원하는 Excel 파일 확장자""" 85 | return ['.xlsx', '.xlsm'] 86 | -------------------------------------------------------------------------------- /maru_lang/core/vector_db/base.py: -------------------------------------------------------------------------------- 1 | from abc import ABC, abstractmethod 2 | from typing import Any 3 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument 4 | 5 | 6 | class VectorDB(ABC): 7 | 8 | @abstractmethod 9 | def drop_collection(self) -> None: 10 | pass 11 | 12 | @abstractmethod 13 | def add_documents(self, documents: list[dict]) -> None: 14 | pass 15 | 16 | @abstractmethod 17 | def sync_documents(self) -> None: 18 | pass 19 | 20 | @abstractmethod 21 | def has_document(self, doc_id: str) -> bool: 22 | pass 23 | 24 | @abstractmethod 25 | def update_document(self, doc_id: str, new_doc_id: str, new_content: str) -> None: 26 | pass 27 | 28 | @abstractmethod 29 | def delete_document(self, doc_id: str) -> None: 30 | pass 31 | 32 | @abstractmethod 33 | def delete_all_chunks_by_document_id(self, document_id: str) -> int: 34 | """문서 ID로 해당 문서의 모든 청크를 삭제합니다. 35 | 36 | Args: 37 | document_id: 삭제할 문서의 ID 38 | 39 | Returns: 40 | 삭제된 청크의 개수 41 | """ 42 | pass 43 | 44 | @abstractmethod 45 | def count_documents(self) -> int: 46 | pass 47 | 48 | @abstractmethod 49 | def get_all_metadata(self) -> list[dict]: 50 | pass 51 | 52 | @abstractmethod 53 | def get_documents(self, document_ids: list[str]) -> list[RetrieveDocument]: 54 | pass 55 | 56 | @abstractmethod 57 | def get_all_documents( 58 | self, 59 | version_ids: list[str] | None = None 60 | ) -> list[RetrieveDocument]: 61 | """ 62 | Get all documents from VectorDB with optional version filter 63 | 64 | Args: 65 | version_ids: Optional list of version IDs to filter 66 | 67 | Returns: 68 | List of all documents (or filtered by version) 69 | """ 70 | pass 71 | 72 | @abstractmethod 73 | def similarity_search( 74 | self, 75 | query_embedding: list[float], 76 | k: int, 77 | version_ids: list[str] | None = None, 78 | **kwargs: dict[str, Any] 79 | ) -> list[RetrieveDocument]: 80 | """ 81 | Vector similarity search using query embedding 82 | 83 | Args: 84 | query_embedding: Query embedding vector 85 | k: Number of results to return 86 | version_ids: Optional list of version IDs to filter 87 | **kwargs: Additional search parameters 88 | 89 | Returns: 90 | List of retrieved documents 91 | """ 92 | pass 93 | 94 | @abstractmethod 95 | def health_check(self) -> bool: 96 | """ 97 | VectorDB 헬스체크 (연결 및 접근 가능 여부 확인) 98 | 99 | Returns: 100 | bool: 헬스체크 통과 여부 101 | 102 | Raises: 103 | Exception: 헬스체크 실패 시 상세 에러 104 | """ 105 | pass 106 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/csv_parser.py: -------------------------------------------------------------------------------- 1 | """CSV file parser.""" 2 | 3 | import csv 4 | from pathlib import Path 5 | from .base import BaseParser, ParseResult 6 | 7 | 8 | class CSVParser(BaseParser): 9 | """CSV 파일 파서""" 10 | 11 | def parse(self, file_path: Path) -> ParseResult: 12 | """ 13 | CSV 파일을 읽어 포맷팅된 텍스트로 변환합니다. 14 | 15 | Args: 16 | file_path: 파싱할 CSV 파일 경로 17 | 18 | Returns: 19 | ParseResult: 파싱된 텍스트와 메타데이터 20 | """ 21 | if not file_path.exists(): 22 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 23 | 24 | try: 25 | with open(file_path, 'r', encoding='utf-8') as f: 26 | # CSV 방언 자동 감지 27 | sample = f.read(1024) 28 | f.seek(0) 29 | sniffer = csv.Sniffer() 30 | 31 | try: 32 | dialect = sniffer.sniff(sample) 33 | has_header = sniffer.has_header(sample) 34 | except csv.Error: 35 | # 감지 실패시 기본값 사용 36 | dialect = csv.excel 37 | has_header = True 38 | 39 | reader = csv.reader(f, dialect=dialect) 40 | rows = list(reader) 41 | 42 | if not rows: 43 | raise ValueError("CSV 파일이 비어 있습니다") 44 | 45 | # 테이블 형식으로 포맷팅 46 | if has_header and len(rows) > 1: 47 | headers = rows[0] 48 | data_rows = rows[1:] 49 | 50 | # 헤더와 데이터를 구분하여 표시 51 | content_lines = [ 52 | f"Headers: {', '.join(headers)}", 53 | "=" * 80, 54 | ] 55 | 56 | for row in data_rows: 57 | content_lines.append(' | '.join(str(cell) for cell in row)) 58 | else: 59 | # 헤더가 없는 경우 60 | content_lines = [] 61 | for row in rows: 62 | content_lines.append(' | '.join(str(cell) for cell in row)) 63 | 64 | content = '\n'.join(content_lines) 65 | 66 | metadata = { 67 | 'file_type': 'csv', 68 | 'encoding': 'utf-8', 69 | 'file_size': file_path.stat().st_size, 70 | 'num_rows': len(rows), 71 | 'num_columns': len(rows[0]) if rows else 0, 72 | 'has_header': has_header, 73 | } 74 | 75 | return ParseResult(content=content, metadata=metadata) 76 | 77 | except UnicodeDecodeError as e: 78 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e 79 | except Exception as e: 80 | raise ValueError(f"CSV 파싱 실패: {file_path}") from e 81 | 82 | def supports(self, file_path: Path) -> bool: 83 | """CSV 파일 확장자 지원 확인""" 84 | return file_path.suffix.lower() in self.supported_extensions 85 | 86 | @property 87 | def supported_extensions(self) -> list[str]: 88 | """지원하는 CSV 파일 확장자""" 89 | return ['.csv', '.tsv'] 90 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/pptx_parser.py: -------------------------------------------------------------------------------- 1 | """PowerPoint file parser.""" 2 | 3 | from pathlib import Path 4 | from .base import BaseParser, ParseResult 5 | 6 | 7 | class PPTXParser(BaseParser): 8 | """PowerPoint 파일 파서 (python-pptx 사용)""" 9 | 10 | def parse(self, file_path: Path) -> ParseResult: 11 | """ 12 | PPTX 파일에서 텍스트를 추출합니다. 13 | 14 | Args: 15 | file_path: 파싱할 PPTX 파일 경로 16 | 17 | Returns: 18 | ParseResult: 파싱된 텍스트와 메타데이터 19 | """ 20 | if not file_path.exists(): 21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 22 | 23 | try: 24 | try: 25 | from pptx import Presentation 26 | except ImportError: 27 | raise ImportError( 28 | "python-pptx가 설치되지 않았습니다. 'pip install python-pptx'로 설치하세요." 29 | ) 30 | 31 | prs = Presentation(file_path) 32 | 33 | # 슬라이드별로 텍스트 추출 34 | slides_text = [] 35 | for idx, slide in enumerate(prs.slides, 1): 36 | slide_content = [f"=== Slide {idx} ==="] 37 | 38 | # 슬라이드의 모든 도형에서 텍스트 추출 39 | for shape in slide.shapes: 40 | if hasattr(shape, "text") and shape.text.strip(): 41 | slide_content.append(shape.text.strip()) 42 | 43 | # 테이블이 있는 경우 처리 44 | if shape.has_table: 45 | table = shape.table 46 | for row in table.rows: 47 | row_text = ' | '.join(cell.text.strip() for cell in row.cells) 48 | if row_text.strip(): 49 | slide_content.append(row_text) 50 | 51 | # 슬라이드 노트 추출 52 | if slide.has_notes_slide: 53 | notes_text = slide.notes_slide.notes_text_frame.text.strip() 54 | if notes_text: 55 | slide_content.append(f"Notes: {notes_text}") 56 | 57 | slides_text.append('\n'.join(slide_content)) 58 | 59 | content = '\n\n'.join(slides_text) 60 | 61 | metadata = { 62 | 'file_type': 'pptx', 63 | 'num_slides': len(prs.slides), 64 | 'file_size': file_path.stat().st_size, 65 | } 66 | 67 | # 슬라이드 크기 정보 68 | if prs.slide_width and prs.slide_height: 69 | metadata['slide_width'] = prs.slide_width 70 | metadata['slide_height'] = prs.slide_height 71 | 72 | return ParseResult(content=content, metadata=metadata) 73 | 74 | except Exception as e: 75 | raise ValueError(f"PPTX 파싱 실패: {file_path}") from e 76 | 77 | def supports(self, file_path: Path) -> bool: 78 | """PPTX 파일 확장자 지원 확인""" 79 | return file_path.suffix.lower() in self.supported_extensions 80 | 81 | @property 82 | def supported_extensions(self) -> list[str]: 83 | """지원하는 PowerPoint 파일 확장자""" 84 | return ['.pptx'] 85 | -------------------------------------------------------------------------------- /maru_lang/models/vector_db.py: -------------------------------------------------------------------------------- 1 | """ 2 | VectorDB 설정 모델 3 | """ 4 | from dataclasses import dataclass, field 5 | from typing import Optional, Union 6 | 7 | 8 | # ========== VectorDB Config (상속 기반) ========== 9 | 10 | @dataclass 11 | class BaseVectorDBConfig: 12 | """VectorDB 기본 설정 (모든 VectorDB 공통)""" 13 | db_type: str 14 | 15 | 16 | @dataclass 17 | class ChromaDBConfig(BaseVectorDBConfig): 18 | """ChromaDB 전용 설정""" 19 | persist_dir: str = field(default="") 20 | collection_name: str = field(default="") 21 | db_type: str = field(default="chromadb", init=False) 22 | 23 | @classmethod 24 | def from_settings(cls) -> "ChromaDBConfig": 25 | """Settings로부터 기본 ChromaDB 설정 생성""" 26 | from maru_lang.configs.system_config import get_system_config 27 | config = get_system_config() 28 | return cls( 29 | persist_dir=config.vector_db.chroma.get_persist_dir_absolute(), 30 | collection_name=config.vector_db.default_collection_name, 31 | ) 32 | 33 | 34 | @dataclass 35 | class MilvusConfig(BaseVectorDBConfig): 36 | """Milvus 전용 설정""" 37 | host: str = field(default="localhost") 38 | port: int = field(default=19530) 39 | user: str = field(default="root") 40 | password: str = field(default="Milvus") 41 | collection_name: str = field(default="") 42 | db_type: str = field(default="milvus", init=False) 43 | 44 | @classmethod 45 | def from_settings(cls) -> "MilvusConfig": 46 | """Settings로부터 기본 Milvus 설정 생성""" 47 | from maru_lang.configs.system_config import get_system_config 48 | config = get_system_config() 49 | return cls( 50 | host=config.vector_db.milvus.host, 51 | port=config.vector_db.milvus.port, 52 | user=config.vector_db.milvus.user, 53 | password=config.vector_db.milvus.password, 54 | collection_name=config.vector_db.default_collection_name, 55 | ) 56 | 57 | 58 | @dataclass 59 | class PineconeConfig(BaseVectorDBConfig): 60 | """Pinecone 전용 설정 (향후 확장)""" 61 | api_key: str = field(default="") 62 | environment: str = field(default="") 63 | index_name: str = field(default="") 64 | db_type: str = field(default="pinecone", init=False) 65 | 66 | 67 | def get_vector_db_config_from_settings() -> Union[ChromaDBConfig, MilvusConfig]: 68 | """ 69 | system_config.yaml의 vector_db.type에 따라 적절한 VectorDB 설정 반환 70 | 71 | Returns: 72 | ChromaDBConfig or MilvusConfig: 설정된 VectorDB 타입에 맞는 설정 객체 73 | 74 | Raises: 75 | ValueError: 지원하지 않는 VectorDB 타입인 경우 76 | """ 77 | from maru_lang.configs.system_config import get_system_config 78 | config = get_system_config() 79 | 80 | db_type = config.vector_db.type.lower() 81 | 82 | if db_type == "chroma": 83 | return ChromaDBConfig.from_settings() 84 | elif db_type == "milvus": 85 | return MilvusConfig.from_settings() 86 | else: 87 | raise ValueError( 88 | f"Unsupported vector_db.type: {db_type}. " 89 | f"Supported types: 'chroma', 'milvus'" 90 | ) -------------------------------------------------------------------------------- /maru_lang/pluggable/configs/chunker_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Chunker configuration loader 3 | """ 4 | from typing import Dict, Any, Optional 5 | from maru_lang.configs.base import DefaultConfigLoader 6 | from maru_lang.pluggable.models import ChunkerConfig 7 | from maru_lang.enums.configs import ConfigType 8 | 9 | 10 | class ChunkerConfigLoader(DefaultConfigLoader[ChunkerConfig]): 11 | """Loader for chunker configurations""" 12 | 13 | def __init__(self): 14 | super().__init__(ConfigType.CHUNKERS) 15 | # Chunkers는 base config 없이 user config만 사용 16 | # (명시적 설정을 강제하기 위해) 17 | 18 | def load_all(self) -> Dict[str, ChunkerConfig]: 19 | """Load configurations from user directory only (no base)""" 20 | import logging 21 | logger = logging.getLogger(__name__) 22 | 23 | self.configs = {} 24 | self._base_configs = {} 25 | 26 | # User config만 로드 (base 없음) - 특정 파일만 읽기 27 | logger.info(f"Loading {self.config_type} configurations from user directory...") 28 | 29 | # chunker_config.yaml만 읽기 (사용자 정의 chunker .py 파일 제외) 30 | config_file = self.user_dir / "chunker_config.yaml" 31 | if config_file.exists(): 32 | if self._load_file(config_file, is_user=True): 33 | logger.info(f"Loaded chunker config from {config_file}") 34 | else: 35 | logger.warning(f"Failed to load chunker config from {config_file}") 36 | else: 37 | logger.warning(f"Chunker config file not found: {config_file}") 38 | 39 | logger.info( 40 | f"Loaded {len(self.configs)} {self.config_type} configs" 41 | ) 42 | 43 | return self.configs 44 | 45 | def parse_config( 46 | self, data: Dict[str, Any], source_path: str, is_user: bool 47 | ) -> Optional[ChunkerConfig]: 48 | """Parse chunker configuration data""" 49 | try: 50 | return ChunkerConfig( 51 | chunkers=data.get("chunkers", {}), 52 | source_path=source_path, 53 | is_override=is_user, 54 | ) 55 | except Exception as e: 56 | import sys 57 | 58 | error_msg = f"Error parsing chunker config from {source_path}: {e}" 59 | print(f"\n❌ ERROR: {error_msg}", file=sys.stderr) 60 | return None 61 | 62 | def get_config_name(self, config: ChunkerConfig) -> str: 63 | """Get the name of a chunker configuration""" 64 | # 단일 config 파일이므로 고정 이름 사용 65 | return "config" 66 | 67 | def validate_config(self, data: Dict[str, Any]) -> bool: 68 | """Validate chunker configuration data""" 69 | # 필수 필드가 없으므로 기본적으로 유효 70 | return True 71 | 72 | def get_merged_config(self) -> ChunkerConfig: 73 | """ 74 | Get merged configuration (base + user override) 75 | 76 | Returns: 77 | Merged ChunkerConfig with user overrides applied 78 | """ 79 | # Base config 80 | base = self.configs.get("config") 81 | if not base: 82 | # Return default if no config found 83 | return ChunkerConfig() 84 | 85 | return base 86 | -------------------------------------------------------------------------------- /maru_lang/pluggable/configs/embedder_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Embedder configuration loader 3 | """ 4 | from typing import Dict, Any, Optional 5 | from maru_lang.configs.base import DefaultConfigLoader 6 | from maru_lang.pluggable.models import EmbedderConfig 7 | from maru_lang.enums.configs import ConfigType 8 | 9 | 10 | class EmbedderConfigLoader(DefaultConfigLoader[EmbedderConfig]): 11 | """Loader for embedder configurations""" 12 | 13 | def __init__(self): 14 | super().__init__(ConfigType.EMBEDDERS) 15 | # Embedders는 base config 없이 user config만 사용 16 | # (명시적 설정을 강제하기 위해) 17 | 18 | def load_all(self) -> Dict[str, EmbedderConfig]: 19 | """Load configurations from user directory only (no base)""" 20 | import logging 21 | logger = logging.getLogger(__name__) 22 | 23 | self.configs = {} 24 | self._base_configs = {} 25 | 26 | # User config만 로드 (base 없음) - 특정 파일만 읽기 27 | logger.info(f"Loading {self.config_type} configurations from user directory...") 28 | 29 | # embedder_config.yaml만 읽기 30 | config_file = self.user_dir / "embedder_config.yaml" 31 | if config_file.exists(): 32 | if self._load_file(config_file, is_user=True): 33 | logger.info(f"Loaded embedder config from {config_file}") 34 | else: 35 | logger.warning(f"Failed to load embedder config from {config_file}") 36 | else: 37 | logger.warning(f"Embedder config file not found: {config_file}") 38 | 39 | logger.info( 40 | f"Loaded {len(self.configs)} {self.config_type} configs" 41 | ) 42 | 43 | return self.configs 44 | 45 | def parse_config( 46 | self, data: Dict[str, Any], source_path: str, is_user: bool 47 | ) -> Optional[EmbedderConfig]: 48 | """Parse embedder configuration data""" 49 | try: 50 | # 'models' 필드는 하위 호환성을 위해 무시 (deprecated) 51 | return EmbedderConfig( 52 | default_model=data.get("default_model"), 53 | device=data.get("device"), 54 | source_path=source_path, 55 | is_override=is_user, 56 | ) 57 | except Exception as e: 58 | import sys 59 | 60 | error_msg = f"Error parsing embedder config from {source_path}: {e}" 61 | print(f"\n❌ ERROR: {error_msg}", file=sys.stderr) 62 | return None 63 | 64 | def get_config_name(self, config: EmbedderConfig) -> str: 65 | """Get the name of an embedder configuration""" 66 | # 단일 config 파일이므로 고정 이름 사용 67 | return "config" 68 | 69 | def validate_config(self, data: Dict[str, Any]) -> bool: 70 | """Validate embedder configuration data""" 71 | # 필수 필드가 없으므로 기본적으로 유효 72 | return True 73 | 74 | def get_merged_config(self) -> EmbedderConfig: 75 | """ 76 | Get merged configuration (base + user override) 77 | 78 | Returns: 79 | Merged EmbedderConfig with user overrides applied 80 | """ 81 | # Base config 82 | base = self.configs.get("config") 83 | if not base: 84 | # Return default if no config found 85 | return EmbedderConfig() 86 | 87 | return base 88 | -------------------------------------------------------------------------------- /maru_lang/pluggable/configs/loader_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Loader configuration loader 3 | """ 4 | from typing import Dict, Any, Optional 5 | from maru_lang.configs.base import DefaultConfigLoader 6 | from maru_lang.pluggable.models import LoaderConfig 7 | from maru_lang.enums.configs import ConfigType 8 | 9 | 10 | class LoaderConfigLoader(DefaultConfigLoader[LoaderConfig]): 11 | """Loader for loader (parser) configurations""" 12 | 13 | def __init__(self): 14 | super().__init__(ConfigType.LOADERS) 15 | # Loaders는 base config 없이 user config만 사용 16 | # (명시적 설정을 강제하기 위해) 17 | 18 | def load_all(self) -> Dict[str, LoaderConfig]: 19 | """Load configurations from user directory only (no base)""" 20 | import logging 21 | logger = logging.getLogger(__name__) 22 | 23 | self.configs = {} 24 | self._base_configs = {} 25 | 26 | # User config만 로드 (base 없음) - 특정 파일만 읽기 27 | logger.info(f"Loading {self.config_type} configurations from user directory...") 28 | 29 | # loader_config.yaml만 읽기 (사용자 정의 parser .py 파일 제외) 30 | config_file = self.user_dir / "loader_config.yaml" 31 | if config_file.exists(): 32 | if self._load_file(config_file, is_user=True): 33 | logger.info(f"Loaded loader config from {config_file}") 34 | else: 35 | logger.warning(f"Failed to load loader config from {config_file}") 36 | else: 37 | logger.warning(f"Loader config file not found: {config_file}") 38 | 39 | logger.info( 40 | f"Loaded {len(self.configs)} {self.config_type} configs" 41 | ) 42 | 43 | return self.configs 44 | 45 | def parse_config( 46 | self, data: Dict[str, Any], source_path: str, is_user: bool 47 | ) -> Optional[LoaderConfig]: 48 | """Parse loader configuration data""" 49 | try: 50 | return LoaderConfig( 51 | default_loader=data.get("default_loader", "txt"), 52 | default_chunker=data.get("default_chunker", "paragraph"), 53 | extensions=data.get("extensions", {}), 54 | source_path=source_path, 55 | is_override=is_user, 56 | ) 57 | except Exception as e: 58 | import sys 59 | 60 | error_msg = f"Error parsing loader config from {source_path}: {e}" 61 | print(f"\n❌ ERROR: {error_msg}", file=sys.stderr) 62 | return None 63 | 64 | def get_config_name(self, config: LoaderConfig) -> str: 65 | """Get the name of a loader configuration""" 66 | # 단일 config 파일이므로 고정 이름 사용 67 | return "config" 68 | 69 | def validate_config(self, data: Dict[str, Any]) -> bool: 70 | """Validate loader configuration data""" 71 | # 필수 필드가 없으므로 기본적으로 유효 72 | return True 73 | 74 | def get_merged_config(self) -> LoaderConfig: 75 | """ 76 | Get merged configuration (base + user override) 77 | 78 | Returns: 79 | Merged LoaderConfig with user overrides applied 80 | """ 81 | # Base config 82 | base = self.configs.get("config") 83 | if not base: 84 | # Return default if no config found 85 | return LoaderConfig() 86 | 87 | return base 88 | -------------------------------------------------------------------------------- /maru_lang/core/relation_db/models/auth.py: -------------------------------------------------------------------------------- 1 | from __future__ import annotations 2 | import datetime 3 | from tortoise.models import Model 4 | from tortoise import fields 5 | 6 | 7 | class User(Model): 8 | id = fields.IntField(pk=True) 9 | name = fields.CharField(max_length=255, index=True, null=True) 10 | email = fields.CharField(max_length=255, index=True, unique=True) 11 | role = fields.ForeignKeyField( 12 | "models.UserRole", related_name="users", null=True) 13 | created_at = fields.DatetimeField(auto_now_add=True) 14 | 15 | class Meta: 16 | table = "user" 17 | 18 | 19 | class UserGroup(Model): 20 | id = fields.IntField(pk=True) 21 | name = fields.CharField(max_length=255, unique=True) 22 | manager = fields.ForeignKeyField( 23 | "models.User", 24 | related_name="managed_user_groups", 25 | on_delete=fields.RESTRICT # Prevents User deletion if managing UserGroups 26 | ) 27 | created_at = fields.DatetimeField(auto_now_add=True) 28 | 29 | class Meta: 30 | table = "user_group" 31 | 32 | 33 | class UserGroupMembership(Model): 34 | user = fields.ForeignKeyField( 35 | "models.User", 36 | related_name="group_memberships") 37 | group = fields.ForeignKeyField( 38 | "models.UserGroup", 39 | related_name="members") 40 | 41 | class Meta: 42 | table = "user_group_membership" 43 | 44 | 45 | class UserGroupInclusion(Model): 46 | parent = fields.ForeignKeyField( 47 | "models.UserGroup", related_name="includes") # 상위 그룹 48 | child = fields.ForeignKeyField( 49 | "models.UserGroup", related_name="included_by") # 하위 그룹 50 | 51 | class Meta: 52 | table = "user_group_inclusion" 53 | 54 | 55 | class OTP(Model): 56 | id = fields.IntField(pk=True) 57 | email = fields.CharField(max_length=255, index=True) 58 | code = fields.CharField(max_length=6) 59 | created_at = fields.DatetimeField(auto_now_add=True) 60 | 61 | class Meta: 62 | table = "otp" 63 | 64 | async def is_valid(self): 65 | """ 인증코드가 5분 이내인지 확인 """ 66 | expiration_time = self.created_at + datetime.timedelta(minutes=5) 67 | return expiration_time > datetime.datetime.now(datetime.timezone.utc) 68 | 69 | 70 | class UserToken(Model): 71 | id = fields.IntField(pk=True) 72 | user_id = fields.CharField(max_length=255, index=True) # 사용자 고유 ID 73 | device_id = fields.CharField(max_length=255, index=True) # 기기 고유 ID 74 | jwt_token = fields.TextField() # JWT 토큰 75 | created_at = fields.DatetimeField(auto_now_add=True) 76 | 77 | class Meta: 78 | table = "user_token" 79 | 80 | 81 | class RefreshToken(Model): 82 | id = fields.IntField(pk=True) 83 | user_id = fields.CharField(max_length=255, index=True) 84 | device_id = fields.CharField(max_length=255, index=True) 85 | refresh_token = fields.TextField() # 발급된 Refresh Token 문자열 86 | created_at = fields.DatetimeField(auto_now_add=True) 87 | expires_at = fields.DatetimeField() 88 | 89 | class Meta: 90 | table = "refresh_token" 91 | 92 | 93 | class UserRole(Model): 94 | id = fields.IntField(pk=True) 95 | name = fields.CharField(max_length=255, index=True, unique=True) 96 | description = fields.TextField(null=True) 97 | 98 | class Meta: 99 | table = "user_role" 100 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/llm_reranker.yaml: -------------------------------------------------------------------------------- 1 | # LLM Reranker Agent Configuration 2 | # Agent that uses LLM to rerank search results based on relevance to the query 3 | # NOTE: This is a utility agent, not selectable by agent_selector 4 | 5 | name: llm_reranker 6 | description: "Uses LLM to evaluate and rerank search results based on relevance" 7 | type: utility 8 | enabled: true 9 | version: "1.0.0" 10 | 11 | # Priority (lower priority - runs after retrieval) 12 | priority: 50 13 | 14 | # LLM Settings 15 | target_llm_config: 16 | server_name: "openai" 17 | override_params: 18 | temperature: 0.0 # Keep temperature at 0 for consistent scoring 19 | max_tokens: 1000 # Allow enough tokens for scoring multiple documents 20 | timeout: 30.0 # Longer timeout for processing multiple documents 21 | 22 | fallback_strategy: "any_available" 23 | 24 | # Prompt Settings 25 | prompts: 26 | system_prompt: | 27 | You are an expert at evaluating document relevance to search queries. 28 | 29 | Responsibilities: 30 | - Analyze the semantic relevance between a query and documents 31 | - Assign relevance scores to each document (0.0 to 1.0) 32 | - Consider both semantic meaning and keyword matching 33 | - Be strict in scoring - only highly relevant documents should get high scores 34 | 35 | Scoring Guidelines: 36 | - 1.0: Perfect match, directly answers the query 37 | - 0.8-0.9: Highly relevant, contains most information needed 38 | - 0.6-0.7: Moderately relevant, contains some useful information 39 | - 0.4-0.5: Weakly relevant, tangentially related 40 | - 0.0-0.3: Not relevant or off-topic 41 | 42 | user_prompt_template: | 43 | Evaluate the relevance of each document to the given query and assign scores. 44 | 45 | Query: {query} 46 | 47 | Documents: 48 | {documents} 49 | 50 | Important rules: 51 | 1. Assign a relevance score (0.0 to 1.0) to each document based on how well it answers the query. 52 | 2. Document indices must match the input (0, 1, 2, ...). 53 | 3. Be strict in scoring - reserve high scores (>0.8) for truly relevant documents. 54 | 4. Consider semantic meaning, not just keyword overlap. 55 | 5. Return all documents with their scores, even if score is 0.0. 56 | 57 | You must return a JSON object as a tool call following the definition below. Use the keys document_scores and reasoning exactly as written. 58 | 59 | # Implementation class (Python file in rerankers/) 60 | implementation: rerankers.llm_reranker.LLMRerankerAgent 61 | 62 | # Agent Settings 63 | config: 64 | timeout: 30 65 | retry_count: 1 66 | 67 | # Tool schema definition (JSON) 68 | tools: 69 | llm_reranker: 70 | description: "Reranks documents based on relevance to query using LLM evaluation" 71 | parameters: 72 | type: "object" 73 | properties: 74 | document_scores: 75 | type: "array" 76 | items: 77 | type: "object" 78 | properties: 79 | index: 80 | type: "integer" 81 | description: "Document index (0-based)" 82 | score: 83 | type: "number" 84 | minimum: 0.0 85 | maximum: 1.0 86 | description: "Relevance score (0.0 to 1.0)" 87 | description: "List of documents with their relevance scores" 88 | reasoning: 89 | type: "string" 90 | description: "Brief explanation of the scoring rationale" 91 | required: ["document_scores", "reasoning"] 92 | -------------------------------------------------------------------------------- /maru_lang/pluggable/agents/agent_factory.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent Factory - Creates and configures agents based on configuration 3 | """ 4 | from typing import Dict, Optional, List 5 | from maru_lang.pluggable.agents.base import BaseAgent 6 | from maru_lang.pluggable.agents.registry import get_registry 7 | from maru_lang.configs.manager import get_config_manager 8 | from maru_lang.pluggable.models import AgentConfig 9 | from maru_lang.pluggable.agents.mcp_client_agent import MCPClientAgent 10 | 11 | 12 | class AgentFactory: 13 | """ 14 | Factory for creating agents with proper configuration 15 | Supports dynamic loading 16 | """ 17 | 18 | def __init__( 19 | self, 20 | ): 21 | """ 22 | Initialize factory with default components 23 | """ 24 | self.config_manager = get_config_manager() 25 | self.registry = get_registry() 26 | 27 | def create_agent( 28 | self, 29 | agent_name: str, 30 | agent_config: AgentConfig 31 | ) -> Optional[BaseAgent]: 32 | """ 33 | Create an agent instance based on name and configuration 34 | 35 | Args: 36 | agent_name: Name/type of the agent 37 | agent_config: Agent-specific configuration 38 | 39 | Returns: 40 | Agent instance or None if not found 41 | """ 42 | # Get agent class from registry 43 | agent_class = self.registry.get_agent_class(agent_name) 44 | if not agent_class: 45 | print(f"Agent not found in registry: {agent_name}") 46 | return None 47 | 48 | # Create agent instance 49 | try: 50 | if issubclass(agent_class, MCPClientAgent): 51 | # Other MCP agents need name, server_params, and llm_client 52 | if not agent_config.mcp_config: 53 | raise ValueError( 54 | f"MCP agent {agent_name} missing mcp_config") 55 | return agent_class( 56 | name=agent_name, 57 | config=agent_config, # Pass full agent_config as config 58 | ) 59 | 60 | except Exception as e: 61 | print(f"Error creating agent {agent_name}: {e}") 62 | return None 63 | 64 | def create_agents_from_config(self) -> Dict[str, BaseAgent]: 65 | """ 66 | Create all agents based on configuration 67 | 68 | Returns: 69 | Dictionary of agent instances by name 70 | """ 71 | agents = {} 72 | 73 | # Create all agents from the registry 74 | for agent_name in self.registry.list_agents(): 75 | agent_config = self.registry.get_agent_config(agent_name) 76 | if not agent_config: 77 | print( 78 | f"[ERROR AgentFactory] Agent config not found: {agent_name}") 79 | continue 80 | agent = self.create_agent(agent_name, agent_config) 81 | if agent: 82 | agents[agent_name] = agent 83 | else: 84 | print( 85 | f"[ERROR AgentFactory] Failed to create agent: {agent_name}") 86 | raise Exception(f"Failed to create agent: {agent_name}") 87 | return agents 88 | 89 | def list_available_agents(self) -> List[str]: 90 | """List all available agent names""" 91 | return self.registry.list_agents() 92 | 93 | def reload_agents(self) -> None: 94 | """Reload all agents from sources""" 95 | self.registry.reload() 96 | -------------------------------------------------------------------------------- /maru_lang/pluggable/configs/reranker_config.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reranker Configuration Loader 3 | """ 4 | import logging 5 | from typing import Dict, Any, Optional 6 | from maru_lang.enums.configs import ConfigType 7 | from maru_lang.pluggable.models import RerankerConfig 8 | from maru_lang.configs.base import DefaultConfigLoader 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | class RerankerConfigLoader(DefaultConfigLoader[RerankerConfig]): 14 | """Loader for reranker configurations""" 15 | 16 | def __init__(self): 17 | super().__init__(ConfigType.RERANKERS) 18 | 19 | def load_all(self) -> Dict[str, RerankerConfig]: 20 | """Load configurations from user directory only (no base)""" 21 | self.configs = {} 22 | self._base_configs = {} 23 | 24 | # User config만 로드 (base 없음) - 특정 파일만 읽기 25 | logger.info(f"Loading {self.config_type} configurations from user directory...") 26 | 27 | # reranker_config.yaml만 읽기 (llm_reranker.yaml 등 agent 설정 제외) 28 | config_file = self.user_dir / "reranker_config.yaml" 29 | if config_file.exists(): 30 | if self._load_file(config_file, is_user=True): 31 | logger.info(f"Loaded reranker config from {config_file}") 32 | else: 33 | logger.warning(f"Failed to load reranker config from {config_file}") 34 | else: 35 | logger.warning(f"Reranker config file not found: {config_file}") 36 | 37 | logger.info( 38 | f"Loaded {len(self.configs)} {self.config_type} configs" 39 | ) 40 | 41 | return self.configs 42 | 43 | def parse_config( 44 | self, data: Dict[str, Any], source_path: str, is_user: bool 45 | ) -> Optional[RerankerConfig]: 46 | """Parse reranker configuration data""" 47 | try: 48 | # 'models' 필드는 하위 호환성을 위해 무시 (deprecated) 49 | return RerankerConfig( 50 | enabled=data.get("enabled", True), 51 | method=data.get("method", "model"), 52 | default_model=data.get("default_model", "BAAI/bge-reranker-v2-m3"), 53 | agent_name=data.get("agent_name"), 54 | top_k=data.get("top_k"), 55 | source_path=source_path, 56 | is_override=is_user, 57 | ) 58 | except Exception as e: 59 | import sys 60 | 61 | error_msg = f"Error parsing reranker config from {source_path}: {e}" 62 | print(f"\n❌ ERROR: {error_msg}", file=sys.stderr) 63 | return None 64 | 65 | def get_config_name(self, config: RerankerConfig) -> str: 66 | """Get the name of a reranker configuration""" 67 | # 단일 config 파일이므로 고정 이름 사용 68 | return "config" 69 | 70 | def validate_config(self, data: Dict[str, Any]) -> bool: 71 | """Validate reranker configuration data""" 72 | # 필수 필드가 없으므로 기본적으로 유효 73 | if not isinstance(data, dict): 74 | logger.error(f"Reranker config data is not a dict: {type(data)}") 75 | return False 76 | return True 77 | 78 | def get_merged_config(self) -> RerankerConfig: 79 | """ 80 | Get merged configuration (base + user override) 81 | 82 | Returns: 83 | Merged RerankerConfig with user overrides applied 84 | """ 85 | # Base config 86 | base = self.configs.get("config") 87 | if not base: 88 | # Return default if no config found 89 | return RerankerConfig() 90 | 91 | return base 92 | -------------------------------------------------------------------------------- /maru_lang/pluggable/loaders/xml_parser.py: -------------------------------------------------------------------------------- 1 | """XML file parser.""" 2 | 3 | import xml.etree.ElementTree as ET 4 | from pathlib import Path 5 | from .base import BaseParser, ParseResult 6 | 7 | 8 | class XMLParser(BaseParser): 9 | """XML 파일 파서""" 10 | 11 | def parse(self, file_path: Path) -> ParseResult: 12 | """ 13 | XML 파일을 읽어 포맷팅된 텍스트로 변환합니다. 14 | 15 | Args: 16 | file_path: 파싱할 XML 파일 경로 17 | 18 | Returns: 19 | ParseResult: 파싱된 텍스트와 메타데이터 20 | """ 21 | if not file_path.exists(): 22 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}") 23 | 24 | try: 25 | tree = ET.parse(file_path) 26 | root = tree.getroot() 27 | 28 | # XML 구조를 텍스트로 변환 29 | lines = [] 30 | self._element_to_text(root, lines, level=0) 31 | content = '\n'.join(lines) 32 | 33 | # 요소 개수 계산 34 | num_elements = len(list(root.iter())) 35 | 36 | metadata = { 37 | 'file_type': 'xml', 38 | 'encoding': tree.docinfo.encoding if hasattr(tree, 'docinfo') else 'utf-8', 39 | 'file_size': file_path.stat().st_size, 40 | 'root_tag': root.tag, 41 | 'num_elements': num_elements, 42 | } 43 | 44 | # 네임스페이스 정보 추출 45 | namespaces = {} 46 | for elem in root.iter(): 47 | if '}' in elem.tag: 48 | ns = elem.tag.split('}')[0][1:] 49 | if ns not in namespaces.values(): 50 | namespaces[f'ns{len(namespaces)}'] = ns 51 | 52 | if namespaces: 53 | metadata['namespaces'] = namespaces 54 | 55 | return ParseResult(content=content, metadata=metadata) 56 | 57 | except ET.ParseError as e: 58 | raise ValueError(f"XML 파싱 실패: {file_path} - {str(e)}") from e 59 | except Exception as e: 60 | raise ValueError(f"파일 읽기 실패: {file_path}") from e 61 | 62 | def _element_to_text(self, element: ET.Element, lines: list[str], level: int = 0) -> None: 63 | """ 64 | XML 요소를 재귀적으로 텍스트로 변환합니다. 65 | 66 | Args: 67 | element: XML 요소 68 | lines: 결과를 저장할 리스트 69 | level: 들여쓰기 레벨 70 | """ 71 | indent = " " * level 72 | tag = element.tag 73 | 74 | # 네임스페이스 제거 (가독성 향상) 75 | if '}' in tag: 76 | tag = tag.split('}')[1] 77 | 78 | # 시작 태그와 속성 79 | attrs = '' 80 | if element.attrib: 81 | attrs = ' [' + ', '.join(f'{k}={v}' for k, v in element.attrib.items()) + ']' 82 | 83 | # 텍스트 내용 84 | text = (element.text or '').strip() 85 | 86 | if text: 87 | lines.append(f"{indent}<{tag}{attrs}>: {text}") 88 | else: 89 | lines.append(f"{indent}<{tag}{attrs}>") 90 | 91 | # 자식 요소 재귀 처리 92 | for child in element: 93 | self._element_to_text(child, lines, level + 1) 94 | 95 | # tail 텍스트 (닫는 태그 뒤의 텍스트) 96 | tail = (element.tail or '').strip() 97 | if tail: 98 | lines.append(f"{indent} {tail}") 99 | 100 | def supports(self, file_path: Path) -> bool: 101 | """XML 파일 확장자 지원 확인""" 102 | return file_path.suffix.lower() in self.supported_extensions 103 | 104 | @property 105 | def supported_extensions(self) -> list[str]: 106 | """지원하는 XML 파일 확장자""" 107 | return ['.xml', '.xhtml', '.svg'] 108 | -------------------------------------------------------------------------------- /maru_lang/pluggable/agents/builtin/intent_extractor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Intent Extractor Agent - Extracts user intent and rewrites queries for search 3 | """ 4 | from typing import Dict, Any, Optional 5 | from maru_lang.pluggable.agents.base import BaseAgent, AgentResult 6 | from maru_lang.models.chat import ChatHistory 7 | 8 | 9 | class IntentExtractorAgent(BaseAgent): 10 | """Agent for extracting user intent and rewriting queries for document search""" 11 | 12 | def __init__(self, **kwargs): 13 | super().__init__(**kwargs) 14 | 15 | async def _setup(self) -> None: 16 | """Initialize intent extraction capabilities""" 17 | pass 18 | 19 | async def execute( 20 | self, 21 | question: str, 22 | chat_history: ChatHistory, 23 | **kwargs 24 | ) -> AgentResult: 25 | """ 26 | Execute intent extraction and query rewriting 27 | 28 | Args: 29 | question: User's new question/message 30 | chat_history: Previous conversation context 31 | max_length: Maximum length of generated question 32 | **kwargs: Additional parameters 33 | 34 | Returns: 35 | AgentResult containing extracted intent and rewritten query 36 | """ 37 | 38 | try: 39 | # Format the prompt with dialogue context 40 | rewritten_query = await self._extract_intent_and_rewrite( 41 | question, 42 | chat_history 43 | ) 44 | 45 | return AgentResult( 46 | success=True, 47 | result=rewritten_query, # 주요 출력: 재작성된 질문 48 | data={ 49 | 'original_question': question, 50 | 'rewritten_question': rewritten_query, 51 | 'has_context': True if chat_history.messages else False, 52 | 'extracted_intent': True, 53 | }, 54 | metadata={ 55 | 'extraction_method': 'llm_based', 56 | } 57 | ) 58 | 59 | except Exception as e: 60 | # Fallback to original question 61 | return AgentResult( 62 | success=True, # Still successful, but using fallback 63 | result=question, # 주요 출력: 원본 질문 64 | data={ 65 | 'original_question': question, 66 | 'rewritten_question': question, # Use original as fallback 67 | 'has_context': True if chat_history.messages else False, 68 | 'extracted_intent': False, 69 | }, 70 | metadata={ 71 | 'extraction_method': 'fallback', 72 | 'error': str(e) 73 | } 74 | ) 75 | 76 | async def _extract_intent_and_rewrite( 77 | self, 78 | question: str, 79 | chat_history: ChatHistory, 80 | ) -> str: 81 | """Extract intent and rewrite query using LLM with fallback""" 82 | # YAML 설정에서 프롬프트 가져오기 83 | prompts = self.config.prompts 84 | 85 | # 템플릿에 질문 삽입 86 | user_prompt = prompts.user_prompt_template.format( 87 | question=question, 88 | history_text=chat_history.to_string() 89 | ) 90 | 91 | override_params = self.get_override_params() 92 | 93 | # Use request_with_fallback for automatic LLM fallback 94 | response = await self.request_with_fallback( 95 | user_prompt=user_prompt, 96 | system_prompt=prompts.system_prompt, 97 | **override_params, 98 | ) 99 | 100 | return response.strip() 101 | -------------------------------------------------------------------------------- /maru_lang/templates/python/custom_parser.py: -------------------------------------------------------------------------------- 1 | """ 2 | Custom parser template - Copy this file and remove .sample extension 3 | 4 | This is a template for creating custom document parsers. 5 | Implement the BaseParser interface to add support for new file formats. 6 | """ 7 | 8 | from pathlib import Path 9 | from maru_lang.pluggable.loaders.base import BaseParser, ParseResult 10 | 11 | 12 | class CustomParser(BaseParser): 13 | """ 14 | Template for custom file parsers. 15 | 16 | Copy this class to implement support for new file formats. 17 | """ 18 | 19 | def parse(self, file_path: Path) -> ParseResult: 20 | """ 21 | Parse the file and extract textual content. 22 | 23 | Args: 24 | file_path: Path to the file to parse 25 | 26 | Returns: 27 | ParseResult: Parsed text and metadata 28 | 29 | Raises: 30 | ValueError: Raised when parsing fails or content cannot be read 31 | FileNotFoundError: Raised when the file does not exist 32 | """ 33 | if not file_path.exists(): 34 | raise FileNotFoundError(f"File not found: {file_path}") 35 | 36 | try: 37 | # Implement your parsing logic here 38 | # Example: convert JSON, XML, or CSV into plain text 39 | 40 | with open(file_path, 'r', encoding='utf-8') as f: 41 | content = f.read() 42 | 43 | # Optional metadata enrichment 44 | metadata = { 45 | 'file_type': 'custom', 46 | 'file_size': file_path.stat().st_size, 47 | # Add additional metadata as needed 48 | } 49 | 50 | return ParseResult(content=content, metadata=metadata) 51 | 52 | except Exception as e: 53 | raise ValueError(f"Failed to parse file: {file_path}") from e 54 | 55 | def supports(self, file_path: Path) -> bool: 56 | """ 57 | Determine whether this parser supports the given file. 58 | 59 | Args: 60 | file_path: Path of the file to check 61 | 62 | Returns: 63 | bool: True if the file is supported, otherwise False 64 | """ 65 | return file_path.suffix.lower() in self.supported_extensions 66 | 67 | @property 68 | def supported_extensions(self) -> list[str]: 69 | """ 70 | List of file extensions supported by this parser 71 | 72 | Returns: 73 | list[str]: Supported extensions (e.g., ['.json', '.jsonl']) 74 | """ 75 | # Update this list with the extensions you support 76 | return ['.custom', '.cst'] 77 | 78 | 79 | # Example: JSON parser 80 | class JsonParser(BaseParser): 81 | """Example parser for JSON files""" 82 | 83 | def parse(self, file_path: Path) -> ParseResult: 84 | if not file_path.exists(): 85 | raise FileNotFoundError(f"File not found: {file_path}") 86 | 87 | try: 88 | import json 89 | 90 | with open(file_path, 'r', encoding='utf-8') as f: 91 | data = json.load(f) 92 | 93 | # Convert JSON into formatted text 94 | content = json.dumps(data, indent=2, ensure_ascii=False) 95 | 96 | metadata = { 97 | 'file_type': 'json', 98 | 'file_size': file_path.stat().st_size, 99 | } 100 | 101 | return ParseResult(content=content, metadata=metadata) 102 | 103 | except json.JSONDecodeError as e: 104 | raise ValueError(f"JSON parse error: {file_path}") from e 105 | except Exception as e: 106 | raise ValueError(f"Failed to read file: {file_path}") from e 107 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [project] 2 | name = "maru-lang" 3 | version = "0.0.0" 4 | description = "Advanced LLM-powered chatbot with RAG, multi-agent system, and enterprise features" 5 | requires-python = ">=3.10" 6 | readme = "README.md" 7 | license = {text = "MIT"} 8 | authors = [ 9 | {name = "KC ML2"}, 10 | ] 11 | keywords = ["llm", "chatbot", "rag", "ai", "fastapi", "agents"] 12 | classifiers = [ 13 | "Development Status :: 4 - Beta", 14 | "Intended Audience :: Developers", 15 | "License :: OSI Approved :: MIT License", 16 | "Programming Language :: Python :: 3", 17 | "Programming Language :: Python :: 3.10", 18 | "Programming Language :: Python :: 3.11", 19 | "Programming Language :: Python :: 3.12", 20 | "Programming Language :: Python :: 3.13", 21 | "Topic :: Software Development :: Libraries :: Python Modules", 22 | "Topic :: Scientific/Engineering :: Artificial Intelligence", 23 | ] 24 | 25 | dependencies = [ 26 | "fastapi>=0.100.0", 27 | "uvicorn[standard]>=0.23.0", 28 | "typer[all]>=0.9.0", 29 | "tortoise-orm[asyncpg]>=0.20.0", 30 | "aerich>=0.7.2", 31 | "python-jose[cryptography]>=3.3.0", 32 | "passlib>=1.7.4", 33 | "httpx>=0.24.0", 34 | "sentence-transformers>=2.2.0", 35 | "fastapi-pagination>=0.12.0", 36 | "rank-bm25>=0.2.2", 37 | "chromadb>=0.4.0", 38 | "konlpy>=0.6.0", 39 | "mcp>=0.9.0", 40 | "pyyaml>=6.0.0", 41 | # Document parsers 42 | "PyPDF2>=3.0.1", 43 | "python-docx>=0.8.11", 44 | "python-pptx>=0.6.21", 45 | "openpyxl>=3.0.0", 46 | "beautifulsoup4>=4.12.0", 47 | ] 48 | 49 | [project.optional-dependencies] 50 | # Vector database backends (alternative to chromadb) 51 | vector-db = [ 52 | "pymilvus[model]>=2.3.0", 53 | ] 54 | 55 | # Email integration 56 | email = [ 57 | "O365>=2.0.0", 58 | ] 59 | 60 | # Development / testing 61 | dev = [ 62 | "aerich[toml]>=0.7.0", 63 | "pytest>=7.0.0", 64 | "pytest-asyncio>=0.21.0", 65 | "pytest-cov>=4.0.0", 66 | "notebook>=6.5.0", 67 | ] 68 | 69 | # Bundle of all optional features 70 | all = [ 71 | "pymilvus[model]>=2.3.0", 72 | "O365>=2.0.0", 73 | ] 74 | 75 | [build-system] 76 | requires = ["setuptools>=61.0", "wheel"] 77 | build-backend = "setuptools.build_meta" 78 | 79 | [tool.setuptools] 80 | packages = {find = {where = ["."], include = ["maru_lang*"]}} 81 | include-package-data = true 82 | 83 | [tool.setuptools.package-data] 84 | maru_lang = [ 85 | "templates/**/*.yaml", 86 | "templates/**/*.yml", 87 | "templates/**/*.py", 88 | "templates/**/*.md", 89 | "py.typed", 90 | ] 91 | 92 | [tool.pytest.ini_options] 93 | asyncio_mode = "auto" 94 | asyncio_default_fixture_loop_scope = "function" 95 | testpaths = ["tests"] 96 | python_files = "test_*.py" 97 | python_classes = "Test*" 98 | python_functions = "test_*" 99 | filterwarnings = [ 100 | "ignore::DeprecationWarning", 101 | "ignore::UserWarning", 102 | ] 103 | markers = [ 104 | "slow: long-running tests", 105 | "integration: integration tests", 106 | ] 107 | 108 | [tool.coverage.run] 109 | source = ["maru_lang"] 110 | omit = [ 111 | "maru_lang/migrations/*", 112 | "maru_lang/alembic/*", 113 | "maru_lang/scripts/*", 114 | "maru_lang/tests/*", 115 | ] 116 | 117 | [tool.coverage.report] 118 | exclude_lines = [ 119 | "pragma: no cover", 120 | "def __repr__", 121 | "raise NotImplementedError", 122 | "if __name__ == '__main__':", 123 | "pass", 124 | "raise ImportError", 125 | ] 126 | 127 | [project.scripts] 128 | maru = "maru_lang.cli:app" 129 | 130 | [tool.aerich] 131 | tortoise_orm = "maru_lang.core.relation_db.TORTOISE_ORM" 132 | location = "./migrations" 133 | src_folder = "./." -------------------------------------------------------------------------------- /maru_lang/pluggable/chunkers/paragraph.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from maru_lang.models.ingest import ChunkInput 3 | from .base import BaseChunker 4 | 5 | 6 | class ParagraphChunker(BaseChunker): 7 | """문단 단위로 청킹 (개행 2개 기준)""" 8 | 9 | name = "paragraph" 10 | description = "문단 단위로 청킹 (빈 줄 기준 분리)" 11 | 12 | def __init__(self, max_chunk_size: int = 2000): 13 | self.max_chunk_size = max_chunk_size 14 | 15 | def chunk(self, text: str) -> List[ChunkInput]: 16 | parts = [p.strip() for p in text.split("\n\n") if p.strip()] 17 | 18 | # 큰 청크를 max_chunk_size 기준으로 분할 19 | chunks = [] 20 | for part in parts: 21 | if len(part) <= self.max_chunk_size: 22 | chunks.append(part) 23 | else: 24 | # 큰 청크를 문장 단위로 분할 시도 25 | sentences = self._split_by_sentences(part) 26 | current_chunk = [] 27 | current_size = 0 28 | 29 | for sentence in sentences: 30 | sentence_len = len(sentence) 31 | 32 | # 단일 문장이 max_chunk_size를 초과하는 경우 33 | if sentence_len > self.max_chunk_size: 34 | # 현재 버퍼가 있으면 먼저 저장 35 | if current_chunk: 36 | chunks.append(" ".join(current_chunk)) 37 | current_chunk = [] 38 | current_size = 0 39 | # 큰 문장을 강제로 분할 40 | chunks.extend(self._force_split(sentence, self.max_chunk_size)) 41 | continue 42 | 43 | # 현재 청크에 추가했을 때 크기 초과 여부 확인 44 | if current_size + sentence_len + (1 if current_chunk else 0) > self.max_chunk_size: 45 | # 현재 버퍼 저장 46 | if current_chunk: 47 | chunks.append(" ".join(current_chunk)) 48 | current_chunk = [sentence] 49 | current_size = sentence_len 50 | else: 51 | current_chunk.append(sentence) 52 | current_size += sentence_len + (1 if len(current_chunk) > 1 else 0) 53 | 54 | # 남은 버퍼 저장 55 | if current_chunk: 56 | chunks.append(" ".join(current_chunk)) 57 | 58 | # 안전장치: 모든 청크가 max_chunk_size 이하인지 검증하고 필요시 재분할 59 | final_chunks = [] 60 | for chunk in chunks: 61 | if len(chunk) <= self.max_chunk_size: 62 | final_chunks.append(chunk) 63 | else: 64 | # max_chunk_size를 초과하는 청크는 강제 분할 65 | final_chunks.extend(self._force_split(chunk, self.max_chunk_size)) 66 | 67 | return [ChunkInput(number=i, content=c) for i, c in enumerate(final_chunks, start=1)] 68 | 69 | def _split_by_sentences(self, text: str) -> List[str]: 70 | """텍스트를 문장 단위로 분할 (간단한 휴리스틱)""" 71 | import re 72 | # 한글/영어 문장 종결 기호로 분할 73 | sentences = re.split(r'([.!?。!?\n]+)', text) 74 | 75 | # 구두점을 앞 문장에 붙이기 76 | result = [] 77 | for i in range(0, len(sentences) - 1, 2): 78 | if i + 1 < len(sentences): 79 | result.append((sentences[i] + sentences[i + 1]).strip()) 80 | else: 81 | result.append(sentences[i].strip()) 82 | 83 | # 마지막 요소가 남아있으면 추가 84 | if len(sentences) % 2 == 1 and sentences[-1].strip(): 85 | result.append(sentences[-1].strip()) 86 | 87 | return [s for s in result if s] 88 | 89 | def _force_split(self, text: str, max_size: int) -> List[str]: 90 | """max_size보다 큰 텍스트를 강제로 분할""" 91 | chunks = [] 92 | for i in range(0, len(text), max_size): 93 | chunks.append(text[i:i + max_size]) 94 | return chunks 95 | -------------------------------------------------------------------------------- /maru_lang/services/ingest.py: -------------------------------------------------------------------------------- 1 | """ 2 | Ingest service functions for file upload and synchronization 3 | """ 4 | from typing import List, Tuple 5 | from datetime import datetime 6 | from pathlib import Path 7 | from maru_lang.core.relation_db.models.documents import Document, DocumentGroup, DocumentGroupMembership 8 | from maru_lang.utils.document import make_source_fingerprint_for_file 9 | 10 | 11 | async def check_files_to_upload( 12 | folder_path: str, 13 | files: List[dict] # [{"fileName": str, "createdAt": datetime, "relativePath": str, "size": int}] 14 | ) -> List[str]: 15 | """ 16 | Check which files need to be uploaded by comparing with database. 17 | 18 | Uses same logic as IngestPipeline's upsert_document_from_file: 19 | - Compares file_path (relativePath) within the DocumentGroup 20 | - Compares source_fingerprint (SHA256 hash of path|size|mtime) 21 | - Only checks files in the specified folder's DocumentGroup 22 | 23 | Args: 24 | folder_path: Project folder name (DocumentGroup name, e.g., "user/project") 25 | files: List of file information dicts with fileName, createdAt, relativePath, size 26 | 27 | Returns: 28 | List of relativePaths that need to be uploaded 29 | """ 30 | files_to_upload = [] 31 | 32 | # Check if DocumentGroup exists for this folder 33 | document_group = await DocumentGroup.get_or_none(name=folder_path) 34 | 35 | # If no group exists, all files are new 36 | if not document_group: 37 | return [file_info["relativePath"] for file_info in files] 38 | 39 | for file_info in files: 40 | relative_path = file_info["relativePath"] 41 | file_name = file_info["fileName"] 42 | created_at = file_info["createdAt"] 43 | file_size = file_info.get("size", 0) # File size in bytes 44 | 45 | # Convert datetime to nanoseconds timestamp 46 | if isinstance(created_at, datetime): 47 | mtime_ns = int(created_at.timestamp() * 1e9) 48 | else: 49 | mtime_ns = int(created_at) 50 | 51 | # Generate expected fingerprint 52 | # Note: folder_path is already "{username}/{folderPath}" 53 | db_file_path = f"{folder_path}/{relative_path}" 54 | expected_fingerprint = make_source_fingerprint_for_file( 55 | db_file_path, file_size, mtime_ns 56 | ) 57 | 58 | # Check if document exists in this specific group 59 | existing_doc = await Document.filter( 60 | file_path=db_file_path, 61 | group_memberships__group=document_group 62 | ).first() 63 | 64 | if not existing_doc: 65 | # New file in this group - needs upload 66 | files_to_upload.append(relative_path) 67 | continue 68 | 69 | # Compare fingerprint 70 | if existing_doc.source_fingerprint != expected_fingerprint: 71 | # File modified - needs re-upload 72 | files_to_upload.append(relative_path) 73 | continue 74 | 75 | # File exists and unchanged - skip 76 | 77 | return files_to_upload 78 | 79 | 80 | async def get_or_create_document_group( 81 | folder_path: str, 82 | manager_id: int 83 | ) -> DocumentGroup: 84 | """ 85 | Get or create a DocumentGroup for the uploaded folder. 86 | 87 | Args: 88 | folder_path: Project folder name 89 | manager_id: User ID who manages this group 90 | 91 | Returns: 92 | DocumentGroup instance 93 | """ 94 | from maru_lang.services.document import upsert_document_group 95 | 96 | # Use folder_path as both name and base_path 97 | # In production, you might want to use absolute paths 98 | group = await upsert_document_group( 99 | name=folder_path, 100 | base_path=folder_path, 101 | manager_id=manager_id, 102 | ) 103 | 104 | return group 105 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/system_config.yaml: -------------------------------------------------------------------------------- 1 | # System Configuration 2 | # Central configuration file for MARU-Lang system settings 3 | # Supports environment variable substitution: ${ENV:VAR_NAME} or ${ENV:VAR_NAME:default_value} 4 | 5 | # ============================================================ 6 | # Server Configuration 7 | # ============================================================ 8 | server: 9 | host: ${ENV:HOST:127.0.0.1} 10 | port: ${ENV:PORT:8000} 11 | reload: ${ENV:RELOAD:false} 12 | log_level: ${ENV:LOG_LEVEL:info} 13 | 14 | # Environment settings 15 | environment: 16 | production: ${ENV:PRODUCTION:false} 17 | 18 | # ============================================================ 19 | # Database Configuration 20 | # ============================================================ 21 | database: 22 | # Database type: "sqlite" or "postgres" 23 | type: ${ENV:DB_TYPE:sqlite} 24 | 25 | # Database name (for SQLite, this is the file name; for PostgreSQL, the database name) 26 | name: ${ENV:DB_NAME:maru} 27 | 28 | # PostgreSQL settings (only required when type is "postgres") 29 | username: ${ENV:DB_USERNAME:} 30 | password: ${ENV:DB_PASSWORD:} 31 | host: ${ENV:DB_HOST:localhost} 32 | port: ${ENV:DB_PORT:5432} 33 | 34 | # ============================================================ 35 | # Authentication & Security 36 | # ============================================================ 37 | auth: 38 | # Secret key for JWT token generation (IMPORTANT: Change in production!) 39 | secret_key: ${ENV:SECRET_KEY:your-secret-key-change-in-production} 40 | salt: ${ENV:SALT:some-sugar} 41 | algorithm: ${ENV:ALGORITHM:HS256} 42 | 43 | # Token expiration times (in minutes) 44 | access_token_expire_minutes: ${ENV:ACCESS_TOKEN_EXPIRE_MINUTES:15} 45 | refresh_token_expire_minutes: ${ENV:REFRESH_TOKEN_EXPIRE_MINUTES:43200} # 30 days 46 | 47 | # Validation 48 | default_validation_code: ${ENV:DEFAULT_VALIDATION_CODE:456123} 49 | 50 | # Auto-create user groups based on email domain 51 | auto_create_group_by_domain: ${ENV:AUTO_CREATE_GROUP_BY_DOMAIN:true} 52 | 53 | # ============================================================ 54 | # Email Service Configuration 55 | # ============================================================ 56 | email: 57 | # Email service type: "o365" or "smtp" 58 | service_type: ${ENV:EMAIL_SERVICE_TYPE:o365} 59 | 60 | sender_email: ${ENV:SENDER_EMAIL:} 61 | 62 | # Office 365 settings (required when service_type is "o365") 63 | o365: 64 | client_id: ${ENV:O365_CLIENT_ID:} 65 | client_secret: ${ENV:O365_CLIENT_SECRET:} 66 | tenant_id: ${ENV:O365_TENANT_ID:} 67 | 68 | # SMTP settings (required when service_type is "smtp") 69 | smtp: 70 | host: ${ENV:SMTP_HOST:} 71 | port: ${ENV:SMTP_PORT:587} 72 | username: ${ENV:SMTP_USERNAME:} 73 | password: ${ENV:SMTP_PASSWORD:} 74 | 75 | # ============================================================ 76 | # Vector Database Configuration 77 | # ============================================================ 78 | vector_db: 79 | # Vector database type: "chroma" or "milvus" 80 | type: ${ENV:VECTOR_DB_TYPE:chroma} 81 | 82 | # Default collection name 83 | default_collection_name: ${ENV:DEFAULT_DB_COLLECTION_NAME:maru} 84 | 85 | # Chroma settings (required when type is "chroma") 86 | chroma: 87 | persist_dir: ${ENV:CHROMA_PERSIST_DIR:data/chroma/} 88 | 89 | # Milvus settings (required when type is "milvus") 90 | milvus: 91 | host: ${ENV:MILVUS_HOST:localhost} 92 | port: ${ENV:MILVUS_PORT:19530} 93 | user: ${ENV:MILVUS_USER:root} 94 | password: ${ENV:MILVUS_PASSWORD:Milvus} 95 | 96 | # ============================================================ 97 | # External Services Configuration 98 | # ============================================================ 99 | external: 100 | # No external services are configured 101 | # ============================================================ -------------------------------------------------------------------------------- /maru_lang/templates/yaml/agents/builtin/agents_group_classifier.yaml: -------------------------------------------------------------------------------- 1 | # Group Classifier Agent Configuration 2 | # Agent that analyzes user questions and classifies them into appropriate document groups 3 | 4 | name: group_classifier 5 | description: "Analyzes user questions and classifies them into suitable document groups" 6 | type: builtin 7 | enabled: true 8 | version: "1.0.0" 9 | 10 | # 우선순위 (가장 높은 우선순위 - 다른 에이전트들보다 먼저 실행) 11 | priority: 100 12 | 13 | # LLM Settings 14 | target_llm_config: 15 | server_name: "openai" 16 | override_params: 17 | temperature: 0.1 # Keep temperature low for consistent classification 18 | max_tokens: 500 # Only short classification output is required 19 | timeout: 10.0 # Short timeout for quick classification 20 | 21 | fallback_strategy: "any_available" 22 | 23 | # Prompt Settings 24 | prompts: 25 | system_prompt: | 26 | You are an expert who analyzes user questions and classifies them into appropriate document groups. 27 | 28 | Responsibilities: 29 | - Accurately identify the intent and topic of the question 30 | - Choose the most appropriate document groups for the question 31 | - Determine priority when multiple groups are applicable 32 | - If no group matches, leave the selection empty 33 | 34 | user_prompt_template: | 35 | Analyze the following question and classify it into the most appropriate document groups: 36 | 37 | Question: {question} 38 | 39 | Available groups: 40 | {available_groups} 41 | 42 | Important rules: 43 | 1. You must select only from the groups listed above. 44 | 2. Never select a group that is not on the list. 45 | 3. If no groups are applicable, return an empty array [] for selected_groups. 46 | 4. Group names must exactly match the ones in the list. 47 | 5. Fill the group_confidences array in the same order as selected_groups. Each value must be between 0 and 1, and the total must sum to 1. 48 | 49 | You must return a JSON object as a tool call following the definition below. Use the keys selected_groups, confidence, group_confidences, and reasoning exactly as written. Do not include additional quotes around keys and do not use non-English keys. Do not provide any free-form text outside the tool call. 50 | 51 | Please return the classification result as JSON: 52 | - selected_groups: Selected groups in priority order (empty array if none) 53 | - confidence: Overall classification confidence (0-1) 54 | - group_confidences: Confidence values aligned with selected_groups (e.g., [0.7, 0.3], sum=1) 55 | - reasoning: Explanation for the classification 56 | 57 | # Implementation class (Python file in user space) 58 | implementation: builtin.group_classifier.GroupClassifierAgent 59 | 60 | # Agent Settings 61 | config: 62 | timeout: 10 63 | retry_count: 1 # Keep retries minimal for fast classification 64 | classification_config: 65 | confidence_threshold: 0.4 66 | 67 | # Tool schema definition (JSON) 68 | tools: 69 | group_classifier: 70 | description: "Classifies user questions into appropriate groups" 71 | parameters: 72 | type: "object" 73 | properties: 74 | selected_groups: 75 | type: "array" 76 | items: 77 | type: "string" 78 | description: "Selected groups in priority order" 79 | confidence: 80 | type: "number" 81 | minimum: 0 82 | maximum: 1 83 | description: "Overall classification confidence (0-1)" 84 | reasoning: 85 | type: "string" 86 | description: "Explanation of the classification" 87 | fallback_used: 88 | type: "boolean" 89 | description: "Indicates whether classification fallback was used" 90 | group_confidences: 91 | type: "array" 92 | items: 93 | type: "number" 94 | minimum: 0 95 | maximum: 1 96 | description: "Confidence scores aligned with selected_groups (sum=1)" 97 | required: ["selected_groups", "confidence", "group_confidences", "reasoning"] 98 | -------------------------------------------------------------------------------- /maru_lang/utils/security.py: -------------------------------------------------------------------------------- 1 | import base64 2 | import hashlib 3 | from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes 4 | from cryptography.hazmat.backends import default_backend 5 | from cryptography.hazmat.primitives import padding 6 | from datetime import datetime, timedelta, timezone 7 | from typing import Optional 8 | from jose import JWTError, jwt 9 | from fastapi import HTTPException, status 10 | from pydantic import ValidationError 11 | from maru_lang.configs.system_config import get_system_config 12 | 13 | config = get_system_config() 14 | 15 | 16 | def generate_anonymized_key( 17 | login_id: str, 18 | company_id: int, 19 | salt: str = None 20 | ) -> str: 21 | if salt is None: 22 | salt = config.auth.salt 23 | # Combine the inputs with the salt to build a deterministic anonymized key 24 | raw_data = f"{login_id}:{company_id}:{salt}" 25 | return hashlib.sha256(raw_data.encode()).hexdigest() 26 | 27 | 28 | def create_jwt_token( 29 | data: dict, 30 | expires_delta: timedelta 31 | ) -> tuple[str, datetime]: 32 | """Create a JWT access token and return it with its expiry.""" 33 | expires_at = datetime.now(timezone.utc) 34 | expires_at += expires_delta 35 | to_encode = data.copy() 36 | to_encode.update({"exp": expires_at}) 37 | encoded_jwt = jwt.encode( 38 | to_encode, 39 | config.auth.secret_key, 40 | algorithm=config.auth.algorithm) 41 | return encoded_jwt, expires_at 42 | 43 | 44 | def decode_token(token: str) -> dict | None: 45 | """Decode a JWT token and return its payload.""" 46 | try: 47 | payload = jwt.decode( 48 | token, 49 | config.auth.secret_key, 50 | algorithms=[config.auth.algorithm]) 51 | return payload 52 | except (jwt.ExpiredSignatureError, jwt.JWTError, ValidationError) as e: 53 | # print(f"Token decode error: {e}") 54 | return None 55 | 56 | 57 | def get_key_spec(key: str): 58 | key_bytes = key.encode('utf-8') 59 | return key_bytes 60 | 61 | 62 | def aes256_decrypt(target_str: str) -> str: 63 | try: 64 | # Decode the Base64-encoded cipher text 65 | decoded_data = base64.b64decode(target_str) 66 | 67 | # Initialize the AES cipher in ECB mode 68 | cipher = Cipher(algorithms.AES(get_key_spec(config.auth.secret_key)), 69 | modes.ECB(), backend=default_backend()) 70 | decryptor = cipher.decryptor() 71 | 72 | # Perform AES decryption 73 | decrypted_data = decryptor.update(decoded_data) + decryptor.finalize() 74 | 75 | # Remove PKCS7 padding 76 | unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder() 77 | unpadded_data = unpadder.update(decrypted_data) + unpadder.finalize() 78 | 79 | return unpadded_data.decode('utf-8') 80 | 81 | except Exception as e: 82 | raise Exception(f"Error during decryption: {str(e)}") 83 | 84 | 85 | def aes256_encrypt(plain_text: str) -> str: 86 | try: 87 | # Convert the plain text to bytes 88 | plain_text_bytes = plain_text.encode('utf-8') 89 | 90 | # Apply PKCS7 padding 91 | padder = padding.PKCS7(algorithms.AES.block_size).padder() 92 | padded_data = padder.update(plain_text_bytes) + padder.finalize() 93 | 94 | # Initialize the AES cipher in ECB mode 95 | cipher = Cipher(algorithms.AES(get_key_spec(config.auth.secret_key)), 96 | modes.ECB(), backend=default_backend()) 97 | encryptor = cipher.encryptor() 98 | 99 | # Perform AES encryption 100 | encrypted_data = encryptor.update(padded_data) + encryptor.finalize() 101 | 102 | # Encode the ciphertext using Base64 103 | encrypted_base64_data = base64.b64encode(encrypted_data) 104 | 105 | # Return the encrypted string 106 | return encrypted_base64_data.decode('utf-8') 107 | 108 | except Exception as e: 109 | raise Exception(f"Error during encryption: {str(e)}") -------------------------------------------------------------------------------- /maru_lang/api/endpoints/user_group.py: -------------------------------------------------------------------------------- 1 | """ 2 | User group management API endpoints 3 | """ 4 | from fastapi import APIRouter, HTTPException, Depends 5 | from pydantic import BaseModel 6 | from typing import Optional, Dict, Any 7 | 8 | from maru_lang.dependencies.auth import get_user 9 | from maru_lang.services.user_group_command import ( 10 | UserGroupCommandParser, 11 | execute_user_group_command 12 | ) 13 | 14 | 15 | router = APIRouter( 16 | prefix="/user-groups", 17 | tags=["User Groups"] 18 | ) 19 | 20 | 21 | class UserGroupCommandRequest(BaseModel): 22 | """Request body for user group command""" 23 | message: str 24 | 25 | 26 | class UserGroupCommandResponse(BaseModel): 27 | """Response for user group command""" 28 | success: bool 29 | message: str 30 | data: Optional[Dict[str, Any]] = None 31 | error: Optional[str] = None 32 | 33 | 34 | @router.post("/command", response_model=UserGroupCommandResponse) 35 | async def execute_command( 36 | request: UserGroupCommandRequest, 37 | user=Depends(get_user) 38 | ): 39 | """ 40 | Execute user group management command. 41 | 42 | Supports natural language commands in Korean and English: 43 | 44 | ## 그룹 생성 45 | - `/그룹생성 [그룹명]` or `/create group [name]` 46 | 47 | ## 멤버 관리 (매니저만) 48 | - `/그룹초대 [그룹명] [이메일]` or `/invite [group] [email]` 49 | - `/그룹추방 [그룹명] [이메일]` or `/remove [group] [email]` 50 | - `/그룹위임 [그룹명] [이메일]` or `/transfer [group] [email]` 51 | 52 | ## 그룹 조회 53 | - `/내그룹목록` or `/my groups` 54 | - `/관리그룹` or `/managed groups` 55 | - `/그룹멤버 [그룹명]` or `/members [group]` 56 | 57 | ## 그룹 나가기 58 | - `/그룹나가기 [그룹명]` or `/leave group [name]` 59 | 60 | Args: 61 | request: Command request with message 62 | user: Authenticated user (from token) 63 | 64 | Returns: 65 | Command execution result with success status and data 66 | 67 | Example: 68 | ```json 69 | { 70 | "message": "/그룹생성 행정팀" 71 | } 72 | ``` 73 | 74 | Response: 75 | ```json 76 | { 77 | "success": true, 78 | "message": "Group created successfully", 79 | "data": { 80 | "group_id": 123, 81 | "group_name": "행정팀", 82 | "created": true 83 | } 84 | } 85 | ``` 86 | """ 87 | try: 88 | # Parse command 89 | parsed = UserGroupCommandParser.parse(request.message) 90 | 91 | # Check if it's a valid command 92 | if parsed["command"] == "unknown": 93 | return UserGroupCommandResponse( 94 | success=False, 95 | message=parsed.get("error", "Unknown command"), 96 | data={"help": UserGroupCommandParser.get_help_text()} 97 | ) 98 | 99 | # Execute command 100 | result = await execute_user_group_command(parsed, user.id) 101 | 102 | return UserGroupCommandResponse(**result) 103 | 104 | except Exception as e: 105 | raise HTTPException( 106 | status_code=500, 107 | detail=f"Failed to execute command: {str(e)}" 108 | ) 109 | 110 | 111 | @router.get("/help") 112 | async def get_help(): 113 | """ 114 | Get help text for user group commands. 115 | 116 | Returns: 117 | Help text with all available commands and usage examples 118 | """ 119 | return { 120 | "help": UserGroupCommandParser.get_help_text() 121 | } 122 | 123 | 124 | @router.get("/check-command") 125 | async def check_command(message: str): 126 | """ 127 | Check if a message is a user group command without executing it. 128 | 129 | Args: 130 | message: Message to check 131 | 132 | Returns: 133 | Whether the message is a valid user group command and parsed result 134 | """ 135 | is_command = UserGroupCommandParser.is_user_group_command(message) 136 | parsed = UserGroupCommandParser.parse(message) if is_command else None 137 | 138 | return { 139 | "is_command": is_command, 140 | "parsed": parsed 141 | } 142 | -------------------------------------------------------------------------------- /maru_lang/templates/yaml/agents/builtin/agents_response.yaml: -------------------------------------------------------------------------------- 1 | # Response Agent Configuration 2 | # Formats and delivers results from other agents to the end user 3 | 4 | name: response 5 | description: "Formats outputs from other agents into user-friendly responses" 6 | type: builtin 7 | enabled: true 8 | version: "1.0.0" 9 | 10 | # LLM configuration 11 | target_llm_config: 12 | server_name: "openai" 13 | override_params: 14 | temperature: 0.7 15 | max_tokens: 3000 16 | 17 | fallback_strategy: "any_available" 18 | 19 | # Prompt configuration 20 | prompts: 21 | system_prompt: | 22 | You are a professional and friendly AI assistant. 23 | Your job is to take results from other system components (agents) 24 | and craft a final answer that is easy to understand and genuinely helpful for the user. 25 | 26 | Primary responsibilities: 27 | - Turn agent outputs into natural, fluent sentences. 28 | - Explain technical details in user-friendly language. 29 | - Present structured data in readable formats. 30 | - Communicate errors or warnings with empathy and clarity. 31 | - Provide additional context or explanations when helpful. 32 | - Adapt your response based on different execution outcomes (success, failure, partial success, errors). 33 | 34 | Response principles: 35 | - Use clear, easy-to-follow language. 36 | - Highlight important information. 37 | - Format structured data appropriately (bullet points, numbered lists, tables, etc.). 38 | - Briefly explain technical terms when necessary. 39 | - Maintain a positive, supportive tone. 40 | - Even when reporting errors, remain polite and constructive. 41 | 42 | user_prompt_template: | 43 | User question: {question} 44 | 45 | Execution scenario: {scenario} 46 | 47 | Agent outputs: 48 | {agent_result} 49 | 50 | Using the information above, write a kind and clear final response for the user. 51 | 52 | Guidelines: 53 | - Adjust tone and content to match the execution scenario. 54 | - Hide implementation details; share only what is helpful for the user. 55 | - Write naturally, as if conversing with the user. 56 | - Apply Markdown formatting when it improves readability. 57 | - Do not mention agent names or internal architecture; focus on the results. 58 | 59 | # Implementation class (builtin agent) 60 | implementation: builtin.response_agent.ResponseAgent 61 | 62 | # Agent configuration 63 | config: 64 | timeout: 30 65 | retry_count: 2 66 | max_context_length: 10000 67 | 68 | # Response formatting options 69 | formatting: 70 | include_metadata: false # Whether to include metadata in the response 71 | show_sources: true # Whether to list information sources 72 | use_markdown: true # Whether to render responses using Markdown 73 | max_response_length: 2000 # Maximum length of the response 74 | 75 | # Scenario-specific LLM guidance (appended to the prompt) 76 | scenario_config: 77 | no_agents: "No agents were selected. Ask the user for more details or offer general assistance." 78 | errors: "An error occurred while running the agents. Explain the issue politely and suggest trying again." 79 | success: "Agents completed successfully. Present the results in a user-friendly manner." 80 | partial_success: "Some agents succeeded while others failed. Share the successful results first and briefly mention the failures." 81 | unknown: "The situation is unclear. Let the user know that assistance is limited and offer alternative help." 82 | 83 | # Fallback responses when no LLM output is available 84 | fallback_config: 85 | no_agents: "I’m sorry, I couldn’t find an appropriate agent to handle that question. Could you provide more details?" 86 | errors: "I’m sorry, something went wrong while processing your request." 87 | success: "" # Empty string: use the formatted_context as-is 88 | partial_success: "" # Empty string: use the formatted_context as-is 89 | unknown: "I’m sorry, I’m unable to generate a response right now." 90 | 91 | # Examples (this agent formats outputs from other agents) 92 | examples: 93 | - "Format knowledge_search results into a user-friendly response" 94 | - "Communicate error messages politely" 95 | -------------------------------------------------------------------------------- /maru_lang/dependencies/email.py: -------------------------------------------------------------------------------- 1 | """ 2 | Email service dependency for FastAPI 3 | """ 4 | from abc import ABC, abstractmethod 5 | from typing import Optional 6 | from fastapi import Depends 7 | from maru_lang.configs.system_config import get_system_config 8 | 9 | config = get_system_config() 10 | 11 | 12 | class EmailService(ABC): 13 | """Abstract base class for email services""" 14 | 15 | @abstractmethod 16 | def send_email(self, recipient: str, subject: str, body: str) -> bool: 17 | pass 18 | 19 | @abstractmethod 20 | def send_otp(self, recipient: str, code: str) -> bool: 21 | pass 22 | 23 | 24 | class O365EmailManager(EmailService): 25 | """Office 365 email service implementation""" 26 | 27 | def __init__(self): 28 | self.sender_email = config.email.sender_email 29 | self.client_id = config.email.o365.client_id 30 | self.client_secret = config.email.o365.client_secret 31 | self.tenant_id = config.email.o365.tenant_id 32 | 33 | def send_email(self, recipient: str, subject: str, body: str) -> bool: 34 | try: 35 | from O365 import Account 36 | 37 | credentials = (self.client_id, self.client_secret) 38 | scopes = ["https://graph.microsoft.com/.default"] 39 | account = Account( 40 | credentials, 41 | auth_flow_type="credentials", 42 | tenant_id=self.tenant_id 43 | ) 44 | 45 | if not account.is_authenticated: 46 | account.authenticate(scopes=scopes) 47 | 48 | mailbox = account.mailbox(resource=self.sender_email) 49 | message = mailbox.new_message() 50 | message.to.add(recipient) 51 | message.subject = subject 52 | message.body = body 53 | message.body_type = "HTML" 54 | message.send() 55 | return True 56 | except Exception as e: 57 | print(f"Failed to send email: {e}") 58 | return False 59 | 60 | def send_otp(self, recipient: str, code: str) -> bool: 61 | subject = f"{code} - Maru Lang Verification Code" 62 | body = f""" 63 |
64 |
65 |

Your Verification Code

66 |

Use this code to verify your email address:

67 |
68 | {code} 69 |
70 |

This code expires in 10 minutes.

71 |
72 |
73 | """ 74 | return self.send_email(recipient, subject, body) 75 | 76 | 77 | def get_email_manager() -> Optional[EmailService]: 78 | """Get email service instance based on settings""" 79 | if not config.email.service_type: 80 | return None 81 | 82 | if config.email.service_type == "o365": 83 | if all([config.email.o365.client_id, config.email.o365.client_secret, config.email.o365.tenant_id, config.email.sender_email]): 84 | try: 85 | return O365EmailManager() 86 | except Exception as e: 87 | print(f"Failed to initialize O365 Email Manager: {e}") 88 | return None 89 | 90 | # TODO: smtp 타입 지원 추가 91 | return None 92 | 93 | 94 | def get_email_service_dependency() -> Optional[EmailService]: 95 | """FastAPI dependency for email service""" 96 | return get_email_manager() 97 | 98 | 99 | __all__ = [ 100 | "EmailService", 101 | "O365EmailManager", 102 | "get_email_manager", 103 | "get_email_service_dependency", 104 | ] 105 | -------------------------------------------------------------------------------- /maru_lang/dependencies/auth.py: -------------------------------------------------------------------------------- 1 | # 시크릿 키, 알고리즘, 토큰 만료 시간 등을 settings에서 관리 2 | from fastapi import Depends, HTTPException, status, Request, Body 3 | from fastapi.security import OAuth2PasswordBearer 4 | from maru_lang.enums.auth import UserRoleCode 5 | from maru_lang.core.relation_db.models.auth import User, UserRole, RefreshToken 6 | from maru_lang.utils.security import decode_token 7 | from maru_lang.services.auth import refresh_token_flow 8 | 9 | # 1) OAuth2 스키마 설정 10 | oauth2_scheme = OAuth2PasswordBearer( 11 | tokenUrl="/auth/editor/login", 12 | auto_error=False) 13 | 14 | 15 | async def get_user( 16 | request: Request, 17 | token: str = Depends(oauth2_scheme) 18 | ) -> User: 19 | """토큰에서 유저 ID 등을 추출하여 실제 유저 정보를 가져오는 함수""" 20 | # device-id는 헤더 또는 쿼리스트링(device-id)로 전달 받을 수 있도록 확장 21 | device_id_in_header = request.headers.get("device-id") or request.query_params.get("device-id") 22 | 23 | # 토큰은 헤더(Authorization) 또는 쿼리 파라미터(token)에서 받을 수 있음 24 | # SSE/EventSource는 커스텀 헤더를 지원하지 않으므로 쿼리 파라미터 지원 필요 25 | if not token: 26 | token = request.query_params.get("token") 27 | 28 | payload = decode_token(token) if token else None 29 | 30 | if payload is None: 31 | # AccessToken 만료 → refresh_token 꺼내기 32 | refresh_token = request.cookies.get("refresh_token") 33 | if not refresh_token: 34 | # 임시로 만약 서버를 재시작 했을때를 대비해서 35 | # 보안적으로 안전하지 않다 salt를 해야할수도 36 | refresh_token_object = await RefreshToken.filter( 37 | device_id=device_id_in_header 38 | ).order_by( 39 | "-created_at" 40 | ).first() 41 | if refresh_token_object: 42 | refresh_token = refresh_token_object.refresh_token 43 | try: 44 | decode_token(refresh_token) 45 | except Exception: 46 | raise HTTPException( 47 | status_code=401, detail="Invalid refresh token") 48 | 49 | if refresh_token: 50 | new_access_token = await refresh_token_flow(refresh_token, device_id_in_header) 51 | if new_access_token: 52 | # 새 토큰으로 재인증 시도 53 | payload = decode_token(new_access_token) 54 | # 🔥 새 AccessToken을 응답 헤더에 추가 (선택) 55 | request.state.new_access_token = new_access_token 56 | 57 | if payload is None: 58 | raise HTTPException( 59 | status_code=status.HTTP_401_UNAUTHORIZED, 60 | detail="Invalid or expired token", 61 | headers={"WWW-Authenticate": "Bearer"}, 62 | ) 63 | 64 | user_id = payload.get("sub") 65 | 66 | if user_id is None: 67 | raise HTTPException( 68 | status_code=status.HTTP_401_UNAUTHORIZED, 69 | detail="Invalid token: no user_id", 70 | headers={"WWW-Authenticate": "Bearer"}, 71 | ) 72 | 73 | user = await User.get_or_none(id=user_id) 74 | if not user: 75 | raise HTTPException( 76 | status_code=status.HTTP_401_UNAUTHORIZED, 77 | detail="User not found", 78 | ) 79 | 80 | return user 81 | 82 | 83 | def get_user_with_role( 84 | required_role: UserRoleCode, 85 | ): 86 | async def dependency( 87 | user: User = Depends(get_user) 88 | ): 89 | # 역할 우선순위 (낮은 권한부터 높은 권한 순) 90 | ROLE_HIERARCHY = [ 91 | UserRoleCode.EDITOR, 92 | UserRoleCode.ADMIN, 93 | ] 94 | # get user role 95 | user_role = await UserRole.get_or_none( 96 | id=user.role_id 97 | ) 98 | 99 | if not user_role: 100 | raise HTTPException(status_code=401, detail="Unauthorized role") 101 | 102 | try: 103 | user_index = ROLE_HIERARCHY.index(UserRoleCode(user_role.name)) 104 | required_index = ROLE_HIERARCHY.index(required_role) 105 | except ValueError: 106 | raise HTTPException(status_code=401, detail="Invalid role") 107 | 108 | if user_index < required_index: 109 | raise HTTPException(status_code=403, detail="Permission denied") 110 | 111 | return user 112 | 113 | return dependency 114 | -------------------------------------------------------------------------------- /maru_lang/core/relation_db/models/documents.py: -------------------------------------------------------------------------------- 1 | from tortoise.models import Model 2 | from tortoise import fields 3 | from maru_lang.enums.documents import ( 4 | PermissionAction, 5 | DocumentStatus, 6 | ) 7 | 8 | 9 | class Document(Model): 10 | id = fields.CharField(pk=True, max_length=64) # ULID/UUIDv7 권장 11 | name = fields.CharField(max_length=255, index=True) 12 | 13 | file_path = fields.CharField(max_length=500, null=True) 14 | file_size = fields.BigIntField(null=True) 15 | head_hash = fields.CharField( 16 | max_length=64, null=True, index=True) # blake3(앞 64KB) 17 | full_hash = fields.CharField( 18 | max_length=64, null=True, index=True) # blake3(전체: 지연 계산 가능) 19 | source_fingerprint = fields.CharField( 20 | max_length=64, unique=True, null=True) # 업서트 기준 키 21 | 22 | metadata = fields.JSONField(default=dict) 23 | status = fields.IntEnumField( 24 | DocumentStatus, default=DocumentStatus.PROCESSING) 25 | created_at = fields.DatetimeField(auto_now_add=True) 26 | updated_at = fields.DatetimeField(auto_now=True) 27 | 28 | class Meta: 29 | table = "document" 30 | indexes = [["name", "file_size", "head_hash"]] 31 | 32 | 33 | class DocumentGroup(Model): 34 | id = fields.IntField(pk=True) 35 | name = fields.CharField(max_length=255, unique=True) # Full path로 unique 식별 36 | base_path = fields.CharField( 37 | max_length=500, 38 | unique=True, # 같은 파일시스템 경로는 단일 DocumentGroup만 존재 39 | ) 40 | description = fields.TextField(null=True) # DocumentGroup 설명 41 | 42 | # Version ID for VDB chunk filtering and version management 43 | version_id = fields.CharField( 44 | max_length=64, 45 | null=True, # 임베딩 완료 전에는 null 46 | index=True # 검색 성능을 위한 인덱스 47 | ) 48 | 49 | # Manager (owner) of this document group 50 | manager = fields.ForeignKeyField( 51 | "models.User", 52 | related_name="managed_document_groups", 53 | on_delete=fields.RESTRICT # Manager가 있는 DocumentGroup이 있으면 User 삭제 불가 54 | ) 55 | 56 | # Pluggable component configurations (used during ingestion) 57 | loader = fields.CharField(max_length=255, null=True) # 사용된 loader 이름 58 | chunker = fields.CharField(max_length=255, null=True) # 사용된 chunker 이름 59 | embedding_model = fields.CharField(max_length=255, null=True) # 사용된 embedding model 이름 60 | 61 | # Configuration snapshot (for detecting changes) 62 | config_snapshot = fields.JSONField(null=True, default=dict) # 사용된 설정의 스냅샷 63 | 64 | minhash_signature = fields.JSONField(null=True) # MinHash 시그니처 (128개 정수 배열) 65 | signature_updated_at = fields.DatetimeField(auto_now=True) 66 | 67 | class Meta: 68 | table = "document_group" 69 | 70 | 71 | class DocumentGroupMembership(Model): 72 | document = fields.ForeignKeyField( 73 | "models.Document", 74 | related_name="group_memberships", 75 | on_delete=fields.CASCADE) 76 | group = fields.ForeignKeyField( 77 | "models.DocumentGroup", 78 | related_name="documents", 79 | on_delete=fields.CASCADE) 80 | 81 | class Meta: 82 | table = "document_group_membership" 83 | 84 | 85 | class DocumentGroupInclusion(Model): 86 | parent = fields.ForeignKeyField( 87 | "models.DocumentGroup", 88 | related_name="includes", 89 | on_delete=fields.CASCADE) 90 | child = fields.ForeignKeyField( 91 | "models.DocumentGroup", 92 | related_name="included_by", 93 | on_delete=fields.CASCADE) 94 | 95 | class Meta: 96 | table = "document_group_inclusion" 97 | unique_together = ("parent", "child") 98 | 99 | 100 | # 그룹 ↔ 문서그룹 권한 101 | class GroupPermission(Model): 102 | user_group = fields.ForeignKeyField( 103 | "models.UserGroup", 104 | related_name="permissions", 105 | on_delete=fields.CASCADE) 106 | document_group = fields.ForeignKeyField( 107 | "models.DocumentGroup", 108 | related_name="permissions", 109 | on_delete=fields.CASCADE) 110 | action = fields.IntEnumField(PermissionAction) 111 | 112 | class Meta: 113 | table = "group_permission" 114 | unique_together = (("user_group", "document_group", "action"),) -------------------------------------------------------------------------------- /maru_lang/commands/tree.py: -------------------------------------------------------------------------------- 1 | """ 2 | DocumentGroup 계층 구조 조회 및 관리 명령어 3 | """ 4 | import typer 5 | from typing import Optional 6 | from maru_lang.core.relation_db.models.documents import ( 7 | DocumentGroup, 8 | DocumentGroupInclusion, 9 | ) 10 | 11 | 12 | async def get_root_groups() -> list[DocumentGroup]: 13 | """ 14 | 루트 그룹들을 조회 (DocumentGroupInclusion에서 child로 지정되지 않은 그룹) 15 | 16 | Returns: 17 | 루트 DocumentGroup 리스트 18 | """ 19 | # child_id로 사용된 그룹 ID들 20 | child_ids = await DocumentGroupInclusion.all().values_list("child_id", flat=True) 21 | child_ids_set = set(child_ids) 22 | 23 | # 모든 그룹 조회 24 | all_groups = await DocumentGroup.all() 25 | 26 | # child로 지정되지 않은 그룹만 필터링 (루트 그룹) 27 | root_groups = [g for g in all_groups if g.id not in child_ids_set] 28 | 29 | return sorted(root_groups, key=lambda g: g.name) 30 | 31 | 32 | async def get_children_groups(parent_group: DocumentGroup) -> list[DocumentGroup]: 33 | """ 34 | 특정 그룹의 직계 자식 그룹들을 조회 35 | 36 | Args: 37 | parent_group: 부모 그룹 38 | 39 | Returns: 40 | 자식 DocumentGroup 리스트 41 | """ 42 | inclusions = await DocumentGroupInclusion.filter( 43 | parent=parent_group 44 | ).prefetch_related("child") 45 | 46 | children = [inc.child for inc in inclusions] 47 | return sorted(children, key=lambda g: g.name) 48 | 49 | 50 | async def print_group_tree( 51 | group: DocumentGroup | None = None, 52 | max_depth: int = 2, 53 | current_depth: int = 0, 54 | prefix: str = "", 55 | is_last: bool = True 56 | ): 57 | """ 58 | 그룹 계층 구조를 트리 형태로 출력 59 | 60 | Args: 61 | group: 출력할 그룹 (None이면 루트부터) 62 | max_depth: 최대 깊이 63 | current_depth: 현재 깊이 64 | prefix: 출력 prefix (트리 그리기용) 65 | is_last: 마지막 자식인지 여부 66 | """ 67 | if group is None: 68 | # 루트 그룹들 출력 69 | root_groups = await get_root_groups() 70 | 71 | if not root_groups: 72 | typer.secho("📭 DocumentGroup이 없습니다.", fg=typer.colors.YELLOW) 73 | return 74 | 75 | typer.echo("\n📁 Document Group 계층 구조:\n") 76 | for i, root in enumerate(root_groups): 77 | is_last_root = (i == len(root_groups) - 1) 78 | await print_group_tree(root, max_depth, 0, "", is_last_root) 79 | else: 80 | # 현재 그룹 출력 81 | if current_depth == 0: 82 | connector = "" 83 | typer.secho(f"{group.name}", fg=typer.colors.CYAN, bold=True) 84 | else: 85 | connector = "└── " if is_last else "├── " 86 | typer.secho(f"{prefix}{connector}{group.name}", fg=typer.colors.GREEN) 87 | 88 | # 최대 깊이에 도달하면 중단 89 | if current_depth >= max_depth: 90 | return 91 | 92 | # 자식 그룹들 재귀 출력 93 | children = await get_children_groups(group) 94 | 95 | for i, child in enumerate(children): 96 | is_last_child = (i == len(children) - 1) 97 | 98 | if current_depth == 0: 99 | child_prefix = "" 100 | else: 101 | child_prefix = prefix + (" " if is_last else "│ ") 102 | 103 | await print_group_tree( 104 | child, 105 | max_depth, 106 | current_depth + 1, 107 | child_prefix, 108 | is_last_child 109 | ) 110 | 111 | 112 | async def show_group_tree_command( 113 | group_name: Optional[str] = None, 114 | depth: int = 2 115 | ): 116 | """ 117 | DocumentGroup 계층 구조 출력 명령어 118 | 119 | Args: 120 | group_name: 특정 그룹명 (없으면 루트 그룹들만 표시) 121 | depth: 표시할 최대 깊이 122 | """ 123 | if group_name: 124 | # 특정 그룹 조회 125 | group = await DocumentGroup.get_or_none(name=group_name.lower()) 126 | 127 | if not group: 128 | typer.secho( 129 | f"❌ '{group_name}' 그룹을 찾을 수 없습니다.", 130 | fg=typer.colors.RED 131 | ) 132 | raise typer.Exit(1) 133 | 134 | typer.echo(f"\n📁 '{group.name}' 그룹 계층 구조 (depth={depth}):\n") 135 | await print_group_tree(group, max_depth=depth) 136 | else: 137 | # 루트 그룹들만 표시 (depth=1) 138 | await print_group_tree(None, max_depth=1) 139 | 140 | typer.echo() # 빈 줄 141 | -------------------------------------------------------------------------------- /maru_lang/api/endpoints/auth.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | from fastapi import APIRouter, HTTPException, Depends, Response 3 | from maru_lang.enums.auth import UserRoleCode 4 | from maru_lang.configs.system_config import get_system_config 5 | from maru_lang.dependencies.auth import get_user 6 | 7 | config = get_system_config() 8 | from maru_lang.dependencies.email import get_email_service_dependency, EmailService 9 | from maru_lang.schemas.auth import ( 10 | VerifyCodeRequest, 11 | SignUpRequest, 12 | LogoutRequest, 13 | UserGroupsResponse, 14 | UserGroupResponse, 15 | ) 16 | from maru_lang.services.auth import ( 17 | generate_token, 18 | verify_OTP, 19 | create_or_get_user, 20 | delete_token, 21 | generate_OTP, 22 | get_user_groups, 23 | ) 24 | 25 | 26 | router = APIRouter( 27 | prefix="/auth", 28 | tags=["Auth"] 29 | ) 30 | 31 | 32 | @router.post("/login") 33 | async def login( 34 | request: SignUpRequest, 35 | email_service: Optional[EmailService] = Depends( 36 | get_email_service_dependency) 37 | ) -> str: 38 | try: 39 | # TODO Email validation 40 | otp = await generate_OTP(request.email, email_service) 41 | 42 | # 이메일 서비스가 활성화된 경우에만 이메일 전송 43 | if email_service: 44 | success = email_service.send_otp(request.email, otp.code) 45 | if not success: 46 | # 이메일 전송 실패 시 DEFAULT_VALIDATION_CODE로 재생성 47 | await otp.delete() 48 | otp = await generate_OTP(request.email, None) 49 | 50 | return otp.email 51 | except Exception as e: 52 | print(e) 53 | raise HTTPException( 54 | status_code=400, 55 | detail="서버가 점검 중 입니다. 다시 시도해주세요.") 56 | 57 | 58 | @router.post("/logout") 59 | async def logout( 60 | request: LogoutRequest, 61 | response: Response, 62 | user=Depends(get_user) 63 | ) -> dict: 64 | try: 65 | await delete_token(user.id, request.device_id) 66 | response.delete_cookie( 67 | key="refresh_token", 68 | path="/", 69 | samesite="strict" 70 | ) 71 | return {"message": "Logged out successfully"} 72 | except Exception as e: 73 | raise HTTPException(status_code=500, detail=str(e)) 74 | 75 | 76 | @router.post("/verify/code") 77 | async def verify_code( 78 | response: Response, 79 | request: VerifyCodeRequest 80 | ): 81 | try: 82 | if not await verify_OTP(request.email, request.code): 83 | raise Exception("Invalid or expired code") 84 | user = await create_or_get_user( 85 | email=request.email, 86 | role=UserRoleCode.EDITOR.value 87 | ) 88 | access_token, refresh_token = await generate_token( 89 | user.id, 90 | user.role_id, 91 | request.device_id) 92 | 93 | response.set_cookie( 94 | key="refresh_token", 95 | value=refresh_token, 96 | httponly=True, 97 | secure=True, 98 | samesite="strict", 99 | max_age=config.auth.refresh_token_expire_minutes * 60 100 | ) 101 | 102 | return access_token 103 | except Exception as e: 104 | raise HTTPException(status_code=400, detail=str(e)) 105 | 106 | 107 | @router.get("/verify") 108 | async def verify(_=Depends(get_user)): 109 | return {"message": "ok"} 110 | 111 | 112 | @router.get("/user/groups", response_model=UserGroupsResponse) 113 | async def get_current_user_groups( 114 | user=Depends(get_user) 115 | ): 116 | """ 117 | Get user groups that the authenticated user belongs to. 118 | 119 | Returns: 120 | UserGroupsResponse: List of user groups with total count 121 | """ 122 | try: 123 | # Get user groups using service function 124 | groups = await get_user_groups(user) 125 | 126 | # Convert to response format 127 | group_responses = [ 128 | UserGroupResponse( 129 | id=group.id, 130 | name=group.name 131 | ) 132 | for group in groups 133 | ] 134 | return UserGroupsResponse( 135 | groups=group_responses, 136 | total=len(group_responses) 137 | ) 138 | 139 | except Exception as e: 140 | print(f"❌ Error fetching user groups: {str(e)}") 141 | raise HTTPException(status_code=500, detail=str(e)) 142 | -------------------------------------------------------------------------------- /maru_lang/pluggable/embedders/manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Embedder: 임베딩 모델 관리 및 벡터 생성 3 | 프로세스 단위로 모델을 캐싱하여 GPU 자원을 효율적으로 사용 4 | """ 5 | from typing import Dict, List, Optional 6 | from sentence_transformers import SentenceTransformer 7 | 8 | 9 | class Embedder: 10 | """ 11 | 임베딩 모델 관리자 12 | 13 | 프로세스 내에서 임베딩 모델을 캐싱하여 재사용 14 | encode 함수로 텍스트를 벡터로 변환하는 단순한 인터페이스 제공 15 | """ 16 | 17 | def __init__(self, device: Optional[str] = None): 18 | """ 19 | Args: 20 | device: 모델을 로드할 디바이스 (None이면 자동 선택) 21 | 예: "cuda", "cpu", "mps" 22 | """ 23 | self.device = device 24 | self.model_cache: Dict[str, SentenceTransformer] = {} 25 | 26 | def encode( 27 | self, 28 | texts: List[str], 29 | model_name: str, 30 | show_progress: bool = False, 31 | ) -> List[List[float]]: 32 | """ 33 | 텍스트를 임베딩 벡터로 변환 34 | 35 | Args: 36 | texts: 임베딩할 텍스트 리스트 37 | model_name: 임베딩 모델 이름 38 | show_progress: 진행바 표시 여부 39 | 40 | Returns: 41 | List[List[float]]: 임베딩 벡터 리스트 42 | """ 43 | model = self._get_or_load_model(model_name) 44 | vectors = model.encode( 45 | texts, show_progress_bar=show_progress, convert_to_numpy=True 46 | ) 47 | return vectors.tolist() 48 | 49 | def get_dimension(self, model_name: str) -> int: 50 | """ 51 | 임베딩 차원 반환 52 | 53 | Args: 54 | model_name: 임베딩 모델 이름 55 | 56 | Returns: 57 | int: 임베딩 벡터 차원 58 | """ 59 | model = self._get_or_load_model(model_name) 60 | return model.get_sentence_embedding_dimension() 61 | 62 | def _get_or_load_model(self, model_name: str) -> SentenceTransformer: 63 | """ 64 | 모델 캐싱 및 로드 (내부 메서드) 65 | 66 | Args: 67 | model_name: 임베딩 모델 이름 68 | 69 | Returns: 70 | SentenceTransformer: 로드된 모델 인스턴스 71 | """ 72 | if model_name not in self.model_cache: 73 | self.model_cache[model_name] = SentenceTransformer( 74 | model_name, device=self.device 75 | ) 76 | 77 | return self.model_cache[model_name] 78 | 79 | def unload_model(self, model_name: str) -> bool: 80 | """ 81 | 모델을 메모리에서 해제 82 | 83 | Args: 84 | model_name: 해제할 모델 이름 85 | 86 | Returns: 87 | bool: 해제 성공 여부 88 | """ 89 | if model_name in self.model_cache: 90 | del self.model_cache[model_name] 91 | print(f"🗑️ Model unloaded: {model_name}") 92 | return True 93 | return False 94 | 95 | def clear_cache(self): 96 | """모든 캐시된 모델 해제""" 97 | count = len(self.model_cache) 98 | self.model_cache.clear() 99 | print(f"🗑️ Cleared {count} model(s) from cache") 100 | 101 | 102 | # 싱글톤 인스턴스 103 | _embedder_instance: Optional[Embedder] = None 104 | 105 | 106 | def get_embedder( 107 | device: Optional[str] = None, 108 | force_new: bool = False, 109 | ) -> Embedder: 110 | """ 111 | Embedder 싱글톤 인스턴스 반환 112 | 113 | Args: 114 | device: 모델을 로드할 디바이스 (None이면 config에서 자동 로드) 115 | 예: "cuda", "cpu", "mps" 116 | force_new: True면 기존 인스턴스 무시하고 새로 생성 (테스트용) 117 | 118 | Returns: 119 | Embedder: 싱글톤 인스턴스 120 | 121 | Example: 122 | >>> embedder = get_embedder() 123 | >>> vectors = embedder.encode(["hello", "world"], "intfloat/multilingual-e5-large") 124 | """ 125 | global _embedder_instance 126 | 127 | if _embedder_instance is None or force_new: 128 | # device가 None이면 config에서 로드 129 | if device is None: 130 | device = _load_device_from_config() 131 | 132 | _embedder_instance = Embedder(device=device) 133 | 134 | return _embedder_instance 135 | 136 | 137 | def _load_device_from_config() -> Optional[str]: 138 | """ 139 | ConfigManager를 사용하여 config에서 device 설정을 로드합니다. 140 | 141 | Returns: 142 | Optional[str]: config에서 읽은 device 설정, 없으면 None 143 | """ 144 | try: 145 | from maru_lang.configs import get_config_manager 146 | 147 | config_manager = get_config_manager() 148 | merged_config = config_manager.get_embedder_config() 149 | 150 | if merged_config: 151 | return merged_config.device 152 | except ImportError: 153 | pass 154 | except Exception as e: 155 | print(f"⚠️ Embedder config 로드 실패: {e}") 156 | 157 | return None 158 | -------------------------------------------------------------------------------- /maru_lang/models/agents.py: -------------------------------------------------------------------------------- 1 | """ 2 | Agent-related data models 3 | """ 4 | import asyncio 5 | from dataclasses import dataclass, field 6 | from typing import List, Dict, Any, Optional, Union, TYPE_CHECKING 7 | from maru_lang.enums.chat import ChatProcessStep as ChatStep 8 | from maru_lang.models.chat import ChatHistory 9 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument 10 | 11 | 12 | @dataclass 13 | class AgentResult: 14 | """Result from individual agent execution""" 15 | success: bool 16 | result: str = "" # 주요 출력 결과 (표준화된 문자열) 17 | data: Optional[Dict[str, Any]] = None # 추가 정보 (선택) 18 | error: Optional[str] = None 19 | metadata: Optional[Dict[str, Any]] = None 20 | 21 | def _serialize_value(self, value: Any) -> Any: 22 | """Recursively serialize values to JSON-compatible format""" 23 | if value is None or isinstance(value, (str, int, float, bool)): 24 | return value 25 | elif isinstance(value, dict): 26 | return {k: self._serialize_value(v) for k, v in value.items()} 27 | elif isinstance(value, (list, tuple)): 28 | return [self._serialize_value(item) for item in value] 29 | elif hasattr(value, 'text'): 30 | # Handle MCP TextContent objects 31 | return value.text 32 | elif hasattr(value, 'to_dict'): 33 | return self._serialize_value(value.to_dict()) 34 | elif hasattr(value, '__dict__'): 35 | return self._serialize_value(value.__dict__) 36 | else: 37 | # Fallback: convert to string 38 | return str(value) 39 | 40 | def to_dict(self) -> Dict[str, Any]: 41 | """Convert to dictionary with safe serialization""" 42 | return { 43 | "success": self.success, 44 | "result": self.result, 45 | "data": self._serialize_value(self.data), 46 | "error": self.error, 47 | "metadata": self._serialize_value(self.metadata) 48 | } 49 | 50 | 51 | @dataclass 52 | class AgentSelection: 53 | """Result of agent selection process""" 54 | selected_agents: List[str] 55 | execution_order: List[str] 56 | reasoning: str 57 | parameters: Optional[Dict[str, Any]] = None 58 | fallback_config: Optional[Dict[str, Any]] = None 59 | 60 | def to_dict(self) -> Dict[str, Any]: 61 | """Convert to dictionary""" 62 | return { 63 | "selected_agents": self.selected_agents, 64 | "execution_order": self.execution_order, 65 | "reasoning": self.reasoning, 66 | "parameters": self.parameters or {}, 67 | "fallback_config": self.fallback_config 68 | } 69 | 70 | 71 | @dataclass 72 | class ExecutionContext: 73 | """Context of agent execution""" 74 | question: str 75 | progress_queue: asyncio.Queue 76 | chat_history: ChatHistory 77 | metadata: Optional[Dict[str, Any]] = field(default_factory=dict) 78 | 79 | def to_dict(self) -> Dict[str, Any]: 80 | """Convert to dictionary""" 81 | # exclude progress_queue 82 | return { 83 | "question": self.question, 84 | "progress_queue": self.progress_queue, 85 | "chat_history": self.chat_history, 86 | "metadata": self.metadata 87 | } 88 | 89 | 90 | @dataclass 91 | class ExecutionResult: 92 | """Result of agent execution orchestration""" 93 | agent_results: Dict[str, AgentResult] 94 | execution_order: List[str] 95 | success: bool 96 | errors: Dict[str, str] = field(default_factory=dict) 97 | 98 | def to_dict(self) -> Dict[str, Any]: 99 | """Convert to dictionary""" 100 | return { 101 | "agent_results": { 102 | name: result.to_dict() 103 | for name, result in self.agent_results.items() 104 | }, 105 | "execution_order": self.execution_order, 106 | "success": self.success, 107 | "errors": self.errors 108 | } 109 | 110 | 111 | @dataclass 112 | class ChatResult: 113 | """Final chat processing result""" 114 | answer: str 115 | internal_documents: List[RetrieveDocument] = field(default_factory=list) 116 | 117 | 118 | @dataclass 119 | class ChatProcess: 120 | """Chat processing result""" 121 | step: ChatStep 122 | data: Union[AgentSelection, ExecutionResult, str, ChatResult] 123 | 124 | 125 | @dataclass 126 | class GenerateAnswerResult: 127 | """Result from answer generation""" 128 | answer: str 129 | documents: List[Any] = field(default_factory=list) 130 | success: bool = True 131 | confidence: Optional[float] = None 132 | metadata: Optional[Dict[str, Any]] = None 133 | 134 | 135 | -------------------------------------------------------------------------------- /maru_lang/pluggable/rerankers/manager.py: -------------------------------------------------------------------------------- 1 | """ 2 | Reranker: 검색 결과 재정렬 3 | 프로세스 단위로 모델을 캐싱하여 GPU 자원을 효율적으로 사용 4 | """ 5 | from typing import Dict, List, Optional, Tuple 6 | from maru_lang.configs import get_config_manager 7 | from sentence_transformers import CrossEncoder 8 | 9 | 10 | class Reranker: 11 | """ 12 | Reranker 관리자 13 | 14 | 프로세스 내에서 reranker 모델을 캐싱하여 재사용 15 | rerank 함수로 검색 결과를 재정렬하는 단순한 인터페이스 제공 16 | """ 17 | 18 | def __init__(self, device: Optional[str] = None): 19 | """ 20 | Args: 21 | device: 모델을 로드할 디바이스 (None이면 자동 선택) 22 | 예: "cuda", "cpu", "mps" 23 | """ 24 | self.device = device 25 | self.model_cache: Dict[str, CrossEncoder] = {} 26 | 27 | def rerank( 28 | self, 29 | query: str, 30 | documents: List[str], 31 | model_name: str, 32 | top_k: Optional[int] = None, 33 | ) -> List[Tuple[int, float]]: 34 | """ 35 | 쿼리와 문서들을 재정렬 36 | 37 | Args: 38 | query: 검색 쿼리 39 | documents: 재정렬할 문서 리스트 40 | model_name: reranker 모델 이름 41 | top_k: 상위 k개만 반환 (None이면 전체) 42 | 43 | Returns: 44 | List[Tuple[int, float]]: (원본 인덱스, 점수) 튜플 리스트 (점수 내림차순) 45 | """ 46 | model = self._get_or_load_model(model_name) 47 | 48 | # 쿼리-문서 쌍 생성 49 | pairs = [[query, doc] for doc in documents] 50 | 51 | # 점수 계산 52 | scores = model.predict(pairs) 53 | 54 | # (인덱스, 점수) 튜플 생성 및 정렬 55 | ranked = [(idx, float(score)) for idx, score in enumerate(scores)] 56 | ranked.sort(key=lambda x: x[1], reverse=True) 57 | 58 | # top_k 제한 59 | if top_k is not None: 60 | ranked = ranked[:top_k] 61 | 62 | return ranked 63 | 64 | def _get_or_load_model(self, model_name: str) -> CrossEncoder: 65 | """ 66 | 모델 캐싱 및 로드 (내부 메서드) 67 | 68 | Args: 69 | model_name: reranker 모델 이름 70 | 71 | Returns: 72 | CrossEncoder: 로드된 모델 인스턴스 73 | """ 74 | if model_name not in self.model_cache: 75 | print(f"Loading reranker model: {model_name}...") 76 | self.model_cache[model_name] = CrossEncoder( 77 | model_name, device=self.device 78 | ) 79 | device_info = f"device={self.device}" if self.device else "auto" 80 | print(f"✅ Reranker loaded: {model_name} ({device_info})") 81 | 82 | return self.model_cache[model_name] 83 | 84 | def unload_model(self, model_name: str) -> bool: 85 | """ 86 | 모델을 메모리에서 해제 87 | 88 | Args: 89 | model_name: 해제할 모델 이름 90 | 91 | Returns: 92 | bool: 해제 성공 여부 93 | """ 94 | if model_name in self.model_cache: 95 | del self.model_cache[model_name] 96 | print(f"🗑️ Reranker unloaded: {model_name}") 97 | return True 98 | return False 99 | 100 | def clear_cache(self): 101 | """모든 캐시된 모델 해제""" 102 | count = len(self.model_cache) 103 | self.model_cache.clear() 104 | print(f"🗑️ Cleared {count} reranker model(s) from cache") 105 | 106 | 107 | # 싱글톤 인스턴스 108 | _reranker_instance: Optional[Reranker] = None 109 | 110 | 111 | def get_reranker( 112 | device: Optional[str] = None, 113 | force_new: bool = False, 114 | ) -> Reranker: 115 | """ 116 | Reranker 싱글톤 인스턴스 반환 117 | 118 | Args: 119 | device: 모델을 로드할 디바이스 (None이면 config에서 자동 로드) 120 | 예: "cuda", "cpu", "mps" 121 | force_new: True면 기존 인스턴스 무시하고 새로 생성 (테스트용) 122 | 123 | Returns: 124 | Reranker: 싱글톤 인스턴스 125 | 126 | Example: 127 | >>> reranker = get_reranker() 128 | >>> ranked = reranker.rerank( 129 | ... query="python tutorial", 130 | ... documents=["doc1", "doc2", "doc3"], 131 | ... model_name="BAAI/bge-reranker-v2-m3", 132 | ... top_k=5 133 | ... ) 134 | """ 135 | global _reranker_instance 136 | 137 | if _reranker_instance is None or force_new: 138 | # device가 None이면 config에서 로드 (embedder와 동일한 device 사용) 139 | if device is None: 140 | device = _load_device_from_config() 141 | 142 | _reranker_instance = Reranker(device=device) 143 | 144 | return _reranker_instance 145 | 146 | 147 | def _load_device_from_config() -> Optional[str]: 148 | """ 149 | ConfigManager를 사용하여 config에서 device 설정을 로드합니다. 150 | Embedder config와 동일한 device 사용 151 | 152 | Returns: 153 | Optional[str]: config에서 읽은 device 설정, 없으면 None 154 | """ 155 | try: 156 | 157 | config_manager = get_config_manager() 158 | embedder_config = config_manager.get_embedder_config() 159 | 160 | if embedder_config: 161 | return embedder_config.device 162 | except ImportError: 163 | pass 164 | except Exception as e: 165 | print(f"⚠️ Reranker config 로드 실패: {e}") 166 | 167 | return None 168 | --------------------------------------------------------------------------------