├── maru_lang
    ├── py.typed
    ├── api
    │   ├── __init__.py
    │   └── endpoints
    │   │   ├── __init__.py
    │   │   ├── user_group.py
    │   │   └── auth.py
    ├── core
    │   ├── __init__.py
    │   ├── relation_db
    │   │   ├── models
    │   │   │   ├── __init__.py
    │   │   │   ├── chat.py
    │   │   │   ├── auth.py
    │   │   │   └── documents.py
    │   │   ├── __init__.py
    │   │   └── connection.py
    │   └── vector_db
    │   │   ├── __init__.py
    │   │   ├── factory.py
    │   │   ├── retrieve_document.py
    │   │   └── base.py
    ├── commands
    │   ├── __init__.py
    │   ├── transfer.py
    │   └── tree.py
    ├── pipelines
    │   ├── __init__.py
    │   ├── chat
    │   │   └── __init__.py
    │   ├── ingest
    │   │   └── __init__.py
    │   └── base.py
    ├── schemas
    │   ├── __init__.py
    │   ├── chat.py
    │   ├── auth.py
    │   └── ingest.py
    ├── services
    │   ├── __init__.py
    │   ├── admin.py
    │   ├── chat.py
    │   └── ingest.py
    ├── dependencies
    │   ├── __init__.py
    │   ├── llm.py
    │   ├── chat.py
    │   ├── ingest.py
    │   ├── email.py
    │   └── auth.py
    ├── pluggable
    │   ├── embedders
    │   │   ├── __init__.py
    │   │   └── manager.py
    │   ├── rerankers
    │   │   ├── __init__.py
    │   │   └── manager.py
    │   ├── llms
    │   │   └── __init__.py
    │   ├── retrievers
    │   │   └── __init__.py
    │   ├── __init__.py
    │   ├── agents
    │   │   ├── __init__.py
    │   │   ├── builtin
    │   │   │   ├── __init__.py
    │   │   │   └── intent_extractor.py
    │   │   └── agent_factory.py
    │   ├── models
    │   │   ├── chunker.py
    │   │   ├── embedder.py
    │   │   ├── reranker.py
    │   │   ├── __init__.py
    │   │   ├── llm.py
    │   │   └── loader.py
    │   ├── configs
    │   │   ├── __init__.py
    │   │   ├── rag_loader.py
    │   │   ├── chunker_config.py
    │   │   ├── embedder_config.py
    │   │   ├── loader_config.py
    │   │   └── reranker_config.py
    │   ├── chunkers
    │   │   ├── base.py
    │   │   ├── sentence.py
    │   │   └── paragraph.py
    │   └── loaders
    │   │   ├── txt_parser.py
    │   │   ├── markdown_parser.py
    │   │   ├── base.py
    │   │   ├── pdf_parser.py
    │   │   ├── json_parser.py
    │   │   ├── docx_parser.py
    │   │   ├── html_parser.py
    │   │   ├── yaml_parser.py
    │   │   ├── xlsx_parser.py
    │   │   ├── csv_parser.py
    │   │   ├── pptx_parser.py
    │   │   └── xml_parser.py
    ├── enums
    │   ├── chat.py
    │   ├── agents.py
    │   ├── documents.py
    │   ├── configs.py
    │   ├── auth.py
    │   └── __init__.py
    ├── templates
    │   ├── yaml
    │   │   ├── embedder_config.yaml
    │   │   ├── openai.yaml
    │   │   ├── local.yaml
    │   │   ├── chunker_config.yaml
    │   │   ├── agents
    │   │   │   ├── mcps
    │   │   │   │   └── agents_firecrawl_mcp.yaml
    │   │   │   ├── agents_calculator.yaml
    │   │   │   └── builtin
    │   │   │   │   ├── agents_knowledge_search.yaml
    │   │   │   │   ├── agents_intent_extractor.yaml
    │   │   │   │   ├── agents_keyword_extractor.yaml
    │   │   │   │   ├── agents_group_classifier.yaml
    │   │   │   │   └── agents_response.yaml
    │   │   ├── reranker_config.yaml
    │   │   ├── loader_config.yaml
    │   │   ├── agents_build_selector.yaml
    │   │   ├── rag_config.yaml
    │   │   ├── llm_reranker.yaml
    │   │   └── system_config.yaml
    │   └── python
    │   │   ├── main.py
    │   │   ├── calculator_agent.py
    │   │   └── custom_parser.py
    ├── __init__.py
    ├── models
    │   ├── ingest.py
    │   ├── configs
    │   │   ├── __init__.py
    │   │   └── group.py
    │   ├── vector_db.py
    │   └── agents.py
    ├── utils
    │   ├── __init__.py
    │   ├── distribution.py
    │   ├── document.py
    │   └── security.py
    └── configs
    │   └── __init__.py
├── LICENSE
└── pyproject.toml


/maru_lang/py.typed:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/commands/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/schemas/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/services/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/api/endpoints/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/dependencies/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/maru_lang/core/relation_db/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .auth import *
2 | from .documents import *
3 | from .chat import *
4 | 


--------------------------------------------------------------------------------
/maru_lang/pipelines/chat/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Chat Pipeline
3 | """
4 | from maru_lang.pipelines.chat.pipeline import ChatPipeline
5 | 
6 | __all__ = ["ChatPipeline"]
7 | 


--------------------------------------------------------------------------------
/maru_lang/pipelines/ingest/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Ingest Pipeline
3 | """
4 | from maru_lang.pipelines.ingest.pipeline import IngestPipeline, IngestResult
5 | 
6 | __all__ = ["IngestPipeline", "IngestResult"]
7 | 


--------------------------------------------------------------------------------
/maru_lang/core/vector_db/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import VectorDB
2 | from .chroma import ChromaVectorDB
3 | from .retrieve_document import RetrieveDocument
4 | 
5 | __all__ = ["VectorDB", "ChromaVectorDB", "RetrieveDocument"]


--------------------------------------------------------------------------------
/maru_lang/pluggable/embedders/__init__.py:
--------------------------------------------------------------------------------
 1 | """Embedder for handling embedding models."""
 2 | 
 3 | from .manager import (
 4 |     Embedder,
 5 |     get_embedder,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "Embedder",
10 |     "get_embedder",
11 | ]
12 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/rerankers/__init__.py:
--------------------------------------------------------------------------------
 1 | """Reranker for handling reranking models."""
 2 | 
 3 | from .manager import (
 4 |     Reranker,
 5 |     get_reranker,
 6 | )
 7 | 
 8 | __all__ = [
 9 |     "Reranker",
10 |     "get_reranker",
11 | ]
12 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/llms/__init__.py:
--------------------------------------------------------------------------------
 1 | # LLM 클라이언트
 2 | from .client import LLMServerClient
 3 | 
 4 | # LLM 서버 매니저
 5 | from .server_manager import LLMServerManager
 6 | 
 7 | __all__ = [
 8 |     "LLMServerClient",
 9 |     "LLMServerManager"
10 | ]
11 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/retrievers/__init__.py:
--------------------------------------------------------------------------------
 1 | """Retriever for handling search operations."""
 2 | 
 3 | from .manager import (
 4 |     Retriever,
 5 |     get_retriever,
 6 |     RetriveMethod,
 7 | )
 8 | 
 9 | __all__ = [
10 |     "Retriever",
11 |     "get_retriever",
12 |     "RetriveMethod",
13 | ]
14 | 


--------------------------------------------------------------------------------
/maru_lang/enums/chat.py:
--------------------------------------------------------------------------------
 1 | from enum import Enum
 2 | 
 3 | 
 4 | class ChatProcessStep(str, Enum):
 5 |     """채팅 처리 단계"""
 6 |     START = "start"
 7 |     AGENT_SELECTION = "agent_selection"
 8 |     AGENT_EXECUTION = "agent_execution"
 9 |     ANSWER_GENERATION = "answer_generation"
10 |     COMPLETED = "completed"


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/embedder_config.yaml:
--------------------------------------------------------------------------------
 1 | # Embedder Configuration
 2 | # Configure embedding models and device preferences
 3 | 
 4 | # Default embedding model for all document groups
 5 | # Can be overridden per-group in rag_config.yaml
 6 | default_model: BAAI/bge-m3
 7 | 
 8 | # Device selection (null => auto-detect: cuda > mps > cpu)
 9 | device: null
10 | 


--------------------------------------------------------------------------------
/maru_lang/enums/agents.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent-related enums
 3 | """
 4 | from enum import Enum
 5 | 
 6 | 
 7 | class LLMFallbackStrategy(Enum):
 8 |     """LLM fallback strategies when specified LLM server is not available"""
 9 |     ANY_AVAILABLE = "any_available"  # Use any available LLM server
10 |     ERROR = "error"                  # Raise error and stop execution


--------------------------------------------------------------------------------
/maru_lang/__init__.py:
--------------------------------------------------------------------------------
 1 | """MaruLang - Advanced AI Agent Framework with RAG and multi-agent system"""
 2 | 
 3 | __version__ = "0.0.0"
 4 | 
 5 | from maru_lang.app import MaruLangApp, default_app
 6 | 
 7 | # FastAPI app instance
 8 | app = default_app.get_fastapi_app()
 9 | 
10 | __all__ = [
11 |     "MaruLangApp",
12 |     "default_app",
13 |     "app",
14 |     "__version__",
15 | ]


--------------------------------------------------------------------------------
/maru_lang/enums/documents.py:
--------------------------------------------------------------------------------
 1 | from enum import IntEnum
 2 | 
 3 | 
 4 | class PermissionAction(IntEnum):
 5 |     READ = 1
 6 |     WRITE = 2
 7 |     MANAGE = 3  # sync, base_path 변경 등 관리 권한
 8 | 
 9 | 
10 | class DocumentStatus(IntEnum):
11 |     PROCESSING = 1  # 처리 중 (파싱/청킹/임베딩 대기)
12 |     ACTIVE = 2      # 활성화 (임베딩 완료, 검색 가능)
13 |     INACTIVE = 3    # 비활성화 (검색 불가)
14 | 
15 | 


--------------------------------------------------------------------------------
/maru_lang/enums/configs.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration type enums
 3 | """
 4 | from enum import Enum
 5 | 
 6 | 
 7 | class ConfigType(Enum):
 8 |     """Configuration types"""
 9 |     LLMS = "llms"
10 |     RAGS = "rags"  # RAG 설정 (retriever + groups)
11 |     AGENTS = "agents"
12 |     LOADERS = "loaders"
13 |     CHUNKERS = "chunkers"
14 |     EMBEDDERS = "embedders"
15 |     RERANKERS = "rerankers"


--------------------------------------------------------------------------------
/maru_lang/pluggable/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Pluggable components for extensibility
 3 | 
 4 | This package contains all pluggable/extensible components:
 5 | - loaders: File parsers (txt, pdf, docx, etc.)
 6 | - chunkers: Text chunking strategies (paragraph, sentence, etc.)
 7 | - embedders: Embedding models management
 8 | - rerankers: Result reranking models
 9 | - configs: Configuration loaders for pluggable components
10 | - models: Data models for configurations
11 | """
12 | 


--------------------------------------------------------------------------------
/maru_lang/enums/auth.py:
--------------------------------------------------------------------------------
 1 | from __future__ import annotations
 2 | from enum import Enum
 3 | 
 4 | 
 5 | class UserRoleCode(Enum):
 6 |     # 기본적으로 생성하는 코드 사용자가 만들 수 있다.
 7 |     EDITOR = 'editor'
 8 |     ADMIN = 'admin'
 9 |     
10 |     @classmethod
11 |     def is_valid_role(cls, role_name: str) -> bool:
12 |         try:
13 |             cls(role_name)
14 |             return True
15 |         except ValueError as e:
16 |             print(e)
17 |             return False
18 | 


--------------------------------------------------------------------------------
/maru_lang/enums/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Enums for the LLM Chatbot application
 3 | """
 4 | from .agents import LLMFallbackStrategy
 5 | from .auth import UserRoleCode
 6 | from .chat import ChatProcessStep
 7 | from .configs import ConfigType
 8 | from .documents import PermissionAction, DocumentStatus
 9 | 
10 | __all__ = [
11 |     "LLMFallbackStrategy",
12 |     "UserRoleCode",
13 |     "ChatProcessStep",
14 |     "ConfigType",
15 |     "PermissionAction",
16 |     "DocumentStatus",
17 | ]


--------------------------------------------------------------------------------
/maru_lang/models/ingest.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | from typing import Optional
 3 | 
 4 | 
 5 | @dataclass(frozen=True)
 6 | class PipelineConfig:
 7 |     model_name: str
 8 |     model_dim: int
 9 |     normalize_ver: str
10 |     pooling: str
11 |     lang_hint: Optional[str] = None
12 |     pipeline_version: Optional[str] = None  # 메타 기록용
13 | 
14 | 
15 | @dataclass(frozen=True)
16 | class ChunkInput:
17 |     number: int               # 페이지/문단/슬롯 인덱스
18 |     content: str
19 |     meta: Optional[dict] = None
20 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/agents/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent components for the chatbot system
 3 | """
 4 | from .base import BaseAgent
 5 | from .agent_selector import AgentSelector
 6 | from .agent_executor import AgentExecutor
 7 | from .agent_factory import AgentFactory
 8 | from .mcp_client_agent import MCPClientAgent
 9 | 
10 | __all__ = [
11 |     # Core components
12 |     "BaseAgent",
13 |     "AgentSelector",
14 |     "AgentExecutor",
15 |     "AgentFactory",
16 |     # Individual agents
17 |     "DocumentSearchAgent",
18 |     "MCPClientAgent",
19 | ]
20 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/models/chunker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Chunker configuration models
 3 | """
 4 | from dataclasses import dataclass, field
 5 | from typing import Dict, Any
 6 | 
 7 | 
 8 | @dataclass
 9 | class ChunkerConfig:
10 |     """
11 |     Chunker configuration
12 | 
13 |     각 chunker의 생성자 파라미터를 설정
14 |     """
15 |     # chunker 이름 -> 생성자 파라미터 매핑
16 |     # 예: {"paragraph": {"max_chunk_size": 500}}
17 |     chunkers: Dict[str, Dict[str, Any]] = field(default_factory=dict)
18 | 
19 |     # Configuration metadata
20 |     source_path: str = ""
21 |     is_override: bool = False
22 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/openai.yaml:
--------------------------------------------------------------------------------
 1 | # OpenAI API basic template
 2 | 
 3 | name: openai
 4 | description: "OpenAI API"
 5 | url: https://api.openai.com
 6 | model_name: gpt-4o-mini
 7 | api_key: ${OPENAI_API_KEY}
 8 | timeout: 30
 9 | enabled: true
10 | 
11 | chat_completions_path: /v1/chat/completions
12 | health_check_endpoint: /v1/models
13 | 
14 | headers:
15 |   Content-Type: application/json
16 | 
17 | config:
18 |   temperature: 0.7
19 |   max_tokens: 2000
20 |   top_p: 1.0
21 | 
22 | retry:
23 |   max_attempts: 3
24 |   backoff_factor: 2
25 |   max_delay: 60
26 | 
27 | log_level: INFO


--------------------------------------------------------------------------------
/maru_lang/pluggable/models/embedder.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Embedder configuration models
 3 | """
 4 | from dataclasses import dataclass
 5 | from typing import Optional
 6 | 
 7 | 
 8 | @dataclass
 9 | class EmbedderConfig:
10 |     """
11 |     Embedder configuration
12 | 
13 |     임베딩 모델 및 디바이스 설정
14 |     """
15 |     # 기본 임베딩 모델 (모든 document group의 기본값)
16 |     # rag_config.yaml에서 그룹별로 override 가능
17 |     default_model: str = "BAAI/bge-m3"
18 | 
19 |     # 디바이스 (None이면 자동 선택: cuda > mps > cpu)
20 |     device: Optional[str] = None
21 | 
22 |     # Configuration metadata
23 |     source_path: str = ""
24 |     is_override: bool = False
25 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/local.yaml:
--------------------------------------------------------------------------------
 1 | # OpenAI-Compatible Local LLM Template
 2 | 
 3 | name: local-llm
 4 | description: "Local OpenAI-Compatible Server"
 5 | url: http://localhost:8000
 6 | model_name: meta-llama/Llama-2-7b-chat-hf
 7 | api_key: ""
 8 | timeout: 60
 9 | enabled: true
10 | 
11 | chat_completions_path: /v1/chat/completions
12 | health_check_endpoint: /health
13 | 
14 | headers:
15 |   Content-Type: application/json
16 | 
17 | config:
18 |   temperature: 0.7
19 |   max_tokens: 2048
20 |   top_p: 0.95
21 |   stream: false
22 | 
23 | retry:
24 |   max_attempts: 3
25 |   backoff_factor: 2
26 |   max_delay: 60
27 | 
28 | log_level: INFO


--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | """Configuration loaders for pluggable components"""
 2 | 
 3 | from .llm_config import LLMConfigLoader
 4 | from .agent_config import AgentConfigLoader
 5 | from .loader_config import LoaderConfigLoader
 6 | from .chunker_config import ChunkerConfigLoader
 7 | from .embedder_config import EmbedderConfigLoader
 8 | from .reranker_config import RerankerConfigLoader
 9 | from .rag_loader import RagConfigLoader
10 | 
11 | __all__ = [
12 |     "LLMConfigLoader",
13 |     "AgentConfigLoader",
14 |     "LoaderConfigLoader",
15 |     "ChunkerConfigLoader",
16 |     "EmbedderConfigLoader",
17 |     "RerankerConfigLoader",
18 |     "RagConfigLoader",
19 | ]
20 | 


--------------------------------------------------------------------------------
/maru_lang/schemas/chat.py:
--------------------------------------------------------------------------------
 1 | from typing import Any
 2 | from typing import List, Optional
 3 | from datetime import datetime
 4 | from pydantic import BaseModel, Field, field_validator
 5 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument
 6 | 
 7 | 
 8 | class ChatRequest(BaseModel):
 9 |     content: str
10 |     session_start_time: Optional[datetime] = Field(
11 |         default=None,
12 |         description="세션 시작 시간")
13 | 
14 | class ChatResponse(BaseModel):
15 |     answer: str
16 |     references: list[RetrieveDocument]
17 | 
18 | 
19 | class ConversationResponse(BaseModel):
20 |     id: int
21 |     question: str
22 |     answer: str
23 |     created_at: datetime
24 | 


--------------------------------------------------------------------------------
/maru_lang/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unified utility module.
 3 | 
 4 | This package exposes shared utility functions used across the project.
 5 | 
 6 | Submodules:
 7 | - security: Security and encryption utilities (JWT, AES, etc.)
 8 | 
 9 | """
10 | 
11 | 
12 | # Security utilities
13 | from .security import (
14 |     generate_anonymized_key,
15 |     create_jwt_token,
16 |     decode_token,
17 |     get_key_spec,
18 |     aes256_decrypt,
19 |     aes256_encrypt
20 | )
21 | 
22 | __all__ = [
23 |     
24 |     # Security helpers
25 |     "generate_anonymized_key",
26 |     "create_jwt_token",
27 |     "decode_token",
28 |     "get_key_spec",
29 |     "aes256_decrypt",
30 |     "aes256_encrypt"
31 | ]


--------------------------------------------------------------------------------
/maru_lang/dependencies/llm.py:
--------------------------------------------------------------------------------
 1 | from maru_lang.pluggable.llms import LLMServerClient, LLMServerManager
 2 | 
 3 | 
 4 | _llm_manager = None
 5 | 
 6 | 
 7 | async def get_llm_manager() -> LLMServerManager:
 8 |     """LLMServerManager 인스턴스를 반환합니다."""
 9 |     global _llm_manager
10 |     if _llm_manager is None:
11 |         _llm_manager = LLMServerManager()
12 |     # 서버가 초기화되지 않았다면 초기화
13 |     if not _llm_manager.all_servers:
14 |         await _llm_manager.initialize_servers()
15 | 
16 |     return _llm_manager
17 | 
18 | 
19 | async def get_llm() -> LLMServerClient | None:
20 |     """활성화된 LLM 서버 중 하나를 반환합니다."""
21 |     manager = await get_llm_manager()
22 | 
23 | 
24 |     return await manager.get_active_server()
25 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/chunkers/base.py:
--------------------------------------------------------------------------------
 1 | from abc import ABC, abstractmethod
 2 | from typing import List, Optional
 3 | from maru_lang.models.ingest import ChunkInput
 4 | 
 5 | 
 6 | class BaseChunker(ABC):
 7 |     """텍스트 청킹 전략의 기본 인터페이스"""
 8 | 
 9 |     # Chunker 식별 정보
10 |     name: str = "base_chunker"
11 |     description: str = "기본 청킹 전략"
12 | 
13 |     @abstractmethod
14 |     def chunk(self, text: str) -> List[ChunkInput]:
15 |         """전체 텍스트를 받아서 ChunkInput 리스트로 변환"""
16 |         pass
17 | 
18 |     def get_metadata(self) -> dict:
19 |         """Chunker 메타데이터 반환"""
20 |         return {
21 |             "chunker_name": self.name,
22 |             "chunker_description": self.description,
23 |         }
24 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/models/reranker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reranker configuration models
 3 | """
 4 | from dataclasses import dataclass
 5 | from typing import Optional, Literal
 6 | 
 7 | 
 8 | @dataclass
 9 | class RerankerConfig:
10 |     """Reranker configuration - reranker 모델 및 사용 여부 설정"""
11 |     enabled: bool = True
12 |     method: Literal["model", "agent"] = "model"
13 | 
14 |     # Method: "model" - 임베딩 모델 기반 reranking
15 |     default_model: str = "BAAI/bge-reranker-v2-m3"
16 | 
17 |     # Method: "agent" - Agent 기반 reranking (LLM 등)
18 |     agent_name: Optional[str] = None
19 | 
20 |     # Reranking 후 반환할 최대 개수 (None이면 원본 k 사용)
21 |     top_k: Optional[int] = 5
22 | 
23 |     source_path: str = ""
24 |     is_override: bool = False
25 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/agents/builtin/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Builtin agents - core system agents
 3 | These agents are not customizable by users and are part of the core system
 4 | """
 5 | from maru_lang.pluggable.agents.builtin.group_classifier import GroupClassifierAgent
 6 | from maru_lang.pluggable.agents.builtin.intent_extractor import IntentExtractorAgent
 7 | from maru_lang.pluggable.agents.builtin.keyword_extractor import KeywordExtractorAgent
 8 | from maru_lang.pluggable.agents.builtin.response_agent import ResponseAgent
 9 | from maru_lang.pluggable.agents.builtin.knowledge_search import KnowledgeSearchAgent
10 | 
11 | __all__ = [
12 |     "GroupClassifierAgent",
13 |     "IntentExtractorAgent",
14 |     "KeywordExtractorAgent",
15 |     "ResponseAgent",
16 |     "KnowledgeSearchAgent",
17 | ]
18 | 


--------------------------------------------------------------------------------
/maru_lang/models/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Configuration models for the LLM Chatbot application
 3 | 
 4 | Note: Most config models have been moved to pluggable.models
 5 | This module now only contains Group configuration which is not pluggable
 6 | """
 7 | from .group import GroupConfig, GroupsConfig
 8 | 
 9 | # Import pluggable models for backward compatibility
10 | from maru_lang.pluggable.models import (
11 |     LLMConfig,
12 |     AgentConfig,
13 |     LoaderConfig,
14 |     ExtensionMapping,
15 |     ChunkerConfig,
16 |     EmbedderConfig,
17 |     RerankerConfig,
18 | )
19 | 
20 | __all__ = [
21 |     "LLMConfig",
22 |     "GroupConfig",
23 |     "GroupsConfig",
24 |     "AgentConfig",
25 |     "LoaderConfig",
26 |     "ExtensionMapping",
27 |     "ChunkerConfig",
28 |     "EmbedderConfig",
29 |     "RerankerConfig",
30 | ]


--------------------------------------------------------------------------------
/maru_lang/schemas/auth.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Optional
 3 | 
 4 | 
 5 | class SignUpRequest(BaseModel):
 6 |     email: str
 7 | 
 8 | 
 9 | class LogoutRequest(BaseModel):
10 |     device_id: str
11 | 
12 | 
13 | class VerifyCodeRequest(BaseModel):
14 |     device_id: str
15 |     email: str
16 |     code: str
17 | 
18 | 
19 | class UserResponse(BaseModel):
20 |     id: int
21 |     email: str
22 |     name: Optional[str] = None
23 | 
24 |     class Config:
25 |         from_attributes = True
26 | 
27 | 
28 | class UserGroupResponse(BaseModel):
29 |     id: int
30 |     name: str
31 |     manager: Optional[UserResponse] = None
32 |     created_at: Optional[str] = None
33 | 
34 |     class Config:
35 |         from_attributes = True
36 | 
37 | 
38 | class UserGroupsResponse(BaseModel):
39 |     groups: List[UserGroupResponse]
40 |     total: int
41 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/models/__init__.py:
--------------------------------------------------------------------------------
 1 | """Configuration data models for pluggable components"""
 2 | 
 3 | from .llm import LLMConfig
 4 | from .agent import AgentConfig
 5 | from .loader import LoaderConfig, ExtensionMapping
 6 | from .chunker import ChunkerConfig
 7 | from .embedder import EmbedderConfig
 8 | from .reranker import RerankerConfig
 9 | from .rag import (
10 |     RagConfig,
11 |     RetrieverConfig,
12 |     GroupRagConfig,
13 |     QueryTypeWeights,
14 |     FallbackLogicConfig,
15 |     GroupComponents,
16 | )
17 | 
18 | __all__ = [
19 |     "LLMConfig",
20 |     "AgentConfig",
21 |     "LoaderConfig",
22 |     "ExtensionMapping",
23 |     "ChunkerConfig",
24 |     "EmbedderConfig",
25 |     "RerankerConfig",
26 |     "RagConfig",
27 |     "RetrieverConfig",
28 |     "GroupRagConfig",
29 |     "QueryTypeWeights",
30 |     "FallbackLogicConfig",
31 |     "GroupComponents",
32 | ]
33 | 


--------------------------------------------------------------------------------
/maru_lang/services/admin.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Admin user management service
 3 | """
 4 | from maru_lang.core.relation_db.models.auth import User
 5 | 
 6 | 
 7 | ADMIN_EMAIL = "admin@maru.local"
 8 | ADMIN_NAME = "Admin"
 9 | 
10 | 
11 | async def get_or_create_admin_user() -> User:
12 |     """
13 |     Admin 사용자를 가져오거나 없으면 생성합니다.
14 |     CLI 명령어는 기본적으로 admin 사용자로 실행됩니다.
15 | 
16 |     Returns:
17 |         Admin User 인스턴스
18 |     """
19 |     admin_user = await User.get_or_none(email=ADMIN_EMAIL)
20 | 
21 |     if admin_user is None:
22 |         admin_user = await User.create(
23 |             email=ADMIN_EMAIL,
24 |             name=ADMIN_NAME,
25 |         )
26 | 
27 |     return admin_user
28 | 
29 | 
30 | async def ensure_admin_user() -> User:
31 |     """
32 |     Admin 사용자가 존재하는지 확인하고 반환합니다.
33 |     DB 초기화 시 호출됩니다.
34 | 
35 |     Returns:
36 |         Admin User 인스턴스
37 |     """
38 |     return await get_or_create_admin_user()
39 | 


--------------------------------------------------------------------------------
/maru_lang/templates/python/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Custom MaruLang Application
 3 | This file was generated by maru install command.
 4 | """
 5 | from maru_lang import MaruLangApp
 6 | 
 7 | # Create your custom MaruLang instance
 8 | app = MaruLangApp(
 9 |     title="My MaruLang App",
10 |     version="1.0.0",
11 |     description="Custom MaruLang Application"
12 | )
13 | 
14 | # You can customize the app here
15 | # For example:
16 | # - Add custom startup events
17 | # - Add custom routes
18 | # - Add middleware
19 | # - Configure CORS settings
20 | 
21 | @app.on_event("startup")
22 | async def custom_startup():
23 |     """Custom startup event"""
24 |     print("🚀 Custom MaruLang app started!")
25 | 
26 | # Optional: Add custom routes
27 | # @app.get("/custom-health")
28 | # async def custom_health_check():
29 | #     return {"status": "healthy", "custom": True}
30 | 
31 | # The app instance will be imported by the serve command
32 | # Usage: maru serve --app-module main:app


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/chunker_config.yaml:
--------------------------------------------------------------------------------
 1 | # Chunker Configuration
 2 | # Configure constructor parameters for each chunker.
 3 | #
 4 | # Available built-in chunkers:
 5 | #   - paragraph: chunk by paragraph (split on blank lines)
 6 | #   - sentence: chunk by sentence and merge when needed
 7 | #   - fixed_size: chunk by fixed size (supports overlap)
 8 | #
 9 | # Add custom chunkers under the chunkers/ directory.
10 | 
11 | # Chunker-specific settings
12 | chunkers:
13 |   # Paragraph-based chunker
14 |   paragraph:
15 |     max_chunk_size: 2000
16 | 
17 |   # Sentence-based chunker
18 |   sentence:
19 |     max_chunk_size: 500
20 | 
21 |   # Fixed-size chunker
22 |   fixed_size:
23 |     chunk_size: 1000
24 |     overlap: 200
25 | 
26 | # Example for custom chunkers:
27 | # chunkers:
28 | #   header:  # Markdown header-based chunker
29 | #     max_level: 3
30 | #
31 | #   page:    # PDF page-based chunker
32 | #     combine_small_pages: true
33 | #     min_page_chars: 100
34 | 


--------------------------------------------------------------------------------
/maru_lang/models/configs/group.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Group configuration models
 3 | """
 4 | from dataclasses import dataclass, field
 5 | from typing import Dict, Any, List
 6 | 
 7 | 
 8 | @dataclass
 9 | class GroupConfig:
10 |     """Group configuration for chatbot categorization"""
11 |     name: str
12 |     description: str = ""
13 |     force_rag: bool = False
14 |     permissions: List[str] = field(default_factory=list)
15 |     prompts: List[str] = field(default_factory=list)
16 |     priority: str = "normal"  # high, normal, low
17 |     weight: float = 1.0
18 |     settings: Dict[str, Any] = field(default_factory=dict)
19 |     source_path: str = ""
20 |     is_override: bool = False
21 | 
22 | 
23 | @dataclass
24 | class GroupsConfig:
25 |     """Complete groups configuration including priorities"""
26 |     group_priorities: Dict[str, Any] = field(default_factory=dict)
27 |     groups: Dict[str, GroupConfig] = field(default_factory=dict)
28 |     tool_choice_reason: Dict[str, str] = field(default_factory=dict)
29 |     source_path: str = ""
30 |     is_override: bool = False


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 ML2
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/maru_lang/core/relation_db/models/chat.py:
--------------------------------------------------------------------------------
 1 | from tortoise.models import Model
 2 | from tortoise import fields
 3 | from datetime import datetime, timezone
 4 | from enum import IntEnum
 5 | 
 6 | 
 7 | class Conversation(Model):
 8 |     id = fields.IntField(pk=True)
 9 |     user = fields.ForeignKeyField(
10 |         "models.User", related_name="conversations", on_delete=fields.OnDelete.CASCADE)
11 |     question = fields.TextField()  # 사용자 질문
12 |     enhanced_question = fields.TextField(null=True)  # 확장된 질문
13 |     answer = fields.TextField()    # AI 답변
14 |     metadata = fields.JSONField(default={})  # API 호출 정보, 토큰 수, 처리 시간 등
15 |     created_at = fields.DatetimeField(auto_now_add=True)
16 | 
17 |     class Meta:
18 |         table = "conversation"
19 | 
20 | 
21 | class ConversationReference(Model):
22 |     id = fields.IntField(pk=True)
23 |     conversation = fields.ForeignKeyField(
24 |         "models.Conversation",
25 |         related_name="references",
26 |         on_delete=fields.OnDelete.CASCADE)
27 |     document = fields.ForeignKeyField(
28 |         "models.Document",
29 |         related_name="conversation_references",
30 |         on_delete=fields.OnDelete.CASCADE)
31 |     score = fields.FloatField()  # 검색 관련성 점수
32 | 
33 |     class Meta:
34 |         table = "conversation_reference"
35 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/models/llm.py:
--------------------------------------------------------------------------------
 1 | """
 2 | LLM configuration models
 3 | """
 4 | from dataclasses import dataclass, field
 5 | from typing import Dict, Any, Optional
 6 | 
 7 | 
 8 | @dataclass
 9 | class LLMConfig:
10 |     """LLM server configuration"""
11 |     name: str
12 |     url: str
13 |     model_name: str = ""
14 |     description: str = ""
15 |     api_key: Optional[str] = None
16 |     timeout: float = 30.0
17 |     enabled: bool = True
18 |     max_retries: int = 3
19 |     health_check_endpoint: str = "/health"
20 |     chat_completions_path: str = "/v1/chat/completions"
21 |     headers: Dict[str, str] = field(default_factory=dict)
22 |     config: Dict[str, Any] = field(default_factory=dict)
23 |     health_check: Dict[str, Any] = field(default_factory=dict)
24 |     cost_tracking: Dict[str, Any] = field(default_factory=dict)
25 |     limits: Dict[str, Any] = field(default_factory=dict)
26 |     retry: Dict[str, Any] = field(default_factory=dict)
27 |     log_level: str = "INFO"
28 |     source_path: str = ""
29 |     is_override: bool = False
30 | 
31 |     def __post_init__(self):
32 |         """Process environment variables in api_key"""
33 |         if self.api_key and self.api_key.startswith('${') and self.api_key.endswith('}'):
34 |             import os
35 |             env_var = self.api_key[2:-1]
36 |             self.api_key = os.getenv(env_var)


--------------------------------------------------------------------------------
/maru_lang/pluggable/models/loader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Loader configuration models
 3 | """
 4 | from dataclasses import dataclass, field
 5 | from typing import Dict, Optional, Any
 6 | 
 7 | 
 8 | @dataclass
 9 | class ExtensionMapping:
10 |     """확장자별 loader와 chunker 매핑"""
11 |     loader: str  # 사용할 loader (parser) 이름
12 |     chunker: str  # 사용할 chunker 이름
13 | 
14 | 
15 | @dataclass
16 | class LoaderConfig:
17 |     """
18 |     Loader configuration
19 | 
20 |     파일 확장자별로 어떤 loader(parser)와 chunker를 사용할지 설정
21 |     """
22 |     # Default loader/chunker (확장자 매핑 없을 때 사용)
23 |     # default_loader가 None이면 등록된 확장자만 처리 (whitelist 모드)
24 |     default_loader: Optional[str] = None
25 |     default_chunker: Optional[str] = "paragraph"
26 | 
27 |     # 확장자 -> {loader, chunker} 매핑
28 |     # 예: {".pdf": {"loader": "pdf", "chunker": "paragraph"}}
29 |     extensions: Dict[str, ExtensionMapping] = field(default_factory=dict)
30 | 
31 |     # Configuration metadata
32 |     source_path: str = ""
33 |     is_override: bool = False
34 | 
35 |     def __post_init__(self):
36 |         """Post-process configuration"""
37 |         # extensions를 dict에서 ExtensionMapping으로 변환
38 |         new_extensions = {}
39 |         for ext, mapping in self.extensions.items():
40 |             if isinstance(mapping, dict):
41 |                 new_extensions[ext] = ExtensionMapping(**mapping)
42 |             else:
43 |                 new_extensions[ext] = mapping
44 |         self.extensions = new_extensions
45 | 


--------------------------------------------------------------------------------
/maru_lang/core/relation_db/__init__.py:
--------------------------------------------------------------------------------
 1 | from .connection import get_register_orm, orm_context
 2 | 
 3 | 
 4 | # def get_tortoise_orm():
 5 | #     """
 6 | #     Get Tortoise ORM configuration lazily.
 7 | #
 8 | #     This function is called by Aerich when needed, avoiding issues with
 9 | #     configuration loading at import time.
10 | #     """
11 | #     from maru_lang.configs.system_config import get_system_config
12 | #
13 | #     config = get_system_config()
14 | #     if not config:
15 | #         raise RuntimeError(
16 | #             "System configuration not found. Please run 'maru install' first."
17 | #         )
18 | #
19 | #     return {
20 | #         "connections": {"default": config.database.get_database_url()},
21 | #         "apps": {
22 | #             "models": {
23 | #                 "models": ["maru_lang.models", "aerich.models"],
24 | #                 "default_connection": "default",
25 | #             },
26 | #         },
27 | #         "use_tz": True,
28 | #     }
29 | 
30 | 
31 | # Tortoise ORM configuration for Aerich
32 | # This is evaluated lazily - only accessed when needed by Aerich commands
33 | # try:
34 | #     TORTOISE_ORM = get_tortoise_orm()
35 | # except RuntimeError:
36 | #     # If config not available at import time, set to None
37 | #     # It will be initialized later when needed
38 | #     TORTOISE_ORM = None
39 | 
40 | __all__ = [
41 |     "get_register_orm",
42 |     "orm_context",
43 |     # "TORTOISE_ORM",
44 |     # "get_tortoise_orm",
45 | ]
46 | 


--------------------------------------------------------------------------------
/maru_lang/schemas/ingest.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Optional
 2 | from datetime import datetime
 3 | from pydantic import BaseModel, Field
 4 | 
 5 | 
 6 | class FileInfo(BaseModel):
 7 |     """Individual file information for sync check"""
 8 |     fileName: str = Field(..., description="파일 이름")
 9 |     createdAt: datetime = Field(..., description="파일 생성 시간")
10 |     relativePath: str = Field(..., description="상대 경로 (프로젝트폴더명/경로/파일명)")
11 |     size: int = Field(..., description="파일 크기 (bytes)")
12 | 
13 | 
14 | class SyncCheckRequest(BaseModel):
15 |     """Request for checking which files need to be uploaded"""
16 |     folderPath: str = Field(..., description="프로젝트 폴더명")
17 |     files: List[FileInfo] = Field(..., description="폴더 내 파일 정보 목록")
18 |     description: Optional[str] = Field(None, description="DocumentGroup 설명")
19 | 
20 | 
21 | class SyncCheckResponse(BaseModel):
22 |     """Response for sync check"""
23 |     filesToUpload: List[str] = Field(..., description="업로드가 필요한 파일의 relativePath 목록")
24 |     totalFiles: int = Field(..., description="전체 파일 개수")
25 |     message: str = Field(..., description="상태 메시지")
26 | 
27 | 
28 | class SyncUploadResponse(BaseModel):
29 |     """Response for batch upload"""
30 |     success: bool = Field(..., description="업로드 성공 여부")
31 |     message: str = Field(..., description="상태 메시지 (예: '배치 1/4 업로드 완료')")
32 |     uploadedCount: int = Field(..., description="업로드된 파일 개수")
33 |     errors: Optional[List[str]] = Field(default=None, description="에러 메시지 목록 (있는 경우)")
34 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/txt_parser.py:
--------------------------------------------------------------------------------
 1 | """Plain text file parser."""
 2 | 
 3 | from pathlib import Path
 4 | from .base import BaseParser, ParseResult
 5 | 
 6 | 
 7 | class TxtParser(BaseParser):
 8 |     """일반 텍스트 파일 파서"""
 9 | 
10 |     def parse(self, file_path: Path) -> ParseResult:
11 |         """
12 |         텍스트 파일을 읽어 내용을 반환합니다.
13 | 
14 |         Args:
15 |             file_path: 파싱할 텍스트 파일 경로
16 | 
17 |         Returns:
18 |             ParseResult: 파싱된 텍스트와 메타데이터
19 |         """
20 |         if not file_path.exists():
21 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 | 
23 |         try:
24 |             with open(file_path, 'r', encoding='utf-8') as f:
25 |                 content = f.read()
26 | 
27 |             metadata = {
28 |                 'file_type': 'text',
29 |                 'encoding': 'utf-8',
30 |                 'file_size': file_path.stat().st_size,
31 |             }
32 | 
33 |             return ParseResult(content=content, metadata=metadata)
34 | 
35 |         except UnicodeDecodeError as e:
36 |             raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
37 |         except Exception as e:
38 |             raise ValueError(f"파일 읽기 실패: {file_path}") from e
39 | 
40 |     def supports(self, file_path: Path) -> bool:
41 |         """텍스트 파일 확장자 지원 확인"""
42 |         return file_path.suffix.lower() in self.supported_extensions
43 | 
44 |     @property
45 |     def supported_extensions(self) -> list[str]:
46 |         """지원하는 텍스트 파일 확장자"""
47 |         return ['.txt', '.text', '.log']
48 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/mcps/agents_firecrawl_mcp.yaml:
--------------------------------------------------------------------------------
 1 | name: firecrawl_mcp
 2 | description: "Firecrawl MCP - Web scraping, crawling, and discovery. If crawling or searching is required, it finds information on the web."
 3 | type: mcp_client
 4 | enabled: false
 5 | version: "1.0.0"
 6 | 
 7 | # Agent Tags
 8 | tags:
 9 |   - mcp
10 |   - firecrawl
11 |   - tools
12 |   - files
13 | 
14 | # LLM Server Settings (Optimized for MCP tool usage)
15 | target_llm_config:
16 |   server_name: "openai"  # Model suitable for tool usage
17 | 
18 |   override_params:
19 |     temperature: 0.1  # Low temperature for accuracy in file operations
20 |     max_tokens: 2000
21 |     top_p: 0.8
22 | 
23 |   fallback_strategy: "error"  # MCP agents require an LLM
24 | 
25 | # MCP Server Connection Settings
26 | mcp_config:
27 |   # Notion MCP server configuration
28 |   transport: "stdio"
29 |   command: "npx"
30 |   args: ["-y", "firecrawl-mcp"]
31 |   env:
32 |     FIRECRAWL_API_KEY: ${FIRECRAWL_API_KEY}
33 |   timeout: 30
34 | 
35 | # Prompt Settings
36 | prompts:
37 |   system_prompt: |
38 |     You are an agent that uses web scraping, crawling, and search tools.
39 |     Perform web scraping, crawling, and search according to the user's requests.
40 | 
41 |     Available tools:
42 |     - web_scraping: Web scraping
43 |     - crawling: Crawling
44 |     - searching: Search
45 |   user_prompt_template: |
46 |     {question}
47 | 
48 | # Agent Execution Settings
49 | config:
50 |   timeout: 60  # File operations can take time
51 |   retry_count: 2
52 |   max_context_length: 8000  # File contents can be long


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/markdown_parser.py:
--------------------------------------------------------------------------------
 1 | """Markdown file parser."""
 2 | 
 3 | from pathlib import Path
 4 | from .base import BaseParser, ParseResult
 5 | 
 6 | 
 7 | class MarkdownParser(BaseParser):
 8 |     """마크다운 파일 파서"""
 9 | 
10 |     def parse(self, file_path: Path) -> ParseResult:
11 |         """
12 |         마크다운 파일을 읽어 내용을 반환합니다.
13 |         (나중에 HTML 변환 등의 추가 처리 가능)
14 | 
15 |         Args:
16 |             file_path: 파싱할 마크다운 파일 경로
17 | 
18 |         Returns:
19 |             ParseResult: 파싱된 텍스트와 메타데이터
20 |         """
21 |         if not file_path.exists():
22 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
23 | 
24 |         try:
25 |             with open(file_path, 'r', encoding='utf-8') as f:
26 |                 content = f.read()
27 | 
28 |             metadata = {
29 |                 'file_type': 'markdown',
30 |                 'encoding': 'utf-8',
31 |                 'file_size': file_path.stat().st_size,
32 |             }
33 | 
34 |             return ParseResult(content=content, metadata=metadata)
35 | 
36 |         except UnicodeDecodeError as e:
37 |             raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
38 |         except Exception as e:
39 |             raise ValueError(f"파일 읽기 실패: {file_path}") from e
40 | 
41 |     def supports(self, file_path: Path) -> bool:
42 |         """마크다운 파일 확장자 지원 확인"""
43 |         return file_path.suffix.lower() in self.supported_extensions
44 | 
45 |     @property
46 |     def supported_extensions(self) -> list[str]:
47 |         """지원하는 마크다운 파일 확장자"""
48 |         return ['.md', '.markdown', '.mdown', '.mkd']
49 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/chunkers/sentence.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import List
 3 | from maru_lang.models.ingest import ChunkInput
 4 | from .base import BaseChunker
 5 | 
 6 | 
 7 | class SentenceChunker(BaseChunker):
 8 |     """문장 단위로 청킹 (마침표/물음표/느낌표 기준), 최대 크기 제한"""
 9 | 
10 |     name = "sentence"
11 |     description = "문장 단위로 청킹하고 최대 크기에 맞춰 병합"
12 | 
13 |     def __init__(self, max_chunk_size: int = 500):
14 |         self.max_chunk_size = max_chunk_size
15 | 
16 |     def chunk(self, text: str) -> List[ChunkInput]:
17 |         # 한글/영문 문장 끝 패턴
18 |         sentence_pattern = r'[.!?]+[\s\n]+'
19 |         sentences = [s.strip() for s in re.split(sentence_pattern, text) if s.strip()]
20 | 
21 |         chunks = []
22 |         current_chunk = []
23 |         current_size = 0
24 |         chunk_num = 1
25 | 
26 |         for sentence in sentences:
27 |             sentence_len = len(sentence)
28 | 
29 |             if current_size + sentence_len > self.max_chunk_size and current_chunk:
30 |                 # 현재 청크 저장
31 |                 chunks.append(ChunkInput(
32 |                     number=chunk_num,
33 |                     content=' '.join(current_chunk)
34 |                 ))
35 |                 chunk_num += 1
36 |                 current_chunk = []
37 |                 current_size = 0
38 | 
39 |             current_chunk.append(sentence)
40 |             current_size += sentence_len
41 | 
42 |         # 마지막 청크
43 |         if current_chunk:
44 |             chunks.append(ChunkInput(
45 |                 number=chunk_num,
46 |                 content=' '.join(current_chunk)
47 |             ))
48 | 
49 |         return chunks
50 | 


--------------------------------------------------------------------------------
/maru_lang/core/relation_db/connection.py:
--------------------------------------------------------------------------------
 1 | from tortoise import Tortoise
 2 | from tortoise.contrib.fastapi import RegisterTortoise
 3 | from functools import partial
 4 | from maru_lang.configs.system_config import get_system_config
 5 | from contextlib import asynccontextmanager
 6 | from typing import Awaitable, Callable
 7 | import asyncio
 8 | 
 9 | 
10 | 
11 | def run_with_orm_context(coro: Callable[..., Awaitable], *args, **kwargs):
12 |     async def runner():
13 |         async with orm_context():
14 |             return await coro(*args, **kwargs)
15 |     return asyncio.run(runner())
16 | 
17 | 
18 | def get_register_orm():
19 |     config = get_system_config()
20 |     # partial을 사용해서 미리 설정된 RegisterTortoise를 반환
21 |     return partial(
22 |         RegisterTortoise,
23 |         generate_schemas=True,
24 |         add_exception_handlers=True,
25 |         db_url=config.database.get_database_url(),
26 |         modules={"models": [
27 |             "maru_lang.core.relation_db.models", "aerich.models"]},
28 |         use_tz=True,
29 |     )
30 | 
31 | 
32 | @asynccontextmanager
33 | async def orm_context():
34 |     config = get_system_config()
35 | 
36 |     await Tortoise.init(
37 |         db_url=config.database.get_database_url(),
38 |         modules={"models": [
39 |             "maru_lang.core.relation_db.models", "aerich.models"]},
40 |         use_tz=True,
41 |     )
42 |     await Tortoise.generate_schemas()
43 | 
44 |     # Admin 사용자 자동 생성
45 |     from maru_lang.services.admin import ensure_admin_user
46 |     await ensure_admin_user()
47 | 
48 |     try:
49 |         yield
50 |     finally:
51 |         await Tortoise.close_connections()


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/base.py:
--------------------------------------------------------------------------------
 1 | """Base parser interface for document parsing."""
 2 | 
 3 | from abc import ABC, abstractmethod
 4 | from pathlib import Path
 5 | from typing import Optional
 6 | from dataclasses import dataclass
 7 | 
 8 | 
 9 | @dataclass
10 | class ParseResult:
11 |     """파싱 결과를 담는 데이터 클래스"""
12 |     content: str
13 |     metadata: Optional[dict] = None
14 | 
15 |     def __post_init__(self):
16 |         if self.metadata is None:
17 |             self.metadata = {}
18 | 
19 | 
20 | class BaseParser(ABC):
21 |     """문서 파싱을 위한 기본 인터페이스"""
22 | 
23 |     @property
24 |     def default_chunker_name(self) -> Optional[str]:
25 |         """
26 |         이 파서의 기본 chunker 이름
27 | 
28 |         Returns:
29 |             Optional[str]: chunker 이름 (None이면 전역 기본 chunker 사용)
30 |         """
31 |         return None  # 기본값: None (전역 기본 chunker 사용)
32 | 
33 |     @abstractmethod
34 |     def parse(self, file_path: Path) -> ParseResult:
35 |         """
36 |         파일을 파싱하여 텍스트 콘텐츠를 추출합니다.
37 | 
38 |         Args:
39 |             file_path: 파싱할 파일의 경로
40 | 
41 |         Returns:
42 |             ParseResult: 파싱된 텍스트와 메타데이터
43 | 
44 |         Raises:
45 |             ValueError: 파일을 읽을 수 없거나 파싱할 수 없는 경우
46 |             FileNotFoundError: 파일이 존재하지 않는 경우
47 |         """
48 |         pass
49 | 
50 |     @abstractmethod
51 |     def supports(self, file_path: Path) -> bool:
52 |         """
53 |         해당 파서가 주어진 파일을 지원하는지 확인합니다.
54 | 
55 |         Args:
56 |             file_path: 확인할 파일 경로
57 | 
58 |         Returns:
59 |             bool: 지원 여부
60 |         """
61 |         pass
62 | 
63 |     @property
64 |     @abstractmethod
65 |     def supported_extensions(self) -> list[str]:
66 |         """
67 |         이 파서가 지원하는 파일 확장자 목록
68 | 
69 |         Returns:
70 |             list[str]: 지원하는 확장자 리스트 (예: ['.txt', '.text'])
71 |         """
72 |         pass
73 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/reranker_config.yaml:
--------------------------------------------------------------------------------
 1 | # Reranker Configuration
 2 | # Configure reranker usage and behavior
 3 | 
 4 | # Enable or disable the reranker
 5 | enabled: true
 6 | 
 7 | # Reranking method
 8 | # - "model": embedding-model-based reranking (fast, low cost)
 9 | # - "agent": agent-based reranking (LLM-powered, higher accuracy)
10 | method: agent
11 | 
12 | # ============================================================
13 | # Method: "agent" - agent-based reranking (LLM)
14 | # ============================================================
15 | 
16 | # Agent name (define in agent_config.yaml)
17 | agent_name: llm_reranker
18 | 
19 | # ============================================================
20 | # Method: "model" - embedding-model-based reranking
21 | # ============================================================
22 | 
23 | # Default reranker model (only used when method: model)
24 | default_model: BAAI/bge-reranker-v2-m3
25 | 
26 | # Maximum number of documents to return after reranking
27 | # - If not set (or null), returns the same number as the original search results
28 | # - Use this to retrieve more documents (k=20) and rerank to top-k (top_k=5) for better quality
29 | top_k: 5
30 | 
31 | # Example: Using LLM-based reranking (default)
32 | # 1. The llm_reranker agent is already included in rerankers/
33 | # 2. Make sure to register it in configs/agent_config.yaml (already done if using default)
34 | # 3. Set method: agent and agent_name: llm_reranker (already set above)
35 | #
36 | # Example: Switching to model-based reranking
37 | # 1. Set method: model
38 | # 2. Uncomment and configure default_model
39 | # 3. Comment out agent_name
40 | #
41 | # Example: Creating a custom reranker agent
42 | # 1. Create my_reranker.py in configs/rerankers/ (inherits BaseAgent)
43 | # 2. Create my_reranker.yaml in configs/rerankers/ with prompts and tools
44 | # 3. Register in configs/agent_config.yaml:
45 | #    my_reranker:
46 | #      type: custom
47 | #      file: rerankers/my_reranker.py
48 | #      config: rerankers/my_reranker.yaml
49 | # 4. Set agent_name: my_reranker
50 | 


--------------------------------------------------------------------------------
/maru_lang/configs/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Unified configuration management system
 3 | 
 4 | Note: Most config loaders have been moved to pluggable.configs
 5 | This module provides backward compatibility imports and manages non-pluggable configs
 6 | """
 7 | from .base import DefaultConfigLoader
 8 | from .manager import ConfigManager, get_config_manager
 9 | from .diff_checker import check_config_differences, ConfigDiffChecker
10 | 
11 | # Import pluggable configs for backward compatibility
12 | from maru_lang.pluggable.configs import (
13 |     LLMConfigLoader,
14 |     AgentConfigLoader,
15 |     LoaderConfigLoader,
16 |     ChunkerConfigLoader,
17 |     EmbedderConfigLoader,
18 |     RerankerConfigLoader,
19 |     RagConfigLoader,
20 | )
21 | 
22 | # Import models for convenience
23 | from maru_lang.models.configs import (
24 |     LLMConfig,
25 |     GroupConfig,
26 |     GroupsConfig,
27 |     AgentConfig,
28 |     LoaderConfig,
29 |     ChunkerConfig,
30 |     EmbedderConfig,
31 |     RerankerConfig,
32 | )
33 | 
34 | # Import RAG models
35 | from maru_lang.pluggable.models import (
36 |     RagConfig,
37 |     RetrieverConfig,
38 |     GroupRagConfig,
39 | )
40 | 
41 | __all__ = [
42 |     # Base
43 |     'DefaultConfigLoader',
44 | 
45 |     # RAG (replaces Group)
46 |     'RagConfig',
47 |     'RetrieverConfig',
48 |     'GroupRagConfig',
49 |     'RagConfigLoader',
50 | 
51 |     # Backward compatibility - Group (deprecated, use RAG instead)
52 |     'GroupConfig',
53 |     'GroupsConfig',
54 | 
55 |     # Pluggable configs (re-exported for convenience)
56 |     'LLMConfig',
57 |     'LLMConfigLoader',
58 |     'AgentConfig',
59 |     'AgentConfigLoader',
60 |     'LoaderConfig',
61 |     'LoaderConfigLoader',
62 |     'ChunkerConfig',
63 |     'ChunkerConfigLoader',
64 |     'EmbedderConfig',
65 |     'EmbedderConfigLoader',
66 |     'RerankerConfig',
67 |     'RerankerConfigLoader',
68 | 
69 |     # Config Manager
70 |     'ConfigManager',
71 |     'get_config_manager',
72 | 
73 |     # Config Diff Checker
74 |     'check_config_differences',
75 |     'ConfigDiffChecker',
76 | ]
77 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/pdf_parser.py:
--------------------------------------------------------------------------------
 1 | """PDF file parser."""
 2 | 
 3 | from pathlib import Path
 4 | from .base import BaseParser, ParseResult
 5 | 
 6 | 
 7 | class PDFParser(BaseParser):
 8 |     """PDF 파일 파서 (PyPDF2 또는 pdfplumber 사용)"""
 9 | 
10 |     def parse(self, file_path: Path) -> ParseResult:
11 |         """
12 |         PDF 파일에서 텍스트를 추출합니다.
13 | 
14 |         Args:
15 |             file_path: 파싱할 PDF 파일 경로
16 | 
17 |         Returns:
18 |             ParseResult: 파싱된 텍스트와 메타데이터
19 |         """
20 |         if not file_path.exists():
21 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 | 
23 |         try:
24 |             # PyPDF2 사용 (나중에 pdfplumber로 변경 가능)
25 |             try:
26 |                 import PyPDF2
27 |             except ImportError:
28 |                 raise ImportError(
29 |                     "PyPDF2가 설치되지 않았습니다. 'pip install PyPDF2'로 설치하세요."
30 |                 )
31 | 
32 |             with open(file_path, 'rb') as f:
33 |                 pdf_reader = PyPDF2.PdfReader(f)
34 |                 num_pages = len(pdf_reader.pages)
35 | 
36 |                 # 모든 페이지에서 텍스트 추출
37 |                 text_parts = []
38 |                 for page_num in range(num_pages):
39 |                     page = pdf_reader.pages[page_num]
40 |                     text_parts.append(page.extract_text())
41 | 
42 |                 content = '\n\n'.join(text_parts)
43 | 
44 |             metadata = {
45 |                 'file_type': 'pdf',
46 |                 'num_pages': num_pages,
47 |                 'file_size': file_path.stat().st_size,
48 |             }
49 | 
50 |             return ParseResult(content=content, metadata=metadata)
51 | 
52 |         except Exception as e:
53 |             raise ValueError(f"PDF 파싱 실패: {file_path}") from e
54 | 
55 |     def supports(self, file_path: Path) -> bool:
56 |         """PDF 파일 확장자 지원 확인"""
57 |         return file_path.suffix.lower() in self.supported_extensions
58 | 
59 |     @property
60 |     def supported_extensions(self) -> list[str]:
61 |         """지원하는 PDF 파일 확장자"""
62 |         return ['.pdf']
63 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/loader_config.yaml:
--------------------------------------------------------------------------------
  1 | # Loader Configuration
  2 | # Map loaders (parsers) and chunkers by file extension.
  3 | 
  4 | # Default loader/chunker when no extension mapping is provided
  5 | # 💡 Tip: Comment out default_loader to ONLY process registered extensions
  6 | #         (automatically ignores .DS_Store, .git, etc.)
  7 | # default_loader: txt
  8 | default_chunker: paragraph
  9 | 
 10 | # Extension-specific mappings (declare special cases only)
 11 | extensions:
 12 |   # Text formats
 13 |   .txt:
 14 |     loader: txt
 15 |     chunker: paragraph
 16 | 
 17 |   .md:
 18 |     loader: markdown
 19 |     chunker: paragraph
 20 | 
 21 |   .markdown:
 22 |     loader: markdown
 23 |     chunker: paragraph
 24 | 
 25 |   # Document formats
 26 |   .pdf:
 27 |     loader: pdf
 28 |     chunker: paragraph
 29 | 
 30 |   .docx:
 31 |     loader: docx
 32 |     chunker: paragraph
 33 | 
 34 |   .pptx:
 35 |     loader: pptx
 36 |     chunker: paragraph
 37 | 
 38 |   .xlsx:
 39 |     loader: xlsx
 40 |     chunker: paragraph
 41 | 
 42 |   .xlsm:
 43 |     loader: xlsx
 44 |     chunker: paragraph
 45 | 
 46 |   # Web formats
 47 |   .html:
 48 |     loader: html
 49 |     chunker: paragraph
 50 | 
 51 |   .htm:
 52 |     loader: html
 53 |     chunker: paragraph
 54 | 
 55 |   .xhtml:
 56 |     loader: xml
 57 |     chunker: paragraph
 58 | 
 59 |   # Data formats
 60 |   .json:
 61 |     loader: json
 62 |     chunker: paragraph
 63 | 
 64 |   .jsonl:
 65 |     loader: json
 66 |     chunker: paragraph
 67 | 
 68 |   .yaml:
 69 |     loader: yaml
 70 |     chunker: paragraph
 71 | 
 72 |   .yml:
 73 |     loader: yaml
 74 |     chunker: paragraph
 75 | 
 76 |   .xml:
 77 |     loader: xml
 78 |     chunker: paragraph
 79 | 
 80 |   .csv:
 81 |     loader: csv
 82 |     chunker: paragraph
 83 | 
 84 |   .tsv:
 85 |     loader: csv
 86 |     chunker: paragraph
 87 | 
 88 |   # Code formats
 89 |   .py:
 90 |     loader: txt
 91 |     chunker: paragraph
 92 | 
 93 |   .js:
 94 |     loader: txt
 95 |     chunker: paragraph
 96 | 
 97 |   .ts:
 98 |     loader: txt
 99 |     chunker: paragraph
100 | 


--------------------------------------------------------------------------------
/maru_lang/dependencies/chat.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Chat Pipeline dependency
 3 | """
 4 | from typing import Optional
 5 | from maru_lang.pluggable.agents.agent_executor import AgentExecutor
 6 | from maru_lang.pluggable.agents.agent_factory import AgentFactory
 7 | from maru_lang.pluggable.agents.agent_selector import AgentSelector
 8 | from maru_lang.pipelines.chat import ChatPipeline
 9 | 
10 | 
11 | class ChatPipelineManager:
12 |     """Singleton manager for ChatPipeline instance"""
13 |     _instance: Optional[ChatPipeline] = None
14 |     _initialized: bool = False
15 | 
16 |     @classmethod
17 |     def get_instance(cls) -> ChatPipeline | None:
18 |         """Get or create ChatPipeline singleton instance"""
19 |         if not cls._initialized:
20 |             cls._instance = cls._create_pipeline()
21 |             cls._initialized = True
22 |         return cls._instance
23 | 
24 |     @classmethod
25 |     def _create_pipeline(cls) -> ChatPipeline | None:
26 |         """Create ChatPipeline instance with all dependencies"""
27 |         try:
28 |             # Register all agents from config
29 |             agents = AgentFactory().create_agents_from_config()
30 | 
31 |             # Create executor and register agents
32 |             agent_executor = AgentExecutor()
33 |             for agent in agents.values():
34 |                 agent_executor.register_agent(agent)
35 | 
36 |             # Create selector
37 |             agent_selector = AgentSelector()
38 | 
39 |             if not all([agent_selector, agent_executor]):
40 |                 return None
41 | 
42 |             return ChatPipeline(agent_selector, agent_executor)
43 |         except Exception as e:
44 |             print(f"❌ Failed to create ChatPipeline: {e}")
45 |             return None
46 | 
47 |     @classmethod
48 |     def reset(cls):
49 |         """Reset singleton instance (useful for testing)"""
50 |         cls._instance = None
51 |         cls._initialized = False
52 | 
53 | 
54 | async def get_chat_pipeline() -> ChatPipeline | None:
55 |     """Dependency to get ChatPipeline singleton instance"""
56 |     return ChatPipelineManager.get_instance()
57 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/agents_calculator.yaml:
--------------------------------------------------------------------------------
 1 | # Calculator Agent (Demo Only) – Purposefully incorrect but overconfident answers
 2 | # Demo agent; do NOT use in production!
 3 | 
 4 | name: calculator_demo
 5 | description: "An overconfident calculator that proudly returns incorrect answers (demo only)"
 6 | type: custom
 7 | enabled: false
 8 | version: "0.1.0"
 9 | 
10 | # Priority (higher values are selected first)
11 | priority: 5
12 | 
13 | # Agent selection criteria
14 | selection_criteria:
15 |   keywords:
16 |     - "calculate"
17 |     - "addition"
18 |     - "subtraction"
19 |     - "multiplication"
20 |     - "division"
21 |     - "conversion"
22 |   patterns:
23 |     - '.*[0-9]+.*[+\-*/].*[0-9]+.*'  # Expression pattern (single quotes required)
24 |     - ".*square.*"
25 |     - ".*convert.*"
26 | 
27 | # LLM configuration
28 | target_llm_config:
29 |   server_name: "openai"
30 |   override_params:
31 |     temperature: 0.9      # High temperature for “creative” arithmetic
32 |     max_tokens: 200
33 |     timeout: 8.0
34 | 
35 |   fallback_strategy: "any_available"
36 | 
37 | # Prompt configuration
38 | prompts:
39 |   system_prompt: |
40 |     You are the world’s greatest mathematician—valedictorian at MIT and a former NASA researcher.
41 | 
42 |     Important rules:
43 |     1. Always provide intentionally incorrect answers (roughly 20-50% error).
44 |     2. Respond with absolute confidence.
45 |     3. Use exaggerated phrases such as “This is 100% certain” or “An algorithm worthy of a Nobel Prize.”
46 |     4. Example outputs:
47 |        - 25 + 37 ≈ 73 (actual: 62)
48 |        - 100 * 2 ≈ 250 (actual: 200)
49 |        - 10 - 3 ≈ 5 (actual: 7)
50 | 
51 |     Respond with unwavering confidence and make sure your answer is wrong.
52 | 
53 |   user_prompt_template: |
54 |     Question: {question}
55 | 
56 | # Implementation (custom Python file in templates)
57 | implementation: templates.python.calculator_agent.CalculatorAgent
58 | 
59 | # Example usage
60 | examples:
61 |   - "What’s 25 plus 37?"
62 |   - "Calculate the square root of 100."
63 |   - "Convert 5 km to miles."
64 |   - "Convert 30°C to Fahrenheit."


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/json_parser.py:
--------------------------------------------------------------------------------
 1 | """JSON file parser."""
 2 | 
 3 | import json
 4 | from pathlib import Path
 5 | from .base import BaseParser, ParseResult
 6 | 
 7 | 
 8 | class JSONParser(BaseParser):
 9 |     """JSON 파일 파서"""
10 | 
11 |     def parse(self, file_path: Path) -> ParseResult:
12 |         """
13 |         JSON 파일을 읽어 포맷팅된 텍스트로 변환합니다.
14 | 
15 |         Args:
16 |             file_path: 파싱할 JSON 파일 경로
17 | 
18 |         Returns:
19 |             ParseResult: 파싱된 텍스트와 메타데이터
20 |         """
21 |         if not file_path.exists():
22 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
23 | 
24 |         try:
25 |             with open(file_path, 'r', encoding='utf-8') as f:
26 |                 data = json.load(f)
27 | 
28 |             # JSON을 보기 좋게 포맷팅
29 |             content = json.dumps(data, indent=2, ensure_ascii=False)
30 | 
31 |             metadata = {
32 |                 'file_type': 'json',
33 |                 'encoding': 'utf-8',
34 |                 'file_size': file_path.stat().st_size,
35 |             }
36 | 
37 |             # 구조 정보 추가
38 |             if isinstance(data, dict):
39 |                 metadata['structure'] = 'object'
40 |                 metadata['num_keys'] = len(data)
41 |             elif isinstance(data, list):
42 |                 metadata['structure'] = 'array'
43 |                 metadata['num_items'] = len(data)
44 | 
45 |             return ParseResult(content=content, metadata=metadata)
46 | 
47 |         except json.JSONDecodeError as e:
48 |             raise ValueError(f"JSON 파싱 실패: {file_path} - {str(e)}") from e
49 |         except UnicodeDecodeError as e:
50 |             raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
51 |         except Exception as e:
52 |             raise ValueError(f"파일 읽기 실패: {file_path}") from e
53 | 
54 |     def supports(self, file_path: Path) -> bool:
55 |         """JSON 파일 확장자 지원 확인"""
56 |         return file_path.suffix.lower() in self.supported_extensions
57 | 
58 |     @property
59 |     def supported_extensions(self) -> list[str]:
60 |         """지원하는 JSON 파일 확장자"""
61 |         return ['.json', '.jsonl']
62 | 


--------------------------------------------------------------------------------
/maru_lang/dependencies/ingest.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Ingest Pipeline dependency
 3 | """
 4 | from pathlib import Path
 5 | from typing import Optional
 6 | from maru_lang.pipelines.ingest.pipeline import IngestPipeline
 7 | from maru_lang.core.vector_db.factory import get_vector_db
 8 | from maru_lang.models.vector_db import get_vector_db_config_from_settings
 9 | from maru_lang.configs.system_config import get_system_config
10 | from maru_lang.configs import get_config_manager
11 | 
12 | config = get_system_config()
13 | 
14 | 
15 | def create_ingest_pipeline(
16 |     upload_path: Path,
17 |     group_name: str,
18 |     manager_id: int,
19 |     re_embed: bool = False,
20 |     all_files_list: Optional[list] = None,
21 |     description: Optional[str] = None,
22 | ) -> IngestPipeline:
23 |     """
24 |     Create IngestPipeline instance for file ingestion.
25 | 
26 |     Args:
27 |         upload_path: Path to uploaded files directory
28 |         group_name: Document group name (usually folder name)
29 |         manager_id: User ID who manages this group
30 |         re_embed: Whether to re-embed existing documents
31 |         all_files_list: Complete list of all file paths (for batch upload deletion detection)
32 |         description: DocumentGroup description (only for root group)
33 | 
34 |     Returns:
35 |         IngestPipeline instance
36 |     """
37 |     # Get VectorDB config using proper conversion function
38 |     vdb_config = get_vector_db_config_from_settings()
39 | 
40 |     # Create IngestPipeline with virtual_path
41 |     # Use group_name as virtual_path to avoid re-embedding when temp directory changes
42 |     # virtual_path: DB 저장용 가상 경로 (실제 파일은 upload_path에서 읽음)
43 |     pipeline = IngestPipeline(
44 |         path=upload_path,  # 실제 파일 작업용 (임시 디렉토리)
45 |         group_name=group_name,
46 |         vdb_config=vdb_config,
47 |         manager_id=manager_id,
48 |         max_batch_size_mb=1000,  # 1GB batch size
49 |         re_embed=re_embed,
50 |         virtual_path=Path(group_name),  # DB 저장용 가상 경로
51 |         all_files_list=all_files_list,  # 전체 파일 목록 (배치 업로드 삭제 판단용)
52 |         description=description,  # DocumentGroup 설명 (루트 그룹에만 저장됨)
53 |     )
54 | 
55 |     return pipeline
56 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/docx_parser.py:
--------------------------------------------------------------------------------
 1 | """Microsoft Word document parser."""
 2 | 
 3 | from pathlib import Path
 4 | from .base import BaseParser, ParseResult
 5 | 
 6 | 
 7 | class DocxParser(BaseParser):
 8 |     """Microsoft Word 문서 파서 (python-docx 사용)"""
 9 | 
10 |     def parse(self, file_path: Path) -> ParseResult:
11 |         """
12 |         DOCX 파일에서 텍스트를 추출합니다.
13 | 
14 |         Args:
15 |             file_path: 파싱할 DOCX 파일 경로
16 | 
17 |         Returns:
18 |             ParseResult: 파싱된 텍스트와 메타데이터
19 |         """
20 |         if not file_path.exists():
21 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 | 
23 |         try:
24 |             try:
25 |                 from docx import Document
26 |             except ImportError:
27 |                 raise ImportError(
28 |                     "python-docx가 설치되지 않았습니다. 'pip install python-docx'로 설치하세요."
29 |                 )
30 | 
31 |             doc = Document(file_path)
32 | 
33 |             # 모든 단락에서 텍스트 추출
34 |             paragraphs = [para.text for para in doc.paragraphs]
35 |             content = '\n'.join(paragraphs)
36 | 
37 |             # 표(table)에서도 텍스트 추출 (옵션)
38 |             tables_text = []
39 |             for table in doc.tables:
40 |                 for row in table.rows:
41 |                     row_text = ' | '.join(cell.text for cell in row.cells)
42 |                     tables_text.append(row_text)
43 | 
44 |             if tables_text:
45 |                 content += '\n\n' + '\n'.join(tables_text)
46 | 
47 |             metadata = {
48 |                 'file_type': 'docx',
49 |                 'num_paragraphs': len(paragraphs),
50 |                 'num_tables': len(doc.tables),
51 |                 'file_size': file_path.stat().st_size,
52 |             }
53 | 
54 |             return ParseResult(content=content, metadata=metadata)
55 | 
56 |         except Exception as e:
57 |             raise ValueError(f"DOCX 파싱 실패: {file_path}") from e
58 | 
59 |     def supports(self, file_path: Path) -> bool:
60 |         """DOCX 파일 확장자 지원 확인"""
61 |         return file_path.suffix.lower() in self.supported_extensions
62 | 
63 |     @property
64 |     def supported_extensions(self) -> list[str]:
65 |         """지원하는 Word 문서 확장자"""
66 |         return ['.docx']
67 | 


--------------------------------------------------------------------------------
/maru_lang/core/vector_db/factory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | VectorDB 팩토리 - VectorDB 인스턴스 생성
 3 | """
 4 | from typing import Optional
 5 | from maru_lang.core.vector_db.base import VectorDB
 6 | from maru_lang.models.vector_db import (
 7 |     BaseVectorDBConfig,
 8 |     ChromaDBConfig,
 9 |     MilvusConfig,
10 |     PineconeConfig,
11 |     get_vector_db_config_from_settings,
12 | )
13 | 
14 | 
15 | def get_vector_db(config: Optional[BaseVectorDBConfig] = None) -> VectorDB:
16 |     """
17 |     VectorDB 인스턴스 생성
18 | 
19 |     Args:
20 |         config: VectorDB 설정 (None이면 system_config.yaml의 vector_db.type에 따라 자동 생성)
21 | 
22 |     Returns:
23 |         VectorDB: VectorDB 인스턴스
24 | 
25 |     Raises:
26 |         ValueError: 지원하지 않는 VectorDB 타입인 경우
27 | 
28 |     Examples:
29 |         # system_config.yaml의 vector_db.type에 따라 자동 생성
30 |         vdb = get_vector_db()  # type이 'chroma'면 ChromaDB, 'milvus'면 Milvus
31 | 
32 |         # 커스텀 ChromaDB 생성
33 |         config = ChromaDBConfig(
34 |             persist_dir="/path/to/chromadb",
35 |             collection_name="my_collection",
36 |         )
37 |         vdb = get_vector_db(config)
38 |     """
39 |     # config가 없으면 system_config에서 자동으로 적절한 타입 선택
40 |     if config is None:
41 |         config = get_vector_db_config_from_settings()
42 | 
43 |     # ChromaDB
44 |     if isinstance(config, ChromaDBConfig):
45 |         from maru_lang.core.vector_db.chroma import ChromaVectorDB
46 |         return ChromaVectorDB(
47 |             persist_dir=config.persist_dir,
48 |             collection_name=config.collection_name,
49 |         )
50 | 
51 |     # Milvus
52 |     elif isinstance(config, MilvusConfig):
53 |         from maru_lang.core.vector_db.milvus import MilvusVectorDB
54 |         return MilvusVectorDB(
55 |             host=config.host,
56 |             port=config.port,
57 |             user=config.user,
58 |             password=config.password,
59 |             collection_name=config.collection_name,
60 |         )
61 | 
62 |     # Pinecone (향후 확장)
63 |     elif isinstance(config, PineconeConfig):
64 |         # from maru_lang.core.vector_db.pinecone import PineconeVectorDB
65 |         # return PineconeVectorDB(...)
66 |         raise NotImplementedError("Pinecone support is not yet implemented")
67 | 
68 |     else:
69 |         raise ValueError(f"Unsupported VectorDB config type: {type(config)}")
70 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/rag_loader.py:
--------------------------------------------------------------------------------
 1 | """
 2 | RAG configuration loader
 3 | """
 4 | from pathlib import Path
 5 | from typing import Dict, Any, Optional, List
 6 | from maru_lang.configs.base import DefaultConfigLoader
 7 | from maru_lang.pluggable.models import RagConfig, GroupRagConfig
 8 | from maru_lang.enums.configs import ConfigType
 9 | 
10 | 
11 | class RagConfigLoader(DefaultConfigLoader[RagConfig]):
12 |     """Loader for RAG configurations"""
13 | 
14 |     def __init__(self):
15 |         super().__init__(ConfigType.RAGS)
16 |         # Override directories - rag_config.yaml is in maru_app root
17 |         self.base_dir = Path(__file__).parent / "rags"  # Base config location (비어있음)
18 |         self.user_dir = Path.cwd() / "maru_app"  # User config in maru_app root
19 |         # Flattened view of all groups
20 |         self.all_groups: Dict[str, GroupRagConfig] = {}
21 | 
22 |     def parse_config(self, data: Dict[str, Any], source_path: str, is_user: bool) -> Optional[RagConfig]:
23 |         """Parse RAG configuration data"""
24 |         try:
25 |             # Use RagConfig.from_dict for parsing
26 |             rag_config = RagConfig.from_dict(data, source_path, is_user)
27 | 
28 |             # Store groups in flattened view
29 |             for group_name, group_config in rag_config.groups.items():
30 |                 self.all_groups[group_name] = group_config
31 | 
32 |             return rag_config
33 |         except Exception as e:
34 |             import logging
35 |             logging.error(f"Failed to parse RAG config: {e}")
36 |             return None
37 | 
38 |     def get_config_name(self, config: RagConfig) -> str:
39 |         """Get the name of a RAG configuration"""
40 |         # Use filename without extension as name
41 |         return Path(config.source_path).stem
42 | 
43 |     def validate_config(self, data: Dict[str, Any]) -> bool:
44 |         """Validate RAG configuration data"""
45 |         # RAG config can be more flexible
46 |         return isinstance(data, dict)
47 | 
48 |     def get_group(self, name: str) -> Optional[GroupRagConfig]:
49 |         """Get a specific group configuration"""
50 |         return self.all_groups.get(name)
51 | 
52 |     def reload(self) -> Dict[str, RagConfig]:
53 |         """Reload all configurations"""
54 |         self.all_groups = {}  # Clear flattened groups
55 |         return super().reload()
56 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/html_parser.py:
--------------------------------------------------------------------------------
 1 | """HTML file parser."""
 2 | 
 3 | from pathlib import Path
 4 | from .base import BaseParser, ParseResult
 5 | 
 6 | 
 7 | class HTMLParser(BaseParser):
 8 |     """HTML 파일 파서 (BeautifulSoup 사용)"""
 9 | 
10 |     def parse(self, file_path: Path) -> ParseResult:
11 |         """
12 |         HTML 파일에서 텍스트를 추출합니다.
13 | 
14 |         Args:
15 |             file_path: 파싱할 HTML 파일 경로
16 | 
17 |         Returns:
18 |             ParseResult: 파싱된 텍스트와 메타데이터
19 |         """
20 |         if not file_path.exists():
21 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 | 
23 |         try:
24 |             try:
25 |                 from bs4 import BeautifulSoup
26 |             except ImportError:
27 |                 raise ImportError(
28 |                     "beautifulsoup4가 설치되지 않았습니다. 'pip install beautifulsoup4'로 설치하세요."
29 |                 )
30 | 
31 |             with open(file_path, 'r', encoding='utf-8') as f:
32 |                 html_content = f.read()
33 | 
34 |             soup = BeautifulSoup(html_content, 'html.parser')
35 | 
36 |             # script와 style 태그 제거
37 |             for script in soup(['script', 'style']):
38 |                 script.decompose()
39 | 
40 |             # 텍스트 추출
41 |             text = soup.get_text(separator='\n', strip=True)
42 | 
43 |             # 연속된 빈 줄 제거
44 |             lines = [line.strip() for line in text.split('\n')]
45 |             content = '\n'.join(line for line in lines if line)
46 | 
47 |             metadata = {
48 |                 'file_type': 'html',
49 |                 'encoding': 'utf-8',
50 |                 'file_size': file_path.stat().st_size,
51 |             }
52 | 
53 |             # 메타 태그에서 추가 정보 추출 (옵션)
54 |             if soup.title:
55 |                 metadata['title'] = soup.title.string
56 | 
57 |             return ParseResult(content=content, metadata=metadata)
58 | 
59 |         except UnicodeDecodeError as e:
60 |             raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
61 |         except Exception as e:
62 |             raise ValueError(f"HTML 파싱 실패: {file_path}") from e
63 | 
64 |     def supports(self, file_path: Path) -> bool:
65 |         """HTML 파일 확장자 지원 확인"""
66 |         return file_path.suffix.lower() in self.supported_extensions
67 | 
68 |     @property
69 |     def supported_extensions(self) -> list[str]:
70 |         """지원하는 HTML 파일 확장자"""
71 |         return ['.html', '.htm', '.xhtml']
72 | 


--------------------------------------------------------------------------------
/maru_lang/templates/python/calculator_agent.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Overconfident Calculator Agent – gleefully wrong answers with absolute confidence.
 3 | Demo agent only – do NOT use in production!
 4 | """
 5 | 
 6 | from typing import Dict, Any, Optional
 7 | from maru_lang.pluggable.agents.base import BaseAgent
 8 | from maru_lang.models.agents import AgentResult
 9 | 
10 | 
11 | class CalculatorAgent(BaseAgent):
12 |     """An unapologetically confident (but incorrect) calculator agent."""
13 | 
14 |     def __init__(self, **kwargs):
15 |         super().__init__(**kwargs)
16 | 
17 |     async def _setup(self) -> None:
18 |         """Agent-specific initialization logic"""
19 |         # No special setup needed for this agent
20 |         pass
21 | 
22 |     async def execute(self, **kwargs) -> AgentResult:
23 |         """Run the agent using the LLM to craft a (wrong) response."""
24 |         question = kwargs.get('question', '')
25 | 
26 |         try:
27 |             # Load prompts from YAML configuration
28 |             prompts = self.config.prompts
29 |             system_prompt = prompts.system_prompt if prompts.system_prompt else ""
30 |             user_prompt_template = prompts.user_prompt_template if prompts.user_prompt_template else ""
31 | 
32 |             # Fill in the template with the user question
33 |             if user_prompt_template:
34 |                 user_prompt = user_prompt_template.format(question=question)
35 |             else:
36 |                 user_prompt = question
37 | 
38 |             override_params = self.get_override_params()
39 | 
40 |             # request_with_fallback automatically tries alternate LLMs if one fails
41 |             response = await self.request_with_fallback(
42 |                 user_prompt=user_prompt,
43 |                 system_prompt=system_prompt,
44 |                 **override_params,
45 |             )
46 | 
47 |             return AgentResult(
48 |                 success=True,
49 |                 result=response,  # Main response text
50 |                 data={},
51 |                 error=None,
52 |                 metadata={"confidence": "200%", "accuracy": "1%"}
53 |             )
54 | 
55 |         except Exception as e:
56 |             # Report failure when an error occurs
57 |             return AgentResult(
58 |                 success=False,
59 |                 result="",
60 |                 data=None,
61 |                 error=str(e),
62 |                 metadata={"confidence": "0%", "accuracy": "0%"}
63 |             )
64 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/yaml_parser.py:
--------------------------------------------------------------------------------
 1 | """YAML file parser."""
 2 | 
 3 | from pathlib import Path
 4 | from .base import BaseParser, ParseResult
 5 | 
 6 | 
 7 | class YAMLParser(BaseParser):
 8 |     """YAML 파일 파서"""
 9 | 
10 |     def parse(self, file_path: Path) -> ParseResult:
11 |         """
12 |         YAML 파일을 읽어 포맷팅된 텍스트로 변환합니다.
13 | 
14 |         Args:
15 |             file_path: 파싱할 YAML 파일 경로
16 | 
17 |         Returns:
18 |             ParseResult: 파싱된 텍스트와 메타데이터
19 |         """
20 |         if not file_path.exists():
21 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 | 
23 |         try:
24 |             try:
25 |                 import yaml
26 |             except ImportError:
27 |                 raise ImportError(
28 |                     "pyyaml이 설치되지 않았습니다. 'pip install pyyaml'로 설치하세요."
29 |                 )
30 | 
31 |             with open(file_path, 'r', encoding='utf-8') as f:
32 |                 data = yaml.safe_load(f)
33 | 
34 |             # YAML을 보기 좋게 포맷팅
35 |             content = yaml.dump(
36 |                 data,
37 |                 allow_unicode=True,
38 |                 default_flow_style=False,
39 |                 sort_keys=False,
40 |                 indent=2,
41 |             )
42 | 
43 |             metadata = {
44 |                 'file_type': 'yaml',
45 |                 'encoding': 'utf-8',
46 |                 'file_size': file_path.stat().st_size,
47 |             }
48 | 
49 |             # 구조 정보 추가
50 |             if isinstance(data, dict):
51 |                 metadata['structure'] = 'mapping'
52 |                 metadata['num_keys'] = len(data)
53 |             elif isinstance(data, list):
54 |                 metadata['structure'] = 'sequence'
55 |                 metadata['num_items'] = len(data)
56 | 
57 |             return ParseResult(content=content, metadata=metadata)
58 | 
59 |         except yaml.YAMLError as e:
60 |             raise ValueError(f"YAML 파싱 실패: {file_path} - {str(e)}") from e
61 |         except UnicodeDecodeError as e:
62 |             raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
63 |         except Exception as e:
64 |             raise ValueError(f"파일 읽기 실패: {file_path}") from e
65 | 
66 |     def supports(self, file_path: Path) -> bool:
67 |         """YAML 파일 확장자 지원 확인"""
68 |         return file_path.suffix.lower() in self.supported_extensions
69 | 
70 |     @property
71 |     def supported_extensions(self) -> list[str]:
72 |         """지원하는 YAML 파일 확장자"""
73 |         return ['.yaml', '.yml']
74 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_knowledge_search.yaml:
--------------------------------------------------------------------------------
 1 | # Knowledge Search Agent Configuration
 2 | # Supports general internal document search, knowledge bases, and supplemental web search
 3 | 
 4 | name: knowledge_search
 5 | description: "Searches and synthesizes information from all registered internal documents and knowledge bases."
 6 | type: builtin
 7 | enabled: true
 8 | version: "1.0.0"
 9 | 
10 | # Priority (higher numbers are selected first)
11 | priority: 15
12 | 
13 | # Agent selection criteria
14 | selection_criteria:
15 |   keywords:
16 |     - "internal document"
17 |     - "document search"
18 |     - "materials"
19 |     - "guide"
20 |     - "regulation"
21 |     - "manual"
22 |     - "knowledge"
23 |     - "search"
24 |     - "documentation" # '문서'라는 일반적인 키워드 추가
25 |   patterns:
26 |     - ".*internal.*document.*"
27 |     - ".*guide.*"
28 |     - ".*manual.*"
29 |     - ".*knowledge.*base.*"
30 |     - ".*find.*materials.*"
31 |     - ".*documentation.*" # '문서' 패턴 추가
32 | 
33 | # LLM configuration
34 | target_llm_config:
35 |   server_name: "openai"
36 |   override_params:
37 |     temperature: 0.2  # Lower temperature for precise answers
38 |     max_tokens: 3000
39 | 
40 |   fallback_strategy: "any_available"
41 | 
42 | # Prompt configuration
43 | prompts:
44 |   system_prompt: |
45 |     You are an internal knowledge search specialist.
46 |     Answer user questions by following this process:
47 | 
48 |     1. Search registered internal documents and knowledge bases first.
49 |     2. Combine all information to deliver an accurate and complete answer.
50 | 
51 |     Key capabilities:
52 |     - Retrieve internal documentation, guides, and materials
53 |     - Clearly label sources (internal documents vs. external websites)
54 |     - Prioritize trustworthy information
55 | 
56 |   user_prompt_template: |
57 |     Question: {question}
58 | 
59 |     Internal search results:
60 |     {internal_context}
61 | 
62 |     Chat history:
63 |     {chat_history}
64 | 
65 |     Using the information above, provide a comprehensive answer to the question.
66 |     Clearly indicate whether each piece of information comes from internal or external sources.
67 |     If you think you need to refer to the previous question during the search process, please create the query while referring to it.
68 | 
69 | # Implementation (builtin agent)
70 | implementation: builtin.knowledge_search.KnowledgeSearchAgent
71 | 
72 | # Agent configuration
73 | config:
74 |   timeout: 60
75 |   retry_count: 2
76 |   max_context_length: 12000
77 | 


--------------------------------------------------------------------------------
/maru_lang/services/chat.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, List
 2 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument
 3 | from maru_lang.core.relation_db.models.chat import Conversation, ConversationReference
 4 | from maru_lang.core.relation_db.models.documents import Document
 5 | from maru_lang.core.relation_db.models.auth import User
 6 | from tortoise.queryset import QuerySet
 7 | from datetime import datetime
 8 | from datetime import timezone
 9 | 
10 | 
11 | def fetch_conversation_queryset_by_user(
12 |     user: User,
13 | ) -> QuerySet[Conversation]:
14 |     return Conversation.filter(
15 |         user=user,
16 |     ).order_by('-created_at')
17 | 
18 | 
19 | async def fetch_conversation_by_user_and_date(
20 |     user: User,
21 |     start_date: datetime = datetime.now(timezone.utc),
22 |     limit: int = 3,
23 | ) -> List[Conversation] | None:
24 |     """
25 |     Fetch conversations by user and date range.
26 | 
27 |     Args:
28 |         user: User object
29 |         start_date: Start date for filtering conversations
30 |         limit: Maximum number of conversations to return
31 | 
32 |     Returns:
33 |         List of Conversation objects or None
34 |     """
35 |     conversations = await Conversation.filter(
36 |         user=user,
37 |         created_at__gte=start_date,
38 |     ).order_by(
39 |         'created_at'
40 |     ).limit(limit).all()
41 | 
42 |     return conversations if conversations else None
43 | 
44 | async def create_conversation(
45 |     user: User,
46 |     question: str,
47 |     answer: str,
48 |     references: list[RetrieveDocument],
49 |     enhanced_question: str | None = None,
50 | ):
51 |     conversation = await Conversation.create(
52 |         user=user,
53 |         question=question,
54 |         answer=answer,
55 |         enhanced_question=enhanced_question,
56 |     )
57 | 
58 |     # Use a set to avoid creating duplicate references
59 |     seen_doc_ids = set()
60 | 
61 |     for reference in references:
62 |         # Extract document_id from metadata
63 |         doc_id = reference.metadata.get("document_id")
64 |         if not doc_id or doc_id in seen_doc_ids:
65 |             continue
66 |         
67 |         # TODO FIX
68 |         score = 0
69 |         # Ensure the document still exists
70 |         document = await Document.get_or_none(id=doc_id)
71 |         if document:
72 |             await ConversationReference.create(
73 |                 conversation=conversation,
74 |                 document=document,
75 |                 score=score,
76 |             )
77 |             seen_doc_ids.add(doc_id)


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_intent_extractor.yaml:
--------------------------------------------------------------------------------
 1 | # Intent Extractor Agent Configuration
 2 | # Extracts the user's intent and rewrites it into a search-ready query
 3 | 
 4 | name: intent_extractor
 5 | description: "Analyzes conversation context to rephrase user intent into a document search query"
 6 | type: builtin
 7 | enabled: true
 8 | version: "1.0.0"
 9 | 
10 | # Priority (used primarily as an internal helper)
11 | priority: 90
12 | 
13 | # LLM configuration
14 | target_llm_config:
15 |   server_name: "openai"
16 |   override_params:
17 |     temperature: 0.1  # Consistent intent extraction
18 |     max_tokens: 1024  # Generate a well-formed question
19 |     timeout: 12.0
20 | 
21 |   fallback_strategy: "any_available"
22 | 
23 | # Prompt configuration
24 | prompts:
25 |   system_prompt: |
26 |     You are an expert at identifying the true intent behind conversations between the user and the assistant.
27 | 
28 |     Core responsibilities:
29 |     - Understand the user's real intent by considering the full context.
30 |     - Treat the most recent message as the primary source of new information.
31 |     - Rewrite the identified intent into a search-optimized question.
32 |     - Produce clear and concise questions in Korean.
33 | 
34 |     Principles:
35 |     1. Analyze the overall conversation flow and context.
36 |     2. Identify the key information the user truly wants to know.
37 |     3. Convert the intent into a concrete question suited for document retrieval.
38 |     4. Remove unnecessary rhetorical phrases or emotional language.
39 |     5. Include searchable keywords and concepts.
40 | 
41 |   user_prompt_template: |
42 |     Analyze the conversation context and the latest message, determine the user's intent, and rewrite it as a search-optimized question.
43 | 
44 |     <Conversation Context>
45 |     {history_text}
46 |     </Conversation Context>
47 | 
48 |     <Latest Message>
49 |     {question}
50 |     </Latest Message>
51 | 
52 |     Produce a Korean question that reflects the user's true intent and is ready for document search.
53 |     Return only the rewritten question with no additional commentary.
54 | 
55 | # Implementation (builtin agent)
56 | implementation: builtin.intent_extractor.IntentExtractorAgent
57 | 
58 | # Agent configuration
59 | config:
60 |   timeout: 12
61 |   retry_count: 1
62 | 
63 |   # Intent extraction options
64 |   extraction_config:
65 |     preserve_key_terms: true  # Preserve key terminology
66 |     remove_emotions: true     # Remove emotional expressions
67 |     optimize_for_search: true # Optimize for search queries
68 | 


--------------------------------------------------------------------------------
/maru_lang/pipelines/base.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Base Pipeline - 모든 파이프라인의 추상 기본 클래스
 3 | 비동기 큐 기반 스트리밍으로 자유로운 진행 상황 전달
 4 | """
 5 | import asyncio
 6 | from abc import ABC, abstractmethod
 7 | from dataclasses import dataclass
 8 | from enum import Enum
 9 | from typing import AsyncGenerator, Any, Optional
10 | 
11 | 
12 | class MessageType(str, Enum):
13 |     """메시지 타입"""
14 |     INFO = "info"
15 |     ERROR = "error"
16 |     WARNING = "warning"
17 | 
18 | 
19 | @dataclass
20 | class PipelineMessage:
21 |     """파이프라인 진행 메시지"""
22 |     message_type: MessageType
23 |     message: str
24 |     data: Any = None
25 | 
26 |     @classmethod
27 |     def info(cls, message: str, data: Any = None):
28 |         """INFO 메시지 생성"""
29 |         return cls(message_type=MessageType.INFO, message=message, data=data)
30 | 
31 |     @classmethod
32 |     def error(cls, message: str, data: Any = None):
33 |         """ERROR 메시지 생성"""
34 |         return cls(message_type=MessageType.ERROR, message=message, data=data)
35 | 
36 |     @classmethod
37 |     def warning(cls, message: str, data: Any = None):
38 |         """WARNING 메시지 생성"""
39 |         return cls(message_type=MessageType.WARNING, message=message, data=data)
40 | 
41 | 
42 | @dataclass
43 | class PipelineComplete:
44 |     """파이프라인 종료 신호"""
45 |     data: Any = None  # 최종 결과
46 | 
47 | 
48 | class BasePipeline(ABC):
49 |     """모든 파이프라인의 기본 클래스 - 비동기 큐 기반 스트리밍"""
50 | 
51 |     def __init__(self):
52 |         self.queue: asyncio.Queue = asyncio.Queue()
53 | 
54 |     async def run(self) -> AsyncGenerator[Any, None]:
55 |         """
56 |         파이프라인 실행 (큐 기반 스트리밍)
57 | 
58 |         백그라운드에서 process()를 실행하고,
59 |         큐에서 메시지를 꺼내서 yield
60 | 
61 |         Yields:
62 |             PipelineMessage | PipelineComplete: 진행 메시지 또는 완료 신호
63 |         """
64 |         # 백그라운드에서 process() 실행
65 |         task = asyncio.create_task(self.process())
66 | 
67 |         try:
68 |             while True:
69 |                 item = await self.queue.get()
70 | 
71 |                 # 종료 신호 확인
72 |                 if isinstance(item, PipelineComplete):
73 |                     yield item
74 |                     break
75 | 
76 |                 yield item
77 |         finally:
78 |             # process() 완료 대기
79 |             await task
80 | 
81 |     @abstractmethod
82 |     async def process(self):
83 |         """
84 |         파이프라인 주요 로직 (하위 클래스 구현)
85 | 
86 |         self.queue.put()으로 진행 상황 전달
87 |         마지막에 반드시 PipelineComplete() 전달
88 | 
89 |         Example:
90 |             await self.queue.put(PipelineMessage.info("Starting..."))
91 |             # ... 처리 ...
92 |             await self.queue.put(PipelineComplete(data=result))
93 |         """
94 |         pass
95 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents_build_selector.yaml:
--------------------------------------------------------------------------------
 1 | # Agent Selector Build Configuration
 2 | # Configure how the agent selector behaves.
 3 | # The available_agents list will be populated dynamically from loaded agents.
 4 | 
 5 | # Agent selector prompt configuration
 6 | system_prompt: |
 7 |   You are an agent selector. Analyze the user's question and pick the most appropriate specialists.
 8 | 
 9 |   Below is a list of available agents. Choose the ones that best fit the user's request.
10 |   Available agents:
11 |   {agent_descriptions}
12 | 
13 |   Guidelines:
14 |   - Select only the agents that are necessary to answer the question.
15 |   - Consider execution order when multiple agents are required.
16 |   - Clearly explain the reasoning behind your choices.
17 | 
18 |   Important: use agent names directly in execution_order.
19 |   Example: ["tech_prophet", "calculator"] (valid)
20 |   Example: ["use_tech_prophet", "use_calculator"] (invalid)
21 | 
22 | # User prompt template
23 | user_prompt: |
24 |   Question: {question}
25 | 
26 |   Conversation History:
27 |   {history_text}
28 | 
29 |   Prioritize the user's latest question when selecting agents. Reference the conversation history only when it helps.
30 |   The most recent message usually contains the most relevant context. Use that information to choose the right specialists.
31 |   If the history is not useful, focus solely on the question.
32 | 
33 | # LLM parameters
34 | parameters:
35 |   temperature: 0.1  # Low temperature for consistent selections
36 |   timeout: 15.0      # Timeout in seconds
37 |   max_tokens: 1000   # Maximum response tokens
38 | 
39 | # Selection policy (optional)
40 | selection_policy:
41 |   max_agents: 3        # Maximum number of agents to select
42 |   require_reasoning: true  # Require reasoning for selections
43 |   allow_parallel: true  # Allow parallel execution when possible
44 | 
45 | # Agent-specific overrides (optional)
46 | # Disable individual agents after they are auto-discovered
47 | agent_overrides:
48 |   # document_search: false  # Example: disable the built-in RAG agent
49 |   # custom_agent: false     # Example: disable a custom agent
50 | 
51 | # Fallback behavior when no agents are selected
52 | fallback_config:
53 |   # Options: 'llm_generate' (LLM responds directly) or 'static_message'
54 |   mode: "llm_generate"
55 | 
56 |   # Message to send when using static_message mode
57 |   static_message: "How can I help? Please provide more details so I can assist you."
58 | 
59 |   # Settings used when mode is 'llm_generate'
60 |   llm_generate:
61 |     system_prompt: |
62 |       You are a helpful AI assistant.
63 |       Provide clear and accurate answers to the user's question.
64 |     temperature: 0.7
65 |     max_tokens: 500
66 | 


--------------------------------------------------------------------------------
/maru_lang/utils/distribution.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Tuple, Dict
 2 | 
 3 | def allocate_by_weight(
 4 |     groups_with_weights: List[Tuple[str, float]],
 5 |     max_results: int,
 6 |     ensure_min_one: bool = True,
 7 |     include_zero_weight_groups: bool = True,
 8 | ) -> Dict[str, int]:
 9 |     if max_results <= 0 or not groups_with_weights:
10 |         return {}
11 | 
12 |     # 안전 가중치(음수 -> 0)
13 |     safe = [(g, (w if (isinstance(w, (int, float)) and w > 0) else 0.0))
14 |             for g, w in groups_with_weights]
15 | 
16 |     all_groups = [g for g, _ in safe]
17 |     allocations: Dict[str, int] = {g: 0 for g in all_groups}
18 | 
19 |     # w>0만 배분 대상
20 |     pos = [(g, w) for g, w in safe if w > 0.0]
21 |     if not pos:
22 |         # 전부 0이면 모두 0으로 반환
23 |         return allocations if include_zero_weight_groups else {}
24 | 
25 |     remaining = max_results
26 | 
27 |     # Step 1) 최소 1개 보장
28 |     if ensure_min_one:
29 |         if len(pos) <= max_results:
30 |             for g, _ in pos:
31 |                 allocations[g] = 1
32 |             remaining -= len(pos)
33 |         else:
34 |             # 양수 그룹이 max_results보다 많으면 상위 weight만 1개씩
35 |             top = sorted(pos, key=lambda x: x[1], reverse=True)[:max_results]
36 |             for g, _ in top:
37 |                 allocations[g] = 1
38 |             remaining = 0
39 | 
40 |     if remaining == 0:
41 |         return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos}
42 | 
43 |     # Step 2) Largest Remainder for the rest
44 |     total_weight = sum(w for _, w in pos)
45 |     if total_weight <= 0:
46 |         # 방어적: weight합 0이면 남은 좌석을 상위 weight 순으로 1씩
47 |         for g, _ in sorted(pos, key=lambda x: x[1], reverse=True):
48 |             if remaining <= 0:
49 |                 break
50 |             allocations[g] += 1
51 |             remaining -= 1
52 |         return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos}
53 | 
54 |     quotas = []
55 |     for g, w in pos:
56 |         q = remaining * (w / total_weight)
57 |         base = int(q)
58 |         frac = q - base
59 |         quotas.append((g, w, base, frac))
60 | 
61 |     used = sum(base for _, _, base, _ in quotas)
62 |     for g, _, base, _ in quotas:
63 |         allocations[g] += base
64 | 
65 |     left = remaining - used
66 |     if left > 0:
67 |         # 타이브레이크: frac DESC, weight DESC, name ASC
68 |         quotas_sorted = sorted(
69 |             quotas,
70 |             key=lambda t: (-t[3], -t[1], t[0])
71 |         )
72 |         for i in range(left):
73 |             g = quotas_sorted[i][0]
74 |             allocations[g] += 1
75 | 
76 |     return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos}


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_keyword_extractor.yaml:
--------------------------------------------------------------------------------
 1 | # Keyword Extractor Agent Configuration
 2 | # Extracts keywords optimized for BM25 search
 3 | 
 4 | name: keyword_extractor
 5 | description: "Extracts core keywords optimized for BM25 retrieval"
 6 | type: builtin
 7 | enabled: true
 8 | version: "1.0.0"
 9 | 
10 | # Priority (used primarily as an internal helper)
11 | priority: 85
12 | 
13 | # LLM configuration
14 | target_llm_config:
15 |   server_name: "openai"
16 |   override_params:
17 |     temperature: 0.1      # Keep output consistent
18 |     max_tokens: 50        # Short responses containing keywords only
19 |     timeout: 8.0          # Fast turnaround
20 | 
21 |   fallback_strategy: "any_available"
22 | 
23 | # Prompt configuration
24 | prompts:
25 |   system_prompt: |
26 |     You are an expert at extracting keywords optimized for BM25 search.
27 | 
28 |     Core responsibilities:
29 |     - Extract only the most important nouns and concepts from the question.
30 |     - Select keywords that work well with the BM25 algorithm.
31 |     - Remove stopwords and terms with little value.
32 |     - Consider synonyms and closely related phrases.
33 | 
34 |     Principles:
35 |     1. Focus on the most critical nouns and key concepts.
36 |     2. Remove all stopwords (particles, endings, interrogatives, etc.).
37 |     3. Include synonyms or related terms when helpful.
38 |     4. Compress the output into 3–7 essential keywords.
39 |     5. Separate keywords with spaces.
40 |     6. Prioritize specific terms that improve search quality.
41 | 
42 |   user_prompt_template: |
43 |     Extract the BM25-optimized core keywords for the following question.
44 | 
45 |     Question: {question}
46 | 
47 |     Return only the most effective keywords for BM25 search, separated by spaces.
48 |     Provide the keywords only—no additional commentary.
49 | 
50 |     Example:
51 |     - Question: "How do I apply for vacation at our company?"
52 |     - Keywords: "company vacation application procedure process"
53 | 
54 | # Implementation (builtin agent)
55 | implementation: builtin.keyword_extractor.KeywordExtractorAgent
56 | 
57 | # Agent configuration
58 | config:
59 |   timeout: 8
60 |   retry_count: 1
61 | 
62 |   # Keyword extraction options
63 |   extraction_config:
64 |     min_keywords: 3         # Minimum number of keywords
65 |     max_keywords: 7         # Maximum number of keywords
66 |     filter_stopwords: true  # Filter stopwords
67 |     include_synonyms: true  # Consider synonyms
68 | 
69 |     # Stopword list (Korean)
70 |     stopwords:
71 |       - "어떻게"
72 |       - "무엇"
73 |       - "언제"
74 |       - "어디서"
75 |       - "왜"
76 |       - "어떤"
77 |       - "이것"
78 |       - "그것"
79 |       - "저것"
80 |       - "되나요"
81 |       - "인가요"
82 |       - "습니까"
83 |       - "하나요"


--------------------------------------------------------------------------------
/maru_lang/commands/transfer.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Transfer 명령어: DocumentGroup 관리자 권한 이전
 3 | """
 4 | import typer
 5 | from maru_lang.core.relation_db.models.documents import DocumentGroup
 6 | from maru_lang.core.relation_db.models.auth import User
 7 | 
 8 | 
 9 | async def transfer_function(
10 |     group_name: str,
11 |     new_manager_email: str,
12 |     force: bool = False,
13 | ):
14 |     """
15 |     DocumentGroup의 관리자를 다른 사용자로 이전
16 | 
17 |     Args:
18 |         group_name: 관리자를 변경할 DocumentGroup 이름
19 |         new_manager_email: 새 관리자의 이메일 주소
20 |         force: 확인 없이 강제 이전
21 |     """
22 |     # ========== 1. DocumentGroup 확인 ==========
23 |     typer.echo("\n" + "=" * 50)
24 |     typer.secho("🔄 DocumentGroup 관리자 이전", fg=typer.colors.CYAN, bold=True)
25 |     typer.echo("=" * 50)
26 | 
27 |     group = await DocumentGroup.get_or_none(name=group_name).prefetch_related("manager")
28 |     if not group:
29 |         typer.secho(
30 |             f"❌ DocumentGroup '{group_name}'을 찾을 수 없습니다.",
31 |             fg=typer.colors.RED,
32 |         )
33 |         raise typer.Exit(1)
34 | 
35 |     # ========== 2. 새 관리자 확인 ==========
36 |     new_manager = await User.get_or_none(email=new_manager_email)
37 |     if not new_manager:
38 |         typer.secho(
39 |             f"❌ 사용자 '{new_manager_email}'을 찾을 수 없습니다.",
40 |             fg=typer.colors.RED,
41 |         )
42 |         raise typer.Exit(1)
43 | 
44 |     # ========== 3. 현재 관리자 정보 출력 ==========
45 |     current_manager = group.manager
46 |     if current_manager:
47 |         typer.echo(f"\n현재 관리자: {current_manager.name} ({current_manager.email})")
48 |     else:
49 |         typer.echo(f"\n현재 관리자: 없음")
50 | 
51 |     typer.echo(f"새 관리자: {new_manager.name} ({new_manager.email})")
52 | 
53 |     # 이미 같은 관리자인 경우
54 |     if current_manager and current_manager.id == new_manager.id:
55 |         typer.secho(
56 |             f"\n⚠️  '{new_manager_email}'은 이미 이 그룹의 관리자입니다.",
57 |             fg=typer.colors.YELLOW,
58 |         )
59 |         raise typer.Exit(0)
60 | 
61 |     # ========== 4. 확인 ==========
62 |     if not force:
63 |         typer.echo("\n" + "=" * 50)
64 |         confirm = typer.confirm(
65 |             f"\n'{group_name}'의 관리자를 '{new_manager_email}'로 변경하시겠습니까?"
66 |         )
67 |         if not confirm:
68 |             typer.secho("\n❌ 이전 작업이 취소되었습니다.", fg=typer.colors.RED)
69 |             raise typer.Exit(0)
70 | 
71 |     # ========== 5. 관리자 변경 ==========
72 |     group.manager = new_manager
73 |     await group.save()
74 | 
75 |     # ========== 완료 ==========
76 |     typer.echo("\n" + "=" * 50)
77 |     typer.secho("✅ 관리자 이전 완료!", fg=typer.colors.GREEN, bold=True)
78 |     typer.echo("=" * 50)
79 |     typer.echo(f"DocumentGroup: {group_name}")
80 |     typer.echo(f"새 관리자: {new_manager.name} ({new_manager.email})")
81 |     typer.echo()
82 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/rag_config.yaml:
--------------------------------------------------------------------------------
 1 | # RAG Configuration
 2 | # Configure retriever defaults and per-group RAG settings
 3 | 
 4 | # ============================================================
 5 | # Retriever global settings
 6 | # ============================================================
 7 | retriever:
 8 |   # Default search parameters
 9 |   default_k: 20              # Number of search results to return
10 |   default_method: "vector"  # Choose among vector, bm25, or ensemble
11 | 
12 |   # Query-type weights
13 |   # - cosine_weight: weight of vector similarity search
14 |   # - bm25_weight: weight of BM25 keyword search
15 |   query_type_weights:
16 |     factual:      # Fact-based question (e.g., "What is ...")
17 |       cosine_weight: 0.2
18 |       bm25_weight: 0.8
19 |     procedural:   # How-to question (e.g., "How do I ...")
20 |       cosine_weight: 0.8
21 |       bm25_weight: 0.2
22 |     analytical:   # Comparative/analytical question (e.g., "Compare A and B")
23 |       cosine_weight: 0.5
24 |       bm25_weight: 0.5
25 | 
26 |   # Representative queries for automatic query-type classification
27 |   # Used to determine which query type the user's question resembles
28 |   representative_queries:
29 |     factual: "What is ..."
30 |     procedural: "How to ..."
31 |     analytical: "Compare A and B"
32 | 
33 |   # Fallback logic configuration
34 |   # Applied when the query type cannot be determined reliably
35 |   fallback_logic:
36 |     similarity_threshold: 0.3  # Use fallback when similarity drops below this value
37 |     short_query_length: 2      # Short query threshold (word count)
38 |     long_query_length: 6       # Long query threshold (word count)
39 |     weights:
40 |       short_query:    # Short query (≤ 2 words)
41 |         cosine_weight: 0.3
42 |         bm25_weight: 0.7
43 |       medium_query:   # Medium-length query (3–5 words)
44 |         cosine_weight: 0.5
45 |         bm25_weight: 0.5
46 |       long_query:     # Long query (≥ 6 words)
47 |         cosine_weight: 0.7
48 |         bm25_weight: 0.3
49 | 
50 | # ============================================================
51 | # Group-specific RAG configuration
52 | # ============================================================
53 | groups:
54 |   # Example: Python documentation group
55 |   # python_docs:
56 |   #   description: "Official Python documentation and tutorials"
57 |   #
58 |   #   # Optional overrides for pluggable components (per-group customization)
59 |   #   components:
60 |   #     loader: "markdown"        # Loader to use for this group
61 |   #     chunker: "sentence"       # Chunker to use for this group
62 |   #     embedding_model: "BAAI/bge-m3"  # Embedding model to use for this group
63 | 
64 |   # Example: General documents group
65 |   # general_docs:
66 |   #   description: "General information and documents"
67 | 


--------------------------------------------------------------------------------
/maru_lang/core/vector_db/retrieve_document.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, Optional
 2 | from datetime import datetime
 3 | from pydantic import BaseModel, Field, computed_field
 4 | 
 5 | 
 6 | class RetrieveDocument(BaseModel):
 7 |     id: str
 8 |     page_content: str
 9 |     metadata: dict[str, Any] = Field(default_factory=dict)
10 | 
11 |     @computed_field
12 |     @property
13 |     def source(self) -> str:
14 |         return self.metadata.get("document_name", "알 수 없는 소스")
15 | 
16 |     @computed_field
17 |     @property
18 |     def code(self) -> str:
19 |         return self.metadata.get("document_code", "unknown")
20 | 
21 |     @computed_field
22 |     @property
23 |     def page(self) -> int:
24 |         return self.metadata.get("number", 1)
25 | 
26 |     def __repr__(self):
27 |         preview = self.page_content[:60].replace("\n", " ")
28 |         if len(self.page_content) > 60:
29 |             preview += "..."
30 |         return f"RetrieveDocument(id='{self.id}', page_content='{preview}', metadata={self.metadata})"
31 | 
32 |     def to_dict(self) -> dict:
33 |         return self.model_dump()
34 | 
35 |     def to_reference_response(self) -> dict:
36 |         """ReferenceResponse 형태로 변환"""
37 |         return {
38 |             "source": self.source,
39 |             "code": self.code,
40 |             "page": self.page,
41 |             "page_content": self.page_content,
42 |             "metadata": self.metadata
43 |         }
44 | 
45 |     def pretty(self) -> str:
46 |         """사용자 친화적 포맷 출력"""
47 |         preview = self.page_content.strip().replace("\n", " ")
48 |         if len(preview) > 50:
49 |             preview = preview[:50] + "..."
50 | 
51 |         filtered_meta = {
52 |             k: v for k, v in self.metadata.items()
53 |             if v not in (None, "", [], {}, "null", "None")
54 |         }
55 | 
56 |         meta_lines = "\n".join(
57 |             f"    {k}: {v}" for k, v in filtered_meta.items())
58 | 
59 |         return (
60 |             f"\n🧩 RetrieveDocument(id={self.id})\n"
61 |             f"📄 Content Preview: {preview}\n"
62 |             f"📎 Metadata:\n{meta_lines}\n"
63 |         )
64 | 
65 |     @staticmethod
66 |     def sort_by_date(documents: list['RetrieveDocument']) -> list['RetrieveDocument']:
67 |         """문서를 날짜 기준으로 정렬 (최신순)"""
68 | 
69 |         def parse_date(date_str: str) -> datetime:
70 |             try:
71 |                 return datetime.strptime(date_str, "%Y%m%d")
72 |             except:
73 |                 return datetime.min
74 | 
75 |         def get_document_date(doc: 'RetrieveDocument') -> datetime:
76 |             update_date = doc.metadata.get("UpdateDate", "")
77 |             creation_date = doc.metadata.get("CreationDate", "")
78 | 
79 |             if update_date:
80 |                 return parse_date(update_date)
81 |             elif creation_date:
82 |                 return parse_date(creation_date)
83 |             return datetime.min
84 | 
85 |         return sorted(documents, key=get_document_date, reverse=True)
86 | 


--------------------------------------------------------------------------------
/maru_lang/utils/document.py:
--------------------------------------------------------------------------------
 1 | import base64
 2 | import hashlib
 3 | import os
 4 | import sys
 5 | import time
 6 | import random
 7 | import uuid
 8 | 
 9 | 
10 | def new_ulid() -> str:
11 |     """
12 |     Generate a time-sortable identifier.
13 | 
14 |     - Use ``uuid.uuid7`` when available (Python 3.12+).
15 |     - Otherwise, fall back to a ULID-style implementation.
16 |     """
17 |     # Detect uuid7 support at runtime
18 |     if hasattr(uuid, 'uuid7'):
19 |         return str(uuid.uuid7())
20 | 
21 |     # ULID fallback implementation
22 |     # Format: 26 characters (10 timestamp + 16 randomness)
23 |     timestamp_ms = int(time.time() * 1000)
24 |     randomness = random.getrandbits(80)
25 | 
26 |     # Crockford's Base32 alphabet
27 |     alphabet = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
28 | 
29 |     # Encode timestamp (10 characters)
30 |     ts_encoded = ""
31 |     ts = timestamp_ms
32 |     for _ in range(10):
33 |         ts_encoded = alphabet[ts & 0x1F] + ts_encoded
34 |         ts >>= 5
35 | 
36 |     # Encode randomness (16 characters)
37 |     rand_encoded = ""
38 |     rand = randomness
39 |     for _ in range(16):
40 |         rand_encoded = alphabet[rand & 0x1F] + rand_encoded
41 |         rand >>= 5
42 | 
43 |     return ts_encoded + rand_encoded
44 | 
45 | 
46 | def canonicalize_text(s: str) -> str:
47 |     return " ".join((s or "").split()).lower()
48 | 
49 | 
50 | def make_source_fingerprint_for_file(file_path: str, size: int, mtime_ns: int) -> str:
51 |     """
52 |     Generate a fingerprint that captures changes to file contents and location.
53 | 
54 |     Args:
55 |         file_path: Full file path (used to distinguish same files in different locations).
56 |         size: File size in bytes.
57 |         mtime_ns: Modification time in nanoseconds.
58 | 
59 |     Returns:
60 |         str: 32-character SHA256 hash.
61 | 
62 |     Note:
63 |         The full file path is included to allow the same file to exist in different
64 |         directories as separate documents. This handles cases where:
65 |         - Files are copied to multiple locations
66 |         - Folder names are changed (creating a new document context)
67 |         - Backup or versioned copies exist in different paths
68 |     """
69 |     raw = f"{file_path.lower()}|{size}|{mtime_ns}"
70 |     return hashlib.sha256(raw.encode()).hexdigest()[:32]  # 128bit
71 | 
72 | def make_chunk_uid(document_id: str, number: int, content: str) -> str:
73 |     raw = f"{document_id}|{number}|{canonicalize_text(content)}"
74 |     d = hashlib.sha256(raw.encode()).digest()
75 |     return base64.b32encode(d).decode("ascii").rstrip("=").lower()[:26]
76 | 
77 | 
78 | def make_embed_id(chunk_uid: str, model_name: str, dim: int, normalize_ver: str, pooling: str, lang_hint: str | None = None) -> str:
79 |     raw = "|".join([chunk_uid, model_name, str(
80 |         dim), normalize_ver, pooling, lang_hint or ""])
81 |     d = hashlib.sha256(raw.encode()).digest()
82 |     return base64.b32encode(d).decode("ascii").rstrip("=").lower()[:26]
83 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/xlsx_parser.py:
--------------------------------------------------------------------------------
 1 | """Excel file parser."""
 2 | 
 3 | from pathlib import Path
 4 | from .base import BaseParser, ParseResult
 5 | 
 6 | 
 7 | class XLSXParser(BaseParser):
 8 |     """Excel 파일 파서 (openpyxl 사용)"""
 9 | 
10 |     def parse(self, file_path: Path) -> ParseResult:
11 |         """
12 |         XLSX 파일에서 텍스트를 추출합니다.
13 | 
14 |         Args:
15 |             file_path: 파싱할 XLSX 파일 경로
16 | 
17 |         Returns:
18 |             ParseResult: 파싱된 텍스트와 메타데이터
19 |         """
20 |         if not file_path.exists():
21 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 | 
23 |         try:
24 |             try:
25 |                 from openpyxl import load_workbook
26 |             except ImportError:
27 |                 raise ImportError(
28 |                     "openpyxl이 설치되지 않았습니다. 'pip install openpyxl'로 설치하세요."
29 |                 )
30 | 
31 |             # data_only=True로 수식 대신 값을 읽음
32 |             workbook = load_workbook(file_path, data_only=True)
33 | 
34 |             # 시트별로 데이터 추출
35 |             sheets_text = []
36 |             for sheet_name in workbook.sheetnames:
37 |                 sheet = workbook[sheet_name]
38 |                 sheet_content = [f"=== Sheet: {sheet_name} ==="]
39 | 
40 |                 # 모든 행 읽기
41 |                 rows_data = []
42 |                 for row in sheet.iter_rows(values_only=True):
43 |                     # 빈 행 건너뛰기
44 |                     if all(cell is None or str(cell).strip() == '' for cell in row):
45 |                         continue
46 | 
47 |                     # 셀 값을 문자열로 변환
48 |                     row_text = ' | '.join(
49 |                         str(cell) if cell is not None else '' for cell in row
50 |                     )
51 |                     rows_data.append(row_text)
52 | 
53 |                 if rows_data:
54 |                     sheet_content.extend(rows_data)
55 |                 else:
56 |                     sheet_content.append("(empty sheet)")
57 | 
58 |                 sheets_text.append('\n'.join(sheet_content))
59 | 
60 |             content = '\n\n'.join(sheets_text)
61 | 
62 |             metadata = {
63 |                 'file_type': 'xlsx',
64 |                 'num_sheets': len(workbook.sheetnames),
65 |                 'sheet_names': workbook.sheetnames,
66 |                 'file_size': file_path.stat().st_size,
67 |             }
68 | 
69 |             # 활성 시트 정보
70 |             if workbook.active:
71 |                 metadata['active_sheet'] = workbook.active.title
72 | 
73 |             return ParseResult(content=content, metadata=metadata)
74 | 
75 |         except Exception as e:
76 |             raise ValueError(f"XLSX 파싱 실패: {file_path}") from e
77 | 
78 |     def supports(self, file_path: Path) -> bool:
79 |         """XLSX 파일 확장자 지원 확인"""
80 |         return file_path.suffix.lower() in self.supported_extensions
81 | 
82 |     @property
83 |     def supported_extensions(self) -> list[str]:
84 |         """지원하는 Excel 파일 확장자"""
85 |         return ['.xlsx', '.xlsm']
86 | 


--------------------------------------------------------------------------------
/maru_lang/core/vector_db/base.py:
--------------------------------------------------------------------------------
  1 | from abc import ABC, abstractmethod
  2 | from typing import Any
  3 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument
  4 | 
  5 | 
  6 | class VectorDB(ABC):
  7 | 
  8 |     @abstractmethod
  9 |     def drop_collection(self) -> None:
 10 |         pass
 11 | 
 12 |     @abstractmethod
 13 |     def add_documents(self, documents: list[dict]) -> None:
 14 |         pass
 15 | 
 16 |     @abstractmethod
 17 |     def sync_documents(self) -> None:
 18 |         pass
 19 | 
 20 |     @abstractmethod
 21 |     def has_document(self, doc_id: str) -> bool:
 22 |         pass
 23 | 
 24 |     @abstractmethod
 25 |     def update_document(self, doc_id: str, new_doc_id: str, new_content: str) -> None:
 26 |         pass
 27 | 
 28 |     @abstractmethod
 29 |     def delete_document(self, doc_id: str) -> None:
 30 |         pass
 31 | 
 32 |     @abstractmethod
 33 |     def delete_all_chunks_by_document_id(self, document_id: str) -> int:
 34 |         """문서 ID로 해당 문서의 모든 청크를 삭제합니다.
 35 |         
 36 |         Args:
 37 |             document_id: 삭제할 문서의 ID
 38 |             
 39 |         Returns:
 40 |             삭제된 청크의 개수
 41 |         """
 42 |         pass
 43 | 
 44 |     @abstractmethod
 45 |     def count_documents(self) -> int:
 46 |         pass
 47 | 
 48 |     @abstractmethod
 49 |     def get_all_metadata(self) -> list[dict]:
 50 |         pass
 51 | 
 52 |     @abstractmethod
 53 |     def get_documents(self, document_ids: list[str]) -> list[RetrieveDocument]:
 54 |         pass
 55 | 
 56 |     @abstractmethod
 57 |     def get_all_documents(
 58 |         self,
 59 |         version_ids: list[str] | None = None
 60 |     ) -> list[RetrieveDocument]:
 61 |         """
 62 |         Get all documents from VectorDB with optional version filter
 63 | 
 64 |         Args:
 65 |             version_ids: Optional list of version IDs to filter
 66 | 
 67 |         Returns:
 68 |             List of all documents (or filtered by version)
 69 |         """
 70 |         pass
 71 | 
 72 |     @abstractmethod
 73 |     def similarity_search(
 74 |         self,
 75 |         query_embedding: list[float],
 76 |         k: int,
 77 |         version_ids: list[str] | None = None,
 78 |         **kwargs: dict[str, Any]
 79 |     ) -> list[RetrieveDocument]:
 80 |         """
 81 |         Vector similarity search using query embedding
 82 | 
 83 |         Args:
 84 |             query_embedding: Query embedding vector
 85 |             k: Number of results to return
 86 |             version_ids: Optional list of version IDs to filter
 87 |             **kwargs: Additional search parameters
 88 | 
 89 |         Returns:
 90 |             List of retrieved documents
 91 |         """
 92 |         pass
 93 | 
 94 |     @abstractmethod
 95 |     def health_check(self) -> bool:
 96 |         """
 97 |         VectorDB 헬스체크 (연결 및 접근 가능 여부 확인)
 98 | 
 99 |         Returns:
100 |             bool: 헬스체크 통과 여부
101 | 
102 |         Raises:
103 |             Exception: 헬스체크 실패 시 상세 에러
104 |         """
105 |         pass
106 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/csv_parser.py:
--------------------------------------------------------------------------------
 1 | """CSV file parser."""
 2 | 
 3 | import csv
 4 | from pathlib import Path
 5 | from .base import BaseParser, ParseResult
 6 | 
 7 | 
 8 | class CSVParser(BaseParser):
 9 |     """CSV 파일 파서"""
10 | 
11 |     def parse(self, file_path: Path) -> ParseResult:
12 |         """
13 |         CSV 파일을 읽어 포맷팅된 텍스트로 변환합니다.
14 | 
15 |         Args:
16 |             file_path: 파싱할 CSV 파일 경로
17 | 
18 |         Returns:
19 |             ParseResult: 파싱된 텍스트와 메타데이터
20 |         """
21 |         if not file_path.exists():
22 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
23 | 
24 |         try:
25 |             with open(file_path, 'r', encoding='utf-8') as f:
26 |                 # CSV 방언 자동 감지
27 |                 sample = f.read(1024)
28 |                 f.seek(0)
29 |                 sniffer = csv.Sniffer()
30 | 
31 |                 try:
32 |                     dialect = sniffer.sniff(sample)
33 |                     has_header = sniffer.has_header(sample)
34 |                 except csv.Error:
35 |                     # 감지 실패시 기본값 사용
36 |                     dialect = csv.excel
37 |                     has_header = True
38 | 
39 |                 reader = csv.reader(f, dialect=dialect)
40 |                 rows = list(reader)
41 | 
42 |             if not rows:
43 |                 raise ValueError("CSV 파일이 비어 있습니다")
44 | 
45 |             # 테이블 형식으로 포맷팅
46 |             if has_header and len(rows) > 1:
47 |                 headers = rows[0]
48 |                 data_rows = rows[1:]
49 | 
50 |                 # 헤더와 데이터를 구분하여 표시
51 |                 content_lines = [
52 |                     f"Headers: {', '.join(headers)}",
53 |                     "=" * 80,
54 |                 ]
55 | 
56 |                 for row in data_rows:
57 |                     content_lines.append(' | '.join(str(cell) for cell in row))
58 |             else:
59 |                 # 헤더가 없는 경우
60 |                 content_lines = []
61 |                 for row in rows:
62 |                     content_lines.append(' | '.join(str(cell) for cell in row))
63 | 
64 |             content = '\n'.join(content_lines)
65 | 
66 |             metadata = {
67 |                 'file_type': 'csv',
68 |                 'encoding': 'utf-8',
69 |                 'file_size': file_path.stat().st_size,
70 |                 'num_rows': len(rows),
71 |                 'num_columns': len(rows[0]) if rows else 0,
72 |                 'has_header': has_header,
73 |             }
74 | 
75 |             return ParseResult(content=content, metadata=metadata)
76 | 
77 |         except UnicodeDecodeError as e:
78 |             raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
79 |         except Exception as e:
80 |             raise ValueError(f"CSV 파싱 실패: {file_path}") from e
81 | 
82 |     def supports(self, file_path: Path) -> bool:
83 |         """CSV 파일 확장자 지원 확인"""
84 |         return file_path.suffix.lower() in self.supported_extensions
85 | 
86 |     @property
87 |     def supported_extensions(self) -> list[str]:
88 |         """지원하는 CSV 파일 확장자"""
89 |         return ['.csv', '.tsv']
90 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/pptx_parser.py:
--------------------------------------------------------------------------------
 1 | """PowerPoint file parser."""
 2 | 
 3 | from pathlib import Path
 4 | from .base import BaseParser, ParseResult
 5 | 
 6 | 
 7 | class PPTXParser(BaseParser):
 8 |     """PowerPoint 파일 파서 (python-pptx 사용)"""
 9 | 
10 |     def parse(self, file_path: Path) -> ParseResult:
11 |         """
12 |         PPTX 파일에서 텍스트를 추출합니다.
13 | 
14 |         Args:
15 |             file_path: 파싱할 PPTX 파일 경로
16 | 
17 |         Returns:
18 |             ParseResult: 파싱된 텍스트와 메타데이터
19 |         """
20 |         if not file_path.exists():
21 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 | 
23 |         try:
24 |             try:
25 |                 from pptx import Presentation
26 |             except ImportError:
27 |                 raise ImportError(
28 |                     "python-pptx가 설치되지 않았습니다. 'pip install python-pptx'로 설치하세요."
29 |                 )
30 | 
31 |             prs = Presentation(file_path)
32 | 
33 |             # 슬라이드별로 텍스트 추출
34 |             slides_text = []
35 |             for idx, slide in enumerate(prs.slides, 1):
36 |                 slide_content = [f"=== Slide {idx} ==="]
37 | 
38 |                 # 슬라이드의 모든 도형에서 텍스트 추출
39 |                 for shape in slide.shapes:
40 |                     if hasattr(shape, "text") and shape.text.strip():
41 |                         slide_content.append(shape.text.strip())
42 | 
43 |                     # 테이블이 있는 경우 처리
44 |                     if shape.has_table:
45 |                         table = shape.table
46 |                         for row in table.rows:
47 |                             row_text = ' | '.join(cell.text.strip() for cell in row.cells)
48 |                             if row_text.strip():
49 |                                 slide_content.append(row_text)
50 | 
51 |                 # 슬라이드 노트 추출
52 |                 if slide.has_notes_slide:
53 |                     notes_text = slide.notes_slide.notes_text_frame.text.strip()
54 |                     if notes_text:
55 |                         slide_content.append(f"Notes: {notes_text}")
56 | 
57 |                 slides_text.append('\n'.join(slide_content))
58 | 
59 |             content = '\n\n'.join(slides_text)
60 | 
61 |             metadata = {
62 |                 'file_type': 'pptx',
63 |                 'num_slides': len(prs.slides),
64 |                 'file_size': file_path.stat().st_size,
65 |             }
66 | 
67 |             # 슬라이드 크기 정보
68 |             if prs.slide_width and prs.slide_height:
69 |                 metadata['slide_width'] = prs.slide_width
70 |                 metadata['slide_height'] = prs.slide_height
71 | 
72 |             return ParseResult(content=content, metadata=metadata)
73 | 
74 |         except Exception as e:
75 |             raise ValueError(f"PPTX 파싱 실패: {file_path}") from e
76 | 
77 |     def supports(self, file_path: Path) -> bool:
78 |         """PPTX 파일 확장자 지원 확인"""
79 |         return file_path.suffix.lower() in self.supported_extensions
80 | 
81 |     @property
82 |     def supported_extensions(self) -> list[str]:
83 |         """지원하는 PowerPoint 파일 확장자"""
84 |         return ['.pptx']
85 | 


--------------------------------------------------------------------------------
/maru_lang/models/vector_db.py:
--------------------------------------------------------------------------------
 1 | """
 2 | VectorDB 설정 모델
 3 | """
 4 | from dataclasses import dataclass, field
 5 | from typing import Optional, Union
 6 | 
 7 | 
 8 | # ========== VectorDB Config (상속 기반) ==========
 9 | 
10 | @dataclass
11 | class BaseVectorDBConfig:
12 |     """VectorDB 기본 설정 (모든 VectorDB 공통)"""
13 |     db_type: str
14 | 
15 | 
16 | @dataclass
17 | class ChromaDBConfig(BaseVectorDBConfig):
18 |     """ChromaDB 전용 설정"""
19 |     persist_dir: str = field(default="")
20 |     collection_name: str = field(default="")
21 |     db_type: str = field(default="chromadb", init=False)
22 | 
23 |     @classmethod
24 |     def from_settings(cls) -> "ChromaDBConfig":
25 |         """Settings로부터 기본 ChromaDB 설정 생성"""
26 |         from maru_lang.configs.system_config import get_system_config
27 |         config = get_system_config()
28 |         return cls(
29 |             persist_dir=config.vector_db.chroma.get_persist_dir_absolute(),
30 |             collection_name=config.vector_db.default_collection_name,
31 |         )
32 | 
33 | 
34 | @dataclass
35 | class MilvusConfig(BaseVectorDBConfig):
36 |     """Milvus 전용 설정"""
37 |     host: str = field(default="localhost")
38 |     port: int = field(default=19530)
39 |     user: str = field(default="root")
40 |     password: str = field(default="Milvus")
41 |     collection_name: str = field(default="")
42 |     db_type: str = field(default="milvus", init=False)
43 | 
44 |     @classmethod
45 |     def from_settings(cls) -> "MilvusConfig":
46 |         """Settings로부터 기본 Milvus 설정 생성"""
47 |         from maru_lang.configs.system_config import get_system_config
48 |         config = get_system_config()
49 |         return cls(
50 |             host=config.vector_db.milvus.host,
51 |             port=config.vector_db.milvus.port,
52 |             user=config.vector_db.milvus.user,
53 |             password=config.vector_db.milvus.password,
54 |             collection_name=config.vector_db.default_collection_name,
55 |         )
56 | 
57 | 
58 | @dataclass
59 | class PineconeConfig(BaseVectorDBConfig):
60 |     """Pinecone 전용 설정 (향후 확장)"""
61 |     api_key: str = field(default="")
62 |     environment: str = field(default="")
63 |     index_name: str = field(default="")
64 |     db_type: str = field(default="pinecone", init=False)
65 | 
66 | 
67 | def get_vector_db_config_from_settings() -> Union[ChromaDBConfig, MilvusConfig]:
68 |     """
69 |     system_config.yaml의 vector_db.type에 따라 적절한 VectorDB 설정 반환
70 | 
71 |     Returns:
72 |         ChromaDBConfig or MilvusConfig: 설정된 VectorDB 타입에 맞는 설정 객체
73 | 
74 |     Raises:
75 |         ValueError: 지원하지 않는 VectorDB 타입인 경우
76 |     """
77 |     from maru_lang.configs.system_config import get_system_config
78 |     config = get_system_config()
79 | 
80 |     db_type = config.vector_db.type.lower()
81 | 
82 |     if db_type == "chroma":
83 |         return ChromaDBConfig.from_settings()
84 |     elif db_type == "milvus":
85 |         return MilvusConfig.from_settings()
86 |     else:
87 |         raise ValueError(
88 |             f"Unsupported vector_db.type: {db_type}. "
89 |             f"Supported types: 'chroma', 'milvus'"
90 |         )


--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/chunker_config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Chunker configuration loader
 3 | """
 4 | from typing import Dict, Any, Optional
 5 | from maru_lang.configs.base import DefaultConfigLoader
 6 | from maru_lang.pluggable.models import ChunkerConfig
 7 | from maru_lang.enums.configs import ConfigType
 8 | 
 9 | 
10 | class ChunkerConfigLoader(DefaultConfigLoader[ChunkerConfig]):
11 |     """Loader for chunker configurations"""
12 | 
13 |     def __init__(self):
14 |         super().__init__(ConfigType.CHUNKERS)
15 |         # Chunkers는 base config 없이 user config만 사용
16 |         # (명시적 설정을 강제하기 위해)
17 | 
18 |     def load_all(self) -> Dict[str, ChunkerConfig]:
19 |         """Load configurations from user directory only (no base)"""
20 |         import logging
21 |         logger = logging.getLogger(__name__)
22 | 
23 |         self.configs = {}
24 |         self._base_configs = {}
25 | 
26 |         # User config만 로드 (base 없음) - 특정 파일만 읽기
27 |         logger.info(f"Loading {self.config_type} configurations from user directory...")
28 | 
29 |         # chunker_config.yaml만 읽기 (사용자 정의 chunker .py 파일 제외)
30 |         config_file = self.user_dir / "chunker_config.yaml"
31 |         if config_file.exists():
32 |             if self._load_file(config_file, is_user=True):
33 |                 logger.info(f"Loaded chunker config from {config_file}")
34 |             else:
35 |                 logger.warning(f"Failed to load chunker config from {config_file}")
36 |         else:
37 |             logger.warning(f"Chunker config file not found: {config_file}")
38 | 
39 |         logger.info(
40 |             f"Loaded {len(self.configs)} {self.config_type} configs"
41 |         )
42 | 
43 |         return self.configs
44 | 
45 |     def parse_config(
46 |         self, data: Dict[str, Any], source_path: str, is_user: bool
47 |     ) -> Optional[ChunkerConfig]:
48 |         """Parse chunker configuration data"""
49 |         try:
50 |             return ChunkerConfig(
51 |                 chunkers=data.get("chunkers", {}),
52 |                 source_path=source_path,
53 |                 is_override=is_user,
54 |             )
55 |         except Exception as e:
56 |             import sys
57 | 
58 |             error_msg = f"Error parsing chunker config from {source_path}: {e}"
59 |             print(f"\n❌ ERROR: {error_msg}", file=sys.stderr)
60 |             return None
61 | 
62 |     def get_config_name(self, config: ChunkerConfig) -> str:
63 |         """Get the name of a chunker configuration"""
64 |         # 단일 config 파일이므로 고정 이름 사용
65 |         return "config"
66 | 
67 |     def validate_config(self, data: Dict[str, Any]) -> bool:
68 |         """Validate chunker configuration data"""
69 |         # 필수 필드가 없으므로 기본적으로 유효
70 |         return True
71 | 
72 |     def get_merged_config(self) -> ChunkerConfig:
73 |         """
74 |         Get merged configuration (base + user override)
75 | 
76 |         Returns:
77 |             Merged ChunkerConfig with user overrides applied
78 |         """
79 |         # Base config
80 |         base = self.configs.get("config")
81 |         if not base:
82 |             # Return default if no config found
83 |             return ChunkerConfig()
84 | 
85 |         return base
86 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/embedder_config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Embedder configuration loader
 3 | """
 4 | from typing import Dict, Any, Optional
 5 | from maru_lang.configs.base import DefaultConfigLoader
 6 | from maru_lang.pluggable.models import EmbedderConfig
 7 | from maru_lang.enums.configs import ConfigType
 8 | 
 9 | 
10 | class EmbedderConfigLoader(DefaultConfigLoader[EmbedderConfig]):
11 |     """Loader for embedder configurations"""
12 | 
13 |     def __init__(self):
14 |         super().__init__(ConfigType.EMBEDDERS)
15 |         # Embedders는 base config 없이 user config만 사용
16 |         # (명시적 설정을 강제하기 위해)
17 | 
18 |     def load_all(self) -> Dict[str, EmbedderConfig]:
19 |         """Load configurations from user directory only (no base)"""
20 |         import logging
21 |         logger = logging.getLogger(__name__)
22 | 
23 |         self.configs = {}
24 |         self._base_configs = {}
25 | 
26 |         # User config만 로드 (base 없음) - 특정 파일만 읽기
27 |         logger.info(f"Loading {self.config_type} configurations from user directory...")
28 | 
29 |         # embedder_config.yaml만 읽기
30 |         config_file = self.user_dir / "embedder_config.yaml"
31 |         if config_file.exists():
32 |             if self._load_file(config_file, is_user=True):
33 |                 logger.info(f"Loaded embedder config from {config_file}")
34 |             else:
35 |                 logger.warning(f"Failed to load embedder config from {config_file}")
36 |         else:
37 |             logger.warning(f"Embedder config file not found: {config_file}")
38 | 
39 |         logger.info(
40 |             f"Loaded {len(self.configs)} {self.config_type} configs"
41 |         )
42 | 
43 |         return self.configs
44 | 
45 |     def parse_config(
46 |         self, data: Dict[str, Any], source_path: str, is_user: bool
47 |     ) -> Optional[EmbedderConfig]:
48 |         """Parse embedder configuration data"""
49 |         try:
50 |             # 'models' 필드는 하위 호환성을 위해 무시 (deprecated)
51 |             return EmbedderConfig(
52 |                 default_model=data.get("default_model"),
53 |                 device=data.get("device"),
54 |                 source_path=source_path,
55 |                 is_override=is_user,
56 |             )
57 |         except Exception as e:
58 |             import sys
59 | 
60 |             error_msg = f"Error parsing embedder config from {source_path}: {e}"
61 |             print(f"\n❌ ERROR: {error_msg}", file=sys.stderr)
62 |             return None
63 | 
64 |     def get_config_name(self, config: EmbedderConfig) -> str:
65 |         """Get the name of an embedder configuration"""
66 |         # 단일 config 파일이므로 고정 이름 사용
67 |         return "config"
68 | 
69 |     def validate_config(self, data: Dict[str, Any]) -> bool:
70 |         """Validate embedder configuration data"""
71 |         # 필수 필드가 없으므로 기본적으로 유효
72 |         return True
73 | 
74 |     def get_merged_config(self) -> EmbedderConfig:
75 |         """
76 |         Get merged configuration (base + user override)
77 | 
78 |         Returns:
79 |             Merged EmbedderConfig with user overrides applied
80 |         """
81 |         # Base config
82 |         base = self.configs.get("config")
83 |         if not base:
84 |             # Return default if no config found
85 |             return EmbedderConfig()
86 | 
87 |         return base
88 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/loader_config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Loader configuration loader
 3 | """
 4 | from typing import Dict, Any, Optional
 5 | from maru_lang.configs.base import DefaultConfigLoader
 6 | from maru_lang.pluggable.models import LoaderConfig
 7 | from maru_lang.enums.configs import ConfigType
 8 | 
 9 | 
10 | class LoaderConfigLoader(DefaultConfigLoader[LoaderConfig]):
11 |     """Loader for loader (parser) configurations"""
12 | 
13 |     def __init__(self):
14 |         super().__init__(ConfigType.LOADERS)
15 |         # Loaders는 base config 없이 user config만 사용
16 |         # (명시적 설정을 강제하기 위해)
17 | 
18 |     def load_all(self) -> Dict[str, LoaderConfig]:
19 |         """Load configurations from user directory only (no base)"""
20 |         import logging
21 |         logger = logging.getLogger(__name__)
22 | 
23 |         self.configs = {}
24 |         self._base_configs = {}
25 | 
26 |         # User config만 로드 (base 없음) - 특정 파일만 읽기
27 |         logger.info(f"Loading {self.config_type} configurations from user directory...")
28 | 
29 |         # loader_config.yaml만 읽기 (사용자 정의 parser .py 파일 제외)
30 |         config_file = self.user_dir / "loader_config.yaml"
31 |         if config_file.exists():
32 |             if self._load_file(config_file, is_user=True):
33 |                 logger.info(f"Loaded loader config from {config_file}")
34 |             else:
35 |                 logger.warning(f"Failed to load loader config from {config_file}")
36 |         else:
37 |             logger.warning(f"Loader config file not found: {config_file}")
38 | 
39 |         logger.info(
40 |             f"Loaded {len(self.configs)} {self.config_type} configs"
41 |         )
42 | 
43 |         return self.configs
44 | 
45 |     def parse_config(
46 |         self, data: Dict[str, Any], source_path: str, is_user: bool
47 |     ) -> Optional[LoaderConfig]:
48 |         """Parse loader configuration data"""
49 |         try:
50 |             return LoaderConfig(
51 |                 default_loader=data.get("default_loader", "txt"),
52 |                 default_chunker=data.get("default_chunker", "paragraph"),
53 |                 extensions=data.get("extensions", {}),
54 |                 source_path=source_path,
55 |                 is_override=is_user,
56 |             )
57 |         except Exception as e:
58 |             import sys
59 | 
60 |             error_msg = f"Error parsing loader config from {source_path}: {e}"
61 |             print(f"\n❌ ERROR: {error_msg}", file=sys.stderr)
62 |             return None
63 | 
64 |     def get_config_name(self, config: LoaderConfig) -> str:
65 |         """Get the name of a loader configuration"""
66 |         # 단일 config 파일이므로 고정 이름 사용
67 |         return "config"
68 | 
69 |     def validate_config(self, data: Dict[str, Any]) -> bool:
70 |         """Validate loader configuration data"""
71 |         # 필수 필드가 없으므로 기본적으로 유효
72 |         return True
73 | 
74 |     def get_merged_config(self) -> LoaderConfig:
75 |         """
76 |         Get merged configuration (base + user override)
77 | 
78 |         Returns:
79 |             Merged LoaderConfig with user overrides applied
80 |         """
81 |         # Base config
82 |         base = self.configs.get("config")
83 |         if not base:
84 |             # Return default if no config found
85 |             return LoaderConfig()
86 | 
87 |         return base
88 | 


--------------------------------------------------------------------------------
/maru_lang/core/relation_db/models/auth.py:
--------------------------------------------------------------------------------
  1 | from __future__ import annotations
  2 | import datetime
  3 | from tortoise.models import Model
  4 | from tortoise import fields
  5 | 
  6 | 
  7 | class User(Model):
  8 |     id = fields.IntField(pk=True)
  9 |     name = fields.CharField(max_length=255, index=True, null=True)
 10 |     email = fields.CharField(max_length=255, index=True, unique=True)
 11 |     role = fields.ForeignKeyField(
 12 |         "models.UserRole", related_name="users", null=True)
 13 |     created_at = fields.DatetimeField(auto_now_add=True)
 14 | 
 15 |     class Meta:
 16 |         table = "user"
 17 | 
 18 | 
 19 | class UserGroup(Model):
 20 |     id = fields.IntField(pk=True)
 21 |     name = fields.CharField(max_length=255, unique=True)
 22 |     manager = fields.ForeignKeyField(
 23 |         "models.User",
 24 |         related_name="managed_user_groups",
 25 |         on_delete=fields.RESTRICT  # Prevents User deletion if managing UserGroups
 26 |     )
 27 |     created_at = fields.DatetimeField(auto_now_add=True)
 28 | 
 29 |     class Meta:
 30 |         table = "user_group"
 31 | 
 32 | 
 33 | class UserGroupMembership(Model):
 34 |     user = fields.ForeignKeyField(
 35 |         "models.User",
 36 |         related_name="group_memberships")
 37 |     group = fields.ForeignKeyField(
 38 |         "models.UserGroup",
 39 |         related_name="members")
 40 | 
 41 |     class Meta:
 42 |         table = "user_group_membership"
 43 | 
 44 | 
 45 | class UserGroupInclusion(Model):
 46 |     parent = fields.ForeignKeyField(
 47 |         "models.UserGroup", related_name="includes")  # 상위 그룹
 48 |     child = fields.ForeignKeyField(
 49 |         "models.UserGroup", related_name="included_by")  # 하위 그룹
 50 | 
 51 |     class Meta:
 52 |         table = "user_group_inclusion"
 53 | 
 54 | 
 55 | class OTP(Model):
 56 |     id = fields.IntField(pk=True)
 57 |     email = fields.CharField(max_length=255, index=True)
 58 |     code = fields.CharField(max_length=6)
 59 |     created_at = fields.DatetimeField(auto_now_add=True)
 60 | 
 61 |     class Meta:
 62 |         table = "otp"
 63 | 
 64 |     async def is_valid(self):
 65 |         """ 인증코드가 5분 이내인지 확인 """
 66 |         expiration_time = self.created_at + datetime.timedelta(minutes=5)
 67 |         return expiration_time > datetime.datetime.now(datetime.timezone.utc)
 68 | 
 69 | 
 70 | class UserToken(Model):
 71 |     id = fields.IntField(pk=True)
 72 |     user_id = fields.CharField(max_length=255, index=True)  # 사용자 고유 ID
 73 |     device_id = fields.CharField(max_length=255, index=True)  # 기기 고유 ID
 74 |     jwt_token = fields.TextField()  # JWT 토큰
 75 |     created_at = fields.DatetimeField(auto_now_add=True)
 76 | 
 77 |     class Meta:
 78 |         table = "user_token"
 79 | 
 80 | 
 81 | class RefreshToken(Model):
 82 |     id = fields.IntField(pk=True)
 83 |     user_id = fields.CharField(max_length=255, index=True)
 84 |     device_id = fields.CharField(max_length=255, index=True)
 85 |     refresh_token = fields.TextField()  # 발급된 Refresh Token 문자열
 86 |     created_at = fields.DatetimeField(auto_now_add=True)
 87 |     expires_at = fields.DatetimeField()
 88 | 
 89 |     class Meta:
 90 |         table = "refresh_token"
 91 | 
 92 | 
 93 | class UserRole(Model):
 94 |     id = fields.IntField(pk=True)
 95 |     name = fields.CharField(max_length=255, index=True, unique=True)
 96 |     description = fields.TextField(null=True)
 97 | 
 98 |     class Meta:
 99 |         table = "user_role"
100 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/llm_reranker.yaml:
--------------------------------------------------------------------------------
 1 | # LLM Reranker Agent Configuration
 2 | # Agent that uses LLM to rerank search results based on relevance to the query
 3 | # NOTE: This is a utility agent, not selectable by agent_selector
 4 | 
 5 | name: llm_reranker
 6 | description: "Uses LLM to evaluate and rerank search results based on relevance"
 7 | type: utility
 8 | enabled: true
 9 | version: "1.0.0"
10 | 
11 | # Priority (lower priority - runs after retrieval)
12 | priority: 50
13 | 
14 | # LLM Settings
15 | target_llm_config:
16 |   server_name: "openai"
17 |   override_params:
18 |     temperature: 0.0  # Keep temperature at 0 for consistent scoring
19 |     max_tokens: 1000  # Allow enough tokens for scoring multiple documents
20 |     timeout: 30.0  # Longer timeout for processing multiple documents
21 | 
22 |   fallback_strategy: "any_available"
23 | 
24 | # Prompt Settings
25 | prompts:
26 |   system_prompt: |
27 |     You are an expert at evaluating document relevance to search queries.
28 | 
29 |     Responsibilities:
30 |     - Analyze the semantic relevance between a query and documents
31 |     - Assign relevance scores to each document (0.0 to 1.0)
32 |     - Consider both semantic meaning and keyword matching
33 |     - Be strict in scoring - only highly relevant documents should get high scores
34 | 
35 |     Scoring Guidelines:
36 |     - 1.0: Perfect match, directly answers the query
37 |     - 0.8-0.9: Highly relevant, contains most information needed
38 |     - 0.6-0.7: Moderately relevant, contains some useful information
39 |     - 0.4-0.5: Weakly relevant, tangentially related
40 |     - 0.0-0.3: Not relevant or off-topic
41 | 
42 |   user_prompt_template: |
43 |     Evaluate the relevance of each document to the given query and assign scores.
44 | 
45 |     Query: {query}
46 | 
47 |     Documents:
48 |     {documents}
49 | 
50 |     Important rules:
51 |     1. Assign a relevance score (0.0 to 1.0) to each document based on how well it answers the query.
52 |     2. Document indices must match the input (0, 1, 2, ...).
53 |     3. Be strict in scoring - reserve high scores (>0.8) for truly relevant documents.
54 |     4. Consider semantic meaning, not just keyword overlap.
55 |     5. Return all documents with their scores, even if score is 0.0.
56 | 
57 |     You must return a JSON object as a tool call following the definition below. Use the keys document_scores and reasoning exactly as written.
58 | 
59 | # Implementation class (Python file in rerankers/)
60 | implementation: rerankers.llm_reranker.LLMRerankerAgent
61 | 
62 | # Agent Settings
63 | config:
64 |   timeout: 30
65 |   retry_count: 1
66 | 
67 | # Tool schema definition (JSON)
68 | tools:
69 |   llm_reranker:
70 |     description: "Reranks documents based on relevance to query using LLM evaluation"
71 |     parameters:
72 |       type: "object"
73 |       properties:
74 |         document_scores:
75 |           type: "array"
76 |           items:
77 |             type: "object"
78 |             properties:
79 |               index:
80 |                 type: "integer"
81 |                 description: "Document index (0-based)"
82 |               score:
83 |                 type: "number"
84 |                 minimum: 0.0
85 |                 maximum: 1.0
86 |                 description: "Relevance score (0.0 to 1.0)"
87 |           description: "List of documents with their relevance scores"
88 |         reasoning:
89 |           type: "string"
90 |           description: "Brief explanation of the scoring rationale"
91 |       required: ["document_scores", "reasoning"]
92 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/agents/agent_factory.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Agent Factory - Creates and configures agents based on configuration
 3 | """
 4 | from typing import Dict, Optional, List
 5 | from maru_lang.pluggable.agents.base import BaseAgent
 6 | from maru_lang.pluggable.agents.registry import get_registry
 7 | from maru_lang.configs.manager import get_config_manager
 8 | from maru_lang.pluggable.models import AgentConfig
 9 | from maru_lang.pluggable.agents.mcp_client_agent import MCPClientAgent
10 | 
11 | 
12 | class AgentFactory:
13 |     """
14 |     Factory for creating agents with proper configuration
15 |     Supports dynamic loading
16 |     """
17 | 
18 |     def __init__(
19 |         self,
20 |     ):
21 |         """
22 |         Initialize factory with default components
23 |         """
24 |         self.config_manager = get_config_manager()
25 |         self.registry = get_registry()
26 | 
27 |     def create_agent(
28 |         self,
29 |         agent_name: str,
30 |         agent_config: AgentConfig
31 |     ) -> Optional[BaseAgent]:
32 |         """
33 |         Create an agent instance based on name and configuration
34 | 
35 |         Args:
36 |             agent_name: Name/type of the agent
37 |             agent_config: Agent-specific configuration
38 | 
39 |         Returns:
40 |             Agent instance or None if not found
41 |         """
42 |         # Get agent class from registry
43 |         agent_class = self.registry.get_agent_class(agent_name)
44 |         if not agent_class:
45 |             print(f"Agent not found in registry: {agent_name}")
46 |             return None
47 | 
48 |         # Create agent instance
49 |         try:
50 |             if issubclass(agent_class, MCPClientAgent):
51 |                 # Other MCP agents need name, server_params, and llm_client
52 |                 if not agent_config.mcp_config:
53 |                     raise ValueError(
54 |                         f"MCP agent {agent_name} missing mcp_config")
55 |             return agent_class(
56 |                 name=agent_name,
57 |                 config=agent_config,  # Pass full agent_config as config
58 |             )
59 |             
60 |         except Exception as e:
61 |             print(f"Error creating agent {agent_name}: {e}")
62 |             return None
63 | 
64 |     def create_agents_from_config(self) -> Dict[str, BaseAgent]:
65 |         """
66 |         Create all agents based on configuration
67 | 
68 |         Returns:
69 |             Dictionary of agent instances by name
70 |         """
71 |         agents = {}
72 | 
73 |         # Create all agents from the registry
74 |         for agent_name in self.registry.list_agents():
75 |             agent_config = self.registry.get_agent_config(agent_name)
76 |             if not agent_config:
77 |                 print(
78 |                     f"[ERROR AgentFactory] Agent config not found: {agent_name}")
79 |                 continue
80 |             agent = self.create_agent(agent_name, agent_config)
81 |             if agent:
82 |                 agents[agent_name] = agent
83 |             else:
84 |                 print(
85 |                     f"[ERROR AgentFactory] Failed to create agent: {agent_name}")
86 |                 raise Exception(f"Failed to create agent: {agent_name}")
87 |         return agents
88 | 
89 |     def list_available_agents(self) -> List[str]:
90 |         """List all available agent names"""
91 |         return self.registry.list_agents()
92 | 
93 |     def reload_agents(self) -> None:
94 |         """Reload all agents from sources"""
95 |         self.registry.reload()
96 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/reranker_config.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Reranker Configuration Loader
 3 | """
 4 | import logging
 5 | from typing import Dict, Any, Optional
 6 | from maru_lang.enums.configs import ConfigType
 7 | from maru_lang.pluggable.models import RerankerConfig
 8 | from maru_lang.configs.base import DefaultConfigLoader
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | class RerankerConfigLoader(DefaultConfigLoader[RerankerConfig]):
14 |     """Loader for reranker configurations"""
15 | 
16 |     def __init__(self):
17 |         super().__init__(ConfigType.RERANKERS)
18 | 
19 |     def load_all(self) -> Dict[str, RerankerConfig]:
20 |         """Load configurations from user directory only (no base)"""
21 |         self.configs = {}
22 |         self._base_configs = {}
23 | 
24 |         # User config만 로드 (base 없음) - 특정 파일만 읽기
25 |         logger.info(f"Loading {self.config_type} configurations from user directory...")
26 | 
27 |         # reranker_config.yaml만 읽기 (llm_reranker.yaml 등 agent 설정 제외)
28 |         config_file = self.user_dir / "reranker_config.yaml"
29 |         if config_file.exists():
30 |             if self._load_file(config_file, is_user=True):
31 |                 logger.info(f"Loaded reranker config from {config_file}")
32 |             else:
33 |                 logger.warning(f"Failed to load reranker config from {config_file}")
34 |         else:
35 |             logger.warning(f"Reranker config file not found: {config_file}")
36 | 
37 |         logger.info(
38 |             f"Loaded {len(self.configs)} {self.config_type} configs"
39 |         )
40 | 
41 |         return self.configs
42 | 
43 |     def parse_config(
44 |         self, data: Dict[str, Any], source_path: str, is_user: bool
45 |     ) -> Optional[RerankerConfig]:
46 |         """Parse reranker configuration data"""
47 |         try:
48 |             # 'models' 필드는 하위 호환성을 위해 무시 (deprecated)
49 |             return RerankerConfig(
50 |                 enabled=data.get("enabled", True),
51 |                 method=data.get("method", "model"),
52 |                 default_model=data.get("default_model", "BAAI/bge-reranker-v2-m3"),
53 |                 agent_name=data.get("agent_name"),
54 |                 top_k=data.get("top_k"),
55 |                 source_path=source_path,
56 |                 is_override=is_user,
57 |             )
58 |         except Exception as e:
59 |             import sys
60 | 
61 |             error_msg = f"Error parsing reranker config from {source_path}: {e}"
62 |             print(f"\n❌ ERROR: {error_msg}", file=sys.stderr)
63 |             return None
64 | 
65 |     def get_config_name(self, config: RerankerConfig) -> str:
66 |         """Get the name of a reranker configuration"""
67 |         # 단일 config 파일이므로 고정 이름 사용
68 |         return "config"
69 | 
70 |     def validate_config(self, data: Dict[str, Any]) -> bool:
71 |         """Validate reranker configuration data"""
72 |         # 필수 필드가 없으므로 기본적으로 유효
73 |         if not isinstance(data, dict):
74 |             logger.error(f"Reranker config data is not a dict: {type(data)}")
75 |             return False
76 |         return True
77 | 
78 |     def get_merged_config(self) -> RerankerConfig:
79 |         """
80 |         Get merged configuration (base + user override)
81 | 
82 |         Returns:
83 |             Merged RerankerConfig with user overrides applied
84 |         """
85 |         # Base config
86 |         base = self.configs.get("config")
87 |         if not base:
88 |             # Return default if no config found
89 |             return RerankerConfig()
90 | 
91 |         return base
92 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/xml_parser.py:
--------------------------------------------------------------------------------
  1 | """XML file parser."""
  2 | 
  3 | import xml.etree.ElementTree as ET
  4 | from pathlib import Path
  5 | from .base import BaseParser, ParseResult
  6 | 
  7 | 
  8 | class XMLParser(BaseParser):
  9 |     """XML 파일 파서"""
 10 | 
 11 |     def parse(self, file_path: Path) -> ParseResult:
 12 |         """
 13 |         XML 파일을 읽어 포맷팅된 텍스트로 변환합니다.
 14 | 
 15 |         Args:
 16 |             file_path: 파싱할 XML 파일 경로
 17 | 
 18 |         Returns:
 19 |             ParseResult: 파싱된 텍스트와 메타데이터
 20 |         """
 21 |         if not file_path.exists():
 22 |             raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
 23 | 
 24 |         try:
 25 |             tree = ET.parse(file_path)
 26 |             root = tree.getroot()
 27 | 
 28 |             # XML 구조를 텍스트로 변환
 29 |             lines = []
 30 |             self._element_to_text(root, lines, level=0)
 31 |             content = '\n'.join(lines)
 32 | 
 33 |             # 요소 개수 계산
 34 |             num_elements = len(list(root.iter()))
 35 | 
 36 |             metadata = {
 37 |                 'file_type': 'xml',
 38 |                 'encoding': tree.docinfo.encoding if hasattr(tree, 'docinfo') else 'utf-8',
 39 |                 'file_size': file_path.stat().st_size,
 40 |                 'root_tag': root.tag,
 41 |                 'num_elements': num_elements,
 42 |             }
 43 | 
 44 |             # 네임스페이스 정보 추출
 45 |             namespaces = {}
 46 |             for elem in root.iter():
 47 |                 if '}' in elem.tag:
 48 |                     ns = elem.tag.split('}')[0][1:]
 49 |                     if ns not in namespaces.values():
 50 |                         namespaces[f'ns{len(namespaces)}'] = ns
 51 | 
 52 |             if namespaces:
 53 |                 metadata['namespaces'] = namespaces
 54 | 
 55 |             return ParseResult(content=content, metadata=metadata)
 56 | 
 57 |         except ET.ParseError as e:
 58 |             raise ValueError(f"XML 파싱 실패: {file_path} - {str(e)}") from e
 59 |         except Exception as e:
 60 |             raise ValueError(f"파일 읽기 실패: {file_path}") from e
 61 | 
 62 |     def _element_to_text(self, element: ET.Element, lines: list[str], level: int = 0) -> None:
 63 |         """
 64 |         XML 요소를 재귀적으로 텍스트로 변환합니다.
 65 | 
 66 |         Args:
 67 |             element: XML 요소
 68 |             lines: 결과를 저장할 리스트
 69 |             level: 들여쓰기 레벨
 70 |         """
 71 |         indent = "  " * level
 72 |         tag = element.tag
 73 | 
 74 |         # 네임스페이스 제거 (가독성 향상)
 75 |         if '}' in tag:
 76 |             tag = tag.split('}')[1]
 77 | 
 78 |         # 시작 태그와 속성
 79 |         attrs = ''
 80 |         if element.attrib:
 81 |             attrs = ' [' + ', '.join(f'{k}={v}' for k, v in element.attrib.items()) + ']'
 82 | 
 83 |         # 텍스트 내용
 84 |         text = (element.text or '').strip()
 85 | 
 86 |         if text:
 87 |             lines.append(f"{indent}<{tag}{attrs}>: {text}")
 88 |         else:
 89 |             lines.append(f"{indent}<{tag}{attrs}>")
 90 | 
 91 |         # 자식 요소 재귀 처리
 92 |         for child in element:
 93 |             self._element_to_text(child, lines, level + 1)
 94 | 
 95 |         # tail 텍스트 (닫는 태그 뒤의 텍스트)
 96 |         tail = (element.tail or '').strip()
 97 |         if tail:
 98 |             lines.append(f"{indent}  {tail}")
 99 | 
100 |     def supports(self, file_path: Path) -> bool:
101 |         """XML 파일 확장자 지원 확인"""
102 |         return file_path.suffix.lower() in self.supported_extensions
103 | 
104 |     @property
105 |     def supported_extensions(self) -> list[str]:
106 |         """지원하는 XML 파일 확장자"""
107 |         return ['.xml', '.xhtml', '.svg']
108 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/agents/builtin/intent_extractor.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Intent Extractor Agent - Extracts user intent and rewrites queries for search
  3 | """
  4 | from typing import Dict, Any, Optional
  5 | from maru_lang.pluggable.agents.base import BaseAgent, AgentResult
  6 | from maru_lang.models.chat import ChatHistory
  7 | 
  8 | 
  9 | class IntentExtractorAgent(BaseAgent):
 10 |     """Agent for extracting user intent and rewriting queries for document search"""
 11 | 
 12 |     def __init__(self, **kwargs):
 13 |         super().__init__(**kwargs)
 14 | 
 15 |     async def _setup(self) -> None:
 16 |         """Initialize intent extraction capabilities"""
 17 |         pass
 18 | 
 19 |     async def execute(
 20 |         self,
 21 |         question: str,
 22 |         chat_history: ChatHistory,
 23 |         **kwargs
 24 |     ) -> AgentResult:
 25 |         """
 26 |         Execute intent extraction and query rewriting
 27 | 
 28 |         Args:
 29 |             question: User's new question/message
 30 |             chat_history: Previous conversation context
 31 |             max_length: Maximum length of generated question
 32 |             **kwargs: Additional parameters
 33 | 
 34 |         Returns:
 35 |             AgentResult containing extracted intent and rewritten query
 36 |         """
 37 | 
 38 |         try:
 39 |             # Format the prompt with dialogue context
 40 |             rewritten_query = await self._extract_intent_and_rewrite(
 41 |                 question,
 42 |                 chat_history
 43 |             )
 44 | 
 45 |             return AgentResult(
 46 |                 success=True,
 47 |                 result=rewritten_query,  # 주요 출력: 재작성된 질문
 48 |                 data={
 49 |                     'original_question': question,
 50 |                     'rewritten_question': rewritten_query,
 51 |                     'has_context': True if chat_history.messages else False,
 52 |                     'extracted_intent': True,
 53 |                 },
 54 |                 metadata={
 55 |                     'extraction_method': 'llm_based',
 56 |                 }
 57 |             )
 58 | 
 59 |         except Exception as e:
 60 |             # Fallback to original question
 61 |             return AgentResult(
 62 |                 success=True,  # Still successful, but using fallback
 63 |                 result=question,  # 주요 출력: 원본 질문
 64 |                 data={
 65 |                     'original_question': question,
 66 |                     'rewritten_question': question,  # Use original as fallback
 67 |                     'has_context': True if chat_history.messages else False,
 68 |                     'extracted_intent': False,
 69 |                 },
 70 |                 metadata={
 71 |                     'extraction_method': 'fallback',
 72 |                     'error': str(e)
 73 |                 }
 74 |             )
 75 | 
 76 |     async def _extract_intent_and_rewrite(
 77 |         self,
 78 |         question: str,
 79 |         chat_history: ChatHistory,
 80 |     ) -> str:
 81 |         """Extract intent and rewrite query using LLM with fallback"""
 82 |         # YAML 설정에서 프롬프트 가져오기
 83 |         prompts = self.config.prompts
 84 | 
 85 |         # 템플릿에 질문 삽입
 86 |         user_prompt = prompts.user_prompt_template.format(
 87 |             question=question,
 88 |             history_text=chat_history.to_string()
 89 |         )
 90 | 
 91 |         override_params = self.get_override_params()
 92 | 
 93 |         # Use request_with_fallback for automatic LLM fallback
 94 |         response = await self.request_with_fallback(
 95 |             user_prompt=user_prompt,
 96 |             system_prompt=prompts.system_prompt,
 97 |             **override_params,
 98 |         )
 99 | 
100 |         return response.strip()
101 | 


--------------------------------------------------------------------------------
/maru_lang/templates/python/custom_parser.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Custom parser template - Copy this file and remove .sample extension
  3 | 
  4 | This is a template for creating custom document parsers.
  5 | Implement the BaseParser interface to add support for new file formats.
  6 | """
  7 | 
  8 | from pathlib import Path
  9 | from maru_lang.pluggable.loaders.base import BaseParser, ParseResult
 10 | 
 11 | 
 12 | class CustomParser(BaseParser):
 13 |     """
 14 |     Template for custom file parsers.
 15 | 
 16 |     Copy this class to implement support for new file formats.
 17 |     """
 18 | 
 19 |     def parse(self, file_path: Path) -> ParseResult:
 20 |         """
 21 |         Parse the file and extract textual content.
 22 | 
 23 |         Args:
 24 |             file_path: Path to the file to parse
 25 | 
 26 |         Returns:
 27 |             ParseResult: Parsed text and metadata
 28 | 
 29 |         Raises:
 30 |             ValueError: Raised when parsing fails or content cannot be read
 31 |             FileNotFoundError: Raised when the file does not exist
 32 |         """
 33 |         if not file_path.exists():
 34 |             raise FileNotFoundError(f"File not found: {file_path}")
 35 | 
 36 |         try:
 37 |             # Implement your parsing logic here
 38 |             # Example: convert JSON, XML, or CSV into plain text
 39 | 
 40 |             with open(file_path, 'r', encoding='utf-8') as f:
 41 |                 content = f.read()
 42 | 
 43 |             # Optional metadata enrichment
 44 |             metadata = {
 45 |                 'file_type': 'custom',
 46 |                 'file_size': file_path.stat().st_size,
 47 |                 # Add additional metadata as needed
 48 |             }
 49 | 
 50 |             return ParseResult(content=content, metadata=metadata)
 51 | 
 52 |         except Exception as e:
 53 |             raise ValueError(f"Failed to parse file: {file_path}") from e
 54 | 
 55 |     def supports(self, file_path: Path) -> bool:
 56 |         """
 57 |         Determine whether this parser supports the given file.
 58 | 
 59 |         Args:
 60 |             file_path: Path of the file to check
 61 | 
 62 |         Returns:
 63 |             bool: True if the file is supported, otherwise False
 64 |         """
 65 |         return file_path.suffix.lower() in self.supported_extensions
 66 | 
 67 |     @property
 68 |     def supported_extensions(self) -> list[str]:
 69 |         """
 70 |         List of file extensions supported by this parser
 71 | 
 72 |         Returns:
 73 |             list[str]: Supported extensions (e.g., ['.json', '.jsonl'])
 74 |         """
 75 |         # Update this list with the extensions you support
 76 |         return ['.custom', '.cst']
 77 | 
 78 | 
 79 | # Example: JSON parser
 80 | class JsonParser(BaseParser):
 81 |     """Example parser for JSON files"""
 82 | 
 83 |     def parse(self, file_path: Path) -> ParseResult:
 84 |         if not file_path.exists():
 85 |             raise FileNotFoundError(f"File not found: {file_path}")
 86 | 
 87 |         try:
 88 |             import json
 89 | 
 90 |             with open(file_path, 'r', encoding='utf-8') as f:
 91 |                 data = json.load(f)
 92 | 
 93 |             # Convert JSON into formatted text
 94 |             content = json.dumps(data, indent=2, ensure_ascii=False)
 95 | 
 96 |             metadata = {
 97 |                 'file_type': 'json',
 98 |                 'file_size': file_path.stat().st_size,
 99 |             }
100 | 
101 |             return ParseResult(content=content, metadata=metadata)
102 | 
103 |         except json.JSONDecodeError as e:
104 |             raise ValueError(f"JSON parse error: {file_path}") from e
105 |         except Exception as e:
106 |             raise ValueError(f"Failed to read file: {file_path}") from e
107 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
  1 | [project]
  2 | name = "maru-lang"
  3 | version = "0.0.0"
  4 | description = "Advanced LLM-powered chatbot with RAG, multi-agent system, and enterprise features"
  5 | requires-python = ">=3.10"
  6 | readme = "README.md"
  7 | license = {text = "MIT"}
  8 | authors = [
  9 |     {name = "KC ML2"},
 10 | ]
 11 | keywords = ["llm", "chatbot", "rag", "ai", "fastapi", "agents"]
 12 | classifiers = [
 13 |     "Development Status :: 4 - Beta",
 14 |     "Intended Audience :: Developers",
 15 |     "License :: OSI Approved :: MIT License",
 16 |     "Programming Language :: Python :: 3",
 17 |     "Programming Language :: Python :: 3.10",
 18 |     "Programming Language :: Python :: 3.11",
 19 |     "Programming Language :: Python :: 3.12",
 20 |     "Programming Language :: Python :: 3.13",
 21 |     "Topic :: Software Development :: Libraries :: Python Modules",
 22 |     "Topic :: Scientific/Engineering :: Artificial Intelligence",
 23 | ]
 24 | 
 25 | dependencies = [
 26 |     "fastapi>=0.100.0",
 27 |     "uvicorn[standard]>=0.23.0",
 28 |     "typer[all]>=0.9.0",
 29 |     "tortoise-orm[asyncpg]>=0.20.0",
 30 |     "aerich>=0.7.2",
 31 |     "python-jose[cryptography]>=3.3.0",
 32 |     "passlib>=1.7.4",
 33 |     "httpx>=0.24.0",
 34 |     "sentence-transformers>=2.2.0",
 35 |     "fastapi-pagination>=0.12.0",
 36 |     "rank-bm25>=0.2.2",
 37 |     "chromadb>=0.4.0",
 38 |     "konlpy>=0.6.0",
 39 |     "mcp>=0.9.0",
 40 |     "pyyaml>=6.0.0",
 41 |     # Document parsers
 42 |     "PyPDF2>=3.0.1",
 43 |     "python-docx>=0.8.11",
 44 |     "python-pptx>=0.6.21",
 45 |     "openpyxl>=3.0.0",
 46 |     "beautifulsoup4>=4.12.0",
 47 | ]
 48 | 
 49 | [project.optional-dependencies]
 50 | # Vector database backends (alternative to chromadb)
 51 | vector-db = [
 52 |     "pymilvus[model]>=2.3.0",
 53 | ]
 54 | 
 55 | # Email integration
 56 | email = [
 57 |     "O365>=2.0.0",
 58 | ]
 59 | 
 60 | # Development / testing
 61 | dev = [
 62 |     "aerich[toml]>=0.7.0",
 63 |     "pytest>=7.0.0",
 64 |     "pytest-asyncio>=0.21.0",
 65 |     "pytest-cov>=4.0.0",
 66 |     "notebook>=6.5.0",
 67 | ]
 68 | 
 69 | # Bundle of all optional features
 70 | all = [
 71 |     "pymilvus[model]>=2.3.0",
 72 |     "O365>=2.0.0",
 73 | ]
 74 | 
 75 | [build-system]
 76 | requires = ["setuptools>=61.0", "wheel"]
 77 | build-backend = "setuptools.build_meta"
 78 | 
 79 | [tool.setuptools]
 80 | packages = {find = {where = ["."], include = ["maru_lang*"]}}
 81 | include-package-data = true
 82 | 
 83 | [tool.setuptools.package-data]
 84 | maru_lang = [
 85 |     "templates/**/*.yaml",
 86 |     "templates/**/*.yml",
 87 |     "templates/**/*.py",
 88 |     "templates/**/*.md",
 89 |     "py.typed",
 90 | ]
 91 | 
 92 | [tool.pytest.ini_options]
 93 | asyncio_mode = "auto"
 94 | asyncio_default_fixture_loop_scope = "function"
 95 | testpaths = ["tests"]
 96 | python_files = "test_*.py"
 97 | python_classes = "Test*"
 98 | python_functions = "test_*"
 99 | filterwarnings = [
100 |     "ignore::DeprecationWarning",
101 |     "ignore::UserWarning",
102 | ]
103 | markers = [
104 |     "slow: long-running tests",
105 |     "integration: integration tests",
106 | ]
107 | 
108 | [tool.coverage.run]
109 | source = ["maru_lang"]
110 | omit = [
111 |     "maru_lang/migrations/*",
112 |     "maru_lang/alembic/*",
113 |     "maru_lang/scripts/*",
114 |     "maru_lang/tests/*",
115 | ]
116 | 
117 | [tool.coverage.report]
118 | exclude_lines = [
119 |     "pragma: no cover",
120 |     "def __repr__",
121 |     "raise NotImplementedError",
122 |     "if __name__ == '__main__':",
123 |     "pass",
124 |     "raise ImportError",
125 | ]
126 | 
127 | [project.scripts]
128 | maru = "maru_lang.cli:app"
129 | 
130 | [tool.aerich]
131 | tortoise_orm = "maru_lang.core.relation_db.TORTOISE_ORM"
132 | location = "./migrations"
133 | src_folder = "./."


--------------------------------------------------------------------------------
/maru_lang/pluggable/chunkers/paragraph.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from maru_lang.models.ingest import ChunkInput
 3 | from .base import BaseChunker
 4 | 
 5 | 
 6 | class ParagraphChunker(BaseChunker):
 7 |     """문단 단위로 청킹 (개행 2개 기준)"""
 8 | 
 9 |     name = "paragraph"
10 |     description = "문단 단위로 청킹 (빈 줄 기준 분리)"
11 | 
12 |     def __init__(self, max_chunk_size: int = 2000):
13 |         self.max_chunk_size = max_chunk_size
14 | 
15 |     def chunk(self, text: str) -> List[ChunkInput]:
16 |         parts = [p.strip() for p in text.split("\n\n") if p.strip()]
17 | 
18 |         # 큰 청크를 max_chunk_size 기준으로 분할
19 |         chunks = []
20 |         for part in parts:
21 |             if len(part) <= self.max_chunk_size:
22 |                 chunks.append(part)
23 |             else:
24 |                 # 큰 청크를 문장 단위로 분할 시도
25 |                 sentences = self._split_by_sentences(part)
26 |                 current_chunk = []
27 |                 current_size = 0
28 | 
29 |                 for sentence in sentences:
30 |                     sentence_len = len(sentence)
31 | 
32 |                     # 단일 문장이 max_chunk_size를 초과하는 경우
33 |                     if sentence_len > self.max_chunk_size:
34 |                         # 현재 버퍼가 있으면 먼저 저장
35 |                         if current_chunk:
36 |                             chunks.append(" ".join(current_chunk))
37 |                             current_chunk = []
38 |                             current_size = 0
39 |                         # 큰 문장을 강제로 분할
40 |                         chunks.extend(self._force_split(sentence, self.max_chunk_size))
41 |                         continue
42 | 
43 |                     # 현재 청크에 추가했을 때 크기 초과 여부 확인
44 |                     if current_size + sentence_len + (1 if current_chunk else 0) > self.max_chunk_size:
45 |                         # 현재 버퍼 저장
46 |                         if current_chunk:
47 |                             chunks.append(" ".join(current_chunk))
48 |                         current_chunk = [sentence]
49 |                         current_size = sentence_len
50 |                     else:
51 |                         current_chunk.append(sentence)
52 |                         current_size += sentence_len + (1 if len(current_chunk) > 1 else 0)
53 | 
54 |                 # 남은 버퍼 저장
55 |                 if current_chunk:
56 |                     chunks.append(" ".join(current_chunk))
57 | 
58 |         # 안전장치: 모든 청크가 max_chunk_size 이하인지 검증하고 필요시 재분할
59 |         final_chunks = []
60 |         for chunk in chunks:
61 |             if len(chunk) <= self.max_chunk_size:
62 |                 final_chunks.append(chunk)
63 |             else:
64 |                 # max_chunk_size를 초과하는 청크는 강제 분할
65 |                 final_chunks.extend(self._force_split(chunk, self.max_chunk_size))
66 | 
67 |         return [ChunkInput(number=i, content=c) for i, c in enumerate(final_chunks, start=1)]
68 | 
69 |     def _split_by_sentences(self, text: str) -> List[str]:
70 |         """텍스트를 문장 단위로 분할 (간단한 휴리스틱)"""
71 |         import re
72 |         # 한글/영어 문장 종결 기호로 분할
73 |         sentences = re.split(r'([.!?。！？\n]+)', text)
74 | 
75 |         # 구두점을 앞 문장에 붙이기
76 |         result = []
77 |         for i in range(0, len(sentences) - 1, 2):
78 |             if i + 1 < len(sentences):
79 |                 result.append((sentences[i] + sentences[i + 1]).strip())
80 |             else:
81 |                 result.append(sentences[i].strip())
82 | 
83 |         # 마지막 요소가 남아있으면 추가
84 |         if len(sentences) % 2 == 1 and sentences[-1].strip():
85 |             result.append(sentences[-1].strip())
86 | 
87 |         return [s for s in result if s]
88 | 
89 |     def _force_split(self, text: str, max_size: int) -> List[str]:
90 |         """max_size보다 큰 텍스트를 강제로 분할"""
91 |         chunks = []
92 |         for i in range(0, len(text), max_size):
93 |             chunks.append(text[i:i + max_size])
94 |         return chunks
95 | 


--------------------------------------------------------------------------------
/maru_lang/services/ingest.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Ingest service functions for file upload and synchronization
  3 | """
  4 | from typing import List, Tuple
  5 | from datetime import datetime
  6 | from pathlib import Path
  7 | from maru_lang.core.relation_db.models.documents import Document, DocumentGroup, DocumentGroupMembership
  8 | from maru_lang.utils.document import make_source_fingerprint_for_file
  9 | 
 10 | 
 11 | async def check_files_to_upload(
 12 |     folder_path: str,
 13 |     files: List[dict]  # [{"fileName": str, "createdAt": datetime, "relativePath": str, "size": int}]
 14 | ) -> List[str]:
 15 |     """
 16 |     Check which files need to be uploaded by comparing with database.
 17 | 
 18 |     Uses same logic as IngestPipeline's upsert_document_from_file:
 19 |     - Compares file_path (relativePath) within the DocumentGroup
 20 |     - Compares source_fingerprint (SHA256 hash of path|size|mtime)
 21 |     - Only checks files in the specified folder's DocumentGroup
 22 | 
 23 |     Args:
 24 |         folder_path: Project folder name (DocumentGroup name, e.g., "user/project")
 25 |         files: List of file information dicts with fileName, createdAt, relativePath, size
 26 | 
 27 |     Returns:
 28 |         List of relativePaths that need to be uploaded
 29 |     """
 30 |     files_to_upload = []
 31 | 
 32 |     # Check if DocumentGroup exists for this folder
 33 |     document_group = await DocumentGroup.get_or_none(name=folder_path)
 34 | 
 35 |     # If no group exists, all files are new
 36 |     if not document_group:
 37 |         return [file_info["relativePath"] for file_info in files]
 38 | 
 39 |     for file_info in files:
 40 |         relative_path = file_info["relativePath"]
 41 |         file_name = file_info["fileName"]
 42 |         created_at = file_info["createdAt"]
 43 |         file_size = file_info.get("size", 0)  # File size in bytes
 44 | 
 45 |         # Convert datetime to nanoseconds timestamp
 46 |         if isinstance(created_at, datetime):
 47 |             mtime_ns = int(created_at.timestamp() * 1e9)
 48 |         else:
 49 |             mtime_ns = int(created_at)
 50 | 
 51 |         # Generate expected fingerprint
 52 |         # Note: folder_path is already "{username}/{folderPath}"
 53 |         db_file_path = f"{folder_path}/{relative_path}"
 54 |         expected_fingerprint = make_source_fingerprint_for_file(
 55 |             db_file_path, file_size, mtime_ns
 56 |         )
 57 | 
 58 |         # Check if document exists in this specific group
 59 |         existing_doc = await Document.filter(
 60 |             file_path=db_file_path,
 61 |             group_memberships__group=document_group
 62 |         ).first()
 63 | 
 64 |         if not existing_doc:
 65 |             # New file in this group - needs upload
 66 |             files_to_upload.append(relative_path)
 67 |             continue
 68 | 
 69 |         # Compare fingerprint
 70 |         if existing_doc.source_fingerprint != expected_fingerprint:
 71 |             # File modified - needs re-upload
 72 |             files_to_upload.append(relative_path)
 73 |             continue
 74 | 
 75 |         # File exists and unchanged - skip
 76 | 
 77 |     return files_to_upload
 78 | 
 79 | 
 80 | async def get_or_create_document_group(
 81 |     folder_path: str,
 82 |     manager_id: int
 83 | ) -> DocumentGroup:
 84 |     """
 85 |     Get or create a DocumentGroup for the uploaded folder.
 86 | 
 87 |     Args:
 88 |         folder_path: Project folder name
 89 |         manager_id: User ID who manages this group
 90 | 
 91 |     Returns:
 92 |         DocumentGroup instance
 93 |     """
 94 |     from maru_lang.services.document import upsert_document_group
 95 | 
 96 |     # Use folder_path as both name and base_path
 97 |     # In production, you might want to use absolute paths
 98 |     group = await upsert_document_group(
 99 |         name=folder_path,
100 |         base_path=folder_path,
101 |         manager_id=manager_id,
102 |     )
103 | 
104 |     return group
105 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/system_config.yaml:
--------------------------------------------------------------------------------
  1 | # System Configuration
  2 | # Central configuration file for MARU-Lang system settings
  3 | # Supports environment variable substitution: ${ENV:VAR_NAME} or ${ENV:VAR_NAME:default_value}
  4 | 
  5 | # ============================================================
  6 | # Server Configuration
  7 | # ============================================================
  8 | server:
  9 |   host: ${ENV:HOST:127.0.0.1}
 10 |   port: ${ENV:PORT:8000}
 11 |   reload: ${ENV:RELOAD:false}
 12 |   log_level: ${ENV:LOG_LEVEL:info}
 13 | 
 14 | # Environment settings
 15 | environment:
 16 |   production: ${ENV:PRODUCTION:false}
 17 | 
 18 | # ============================================================
 19 | # Database Configuration
 20 | # ============================================================
 21 | database:
 22 |   # Database type: "sqlite" or "postgres"
 23 |   type: ${ENV:DB_TYPE:sqlite}
 24 | 
 25 |   # Database name (for SQLite, this is the file name; for PostgreSQL, the database name)
 26 |   name: ${ENV:DB_NAME:maru}
 27 | 
 28 |   # PostgreSQL settings (only required when type is "postgres")
 29 |   username: ${ENV:DB_USERNAME:}
 30 |   password: ${ENV:DB_PASSWORD:}
 31 |   host: ${ENV:DB_HOST:localhost}
 32 |   port: ${ENV:DB_PORT:5432}
 33 | 
 34 | # ============================================================
 35 | # Authentication & Security
 36 | # ============================================================
 37 | auth:
 38 |   # Secret key for JWT token generation (IMPORTANT: Change in production!)
 39 |   secret_key: ${ENV:SECRET_KEY:your-secret-key-change-in-production}
 40 |   salt: ${ENV:SALT:some-sugar}
 41 |   algorithm: ${ENV:ALGORITHM:HS256}
 42 | 
 43 |   # Token expiration times (in minutes)
 44 |   access_token_expire_minutes: ${ENV:ACCESS_TOKEN_EXPIRE_MINUTES:15}
 45 |   refresh_token_expire_minutes: ${ENV:REFRESH_TOKEN_EXPIRE_MINUTES:43200}  # 30 days
 46 | 
 47 |   # Validation
 48 |   default_validation_code: ${ENV:DEFAULT_VALIDATION_CODE:456123}
 49 | 
 50 |   # Auto-create user groups based on email domain
 51 |   auto_create_group_by_domain: ${ENV:AUTO_CREATE_GROUP_BY_DOMAIN:true}
 52 | 
 53 | # ============================================================
 54 | # Email Service Configuration
 55 | # ============================================================
 56 | email:
 57 |   # Email service type: "o365" or "smtp"
 58 |   service_type: ${ENV:EMAIL_SERVICE_TYPE:o365}
 59 | 
 60 |   sender_email: ${ENV:SENDER_EMAIL:}
 61 | 
 62 |   # Office 365 settings (required when service_type is "o365")
 63 |   o365:
 64 |     client_id: ${ENV:O365_CLIENT_ID:}
 65 |     client_secret: ${ENV:O365_CLIENT_SECRET:}
 66 |     tenant_id: ${ENV:O365_TENANT_ID:}
 67 | 
 68 |   # SMTP settings (required when service_type is "smtp")
 69 |   smtp:
 70 |     host: ${ENV:SMTP_HOST:}
 71 |     port: ${ENV:SMTP_PORT:587}
 72 |     username: ${ENV:SMTP_USERNAME:}
 73 |     password: ${ENV:SMTP_PASSWORD:}
 74 | 
 75 | # ============================================================
 76 | # Vector Database Configuration
 77 | # ============================================================
 78 | vector_db:
 79 |   # Vector database type: "chroma" or "milvus"
 80 |   type: ${ENV:VECTOR_DB_TYPE:chroma}
 81 | 
 82 |   # Default collection name
 83 |   default_collection_name: ${ENV:DEFAULT_DB_COLLECTION_NAME:maru}
 84 | 
 85 |   # Chroma settings (required when type is "chroma")
 86 |   chroma:
 87 |     persist_dir: ${ENV:CHROMA_PERSIST_DIR:data/chroma/}
 88 | 
 89 |   # Milvus settings (required when type is "milvus")
 90 |   milvus:
 91 |     host: ${ENV:MILVUS_HOST:localhost}
 92 |     port: ${ENV:MILVUS_PORT:19530}
 93 |     user: ${ENV:MILVUS_USER:root}
 94 |     password: ${ENV:MILVUS_PASSWORD:Milvus}
 95 | 
 96 | # ============================================================
 97 | # External Services Configuration
 98 | # ============================================================
 99 | external:
100 | # No external services are configured
101 | # ============================================================


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_group_classifier.yaml:
--------------------------------------------------------------------------------
 1 | # Group Classifier Agent Configuration
 2 | # Agent that analyzes user questions and classifies them into appropriate document groups
 3 | 
 4 | name: group_classifier
 5 | description: "Analyzes user questions and classifies them into suitable document groups"
 6 | type: builtin
 7 | enabled: true
 8 | version: "1.0.0"
 9 | 
10 | # 우선순위 (가장 높은 우선순위 - 다른 에이전트들보다 먼저 실행)
11 | priority: 100
12 | 
13 | # LLM Settings
14 | target_llm_config:
15 |   server_name: "openai"
16 |   override_params:
17 |     temperature: 0.1 # Keep temperature low for consistent classification
18 |     max_tokens: 500 # Only short classification output is required
19 |     timeout: 10.0 # Short timeout for quick classification
20 | 
21 |   fallback_strategy: "any_available"
22 | 
23 | # Prompt Settings
24 | prompts:
25 |   system_prompt: |
26 |     You are an expert who analyzes user questions and classifies them into appropriate document groups.
27 | 
28 |     Responsibilities:
29 |     - Accurately identify the intent and topic of the question
30 |     - Choose the most appropriate document groups for the question
31 |     - Determine priority when multiple groups are applicable
32 |     - If no group matches, leave the selection empty
33 | 
34 |   user_prompt_template: |
35 |     Analyze the following question and classify it into the most appropriate document groups:
36 | 
37 |     Question: {question}
38 | 
39 |     Available groups:
40 |     {available_groups}
41 | 
42 |     Important rules:
43 |     1. You must select only from the groups listed above.
44 |     2. Never select a group that is not on the list.
45 |     3. If no groups are applicable, return an empty array [] for selected_groups.
46 |     4. Group names must exactly match the ones in the list.
47 |     5. Fill the group_confidences array in the same order as selected_groups. Each value must be between 0 and 1, and the total must sum to 1.
48 | 
49 |     You must return a JSON object as a tool call following the definition below. Use the keys selected_groups, confidence, group_confidences, and reasoning exactly as written. Do not include additional quotes around keys and do not use non-English keys. Do not provide any free-form text outside the tool call.
50 | 
51 |     Please return the classification result as JSON:
52 |     - selected_groups: Selected groups in priority order (empty array if none)
53 |     - confidence: Overall classification confidence (0-1)
54 |     - group_confidences: Confidence values aligned with selected_groups (e.g., [0.7, 0.3], sum=1)
55 |     - reasoning: Explanation for the classification
56 | 
57 | # Implementation class (Python file in user space)
58 | implementation: builtin.group_classifier.GroupClassifierAgent
59 | 
60 | # Agent Settings
61 | config:
62 |   timeout: 10
63 |   retry_count: 1 # Keep retries minimal for fast classification
64 |   classification_config:
65 |     confidence_threshold: 0.4
66 | 
67 | # Tool schema definition (JSON)
68 | tools:
69 |   group_classifier:
70 |     description: "Classifies user questions into appropriate groups"
71 |     parameters:
72 |       type: "object"
73 |       properties:
74 |         selected_groups:
75 |           type: "array"
76 |           items:
77 |             type: "string"
78 |           description: "Selected groups in priority order"
79 |         confidence:
80 |           type: "number"
81 |           minimum: 0
82 |           maximum: 1
83 |           description: "Overall classification confidence (0-1)"
84 |         reasoning:
85 |           type: "string"
86 |           description: "Explanation of the classification"
87 |         fallback_used:
88 |           type: "boolean"
89 |           description: "Indicates whether classification fallback was used"
90 |         group_confidences:
91 |           type: "array"
92 |           items:
93 |             type: "number"
94 |             minimum: 0
95 |             maximum: 1
96 |           description: "Confidence scores aligned with selected_groups (sum=1)"
97 |       required: ["selected_groups", "confidence", "group_confidences", "reasoning"]
98 | 


--------------------------------------------------------------------------------
/maru_lang/utils/security.py:
--------------------------------------------------------------------------------
  1 | import base64
  2 | import hashlib
  3 | from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
  4 | from cryptography.hazmat.backends import default_backend
  5 | from cryptography.hazmat.primitives import padding
  6 | from datetime import datetime, timedelta, timezone
  7 | from typing import Optional
  8 | from jose import JWTError, jwt
  9 | from fastapi import HTTPException, status
 10 | from pydantic import ValidationError
 11 | from maru_lang.configs.system_config import get_system_config
 12 | 
 13 | config = get_system_config()
 14 | 
 15 | 
 16 | def generate_anonymized_key(
 17 |     login_id: str,
 18 |     company_id: int,
 19 |     salt: str = None
 20 | ) -> str:
 21 |     if salt is None:
 22 |         salt = config.auth.salt
 23 |     # Combine the inputs with the salt to build a deterministic anonymized key
 24 |     raw_data = f"{login_id}:{company_id}:{salt}"
 25 |     return hashlib.sha256(raw_data.encode()).hexdigest()
 26 | 
 27 | 
 28 | def create_jwt_token(
 29 |     data: dict,
 30 |     expires_delta: timedelta
 31 | ) -> tuple[str, datetime]:
 32 |     """Create a JWT access token and return it with its expiry."""
 33 |     expires_at = datetime.now(timezone.utc)
 34 |     expires_at += expires_delta
 35 |     to_encode = data.copy()
 36 |     to_encode.update({"exp": expires_at})
 37 |     encoded_jwt = jwt.encode(
 38 |         to_encode,
 39 |         config.auth.secret_key,
 40 |         algorithm=config.auth.algorithm)
 41 |     return encoded_jwt, expires_at
 42 | 
 43 | 
 44 | def decode_token(token: str) -> dict | None:
 45 |     """Decode a JWT token and return its payload."""
 46 |     try:
 47 |         payload = jwt.decode(
 48 |             token,
 49 |             config.auth.secret_key,
 50 |             algorithms=[config.auth.algorithm])
 51 |         return payload
 52 |     except (jwt.ExpiredSignatureError, jwt.JWTError, ValidationError) as e:
 53 |         # print(f"Token decode error: {e}")
 54 |         return None
 55 | 
 56 | 
 57 | def get_key_spec(key: str):
 58 |     key_bytes = key.encode('utf-8')
 59 |     return key_bytes
 60 | 
 61 | 
 62 | def aes256_decrypt(target_str: str) -> str:
 63 |     try:
 64 |         # Decode the Base64-encoded cipher text
 65 |         decoded_data = base64.b64decode(target_str)
 66 | 
 67 |         # Initialize the AES cipher in ECB mode
 68 |         cipher = Cipher(algorithms.AES(get_key_spec(config.auth.secret_key)),
 69 |                         modes.ECB(), backend=default_backend())
 70 |         decryptor = cipher.decryptor()
 71 | 
 72 |         # Perform AES decryption
 73 |         decrypted_data = decryptor.update(decoded_data) + decryptor.finalize()
 74 | 
 75 |         # Remove PKCS7 padding
 76 |         unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
 77 |         unpadded_data = unpadder.update(decrypted_data) + unpadder.finalize()
 78 | 
 79 |         return unpadded_data.decode('utf-8')
 80 | 
 81 |     except Exception as e:
 82 |         raise Exception(f"Error during decryption: {str(e)}")
 83 | 
 84 | 
 85 | def aes256_encrypt(plain_text: str) -> str:
 86 |     try:
 87 |         # Convert the plain text to bytes
 88 |         plain_text_bytes = plain_text.encode('utf-8')
 89 | 
 90 |         # Apply PKCS7 padding
 91 |         padder = padding.PKCS7(algorithms.AES.block_size).padder()
 92 |         padded_data = padder.update(plain_text_bytes) + padder.finalize()
 93 | 
 94 |         # Initialize the AES cipher in ECB mode
 95 |         cipher = Cipher(algorithms.AES(get_key_spec(config.auth.secret_key)),
 96 |                         modes.ECB(), backend=default_backend())
 97 |         encryptor = cipher.encryptor()
 98 | 
 99 |         # Perform AES encryption
100 |         encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
101 | 
102 |         # Encode the ciphertext using Base64
103 |         encrypted_base64_data = base64.b64encode(encrypted_data)
104 | 
105 |         # Return the encrypted string
106 |         return encrypted_base64_data.decode('utf-8')
107 | 
108 |     except Exception as e:
109 |         raise Exception(f"Error during encryption: {str(e)}")


--------------------------------------------------------------------------------
/maru_lang/api/endpoints/user_group.py:
--------------------------------------------------------------------------------
  1 | """
  2 | User group management API endpoints
  3 | """
  4 | from fastapi import APIRouter, HTTPException, Depends
  5 | from pydantic import BaseModel
  6 | from typing import Optional, Dict, Any
  7 | 
  8 | from maru_lang.dependencies.auth import get_user
  9 | from maru_lang.services.user_group_command import (
 10 |     UserGroupCommandParser,
 11 |     execute_user_group_command
 12 | )
 13 | 
 14 | 
 15 | router = APIRouter(
 16 |     prefix="/user-groups",
 17 |     tags=["User Groups"]
 18 | )
 19 | 
 20 | 
 21 | class UserGroupCommandRequest(BaseModel):
 22 |     """Request body for user group command"""
 23 |     message: str
 24 | 
 25 | 
 26 | class UserGroupCommandResponse(BaseModel):
 27 |     """Response for user group command"""
 28 |     success: bool
 29 |     message: str
 30 |     data: Optional[Dict[str, Any]] = None
 31 |     error: Optional[str] = None
 32 | 
 33 | 
 34 | @router.post("/command", response_model=UserGroupCommandResponse)
 35 | async def execute_command(
 36 |     request: UserGroupCommandRequest,
 37 |     user=Depends(get_user)
 38 | ):
 39 |     """
 40 |     Execute user group management command.
 41 | 
 42 |     Supports natural language commands in Korean and English:
 43 | 
 44 |     ## 그룹 생성
 45 |     - `/그룹생성 [그룹명]` or `/create group [name]`
 46 | 
 47 |     ## 멤버 관리 (매니저만)
 48 |     - `/그룹초대 [그룹명] [이메일]` or `/invite [group] [email]`
 49 |     - `/그룹추방 [그룹명] [이메일]` or `/remove [group] [email]`
 50 |     - `/그룹위임 [그룹명] [이메일]` or `/transfer [group] [email]`
 51 | 
 52 |     ## 그룹 조회
 53 |     - `/내그룹목록` or `/my groups`
 54 |     - `/관리그룹` or `/managed groups`
 55 |     - `/그룹멤버 [그룹명]` or `/members [group]`
 56 | 
 57 |     ## 그룹 나가기
 58 |     - `/그룹나가기 [그룹명]` or `/leave group [name]`
 59 | 
 60 |     Args:
 61 |         request: Command request with message
 62 |         user: Authenticated user (from token)
 63 | 
 64 |     Returns:
 65 |         Command execution result with success status and data
 66 | 
 67 |     Example:
 68 |         ```json
 69 |         {
 70 |             "message": "/그룹생성 행정팀"
 71 |         }
 72 |         ```
 73 | 
 74 |         Response:
 75 |         ```json
 76 |         {
 77 |             "success": true,
 78 |             "message": "Group created successfully",
 79 |             "data": {
 80 |                 "group_id": 123,
 81 |                 "group_name": "행정팀",
 82 |                 "created": true
 83 |             }
 84 |         }
 85 |         ```
 86 |     """
 87 |     try:
 88 |         # Parse command
 89 |         parsed = UserGroupCommandParser.parse(request.message)
 90 | 
 91 |         # Check if it's a valid command
 92 |         if parsed["command"] == "unknown":
 93 |             return UserGroupCommandResponse(
 94 |                 success=False,
 95 |                 message=parsed.get("error", "Unknown command"),
 96 |                 data={"help": UserGroupCommandParser.get_help_text()}
 97 |             )
 98 | 
 99 |         # Execute command
100 |         result = await execute_user_group_command(parsed, user.id)
101 | 
102 |         return UserGroupCommandResponse(**result)
103 | 
104 |     except Exception as e:
105 |         raise HTTPException(
106 |             status_code=500,
107 |             detail=f"Failed to execute command: {str(e)}"
108 |         )
109 | 
110 | 
111 | @router.get("/help")
112 | async def get_help():
113 |     """
114 |     Get help text for user group commands.
115 | 
116 |     Returns:
117 |         Help text with all available commands and usage examples
118 |     """
119 |     return {
120 |         "help": UserGroupCommandParser.get_help_text()
121 |     }
122 | 
123 | 
124 | @router.get("/check-command")
125 | async def check_command(message: str):
126 |     """
127 |     Check if a message is a user group command without executing it.
128 | 
129 |     Args:
130 |         message: Message to check
131 | 
132 |     Returns:
133 |         Whether the message is a valid user group command and parsed result
134 |     """
135 |     is_command = UserGroupCommandParser.is_user_group_command(message)
136 |     parsed = UserGroupCommandParser.parse(message) if is_command else None
137 | 
138 |     return {
139 |         "is_command": is_command,
140 |         "parsed": parsed
141 |     }
142 | 


--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_response.yaml:
--------------------------------------------------------------------------------
 1 | # Response Agent Configuration
 2 | # Formats and delivers results from other agents to the end user
 3 | 
 4 | name: response
 5 | description: "Formats outputs from other agents into user-friendly responses"
 6 | type: builtin
 7 | enabled: true
 8 | version: "1.0.0"
 9 | 
10 | # LLM configuration
11 | target_llm_config:
12 |   server_name: "openai"
13 |   override_params:
14 |     temperature: 0.7
15 |     max_tokens: 3000
16 | 
17 |   fallback_strategy: "any_available"
18 | 
19 | # Prompt configuration
20 | prompts:
21 |   system_prompt: |
22 |     You are a professional and friendly AI assistant.
23 |     Your job is to take results from other system components (agents)
24 |     and craft a final answer that is easy to understand and genuinely helpful for the user.
25 | 
26 |     Primary responsibilities:
27 |     - Turn agent outputs into natural, fluent sentences.
28 |     - Explain technical details in user-friendly language.
29 |     - Present structured data in readable formats.
30 |     - Communicate errors or warnings with empathy and clarity.
31 |     - Provide additional context or explanations when helpful.
32 |     - Adapt your response based on different execution outcomes (success, failure, partial success, errors).
33 | 
34 |     Response principles:
35 |     - Use clear, easy-to-follow language.
36 |     - Highlight important information.
37 |     - Format structured data appropriately (bullet points, numbered lists, tables, etc.).
38 |     - Briefly explain technical terms when necessary.
39 |     - Maintain a positive, supportive tone.
40 |     - Even when reporting errors, remain polite and constructive.
41 | 
42 |   user_prompt_template: |
43 |     User question: {question}
44 | 
45 |     Execution scenario: {scenario}
46 | 
47 |     Agent outputs:
48 |     {agent_result}
49 | 
50 |     Using the information above, write a kind and clear final response for the user.
51 | 
52 |     Guidelines:
53 |     - Adjust tone and content to match the execution scenario.
54 |     - Hide implementation details; share only what is helpful for the user.
55 |     - Write naturally, as if conversing with the user.
56 |     - Apply Markdown formatting when it improves readability.
57 |     - Do not mention agent names or internal architecture; focus on the results.
58 | 
59 | # Implementation class (builtin agent)
60 | implementation: builtin.response_agent.ResponseAgent
61 | 
62 | # Agent configuration
63 | config:
64 |   timeout: 30
65 |   retry_count: 2
66 |   max_context_length: 10000
67 | 
68 |   # Response formatting options
69 |   formatting:
70 |     include_metadata: false  # Whether to include metadata in the response
71 |     show_sources: true       # Whether to list information sources
72 |     use_markdown: true       # Whether to render responses using Markdown
73 |     max_response_length: 2000  # Maximum length of the response
74 | 
75 |   # Scenario-specific LLM guidance (appended to the prompt)
76 |   scenario_config:
77 |     no_agents: "No agents were selected. Ask the user for more details or offer general assistance."
78 |     errors: "An error occurred while running the agents. Explain the issue politely and suggest trying again."
79 |     success: "Agents completed successfully. Present the results in a user-friendly manner."
80 |     partial_success: "Some agents succeeded while others failed. Share the successful results first and briefly mention the failures."
81 |     unknown: "The situation is unclear. Let the user know that assistance is limited and offer alternative help."
82 | 
83 |   # Fallback responses when no LLM output is available
84 |   fallback_config:
85 |     no_agents: "I’m sorry, I couldn’t find an appropriate agent to handle that question. Could you provide more details?"
86 |     errors: "I’m sorry, something went wrong while processing your request."
87 |     success: ""  # Empty string: use the formatted_context as-is
88 |     partial_success: ""  # Empty string: use the formatted_context as-is
89 |     unknown: "I’m sorry, I’m unable to generate a response right now."
90 | 
91 | # Examples (this agent formats outputs from other agents)
92 | examples:
93 |   - "Format knowledge_search results into a user-friendly response"
94 |   - "Communicate error messages politely"
95 | 


--------------------------------------------------------------------------------
/maru_lang/dependencies/email.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Email service dependency for FastAPI
  3 | """
  4 | from abc import ABC, abstractmethod
  5 | from typing import Optional
  6 | from fastapi import Depends
  7 | from maru_lang.configs.system_config import get_system_config
  8 | 
  9 | config = get_system_config()
 10 | 
 11 | 
 12 | class EmailService(ABC):
 13 |     """Abstract base class for email services"""
 14 | 
 15 |     @abstractmethod
 16 |     def send_email(self, recipient: str, subject: str, body: str) -> bool:
 17 |         pass
 18 | 
 19 |     @abstractmethod
 20 |     def send_otp(self, recipient: str, code: str) -> bool:
 21 |         pass
 22 | 
 23 | 
 24 | class O365EmailManager(EmailService):
 25 |     """Office 365 email service implementation"""
 26 | 
 27 |     def __init__(self):
 28 |         self.sender_email = config.email.sender_email
 29 |         self.client_id = config.email.o365.client_id
 30 |         self.client_secret = config.email.o365.client_secret
 31 |         self.tenant_id = config.email.o365.tenant_id
 32 | 
 33 |     def send_email(self, recipient: str, subject: str, body: str) -> bool:
 34 |         try:
 35 |             from O365 import Account
 36 | 
 37 |             credentials = (self.client_id, self.client_secret)
 38 |             scopes = ["https://graph.microsoft.com/.default"]
 39 |             account = Account(
 40 |                 credentials,
 41 |                 auth_flow_type="credentials",
 42 |                 tenant_id=self.tenant_id
 43 |             )
 44 | 
 45 |             if not account.is_authenticated:
 46 |                 account.authenticate(scopes=scopes)
 47 | 
 48 |             mailbox = account.mailbox(resource=self.sender_email)
 49 |             message = mailbox.new_message()
 50 |             message.to.add(recipient)
 51 |             message.subject = subject
 52 |             message.body = body
 53 |             message.body_type = "HTML"
 54 |             message.send()
 55 |             return True
 56 |         except Exception as e:
 57 |             print(f"Failed to send email: {e}")
 58 |             return False
 59 | 
 60 |     def send_otp(self, recipient: str, code: str) -> bool:
 61 |         subject = f"{code} - Maru Lang Verification Code"
 62 |         body = f"""
 63 |         <div style="font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, 'Helvetica Neue', Arial, sans-serif; text-align: center; padding: 40px; background-color: #f9f9f9;">
 64 |             <div style="background: white; padding: 30px; border-radius: 12px; box-shadow: 0 4px 12px rgba(0,0,0,0.1); max-width: 400px; margin: auto;">
 65 |                 <h2 style="color: #333; font-weight: 600;">Your Verification Code</h2>
 66 |                 <p style="color: #666; font-size: 16px;">Use this code to verify your email address:</p>
 67 |                 <div style="font-size: 28px; font-weight: bold; color: #007AFF; letter-spacing: 4px; padding: 12px 20px; border-radius: 8px; background: #f2f2f7; display: inline-block; margin: 20px 0;">
 68 |                     {code}
 69 |                 </div>
 70 |                 <p style="color: #999; font-size: 14px;">This code expires in 10 minutes.</p>
 71 |             </div>
 72 |         </div>
 73 |         """
 74 |         return self.send_email(recipient, subject, body)
 75 | 
 76 | 
 77 | def get_email_manager() -> Optional[EmailService]:
 78 |     """Get email service instance based on settings"""
 79 |     if not config.email.service_type:
 80 |         return None
 81 | 
 82 |     if config.email.service_type == "o365":
 83 |         if all([config.email.o365.client_id, config.email.o365.client_secret, config.email.o365.tenant_id, config.email.sender_email]):
 84 |             try:
 85 |                 return O365EmailManager()
 86 |             except Exception as e:
 87 |                 print(f"Failed to initialize O365 Email Manager: {e}")
 88 |                 return None
 89 | 
 90 |     # TODO: smtp 타입 지원 추가
 91 |     return None
 92 | 
 93 | 
 94 | def get_email_service_dependency() -> Optional[EmailService]:
 95 |     """FastAPI dependency for email service"""
 96 |     return get_email_manager()
 97 | 
 98 | 
 99 | __all__ = [
100 |     "EmailService",
101 |     "O365EmailManager",
102 |     "get_email_manager",
103 |     "get_email_service_dependency",
104 | ]
105 | 


--------------------------------------------------------------------------------
/maru_lang/dependencies/auth.py:
--------------------------------------------------------------------------------
  1 | # 시크릿 키, 알고리즘, 토큰 만료 시간 등을 settings에서 관리
  2 | from fastapi import Depends, HTTPException, status, Request, Body
  3 | from fastapi.security import OAuth2PasswordBearer
  4 | from maru_lang.enums.auth import UserRoleCode
  5 | from maru_lang.core.relation_db.models.auth import User, UserRole, RefreshToken
  6 | from maru_lang.utils.security import decode_token
  7 | from maru_lang.services.auth import refresh_token_flow
  8 | 
  9 | # 1) OAuth2 스키마 설정
 10 | oauth2_scheme = OAuth2PasswordBearer(
 11 |     tokenUrl="/auth/editor/login",
 12 |     auto_error=False)
 13 | 
 14 | 
 15 | async def get_user(
 16 |     request: Request,
 17 |     token: str = Depends(oauth2_scheme)
 18 | ) -> User:
 19 |     """토큰에서 유저 ID 등을 추출하여 실제 유저 정보를 가져오는 함수"""
 20 |     # device-id는 헤더 또는 쿼리스트링(device-id)로 전달 받을 수 있도록 확장
 21 |     device_id_in_header = request.headers.get("device-id") or request.query_params.get("device-id")
 22 | 
 23 |     # 토큰은 헤더(Authorization) 또는 쿼리 파라미터(token)에서 받을 수 있음
 24 |     # SSE/EventSource는 커스텀 헤더를 지원하지 않으므로 쿼리 파라미터 지원 필요
 25 |     if not token:
 26 |         token = request.query_params.get("token")
 27 | 
 28 |     payload = decode_token(token) if token else None
 29 | 
 30 |     if payload is None:
 31 |         # AccessToken 만료 → refresh_token 꺼내기
 32 |         refresh_token = request.cookies.get("refresh_token")
 33 |         if not refresh_token:
 34 |             # 임시로 만약 서버를 재시작 했을때를 대비해서
 35 |             # 보안적으로 안전하지 않다 salt를 해야할수도
 36 |             refresh_token_object = await RefreshToken.filter(
 37 |                 device_id=device_id_in_header
 38 |             ).order_by(
 39 |                 "-created_at"
 40 |             ).first()
 41 |             if refresh_token_object:
 42 |                 refresh_token = refresh_token_object.refresh_token
 43 |                 try:
 44 |                     decode_token(refresh_token)
 45 |                 except Exception:
 46 |                     raise HTTPException(
 47 |                         status_code=401, detail="Invalid refresh token")
 48 | 
 49 |         if refresh_token:
 50 |             new_access_token = await refresh_token_flow(refresh_token, device_id_in_header)
 51 |             if new_access_token:
 52 |                 # 새 토큰으로 재인증 시도
 53 |                 payload = decode_token(new_access_token)
 54 |                 # 🔥 새 AccessToken을 응답 헤더에 추가 (선택)
 55 |                 request.state.new_access_token = new_access_token
 56 | 
 57 |     if payload is None:
 58 |         raise HTTPException(
 59 |             status_code=status.HTTP_401_UNAUTHORIZED,
 60 |             detail="Invalid or expired token",
 61 |             headers={"WWW-Authenticate": "Bearer"},
 62 |         )
 63 | 
 64 |     user_id = payload.get("sub")
 65 | 
 66 |     if user_id is None:
 67 |         raise HTTPException(
 68 |             status_code=status.HTTP_401_UNAUTHORIZED,
 69 |             detail="Invalid token: no user_id",
 70 |             headers={"WWW-Authenticate": "Bearer"},
 71 |         )
 72 | 
 73 |     user = await User.get_or_none(id=user_id)
 74 |     if not user:
 75 |         raise HTTPException(
 76 |             status_code=status.HTTP_401_UNAUTHORIZED,
 77 |             detail="User not found",
 78 |         )
 79 | 
 80 |     return user
 81 | 
 82 | 
 83 | def get_user_with_role(
 84 |     required_role: UserRoleCode,
 85 | ):
 86 |     async def dependency(
 87 |         user: User = Depends(get_user)
 88 |     ):
 89 |         # 역할 우선순위 (낮은 권한부터 높은 권한 순)
 90 |         ROLE_HIERARCHY = [
 91 |             UserRoleCode.EDITOR,
 92 |             UserRoleCode.ADMIN,
 93 |         ]
 94 |         # get user role
 95 |         user_role = await UserRole.get_or_none(
 96 |             id=user.role_id
 97 |         )
 98 | 
 99 |         if not user_role:
100 |             raise HTTPException(status_code=401, detail="Unauthorized role")
101 | 
102 |         try:
103 |             user_index = ROLE_HIERARCHY.index(UserRoleCode(user_role.name))
104 |             required_index = ROLE_HIERARCHY.index(required_role)
105 |         except ValueError:
106 |             raise HTTPException(status_code=401, detail="Invalid role")
107 | 
108 |         if user_index < required_index:
109 |             raise HTTPException(status_code=403, detail="Permission denied")
110 | 
111 |         return user
112 | 
113 |     return dependency
114 | 


--------------------------------------------------------------------------------
/maru_lang/core/relation_db/models/documents.py:
--------------------------------------------------------------------------------
  1 | from tortoise.models import Model
  2 | from tortoise import fields
  3 | from maru_lang.enums.documents import (
  4 |     PermissionAction,
  5 |     DocumentStatus,
  6 | )
  7 | 
  8 | 
  9 | class Document(Model):
 10 |     id = fields.CharField(pk=True, max_length=64)  # ULID/UUIDv7 권장
 11 |     name = fields.CharField(max_length=255, index=True)
 12 | 
 13 |     file_path = fields.CharField(max_length=500, null=True)
 14 |     file_size = fields.BigIntField(null=True)
 15 |     head_hash = fields.CharField(
 16 |         max_length=64, null=True, index=True)   # blake3(앞 64KB)
 17 |     full_hash = fields.CharField(
 18 |         max_length=64, null=True, index=True)   # blake3(전체: 지연 계산 가능)
 19 |     source_fingerprint = fields.CharField(
 20 |         max_length=64, unique=True, null=True)  # 업서트 기준 키
 21 | 
 22 |     metadata = fields.JSONField(default=dict)
 23 |     status = fields.IntEnumField(
 24 |         DocumentStatus, default=DocumentStatus.PROCESSING)
 25 |     created_at = fields.DatetimeField(auto_now_add=True)
 26 |     updated_at = fields.DatetimeField(auto_now=True)
 27 | 
 28 |     class Meta:
 29 |         table = "document"
 30 |         indexes = [["name", "file_size", "head_hash"]]
 31 | 
 32 | 
 33 | class DocumentGroup(Model):
 34 |     id = fields.IntField(pk=True)
 35 |     name = fields.CharField(max_length=255, unique=True)  # Full path로 unique 식별
 36 |     base_path = fields.CharField(
 37 |         max_length=500,
 38 |         unique=True,  # 같은 파일시스템 경로는 단일 DocumentGroup만 존재
 39 |     )
 40 |     description = fields.TextField(null=True)  # DocumentGroup 설명
 41 | 
 42 |     # Version ID for VDB chunk filtering and version management
 43 |     version_id = fields.CharField(
 44 |         max_length=64,
 45 |         null=True,  # 임베딩 완료 전에는 null
 46 |         index=True  # 검색 성능을 위한 인덱스
 47 |     )
 48 | 
 49 |     # Manager (owner) of this document group
 50 |     manager = fields.ForeignKeyField(
 51 |         "models.User",
 52 |         related_name="managed_document_groups",
 53 |         on_delete=fields.RESTRICT  # Manager가 있는 DocumentGroup이 있으면 User 삭제 불가
 54 |     )
 55 | 
 56 |     # Pluggable component configurations (used during ingestion)
 57 |     loader = fields.CharField(max_length=255, null=True)           # 사용된 loader 이름
 58 |     chunker = fields.CharField(max_length=255, null=True)          # 사용된 chunker 이름
 59 |     embedding_model = fields.CharField(max_length=255, null=True)  # 사용된 embedding model 이름
 60 | 
 61 |     # Configuration snapshot (for detecting changes)
 62 |     config_snapshot = fields.JSONField(null=True, default=dict)  # 사용된 설정의 스냅샷
 63 | 
 64 |     minhash_signature = fields.JSONField(null=True) # MinHash 시그니처 (128개 정수 배열)
 65 |     signature_updated_at = fields.DatetimeField(auto_now=True)
 66 | 
 67 |     class Meta:
 68 |         table = "document_group"
 69 | 
 70 | 
 71 | class DocumentGroupMembership(Model):
 72 |     document = fields.ForeignKeyField(
 73 |         "models.Document",
 74 |         related_name="group_memberships",
 75 |         on_delete=fields.CASCADE)
 76 |     group = fields.ForeignKeyField(
 77 |         "models.DocumentGroup",
 78 |         related_name="documents",
 79 |         on_delete=fields.CASCADE)
 80 | 
 81 |     class Meta:
 82 |         table = "document_group_membership"
 83 | 
 84 | 
 85 | class DocumentGroupInclusion(Model):
 86 |     parent = fields.ForeignKeyField(
 87 |         "models.DocumentGroup",
 88 |         related_name="includes",
 89 |         on_delete=fields.CASCADE)
 90 |     child = fields.ForeignKeyField(
 91 |         "models.DocumentGroup",
 92 |         related_name="included_by",
 93 |         on_delete=fields.CASCADE)
 94 | 
 95 |     class Meta:
 96 |         table = "document_group_inclusion"
 97 |         unique_together = ("parent", "child")
 98 | 
 99 | 
100 | # 그룹 ↔ 문서그룹 권한
101 | class GroupPermission(Model):
102 |     user_group = fields.ForeignKeyField(
103 |         "models.UserGroup",
104 |         related_name="permissions",
105 |         on_delete=fields.CASCADE)
106 |     document_group = fields.ForeignKeyField(
107 |         "models.DocumentGroup",
108 |         related_name="permissions",
109 |         on_delete=fields.CASCADE)
110 |     action = fields.IntEnumField(PermissionAction)
111 | 
112 |     class Meta:
113 |         table = "group_permission"
114 |         unique_together = (("user_group", "document_group", "action"),)


--------------------------------------------------------------------------------
/maru_lang/commands/tree.py:
--------------------------------------------------------------------------------
  1 | """
  2 | DocumentGroup 계층 구조 조회 및 관리 명령어
  3 | """
  4 | import typer
  5 | from typing import Optional
  6 | from maru_lang.core.relation_db.models.documents import (
  7 |     DocumentGroup,
  8 |     DocumentGroupInclusion,
  9 | )
 10 | 
 11 | 
 12 | async def get_root_groups() -> list[DocumentGroup]:
 13 |     """
 14 |     루트 그룹들을 조회 (DocumentGroupInclusion에서 child로 지정되지 않은 그룹)
 15 | 
 16 |     Returns:
 17 |         루트 DocumentGroup 리스트
 18 |     """
 19 |     # child_id로 사용된 그룹 ID들
 20 |     child_ids = await DocumentGroupInclusion.all().values_list("child_id", flat=True)
 21 |     child_ids_set = set(child_ids)
 22 | 
 23 |     # 모든 그룹 조회
 24 |     all_groups = await DocumentGroup.all()
 25 | 
 26 |     # child로 지정되지 않은 그룹만 필터링 (루트 그룹)
 27 |     root_groups = [g for g in all_groups if g.id not in child_ids_set]
 28 | 
 29 |     return sorted(root_groups, key=lambda g: g.name)
 30 | 
 31 | 
 32 | async def get_children_groups(parent_group: DocumentGroup) -> list[DocumentGroup]:
 33 |     """
 34 |     특정 그룹의 직계 자식 그룹들을 조회
 35 | 
 36 |     Args:
 37 |         parent_group: 부모 그룹
 38 | 
 39 |     Returns:
 40 |         자식 DocumentGroup 리스트
 41 |     """
 42 |     inclusions = await DocumentGroupInclusion.filter(
 43 |         parent=parent_group
 44 |     ).prefetch_related("child")
 45 | 
 46 |     children = [inc.child for inc in inclusions]
 47 |     return sorted(children, key=lambda g: g.name)
 48 | 
 49 | 
 50 | async def print_group_tree(
 51 |     group: DocumentGroup | None = None,
 52 |     max_depth: int = 2,
 53 |     current_depth: int = 0,
 54 |     prefix: str = "",
 55 |     is_last: bool = True
 56 | ):
 57 |     """
 58 |     그룹 계층 구조를 트리 형태로 출력
 59 | 
 60 |     Args:
 61 |         group: 출력할 그룹 (None이면 루트부터)
 62 |         max_depth: 최대 깊이
 63 |         current_depth: 현재 깊이
 64 |         prefix: 출력 prefix (트리 그리기용)
 65 |         is_last: 마지막 자식인지 여부
 66 |     """
 67 |     if group is None:
 68 |         # 루트 그룹들 출력
 69 |         root_groups = await get_root_groups()
 70 | 
 71 |         if not root_groups:
 72 |             typer.secho("📭 DocumentGroup이 없습니다.", fg=typer.colors.YELLOW)
 73 |             return
 74 | 
 75 |         typer.echo("\n📁 Document Group 계층 구조:\n")
 76 |         for i, root in enumerate(root_groups):
 77 |             is_last_root = (i == len(root_groups) - 1)
 78 |             await print_group_tree(root, max_depth, 0, "", is_last_root)
 79 |     else:
 80 |         # 현재 그룹 출력
 81 |         if current_depth == 0:
 82 |             connector = ""
 83 |             typer.secho(f"{group.name}", fg=typer.colors.CYAN, bold=True)
 84 |         else:
 85 |             connector = "└── " if is_last else "├── "
 86 |             typer.secho(f"{prefix}{connector}{group.name}", fg=typer.colors.GREEN)
 87 | 
 88 |         # 최대 깊이에 도달하면 중단
 89 |         if current_depth >= max_depth:
 90 |             return
 91 | 
 92 |         # 자식 그룹들 재귀 출력
 93 |         children = await get_children_groups(group)
 94 | 
 95 |         for i, child in enumerate(children):
 96 |             is_last_child = (i == len(children) - 1)
 97 | 
 98 |             if current_depth == 0:
 99 |                 child_prefix = ""
100 |             else:
101 |                 child_prefix = prefix + ("    " if is_last else "│   ")
102 | 
103 |             await print_group_tree(
104 |                 child,
105 |                 max_depth,
106 |                 current_depth + 1,
107 |                 child_prefix,
108 |                 is_last_child
109 |             )
110 | 
111 | 
112 | async def show_group_tree_command(
113 |     group_name: Optional[str] = None,
114 |     depth: int = 2
115 | ):
116 |     """
117 |     DocumentGroup 계층 구조 출력 명령어
118 | 
119 |     Args:
120 |         group_name: 특정 그룹명 (없으면 루트 그룹들만 표시)
121 |         depth: 표시할 최대 깊이
122 |     """
123 |     if group_name:
124 |         # 특정 그룹 조회
125 |         group = await DocumentGroup.get_or_none(name=group_name.lower())
126 | 
127 |         if not group:
128 |             typer.secho(
129 |                 f"❌ '{group_name}' 그룹을 찾을 수 없습니다.",
130 |                 fg=typer.colors.RED
131 |             )
132 |             raise typer.Exit(1)
133 | 
134 |         typer.echo(f"\n📁 '{group.name}' 그룹 계층 구조 (depth={depth}):\n")
135 |         await print_group_tree(group, max_depth=depth)
136 |     else:
137 |         # 루트 그룹들만 표시 (depth=1)
138 |         await print_group_tree(None, max_depth=1)
139 | 
140 |     typer.echo()  # 빈 줄
141 | 


--------------------------------------------------------------------------------
/maru_lang/api/endpoints/auth.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | from fastapi import APIRouter, HTTPException, Depends, Response
  3 | from maru_lang.enums.auth import UserRoleCode
  4 | from maru_lang.configs.system_config import get_system_config
  5 | from maru_lang.dependencies.auth import get_user
  6 | 
  7 | config = get_system_config()
  8 | from maru_lang.dependencies.email import get_email_service_dependency, EmailService
  9 | from maru_lang.schemas.auth import (
 10 |     VerifyCodeRequest,
 11 |     SignUpRequest,
 12 |     LogoutRequest,
 13 |     UserGroupsResponse,
 14 |     UserGroupResponse,
 15 | )
 16 | from maru_lang.services.auth import (
 17 |     generate_token,
 18 |     verify_OTP,
 19 |     create_or_get_user,
 20 |     delete_token,
 21 |     generate_OTP,
 22 |     get_user_groups,
 23 | )
 24 | 
 25 | 
 26 | router = APIRouter(
 27 |     prefix="/auth",
 28 |     tags=["Auth"]
 29 | )
 30 | 
 31 | 
 32 | @router.post("/login")
 33 | async def login(
 34 |     request: SignUpRequest,
 35 |     email_service: Optional[EmailService] = Depends(
 36 |         get_email_service_dependency)
 37 | ) -> str:
 38 |     try:
 39 |         # TODO Email validation
 40 |         otp = await generate_OTP(request.email, email_service)
 41 | 
 42 |         # 이메일 서비스가 활성화된 경우에만 이메일 전송
 43 |         if email_service:
 44 |             success = email_service.send_otp(request.email, otp.code)
 45 |             if not success:
 46 |                 # 이메일 전송 실패 시 DEFAULT_VALIDATION_CODE로 재생성
 47 |                 await otp.delete()
 48 |                 otp = await generate_OTP(request.email, None)
 49 | 
 50 |         return otp.email
 51 |     except Exception as e:
 52 |         print(e)
 53 |         raise HTTPException(
 54 |             status_code=400,
 55 |             detail="서버가 점검 중 입니다. 다시 시도해주세요.")
 56 | 
 57 | 
 58 | @router.post("/logout")
 59 | async def logout(
 60 |     request: LogoutRequest,
 61 |     response: Response,
 62 |     user=Depends(get_user)
 63 | ) -> dict:
 64 |     try:
 65 |         await delete_token(user.id, request.device_id)
 66 |         response.delete_cookie(
 67 |             key="refresh_token",
 68 |             path="/",
 69 |             samesite="strict"
 70 |         )
 71 |         return {"message": "Logged out successfully"}
 72 |     except Exception as e:
 73 |         raise HTTPException(status_code=500, detail=str(e))
 74 | 
 75 | 
 76 | @router.post("/verify/code")
 77 | async def verify_code(
 78 |     response: Response,
 79 |     request: VerifyCodeRequest
 80 | ):
 81 |     try:
 82 |         if not await verify_OTP(request.email, request.code):
 83 |             raise Exception("Invalid or expired code")
 84 |         user = await create_or_get_user(
 85 |             email=request.email,
 86 |             role=UserRoleCode.EDITOR.value
 87 |         )
 88 |         access_token, refresh_token = await generate_token(
 89 |             user.id,
 90 |             user.role_id,
 91 |             request.device_id)
 92 | 
 93 |         response.set_cookie(
 94 |             key="refresh_token",
 95 |             value=refresh_token,
 96 |             httponly=True,
 97 |             secure=True,
 98 |             samesite="strict",
 99 |             max_age=config.auth.refresh_token_expire_minutes * 60
100 |         )
101 | 
102 |         return access_token
103 |     except Exception as e:
104 |         raise HTTPException(status_code=400, detail=str(e))
105 | 
106 | 
107 | @router.get("/verify")
108 | async def verify(_=Depends(get_user)):
109 |     return {"message": "ok"}
110 | 
111 | 
112 | @router.get("/user/groups", response_model=UserGroupsResponse)
113 | async def get_current_user_groups(
114 |     user=Depends(get_user)
115 | ):
116 |     """
117 |     Get user groups that the authenticated user belongs to.
118 | 
119 |     Returns:
120 |         UserGroupsResponse: List of user groups with total count
121 |     """
122 |     try:
123 |         # Get user groups using service function
124 |         groups = await get_user_groups(user)
125 | 
126 |         # Convert to response format
127 |         group_responses = [
128 |             UserGroupResponse(
129 |                 id=group.id,
130 |                 name=group.name
131 |             )
132 |             for group in groups
133 |         ]
134 |         return UserGroupsResponse(
135 |             groups=group_responses,
136 |             total=len(group_responses)
137 |         )
138 | 
139 |     except Exception as e:
140 |         print(f"❌ Error fetching user groups: {str(e)}")
141 |         raise HTTPException(status_code=500, detail=str(e))
142 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/embedders/manager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Embedder: 임베딩 모델 관리 및 벡터 생성
  3 | 프로세스 단위로 모델을 캐싱하여 GPU 자원을 효율적으로 사용
  4 | """
  5 | from typing import Dict, List, Optional
  6 | from sentence_transformers import SentenceTransformer
  7 | 
  8 | 
  9 | class Embedder:
 10 |     """
 11 |     임베딩 모델 관리자
 12 | 
 13 |     프로세스 내에서 임베딩 모델을 캐싱하여 재사용
 14 |     encode 함수로 텍스트를 벡터로 변환하는 단순한 인터페이스 제공
 15 |     """
 16 | 
 17 |     def __init__(self, device: Optional[str] = None):
 18 |         """
 19 |         Args:
 20 |             device: 모델을 로드할 디바이스 (None이면 자동 선택)
 21 |                    예: "cuda", "cpu", "mps"
 22 |         """
 23 |         self.device = device
 24 |         self.model_cache: Dict[str, SentenceTransformer] = {}
 25 | 
 26 |     def encode(
 27 |         self,
 28 |         texts: List[str],
 29 |         model_name: str,
 30 |         show_progress: bool = False,
 31 |     ) -> List[List[float]]:
 32 |         """
 33 |         텍스트를 임베딩 벡터로 변환
 34 | 
 35 |         Args:
 36 |             texts: 임베딩할 텍스트 리스트
 37 |             model_name: 임베딩 모델 이름
 38 |             show_progress: 진행바 표시 여부
 39 | 
 40 |         Returns:
 41 |             List[List[float]]: 임베딩 벡터 리스트
 42 |         """
 43 |         model = self._get_or_load_model(model_name)
 44 |         vectors = model.encode(
 45 |             texts, show_progress_bar=show_progress, convert_to_numpy=True
 46 |         )
 47 |         return vectors.tolist()
 48 | 
 49 |     def get_dimension(self, model_name: str) -> int:
 50 |         """
 51 |         임베딩 차원 반환
 52 | 
 53 |         Args:
 54 |             model_name: 임베딩 모델 이름
 55 | 
 56 |         Returns:
 57 |             int: 임베딩 벡터 차원
 58 |         """
 59 |         model = self._get_or_load_model(model_name)
 60 |         return model.get_sentence_embedding_dimension()
 61 | 
 62 |     def _get_or_load_model(self, model_name: str) -> SentenceTransformer:
 63 |         """
 64 |         모델 캐싱 및 로드 (내부 메서드)
 65 | 
 66 |         Args:
 67 |             model_name: 임베딩 모델 이름
 68 | 
 69 |         Returns:
 70 |             SentenceTransformer: 로드된 모델 인스턴스
 71 |         """
 72 |         if model_name not in self.model_cache:
 73 |             self.model_cache[model_name] = SentenceTransformer(
 74 |                 model_name, device=self.device
 75 |             )
 76 | 
 77 |         return self.model_cache[model_name]
 78 | 
 79 |     def unload_model(self, model_name: str) -> bool:
 80 |         """
 81 |         모델을 메모리에서 해제
 82 | 
 83 |         Args:
 84 |             model_name: 해제할 모델 이름
 85 | 
 86 |         Returns:
 87 |             bool: 해제 성공 여부
 88 |         """
 89 |         if model_name in self.model_cache:
 90 |             del self.model_cache[model_name]
 91 |             print(f"🗑️ Model unloaded: {model_name}")
 92 |             return True
 93 |         return False
 94 | 
 95 |     def clear_cache(self):
 96 |         """모든 캐시된 모델 해제"""
 97 |         count = len(self.model_cache)
 98 |         self.model_cache.clear()
 99 |         print(f"🗑️ Cleared {count} model(s) from cache")
100 | 
101 | 
102 | # 싱글톤 인스턴스
103 | _embedder_instance: Optional[Embedder] = None
104 | 
105 | 
106 | def get_embedder(
107 |     device: Optional[str] = None,
108 |     force_new: bool = False,
109 | ) -> Embedder:
110 |     """
111 |     Embedder 싱글톤 인스턴스 반환
112 | 
113 |     Args:
114 |         device: 모델을 로드할 디바이스 (None이면 config에서 자동 로드)
115 |                예: "cuda", "cpu", "mps"
116 |         force_new: True면 기존 인스턴스 무시하고 새로 생성 (테스트용)
117 | 
118 |     Returns:
119 |         Embedder: 싱글톤 인스턴스
120 | 
121 |     Example:
122 |         >>> embedder = get_embedder()
123 |         >>> vectors = embedder.encode(["hello", "world"], "intfloat/multilingual-e5-large")
124 |     """
125 |     global _embedder_instance
126 | 
127 |     if _embedder_instance is None or force_new:
128 |         # device가 None이면 config에서 로드
129 |         if device is None:
130 |             device = _load_device_from_config()
131 | 
132 |         _embedder_instance = Embedder(device=device)
133 | 
134 |     return _embedder_instance
135 | 
136 | 
137 | def _load_device_from_config() -> Optional[str]:
138 |     """
139 |     ConfigManager를 사용하여 config에서 device 설정을 로드합니다.
140 | 
141 |     Returns:
142 |         Optional[str]: config에서 읽은 device 설정, 없으면 None
143 |     """
144 |     try:
145 |         from maru_lang.configs import get_config_manager
146 | 
147 |         config_manager = get_config_manager()
148 |         merged_config = config_manager.get_embedder_config()
149 | 
150 |         if merged_config:
151 |             return merged_config.device
152 |     except ImportError:
153 |         pass
154 |     except Exception as e:
155 |         print(f"⚠️ Embedder config 로드 실패: {e}")
156 | 
157 |     return None
158 | 


--------------------------------------------------------------------------------
/maru_lang/models/agents.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Agent-related data models
  3 | """
  4 | import asyncio
  5 | from dataclasses import dataclass, field
  6 | from typing import List, Dict, Any, Optional, Union, TYPE_CHECKING
  7 | from maru_lang.enums.chat import ChatProcessStep as ChatStep
  8 | from maru_lang.models.chat import ChatHistory
  9 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument
 10 | 
 11 | 
 12 | @dataclass
 13 | class AgentResult:
 14 |     """Result from individual agent execution"""
 15 |     success: bool
 16 |     result: str = ""  # 주요 출력 결과 (표준화된 문자열)
 17 |     data: Optional[Dict[str, Any]] = None  # 추가 정보 (선택)
 18 |     error: Optional[str] = None
 19 |     metadata: Optional[Dict[str, Any]] = None
 20 | 
 21 |     def _serialize_value(self, value: Any) -> Any:
 22 |         """Recursively serialize values to JSON-compatible format"""
 23 |         if value is None or isinstance(value, (str, int, float, bool)):
 24 |             return value
 25 |         elif isinstance(value, dict):
 26 |             return {k: self._serialize_value(v) for k, v in value.items()}
 27 |         elif isinstance(value, (list, tuple)):
 28 |             return [self._serialize_value(item) for item in value]
 29 |         elif hasattr(value, 'text'):
 30 |             # Handle MCP TextContent objects
 31 |             return value.text
 32 |         elif hasattr(value, 'to_dict'):
 33 |             return self._serialize_value(value.to_dict())
 34 |         elif hasattr(value, '__dict__'):
 35 |             return self._serialize_value(value.__dict__)
 36 |         else:
 37 |             # Fallback: convert to string
 38 |             return str(value)
 39 | 
 40 |     def to_dict(self) -> Dict[str, Any]:
 41 |         """Convert to dictionary with safe serialization"""
 42 |         return {
 43 |             "success": self.success,
 44 |             "result": self.result,
 45 |             "data": self._serialize_value(self.data),
 46 |             "error": self.error,
 47 |             "metadata": self._serialize_value(self.metadata)
 48 |         }
 49 | 
 50 | 
 51 | @dataclass
 52 | class AgentSelection:
 53 |     """Result of agent selection process"""
 54 |     selected_agents: List[str]
 55 |     execution_order: List[str]
 56 |     reasoning: str
 57 |     parameters: Optional[Dict[str, Any]] = None
 58 |     fallback_config: Optional[Dict[str, Any]] = None
 59 | 
 60 |     def to_dict(self) -> Dict[str, Any]:
 61 |         """Convert to dictionary"""
 62 |         return {
 63 |             "selected_agents": self.selected_agents,
 64 |             "execution_order": self.execution_order,
 65 |             "reasoning": self.reasoning,
 66 |             "parameters": self.parameters or {},
 67 |             "fallback_config": self.fallback_config
 68 |         }
 69 | 
 70 | 
 71 | @dataclass
 72 | class ExecutionContext:
 73 |     """Context of agent execution"""
 74 |     question: str
 75 |     progress_queue: asyncio.Queue
 76 |     chat_history: ChatHistory
 77 |     metadata: Optional[Dict[str, Any]] = field(default_factory=dict)
 78 | 
 79 |     def to_dict(self) -> Dict[str, Any]:
 80 |         """Convert to dictionary"""
 81 |         # exclude progress_queue
 82 |         return {
 83 |             "question": self.question,
 84 |             "progress_queue": self.progress_queue,
 85 |             "chat_history": self.chat_history,
 86 |             "metadata": self.metadata
 87 |         }
 88 | 
 89 | 
 90 | @dataclass
 91 | class ExecutionResult:
 92 |     """Result of agent execution orchestration"""
 93 |     agent_results: Dict[str, AgentResult]
 94 |     execution_order: List[str]
 95 |     success: bool
 96 |     errors: Dict[str, str] = field(default_factory=dict)
 97 | 
 98 |     def to_dict(self) -> Dict[str, Any]:
 99 |         """Convert to dictionary"""
100 |         return {
101 |             "agent_results": {
102 |                 name: result.to_dict()
103 |                 for name, result in self.agent_results.items()
104 |             },
105 |             "execution_order": self.execution_order,
106 |             "success": self.success,
107 |             "errors": self.errors
108 |         }
109 | 
110 | 
111 | @dataclass
112 | class ChatResult:
113 |     """Final chat processing result"""
114 |     answer: str
115 |     internal_documents: List[RetrieveDocument] = field(default_factory=list)
116 | 
117 | 
118 | @dataclass
119 | class ChatProcess:
120 |     """Chat processing result"""
121 |     step: ChatStep
122 |     data: Union[AgentSelection, ExecutionResult, str, ChatResult]
123 | 
124 | 
125 | @dataclass
126 | class GenerateAnswerResult:
127 |     """Result from answer generation"""
128 |     answer: str
129 |     documents: List[Any] = field(default_factory=list)
130 |     success: bool = True
131 |     confidence: Optional[float] = None
132 |     metadata: Optional[Dict[str, Any]] = None
133 | 
134 | 
135 | 


--------------------------------------------------------------------------------
/maru_lang/pluggable/rerankers/manager.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Reranker: 검색 결과 재정렬
  3 | 프로세스 단위로 모델을 캐싱하여 GPU 자원을 효율적으로 사용
  4 | """
  5 | from typing import Dict, List, Optional, Tuple
  6 | from maru_lang.configs import get_config_manager
  7 | from sentence_transformers import CrossEncoder
  8 | 
  9 | 
 10 | class Reranker:
 11 |     """
 12 |     Reranker 관리자
 13 | 
 14 |     프로세스 내에서 reranker 모델을 캐싱하여 재사용
 15 |     rerank 함수로 검색 결과를 재정렬하는 단순한 인터페이스 제공
 16 |     """
 17 | 
 18 |     def __init__(self, device: Optional[str] = None):
 19 |         """
 20 |         Args:
 21 |             device: 모델을 로드할 디바이스 (None이면 자동 선택)
 22 |                    예: "cuda", "cpu", "mps"
 23 |         """
 24 |         self.device = device
 25 |         self.model_cache: Dict[str, CrossEncoder] = {}
 26 | 
 27 |     def rerank(
 28 |         self,
 29 |         query: str,
 30 |         documents: List[str],
 31 |         model_name: str,
 32 |         top_k: Optional[int] = None,
 33 |     ) -> List[Tuple[int, float]]:
 34 |         """
 35 |         쿼리와 문서들을 재정렬
 36 | 
 37 |         Args:
 38 |             query: 검색 쿼리
 39 |             documents: 재정렬할 문서 리스트
 40 |             model_name: reranker 모델 이름
 41 |             top_k: 상위 k개만 반환 (None이면 전체)
 42 | 
 43 |         Returns:
 44 |             List[Tuple[int, float]]: (원본 인덱스, 점수) 튜플 리스트 (점수 내림차순)
 45 |         """
 46 |         model = self._get_or_load_model(model_name)
 47 | 
 48 |         # 쿼리-문서 쌍 생성
 49 |         pairs = [[query, doc] for doc in documents]
 50 | 
 51 |         # 점수 계산
 52 |         scores = model.predict(pairs)
 53 | 
 54 |         # (인덱스, 점수) 튜플 생성 및 정렬
 55 |         ranked = [(idx, float(score)) for idx, score in enumerate(scores)]
 56 |         ranked.sort(key=lambda x: x[1], reverse=True)
 57 | 
 58 |         # top_k 제한
 59 |         if top_k is not None:
 60 |             ranked = ranked[:top_k]
 61 | 
 62 |         return ranked
 63 | 
 64 |     def _get_or_load_model(self, model_name: str) -> CrossEncoder:
 65 |         """
 66 |         모델 캐싱 및 로드 (내부 메서드)
 67 | 
 68 |         Args:
 69 |             model_name: reranker 모델 이름
 70 | 
 71 |         Returns:
 72 |             CrossEncoder: 로드된 모델 인스턴스
 73 |         """
 74 |         if model_name not in self.model_cache:
 75 |             print(f"Loading reranker model: {model_name}...")
 76 |             self.model_cache[model_name] = CrossEncoder(
 77 |                 model_name, device=self.device
 78 |             )
 79 |             device_info = f"device={self.device}" if self.device else "auto"
 80 |             print(f"✅ Reranker loaded: {model_name} ({device_info})")
 81 | 
 82 |         return self.model_cache[model_name]
 83 | 
 84 |     def unload_model(self, model_name: str) -> bool:
 85 |         """
 86 |         모델을 메모리에서 해제
 87 | 
 88 |         Args:
 89 |             model_name: 해제할 모델 이름
 90 | 
 91 |         Returns:
 92 |             bool: 해제 성공 여부
 93 |         """
 94 |         if model_name in self.model_cache:
 95 |             del self.model_cache[model_name]
 96 |             print(f"🗑️ Reranker unloaded: {model_name}")
 97 |             return True
 98 |         return False
 99 | 
100 |     def clear_cache(self):
101 |         """모든 캐시된 모델 해제"""
102 |         count = len(self.model_cache)
103 |         self.model_cache.clear()
104 |         print(f"🗑️ Cleared {count} reranker model(s) from cache")
105 | 
106 | 
107 | # 싱글톤 인스턴스
108 | _reranker_instance: Optional[Reranker] = None
109 | 
110 | 
111 | def get_reranker(
112 |     device: Optional[str] = None,
113 |     force_new: bool = False,
114 | ) -> Reranker:
115 |     """
116 |     Reranker 싱글톤 인스턴스 반환
117 | 
118 |     Args:
119 |         device: 모델을 로드할 디바이스 (None이면 config에서 자동 로드)
120 |                예: "cuda", "cpu", "mps"
121 |         force_new: True면 기존 인스턴스 무시하고 새로 생성 (테스트용)
122 | 
123 |     Returns:
124 |         Reranker: 싱글톤 인스턴스
125 | 
126 |     Example:
127 |         >>> reranker = get_reranker()
128 |         >>> ranked = reranker.rerank(
129 |         ...     query="python tutorial",
130 |         ...     documents=["doc1", "doc2", "doc3"],
131 |         ...     model_name="BAAI/bge-reranker-v2-m3",
132 |         ...     top_k=5
133 |         ... )
134 |     """
135 |     global _reranker_instance
136 | 
137 |     if _reranker_instance is None or force_new:
138 |         # device가 None이면 config에서 로드 (embedder와 동일한 device 사용)
139 |         if device is None:
140 |             device = _load_device_from_config()
141 | 
142 |         _reranker_instance = Reranker(device=device)
143 | 
144 |     return _reranker_instance
145 | 
146 | 
147 | def _load_device_from_config() -> Optional[str]:
148 |     """
149 |     ConfigManager를 사용하여 config에서 device 설정을 로드합니다.
150 |     Embedder config와 동일한 device 사용
151 | 
152 |     Returns:
153 |         Optional[str]: config에서 읽은 device 설정, 없으면 None
154 |     """
155 |     try:
156 | 
157 |         config_manager = get_config_manager()
158 |         embedder_config = config_manager.get_embedder_config()
159 | 
160 |         if embedder_config:
161 |             return embedder_config.device
162 |     except ImportError:
163 |         pass
164 |     except Exception as e:
165 |         print(f"⚠️ Reranker config 로드 실패: {e}")
166 | 
167 |     return None
168 | 


--------------------------------------------------------------------------------