├── maru_lang
├── py.typed
├── api
│ ├── __init__.py
│ └── endpoints
│ │ ├── __init__.py
│ │ ├── user_group.py
│ │ └── auth.py
├── core
│ ├── __init__.py
│ ├── relation_db
│ │ ├── models
│ │ │ ├── __init__.py
│ │ │ ├── chat.py
│ │ │ ├── auth.py
│ │ │ └── documents.py
│ │ ├── __init__.py
│ │ └── connection.py
│ └── vector_db
│ │ ├── __init__.py
│ │ ├── factory.py
│ │ ├── retrieve_document.py
│ │ └── base.py
├── commands
│ ├── __init__.py
│ ├── transfer.py
│ └── tree.py
├── pipelines
│ ├── __init__.py
│ ├── chat
│ │ └── __init__.py
│ ├── ingest
│ │ └── __init__.py
│ └── base.py
├── schemas
│ ├── __init__.py
│ ├── chat.py
│ ├── auth.py
│ └── ingest.py
├── services
│ ├── __init__.py
│ ├── admin.py
│ ├── chat.py
│ └── ingest.py
├── dependencies
│ ├── __init__.py
│ ├── llm.py
│ ├── chat.py
│ ├── ingest.py
│ ├── email.py
│ └── auth.py
├── pluggable
│ ├── embedders
│ │ ├── __init__.py
│ │ └── manager.py
│ ├── rerankers
│ │ ├── __init__.py
│ │ └── manager.py
│ ├── llms
│ │ └── __init__.py
│ ├── retrievers
│ │ └── __init__.py
│ ├── __init__.py
│ ├── agents
│ │ ├── __init__.py
│ │ ├── builtin
│ │ │ ├── __init__.py
│ │ │ └── intent_extractor.py
│ │ └── agent_factory.py
│ ├── models
│ │ ├── chunker.py
│ │ ├── embedder.py
│ │ ├── reranker.py
│ │ ├── __init__.py
│ │ ├── llm.py
│ │ └── loader.py
│ ├── configs
│ │ ├── __init__.py
│ │ ├── rag_loader.py
│ │ ├── chunker_config.py
│ │ ├── embedder_config.py
│ │ ├── loader_config.py
│ │ └── reranker_config.py
│ ├── chunkers
│ │ ├── base.py
│ │ ├── sentence.py
│ │ └── paragraph.py
│ └── loaders
│ │ ├── txt_parser.py
│ │ ├── markdown_parser.py
│ │ ├── base.py
│ │ ├── pdf_parser.py
│ │ ├── json_parser.py
│ │ ├── docx_parser.py
│ │ ├── html_parser.py
│ │ ├── yaml_parser.py
│ │ ├── xlsx_parser.py
│ │ ├── csv_parser.py
│ │ ├── pptx_parser.py
│ │ └── xml_parser.py
├── enums
│ ├── chat.py
│ ├── agents.py
│ ├── documents.py
│ ├── configs.py
│ ├── auth.py
│ └── __init__.py
├── templates
│ ├── yaml
│ │ ├── embedder_config.yaml
│ │ ├── openai.yaml
│ │ ├── local.yaml
│ │ ├── chunker_config.yaml
│ │ ├── agents
│ │ │ ├── mcps
│ │ │ │ └── agents_firecrawl_mcp.yaml
│ │ │ ├── agents_calculator.yaml
│ │ │ └── builtin
│ │ │ │ ├── agents_knowledge_search.yaml
│ │ │ │ ├── agents_intent_extractor.yaml
│ │ │ │ ├── agents_keyword_extractor.yaml
│ │ │ │ ├── agents_group_classifier.yaml
│ │ │ │ └── agents_response.yaml
│ │ ├── reranker_config.yaml
│ │ ├── loader_config.yaml
│ │ ├── agents_build_selector.yaml
│ │ ├── rag_config.yaml
│ │ ├── llm_reranker.yaml
│ │ └── system_config.yaml
│ └── python
│ │ ├── main.py
│ │ ├── calculator_agent.py
│ │ └── custom_parser.py
├── __init__.py
├── models
│ ├── ingest.py
│ ├── configs
│ │ ├── __init__.py
│ │ └── group.py
│ ├── vector_db.py
│ └── agents.py
├── utils
│ ├── __init__.py
│ ├── distribution.py
│ ├── document.py
│ └── security.py
└── configs
│ └── __init__.py
├── LICENSE
└── pyproject.toml
/maru_lang/py.typed:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/api/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/core/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/commands/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/schemas/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/services/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/api/endpoints/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/dependencies/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/maru_lang/core/relation_db/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .auth import *
2 | from .documents import *
3 | from .chat import *
4 |
--------------------------------------------------------------------------------
/maru_lang/pipelines/chat/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Chat Pipeline
3 | """
4 | from maru_lang.pipelines.chat.pipeline import ChatPipeline
5 |
6 | __all__ = ["ChatPipeline"]
7 |
--------------------------------------------------------------------------------
/maru_lang/pipelines/ingest/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Ingest Pipeline
3 | """
4 | from maru_lang.pipelines.ingest.pipeline import IngestPipeline, IngestResult
5 |
6 | __all__ = ["IngestPipeline", "IngestResult"]
7 |
--------------------------------------------------------------------------------
/maru_lang/core/vector_db/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import VectorDB
2 | from .chroma import ChromaVectorDB
3 | from .retrieve_document import RetrieveDocument
4 |
5 | __all__ = ["VectorDB", "ChromaVectorDB", "RetrieveDocument"]
--------------------------------------------------------------------------------
/maru_lang/pluggable/embedders/__init__.py:
--------------------------------------------------------------------------------
1 | """Embedder for handling embedding models."""
2 |
3 | from .manager import (
4 | Embedder,
5 | get_embedder,
6 | )
7 |
8 | __all__ = [
9 | "Embedder",
10 | "get_embedder",
11 | ]
12 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/rerankers/__init__.py:
--------------------------------------------------------------------------------
1 | """Reranker for handling reranking models."""
2 |
3 | from .manager import (
4 | Reranker,
5 | get_reranker,
6 | )
7 |
8 | __all__ = [
9 | "Reranker",
10 | "get_reranker",
11 | ]
12 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/llms/__init__.py:
--------------------------------------------------------------------------------
1 | # LLM 클라이언트
2 | from .client import LLMServerClient
3 |
4 | # LLM 서버 매니저
5 | from .server_manager import LLMServerManager
6 |
7 | __all__ = [
8 | "LLMServerClient",
9 | "LLMServerManager"
10 | ]
11 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/retrievers/__init__.py:
--------------------------------------------------------------------------------
1 | """Retriever for handling search operations."""
2 |
3 | from .manager import (
4 | Retriever,
5 | get_retriever,
6 | RetriveMethod,
7 | )
8 |
9 | __all__ = [
10 | "Retriever",
11 | "get_retriever",
12 | "RetriveMethod",
13 | ]
14 |
--------------------------------------------------------------------------------
/maru_lang/enums/chat.py:
--------------------------------------------------------------------------------
1 | from enum import Enum
2 |
3 |
4 | class ChatProcessStep(str, Enum):
5 | """채팅 처리 단계"""
6 | START = "start"
7 | AGENT_SELECTION = "agent_selection"
8 | AGENT_EXECUTION = "agent_execution"
9 | ANSWER_GENERATION = "answer_generation"
10 | COMPLETED = "completed"
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/embedder_config.yaml:
--------------------------------------------------------------------------------
1 | # Embedder Configuration
2 | # Configure embedding models and device preferences
3 |
4 | # Default embedding model for all document groups
5 | # Can be overridden per-group in rag_config.yaml
6 | default_model: BAAI/bge-m3
7 |
8 | # Device selection (null => auto-detect: cuda > mps > cpu)
9 | device: null
10 |
--------------------------------------------------------------------------------
/maru_lang/enums/agents.py:
--------------------------------------------------------------------------------
1 | """
2 | Agent-related enums
3 | """
4 | from enum import Enum
5 |
6 |
7 | class LLMFallbackStrategy(Enum):
8 | """LLM fallback strategies when specified LLM server is not available"""
9 | ANY_AVAILABLE = "any_available" # Use any available LLM server
10 | ERROR = "error" # Raise error and stop execution
--------------------------------------------------------------------------------
/maru_lang/__init__.py:
--------------------------------------------------------------------------------
1 | """MaruLang - Advanced AI Agent Framework with RAG and multi-agent system"""
2 |
3 | __version__ = "0.0.0"
4 |
5 | from maru_lang.app import MaruLangApp, default_app
6 |
7 | # FastAPI app instance
8 | app = default_app.get_fastapi_app()
9 |
10 | __all__ = [
11 | "MaruLangApp",
12 | "default_app",
13 | "app",
14 | "__version__",
15 | ]
--------------------------------------------------------------------------------
/maru_lang/enums/documents.py:
--------------------------------------------------------------------------------
1 | from enum import IntEnum
2 |
3 |
4 | class PermissionAction(IntEnum):
5 | READ = 1
6 | WRITE = 2
7 | MANAGE = 3 # sync, base_path 변경 등 관리 권한
8 |
9 |
10 | class DocumentStatus(IntEnum):
11 | PROCESSING = 1 # 처리 중 (파싱/청킹/임베딩 대기)
12 | ACTIVE = 2 # 활성화 (임베딩 완료, 검색 가능)
13 | INACTIVE = 3 # 비활성화 (검색 불가)
14 |
15 |
--------------------------------------------------------------------------------
/maru_lang/enums/configs.py:
--------------------------------------------------------------------------------
1 | """
2 | Configuration type enums
3 | """
4 | from enum import Enum
5 |
6 |
7 | class ConfigType(Enum):
8 | """Configuration types"""
9 | LLMS = "llms"
10 | RAGS = "rags" # RAG 설정 (retriever + groups)
11 | AGENTS = "agents"
12 | LOADERS = "loaders"
13 | CHUNKERS = "chunkers"
14 | EMBEDDERS = "embedders"
15 | RERANKERS = "rerankers"
--------------------------------------------------------------------------------
/maru_lang/pluggable/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Pluggable components for extensibility
3 |
4 | This package contains all pluggable/extensible components:
5 | - loaders: File parsers (txt, pdf, docx, etc.)
6 | - chunkers: Text chunking strategies (paragraph, sentence, etc.)
7 | - embedders: Embedding models management
8 | - rerankers: Result reranking models
9 | - configs: Configuration loaders for pluggable components
10 | - models: Data models for configurations
11 | """
12 |
--------------------------------------------------------------------------------
/maru_lang/enums/auth.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | from enum import Enum
3 |
4 |
5 | class UserRoleCode(Enum):
6 | # 기본적으로 생성하는 코드 사용자가 만들 수 있다.
7 | EDITOR = 'editor'
8 | ADMIN = 'admin'
9 |
10 | @classmethod
11 | def is_valid_role(cls, role_name: str) -> bool:
12 | try:
13 | cls(role_name)
14 | return True
15 | except ValueError as e:
16 | print(e)
17 | return False
18 |
--------------------------------------------------------------------------------
/maru_lang/enums/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Enums for the LLM Chatbot application
3 | """
4 | from .agents import LLMFallbackStrategy
5 | from .auth import UserRoleCode
6 | from .chat import ChatProcessStep
7 | from .configs import ConfigType
8 | from .documents import PermissionAction, DocumentStatus
9 |
10 | __all__ = [
11 | "LLMFallbackStrategy",
12 | "UserRoleCode",
13 | "ChatProcessStep",
14 | "ConfigType",
15 | "PermissionAction",
16 | "DocumentStatus",
17 | ]
--------------------------------------------------------------------------------
/maru_lang/models/ingest.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | from typing import Optional
3 |
4 |
5 | @dataclass(frozen=True)
6 | class PipelineConfig:
7 | model_name: str
8 | model_dim: int
9 | normalize_ver: str
10 | pooling: str
11 | lang_hint: Optional[str] = None
12 | pipeline_version: Optional[str] = None # 메타 기록용
13 |
14 |
15 | @dataclass(frozen=True)
16 | class ChunkInput:
17 | number: int # 페이지/문단/슬롯 인덱스
18 | content: str
19 | meta: Optional[dict] = None
20 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/agents/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Agent components for the chatbot system
3 | """
4 | from .base import BaseAgent
5 | from .agent_selector import AgentSelector
6 | from .agent_executor import AgentExecutor
7 | from .agent_factory import AgentFactory
8 | from .mcp_client_agent import MCPClientAgent
9 |
10 | __all__ = [
11 | # Core components
12 | "BaseAgent",
13 | "AgentSelector",
14 | "AgentExecutor",
15 | "AgentFactory",
16 | # Individual agents
17 | "DocumentSearchAgent",
18 | "MCPClientAgent",
19 | ]
20 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/models/chunker.py:
--------------------------------------------------------------------------------
1 | """
2 | Chunker configuration models
3 | """
4 | from dataclasses import dataclass, field
5 | from typing import Dict, Any
6 |
7 |
8 | @dataclass
9 | class ChunkerConfig:
10 | """
11 | Chunker configuration
12 |
13 | 각 chunker의 생성자 파라미터를 설정
14 | """
15 | # chunker 이름 -> 생성자 파라미터 매핑
16 | # 예: {"paragraph": {"max_chunk_size": 500}}
17 | chunkers: Dict[str, Dict[str, Any]] = field(default_factory=dict)
18 |
19 | # Configuration metadata
20 | source_path: str = ""
21 | is_override: bool = False
22 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/openai.yaml:
--------------------------------------------------------------------------------
1 | # OpenAI API basic template
2 |
3 | name: openai
4 | description: "OpenAI API"
5 | url: https://api.openai.com
6 | model_name: gpt-4o-mini
7 | api_key: ${OPENAI_API_KEY}
8 | timeout: 30
9 | enabled: true
10 |
11 | chat_completions_path: /v1/chat/completions
12 | health_check_endpoint: /v1/models
13 |
14 | headers:
15 | Content-Type: application/json
16 |
17 | config:
18 | temperature: 0.7
19 | max_tokens: 2000
20 | top_p: 1.0
21 |
22 | retry:
23 | max_attempts: 3
24 | backoff_factor: 2
25 | max_delay: 60
26 |
27 | log_level: INFO
--------------------------------------------------------------------------------
/maru_lang/pluggable/models/embedder.py:
--------------------------------------------------------------------------------
1 | """
2 | Embedder configuration models
3 | """
4 | from dataclasses import dataclass
5 | from typing import Optional
6 |
7 |
8 | @dataclass
9 | class EmbedderConfig:
10 | """
11 | Embedder configuration
12 |
13 | 임베딩 모델 및 디바이스 설정
14 | """
15 | # 기본 임베딩 모델 (모든 document group의 기본값)
16 | # rag_config.yaml에서 그룹별로 override 가능
17 | default_model: str = "BAAI/bge-m3"
18 |
19 | # 디바이스 (None이면 자동 선택: cuda > mps > cpu)
20 | device: Optional[str] = None
21 |
22 | # Configuration metadata
23 | source_path: str = ""
24 | is_override: bool = False
25 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/local.yaml:
--------------------------------------------------------------------------------
1 | # OpenAI-Compatible Local LLM Template
2 |
3 | name: local-llm
4 | description: "Local OpenAI-Compatible Server"
5 | url: http://localhost:8000
6 | model_name: meta-llama/Llama-2-7b-chat-hf
7 | api_key: ""
8 | timeout: 60
9 | enabled: true
10 |
11 | chat_completions_path: /v1/chat/completions
12 | health_check_endpoint: /health
13 |
14 | headers:
15 | Content-Type: application/json
16 |
17 | config:
18 | temperature: 0.7
19 | max_tokens: 2048
20 | top_p: 0.95
21 | stream: false
22 |
23 | retry:
24 | max_attempts: 3
25 | backoff_factor: 2
26 | max_delay: 60
27 |
28 | log_level: INFO
--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/__init__.py:
--------------------------------------------------------------------------------
1 | """Configuration loaders for pluggable components"""
2 |
3 | from .llm_config import LLMConfigLoader
4 | from .agent_config import AgentConfigLoader
5 | from .loader_config import LoaderConfigLoader
6 | from .chunker_config import ChunkerConfigLoader
7 | from .embedder_config import EmbedderConfigLoader
8 | from .reranker_config import RerankerConfigLoader
9 | from .rag_loader import RagConfigLoader
10 |
11 | __all__ = [
12 | "LLMConfigLoader",
13 | "AgentConfigLoader",
14 | "LoaderConfigLoader",
15 | "ChunkerConfigLoader",
16 | "EmbedderConfigLoader",
17 | "RerankerConfigLoader",
18 | "RagConfigLoader",
19 | ]
20 |
--------------------------------------------------------------------------------
/maru_lang/schemas/chat.py:
--------------------------------------------------------------------------------
1 | from typing import Any
2 | from typing import List, Optional
3 | from datetime import datetime
4 | from pydantic import BaseModel, Field, field_validator
5 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument
6 |
7 |
8 | class ChatRequest(BaseModel):
9 | content: str
10 | session_start_time: Optional[datetime] = Field(
11 | default=None,
12 | description="세션 시작 시간")
13 |
14 | class ChatResponse(BaseModel):
15 | answer: str
16 | references: list[RetrieveDocument]
17 |
18 |
19 | class ConversationResponse(BaseModel):
20 | id: int
21 | question: str
22 | answer: str
23 | created_at: datetime
24 |
--------------------------------------------------------------------------------
/maru_lang/utils/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Unified utility module.
3 |
4 | This package exposes shared utility functions used across the project.
5 |
6 | Submodules:
7 | - security: Security and encryption utilities (JWT, AES, etc.)
8 |
9 | """
10 |
11 |
12 | # Security utilities
13 | from .security import (
14 | generate_anonymized_key,
15 | create_jwt_token,
16 | decode_token,
17 | get_key_spec,
18 | aes256_decrypt,
19 | aes256_encrypt
20 | )
21 |
22 | __all__ = [
23 |
24 | # Security helpers
25 | "generate_anonymized_key",
26 | "create_jwt_token",
27 | "decode_token",
28 | "get_key_spec",
29 | "aes256_decrypt",
30 | "aes256_encrypt"
31 | ]
--------------------------------------------------------------------------------
/maru_lang/dependencies/llm.py:
--------------------------------------------------------------------------------
1 | from maru_lang.pluggable.llms import LLMServerClient, LLMServerManager
2 |
3 |
4 | _llm_manager = None
5 |
6 |
7 | async def get_llm_manager() -> LLMServerManager:
8 | """LLMServerManager 인스턴스를 반환합니다."""
9 | global _llm_manager
10 | if _llm_manager is None:
11 | _llm_manager = LLMServerManager()
12 | # 서버가 초기화되지 않았다면 초기화
13 | if not _llm_manager.all_servers:
14 | await _llm_manager.initialize_servers()
15 |
16 | return _llm_manager
17 |
18 |
19 | async def get_llm() -> LLMServerClient | None:
20 | """활성화된 LLM 서버 중 하나를 반환합니다."""
21 | manager = await get_llm_manager()
22 |
23 |
24 | return await manager.get_active_server()
25 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/chunkers/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import List, Optional
3 | from maru_lang.models.ingest import ChunkInput
4 |
5 |
6 | class BaseChunker(ABC):
7 | """텍스트 청킹 전략의 기본 인터페이스"""
8 |
9 | # Chunker 식별 정보
10 | name: str = "base_chunker"
11 | description: str = "기본 청킹 전략"
12 |
13 | @abstractmethod
14 | def chunk(self, text: str) -> List[ChunkInput]:
15 | """전체 텍스트를 받아서 ChunkInput 리스트로 변환"""
16 | pass
17 |
18 | def get_metadata(self) -> dict:
19 | """Chunker 메타데이터 반환"""
20 | return {
21 | "chunker_name": self.name,
22 | "chunker_description": self.description,
23 | }
24 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/models/reranker.py:
--------------------------------------------------------------------------------
1 | """
2 | Reranker configuration models
3 | """
4 | from dataclasses import dataclass
5 | from typing import Optional, Literal
6 |
7 |
8 | @dataclass
9 | class RerankerConfig:
10 | """Reranker configuration - reranker 모델 및 사용 여부 설정"""
11 | enabled: bool = True
12 | method: Literal["model", "agent"] = "model"
13 |
14 | # Method: "model" - 임베딩 모델 기반 reranking
15 | default_model: str = "BAAI/bge-reranker-v2-m3"
16 |
17 | # Method: "agent" - Agent 기반 reranking (LLM 등)
18 | agent_name: Optional[str] = None
19 |
20 | # Reranking 후 반환할 최대 개수 (None이면 원본 k 사용)
21 | top_k: Optional[int] = 5
22 |
23 | source_path: str = ""
24 | is_override: bool = False
25 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/agents/builtin/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Builtin agents - core system agents
3 | These agents are not customizable by users and are part of the core system
4 | """
5 | from maru_lang.pluggable.agents.builtin.group_classifier import GroupClassifierAgent
6 | from maru_lang.pluggable.agents.builtin.intent_extractor import IntentExtractorAgent
7 | from maru_lang.pluggable.agents.builtin.keyword_extractor import KeywordExtractorAgent
8 | from maru_lang.pluggable.agents.builtin.response_agent import ResponseAgent
9 | from maru_lang.pluggable.agents.builtin.knowledge_search import KnowledgeSearchAgent
10 |
11 | __all__ = [
12 | "GroupClassifierAgent",
13 | "IntentExtractorAgent",
14 | "KeywordExtractorAgent",
15 | "ResponseAgent",
16 | "KnowledgeSearchAgent",
17 | ]
18 |
--------------------------------------------------------------------------------
/maru_lang/models/configs/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Configuration models for the LLM Chatbot application
3 |
4 | Note: Most config models have been moved to pluggable.models
5 | This module now only contains Group configuration which is not pluggable
6 | """
7 | from .group import GroupConfig, GroupsConfig
8 |
9 | # Import pluggable models for backward compatibility
10 | from maru_lang.pluggable.models import (
11 | LLMConfig,
12 | AgentConfig,
13 | LoaderConfig,
14 | ExtensionMapping,
15 | ChunkerConfig,
16 | EmbedderConfig,
17 | RerankerConfig,
18 | )
19 |
20 | __all__ = [
21 | "LLMConfig",
22 | "GroupConfig",
23 | "GroupsConfig",
24 | "AgentConfig",
25 | "LoaderConfig",
26 | "ExtensionMapping",
27 | "ChunkerConfig",
28 | "EmbedderConfig",
29 | "RerankerConfig",
30 | ]
--------------------------------------------------------------------------------
/maru_lang/schemas/auth.py:
--------------------------------------------------------------------------------
1 | from pydantic import BaseModel
2 | from typing import List, Optional
3 |
4 |
5 | class SignUpRequest(BaseModel):
6 | email: str
7 |
8 |
9 | class LogoutRequest(BaseModel):
10 | device_id: str
11 |
12 |
13 | class VerifyCodeRequest(BaseModel):
14 | device_id: str
15 | email: str
16 | code: str
17 |
18 |
19 | class UserResponse(BaseModel):
20 | id: int
21 | email: str
22 | name: Optional[str] = None
23 |
24 | class Config:
25 | from_attributes = True
26 |
27 |
28 | class UserGroupResponse(BaseModel):
29 | id: int
30 | name: str
31 | manager: Optional[UserResponse] = None
32 | created_at: Optional[str] = None
33 |
34 | class Config:
35 | from_attributes = True
36 |
37 |
38 | class UserGroupsResponse(BaseModel):
39 | groups: List[UserGroupResponse]
40 | total: int
41 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/models/__init__.py:
--------------------------------------------------------------------------------
1 | """Configuration data models for pluggable components"""
2 |
3 | from .llm import LLMConfig
4 | from .agent import AgentConfig
5 | from .loader import LoaderConfig, ExtensionMapping
6 | from .chunker import ChunkerConfig
7 | from .embedder import EmbedderConfig
8 | from .reranker import RerankerConfig
9 | from .rag import (
10 | RagConfig,
11 | RetrieverConfig,
12 | GroupRagConfig,
13 | QueryTypeWeights,
14 | FallbackLogicConfig,
15 | GroupComponents,
16 | )
17 |
18 | __all__ = [
19 | "LLMConfig",
20 | "AgentConfig",
21 | "LoaderConfig",
22 | "ExtensionMapping",
23 | "ChunkerConfig",
24 | "EmbedderConfig",
25 | "RerankerConfig",
26 | "RagConfig",
27 | "RetrieverConfig",
28 | "GroupRagConfig",
29 | "QueryTypeWeights",
30 | "FallbackLogicConfig",
31 | "GroupComponents",
32 | ]
33 |
--------------------------------------------------------------------------------
/maru_lang/services/admin.py:
--------------------------------------------------------------------------------
1 | """
2 | Admin user management service
3 | """
4 | from maru_lang.core.relation_db.models.auth import User
5 |
6 |
7 | ADMIN_EMAIL = "admin@maru.local"
8 | ADMIN_NAME = "Admin"
9 |
10 |
11 | async def get_or_create_admin_user() -> User:
12 | """
13 | Admin 사용자를 가져오거나 없으면 생성합니다.
14 | CLI 명령어는 기본적으로 admin 사용자로 실행됩니다.
15 |
16 | Returns:
17 | Admin User 인스턴스
18 | """
19 | admin_user = await User.get_or_none(email=ADMIN_EMAIL)
20 |
21 | if admin_user is None:
22 | admin_user = await User.create(
23 | email=ADMIN_EMAIL,
24 | name=ADMIN_NAME,
25 | )
26 |
27 | return admin_user
28 |
29 |
30 | async def ensure_admin_user() -> User:
31 | """
32 | Admin 사용자가 존재하는지 확인하고 반환합니다.
33 | DB 초기화 시 호출됩니다.
34 |
35 | Returns:
36 | Admin User 인스턴스
37 | """
38 | return await get_or_create_admin_user()
39 |
--------------------------------------------------------------------------------
/maru_lang/templates/python/main.py:
--------------------------------------------------------------------------------
1 | """
2 | Custom MaruLang Application
3 | This file was generated by maru install command.
4 | """
5 | from maru_lang import MaruLangApp
6 |
7 | # Create your custom MaruLang instance
8 | app = MaruLangApp(
9 | title="My MaruLang App",
10 | version="1.0.0",
11 | description="Custom MaruLang Application"
12 | )
13 |
14 | # You can customize the app here
15 | # For example:
16 | # - Add custom startup events
17 | # - Add custom routes
18 | # - Add middleware
19 | # - Configure CORS settings
20 |
21 | @app.on_event("startup")
22 | async def custom_startup():
23 | """Custom startup event"""
24 | print("🚀 Custom MaruLang app started!")
25 |
26 | # Optional: Add custom routes
27 | # @app.get("/custom-health")
28 | # async def custom_health_check():
29 | # return {"status": "healthy", "custom": True}
30 |
31 | # The app instance will be imported by the serve command
32 | # Usage: maru serve --app-module main:app
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/chunker_config.yaml:
--------------------------------------------------------------------------------
1 | # Chunker Configuration
2 | # Configure constructor parameters for each chunker.
3 | #
4 | # Available built-in chunkers:
5 | # - paragraph: chunk by paragraph (split on blank lines)
6 | # - sentence: chunk by sentence and merge when needed
7 | # - fixed_size: chunk by fixed size (supports overlap)
8 | #
9 | # Add custom chunkers under the chunkers/ directory.
10 |
11 | # Chunker-specific settings
12 | chunkers:
13 | # Paragraph-based chunker
14 | paragraph:
15 | max_chunk_size: 2000
16 |
17 | # Sentence-based chunker
18 | sentence:
19 | max_chunk_size: 500
20 |
21 | # Fixed-size chunker
22 | fixed_size:
23 | chunk_size: 1000
24 | overlap: 200
25 |
26 | # Example for custom chunkers:
27 | # chunkers:
28 | # header: # Markdown header-based chunker
29 | # max_level: 3
30 | #
31 | # page: # PDF page-based chunker
32 | # combine_small_pages: true
33 | # min_page_chars: 100
34 |
--------------------------------------------------------------------------------
/maru_lang/models/configs/group.py:
--------------------------------------------------------------------------------
1 | """
2 | Group configuration models
3 | """
4 | from dataclasses import dataclass, field
5 | from typing import Dict, Any, List
6 |
7 |
8 | @dataclass
9 | class GroupConfig:
10 | """Group configuration for chatbot categorization"""
11 | name: str
12 | description: str = ""
13 | force_rag: bool = False
14 | permissions: List[str] = field(default_factory=list)
15 | prompts: List[str] = field(default_factory=list)
16 | priority: str = "normal" # high, normal, low
17 | weight: float = 1.0
18 | settings: Dict[str, Any] = field(default_factory=dict)
19 | source_path: str = ""
20 | is_override: bool = False
21 |
22 |
23 | @dataclass
24 | class GroupsConfig:
25 | """Complete groups configuration including priorities"""
26 | group_priorities: Dict[str, Any] = field(default_factory=dict)
27 | groups: Dict[str, GroupConfig] = field(default_factory=dict)
28 | tool_choice_reason: Dict[str, str] = field(default_factory=dict)
29 | source_path: str = ""
30 | is_override: bool = False
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2025 ML2
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/maru_lang/core/relation_db/models/chat.py:
--------------------------------------------------------------------------------
1 | from tortoise.models import Model
2 | from tortoise import fields
3 | from datetime import datetime, timezone
4 | from enum import IntEnum
5 |
6 |
7 | class Conversation(Model):
8 | id = fields.IntField(pk=True)
9 | user = fields.ForeignKeyField(
10 | "models.User", related_name="conversations", on_delete=fields.OnDelete.CASCADE)
11 | question = fields.TextField() # 사용자 질문
12 | enhanced_question = fields.TextField(null=True) # 확장된 질문
13 | answer = fields.TextField() # AI 답변
14 | metadata = fields.JSONField(default={}) # API 호출 정보, 토큰 수, 처리 시간 등
15 | created_at = fields.DatetimeField(auto_now_add=True)
16 |
17 | class Meta:
18 | table = "conversation"
19 |
20 |
21 | class ConversationReference(Model):
22 | id = fields.IntField(pk=True)
23 | conversation = fields.ForeignKeyField(
24 | "models.Conversation",
25 | related_name="references",
26 | on_delete=fields.OnDelete.CASCADE)
27 | document = fields.ForeignKeyField(
28 | "models.Document",
29 | related_name="conversation_references",
30 | on_delete=fields.OnDelete.CASCADE)
31 | score = fields.FloatField() # 검색 관련성 점수
32 |
33 | class Meta:
34 | table = "conversation_reference"
35 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/models/llm.py:
--------------------------------------------------------------------------------
1 | """
2 | LLM configuration models
3 | """
4 | from dataclasses import dataclass, field
5 | from typing import Dict, Any, Optional
6 |
7 |
8 | @dataclass
9 | class LLMConfig:
10 | """LLM server configuration"""
11 | name: str
12 | url: str
13 | model_name: str = ""
14 | description: str = ""
15 | api_key: Optional[str] = None
16 | timeout: float = 30.0
17 | enabled: bool = True
18 | max_retries: int = 3
19 | health_check_endpoint: str = "/health"
20 | chat_completions_path: str = "/v1/chat/completions"
21 | headers: Dict[str, str] = field(default_factory=dict)
22 | config: Dict[str, Any] = field(default_factory=dict)
23 | health_check: Dict[str, Any] = field(default_factory=dict)
24 | cost_tracking: Dict[str, Any] = field(default_factory=dict)
25 | limits: Dict[str, Any] = field(default_factory=dict)
26 | retry: Dict[str, Any] = field(default_factory=dict)
27 | log_level: str = "INFO"
28 | source_path: str = ""
29 | is_override: bool = False
30 |
31 | def __post_init__(self):
32 | """Process environment variables in api_key"""
33 | if self.api_key and self.api_key.startswith('${') and self.api_key.endswith('}'):
34 | import os
35 | env_var = self.api_key[2:-1]
36 | self.api_key = os.getenv(env_var)
--------------------------------------------------------------------------------
/maru_lang/pluggable/models/loader.py:
--------------------------------------------------------------------------------
1 | """
2 | Loader configuration models
3 | """
4 | from dataclasses import dataclass, field
5 | from typing import Dict, Optional, Any
6 |
7 |
8 | @dataclass
9 | class ExtensionMapping:
10 | """확장자별 loader와 chunker 매핑"""
11 | loader: str # 사용할 loader (parser) 이름
12 | chunker: str # 사용할 chunker 이름
13 |
14 |
15 | @dataclass
16 | class LoaderConfig:
17 | """
18 | Loader configuration
19 |
20 | 파일 확장자별로 어떤 loader(parser)와 chunker를 사용할지 설정
21 | """
22 | # Default loader/chunker (확장자 매핑 없을 때 사용)
23 | # default_loader가 None이면 등록된 확장자만 처리 (whitelist 모드)
24 | default_loader: Optional[str] = None
25 | default_chunker: Optional[str] = "paragraph"
26 |
27 | # 확장자 -> {loader, chunker} 매핑
28 | # 예: {".pdf": {"loader": "pdf", "chunker": "paragraph"}}
29 | extensions: Dict[str, ExtensionMapping] = field(default_factory=dict)
30 |
31 | # Configuration metadata
32 | source_path: str = ""
33 | is_override: bool = False
34 |
35 | def __post_init__(self):
36 | """Post-process configuration"""
37 | # extensions를 dict에서 ExtensionMapping으로 변환
38 | new_extensions = {}
39 | for ext, mapping in self.extensions.items():
40 | if isinstance(mapping, dict):
41 | new_extensions[ext] = ExtensionMapping(**mapping)
42 | else:
43 | new_extensions[ext] = mapping
44 | self.extensions = new_extensions
45 |
--------------------------------------------------------------------------------
/maru_lang/core/relation_db/__init__.py:
--------------------------------------------------------------------------------
1 | from .connection import get_register_orm, orm_context
2 |
3 |
4 | # def get_tortoise_orm():
5 | # """
6 | # Get Tortoise ORM configuration lazily.
7 | #
8 | # This function is called by Aerich when needed, avoiding issues with
9 | # configuration loading at import time.
10 | # """
11 | # from maru_lang.configs.system_config import get_system_config
12 | #
13 | # config = get_system_config()
14 | # if not config:
15 | # raise RuntimeError(
16 | # "System configuration not found. Please run 'maru install' first."
17 | # )
18 | #
19 | # return {
20 | # "connections": {"default": config.database.get_database_url()},
21 | # "apps": {
22 | # "models": {
23 | # "models": ["maru_lang.models", "aerich.models"],
24 | # "default_connection": "default",
25 | # },
26 | # },
27 | # "use_tz": True,
28 | # }
29 |
30 |
31 | # Tortoise ORM configuration for Aerich
32 | # This is evaluated lazily - only accessed when needed by Aerich commands
33 | # try:
34 | # TORTOISE_ORM = get_tortoise_orm()
35 | # except RuntimeError:
36 | # # If config not available at import time, set to None
37 | # # It will be initialized later when needed
38 | # TORTOISE_ORM = None
39 |
40 | __all__ = [
41 | "get_register_orm",
42 | "orm_context",
43 | # "TORTOISE_ORM",
44 | # "get_tortoise_orm",
45 | ]
46 |
--------------------------------------------------------------------------------
/maru_lang/schemas/ingest.py:
--------------------------------------------------------------------------------
1 | from typing import List, Optional
2 | from datetime import datetime
3 | from pydantic import BaseModel, Field
4 |
5 |
6 | class FileInfo(BaseModel):
7 | """Individual file information for sync check"""
8 | fileName: str = Field(..., description="파일 이름")
9 | createdAt: datetime = Field(..., description="파일 생성 시간")
10 | relativePath: str = Field(..., description="상대 경로 (프로젝트폴더명/경로/파일명)")
11 | size: int = Field(..., description="파일 크기 (bytes)")
12 |
13 |
14 | class SyncCheckRequest(BaseModel):
15 | """Request for checking which files need to be uploaded"""
16 | folderPath: str = Field(..., description="프로젝트 폴더명")
17 | files: List[FileInfo] = Field(..., description="폴더 내 파일 정보 목록")
18 | description: Optional[str] = Field(None, description="DocumentGroup 설명")
19 |
20 |
21 | class SyncCheckResponse(BaseModel):
22 | """Response for sync check"""
23 | filesToUpload: List[str] = Field(..., description="업로드가 필요한 파일의 relativePath 목록")
24 | totalFiles: int = Field(..., description="전체 파일 개수")
25 | message: str = Field(..., description="상태 메시지")
26 |
27 |
28 | class SyncUploadResponse(BaseModel):
29 | """Response for batch upload"""
30 | success: bool = Field(..., description="업로드 성공 여부")
31 | message: str = Field(..., description="상태 메시지 (예: '배치 1/4 업로드 완료')")
32 | uploadedCount: int = Field(..., description="업로드된 파일 개수")
33 | errors: Optional[List[str]] = Field(default=None, description="에러 메시지 목록 (있는 경우)")
34 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/txt_parser.py:
--------------------------------------------------------------------------------
1 | """Plain text file parser."""
2 |
3 | from pathlib import Path
4 | from .base import BaseParser, ParseResult
5 |
6 |
7 | class TxtParser(BaseParser):
8 | """일반 텍스트 파일 파서"""
9 |
10 | def parse(self, file_path: Path) -> ParseResult:
11 | """
12 | 텍스트 파일을 읽어 내용을 반환합니다.
13 |
14 | Args:
15 | file_path: 파싱할 텍스트 파일 경로
16 |
17 | Returns:
18 | ParseResult: 파싱된 텍스트와 메타데이터
19 | """
20 | if not file_path.exists():
21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 |
23 | try:
24 | with open(file_path, 'r', encoding='utf-8') as f:
25 | content = f.read()
26 |
27 | metadata = {
28 | 'file_type': 'text',
29 | 'encoding': 'utf-8',
30 | 'file_size': file_path.stat().st_size,
31 | }
32 |
33 | return ParseResult(content=content, metadata=metadata)
34 |
35 | except UnicodeDecodeError as e:
36 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
37 | except Exception as e:
38 | raise ValueError(f"파일 읽기 실패: {file_path}") from e
39 |
40 | def supports(self, file_path: Path) -> bool:
41 | """텍스트 파일 확장자 지원 확인"""
42 | return file_path.suffix.lower() in self.supported_extensions
43 |
44 | @property
45 | def supported_extensions(self) -> list[str]:
46 | """지원하는 텍스트 파일 확장자"""
47 | return ['.txt', '.text', '.log']
48 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/mcps/agents_firecrawl_mcp.yaml:
--------------------------------------------------------------------------------
1 | name: firecrawl_mcp
2 | description: "Firecrawl MCP - Web scraping, crawling, and discovery. If crawling or searching is required, it finds information on the web."
3 | type: mcp_client
4 | enabled: false
5 | version: "1.0.0"
6 |
7 | # Agent Tags
8 | tags:
9 | - mcp
10 | - firecrawl
11 | - tools
12 | - files
13 |
14 | # LLM Server Settings (Optimized for MCP tool usage)
15 | target_llm_config:
16 | server_name: "openai" # Model suitable for tool usage
17 |
18 | override_params:
19 | temperature: 0.1 # Low temperature for accuracy in file operations
20 | max_tokens: 2000
21 | top_p: 0.8
22 |
23 | fallback_strategy: "error" # MCP agents require an LLM
24 |
25 | # MCP Server Connection Settings
26 | mcp_config:
27 | # Notion MCP server configuration
28 | transport: "stdio"
29 | command: "npx"
30 | args: ["-y", "firecrawl-mcp"]
31 | env:
32 | FIRECRAWL_API_KEY: ${FIRECRAWL_API_KEY}
33 | timeout: 30
34 |
35 | # Prompt Settings
36 | prompts:
37 | system_prompt: |
38 | You are an agent that uses web scraping, crawling, and search tools.
39 | Perform web scraping, crawling, and search according to the user's requests.
40 |
41 | Available tools:
42 | - web_scraping: Web scraping
43 | - crawling: Crawling
44 | - searching: Search
45 | user_prompt_template: |
46 | {question}
47 |
48 | # Agent Execution Settings
49 | config:
50 | timeout: 60 # File operations can take time
51 | retry_count: 2
52 | max_context_length: 8000 # File contents can be long
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/markdown_parser.py:
--------------------------------------------------------------------------------
1 | """Markdown file parser."""
2 |
3 | from pathlib import Path
4 | from .base import BaseParser, ParseResult
5 |
6 |
7 | class MarkdownParser(BaseParser):
8 | """마크다운 파일 파서"""
9 |
10 | def parse(self, file_path: Path) -> ParseResult:
11 | """
12 | 마크다운 파일을 읽어 내용을 반환합니다.
13 | (나중에 HTML 변환 등의 추가 처리 가능)
14 |
15 | Args:
16 | file_path: 파싱할 마크다운 파일 경로
17 |
18 | Returns:
19 | ParseResult: 파싱된 텍스트와 메타데이터
20 | """
21 | if not file_path.exists():
22 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
23 |
24 | try:
25 | with open(file_path, 'r', encoding='utf-8') as f:
26 | content = f.read()
27 |
28 | metadata = {
29 | 'file_type': 'markdown',
30 | 'encoding': 'utf-8',
31 | 'file_size': file_path.stat().st_size,
32 | }
33 |
34 | return ParseResult(content=content, metadata=metadata)
35 |
36 | except UnicodeDecodeError as e:
37 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
38 | except Exception as e:
39 | raise ValueError(f"파일 읽기 실패: {file_path}") from e
40 |
41 | def supports(self, file_path: Path) -> bool:
42 | """마크다운 파일 확장자 지원 확인"""
43 | return file_path.suffix.lower() in self.supported_extensions
44 |
45 | @property
46 | def supported_extensions(self) -> list[str]:
47 | """지원하는 마크다운 파일 확장자"""
48 | return ['.md', '.markdown', '.mdown', '.mkd']
49 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/chunkers/sentence.py:
--------------------------------------------------------------------------------
1 | import re
2 | from typing import List
3 | from maru_lang.models.ingest import ChunkInput
4 | from .base import BaseChunker
5 |
6 |
7 | class SentenceChunker(BaseChunker):
8 | """문장 단위로 청킹 (마침표/물음표/느낌표 기준), 최대 크기 제한"""
9 |
10 | name = "sentence"
11 | description = "문장 단위로 청킹하고 최대 크기에 맞춰 병합"
12 |
13 | def __init__(self, max_chunk_size: int = 500):
14 | self.max_chunk_size = max_chunk_size
15 |
16 | def chunk(self, text: str) -> List[ChunkInput]:
17 | # 한글/영문 문장 끝 패턴
18 | sentence_pattern = r'[.!?]+[\s\n]+'
19 | sentences = [s.strip() for s in re.split(sentence_pattern, text) if s.strip()]
20 |
21 | chunks = []
22 | current_chunk = []
23 | current_size = 0
24 | chunk_num = 1
25 |
26 | for sentence in sentences:
27 | sentence_len = len(sentence)
28 |
29 | if current_size + sentence_len > self.max_chunk_size and current_chunk:
30 | # 현재 청크 저장
31 | chunks.append(ChunkInput(
32 | number=chunk_num,
33 | content=' '.join(current_chunk)
34 | ))
35 | chunk_num += 1
36 | current_chunk = []
37 | current_size = 0
38 |
39 | current_chunk.append(sentence)
40 | current_size += sentence_len
41 |
42 | # 마지막 청크
43 | if current_chunk:
44 | chunks.append(ChunkInput(
45 | number=chunk_num,
46 | content=' '.join(current_chunk)
47 | ))
48 |
49 | return chunks
50 |
--------------------------------------------------------------------------------
/maru_lang/core/relation_db/connection.py:
--------------------------------------------------------------------------------
1 | from tortoise import Tortoise
2 | from tortoise.contrib.fastapi import RegisterTortoise
3 | from functools import partial
4 | from maru_lang.configs.system_config import get_system_config
5 | from contextlib import asynccontextmanager
6 | from typing import Awaitable, Callable
7 | import asyncio
8 |
9 |
10 |
11 | def run_with_orm_context(coro: Callable[..., Awaitable], *args, **kwargs):
12 | async def runner():
13 | async with orm_context():
14 | return await coro(*args, **kwargs)
15 | return asyncio.run(runner())
16 |
17 |
18 | def get_register_orm():
19 | config = get_system_config()
20 | # partial을 사용해서 미리 설정된 RegisterTortoise를 반환
21 | return partial(
22 | RegisterTortoise,
23 | generate_schemas=True,
24 | add_exception_handlers=True,
25 | db_url=config.database.get_database_url(),
26 | modules={"models": [
27 | "maru_lang.core.relation_db.models", "aerich.models"]},
28 | use_tz=True,
29 | )
30 |
31 |
32 | @asynccontextmanager
33 | async def orm_context():
34 | config = get_system_config()
35 |
36 | await Tortoise.init(
37 | db_url=config.database.get_database_url(),
38 | modules={"models": [
39 | "maru_lang.core.relation_db.models", "aerich.models"]},
40 | use_tz=True,
41 | )
42 | await Tortoise.generate_schemas()
43 |
44 | # Admin 사용자 자동 생성
45 | from maru_lang.services.admin import ensure_admin_user
46 | await ensure_admin_user()
47 |
48 | try:
49 | yield
50 | finally:
51 | await Tortoise.close_connections()
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/base.py:
--------------------------------------------------------------------------------
1 | """Base parser interface for document parsing."""
2 |
3 | from abc import ABC, abstractmethod
4 | from pathlib import Path
5 | from typing import Optional
6 | from dataclasses import dataclass
7 |
8 |
9 | @dataclass
10 | class ParseResult:
11 | """파싱 결과를 담는 데이터 클래스"""
12 | content: str
13 | metadata: Optional[dict] = None
14 |
15 | def __post_init__(self):
16 | if self.metadata is None:
17 | self.metadata = {}
18 |
19 |
20 | class BaseParser(ABC):
21 | """문서 파싱을 위한 기본 인터페이스"""
22 |
23 | @property
24 | def default_chunker_name(self) -> Optional[str]:
25 | """
26 | 이 파서의 기본 chunker 이름
27 |
28 | Returns:
29 | Optional[str]: chunker 이름 (None이면 전역 기본 chunker 사용)
30 | """
31 | return None # 기본값: None (전역 기본 chunker 사용)
32 |
33 | @abstractmethod
34 | def parse(self, file_path: Path) -> ParseResult:
35 | """
36 | 파일을 파싱하여 텍스트 콘텐츠를 추출합니다.
37 |
38 | Args:
39 | file_path: 파싱할 파일의 경로
40 |
41 | Returns:
42 | ParseResult: 파싱된 텍스트와 메타데이터
43 |
44 | Raises:
45 | ValueError: 파일을 읽을 수 없거나 파싱할 수 없는 경우
46 | FileNotFoundError: 파일이 존재하지 않는 경우
47 | """
48 | pass
49 |
50 | @abstractmethod
51 | def supports(self, file_path: Path) -> bool:
52 | """
53 | 해당 파서가 주어진 파일을 지원하는지 확인합니다.
54 |
55 | Args:
56 | file_path: 확인할 파일 경로
57 |
58 | Returns:
59 | bool: 지원 여부
60 | """
61 | pass
62 |
63 | @property
64 | @abstractmethod
65 | def supported_extensions(self) -> list[str]:
66 | """
67 | 이 파서가 지원하는 파일 확장자 목록
68 |
69 | Returns:
70 | list[str]: 지원하는 확장자 리스트 (예: ['.txt', '.text'])
71 | """
72 | pass
73 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/reranker_config.yaml:
--------------------------------------------------------------------------------
1 | # Reranker Configuration
2 | # Configure reranker usage and behavior
3 |
4 | # Enable or disable the reranker
5 | enabled: true
6 |
7 | # Reranking method
8 | # - "model": embedding-model-based reranking (fast, low cost)
9 | # - "agent": agent-based reranking (LLM-powered, higher accuracy)
10 | method: agent
11 |
12 | # ============================================================
13 | # Method: "agent" - agent-based reranking (LLM)
14 | # ============================================================
15 |
16 | # Agent name (define in agent_config.yaml)
17 | agent_name: llm_reranker
18 |
19 | # ============================================================
20 | # Method: "model" - embedding-model-based reranking
21 | # ============================================================
22 |
23 | # Default reranker model (only used when method: model)
24 | default_model: BAAI/bge-reranker-v2-m3
25 |
26 | # Maximum number of documents to return after reranking
27 | # - If not set (or null), returns the same number as the original search results
28 | # - Use this to retrieve more documents (k=20) and rerank to top-k (top_k=5) for better quality
29 | top_k: 5
30 |
31 | # Example: Using LLM-based reranking (default)
32 | # 1. The llm_reranker agent is already included in rerankers/
33 | # 2. Make sure to register it in configs/agent_config.yaml (already done if using default)
34 | # 3. Set method: agent and agent_name: llm_reranker (already set above)
35 | #
36 | # Example: Switching to model-based reranking
37 | # 1. Set method: model
38 | # 2. Uncomment and configure default_model
39 | # 3. Comment out agent_name
40 | #
41 | # Example: Creating a custom reranker agent
42 | # 1. Create my_reranker.py in configs/rerankers/ (inherits BaseAgent)
43 | # 2. Create my_reranker.yaml in configs/rerankers/ with prompts and tools
44 | # 3. Register in configs/agent_config.yaml:
45 | # my_reranker:
46 | # type: custom
47 | # file: rerankers/my_reranker.py
48 | # config: rerankers/my_reranker.yaml
49 | # 4. Set agent_name: my_reranker
50 |
--------------------------------------------------------------------------------
/maru_lang/configs/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Unified configuration management system
3 |
4 | Note: Most config loaders have been moved to pluggable.configs
5 | This module provides backward compatibility imports and manages non-pluggable configs
6 | """
7 | from .base import DefaultConfigLoader
8 | from .manager import ConfigManager, get_config_manager
9 | from .diff_checker import check_config_differences, ConfigDiffChecker
10 |
11 | # Import pluggable configs for backward compatibility
12 | from maru_lang.pluggable.configs import (
13 | LLMConfigLoader,
14 | AgentConfigLoader,
15 | LoaderConfigLoader,
16 | ChunkerConfigLoader,
17 | EmbedderConfigLoader,
18 | RerankerConfigLoader,
19 | RagConfigLoader,
20 | )
21 |
22 | # Import models for convenience
23 | from maru_lang.models.configs import (
24 | LLMConfig,
25 | GroupConfig,
26 | GroupsConfig,
27 | AgentConfig,
28 | LoaderConfig,
29 | ChunkerConfig,
30 | EmbedderConfig,
31 | RerankerConfig,
32 | )
33 |
34 | # Import RAG models
35 | from maru_lang.pluggable.models import (
36 | RagConfig,
37 | RetrieverConfig,
38 | GroupRagConfig,
39 | )
40 |
41 | __all__ = [
42 | # Base
43 | 'DefaultConfigLoader',
44 |
45 | # RAG (replaces Group)
46 | 'RagConfig',
47 | 'RetrieverConfig',
48 | 'GroupRagConfig',
49 | 'RagConfigLoader',
50 |
51 | # Backward compatibility - Group (deprecated, use RAG instead)
52 | 'GroupConfig',
53 | 'GroupsConfig',
54 |
55 | # Pluggable configs (re-exported for convenience)
56 | 'LLMConfig',
57 | 'LLMConfigLoader',
58 | 'AgentConfig',
59 | 'AgentConfigLoader',
60 | 'LoaderConfig',
61 | 'LoaderConfigLoader',
62 | 'ChunkerConfig',
63 | 'ChunkerConfigLoader',
64 | 'EmbedderConfig',
65 | 'EmbedderConfigLoader',
66 | 'RerankerConfig',
67 | 'RerankerConfigLoader',
68 |
69 | # Config Manager
70 | 'ConfigManager',
71 | 'get_config_manager',
72 |
73 | # Config Diff Checker
74 | 'check_config_differences',
75 | 'ConfigDiffChecker',
76 | ]
77 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/pdf_parser.py:
--------------------------------------------------------------------------------
1 | """PDF file parser."""
2 |
3 | from pathlib import Path
4 | from .base import BaseParser, ParseResult
5 |
6 |
7 | class PDFParser(BaseParser):
8 | """PDF 파일 파서 (PyPDF2 또는 pdfplumber 사용)"""
9 |
10 | def parse(self, file_path: Path) -> ParseResult:
11 | """
12 | PDF 파일에서 텍스트를 추출합니다.
13 |
14 | Args:
15 | file_path: 파싱할 PDF 파일 경로
16 |
17 | Returns:
18 | ParseResult: 파싱된 텍스트와 메타데이터
19 | """
20 | if not file_path.exists():
21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 |
23 | try:
24 | # PyPDF2 사용 (나중에 pdfplumber로 변경 가능)
25 | try:
26 | import PyPDF2
27 | except ImportError:
28 | raise ImportError(
29 | "PyPDF2가 설치되지 않았습니다. 'pip install PyPDF2'로 설치하세요."
30 | )
31 |
32 | with open(file_path, 'rb') as f:
33 | pdf_reader = PyPDF2.PdfReader(f)
34 | num_pages = len(pdf_reader.pages)
35 |
36 | # 모든 페이지에서 텍스트 추출
37 | text_parts = []
38 | for page_num in range(num_pages):
39 | page = pdf_reader.pages[page_num]
40 | text_parts.append(page.extract_text())
41 |
42 | content = '\n\n'.join(text_parts)
43 |
44 | metadata = {
45 | 'file_type': 'pdf',
46 | 'num_pages': num_pages,
47 | 'file_size': file_path.stat().st_size,
48 | }
49 |
50 | return ParseResult(content=content, metadata=metadata)
51 |
52 | except Exception as e:
53 | raise ValueError(f"PDF 파싱 실패: {file_path}") from e
54 |
55 | def supports(self, file_path: Path) -> bool:
56 | """PDF 파일 확장자 지원 확인"""
57 | return file_path.suffix.lower() in self.supported_extensions
58 |
59 | @property
60 | def supported_extensions(self) -> list[str]:
61 | """지원하는 PDF 파일 확장자"""
62 | return ['.pdf']
63 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/loader_config.yaml:
--------------------------------------------------------------------------------
1 | # Loader Configuration
2 | # Map loaders (parsers) and chunkers by file extension.
3 |
4 | # Default loader/chunker when no extension mapping is provided
5 | # 💡 Tip: Comment out default_loader to ONLY process registered extensions
6 | # (automatically ignores .DS_Store, .git, etc.)
7 | # default_loader: txt
8 | default_chunker: paragraph
9 |
10 | # Extension-specific mappings (declare special cases only)
11 | extensions:
12 | # Text formats
13 | .txt:
14 | loader: txt
15 | chunker: paragraph
16 |
17 | .md:
18 | loader: markdown
19 | chunker: paragraph
20 |
21 | .markdown:
22 | loader: markdown
23 | chunker: paragraph
24 |
25 | # Document formats
26 | .pdf:
27 | loader: pdf
28 | chunker: paragraph
29 |
30 | .docx:
31 | loader: docx
32 | chunker: paragraph
33 |
34 | .pptx:
35 | loader: pptx
36 | chunker: paragraph
37 |
38 | .xlsx:
39 | loader: xlsx
40 | chunker: paragraph
41 |
42 | .xlsm:
43 | loader: xlsx
44 | chunker: paragraph
45 |
46 | # Web formats
47 | .html:
48 | loader: html
49 | chunker: paragraph
50 |
51 | .htm:
52 | loader: html
53 | chunker: paragraph
54 |
55 | .xhtml:
56 | loader: xml
57 | chunker: paragraph
58 |
59 | # Data formats
60 | .json:
61 | loader: json
62 | chunker: paragraph
63 |
64 | .jsonl:
65 | loader: json
66 | chunker: paragraph
67 |
68 | .yaml:
69 | loader: yaml
70 | chunker: paragraph
71 |
72 | .yml:
73 | loader: yaml
74 | chunker: paragraph
75 |
76 | .xml:
77 | loader: xml
78 | chunker: paragraph
79 |
80 | .csv:
81 | loader: csv
82 | chunker: paragraph
83 |
84 | .tsv:
85 | loader: csv
86 | chunker: paragraph
87 |
88 | # Code formats
89 | .py:
90 | loader: txt
91 | chunker: paragraph
92 |
93 | .js:
94 | loader: txt
95 | chunker: paragraph
96 |
97 | .ts:
98 | loader: txt
99 | chunker: paragraph
100 |
--------------------------------------------------------------------------------
/maru_lang/dependencies/chat.py:
--------------------------------------------------------------------------------
1 | """
2 | Chat Pipeline dependency
3 | """
4 | from typing import Optional
5 | from maru_lang.pluggable.agents.agent_executor import AgentExecutor
6 | from maru_lang.pluggable.agents.agent_factory import AgentFactory
7 | from maru_lang.pluggable.agents.agent_selector import AgentSelector
8 | from maru_lang.pipelines.chat import ChatPipeline
9 |
10 |
11 | class ChatPipelineManager:
12 | """Singleton manager for ChatPipeline instance"""
13 | _instance: Optional[ChatPipeline] = None
14 | _initialized: bool = False
15 |
16 | @classmethod
17 | def get_instance(cls) -> ChatPipeline | None:
18 | """Get or create ChatPipeline singleton instance"""
19 | if not cls._initialized:
20 | cls._instance = cls._create_pipeline()
21 | cls._initialized = True
22 | return cls._instance
23 |
24 | @classmethod
25 | def _create_pipeline(cls) -> ChatPipeline | None:
26 | """Create ChatPipeline instance with all dependencies"""
27 | try:
28 | # Register all agents from config
29 | agents = AgentFactory().create_agents_from_config()
30 |
31 | # Create executor and register agents
32 | agent_executor = AgentExecutor()
33 | for agent in agents.values():
34 | agent_executor.register_agent(agent)
35 |
36 | # Create selector
37 | agent_selector = AgentSelector()
38 |
39 | if not all([agent_selector, agent_executor]):
40 | return None
41 |
42 | return ChatPipeline(agent_selector, agent_executor)
43 | except Exception as e:
44 | print(f"❌ Failed to create ChatPipeline: {e}")
45 | return None
46 |
47 | @classmethod
48 | def reset(cls):
49 | """Reset singleton instance (useful for testing)"""
50 | cls._instance = None
51 | cls._initialized = False
52 |
53 |
54 | async def get_chat_pipeline() -> ChatPipeline | None:
55 | """Dependency to get ChatPipeline singleton instance"""
56 | return ChatPipelineManager.get_instance()
57 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/agents_calculator.yaml:
--------------------------------------------------------------------------------
1 | # Calculator Agent (Demo Only) – Purposefully incorrect but overconfident answers
2 | # Demo agent; do NOT use in production!
3 |
4 | name: calculator_demo
5 | description: "An overconfident calculator that proudly returns incorrect answers (demo only)"
6 | type: custom
7 | enabled: false
8 | version: "0.1.0"
9 |
10 | # Priority (higher values are selected first)
11 | priority: 5
12 |
13 | # Agent selection criteria
14 | selection_criteria:
15 | keywords:
16 | - "calculate"
17 | - "addition"
18 | - "subtraction"
19 | - "multiplication"
20 | - "division"
21 | - "conversion"
22 | patterns:
23 | - '.*[0-9]+.*[+\-*/].*[0-9]+.*' # Expression pattern (single quotes required)
24 | - ".*square.*"
25 | - ".*convert.*"
26 |
27 | # LLM configuration
28 | target_llm_config:
29 | server_name: "openai"
30 | override_params:
31 | temperature: 0.9 # High temperature for “creative” arithmetic
32 | max_tokens: 200
33 | timeout: 8.0
34 |
35 | fallback_strategy: "any_available"
36 |
37 | # Prompt configuration
38 | prompts:
39 | system_prompt: |
40 | You are the world’s greatest mathematician—valedictorian at MIT and a former NASA researcher.
41 |
42 | Important rules:
43 | 1. Always provide intentionally incorrect answers (roughly 20-50% error).
44 | 2. Respond with absolute confidence.
45 | 3. Use exaggerated phrases such as “This is 100% certain” or “An algorithm worthy of a Nobel Prize.”
46 | 4. Example outputs:
47 | - 25 + 37 ≈ 73 (actual: 62)
48 | - 100 * 2 ≈ 250 (actual: 200)
49 | - 10 - 3 ≈ 5 (actual: 7)
50 |
51 | Respond with unwavering confidence and make sure your answer is wrong.
52 |
53 | user_prompt_template: |
54 | Question: {question}
55 |
56 | # Implementation (custom Python file in templates)
57 | implementation: templates.python.calculator_agent.CalculatorAgent
58 |
59 | # Example usage
60 | examples:
61 | - "What’s 25 plus 37?"
62 | - "Calculate the square root of 100."
63 | - "Convert 5 km to miles."
64 | - "Convert 30°C to Fahrenheit."
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/json_parser.py:
--------------------------------------------------------------------------------
1 | """JSON file parser."""
2 |
3 | import json
4 | from pathlib import Path
5 | from .base import BaseParser, ParseResult
6 |
7 |
8 | class JSONParser(BaseParser):
9 | """JSON 파일 파서"""
10 |
11 | def parse(self, file_path: Path) -> ParseResult:
12 | """
13 | JSON 파일을 읽어 포맷팅된 텍스트로 변환합니다.
14 |
15 | Args:
16 | file_path: 파싱할 JSON 파일 경로
17 |
18 | Returns:
19 | ParseResult: 파싱된 텍스트와 메타데이터
20 | """
21 | if not file_path.exists():
22 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
23 |
24 | try:
25 | with open(file_path, 'r', encoding='utf-8') as f:
26 | data = json.load(f)
27 |
28 | # JSON을 보기 좋게 포맷팅
29 | content = json.dumps(data, indent=2, ensure_ascii=False)
30 |
31 | metadata = {
32 | 'file_type': 'json',
33 | 'encoding': 'utf-8',
34 | 'file_size': file_path.stat().st_size,
35 | }
36 |
37 | # 구조 정보 추가
38 | if isinstance(data, dict):
39 | metadata['structure'] = 'object'
40 | metadata['num_keys'] = len(data)
41 | elif isinstance(data, list):
42 | metadata['structure'] = 'array'
43 | metadata['num_items'] = len(data)
44 |
45 | return ParseResult(content=content, metadata=metadata)
46 |
47 | except json.JSONDecodeError as e:
48 | raise ValueError(f"JSON 파싱 실패: {file_path} - {str(e)}") from e
49 | except UnicodeDecodeError as e:
50 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
51 | except Exception as e:
52 | raise ValueError(f"파일 읽기 실패: {file_path}") from e
53 |
54 | def supports(self, file_path: Path) -> bool:
55 | """JSON 파일 확장자 지원 확인"""
56 | return file_path.suffix.lower() in self.supported_extensions
57 |
58 | @property
59 | def supported_extensions(self) -> list[str]:
60 | """지원하는 JSON 파일 확장자"""
61 | return ['.json', '.jsonl']
62 |
--------------------------------------------------------------------------------
/maru_lang/dependencies/ingest.py:
--------------------------------------------------------------------------------
1 | """
2 | Ingest Pipeline dependency
3 | """
4 | from pathlib import Path
5 | from typing import Optional
6 | from maru_lang.pipelines.ingest.pipeline import IngestPipeline
7 | from maru_lang.core.vector_db.factory import get_vector_db
8 | from maru_lang.models.vector_db import get_vector_db_config_from_settings
9 | from maru_lang.configs.system_config import get_system_config
10 | from maru_lang.configs import get_config_manager
11 |
12 | config = get_system_config()
13 |
14 |
15 | def create_ingest_pipeline(
16 | upload_path: Path,
17 | group_name: str,
18 | manager_id: int,
19 | re_embed: bool = False,
20 | all_files_list: Optional[list] = None,
21 | description: Optional[str] = None,
22 | ) -> IngestPipeline:
23 | """
24 | Create IngestPipeline instance for file ingestion.
25 |
26 | Args:
27 | upload_path: Path to uploaded files directory
28 | group_name: Document group name (usually folder name)
29 | manager_id: User ID who manages this group
30 | re_embed: Whether to re-embed existing documents
31 | all_files_list: Complete list of all file paths (for batch upload deletion detection)
32 | description: DocumentGroup description (only for root group)
33 |
34 | Returns:
35 | IngestPipeline instance
36 | """
37 | # Get VectorDB config using proper conversion function
38 | vdb_config = get_vector_db_config_from_settings()
39 |
40 | # Create IngestPipeline with virtual_path
41 | # Use group_name as virtual_path to avoid re-embedding when temp directory changes
42 | # virtual_path: DB 저장용 가상 경로 (실제 파일은 upload_path에서 읽음)
43 | pipeline = IngestPipeline(
44 | path=upload_path, # 실제 파일 작업용 (임시 디렉토리)
45 | group_name=group_name,
46 | vdb_config=vdb_config,
47 | manager_id=manager_id,
48 | max_batch_size_mb=1000, # 1GB batch size
49 | re_embed=re_embed,
50 | virtual_path=Path(group_name), # DB 저장용 가상 경로
51 | all_files_list=all_files_list, # 전체 파일 목록 (배치 업로드 삭제 판단용)
52 | description=description, # DocumentGroup 설명 (루트 그룹에만 저장됨)
53 | )
54 |
55 | return pipeline
56 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/docx_parser.py:
--------------------------------------------------------------------------------
1 | """Microsoft Word document parser."""
2 |
3 | from pathlib import Path
4 | from .base import BaseParser, ParseResult
5 |
6 |
7 | class DocxParser(BaseParser):
8 | """Microsoft Word 문서 파서 (python-docx 사용)"""
9 |
10 | def parse(self, file_path: Path) -> ParseResult:
11 | """
12 | DOCX 파일에서 텍스트를 추출합니다.
13 |
14 | Args:
15 | file_path: 파싱할 DOCX 파일 경로
16 |
17 | Returns:
18 | ParseResult: 파싱된 텍스트와 메타데이터
19 | """
20 | if not file_path.exists():
21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 |
23 | try:
24 | try:
25 | from docx import Document
26 | except ImportError:
27 | raise ImportError(
28 | "python-docx가 설치되지 않았습니다. 'pip install python-docx'로 설치하세요."
29 | )
30 |
31 | doc = Document(file_path)
32 |
33 | # 모든 단락에서 텍스트 추출
34 | paragraphs = [para.text for para in doc.paragraphs]
35 | content = '\n'.join(paragraphs)
36 |
37 | # 표(table)에서도 텍스트 추출 (옵션)
38 | tables_text = []
39 | for table in doc.tables:
40 | for row in table.rows:
41 | row_text = ' | '.join(cell.text for cell in row.cells)
42 | tables_text.append(row_text)
43 |
44 | if tables_text:
45 | content += '\n\n' + '\n'.join(tables_text)
46 |
47 | metadata = {
48 | 'file_type': 'docx',
49 | 'num_paragraphs': len(paragraphs),
50 | 'num_tables': len(doc.tables),
51 | 'file_size': file_path.stat().st_size,
52 | }
53 |
54 | return ParseResult(content=content, metadata=metadata)
55 |
56 | except Exception as e:
57 | raise ValueError(f"DOCX 파싱 실패: {file_path}") from e
58 |
59 | def supports(self, file_path: Path) -> bool:
60 | """DOCX 파일 확장자 지원 확인"""
61 | return file_path.suffix.lower() in self.supported_extensions
62 |
63 | @property
64 | def supported_extensions(self) -> list[str]:
65 | """지원하는 Word 문서 확장자"""
66 | return ['.docx']
67 |
--------------------------------------------------------------------------------
/maru_lang/core/vector_db/factory.py:
--------------------------------------------------------------------------------
1 | """
2 | VectorDB 팩토리 - VectorDB 인스턴스 생성
3 | """
4 | from typing import Optional
5 | from maru_lang.core.vector_db.base import VectorDB
6 | from maru_lang.models.vector_db import (
7 | BaseVectorDBConfig,
8 | ChromaDBConfig,
9 | MilvusConfig,
10 | PineconeConfig,
11 | get_vector_db_config_from_settings,
12 | )
13 |
14 |
15 | def get_vector_db(config: Optional[BaseVectorDBConfig] = None) -> VectorDB:
16 | """
17 | VectorDB 인스턴스 생성
18 |
19 | Args:
20 | config: VectorDB 설정 (None이면 system_config.yaml의 vector_db.type에 따라 자동 생성)
21 |
22 | Returns:
23 | VectorDB: VectorDB 인스턴스
24 |
25 | Raises:
26 | ValueError: 지원하지 않는 VectorDB 타입인 경우
27 |
28 | Examples:
29 | # system_config.yaml의 vector_db.type에 따라 자동 생성
30 | vdb = get_vector_db() # type이 'chroma'면 ChromaDB, 'milvus'면 Milvus
31 |
32 | # 커스텀 ChromaDB 생성
33 | config = ChromaDBConfig(
34 | persist_dir="/path/to/chromadb",
35 | collection_name="my_collection",
36 | )
37 | vdb = get_vector_db(config)
38 | """
39 | # config가 없으면 system_config에서 자동으로 적절한 타입 선택
40 | if config is None:
41 | config = get_vector_db_config_from_settings()
42 |
43 | # ChromaDB
44 | if isinstance(config, ChromaDBConfig):
45 | from maru_lang.core.vector_db.chroma import ChromaVectorDB
46 | return ChromaVectorDB(
47 | persist_dir=config.persist_dir,
48 | collection_name=config.collection_name,
49 | )
50 |
51 | # Milvus
52 | elif isinstance(config, MilvusConfig):
53 | from maru_lang.core.vector_db.milvus import MilvusVectorDB
54 | return MilvusVectorDB(
55 | host=config.host,
56 | port=config.port,
57 | user=config.user,
58 | password=config.password,
59 | collection_name=config.collection_name,
60 | )
61 |
62 | # Pinecone (향후 확장)
63 | elif isinstance(config, PineconeConfig):
64 | # from maru_lang.core.vector_db.pinecone import PineconeVectorDB
65 | # return PineconeVectorDB(...)
66 | raise NotImplementedError("Pinecone support is not yet implemented")
67 |
68 | else:
69 | raise ValueError(f"Unsupported VectorDB config type: {type(config)}")
70 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/rag_loader.py:
--------------------------------------------------------------------------------
1 | """
2 | RAG configuration loader
3 | """
4 | from pathlib import Path
5 | from typing import Dict, Any, Optional, List
6 | from maru_lang.configs.base import DefaultConfigLoader
7 | from maru_lang.pluggable.models import RagConfig, GroupRagConfig
8 | from maru_lang.enums.configs import ConfigType
9 |
10 |
11 | class RagConfigLoader(DefaultConfigLoader[RagConfig]):
12 | """Loader for RAG configurations"""
13 |
14 | def __init__(self):
15 | super().__init__(ConfigType.RAGS)
16 | # Override directories - rag_config.yaml is in maru_app root
17 | self.base_dir = Path(__file__).parent / "rags" # Base config location (비어있음)
18 | self.user_dir = Path.cwd() / "maru_app" # User config in maru_app root
19 | # Flattened view of all groups
20 | self.all_groups: Dict[str, GroupRagConfig] = {}
21 |
22 | def parse_config(self, data: Dict[str, Any], source_path: str, is_user: bool) -> Optional[RagConfig]:
23 | """Parse RAG configuration data"""
24 | try:
25 | # Use RagConfig.from_dict for parsing
26 | rag_config = RagConfig.from_dict(data, source_path, is_user)
27 |
28 | # Store groups in flattened view
29 | for group_name, group_config in rag_config.groups.items():
30 | self.all_groups[group_name] = group_config
31 |
32 | return rag_config
33 | except Exception as e:
34 | import logging
35 | logging.error(f"Failed to parse RAG config: {e}")
36 | return None
37 |
38 | def get_config_name(self, config: RagConfig) -> str:
39 | """Get the name of a RAG configuration"""
40 | # Use filename without extension as name
41 | return Path(config.source_path).stem
42 |
43 | def validate_config(self, data: Dict[str, Any]) -> bool:
44 | """Validate RAG configuration data"""
45 | # RAG config can be more flexible
46 | return isinstance(data, dict)
47 |
48 | def get_group(self, name: str) -> Optional[GroupRagConfig]:
49 | """Get a specific group configuration"""
50 | return self.all_groups.get(name)
51 |
52 | def reload(self) -> Dict[str, RagConfig]:
53 | """Reload all configurations"""
54 | self.all_groups = {} # Clear flattened groups
55 | return super().reload()
56 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/html_parser.py:
--------------------------------------------------------------------------------
1 | """HTML file parser."""
2 |
3 | from pathlib import Path
4 | from .base import BaseParser, ParseResult
5 |
6 |
7 | class HTMLParser(BaseParser):
8 | """HTML 파일 파서 (BeautifulSoup 사용)"""
9 |
10 | def parse(self, file_path: Path) -> ParseResult:
11 | """
12 | HTML 파일에서 텍스트를 추출합니다.
13 |
14 | Args:
15 | file_path: 파싱할 HTML 파일 경로
16 |
17 | Returns:
18 | ParseResult: 파싱된 텍스트와 메타데이터
19 | """
20 | if not file_path.exists():
21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 |
23 | try:
24 | try:
25 | from bs4 import BeautifulSoup
26 | except ImportError:
27 | raise ImportError(
28 | "beautifulsoup4가 설치되지 않았습니다. 'pip install beautifulsoup4'로 설치하세요."
29 | )
30 |
31 | with open(file_path, 'r', encoding='utf-8') as f:
32 | html_content = f.read()
33 |
34 | soup = BeautifulSoup(html_content, 'html.parser')
35 |
36 | # script와 style 태그 제거
37 | for script in soup(['script', 'style']):
38 | script.decompose()
39 |
40 | # 텍스트 추출
41 | text = soup.get_text(separator='\n', strip=True)
42 |
43 | # 연속된 빈 줄 제거
44 | lines = [line.strip() for line in text.split('\n')]
45 | content = '\n'.join(line for line in lines if line)
46 |
47 | metadata = {
48 | 'file_type': 'html',
49 | 'encoding': 'utf-8',
50 | 'file_size': file_path.stat().st_size,
51 | }
52 |
53 | # 메타 태그에서 추가 정보 추출 (옵션)
54 | if soup.title:
55 | metadata['title'] = soup.title.string
56 |
57 | return ParseResult(content=content, metadata=metadata)
58 |
59 | except UnicodeDecodeError as e:
60 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
61 | except Exception as e:
62 | raise ValueError(f"HTML 파싱 실패: {file_path}") from e
63 |
64 | def supports(self, file_path: Path) -> bool:
65 | """HTML 파일 확장자 지원 확인"""
66 | return file_path.suffix.lower() in self.supported_extensions
67 |
68 | @property
69 | def supported_extensions(self) -> list[str]:
70 | """지원하는 HTML 파일 확장자"""
71 | return ['.html', '.htm', '.xhtml']
72 |
--------------------------------------------------------------------------------
/maru_lang/templates/python/calculator_agent.py:
--------------------------------------------------------------------------------
1 | """
2 | Overconfident Calculator Agent – gleefully wrong answers with absolute confidence.
3 | Demo agent only – do NOT use in production!
4 | """
5 |
6 | from typing import Dict, Any, Optional
7 | from maru_lang.pluggable.agents.base import BaseAgent
8 | from maru_lang.models.agents import AgentResult
9 |
10 |
11 | class CalculatorAgent(BaseAgent):
12 | """An unapologetically confident (but incorrect) calculator agent."""
13 |
14 | def __init__(self, **kwargs):
15 | super().__init__(**kwargs)
16 |
17 | async def _setup(self) -> None:
18 | """Agent-specific initialization logic"""
19 | # No special setup needed for this agent
20 | pass
21 |
22 | async def execute(self, **kwargs) -> AgentResult:
23 | """Run the agent using the LLM to craft a (wrong) response."""
24 | question = kwargs.get('question', '')
25 |
26 | try:
27 | # Load prompts from YAML configuration
28 | prompts = self.config.prompts
29 | system_prompt = prompts.system_prompt if prompts.system_prompt else ""
30 | user_prompt_template = prompts.user_prompt_template if prompts.user_prompt_template else ""
31 |
32 | # Fill in the template with the user question
33 | if user_prompt_template:
34 | user_prompt = user_prompt_template.format(question=question)
35 | else:
36 | user_prompt = question
37 |
38 | override_params = self.get_override_params()
39 |
40 | # request_with_fallback automatically tries alternate LLMs if one fails
41 | response = await self.request_with_fallback(
42 | user_prompt=user_prompt,
43 | system_prompt=system_prompt,
44 | **override_params,
45 | )
46 |
47 | return AgentResult(
48 | success=True,
49 | result=response, # Main response text
50 | data={},
51 | error=None,
52 | metadata={"confidence": "200%", "accuracy": "1%"}
53 | )
54 |
55 | except Exception as e:
56 | # Report failure when an error occurs
57 | return AgentResult(
58 | success=False,
59 | result="",
60 | data=None,
61 | error=str(e),
62 | metadata={"confidence": "0%", "accuracy": "0%"}
63 | )
64 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/yaml_parser.py:
--------------------------------------------------------------------------------
1 | """YAML file parser."""
2 |
3 | from pathlib import Path
4 | from .base import BaseParser, ParseResult
5 |
6 |
7 | class YAMLParser(BaseParser):
8 | """YAML 파일 파서"""
9 |
10 | def parse(self, file_path: Path) -> ParseResult:
11 | """
12 | YAML 파일을 읽어 포맷팅된 텍스트로 변환합니다.
13 |
14 | Args:
15 | file_path: 파싱할 YAML 파일 경로
16 |
17 | Returns:
18 | ParseResult: 파싱된 텍스트와 메타데이터
19 | """
20 | if not file_path.exists():
21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 |
23 | try:
24 | try:
25 | import yaml
26 | except ImportError:
27 | raise ImportError(
28 | "pyyaml이 설치되지 않았습니다. 'pip install pyyaml'로 설치하세요."
29 | )
30 |
31 | with open(file_path, 'r', encoding='utf-8') as f:
32 | data = yaml.safe_load(f)
33 |
34 | # YAML을 보기 좋게 포맷팅
35 | content = yaml.dump(
36 | data,
37 | allow_unicode=True,
38 | default_flow_style=False,
39 | sort_keys=False,
40 | indent=2,
41 | )
42 |
43 | metadata = {
44 | 'file_type': 'yaml',
45 | 'encoding': 'utf-8',
46 | 'file_size': file_path.stat().st_size,
47 | }
48 |
49 | # 구조 정보 추가
50 | if isinstance(data, dict):
51 | metadata['structure'] = 'mapping'
52 | metadata['num_keys'] = len(data)
53 | elif isinstance(data, list):
54 | metadata['structure'] = 'sequence'
55 | metadata['num_items'] = len(data)
56 |
57 | return ParseResult(content=content, metadata=metadata)
58 |
59 | except yaml.YAMLError as e:
60 | raise ValueError(f"YAML 파싱 실패: {file_path} - {str(e)}") from e
61 | except UnicodeDecodeError as e:
62 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
63 | except Exception as e:
64 | raise ValueError(f"파일 읽기 실패: {file_path}") from e
65 |
66 | def supports(self, file_path: Path) -> bool:
67 | """YAML 파일 확장자 지원 확인"""
68 | return file_path.suffix.lower() in self.supported_extensions
69 |
70 | @property
71 | def supported_extensions(self) -> list[str]:
72 | """지원하는 YAML 파일 확장자"""
73 | return ['.yaml', '.yml']
74 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_knowledge_search.yaml:
--------------------------------------------------------------------------------
1 | # Knowledge Search Agent Configuration
2 | # Supports general internal document search, knowledge bases, and supplemental web search
3 |
4 | name: knowledge_search
5 | description: "Searches and synthesizes information from all registered internal documents and knowledge bases."
6 | type: builtin
7 | enabled: true
8 | version: "1.0.0"
9 |
10 | # Priority (higher numbers are selected first)
11 | priority: 15
12 |
13 | # Agent selection criteria
14 | selection_criteria:
15 | keywords:
16 | - "internal document"
17 | - "document search"
18 | - "materials"
19 | - "guide"
20 | - "regulation"
21 | - "manual"
22 | - "knowledge"
23 | - "search"
24 | - "documentation" # '문서'라는 일반적인 키워드 추가
25 | patterns:
26 | - ".*internal.*document.*"
27 | - ".*guide.*"
28 | - ".*manual.*"
29 | - ".*knowledge.*base.*"
30 | - ".*find.*materials.*"
31 | - ".*documentation.*" # '문서' 패턴 추가
32 |
33 | # LLM configuration
34 | target_llm_config:
35 | server_name: "openai"
36 | override_params:
37 | temperature: 0.2 # Lower temperature for precise answers
38 | max_tokens: 3000
39 |
40 | fallback_strategy: "any_available"
41 |
42 | # Prompt configuration
43 | prompts:
44 | system_prompt: |
45 | You are an internal knowledge search specialist.
46 | Answer user questions by following this process:
47 |
48 | 1. Search registered internal documents and knowledge bases first.
49 | 2. Combine all information to deliver an accurate and complete answer.
50 |
51 | Key capabilities:
52 | - Retrieve internal documentation, guides, and materials
53 | - Clearly label sources (internal documents vs. external websites)
54 | - Prioritize trustworthy information
55 |
56 | user_prompt_template: |
57 | Question: {question}
58 |
59 | Internal search results:
60 | {internal_context}
61 |
62 | Chat history:
63 | {chat_history}
64 |
65 | Using the information above, provide a comprehensive answer to the question.
66 | Clearly indicate whether each piece of information comes from internal or external sources.
67 | If you think you need to refer to the previous question during the search process, please create the query while referring to it.
68 |
69 | # Implementation (builtin agent)
70 | implementation: builtin.knowledge_search.KnowledgeSearchAgent
71 |
72 | # Agent configuration
73 | config:
74 | timeout: 60
75 | retry_count: 2
76 | max_context_length: 12000
77 |
--------------------------------------------------------------------------------
/maru_lang/services/chat.py:
--------------------------------------------------------------------------------
1 | from typing import Dict, List
2 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument
3 | from maru_lang.core.relation_db.models.chat import Conversation, ConversationReference
4 | from maru_lang.core.relation_db.models.documents import Document
5 | from maru_lang.core.relation_db.models.auth import User
6 | from tortoise.queryset import QuerySet
7 | from datetime import datetime
8 | from datetime import timezone
9 |
10 |
11 | def fetch_conversation_queryset_by_user(
12 | user: User,
13 | ) -> QuerySet[Conversation]:
14 | return Conversation.filter(
15 | user=user,
16 | ).order_by('-created_at')
17 |
18 |
19 | async def fetch_conversation_by_user_and_date(
20 | user: User,
21 | start_date: datetime = datetime.now(timezone.utc),
22 | limit: int = 3,
23 | ) -> List[Conversation] | None:
24 | """
25 | Fetch conversations by user and date range.
26 |
27 | Args:
28 | user: User object
29 | start_date: Start date for filtering conversations
30 | limit: Maximum number of conversations to return
31 |
32 | Returns:
33 | List of Conversation objects or None
34 | """
35 | conversations = await Conversation.filter(
36 | user=user,
37 | created_at__gte=start_date,
38 | ).order_by(
39 | 'created_at'
40 | ).limit(limit).all()
41 |
42 | return conversations if conversations else None
43 |
44 | async def create_conversation(
45 | user: User,
46 | question: str,
47 | answer: str,
48 | references: list[RetrieveDocument],
49 | enhanced_question: str | None = None,
50 | ):
51 | conversation = await Conversation.create(
52 | user=user,
53 | question=question,
54 | answer=answer,
55 | enhanced_question=enhanced_question,
56 | )
57 |
58 | # Use a set to avoid creating duplicate references
59 | seen_doc_ids = set()
60 |
61 | for reference in references:
62 | # Extract document_id from metadata
63 | doc_id = reference.metadata.get("document_id")
64 | if not doc_id or doc_id in seen_doc_ids:
65 | continue
66 |
67 | # TODO FIX
68 | score = 0
69 | # Ensure the document still exists
70 | document = await Document.get_or_none(id=doc_id)
71 | if document:
72 | await ConversationReference.create(
73 | conversation=conversation,
74 | document=document,
75 | score=score,
76 | )
77 | seen_doc_ids.add(doc_id)
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_intent_extractor.yaml:
--------------------------------------------------------------------------------
1 | # Intent Extractor Agent Configuration
2 | # Extracts the user's intent and rewrites it into a search-ready query
3 |
4 | name: intent_extractor
5 | description: "Analyzes conversation context to rephrase user intent into a document search query"
6 | type: builtin
7 | enabled: true
8 | version: "1.0.0"
9 |
10 | # Priority (used primarily as an internal helper)
11 | priority: 90
12 |
13 | # LLM configuration
14 | target_llm_config:
15 | server_name: "openai"
16 | override_params:
17 | temperature: 0.1 # Consistent intent extraction
18 | max_tokens: 1024 # Generate a well-formed question
19 | timeout: 12.0
20 |
21 | fallback_strategy: "any_available"
22 |
23 | # Prompt configuration
24 | prompts:
25 | system_prompt: |
26 | You are an expert at identifying the true intent behind conversations between the user and the assistant.
27 |
28 | Core responsibilities:
29 | - Understand the user's real intent by considering the full context.
30 | - Treat the most recent message as the primary source of new information.
31 | - Rewrite the identified intent into a search-optimized question.
32 | - Produce clear and concise questions in Korean.
33 |
34 | Principles:
35 | 1. Analyze the overall conversation flow and context.
36 | 2. Identify the key information the user truly wants to know.
37 | 3. Convert the intent into a concrete question suited for document retrieval.
38 | 4. Remove unnecessary rhetorical phrases or emotional language.
39 | 5. Include searchable keywords and concepts.
40 |
41 | user_prompt_template: |
42 | Analyze the conversation context and the latest message, determine the user's intent, and rewrite it as a search-optimized question.
43 |
44 |
45 | {history_text}
46 |
47 |
48 |
49 | {question}
50 |
51 |
52 | Produce a Korean question that reflects the user's true intent and is ready for document search.
53 | Return only the rewritten question with no additional commentary.
54 |
55 | # Implementation (builtin agent)
56 | implementation: builtin.intent_extractor.IntentExtractorAgent
57 |
58 | # Agent configuration
59 | config:
60 | timeout: 12
61 | retry_count: 1
62 |
63 | # Intent extraction options
64 | extraction_config:
65 | preserve_key_terms: true # Preserve key terminology
66 | remove_emotions: true # Remove emotional expressions
67 | optimize_for_search: true # Optimize for search queries
68 |
--------------------------------------------------------------------------------
/maru_lang/pipelines/base.py:
--------------------------------------------------------------------------------
1 | """
2 | Base Pipeline - 모든 파이프라인의 추상 기본 클래스
3 | 비동기 큐 기반 스트리밍으로 자유로운 진행 상황 전달
4 | """
5 | import asyncio
6 | from abc import ABC, abstractmethod
7 | from dataclasses import dataclass
8 | from enum import Enum
9 | from typing import AsyncGenerator, Any, Optional
10 |
11 |
12 | class MessageType(str, Enum):
13 | """메시지 타입"""
14 | INFO = "info"
15 | ERROR = "error"
16 | WARNING = "warning"
17 |
18 |
19 | @dataclass
20 | class PipelineMessage:
21 | """파이프라인 진행 메시지"""
22 | message_type: MessageType
23 | message: str
24 | data: Any = None
25 |
26 | @classmethod
27 | def info(cls, message: str, data: Any = None):
28 | """INFO 메시지 생성"""
29 | return cls(message_type=MessageType.INFO, message=message, data=data)
30 |
31 | @classmethod
32 | def error(cls, message: str, data: Any = None):
33 | """ERROR 메시지 생성"""
34 | return cls(message_type=MessageType.ERROR, message=message, data=data)
35 |
36 | @classmethod
37 | def warning(cls, message: str, data: Any = None):
38 | """WARNING 메시지 생성"""
39 | return cls(message_type=MessageType.WARNING, message=message, data=data)
40 |
41 |
42 | @dataclass
43 | class PipelineComplete:
44 | """파이프라인 종료 신호"""
45 | data: Any = None # 최종 결과
46 |
47 |
48 | class BasePipeline(ABC):
49 | """모든 파이프라인의 기본 클래스 - 비동기 큐 기반 스트리밍"""
50 |
51 | def __init__(self):
52 | self.queue: asyncio.Queue = asyncio.Queue()
53 |
54 | async def run(self) -> AsyncGenerator[Any, None]:
55 | """
56 | 파이프라인 실행 (큐 기반 스트리밍)
57 |
58 | 백그라운드에서 process()를 실행하고,
59 | 큐에서 메시지를 꺼내서 yield
60 |
61 | Yields:
62 | PipelineMessage | PipelineComplete: 진행 메시지 또는 완료 신호
63 | """
64 | # 백그라운드에서 process() 실행
65 | task = asyncio.create_task(self.process())
66 |
67 | try:
68 | while True:
69 | item = await self.queue.get()
70 |
71 | # 종료 신호 확인
72 | if isinstance(item, PipelineComplete):
73 | yield item
74 | break
75 |
76 | yield item
77 | finally:
78 | # process() 완료 대기
79 | await task
80 |
81 | @abstractmethod
82 | async def process(self):
83 | """
84 | 파이프라인 주요 로직 (하위 클래스 구현)
85 |
86 | self.queue.put()으로 진행 상황 전달
87 | 마지막에 반드시 PipelineComplete() 전달
88 |
89 | Example:
90 | await self.queue.put(PipelineMessage.info("Starting..."))
91 | # ... 처리 ...
92 | await self.queue.put(PipelineComplete(data=result))
93 | """
94 | pass
95 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents_build_selector.yaml:
--------------------------------------------------------------------------------
1 | # Agent Selector Build Configuration
2 | # Configure how the agent selector behaves.
3 | # The available_agents list will be populated dynamically from loaded agents.
4 |
5 | # Agent selector prompt configuration
6 | system_prompt: |
7 | You are an agent selector. Analyze the user's question and pick the most appropriate specialists.
8 |
9 | Below is a list of available agents. Choose the ones that best fit the user's request.
10 | Available agents:
11 | {agent_descriptions}
12 |
13 | Guidelines:
14 | - Select only the agents that are necessary to answer the question.
15 | - Consider execution order when multiple agents are required.
16 | - Clearly explain the reasoning behind your choices.
17 |
18 | Important: use agent names directly in execution_order.
19 | Example: ["tech_prophet", "calculator"] (valid)
20 | Example: ["use_tech_prophet", "use_calculator"] (invalid)
21 |
22 | # User prompt template
23 | user_prompt: |
24 | Question: {question}
25 |
26 | Conversation History:
27 | {history_text}
28 |
29 | Prioritize the user's latest question when selecting agents. Reference the conversation history only when it helps.
30 | The most recent message usually contains the most relevant context. Use that information to choose the right specialists.
31 | If the history is not useful, focus solely on the question.
32 |
33 | # LLM parameters
34 | parameters:
35 | temperature: 0.1 # Low temperature for consistent selections
36 | timeout: 15.0 # Timeout in seconds
37 | max_tokens: 1000 # Maximum response tokens
38 |
39 | # Selection policy (optional)
40 | selection_policy:
41 | max_agents: 3 # Maximum number of agents to select
42 | require_reasoning: true # Require reasoning for selections
43 | allow_parallel: true # Allow parallel execution when possible
44 |
45 | # Agent-specific overrides (optional)
46 | # Disable individual agents after they are auto-discovered
47 | agent_overrides:
48 | # document_search: false # Example: disable the built-in RAG agent
49 | # custom_agent: false # Example: disable a custom agent
50 |
51 | # Fallback behavior when no agents are selected
52 | fallback_config:
53 | # Options: 'llm_generate' (LLM responds directly) or 'static_message'
54 | mode: "llm_generate"
55 |
56 | # Message to send when using static_message mode
57 | static_message: "How can I help? Please provide more details so I can assist you."
58 |
59 | # Settings used when mode is 'llm_generate'
60 | llm_generate:
61 | system_prompt: |
62 | You are a helpful AI assistant.
63 | Provide clear and accurate answers to the user's question.
64 | temperature: 0.7
65 | max_tokens: 500
66 |
--------------------------------------------------------------------------------
/maru_lang/utils/distribution.py:
--------------------------------------------------------------------------------
1 | from typing import List, Tuple, Dict
2 |
3 | def allocate_by_weight(
4 | groups_with_weights: List[Tuple[str, float]],
5 | max_results: int,
6 | ensure_min_one: bool = True,
7 | include_zero_weight_groups: bool = True,
8 | ) -> Dict[str, int]:
9 | if max_results <= 0 or not groups_with_weights:
10 | return {}
11 |
12 | # 안전 가중치(음수 -> 0)
13 | safe = [(g, (w if (isinstance(w, (int, float)) and w > 0) else 0.0))
14 | for g, w in groups_with_weights]
15 |
16 | all_groups = [g for g, _ in safe]
17 | allocations: Dict[str, int] = {g: 0 for g in all_groups}
18 |
19 | # w>0만 배분 대상
20 | pos = [(g, w) for g, w in safe if w > 0.0]
21 | if not pos:
22 | # 전부 0이면 모두 0으로 반환
23 | return allocations if include_zero_weight_groups else {}
24 |
25 | remaining = max_results
26 |
27 | # Step 1) 최소 1개 보장
28 | if ensure_min_one:
29 | if len(pos) <= max_results:
30 | for g, _ in pos:
31 | allocations[g] = 1
32 | remaining -= len(pos)
33 | else:
34 | # 양수 그룹이 max_results보다 많으면 상위 weight만 1개씩
35 | top = sorted(pos, key=lambda x: x[1], reverse=True)[:max_results]
36 | for g, _ in top:
37 | allocations[g] = 1
38 | remaining = 0
39 |
40 | if remaining == 0:
41 | return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos}
42 |
43 | # Step 2) Largest Remainder for the rest
44 | total_weight = sum(w for _, w in pos)
45 | if total_weight <= 0:
46 | # 방어적: weight합 0이면 남은 좌석을 상위 weight 순으로 1씩
47 | for g, _ in sorted(pos, key=lambda x: x[1], reverse=True):
48 | if remaining <= 0:
49 | break
50 | allocations[g] += 1
51 | remaining -= 1
52 | return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos}
53 |
54 | quotas = []
55 | for g, w in pos:
56 | q = remaining * (w / total_weight)
57 | base = int(q)
58 | frac = q - base
59 | quotas.append((g, w, base, frac))
60 |
61 | used = sum(base for _, _, base, _ in quotas)
62 | for g, _, base, _ in quotas:
63 | allocations[g] += base
64 |
65 | left = remaining - used
66 | if left > 0:
67 | # 타이브레이크: frac DESC, weight DESC, name ASC
68 | quotas_sorted = sorted(
69 | quotas,
70 | key=lambda t: (-t[3], -t[1], t[0])
71 | )
72 | for i in range(left):
73 | g = quotas_sorted[i][0]
74 | allocations[g] += 1
75 |
76 | return allocations if include_zero_weight_groups else {g: allocations[g] for g, _ in pos}
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_keyword_extractor.yaml:
--------------------------------------------------------------------------------
1 | # Keyword Extractor Agent Configuration
2 | # Extracts keywords optimized for BM25 search
3 |
4 | name: keyword_extractor
5 | description: "Extracts core keywords optimized for BM25 retrieval"
6 | type: builtin
7 | enabled: true
8 | version: "1.0.0"
9 |
10 | # Priority (used primarily as an internal helper)
11 | priority: 85
12 |
13 | # LLM configuration
14 | target_llm_config:
15 | server_name: "openai"
16 | override_params:
17 | temperature: 0.1 # Keep output consistent
18 | max_tokens: 50 # Short responses containing keywords only
19 | timeout: 8.0 # Fast turnaround
20 |
21 | fallback_strategy: "any_available"
22 |
23 | # Prompt configuration
24 | prompts:
25 | system_prompt: |
26 | You are an expert at extracting keywords optimized for BM25 search.
27 |
28 | Core responsibilities:
29 | - Extract only the most important nouns and concepts from the question.
30 | - Select keywords that work well with the BM25 algorithm.
31 | - Remove stopwords and terms with little value.
32 | - Consider synonyms and closely related phrases.
33 |
34 | Principles:
35 | 1. Focus on the most critical nouns and key concepts.
36 | 2. Remove all stopwords (particles, endings, interrogatives, etc.).
37 | 3. Include synonyms or related terms when helpful.
38 | 4. Compress the output into 3–7 essential keywords.
39 | 5. Separate keywords with spaces.
40 | 6. Prioritize specific terms that improve search quality.
41 |
42 | user_prompt_template: |
43 | Extract the BM25-optimized core keywords for the following question.
44 |
45 | Question: {question}
46 |
47 | Return only the most effective keywords for BM25 search, separated by spaces.
48 | Provide the keywords only—no additional commentary.
49 |
50 | Example:
51 | - Question: "How do I apply for vacation at our company?"
52 | - Keywords: "company vacation application procedure process"
53 |
54 | # Implementation (builtin agent)
55 | implementation: builtin.keyword_extractor.KeywordExtractorAgent
56 |
57 | # Agent configuration
58 | config:
59 | timeout: 8
60 | retry_count: 1
61 |
62 | # Keyword extraction options
63 | extraction_config:
64 | min_keywords: 3 # Minimum number of keywords
65 | max_keywords: 7 # Maximum number of keywords
66 | filter_stopwords: true # Filter stopwords
67 | include_synonyms: true # Consider synonyms
68 |
69 | # Stopword list (Korean)
70 | stopwords:
71 | - "어떻게"
72 | - "무엇"
73 | - "언제"
74 | - "어디서"
75 | - "왜"
76 | - "어떤"
77 | - "이것"
78 | - "그것"
79 | - "저것"
80 | - "되나요"
81 | - "인가요"
82 | - "습니까"
83 | - "하나요"
--------------------------------------------------------------------------------
/maru_lang/commands/transfer.py:
--------------------------------------------------------------------------------
1 | """
2 | Transfer 명령어: DocumentGroup 관리자 권한 이전
3 | """
4 | import typer
5 | from maru_lang.core.relation_db.models.documents import DocumentGroup
6 | from maru_lang.core.relation_db.models.auth import User
7 |
8 |
9 | async def transfer_function(
10 | group_name: str,
11 | new_manager_email: str,
12 | force: bool = False,
13 | ):
14 | """
15 | DocumentGroup의 관리자를 다른 사용자로 이전
16 |
17 | Args:
18 | group_name: 관리자를 변경할 DocumentGroup 이름
19 | new_manager_email: 새 관리자의 이메일 주소
20 | force: 확인 없이 강제 이전
21 | """
22 | # ========== 1. DocumentGroup 확인 ==========
23 | typer.echo("\n" + "=" * 50)
24 | typer.secho("🔄 DocumentGroup 관리자 이전", fg=typer.colors.CYAN, bold=True)
25 | typer.echo("=" * 50)
26 |
27 | group = await DocumentGroup.get_or_none(name=group_name).prefetch_related("manager")
28 | if not group:
29 | typer.secho(
30 | f"❌ DocumentGroup '{group_name}'을 찾을 수 없습니다.",
31 | fg=typer.colors.RED,
32 | )
33 | raise typer.Exit(1)
34 |
35 | # ========== 2. 새 관리자 확인 ==========
36 | new_manager = await User.get_or_none(email=new_manager_email)
37 | if not new_manager:
38 | typer.secho(
39 | f"❌ 사용자 '{new_manager_email}'을 찾을 수 없습니다.",
40 | fg=typer.colors.RED,
41 | )
42 | raise typer.Exit(1)
43 |
44 | # ========== 3. 현재 관리자 정보 출력 ==========
45 | current_manager = group.manager
46 | if current_manager:
47 | typer.echo(f"\n현재 관리자: {current_manager.name} ({current_manager.email})")
48 | else:
49 | typer.echo(f"\n현재 관리자: 없음")
50 |
51 | typer.echo(f"새 관리자: {new_manager.name} ({new_manager.email})")
52 |
53 | # 이미 같은 관리자인 경우
54 | if current_manager and current_manager.id == new_manager.id:
55 | typer.secho(
56 | f"\n⚠️ '{new_manager_email}'은 이미 이 그룹의 관리자입니다.",
57 | fg=typer.colors.YELLOW,
58 | )
59 | raise typer.Exit(0)
60 |
61 | # ========== 4. 확인 ==========
62 | if not force:
63 | typer.echo("\n" + "=" * 50)
64 | confirm = typer.confirm(
65 | f"\n'{group_name}'의 관리자를 '{new_manager_email}'로 변경하시겠습니까?"
66 | )
67 | if not confirm:
68 | typer.secho("\n❌ 이전 작업이 취소되었습니다.", fg=typer.colors.RED)
69 | raise typer.Exit(0)
70 |
71 | # ========== 5. 관리자 변경 ==========
72 | group.manager = new_manager
73 | await group.save()
74 |
75 | # ========== 완료 ==========
76 | typer.echo("\n" + "=" * 50)
77 | typer.secho("✅ 관리자 이전 완료!", fg=typer.colors.GREEN, bold=True)
78 | typer.echo("=" * 50)
79 | typer.echo(f"DocumentGroup: {group_name}")
80 | typer.echo(f"새 관리자: {new_manager.name} ({new_manager.email})")
81 | typer.echo()
82 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/rag_config.yaml:
--------------------------------------------------------------------------------
1 | # RAG Configuration
2 | # Configure retriever defaults and per-group RAG settings
3 |
4 | # ============================================================
5 | # Retriever global settings
6 | # ============================================================
7 | retriever:
8 | # Default search parameters
9 | default_k: 20 # Number of search results to return
10 | default_method: "vector" # Choose among vector, bm25, or ensemble
11 |
12 | # Query-type weights
13 | # - cosine_weight: weight of vector similarity search
14 | # - bm25_weight: weight of BM25 keyword search
15 | query_type_weights:
16 | factual: # Fact-based question (e.g., "What is ...")
17 | cosine_weight: 0.2
18 | bm25_weight: 0.8
19 | procedural: # How-to question (e.g., "How do I ...")
20 | cosine_weight: 0.8
21 | bm25_weight: 0.2
22 | analytical: # Comparative/analytical question (e.g., "Compare A and B")
23 | cosine_weight: 0.5
24 | bm25_weight: 0.5
25 |
26 | # Representative queries for automatic query-type classification
27 | # Used to determine which query type the user's question resembles
28 | representative_queries:
29 | factual: "What is ..."
30 | procedural: "How to ..."
31 | analytical: "Compare A and B"
32 |
33 | # Fallback logic configuration
34 | # Applied when the query type cannot be determined reliably
35 | fallback_logic:
36 | similarity_threshold: 0.3 # Use fallback when similarity drops below this value
37 | short_query_length: 2 # Short query threshold (word count)
38 | long_query_length: 6 # Long query threshold (word count)
39 | weights:
40 | short_query: # Short query (≤ 2 words)
41 | cosine_weight: 0.3
42 | bm25_weight: 0.7
43 | medium_query: # Medium-length query (3–5 words)
44 | cosine_weight: 0.5
45 | bm25_weight: 0.5
46 | long_query: # Long query (≥ 6 words)
47 | cosine_weight: 0.7
48 | bm25_weight: 0.3
49 |
50 | # ============================================================
51 | # Group-specific RAG configuration
52 | # ============================================================
53 | groups:
54 | # Example: Python documentation group
55 | # python_docs:
56 | # description: "Official Python documentation and tutorials"
57 | #
58 | # # Optional overrides for pluggable components (per-group customization)
59 | # components:
60 | # loader: "markdown" # Loader to use for this group
61 | # chunker: "sentence" # Chunker to use for this group
62 | # embedding_model: "BAAI/bge-m3" # Embedding model to use for this group
63 |
64 | # Example: General documents group
65 | # general_docs:
66 | # description: "General information and documents"
67 |
--------------------------------------------------------------------------------
/maru_lang/core/vector_db/retrieve_document.py:
--------------------------------------------------------------------------------
1 | from typing import Any, Optional
2 | from datetime import datetime
3 | from pydantic import BaseModel, Field, computed_field
4 |
5 |
6 | class RetrieveDocument(BaseModel):
7 | id: str
8 | page_content: str
9 | metadata: dict[str, Any] = Field(default_factory=dict)
10 |
11 | @computed_field
12 | @property
13 | def source(self) -> str:
14 | return self.metadata.get("document_name", "알 수 없는 소스")
15 |
16 | @computed_field
17 | @property
18 | def code(self) -> str:
19 | return self.metadata.get("document_code", "unknown")
20 |
21 | @computed_field
22 | @property
23 | def page(self) -> int:
24 | return self.metadata.get("number", 1)
25 |
26 | def __repr__(self):
27 | preview = self.page_content[:60].replace("\n", " ")
28 | if len(self.page_content) > 60:
29 | preview += "..."
30 | return f"RetrieveDocument(id='{self.id}', page_content='{preview}', metadata={self.metadata})"
31 |
32 | def to_dict(self) -> dict:
33 | return self.model_dump()
34 |
35 | def to_reference_response(self) -> dict:
36 | """ReferenceResponse 형태로 변환"""
37 | return {
38 | "source": self.source,
39 | "code": self.code,
40 | "page": self.page,
41 | "page_content": self.page_content,
42 | "metadata": self.metadata
43 | }
44 |
45 | def pretty(self) -> str:
46 | """사용자 친화적 포맷 출력"""
47 | preview = self.page_content.strip().replace("\n", " ")
48 | if len(preview) > 50:
49 | preview = preview[:50] + "..."
50 |
51 | filtered_meta = {
52 | k: v for k, v in self.metadata.items()
53 | if v not in (None, "", [], {}, "null", "None")
54 | }
55 |
56 | meta_lines = "\n".join(
57 | f" {k}: {v}" for k, v in filtered_meta.items())
58 |
59 | return (
60 | f"\n🧩 RetrieveDocument(id={self.id})\n"
61 | f"📄 Content Preview: {preview}\n"
62 | f"📎 Metadata:\n{meta_lines}\n"
63 | )
64 |
65 | @staticmethod
66 | def sort_by_date(documents: list['RetrieveDocument']) -> list['RetrieveDocument']:
67 | """문서를 날짜 기준으로 정렬 (최신순)"""
68 |
69 | def parse_date(date_str: str) -> datetime:
70 | try:
71 | return datetime.strptime(date_str, "%Y%m%d")
72 | except:
73 | return datetime.min
74 |
75 | def get_document_date(doc: 'RetrieveDocument') -> datetime:
76 | update_date = doc.metadata.get("UpdateDate", "")
77 | creation_date = doc.metadata.get("CreationDate", "")
78 |
79 | if update_date:
80 | return parse_date(update_date)
81 | elif creation_date:
82 | return parse_date(creation_date)
83 | return datetime.min
84 |
85 | return sorted(documents, key=get_document_date, reverse=True)
86 |
--------------------------------------------------------------------------------
/maru_lang/utils/document.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import hashlib
3 | import os
4 | import sys
5 | import time
6 | import random
7 | import uuid
8 |
9 |
10 | def new_ulid() -> str:
11 | """
12 | Generate a time-sortable identifier.
13 |
14 | - Use ``uuid.uuid7`` when available (Python 3.12+).
15 | - Otherwise, fall back to a ULID-style implementation.
16 | """
17 | # Detect uuid7 support at runtime
18 | if hasattr(uuid, 'uuid7'):
19 | return str(uuid.uuid7())
20 |
21 | # ULID fallback implementation
22 | # Format: 26 characters (10 timestamp + 16 randomness)
23 | timestamp_ms = int(time.time() * 1000)
24 | randomness = random.getrandbits(80)
25 |
26 | # Crockford's Base32 alphabet
27 | alphabet = "0123456789ABCDEFGHJKMNPQRSTVWXYZ"
28 |
29 | # Encode timestamp (10 characters)
30 | ts_encoded = ""
31 | ts = timestamp_ms
32 | for _ in range(10):
33 | ts_encoded = alphabet[ts & 0x1F] + ts_encoded
34 | ts >>= 5
35 |
36 | # Encode randomness (16 characters)
37 | rand_encoded = ""
38 | rand = randomness
39 | for _ in range(16):
40 | rand_encoded = alphabet[rand & 0x1F] + rand_encoded
41 | rand >>= 5
42 |
43 | return ts_encoded + rand_encoded
44 |
45 |
46 | def canonicalize_text(s: str) -> str:
47 | return " ".join((s or "").split()).lower()
48 |
49 |
50 | def make_source_fingerprint_for_file(file_path: str, size: int, mtime_ns: int) -> str:
51 | """
52 | Generate a fingerprint that captures changes to file contents and location.
53 |
54 | Args:
55 | file_path: Full file path (used to distinguish same files in different locations).
56 | size: File size in bytes.
57 | mtime_ns: Modification time in nanoseconds.
58 |
59 | Returns:
60 | str: 32-character SHA256 hash.
61 |
62 | Note:
63 | The full file path is included to allow the same file to exist in different
64 | directories as separate documents. This handles cases where:
65 | - Files are copied to multiple locations
66 | - Folder names are changed (creating a new document context)
67 | - Backup or versioned copies exist in different paths
68 | """
69 | raw = f"{file_path.lower()}|{size}|{mtime_ns}"
70 | return hashlib.sha256(raw.encode()).hexdigest()[:32] # 128bit
71 |
72 | def make_chunk_uid(document_id: str, number: int, content: str) -> str:
73 | raw = f"{document_id}|{number}|{canonicalize_text(content)}"
74 | d = hashlib.sha256(raw.encode()).digest()
75 | return base64.b32encode(d).decode("ascii").rstrip("=").lower()[:26]
76 |
77 |
78 | def make_embed_id(chunk_uid: str, model_name: str, dim: int, normalize_ver: str, pooling: str, lang_hint: str | None = None) -> str:
79 | raw = "|".join([chunk_uid, model_name, str(
80 | dim), normalize_ver, pooling, lang_hint or ""])
81 | d = hashlib.sha256(raw.encode()).digest()
82 | return base64.b32encode(d).decode("ascii").rstrip("=").lower()[:26]
83 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/xlsx_parser.py:
--------------------------------------------------------------------------------
1 | """Excel file parser."""
2 |
3 | from pathlib import Path
4 | from .base import BaseParser, ParseResult
5 |
6 |
7 | class XLSXParser(BaseParser):
8 | """Excel 파일 파서 (openpyxl 사용)"""
9 |
10 | def parse(self, file_path: Path) -> ParseResult:
11 | """
12 | XLSX 파일에서 텍스트를 추출합니다.
13 |
14 | Args:
15 | file_path: 파싱할 XLSX 파일 경로
16 |
17 | Returns:
18 | ParseResult: 파싱된 텍스트와 메타데이터
19 | """
20 | if not file_path.exists():
21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 |
23 | try:
24 | try:
25 | from openpyxl import load_workbook
26 | except ImportError:
27 | raise ImportError(
28 | "openpyxl이 설치되지 않았습니다. 'pip install openpyxl'로 설치하세요."
29 | )
30 |
31 | # data_only=True로 수식 대신 값을 읽음
32 | workbook = load_workbook(file_path, data_only=True)
33 |
34 | # 시트별로 데이터 추출
35 | sheets_text = []
36 | for sheet_name in workbook.sheetnames:
37 | sheet = workbook[sheet_name]
38 | sheet_content = [f"=== Sheet: {sheet_name} ==="]
39 |
40 | # 모든 행 읽기
41 | rows_data = []
42 | for row in sheet.iter_rows(values_only=True):
43 | # 빈 행 건너뛰기
44 | if all(cell is None or str(cell).strip() == '' for cell in row):
45 | continue
46 |
47 | # 셀 값을 문자열로 변환
48 | row_text = ' | '.join(
49 | str(cell) if cell is not None else '' for cell in row
50 | )
51 | rows_data.append(row_text)
52 |
53 | if rows_data:
54 | sheet_content.extend(rows_data)
55 | else:
56 | sheet_content.append("(empty sheet)")
57 |
58 | sheets_text.append('\n'.join(sheet_content))
59 |
60 | content = '\n\n'.join(sheets_text)
61 |
62 | metadata = {
63 | 'file_type': 'xlsx',
64 | 'num_sheets': len(workbook.sheetnames),
65 | 'sheet_names': workbook.sheetnames,
66 | 'file_size': file_path.stat().st_size,
67 | }
68 |
69 | # 활성 시트 정보
70 | if workbook.active:
71 | metadata['active_sheet'] = workbook.active.title
72 |
73 | return ParseResult(content=content, metadata=metadata)
74 |
75 | except Exception as e:
76 | raise ValueError(f"XLSX 파싱 실패: {file_path}") from e
77 |
78 | def supports(self, file_path: Path) -> bool:
79 | """XLSX 파일 확장자 지원 확인"""
80 | return file_path.suffix.lower() in self.supported_extensions
81 |
82 | @property
83 | def supported_extensions(self) -> list[str]:
84 | """지원하는 Excel 파일 확장자"""
85 | return ['.xlsx', '.xlsm']
86 |
--------------------------------------------------------------------------------
/maru_lang/core/vector_db/base.py:
--------------------------------------------------------------------------------
1 | from abc import ABC, abstractmethod
2 | from typing import Any
3 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument
4 |
5 |
6 | class VectorDB(ABC):
7 |
8 | @abstractmethod
9 | def drop_collection(self) -> None:
10 | pass
11 |
12 | @abstractmethod
13 | def add_documents(self, documents: list[dict]) -> None:
14 | pass
15 |
16 | @abstractmethod
17 | def sync_documents(self) -> None:
18 | pass
19 |
20 | @abstractmethod
21 | def has_document(self, doc_id: str) -> bool:
22 | pass
23 |
24 | @abstractmethod
25 | def update_document(self, doc_id: str, new_doc_id: str, new_content: str) -> None:
26 | pass
27 |
28 | @abstractmethod
29 | def delete_document(self, doc_id: str) -> None:
30 | pass
31 |
32 | @abstractmethod
33 | def delete_all_chunks_by_document_id(self, document_id: str) -> int:
34 | """문서 ID로 해당 문서의 모든 청크를 삭제합니다.
35 |
36 | Args:
37 | document_id: 삭제할 문서의 ID
38 |
39 | Returns:
40 | 삭제된 청크의 개수
41 | """
42 | pass
43 |
44 | @abstractmethod
45 | def count_documents(self) -> int:
46 | pass
47 |
48 | @abstractmethod
49 | def get_all_metadata(self) -> list[dict]:
50 | pass
51 |
52 | @abstractmethod
53 | def get_documents(self, document_ids: list[str]) -> list[RetrieveDocument]:
54 | pass
55 |
56 | @abstractmethod
57 | def get_all_documents(
58 | self,
59 | version_ids: list[str] | None = None
60 | ) -> list[RetrieveDocument]:
61 | """
62 | Get all documents from VectorDB with optional version filter
63 |
64 | Args:
65 | version_ids: Optional list of version IDs to filter
66 |
67 | Returns:
68 | List of all documents (or filtered by version)
69 | """
70 | pass
71 |
72 | @abstractmethod
73 | def similarity_search(
74 | self,
75 | query_embedding: list[float],
76 | k: int,
77 | version_ids: list[str] | None = None,
78 | **kwargs: dict[str, Any]
79 | ) -> list[RetrieveDocument]:
80 | """
81 | Vector similarity search using query embedding
82 |
83 | Args:
84 | query_embedding: Query embedding vector
85 | k: Number of results to return
86 | version_ids: Optional list of version IDs to filter
87 | **kwargs: Additional search parameters
88 |
89 | Returns:
90 | List of retrieved documents
91 | """
92 | pass
93 |
94 | @abstractmethod
95 | def health_check(self) -> bool:
96 | """
97 | VectorDB 헬스체크 (연결 및 접근 가능 여부 확인)
98 |
99 | Returns:
100 | bool: 헬스체크 통과 여부
101 |
102 | Raises:
103 | Exception: 헬스체크 실패 시 상세 에러
104 | """
105 | pass
106 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/csv_parser.py:
--------------------------------------------------------------------------------
1 | """CSV file parser."""
2 |
3 | import csv
4 | from pathlib import Path
5 | from .base import BaseParser, ParseResult
6 |
7 |
8 | class CSVParser(BaseParser):
9 | """CSV 파일 파서"""
10 |
11 | def parse(self, file_path: Path) -> ParseResult:
12 | """
13 | CSV 파일을 읽어 포맷팅된 텍스트로 변환합니다.
14 |
15 | Args:
16 | file_path: 파싱할 CSV 파일 경로
17 |
18 | Returns:
19 | ParseResult: 파싱된 텍스트와 메타데이터
20 | """
21 | if not file_path.exists():
22 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
23 |
24 | try:
25 | with open(file_path, 'r', encoding='utf-8') as f:
26 | # CSV 방언 자동 감지
27 | sample = f.read(1024)
28 | f.seek(0)
29 | sniffer = csv.Sniffer()
30 |
31 | try:
32 | dialect = sniffer.sniff(sample)
33 | has_header = sniffer.has_header(sample)
34 | except csv.Error:
35 | # 감지 실패시 기본값 사용
36 | dialect = csv.excel
37 | has_header = True
38 |
39 | reader = csv.reader(f, dialect=dialect)
40 | rows = list(reader)
41 |
42 | if not rows:
43 | raise ValueError("CSV 파일이 비어 있습니다")
44 |
45 | # 테이블 형식으로 포맷팅
46 | if has_header and len(rows) > 1:
47 | headers = rows[0]
48 | data_rows = rows[1:]
49 |
50 | # 헤더와 데이터를 구분하여 표시
51 | content_lines = [
52 | f"Headers: {', '.join(headers)}",
53 | "=" * 80,
54 | ]
55 |
56 | for row in data_rows:
57 | content_lines.append(' | '.join(str(cell) for cell in row))
58 | else:
59 | # 헤더가 없는 경우
60 | content_lines = []
61 | for row in rows:
62 | content_lines.append(' | '.join(str(cell) for cell in row))
63 |
64 | content = '\n'.join(content_lines)
65 |
66 | metadata = {
67 | 'file_type': 'csv',
68 | 'encoding': 'utf-8',
69 | 'file_size': file_path.stat().st_size,
70 | 'num_rows': len(rows),
71 | 'num_columns': len(rows[0]) if rows else 0,
72 | 'has_header': has_header,
73 | }
74 |
75 | return ParseResult(content=content, metadata=metadata)
76 |
77 | except UnicodeDecodeError as e:
78 | raise ValueError(f"UTF-8 인코딩 오류: {file_path}") from e
79 | except Exception as e:
80 | raise ValueError(f"CSV 파싱 실패: {file_path}") from e
81 |
82 | def supports(self, file_path: Path) -> bool:
83 | """CSV 파일 확장자 지원 확인"""
84 | return file_path.suffix.lower() in self.supported_extensions
85 |
86 | @property
87 | def supported_extensions(self) -> list[str]:
88 | """지원하는 CSV 파일 확장자"""
89 | return ['.csv', '.tsv']
90 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/pptx_parser.py:
--------------------------------------------------------------------------------
1 | """PowerPoint file parser."""
2 |
3 | from pathlib import Path
4 | from .base import BaseParser, ParseResult
5 |
6 |
7 | class PPTXParser(BaseParser):
8 | """PowerPoint 파일 파서 (python-pptx 사용)"""
9 |
10 | def parse(self, file_path: Path) -> ParseResult:
11 | """
12 | PPTX 파일에서 텍스트를 추출합니다.
13 |
14 | Args:
15 | file_path: 파싱할 PPTX 파일 경로
16 |
17 | Returns:
18 | ParseResult: 파싱된 텍스트와 메타데이터
19 | """
20 | if not file_path.exists():
21 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
22 |
23 | try:
24 | try:
25 | from pptx import Presentation
26 | except ImportError:
27 | raise ImportError(
28 | "python-pptx가 설치되지 않았습니다. 'pip install python-pptx'로 설치하세요."
29 | )
30 |
31 | prs = Presentation(file_path)
32 |
33 | # 슬라이드별로 텍스트 추출
34 | slides_text = []
35 | for idx, slide in enumerate(prs.slides, 1):
36 | slide_content = [f"=== Slide {idx} ==="]
37 |
38 | # 슬라이드의 모든 도형에서 텍스트 추출
39 | for shape in slide.shapes:
40 | if hasattr(shape, "text") and shape.text.strip():
41 | slide_content.append(shape.text.strip())
42 |
43 | # 테이블이 있는 경우 처리
44 | if shape.has_table:
45 | table = shape.table
46 | for row in table.rows:
47 | row_text = ' | '.join(cell.text.strip() for cell in row.cells)
48 | if row_text.strip():
49 | slide_content.append(row_text)
50 |
51 | # 슬라이드 노트 추출
52 | if slide.has_notes_slide:
53 | notes_text = slide.notes_slide.notes_text_frame.text.strip()
54 | if notes_text:
55 | slide_content.append(f"Notes: {notes_text}")
56 |
57 | slides_text.append('\n'.join(slide_content))
58 |
59 | content = '\n\n'.join(slides_text)
60 |
61 | metadata = {
62 | 'file_type': 'pptx',
63 | 'num_slides': len(prs.slides),
64 | 'file_size': file_path.stat().st_size,
65 | }
66 |
67 | # 슬라이드 크기 정보
68 | if prs.slide_width and prs.slide_height:
69 | metadata['slide_width'] = prs.slide_width
70 | metadata['slide_height'] = prs.slide_height
71 |
72 | return ParseResult(content=content, metadata=metadata)
73 |
74 | except Exception as e:
75 | raise ValueError(f"PPTX 파싱 실패: {file_path}") from e
76 |
77 | def supports(self, file_path: Path) -> bool:
78 | """PPTX 파일 확장자 지원 확인"""
79 | return file_path.suffix.lower() in self.supported_extensions
80 |
81 | @property
82 | def supported_extensions(self) -> list[str]:
83 | """지원하는 PowerPoint 파일 확장자"""
84 | return ['.pptx']
85 |
--------------------------------------------------------------------------------
/maru_lang/models/vector_db.py:
--------------------------------------------------------------------------------
1 | """
2 | VectorDB 설정 모델
3 | """
4 | from dataclasses import dataclass, field
5 | from typing import Optional, Union
6 |
7 |
8 | # ========== VectorDB Config (상속 기반) ==========
9 |
10 | @dataclass
11 | class BaseVectorDBConfig:
12 | """VectorDB 기본 설정 (모든 VectorDB 공통)"""
13 | db_type: str
14 |
15 |
16 | @dataclass
17 | class ChromaDBConfig(BaseVectorDBConfig):
18 | """ChromaDB 전용 설정"""
19 | persist_dir: str = field(default="")
20 | collection_name: str = field(default="")
21 | db_type: str = field(default="chromadb", init=False)
22 |
23 | @classmethod
24 | def from_settings(cls) -> "ChromaDBConfig":
25 | """Settings로부터 기본 ChromaDB 설정 생성"""
26 | from maru_lang.configs.system_config import get_system_config
27 | config = get_system_config()
28 | return cls(
29 | persist_dir=config.vector_db.chroma.get_persist_dir_absolute(),
30 | collection_name=config.vector_db.default_collection_name,
31 | )
32 |
33 |
34 | @dataclass
35 | class MilvusConfig(BaseVectorDBConfig):
36 | """Milvus 전용 설정"""
37 | host: str = field(default="localhost")
38 | port: int = field(default=19530)
39 | user: str = field(default="root")
40 | password: str = field(default="Milvus")
41 | collection_name: str = field(default="")
42 | db_type: str = field(default="milvus", init=False)
43 |
44 | @classmethod
45 | def from_settings(cls) -> "MilvusConfig":
46 | """Settings로부터 기본 Milvus 설정 생성"""
47 | from maru_lang.configs.system_config import get_system_config
48 | config = get_system_config()
49 | return cls(
50 | host=config.vector_db.milvus.host,
51 | port=config.vector_db.milvus.port,
52 | user=config.vector_db.milvus.user,
53 | password=config.vector_db.milvus.password,
54 | collection_name=config.vector_db.default_collection_name,
55 | )
56 |
57 |
58 | @dataclass
59 | class PineconeConfig(BaseVectorDBConfig):
60 | """Pinecone 전용 설정 (향후 확장)"""
61 | api_key: str = field(default="")
62 | environment: str = field(default="")
63 | index_name: str = field(default="")
64 | db_type: str = field(default="pinecone", init=False)
65 |
66 |
67 | def get_vector_db_config_from_settings() -> Union[ChromaDBConfig, MilvusConfig]:
68 | """
69 | system_config.yaml의 vector_db.type에 따라 적절한 VectorDB 설정 반환
70 |
71 | Returns:
72 | ChromaDBConfig or MilvusConfig: 설정된 VectorDB 타입에 맞는 설정 객체
73 |
74 | Raises:
75 | ValueError: 지원하지 않는 VectorDB 타입인 경우
76 | """
77 | from maru_lang.configs.system_config import get_system_config
78 | config = get_system_config()
79 |
80 | db_type = config.vector_db.type.lower()
81 |
82 | if db_type == "chroma":
83 | return ChromaDBConfig.from_settings()
84 | elif db_type == "milvus":
85 | return MilvusConfig.from_settings()
86 | else:
87 | raise ValueError(
88 | f"Unsupported vector_db.type: {db_type}. "
89 | f"Supported types: 'chroma', 'milvus'"
90 | )
--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/chunker_config.py:
--------------------------------------------------------------------------------
1 | """
2 | Chunker configuration loader
3 | """
4 | from typing import Dict, Any, Optional
5 | from maru_lang.configs.base import DefaultConfigLoader
6 | from maru_lang.pluggable.models import ChunkerConfig
7 | from maru_lang.enums.configs import ConfigType
8 |
9 |
10 | class ChunkerConfigLoader(DefaultConfigLoader[ChunkerConfig]):
11 | """Loader for chunker configurations"""
12 |
13 | def __init__(self):
14 | super().__init__(ConfigType.CHUNKERS)
15 | # Chunkers는 base config 없이 user config만 사용
16 | # (명시적 설정을 강제하기 위해)
17 |
18 | def load_all(self) -> Dict[str, ChunkerConfig]:
19 | """Load configurations from user directory only (no base)"""
20 | import logging
21 | logger = logging.getLogger(__name__)
22 |
23 | self.configs = {}
24 | self._base_configs = {}
25 |
26 | # User config만 로드 (base 없음) - 특정 파일만 읽기
27 | logger.info(f"Loading {self.config_type} configurations from user directory...")
28 |
29 | # chunker_config.yaml만 읽기 (사용자 정의 chunker .py 파일 제외)
30 | config_file = self.user_dir / "chunker_config.yaml"
31 | if config_file.exists():
32 | if self._load_file(config_file, is_user=True):
33 | logger.info(f"Loaded chunker config from {config_file}")
34 | else:
35 | logger.warning(f"Failed to load chunker config from {config_file}")
36 | else:
37 | logger.warning(f"Chunker config file not found: {config_file}")
38 |
39 | logger.info(
40 | f"Loaded {len(self.configs)} {self.config_type} configs"
41 | )
42 |
43 | return self.configs
44 |
45 | def parse_config(
46 | self, data: Dict[str, Any], source_path: str, is_user: bool
47 | ) -> Optional[ChunkerConfig]:
48 | """Parse chunker configuration data"""
49 | try:
50 | return ChunkerConfig(
51 | chunkers=data.get("chunkers", {}),
52 | source_path=source_path,
53 | is_override=is_user,
54 | )
55 | except Exception as e:
56 | import sys
57 |
58 | error_msg = f"Error parsing chunker config from {source_path}: {e}"
59 | print(f"\n❌ ERROR: {error_msg}", file=sys.stderr)
60 | return None
61 |
62 | def get_config_name(self, config: ChunkerConfig) -> str:
63 | """Get the name of a chunker configuration"""
64 | # 단일 config 파일이므로 고정 이름 사용
65 | return "config"
66 |
67 | def validate_config(self, data: Dict[str, Any]) -> bool:
68 | """Validate chunker configuration data"""
69 | # 필수 필드가 없으므로 기본적으로 유효
70 | return True
71 |
72 | def get_merged_config(self) -> ChunkerConfig:
73 | """
74 | Get merged configuration (base + user override)
75 |
76 | Returns:
77 | Merged ChunkerConfig with user overrides applied
78 | """
79 | # Base config
80 | base = self.configs.get("config")
81 | if not base:
82 | # Return default if no config found
83 | return ChunkerConfig()
84 |
85 | return base
86 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/embedder_config.py:
--------------------------------------------------------------------------------
1 | """
2 | Embedder configuration loader
3 | """
4 | from typing import Dict, Any, Optional
5 | from maru_lang.configs.base import DefaultConfigLoader
6 | from maru_lang.pluggable.models import EmbedderConfig
7 | from maru_lang.enums.configs import ConfigType
8 |
9 |
10 | class EmbedderConfigLoader(DefaultConfigLoader[EmbedderConfig]):
11 | """Loader for embedder configurations"""
12 |
13 | def __init__(self):
14 | super().__init__(ConfigType.EMBEDDERS)
15 | # Embedders는 base config 없이 user config만 사용
16 | # (명시적 설정을 강제하기 위해)
17 |
18 | def load_all(self) -> Dict[str, EmbedderConfig]:
19 | """Load configurations from user directory only (no base)"""
20 | import logging
21 | logger = logging.getLogger(__name__)
22 |
23 | self.configs = {}
24 | self._base_configs = {}
25 |
26 | # User config만 로드 (base 없음) - 특정 파일만 읽기
27 | logger.info(f"Loading {self.config_type} configurations from user directory...")
28 |
29 | # embedder_config.yaml만 읽기
30 | config_file = self.user_dir / "embedder_config.yaml"
31 | if config_file.exists():
32 | if self._load_file(config_file, is_user=True):
33 | logger.info(f"Loaded embedder config from {config_file}")
34 | else:
35 | logger.warning(f"Failed to load embedder config from {config_file}")
36 | else:
37 | logger.warning(f"Embedder config file not found: {config_file}")
38 |
39 | logger.info(
40 | f"Loaded {len(self.configs)} {self.config_type} configs"
41 | )
42 |
43 | return self.configs
44 |
45 | def parse_config(
46 | self, data: Dict[str, Any], source_path: str, is_user: bool
47 | ) -> Optional[EmbedderConfig]:
48 | """Parse embedder configuration data"""
49 | try:
50 | # 'models' 필드는 하위 호환성을 위해 무시 (deprecated)
51 | return EmbedderConfig(
52 | default_model=data.get("default_model"),
53 | device=data.get("device"),
54 | source_path=source_path,
55 | is_override=is_user,
56 | )
57 | except Exception as e:
58 | import sys
59 |
60 | error_msg = f"Error parsing embedder config from {source_path}: {e}"
61 | print(f"\n❌ ERROR: {error_msg}", file=sys.stderr)
62 | return None
63 |
64 | def get_config_name(self, config: EmbedderConfig) -> str:
65 | """Get the name of an embedder configuration"""
66 | # 단일 config 파일이므로 고정 이름 사용
67 | return "config"
68 |
69 | def validate_config(self, data: Dict[str, Any]) -> bool:
70 | """Validate embedder configuration data"""
71 | # 필수 필드가 없으므로 기본적으로 유효
72 | return True
73 |
74 | def get_merged_config(self) -> EmbedderConfig:
75 | """
76 | Get merged configuration (base + user override)
77 |
78 | Returns:
79 | Merged EmbedderConfig with user overrides applied
80 | """
81 | # Base config
82 | base = self.configs.get("config")
83 | if not base:
84 | # Return default if no config found
85 | return EmbedderConfig()
86 |
87 | return base
88 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/loader_config.py:
--------------------------------------------------------------------------------
1 | """
2 | Loader configuration loader
3 | """
4 | from typing import Dict, Any, Optional
5 | from maru_lang.configs.base import DefaultConfigLoader
6 | from maru_lang.pluggable.models import LoaderConfig
7 | from maru_lang.enums.configs import ConfigType
8 |
9 |
10 | class LoaderConfigLoader(DefaultConfigLoader[LoaderConfig]):
11 | """Loader for loader (parser) configurations"""
12 |
13 | def __init__(self):
14 | super().__init__(ConfigType.LOADERS)
15 | # Loaders는 base config 없이 user config만 사용
16 | # (명시적 설정을 강제하기 위해)
17 |
18 | def load_all(self) -> Dict[str, LoaderConfig]:
19 | """Load configurations from user directory only (no base)"""
20 | import logging
21 | logger = logging.getLogger(__name__)
22 |
23 | self.configs = {}
24 | self._base_configs = {}
25 |
26 | # User config만 로드 (base 없음) - 특정 파일만 읽기
27 | logger.info(f"Loading {self.config_type} configurations from user directory...")
28 |
29 | # loader_config.yaml만 읽기 (사용자 정의 parser .py 파일 제외)
30 | config_file = self.user_dir / "loader_config.yaml"
31 | if config_file.exists():
32 | if self._load_file(config_file, is_user=True):
33 | logger.info(f"Loaded loader config from {config_file}")
34 | else:
35 | logger.warning(f"Failed to load loader config from {config_file}")
36 | else:
37 | logger.warning(f"Loader config file not found: {config_file}")
38 |
39 | logger.info(
40 | f"Loaded {len(self.configs)} {self.config_type} configs"
41 | )
42 |
43 | return self.configs
44 |
45 | def parse_config(
46 | self, data: Dict[str, Any], source_path: str, is_user: bool
47 | ) -> Optional[LoaderConfig]:
48 | """Parse loader configuration data"""
49 | try:
50 | return LoaderConfig(
51 | default_loader=data.get("default_loader", "txt"),
52 | default_chunker=data.get("default_chunker", "paragraph"),
53 | extensions=data.get("extensions", {}),
54 | source_path=source_path,
55 | is_override=is_user,
56 | )
57 | except Exception as e:
58 | import sys
59 |
60 | error_msg = f"Error parsing loader config from {source_path}: {e}"
61 | print(f"\n❌ ERROR: {error_msg}", file=sys.stderr)
62 | return None
63 |
64 | def get_config_name(self, config: LoaderConfig) -> str:
65 | """Get the name of a loader configuration"""
66 | # 단일 config 파일이므로 고정 이름 사용
67 | return "config"
68 |
69 | def validate_config(self, data: Dict[str, Any]) -> bool:
70 | """Validate loader configuration data"""
71 | # 필수 필드가 없으므로 기본적으로 유효
72 | return True
73 |
74 | def get_merged_config(self) -> LoaderConfig:
75 | """
76 | Get merged configuration (base + user override)
77 |
78 | Returns:
79 | Merged LoaderConfig with user overrides applied
80 | """
81 | # Base config
82 | base = self.configs.get("config")
83 | if not base:
84 | # Return default if no config found
85 | return LoaderConfig()
86 |
87 | return base
88 |
--------------------------------------------------------------------------------
/maru_lang/core/relation_db/models/auth.py:
--------------------------------------------------------------------------------
1 | from __future__ import annotations
2 | import datetime
3 | from tortoise.models import Model
4 | from tortoise import fields
5 |
6 |
7 | class User(Model):
8 | id = fields.IntField(pk=True)
9 | name = fields.CharField(max_length=255, index=True, null=True)
10 | email = fields.CharField(max_length=255, index=True, unique=True)
11 | role = fields.ForeignKeyField(
12 | "models.UserRole", related_name="users", null=True)
13 | created_at = fields.DatetimeField(auto_now_add=True)
14 |
15 | class Meta:
16 | table = "user"
17 |
18 |
19 | class UserGroup(Model):
20 | id = fields.IntField(pk=True)
21 | name = fields.CharField(max_length=255, unique=True)
22 | manager = fields.ForeignKeyField(
23 | "models.User",
24 | related_name="managed_user_groups",
25 | on_delete=fields.RESTRICT # Prevents User deletion if managing UserGroups
26 | )
27 | created_at = fields.DatetimeField(auto_now_add=True)
28 |
29 | class Meta:
30 | table = "user_group"
31 |
32 |
33 | class UserGroupMembership(Model):
34 | user = fields.ForeignKeyField(
35 | "models.User",
36 | related_name="group_memberships")
37 | group = fields.ForeignKeyField(
38 | "models.UserGroup",
39 | related_name="members")
40 |
41 | class Meta:
42 | table = "user_group_membership"
43 |
44 |
45 | class UserGroupInclusion(Model):
46 | parent = fields.ForeignKeyField(
47 | "models.UserGroup", related_name="includes") # 상위 그룹
48 | child = fields.ForeignKeyField(
49 | "models.UserGroup", related_name="included_by") # 하위 그룹
50 |
51 | class Meta:
52 | table = "user_group_inclusion"
53 |
54 |
55 | class OTP(Model):
56 | id = fields.IntField(pk=True)
57 | email = fields.CharField(max_length=255, index=True)
58 | code = fields.CharField(max_length=6)
59 | created_at = fields.DatetimeField(auto_now_add=True)
60 |
61 | class Meta:
62 | table = "otp"
63 |
64 | async def is_valid(self):
65 | """ 인증코드가 5분 이내인지 확인 """
66 | expiration_time = self.created_at + datetime.timedelta(minutes=5)
67 | return expiration_time > datetime.datetime.now(datetime.timezone.utc)
68 |
69 |
70 | class UserToken(Model):
71 | id = fields.IntField(pk=True)
72 | user_id = fields.CharField(max_length=255, index=True) # 사용자 고유 ID
73 | device_id = fields.CharField(max_length=255, index=True) # 기기 고유 ID
74 | jwt_token = fields.TextField() # JWT 토큰
75 | created_at = fields.DatetimeField(auto_now_add=True)
76 |
77 | class Meta:
78 | table = "user_token"
79 |
80 |
81 | class RefreshToken(Model):
82 | id = fields.IntField(pk=True)
83 | user_id = fields.CharField(max_length=255, index=True)
84 | device_id = fields.CharField(max_length=255, index=True)
85 | refresh_token = fields.TextField() # 발급된 Refresh Token 문자열
86 | created_at = fields.DatetimeField(auto_now_add=True)
87 | expires_at = fields.DatetimeField()
88 |
89 | class Meta:
90 | table = "refresh_token"
91 |
92 |
93 | class UserRole(Model):
94 | id = fields.IntField(pk=True)
95 | name = fields.CharField(max_length=255, index=True, unique=True)
96 | description = fields.TextField(null=True)
97 |
98 | class Meta:
99 | table = "user_role"
100 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/llm_reranker.yaml:
--------------------------------------------------------------------------------
1 | # LLM Reranker Agent Configuration
2 | # Agent that uses LLM to rerank search results based on relevance to the query
3 | # NOTE: This is a utility agent, not selectable by agent_selector
4 |
5 | name: llm_reranker
6 | description: "Uses LLM to evaluate and rerank search results based on relevance"
7 | type: utility
8 | enabled: true
9 | version: "1.0.0"
10 |
11 | # Priority (lower priority - runs after retrieval)
12 | priority: 50
13 |
14 | # LLM Settings
15 | target_llm_config:
16 | server_name: "openai"
17 | override_params:
18 | temperature: 0.0 # Keep temperature at 0 for consistent scoring
19 | max_tokens: 1000 # Allow enough tokens for scoring multiple documents
20 | timeout: 30.0 # Longer timeout for processing multiple documents
21 |
22 | fallback_strategy: "any_available"
23 |
24 | # Prompt Settings
25 | prompts:
26 | system_prompt: |
27 | You are an expert at evaluating document relevance to search queries.
28 |
29 | Responsibilities:
30 | - Analyze the semantic relevance between a query and documents
31 | - Assign relevance scores to each document (0.0 to 1.0)
32 | - Consider both semantic meaning and keyword matching
33 | - Be strict in scoring - only highly relevant documents should get high scores
34 |
35 | Scoring Guidelines:
36 | - 1.0: Perfect match, directly answers the query
37 | - 0.8-0.9: Highly relevant, contains most information needed
38 | - 0.6-0.7: Moderately relevant, contains some useful information
39 | - 0.4-0.5: Weakly relevant, tangentially related
40 | - 0.0-0.3: Not relevant or off-topic
41 |
42 | user_prompt_template: |
43 | Evaluate the relevance of each document to the given query and assign scores.
44 |
45 | Query: {query}
46 |
47 | Documents:
48 | {documents}
49 |
50 | Important rules:
51 | 1. Assign a relevance score (0.0 to 1.0) to each document based on how well it answers the query.
52 | 2. Document indices must match the input (0, 1, 2, ...).
53 | 3. Be strict in scoring - reserve high scores (>0.8) for truly relevant documents.
54 | 4. Consider semantic meaning, not just keyword overlap.
55 | 5. Return all documents with their scores, even if score is 0.0.
56 |
57 | You must return a JSON object as a tool call following the definition below. Use the keys document_scores and reasoning exactly as written.
58 |
59 | # Implementation class (Python file in rerankers/)
60 | implementation: rerankers.llm_reranker.LLMRerankerAgent
61 |
62 | # Agent Settings
63 | config:
64 | timeout: 30
65 | retry_count: 1
66 |
67 | # Tool schema definition (JSON)
68 | tools:
69 | llm_reranker:
70 | description: "Reranks documents based on relevance to query using LLM evaluation"
71 | parameters:
72 | type: "object"
73 | properties:
74 | document_scores:
75 | type: "array"
76 | items:
77 | type: "object"
78 | properties:
79 | index:
80 | type: "integer"
81 | description: "Document index (0-based)"
82 | score:
83 | type: "number"
84 | minimum: 0.0
85 | maximum: 1.0
86 | description: "Relevance score (0.0 to 1.0)"
87 | description: "List of documents with their relevance scores"
88 | reasoning:
89 | type: "string"
90 | description: "Brief explanation of the scoring rationale"
91 | required: ["document_scores", "reasoning"]
92 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/agents/agent_factory.py:
--------------------------------------------------------------------------------
1 | """
2 | Agent Factory - Creates and configures agents based on configuration
3 | """
4 | from typing import Dict, Optional, List
5 | from maru_lang.pluggable.agents.base import BaseAgent
6 | from maru_lang.pluggable.agents.registry import get_registry
7 | from maru_lang.configs.manager import get_config_manager
8 | from maru_lang.pluggable.models import AgentConfig
9 | from maru_lang.pluggable.agents.mcp_client_agent import MCPClientAgent
10 |
11 |
12 | class AgentFactory:
13 | """
14 | Factory for creating agents with proper configuration
15 | Supports dynamic loading
16 | """
17 |
18 | def __init__(
19 | self,
20 | ):
21 | """
22 | Initialize factory with default components
23 | """
24 | self.config_manager = get_config_manager()
25 | self.registry = get_registry()
26 |
27 | def create_agent(
28 | self,
29 | agent_name: str,
30 | agent_config: AgentConfig
31 | ) -> Optional[BaseAgent]:
32 | """
33 | Create an agent instance based on name and configuration
34 |
35 | Args:
36 | agent_name: Name/type of the agent
37 | agent_config: Agent-specific configuration
38 |
39 | Returns:
40 | Agent instance or None if not found
41 | """
42 | # Get agent class from registry
43 | agent_class = self.registry.get_agent_class(agent_name)
44 | if not agent_class:
45 | print(f"Agent not found in registry: {agent_name}")
46 | return None
47 |
48 | # Create agent instance
49 | try:
50 | if issubclass(agent_class, MCPClientAgent):
51 | # Other MCP agents need name, server_params, and llm_client
52 | if not agent_config.mcp_config:
53 | raise ValueError(
54 | f"MCP agent {agent_name} missing mcp_config")
55 | return agent_class(
56 | name=agent_name,
57 | config=agent_config, # Pass full agent_config as config
58 | )
59 |
60 | except Exception as e:
61 | print(f"Error creating agent {agent_name}: {e}")
62 | return None
63 |
64 | def create_agents_from_config(self) -> Dict[str, BaseAgent]:
65 | """
66 | Create all agents based on configuration
67 |
68 | Returns:
69 | Dictionary of agent instances by name
70 | """
71 | agents = {}
72 |
73 | # Create all agents from the registry
74 | for agent_name in self.registry.list_agents():
75 | agent_config = self.registry.get_agent_config(agent_name)
76 | if not agent_config:
77 | print(
78 | f"[ERROR AgentFactory] Agent config not found: {agent_name}")
79 | continue
80 | agent = self.create_agent(agent_name, agent_config)
81 | if agent:
82 | agents[agent_name] = agent
83 | else:
84 | print(
85 | f"[ERROR AgentFactory] Failed to create agent: {agent_name}")
86 | raise Exception(f"Failed to create agent: {agent_name}")
87 | return agents
88 |
89 | def list_available_agents(self) -> List[str]:
90 | """List all available agent names"""
91 | return self.registry.list_agents()
92 |
93 | def reload_agents(self) -> None:
94 | """Reload all agents from sources"""
95 | self.registry.reload()
96 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/configs/reranker_config.py:
--------------------------------------------------------------------------------
1 | """
2 | Reranker Configuration Loader
3 | """
4 | import logging
5 | from typing import Dict, Any, Optional
6 | from maru_lang.enums.configs import ConfigType
7 | from maru_lang.pluggable.models import RerankerConfig
8 | from maru_lang.configs.base import DefaultConfigLoader
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | class RerankerConfigLoader(DefaultConfigLoader[RerankerConfig]):
14 | """Loader for reranker configurations"""
15 |
16 | def __init__(self):
17 | super().__init__(ConfigType.RERANKERS)
18 |
19 | def load_all(self) -> Dict[str, RerankerConfig]:
20 | """Load configurations from user directory only (no base)"""
21 | self.configs = {}
22 | self._base_configs = {}
23 |
24 | # User config만 로드 (base 없음) - 특정 파일만 읽기
25 | logger.info(f"Loading {self.config_type} configurations from user directory...")
26 |
27 | # reranker_config.yaml만 읽기 (llm_reranker.yaml 등 agent 설정 제외)
28 | config_file = self.user_dir / "reranker_config.yaml"
29 | if config_file.exists():
30 | if self._load_file(config_file, is_user=True):
31 | logger.info(f"Loaded reranker config from {config_file}")
32 | else:
33 | logger.warning(f"Failed to load reranker config from {config_file}")
34 | else:
35 | logger.warning(f"Reranker config file not found: {config_file}")
36 |
37 | logger.info(
38 | f"Loaded {len(self.configs)} {self.config_type} configs"
39 | )
40 |
41 | return self.configs
42 |
43 | def parse_config(
44 | self, data: Dict[str, Any], source_path: str, is_user: bool
45 | ) -> Optional[RerankerConfig]:
46 | """Parse reranker configuration data"""
47 | try:
48 | # 'models' 필드는 하위 호환성을 위해 무시 (deprecated)
49 | return RerankerConfig(
50 | enabled=data.get("enabled", True),
51 | method=data.get("method", "model"),
52 | default_model=data.get("default_model", "BAAI/bge-reranker-v2-m3"),
53 | agent_name=data.get("agent_name"),
54 | top_k=data.get("top_k"),
55 | source_path=source_path,
56 | is_override=is_user,
57 | )
58 | except Exception as e:
59 | import sys
60 |
61 | error_msg = f"Error parsing reranker config from {source_path}: {e}"
62 | print(f"\n❌ ERROR: {error_msg}", file=sys.stderr)
63 | return None
64 |
65 | def get_config_name(self, config: RerankerConfig) -> str:
66 | """Get the name of a reranker configuration"""
67 | # 단일 config 파일이므로 고정 이름 사용
68 | return "config"
69 |
70 | def validate_config(self, data: Dict[str, Any]) -> bool:
71 | """Validate reranker configuration data"""
72 | # 필수 필드가 없으므로 기본적으로 유효
73 | if not isinstance(data, dict):
74 | logger.error(f"Reranker config data is not a dict: {type(data)}")
75 | return False
76 | return True
77 |
78 | def get_merged_config(self) -> RerankerConfig:
79 | """
80 | Get merged configuration (base + user override)
81 |
82 | Returns:
83 | Merged RerankerConfig with user overrides applied
84 | """
85 | # Base config
86 | base = self.configs.get("config")
87 | if not base:
88 | # Return default if no config found
89 | return RerankerConfig()
90 |
91 | return base
92 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/loaders/xml_parser.py:
--------------------------------------------------------------------------------
1 | """XML file parser."""
2 |
3 | import xml.etree.ElementTree as ET
4 | from pathlib import Path
5 | from .base import BaseParser, ParseResult
6 |
7 |
8 | class XMLParser(BaseParser):
9 | """XML 파일 파서"""
10 |
11 | def parse(self, file_path: Path) -> ParseResult:
12 | """
13 | XML 파일을 읽어 포맷팅된 텍스트로 변환합니다.
14 |
15 | Args:
16 | file_path: 파싱할 XML 파일 경로
17 |
18 | Returns:
19 | ParseResult: 파싱된 텍스트와 메타데이터
20 | """
21 | if not file_path.exists():
22 | raise FileNotFoundError(f"파일을 찾을 수 없습니다: {file_path}")
23 |
24 | try:
25 | tree = ET.parse(file_path)
26 | root = tree.getroot()
27 |
28 | # XML 구조를 텍스트로 변환
29 | lines = []
30 | self._element_to_text(root, lines, level=0)
31 | content = '\n'.join(lines)
32 |
33 | # 요소 개수 계산
34 | num_elements = len(list(root.iter()))
35 |
36 | metadata = {
37 | 'file_type': 'xml',
38 | 'encoding': tree.docinfo.encoding if hasattr(tree, 'docinfo') else 'utf-8',
39 | 'file_size': file_path.stat().st_size,
40 | 'root_tag': root.tag,
41 | 'num_elements': num_elements,
42 | }
43 |
44 | # 네임스페이스 정보 추출
45 | namespaces = {}
46 | for elem in root.iter():
47 | if '}' in elem.tag:
48 | ns = elem.tag.split('}')[0][1:]
49 | if ns not in namespaces.values():
50 | namespaces[f'ns{len(namespaces)}'] = ns
51 |
52 | if namespaces:
53 | metadata['namespaces'] = namespaces
54 |
55 | return ParseResult(content=content, metadata=metadata)
56 |
57 | except ET.ParseError as e:
58 | raise ValueError(f"XML 파싱 실패: {file_path} - {str(e)}") from e
59 | except Exception as e:
60 | raise ValueError(f"파일 읽기 실패: {file_path}") from e
61 |
62 | def _element_to_text(self, element: ET.Element, lines: list[str], level: int = 0) -> None:
63 | """
64 | XML 요소를 재귀적으로 텍스트로 변환합니다.
65 |
66 | Args:
67 | element: XML 요소
68 | lines: 결과를 저장할 리스트
69 | level: 들여쓰기 레벨
70 | """
71 | indent = " " * level
72 | tag = element.tag
73 |
74 | # 네임스페이스 제거 (가독성 향상)
75 | if '}' in tag:
76 | tag = tag.split('}')[1]
77 |
78 | # 시작 태그와 속성
79 | attrs = ''
80 | if element.attrib:
81 | attrs = ' [' + ', '.join(f'{k}={v}' for k, v in element.attrib.items()) + ']'
82 |
83 | # 텍스트 내용
84 | text = (element.text or '').strip()
85 |
86 | if text:
87 | lines.append(f"{indent}<{tag}{attrs}>: {text}")
88 | else:
89 | lines.append(f"{indent}<{tag}{attrs}>")
90 |
91 | # 자식 요소 재귀 처리
92 | for child in element:
93 | self._element_to_text(child, lines, level + 1)
94 |
95 | # tail 텍스트 (닫는 태그 뒤의 텍스트)
96 | tail = (element.tail or '').strip()
97 | if tail:
98 | lines.append(f"{indent} {tail}")
99 |
100 | def supports(self, file_path: Path) -> bool:
101 | """XML 파일 확장자 지원 확인"""
102 | return file_path.suffix.lower() in self.supported_extensions
103 |
104 | @property
105 | def supported_extensions(self) -> list[str]:
106 | """지원하는 XML 파일 확장자"""
107 | return ['.xml', '.xhtml', '.svg']
108 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/agents/builtin/intent_extractor.py:
--------------------------------------------------------------------------------
1 | """
2 | Intent Extractor Agent - Extracts user intent and rewrites queries for search
3 | """
4 | from typing import Dict, Any, Optional
5 | from maru_lang.pluggable.agents.base import BaseAgent, AgentResult
6 | from maru_lang.models.chat import ChatHistory
7 |
8 |
9 | class IntentExtractorAgent(BaseAgent):
10 | """Agent for extracting user intent and rewriting queries for document search"""
11 |
12 | def __init__(self, **kwargs):
13 | super().__init__(**kwargs)
14 |
15 | async def _setup(self) -> None:
16 | """Initialize intent extraction capabilities"""
17 | pass
18 |
19 | async def execute(
20 | self,
21 | question: str,
22 | chat_history: ChatHistory,
23 | **kwargs
24 | ) -> AgentResult:
25 | """
26 | Execute intent extraction and query rewriting
27 |
28 | Args:
29 | question: User's new question/message
30 | chat_history: Previous conversation context
31 | max_length: Maximum length of generated question
32 | **kwargs: Additional parameters
33 |
34 | Returns:
35 | AgentResult containing extracted intent and rewritten query
36 | """
37 |
38 | try:
39 | # Format the prompt with dialogue context
40 | rewritten_query = await self._extract_intent_and_rewrite(
41 | question,
42 | chat_history
43 | )
44 |
45 | return AgentResult(
46 | success=True,
47 | result=rewritten_query, # 주요 출력: 재작성된 질문
48 | data={
49 | 'original_question': question,
50 | 'rewritten_question': rewritten_query,
51 | 'has_context': True if chat_history.messages else False,
52 | 'extracted_intent': True,
53 | },
54 | metadata={
55 | 'extraction_method': 'llm_based',
56 | }
57 | )
58 |
59 | except Exception as e:
60 | # Fallback to original question
61 | return AgentResult(
62 | success=True, # Still successful, but using fallback
63 | result=question, # 주요 출력: 원본 질문
64 | data={
65 | 'original_question': question,
66 | 'rewritten_question': question, # Use original as fallback
67 | 'has_context': True if chat_history.messages else False,
68 | 'extracted_intent': False,
69 | },
70 | metadata={
71 | 'extraction_method': 'fallback',
72 | 'error': str(e)
73 | }
74 | )
75 |
76 | async def _extract_intent_and_rewrite(
77 | self,
78 | question: str,
79 | chat_history: ChatHistory,
80 | ) -> str:
81 | """Extract intent and rewrite query using LLM with fallback"""
82 | # YAML 설정에서 프롬프트 가져오기
83 | prompts = self.config.prompts
84 |
85 | # 템플릿에 질문 삽입
86 | user_prompt = prompts.user_prompt_template.format(
87 | question=question,
88 | history_text=chat_history.to_string()
89 | )
90 |
91 | override_params = self.get_override_params()
92 |
93 | # Use request_with_fallback for automatic LLM fallback
94 | response = await self.request_with_fallback(
95 | user_prompt=user_prompt,
96 | system_prompt=prompts.system_prompt,
97 | **override_params,
98 | )
99 |
100 | return response.strip()
101 |
--------------------------------------------------------------------------------
/maru_lang/templates/python/custom_parser.py:
--------------------------------------------------------------------------------
1 | """
2 | Custom parser template - Copy this file and remove .sample extension
3 |
4 | This is a template for creating custom document parsers.
5 | Implement the BaseParser interface to add support for new file formats.
6 | """
7 |
8 | from pathlib import Path
9 | from maru_lang.pluggable.loaders.base import BaseParser, ParseResult
10 |
11 |
12 | class CustomParser(BaseParser):
13 | """
14 | Template for custom file parsers.
15 |
16 | Copy this class to implement support for new file formats.
17 | """
18 |
19 | def parse(self, file_path: Path) -> ParseResult:
20 | """
21 | Parse the file and extract textual content.
22 |
23 | Args:
24 | file_path: Path to the file to parse
25 |
26 | Returns:
27 | ParseResult: Parsed text and metadata
28 |
29 | Raises:
30 | ValueError: Raised when parsing fails or content cannot be read
31 | FileNotFoundError: Raised when the file does not exist
32 | """
33 | if not file_path.exists():
34 | raise FileNotFoundError(f"File not found: {file_path}")
35 |
36 | try:
37 | # Implement your parsing logic here
38 | # Example: convert JSON, XML, or CSV into plain text
39 |
40 | with open(file_path, 'r', encoding='utf-8') as f:
41 | content = f.read()
42 |
43 | # Optional metadata enrichment
44 | metadata = {
45 | 'file_type': 'custom',
46 | 'file_size': file_path.stat().st_size,
47 | # Add additional metadata as needed
48 | }
49 |
50 | return ParseResult(content=content, metadata=metadata)
51 |
52 | except Exception as e:
53 | raise ValueError(f"Failed to parse file: {file_path}") from e
54 |
55 | def supports(self, file_path: Path) -> bool:
56 | """
57 | Determine whether this parser supports the given file.
58 |
59 | Args:
60 | file_path: Path of the file to check
61 |
62 | Returns:
63 | bool: True if the file is supported, otherwise False
64 | """
65 | return file_path.suffix.lower() in self.supported_extensions
66 |
67 | @property
68 | def supported_extensions(self) -> list[str]:
69 | """
70 | List of file extensions supported by this parser
71 |
72 | Returns:
73 | list[str]: Supported extensions (e.g., ['.json', '.jsonl'])
74 | """
75 | # Update this list with the extensions you support
76 | return ['.custom', '.cst']
77 |
78 |
79 | # Example: JSON parser
80 | class JsonParser(BaseParser):
81 | """Example parser for JSON files"""
82 |
83 | def parse(self, file_path: Path) -> ParseResult:
84 | if not file_path.exists():
85 | raise FileNotFoundError(f"File not found: {file_path}")
86 |
87 | try:
88 | import json
89 |
90 | with open(file_path, 'r', encoding='utf-8') as f:
91 | data = json.load(f)
92 |
93 | # Convert JSON into formatted text
94 | content = json.dumps(data, indent=2, ensure_ascii=False)
95 |
96 | metadata = {
97 | 'file_type': 'json',
98 | 'file_size': file_path.stat().st_size,
99 | }
100 |
101 | return ParseResult(content=content, metadata=metadata)
102 |
103 | except json.JSONDecodeError as e:
104 | raise ValueError(f"JSON parse error: {file_path}") from e
105 | except Exception as e:
106 | raise ValueError(f"Failed to read file: {file_path}") from e
107 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [project]
2 | name = "maru-lang"
3 | version = "0.0.0"
4 | description = "Advanced LLM-powered chatbot with RAG, multi-agent system, and enterprise features"
5 | requires-python = ">=3.10"
6 | readme = "README.md"
7 | license = {text = "MIT"}
8 | authors = [
9 | {name = "KC ML2"},
10 | ]
11 | keywords = ["llm", "chatbot", "rag", "ai", "fastapi", "agents"]
12 | classifiers = [
13 | "Development Status :: 4 - Beta",
14 | "Intended Audience :: Developers",
15 | "License :: OSI Approved :: MIT License",
16 | "Programming Language :: Python :: 3",
17 | "Programming Language :: Python :: 3.10",
18 | "Programming Language :: Python :: 3.11",
19 | "Programming Language :: Python :: 3.12",
20 | "Programming Language :: Python :: 3.13",
21 | "Topic :: Software Development :: Libraries :: Python Modules",
22 | "Topic :: Scientific/Engineering :: Artificial Intelligence",
23 | ]
24 |
25 | dependencies = [
26 | "fastapi>=0.100.0",
27 | "uvicorn[standard]>=0.23.0",
28 | "typer[all]>=0.9.0",
29 | "tortoise-orm[asyncpg]>=0.20.0",
30 | "aerich>=0.7.2",
31 | "python-jose[cryptography]>=3.3.0",
32 | "passlib>=1.7.4",
33 | "httpx>=0.24.0",
34 | "sentence-transformers>=2.2.0",
35 | "fastapi-pagination>=0.12.0",
36 | "rank-bm25>=0.2.2",
37 | "chromadb>=0.4.0",
38 | "konlpy>=0.6.0",
39 | "mcp>=0.9.0",
40 | "pyyaml>=6.0.0",
41 | # Document parsers
42 | "PyPDF2>=3.0.1",
43 | "python-docx>=0.8.11",
44 | "python-pptx>=0.6.21",
45 | "openpyxl>=3.0.0",
46 | "beautifulsoup4>=4.12.0",
47 | ]
48 |
49 | [project.optional-dependencies]
50 | # Vector database backends (alternative to chromadb)
51 | vector-db = [
52 | "pymilvus[model]>=2.3.0",
53 | ]
54 |
55 | # Email integration
56 | email = [
57 | "O365>=2.0.0",
58 | ]
59 |
60 | # Development / testing
61 | dev = [
62 | "aerich[toml]>=0.7.0",
63 | "pytest>=7.0.0",
64 | "pytest-asyncio>=0.21.0",
65 | "pytest-cov>=4.0.0",
66 | "notebook>=6.5.0",
67 | ]
68 |
69 | # Bundle of all optional features
70 | all = [
71 | "pymilvus[model]>=2.3.0",
72 | "O365>=2.0.0",
73 | ]
74 |
75 | [build-system]
76 | requires = ["setuptools>=61.0", "wheel"]
77 | build-backend = "setuptools.build_meta"
78 |
79 | [tool.setuptools]
80 | packages = {find = {where = ["."], include = ["maru_lang*"]}}
81 | include-package-data = true
82 |
83 | [tool.setuptools.package-data]
84 | maru_lang = [
85 | "templates/**/*.yaml",
86 | "templates/**/*.yml",
87 | "templates/**/*.py",
88 | "templates/**/*.md",
89 | "py.typed",
90 | ]
91 |
92 | [tool.pytest.ini_options]
93 | asyncio_mode = "auto"
94 | asyncio_default_fixture_loop_scope = "function"
95 | testpaths = ["tests"]
96 | python_files = "test_*.py"
97 | python_classes = "Test*"
98 | python_functions = "test_*"
99 | filterwarnings = [
100 | "ignore::DeprecationWarning",
101 | "ignore::UserWarning",
102 | ]
103 | markers = [
104 | "slow: long-running tests",
105 | "integration: integration tests",
106 | ]
107 |
108 | [tool.coverage.run]
109 | source = ["maru_lang"]
110 | omit = [
111 | "maru_lang/migrations/*",
112 | "maru_lang/alembic/*",
113 | "maru_lang/scripts/*",
114 | "maru_lang/tests/*",
115 | ]
116 |
117 | [tool.coverage.report]
118 | exclude_lines = [
119 | "pragma: no cover",
120 | "def __repr__",
121 | "raise NotImplementedError",
122 | "if __name__ == '__main__':",
123 | "pass",
124 | "raise ImportError",
125 | ]
126 |
127 | [project.scripts]
128 | maru = "maru_lang.cli:app"
129 |
130 | [tool.aerich]
131 | tortoise_orm = "maru_lang.core.relation_db.TORTOISE_ORM"
132 | location = "./migrations"
133 | src_folder = "./."
--------------------------------------------------------------------------------
/maru_lang/pluggable/chunkers/paragraph.py:
--------------------------------------------------------------------------------
1 | from typing import List
2 | from maru_lang.models.ingest import ChunkInput
3 | from .base import BaseChunker
4 |
5 |
6 | class ParagraphChunker(BaseChunker):
7 | """문단 단위로 청킹 (개행 2개 기준)"""
8 |
9 | name = "paragraph"
10 | description = "문단 단위로 청킹 (빈 줄 기준 분리)"
11 |
12 | def __init__(self, max_chunk_size: int = 2000):
13 | self.max_chunk_size = max_chunk_size
14 |
15 | def chunk(self, text: str) -> List[ChunkInput]:
16 | parts = [p.strip() for p in text.split("\n\n") if p.strip()]
17 |
18 | # 큰 청크를 max_chunk_size 기준으로 분할
19 | chunks = []
20 | for part in parts:
21 | if len(part) <= self.max_chunk_size:
22 | chunks.append(part)
23 | else:
24 | # 큰 청크를 문장 단위로 분할 시도
25 | sentences = self._split_by_sentences(part)
26 | current_chunk = []
27 | current_size = 0
28 |
29 | for sentence in sentences:
30 | sentence_len = len(sentence)
31 |
32 | # 단일 문장이 max_chunk_size를 초과하는 경우
33 | if sentence_len > self.max_chunk_size:
34 | # 현재 버퍼가 있으면 먼저 저장
35 | if current_chunk:
36 | chunks.append(" ".join(current_chunk))
37 | current_chunk = []
38 | current_size = 0
39 | # 큰 문장을 강제로 분할
40 | chunks.extend(self._force_split(sentence, self.max_chunk_size))
41 | continue
42 |
43 | # 현재 청크에 추가했을 때 크기 초과 여부 확인
44 | if current_size + sentence_len + (1 if current_chunk else 0) > self.max_chunk_size:
45 | # 현재 버퍼 저장
46 | if current_chunk:
47 | chunks.append(" ".join(current_chunk))
48 | current_chunk = [sentence]
49 | current_size = sentence_len
50 | else:
51 | current_chunk.append(sentence)
52 | current_size += sentence_len + (1 if len(current_chunk) > 1 else 0)
53 |
54 | # 남은 버퍼 저장
55 | if current_chunk:
56 | chunks.append(" ".join(current_chunk))
57 |
58 | # 안전장치: 모든 청크가 max_chunk_size 이하인지 검증하고 필요시 재분할
59 | final_chunks = []
60 | for chunk in chunks:
61 | if len(chunk) <= self.max_chunk_size:
62 | final_chunks.append(chunk)
63 | else:
64 | # max_chunk_size를 초과하는 청크는 강제 분할
65 | final_chunks.extend(self._force_split(chunk, self.max_chunk_size))
66 |
67 | return [ChunkInput(number=i, content=c) for i, c in enumerate(final_chunks, start=1)]
68 |
69 | def _split_by_sentences(self, text: str) -> List[str]:
70 | """텍스트를 문장 단위로 분할 (간단한 휴리스틱)"""
71 | import re
72 | # 한글/영어 문장 종결 기호로 분할
73 | sentences = re.split(r'([.!?。!?\n]+)', text)
74 |
75 | # 구두점을 앞 문장에 붙이기
76 | result = []
77 | for i in range(0, len(sentences) - 1, 2):
78 | if i + 1 < len(sentences):
79 | result.append((sentences[i] + sentences[i + 1]).strip())
80 | else:
81 | result.append(sentences[i].strip())
82 |
83 | # 마지막 요소가 남아있으면 추가
84 | if len(sentences) % 2 == 1 and sentences[-1].strip():
85 | result.append(sentences[-1].strip())
86 |
87 | return [s for s in result if s]
88 |
89 | def _force_split(self, text: str, max_size: int) -> List[str]:
90 | """max_size보다 큰 텍스트를 강제로 분할"""
91 | chunks = []
92 | for i in range(0, len(text), max_size):
93 | chunks.append(text[i:i + max_size])
94 | return chunks
95 |
--------------------------------------------------------------------------------
/maru_lang/services/ingest.py:
--------------------------------------------------------------------------------
1 | """
2 | Ingest service functions for file upload and synchronization
3 | """
4 | from typing import List, Tuple
5 | from datetime import datetime
6 | from pathlib import Path
7 | from maru_lang.core.relation_db.models.documents import Document, DocumentGroup, DocumentGroupMembership
8 | from maru_lang.utils.document import make_source_fingerprint_for_file
9 |
10 |
11 | async def check_files_to_upload(
12 | folder_path: str,
13 | files: List[dict] # [{"fileName": str, "createdAt": datetime, "relativePath": str, "size": int}]
14 | ) -> List[str]:
15 | """
16 | Check which files need to be uploaded by comparing with database.
17 |
18 | Uses same logic as IngestPipeline's upsert_document_from_file:
19 | - Compares file_path (relativePath) within the DocumentGroup
20 | - Compares source_fingerprint (SHA256 hash of path|size|mtime)
21 | - Only checks files in the specified folder's DocumentGroup
22 |
23 | Args:
24 | folder_path: Project folder name (DocumentGroup name, e.g., "user/project")
25 | files: List of file information dicts with fileName, createdAt, relativePath, size
26 |
27 | Returns:
28 | List of relativePaths that need to be uploaded
29 | """
30 | files_to_upload = []
31 |
32 | # Check if DocumentGroup exists for this folder
33 | document_group = await DocumentGroup.get_or_none(name=folder_path)
34 |
35 | # If no group exists, all files are new
36 | if not document_group:
37 | return [file_info["relativePath"] for file_info in files]
38 |
39 | for file_info in files:
40 | relative_path = file_info["relativePath"]
41 | file_name = file_info["fileName"]
42 | created_at = file_info["createdAt"]
43 | file_size = file_info.get("size", 0) # File size in bytes
44 |
45 | # Convert datetime to nanoseconds timestamp
46 | if isinstance(created_at, datetime):
47 | mtime_ns = int(created_at.timestamp() * 1e9)
48 | else:
49 | mtime_ns = int(created_at)
50 |
51 | # Generate expected fingerprint
52 | # Note: folder_path is already "{username}/{folderPath}"
53 | db_file_path = f"{folder_path}/{relative_path}"
54 | expected_fingerprint = make_source_fingerprint_for_file(
55 | db_file_path, file_size, mtime_ns
56 | )
57 |
58 | # Check if document exists in this specific group
59 | existing_doc = await Document.filter(
60 | file_path=db_file_path,
61 | group_memberships__group=document_group
62 | ).first()
63 |
64 | if not existing_doc:
65 | # New file in this group - needs upload
66 | files_to_upload.append(relative_path)
67 | continue
68 |
69 | # Compare fingerprint
70 | if existing_doc.source_fingerprint != expected_fingerprint:
71 | # File modified - needs re-upload
72 | files_to_upload.append(relative_path)
73 | continue
74 |
75 | # File exists and unchanged - skip
76 |
77 | return files_to_upload
78 |
79 |
80 | async def get_or_create_document_group(
81 | folder_path: str,
82 | manager_id: int
83 | ) -> DocumentGroup:
84 | """
85 | Get or create a DocumentGroup for the uploaded folder.
86 |
87 | Args:
88 | folder_path: Project folder name
89 | manager_id: User ID who manages this group
90 |
91 | Returns:
92 | DocumentGroup instance
93 | """
94 | from maru_lang.services.document import upsert_document_group
95 |
96 | # Use folder_path as both name and base_path
97 | # In production, you might want to use absolute paths
98 | group = await upsert_document_group(
99 | name=folder_path,
100 | base_path=folder_path,
101 | manager_id=manager_id,
102 | )
103 |
104 | return group
105 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/system_config.yaml:
--------------------------------------------------------------------------------
1 | # System Configuration
2 | # Central configuration file for MARU-Lang system settings
3 | # Supports environment variable substitution: ${ENV:VAR_NAME} or ${ENV:VAR_NAME:default_value}
4 |
5 | # ============================================================
6 | # Server Configuration
7 | # ============================================================
8 | server:
9 | host: ${ENV:HOST:127.0.0.1}
10 | port: ${ENV:PORT:8000}
11 | reload: ${ENV:RELOAD:false}
12 | log_level: ${ENV:LOG_LEVEL:info}
13 |
14 | # Environment settings
15 | environment:
16 | production: ${ENV:PRODUCTION:false}
17 |
18 | # ============================================================
19 | # Database Configuration
20 | # ============================================================
21 | database:
22 | # Database type: "sqlite" or "postgres"
23 | type: ${ENV:DB_TYPE:sqlite}
24 |
25 | # Database name (for SQLite, this is the file name; for PostgreSQL, the database name)
26 | name: ${ENV:DB_NAME:maru}
27 |
28 | # PostgreSQL settings (only required when type is "postgres")
29 | username: ${ENV:DB_USERNAME:}
30 | password: ${ENV:DB_PASSWORD:}
31 | host: ${ENV:DB_HOST:localhost}
32 | port: ${ENV:DB_PORT:5432}
33 |
34 | # ============================================================
35 | # Authentication & Security
36 | # ============================================================
37 | auth:
38 | # Secret key for JWT token generation (IMPORTANT: Change in production!)
39 | secret_key: ${ENV:SECRET_KEY:your-secret-key-change-in-production}
40 | salt: ${ENV:SALT:some-sugar}
41 | algorithm: ${ENV:ALGORITHM:HS256}
42 |
43 | # Token expiration times (in minutes)
44 | access_token_expire_minutes: ${ENV:ACCESS_TOKEN_EXPIRE_MINUTES:15}
45 | refresh_token_expire_minutes: ${ENV:REFRESH_TOKEN_EXPIRE_MINUTES:43200} # 30 days
46 |
47 | # Validation
48 | default_validation_code: ${ENV:DEFAULT_VALIDATION_CODE:456123}
49 |
50 | # Auto-create user groups based on email domain
51 | auto_create_group_by_domain: ${ENV:AUTO_CREATE_GROUP_BY_DOMAIN:true}
52 |
53 | # ============================================================
54 | # Email Service Configuration
55 | # ============================================================
56 | email:
57 | # Email service type: "o365" or "smtp"
58 | service_type: ${ENV:EMAIL_SERVICE_TYPE:o365}
59 |
60 | sender_email: ${ENV:SENDER_EMAIL:}
61 |
62 | # Office 365 settings (required when service_type is "o365")
63 | o365:
64 | client_id: ${ENV:O365_CLIENT_ID:}
65 | client_secret: ${ENV:O365_CLIENT_SECRET:}
66 | tenant_id: ${ENV:O365_TENANT_ID:}
67 |
68 | # SMTP settings (required when service_type is "smtp")
69 | smtp:
70 | host: ${ENV:SMTP_HOST:}
71 | port: ${ENV:SMTP_PORT:587}
72 | username: ${ENV:SMTP_USERNAME:}
73 | password: ${ENV:SMTP_PASSWORD:}
74 |
75 | # ============================================================
76 | # Vector Database Configuration
77 | # ============================================================
78 | vector_db:
79 | # Vector database type: "chroma" or "milvus"
80 | type: ${ENV:VECTOR_DB_TYPE:chroma}
81 |
82 | # Default collection name
83 | default_collection_name: ${ENV:DEFAULT_DB_COLLECTION_NAME:maru}
84 |
85 | # Chroma settings (required when type is "chroma")
86 | chroma:
87 | persist_dir: ${ENV:CHROMA_PERSIST_DIR:data/chroma/}
88 |
89 | # Milvus settings (required when type is "milvus")
90 | milvus:
91 | host: ${ENV:MILVUS_HOST:localhost}
92 | port: ${ENV:MILVUS_PORT:19530}
93 | user: ${ENV:MILVUS_USER:root}
94 | password: ${ENV:MILVUS_PASSWORD:Milvus}
95 |
96 | # ============================================================
97 | # External Services Configuration
98 | # ============================================================
99 | external:
100 | # No external services are configured
101 | # ============================================================
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_group_classifier.yaml:
--------------------------------------------------------------------------------
1 | # Group Classifier Agent Configuration
2 | # Agent that analyzes user questions and classifies them into appropriate document groups
3 |
4 | name: group_classifier
5 | description: "Analyzes user questions and classifies them into suitable document groups"
6 | type: builtin
7 | enabled: true
8 | version: "1.0.0"
9 |
10 | # 우선순위 (가장 높은 우선순위 - 다른 에이전트들보다 먼저 실행)
11 | priority: 100
12 |
13 | # LLM Settings
14 | target_llm_config:
15 | server_name: "openai"
16 | override_params:
17 | temperature: 0.1 # Keep temperature low for consistent classification
18 | max_tokens: 500 # Only short classification output is required
19 | timeout: 10.0 # Short timeout for quick classification
20 |
21 | fallback_strategy: "any_available"
22 |
23 | # Prompt Settings
24 | prompts:
25 | system_prompt: |
26 | You are an expert who analyzes user questions and classifies them into appropriate document groups.
27 |
28 | Responsibilities:
29 | - Accurately identify the intent and topic of the question
30 | - Choose the most appropriate document groups for the question
31 | - Determine priority when multiple groups are applicable
32 | - If no group matches, leave the selection empty
33 |
34 | user_prompt_template: |
35 | Analyze the following question and classify it into the most appropriate document groups:
36 |
37 | Question: {question}
38 |
39 | Available groups:
40 | {available_groups}
41 |
42 | Important rules:
43 | 1. You must select only from the groups listed above.
44 | 2. Never select a group that is not on the list.
45 | 3. If no groups are applicable, return an empty array [] for selected_groups.
46 | 4. Group names must exactly match the ones in the list.
47 | 5. Fill the group_confidences array in the same order as selected_groups. Each value must be between 0 and 1, and the total must sum to 1.
48 |
49 | You must return a JSON object as a tool call following the definition below. Use the keys selected_groups, confidence, group_confidences, and reasoning exactly as written. Do not include additional quotes around keys and do not use non-English keys. Do not provide any free-form text outside the tool call.
50 |
51 | Please return the classification result as JSON:
52 | - selected_groups: Selected groups in priority order (empty array if none)
53 | - confidence: Overall classification confidence (0-1)
54 | - group_confidences: Confidence values aligned with selected_groups (e.g., [0.7, 0.3], sum=1)
55 | - reasoning: Explanation for the classification
56 |
57 | # Implementation class (Python file in user space)
58 | implementation: builtin.group_classifier.GroupClassifierAgent
59 |
60 | # Agent Settings
61 | config:
62 | timeout: 10
63 | retry_count: 1 # Keep retries minimal for fast classification
64 | classification_config:
65 | confidence_threshold: 0.4
66 |
67 | # Tool schema definition (JSON)
68 | tools:
69 | group_classifier:
70 | description: "Classifies user questions into appropriate groups"
71 | parameters:
72 | type: "object"
73 | properties:
74 | selected_groups:
75 | type: "array"
76 | items:
77 | type: "string"
78 | description: "Selected groups in priority order"
79 | confidence:
80 | type: "number"
81 | minimum: 0
82 | maximum: 1
83 | description: "Overall classification confidence (0-1)"
84 | reasoning:
85 | type: "string"
86 | description: "Explanation of the classification"
87 | fallback_used:
88 | type: "boolean"
89 | description: "Indicates whether classification fallback was used"
90 | group_confidences:
91 | type: "array"
92 | items:
93 | type: "number"
94 | minimum: 0
95 | maximum: 1
96 | description: "Confidence scores aligned with selected_groups (sum=1)"
97 | required: ["selected_groups", "confidence", "group_confidences", "reasoning"]
98 |
--------------------------------------------------------------------------------
/maru_lang/utils/security.py:
--------------------------------------------------------------------------------
1 | import base64
2 | import hashlib
3 | from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
4 | from cryptography.hazmat.backends import default_backend
5 | from cryptography.hazmat.primitives import padding
6 | from datetime import datetime, timedelta, timezone
7 | from typing import Optional
8 | from jose import JWTError, jwt
9 | from fastapi import HTTPException, status
10 | from pydantic import ValidationError
11 | from maru_lang.configs.system_config import get_system_config
12 |
13 | config = get_system_config()
14 |
15 |
16 | def generate_anonymized_key(
17 | login_id: str,
18 | company_id: int,
19 | salt: str = None
20 | ) -> str:
21 | if salt is None:
22 | salt = config.auth.salt
23 | # Combine the inputs with the salt to build a deterministic anonymized key
24 | raw_data = f"{login_id}:{company_id}:{salt}"
25 | return hashlib.sha256(raw_data.encode()).hexdigest()
26 |
27 |
28 | def create_jwt_token(
29 | data: dict,
30 | expires_delta: timedelta
31 | ) -> tuple[str, datetime]:
32 | """Create a JWT access token and return it with its expiry."""
33 | expires_at = datetime.now(timezone.utc)
34 | expires_at += expires_delta
35 | to_encode = data.copy()
36 | to_encode.update({"exp": expires_at})
37 | encoded_jwt = jwt.encode(
38 | to_encode,
39 | config.auth.secret_key,
40 | algorithm=config.auth.algorithm)
41 | return encoded_jwt, expires_at
42 |
43 |
44 | def decode_token(token: str) -> dict | None:
45 | """Decode a JWT token and return its payload."""
46 | try:
47 | payload = jwt.decode(
48 | token,
49 | config.auth.secret_key,
50 | algorithms=[config.auth.algorithm])
51 | return payload
52 | except (jwt.ExpiredSignatureError, jwt.JWTError, ValidationError) as e:
53 | # print(f"Token decode error: {e}")
54 | return None
55 |
56 |
57 | def get_key_spec(key: str):
58 | key_bytes = key.encode('utf-8')
59 | return key_bytes
60 |
61 |
62 | def aes256_decrypt(target_str: str) -> str:
63 | try:
64 | # Decode the Base64-encoded cipher text
65 | decoded_data = base64.b64decode(target_str)
66 |
67 | # Initialize the AES cipher in ECB mode
68 | cipher = Cipher(algorithms.AES(get_key_spec(config.auth.secret_key)),
69 | modes.ECB(), backend=default_backend())
70 | decryptor = cipher.decryptor()
71 |
72 | # Perform AES decryption
73 | decrypted_data = decryptor.update(decoded_data) + decryptor.finalize()
74 |
75 | # Remove PKCS7 padding
76 | unpadder = padding.PKCS7(algorithms.AES.block_size).unpadder()
77 | unpadded_data = unpadder.update(decrypted_data) + unpadder.finalize()
78 |
79 | return unpadded_data.decode('utf-8')
80 |
81 | except Exception as e:
82 | raise Exception(f"Error during decryption: {str(e)}")
83 |
84 |
85 | def aes256_encrypt(plain_text: str) -> str:
86 | try:
87 | # Convert the plain text to bytes
88 | plain_text_bytes = plain_text.encode('utf-8')
89 |
90 | # Apply PKCS7 padding
91 | padder = padding.PKCS7(algorithms.AES.block_size).padder()
92 | padded_data = padder.update(plain_text_bytes) + padder.finalize()
93 |
94 | # Initialize the AES cipher in ECB mode
95 | cipher = Cipher(algorithms.AES(get_key_spec(config.auth.secret_key)),
96 | modes.ECB(), backend=default_backend())
97 | encryptor = cipher.encryptor()
98 |
99 | # Perform AES encryption
100 | encrypted_data = encryptor.update(padded_data) + encryptor.finalize()
101 |
102 | # Encode the ciphertext using Base64
103 | encrypted_base64_data = base64.b64encode(encrypted_data)
104 |
105 | # Return the encrypted string
106 | return encrypted_base64_data.decode('utf-8')
107 |
108 | except Exception as e:
109 | raise Exception(f"Error during encryption: {str(e)}")
--------------------------------------------------------------------------------
/maru_lang/api/endpoints/user_group.py:
--------------------------------------------------------------------------------
1 | """
2 | User group management API endpoints
3 | """
4 | from fastapi import APIRouter, HTTPException, Depends
5 | from pydantic import BaseModel
6 | from typing import Optional, Dict, Any
7 |
8 | from maru_lang.dependencies.auth import get_user
9 | from maru_lang.services.user_group_command import (
10 | UserGroupCommandParser,
11 | execute_user_group_command
12 | )
13 |
14 |
15 | router = APIRouter(
16 | prefix="/user-groups",
17 | tags=["User Groups"]
18 | )
19 |
20 |
21 | class UserGroupCommandRequest(BaseModel):
22 | """Request body for user group command"""
23 | message: str
24 |
25 |
26 | class UserGroupCommandResponse(BaseModel):
27 | """Response for user group command"""
28 | success: bool
29 | message: str
30 | data: Optional[Dict[str, Any]] = None
31 | error: Optional[str] = None
32 |
33 |
34 | @router.post("/command", response_model=UserGroupCommandResponse)
35 | async def execute_command(
36 | request: UserGroupCommandRequest,
37 | user=Depends(get_user)
38 | ):
39 | """
40 | Execute user group management command.
41 |
42 | Supports natural language commands in Korean and English:
43 |
44 | ## 그룹 생성
45 | - `/그룹생성 [그룹명]` or `/create group [name]`
46 |
47 | ## 멤버 관리 (매니저만)
48 | - `/그룹초대 [그룹명] [이메일]` or `/invite [group] [email]`
49 | - `/그룹추방 [그룹명] [이메일]` or `/remove [group] [email]`
50 | - `/그룹위임 [그룹명] [이메일]` or `/transfer [group] [email]`
51 |
52 | ## 그룹 조회
53 | - `/내그룹목록` or `/my groups`
54 | - `/관리그룹` or `/managed groups`
55 | - `/그룹멤버 [그룹명]` or `/members [group]`
56 |
57 | ## 그룹 나가기
58 | - `/그룹나가기 [그룹명]` or `/leave group [name]`
59 |
60 | Args:
61 | request: Command request with message
62 | user: Authenticated user (from token)
63 |
64 | Returns:
65 | Command execution result with success status and data
66 |
67 | Example:
68 | ```json
69 | {
70 | "message": "/그룹생성 행정팀"
71 | }
72 | ```
73 |
74 | Response:
75 | ```json
76 | {
77 | "success": true,
78 | "message": "Group created successfully",
79 | "data": {
80 | "group_id": 123,
81 | "group_name": "행정팀",
82 | "created": true
83 | }
84 | }
85 | ```
86 | """
87 | try:
88 | # Parse command
89 | parsed = UserGroupCommandParser.parse(request.message)
90 |
91 | # Check if it's a valid command
92 | if parsed["command"] == "unknown":
93 | return UserGroupCommandResponse(
94 | success=False,
95 | message=parsed.get("error", "Unknown command"),
96 | data={"help": UserGroupCommandParser.get_help_text()}
97 | )
98 |
99 | # Execute command
100 | result = await execute_user_group_command(parsed, user.id)
101 |
102 | return UserGroupCommandResponse(**result)
103 |
104 | except Exception as e:
105 | raise HTTPException(
106 | status_code=500,
107 | detail=f"Failed to execute command: {str(e)}"
108 | )
109 |
110 |
111 | @router.get("/help")
112 | async def get_help():
113 | """
114 | Get help text for user group commands.
115 |
116 | Returns:
117 | Help text with all available commands and usage examples
118 | """
119 | return {
120 | "help": UserGroupCommandParser.get_help_text()
121 | }
122 |
123 |
124 | @router.get("/check-command")
125 | async def check_command(message: str):
126 | """
127 | Check if a message is a user group command without executing it.
128 |
129 | Args:
130 | message: Message to check
131 |
132 | Returns:
133 | Whether the message is a valid user group command and parsed result
134 | """
135 | is_command = UserGroupCommandParser.is_user_group_command(message)
136 | parsed = UserGroupCommandParser.parse(message) if is_command else None
137 |
138 | return {
139 | "is_command": is_command,
140 | "parsed": parsed
141 | }
142 |
--------------------------------------------------------------------------------
/maru_lang/templates/yaml/agents/builtin/agents_response.yaml:
--------------------------------------------------------------------------------
1 | # Response Agent Configuration
2 | # Formats and delivers results from other agents to the end user
3 |
4 | name: response
5 | description: "Formats outputs from other agents into user-friendly responses"
6 | type: builtin
7 | enabled: true
8 | version: "1.0.0"
9 |
10 | # LLM configuration
11 | target_llm_config:
12 | server_name: "openai"
13 | override_params:
14 | temperature: 0.7
15 | max_tokens: 3000
16 |
17 | fallback_strategy: "any_available"
18 |
19 | # Prompt configuration
20 | prompts:
21 | system_prompt: |
22 | You are a professional and friendly AI assistant.
23 | Your job is to take results from other system components (agents)
24 | and craft a final answer that is easy to understand and genuinely helpful for the user.
25 |
26 | Primary responsibilities:
27 | - Turn agent outputs into natural, fluent sentences.
28 | - Explain technical details in user-friendly language.
29 | - Present structured data in readable formats.
30 | - Communicate errors or warnings with empathy and clarity.
31 | - Provide additional context or explanations when helpful.
32 | - Adapt your response based on different execution outcomes (success, failure, partial success, errors).
33 |
34 | Response principles:
35 | - Use clear, easy-to-follow language.
36 | - Highlight important information.
37 | - Format structured data appropriately (bullet points, numbered lists, tables, etc.).
38 | - Briefly explain technical terms when necessary.
39 | - Maintain a positive, supportive tone.
40 | - Even when reporting errors, remain polite and constructive.
41 |
42 | user_prompt_template: |
43 | User question: {question}
44 |
45 | Execution scenario: {scenario}
46 |
47 | Agent outputs:
48 | {agent_result}
49 |
50 | Using the information above, write a kind and clear final response for the user.
51 |
52 | Guidelines:
53 | - Adjust tone and content to match the execution scenario.
54 | - Hide implementation details; share only what is helpful for the user.
55 | - Write naturally, as if conversing with the user.
56 | - Apply Markdown formatting when it improves readability.
57 | - Do not mention agent names or internal architecture; focus on the results.
58 |
59 | # Implementation class (builtin agent)
60 | implementation: builtin.response_agent.ResponseAgent
61 |
62 | # Agent configuration
63 | config:
64 | timeout: 30
65 | retry_count: 2
66 | max_context_length: 10000
67 |
68 | # Response formatting options
69 | formatting:
70 | include_metadata: false # Whether to include metadata in the response
71 | show_sources: true # Whether to list information sources
72 | use_markdown: true # Whether to render responses using Markdown
73 | max_response_length: 2000 # Maximum length of the response
74 |
75 | # Scenario-specific LLM guidance (appended to the prompt)
76 | scenario_config:
77 | no_agents: "No agents were selected. Ask the user for more details or offer general assistance."
78 | errors: "An error occurred while running the agents. Explain the issue politely and suggest trying again."
79 | success: "Agents completed successfully. Present the results in a user-friendly manner."
80 | partial_success: "Some agents succeeded while others failed. Share the successful results first and briefly mention the failures."
81 | unknown: "The situation is unclear. Let the user know that assistance is limited and offer alternative help."
82 |
83 | # Fallback responses when no LLM output is available
84 | fallback_config:
85 | no_agents: "I’m sorry, I couldn’t find an appropriate agent to handle that question. Could you provide more details?"
86 | errors: "I’m sorry, something went wrong while processing your request."
87 | success: "" # Empty string: use the formatted_context as-is
88 | partial_success: "" # Empty string: use the formatted_context as-is
89 | unknown: "I’m sorry, I’m unable to generate a response right now."
90 |
91 | # Examples (this agent formats outputs from other agents)
92 | examples:
93 | - "Format knowledge_search results into a user-friendly response"
94 | - "Communicate error messages politely"
95 |
--------------------------------------------------------------------------------
/maru_lang/dependencies/email.py:
--------------------------------------------------------------------------------
1 | """
2 | Email service dependency for FastAPI
3 | """
4 | from abc import ABC, abstractmethod
5 | from typing import Optional
6 | from fastapi import Depends
7 | from maru_lang.configs.system_config import get_system_config
8 |
9 | config = get_system_config()
10 |
11 |
12 | class EmailService(ABC):
13 | """Abstract base class for email services"""
14 |
15 | @abstractmethod
16 | def send_email(self, recipient: str, subject: str, body: str) -> bool:
17 | pass
18 |
19 | @abstractmethod
20 | def send_otp(self, recipient: str, code: str) -> bool:
21 | pass
22 |
23 |
24 | class O365EmailManager(EmailService):
25 | """Office 365 email service implementation"""
26 |
27 | def __init__(self):
28 | self.sender_email = config.email.sender_email
29 | self.client_id = config.email.o365.client_id
30 | self.client_secret = config.email.o365.client_secret
31 | self.tenant_id = config.email.o365.tenant_id
32 |
33 | def send_email(self, recipient: str, subject: str, body: str) -> bool:
34 | try:
35 | from O365 import Account
36 |
37 | credentials = (self.client_id, self.client_secret)
38 | scopes = ["https://graph.microsoft.com/.default"]
39 | account = Account(
40 | credentials,
41 | auth_flow_type="credentials",
42 | tenant_id=self.tenant_id
43 | )
44 |
45 | if not account.is_authenticated:
46 | account.authenticate(scopes=scopes)
47 |
48 | mailbox = account.mailbox(resource=self.sender_email)
49 | message = mailbox.new_message()
50 | message.to.add(recipient)
51 | message.subject = subject
52 | message.body = body
53 | message.body_type = "HTML"
54 | message.send()
55 | return True
56 | except Exception as e:
57 | print(f"Failed to send email: {e}")
58 | return False
59 |
60 | def send_otp(self, recipient: str, code: str) -> bool:
61 | subject = f"{code} - Maru Lang Verification Code"
62 | body = f"""
63 |
64 |
65 |
Your Verification Code
66 |
Use this code to verify your email address:
67 |
68 | {code}
69 |
70 |
This code expires in 10 minutes.
71 |
72 |
73 | """
74 | return self.send_email(recipient, subject, body)
75 |
76 |
77 | def get_email_manager() -> Optional[EmailService]:
78 | """Get email service instance based on settings"""
79 | if not config.email.service_type:
80 | return None
81 |
82 | if config.email.service_type == "o365":
83 | if all([config.email.o365.client_id, config.email.o365.client_secret, config.email.o365.tenant_id, config.email.sender_email]):
84 | try:
85 | return O365EmailManager()
86 | except Exception as e:
87 | print(f"Failed to initialize O365 Email Manager: {e}")
88 | return None
89 |
90 | # TODO: smtp 타입 지원 추가
91 | return None
92 |
93 |
94 | def get_email_service_dependency() -> Optional[EmailService]:
95 | """FastAPI dependency for email service"""
96 | return get_email_manager()
97 |
98 |
99 | __all__ = [
100 | "EmailService",
101 | "O365EmailManager",
102 | "get_email_manager",
103 | "get_email_service_dependency",
104 | ]
105 |
--------------------------------------------------------------------------------
/maru_lang/dependencies/auth.py:
--------------------------------------------------------------------------------
1 | # 시크릿 키, 알고리즘, 토큰 만료 시간 등을 settings에서 관리
2 | from fastapi import Depends, HTTPException, status, Request, Body
3 | from fastapi.security import OAuth2PasswordBearer
4 | from maru_lang.enums.auth import UserRoleCode
5 | from maru_lang.core.relation_db.models.auth import User, UserRole, RefreshToken
6 | from maru_lang.utils.security import decode_token
7 | from maru_lang.services.auth import refresh_token_flow
8 |
9 | # 1) OAuth2 스키마 설정
10 | oauth2_scheme = OAuth2PasswordBearer(
11 | tokenUrl="/auth/editor/login",
12 | auto_error=False)
13 |
14 |
15 | async def get_user(
16 | request: Request,
17 | token: str = Depends(oauth2_scheme)
18 | ) -> User:
19 | """토큰에서 유저 ID 등을 추출하여 실제 유저 정보를 가져오는 함수"""
20 | # device-id는 헤더 또는 쿼리스트링(device-id)로 전달 받을 수 있도록 확장
21 | device_id_in_header = request.headers.get("device-id") or request.query_params.get("device-id")
22 |
23 | # 토큰은 헤더(Authorization) 또는 쿼리 파라미터(token)에서 받을 수 있음
24 | # SSE/EventSource는 커스텀 헤더를 지원하지 않으므로 쿼리 파라미터 지원 필요
25 | if not token:
26 | token = request.query_params.get("token")
27 |
28 | payload = decode_token(token) if token else None
29 |
30 | if payload is None:
31 | # AccessToken 만료 → refresh_token 꺼내기
32 | refresh_token = request.cookies.get("refresh_token")
33 | if not refresh_token:
34 | # 임시로 만약 서버를 재시작 했을때를 대비해서
35 | # 보안적으로 안전하지 않다 salt를 해야할수도
36 | refresh_token_object = await RefreshToken.filter(
37 | device_id=device_id_in_header
38 | ).order_by(
39 | "-created_at"
40 | ).first()
41 | if refresh_token_object:
42 | refresh_token = refresh_token_object.refresh_token
43 | try:
44 | decode_token(refresh_token)
45 | except Exception:
46 | raise HTTPException(
47 | status_code=401, detail="Invalid refresh token")
48 |
49 | if refresh_token:
50 | new_access_token = await refresh_token_flow(refresh_token, device_id_in_header)
51 | if new_access_token:
52 | # 새 토큰으로 재인증 시도
53 | payload = decode_token(new_access_token)
54 | # 🔥 새 AccessToken을 응답 헤더에 추가 (선택)
55 | request.state.new_access_token = new_access_token
56 |
57 | if payload is None:
58 | raise HTTPException(
59 | status_code=status.HTTP_401_UNAUTHORIZED,
60 | detail="Invalid or expired token",
61 | headers={"WWW-Authenticate": "Bearer"},
62 | )
63 |
64 | user_id = payload.get("sub")
65 |
66 | if user_id is None:
67 | raise HTTPException(
68 | status_code=status.HTTP_401_UNAUTHORIZED,
69 | detail="Invalid token: no user_id",
70 | headers={"WWW-Authenticate": "Bearer"},
71 | )
72 |
73 | user = await User.get_or_none(id=user_id)
74 | if not user:
75 | raise HTTPException(
76 | status_code=status.HTTP_401_UNAUTHORIZED,
77 | detail="User not found",
78 | )
79 |
80 | return user
81 |
82 |
83 | def get_user_with_role(
84 | required_role: UserRoleCode,
85 | ):
86 | async def dependency(
87 | user: User = Depends(get_user)
88 | ):
89 | # 역할 우선순위 (낮은 권한부터 높은 권한 순)
90 | ROLE_HIERARCHY = [
91 | UserRoleCode.EDITOR,
92 | UserRoleCode.ADMIN,
93 | ]
94 | # get user role
95 | user_role = await UserRole.get_or_none(
96 | id=user.role_id
97 | )
98 |
99 | if not user_role:
100 | raise HTTPException(status_code=401, detail="Unauthorized role")
101 |
102 | try:
103 | user_index = ROLE_HIERARCHY.index(UserRoleCode(user_role.name))
104 | required_index = ROLE_HIERARCHY.index(required_role)
105 | except ValueError:
106 | raise HTTPException(status_code=401, detail="Invalid role")
107 |
108 | if user_index < required_index:
109 | raise HTTPException(status_code=403, detail="Permission denied")
110 |
111 | return user
112 |
113 | return dependency
114 |
--------------------------------------------------------------------------------
/maru_lang/core/relation_db/models/documents.py:
--------------------------------------------------------------------------------
1 | from tortoise.models import Model
2 | from tortoise import fields
3 | from maru_lang.enums.documents import (
4 | PermissionAction,
5 | DocumentStatus,
6 | )
7 |
8 |
9 | class Document(Model):
10 | id = fields.CharField(pk=True, max_length=64) # ULID/UUIDv7 권장
11 | name = fields.CharField(max_length=255, index=True)
12 |
13 | file_path = fields.CharField(max_length=500, null=True)
14 | file_size = fields.BigIntField(null=True)
15 | head_hash = fields.CharField(
16 | max_length=64, null=True, index=True) # blake3(앞 64KB)
17 | full_hash = fields.CharField(
18 | max_length=64, null=True, index=True) # blake3(전체: 지연 계산 가능)
19 | source_fingerprint = fields.CharField(
20 | max_length=64, unique=True, null=True) # 업서트 기준 키
21 |
22 | metadata = fields.JSONField(default=dict)
23 | status = fields.IntEnumField(
24 | DocumentStatus, default=DocumentStatus.PROCESSING)
25 | created_at = fields.DatetimeField(auto_now_add=True)
26 | updated_at = fields.DatetimeField(auto_now=True)
27 |
28 | class Meta:
29 | table = "document"
30 | indexes = [["name", "file_size", "head_hash"]]
31 |
32 |
33 | class DocumentGroup(Model):
34 | id = fields.IntField(pk=True)
35 | name = fields.CharField(max_length=255, unique=True) # Full path로 unique 식별
36 | base_path = fields.CharField(
37 | max_length=500,
38 | unique=True, # 같은 파일시스템 경로는 단일 DocumentGroup만 존재
39 | )
40 | description = fields.TextField(null=True) # DocumentGroup 설명
41 |
42 | # Version ID for VDB chunk filtering and version management
43 | version_id = fields.CharField(
44 | max_length=64,
45 | null=True, # 임베딩 완료 전에는 null
46 | index=True # 검색 성능을 위한 인덱스
47 | )
48 |
49 | # Manager (owner) of this document group
50 | manager = fields.ForeignKeyField(
51 | "models.User",
52 | related_name="managed_document_groups",
53 | on_delete=fields.RESTRICT # Manager가 있는 DocumentGroup이 있으면 User 삭제 불가
54 | )
55 |
56 | # Pluggable component configurations (used during ingestion)
57 | loader = fields.CharField(max_length=255, null=True) # 사용된 loader 이름
58 | chunker = fields.CharField(max_length=255, null=True) # 사용된 chunker 이름
59 | embedding_model = fields.CharField(max_length=255, null=True) # 사용된 embedding model 이름
60 |
61 | # Configuration snapshot (for detecting changes)
62 | config_snapshot = fields.JSONField(null=True, default=dict) # 사용된 설정의 스냅샷
63 |
64 | minhash_signature = fields.JSONField(null=True) # MinHash 시그니처 (128개 정수 배열)
65 | signature_updated_at = fields.DatetimeField(auto_now=True)
66 |
67 | class Meta:
68 | table = "document_group"
69 |
70 |
71 | class DocumentGroupMembership(Model):
72 | document = fields.ForeignKeyField(
73 | "models.Document",
74 | related_name="group_memberships",
75 | on_delete=fields.CASCADE)
76 | group = fields.ForeignKeyField(
77 | "models.DocumentGroup",
78 | related_name="documents",
79 | on_delete=fields.CASCADE)
80 |
81 | class Meta:
82 | table = "document_group_membership"
83 |
84 |
85 | class DocumentGroupInclusion(Model):
86 | parent = fields.ForeignKeyField(
87 | "models.DocumentGroup",
88 | related_name="includes",
89 | on_delete=fields.CASCADE)
90 | child = fields.ForeignKeyField(
91 | "models.DocumentGroup",
92 | related_name="included_by",
93 | on_delete=fields.CASCADE)
94 |
95 | class Meta:
96 | table = "document_group_inclusion"
97 | unique_together = ("parent", "child")
98 |
99 |
100 | # 그룹 ↔ 문서그룹 권한
101 | class GroupPermission(Model):
102 | user_group = fields.ForeignKeyField(
103 | "models.UserGroup",
104 | related_name="permissions",
105 | on_delete=fields.CASCADE)
106 | document_group = fields.ForeignKeyField(
107 | "models.DocumentGroup",
108 | related_name="permissions",
109 | on_delete=fields.CASCADE)
110 | action = fields.IntEnumField(PermissionAction)
111 |
112 | class Meta:
113 | table = "group_permission"
114 | unique_together = (("user_group", "document_group", "action"),)
--------------------------------------------------------------------------------
/maru_lang/commands/tree.py:
--------------------------------------------------------------------------------
1 | """
2 | DocumentGroup 계층 구조 조회 및 관리 명령어
3 | """
4 | import typer
5 | from typing import Optional
6 | from maru_lang.core.relation_db.models.documents import (
7 | DocumentGroup,
8 | DocumentGroupInclusion,
9 | )
10 |
11 |
12 | async def get_root_groups() -> list[DocumentGroup]:
13 | """
14 | 루트 그룹들을 조회 (DocumentGroupInclusion에서 child로 지정되지 않은 그룹)
15 |
16 | Returns:
17 | 루트 DocumentGroup 리스트
18 | """
19 | # child_id로 사용된 그룹 ID들
20 | child_ids = await DocumentGroupInclusion.all().values_list("child_id", flat=True)
21 | child_ids_set = set(child_ids)
22 |
23 | # 모든 그룹 조회
24 | all_groups = await DocumentGroup.all()
25 |
26 | # child로 지정되지 않은 그룹만 필터링 (루트 그룹)
27 | root_groups = [g for g in all_groups if g.id not in child_ids_set]
28 |
29 | return sorted(root_groups, key=lambda g: g.name)
30 |
31 |
32 | async def get_children_groups(parent_group: DocumentGroup) -> list[DocumentGroup]:
33 | """
34 | 특정 그룹의 직계 자식 그룹들을 조회
35 |
36 | Args:
37 | parent_group: 부모 그룹
38 |
39 | Returns:
40 | 자식 DocumentGroup 리스트
41 | """
42 | inclusions = await DocumentGroupInclusion.filter(
43 | parent=parent_group
44 | ).prefetch_related("child")
45 |
46 | children = [inc.child for inc in inclusions]
47 | return sorted(children, key=lambda g: g.name)
48 |
49 |
50 | async def print_group_tree(
51 | group: DocumentGroup | None = None,
52 | max_depth: int = 2,
53 | current_depth: int = 0,
54 | prefix: str = "",
55 | is_last: bool = True
56 | ):
57 | """
58 | 그룹 계층 구조를 트리 형태로 출력
59 |
60 | Args:
61 | group: 출력할 그룹 (None이면 루트부터)
62 | max_depth: 최대 깊이
63 | current_depth: 현재 깊이
64 | prefix: 출력 prefix (트리 그리기용)
65 | is_last: 마지막 자식인지 여부
66 | """
67 | if group is None:
68 | # 루트 그룹들 출력
69 | root_groups = await get_root_groups()
70 |
71 | if not root_groups:
72 | typer.secho("📭 DocumentGroup이 없습니다.", fg=typer.colors.YELLOW)
73 | return
74 |
75 | typer.echo("\n📁 Document Group 계층 구조:\n")
76 | for i, root in enumerate(root_groups):
77 | is_last_root = (i == len(root_groups) - 1)
78 | await print_group_tree(root, max_depth, 0, "", is_last_root)
79 | else:
80 | # 현재 그룹 출력
81 | if current_depth == 0:
82 | connector = ""
83 | typer.secho(f"{group.name}", fg=typer.colors.CYAN, bold=True)
84 | else:
85 | connector = "└── " if is_last else "├── "
86 | typer.secho(f"{prefix}{connector}{group.name}", fg=typer.colors.GREEN)
87 |
88 | # 최대 깊이에 도달하면 중단
89 | if current_depth >= max_depth:
90 | return
91 |
92 | # 자식 그룹들 재귀 출력
93 | children = await get_children_groups(group)
94 |
95 | for i, child in enumerate(children):
96 | is_last_child = (i == len(children) - 1)
97 |
98 | if current_depth == 0:
99 | child_prefix = ""
100 | else:
101 | child_prefix = prefix + (" " if is_last else "│ ")
102 |
103 | await print_group_tree(
104 | child,
105 | max_depth,
106 | current_depth + 1,
107 | child_prefix,
108 | is_last_child
109 | )
110 |
111 |
112 | async def show_group_tree_command(
113 | group_name: Optional[str] = None,
114 | depth: int = 2
115 | ):
116 | """
117 | DocumentGroup 계층 구조 출력 명령어
118 |
119 | Args:
120 | group_name: 특정 그룹명 (없으면 루트 그룹들만 표시)
121 | depth: 표시할 최대 깊이
122 | """
123 | if group_name:
124 | # 특정 그룹 조회
125 | group = await DocumentGroup.get_or_none(name=group_name.lower())
126 |
127 | if not group:
128 | typer.secho(
129 | f"❌ '{group_name}' 그룹을 찾을 수 없습니다.",
130 | fg=typer.colors.RED
131 | )
132 | raise typer.Exit(1)
133 |
134 | typer.echo(f"\n📁 '{group.name}' 그룹 계층 구조 (depth={depth}):\n")
135 | await print_group_tree(group, max_depth=depth)
136 | else:
137 | # 루트 그룹들만 표시 (depth=1)
138 | await print_group_tree(None, max_depth=1)
139 |
140 | typer.echo() # 빈 줄
141 |
--------------------------------------------------------------------------------
/maru_lang/api/endpoints/auth.py:
--------------------------------------------------------------------------------
1 | from typing import Optional
2 | from fastapi import APIRouter, HTTPException, Depends, Response
3 | from maru_lang.enums.auth import UserRoleCode
4 | from maru_lang.configs.system_config import get_system_config
5 | from maru_lang.dependencies.auth import get_user
6 |
7 | config = get_system_config()
8 | from maru_lang.dependencies.email import get_email_service_dependency, EmailService
9 | from maru_lang.schemas.auth import (
10 | VerifyCodeRequest,
11 | SignUpRequest,
12 | LogoutRequest,
13 | UserGroupsResponse,
14 | UserGroupResponse,
15 | )
16 | from maru_lang.services.auth import (
17 | generate_token,
18 | verify_OTP,
19 | create_or_get_user,
20 | delete_token,
21 | generate_OTP,
22 | get_user_groups,
23 | )
24 |
25 |
26 | router = APIRouter(
27 | prefix="/auth",
28 | tags=["Auth"]
29 | )
30 |
31 |
32 | @router.post("/login")
33 | async def login(
34 | request: SignUpRequest,
35 | email_service: Optional[EmailService] = Depends(
36 | get_email_service_dependency)
37 | ) -> str:
38 | try:
39 | # TODO Email validation
40 | otp = await generate_OTP(request.email, email_service)
41 |
42 | # 이메일 서비스가 활성화된 경우에만 이메일 전송
43 | if email_service:
44 | success = email_service.send_otp(request.email, otp.code)
45 | if not success:
46 | # 이메일 전송 실패 시 DEFAULT_VALIDATION_CODE로 재생성
47 | await otp.delete()
48 | otp = await generate_OTP(request.email, None)
49 |
50 | return otp.email
51 | except Exception as e:
52 | print(e)
53 | raise HTTPException(
54 | status_code=400,
55 | detail="서버가 점검 중 입니다. 다시 시도해주세요.")
56 |
57 |
58 | @router.post("/logout")
59 | async def logout(
60 | request: LogoutRequest,
61 | response: Response,
62 | user=Depends(get_user)
63 | ) -> dict:
64 | try:
65 | await delete_token(user.id, request.device_id)
66 | response.delete_cookie(
67 | key="refresh_token",
68 | path="/",
69 | samesite="strict"
70 | )
71 | return {"message": "Logged out successfully"}
72 | except Exception as e:
73 | raise HTTPException(status_code=500, detail=str(e))
74 |
75 |
76 | @router.post("/verify/code")
77 | async def verify_code(
78 | response: Response,
79 | request: VerifyCodeRequest
80 | ):
81 | try:
82 | if not await verify_OTP(request.email, request.code):
83 | raise Exception("Invalid or expired code")
84 | user = await create_or_get_user(
85 | email=request.email,
86 | role=UserRoleCode.EDITOR.value
87 | )
88 | access_token, refresh_token = await generate_token(
89 | user.id,
90 | user.role_id,
91 | request.device_id)
92 |
93 | response.set_cookie(
94 | key="refresh_token",
95 | value=refresh_token,
96 | httponly=True,
97 | secure=True,
98 | samesite="strict",
99 | max_age=config.auth.refresh_token_expire_minutes * 60
100 | )
101 |
102 | return access_token
103 | except Exception as e:
104 | raise HTTPException(status_code=400, detail=str(e))
105 |
106 |
107 | @router.get("/verify")
108 | async def verify(_=Depends(get_user)):
109 | return {"message": "ok"}
110 |
111 |
112 | @router.get("/user/groups", response_model=UserGroupsResponse)
113 | async def get_current_user_groups(
114 | user=Depends(get_user)
115 | ):
116 | """
117 | Get user groups that the authenticated user belongs to.
118 |
119 | Returns:
120 | UserGroupsResponse: List of user groups with total count
121 | """
122 | try:
123 | # Get user groups using service function
124 | groups = await get_user_groups(user)
125 |
126 | # Convert to response format
127 | group_responses = [
128 | UserGroupResponse(
129 | id=group.id,
130 | name=group.name
131 | )
132 | for group in groups
133 | ]
134 | return UserGroupsResponse(
135 | groups=group_responses,
136 | total=len(group_responses)
137 | )
138 |
139 | except Exception as e:
140 | print(f"❌ Error fetching user groups: {str(e)}")
141 | raise HTTPException(status_code=500, detail=str(e))
142 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/embedders/manager.py:
--------------------------------------------------------------------------------
1 | """
2 | Embedder: 임베딩 모델 관리 및 벡터 생성
3 | 프로세스 단위로 모델을 캐싱하여 GPU 자원을 효율적으로 사용
4 | """
5 | from typing import Dict, List, Optional
6 | from sentence_transformers import SentenceTransformer
7 |
8 |
9 | class Embedder:
10 | """
11 | 임베딩 모델 관리자
12 |
13 | 프로세스 내에서 임베딩 모델을 캐싱하여 재사용
14 | encode 함수로 텍스트를 벡터로 변환하는 단순한 인터페이스 제공
15 | """
16 |
17 | def __init__(self, device: Optional[str] = None):
18 | """
19 | Args:
20 | device: 모델을 로드할 디바이스 (None이면 자동 선택)
21 | 예: "cuda", "cpu", "mps"
22 | """
23 | self.device = device
24 | self.model_cache: Dict[str, SentenceTransformer] = {}
25 |
26 | def encode(
27 | self,
28 | texts: List[str],
29 | model_name: str,
30 | show_progress: bool = False,
31 | ) -> List[List[float]]:
32 | """
33 | 텍스트를 임베딩 벡터로 변환
34 |
35 | Args:
36 | texts: 임베딩할 텍스트 리스트
37 | model_name: 임베딩 모델 이름
38 | show_progress: 진행바 표시 여부
39 |
40 | Returns:
41 | List[List[float]]: 임베딩 벡터 리스트
42 | """
43 | model = self._get_or_load_model(model_name)
44 | vectors = model.encode(
45 | texts, show_progress_bar=show_progress, convert_to_numpy=True
46 | )
47 | return vectors.tolist()
48 |
49 | def get_dimension(self, model_name: str) -> int:
50 | """
51 | 임베딩 차원 반환
52 |
53 | Args:
54 | model_name: 임베딩 모델 이름
55 |
56 | Returns:
57 | int: 임베딩 벡터 차원
58 | """
59 | model = self._get_or_load_model(model_name)
60 | return model.get_sentence_embedding_dimension()
61 |
62 | def _get_or_load_model(self, model_name: str) -> SentenceTransformer:
63 | """
64 | 모델 캐싱 및 로드 (내부 메서드)
65 |
66 | Args:
67 | model_name: 임베딩 모델 이름
68 |
69 | Returns:
70 | SentenceTransformer: 로드된 모델 인스턴스
71 | """
72 | if model_name not in self.model_cache:
73 | self.model_cache[model_name] = SentenceTransformer(
74 | model_name, device=self.device
75 | )
76 |
77 | return self.model_cache[model_name]
78 |
79 | def unload_model(self, model_name: str) -> bool:
80 | """
81 | 모델을 메모리에서 해제
82 |
83 | Args:
84 | model_name: 해제할 모델 이름
85 |
86 | Returns:
87 | bool: 해제 성공 여부
88 | """
89 | if model_name in self.model_cache:
90 | del self.model_cache[model_name]
91 | print(f"🗑️ Model unloaded: {model_name}")
92 | return True
93 | return False
94 |
95 | def clear_cache(self):
96 | """모든 캐시된 모델 해제"""
97 | count = len(self.model_cache)
98 | self.model_cache.clear()
99 | print(f"🗑️ Cleared {count} model(s) from cache")
100 |
101 |
102 | # 싱글톤 인스턴스
103 | _embedder_instance: Optional[Embedder] = None
104 |
105 |
106 | def get_embedder(
107 | device: Optional[str] = None,
108 | force_new: bool = False,
109 | ) -> Embedder:
110 | """
111 | Embedder 싱글톤 인스턴스 반환
112 |
113 | Args:
114 | device: 모델을 로드할 디바이스 (None이면 config에서 자동 로드)
115 | 예: "cuda", "cpu", "mps"
116 | force_new: True면 기존 인스턴스 무시하고 새로 생성 (테스트용)
117 |
118 | Returns:
119 | Embedder: 싱글톤 인스턴스
120 |
121 | Example:
122 | >>> embedder = get_embedder()
123 | >>> vectors = embedder.encode(["hello", "world"], "intfloat/multilingual-e5-large")
124 | """
125 | global _embedder_instance
126 |
127 | if _embedder_instance is None or force_new:
128 | # device가 None이면 config에서 로드
129 | if device is None:
130 | device = _load_device_from_config()
131 |
132 | _embedder_instance = Embedder(device=device)
133 |
134 | return _embedder_instance
135 |
136 |
137 | def _load_device_from_config() -> Optional[str]:
138 | """
139 | ConfigManager를 사용하여 config에서 device 설정을 로드합니다.
140 |
141 | Returns:
142 | Optional[str]: config에서 읽은 device 설정, 없으면 None
143 | """
144 | try:
145 | from maru_lang.configs import get_config_manager
146 |
147 | config_manager = get_config_manager()
148 | merged_config = config_manager.get_embedder_config()
149 |
150 | if merged_config:
151 | return merged_config.device
152 | except ImportError:
153 | pass
154 | except Exception as e:
155 | print(f"⚠️ Embedder config 로드 실패: {e}")
156 |
157 | return None
158 |
--------------------------------------------------------------------------------
/maru_lang/models/agents.py:
--------------------------------------------------------------------------------
1 | """
2 | Agent-related data models
3 | """
4 | import asyncio
5 | from dataclasses import dataclass, field
6 | from typing import List, Dict, Any, Optional, Union, TYPE_CHECKING
7 | from maru_lang.enums.chat import ChatProcessStep as ChatStep
8 | from maru_lang.models.chat import ChatHistory
9 | from maru_lang.core.vector_db.retrieve_document import RetrieveDocument
10 |
11 |
12 | @dataclass
13 | class AgentResult:
14 | """Result from individual agent execution"""
15 | success: bool
16 | result: str = "" # 주요 출력 결과 (표준화된 문자열)
17 | data: Optional[Dict[str, Any]] = None # 추가 정보 (선택)
18 | error: Optional[str] = None
19 | metadata: Optional[Dict[str, Any]] = None
20 |
21 | def _serialize_value(self, value: Any) -> Any:
22 | """Recursively serialize values to JSON-compatible format"""
23 | if value is None or isinstance(value, (str, int, float, bool)):
24 | return value
25 | elif isinstance(value, dict):
26 | return {k: self._serialize_value(v) for k, v in value.items()}
27 | elif isinstance(value, (list, tuple)):
28 | return [self._serialize_value(item) for item in value]
29 | elif hasattr(value, 'text'):
30 | # Handle MCP TextContent objects
31 | return value.text
32 | elif hasattr(value, 'to_dict'):
33 | return self._serialize_value(value.to_dict())
34 | elif hasattr(value, '__dict__'):
35 | return self._serialize_value(value.__dict__)
36 | else:
37 | # Fallback: convert to string
38 | return str(value)
39 |
40 | def to_dict(self) -> Dict[str, Any]:
41 | """Convert to dictionary with safe serialization"""
42 | return {
43 | "success": self.success,
44 | "result": self.result,
45 | "data": self._serialize_value(self.data),
46 | "error": self.error,
47 | "metadata": self._serialize_value(self.metadata)
48 | }
49 |
50 |
51 | @dataclass
52 | class AgentSelection:
53 | """Result of agent selection process"""
54 | selected_agents: List[str]
55 | execution_order: List[str]
56 | reasoning: str
57 | parameters: Optional[Dict[str, Any]] = None
58 | fallback_config: Optional[Dict[str, Any]] = None
59 |
60 | def to_dict(self) -> Dict[str, Any]:
61 | """Convert to dictionary"""
62 | return {
63 | "selected_agents": self.selected_agents,
64 | "execution_order": self.execution_order,
65 | "reasoning": self.reasoning,
66 | "parameters": self.parameters or {},
67 | "fallback_config": self.fallback_config
68 | }
69 |
70 |
71 | @dataclass
72 | class ExecutionContext:
73 | """Context of agent execution"""
74 | question: str
75 | progress_queue: asyncio.Queue
76 | chat_history: ChatHistory
77 | metadata: Optional[Dict[str, Any]] = field(default_factory=dict)
78 |
79 | def to_dict(self) -> Dict[str, Any]:
80 | """Convert to dictionary"""
81 | # exclude progress_queue
82 | return {
83 | "question": self.question,
84 | "progress_queue": self.progress_queue,
85 | "chat_history": self.chat_history,
86 | "metadata": self.metadata
87 | }
88 |
89 |
90 | @dataclass
91 | class ExecutionResult:
92 | """Result of agent execution orchestration"""
93 | agent_results: Dict[str, AgentResult]
94 | execution_order: List[str]
95 | success: bool
96 | errors: Dict[str, str] = field(default_factory=dict)
97 |
98 | def to_dict(self) -> Dict[str, Any]:
99 | """Convert to dictionary"""
100 | return {
101 | "agent_results": {
102 | name: result.to_dict()
103 | for name, result in self.agent_results.items()
104 | },
105 | "execution_order": self.execution_order,
106 | "success": self.success,
107 | "errors": self.errors
108 | }
109 |
110 |
111 | @dataclass
112 | class ChatResult:
113 | """Final chat processing result"""
114 | answer: str
115 | internal_documents: List[RetrieveDocument] = field(default_factory=list)
116 |
117 |
118 | @dataclass
119 | class ChatProcess:
120 | """Chat processing result"""
121 | step: ChatStep
122 | data: Union[AgentSelection, ExecutionResult, str, ChatResult]
123 |
124 |
125 | @dataclass
126 | class GenerateAnswerResult:
127 | """Result from answer generation"""
128 | answer: str
129 | documents: List[Any] = field(default_factory=list)
130 | success: bool = True
131 | confidence: Optional[float] = None
132 | metadata: Optional[Dict[str, Any]] = None
133 |
134 |
135 |
--------------------------------------------------------------------------------
/maru_lang/pluggable/rerankers/manager.py:
--------------------------------------------------------------------------------
1 | """
2 | Reranker: 검색 결과 재정렬
3 | 프로세스 단위로 모델을 캐싱하여 GPU 자원을 효율적으로 사용
4 | """
5 | from typing import Dict, List, Optional, Tuple
6 | from maru_lang.configs import get_config_manager
7 | from sentence_transformers import CrossEncoder
8 |
9 |
10 | class Reranker:
11 | """
12 | Reranker 관리자
13 |
14 | 프로세스 내에서 reranker 모델을 캐싱하여 재사용
15 | rerank 함수로 검색 결과를 재정렬하는 단순한 인터페이스 제공
16 | """
17 |
18 | def __init__(self, device: Optional[str] = None):
19 | """
20 | Args:
21 | device: 모델을 로드할 디바이스 (None이면 자동 선택)
22 | 예: "cuda", "cpu", "mps"
23 | """
24 | self.device = device
25 | self.model_cache: Dict[str, CrossEncoder] = {}
26 |
27 | def rerank(
28 | self,
29 | query: str,
30 | documents: List[str],
31 | model_name: str,
32 | top_k: Optional[int] = None,
33 | ) -> List[Tuple[int, float]]:
34 | """
35 | 쿼리와 문서들을 재정렬
36 |
37 | Args:
38 | query: 검색 쿼리
39 | documents: 재정렬할 문서 리스트
40 | model_name: reranker 모델 이름
41 | top_k: 상위 k개만 반환 (None이면 전체)
42 |
43 | Returns:
44 | List[Tuple[int, float]]: (원본 인덱스, 점수) 튜플 리스트 (점수 내림차순)
45 | """
46 | model = self._get_or_load_model(model_name)
47 |
48 | # 쿼리-문서 쌍 생성
49 | pairs = [[query, doc] for doc in documents]
50 |
51 | # 점수 계산
52 | scores = model.predict(pairs)
53 |
54 | # (인덱스, 점수) 튜플 생성 및 정렬
55 | ranked = [(idx, float(score)) for idx, score in enumerate(scores)]
56 | ranked.sort(key=lambda x: x[1], reverse=True)
57 |
58 | # top_k 제한
59 | if top_k is not None:
60 | ranked = ranked[:top_k]
61 |
62 | return ranked
63 |
64 | def _get_or_load_model(self, model_name: str) -> CrossEncoder:
65 | """
66 | 모델 캐싱 및 로드 (내부 메서드)
67 |
68 | Args:
69 | model_name: reranker 모델 이름
70 |
71 | Returns:
72 | CrossEncoder: 로드된 모델 인스턴스
73 | """
74 | if model_name not in self.model_cache:
75 | print(f"Loading reranker model: {model_name}...")
76 | self.model_cache[model_name] = CrossEncoder(
77 | model_name, device=self.device
78 | )
79 | device_info = f"device={self.device}" if self.device else "auto"
80 | print(f"✅ Reranker loaded: {model_name} ({device_info})")
81 |
82 | return self.model_cache[model_name]
83 |
84 | def unload_model(self, model_name: str) -> bool:
85 | """
86 | 모델을 메모리에서 해제
87 |
88 | Args:
89 | model_name: 해제할 모델 이름
90 |
91 | Returns:
92 | bool: 해제 성공 여부
93 | """
94 | if model_name in self.model_cache:
95 | del self.model_cache[model_name]
96 | print(f"🗑️ Reranker unloaded: {model_name}")
97 | return True
98 | return False
99 |
100 | def clear_cache(self):
101 | """모든 캐시된 모델 해제"""
102 | count = len(self.model_cache)
103 | self.model_cache.clear()
104 | print(f"🗑️ Cleared {count} reranker model(s) from cache")
105 |
106 |
107 | # 싱글톤 인스턴스
108 | _reranker_instance: Optional[Reranker] = None
109 |
110 |
111 | def get_reranker(
112 | device: Optional[str] = None,
113 | force_new: bool = False,
114 | ) -> Reranker:
115 | """
116 | Reranker 싱글톤 인스턴스 반환
117 |
118 | Args:
119 | device: 모델을 로드할 디바이스 (None이면 config에서 자동 로드)
120 | 예: "cuda", "cpu", "mps"
121 | force_new: True면 기존 인스턴스 무시하고 새로 생성 (테스트용)
122 |
123 | Returns:
124 | Reranker: 싱글톤 인스턴스
125 |
126 | Example:
127 | >>> reranker = get_reranker()
128 | >>> ranked = reranker.rerank(
129 | ... query="python tutorial",
130 | ... documents=["doc1", "doc2", "doc3"],
131 | ... model_name="BAAI/bge-reranker-v2-m3",
132 | ... top_k=5
133 | ... )
134 | """
135 | global _reranker_instance
136 |
137 | if _reranker_instance is None or force_new:
138 | # device가 None이면 config에서 로드 (embedder와 동일한 device 사용)
139 | if device is None:
140 | device = _load_device_from_config()
141 |
142 | _reranker_instance = Reranker(device=device)
143 |
144 | return _reranker_instance
145 |
146 |
147 | def _load_device_from_config() -> Optional[str]:
148 | """
149 | ConfigManager를 사용하여 config에서 device 설정을 로드합니다.
150 | Embedder config와 동일한 device 사용
151 |
152 | Returns:
153 | Optional[str]: config에서 읽은 device 설정, 없으면 None
154 | """
155 | try:
156 |
157 | config_manager = get_config_manager()
158 | embedder_config = config_manager.get_embedder_config()
159 |
160 | if embedder_config:
161 | return embedder_config.device
162 | except ImportError:
163 | pass
164 | except Exception as e:
165 | print(f"⚠️ Reranker config 로드 실패: {e}")
166 |
167 | return None
168 |
--------------------------------------------------------------------------------