├── app ├── __init__.py ├── api │ ├── __init__.py │ ├── routes │ │ ├── __init__.py │ │ ├── health.py │ │ ├── search.py │ │ ├── ingest.py │ │ └── qa.py │ └── schemas.py ├── core │ ├── __init__.py │ ├── db.py │ └── config.py ├── application │ ├── __init__.py │ └── services │ │ ├── __init__.py │ │ ├── search_service.py │ │ ├── qa_service.py │ │ └── ingestion_service.py ├── infrastructure │ ├── __init__.py │ ├── parsers │ │ ├── __init__.py │ │ └── pdf_reader.py │ ├── text │ │ ├── __init__.py │ │ └── text_utils.py │ ├── embeddings │ │ ├── __init__.py │ │ └── sentence_transformer_provider.py │ ├── persistence │ │ ├── __init__.py │ │ └── models.py │ └── vectorstore │ │ ├── __init__.py │ │ └── faiss_index.py └── main.py ├── requirements.txt ├── docker-compose.yml ├── Dockerfile ├── .gitignore └── README.md /app/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/api/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/core/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/api/routes/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/application/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/infrastructure/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/application/services/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/infrastructure/parsers/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/infrastructure/text/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/infrastructure/embeddings/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/infrastructure/persistence/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/infrastructure/vectorstore/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /app/api/routes/health.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter 2 | 3 | router = APIRouter(tags=["health"]) 4 | 5 | 6 | @router.get("/health") 7 | def health() -> dict: 8 | return {"status": "ok"} 9 | -------------------------------------------------------------------------------- /app/infrastructure/parsers/pdf_reader.py: -------------------------------------------------------------------------------- 1 | from typing import Iterable 2 | from pypdf import PdfReader 3 | 4 | 5 | def extract_text_pages(file_path: str) -> Iterable[str]: 6 | reader = PdfReader(file_path) 7 | for page in reader.pages: 8 | text = page.extract_text() or "" 9 | yield text 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.115.0 2 | uvicorn[standard]==0.30.6 3 | pydantic==2.9.2 4 | SQLAlchemy==2.0.34 5 | sentence-transformers==3.0.1 6 | faiss-cpu==1.8.0.post1 7 | pypdf==4.3.1 8 | python-multipart==0.0.9 9 | httpx==0.27.0 10 | numpy>=1.25,<3 11 | tenacity==9.0.0 12 | openai==1.51.0 13 | starlette==0.38.5 14 | typing-extensions>=4.9.0 15 | python-dotenv==1.0.1 16 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | api: 3 | build: 4 | context: . 5 | dockerfile: Dockerfile 6 | container_name: backend_architecture_api 7 | ports: 8 | - "8000:8000" 9 | environment: 10 | - DATA_DIR=/app/data 11 | - OPENAI_API_KEY=${OPENAI_API_KEY:-} 12 | volumes: 13 | - ./data:/app/data 14 | restart: unless-stopped 15 | 16 | 17 | -------------------------------------------------------------------------------- /app/application/services/search_service.py: -------------------------------------------------------------------------------- 1 | from typing import List, Dict, Any 2 | 3 | from app.core.config import settings 4 | from app.infrastructure.embeddings.sentence_transformer_provider import embed_query 5 | from app.infrastructure.vectorstore.faiss_index import VectorIndex 6 | 7 | 8 | def search_documents(query: str, top_k: int) -> List[Dict[str, Any]]: 9 | query_vec = embed_query(query) 10 | results = VectorIndex.search(query_vec=query_vec, top_k=top_k or settings.TOP_K_DEFAULT) 11 | return results 12 | -------------------------------------------------------------------------------- /app/api/routes/search.py: -------------------------------------------------------------------------------- 1 | from typing import List 2 | from fastapi import APIRouter, HTTPException 3 | 4 | from app.core.config import settings 5 | from app.api.schemas import SearchRequest, SearchResult 6 | from app.application.services.search_service import search_documents 7 | 8 | router = APIRouter(tags=["search"]) 9 | 10 | 11 | @router.post("/search", response_model=List[SearchResult]) 12 | def search(req: SearchRequest): 13 | try: 14 | return search_documents(query=req.query, top_k=req.k or settings.TOP_K_DEFAULT) 15 | except Exception as exc: 16 | raise HTTPException(status_code=500, detail=str(exc)) 17 | -------------------------------------------------------------------------------- /app/api/schemas.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import Optional, List 3 | 4 | 5 | class IngestTextRequest(BaseModel): 6 | text: str 7 | uri: Optional[str] = None 8 | 9 | 10 | class SearchRequest(BaseModel): 11 | query: str 12 | k: int = 5 13 | 14 | 15 | class QARequest(BaseModel): 16 | question: str 17 | k: int = 5 18 | use_openai: bool = False 19 | 20 | 21 | class CompletenessRequest(BaseModel): 22 | query: str 23 | k: int = 20 24 | 25 | 26 | class SearchResult(BaseModel): 27 | content: str 28 | score: float 29 | document_id: int 30 | uri: Optional[str] 31 | chunk_index: int 32 | -------------------------------------------------------------------------------- /app/core/db.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import create_engine # pyright: ignore[reportMissingImports] 2 | from sqlalchemy.orm import sessionmaker, declarative_base # pyright: ignore[reportMissingImports] 3 | from app.core.config import settings 4 | 5 | 6 | DATABASE_URL = f"sqlite:///{settings.DB_PATH}" 7 | 8 | engine = create_engine( 9 | DATABASE_URL, connect_args={"check_same_thread": False} 10 | ) 11 | 12 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) 13 | 14 | Base = declarative_base() 15 | 16 | 17 | def get_session(): 18 | session = SessionLocal() 19 | try: 20 | yield session 21 | finally: 22 | session.close() 23 | -------------------------------------------------------------------------------- /app/api/routes/ingest.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, UploadFile, File, HTTPException 2 | 3 | from app.api.schemas import IngestTextRequest 4 | from app.application.services.ingestion_service import ingest_text_document, ingest_file_document 5 | 6 | router = APIRouter(tags=["ingest"]) 7 | 8 | 9 | @router.post("/ingest/text") 10 | def ingest_text(req: IngestTextRequest) -> dict: 11 | try: 12 | result = ingest_text_document(text=req.text, uri=req.uri) 13 | return result 14 | except Exception as exc: 15 | raise HTTPException(status_code=500, detail=str(exc)) 16 | 17 | 18 | @router.post("/ingest/file") 19 | async def ingest_file(file: UploadFile = File(...)) -> dict: 20 | try: 21 | result = await ingest_file_document(file) 22 | return result 23 | except ValueError as ve: 24 | raise HTTPException(status_code=400, detail=str(ve)) 25 | except Exception as exc: 26 | raise HTTPException(status_code=500, detail=str(exc)) 27 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.11-slim 2 | 3 | # Environment configuration 4 | ENV PYTHONDONTWRITEBYTECODE=1 \ 5 | PYTHONUNBUFFERED=1 \ 6 | PIP_NO_CACHE_DIR=1 7 | 8 | # System dependencies required by faiss-cpu (OpenMP) 9 | RUN apt-get update \ 10 | && apt-get install -y --no-install-recommends libgomp1 \ 11 | && rm -rf /var/lib/apt/lists/* 12 | 13 | WORKDIR /app 14 | 15 | # Install Python dependencies first (leverages Docker layer caching) 16 | COPY requirements.txt /app/requirements.txt 17 | RUN pip install --upgrade pip \ 18 | && pip install -r /app/requirements.txt 19 | 20 | # Copy project files 21 | COPY app /app/app 22 | COPY README.md /app/README.md 23 | COPY .env.local /app/.env.local 24 | 25 | # Ensure data directory exists inside the image (will be mounted in compose) 26 | RUN mkdir -p /app/data 27 | 28 | # Default runtime environment 29 | ENV DATA_DIR=/app/data 30 | 31 | EXPOSE 8000 32 | 33 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"] 34 | 35 | 36 | -------------------------------------------------------------------------------- /app/infrastructure/text/text_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | from typing import Iterable, List 3 | 4 | 5 | _WHITESPACE_RE = re.compile(r"\s+") 6 | 7 | 8 | def clean_text(text: str) -> str: 9 | text = text.replace("\x00", " ") 10 | text = _WHITESPACE_RE.sub(" ", text).strip() 11 | return text 12 | 13 | 14 | def estimate_tokens(text: str) -> int: 15 | # Lightweight tokens approximation 16 | return max(1, len(text.split())) 17 | 18 | 19 | def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]: 20 | if chunk_size <= 0: 21 | return [text] 22 | text = clean_text(text) 23 | if not text: 24 | return [] 25 | chunks: List[str] = [] 26 | start = 0 27 | n = len(text) 28 | step = max(1, chunk_size - overlap) 29 | while start < n: 30 | end = min(n, start + chunk_size) 31 | chunk = text[start:end] 32 | chunks.append(chunk) 33 | if end == n: 34 | break 35 | start += step 36 | return chunks 37 | -------------------------------------------------------------------------------- /app/api/routes/qa.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, HTTPException 2 | 3 | from app.core.config import settings 4 | from app.api.schemas import QARequest, CompletenessRequest 5 | from app.application.services.qa_service import answer_question_and_citations, completeness_check 6 | 7 | router = APIRouter(tags=["qa"]) 8 | 9 | 10 | @router.post("/qa") 11 | def qa(req: QARequest): 12 | try: 13 | payload = answer_question_and_citations( 14 | question=req.question, top_k=req.k or settings.TOP_K_DEFAULT, use_openai=req.use_openai 15 | ) 16 | return payload 17 | except Exception as exc: 18 | raise HTTPException(status_code=500, detail=str(exc)) 19 | 20 | 21 | @router.post("/completeness") 22 | def completeness(req: CompletenessRequest) -> dict: 23 | try: 24 | result = completeness_check(query=req.query, top_k=req.k or settings.TOP_K_DEFAULT) 25 | return result 26 | except Exception as exc: 27 | raise HTTPException(status_code=500, detail=str(exc)) 28 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from fastapi.middleware.cors import CORSMiddleware 3 | 4 | from app.core.config import settings 5 | from app.core.db import Base, engine 6 | from app.infrastructure.embeddings.sentence_transformer_provider import get_embedding_dimension 7 | from app.infrastructure.vectorstore.faiss_index import VectorIndex 8 | 9 | from app.api.routes.health import router as health_router 10 | from app.api.routes.ingest import router as ingest_router 11 | from app.api.routes.search import router as search_router 12 | from app.api.routes.qa import router as qa_router 13 | 14 | 15 | app = FastAPI(title="Knowledge Base Search & Q&A", version="0.1.0") 16 | 17 | app.add_middleware( 18 | CORSMiddleware, 19 | allow_origins=["*"], 20 | allow_credentials=True, 21 | allow_methods=["*"], 22 | allow_headers=["*"], 23 | ) 24 | 25 | 26 | @app.on_event("startup") 27 | def on_startup() -> None: 28 | Base.metadata.create_all(bind=engine) 29 | VectorIndex.initialize(dimension=get_embedding_dimension()) 30 | 31 | 32 | # Routers 33 | app.include_router(health_router) 34 | app.include_router(ingest_router) 35 | app.include_router(search_router) 36 | app.include_router(qa_router) 37 | -------------------------------------------------------------------------------- /app/infrastructure/persistence/models.py: -------------------------------------------------------------------------------- 1 | from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime 2 | from sqlalchemy.orm import relationship 3 | from datetime import datetime 4 | 5 | from app.core.db import Base 6 | 7 | 8 | class Document(Base): 9 | __tablename__ = "documents" 10 | 11 | id = Column(Integer, primary_key=True, index=True) 12 | uri = Column(String(512), unique=False, nullable=True) 13 | source_type = Column(String(32), nullable=False, default="api") 14 | sha256 = Column(String(64), nullable=False, index=True) 15 | num_chunks = Column(Integer, nullable=False, default=0) 16 | created_at = Column(DateTime, default=datetime.utcnow) 17 | updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow) 18 | 19 | chunks = relationship("Chunk", back_populates="document", cascade="all, delete-orphan") 20 | 21 | 22 | class Chunk(Base): 23 | __tablename__ = "chunks" 24 | 25 | id = Column(Integer, primary_key=True, index=True) 26 | document_id = Column(Integer, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False, index=True) 27 | chunk_index = Column(Integer, nullable=False) 28 | content = Column(Text, nullable=False) 29 | token_count = Column(Integer, nullable=False, default=0) 30 | created_at = Column(DateTime, default=datetime.utcnow) 31 | 32 | document = relationship("Document", back_populates="chunks") 33 | -------------------------------------------------------------------------------- /app/core/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dataclasses import dataclass 3 | from dotenv import load_dotenv, find_dotenv 4 | 5 | _env_file = os.getenv("ENV_FILE", ".env.local") 6 | if os.path.exists(_env_file): 7 | load_dotenv(_env_file) 8 | else: 9 | _found = find_dotenv(".env") 10 | if _found: 11 | load_dotenv(_found) 12 | 13 | 14 | def _to_int(value: str, default: int) -> int: 15 | try: 16 | return int(value) 17 | except Exception: 18 | return default 19 | 20 | 21 | @dataclass 22 | class Settings: 23 | DATA_DIR: str = os.getenv("DATA_DIR", os.path.join(os.getcwd(), "data")) 24 | DB_PATH: str = os.getenv("DB_PATH", os.path.join(DATA_DIR, "db.sqlite3")) 25 | INDEX_PATH: str = os.getenv("INDEX_PATH", os.path.join(DATA_DIR, "index.faiss")) 26 | INDEX_META_PATH: str = os.getenv("INDEX_META_PATH", os.path.join(DATA_DIR, "index_meta.json")) 27 | 28 | MODEL_NAME: str = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2") 29 | DEVICE: str = os.getenv("DEVICE", "cpu") 30 | 31 | CHUNK_SIZE_CHARS: int = _to_int(os.getenv("CHUNK_SIZE_CHARS", "1000"), 1000) 32 | CHUNK_OVERLAP_CHARS: int = _to_int(os.getenv("CHUNK_OVERLAP_CHARS", "200"), 200) 33 | 34 | TOP_K_DEFAULT: int = _to_int(os.getenv("TOP_K_DEFAULT", "5"), 5) 35 | 36 | OPENAI_API_KEY: str | None = os.getenv("OPENAI_API_KEY") 37 | 38 | 39 | settings = Settings() 40 | 41 | os.makedirs(settings.DATA_DIR, exist_ok=True) 42 | -------------------------------------------------------------------------------- /app/infrastructure/embeddings/sentence_transformer_provider.py: -------------------------------------------------------------------------------- 1 | import threading 2 | from typing import List 3 | import numpy as np 4 | 5 | from sentence_transformers import SentenceTransformer 6 | 7 | from app.core.config import settings 8 | 9 | _model_lock = threading.Lock() 10 | _model: SentenceTransformer | None = None 11 | _dim: int | None = None 12 | 13 | 14 | def _load_model() -> SentenceTransformer: 15 | global _model, _dim 16 | if _model is None: 17 | with _model_lock: 18 | if _model is None: 19 | _model = SentenceTransformer(settings.MODEL_NAME, device=settings.DEVICE) 20 | # Probe dimension 21 | probe = _model.encode(["dim"], convert_to_numpy=True, normalize_embeddings=False) 22 | _dim = probe.shape[1] 23 | return _model 24 | 25 | 26 | def get_embedding_dimension() -> int: 27 | if _dim is None: 28 | _load_model() 29 | assert _dim is not None 30 | return _dim 31 | 32 | 33 | def _normalize(matrix: np.ndarray) -> np.ndarray: 34 | norms = np.linalg.norm(matrix, axis=1, keepdims=True) + 1e-12 35 | return matrix / norms 36 | 37 | 38 | def embed_texts(texts: List[str]) -> np.ndarray: 39 | model = _load_model() 40 | vectors = model.encode(texts, convert_to_numpy=True, normalize_embeddings=False) 41 | vectors = _normalize(vectors) 42 | return vectors.astype(np.float32) 43 | 44 | 45 | def embed_query(text: str) -> np.ndarray: 46 | vec = embed_texts([text]) 47 | return vec 48 | -------------------------------------------------------------------------------- /app/application/services/qa_service.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Any, List 2 | import os 3 | 4 | import numpy as np 5 | 6 | from app.core.config import settings 7 | from app.infrastructure.embeddings.sentence_transformer_provider import embed_query 8 | from app.infrastructure.vectorstore.faiss_index import VectorIndex 9 | 10 | try: 11 | from openai import OpenAI 12 | except Exception: 13 | OpenAI = None 14 | 15 | 16 | def _format_citations(chunks: List[dict]) -> str: 17 | lines = [] 18 | for i, c in enumerate(chunks, start=1): 19 | uri = c.get("uri") or f"doc-{c.get('document_id')}" 20 | snippet = c.get("content", "").strip().replace("\n", " ") 21 | if len(snippet) > 500: 22 | snippet = snippet[:500] + "..." 23 | lines.append(f"[{i}] ({uri}) {snippet}") 24 | return "\n".join(lines) 25 | 26 | 27 | def answer_question_and_citations(*, question: str, top_k: int, use_openai: bool = False) -> Dict[str, Any]: 28 | q_vec = embed_query(question) 29 | chunks = VectorIndex.search(q_vec, top_k) 30 | if use_openai and settings.OPENAI_API_KEY and OpenAI is not None: 31 | client = OpenAI(api_key=settings.OPENAI_API_KEY) 32 | context = _format_citations(chunks) 33 | prompt = ( 34 | "You are a helpful assistant. Answer the user's question using ONLY the context provided.\n" 35 | "If the answer cannot be found in the context, say you don't know.\n\n" 36 | f"Context:\n{context}\n\nQuestion: {question}\nAnswer:" 37 | ) 38 | try: 39 | completion = client.chat.completions.create( 40 | model="gpt-4o-mini", 41 | messages=[{"role": "user", "content": prompt}], 42 | temperature=0.0, 43 | ) 44 | answer = completion.choices[0].message.content or "" 45 | except Exception as e: 46 | answer = f"Retrieval-only fallback due to LLM error: {e}\n\n" + _format_citations(chunks) 47 | else: 48 | answer = "Retrieval-only mode. Provide your own synthesis using these snippets:\n\n" + _format_citations(chunks) 49 | return {"answer": answer, "citations": chunks} 50 | 51 | 52 | def completeness_check(*, query: str, top_k: int) -> Dict[str, Any]: 53 | q_vec = embed_query(query) 54 | chunks = VectorIndex.search(q_vec, top_k) 55 | scores = [c.get("score", 0.0) for c in chunks] 56 | coverage = float(sum(scores) / max(1, len(scores))) if scores else 0.0 57 | is_complete = coverage >= 0.4 58 | return {"is_complete": is_complete, "coverage": coverage, "k": top_k, "results": chunks} 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | *.egg-info/ 23 | .installed.cfg 24 | *.egg 25 | 26 | # Virtual environments 27 | .venv/ 28 | venv/ 29 | env/ 30 | ENV/ 31 | 32 | # IDE/editor 33 | .vscode/ 34 | .idea/ 35 | .DS_Store 36 | 37 | # Environment variables 38 | .env 39 | .env.* 40 | 41 | # Logs 42 | *.log 43 | 44 | # Project data (local artifacts) 45 | data/ 46 | 47 | # SQLite and vector indexes (if outside data/) 48 | *.sqlite3 49 | *.faiss 50 | 51 | # Cache 52 | .cache/ 53 | **/__pycache__/ 54 | 55 | # Byte-compiled / optimized / DLL files 56 | __pycache__/ 57 | *.py[cod] 58 | *$py.class 59 | 60 | # C extensions 61 | *.so 62 | 63 | # Distribution / packaging 64 | .Python 65 | build/ 66 | develop-eggs/ 67 | dist/ 68 | downloads/ 69 | eggs/ 70 | .eggs/ 71 | lib/ 72 | lib64/ 73 | parts/ 74 | sdist/ 75 | var/ 76 | wheels/ 77 | share/python-wheels/ 78 | *.egg-info/ 79 | .installed.cfg 80 | *.egg 81 | MANIFEST 82 | 83 | # PyInstaller 84 | *.manifest 85 | *.spec 86 | 87 | # Installer logs 88 | pip-log.txt 89 | pip-delete-this-directory.txt 90 | 91 | # Unit test / coverage reports 92 | htmlcov/ 93 | .tox/ 94 | .nox/ 95 | .coverage 96 | .coverage.* 97 | .cache 98 | nosetests.xml 99 | coverage.xml 100 | *.cover 101 | *.py,cover 102 | .hypothesis/ 103 | .pytest_cache/ 104 | 105 | # Translations 106 | *.mo 107 | *.pot 108 | 109 | # Django/Flask instances 110 | instance/ 111 | .webassets-cache 112 | 113 | # Scrapy stuff 114 | .scrapy 115 | 116 | # Sphinx documentation 117 | /docs/_build/ 118 | 119 | # Jupyter Notebook 120 | .ipynb_checkpoints 121 | 122 | # IPython 123 | profile_default/ 124 | ipython_config.py 125 | 126 | # pyenv 127 | .python-version 128 | 129 | # pipenv 130 | Pipfile.lock 131 | 132 | # poetry 133 | poetry.lock 134 | 135 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 136 | __pypackages__/ 137 | 138 | # Celery 139 | celerybeat-schedule 140 | celerybeat.pid 141 | 142 | # dotenv 143 | .env 144 | .env.* 145 | 146 | # virtualenv 147 | .venv/ 148 | venv/ 149 | ENV/ 150 | env/ 151 | env.bak/ 152 | venv.bak/ 153 | 154 | # mypy / pyre / pytype 155 | .mypy_cache/ 156 | .pyre/ 157 | .pytype/ 158 | 159 | # IDEs 160 | .vscode/ 161 | .idea/ 162 | *.iml 163 | 164 | # OS files 165 | .DS_Store 166 | Thumbs.db 167 | 168 | # Logs 169 | *.log 170 | 171 | # Project data artifacts (persisted at runtime) 172 | data/* 173 | !data/.gitkeep 174 | 175 | # FAISS / SQLite snapshots if outside data/ 176 | *.faiss 177 | *.sqlite3 178 | 179 | # Uvicorn reload dirs cache 180 | **/__pycache__/ 181 | -------------------------------------------------------------------------------- /app/application/services/ingestion_service.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import tempfile 4 | from typing import List, Tuple, Optional 5 | 6 | from fastapi import UploadFile 7 | from sqlalchemy.orm import Session 8 | 9 | from app.core.config import settings 10 | from app.core.db import SessionLocal 11 | from app.infrastructure.persistence.models import Document, Chunk 12 | from app.infrastructure.text.text_utils import chunk_text, estimate_tokens, clean_text 13 | from app.infrastructure.parsers.pdf_reader import extract_text_pages 14 | from app.infrastructure.embeddings.sentence_transformer_provider import embed_texts 15 | from app.infrastructure.vectorstore.faiss_index import VectorIndex 16 | 17 | 18 | def _sha256_bytes(data: bytes) -> str: 19 | h = hashlib.sha256() 20 | h.update(data) 21 | return h.hexdigest() 22 | 23 | 24 | def _sha256_text(text: str) -> str: 25 | return _sha256_bytes(text.encode("utf-8")) 26 | 27 | 28 | def _persist_document(session: Session, *, uri: Optional[str], source_type: str, sha256: str, num_chunks: int) -> Document: 29 | doc = Document(uri=uri, source_type=source_type, sha256=sha256, num_chunks=num_chunks) 30 | session.add(doc) 31 | session.commit() 32 | session.refresh(doc) 33 | return doc 34 | 35 | 36 | def _persist_chunks(session: Session, document_id: int, chunks: List[str]) -> List[Chunk]: 37 | chunk_rows: List[Chunk] = [] 38 | for idx, content in enumerate(chunks): 39 | row = Chunk(document_id=document_id, chunk_index=idx, content=content, token_count=estimate_tokens(content)) 40 | session.add(row) 41 | chunk_rows.append(row) 42 | session.commit() 43 | for row in chunk_rows: 44 | session.refresh(row) 45 | return chunk_rows 46 | 47 | 48 | def _index_chunks(chunks: List[Chunk]) -> None: 49 | texts = [c.content for c in chunks] 50 | vectors = embed_texts(texts) 51 | ids = [int(c.id) for c in chunks] 52 | VectorIndex.add(vectors, ids) 53 | 54 | 55 | def _maybe_skip_existing(session: Session, sha256: str, uri: Optional[str]) -> Optional[Document]: 56 | existing = session.query(Document).filter(Document.sha256 == sha256).first() 57 | if existing is not None: 58 | return existing 59 | if uri: 60 | existing_uri = session.query(Document).filter(Document.uri == uri).first() 61 | if existing_uri is not None: 62 | old_chunk_ids = [c.id for c in existing_uri.chunks] 63 | VectorIndex.remove_ids(old_chunk_ids) 64 | session.delete(existing_uri) 65 | session.commit() 66 | return None 67 | 68 | 69 | def _ingest_text_core(text: str, uri: Optional[str], source_type: str) -> dict: 70 | cleaned = clean_text(text) 71 | if not cleaned: 72 | return {"status": "empty", "num_chunks": 0} 73 | parts = chunk_text(cleaned, settings.CHUNK_SIZE_CHARS, settings.CHUNK_OVERLAP_CHARS) 74 | content_hash = _sha256_text(cleaned) 75 | with SessionLocal() as session: 76 | existing = _maybe_skip_existing(session, content_hash, uri) 77 | if existing is not None: 78 | return {"status": "skipped", "document_id": int(existing.id), "num_chunks": int(existing.num_chunks)} 79 | doc = _persist_document(session, uri=uri, source_type=source_type, sha256=content_hash, num_chunks=len(parts)) 80 | chunk_rows = _persist_chunks(session, doc.id, parts) 81 | _index_chunks(chunk_rows) 82 | return {"status": "ingested", "document_id": int(doc.id), "num_chunks": len(parts)} 83 | 84 | 85 | def ingest_text_document(*, text: str, uri: Optional[str] = None) -> dict: 86 | return _ingest_text_core(text=text, uri=uri, source_type="api") 87 | 88 | 89 | async def ingest_file_document(file: UploadFile) -> dict: 90 | filename = file.filename or "uploaded" 91 | ext = os.path.splitext(filename.lower())[1] 92 | if ext not in [".txt", ".pdf"]: 93 | raise ValueError("Only .txt and .pdf are supported for this prototype") 94 | data = await file.read() 95 | if ext == ".txt": 96 | text = data.decode("utf-8", errors="ignore") 97 | return _ingest_text_core(text=text, uri=filename, source_type="file") 98 | else: 99 | with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp: 100 | tmp.write(data) 101 | tmp.flush() 102 | temp_path = tmp.name 103 | try: 104 | pages = list(extract_text_pages(temp_path)) 105 | combined = "\n\n".join(pages) 106 | return _ingest_text_core(text=combined, uri=filename, source_type="file") 107 | finally: 108 | try: 109 | os.remove(temp_path) 110 | except Exception: 111 | pass 112 | -------------------------------------------------------------------------------- /app/infrastructure/vectorstore/faiss_index.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import threading 4 | from typing import List, Dict, Any 5 | 6 | import faiss 7 | import numpy as np 8 | 9 | from app.core.config import settings 10 | from app.core.db import SessionLocal 11 | from app.infrastructure.persistence.models import Chunk, Document 12 | from app.infrastructure.embeddings.sentence_transformer_provider import embed_texts 13 | 14 | 15 | class VectorIndex: 16 | _index: faiss.Index | None = None 17 | _lock = threading.RLock() 18 | _dim: int | None = None 19 | 20 | @classmethod 21 | def initialize(cls, dimension: int) -> None: 22 | with cls._lock: 23 | cls._dim = dimension 24 | if os.path.exists(settings.INDEX_PATH): 25 | idx = faiss.read_index(settings.INDEX_PATH) 26 | if isinstance(idx, (faiss.IndexIDMap, faiss.IndexIDMap2)): 27 | cls._index = idx 28 | else: 29 | if idx.d != dimension: 30 | cls._index = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension)) 31 | cls._persist_meta() 32 | cls._save() 33 | cls._rebuild_from_db() 34 | elif idx.ntotal == 0: 35 | cls._index = faiss.IndexIDMap2(idx) 36 | cls._persist_meta() 37 | cls._save() 38 | else: 39 | cls._index = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension)) 40 | cls._persist_meta() 41 | cls._save() 42 | cls._rebuild_from_db() 43 | if cls._index is not None and cls._index.d != dimension: 44 | cls._index = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension)) 45 | cls._persist_meta() 46 | cls._save() 47 | cls._rebuild_from_db() 48 | else: 49 | cls._index = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension)) 50 | cls._persist_meta() 51 | cls._save() 52 | 53 | @classmethod 54 | def _persist_meta(cls) -> None: 55 | meta = {"dimension": cls._dim, "model": settings.MODEL_NAME} 56 | os.makedirs(os.path.dirname(settings.INDEX_META_PATH), exist_ok=True) 57 | with open(settings.INDEX_META_PATH, "w", encoding="utf-8") as f: 58 | json.dump(meta, f) 59 | 60 | @classmethod 61 | def _save(cls) -> None: 62 | assert cls._index is not None 63 | faiss.write_index(cls._index, settings.INDEX_PATH) 64 | 65 | @classmethod 66 | def _rebuild_from_db(cls, batch_size: int = 256) -> None: 67 | assert cls._index is not None 68 | last_id = 0 69 | with SessionLocal() as session: 70 | while True: 71 | rows: List[Chunk] = ( 72 | session.query(Chunk) 73 | .filter(Chunk.id > last_id) 74 | .order_by(Chunk.id.asc()) 75 | .limit(batch_size) 76 | .all() 77 | ) 78 | if not rows: 79 | break 80 | texts = [r.content for r in rows] 81 | ids = [int(r.id) for r in rows] 82 | vectors = embed_texts(texts) 83 | cls._index.add_with_ids(vectors, np.array(ids, dtype=np.int64)) 84 | last_id = rows[-1].id 85 | cls._save() 86 | 87 | @classmethod 88 | def add(cls, embeddings: np.ndarray, ids: List[int]) -> None: 89 | assert cls._index is not None 90 | with cls._lock: 91 | cls._index.add_with_ids(embeddings, np.array(ids, dtype=np.int64)) 92 | cls._save() 93 | 94 | @classmethod 95 | def remove_ids(cls, ids: List[int]) -> None: 96 | assert cls._index is not None 97 | if not ids: 98 | return 99 | with cls._lock: 100 | to_remove = faiss.IDSelectorArray(np.array(ids, dtype=np.int64)) 101 | cls._index.remove_ids(to_remove) 102 | cls._save() 103 | 104 | @classmethod 105 | def search(cls, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]: 106 | assert cls._index is not None 107 | with cls._lock: 108 | distances, id_matrix = cls._index.search(query_vec, top_k) 109 | id_list = id_matrix[0].tolist() 110 | score_list = distances[0].tolist() 111 | results: List[Dict[str, Any]] = [] 112 | with SessionLocal() as session: 113 | rows = ( 114 | session.query(Chunk, Document) 115 | .join(Document, Chunk.document_id == Document.id) 116 | .filter(Chunk.id.in_([cid for cid in id_list if cid != -1])) 117 | .all() 118 | ) 119 | chunk_by_id = {chunk.id: (chunk, doc) for (chunk, doc) in rows} 120 | for cid, score in zip(id_list, score_list): 121 | if cid == -1: 122 | continue 123 | pair = chunk_by_id.get(cid) 124 | if not pair: 125 | continue 126 | chunk, doc = pair 127 | results.append( 128 | { 129 | "content": chunk.content, 130 | "score": float(score), 131 | "document_id": int(doc.id), 132 | "uri": doc.uri, 133 | "chunk_index": int(chunk.chunk_index), 134 | } 135 | ) 136 | return results 137 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | AI-Powered Knowledge Base Search & Enrichment (Challenge 2) 2 | 3 | Overview 4 | 5 | This repository contains a working prototype of a document ingestion and semantic search system with Q&A and completeness checking APIs. It focuses on simplicity and reliability for a 24-hour deliverable while leaving clear extension points for scale and features. 6 | 7 | Key Features 8 | 9 | - Document ingestion for raw text and PDF (stores raw text and vector embeddings) 10 | - Sentence-transformer embeddings (local, CPU-friendly) with FAISS vector index (persistent) 11 | - Semantic search API returning ranked chunks with metadata 12 | - Q&A API (RAG) with optional OpenAI integration; retrieval-only fallback 13 | - Completeness check API estimating corpus coverage of a query 14 | - Incremental updates using content SHA-256 to avoid redundant re-indexing 15 | - Modular architecture for future tools (parsers, LLMs, stores) 16 | 17 | Quickstart 18 | 19 | 1) Environment 20 | 21 | - Python 3.10+ is recommended 22 | - Windows, macOS, Linux supported (Windows tested) 23 | 24 | 2) Install 25 | 26 | ```bash 27 | python -m venv .venv 28 | . .venv/Scripts/activate 29 | pip install -r requirements.txt 30 | ``` 31 | 32 | 3) Run the server 33 | 34 | ```bash 35 | uvicorn app.main:app --reload --host 0.0.0.0 --port 8000 36 | ``` 37 | 38 | 4) Try the APIs 39 | 40 | - Open Swagger UI: http://localhost:8000/docs 41 | 42 | Examples (HTTP): 43 | 44 | ```bash 45 | curl -X POST http://localhost:8000/ingest/text \ 46 | -H "Content-Type: application/json" \ 47 | -d '{"text":"This is a sample document about machine learning.", "uri":"sample-1"}' 48 | 49 | curl -X POST http://localhost:8000/search \ 50 | -H "Content-Type: application/json" \ 51 | -d '{"query":"What is machine learning?", "k":5}' 52 | 53 | curl -X POST http://localhost:8000/qa \ 54 | -H "Content-Type: application/json" \ 55 | -d '{"question":"What is machine learning?", "k":5}' 56 | 57 | curl -X POST http://localhost:8000/completeness \ 58 | -H "Content-Type: application/json" \ 59 | -d '{"query":"Neural networks overview", "k":10}' 60 | ``` 61 | 62 | Design Decisions & Trade-offs 63 | 64 | - Embeddings: `all-MiniLM-L6-v2` (384-dim) for speed/size on CPU. 65 | - Vector store: FAISS (inner product with cosine normalization) persisted to disk. 66 | - DB: SQLite for simplicity; holds documents and chunks for metadata and re-indexing. 67 | - Incremental updates: content hash (SHA-256). If unchanged, indexing is skipped. 68 | - Parsers: PDF via `pypdf`; raw text via API. HTML/Docx can be added with new parsers. 69 | - Q&A: Optional OpenAI integration for answer synthesis; otherwise returns retrieved context plus a note. 70 | 71 | 24-hour Constraints & Specific Trade-offs 72 | 73 | - Scope: Retrieval-first system with optional LLM answer synthesis; no advanced reranking. 74 | - Completeness metric: Simple similarity-based heuristic (avg score) vs. richer coverage metrics. 75 | - Infra: Single-process FastAPI; no background job queue. For scale, add Celery/RQ workers. 76 | - Storage: SQLite + FAISS for fast local prototyping; swap to Postgres+pgvector or a managed vector DB for large corpora. 77 | - Parsing: Basic text/PDF support only; HTML/DOCX and OCR are out of scope for this iteration. 78 | - Observability: Minimal logging; no tracing/metrics dashboards. Add OpenTelemetry in next phase. 79 | - Security: No auth/rate limiting. Intended for local demo. In production, put behind gateway and add auth. 80 | - Testing: Manual and Swagger-driven smoke tests; unit/integration tests not included due to time. 81 | 82 | Scaling Considerations 83 | 84 | - Swap SQLite+FAISS to Postgres+pgvector or a cloud vector DB for larger scale. 85 | - Add distributed workers for ingestion and background batching. 86 | - Stream chunking and embedding in batches to keep memory stable (already supported). 87 | - Use async queues and backpressure for very large corpora. 88 | 89 | Project Structure 90 | 91 | ``` 92 | app/ 93 | main.py # FastAPI app and routes 94 | config.py # Settings and constants 95 | db.py # SQLAlchemy engine/session 96 | models.py # ORM models 97 | schemas.py # Pydantic request/response models 98 | embeddings.py # Sentence-transformer loader and wrappers 99 | vectorstore.py # FAISS index manager (persistent) 100 | ingestion.py # Pipeline: parse, chunk, hash, embed, index 101 | qa.py # RAG-style Q&A and completeness helpers 102 | utils/ 103 | text.py # Cleaning and chunking 104 | pdf.py # PDF extraction 105 | data/ 106 | (created at runtime for DB and index persistence) 107 | ``` 108 | 109 | How to Test 110 | 111 | - Use Swagger UI to try endpoints interactively 112 | - Use the sample curl commands above 113 | - Re-ingest the same content to see `"status":"skipped"` (incremental updates) 114 | - Change content for the same `uri` to see replacement (old vectors removed) 115 | - Ingest a PDF via `/ingest/file` and then query `/search` and `/qa` 116 | - Restart the server and run search again to confirm persistence under `data/` 117 | 118 | Demo 119 | 120 | - Record a <5 min screen capture showing: start server → ingest text → search → QA (retrieval-only and with OpenAI if set) → ingest PDF → completeness → restart and confirm persistence. 121 | 122 | Deliverables Checklist 123 | 124 | - Working prototype (local): yes (http://45.61.150.108:8000/docs) 125 | - README with design decisions, 24h trade-offs, run/test steps: yes 126 | - Short Loom/screen recording demo: 127 | - Code in a GitHub repo: https://github.com/RikuSato0/AI-Powered-Knowledge-Base-Search---Enrichment 128 | 129 | 130 | --------------------------------------------------------------------------------