├── app
    ├── __init__.py
    ├── api
    │   ├── __init__.py
    │   ├── routes
    │   │   ├── __init__.py
    │   │   ├── health.py
    │   │   ├── search.py
    │   │   ├── ingest.py
    │   │   └── qa.py
    │   └── schemas.py
    ├── core
    │   ├── __init__.py
    │   ├── db.py
    │   └── config.py
    ├── application
    │   ├── __init__.py
    │   └── services
    │   │   ├── __init__.py
    │   │   ├── search_service.py
    │   │   ├── qa_service.py
    │   │   └── ingestion_service.py
    ├── infrastructure
    │   ├── __init__.py
    │   ├── parsers
    │   │   ├── __init__.py
    │   │   └── pdf_reader.py
    │   ├── text
    │   │   ├── __init__.py
    │   │   └── text_utils.py
    │   ├── embeddings
    │   │   ├── __init__.py
    │   │   └── sentence_transformer_provider.py
    │   ├── persistence
    │   │   ├── __init__.py
    │   │   └── models.py
    │   └── vectorstore
    │   │   ├── __init__.py
    │   │   └── faiss_index.py
    └── main.py
├── requirements.txt
├── docker-compose.yml
├── Dockerfile
├── .gitignore
└── README.md


/app/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/api/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/core/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/api/routes/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/application/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/infrastructure/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/application/services/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/infrastructure/parsers/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/infrastructure/text/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/infrastructure/embeddings/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/infrastructure/persistence/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/infrastructure/vectorstore/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/app/api/routes/health.py:
--------------------------------------------------------------------------------
1 | from fastapi import APIRouter
2 | 
3 | router = APIRouter(tags=["health"])
4 | 
5 | 
6 | @router.get("/health")
7 | def health() -> dict:
8 |     return {"status": "ok"}
9 | 


--------------------------------------------------------------------------------
/app/infrastructure/parsers/pdf_reader.py:
--------------------------------------------------------------------------------
 1 | from typing import Iterable
 2 | from pypdf import PdfReader
 3 | 
 4 | 
 5 | def extract_text_pages(file_path: str) -> Iterable[str]:
 6 |     reader = PdfReader(file_path)
 7 |     for page in reader.pages:
 8 |         text = page.extract_text() or ""
 9 |         yield text
10 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | fastapi==0.115.0
 2 | uvicorn[standard]==0.30.6
 3 | pydantic==2.9.2
 4 | SQLAlchemy==2.0.34
 5 | sentence-transformers==3.0.1
 6 | faiss-cpu==1.8.0.post1
 7 | pypdf==4.3.1
 8 | python-multipart==0.0.9
 9 | httpx==0.27.0
10 | numpy>=1.25,<3
11 | tenacity==9.0.0
12 | openai==1.51.0
13 | starlette==0.38.5
14 | typing-extensions>=4.9.0
15 | python-dotenv==1.0.1
16 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | services:
 2 |   api:
 3 |     build:
 4 |       context: .
 5 |       dockerfile: Dockerfile
 6 |     container_name: backend_architecture_api
 7 |     ports:
 8 |       - "8000:8000"
 9 |     environment:
10 |       - DATA_DIR=/app/data
11 |       - OPENAI_API_KEY=${OPENAI_API_KEY:-}
12 |     volumes:
13 |       - ./data:/app/data
14 |     restart: unless-stopped
15 | 
16 | 
17 | 


--------------------------------------------------------------------------------
/app/application/services/search_service.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Dict, Any
 2 | 
 3 | from app.core.config import settings
 4 | from app.infrastructure.embeddings.sentence_transformer_provider import embed_query
 5 | from app.infrastructure.vectorstore.faiss_index import VectorIndex
 6 | 
 7 | 
 8 | def search_documents(query: str, top_k: int) -> List[Dict[str, Any]]:
 9 |     query_vec = embed_query(query)
10 |     results = VectorIndex.search(query_vec=query_vec, top_k=top_k or settings.TOP_K_DEFAULT)
11 |     return results
12 | 


--------------------------------------------------------------------------------
/app/api/routes/search.py:
--------------------------------------------------------------------------------
 1 | from typing import List
 2 | from fastapi import APIRouter, HTTPException
 3 | 
 4 | from app.core.config import settings
 5 | from app.api.schemas import SearchRequest, SearchResult
 6 | from app.application.services.search_service import search_documents
 7 | 
 8 | router = APIRouter(tags=["search"])
 9 | 
10 | 
11 | @router.post("/search", response_model=List[SearchResult])
12 | def search(req: SearchRequest):
13 |     try:
14 |         return search_documents(query=req.query, top_k=req.k or settings.TOP_K_DEFAULT)
15 |     except Exception as exc:
16 |         raise HTTPException(status_code=500, detail=str(exc))
17 | 


--------------------------------------------------------------------------------
/app/api/schemas.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import Optional, List
 3 | 
 4 | 
 5 | class IngestTextRequest(BaseModel):
 6 |     text: str
 7 |     uri: Optional[str] = None
 8 | 
 9 | 
10 | class SearchRequest(BaseModel):
11 |     query: str
12 |     k: int = 5
13 | 
14 | 
15 | class QARequest(BaseModel):
16 |     question: str
17 |     k: int = 5
18 |     use_openai: bool = False
19 | 
20 | 
21 | class CompletenessRequest(BaseModel):
22 |     query: str
23 |     k: int = 20
24 | 
25 | 
26 | class SearchResult(BaseModel):
27 |     content: str
28 |     score: float
29 |     document_id: int
30 |     uri: Optional[str]
31 |     chunk_index: int
32 | 


--------------------------------------------------------------------------------
/app/core/db.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import create_engine  # pyright: ignore[reportMissingImports]
 2 | from sqlalchemy.orm import sessionmaker, declarative_base  # pyright: ignore[reportMissingImports]
 3 | from app.core.config import settings
 4 | 
 5 | 
 6 | DATABASE_URL = f"sqlite:///{settings.DB_PATH}"
 7 | 
 8 | engine = create_engine(
 9 |     DATABASE_URL, connect_args={"check_same_thread": False}
10 | )
11 | 
12 | SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
13 | 
14 | Base = declarative_base()
15 | 
16 | 
17 | def get_session():
18 |     session = SessionLocal()
19 |     try:
20 |         yield session
21 |     finally:
22 |         session.close()
23 | 


--------------------------------------------------------------------------------
/app/api/routes/ingest.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, UploadFile, File, HTTPException
 2 | 
 3 | from app.api.schemas import IngestTextRequest
 4 | from app.application.services.ingestion_service import ingest_text_document, ingest_file_document
 5 | 
 6 | router = APIRouter(tags=["ingest"])
 7 | 
 8 | 
 9 | @router.post("/ingest/text")
10 | def ingest_text(req: IngestTextRequest) -> dict:
11 |     try:
12 |         result = ingest_text_document(text=req.text, uri=req.uri)
13 |         return result
14 |     except Exception as exc:
15 |         raise HTTPException(status_code=500, detail=str(exc))
16 | 
17 | 
18 | @router.post("/ingest/file")
19 | async def ingest_file(file: UploadFile = File(...)) -> dict:
20 |     try:
21 |         result = await ingest_file_document(file)
22 |         return result
23 |     except ValueError as ve:
24 |         raise HTTPException(status_code=400, detail=str(ve))
25 |     except Exception as exc:
26 |         raise HTTPException(status_code=500, detail=str(exc))
27 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.11-slim
 2 | 
 3 | # Environment configuration
 4 | ENV PYTHONDONTWRITEBYTECODE=1 \
 5 |     PYTHONUNBUFFERED=1 \
 6 |     PIP_NO_CACHE_DIR=1
 7 | 
 8 | # System dependencies required by faiss-cpu (OpenMP)
 9 | RUN apt-get update \
10 |     && apt-get install -y --no-install-recommends libgomp1 \
11 |     && rm -rf /var/lib/apt/lists/*
12 | 
13 | WORKDIR /app
14 | 
15 | # Install Python dependencies first (leverages Docker layer caching)
16 | COPY requirements.txt /app/requirements.txt
17 | RUN pip install --upgrade pip \
18 |     && pip install -r /app/requirements.txt
19 | 
20 | # Copy project files
21 | COPY app /app/app
22 | COPY README.md /app/README.md
23 | COPY .env.local /app/.env.local
24 | 
25 | # Ensure data directory exists inside the image (will be mounted in compose)
26 | RUN mkdir -p /app/data
27 | 
28 | # Default runtime environment
29 | ENV DATA_DIR=/app/data
30 | 
31 | EXPOSE 8000
32 | 
33 | CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/app/infrastructure/text/text_utils.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from typing import Iterable, List
 3 | 
 4 | 
 5 | _WHITESPACE_RE = re.compile(r"\s+")
 6 | 
 7 | 
 8 | def clean_text(text: str) -> str:
 9 |     text = text.replace("\x00", " ")
10 |     text = _WHITESPACE_RE.sub(" ", text).strip()
11 |     return text
12 | 
13 | 
14 | def estimate_tokens(text: str) -> int:
15 |     # Lightweight tokens approximation
16 |     return max(1, len(text.split()))
17 | 
18 | 
19 | def chunk_text(text: str, chunk_size: int, overlap: int) -> List[str]:
20 |     if chunk_size <= 0:
21 |         return [text]
22 |     text = clean_text(text)
23 |     if not text:
24 |         return []
25 |     chunks: List[str] = []
26 |     start = 0
27 |     n = len(text)
28 |     step = max(1, chunk_size - overlap)
29 |     while start < n:
30 |         end = min(n, start + chunk_size)
31 |         chunk = text[start:end]
32 |         chunks.append(chunk)
33 |         if end == n:
34 |             break
35 |         start += step
36 |     return chunks
37 | 


--------------------------------------------------------------------------------
/app/api/routes/qa.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, HTTPException
 2 | 
 3 | from app.core.config import settings
 4 | from app.api.schemas import QARequest, CompletenessRequest
 5 | from app.application.services.qa_service import answer_question_and_citations, completeness_check
 6 | 
 7 | router = APIRouter(tags=["qa"])
 8 | 
 9 | 
10 | @router.post("/qa")
11 | def qa(req: QARequest):
12 |     try:
13 |         payload = answer_question_and_citations(
14 |             question=req.question, top_k=req.k or settings.TOP_K_DEFAULT, use_openai=req.use_openai
15 |         )
16 |         return payload
17 |     except Exception as exc:
18 |         raise HTTPException(status_code=500, detail=str(exc))
19 | 
20 | 
21 | @router.post("/completeness")
22 | def completeness(req: CompletenessRequest) -> dict:
23 |     try:
24 |         result = completeness_check(query=req.query, top_k=req.k or settings.TOP_K_DEFAULT)
25 |         return result
26 |     except Exception as exc:
27 |         raise HTTPException(status_code=500, detail=str(exc))
28 | 


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | from fastapi.middleware.cors import CORSMiddleware
 3 | 
 4 | from app.core.config import settings
 5 | from app.core.db import Base, engine
 6 | from app.infrastructure.embeddings.sentence_transformer_provider import get_embedding_dimension
 7 | from app.infrastructure.vectorstore.faiss_index import VectorIndex
 8 | 
 9 | from app.api.routes.health import router as health_router
10 | from app.api.routes.ingest import router as ingest_router
11 | from app.api.routes.search import router as search_router
12 | from app.api.routes.qa import router as qa_router
13 | 
14 | 
15 | app = FastAPI(title="Knowledge Base Search & Q&A", version="0.1.0")
16 | 
17 | app.add_middleware(
18 |     CORSMiddleware,
19 |     allow_origins=["*"],
20 |     allow_credentials=True,
21 |     allow_methods=["*"],
22 |     allow_headers=["*"],
23 | )
24 | 
25 | 
26 | @app.on_event("startup")
27 | def on_startup() -> None:
28 |     Base.metadata.create_all(bind=engine)
29 |     VectorIndex.initialize(dimension=get_embedding_dimension())
30 | 
31 | 
32 | # Routers
33 | app.include_router(health_router)
34 | app.include_router(ingest_router)
35 | app.include_router(search_router)
36 | app.include_router(qa_router)
37 | 


--------------------------------------------------------------------------------
/app/infrastructure/persistence/models.py:
--------------------------------------------------------------------------------
 1 | from sqlalchemy import Column, Integer, String, Text, ForeignKey, DateTime  
 2 | from sqlalchemy.orm import relationship  
 3 | from datetime import datetime
 4 | 
 5 | from app.core.db import Base
 6 | 
 7 | 
 8 | class Document(Base):
 9 |     __tablename__ = "documents"
10 | 
11 |     id = Column(Integer, primary_key=True, index=True)
12 |     uri = Column(String(512), unique=False, nullable=True)
13 |     source_type = Column(String(32), nullable=False, default="api")  
14 |     sha256 = Column(String(64), nullable=False, index=True)
15 |     num_chunks = Column(Integer, nullable=False, default=0)
16 |     created_at = Column(DateTime, default=datetime.utcnow)
17 |     updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
18 | 
19 |     chunks = relationship("Chunk", back_populates="document", cascade="all, delete-orphan")
20 | 
21 | 
22 | class Chunk(Base):
23 |     __tablename__ = "chunks"
24 | 
25 |     id = Column(Integer, primary_key=True, index=True)
26 |     document_id = Column(Integer, ForeignKey("documents.id", ondelete="CASCADE"), nullable=False, index=True)
27 |     chunk_index = Column(Integer, nullable=False)
28 |     content = Column(Text, nullable=False)
29 |     token_count = Column(Integer, nullable=False, default=0)
30 |     created_at = Column(DateTime, default=datetime.utcnow)
31 | 
32 |     document = relationship("Document", back_populates="chunks")
33 | 


--------------------------------------------------------------------------------
/app/core/config.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from dataclasses import dataclass
 3 | from dotenv import load_dotenv, find_dotenv
 4 | 
 5 | _env_file = os.getenv("ENV_FILE", ".env.local")
 6 | if os.path.exists(_env_file):
 7 |     load_dotenv(_env_file)
 8 | else:
 9 |     _found = find_dotenv(".env")
10 |     if _found:
11 |         load_dotenv(_found)
12 | 
13 | 
14 | def _to_int(value: str, default: int) -> int:
15 |     try:
16 |         return int(value)
17 |     except Exception:
18 |         return default
19 | 
20 | 
21 | @dataclass
22 | class Settings:
23 |     DATA_DIR: str = os.getenv("DATA_DIR", os.path.join(os.getcwd(), "data"))
24 |     DB_PATH: str = os.getenv("DB_PATH", os.path.join(DATA_DIR, "db.sqlite3"))
25 |     INDEX_PATH: str = os.getenv("INDEX_PATH", os.path.join(DATA_DIR, "index.faiss"))
26 |     INDEX_META_PATH: str = os.getenv("INDEX_META_PATH", os.path.join(DATA_DIR, "index_meta.json"))
27 | 
28 |     MODEL_NAME: str = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
29 |     DEVICE: str = os.getenv("DEVICE", "cpu")
30 | 
31 |     CHUNK_SIZE_CHARS: int = _to_int(os.getenv("CHUNK_SIZE_CHARS", "1000"), 1000)
32 |     CHUNK_OVERLAP_CHARS: int = _to_int(os.getenv("CHUNK_OVERLAP_CHARS", "200"), 200)
33 | 
34 |     TOP_K_DEFAULT: int = _to_int(os.getenv("TOP_K_DEFAULT", "5"), 5)
35 | 
36 |     OPENAI_API_KEY: str | None = os.getenv("OPENAI_API_KEY")
37 | 
38 | 
39 | settings = Settings()
40 | 
41 | os.makedirs(settings.DATA_DIR, exist_ok=True)
42 | 


--------------------------------------------------------------------------------
/app/infrastructure/embeddings/sentence_transformer_provider.py:
--------------------------------------------------------------------------------
 1 | import threading
 2 | from typing import List
 3 | import numpy as np  
 4 | 
 5 | from sentence_transformers import SentenceTransformer  
 6 | 
 7 | from app.core.config import settings
 8 | 
 9 | _model_lock = threading.Lock()
10 | _model: SentenceTransformer | None = None
11 | _dim: int | None = None
12 | 
13 | 
14 | def _load_model() -> SentenceTransformer:
15 |     global _model, _dim
16 |     if _model is None:
17 |         with _model_lock:
18 |             if _model is None:
19 |                 _model = SentenceTransformer(settings.MODEL_NAME, device=settings.DEVICE)
20 |                 # Probe dimension
21 |                 probe = _model.encode(["dim"], convert_to_numpy=True, normalize_embeddings=False)
22 |                 _dim = probe.shape[1]
23 |     return _model
24 | 
25 | 
26 | def get_embedding_dimension() -> int:
27 |     if _dim is None:
28 |         _load_model()
29 |     assert _dim is not None
30 |     return _dim
31 | 
32 | 
33 | def _normalize(matrix: np.ndarray) -> np.ndarray:
34 |     norms = np.linalg.norm(matrix, axis=1, keepdims=True) + 1e-12
35 |     return matrix / norms
36 | 
37 | 
38 | def embed_texts(texts: List[str]) -> np.ndarray:
39 |     model = _load_model()
40 |     vectors = model.encode(texts, convert_to_numpy=True, normalize_embeddings=False)
41 |     vectors = _normalize(vectors)
42 |     return vectors.astype(np.float32)
43 | 
44 | 
45 | def embed_query(text: str) -> np.ndarray:
46 |     vec = embed_texts([text])
47 |     return vec
48 | 


--------------------------------------------------------------------------------
/app/application/services/qa_service.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Any, List
 2 | import os
 3 | 
 4 | import numpy as np  
 5 | 
 6 | from app.core.config import settings
 7 | from app.infrastructure.embeddings.sentence_transformer_provider import embed_query
 8 | from app.infrastructure.vectorstore.faiss_index import VectorIndex
 9 | 
10 | try:
11 |     from openai import OpenAI  
12 | except Exception:
13 |     OpenAI = None  
14 | 
15 | 
16 | def _format_citations(chunks: List[dict]) -> str:
17 |     lines = []
18 |     for i, c in enumerate(chunks, start=1):
19 |         uri = c.get("uri") or f"doc-{c.get('document_id')}"
20 |         snippet = c.get("content", "").strip().replace("\n", " ")
21 |         if len(snippet) > 500:
22 |             snippet = snippet[:500] + "..."
23 |         lines.append(f"[{i}] ({uri}) {snippet}")
24 |     return "\n".join(lines)
25 | 
26 | 
27 | def answer_question_and_citations(*, question: str, top_k: int, use_openai: bool = False) -> Dict[str, Any]:
28 |     q_vec = embed_query(question)
29 |     chunks = VectorIndex.search(q_vec, top_k)
30 |     if use_openai and settings.OPENAI_API_KEY and OpenAI is not None:
31 |         client = OpenAI(api_key=settings.OPENAI_API_KEY)
32 |         context = _format_citations(chunks)
33 |         prompt = (
34 |             "You are a helpful assistant. Answer the user's question using ONLY the context provided.\n"
35 |             "If the answer cannot be found in the context, say you don't know.\n\n"
36 |             f"Context:\n{context}\n\nQuestion: {question}\nAnswer:"
37 |         )
38 |         try:
39 |             completion = client.chat.completions.create(
40 |                 model="gpt-4o-mini",
41 |                 messages=[{"role": "user", "content": prompt}],
42 |                 temperature=0.0,
43 |             )
44 |             answer = completion.choices[0].message.content or ""
45 |         except Exception as e:
46 |             answer = f"Retrieval-only fallback due to LLM error: {e}\n\n" + _format_citations(chunks)
47 |     else:
48 |         answer = "Retrieval-only mode. Provide your own synthesis using these snippets:\n\n" + _format_citations(chunks)
49 |     return {"answer": answer, "citations": chunks}
50 | 
51 | 
52 | def completeness_check(*, query: str, top_k: int) -> Dict[str, Any]:
53 |     q_vec = embed_query(query)
54 |     chunks = VectorIndex.search(q_vec, top_k)
55 |     scores = [c.get("score", 0.0) for c in chunks]
56 |     coverage = float(sum(scores) / max(1, len(scores))) if scores else 0.0
57 |     is_complete = coverage >= 0.4
58 |     return {"is_complete": is_complete, "coverage": coverage, "k": top_k, "results": chunks}
59 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | *.egg-info/
 23 | .installed.cfg
 24 | *.egg
 25 | 
 26 | # Virtual environments
 27 | .venv/
 28 | venv/
 29 | env/
 30 | ENV/
 31 | 
 32 | # IDE/editor
 33 | .vscode/
 34 | .idea/
 35 | .DS_Store
 36 | 
 37 | # Environment variables
 38 | .env
 39 | .env.*
 40 | 
 41 | # Logs
 42 | *.log
 43 | 
 44 | # Project data (local artifacts)
 45 | data/
 46 | 
 47 | # SQLite and vector indexes (if outside data/)
 48 | *.sqlite3
 49 | *.faiss
 50 | 
 51 | # Cache
 52 | .cache/
 53 | **/__pycache__/
 54 | 
 55 | # Byte-compiled / optimized / DLL files
 56 | __pycache__/
 57 | *.py[cod]
 58 | *$py.class
 59 | 
 60 | # C extensions
 61 | *.so
 62 | 
 63 | # Distribution / packaging
 64 | .Python
 65 | build/
 66 | develop-eggs/
 67 | dist/
 68 | downloads/
 69 | eggs/
 70 | .eggs/
 71 | lib/
 72 | lib64/
 73 | parts/
 74 | sdist/
 75 | var/
 76 | wheels/
 77 | share/python-wheels/
 78 | *.egg-info/
 79 | .installed.cfg
 80 | *.egg
 81 | MANIFEST
 82 | 
 83 | # PyInstaller
 84 | *.manifest
 85 | *.spec
 86 | 
 87 | # Installer logs
 88 | pip-log.txt
 89 | pip-delete-this-directory.txt
 90 | 
 91 | # Unit test / coverage reports
 92 | htmlcov/
 93 | .tox/
 94 | .nox/
 95 | .coverage
 96 | .coverage.*
 97 | .cache
 98 | nosetests.xml
 99 | coverage.xml
100 | *.cover
101 | *.py,cover
102 | .hypothesis/
103 | .pytest_cache/
104 | 
105 | # Translations
106 | *.mo
107 | *.pot
108 | 
109 | # Django/Flask instances
110 | instance/
111 | .webassets-cache
112 | 
113 | # Scrapy stuff
114 | .scrapy
115 | 
116 | # Sphinx documentation
117 | /docs/_build/
118 | 
119 | # Jupyter Notebook
120 | .ipynb_checkpoints
121 | 
122 | # IPython
123 | profile_default/
124 | ipython_config.py
125 | 
126 | # pyenv
127 | .python-version
128 | 
129 | # pipenv
130 | Pipfile.lock
131 | 
132 | # poetry
133 | poetry.lock
134 | 
135 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
136 | __pypackages__/
137 | 
138 | # Celery
139 | celerybeat-schedule
140 | celerybeat.pid
141 | 
142 | # dotenv
143 | .env
144 | .env.*
145 | 
146 | # virtualenv
147 | .venv/
148 | venv/
149 | ENV/
150 | env/
151 | env.bak/
152 | venv.bak/
153 | 
154 | # mypy / pyre / pytype
155 | .mypy_cache/
156 | .pyre/
157 | .pytype/
158 | 
159 | # IDEs
160 | .vscode/
161 | .idea/
162 | *.iml
163 | 
164 | # OS files
165 | .DS_Store
166 | Thumbs.db
167 | 
168 | # Logs
169 | *.log
170 | 
171 | # Project data artifacts (persisted at runtime)
172 | data/*
173 | !data/.gitkeep
174 | 
175 | # FAISS / SQLite snapshots if outside data/
176 | *.faiss
177 | *.sqlite3
178 | 
179 | # Uvicorn reload dirs cache
180 | **/__pycache__/
181 | 


--------------------------------------------------------------------------------
/app/application/services/ingestion_service.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import tempfile
  4 | from typing import List, Tuple, Optional
  5 | 
  6 | from fastapi import UploadFile
  7 | from sqlalchemy.orm import Session  
  8 | 
  9 | from app.core.config import settings
 10 | from app.core.db import SessionLocal
 11 | from app.infrastructure.persistence.models import Document, Chunk
 12 | from app.infrastructure.text.text_utils import chunk_text, estimate_tokens, clean_text
 13 | from app.infrastructure.parsers.pdf_reader import extract_text_pages
 14 | from app.infrastructure.embeddings.sentence_transformer_provider import embed_texts
 15 | from app.infrastructure.vectorstore.faiss_index import VectorIndex
 16 | 
 17 | 
 18 | def _sha256_bytes(data: bytes) -> str:
 19 |     h = hashlib.sha256()
 20 |     h.update(data)
 21 |     return h.hexdigest()
 22 | 
 23 | 
 24 | def _sha256_text(text: str) -> str:
 25 |     return _sha256_bytes(text.encode("utf-8"))
 26 | 
 27 | 
 28 | def _persist_document(session: Session, *, uri: Optional[str], source_type: str, sha256: str, num_chunks: int) -> Document:
 29 |     doc = Document(uri=uri, source_type=source_type, sha256=sha256, num_chunks=num_chunks)
 30 |     session.add(doc)
 31 |     session.commit()
 32 |     session.refresh(doc)
 33 |     return doc
 34 | 
 35 | 
 36 | def _persist_chunks(session: Session, document_id: int, chunks: List[str]) -> List[Chunk]:
 37 |     chunk_rows: List[Chunk] = []
 38 |     for idx, content in enumerate(chunks):
 39 |         row = Chunk(document_id=document_id, chunk_index=idx, content=content, token_count=estimate_tokens(content))
 40 |         session.add(row)
 41 |         chunk_rows.append(row)
 42 |     session.commit()
 43 |     for row in chunk_rows:
 44 |         session.refresh(row)
 45 |     return chunk_rows
 46 | 
 47 | 
 48 | def _index_chunks(chunks: List[Chunk]) -> None:
 49 |     texts = [c.content for c in chunks]
 50 |     vectors = embed_texts(texts)
 51 |     ids = [int(c.id) for c in chunks]
 52 |     VectorIndex.add(vectors, ids)
 53 | 
 54 | 
 55 | def _maybe_skip_existing(session: Session, sha256: str, uri: Optional[str]) -> Optional[Document]:
 56 |     existing = session.query(Document).filter(Document.sha256 == sha256).first()
 57 |     if existing is not None:
 58 |         return existing
 59 |     if uri:
 60 |         existing_uri = session.query(Document).filter(Document.uri == uri).first()
 61 |         if existing_uri is not None:
 62 |             old_chunk_ids = [c.id for c in existing_uri.chunks]
 63 |             VectorIndex.remove_ids(old_chunk_ids)
 64 |             session.delete(existing_uri)
 65 |             session.commit()
 66 |     return None
 67 | 
 68 | 
 69 | def _ingest_text_core(text: str, uri: Optional[str], source_type: str) -> dict:
 70 |     cleaned = clean_text(text)
 71 |     if not cleaned:
 72 |         return {"status": "empty", "num_chunks": 0}
 73 |     parts = chunk_text(cleaned, settings.CHUNK_SIZE_CHARS, settings.CHUNK_OVERLAP_CHARS)
 74 |     content_hash = _sha256_text(cleaned)
 75 |     with SessionLocal() as session:
 76 |         existing = _maybe_skip_existing(session, content_hash, uri)
 77 |         if existing is not None:
 78 |             return {"status": "skipped", "document_id": int(existing.id), "num_chunks": int(existing.num_chunks)}
 79 |         doc = _persist_document(session, uri=uri, source_type=source_type, sha256=content_hash, num_chunks=len(parts))
 80 |         chunk_rows = _persist_chunks(session, doc.id, parts)
 81 |         _index_chunks(chunk_rows)
 82 |         return {"status": "ingested", "document_id": int(doc.id), "num_chunks": len(parts)}
 83 | 
 84 | 
 85 | def ingest_text_document(*, text: str, uri: Optional[str] = None) -> dict:
 86 |     return _ingest_text_core(text=text, uri=uri, source_type="api")
 87 | 
 88 | 
 89 | async def ingest_file_document(file: UploadFile) -> dict:
 90 |     filename = file.filename or "uploaded"
 91 |     ext = os.path.splitext(filename.lower())[1]
 92 |     if ext not in [".txt", ".pdf"]:
 93 |         raise ValueError("Only .txt and .pdf are supported for this prototype")
 94 |     data = await file.read()
 95 |     if ext == ".txt":
 96 |         text = data.decode("utf-8", errors="ignore")
 97 |         return _ingest_text_core(text=text, uri=filename, source_type="file")
 98 |     else:
 99 |         with tempfile.NamedTemporaryFile(delete=False, suffix=ext) as tmp:
100 |             tmp.write(data)
101 |             tmp.flush()
102 |             temp_path = tmp.name
103 |         try:
104 |             pages = list(extract_text_pages(temp_path))
105 |             combined = "\n\n".join(pages)
106 |             return _ingest_text_core(text=combined, uri=filename, source_type="file")
107 |         finally:
108 |             try:
109 |                 os.remove(temp_path)
110 |             except Exception:
111 |                 pass
112 | 


--------------------------------------------------------------------------------
/app/infrastructure/vectorstore/faiss_index.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import threading
  4 | from typing import List, Dict, Any
  5 | 
  6 | import faiss
  7 | import numpy as np
  8 | 
  9 | from app.core.config import settings
 10 | from app.core.db import SessionLocal
 11 | from app.infrastructure.persistence.models import Chunk, Document
 12 | from app.infrastructure.embeddings.sentence_transformer_provider import embed_texts
 13 | 
 14 | 
 15 | class VectorIndex:
 16 |     _index: faiss.Index | None = None
 17 |     _lock = threading.RLock()
 18 |     _dim: int | None = None
 19 | 
 20 |     @classmethod
 21 |     def initialize(cls, dimension: int) -> None:
 22 |         with cls._lock:
 23 |             cls._dim = dimension
 24 |             if os.path.exists(settings.INDEX_PATH):
 25 |                 idx = faiss.read_index(settings.INDEX_PATH)
 26 |                 if isinstance(idx, (faiss.IndexIDMap, faiss.IndexIDMap2)):
 27 |                     cls._index = idx
 28 |                 else:
 29 |                     if idx.d != dimension:
 30 |                         cls._index = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension))
 31 |                         cls._persist_meta()
 32 |                         cls._save()
 33 |                         cls._rebuild_from_db()
 34 |                     elif idx.ntotal == 0:
 35 |                         cls._index = faiss.IndexIDMap2(idx)
 36 |                         cls._persist_meta()
 37 |                         cls._save()
 38 |                     else:
 39 |                         cls._index = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension))
 40 |                         cls._persist_meta()
 41 |                         cls._save()
 42 |                         cls._rebuild_from_db()
 43 |                 if cls._index is not None and cls._index.d != dimension:
 44 |                     cls._index = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension))
 45 |                     cls._persist_meta()
 46 |                     cls._save()
 47 |                     cls._rebuild_from_db()
 48 |             else:
 49 |                 cls._index = faiss.IndexIDMap2(faiss.IndexFlatIP(dimension))
 50 |                 cls._persist_meta()
 51 |                 cls._save()
 52 | 
 53 |     @classmethod
 54 |     def _persist_meta(cls) -> None:
 55 |         meta = {"dimension": cls._dim, "model": settings.MODEL_NAME}
 56 |         os.makedirs(os.path.dirname(settings.INDEX_META_PATH), exist_ok=True)
 57 |         with open(settings.INDEX_META_PATH, "w", encoding="utf-8") as f:
 58 |             json.dump(meta, f)
 59 | 
 60 |     @classmethod
 61 |     def _save(cls) -> None:
 62 |         assert cls._index is not None
 63 |         faiss.write_index(cls._index, settings.INDEX_PATH)
 64 | 
 65 |     @classmethod
 66 |     def _rebuild_from_db(cls, batch_size: int = 256) -> None:
 67 |         assert cls._index is not None
 68 |         last_id = 0
 69 |         with SessionLocal() as session:
 70 |             while True:
 71 |                 rows: List[Chunk] = (
 72 |                     session.query(Chunk)
 73 |                     .filter(Chunk.id > last_id)
 74 |                     .order_by(Chunk.id.asc())
 75 |                     .limit(batch_size)
 76 |                     .all()
 77 |                 )
 78 |                 if not rows:
 79 |                     break
 80 |                 texts = [r.content for r in rows]
 81 |                 ids = [int(r.id) for r in rows]
 82 |                 vectors = embed_texts(texts)
 83 |                 cls._index.add_with_ids(vectors, np.array(ids, dtype=np.int64))
 84 |                 last_id = rows[-1].id
 85 |         cls._save()
 86 | 
 87 |     @classmethod
 88 |     def add(cls, embeddings: np.ndarray, ids: List[int]) -> None:
 89 |         assert cls._index is not None
 90 |         with cls._lock:
 91 |             cls._index.add_with_ids(embeddings, np.array(ids, dtype=np.int64))
 92 |             cls._save()
 93 | 
 94 |     @classmethod
 95 |     def remove_ids(cls, ids: List[int]) -> None:
 96 |         assert cls._index is not None
 97 |         if not ids:
 98 |             return
 99 |         with cls._lock:
100 |             to_remove = faiss.IDSelectorArray(np.array(ids, dtype=np.int64))
101 |             cls._index.remove_ids(to_remove)
102 |             cls._save()
103 | 
104 |     @classmethod
105 |     def search(cls, query_vec: np.ndarray, top_k: int) -> List[Dict[str, Any]]:
106 |         assert cls._index is not None
107 |         with cls._lock:
108 |             distances, id_matrix = cls._index.search(query_vec, top_k)
109 |         id_list = id_matrix[0].tolist()
110 |         score_list = distances[0].tolist()
111 |         results: List[Dict[str, Any]] = []
112 |         with SessionLocal() as session:
113 |             rows = (
114 |                 session.query(Chunk, Document)
115 |                 .join(Document, Chunk.document_id == Document.id)
116 |                 .filter(Chunk.id.in_([cid for cid in id_list if cid != -1]))
117 |                 .all()
118 |             )
119 |             chunk_by_id = {chunk.id: (chunk, doc) for (chunk, doc) in rows}
120 |         for cid, score in zip(id_list, score_list):
121 |             if cid == -1:
122 |                 continue
123 |             pair = chunk_by_id.get(cid)
124 |             if not pair:
125 |                 continue
126 |             chunk, doc = pair
127 |             results.append(
128 |                 {
129 |                     "content": chunk.content,
130 |                     "score": float(score),
131 |                     "document_id": int(doc.id),
132 |                     "uri": doc.uri,
133 |                     "chunk_index": int(chunk.chunk_index),
134 |                 }
135 |             )
136 |         return results
137 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | AI-Powered Knowledge Base Search & Enrichment (Challenge 2)
  2 | 
  3 | Overview
  4 | 
  5 | This repository contains a working prototype of a document ingestion and semantic search system with Q&A and completeness checking APIs. It focuses on simplicity and reliability for a 24-hour deliverable while leaving clear extension points for scale and features.
  6 | 
  7 | Key Features
  8 | 
  9 | - Document ingestion for raw text and PDF (stores raw text and vector embeddings)
 10 | - Sentence-transformer embeddings (local, CPU-friendly) with FAISS vector index (persistent)
 11 | - Semantic search API returning ranked chunks with metadata
 12 | - Q&A API (RAG) with optional OpenAI integration; retrieval-only fallback
 13 | - Completeness check API estimating corpus coverage of a query
 14 | - Incremental updates using content SHA-256 to avoid redundant re-indexing
 15 | - Modular architecture for future tools (parsers, LLMs, stores)
 16 | 
 17 | Quickstart
 18 | 
 19 | 1) Environment
 20 | 
 21 | - Python 3.10+ is recommended
 22 | - Windows, macOS, Linux supported (Windows tested)
 23 | 
 24 | 2) Install
 25 | 
 26 | ```bash
 27 | python -m venv .venv
 28 | . .venv/Scripts/activate  
 29 | pip install -r requirements.txt
 30 | ```
 31 | 
 32 | 3) Run the server
 33 | 
 34 | ```bash
 35 | uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
 36 | ```
 37 | 
 38 | 4) Try the APIs
 39 | 
 40 | - Open Swagger UI: http://localhost:8000/docs
 41 | 
 42 | Examples (HTTP):
 43 | 
 44 | ```bash
 45 | curl -X POST http://localhost:8000/ingest/text \
 46 |   -H "Content-Type: application/json" \
 47 |   -d '{"text":"This is a sample document about machine learning.", "uri":"sample-1"}'
 48 | 
 49 | curl -X POST http://localhost:8000/search \
 50 |   -H "Content-Type: application/json" \
 51 |   -d '{"query":"What is machine learning?", "k":5}'
 52 | 
 53 | curl -X POST http://localhost:8000/qa \
 54 |   -H "Content-Type: application/json" \
 55 |   -d '{"question":"What is machine learning?", "k":5}'
 56 | 
 57 | curl -X POST http://localhost:8000/completeness \
 58 |   -H "Content-Type: application/json" \
 59 |   -d '{"query":"Neural networks overview", "k":10}'
 60 | ```
 61 | 
 62 | Design Decisions & Trade-offs
 63 | 
 64 | - Embeddings: `all-MiniLM-L6-v2` (384-dim) for speed/size on CPU.
 65 | - Vector store: FAISS (inner product with cosine normalization) persisted to disk.
 66 | - DB: SQLite for simplicity; holds documents and chunks for metadata and re-indexing.
 67 | - Incremental updates: content hash (SHA-256). If unchanged, indexing is skipped.
 68 | - Parsers: PDF via `pypdf`; raw text via API. HTML/Docx can be added with new parsers.
 69 | - Q&A: Optional OpenAI integration for answer synthesis; otherwise returns retrieved context plus a note.
 70 | 
 71 | 24-hour Constraints & Specific Trade-offs
 72 | 
 73 | - Scope: Retrieval-first system with optional LLM answer synthesis; no advanced reranking.
 74 | - Completeness metric: Simple similarity-based heuristic (avg score) vs. richer coverage metrics.
 75 | - Infra: Single-process FastAPI; no background job queue. For scale, add Celery/RQ workers.
 76 | - Storage: SQLite + FAISS for fast local prototyping; swap to Postgres+pgvector or a managed vector DB for large corpora.
 77 | - Parsing: Basic text/PDF support only; HTML/DOCX and OCR are out of scope for this iteration.
 78 | - Observability: Minimal logging; no tracing/metrics dashboards. Add OpenTelemetry in next phase.
 79 | - Security: No auth/rate limiting. Intended for local demo. In production, put behind gateway and add auth.
 80 | - Testing: Manual and Swagger-driven smoke tests; unit/integration tests not included due to time.
 81 | 
 82 | Scaling Considerations
 83 | 
 84 | - Swap SQLite+FAISS to Postgres+pgvector or a cloud vector DB for larger scale.
 85 | - Add distributed workers for ingestion and background batching.
 86 | - Stream chunking and embedding in batches to keep memory stable (already supported).
 87 | - Use async queues and backpressure for very large corpora.
 88 | 
 89 | Project Structure
 90 | 
 91 | ```
 92 | app/
 93 |   main.py              # FastAPI app and routes
 94 |   config.py            # Settings and constants
 95 |   db.py                # SQLAlchemy engine/session
 96 |   models.py            # ORM models
 97 |   schemas.py           # Pydantic request/response models
 98 |   embeddings.py        # Sentence-transformer loader and wrappers
 99 |   vectorstore.py       # FAISS index manager (persistent)
100 |   ingestion.py         # Pipeline: parse, chunk, hash, embed, index
101 |   qa.py                # RAG-style Q&A and completeness helpers
102 |   utils/
103 |     text.py            # Cleaning and chunking
104 |     pdf.py             # PDF extraction
105 | data/
106 |   (created at runtime for DB and index persistence)
107 | ```
108 | 
109 | How to Test
110 | 
111 | - Use Swagger UI to try endpoints interactively
112 | - Use the sample curl commands above
113 | - Re-ingest the same content to see `"status":"skipped"` (incremental updates)
114 | - Change content for the same `uri` to see replacement (old vectors removed)
115 | - Ingest a PDF via `/ingest/file` and then query `/search` and `/qa`
116 | - Restart the server and run search again to confirm persistence under `data/`
117 | 
118 | Demo
119 | 
120 | - Record a <5 min screen capture showing: start server → ingest text → search → QA (retrieval-only and with OpenAI if set) → ingest PDF → completeness → restart and confirm persistence.
121 | 
122 | Deliverables Checklist
123 | 
124 | - Working prototype (local): yes (http://45.61.150.108:8000/docs)
125 | - README with design decisions, 24h trade-offs, run/test steps: yes
126 | - Short Loom/screen recording demo: 
127 | - Code in a GitHub repo: https://github.com/RikuSato0/AI-Powered-Knowledge-Base-Search---Enrichment
128 | 
129 | 
130 | 


--------------------------------------------------------------------------------