├── README.md
├── app
    ├── .gitignore
    ├── main.py
    ├── models
    │   ├── embed_models.py
    │   └── extraction_models.py
    ├── requirements.txt
    ├── routers
    │   ├── embed_router.py
    │   ├── file_processing.py
    │   └── token_counter.py
    ├── services
    │   ├── csv_extractor.py
    │   ├── doc_extractor.py
    │   ├── embedding_service.py
    │   ├── pdf_extractor.py
    │   ├── text_extractor.py
    │   ├── token_counter.py
    │   └── word_count_service.py
    └── utils
    │   ├── file_utils.py
    │   └── model_utils.py
├── examples
    ├── README.md
    └── chatPdf
    │   ├── .env
    │   ├── README.md
    │   ├── chat.py
    │   ├── example.pdf
    │   ├── requirements.txt
    │   └── services
    │       ├── chat_service.py
    │       ├── chroma_service.py
    │       ├── embedding_service.py
    │       └── pdf_extractor.py
└── static
    └── banner.png


/README.md:
--------------------------------------------------------------------------------
  1 | # ![Maux-API Banner](static/banner.png)
  2 | # Maux-API: RAG AI Workflow Simplified 🚀
  3 | 
  4 | Welcome to **Maux-API** – an open-source API designed to help users quickly create RAG (Retrieval-Augmented Generation) AI workflows. Built by the team at [ai.maux.space](https://ai.maux.space), this project is free for anyone to use, with a special focus on supporting the Persian language.
  5 | 
  6 | ## 🌟 Features
  7 | 
  8 | - **/embed**: Generates embeddings using the "Alibaba-NLP/gte-multilingual-base" model. This route requires a ~700MB model to be downloaded.
  9 | - **/extract**: Extracts content from supported files such as PDFs, CSVs, DOC/DOCX, TXT, and Markdown files.
 10 | - **/count**: Counts the tokens in the given content, helping you manage token usage effectively.
 11 | 
 12 | ## 🛠 Installation
 13 | 
 14 | To get started with Maux-API, follow these steps:
 15 | 
 16 | 1. **Clone the Repository:**
 17 | 
 18 |     ```bash
 19 |     git clone https://github.com/xmannii/Maux-API.git
 20 |     cd Maux-API
 21 |     ```
 22 | 
 23 | 2. **Create and Activate a Virtual Environment:**
 24 | 
 25 |     ```bash
 26 |     python -m venv venv
 27 |     source venv/bin/activate  # On Windows use `venv\Scripts\activate`
 28 |     ```
 29 | 
 30 | 3. **Install the Required Dependencies:**
 31 | 
 32 |     ```bash
 33 |     cd app
 34 |     pip install -r requirements.txt
 35 |     cd ..
 36 |     ```
 37 | 
 38 | 4. **Run the API:**
 39 | 
 40 |     ```bash
 41 |     uvicorn app.main:app --reload
 42 |     ```
 43 | 
 44 | ## 📚 Usage
 45 | 
 46 | Once the API is up and running, you can use the following routes:
 47 | 
 48 | ### 🔍 `/extract`
 49 | 
 50 | **Description:** Extracts content from a variety of file formats.
 51 | 
 52 | - **Supported Formats:** `.pdf`, `.csv`, `.doc`, `.docx`, `.txt`, `.md`
 53 | 
 54 | **Response:**
 55 | 
 56 | ```json
 57 | {
 58 |     "filename": "example.pdf",
 59 |     "file_type": "pdf",
 60 |     "extraction_time": "0:00:01.123456",
 61 |     "word_count": 1024,
 62 |     "content": [
 63 |         {
 64 |             "text": "Extracted text from the PDF",
 65 |             "page_number": 1
 66 |         }
 67 |     ]
 68 | }
 69 | ```
 70 | 
 71 | ### 📝 `/count`
 72 | 
 73 | **Description:** Counts the number of tokens in the given content.
 74 | 
 75 | **Parameters:**
 76 | 
 77 | - `content` (required): The content to count tokens for.
 78 | - `model` (optional): The model to use for counting tokens. Defaults to "gpt-3.5-turbo".
 79 | - `token_limit` (optional): The maximum number of tokens to count. Defaults to None.
 80 | 
 81 | **Response:**
 82 | 
 83 | ```json
 84 | {
 85 |     "model": "gpt-3.5-turbo",
 86 |     "num_tokens": 1024,
 87 |     "token_limit": 1000,
 88 |     "within_limit": true
 89 | }
 90 | ```
 91 | 
 92 | ### 📝 `/embed`
 93 | 
 94 | **Description:** Generates embeddings for the given text using the "Alibaba-NLP/gte-multilingual-base" model.
 95 | 
 96 | **Parameters:**
 97 | 
 98 | - `text` (required): The text to generate embeddings for.
 99 | 
100 | **Response:**
101 | 
102 | ```json
103 | {
104 |     "object": "list",
105 |     "data": [
106 |         {
107 |             "object": "embedding",
108 |             "index": 0,
109 |             "embedding": [0.1, 0.2, 0.3, 0.4, 0.5]
110 |         }
111 |     ],
112 |     "model": "Alibaba-NLP/gte-multilingual-base"
113 | }
114 | ```
115 | 
116 | **Discarding Embeddings:**
117 | 
118 | If you want to discard the embeddings, you can omit its Route from the main.py file and remove the corresponding import statement.
119 | 
120 | 
121 | ### 📁 Examples
122 | 
123 | We have added some examples of how to use the APIs in the `examples` folder.
124 | 
125 | ### 🛠️ Contributions
126 | 
127 | this project is licensed under the Apache License 2.0 and fully open-source. you are welcome to contribute to this project by submitting issues, pull requests, or suggesting new features.
128 | 
129 | 


--------------------------------------------------------------------------------
/app/.gitignore:
--------------------------------------------------------------------------------
1 | .env 
2 | __pycache__


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
 1 | from fastapi import FastAPI
 2 | from fastapi.middleware.cors import CORSMiddleware
 3 | from app.routers import file_processing 
 4 | from app.routers import token_counter
 5 | from app.routers import embed_router
 6 | 
 7 | 
 8 | app = FastAPI()
 9 | # CORS configuration
10 | origins = [
11 |     "http://localhost:3000",  # Local development
12 |     # اینجا می‌توانید مبدأهای دیگر را اضافه کنید 
13 | ]
14 | 
15 | app.add_middleware(
16 |     CORSMiddleware,
17 |     allow_origins=origins,  # Allows specific origins
18 |     allow_credentials=True,
19 |     allow_methods=["*"],  # Allows all methods (GET, POST, etc.)
20 | 
21 |     allow_headers=["*"],  # Allows all headers
22 | 
23 | )
24 | 
25 | 
26 | app.include_router(file_processing.router)
27 | app.include_router(token_counter.router)
28 | # Omit this line if you don't want to use the embedding API and download the model
29 | # اگر نمیخواید از embedding استفاده کتید و مدل دانلود کنید این خط را حذف کنید
30 | app.include_router(embed_router.router)
31 | 
32 | 
33 | if __name__ == "__main__":
34 |     import uvicorn
35 |     uvicorn.run(app, host="0.0.0.0", port=8000)


--------------------------------------------------------------------------------
/app/models/embed_models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List
 3 | 
 4 | class EmbedRequest(BaseModel):
 5 |     text: str
 6 | 
 7 | class EmbeddingData(BaseModel):
 8 |     object: str = "embedding"
 9 |     index: int
10 |     embedding: List[float]
11 | 
12 | class EmbeddingResponse(BaseModel):
13 |     object: str = "list"
14 |     data: List[EmbeddingData]
15 |     model: str


--------------------------------------------------------------------------------
/app/models/extraction_models.py:
--------------------------------------------------------------------------------
 1 | from pydantic import BaseModel
 2 | from typing import List, Optional
 3 | from datetime import timedelta
 4 | 
 5 | class ExtractedContent(BaseModel):
 6 |     text: str
 7 |     page_number: Optional[int] = None
 8 | 
 9 | 
10 | class ExtractionResponse(BaseModel):
11 |     filename: str
12 |     file_type: str
13 |     extraction_time: timedelta
14 |     word_count: int
15 |     content: List[ExtractedContent]


--------------------------------------------------------------------------------
/app/requirements.txt:
--------------------------------------------------------------------------------
1 | fastapi 
2 | uvicorn
3 | requests
4 | tiktoken
5 | docx
6 | PyPDF2
7 | pandas
8 | sentence-transformers
9 | numpy


--------------------------------------------------------------------------------
/app/routers/embed_router.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, HTTPException
 2 | from app.services.embedding_service import EmbeddingService
 3 | from app.models.embed_models import EmbedRequest, EmbeddingResponse
 4 | import numpy as np
 5 | 
 6 | router = APIRouter()
 7 | 
 8 | MODEL_NAME = "Alibaba-NLP/gte-multilingual-base" # good persian supporting model
 9 | embedding_service = EmbeddingService(MODEL_NAME)
10 | 
11 | @router.post("/embed", response_model=EmbeddingResponse)
12 | async def embed_text(request: EmbedRequest):
13 |     try:
14 |         embedding = embedding_service.get_embedding(request.text)
15 |         embedding_list = np.where(np.isnan(embedding), None, embedding).tolist()
16 | 
17 |         response = EmbeddingResponse(
18 |             data=[
19 |                 {
20 |                     "object": "embedding",
21 |                     "index": 0,
22 |                     "embedding": embedding_list,
23 |                 }
24 |             ],
25 |             model=MODEL_NAME
26 |         )
27 | 
28 |         return response
29 |     except Exception as e:
30 |         raise HTTPException(status_code=500, detail=str(e))


--------------------------------------------------------------------------------
/app/routers/file_processing.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, UploadFile, File, HTTPException
 2 | from app.models.extraction_models import ExtractionResponse, ExtractedContent
 3 | from app.services import pdf_extractor, csv_extractor, doc_extractor, text_extractor
 4 | from app.services.word_count_service import count_words, process_content
 5 | from app.utils.file_utils import get_file_extension
 6 | from datetime import datetime
 7 | import logging
 8 | 
 9 | router = APIRouter()
10 | logging.basicConfig(level=logging.INFO)
11 | logger = logging.getLogger(__name__)
12 | 
13 | @router.post("/extract", response_model=ExtractionResponse)
14 | async def extract_content(file: UploadFile = File(...)):
15 |     if file.filename == "":
16 |         raise HTTPException(status_code=400, detail="No file uploaded")
17 |     
18 |     logger.info(f"Received file: {file.filename}")
19 |     file_extension = get_file_extension(file.filename)
20 |     
21 |     supported_extensions = {".pdf", ".csv", ".doc", ".docx", ".txt", ".md"}
22 |     
23 |     if file_extension not in supported_extensions:
24 |         logger.warning(f"Unsupported file type: {file_extension}")
25 |         raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}")
26 |     
27 |     try:
28 |         extraction_start = datetime.now()
29 |         
30 |         if file_extension == ".pdf":
31 |             content = await pdf_extractor.extract(file)
32 |         elif file_extension == ".csv":
33 |             content = await csv_extractor.extract(file)
34 |         elif file_extension in [".doc", ".docx"]:
35 |             content = await doc_extractor.extract(file)
36 |         elif file_extension in [".txt", ".md"]:
37 |             content = await text_extractor.extract(file)
38 |         
39 |         extraction_end = datetime.now()
40 |         extraction_duration = extraction_end - extraction_start
41 |         
42 | 
43 |         total_word_count = sum(len(page.text.split()) for page in content)  
44 |         
45 |         logger.info(f"Successfully extracted content from {file.filename}")
46 |         
47 |         return ExtractionResponse(
48 |             filename=file.filename,
49 |             file_type=file_extension,
50 |             extraction_time=extraction_duration,
51 |             word_count=total_word_count,
52 |             content=content 
53 |         )
54 |     except Exception as e:
55 |         logger.error(f"Error processing file {file.filename}: {str(e)}", exc_info=True)
56 |         raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}")


--------------------------------------------------------------------------------
/app/routers/token_counter.py:
--------------------------------------------------------------------------------
 1 | from fastapi import APIRouter, HTTPException, Query
 2 | from pydantic import BaseModel
 3 | from app.services.token_counter import count_tokens, SUPPORTED_MODELS
 4 | 
 5 | router = APIRouter()
 6 | 
 7 | class TokenCountRequest(BaseModel):
 8 |     content: str
 9 |     model: str = "gpt-3.5-turbo"
10 |     token_limit: int = None
11 | 
12 | @router.post("/count")
13 | async def count_tokens_route(request: TokenCountRequest):
14 |     if request.model not in SUPPORTED_MODELS:
15 |         raise HTTPException(status_code=400, detail=f"Unsupported model: {request.model}")
16 | 
17 |     try:
18 |         num_tokens = count_tokens(request.content, request.model)
19 |         response = {
20 |             "model": request.model,
21 |             "num_tokens": num_tokens
22 |         }
23 | 
24 |         if request.token_limit is not None:
25 |             within_limit = num_tokens <= request.token_limit
26 |             response["token_limit"] = request.token_limit
27 |             response["within_limit"] = within_limit
28 | 
29 |         return response
30 |     except Exception as e:
31 |         raise HTTPException(status_code=500, detail=f"Error counting tokens: {str(e)}")


--------------------------------------------------------------------------------
/app/services/csv_extractor.py:
--------------------------------------------------------------------------------
 1 | import pandas as pd
 2 | import io
 3 | 
 4 | async def extract(file):
 5 |     try:
 6 |         content = await file.read()
 7 |         csv_file = io.StringIO(content.decode('utf-8'))
 8 |         df = pd.read_csv(csv_file)
 9 | 
10 |         records = df.to_dict('records')
11 |         
12 |         return records
13 |     except Exception as e:
14 |         raise Exception(f"Error extracting CSV content: {str(e)}")


--------------------------------------------------------------------------------
/app/services/doc_extractor.py:
--------------------------------------------------------------------------------
 1 | from docx import Document
 2 | import io
 3 | 
 4 | async def extract(file):
 5 |     try:
 6 |         content = await file.read()
 7 |         doc_file = io.BytesIO(content)
 8 |         document = Document(doc_file)
 9 |         
10 |         full_text = []
11 |         for para in document.paragraphs:
12 |             full_text.append(para.text)
13 |         
14 |         return "\n".join(full_text)
15 |     except Exception as e:
16 |         raise Exception(f"Error extracting DOC content: {str(e)}")


--------------------------------------------------------------------------------
/app/services/embedding_service.py:
--------------------------------------------------------------------------------
 1 | from sentence_transformers import SentenceTransformer
 2 | from app.utils.model_utils import get_model_path
 3 | 
 4 | class EmbeddingService:
 5 |     def __init__(self, model_name: str):
 6 |         self.model_name = model_name
 7 |         self.model = self._load_model()
 8 | 
 9 |     def _load_model(self):
10 |         model_path = get_model_path(self.model_name)
11 |         return SentenceTransformer(model_path, trust_remote_code=True)
12 | 
13 |     def get_embedding(self, text: str):
14 |         return self.model.encode([text])[0]


--------------------------------------------------------------------------------
/app/services/pdf_extractor.py:
--------------------------------------------------------------------------------
 1 | import PyPDF2
 2 | import io
 3 | from app.models.extraction_models import ExtractedContent
 4 | 
 5 | async def extract(file):
 6 |     try:
 7 |         content = await file.read()
 8 |         pdf_file = io.BytesIO(content)
 9 |         pdf_reader = PyPDF2.PdfReader(pdf_file)
10 |         
11 |         extracted_content = []
12 |         for page_number, page in enumerate(pdf_reader.pages):
13 |             text = page.extract_text()
14 |             if text: 
15 |                 extracted_content.append(ExtractedContent(text=text.strip(), page_number=page_number + 1))
16 |         
17 |         return extracted_content
18 |     except Exception as e:
19 |         raise Exception(f"Error extracting PDF content: {str(e)}")


--------------------------------------------------------------------------------
/app/services/text_extractor.py:
--------------------------------------------------------------------------------
1 | async def extract(file):
2 |     try:
3 |         content = await file.read()
4 |         text = content.decode('utf-8')
5 |         return text.strip()
6 |     except Exception as e:
7 |         raise Exception(f"Error extracting text content: {str(e)}")


--------------------------------------------------------------------------------
/app/services/token_counter.py:
--------------------------------------------------------------------------------
 1 | import tiktoken
 2 | 
 3 | 
 4 | # Supported models
 5 | SUPPORTED_MODELS = {
 6 |     "gpt-4o": "o200k_base",
 7 |     "gpt-4": "cl100k_base",
 8 |     "gpt-3.5-turbo": "cl100k_base",
 9 |     "gpt-3.5": "cl100k_base",
10 |     "gpt-35-turbo": "cl100k_base",
11 |     "text-embedding-ada-002": "cl100k_base",
12 |     "text-embedding-3-small": "cl100k_base",
13 |     "text-embedding-3-large": "cl100k_base",
14 | }
15 | 
16 | def count_tokens(content: str, model: str = "gpt-3.5-turbo") -> int:
17 |     if model not in SUPPORTED_MODELS:
18 |         raise ValueError(f"Unsupported model: {model}")
19 |     
20 |     encoding_name = SUPPORTED_MODELS[model]
21 |     encoding = tiktoken.get_encoding(encoding_name)
22 |     tokens = encoding.encode(content)
23 |     num_tokens = len(tokens)
24 |     return num_tokens
25 | 
26 | 


--------------------------------------------------------------------------------
/app/services/word_count_service.py:
--------------------------------------------------------------------------------
 1 | from typing import List, Union
 2 | from app.models.extraction_models import ExtractedContent
 3 | 
 4 | def count_words(content: Union[str, List[Union[str, dict]]]) -> int:
 5 |     total_word_count = 0
 6 | 
 7 |     if isinstance(content, str):
 8 |         total_word_count = len(content.split())
 9 |     elif isinstance(content, list):
10 |         for item in content:
11 |             if isinstance(item, dict):
12 |                 text = item.get('text', '')
13 |             elif isinstance(item, str):
14 |                 text = item
15 |             else:
16 |                 text = str(item)
17 |             total_word_count += len(text.split())
18 | 
19 |     return total_word_count
20 | 
21 | def process_content(content: Union[str, List[Union[str, dict]]]) -> List[ExtractedContent]:
22 |     processed_content = []
23 | 
24 |     if isinstance(content, list):
25 |         for idx, item in enumerate(content, start=1):
26 |             if isinstance(item, dict):
27 |                 text = item.get('text', '')
28 |                 metadata = {k: v for k, v in item.items() if k != 'text'}
29 |                 processed_content.append(ExtractedContent(
30 |                     text=text,
31 |                     page_number=idx,
32 |                     metadata=metadata
33 |                 ))
34 |             else:
35 |                 processed_content.append(ExtractedContent(text=str(item), page_number=idx))
36 |     else:
37 |         processed_content.append(ExtractedContent(text=str(content)))
38 | 
39 |     return processed_content


--------------------------------------------------------------------------------
/app/utils/file_utils.py:
--------------------------------------------------------------------------------
1 | import os
2 | 
3 | def get_file_extension(filename):
4 |     return os.path.splitext(filename)[1].lower()


--------------------------------------------------------------------------------
/app/utils/model_utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | from sentence_transformers import SentenceTransformer
 4 | 
 5 | def get_model_path(model_name: str) -> str:
 6 |     model_cache_dir = os.path.join(tempfile.gettempdir(), "sentence_transformer_cache")
 7 |     os.makedirs(model_cache_dir, exist_ok=True)
 8 |     
 9 |     model_path = os.path.join(model_cache_dir, model_name)
10 |     
11 |     if not os.path.exists(model_path):
12 |         print(f"Downloading model {model_name} and saving to {model_path}")
13 |         model = SentenceTransformer(model_name, trust_remote_code=True)
14 |         model.save(model_path)
15 |         print("Model saved successfully")
16 |     else:
17 |         print(f"Model loaded from cache: {model_path}")
18 |     
19 |     return model_path


--------------------------------------------------------------------------------
/examples/README.md:
--------------------------------------------------------------------------------
1 | Here we will add some examples of how to use the APIs
2 | 
3 | 1. Chat with PDF. (uses the PDF extraction API and embedding API)
4 | 
5 | 


--------------------------------------------------------------------------------
/examples/chatPdf/.env:
--------------------------------------------------------------------------------
1 | 
2 | # Add your Groq API key here
3 | GROQ_API_KEY= "your_groq_api_key"


--------------------------------------------------------------------------------
/examples/chatPdf/README.md:
--------------------------------------------------------------------------------
 1 | # Chat with PDF
 2 | 
 3 | 🚀 This example demonstrates how to use the PDF extraction and embedding APIs to chat with a PDF file.
 4 | 
 5 | ## Prerequisites
 6 | 
 7 | - Python 
 8 | - ChromaDB
 9 | - requests
10 | - groq
11 | 
12 | ## Installation
13 | 
14 | 1. install requirements
15 | ```bash
16 | pip install -r requirements.txt
17 | ```
18 | 
19 | 2. run the self-hosted Maux-API server
20 | ```bash
21 | uvicorn app.main:app --reload
22 | ```
23 | 3. add your groq api key to the env file
24 | 4. run the chat script
25 | ```bash
26 | python chat.py
27 | ```
28 | 
29 | 5. enter the path to your PDF file
30 | 6. enter your question
31 | 7. wait for the chat to end
32 | 
33 | 


--------------------------------------------------------------------------------
/examples/chatPdf/chat.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import asyncio
 3 | from services.pdf_extractor import extract_pdf_content
 4 | from services.embedding_service import get_embeddings
 5 | from services.chroma_service import setup_chroma, store_in_chroma
 6 | from services.chat_service import chat_loop
 7 | 
 8 | async def process_pdf(file_path: str, collection):
 9 |     """Process a PDF file: extract content, generate embeddings, and store in Chroma."""
10 |     print("Extracting PDF content...")
11 |     pages = extract_pdf_content(file_path)
12 |     print(f"Extracted {len(pages)} pages.")
13 |     print("Generating embeddings...")
14 |     embeddings = await get_embeddings(pages)
15 |     
16 |     print("Storing in Chroma...")
17 |     store_in_chroma(collection, embeddings, pages)
18 |     print("PDF processed and stored in Chroma.")
19 | 
20 | async def main():
21 |     collection = setup_chroma()
22 |     
23 |     file_path = input("Enter the path to your PDF file: ")
24 |     ## file_path = "example.pdf"
25 |     await process_pdf(file_path, collection)
26 |     
27 |     print("\nPDF processed. Entering chat mode.")
28 |     await chat_loop(collection)
29 | 
30 | if __name__ == "__main__":
31 |     asyncio.run(main())
32 | 


--------------------------------------------------------------------------------
/examples/chatPdf/example.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xmannii/Maux-API/9bf1d0f3a7023044db53ef7ba8432422e268a7a5/examples/chatPdf/example.pdf


--------------------------------------------------------------------------------
/examples/chatPdf/requirements.txt:
--------------------------------------------------------------------------------
1 | chromadb
2 | requests
3 | groq
4 | 
5 | 


--------------------------------------------------------------------------------
/examples/chatPdf/services/chat_service.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import os
 3 | from groq import Groq
 4 | from services.embedding_service import get_embeddings
 5 | from services.chroma_service import query_chroma
 6 | 
 7 | # Initialize the Groq client with the API key 
 8 | client = Groq(api_key=os.environ.get("GROQ_API_KEY"))
 9 | 
10 | async def chat_loop(collection):
11 |     """Main chat loop for querying the stored PDF content and interacting with Groq API."""
12 |     messages = [{"role": "system", "content": "You are a helpful assistant that answers questions based on the extracted content from a PDF file."}]
13 |     
14 |     while True:
15 |         query = input("\nEnter your question (or 'quit' to exit): ")
16 |         if query.lower() == 'quit':
17 |             break
18 |         
19 |         print("Generating query embedding...")
20 |         query_embedding = await get_embeddings([query])
21 |         
22 |         print("Querying Chroma...")
23 |         results = query_chroma(collection, query_embedding)
24 |         print(results)
25 |         if not results['documents']:
26 |             print("No relevant content found in the PDF.")
27 |             continue
28 |        
29 |         # Prepare the prompt template
30 |         top_documents = results['documents'][0][:2]  # Get top 2 results
31 |         prompt = "The following are the most relevant contents extracted from the PDF:\n"
32 |         for i, doc in enumerate(top_documents, 1):
33 |             prompt += f"{i}. {doc}\n"
34 |         prompt += f"\nUser Question: {query}\n\nPlease provide a response based on the extracted content."
35 |         print(prompt)
36 |         # Add the user message to the messages list
37 |         messages.append({"role": "user", "content": prompt})
38 |         
39 |         # Generate a response from Groq API
40 |         print("Generating response from Groq API...")
41 |         chat_completion = client.chat.completions.create(
42 |             messages=messages,
43 |             model="llama-3.1-70b-versatile",
44 |         )
45 |         
46 |         response_content = chat_completion.choices[0].message.content
47 |         print(f"\nAI Response:\n{response_content}")
48 |         
49 |         # Add the AI's response to the messages list
50 |         messages.append({"role": "assistant", "content": response_content})
51 | 
52 | 


--------------------------------------------------------------------------------
/examples/chatPdf/services/chroma_service.py:
--------------------------------------------------------------------------------
 1 | # services/chroma_service.py
 2 | 
 3 | import chromadb
 4 | 
 5 | def setup_chroma():
 6 |     """Set up and return a Chroma client and collection."""
 7 |     chroma_client = chromadb.Client()
 8 |     collection = chroma_client.get_or_create_collection(name="pdf_collection")
 9 |     return collection
10 | 
11 | def store_in_chroma(collection, embeddings, documents):
12 |     """Store embeddings and documents in Chroma."""
13 |     collection.add(
14 |         embeddings=embeddings,
15 |         documents=documents,
16 |         ids=[f"page_{i + 1}" for i in range(len(documents))]  # Start IDs from 1 for better readability
17 |     )
18 | 
19 | def query_chroma(collection, query_embedding):
20 |     """Query the Chroma collection."""
21 |     results = collection.query(
22 |         query_embeddings=query_embedding,
23 |         n_results=2
24 |     )
25 |     return results
26 | 


--------------------------------------------------------------------------------
/examples/chatPdf/services/embedding_service.py:
--------------------------------------------------------------------------------
 1 | # services/embedding_service.py
 2 | 
 3 | import requests
 4 | from typing import List
 5 | 
 6 | API_BASE_URL = "http://localhost:8000"  # your maux api url here
 7 | async def get_embeddings(texts: List[str]) -> List[List[float]]:
 8 |     """Get embeddings for a list of texts using the FastAPI endpoint."""
 9 |     embeddings = []
10 |     for text in texts:
11 |         response = requests.post(f"{API_BASE_URL}/embed", json={"text": text})
12 |         if response.status_code != 200:
13 |             raise Exception(f"Failed to get embedding: {response.text}")
14 |         
15 |         # Access the embedding from the response
16 |         embeddings.append(response.json()['data'][0]['embedding'])  
17 |     return embeddings
18 | 


--------------------------------------------------------------------------------
/examples/chatPdf/services/pdf_extractor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | import requests
 4 | from typing import List
 5 | 
 6 | API_BASE_URL = "http://localhost:8000"  # your maux api url here
 7 | 
 8 | def extract_pdf_content(file_path: str) -> List[str]:
 9 |     """Extract content from a PDF file using the FastAPI endpoint."""
10 |     with open(file_path, 'rb') as file:
11 |         response = requests.post(f"{API_BASE_URL}/extract", files={"file": file})
12 |     
13 |     if response.status_code != 200:
14 |         raise Exception(f"Failed to extract PDF content: {response.text}")
15 |     
16 |     # Extract the text from each page's content
17 |     return [content['text'] for content in response.json()['content']]
18 | 


--------------------------------------------------------------------------------
/static/banner.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/xmannii/Maux-API/9bf1d0f3a7023044db53ef7ba8432422e268a7a5/static/banner.png


--------------------------------------------------------------------------------