├── README.md ├── app ├── .gitignore ├── main.py ├── models │ ├── embed_models.py │ └── extraction_models.py ├── requirements.txt ├── routers │ ├── embed_router.py │ ├── file_processing.py │ └── token_counter.py ├── services │ ├── csv_extractor.py │ ├── doc_extractor.py │ ├── embedding_service.py │ ├── pdf_extractor.py │ ├── text_extractor.py │ ├── token_counter.py │ └── word_count_service.py └── utils │ ├── file_utils.py │ └── model_utils.py ├── examples ├── README.md └── chatPdf │ ├── .env │ ├── README.md │ ├── chat.py │ ├── example.pdf │ ├── requirements.txt │ └── services │ ├── chat_service.py │ ├── chroma_service.py │ ├── embedding_service.py │ └── pdf_extractor.py └── static └── banner.png /README.md: -------------------------------------------------------------------------------- 1 | # ![Maux-API Banner](static/banner.png) 2 | # Maux-API: RAG AI Workflow Simplified 🚀 3 | 4 | Welcome to **Maux-API** – an open-source API designed to help users quickly create RAG (Retrieval-Augmented Generation) AI workflows. Built by the team at [ai.maux.space](https://ai.maux.space), this project is free for anyone to use, with a special focus on supporting the Persian language. 5 | 6 | ## 🌟 Features 7 | 8 | - **/embed**: Generates embeddings using the "Alibaba-NLP/gte-multilingual-base" model. This route requires a ~700MB model to be downloaded. 9 | - **/extract**: Extracts content from supported files such as PDFs, CSVs, DOC/DOCX, TXT, and Markdown files. 10 | - **/count**: Counts the tokens in the given content, helping you manage token usage effectively. 11 | 12 | ## 🛠 Installation 13 | 14 | To get started with Maux-API, follow these steps: 15 | 16 | 1. **Clone the Repository:** 17 | 18 | ```bash 19 | git clone https://github.com/xmannii/Maux-API.git 20 | cd Maux-API 21 | ``` 22 | 23 | 2. **Create and Activate a Virtual Environment:** 24 | 25 | ```bash 26 | python -m venv venv 27 | source venv/bin/activate # On Windows use `venv\Scripts\activate` 28 | ``` 29 | 30 | 3. **Install the Required Dependencies:** 31 | 32 | ```bash 33 | cd app 34 | pip install -r requirements.txt 35 | cd .. 36 | ``` 37 | 38 | 4. **Run the API:** 39 | 40 | ```bash 41 | uvicorn app.main:app --reload 42 | ``` 43 | 44 | ## 📚 Usage 45 | 46 | Once the API is up and running, you can use the following routes: 47 | 48 | ### 🔍 `/extract` 49 | 50 | **Description:** Extracts content from a variety of file formats. 51 | 52 | - **Supported Formats:** `.pdf`, `.csv`, `.doc`, `.docx`, `.txt`, `.md` 53 | 54 | **Response:** 55 | 56 | ```json 57 | { 58 | "filename": "example.pdf", 59 | "file_type": "pdf", 60 | "extraction_time": "0:00:01.123456", 61 | "word_count": 1024, 62 | "content": [ 63 | { 64 | "text": "Extracted text from the PDF", 65 | "page_number": 1 66 | } 67 | ] 68 | } 69 | ``` 70 | 71 | ### 📝 `/count` 72 | 73 | **Description:** Counts the number of tokens in the given content. 74 | 75 | **Parameters:** 76 | 77 | - `content` (required): The content to count tokens for. 78 | - `model` (optional): The model to use for counting tokens. Defaults to "gpt-3.5-turbo". 79 | - `token_limit` (optional): The maximum number of tokens to count. Defaults to None. 80 | 81 | **Response:** 82 | 83 | ```json 84 | { 85 | "model": "gpt-3.5-turbo", 86 | "num_tokens": 1024, 87 | "token_limit": 1000, 88 | "within_limit": true 89 | } 90 | ``` 91 | 92 | ### 📝 `/embed` 93 | 94 | **Description:** Generates embeddings for the given text using the "Alibaba-NLP/gte-multilingual-base" model. 95 | 96 | **Parameters:** 97 | 98 | - `text` (required): The text to generate embeddings for. 99 | 100 | **Response:** 101 | 102 | ```json 103 | { 104 | "object": "list", 105 | "data": [ 106 | { 107 | "object": "embedding", 108 | "index": 0, 109 | "embedding": [0.1, 0.2, 0.3, 0.4, 0.5] 110 | } 111 | ], 112 | "model": "Alibaba-NLP/gte-multilingual-base" 113 | } 114 | ``` 115 | 116 | **Discarding Embeddings:** 117 | 118 | If you want to discard the embeddings, you can omit its Route from the main.py file and remove the corresponding import statement. 119 | 120 | 121 | ### 📁 Examples 122 | 123 | We have added some examples of how to use the APIs in the `examples` folder. 124 | 125 | ### 🛠️ Contributions 126 | 127 | this project is licensed under the Apache License 2.0 and fully open-source. you are welcome to contribute to this project by submitting issues, pull requests, or suggesting new features. 128 | 129 | -------------------------------------------------------------------------------- /app/.gitignore: -------------------------------------------------------------------------------- 1 | .env 2 | __pycache__ -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI 2 | from fastapi.middleware.cors import CORSMiddleware 3 | from app.routers import file_processing 4 | from app.routers import token_counter 5 | from app.routers import embed_router 6 | 7 | 8 | app = FastAPI() 9 | # CORS configuration 10 | origins = [ 11 | "http://localhost:3000", # Local development 12 | # اینجا می‌توانید مبدأهای دیگر را اضافه کنید 13 | ] 14 | 15 | app.add_middleware( 16 | CORSMiddleware, 17 | allow_origins=origins, # Allows specific origins 18 | allow_credentials=True, 19 | allow_methods=["*"], # Allows all methods (GET, POST, etc.) 20 | 21 | allow_headers=["*"], # Allows all headers 22 | 23 | ) 24 | 25 | 26 | app.include_router(file_processing.router) 27 | app.include_router(token_counter.router) 28 | # Omit this line if you don't want to use the embedding API and download the model 29 | # اگر نمیخواید از embedding استفاده کتید و مدل دانلود کنید این خط را حذف کنید 30 | app.include_router(embed_router.router) 31 | 32 | 33 | if __name__ == "__main__": 34 | import uvicorn 35 | uvicorn.run(app, host="0.0.0.0", port=8000) -------------------------------------------------------------------------------- /app/models/embed_models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List 3 | 4 | class EmbedRequest(BaseModel): 5 | text: str 6 | 7 | class EmbeddingData(BaseModel): 8 | object: str = "embedding" 9 | index: int 10 | embedding: List[float] 11 | 12 | class EmbeddingResponse(BaseModel): 13 | object: str = "list" 14 | data: List[EmbeddingData] 15 | model: str -------------------------------------------------------------------------------- /app/models/extraction_models.py: -------------------------------------------------------------------------------- 1 | from pydantic import BaseModel 2 | from typing import List, Optional 3 | from datetime import timedelta 4 | 5 | class ExtractedContent(BaseModel): 6 | text: str 7 | page_number: Optional[int] = None 8 | 9 | 10 | class ExtractionResponse(BaseModel): 11 | filename: str 12 | file_type: str 13 | extraction_time: timedelta 14 | word_count: int 15 | content: List[ExtractedContent] -------------------------------------------------------------------------------- /app/requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi 2 | uvicorn 3 | requests 4 | tiktoken 5 | docx 6 | PyPDF2 7 | pandas 8 | sentence-transformers 9 | numpy -------------------------------------------------------------------------------- /app/routers/embed_router.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, HTTPException 2 | from app.services.embedding_service import EmbeddingService 3 | from app.models.embed_models import EmbedRequest, EmbeddingResponse 4 | import numpy as np 5 | 6 | router = APIRouter() 7 | 8 | MODEL_NAME = "Alibaba-NLP/gte-multilingual-base" # good persian supporting model 9 | embedding_service = EmbeddingService(MODEL_NAME) 10 | 11 | @router.post("/embed", response_model=EmbeddingResponse) 12 | async def embed_text(request: EmbedRequest): 13 | try: 14 | embedding = embedding_service.get_embedding(request.text) 15 | embedding_list = np.where(np.isnan(embedding), None, embedding).tolist() 16 | 17 | response = EmbeddingResponse( 18 | data=[ 19 | { 20 | "object": "embedding", 21 | "index": 0, 22 | "embedding": embedding_list, 23 | } 24 | ], 25 | model=MODEL_NAME 26 | ) 27 | 28 | return response 29 | except Exception as e: 30 | raise HTTPException(status_code=500, detail=str(e)) -------------------------------------------------------------------------------- /app/routers/file_processing.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, UploadFile, File, HTTPException 2 | from app.models.extraction_models import ExtractionResponse, ExtractedContent 3 | from app.services import pdf_extractor, csv_extractor, doc_extractor, text_extractor 4 | from app.services.word_count_service import count_words, process_content 5 | from app.utils.file_utils import get_file_extension 6 | from datetime import datetime 7 | import logging 8 | 9 | router = APIRouter() 10 | logging.basicConfig(level=logging.INFO) 11 | logger = logging.getLogger(__name__) 12 | 13 | @router.post("/extract", response_model=ExtractionResponse) 14 | async def extract_content(file: UploadFile = File(...)): 15 | if file.filename == "": 16 | raise HTTPException(status_code=400, detail="No file uploaded") 17 | 18 | logger.info(f"Received file: {file.filename}") 19 | file_extension = get_file_extension(file.filename) 20 | 21 | supported_extensions = {".pdf", ".csv", ".doc", ".docx", ".txt", ".md"} 22 | 23 | if file_extension not in supported_extensions: 24 | logger.warning(f"Unsupported file type: {file_extension}") 25 | raise HTTPException(status_code=400, detail=f"Unsupported file type: {file_extension}") 26 | 27 | try: 28 | extraction_start = datetime.now() 29 | 30 | if file_extension == ".pdf": 31 | content = await pdf_extractor.extract(file) 32 | elif file_extension == ".csv": 33 | content = await csv_extractor.extract(file) 34 | elif file_extension in [".doc", ".docx"]: 35 | content = await doc_extractor.extract(file) 36 | elif file_extension in [".txt", ".md"]: 37 | content = await text_extractor.extract(file) 38 | 39 | extraction_end = datetime.now() 40 | extraction_duration = extraction_end - extraction_start 41 | 42 | 43 | total_word_count = sum(len(page.text.split()) for page in content) 44 | 45 | logger.info(f"Successfully extracted content from {file.filename}") 46 | 47 | return ExtractionResponse( 48 | filename=file.filename, 49 | file_type=file_extension, 50 | extraction_time=extraction_duration, 51 | word_count=total_word_count, 52 | content=content 53 | ) 54 | except Exception as e: 55 | logger.error(f"Error processing file {file.filename}: {str(e)}", exc_info=True) 56 | raise HTTPException(status_code=500, detail=f"Error processing file: {str(e)}") -------------------------------------------------------------------------------- /app/routers/token_counter.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, HTTPException, Query 2 | from pydantic import BaseModel 3 | from app.services.token_counter import count_tokens, SUPPORTED_MODELS 4 | 5 | router = APIRouter() 6 | 7 | class TokenCountRequest(BaseModel): 8 | content: str 9 | model: str = "gpt-3.5-turbo" 10 | token_limit: int = None 11 | 12 | @router.post("/count") 13 | async def count_tokens_route(request: TokenCountRequest): 14 | if request.model not in SUPPORTED_MODELS: 15 | raise HTTPException(status_code=400, detail=f"Unsupported model: {request.model}") 16 | 17 | try: 18 | num_tokens = count_tokens(request.content, request.model) 19 | response = { 20 | "model": request.model, 21 | "num_tokens": num_tokens 22 | } 23 | 24 | if request.token_limit is not None: 25 | within_limit = num_tokens <= request.token_limit 26 | response["token_limit"] = request.token_limit 27 | response["within_limit"] = within_limit 28 | 29 | return response 30 | except Exception as e: 31 | raise HTTPException(status_code=500, detail=f"Error counting tokens: {str(e)}") -------------------------------------------------------------------------------- /app/services/csv_extractor.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | import io 3 | 4 | async def extract(file): 5 | try: 6 | content = await file.read() 7 | csv_file = io.StringIO(content.decode('utf-8')) 8 | df = pd.read_csv(csv_file) 9 | 10 | records = df.to_dict('records') 11 | 12 | return records 13 | except Exception as e: 14 | raise Exception(f"Error extracting CSV content: {str(e)}") -------------------------------------------------------------------------------- /app/services/doc_extractor.py: -------------------------------------------------------------------------------- 1 | from docx import Document 2 | import io 3 | 4 | async def extract(file): 5 | try: 6 | content = await file.read() 7 | doc_file = io.BytesIO(content) 8 | document = Document(doc_file) 9 | 10 | full_text = [] 11 | for para in document.paragraphs: 12 | full_text.append(para.text) 13 | 14 | return "\n".join(full_text) 15 | except Exception as e: 16 | raise Exception(f"Error extracting DOC content: {str(e)}") -------------------------------------------------------------------------------- /app/services/embedding_service.py: -------------------------------------------------------------------------------- 1 | from sentence_transformers import SentenceTransformer 2 | from app.utils.model_utils import get_model_path 3 | 4 | class EmbeddingService: 5 | def __init__(self, model_name: str): 6 | self.model_name = model_name 7 | self.model = self._load_model() 8 | 9 | def _load_model(self): 10 | model_path = get_model_path(self.model_name) 11 | return SentenceTransformer(model_path, trust_remote_code=True) 12 | 13 | def get_embedding(self, text: str): 14 | return self.model.encode([text])[0] -------------------------------------------------------------------------------- /app/services/pdf_extractor.py: -------------------------------------------------------------------------------- 1 | import PyPDF2 2 | import io 3 | from app.models.extraction_models import ExtractedContent 4 | 5 | async def extract(file): 6 | try: 7 | content = await file.read() 8 | pdf_file = io.BytesIO(content) 9 | pdf_reader = PyPDF2.PdfReader(pdf_file) 10 | 11 | extracted_content = [] 12 | for page_number, page in enumerate(pdf_reader.pages): 13 | text = page.extract_text() 14 | if text: 15 | extracted_content.append(ExtractedContent(text=text.strip(), page_number=page_number + 1)) 16 | 17 | return extracted_content 18 | except Exception as e: 19 | raise Exception(f"Error extracting PDF content: {str(e)}") -------------------------------------------------------------------------------- /app/services/text_extractor.py: -------------------------------------------------------------------------------- 1 | async def extract(file): 2 | try: 3 | content = await file.read() 4 | text = content.decode('utf-8') 5 | return text.strip() 6 | except Exception as e: 7 | raise Exception(f"Error extracting text content: {str(e)}") -------------------------------------------------------------------------------- /app/services/token_counter.py: -------------------------------------------------------------------------------- 1 | import tiktoken 2 | 3 | 4 | # Supported models 5 | SUPPORTED_MODELS = { 6 | "gpt-4o": "o200k_base", 7 | "gpt-4": "cl100k_base", 8 | "gpt-3.5-turbo": "cl100k_base", 9 | "gpt-3.5": "cl100k_base", 10 | "gpt-35-turbo": "cl100k_base", 11 | "text-embedding-ada-002": "cl100k_base", 12 | "text-embedding-3-small": "cl100k_base", 13 | "text-embedding-3-large": "cl100k_base", 14 | } 15 | 16 | def count_tokens(content: str, model: str = "gpt-3.5-turbo") -> int: 17 | if model not in SUPPORTED_MODELS: 18 | raise ValueError(f"Unsupported model: {model}") 19 | 20 | encoding_name = SUPPORTED_MODELS[model] 21 | encoding = tiktoken.get_encoding(encoding_name) 22 | tokens = encoding.encode(content) 23 | num_tokens = len(tokens) 24 | return num_tokens 25 | 26 | -------------------------------------------------------------------------------- /app/services/word_count_service.py: -------------------------------------------------------------------------------- 1 | from typing import List, Union 2 | from app.models.extraction_models import ExtractedContent 3 | 4 | def count_words(content: Union[str, List[Union[str, dict]]]) -> int: 5 | total_word_count = 0 6 | 7 | if isinstance(content, str): 8 | total_word_count = len(content.split()) 9 | elif isinstance(content, list): 10 | for item in content: 11 | if isinstance(item, dict): 12 | text = item.get('text', '') 13 | elif isinstance(item, str): 14 | text = item 15 | else: 16 | text = str(item) 17 | total_word_count += len(text.split()) 18 | 19 | return total_word_count 20 | 21 | def process_content(content: Union[str, List[Union[str, dict]]]) -> List[ExtractedContent]: 22 | processed_content = [] 23 | 24 | if isinstance(content, list): 25 | for idx, item in enumerate(content, start=1): 26 | if isinstance(item, dict): 27 | text = item.get('text', '') 28 | metadata = {k: v for k, v in item.items() if k != 'text'} 29 | processed_content.append(ExtractedContent( 30 | text=text, 31 | page_number=idx, 32 | metadata=metadata 33 | )) 34 | else: 35 | processed_content.append(ExtractedContent(text=str(item), page_number=idx)) 36 | else: 37 | processed_content.append(ExtractedContent(text=str(content))) 38 | 39 | return processed_content -------------------------------------------------------------------------------- /app/utils/file_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | def get_file_extension(filename): 4 | return os.path.splitext(filename)[1].lower() -------------------------------------------------------------------------------- /app/utils/model_utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | from sentence_transformers import SentenceTransformer 4 | 5 | def get_model_path(model_name: str) -> str: 6 | model_cache_dir = os.path.join(tempfile.gettempdir(), "sentence_transformer_cache") 7 | os.makedirs(model_cache_dir, exist_ok=True) 8 | 9 | model_path = os.path.join(model_cache_dir, model_name) 10 | 11 | if not os.path.exists(model_path): 12 | print(f"Downloading model {model_name} and saving to {model_path}") 13 | model = SentenceTransformer(model_name, trust_remote_code=True) 14 | model.save(model_path) 15 | print("Model saved successfully") 16 | else: 17 | print(f"Model loaded from cache: {model_path}") 18 | 19 | return model_path -------------------------------------------------------------------------------- /examples/README.md: -------------------------------------------------------------------------------- 1 | Here we will add some examples of how to use the APIs 2 | 3 | 1. Chat with PDF. (uses the PDF extraction API and embedding API) 4 | 5 | -------------------------------------------------------------------------------- /examples/chatPdf/.env: -------------------------------------------------------------------------------- 1 | 2 | # Add your Groq API key here 3 | GROQ_API_KEY= "your_groq_api_key" -------------------------------------------------------------------------------- /examples/chatPdf/README.md: -------------------------------------------------------------------------------- 1 | # Chat with PDF 2 | 3 | 🚀 This example demonstrates how to use the PDF extraction and embedding APIs to chat with a PDF file. 4 | 5 | ## Prerequisites 6 | 7 | - Python 8 | - ChromaDB 9 | - requests 10 | - groq 11 | 12 | ## Installation 13 | 14 | 1. install requirements 15 | ```bash 16 | pip install -r requirements.txt 17 | ``` 18 | 19 | 2. run the self-hosted Maux-API server 20 | ```bash 21 | uvicorn app.main:app --reload 22 | ``` 23 | 3. add your groq api key to the env file 24 | 4. run the chat script 25 | ```bash 26 | python chat.py 27 | ``` 28 | 29 | 5. enter the path to your PDF file 30 | 6. enter your question 31 | 7. wait for the chat to end 32 | 33 | -------------------------------------------------------------------------------- /examples/chatPdf/chat.py: -------------------------------------------------------------------------------- 1 | 2 | import asyncio 3 | from services.pdf_extractor import extract_pdf_content 4 | from services.embedding_service import get_embeddings 5 | from services.chroma_service import setup_chroma, store_in_chroma 6 | from services.chat_service import chat_loop 7 | 8 | async def process_pdf(file_path: str, collection): 9 | """Process a PDF file: extract content, generate embeddings, and store in Chroma.""" 10 | print("Extracting PDF content...") 11 | pages = extract_pdf_content(file_path) 12 | print(f"Extracted {len(pages)} pages.") 13 | print("Generating embeddings...") 14 | embeddings = await get_embeddings(pages) 15 | 16 | print("Storing in Chroma...") 17 | store_in_chroma(collection, embeddings, pages) 18 | print("PDF processed and stored in Chroma.") 19 | 20 | async def main(): 21 | collection = setup_chroma() 22 | 23 | file_path = input("Enter the path to your PDF file: ") 24 | ## file_path = "example.pdf" 25 | await process_pdf(file_path, collection) 26 | 27 | print("\nPDF processed. Entering chat mode.") 28 | await chat_loop(collection) 29 | 30 | if __name__ == "__main__": 31 | asyncio.run(main()) 32 | -------------------------------------------------------------------------------- /examples/chatPdf/example.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xmannii/Maux-API/9bf1d0f3a7023044db53ef7ba8432422e268a7a5/examples/chatPdf/example.pdf -------------------------------------------------------------------------------- /examples/chatPdf/requirements.txt: -------------------------------------------------------------------------------- 1 | chromadb 2 | requests 3 | groq 4 | 5 | -------------------------------------------------------------------------------- /examples/chatPdf/services/chat_service.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from groq import Groq 4 | from services.embedding_service import get_embeddings 5 | from services.chroma_service import query_chroma 6 | 7 | # Initialize the Groq client with the API key 8 | client = Groq(api_key=os.environ.get("GROQ_API_KEY")) 9 | 10 | async def chat_loop(collection): 11 | """Main chat loop for querying the stored PDF content and interacting with Groq API.""" 12 | messages = [{"role": "system", "content": "You are a helpful assistant that answers questions based on the extracted content from a PDF file."}] 13 | 14 | while True: 15 | query = input("\nEnter your question (or 'quit' to exit): ") 16 | if query.lower() == 'quit': 17 | break 18 | 19 | print("Generating query embedding...") 20 | query_embedding = await get_embeddings([query]) 21 | 22 | print("Querying Chroma...") 23 | results = query_chroma(collection, query_embedding) 24 | print(results) 25 | if not results['documents']: 26 | print("No relevant content found in the PDF.") 27 | continue 28 | 29 | # Prepare the prompt template 30 | top_documents = results['documents'][0][:2] # Get top 2 results 31 | prompt = "The following are the most relevant contents extracted from the PDF:\n" 32 | for i, doc in enumerate(top_documents, 1): 33 | prompt += f"{i}. {doc}\n" 34 | prompt += f"\nUser Question: {query}\n\nPlease provide a response based on the extracted content." 35 | print(prompt) 36 | # Add the user message to the messages list 37 | messages.append({"role": "user", "content": prompt}) 38 | 39 | # Generate a response from Groq API 40 | print("Generating response from Groq API...") 41 | chat_completion = client.chat.completions.create( 42 | messages=messages, 43 | model="llama-3.1-70b-versatile", 44 | ) 45 | 46 | response_content = chat_completion.choices[0].message.content 47 | print(f"\nAI Response:\n{response_content}") 48 | 49 | # Add the AI's response to the messages list 50 | messages.append({"role": "assistant", "content": response_content}) 51 | 52 | -------------------------------------------------------------------------------- /examples/chatPdf/services/chroma_service.py: -------------------------------------------------------------------------------- 1 | # services/chroma_service.py 2 | 3 | import chromadb 4 | 5 | def setup_chroma(): 6 | """Set up and return a Chroma client and collection.""" 7 | chroma_client = chromadb.Client() 8 | collection = chroma_client.get_or_create_collection(name="pdf_collection") 9 | return collection 10 | 11 | def store_in_chroma(collection, embeddings, documents): 12 | """Store embeddings and documents in Chroma.""" 13 | collection.add( 14 | embeddings=embeddings, 15 | documents=documents, 16 | ids=[f"page_{i + 1}" for i in range(len(documents))] # Start IDs from 1 for better readability 17 | ) 18 | 19 | def query_chroma(collection, query_embedding): 20 | """Query the Chroma collection.""" 21 | results = collection.query( 22 | query_embeddings=query_embedding, 23 | n_results=2 24 | ) 25 | return results 26 | -------------------------------------------------------------------------------- /examples/chatPdf/services/embedding_service.py: -------------------------------------------------------------------------------- 1 | # services/embedding_service.py 2 | 3 | import requests 4 | from typing import List 5 | 6 | API_BASE_URL = "http://localhost:8000" # your maux api url here 7 | async def get_embeddings(texts: List[str]) -> List[List[float]]: 8 | """Get embeddings for a list of texts using the FastAPI endpoint.""" 9 | embeddings = [] 10 | for text in texts: 11 | response = requests.post(f"{API_BASE_URL}/embed", json={"text": text}) 12 | if response.status_code != 200: 13 | raise Exception(f"Failed to get embedding: {response.text}") 14 | 15 | # Access the embedding from the response 16 | embeddings.append(response.json()['data'][0]['embedding']) 17 | return embeddings 18 | -------------------------------------------------------------------------------- /examples/chatPdf/services/pdf_extractor.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | import requests 4 | from typing import List 5 | 6 | API_BASE_URL = "http://localhost:8000" # your maux api url here 7 | 8 | def extract_pdf_content(file_path: str) -> List[str]: 9 | """Extract content from a PDF file using the FastAPI endpoint.""" 10 | with open(file_path, 'rb') as file: 11 | response = requests.post(f"{API_BASE_URL}/extract", files={"file": file}) 12 | 13 | if response.status_code != 200: 14 | raise Exception(f"Failed to extract PDF content: {response.text}") 15 | 16 | # Extract the text from each page's content 17 | return [content['text'] for content in response.json()['content']] 18 | -------------------------------------------------------------------------------- /static/banner.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/xmannii/Maux-API/9bf1d0f3a7023044db53ef7ba8432422e268a7a5/static/banner.png --------------------------------------------------------------------------------