├── .env_example ├── Dockerfile ├── README.md ├── build_context.py ├── cohere_reranking.py ├── extract_content_from_website.py ├── groq_api.py ├── jina_rerank.py ├── main.py ├── prompts.py ├── requirements.txt ├── semantic_chunking.py ├── sources_manipulation.py └── sources_searcher.py /.env_example: -------------------------------------------------------------------------------- 1 | GROQ_API_KEY="your api key here" 2 | JINA_API_KEY="your api key here" 3 | SERPER_API_KEY="your api key here" 4 | COHERE_API_KEY = "your api key here" 5 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | WORKDIR /workspace 4 | ENV HOME=/workspace 5 | 6 | ADD . /workspace 7 | 8 | RUN chown -R 42420:42420 /workspace 9 | 10 | # Install dependencies using apk and then Python packages 11 | RUN pip install -r requirements.txt 12 | 13 | EXPOSE 8080 14 | 15 | ENTRYPOINT ["uvicorn"] 16 | 17 | CMD ["main:app", "--host", "0.0.0.0", "--port", "8080"] 18 | 19 | 20 | # this docker image work for OVH CLOUD AI DEPLOY -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # OpenPerPlex 2 | 3 | OpenPerPlex is an open-source AI search engine that leverages cutting-edge technologies to provide search capabilities over the web. 4 | 5 | ## front end app (vuejs) 6 | 7 | - `https://github.com/YassKhazzan/openperplex_front.git` 8 | 9 | ## 🌟 Features 10 | 11 | - Semantic chunking using Cohere and semantic-chunkers library (`https://github.com/aurelio-labs/semantic-chunkers/blob/main/semantic_chunkers/chunkers/statistical.py`) 12 | - Reranking results with JINA API 13 | - Google search integration via serper.dev 14 | - Groq as the inference engine 15 | - Llama 3 70B MODEL 16 | 17 | ## 🚀 Quick Start 18 | 19 | ### Prerequisites 20 | 21 | - Python 3.11+ 22 | - pip 23 | 24 | ### Installation 25 | 26 | 1. Clone the repository: 27 | git clone [https://github.com/YassKhazzan/openperplex_backend_os.git](https://github.com/YassKhazzan/openperplex_backend_os.git) 28 | 2. Install the required packages: `pip install -r requirements.txt` 29 | 3. Set up your environment variables: 30 | - Copy the `.env_example` file to `.env` 31 | - Fill in your API keys in the `.env` file 32 | 33 | ### Running the Project 34 | 35 | To start the OpenPerPlex server: ```uvicorn main:app --port 8000``` 36 | 37 | The server will be available at `http://localhost:8000` 38 | 39 | ## 🔧 Configuration 40 | 41 | Make sure to set up your `.env` file with the necessary API keys: 42 | 43 | - COHERE_API_KEY 44 | - JINA_API_KEY 45 | - SERPER_API_KEY 46 | - GROQ_API_KEY 47 | 48 | ## 🤝 Contributing 49 | 50 | We welcome contributions to OpenPerPlex! Please feel free to submit issues, fork the repository and send pull requests! 51 | 52 | ## 📝 License 53 | 54 | This project is licensed under the [MIT License](LICENSE). 55 | 56 | ## 🙏 Acknowledgements 57 | 58 | - [Cohere](https://cohere.ai/) for semantic chunking 59 | - [JINA AI](https://jina.ai/) for reranking 60 | - [serper.dev](https://serper.dev/) for Google search integration 61 | - [Groq](https://groq.com/) for inference engine 62 | - [META](https://www.meta.ai/) opensource models 63 | 64 | ## 📬 Contact 65 | 66 | For any questions or feedback, please open an issue on this repository or contact me on [X](https://x.com/KhazzanYassine) 67 | 68 | --- 69 | 70 | Happy searching with OpenPerPlex! 🚀🔍 71 | -------------------------------------------------------------------------------- /build_context.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from jina_rerank import get_reranking_jina 3 | from semantic_chunking import get_chunking 4 | 5 | # Configure logging 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 7 | logger = logging.getLogger(__name__) 8 | 9 | 10 | def build_context(sources_result, query, pro_mode, date_context): 11 | """ 12 | Build context from search results. 13 | 14 | :param sources_result: Dictionary containing search results 15 | :param query: Search query string 16 | :param pro_mode: Boolean indicating whether to use pro mode (reranking) 17 | :param date_context: Date context string 18 | :return: Built context as a string 19 | """ 20 | try: 21 | combined_list = [] 22 | 23 | organic_results = sources_result.get('organic', []) 24 | graph = sources_result.get('graph') 25 | answer_box = sources_result.get('answerBox') 26 | 27 | snippets = [ 28 | f"{item['snippet']} {item.get('date', '')}" 29 | for item in organic_results if 'snippet' in item # Ensure there's always a snippet 30 | ] 31 | 32 | combined_list.extend(snippets) 33 | 34 | html_text = " ".join(item['html'] for item in organic_results if 'html' in item) 35 | if html_text is not None and len(html_text) > 200: 36 | combined_list.extend(get_chunking(html_text)) 37 | 38 | # Extract top stories titles 39 | if sources_result.get('topStories') is not None: 40 | top_stories_titles = [item['title'] for item in sources_result.get('topStories') if 'title' in item] 41 | combined_list.extend(top_stories_titles) 42 | 43 | # Add descriptions and answers from 'graph' and 'answerBox' 44 | if graph is not None: 45 | graph_desc = graph.get('description') 46 | if graph_desc: 47 | combined_list.append(graph_desc) 48 | 49 | if answer_box is not None: 50 | for key in ['answer', 'snippet']: 51 | if key in answer_box: # Use this if you want to append regardless of the value (including None) 52 | combined_list.append(answer_box[key]) 53 | 54 | if pro_mode: 55 | # you can choose to use jina or cohere for reranking 56 | final_list = get_reranking_jina(combined_list, query + date_context, 15) 57 | else: 58 | final_list = combined_list 59 | 60 | search_contexts = "\n\n".join(final_list) 61 | return search_contexts 62 | except Exception as e: 63 | logger.exception(f"An error occurred while building context: {e}") 64 | return "" 65 | -------------------------------------------------------------------------------- /cohere_reranking.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import cohere 4 | 5 | 6 | # use ENV variables 7 | COHERE_API_KEY = os.getenv("COHERE_API_KEY") 8 | MODEL = "rerank-multilingual-v3.0" 9 | 10 | co = cohere.Client(api_key=COHERE_API_KEY) 11 | 12 | 13 | def get_reranking_cohere(docs, query, top_res): 14 | """ 15 | Re-ranks a list of documents based on a query using Cohere's reranking API. 16 | 17 | Args: 18 | docs (list of str): List of documents to be re-ranked. 19 | query (str): Query string to rank the documents against. 20 | top_res (int): Number of top results to return. 21 | 22 | Returns: 23 | list of str: Top re-ranked documents based on the query. 24 | """ 25 | try: 26 | # Call the Cohere rerank API 27 | response = co.rerank( 28 | model=MODEL, 29 | query=query, 30 | documents=docs, 31 | top_n=top_res, 32 | return_documents=True 33 | ) 34 | 35 | # Extract and return the texts of the top documents 36 | return [item.document.text for item in response.results] 37 | 38 | except Exception as e: 39 | # Log the error and handle it as needed 40 | print(f"An error occurred: {e}") 41 | return [] 42 | -------------------------------------------------------------------------------- /extract_content_from_website.py: -------------------------------------------------------------------------------- 1 | from langchain_community.document_loaders import WebBaseLoader 2 | 3 | 4 | def extract_website_content(url): 5 | """ 6 | Extracts and cleans the main content from a given website URL. 7 | 8 | Args: 9 | url (str): The URL of the website from which to extract content. 10 | 11 | Returns: 12 | str: The first 4000 characters of the cleaned main content if it is sufficiently long, otherwise an empty string. 13 | """ 14 | try: 15 | clean_text = [] 16 | loader = WebBaseLoader(url) 17 | data = loader.load() 18 | 19 | # Aggregate content using a list to avoid inefficient string concatenation in the loop 20 | for doc in data: 21 | if doc.page_content: # Check if page_content is not None or empty 22 | clean_text.append(doc.page_content.replace("\n", "")) 23 | 24 | # Join all parts into a single string after processing 25 | clean_text = "".join(clean_text) 26 | 27 | # Return up to the first 4000 characters if the content is sufficiently long 28 | return clean_text[:4000] if len(clean_text) > 200 else "" 29 | 30 | except Exception as error: 31 | print('Error extracting main content:', error) 32 | return "" 33 | -------------------------------------------------------------------------------- /groq_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | from groq import Groq 4 | from langchain_core.prompts import PromptTemplate 5 | from prompts import search_prompt_system, relevant_prompt_system 6 | 7 | # use ENV variables 8 | MODEL = "llama3-70b-8192" 9 | api_key_groq = os.getenv("GROQ_API_KEY") 10 | 11 | 12 | client = Groq() 13 | 14 | 15 | def get_answer(query, contexts, date_context): 16 | system_prompt_search = PromptTemplate(input_variables=["date_today"], template=search_prompt_system) 17 | 18 | messages = [ 19 | {"role": "system", "content": system_prompt_search.format(date_today=date_context)}, 20 | {"role": "user", "content": "User Question : " + query + "\n\n CONTEXTS :\n\n" + contexts} 21 | ] 22 | 23 | try: 24 | stream = client.chat.completions.create( 25 | model=MODEL, 26 | messages=messages, 27 | stream=True, 28 | stop=None, 29 | ) 30 | 31 | for chunk in stream: 32 | if chunk.choices[0].delta.content is not None: 33 | yield chunk.choices[0].delta.content 34 | 35 | except Exception as e: 36 | print(f"Error during get_answer_groq call: {e}") 37 | yield "data:" + json.dumps( 38 | {'type': 'error', 'data': "We are currently experiencing some issues. Please try again later."}) + "\n\n" 39 | 40 | 41 | def get_relevant_questions(contexts, query): 42 | try: 43 | response = client.chat.completions.create( 44 | model=MODEL, 45 | messages=[ 46 | {"role": "system", 47 | "content": relevant_prompt_system 48 | }, 49 | {"role": "user", 50 | "content": "User Query: " + query + "\n\n" + "Contexts: " + "\n" + contexts + "\n"} 51 | ], 52 | response_format={"type": "json_object"}, 53 | ) 54 | 55 | return response.choices[0].message.content 56 | except Exception as e: 57 | print(f"Error during RELEVANT GROQ ***************: {e}") 58 | return {} 59 | -------------------------------------------------------------------------------- /jina_rerank.py: -------------------------------------------------------------------------------- 1 | import os 2 | import requests 3 | from typing import List 4 | import logging 5 | from requests.exceptions import RequestException 6 | 7 | # Configure logging 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') 9 | logger = logging.getLogger(__name__) 10 | 11 | # Constants 12 | API_URL = "https://api.jina.ai/v1/rerank" 13 | API_KEY = os.getenv("JINA_API_KEY") 14 | MODEL = "jina-reranker-v2-base-multilingual" 15 | HEADERS = { 16 | "Content-Type": "application/json", 17 | "Authorization": f"Bearer {API_KEY}" 18 | } 19 | 20 | 21 | def get_reranking_jina(docs: List[str], query: str, top_res: int) -> List[str]: 22 | """ 23 | Get reranked documents using Jina AI API. 24 | 25 | :param docs: List of documents to rerank 26 | :param query: Query string 27 | :param top_res: Number of top results to return 28 | :return: List of reranked documents 29 | """ 30 | try: 31 | data = { 32 | "model": MODEL, 33 | "query": query, 34 | "documents": docs, 35 | "top_n": top_res 36 | } 37 | 38 | response = requests.post(API_URL, headers=HEADERS, json=data, timeout=10) 39 | response.raise_for_status() 40 | response_data = response.json() 41 | 42 | return [item['document']['text'] for item in response_data.get('results', [])] 43 | 44 | except RequestException as e: 45 | logger.error(f"HTTP error occurred while reranking: {e}") 46 | except KeyError as e: 47 | logger.error(f"Unexpected response format: {e}") 48 | except Exception as e: 49 | logger.exception(f"An unexpected error occurred: {e}") 50 | 51 | return [] 52 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | import orjson as json 2 | from dotenv import load_dotenv 3 | 4 | load_dotenv() 5 | 6 | from fastapi.responses import StreamingResponse 7 | from fastapi import FastAPI, HTTPException 8 | from fastapi.middleware.cors import CORSMiddleware 9 | from groq_api import get_answer, get_relevant_questions 10 | from sources_searcher import get_sources 11 | from build_context import build_context 12 | from sources_manipulation import populate_sources 13 | 14 | 15 | app = FastAPI() 16 | 17 | # allow_origins=["https://openperplex.com"] 18 | app.add_middleware( 19 | CORSMiddleware, 20 | allow_origins=["*"], 21 | allow_credentials=True, 22 | allow_methods=["GET", "POST"], # Allow all methods or specify like ["POST", "GET"] 23 | allow_headers=["*"], # Allow all headers or specify 24 | ) 25 | 26 | load_dotenv() 27 | 28 | 29 | @app.get("/") 30 | def root(): 31 | return {"message": "hello world openperplex v1"} 32 | 33 | 34 | @app.get("/up_test") 35 | def up_test(): 36 | # test for kamal deploy 37 | return {"status": "ok"} 38 | 39 | 40 | # you can change to post if typical your query is too long 41 | @app.get("/search") 42 | def ask(query: str, date_context: str, stored_location: str, pro_mode: bool = False): 43 | if not query: 44 | raise HTTPException(status_code=400, detail="Query cannot be empty") 45 | 46 | def generate(): 47 | try: 48 | sources_result = get_sources(query, pro_mode, stored_location) 49 | yield "data:" + json.dumps({'type': 'sources', 'data': sources_result}).decode() + "\n\n" 50 | 51 | if sources_result.get('organic') is not None and pro_mode is True: 52 | # set the number of websites to scrape : here = 2 53 | sources_result['organic'] = populate_sources(sources_result['organic'], 2) 54 | 55 | search_contexts = build_context(sources_result, query, pro_mode, date_context) 56 | 57 | for chunk in get_answer(query, search_contexts, date_context): 58 | yield "data:" + json.dumps({'type': 'llm', 'text': chunk}).decode() + "\n\n" 59 | 60 | try: 61 | relevant_questions = get_relevant_questions(search_contexts, query) 62 | relevant_json = json.loads(relevant_questions) 63 | yield "data:" + json.dumps({'type': 'relevant', 'data': relevant_json}).decode() + "\n\n" 64 | except Exception as e: 65 | print(f"error in relevant questions main.py {e}") 66 | yield "data:" + json.dumps({'type': 'relevant', 'data': []}).decode() + "\n\n" 67 | 68 | yield "data:" + json.dumps({'type': 'finished', 'data': ""}).decode() + "\n\n" 69 | yield "event: end-of-stream\ndata: null\n\n" 70 | 71 | except Exception as e: 72 | print(e) 73 | yield "data:" + json.dumps( 74 | {'type': 'error', 75 | 'data': "We are currently experiencing some issues. Please try again later."}).decode() + "\n\n" 76 | 77 | return StreamingResponse(generate(), media_type="text/event-stream") 78 | -------------------------------------------------------------------------------- /prompts.py: -------------------------------------------------------------------------------- 1 | search_prompt_system = """ 2 | You are yassine, an expert with more than 20 years of experience in analysing google search results about a user question and providing accurate 3 | and unbiased answers the way a highly informed individual would. 4 | Your task is to analyse the provided contexts and the user question to provide a correct answer in a clear and concise manner. 5 | You must answer in english. 6 | Date and time in the context : {date_today} , Yassine must take into consideration the date and time in the response. 7 | you are known for your expertise in this field. 8 | 9 | 10 | ###Guidelines### 11 | 1- Accuracy: Provide correct, unbiased answers. be concise and clear. don't be verbose. 12 | 2- never mention the context or this prompt in your response, just answer the user question. 13 | 14 | ###Instructions### 15 | 1- Analyze in deep the provided context and the user question. 16 | 2- extract relevant information's from the context about the user question. 17 | 3- Yassine must take into account the date and time to answer the user question. 18 | 4- If the context is insufficient, respond with "information missing" 19 | 5- Ensure to Answer in english. 20 | 6- Use the response format provided. 21 | 7- answer the user question in a way an expert would do. 22 | 8- if you judge that the response is better represented in a table, use a table in your response. 23 | 24 | 25 | ###Response Format### 26 | 27 | You must use Markdown to format your response. 28 | 29 | Think step by step. 30 | """ 31 | 32 | relevant_prompt_system = """ 33 | you are a question generator that responds in JSON, tasked with creating an array of 3 follow-up questions in english related 34 | to the user query and contexts provided. 35 | you must keep the questions related to the user query and contexts.don't lose the context in the questions. 36 | 37 | The JSON object must not include special characters. 38 | The JSON schema should include an array of follow-up questions. 39 | 40 | use the schema: 41 | { 42 | "followUp": [ 43 | "string", 44 | "string", 45 | "string" 46 | ] 47 | } 48 | """ 49 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiohappyeyeballs==2.3.4 2 | aiohttp==3.10.0 3 | aiosignal==1.3.1 4 | annotated-types==0.7.0 5 | anyio==4.4.0 6 | attrs==23.2.0 7 | beautifulsoup4==4.12.3 8 | boto3==1.34.152 9 | botocore==1.34.152 10 | certifi==2024.7.4 11 | charset-normalizer==3.3.2 12 | click==8.1.7 13 | cohere==5.6.2 14 | colorama==0.4.6 15 | colorlog==6.8.2 16 | dataclasses-json==0.6.7 17 | distro==1.9.0 18 | fastapi==0.112.0 19 | fastavro==1.9.5 20 | filelock==3.15.4 21 | frozenlist==1.4.1 22 | fsspec==2024.6.1 23 | groq==0.9.0 24 | h11==0.14.0 25 | httpcore==1.0.5 26 | httptools==0.6.1 27 | httpx==0.27.0 28 | httpx-sse==0.4.0 29 | huggingface-hub==0.24.5 30 | idna==3.7 31 | jmespath==1.0.1 32 | jsonpatch==1.33 33 | jsonpointer==3.0.0 34 | langchain==0.2.12 35 | langchain-community==0.2.10 36 | langchain-core==0.2.27 37 | langchain-text-splitters==0.2.2 38 | langsmith==0.1.96 39 | marshmallow==3.21.3 40 | multidict==6.0.5 41 | mypy-extensions==1.0.0 42 | numpy==1.26.4 43 | openai==1.38.0 44 | orjson==3.10.6 45 | packaging==24.1 46 | parameterized==0.9.0 47 | pydantic==2.8.2 48 | pydantic_core==2.20.1 49 | python-dateutil==2.9.0.post0 50 | python-dotenv==1.0.1 51 | PyYAML==6.0.1 52 | regex==2023.12.25 53 | requests==2.32.3 54 | requests-mock==1.12.1 55 | s3transfer==0.10.2 56 | semantic-chunkers==0.0.9 57 | semantic-router==0.0.55 58 | six==1.16.0 59 | sniffio==1.3.1 60 | soupsieve==2.5 61 | SQLAlchemy==2.0.31 62 | starlette==0.37.2 63 | tenacity==8.5.0 64 | tiktoken==0.7.0 65 | tokenizers==0.19.1 66 | tqdm==4.66.4 67 | types-requests==2.32.0.20240712 68 | typing-inspect==0.9.0 69 | typing_extensions==4.12.2 70 | urllib3==2.2.2 71 | uvicorn==0.30.5 72 | uvloop==0.19.0 73 | watchfiles==0.22.0 74 | websockets==12.0 75 | yarl==1.9.4 76 | -------------------------------------------------------------------------------- /semantic_chunking.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from semantic_router.encoders import CohereEncoder 4 | from semantic_chunkers import StatisticalChunker 5 | 6 | COHERE_API_KEY = os.getenv("COHERE_API_KEY") 7 | 8 | encoder = CohereEncoder(cohere_api_key=COHERE_API_KEY, input_type='search_document', 9 | name='embed-multilingual-v3.0') 10 | 11 | chunker = StatisticalChunker(encoder=encoder, max_split_tokens=200) 12 | 13 | 14 | def get_chunking(text): 15 | """ 16 | Splits the provided text into meaningful chunks using a predefined chunker. 17 | 18 | Args: 19 | text (str): The text to be chunked. 20 | 21 | Returns: 22 | list: A list of chunks if the text is sufficiently long and non-empty; otherwise, an empty list. 23 | """ 24 | try: 25 | chunks = chunker(docs=[text]) 26 | values = [c.content for chunk in chunks for c in chunk] 27 | 28 | return values 29 | 30 | except Exception as e: 31 | print(f"Error during chunking process: {e}") 32 | return [] 33 | -------------------------------------------------------------------------------- /sources_manipulation.py: -------------------------------------------------------------------------------- 1 | from extract_content_from_website import extract_website_content 2 | 3 | 4 | def populate_sources(sources, num_elements): 5 | try: 6 | for i, source in enumerate(sources[:num_elements]): 7 | if not source: 8 | continue 9 | 10 | try: 11 | source['html'] = extract_website_content(source['link']) 12 | sources[i] = source 13 | except Exception as e: 14 | continue 15 | except Exception as e: 16 | print(f"Error in populate_sources: {e}") 17 | return sources 18 | 19 | return sources 20 | -------------------------------------------------------------------------------- /sources_searcher.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import requests 4 | from typing import Dict, Any, Optional, List 5 | 6 | 7 | # use ENV variables 8 | # Constants 9 | API_URL = "https://google.serper.dev/search" 10 | API_KEY = os.getenv("SERPER_API_KEY") 11 | DEFAULT_LOCATION = 'us' 12 | HEADERS = { 13 | 'X-API-KEY': API_KEY, 14 | 'Content-Type': 'application/json' 15 | } 16 | 17 | 18 | def get_sources(query: str, pro_mode: bool = False, stored_location: Optional[str] = None) -> Dict[str, Any]: 19 | """ 20 | Fetch search results from Serper API. 21 | 22 | :param query: Search query string 23 | :param pro_mode: Boolean to determine the number of results 24 | :param stored_location: Optional location string 25 | :return: Dictionary containing search results 26 | """ 27 | try: 28 | search_location = (stored_location or DEFAULT_LOCATION).lower() 29 | num_results = 10 if pro_mode else 20 30 | 31 | payload = { 32 | "q": query, 33 | "num": num_results, 34 | "gl": search_location 35 | } 36 | 37 | response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=10) 38 | response.raise_for_status() 39 | 40 | data = response.json() 41 | 42 | return { 43 | 'organic': extract_fields(data.get('organic', []), ['title', 'link', 'snippet', 'date']), 44 | 'topStories': extract_fields(data.get('topStories', []), ['title', 'imageUrl']), 45 | 'images': extract_fields(data.get('images', [])[:6], ['title', 'imageUrl']), 46 | 'graph': data.get('knowledgeGraph'), 47 | 'answerBox': data.get('answerBox') 48 | } 49 | 50 | except requests.RequestException as e: 51 | print(f"HTTP error while getting sources: {e}") 52 | except Exception as e: 53 | print(f"Unexpected error while getting sources: {e}") 54 | 55 | return {} 56 | 57 | 58 | def extract_fields(items: List[Dict[str, Any]], fields: List[str]) -> List[Dict[str, Any]]: 59 | """ 60 | Extract specified fields from a list of dictionaries. 61 | 62 | :param items: List of dictionaries 63 | :param fields: List of fields to extract 64 | :return: List of dictionaries with only the specified fields 65 | """ 66 | return [{key: item[key] for key in fields if key in item} for item in items] 67 | --------------------------------------------------------------------------------