├── .env_example
├── Dockerfile
├── README.md
├── build_context.py
├── cohere_reranking.py
├── extract_content_from_website.py
├── groq_api.py
├── jina_rerank.py
├── main.py
├── prompts.py
├── requirements.txt
├── semantic_chunking.py
├── sources_manipulation.py
└── sources_searcher.py


/.env_example:
--------------------------------------------------------------------------------
1 | GROQ_API_KEY="your api key here"
2 | JINA_API_KEY="your api key here"
3 | SERPER_API_KEY="your api key here"
4 | COHERE_API_KEY = "your api key here"
5 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | FROM python:3.12-slim
 2 | 
 3 | WORKDIR /workspace
 4 | ENV HOME=/workspace
 5 | 
 6 | ADD . /workspace
 7 | 
 8 | RUN chown -R 42420:42420 /workspace
 9 | 
10 | # Install dependencies using apk and then Python packages
11 | RUN pip install -r requirements.txt
12 | 
13 | EXPOSE 8080
14 | 
15 | ENTRYPOINT ["uvicorn"]
16 | 
17 | CMD ["main:app", "--host", "0.0.0.0", "--port", "8080"]
18 | 
19 | 
20 | # this docker image work for OVH CLOUD AI DEPLOY


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # OpenPerPlex
 2 | 
 3 | OpenPerPlex is an open-source AI search engine that leverages cutting-edge technologies to provide search capabilities over the web.
 4 | 
 5 | ## front end app (vuejs)
 6 | 
 7 | - `https://github.com/YassKhazzan/openperplex_front.git`
 8 | 
 9 | ## 🌟 Features
10 | 
11 | - Semantic chunking using Cohere and semantic-chunkers library (`https://github.com/aurelio-labs/semantic-chunkers/blob/main/semantic_chunkers/chunkers/statistical.py`)
12 | - Reranking results with JINA API
13 | - Google search integration via serper.dev
14 | - Groq as the inference engine
15 | - Llama 3 70B MODEL
16 | 
17 | ## 🚀 Quick Start
18 | 
19 | ### Prerequisites
20 | 
21 | - Python 3.11+
22 | - pip
23 | 
24 | ### Installation
25 | 
26 | 1. Clone the repository:
27 |  git clone [https://github.com/YassKhazzan/openperplex_backend_os.git](https://github.com/YassKhazzan/openperplex_backend_os.git) 
28 | 2. Install the required packages: `pip install -r requirements.txt`
29 | 3. Set up your environment variables:
30 | - Copy the `.env_example` file to `.env`
31 | - Fill in your API keys in the `.env` file
32 | 
33 | ### Running the Project
34 | 
35 | To start the OpenPerPlex server: ```uvicorn main:app --port 8000```
36 | 
37 | The server will be available at `http://localhost:8000`
38 | 
39 | ## 🔧 Configuration
40 | 
41 | Make sure to set up your `.env` file with the necessary API keys:
42 | 
43 | - COHERE_API_KEY
44 | - JINA_API_KEY
45 | - SERPER_API_KEY
46 | - GROQ_API_KEY
47 | 
48 | ## 🤝 Contributing
49 | 
50 | We welcome contributions to OpenPerPlex! Please feel free to submit issues, fork the repository and send pull requests!
51 | 
52 | ## 📝 License
53 | 
54 | This project is licensed under the [MIT License](LICENSE).
55 | 
56 | ## 🙏 Acknowledgements
57 | 
58 | - [Cohere](https://cohere.ai/) for semantic chunking
59 | - [JINA AI](https://jina.ai/) for reranking
60 | - [serper.dev](https://serper.dev/) for Google search integration
61 | - [Groq](https://groq.com/) for inference engine
62 | - [META](https://www.meta.ai/) opensource models
63 | 
64 | ## 📬 Contact
65 | 
66 | For any questions or feedback, please open an issue on this repository or contact me on [X](https://x.com/KhazzanYassine)     
67 | 
68 | ---
69 | 
70 | Happy searching with OpenPerPlex! 🚀🔍
71 | 


--------------------------------------------------------------------------------
/build_context.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | from jina_rerank import get_reranking_jina
 3 | from semantic_chunking import get_chunking
 4 | 
 5 | # Configure logging
 6 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | def build_context(sources_result, query, pro_mode, date_context):
11 |     """
12 |       Build context from search results.
13 | 
14 |       :param sources_result: Dictionary containing search results
15 |       :param query: Search query string
16 |       :param pro_mode: Boolean indicating whether to use pro mode (reranking)
17 |       :param date_context: Date context string
18 |       :return: Built context as a string
19 |       """
20 |     try:
21 |         combined_list = []
22 | 
23 |         organic_results = sources_result.get('organic', [])
24 |         graph = sources_result.get('graph')
25 |         answer_box = sources_result.get('answerBox')
26 | 
27 |         snippets = [
28 |             f"{item['snippet']} {item.get('date', '')}"
29 |             for item in organic_results if 'snippet' in item  # Ensure there's always a snippet
30 |         ]
31 | 
32 |         combined_list.extend(snippets)
33 | 
34 |         html_text = " ".join(item['html'] for item in organic_results if 'html' in item)
35 |         if html_text is not None and len(html_text) > 200:
36 |             combined_list.extend(get_chunking(html_text))
37 | 
38 |         # Extract top stories titles
39 |         if sources_result.get('topStories') is not None:
40 |             top_stories_titles = [item['title'] for item in sources_result.get('topStories') if 'title' in item]
41 |             combined_list.extend(top_stories_titles)
42 | 
43 |         # Add descriptions and answers from 'graph' and 'answerBox'
44 |         if graph is not None:
45 |             graph_desc = graph.get('description')
46 |             if graph_desc:
47 |                 combined_list.append(graph_desc)
48 | 
49 |         if answer_box is not None:
50 |             for key in ['answer', 'snippet']:
51 |                 if key in answer_box:  # Use this if you want to append regardless of the value (including None)
52 |                     combined_list.append(answer_box[key])
53 | 
54 |         if pro_mode:
55 |             # you can choose to use jina or cohere for reranking
56 |             final_list = get_reranking_jina(combined_list, query + date_context, 15)
57 |         else:
58 |             final_list = combined_list
59 | 
60 |         search_contexts = "\n\n".join(final_list)
61 |         return search_contexts
62 |     except Exception as e:
63 |         logger.exception(f"An error occurred while building context: {e}")
64 |         return ""
65 | 


--------------------------------------------------------------------------------
/cohere_reranking.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import cohere
 4 | 
 5 | 
 6 | # use ENV variables
 7 | COHERE_API_KEY = os.getenv("COHERE_API_KEY")
 8 | MODEL = "rerank-multilingual-v3.0"
 9 | 
10 | co = cohere.Client(api_key=COHERE_API_KEY)
11 | 
12 | 
13 | def get_reranking_cohere(docs, query, top_res):
14 |     """
15 |     Re-ranks a list of documents based on a query using Cohere's reranking API.
16 | 
17 |     Args:
18 |     docs (list of str): List of documents to be re-ranked.
19 |     query (str): Query string to rank the documents against.
20 |     top_res (int): Number of top results to return.
21 | 
22 |     Returns:
23 |     list of str: Top re-ranked documents based on the query.
24 |     """
25 |     try:
26 |         # Call the Cohere rerank API
27 |         response = co.rerank(
28 |             model=MODEL,
29 |             query=query,
30 |             documents=docs,
31 |             top_n=top_res,
32 |             return_documents=True
33 |         )
34 | 
35 |         # Extract and return the texts of the top documents
36 |         return [item.document.text for item in response.results]
37 | 
38 |     except Exception as e:
39 |         # Log the error and handle it as needed
40 |         print(f"An error occurred: {e}")
41 |         return []
42 | 


--------------------------------------------------------------------------------
/extract_content_from_website.py:
--------------------------------------------------------------------------------
 1 | from langchain_community.document_loaders import WebBaseLoader
 2 | 
 3 | 
 4 | def extract_website_content(url):
 5 |     """
 6 |     Extracts and cleans the main content from a given website URL.
 7 | 
 8 |     Args:
 9 |     url (str): The URL of the website from which to extract content.
10 | 
11 |     Returns:
12 |     str: The first 4000 characters of the cleaned main content if it is sufficiently long, otherwise an empty string.
13 |     """
14 |     try:
15 |         clean_text = []
16 |         loader = WebBaseLoader(url)
17 |         data = loader.load()
18 | 
19 |         # Aggregate content using a list to avoid inefficient string concatenation in the loop
20 |         for doc in data:
21 |             if doc.page_content:  # Check if page_content is not None or empty
22 |                 clean_text.append(doc.page_content.replace("\n", ""))
23 | 
24 |                 # Join all parts into a single string after processing
25 |         clean_text = "".join(clean_text)
26 | 
27 |         # Return up to the first 4000 characters if the content is sufficiently long
28 |         return clean_text[:4000] if len(clean_text) > 200 else ""
29 | 
30 |     except Exception as error:
31 |         print('Error extracting main content:', error)
32 |         return ""
33 | 


--------------------------------------------------------------------------------
/groq_api.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | from groq import Groq
 4 | from langchain_core.prompts import PromptTemplate
 5 | from prompts import search_prompt_system, relevant_prompt_system
 6 | 
 7 | # use ENV variables
 8 | MODEL = "llama3-70b-8192"
 9 | api_key_groq = os.getenv("GROQ_API_KEY")
10 | 
11 | 
12 | client = Groq()
13 | 
14 | 
15 | def get_answer(query, contexts, date_context):
16 |     system_prompt_search = PromptTemplate(input_variables=["date_today"], template=search_prompt_system)
17 | 
18 |     messages = [
19 |         {"role": "system", "content": system_prompt_search.format(date_today=date_context)},
20 |         {"role": "user", "content": "User Question : " + query + "\n\n CONTEXTS :\n\n" + contexts}
21 |     ]
22 | 
23 |     try:
24 |         stream = client.chat.completions.create(
25 |             model=MODEL,
26 |             messages=messages,
27 |             stream=True,
28 |             stop=None,
29 |         )
30 | 
31 |         for chunk in stream:
32 |             if chunk.choices[0].delta.content is not None:
33 |                 yield chunk.choices[0].delta.content
34 | 
35 |     except Exception as e:
36 |         print(f"Error during get_answer_groq call: {e}")
37 |         yield "data:" + json.dumps(
38 |             {'type': 'error', 'data': "We are currently experiencing some issues. Please try again later."}) + "\n\n"
39 | 
40 | 
41 | def get_relevant_questions(contexts, query):
42 |     try:
43 |         response = client.chat.completions.create(
44 |             model=MODEL,
45 |             messages=[
46 |                 {"role": "system",
47 |                  "content": relevant_prompt_system
48 |                  },
49 |                 {"role": "user",
50 |                  "content": "User Query: " + query + "\n\n" + "Contexts: " + "\n" + contexts + "\n"}
51 |             ],
52 |             response_format={"type": "json_object"},
53 |         )
54 | 
55 |         return response.choices[0].message.content
56 |     except Exception as e:
57 |         print(f"Error during RELEVANT GROQ ***************: {e}")
58 |         return {}
59 | 


--------------------------------------------------------------------------------
/jina_rerank.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import requests
 3 | from typing import List
 4 | import logging
 5 | from requests.exceptions import RequestException
 6 | 
 7 | # Configure logging
 8 | logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 9 | logger = logging.getLogger(__name__)
10 | 
11 | # Constants
12 | API_URL = "https://api.jina.ai/v1/rerank"
13 | API_KEY = os.getenv("JINA_API_KEY")
14 | MODEL = "jina-reranker-v2-base-multilingual"
15 | HEADERS = {
16 |     "Content-Type": "application/json",
17 |     "Authorization": f"Bearer {API_KEY}"
18 | }
19 | 
20 | 
21 | def get_reranking_jina(docs: List[str], query: str, top_res: int) -> List[str]:
22 |     """
23 |     Get reranked documents using Jina AI API.
24 | 
25 |     :param docs: List of documents to rerank
26 |     :param query: Query string
27 |     :param top_res: Number of top results to return
28 |     :return: List of reranked documents
29 |     """
30 |     try:
31 |         data = {
32 |             "model": MODEL,
33 |             "query": query,
34 |             "documents": docs,
35 |             "top_n": top_res
36 |         }
37 | 
38 |         response = requests.post(API_URL, headers=HEADERS, json=data, timeout=10)
39 |         response.raise_for_status()
40 |         response_data = response.json()
41 | 
42 |         return [item['document']['text'] for item in response_data.get('results', [])]
43 | 
44 |     except RequestException as e:
45 |         logger.error(f"HTTP error occurred while reranking: {e}")
46 |     except KeyError as e:
47 |         logger.error(f"Unexpected response format: {e}")
48 |     except Exception as e:
49 |         logger.exception(f"An unexpected error occurred: {e}")
50 | 
51 |     return []
52 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
 1 | import orjson as json
 2 | from dotenv import load_dotenv
 3 | 
 4 | load_dotenv()
 5 | 
 6 | from fastapi.responses import StreamingResponse
 7 | from fastapi import FastAPI, HTTPException
 8 | from fastapi.middleware.cors import CORSMiddleware
 9 | from groq_api import get_answer, get_relevant_questions
10 | from sources_searcher import get_sources
11 | from build_context import build_context
12 | from sources_manipulation import populate_sources
13 | 
14 | 
15 | app = FastAPI()
16 | 
17 | # allow_origins=["https://openperplex.com"]
18 | app.add_middleware(
19 |     CORSMiddleware,
20 |     allow_origins=["*"],
21 |     allow_credentials=True,
22 |     allow_methods=["GET", "POST"],  # Allow all methods or specify like ["POST", "GET"]
23 |     allow_headers=["*"],  # Allow all headers or specify
24 | )
25 | 
26 | load_dotenv()
27 | 
28 | 
29 | @app.get("/")
30 | def root():
31 |     return {"message": "hello world openperplex v1"}
32 | 
33 | 
34 | @app.get("/up_test")
35 | def up_test():
36 |     # test for kamal deploy
37 |     return {"status": "ok"}
38 | 
39 | 
40 | # you can change to post if typical your query is too long
41 | @app.get("/search")
42 | def ask(query: str, date_context: str, stored_location: str, pro_mode: bool = False):
43 |     if not query:
44 |         raise HTTPException(status_code=400, detail="Query cannot be empty")
45 | 
46 |     def generate():
47 |         try:
48 |             sources_result = get_sources(query, pro_mode, stored_location)
49 |             yield "data:" + json.dumps({'type': 'sources', 'data': sources_result}).decode() + "\n\n"
50 | 
51 |             if sources_result.get('organic') is not None and pro_mode is True:
52 |                 # set the number of websites to scrape : here = 2
53 |                 sources_result['organic'] = populate_sources(sources_result['organic'], 2)
54 | 
55 |             search_contexts = build_context(sources_result, query, pro_mode, date_context)
56 | 
57 |             for chunk in get_answer(query, search_contexts, date_context):
58 |                 yield "data:" + json.dumps({'type': 'llm', 'text': chunk}).decode() + "\n\n"
59 | 
60 |             try:
61 |                 relevant_questions = get_relevant_questions(search_contexts, query)
62 |                 relevant_json = json.loads(relevant_questions)
63 |                 yield "data:" + json.dumps({'type': 'relevant', 'data': relevant_json}).decode() + "\n\n"
64 |             except Exception as e:
65 |                 print(f"error in relevant questions main.py {e}")
66 |                 yield "data:" + json.dumps({'type': 'relevant', 'data': []}).decode() + "\n\n"
67 | 
68 |             yield "data:" + json.dumps({'type': 'finished', 'data': ""}).decode() + "\n\n"
69 |             yield "event: end-of-stream\ndata: null\n\n"
70 | 
71 |         except Exception as e:
72 |             print(e)
73 |             yield "data:" + json.dumps(
74 |                 {'type': 'error',
75 |                  'data': "We are currently experiencing some issues. Please try again later."}).decode() + "\n\n"
76 | 
77 |     return StreamingResponse(generate(), media_type="text/event-stream")
78 | 


--------------------------------------------------------------------------------
/prompts.py:
--------------------------------------------------------------------------------
 1 | search_prompt_system = """
 2 | You are yassine, an expert with more than 20 years of experience in analysing google search results about a user question and providing accurate 
 3 | and unbiased answers the way a highly informed individual would. 
 4 | Your task is to analyse the provided contexts and the user question to provide a correct answer in a clear and concise manner.
 5 | You must answer in english.
 6 | Date and time in the context : {date_today} , Yassine must take into consideration the date and time in the response.
 7 | you are known for your expertise in this field.
 8 | 
 9 | 
10 | ###Guidelines###
11 | 1- Accuracy: Provide correct, unbiased answers. be concise and clear. don't be verbose.
12 | 2- never mention the context or this prompt in your response, just answer the user question.
13 | 
14 | ###Instructions###
15 | 1- Analyze in deep the provided context and the user question.
16 | 2- extract relevant information's from the context about the user question.
17 | 3- Yassine must take into account the date and time to answer the user question.
18 | 4- If the context is insufficient, respond with "information missing"
19 | 5- Ensure to Answer in english.
20 | 6- Use the response format provided.
21 | 7- answer the user question in a way an expert would do.
22 | 8- if you judge that the response is better represented in a table, use a table in your response. 
23 | 
24 | 
25 | ###Response Format###
26 | 
27 | You must use Markdown to format your response.
28 | 
29 | Think step by step.
30 | """
31 | 
32 | relevant_prompt_system = """
33 |     you are a question generator that responds in JSON, tasked with creating an array of 3 follow-up questions in english related
34 |     to the user query and contexts provided.
35 |     you must keep the questions related to the user query and contexts.don't lose the context in the questions.
36 | 
37 |     The JSON object must not include special characters. 
38 |     The JSON schema should include an array of follow-up questions.
39 | 
40 |     use the schema:
41 |     {
42 |       "followUp": [
43 |         "string",
44 |         "string",
45 |         "string"
46 |       ]
47 |     }
48 | """
49 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | aiohappyeyeballs==2.3.4
 2 | aiohttp==3.10.0
 3 | aiosignal==1.3.1
 4 | annotated-types==0.7.0
 5 | anyio==4.4.0
 6 | attrs==23.2.0
 7 | beautifulsoup4==4.12.3
 8 | boto3==1.34.152
 9 | botocore==1.34.152
10 | certifi==2024.7.4
11 | charset-normalizer==3.3.2
12 | click==8.1.7
13 | cohere==5.6.2
14 | colorama==0.4.6
15 | colorlog==6.8.2
16 | dataclasses-json==0.6.7
17 | distro==1.9.0
18 | fastapi==0.112.0
19 | fastavro==1.9.5
20 | filelock==3.15.4
21 | frozenlist==1.4.1
22 | fsspec==2024.6.1
23 | groq==0.9.0
24 | h11==0.14.0
25 | httpcore==1.0.5
26 | httptools==0.6.1
27 | httpx==0.27.0
28 | httpx-sse==0.4.0
29 | huggingface-hub==0.24.5
30 | idna==3.7
31 | jmespath==1.0.1
32 | jsonpatch==1.33
33 | jsonpointer==3.0.0
34 | langchain==0.2.12
35 | langchain-community==0.2.10
36 | langchain-core==0.2.27
37 | langchain-text-splitters==0.2.2
38 | langsmith==0.1.96
39 | marshmallow==3.21.3
40 | multidict==6.0.5
41 | mypy-extensions==1.0.0
42 | numpy==1.26.4
43 | openai==1.38.0
44 | orjson==3.10.6
45 | packaging==24.1
46 | parameterized==0.9.0
47 | pydantic==2.8.2
48 | pydantic_core==2.20.1
49 | python-dateutil==2.9.0.post0
50 | python-dotenv==1.0.1
51 | PyYAML==6.0.1
52 | regex==2023.12.25
53 | requests==2.32.3
54 | requests-mock==1.12.1
55 | s3transfer==0.10.2
56 | semantic-chunkers==0.0.9
57 | semantic-router==0.0.55
58 | six==1.16.0
59 | sniffio==1.3.1
60 | soupsieve==2.5
61 | SQLAlchemy==2.0.31
62 | starlette==0.37.2
63 | tenacity==8.5.0
64 | tiktoken==0.7.0
65 | tokenizers==0.19.1
66 | tqdm==4.66.4
67 | types-requests==2.32.0.20240712
68 | typing-inspect==0.9.0
69 | typing_extensions==4.12.2
70 | urllib3==2.2.2
71 | uvicorn==0.30.5
72 | uvloop==0.19.0
73 | watchfiles==0.22.0
74 | websockets==12.0
75 | yarl==1.9.4
76 | 


--------------------------------------------------------------------------------
/semantic_chunking.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from semantic_router.encoders import CohereEncoder
 4 | from semantic_chunkers import StatisticalChunker
 5 | 
 6 | COHERE_API_KEY = os.getenv("COHERE_API_KEY")
 7 | 
 8 | encoder = CohereEncoder(cohere_api_key=COHERE_API_KEY, input_type='search_document',
 9 |                         name='embed-multilingual-v3.0')
10 | 
11 | chunker = StatisticalChunker(encoder=encoder, max_split_tokens=200)
12 | 
13 | 
14 | def get_chunking(text):
15 |     """
16 |     Splits the provided text into meaningful chunks using a predefined chunker.
17 | 
18 |     Args:
19 |     text (str): The text to be chunked.
20 | 
21 |     Returns:
22 |     list: A list of chunks if the text is sufficiently long and non-empty; otherwise, an empty list.
23 |     """
24 |     try:
25 |         chunks = chunker(docs=[text])
26 |         values = [c.content for chunk in chunks for c in chunk]
27 | 
28 |         return values
29 | 
30 |     except Exception as e:
31 |         print(f"Error during chunking process: {e}")
32 |         return []
33 | 


--------------------------------------------------------------------------------
/sources_manipulation.py:
--------------------------------------------------------------------------------
 1 | from extract_content_from_website import extract_website_content
 2 | 
 3 | 
 4 | def populate_sources(sources, num_elements):
 5 |     try:
 6 |         for i, source in enumerate(sources[:num_elements]):
 7 |             if not source:
 8 |                 continue
 9 | 
10 |             try:
11 |                 source['html'] = extract_website_content(source['link'])
12 |                 sources[i] = source
13 |             except Exception as e:
14 |                 continue
15 |     except Exception as e:
16 |         print(f"Error in populate_sources: {e}")
17 |         return sources
18 | 
19 |     return sources
20 | 


--------------------------------------------------------------------------------
/sources_searcher.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import requests
 4 | from typing import Dict, Any, Optional, List
 5 | 
 6 | 
 7 | # use ENV variables
 8 | # Constants
 9 | API_URL = "https://google.serper.dev/search"
10 | API_KEY = os.getenv("SERPER_API_KEY")
11 | DEFAULT_LOCATION = 'us'
12 | HEADERS = {
13 |     'X-API-KEY': API_KEY,
14 |     'Content-Type': 'application/json'
15 | }
16 | 
17 | 
18 | def get_sources(query: str, pro_mode: bool = False, stored_location: Optional[str] = None) -> Dict[str, Any]:
19 |     """
20 |     Fetch search results from Serper API.
21 | 
22 |     :param query: Search query string
23 |     :param pro_mode: Boolean to determine the number of results
24 |     :param stored_location: Optional location string
25 |     :return: Dictionary containing search results
26 |     """
27 |     try:
28 |         search_location = (stored_location or DEFAULT_LOCATION).lower()
29 |         num_results = 10 if pro_mode else 20
30 | 
31 |         payload = {
32 |             "q": query,
33 |             "num": num_results,
34 |             "gl": search_location
35 |         }
36 | 
37 |         response = requests.post(API_URL, headers=HEADERS, json=payload, timeout=10)
38 |         response.raise_for_status()
39 | 
40 |         data = response.json()
41 | 
42 |         return {
43 |             'organic': extract_fields(data.get('organic', []), ['title', 'link', 'snippet', 'date']),
44 |             'topStories': extract_fields(data.get('topStories', []), ['title', 'imageUrl']),
45 |             'images': extract_fields(data.get('images', [])[:6], ['title', 'imageUrl']),
46 |             'graph': data.get('knowledgeGraph'),
47 |             'answerBox': data.get('answerBox')
48 |         }
49 | 
50 |     except requests.RequestException as e:
51 |         print(f"HTTP error while getting sources: {e}")
52 |     except Exception as e:
53 |         print(f"Unexpected error while getting sources: {e}")
54 | 
55 |     return {}
56 | 
57 | 
58 | def extract_fields(items: List[Dict[str, Any]], fields: List[str]) -> List[Dict[str, Any]]:
59 |     """
60 |     Extract specified fields from a list of dictionaries.
61 | 
62 |     :param items: List of dictionaries
63 |     :param fields: List of fields to extract
64 |     :return: List of dictionaries with only the specified fields
65 |     """
66 |     return [{key: item[key] for key in fields if key in item} for item in items]
67 | 


--------------------------------------------------------------------------------