├── search_benchmark ├── results │ └── August-03-2024.png ├── shared │ ├── pg.py │ ├── config.py │ └── redis_queue.py ├── ranking │ ├── config.py │ ├── logging_api.py │ ├── ctx_relevancy.py │ ├── ranking.py │ └── ctx_precision.py ├── search │ ├── google_scholar_search.py │ ├── config.py │ ├── exa_search.py │ ├── questions.py │ ├── semantic_scholar_search.py │ ├── responses.py │ ├── lumina_search.py │ └── recursive_search.py └── evals │ ├── cumulative.py │ ├── live_eval.py │ └── graphing.py ├── git.sh ├── requirements.txt ├── dockerfile ├── .gitignore ├── .github └── ISSUE_TEMPLATE │ ├── feature_request.md │ └── bug_report.md ├── docker.sh ├── .env.example ├── config.py ├── benchmark.py ├── compose.yaml └── README.md /search_benchmark/results/August-03-2024.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/lumina-ai-inc/benchmark/HEAD/search_benchmark/results/August-03-2024.png -------------------------------------------------------------------------------- /git.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Get the commit message as a command line argument 4 | m=$1 5 | 6 | git add -A 7 | git commit -m "$m" 8 | git pull 9 | git push -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | redis 2 | psycopg2-binary 3 | anthropic 4 | ragas 5 | datasets 6 | requests 7 | openai 8 | matplotlib 9 | numpy 10 | streamlit 11 | exa_py 12 | python-dotenv -------------------------------------------------------------------------------- /dockerfile: -------------------------------------------------------------------------------- 1 | # Use an official Python runtime as a parent image 2 | FROM python:3.11-slim 3 | 4 | # Set the working directory in the container 5 | WORKDIR /usr/src/app 6 | 7 | # Copy the current directory contents into the container at /usr/src/app 8 | COPY . . 9 | 10 | # Install any needed packages specified in requirements.txt 11 | RUN pip install redis psycopg2-binary openai anthropic matplotlib numpy requests streamlit python-dotenv exa_py 12 | 13 | 14 | 15 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .whisper 2 | .venv 3 | .lh 4 | .env 5 | __pycache__ 6 | .DS_Store 7 | .idea 8 | .vscode 9 | /results 10 | cmd.sh 11 | 12 | /search_benchmark/dataset/output/ 13 | # /search_benchmark/dataset/generated_questions.jsonl 14 | # /search_benchmark/dataset/user_queries.jsonl 15 | 16 | # /search_benchmark/search/recursive_search.py 17 | /search_benchmark/search/lumina_recursive_search_new.py 18 | /search_benchmark/dataset/scripts 19 | /search_benchmark/dataset/output 20 | 21 | 22 | /search_benchmark/search/semantic_scholar_search_new.py 23 | docker.sh 24 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Is your feature request related to a problem? Please describe.** 11 | A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you'd like** 14 | A clear and concise description of what you want to happen. 15 | 16 | **Describe alternatives you've considered** 17 | A clear and concise description of any alternative solutions or features you've considered. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /docker.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Save the current directory path 4 | CURRENT_DIR=$(pwd) 5 | 6 | # Define the Docker image name as a variable 7 | DOCKER_IMAGE_NAME="akhilesh99/benchmark" 8 | 9 | # Get the current commit SHA 10 | SHA=$(git rev-parse --short HEAD) 11 | echo "------------------------" 12 | echo $SHA 13 | echo "------------------------" 14 | 15 | # Build the Docker image with the SHA tag, using the saved path for the Dockerfile 16 | docker build --platform linux/amd64 -t $DOCKER_IMAGE_NAME:$SHA -f $CURRENT_DIR/Dockerfile . 17 | 18 | # Check if the build was successful 19 | if [ $? -eq 0 ]; then 20 | # Push the Docker image with the SHA tag 21 | docker push $DOCKER_IMAGE_NAME:$SHA 22 | 23 | # Optionally, you can also tag and push as latest 24 | docker tag $DOCKER_IMAGE_NAME:$SHA $DOCKER_IMAGE_NAME:latest 25 | docker push $DOCKER_IMAGE_NAME:latest 26 | else 27 | echo "Docker build failed. Skipping push." 28 | exit 1 29 | fi -------------------------------------------------------------------------------- /search_benchmark/shared/pg.py: -------------------------------------------------------------------------------- 1 | import psycopg2 2 | 3 | from psycopg2 import pool, connect 4 | import os 5 | from search_benchmark.shared.config import get_pg_db_name, get_pg_user, get_pg_password, get_pg_host, get_pg_port 6 | 7 | def get_db_connection(): 8 | """ 9 | Creates and returns a new database connection using environment variables. 10 | """ 11 | db_name = get_pg_db_name() 12 | user = get_pg_user() 13 | password = get_pg_password() 14 | host = get_pg_host() 15 | port = get_pg_port() 16 | 17 | print(f"Connecting to database: {db_name}") 18 | print(f"Host: {host}, Port: {port}") 19 | print(f"User: {user}") 20 | 21 | connection = psycopg2.connect(dbname=db_name, user=user, password=password, host=host, port=port) 22 | 23 | print("Database connection established successfully") 24 | 25 | return connection 26 | 27 | def put_db_connection(connection): 28 | if connection: 29 | connection.close() 30 | 31 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | 23 | **Screenshots** 24 | If applicable, add screenshots to help explain your problem. 25 | 26 | **Desktop (please complete the following information):** 27 | - OS: [e.g. iOS] 28 | - Browser [e.g. chrome, safari] 29 | - Version [e.g. 22] 30 | 31 | **Smartphone (please complete the following information):** 32 | - Device: [e.g. iPhone6] 33 | - OS: [e.g. iOS8.1] 34 | - Browser [e.g. stock browser, safari] 35 | - Version [e.g. 22] 36 | 37 | **Additional context** 38 | Add any other context about the problem here. 39 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | #API Keys 2 | SERP_API_KEY="example" 3 | SEMANTIC_SCHOLAR_API_KEY="example" 4 | OPENAI_API_KEY="example" 5 | ANTHROPIC_API_KEY="example" 6 | EXA_API_KEY="example" 7 | # RERANKER_URL="example" # if you have a reranker, put the url here 8 | 9 | #URLS 10 | EXA_URL="https://api.exa.ai/search" 11 | EXA_CONTENT_URL="https://api.exa.ai/contents" 12 | EXA_URL="https://api.exa.ai/" 13 | API_URL="example - contact us for a Lumina api url" 14 | 15 | 16 | 17 | # you can redis, postgres, img and config as is 18 | 19 | #Redis 20 | REDIS_URL='redis://redis:6379' 21 | 22 | #postgres 23 | PG_DB_NAME='postgres' 24 | PG_USER='postgres' 25 | PG_PASSWORD='postgres' 26 | PG_HOST='localhost' 27 | PG_PORT='5432' 28 | 29 | #img 30 | IMG_URL="index.docker.io/akhilesh99/benchmark:latest" 31 | IMG_NAME="akhilesh99/benchmark:latest" 32 | 33 | #Config for benchmark 34 | QUESTION_TYPES=user_queries,generated_questions 35 | METRICS=ctx_relevancy 36 | LLMS=[{"name": "gpt-4o", "api": "openai", "max_tokens": 1024, "temperature": 0}] 37 | NUM_Q=500 38 | PROVIDERS=lumina,google_scholar,semantic_scholar,exa 39 | 40 | 41 | -------------------------------------------------------------------------------- /search_benchmark/shared/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | # Load environment variables from .env file 5 | load_dotenv() 6 | 7 | def get_redis_url(): 8 | return os.getenv('REDIS_URL') 9 | 10 | def get_pg_db_name(): 11 | return os.getenv('PG_DB_NAME') 12 | 13 | def get_pg_user(): 14 | return os.getenv('PG_USER') 15 | 16 | def get_pg_password(): 17 | return os.getenv('PG_PASSWORD') 18 | 19 | def get_pg_host(): 20 | return os.getenv('PG_HOST') 21 | 22 | def get_pg_port(): 23 | return os.getenv('PG_PORT') 24 | 25 | def get_openai_api_key(): 26 | return os.getenv('OPENAI_API_KEY') 27 | 28 | def get_anthropic_api_key(): 29 | return os.getenv('ANTHROPIC_API_KEY') 30 | 31 | def get_exa_api_key(): 32 | return os.getenv('EXA_API_KEY') 33 | 34 | def get_serp_api_key(): 35 | return os.getenv('SERP_API_KEY') 36 | 37 | def get_semantic_scholar_api_key(): 38 | return os.getenv('SEMANTIC_SCHOLAR_API_KEY') 39 | 40 | def get_lumina_api_url(): 41 | return os.getenv('API_URL') 42 | 43 | def get_exa_url(): 44 | return os.getenv('EXA_URL') 45 | 46 | def get_exa_content_url(): 47 | return os.getenv('EXA_CONTENT_URL') 48 | 49 | def get_reranker_url(): 50 | return os.getenv('RERANKER_URL') -------------------------------------------------------------------------------- /search_benchmark/ranking/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | 4 | # Load environment variables from .env file 5 | load_dotenv() 6 | 7 | def get_redis_url(): 8 | return os.getenv('REDIS_URL') 9 | 10 | def get_pg_db_name(): 11 | return os.getenv('PG_DB_NAME') 12 | 13 | def get_pg_user(): 14 | return os.getenv('PG_USER') 15 | 16 | def get_pg_password(): 17 | return os.getenv('PG_PASSWORD') 18 | 19 | def get_pg_host(): 20 | return os.getenv('PG_HOST') 21 | 22 | def get_pg_port(): 23 | return os.getenv('PG_PORT') 24 | 25 | def get_openai_api_key(): 26 | return os.getenv('OPENAI_API_KEY') 27 | 28 | def get_anthropic_api_key(): 29 | return os.getenv('ANTHROPIC_API_KEY') 30 | 31 | def get_exa_api_key(): 32 | return os.getenv('EXA_API_KEY') 33 | 34 | def get_serp_api_key(): 35 | return os.getenv('SERP_API_KEY') 36 | 37 | def get_semantic_scholar_api_key(): 38 | return os.getenv('SEMANTIC_SCHOLAR_API_KEY') 39 | 40 | def get_lumina_api_url(): 41 | return os.getenv('API_URL') 42 | 43 | def get_exa_url(): 44 | return os.getenv('EXA_URL') 45 | 46 | def get_exa_content_url(): 47 | return os.getenv('EXA_CONTENT_URL') 48 | 49 | def get_reranker_url(): 50 | return os.getenv('RERANKER_URL') -------------------------------------------------------------------------------- /search_benchmark/search/google_scholar_search.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json 3 | from config import get_serp_api_key 4 | 5 | def fetch_google_scholar_results(question): 6 | url = "https://serpapi.com/search" 7 | params = { 8 | "engine": "google_scholar", 9 | "q": question, 10 | "api_key": get_serp_api_key() 11 | } 12 | response = requests.get(url, params=params) 13 | results = response.json() 14 | return results 15 | 16 | def process_google_scholar_results(results): 17 | processed_results = [] 18 | for result in results.get('organic_results', []): 19 | processed_results.append({ 20 | "title": result.get('title', 'No title available'), 21 | "chunks": result.get('snippet', 'No content available'), 22 | "type": "google_scholar" 23 | }) 24 | print(f"Processed {len(processed_results)} results") 25 | return processed_results 26 | 27 | def main(question): 28 | results = fetch_google_scholar_results(question) 29 | if results: 30 | processed_results = process_google_scholar_results(results) 31 | return (processed_results) 32 | print("No results found") 33 | return None 34 | 35 | if __name__ == "__main__": 36 | sample_question = "What are the effects of climate change on biodiversity?" 37 | print(f"Running sample search with question: {sample_question}") 38 | result = main(sample_question) 39 | print("Final result:") 40 | print(result) 41 | -------------------------------------------------------------------------------- /search_benchmark/search/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | import json 4 | 5 | # Load environment variables from .env file 6 | load_dotenv() 7 | 8 | def get_redis_url(): 9 | return os.getenv('REDIS_URL') 10 | 11 | def get_pg_db_name(): 12 | return os.getenv('PG_DB_NAME') 13 | 14 | def get_pg_user(): 15 | return os.getenv('PG_USER') 16 | 17 | def get_pg_password(): 18 | return os.getenv('PG_PASSWORD') 19 | 20 | def get_pg_host(): 21 | return os.getenv('PG_HOST') 22 | 23 | def get_pg_port(): 24 | return os.getenv('PG_PORT') 25 | 26 | def get_openai_api_key(): 27 | return os.getenv('OPENAI_API_KEY') 28 | 29 | def get_anthropic_api_key(): 30 | return os.getenv('ANTHROPIC_API_KEY') 31 | 32 | def get_exa_api_key(): 33 | return os.getenv('EXA_API_KEY') 34 | 35 | def get_serp_api_key(): 36 | return os.getenv('SERP_API_KEY') 37 | 38 | def get_semantic_scholar_api_key(): 39 | return os.getenv('SEMANTIC_SCHOLAR_API_KEY') 40 | 41 | def get_lumina_api_url(): 42 | return os.getenv('API_URL') 43 | 44 | def get_exa_url(): 45 | return os.getenv('EXA_URL') 46 | 47 | def get_exa_content_url(): 48 | return os.getenv('EXA_CONTENT_URL') 49 | 50 | def get_reranker_url(): 51 | return os.getenv('RERANKER_URL') 52 | 53 | def get_question_types(): 54 | load_dotenv() 55 | return os.getenv('QUESTION_TYPES', 'user_queries,generated_questions').split(',') 56 | 57 | def get_metrics(): 58 | return os.getenv('METRICS', 'ctx_relevancy').split(',') 59 | 60 | def get_llms(): 61 | llms_str = os.getenv('LLMS', '[{"name": "gpt-4o", "api": "openai", "max_tokens": 1024, "temperature": 0}]') 62 | return json.loads(llms_str) 63 | 64 | def get_providers(): 65 | return os.getenv('PROVIDERS', 'lumina,google_scholar,semantic_scholar').split(',') 66 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dotenv import load_dotenv 3 | import json 4 | 5 | # Load environment variables from .env file 6 | load_dotenv() 7 | 8 | def get_num_q(): 9 | return os.getenv('NUM_Q', 500) 10 | 11 | def get_redis_url(): 12 | return os.getenv('REDIS_URL') 13 | 14 | def get_pg_db_name(): 15 | return os.getenv('PG_DB_NAME') 16 | 17 | def get_pg_user(): 18 | return os.getenv('PG_USER') 19 | 20 | def get_pg_password(): 21 | return os.getenv('PG_PASSWORD') 22 | 23 | def get_pg_host(): 24 | return os.getenv('PG_HOST') 25 | 26 | def get_pg_port(): 27 | return os.getenv('PG_PORT') 28 | 29 | def get_openai_api_key(): 30 | return os.getenv('OPENAI_API_KEY') 31 | 32 | def get_anthropic_api_key(): 33 | return os.getenv('ANTHROPIC_API_KEY') 34 | 35 | def get_exa_api_key(): 36 | return os.getenv('EXA_API_KEY') 37 | 38 | def get_serp_api_key(): 39 | return os.getenv('SERP_API_KEY') 40 | 41 | def get_semantic_scholar_api_key(): 42 | return os.getenv('SEMANTIC_SCHOLAR_API_KEY') 43 | 44 | def get_lumina_api_url(): 45 | return os.getenv('API_URL') 46 | 47 | def get_exa_url(): 48 | return os.getenv('EXA_URL') 49 | 50 | def get_exa_content_url(): 51 | return os.getenv('EXA_CONTENT_URL') 52 | 53 | def get_reranker_url(): 54 | return os.getenv('RERANKER_URL') 55 | 56 | def get_question_types(): 57 | load_dotenv() 58 | return os.getenv('QUESTION_TYPES', 'user_queries,generated_questions').split(',') 59 | 60 | def get_metrics(): 61 | return os.getenv('METRICS', 'ctx_relevancy').split(',') 62 | 63 | def get_llms(): 64 | llms_str = os.getenv('LLMS', '[{"name": "gpt-4o", "api": "openai", "max_tokens": 1024, "temperature": 0}]') 65 | return json.loads(llms_str) 66 | 67 | def get_providers(): 68 | return os.getenv('PROVIDERS', 'lumina,google_scholar,semantic_scholar').split(',') 69 | -------------------------------------------------------------------------------- /search_benchmark/ranking/logging_api.py: -------------------------------------------------------------------------------- 1 | import json 2 | import psycopg2 3 | import os 4 | import sys 5 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 6 | sys.path.append(project_root) 7 | import concurrent.futures 8 | from search_benchmark.shared.redis_queue import RedisQueue 9 | from search_benchmark.shared.config import get_pg_db_name, get_pg_user, get_pg_password, get_pg_host, get_pg_port 10 | 11 | def get_db_connection(): 12 | """ 13 | Creates and returns a new database connection using environment variables. 14 | """ 15 | db_name = get_pg_db_name() 16 | user = get_pg_user() 17 | password = get_pg_password() 18 | host = get_pg_host() 19 | port = get_pg_port() 20 | return psycopg2.connect(dbname=db_name, user=user, password=password, host=host, port=port) 21 | 22 | # Listen to the Redis queue 'table_logs' 23 | def listen_to_table_logs(): 24 | queue = RedisQueue('table_logs') 25 | queue.start_consuming(process_message) 26 | 27 | def process_message(message): 28 | print(f"Processing message from table_logs: {message}") 29 | try: 30 | data = json.loads(message) 31 | table_name = data.get('table') 32 | payload = data.get('payload') 33 | 34 | if not table_name or not payload: 35 | raise ValueError("Both 'table_name' and 'payload' must be provided in the message.") 36 | 37 | # Ensure payload is a JSON string before parsing 38 | if isinstance(payload, str): 39 | payload = json.loads(payload) 40 | 41 | # Insert the data into the specified table in the database 42 | conn = get_db_connection() 43 | cursor = conn.cursor() 44 | 45 | # Assuming the payload is a dictionary of column-value pairs 46 | columns = ', '.join(payload.keys()) 47 | values = ', '.join(['%s'] * len(payload)) 48 | insert_query = f"INSERT INTO {table_name} ({columns}) VALUES ({values})" 49 | 50 | cursor.execute(insert_query, list(payload.values())) 51 | conn.commit() 52 | 53 | cursor.close() 54 | conn.close() 55 | 56 | print(f"Data inserted into table {table_name}: {payload}") 57 | except Exception as e: 58 | print(f"An error occurred while processing the message: {e}") 59 | 60 | # Example usage 61 | if __name__ == "__main__": 62 | listen_to_table_logs() -------------------------------------------------------------------------------- /benchmark.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import uuid 4 | import subprocess 5 | import argparse 6 | import json 7 | from dotenv import load_dotenv 8 | 9 | load_dotenv() 10 | 11 | # Get the absolute path to the project root directory 12 | project_root = os.path.dirname(os.path.abspath(__file__)) 13 | sys.path.append(project_root) 14 | 15 | from search_benchmark.search.questions import main as process_questions 16 | from config import get_question_types, get_metrics, get_llms, get_providers, get_num_q 17 | 18 | def run_benchmark(question_types, metrics, llms, providers, num_q): 19 | # Generate a unique run ID 20 | run_id = str(uuid.uuid4()) 21 | print(f"Run ID: {run_id}") 22 | print("SENDING QUESTIONS") 23 | # Process questions for each question type 24 | for question_type in question_types: 25 | process_questions(question_type, metrics, llms, providers, run_id, num_q=num_q) 26 | 27 | # Run the Streamlit app 28 | streamlit_path = os.path.join(project_root, 'search_benchmark', 'evals', 'live_eval.py') 29 | print(f"Attempting to run Streamlit app at: {streamlit_path}") 30 | if os.path.exists(streamlit_path): 31 | subprocess.run(['streamlit', 'run', streamlit_path, '--', f'--run_id={run_id}']) 32 | else: 33 | print(f"Error: The file {streamlit_path} does not exist.") 34 | 35 | if __name__ == "__main__": 36 | # Define default values from config 37 | default_question_types = get_question_types() 38 | default_metrics = get_metrics() 39 | default_llms = get_llms() 40 | default_providers = get_providers() 41 | default_num_q = get_num_q() 42 | 43 | # Set up argument parser 44 | parser = argparse.ArgumentParser(description="Run benchmark with specified parameters") 45 | parser.add_argument("--question_types", nargs="+", default=default_question_types, help="List of question types") 46 | parser.add_argument("--metrics", nargs="+", default=default_metrics, help="List of metrics") 47 | parser.add_argument("--llms", type=json.loads, default=default_llms, help="JSON string of LLM configurations") 48 | parser.add_argument("--providers", nargs="+", default=default_providers, help="List of providers") 49 | parser.add_argument("--num_q", type=int, default=default_num_q, help="Number of questions to send to each provider") 50 | 51 | # Parse arguments 52 | args = parser.parse_args() 53 | 54 | # Run benchmark with parsed arguments 55 | run_benchmark(args.question_types, args.metrics, args.llms, args.providers, args.num_q) -------------------------------------------------------------------------------- /search_benchmark/evals/cumulative.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | import os 4 | from dotenv import load_dotenv 5 | import matplotlib.pyplot as plt 6 | import seaborn as sns 7 | import pandas as pd 8 | 9 | load_dotenv() 10 | 11 | # Add the project root to the Python path 12 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 13 | sys.path.append(project_root) 14 | 15 | from search_benchmark.evals.graphing import load_results_from_db, aggregate_results 16 | 17 | def generate_cumulative_charts(): 18 | st.title("Cumulative Search Benchmark Evaluation") 19 | 20 | # Load all results from the database 21 | all_results = load_results_from_db(run_id=None) # Pass None to get all results 22 | 23 | if all_results: 24 | st.info(f"Raw results loaded: {len(all_results)} entries") 25 | 26 | # Filter results for specific providers, and exclude scores > 1 27 | filtered_results = [ 28 | result for result in all_results 29 | if result[0] in ['exa', 'lumina', 'lumina_recursive', 'semantic_scholar', 'google_scholar'] and 30 | result[2] <= 1 31 | ] 32 | 33 | # Convert to DataFrame for easier manipulation 34 | df = pd.DataFrame(filtered_results, columns=['provider', 'llm', 'score', 'question_type', 'metric']) 35 | 36 | # Group by metric and provider, then calculate mean score across all LLMs 37 | aggregated_results = df.groupby(['metric', 'provider'])['score'].mean().reset_index() 38 | 39 | st.info(f"Aggregated results: {len(aggregated_results)} entries") 40 | 41 | # Create and display bar charts 42 | metrics = aggregated_results['metric'].unique() 43 | providers = aggregated_results['provider'].unique() 44 | 45 | for metric in metrics: 46 | plt.figure(figsize=(12, 6)) 47 | 48 | metric_data = aggregated_results[aggregated_results['metric'] == metric] 49 | 50 | sns.barplot(x='provider', y='score', data=metric_data) 51 | plt.title(f'Average {metric.capitalize()} Scores Across Providers (All LLMs)') 52 | plt.xlabel('Provider') 53 | plt.ylabel('Average Score') 54 | plt.xticks(rotation=45) 55 | 56 | st.pyplot(plt) 57 | plt.close() 58 | 59 | else: 60 | st.error("No results found in the database.") 61 | 62 | if __name__ == "__main__": 63 | generate_cumulative_charts() 64 | -------------------------------------------------------------------------------- /search_benchmark/search/exa_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | from exa_py import Exa 4 | from config import get_exa_api_key, get_exa_url, get_exa_content_url 5 | 6 | exa = Exa(api_key=get_exa_api_key()) 7 | 8 | def search_exa(question): 9 | url = "https://api.exa.ai/search" 10 | headers = { 11 | "accept": "application/json", 12 | "content-type": "application/json", 13 | "x-api-key": get_exa_api_key() 14 | } 15 | payload = {"query": question, "category": "research paper", "type": "auto"} 16 | response = requests.post(url, json=payload, headers=headers) 17 | if response.status_code == 200: 18 | return response.json() 19 | else: 20 | print(f"Request failed with status code {response.status_code}: {response.text}") 21 | return None 22 | 23 | def get_content_from_exa(url): 24 | print(f"Getting content for URLs: {url}") 25 | try: 26 | results = exa.get_contents([url]) 27 | # print(f"Content Results: {results}") 28 | return [result.text if result else '' for result in results.results] 29 | except ValueError as e: 30 | print(f"Error fetching content for URLs: {str(e)}") 31 | return "No Content" 32 | 33 | def process_exa_results(results): 34 | print("Processing Exa results") 35 | processed_results = [] 36 | 37 | for result in results: 38 | # print(f"Result: {result}") 39 | url = result.get('url', '') 40 | content = get_content_from_exa(url) 41 | # print(f"Content: {content}") 42 | 43 | content_data = str(content) 44 | # print(f"Content data: {content_data}") 45 | # print(f"Chunks: {chunks}") 46 | processed_result = { 47 | "title": result.get('title', 'No title available'), 48 | "chunks": content_data, 49 | "type": "exa" 50 | } 51 | processed_results.append(processed_result) 52 | 53 | # print(f"Processed {processed_results} results") 54 | return processed_results 55 | 56 | def main(question): 57 | print(f"Processing question: {question}") 58 | exa_results = search_exa(question) 59 | 60 | if exa_results and 'results' in exa_results: 61 | exa_results_list = exa_results['results'] 62 | else: 63 | print("No results found or unexpected response structure from Exa API") 64 | return [] 65 | # print(f"Exa results: {exa_results}") 66 | 67 | processed_results = process_exa_results(exa_results_list) 68 | 69 | return processed_results 70 | 71 | if __name__ == "__main__": 72 | question = "What are the latest advancements in quantum computing?" 73 | results = main(question) 74 | print("Results:", results) 75 | 76 | -------------------------------------------------------------------------------- /search_benchmark/search/questions.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import uuid 4 | import sys 5 | import time 6 | import dotenv 7 | dotenv.load_dotenv() 8 | # Get the absolute path to the project root directory 9 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 10 | sys.path.append(project_root) 11 | 12 | from search_benchmark.shared.redis_queue import RedisQueue 13 | from config import get_redis_url 14 | 15 | def main(question_type, metrics, llms, providers, run_id, num_q=0): 16 | questions_file = os.path.join(project_root, 'search_benchmark', 'dataset', f'{question_type}.jsonl') 17 | print(f"Loading questions from file: {questions_file}") 18 | 19 | # Ensure the data directory exists 20 | data_dir = os.path.join(project_root, 'data') 21 | os.makedirs(data_dir, exist_ok=True) 22 | print(f"Data directory ensured at: {data_dir}") 23 | 24 | # Initialize Redis queue 25 | redis_queue = RedisQueue('search_queue', get_redis_url()) 26 | print("Initialized Redis queue.", get_redis_url()) 27 | 28 | # Read questions from file 29 | with open(questions_file, 'r') as f: 30 | questions = [json.loads(line)['question'] for line in f] # Limit to first 200 questions 31 | print(f"Loaded {len(questions)} questions.") 32 | 33 | # Process each question 34 | questions_to_process = questions[:num_q] if num_q != 0 else questions 35 | print(f"Processing {len(questions_to_process)} questions.") 36 | 37 | for question in questions_to_process: 38 | payload = { 39 | 'question': question, 40 | 'metrics': metrics, 41 | 'llms': llms, 42 | 'providers': providers, 43 | 'run_id': run_id, 44 | 'question_type': question_type 45 | } 46 | 47 | # Send payload to Redis queue 48 | redis_queue.send_to_queue(json.dumps(payload)) 49 | print(f"Sent question to Redis queue: {question}") 50 | 51 | # Add a small sleep after sending each question 52 | time.sleep(0.001) 53 | 54 | 55 | if __name__ == "__main__": 56 | # Example usage 57 | question_types = ['generated_questions', 'user_queries'] 58 | metrics = ['ctx_relevancy'] 59 | llms = [ 60 | {"name": "gpt-4o", "api": "openai", "max_tokens": 1024, "temperature": 0} 61 | # {"name": "claude-3-sonnet-20240229", "api": "anthropic", "max_tokens": 1024, "temperature": 0}, 62 | # {"name": "claude-3-haiku-20240307", "api": "anthropic", "max_tokens": 1024, "temperature": 0} 63 | ] 64 | providers = ['lumina', 'google_scholar', 'semantic_scholar'] 65 | # providers = ['lumina_recursive'] 66 | run_id = str(uuid.uuid4()) 67 | 68 | for question_type in question_types: 69 | main(question_type, metrics, llms, providers, run_id, num_q=300) 70 | -------------------------------------------------------------------------------- /search_benchmark/evals/live_eval.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import sys 3 | import os 4 | import argparse 5 | from dotenv import load_dotenv 6 | 7 | load_dotenv() 8 | 9 | # Add the project root to the Python path 10 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 11 | sys.path.append(project_root) 12 | 13 | from search_benchmark.evals.graphing import load_results_from_db, aggregate_results, create_bar_charts 14 | 15 | def generate_charts(run_id): 16 | if run_id: 17 | st.info(f"Generating charts for Run ID: {run_id}") 18 | # Load results from the database 19 | raw_results = load_results_from_db(run_id) 20 | 21 | if raw_results: 22 | st.info(f"Raw results loaded: {len(raw_results)} entries") 23 | # Aggregate the results 24 | aggregated_results = aggregate_results(raw_results) 25 | st.info(f"Aggregated results: {len(aggregated_results)} entries") 26 | 27 | # Create and display bar charts 28 | create_bar_charts(aggregated_results, run_id) 29 | 30 | # Display the generated charts 31 | st.subheader("Generated Charts") 32 | 33 | # Get the list of generated chart files 34 | results_dir = os.path.join(project_root, 'results') 35 | print(results_dir) 36 | st.info(f"Looking for charts in: {results_dir}") 37 | chart_files = [f for f in os.listdir(results_dir) if f".png" in f and run_id in f] 38 | st.info(f"Found {len(chart_files)} chart files") 39 | 40 | if chart_files: 41 | for chart_file in chart_files: 42 | chart_path = os.path.join(results_dir, chart_file) 43 | st.image(chart_path, caption=chart_file, use_column_width=True) 44 | 45 | st.success(f"{len(chart_files)} chart(s) displayed successfully!") 46 | else: 47 | st.warning("No charts were found for the given Run ID.") 48 | else: 49 | st.error("No results found for the given Run ID.") 50 | else: 51 | st.warning("Please enter a Run ID.") 52 | 53 | def main(): 54 | st.title("Search Benchmark Evaluation") 55 | 56 | # Parse command-line arguments 57 | parser = argparse.ArgumentParser() 58 | parser.add_argument("--run_id", type=str, help="Run ID to use") 59 | args = parser.parse_args() 60 | 61 | # Input for run ID, with default value from command-line argument 62 | run_id = st.text_input("Enter Run ID:", value=args.run_id if args.run_id else "") 63 | 64 | # Generate Charts button 65 | if st.button("Generate Charts"): 66 | generate_charts(run_id) 67 | 68 | # Refresh button 69 | if st.button("Refresh"): 70 | generate_charts(run_id) 71 | 72 | if __name__ == "__main__": 73 | main() -------------------------------------------------------------------------------- /search_benchmark/search/semantic_scholar_search.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import time 3 | from config import get_semantic_scholar_api_key 4 | 5 | def get_paper_data(paper_id): 6 | time.sleep(1) 7 | url = f'https://api.semanticscholar.org/graph/v1/paper/{paper_id}' 8 | paper_data_query_params = {'fields': 'title,abstract'} 9 | max_retries = 3 10 | for attempt in range(max_retries): 11 | response = requests.get(url, params=paper_data_query_params) 12 | if response.status_code == 200: 13 | print(f"Retrieved data for paper id: {paper_id}") 14 | return response.json() 15 | elif response.status_code == 429: 16 | print(f"Rate limit exceeded for paper id: {paper_id}. Retrying in 1 second...") 17 | time.sleep(1) 18 | else: 19 | break 20 | print(f"Failed to retrieve data for paper id: {paper_id} after {max_retries} attempts") 21 | return None 22 | 23 | def fetch_paper_details(question): 24 | print(f"Question: {question}") 25 | url = 'https://api.semanticscholar.org/graph/v1/paper/search' 26 | query_params = {'query': question, 'limit': 10} 27 | headers = {'x-api-key': get_semantic_scholar_api_key()} 28 | 29 | max_retries = 5 30 | for attempt in range(max_retries): 31 | response = requests.get(url, params=query_params, headers=headers) 32 | if response.status_code == 200: 33 | search_response = response.json() 34 | print(f"Search response received for question: {question}") 35 | 36 | if 'data' in search_response and search_response['data']: 37 | paper_ids = [paper['paperId'] for paper in search_response['data']] 38 | paper_details_list = [get_paper_data(paper_id) for paper_id in paper_ids] 39 | return [details for details in paper_details_list if details] 40 | else: 41 | print("No data available in the response.") 42 | return None 43 | elif response.status_code == 429: 44 | print(f"Rate limit exceeded for question: {question}. Retrying in 1 second...") 45 | time.sleep(1) 46 | else: 47 | print(f"Request failed with status code {response.status_code}: {response.text}") 48 | break 49 | print(f"Failed to fetch paper details for question: {question} after {max_retries} attempts") 50 | return None 51 | 52 | def process_question(question): 53 | paper_details_list = fetch_paper_details(question) 54 | if paper_details_list: 55 | contexts = [ 56 | { 57 | "title": paper_details.get('title', 'No title available'), 58 | "chunks": paper_details.get('abstract', 'No abstract available'), 59 | "type": "semantic_scholar" 60 | } 61 | for paper_details in paper_details_list 62 | ] 63 | print(f"Added {len(contexts)} results for question") 64 | else: 65 | contexts = [{ 66 | "title": "", 67 | "chunks": "", 68 | "type": "semantic_scholar" 69 | }] 70 | print("Added empty result for question") 71 | return contexts 72 | 73 | def main(question): 74 | return process_question(question) 75 | 76 | if __name__ == "__main__": 77 | sample_question = "physics" 78 | result = main(sample_question) 79 | print(result) 80 | -------------------------------------------------------------------------------- /search_benchmark/search/responses.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import os 4 | import time 5 | 6 | # Get the absolute path to the project root directory 7 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 8 | sys.path.append(project_root) 9 | 10 | from search_benchmark.shared.redis_queue import RedisQueue 11 | import search_benchmark.search.lumina_search as lumina_search 12 | import search_benchmark.search.semantic_scholar_search as semantic_scholar_search 13 | import search_benchmark.search.google_scholar_search as google_scholar_search 14 | import search_benchmark.search.exa_search as exa_search 15 | import search_benchmark.search.recursive_search as recursive_search # Importing recursive_search 16 | 17 | def process_search_request(payload): 18 | question = payload['question'] 19 | providers = payload['providers'] 20 | results = {} 21 | 22 | for provider in providers: 23 | if provider == 'lumina': 24 | results['lumina'] = lumina_search.main(question) 25 | elif provider == 'semantic_scholar': 26 | results['semantic_scholar'] = semantic_scholar_search.main(question) 27 | elif provider == 'google_scholar': 28 | results['google_scholar'] = google_scholar_search.main(question) 29 | elif provider == 'exa': 30 | results['exa'] = exa_search.main(question) 31 | elif provider == 'lumina_recursive': 32 | results['lumina_recursive'] = recursive_search.main(lumina_search.main, question, recursion_depth=1, page_size=10, page_size_per_recursion=3) # Using recursive_search for lumina_recursive 33 | elif provider == 'semantic_scholar_recursive': 34 | results['semantic_scholar_recursive'] = recursive_search.main(semantic_scholar_search.main, question, recursion_depth=1, page_size=10, page_size_per_recursion=3) # Using recursive_search for semantic_scholar_recursive 35 | elif provider == 'google_scholar_recursive': 36 | results['google_scholar_recursive'] = recursive_search.main(google_scholar_search.main, question, recursion_depth=1, page_size=10, page_size_per_recursion=3) # Using recursive_search for google_scholar_recursive 37 | time.sleep(0.05) # Small sleep after each provider search 38 | 39 | return results 40 | 41 | def listen_to_search_queue(): 42 | redis_queue = RedisQueue('search_queue') 43 | result_queue = RedisQueue('result_queue') 44 | 45 | def callback(body): 46 | payload = json.loads(body) 47 | results = process_search_request(payload) 48 | 49 | for provider, provider_results in results.items(): 50 | # print(f"Results for provider {provider}: {provider_results}") 51 | new_payload = { 52 | 'question': payload['question'], 53 | 'results': json.dumps(provider_results), 54 | 'provider': provider, 55 | 'llms': payload['llms'], 56 | 'metrics': payload['metrics'], 57 | 'run_id': payload['run_id'], 58 | 'question_type': payload['question_type'] 59 | } 60 | 61 | result_queue.send_to_queue(json.dumps(new_payload)) 62 | time.sleep(0.05) # Small sleep after sending each result 63 | 64 | print(f"Processed question: {payload['question']}") 65 | print(f"Results sent to result queue for each provider") 66 | 67 | redis_queue.start_consuming(callback) 68 | 69 | if __name__ == "__main__": 70 | listen_to_search_queue() -------------------------------------------------------------------------------- /compose.yaml: -------------------------------------------------------------------------------- 1 | version: '3.8' 2 | 3 | services: 4 | logging_api: 5 | depends_on: 6 | - redis 7 | - pg 8 | image: akhilesh99/benchmark:latest 9 | command: ["python3", "search_benchmark/ranking/logging_api.py"] 10 | deploy: 11 | mode: replicated 12 | replicas: 0 13 | env_file: 14 | - path: .env 15 | required: false 16 | environment: 17 | - REDIS_URL=redis://redis:6379 18 | - PG_DB_NAME=postgres 19 | - PG_USER=postgres 20 | - PG_PASSWORD=postgres 21 | - PG_HOST=pg 22 | - PG_PORT=5432 23 | - OPENAI_API_KEY=${OPENAI_API_KEY} 24 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} 25 | - EXA_API_KEY=${EXA_API_KEY} 26 | - SERP_API_KEY=${SERP_API_KEY} 27 | - SEMANTIC_SCHOLAR_API_KEY=${SEMANTIC_SCHOLAR_API_KEY} 28 | - API_URL=${API_URL} 29 | 30 | questions: 31 | depends_on: 32 | - redis 33 | - pg 34 | image: akhilesh99/benchmark:latest 35 | command: ["python3", "benchmark.py"] 36 | deploy: 37 | mode: replicated 38 | replicas: 1 39 | env_file: 40 | - path: .env 41 | required: false 42 | environment: 43 | - REDIS_URL=redis://redis:6379 44 | - PG_DB_NAME=postgres 45 | - PG_USER=postgres 46 | - PG_PASSWORD=postgres 47 | - PG_HOST=pg 48 | - PG_PORT=5432 49 | - OPENAI_API_KEY=${OPENAI_API_KEY} 50 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} 51 | - EXA_API_KEY=${EXA_API_KEY} 52 | - SERP_API_KEY=${SERP_API_KEY} 53 | - SEMANTIC_SCHOLAR_API_KEY=${SEMANTIC_SCHOLAR_API_KEY} 54 | - API_URL=${API_URL} 55 | ports: 56 | - "8501:8501" 57 | 58 | responses: 59 | depends_on: 60 | - redis 61 | - pg 62 | image: akhilesh99/benchmark:latest 63 | command: ["python3", "search_benchmark/search/responses.py"] 64 | deploy: 65 | mode: replicated 66 | replicas: 0 67 | env_file: 68 | - path: .env 69 | required: false 70 | environment: 71 | - REDIS_URL=redis://redis:6379 72 | - PG_DB_NAME=postgres 73 | - PG_USER=postgres 74 | - PG_PASSWORD=postgres 75 | - PG_HOST=pg 76 | - PG_PORT=5432 77 | - OPENAI_API_KEY=${OPENAI_API_KEY} 78 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} 79 | - EXA_API_KEY=${EXA_API_KEY} 80 | - SERP_API_KEY=${SERP_API_KEY} 81 | - SEMANTIC_SCHOLAR_API_KEY=${SEMANTIC_SCHOLAR_API_KEY} 82 | - API_URL=${API_URL} 83 | 84 | ranking: 85 | depends_on: 86 | - redis 87 | - pg 88 | image: akhilesh99/benchmark:latest 89 | command: ["python3", "search_benchmark/ranking/ranking.py"] 90 | deploy: 91 | mode: replicated 92 | replicas: 0 93 | env_file: 94 | - path: .env 95 | required: false 96 | environment: 97 | - REDIS_URL=redis://redis:6379 98 | - PG_DB_NAME=postgres 99 | - PG_USER=postgres 100 | - PG_PASSWORD=postgres 101 | - PG_HOST=pg 102 | - PG_PORT=5432 103 | - OPENAI_API_KEY=${OPENAI_API_KEY} 104 | - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} 105 | - EXA_API_KEY=${EXA_API_KEY} 106 | - SERP_API_KEY=${SERP_API_KEY} 107 | - SEMANTIC_SCHOLAR_API_KEY=${SEMANTIC_SCHOLAR_API_KEY} 108 | - API_URL=${API_URL} 109 | 110 | redis: 111 | image: redis:alpine 112 | ports: 113 | - "6379:6379" 114 | volumes: 115 | - redis_data:/data 116 | 117 | pg: 118 | image: postgres 119 | shm_size: 128mb 120 | environment: 121 | POSTGRES_PASSWORD: postgres 122 | volumes: 123 | - pg_data:/var/lib/postgresql/data 124 | 125 | adminer: 126 | image: adminer 127 | restart: always 128 | ports: 129 | - "8080:8080" 130 | 131 | 132 | volumes: 133 | redis_data: 134 | pg_data: -------------------------------------------------------------------------------- /search_benchmark/shared/redis_queue.py: -------------------------------------------------------------------------------- 1 | import redis 2 | import sys 3 | import os 4 | import time 5 | from search_benchmark.shared.config import get_redis_url 6 | import dotenv 7 | 8 | class RedisQueue: 9 | def __init__(self, queue='task_queue', model=None): 10 | try: 11 | dotenv.load_dotenv() 12 | timeout = 3600 13 | redis_url = get_redis_url() 14 | self._redis = redis.Redis.from_url(redis_url, socket_timeout=timeout) 15 | self._queue = queue 16 | self.model = model 17 | print(f"Connected to Redis: {redis_url}") 18 | except Exception as e: 19 | print(f"Failed to connect to Redis: {e}") 20 | 21 | def send_to_queue(self, body): 22 | try: 23 | print(f"Sending message to queue: {body}") 24 | self._redis.lpush(self._queue, body) 25 | print(f" [x] Sent {body}") 26 | except Exception as e: 27 | print(f"An error occurred in redis: {e}") 28 | 29 | def start_consuming(self, callback): 30 | print(" [*] Waiting for messages. To exit press CTRL+C") 31 | try: 32 | while True: 33 | body = self._redis.rpop(self._queue) 34 | if body: 35 | retries = 5 36 | while retries > 0: 37 | try: 38 | callback(body) 39 | break 40 | except Exception as e: 41 | retries -= 1 42 | print(f"Callback failed, retrying... ({5 - retries}/5). Error: {e}") 43 | if retries == 0: 44 | print("Callback failed after 5 retries.") 45 | else: 46 | # Sleep to prevent tight looping when the queue is empty 47 | time.sleep(1) 48 | except KeyboardInterrupt: 49 | print('\nInterrupted') 50 | try: 51 | sys.exit(0) 52 | except SystemExit: 53 | os._exit(0) 54 | except Exception as e: 55 | print(f"An error occurred in start_consuming: {e}") 56 | 57 | def close_connection(self): 58 | # Redis connection is managed automatically, so there might not be a need to explicitly close it. 59 | print("Connection closed") 60 | def start_consuming_batch_loop(self, callback, count): 61 | print("[*] Waiting for messages. To exit press CTRL+C") 62 | try: 63 | while True: 64 | # Use a pipeline to execute multiple RPOP commands simultaneously 65 | pipeline = self._redis.pipeline() 66 | for _ in range(count): 67 | pipeline.rpop(self._queue) 68 | batch = pipeline.execute() 69 | 70 | # Filter out None values (in case the queue is empty) 71 | batch = [body for body in batch if body] 72 | 73 | if batch: 74 | retries = 5 75 | while retries > 0: 76 | try: 77 | callback(batch) 78 | break 79 | except Exception as e: 80 | retries -= 1 81 | print(f"Callback batch failed, retrying... ({5 - retries}/5). Error: {e}") 82 | if retries == 0: 83 | print("Callback batch failed after 5 retries.") 84 | else: 85 | # Sleep to prevent tight looping when the queue is empty 86 | time.sleep(1) 87 | except KeyboardInterrupt: 88 | print('\nInterrupted') 89 | try: 90 | sys.exit(0) 91 | except SystemExit: 92 | os._exit(0) 93 | except Exception as e: 94 | print(f"An error occurred in start_consuming_batch_loop: {e}") 95 | 96 | # Example callback function 97 | def process_message(message): 98 | print(f"Processing message: {message}") 99 | 100 | # Example usage 101 | if __name__ == "__main__": 102 | queue = RedisQueue('my_task_queue') 103 | queue.send_to_queue('Hello, Redis!') 104 | queue.start_consuming(process_message) -------------------------------------------------------------------------------- /search_benchmark/ranking/ctx_relevancy.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import json 3 | import anthropic 4 | import re 5 | from config import get_openai_api_key, get_anthropic_api_key 6 | from typing import List, Dict 7 | 8 | CONTEXT_RELEVANCE_PROMPT = """Given a question and a single context, determine if this context is absolutely essential and irreplaceable for answering the question. The context contains a 'title', 'chunks', and 'doi'. 9 | 10 | Respond with either '' or ''. Base your decision solely on whether the 'chunks' content provides unique, indispensable information that directly answers the question. 11 | 12 | If no context is provided or if the context is empty, respond with ''. Do not bias any better or worse for length of context. It should not matter. 13 | 14 | Question: {question} 15 | 16 | Context: 17 | {context}""" 18 | 19 | def context_relevancy_score(question: str, contexts: List[Dict], model: Dict[str, str]) -> float: 20 | print(f"Starting context relevancy scoring for question: {question[:30]}...") 21 | if model["api"] == "openai": 22 | api_key = get_openai_api_key() 23 | client = openai.OpenAI(api_key=api_key) 24 | print("Using OpenAI API") 25 | elif model["api"] == "anthropic": 26 | api_key = get_anthropic_api_key() 27 | client = anthropic.Anthropic(api_key=api_key) 28 | print("Using Anthropic API") 29 | else: 30 | raise ValueError(f"Unsupported API: {model['api']}") 31 | 32 | print(f"Number of contexts: {len(contexts)}") 33 | 34 | highly_relevant_count = 0 35 | 36 | for context in contexts: 37 | prompt = CONTEXT_RELEVANCE_PROMPT.format( 38 | question=question, 39 | context=json.dumps(context, indent=2) 40 | ) 41 | 42 | print(f"Sending request to {model['api']} API...") 43 | print(f"Prompt: {prompt}") 44 | if model["api"] == "openai": 45 | response = client.chat.completions.create( 46 | model=model["name"], 47 | messages=[{"role": "user", "content": prompt}], 48 | max_tokens=model["max_tokens"], 49 | n=1, 50 | stop=None, 51 | temperature=model["temperature"], 52 | ) 53 | response_text = response.choices[0].message.content 54 | elif model["api"] == "anthropic": 55 | response = client.messages.create( 56 | model=model["name"], 57 | max_tokens=model["max_tokens"], 58 | messages=[ 59 | { 60 | "role": "user", 61 | "content": prompt, 62 | } 63 | ], 64 | temperature=model["temperature"], 65 | ) 66 | response_text = response.content[0].text 67 | 68 | print(f"Response from {model['api']} API: {response_text}") 69 | 70 | if '' in response_text.lower(): 71 | highly_relevant_count += 1 72 | 73 | score = highly_relevant_count / len(contexts) if len(contexts) > 0 else 0 74 | print(f"Computed score: {score}") 75 | return score 76 | 77 | def evaluate_context_relevancy(data_list: List[Dict], api_key: str) -> List[Dict]: 78 | print(f"Starting evaluation of context relevancy for {len(data_list)} items...") 79 | results = [] 80 | for data in data_list: 81 | question = data["question"] 82 | results_json = json.loads(data["results"]) 83 | contexts = results_json 84 | provider = data["provider"] 85 | llms = data["llms"] 86 | 87 | for model in llms: 88 | print(f"Processing question: {question[:30]}, provider: {provider}, model: {model['name']}") 89 | score = context_relevancy_score(question, contexts, model) 90 | result = { 91 | "question": data["question"], 92 | "provider": data["provider"], 93 | "model": model["name"], 94 | "score": score 95 | } 96 | results.append(result) 97 | print(f"Completed evaluation for question: {data['question'][:30]}, provider: {data['provider']}, model: {model['name']}") 98 | 99 | print(f"Completed evaluation of context relevancy for all items") 100 | return results 101 | 102 | def batch_evaluate_context_relevancy(data_list: List[Dict], api_key: str) -> List[Dict]: 103 | print(f"Starting batch evaluation of context relevancy for {len(data_list)} items...") 104 | results = evaluate_context_relevancy(data_list, api_key) 105 | print(f"Completed batch evaluation of context relevancy") 106 | return results 107 | -------------------------------------------------------------------------------- /search_benchmark/ranking/ranking.py: -------------------------------------------------------------------------------- 1 | import json 2 | import sys 3 | import os 4 | import time 5 | import uuid 6 | from datetime import datetime 7 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 8 | sys.path.append(project_root) 9 | import concurrent.futures 10 | from search_benchmark.shared.redis_queue import RedisQueue 11 | from search_benchmark.ranking.ctx_relevancy import batch_evaluate_context_relevancy 12 | from search_benchmark.ranking.ctx_precision import batch_evaluate_context_precision 13 | from search_benchmark.ranking.config import get_openai_api_key, get_anthropic_api_key 14 | from search_benchmark.shared.pg import get_db_connection, put_db_connection 15 | 16 | 17 | def create_eval_table_if_not_exists(): 18 | conn = get_db_connection() 19 | try: 20 | with conn.cursor() as cur: 21 | cur.execute(""" 22 | CREATE TABLE IF NOT EXISTS eval ( 23 | id UUID DEFAULT gen_random_uuid() PRIMARY KEY, 24 | created_at TIMESTAMPTZ DEFAULT now() NOT NULL, 25 | run_id UUID NOT NULL, 26 | question TEXT, 27 | metric TEXT, 28 | results TEXT, 29 | score NUMERIC, 30 | provider TEXT, 31 | llm TEXT, 32 | question_type TEXT 33 | ) 34 | """) 35 | conn.commit() 36 | except Exception as e: 37 | conn.rollback() 38 | finally: 39 | put_db_connection(conn) 40 | 41 | def send_eval_row_to_redis(eval_row): 42 | 43 | table_logs_queue = RedisQueue('table_logs') 44 | try: 45 | log_data = { 46 | 'table': 'eval', 47 | 'payload': eval_row 48 | } 49 | table_logs_queue.send_to_queue(json.dumps(log_data)) 50 | except Exception as e: 51 | print(f"Error sending eval row to Redis queue: {e}") 52 | 53 | def process_batch(batch): 54 | 55 | data_list = [] 56 | for payload in batch: 57 | print(f"Debug: Processing payload: {payload}") 58 | question = payload['question'] 59 | results = payload['results'] 60 | provider = payload['provider'] 61 | llms = payload['llms'] 62 | question_type = payload['question_type'] 63 | metrics = payload['metrics'] 64 | data_list.append({ 65 | 'question': question, 66 | 'results': results, 67 | 'provider': provider, 68 | 'llms': llms, 69 | 'question_type': question_type, 70 | 'metrics': metrics 71 | }) 72 | 73 | api_key = get_openai_api_key() if 'openai' in [llm['api'] for data in data_list for llm in data['llms']] else get_anthropic_api_key() 74 | 75 | try: 76 | for metric in data_list[0]['metrics']: 77 | print(f"Debug: Processing metric: {metric}") 78 | if metric == 'ctx_precision': 79 | scores = batch_evaluate_context_precision(data_list, api_key) 80 | else: 81 | scores = batch_evaluate_context_relevancy(data_list, api_key) 82 | 83 | for score_data in scores: 84 | print(f"Debug: Processing score data: {score_data}") 85 | eval_row = { 86 | 'run_id': str(payload['run_id']), 87 | 'question': score_data['question'], 88 | 'metric': metric, 89 | 'results': json.dumps(results), 90 | 'score': score_data['score'], 91 | 'provider': score_data['provider'], 92 | 'llm': str(score_data['model']), 93 | 'question_type': payload['question_type'] 94 | } 95 | try: 96 | send_eval_row_to_redis(eval_row) 97 | except Exception as e: 98 | print(f"Error sending eval row to Redis: {e}") 99 | 100 | print(f"Processed batch of {len(batch)} results") 101 | except Exception as e: 102 | print(f"Error processing batch: {e}") 103 | 104 | def listen_to_result_queue(batch_size=1): 105 | 106 | result_queue = RedisQueue('result_queue') 107 | batch = [] 108 | 109 | def callback(body): 110 | nonlocal batch 111 | print("Debug: Callback function called") 112 | payload = json.loads(body) 113 | batch.append(payload) 114 | 115 | 116 | if len(batch) >= batch_size: 117 | process_batch(batch) 118 | batch = [] 119 | 120 | result_queue.start_consuming(callback) 121 | 122 | def main(): 123 | print("Starting ranking process...") 124 | create_eval_table_if_not_exists() 125 | listen_to_result_queue() 126 | print("Ranking process completed.") 127 | 128 | if __name__ == "__main__": 129 | print("Debug: Script started") 130 | main() 131 | print("Debug: Script finished") 132 | -------------------------------------------------------------------------------- /search_benchmark/search/lumina_search.py: -------------------------------------------------------------------------------- 1 | import json 2 | import requests 3 | from config import get_lumina_api_url 4 | 5 | def query_api(question, page_size: int = 10): 6 | print(f"Querying API for question: {question}") 7 | url = f"{get_lumina_api_url()}/search" 8 | headers = {"Content-Type": "application/json"} 9 | data = { 10 | "query": question, 11 | "dataset_id": "c5bbe32b-4fb7-476a-81aa-fe269f67f283", 12 | "page": 1, 13 | "filters": {"must": None, "must_not": None, "should": None}, 14 | "page_size": page_size, 15 | "group_size": 3, 16 | "search_type": "hybrid", 17 | "slim_chunks": False, 18 | "highlight_results": True, 19 | "highlight_threshold": 0.8, 20 | "highlight_delimiters": ["?", ".", "!"], 21 | "highlight_max_length": 20, 22 | "highlight_max_num": 4, 23 | "recency_bias": 3, 24 | "get_total_pages": False, 25 | "score_threshold": 0, 26 | "get_collisions": True 27 | } 28 | response = requests.post(url, headers=headers, json=data) 29 | 30 | if response.status_code == 200: 31 | try: 32 | print("API query successful") 33 | return response.json() 34 | except requests.exceptions.JSONDecodeError: 35 | print(f"Error: Unable to decode JSON. Response content: {response.text}") 36 | return None 37 | else: 38 | print(f"Error: API request failed with status code {response.status_code}. Response content: {response.text}") 39 | return None 40 | 41 | def process_results_lumina(results): 42 | print("Processing Lumina results") 43 | processed_results = [] 44 | doi = "" 45 | for group_chunk in results['group_chunks']: 46 | group_name = group_chunk['group_name'] 47 | group_data = {"title": group_name, "chunks": "", "doi": doi, "type": "lumina"} 48 | for chunk in group_chunk['metadata']: 49 | metadata = chunk['metadata'][0] 50 | doi = metadata['metadata']['doi'] 51 | content_html = metadata['chunk_html'] 52 | if content_html != doi: 53 | group_data["chunks"] += content_html 54 | group_data["doi"] = doi 55 | processed_results.append(group_data) 56 | print(f"Processed {len(processed_results)} results") 57 | return processed_results 58 | 59 | def get_abstract(group_tracking_id): 60 | print(f"Fetching abstract for group_tracking_id: {group_tracking_id}") 61 | url = f"{get_lumina_api_url()}/file/{group_tracking_id}" 62 | response = requests.get(url) 63 | if response.status_code == 200: 64 | file_metadata = response.json() 65 | abstract = file_metadata.get('abstract', None) 66 | print("Abstract fetched successfully" if abstract else "No abstract found") 67 | return abstract 68 | else: 69 | print(f"Failed to fetch abstract. Status code: {response.status_code}") 70 | return None 71 | 72 | def process_results_lumina_abstracts(results): 73 | print("Processing Lumina results with abstracts") 74 | processed_results = [] 75 | for group_chunk in results['group_chunks']: 76 | group_name = group_chunk['group_name'] 77 | group_tracking_id = group_chunk['group_tracking_id'] 78 | group_data = {"title": group_name, "chunks": "", "doi": "", "type": "lumina"} 79 | 80 | abstract = get_abstract(group_tracking_id) 81 | if abstract: 82 | group_data["chunks"] = abstract 83 | for chunk in group_chunk['metadata']: 84 | metadata = chunk['metadata'][0] 85 | doi = metadata['metadata']['doi'] 86 | group_data["doi"] = doi 87 | if group_data["doi"]: 88 | break 89 | else: 90 | print("No abstract found, processing chunks") 91 | for chunk in group_chunk['metadata']: 92 | metadata = chunk['metadata'][0] 93 | doi = metadata['metadata']['doi'] 94 | content_html = metadata['chunk_html'] 95 | group_data["chunks"] += content_html 96 | group_data["doi"] = doi 97 | 98 | processed_results.append(group_data) 99 | print(f"Processed {len(processed_results)} results with abstracts") 100 | return processed_results 101 | 102 | def process_question(question, type='full', page_size=10): 103 | print(f"Processing question: {question}") 104 | results = query_api(question, page_size=page_size) 105 | if results is None: 106 | print("No results found for the question") 107 | return [] 108 | if type == 'full': 109 | return process_results_lumina(results) 110 | else: 111 | return process_results_lumina_abstracts(results) 112 | 113 | def main(question, type='full'): 114 | print(f"Processing question: {question}") 115 | results = process_question(question) 116 | return results 117 | 118 | 119 | if __name__ == "__main__": 120 | question = "What is the role of mitochondria in cellular energy production?" 121 | results = main(question) 122 | print(json.dumps(results, indent=2)) 123 | 124 | -------------------------------------------------------------------------------- /search_benchmark/evals/graphing.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import matplotlib.pyplot as plt 4 | import numpy as np 5 | import sys 6 | from collections import defaultdict 7 | 8 | project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) 9 | sys.path.append(project_root) 10 | 11 | from search_benchmark.shared.pg import get_db_connection, put_db_connection 12 | 13 | def load_results_from_db(run_id): 14 | conn = get_db_connection() 15 | cur = conn.cursor() 16 | 17 | if run_id is None: 18 | query = """ 19 | SELECT provider, llm, score, question_type, metric 20 | FROM eval 21 | """ 22 | cur.execute(query) 23 | else: 24 | query = """ 25 | SELECT provider, llm, score, question_type, metric 26 | FROM eval 27 | WHERE run_id::text = %s 28 | """ 29 | cur.execute(query, (run_id,)) 30 | 31 | results = cur.fetchall() 32 | 33 | cur.close() 34 | put_db_connection(conn) 35 | 36 | if not results: 37 | print(f"No results found" + (f" for run_id: {run_id}" if run_id else "")) 38 | else: 39 | print(f"Results loaded successfully") 40 | 41 | return results 42 | 43 | def aggregate_results(results): 44 | aggregated = defaultdict(lambda: defaultdict(lambda: defaultdict(lambda: defaultdict(list)))) 45 | for provider, llm, score, question_type, metric in results: 46 | aggregated[metric][question_type][provider][llm].append(score) 47 | 48 | final_results = [] 49 | for metric, question_types in aggregated.items(): 50 | for question_type, providers in question_types.items(): 51 | for provider, llm_scores in providers.items(): 52 | for llm, scores in llm_scores.items(): 53 | avg_score = sum(scores) / len(scores) 54 | final_results.append((metric, question_type, provider, llm, avg_score, len(scores))) 55 | 56 | return final_results 57 | 58 | def get_providers_and_llms_for_run(run_id): 59 | conn = get_db_connection() 60 | cursor = conn.cursor() 61 | 62 | query = """ 63 | SELECT DISTINCT provider, llm, question_type, metric 64 | FROM eval 65 | WHERE run_id::text = %s 66 | """ 67 | 68 | cursor.execute(query, (run_id,)) 69 | results = cursor.fetchall() 70 | 71 | providers = set() 72 | llms = set() 73 | question_types = set() 74 | metrics = set() 75 | 76 | for provider, llm, question_type, metric in results: 77 | providers.add(provider) 78 | llms.add(llm) 79 | question_types.add(question_type) 80 | metrics.add(metric) 81 | 82 | cursor.close() 83 | put_db_connection(conn) 84 | 85 | return list(providers), list(llms), list(question_types), list(metrics) 86 | 87 | def create_bar_charts(results, run_id): 88 | providers, llms, question_types, metrics = get_providers_and_llms_for_run(run_id) 89 | 90 | # Define the results directory 91 | results_dir = os.path.join(project_root, 'results') 92 | os.makedirs(results_dir, exist_ok=True) 93 | 94 | for metric in metrics: 95 | for question_type in question_types: 96 | filtered_results = [r for r in results if r[0] == metric and r[1] == question_type] 97 | 98 | if not filtered_results: 99 | print(f"No results found for metric: {metric} and question type: {question_type}. Skipping...") 100 | continue 101 | fig, ax = plt.subplots(figsize=(15, 10)) 102 | 103 | x = np.arange(len(providers)) 104 | width = 0.2 105 | multiplier = 0 106 | 107 | for llm, color in zip(llms, plt.cm.viridis(np.linspace(0, 1, len(llms)))): 108 | offset = width * multiplier 109 | scores = [next((r[4] for r in filtered_results if r[2] == provider and r[3] == llm), 0) for provider in providers] 110 | rects = ax.bar(x + offset, scores, width, label=llm, color=color) 111 | ax.bar_label(rects, fmt='%.2f', padding=3) 112 | multiplier += 1 113 | 114 | ax.set_ylabel(f'Average {metric.capitalize()} Score') 115 | 116 | # Get number of questions per provider 117 | question_counts = [next((r[5] for r in filtered_results if r[2] == provider), 0) for provider in providers] 118 | 119 | ax.set_title(f'{metric.capitalize()} Scores by Search Engine and LLM\n' 120 | f'(Run ID: {run_id}, Question Type: {question_type}, ' 121 | f'Total Queries: {sum(question_counts)})') 122 | ax.set_xticks(x + width * 1.5) 123 | ax.set_xticklabels([f'{provider}\n({count} questions)' for provider, count in zip(providers, question_counts)]) 124 | ax.legend(loc='upper left', ncol=2) 125 | ax.set_ylim(0, 1) 126 | 127 | plt.tight_layout() 128 | chart_filename = f'{metric}_comparison_{run_id}_{question_type}.png' 129 | chart_path = os.path.join(results_dir, chart_filename) 130 | plt.savefig(chart_path) 131 | plt.close() 132 | 133 | print(f"Bar chart created and saved as '{chart_path}'") 134 | 135 | def main(run_id): 136 | raw_results = load_results_from_db(run_id) 137 | if not raw_results: 138 | print("No results to process. Exiting.") 139 | return 140 | 141 | aggregated_results = aggregate_results(raw_results) 142 | create_bar_charts(aggregated_results, run_id) 143 | 144 | if __name__ == "__main__": 145 | main() -------------------------------------------------------------------------------- /search_benchmark/search/recursive_search.py: -------------------------------------------------------------------------------- 1 | from config import get_anthropic_api_key, get_reranker_url 2 | import anthropic 3 | import re 4 | from typing import List, Callable 5 | import json 6 | import requests 7 | 8 | 9 | def rerank_results(results: List[dict], query: str, batch_size: int = 20): 10 | def batch(iterable, n=1): 11 | l = len(iterable) 12 | for ndx in range(0, l, n): 13 | yield iterable[ndx:min(ndx + n, l)] 14 | 15 | # Truncate to 350 words 16 | texts = [' '.join(f"{res['title']} {res['chunks']}".split()[:350]) 17 | for res in results] 18 | all_index_score_pairs = [] 19 | for result_batch in batch(texts, batch_size): 20 | payload = { 21 | "query": query, 22 | "texts": result_batch, 23 | "raw_scores": False 24 | } 25 | 26 | response = requests.post(get_reranker_url(), json=payload) 27 | response.raise_for_status() 28 | rerank_results = response.json() 29 | 30 | index_score_pairs = [(item["index"], item["score"]) 31 | for item in rerank_results] 32 | all_index_score_pairs.extend(index_score_pairs) 33 | 34 | all_index_score_pairs.sort(key=lambda x: x[1], reverse=True) 35 | sorted_results = [results[index] for index, _ in all_index_score_pairs] 36 | return sorted_results 37 | 38 | 39 | def get_new_questions(question: str, result: dict): 40 | api_key = get_anthropic_api_key() 41 | client = anthropic.Anthropic(api_key=api_key) 42 | 43 | prompt = f""" 44 | Based on the user's query: "{question}", 45 | 46 | the search result is: 47 | {result} 48 | 49 | Identify parts of the user's query that were unanswered or need further refinement, and suggest a refined search query to help find better search results. 50 | There should be variation in length, complexity, and specificity across the queries. 51 | The query must be based on the detailed concepts, key-terms, hard values and facts in the result you've been provided. 52 | Wrap it in tags new_query. 53 | """ 54 | response = client.messages.create( 55 | model="claude-3-haiku-20240307", 56 | max_tokens=1024, 57 | messages=[{"role": "user", "content": prompt}], 58 | temperature=1 59 | ) 60 | refined_query = response.content[0].text 61 | matches = re.findall(r'(.*?)', refined_query, re.DOTALL) 62 | if matches: 63 | return [query.strip() for query in matches][:1] 64 | return [] 65 | 66 | 67 | def recursion_pattern(question: str, results: List[dict]): 68 | new_questions = [] 69 | results = [{k: v for k, v in res.items() if k != 'type'} 70 | for res in results] 71 | for i, result in enumerate(results): 72 | print(f"Generating new questions for result {i+1} of {len(results)}") 73 | try: 74 | llm_questions = get_new_questions(question, result) 75 | new_questions.extend(llm_questions) 76 | except Exception as e: 77 | print(e) 78 | continue 79 | print(f"Generated {len(new_questions)} new questions") 80 | return new_questions 81 | 82 | 83 | def recursive_search(search_function: Callable, questions: List[str], recursion_depth: int, page_size: int): 84 | print(f"\n------------------") 85 | print(f"Recursion depth: {recursion_depth}") 86 | print(f"------------------") 87 | new_questions = [] 88 | results = [] 89 | for i, question in enumerate(questions): 90 | print(f"\nSearching for question {i+1} of {len(questions)}") 91 | try: 92 | search_results = search_function(question)[:page_size] 93 | except Exception as e: 94 | print(e) 95 | continue 96 | results.extend([{**res, 'question': question} for res in search_results]) 97 | new_questions.extend(recursion_pattern(question, search_results)) 98 | if recursion_depth > 1: 99 | q, r = recursive_search( 100 | search_function, new_questions, recursion_depth-1, page_size) 101 | new_questions.extend(q) 102 | results.extend(r) 103 | return new_questions, results 104 | 105 | 106 | def main(search_function: Callable, question: str, recursion_depth: int, page_size: int = 10, page_size_per_recursion: int = 10): 107 | _, results = recursive_search( 108 | search_function, [question], recursion_depth, page_size_per_recursion) 109 | 110 | # Filter out results with the same chunk 111 | unique_chunks = set() 112 | filtered_results = [] 113 | for result in results: 114 | chunk = result.get('chunks') 115 | if chunk not in unique_chunks: 116 | unique_chunks.add(chunk) 117 | filtered_results.append(result) 118 | print(f"Filtered {len(results)} results to {len(filtered_results)} unique chunks") 119 | try: 120 | reranked_results = rerank_results(filtered_results, question)[:page_size] 121 | except Exception as e: 122 | print(e) 123 | reranked_results = filtered_results[:page_size] 124 | 125 | return reranked_results 126 | 127 | 128 | if __name__ == "__main__": 129 | from google_scholar_search import main as google_scholar_search 130 | question = "How does bibliometric analysis help in understanding the impact of a particular research topic?" 131 | recursion_depth = 1 132 | page_size = 10 133 | page_size_per_recursion = 2 134 | 135 | reranked_results = main( 136 | google_scholar_search, question, recursion_depth, page_size, page_size_per_recursion) 137 | print(reranked_results) 138 | with open("reranked_results.json", "w") as f: 139 | json.dump(reranked_results, f) 140 | -------------------------------------------------------------------------------- /search_benchmark/ranking/ctx_precision.py: -------------------------------------------------------------------------------- 1 | import openai 2 | import json 3 | import anthropic 4 | import re 5 | from config import get_openai_api_key, get_anthropic_api_key 6 | from typing import List, Dict 7 | 8 | CONTEXT_PRECISION_PROMPT = """Given a question and a context, evaluate each sentence in the context for its relevance and criticality in answering the question. 9 | 10 | For each sentence, respond with either '' or ''. A sentence is critical if it provides unique, indispensable information that directly contributes to answering the question. 11 | 12 | Be extremely selective. If there's any doubt about the critical nature of the sentence, mark it as ''. 13 | 14 | Respond in the following format: 15 | 1. or 16 | 2. or 17 | 3. or 18 | ... 19 | 20 | Question: {question} 21 | 22 | Context: 23 | {context}""" 24 | 25 | def simple_sentence_tokenize(text): 26 | return re.split(r'(?<=[.!?])\s+', text) 27 | 28 | def context_precision_score(question: str, contexts: List[Dict], model: Dict[str, str]) -> float: 29 | print(f"Starting context precision scoring for question: {question[:30]}...") 30 | 31 | if not contexts: 32 | print("No contexts available. Returning precision score of 0.") 33 | return 0.0 34 | 35 | if model["api"] == "openai": 36 | api_key = get_openai_api_key() 37 | client = openai.OpenAI(api_key=api_key) 38 | print("Using OpenAI API") 39 | elif model["api"] == "anthropic": 40 | api_key = get_anthropic_api_key() 41 | client = anthropic.Anthropic(api_key=api_key) 42 | print("Using Anthropic API") 43 | else: 44 | raise ValueError(f"Unsupported API: {model['api']}") 45 | 46 | print(f"Number of contexts: {len(contexts)}") 47 | 48 | max_chars = 50000 49 | truncated_contexts = [] 50 | current_chars = 0 51 | 52 | for context in contexts: 53 | context_json = json.dumps(context, indent=2) 54 | if current_chars + len(context_json) <= max_chars: 55 | truncated_contexts.append(context) 56 | current_chars += len(context_json) 57 | else: 58 | break 59 | 60 | print(f"Number of truncated contexts: {len(truncated_contexts)}") 61 | 62 | if not truncated_contexts: 63 | print("No valid contexts after truncation. Returning precision score of 0.") 64 | return 0.0 65 | 66 | context_scores = [] 67 | 68 | for context in truncated_contexts: 69 | chunks = context.get('chunks', '') 70 | if not chunks.strip(): 71 | print("Empty chunks in context. Adding score of 0 for this context.") 72 | context_scores.append(0.0) 73 | continue 74 | 75 | sentences = simple_sentence_tokenize(chunks) 76 | total_sentences = len(sentences) 77 | 78 | if total_sentences == 0: 79 | print("No sentences in context. Adding score of 0 for this context.") 80 | context_scores.append(0.0) 81 | continue 82 | 83 | prompt = CONTEXT_PRECISION_PROMPT.format( 84 | question=question, 85 | context=chunks 86 | ) 87 | 88 | print(f"Sending request to {model['api']} API...") 89 | if model["api"] == "openai": 90 | response = client.chat.completions.create( 91 | model=model["name"], 92 | messages=[{"role": "user", "content": prompt}], 93 | max_tokens=model["max_tokens"], 94 | n=1, 95 | stop=None, 96 | temperature=model["temperature"], 97 | ) 98 | response_text = response.choices[0].message.content 99 | elif model["api"] == "anthropic": 100 | response = client.messages.create( 101 | model=model["name"], 102 | max_tokens=model["max_tokens"], 103 | messages=[ 104 | { 105 | "role": "user", 106 | "content": prompt, 107 | } 108 | ], 109 | temperature=model["temperature"], 110 | ) 111 | response_text = response.content[0].text 112 | 113 | print(f"Response from {model['api']} API: {response_text}") 114 | 115 | critical_sentences = len(re.findall(r'', response_text, re.IGNORECASE)) 116 | context_score = critical_sentences / total_sentences 117 | context_scores.append(min(context_score, 1.0)) # Ensure the score does not exceed 1 118 | 119 | precision_score = sum(context_scores) / len(context_scores) if context_scores else 0.0 120 | print(f"Computed precision score: {precision_score}") 121 | return precision_score 122 | 123 | def evaluate_context_precision(data_list: List[Dict], api_key: str) -> List[Dict]: 124 | print(f"Starting evaluation of context precision for {len(data_list)} items...") 125 | results = [] 126 | for data in data_list: 127 | question = data["question"] 128 | results_json = json.loads(data["results"]) 129 | contexts = results_json 130 | provider = data["provider"] 131 | llms = data["llms"] 132 | 133 | for model in llms: 134 | print(f"Processing question: {question[:30]}, provider: {provider}, model: {model['name']}") 135 | score = context_precision_score(question, contexts, model) 136 | result = { 137 | "question": data["question"], 138 | "provider": data["provider"], 139 | "model": model["name"], 140 | "score": score 141 | } 142 | results.append(result) 143 | print(f"Completed evaluation for question: {data['question'][:30]}, provider: {data['provider']}, model: {model['name']}") 144 | 145 | print(f"Completed evaluation of context precision for all items") 146 | return results 147 | 148 | def batch_evaluate_context_precision(data_list: List[Dict], api_key: str) -> List[Dict]: 149 | print(f"Starting batch evaluation of context precision for {len(data_list)} items...") 150 | results = evaluate_context_precision(data_list, api_key) 151 | print(f"Completed batch evaluation of context precision") 152 | return results -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # An Open Source Evaluation for Search APIs 2 | 3 | This repository presents a comprehensive benchmark designed to evaluate the performance of various search engines. You can plug-and-play search APIs, and this benchmark works natively with Lumina, exa, semantic scholar, and SERP API. To begin, we are comparing the efficacy of research paper search engines. We specifically compare Lumina, Semantic Scholar, and Google Scholar (via SERP) focusing on two key metrics: Context Relevance and Context Precision. By employing large language models (LLMs) as evaluators, we assess each search result's context relevancy and context precision, for the top 10 search results returned by each search provider. We aim to have as fair of an evaluation as possible. We evaluate the search results returned by each provider and use zero shot (no recursion or LLM improvement) as the default method. 4 | 5 | 6 | # Our most recent result - Lumina is up to 11x better. 7 | 8 | Our most recent result is a comparison between Lumina Base, Lumina Recursive, Semantic Scholar, and Google Scholar. 9 | 10 | ![Benchmark Results](search_benchmark/results/August-03-2024.png) 11 | 12 | We measured context relevancy for each search provider's top 10 search results. 13 | - Lumina Base: For our base search, we received a 4.8x multiple over Google Scholar and an 8x multiple over Semantic Scholar over our generated questions dataset for ~2470 queries each. 14 | - Lumina Recursive: We received a 6.8x multiple over Google Scholar and 11.3x multiple over Semantic Scholar over our generated questions dataset for ~2470 queries each. 15 | 16 | **Lumina consistently delivers 2-3 highly relevant results for every query - outperforming Google Scholar and Semantic Scholar, which provide 1 highly relevant result for 50% and 30% of the queries, respectively.** 17 | 18 | 19 | # Running the Benchmark 20 | 21 | This repo requires a `.env` file with API keys for each of these services. To get a lumina API_URL for lumina, and gain access to our scientific search API, you can book a meeting with me at https://cal.com/ishaank99/lumina-api. 22 | 23 | We setup a local `postgres` instance to log the benchmark results, and a local `redis` instance for communication between the benchmark and the services. To run the benchmark with recursion, you will need to host a `reranker` service. We use the BGE Large reranker. By default, this is turned off. 24 | 25 | You can pull the benchmark image with the following command from the root dir of the project: 26 | ``` 27 | docker pull index.docker.io/akhilesh99/benchmark:latest 28 | ``` 29 | 30 | 31 | 1. Clone the repo and cd into it 32 | ``` 33 | git clone https://github.com/lumina-chat/benchmark.git 34 | cd benchmark 35 | ``` 36 | 37 | 2. Set environment variables in .env in the root of the project. 38 | 39 | 3. pull the benchmark image from dockerhub with: 40 | ``` 41 | docker pull index.docker.io/akhilesh99/benchmark:latest 42 | ``` 43 | 4. Run `docker compose up -d` to start the benchmark. This will start all of the services defined in the `compose.yaml` file. 44 | 45 | ``` 46 | docker compose up -d 47 | ``` 48 | 5. Run `docker compose logs -f questions`. This will print a Streamlit link to the benchmark dashboard to view progress. 49 | 50 | 6. To stop the benchmark, run `docker compose down`. 51 | 52 | # Components 53 | 54 | ## `.env` 55 | 56 | We set up API keys, postgres, redis, and config for the benchmark in this file. You should make a `.env` file at the root of the repo with these variables. We use the `config.py` file to access these variables, and the `.env` file to set them. The `python-dotenv` package is used to load the environment variables from the .env file. These include: 57 | 58 | - REDIS_URL 59 | - PG_DB_NAME 60 | - PG_USER 61 | - PG_PASSWORD 62 | - PG_HOST 63 | - PG_PORT 64 | - OPENAI_API_KEY 65 | - ANTHROPIC_API_KEY 66 | - EXA_API_KEY 67 | - SERP_API_KEY 68 | - SEMANTIC_SCHOLAR_API_KEY 69 | - API_URL (for lumina) 70 | - EXA_URL ("https://api.exa.ai/search") 71 | - EXA_CONTENT_URL ("https://api.exa.ai/contents") 72 | - IMG_URL ("index.docker.io/username/img:tag") 73 | - IMG_NAME ("username/img:tag") 74 | - RERANKER_URL (host a reranker if you want to do recursive search) 75 | - QUESTION_TYPES=user_queries,generated_questions 76 | - METRICS=ctx_relevancy, ctx_precision 77 | - LLMS=[{"name": "gpt-4o", "api": "openai", "max_tokens": 1024, "temperature": 0}, {"name": "claude-3-haiku-20240307", "api": "anthropic", "max_tokens": 1024, "temperature": 0}] 78 | - PROVIDERS=lumina,google_scholar,semantic_scholar,exa 79 | - NUM_Q=500 80 | 81 | (if you want recursion add a "-recursive" to the end of the provider name, like lumina_recursive,google_scholar_recursive,semantic_scholar_recursive) 82 | 83 | ## `compose.yaml` 84 | 85 | The `compose.yaml` file orchestrates the deployment of all services required for the benchmark. It defines the configuration for each service, including dependencies, environment variables, and the number of replicas to run. This setup allows for efficient communication between the benchmark and the various search providers, as well as logging and data storage through Redis and PostgreSQL. 86 | 87 | - **logging_api**: Handles logging of benchmark results and depends on Redis and PostgreSQL for data storage. 88 | - **questions**: Runs the benchmark process, sending questions to the configured providers and processing their responses. 89 | - **responses**: Manages the responses from the search providers, processing and storing the results. 90 | - **ranking**: Responsible for ranking the responses received from the providers based on the defined metrics. 91 | - **redis**: Provides a Redis instance for message queuing and inter-service communication. 92 | - **pg**: Sets up a PostgreSQL database for logging benchmark results and storing relevant data. 93 | - **adminer**: A web-based database management tool for interacting with the PostgreSQL database. 94 | 95 | This setup allows for efficient benchmarking and evaluation of different search APIs. 96 | ## `benchmark.py` 97 | 98 | The `benchmark.py` script is run separately and performs the actual benchmarking with the following parameters: 99 | 100 | - Question types: `generated_questions` and `user_queries` 101 | - Metrics: `ctx_relevancy` 102 | - LLMs: Any OpenAI or Anthropic model 103 | - Providers: Lumina, Google Scholar, and Semantic Scholar 104 | 105 | You can also create your own custom question datasets for benchmarking. Simply add your JSONL file to the `search_benchmark/dataset` folder and use its name (without the .jsonl extension) as a question type when running the benchmark. 106 | 107 | ### Question Types 108 | 109 | The script uses two question types: `generated_questions` and `user_queries`. These correspond to JSONL files located in the `search_benchmark/dataset` folder. Each file contains a set of questions used for the benchmark. 110 | 111 | - `generated_questions`: 9k AI-generated questions for benchmarking 112 | - `user_queries`: 9k real user queries from SciSpace for more realistic testing 113 | 114 | You don't need to run all questions, you can specify num questions in the `benchmark.py` file. 115 | You can modify these files or add new ones to customize the benchmark according to your needs. 116 | 117 | ### Recursive search 118 | 119 | The recursive search algorithm enhances search results by using an LLM to generate new questions based on initial search results. This process helps to fill gaps in the original results and provide more comprehensive coverage of the topic. 120 | 121 | 1. **Initial Search**: 122 | - Perform an initial search using the provided question. 123 | - Limit results to the specified `page_size_per_recursion`. 124 | 125 | 2. **Generate New Questions**: 126 | - For each search result: 127 | - Use an LLM to analyze the result and the original question. 128 | - Generate new, more specific questions that address unanswered aspects. The prompt is as follows: 129 | ``` 130 | Based on the user's query: "{question}", 131 | 132 | the search result is: 133 | {result} 134 | 135 | Identify parts of the user's query that were unanswered or need further refinement, and suggest a refined search query to help find better search results. 136 | There should be variation in length, complexity, and specificity across the queries. 137 | The query must be based on the detailed concepts, key-terms, hard values and facts in the result you've been provided. 138 | Wrap it in tags new_query. 139 | ``` 140 | 141 | 3. **Recursive Search**: 142 | - Perform searches using the newly generated questions. 143 | - Repeat steps 1-3 until `recursion_depth` is reached. 144 | 145 | 4. **Result Processing**: 146 | - Combine results from all recursion levels. 147 | - Remove duplicate results based on the content of the chunks. 148 | 149 | 5. **Reranking**: 150 | - Use a reranker model to sort the combined results. 151 | - Return the top `page_size` results. 152 | 153 | 154 | # Notes 155 | 156 | To run adminer, visit localhost:8080 and use the following credentials to log in: 157 | 158 | - System: PostgreSQL 159 | - Server: pg 160 | - Username: postgres 161 | - Password: postrges 162 | - Database: postgres 163 | --------------------------------------------------------------------------------