├── .env.example ├── .gitignore ├── CONTRIBUTING.md ├── Dockerfile ├── LICENSE ├── README.md ├── app ├── __init__.py ├── api │ ├── __init__.py │ ├── read.py │ └── search.py ├── core │ ├── config.py │ └── security.py ├── main.py └── utils │ ├── brave.py │ ├── duckduckgo.py │ ├── html_parser.py │ ├── search_client.py │ ├── searx.py │ └── web_fetcher.py ├── docker-compose.yml ├── docs ├── README.md ├── architecture.md ├── integration_guide.md ├── self_hosting.md └── use_cases.md ├── requirements.txt ├── run.py └── test_duckduckgo.py /.env.example: -------------------------------------------------------------------------------- 1 | # Debug mode (True/False) 2 | DEBUG=True 3 | 4 | # Port to run the API on 5 | PORT=8000 6 | 7 | # Authentication settings 8 | AUTH_ENABLED=True 9 | # Comma-separated list of valid API keys (leave empty to use default key only) 10 | # API_KEYS= 11 | # DEFAULT_API_KEY= 12 | 13 | # Search provider (searxng, duckduckgo, brave) 14 | SEARCH_PROVIDER=duckduckgo 15 | 16 | # SearXNG settings 17 | # SEARXNG_INSTANCE_URL= 18 | # SEARXNG_AUTH_USERNAME= 19 | # SEARXNG_AUTH_PASSWORD= 20 | # SEARXNG_TIMEOUT=10 21 | # SEARXNG_MAX_RESULTS=10 22 | 23 | # DuckDuckGo settings 24 | DUCKDUCKGO_TIMEOUT=10 25 | DUCKDUCKGO_MAX_RESULTS=10 26 | 27 | # Brave Search settings (requires API key) 28 | # BRAVE_API_KEY= 29 | # BRAVE_TIMEOUT=10 30 | # BRAVE_MAX_RESULTS=10 -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .env 3 | __pycache__ -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to SURF 2 | 3 | Thank you for your interest in contributing to SURF! This document provides guidelines and instructions for contributing to the project. 4 | 5 | ## Code of Conduct 6 | 7 | By participating in this project, you agree to uphold our Code of Conduct, which expects all participants to treat each other with respect and professionalism. 8 | 9 | ## How Can I Contribute? 10 | 11 | ### Reporting Bugs 12 | 13 | 1. Before submitting a bug report, please check if the issue has already been reported. 14 | 2. Use the bug report template and provide all the requested information. 15 | 3. Include detailed steps to reproduce the bug. 16 | 4. Include relevant logs and error messages. 17 | 5. Describe what you expected to happen and what actually happened. 18 | 19 | ### Suggesting Enhancements 20 | 21 | 1. Check if your idea has already been suggested. 22 | 2. Use the feature request template and provide all the requested information. 23 | 3. Describe the enhancement you're suggesting in detail. 24 | 4. Explain why this enhancement would be useful to most SURF users. 25 | 26 | ### Code Contributions 27 | 28 | #### Local Development Setup 29 | 30 | 1. Fork the repository. 31 | 2. Clone your fork locally: 32 | ``` 33 | git clone https://github.com/44za12/surf.git 34 | cd surf 35 | ``` 36 | 3. Set up the development environment: 37 | ``` 38 | uv venv 39 | source .venv/bin/activate # On Windows: .venv\Scripts\activate 40 | uv pip install fastapi uvicorn aiohttp python-dotenv beautifulsoup4 markdown pytest 41 | ``` 42 | 4. Create a branch for your changes: 43 | ``` 44 | git checkout -b feature/your-feature-name 45 | ``` 46 | 47 | #### Making Changes 48 | 49 | 1. Follow the coding style and conventions used in the project. 50 | 2. Write or update tests for your changes. 51 | 3. Document your changes (docstrings, comments, update README if necessary). 52 | 4. Ensure all tests pass: 53 | ``` 54 | pytest 55 | ``` 56 | 57 | #### Submitting a Pull Request 58 | 59 | 1. Push your changes to your fork: 60 | ``` 61 | git push origin feature/your-feature-name 62 | ``` 63 | 2. Submit a pull request to the main repository. 64 | 3. Follow the pull request template and provide all the requested information. 65 | 4. Wait for a maintainer to review your pull request. 66 | 5. Make any requested changes and update your pull request. 67 | 68 | ## Style Guides 69 | 70 | ### Git Commit Messages 71 | 72 | * Use the present tense ("Add feature" not "Added feature") 73 | * Use the imperative mood ("Move cursor to..." not "Moves cursor to...") 74 | * Limit the first line to 72 characters or less 75 | * Reference issues and pull requests after the first line 76 | 77 | ### Python Style Guide 78 | 79 | We follow [PEP 8](https://www.python.org/dev/peps/pep-0008/) for Python code style. Additionally: 80 | 81 | * Use 4 spaces for indentation (not tabs) 82 | * Use docstrings for all modules, classes, and functions 83 | * Type hints are encouraged 84 | * Line length should be limited to 88 characters (compatible with Black) 85 | 86 | ### Documentation Style Guide 87 | 88 | * Use Markdown for documentation 89 | * Keep documentation up-to-date with code changes 90 | * Be clear and concise 91 | * Include examples where appropriate 92 | 93 | ## Project Structure 94 | 95 | The project is structured as follows: 96 | 97 | ``` 98 | app/ 99 | ├── api/ # API route handlers 100 | ├── core/ # Core configuration 101 | ├── utils/ # Utility functions 102 | ├── main.py # Main FastAPI application 103 | docs/ # Documentation 104 | tests/ # Tests 105 | ``` 106 | 107 | ## Testing 108 | 109 | We use pytest for testing. All code contributions should include appropriate tests. 110 | 111 | To run the tests: 112 | 113 | ``` 114 | pytest 115 | ``` 116 | 117 | ## Documentation 118 | 119 | We encourage improvements to the documentation. If you find something unclear or missing, please submit a pull request with your improvements. 120 | 121 | ## Questions? 122 | 123 | If you have any questions about contributing, please open an issue or reach out to the maintainers. 124 | 125 | Thank you for contributing to SURF! -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | WORKDIR /app 4 | 5 | # Install dependencies 6 | COPY requirements.txt . 7 | RUN pip install --no-cache-dir -r requirements.txt 8 | 9 | # Copy application files 10 | COPY . . 11 | 12 | # Create a non-root user to run the application 13 | RUN useradd -m surf && \ 14 | chown -R surf:surf /app 15 | 16 | USER surf 17 | 18 | # Expose the port 19 | EXPOSE 8000 20 | 21 | # Run the application 22 | CMD ["python", "run.py"] -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2025 Aazar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # SURF API 2 | 3 | ![License](https://img.shields.io/badge/license-MIT-blue.svg) 4 | 5 | SURF (Search Utility & Reading Framework) is an elegant, self-deployable API that bridges the gap between Large Language Models and the web. With minimal setup, deploy a robust solution that enables any LLM to search the web and process content in a format optimized for context windows. 6 | 7 | ## ✨ Key Features 8 | 9 | - **Powerful HTML Processing**: 10 | - Advanced processing with support for tables, images, and complex layouts 11 | - Clean content extraction from web pages with noise removal 12 | - Multi-format support (HTML, plain text, JSON, and more) 13 | 14 | - **Intelligent Web Search**: Leverage multiple search providers: 15 | - Choose between DuckDuckGo (default), SearXNG, or Brave Search for web searches 16 | - Privacy-respecting searches through configurable instances 17 | - High-quality results using native ranking algorithms 18 | - Flexible output formats for seamless LLM integration 19 | - Customizable result count and presentation format 20 | 21 | - **Designed for LLMs**: 22 | - Content optimized for LLM context windows 23 | - Structured data for easy comprehension by AI models 24 | - Consistent formatting for reliable parsing 25 | - Customizable output formats (JSON, Markdown) 26 | 27 | - **Developer-Friendly**: 28 | - Simple REST API with intuitive endpoints 29 | - Comprehensive documentation and integration guides 30 | - Authentication-ready with secure API keys 31 | - Fully self-hostable with minimal dependencies 32 | 33 | - **Model Context Protocol (MCP) Integration**: 34 | - Easy implementation of MCP servers for standardized AI access 35 | - Simplified interfaces for search and content reading 36 | - Compatible with all MCP clients like Claude Desktop 37 | - Rapid development of AI tools with web access capabilities 38 | 39 | ## 📚 Documentation 40 | 41 | Comprehensive documentation is available in the `docs/` directory: 42 | 43 | - [Documentation Index](docs/README.md) - Start here for a complete overview 44 | - [Architecture Overview](docs/architecture.md) - Learn about the system design 45 | - [Integration Guide](docs/integration_guide.md) - Detailed instructions for connecting with LLMs 46 | - [Use Cases & Applications](docs/use_cases.md) - Explore real-world applications 47 | - [Self-Hosting Guide](docs/self_hosting.md) - Deploy SURF on your own infrastructure 48 | 49 | ## 💻 Quick Start 50 | 51 | ### Installation 52 | 53 | ```bash 54 | # Clone the repository 55 | git clone https://github.com/44za12/surf.git 56 | cd surf 57 | 58 | # Create a virtual environment 59 | python -m venv venv 60 | source venv/bin/activate # On Windows: venv\Scripts\activate 61 | 62 | # Install dependencies 63 | pip install -r requirements.txt 64 | 65 | # Start the server 66 | python run.py 67 | ``` 68 | 69 | The API server will be available at http://localhost:8000. 70 | 71 | ### Basic Usage 72 | 73 | #### Read a webpage 74 | 75 | ```bash 76 | curl "http://localhost:8000/read/https%3A%2F%2Fen.wikipedia.org%2Fwiki%2FPython_%28programming_language%29" 77 | ``` 78 | 79 | #### Search the web 80 | 81 | ```bash 82 | curl "http://localhost:8000/search?q=latest+AI+research+papers" 83 | ``` 84 | 85 | ## 📋 API Reference 86 | 87 | ### GET /read/{url} 88 | 89 | Fetches, cleans and processes web content. 90 | 91 | - **URL Parameters**: 92 | - `url`: URL-encoded address of the content to read 93 | - **Query Parameters**: 94 | - `format`: Output format (json or md, default: json) 95 | 96 | ### GET /search 97 | 98 | Searches the web and returns relevant results. 99 | 100 | - **Query Parameters**: 101 | - `q`: Search query 102 | - `format`: Output format (json or md, default: json) 103 | - `max_results`: Number of results (1-10, default: 5) 104 | - `language`: Language code (e.g., en-US, fr-FR) 105 | - `time_range`: Time filter (day, week, month, year) 106 | 107 | ## 🧠 LLM Integration Strategies 108 | 109 | SURF API is designed to be easily integrated with any LLM system. Here are some recommended integration patterns: 110 | 111 | 1. **Tool-based integration**: Configure SURF endpoints as tools in your LLM tool library 112 | 2. **Retrieval Augmentation**: Use the search and read endpoints for RAG (Retrieval-Augmented Generation) 113 | 3. **Direct Context Injection**: Insert search results or web content directly into your prompts 114 | 4. **Multi-step workflow**: First search for relevant sources, then read specific pages based on search results 115 | 5. **Model Context Protocol (MCP)**: Create MCP servers that leverage SURF for web access, allowing standardized integration with compatible AI systems 116 | 117 | For detailed MCP implementation examples, see our [Integration Guide](docs/integration_guide.md#integration-with-model-context-protocol-mcp). 118 | 119 | ## 🚀 Deployment Options 120 | 121 | SURF can be deployed in multiple ways depending on your requirements: 122 | 123 | ### 🐳 Docker (Recommended) 124 | 125 | Deploy with Docker Compose for the simplest setup: 126 | 127 | ```bash 128 | # Clone the repository 129 | git clone https://github.com/44za12/surf.git 130 | cd surf 131 | 132 | # Start the container 133 | docker-compose up -d 134 | ``` 135 | 136 | For more details, see the [Self-Hosting Guide](docs/self_hosting.md). 137 | 138 | ### 💻 Bare Metal 139 | 140 | Install directly on your server: 141 | 142 | ```bash 143 | # Create a virtual environment 144 | python -m venv venv 145 | source venv/bin/activate 146 | 147 | # Install dependencies 148 | pip install -r requirements.txt 149 | 150 | # Start the server 151 | python run.py 152 | ``` 153 | 154 | ### ☁️ Cloud Platforms 155 | 156 | SURF works well on: 157 | - Digital Ocean droplets 158 | - AWS EC2 or Lightsail instances 159 | - Azure VMs 160 | - Google Cloud VMs 161 | - PaaS platforms like Heroku, Railway, and Render 162 | 163 | Full deployment instructions are available in the [Self-Hosting Guide](docs/self_hosting.md). 164 | 165 | ## 📐 Configuration Options 166 | 167 | All configuration is managed through environment variables or the `.env` file: 168 | 169 | ### API Settings 170 | - `DEBUG`: Enable/disable debug mode (default: False) 171 | - `PORT`: Port to run the API on (default: 8000) 172 | 173 | ### Security Settings 174 | - `AUTH_ENABLED`: Enable/disable API key authentication (default: True) 175 | - `API_KEYS`: Comma-separated list of valid API keys 176 | - `DEFAULT_API_KEY`: A default API key to use (auto-generated if not specified) 177 | 178 | ### Search Provider Settings 179 | - `SEARCH_PROVIDER`: The search provider to use (`searxng`, `duckduckgo`, or `brave`, default: `duckduckgo`) 180 | 181 | ### SearXNG Settings 182 | - `SEARXNG_INSTANCE_URL`: SearXNG instance URL (default: https://searx.be) 183 | - `SEARXNG_AUTH_USERNAME`: Username for SearXNG authentication (optional) 184 | - `SEARXNG_AUTH_PASSWORD`: Password for SearXNG authentication (optional) 185 | - `SEARXNG_TIMEOUT`: Request timeout in seconds (default: 10) 186 | - `SEARXNG_MAX_RESULTS`: Maximum search results to fetch (default: 10) 187 | 188 | ### DuckDuckGo Settings 189 | - `DUCKDUCKGO_TIMEOUT`: Request timeout in seconds (default: 10) 190 | - `DUCKDUCKGO_MAX_RESULTS`: Maximum search results to fetch (default: 10) 191 | 192 | ### Brave Search Settings 193 | - `BRAVE_API_KEY`: API key for Brave Search (required for Brave Search) 194 | - `BRAVE_TIMEOUT`: Request timeout in seconds (default: 10) 195 | - `BRAVE_MAX_RESULTS`: Maximum search results to fetch (default: 10) 196 | 197 | ## 🚀 Advanced Usage 198 | 199 | ### Testing the HTML Parser 200 | 201 | ``` 202 | python run.py --test --url https://example.com 203 | ``` 204 | 205 | This command tests the HTML parser with a specific URL and displays the processed content. 206 | 207 | ### Custom SearXNG Instance 208 | 209 | For full privacy control, you can set up your own SearXNG instance and configure SURF to use it: 210 | 211 | 1. Deploy SearXNG using their [official documentation](https://searxng.github.io/searxng/) 212 | 2. Update your `.env` file with your instance URL: 213 | ``` 214 | SEARXNG_INSTANCE_URL=https://your-searxng-instance.com 215 | ``` 216 | 217 | ### Using Different Search Providers 218 | 219 | SURF supports multiple search providers that you can configure: 220 | 221 | #### DuckDuckGo (Default, No API key required) 222 | 223 | DuckDuckGo is the default search provider and requires no API key or special setup. 224 | 225 | #### SearXNG 226 | 227 | To use SearXNG instead of DuckDuckGo: 228 | 229 | ``` 230 | SEARCH_PROVIDER=searxng 231 | SEARXNG_INSTANCE_URL=https://your-searxng-instance.com 232 | ``` 233 | 234 | #### Brave Search (API key required) 235 | 236 | 1. Get a Brave Search API key from [Brave Search API](https://brave.com/search/api/) 237 | 2. Configure SURF to use Brave Search: 238 | ``` 239 | SEARCH_PROVIDER=brave 240 | BRAVE_API_KEY=your-api-key-here 241 | ``` 242 | 243 | ## 📜 License 244 | 245 | This project is licensed under the MIT License - see the LICENSE file for details. 246 | 247 | ## 🤝 Contributing 248 | 249 | Contributions to SURF are welcome! Please feel free to submit a Pull Request. -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | # SURF API -------------------------------------------------------------------------------- /app/api/__init__.py: -------------------------------------------------------------------------------- 1 | from . import read, search -------------------------------------------------------------------------------- /app/api/read.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, HTTPException, Query, Depends 2 | from enum import Enum 3 | import json 4 | import re 5 | 6 | from app.utils.web_fetcher import WebFetcher 7 | from app.utils.html_parser import HTMLCleaner 8 | from app.core.security import require_api_key 9 | 10 | 11 | # Define output format enum 12 | class OutputFormat(str, Enum): 13 | json = "json" 14 | markdown = "md" 15 | 16 | 17 | # Create router 18 | router = APIRouter(prefix="/read", tags=["read"]) 19 | 20 | 21 | async def extract_text_content(content, content_type, url): 22 | """Extract text content from non-HTML sources.""" 23 | # Basic processing for plain text 24 | if 'text/plain' in content_type: 25 | lines = content.split('\n') 26 | 27 | # Try to extract a title from the first non-empty line 28 | title = "Text Document" 29 | for line in lines: 30 | if line.strip(): 31 | title = line.strip() 32 | break 33 | 34 | # Return the content with minimal formatting 35 | return { 36 | "title": title, 37 | "url": url, 38 | "content": content 39 | } 40 | 41 | # Basic processing for JSON 42 | elif 'application/json' in content_type: 43 | try: 44 | # Try to parse JSON and pretty-print it 45 | json_data = json.loads(content) 46 | formatted_json = json.dumps(json_data, indent=2) 47 | 48 | # Try to find a title in common JSON fields 49 | title = "JSON Document" 50 | for field in ['title', 'name', 'id', 'key']: 51 | if isinstance(json_data, dict) and field in json_data: 52 | title = f"JSON: {json_data[field]}" 53 | break 54 | 55 | return { 56 | "title": title, 57 | "url": url, 58 | "content": f"```json\n{formatted_json}\n```" 59 | } 60 | except json.JSONDecodeError: 61 | # If JSON parsing fails, return as plain text 62 | return { 63 | "title": "Invalid JSON Document", 64 | "url": url, 65 | "content": content 66 | } 67 | 68 | # For other formats, return a simple representation 69 | else: 70 | return { 71 | "title": f"Document ({content_type})", 72 | "url": url, 73 | "content": f"Content type '{content_type}' not fully supported. Raw content:\n\n```\n{content[:5000]}\n```\n\n(Content may be truncated)" 74 | } 75 | 76 | 77 | @router.get("/{url:path}") 78 | async def read_url( 79 | url: str, 80 | format: OutputFormat = Query(OutputFormat.json, description="Output format (json or md)"), 81 | api_key: str = require_api_key 82 | ): 83 | """ 84 | Fetch and process content from a URL. 85 | 86 | Args: 87 | url: URL to read content from (as path parameter) 88 | format: Output format (json or markdown) 89 | api_key: API key for authentication 90 | 91 | Returns: 92 | Processed content in requested format 93 | """ 94 | try: 95 | # Make sure URL is properly formatted 96 | if not url.startswith(("http://", "https://")): 97 | url = f"https://{url}" 98 | 99 | # Fetch content 100 | fetch_result = await WebFetcher.fetch_url(url) 101 | 102 | if not fetch_result: 103 | raise HTTPException(status_code=404, detail=f"Failed to fetch URL or URL not found: {url}") 104 | 105 | content, content_type = fetch_result 106 | 107 | # Process content based on content type 108 | if 'text/html' in content_type: 109 | processed_content = await HTMLCleaner.process_html(content, url) 110 | else: 111 | # For non-HTML content, apply basic processing 112 | processed_content = await extract_text_content(content, content_type, url) 113 | 114 | # Return in requested format 115 | if format == OutputFormat.markdown: 116 | return processed_content["content"] 117 | else: 118 | # JSON format (default) 119 | return processed_content 120 | except Exception as e: 121 | # Log the error and provide a meaningful response 122 | error_detail = f"Error processing URL {url}: {str(e)}" 123 | print(error_detail) # In production, use proper logging 124 | raise HTTPException(status_code=500, detail=error_detail) -------------------------------------------------------------------------------- /app/api/search.py: -------------------------------------------------------------------------------- 1 | from fastapi import APIRouter, HTTPException, Query, Depends 2 | from enum import Enum 3 | 4 | from app.utils.search_client import SearchClientFactory 5 | from app.core.config import settings, SearchProvider 6 | from app.core.security import require_api_key 7 | 8 | 9 | # Define output format enum 10 | class OutputFormat(str, Enum): 11 | json = "json" 12 | markdown = "md" 13 | 14 | 15 | # Create router 16 | router = APIRouter(prefix="/search", tags=["search"]) 17 | 18 | 19 | # Helper function to format results as markdown 20 | def format_results_as_markdown(results: list) -> str: 21 | """Format search results as markdown.""" 22 | if not results: 23 | return "No results found." 24 | 25 | markdown = "# Search Results\n\n" 26 | 27 | for i, result in enumerate(results, 1): 28 | markdown += f"## {i}. {result.get('title', 'No Title')}\n\n" 29 | markdown += f"**URL**: {result.get('url', 'No URL')}\n\n" 30 | markdown += f"{result.get('snippet', 'No description available.')}\n\n" 31 | markdown += "---\n\n" 32 | 33 | return markdown 34 | 35 | 36 | @router.get("") 37 | async def search( 38 | q: str = Query(..., description="Search query"), 39 | format: OutputFormat = Query(OutputFormat.json, description="Output format (json or md)"), 40 | max_results: int = Query(5, description="Maximum number of results to return", ge=1, le=10), 41 | language: str = Query("en-US", description="Search language (e.g., en-US, fr-FR)"), 42 | time_range: str = Query(None, description="Optional time range for search results (day, week, month, year)"), 43 | api_key: str = require_api_key 44 | ): 45 | """ 46 | Search the web using the configured search provider. 47 | 48 | Args: 49 | q: Search query 50 | format: Output format (json or markdown) 51 | max_results: Maximum number of results to return (1-10) 52 | language: Language code for search results (e.g., en-US, fr-FR) 53 | time_range: Optional time filter (day, week, month, year) 54 | api_key: API key for authentication 55 | 56 | Returns: 57 | Search results in requested format 58 | 59 | Note: 60 | This endpoint uses the configured search provider (SearXNG, DuckDuckGo, or Brave Search). 61 | The current provider is: {settings.search.provider} 62 | """ 63 | try: 64 | # Get appropriate search client based on configuration 65 | search_client = SearchClientFactory.get_client() 66 | 67 | # Perform search 68 | results = await search_client.search( 69 | q, 70 | num_results=max_results, 71 | language=language, 72 | time_range=time_range 73 | ) 74 | 75 | if not results: 76 | if format == OutputFormat.markdown: 77 | return "No search results found." 78 | else: 79 | return {"results": [], "query": q} 80 | 81 | # Return in requested format 82 | if format == OutputFormat.markdown: 83 | return format_results_as_markdown(results) 84 | else: 85 | # JSON format (default) 86 | return { 87 | "results": results, 88 | "query": q, 89 | "provider": str(settings.search.provider) 90 | } 91 | 92 | except Exception as e: 93 | # Log the error and provide a meaningful response 94 | error_detail = f"Search error for query '{q}': {str(e)}" 95 | print(error_detail) # In production, use proper logging 96 | raise HTTPException(status_code=500, detail=error_detail) -------------------------------------------------------------------------------- /app/core/config.py: -------------------------------------------------------------------------------- 1 | import os 2 | import secrets 3 | from enum import Enum 4 | from typing import Optional, List 5 | from pydantic import BaseModel 6 | from dotenv import load_dotenv 7 | 8 | # Load environment variables from .env file 9 | load_dotenv() 10 | 11 | class SearchProvider(str, Enum): 12 | """Supported search providers.""" 13 | SEARXNG = "searxng" 14 | DUCKDUCKGO = "duckduckgo" 15 | BRAVE = "brave" 16 | 17 | class SearXNGConfig(BaseModel): 18 | """Configuration for SearXNG search engine.""" 19 | instance_url: str = os.getenv("SEARXNG_INSTANCE_URL", "https://searx.be") 20 | auth_username: Optional[str] = os.getenv("SEARXNG_AUTH_USERNAME") 21 | auth_password: Optional[str] = os.getenv("SEARXNG_AUTH_PASSWORD") 22 | timeout: int = int(os.getenv("SEARXNG_TIMEOUT", "10")) 23 | max_results: int = int(os.getenv("SEARXNG_MAX_RESULTS", "10")) 24 | 25 | class BraveSearchConfig(BaseModel): 26 | """Configuration for Brave Search API.""" 27 | api_key: Optional[str] = os.getenv("BRAVE_API_KEY") 28 | timeout: int = int(os.getenv("BRAVE_TIMEOUT", "10")) 29 | max_results: int = int(os.getenv("BRAVE_MAX_RESULTS", "10")) 30 | 31 | class DuckDuckGoConfig(BaseModel): 32 | """Configuration for DuckDuckGo search.""" 33 | timeout: int = int(os.getenv("DUCKDUCKGO_TIMEOUT", "10")) 34 | max_results: int = int(os.getenv("DUCKDUCKGO_MAX_RESULTS", "10")) 35 | 36 | class SearchConfig(BaseModel): 37 | """Search configuration.""" 38 | provider: SearchProvider = os.getenv("SEARCH_PROVIDER", SearchProvider.DUCKDUCKGO) 39 | searxng: SearXNGConfig = SearXNGConfig() 40 | brave: BraveSearchConfig = BraveSearchConfig() 41 | duckduckgo: DuckDuckGoConfig = DuckDuckGoConfig() 42 | 43 | class SecurityConfig(BaseModel): 44 | """Security configuration.""" 45 | # Read API keys from environment variable - comma-separated list 46 | api_keys: List[str] = [k.strip() for k in os.getenv("API_KEYS", "").split(",") if k.strip()] 47 | # Generate a random default key if none provided 48 | default_key: str = os.getenv("DEFAULT_API_KEY", secrets.token_urlsafe(32)) 49 | # Whether API key auth is enabled 50 | auth_enabled: bool = os.getenv("AUTH_ENABLED", "True").lower() == "true" 51 | 52 | def __init__(self, **data): 53 | super().__init__(**data) 54 | # Add the default key to api_keys if auth is enabled and no keys are provided 55 | if self.auth_enabled and not self.api_keys: 56 | self.api_keys.append(self.default_key) 57 | 58 | class Settings(BaseModel): 59 | """Application settings.""" 60 | app_name: str = "SURF API" 61 | debug: bool = os.getenv("DEBUG", "False").lower() == "true" 62 | port: int = int(os.getenv("PORT", "8000")) 63 | search: SearchConfig = SearchConfig() 64 | searxng: SearXNGConfig = SearXNGConfig() # For backward compatibility 65 | security: SecurityConfig = SecurityConfig() 66 | 67 | # Create global settings object 68 | settings = Settings() -------------------------------------------------------------------------------- /app/core/security.py: -------------------------------------------------------------------------------- 1 | from fastapi import Depends, HTTPException, Security, status 2 | from fastapi.security import APIKeyHeader 3 | from app.core.config import settings 4 | 5 | # API key header name 6 | API_KEY_NAME = "X-API-Key" 7 | api_key_header = APIKeyHeader(name=API_KEY_NAME, auto_error=False) 8 | 9 | async def get_api_key(api_key_header: str = Security(api_key_header)): 10 | """ 11 | Dependency for validating the API key. 12 | 13 | Args: 14 | api_key_header: The API key from the request header 15 | 16 | Returns: 17 | The validated API key 18 | 19 | Raises: 20 | HTTPException: If authentication is required and the API key is invalid 21 | """ 22 | # If auth is disabled, bypass validation 23 | if not settings.security.auth_enabled: 24 | return None 25 | 26 | # Check if API key is provided 27 | if not api_key_header: 28 | raise HTTPException( 29 | status_code=status.HTTP_401_UNAUTHORIZED, 30 | detail="API key is missing", 31 | headers={"WWW-Authenticate": "APIKey"}, 32 | ) 33 | 34 | # Validate the API key 35 | if api_key_header not in settings.security.api_keys: 36 | raise HTTPException( 37 | status_code=status.HTTP_401_UNAUTHORIZED, 38 | detail="Invalid API key", 39 | headers={"WWW-Authenticate": "APIKey"}, 40 | ) 41 | 42 | return api_key_header 43 | 44 | # Convenient dependency for routes 45 | require_api_key = Depends(get_api_key) -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | from fastapi import FastAPI, Request, Depends 2 | from fastapi.responses import JSONResponse 3 | from fastapi.middleware.cors import CORSMiddleware 4 | from fastapi.openapi.utils import get_openapi 5 | import logging 6 | import sys 7 | 8 | from app.core.config import settings 9 | from app.api import read, search 10 | from app.core.security import API_KEY_NAME 11 | 12 | 13 | # Configure logging 14 | def setup_logging(): 15 | """Set up logging configuration.""" 16 | log_level = logging.DEBUG if settings.debug else logging.INFO 17 | 18 | # Configure root logger 19 | logging.basicConfig( 20 | level=log_level, 21 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', 22 | datefmt='%Y-%m-%d %H:%M:%S', 23 | handlers=[logging.StreamHandler(sys.stdout)] 24 | ) 25 | 26 | # Set levels for external libraries 27 | logging.getLogger("uvicorn").setLevel(logging.WARNING) 28 | logging.getLogger("uvicorn.access").setLevel(logging.WARNING) 29 | logging.getLogger("uvicorn.error").setLevel(logging.ERROR) 30 | logging.getLogger("aiohttp").setLevel(logging.WARNING) 31 | 32 | return logging.getLogger("app") 33 | 34 | # Initialize logger 35 | logger = setup_logging() 36 | 37 | # Create FastAPI app 38 | app = FastAPI( 39 | title=settings.app_name, 40 | version="1.0.0", 41 | description="SURF API for web content retrieval and search", 42 | docs_url="/docs", 43 | redoc_url="/redoc", 44 | debug=settings.debug 45 | ) 46 | 47 | # Add CORS middleware 48 | app.add_middleware( 49 | CORSMiddleware, 50 | allow_origins=["*"], # In production, restrict to specific origins 51 | allow_credentials=True, 52 | allow_methods=["*"], 53 | allow_headers=["*"], 54 | ) 55 | 56 | # Include routers 57 | app.include_router(read.router) 58 | app.include_router(search.router) 59 | 60 | # Custom OpenAPI schema to document API key usage 61 | def custom_openapi(): 62 | if app.openapi_schema: 63 | return app.openapi_schema 64 | 65 | openapi_schema = get_openapi( 66 | title=app.title, 67 | version=app.version, 68 | description=app.description, 69 | routes=app.routes, 70 | ) 71 | 72 | # Add API key security scheme 73 | openapi_schema["components"] = openapi_schema.get("components", {}) 74 | openapi_schema["components"]["securitySchemes"] = { 75 | "APIKeyHeader": { 76 | "type": "apiKey", 77 | "in": "header", 78 | "name": API_KEY_NAME, 79 | "description": f"API key to authenticate requests. Example: {API_KEY_NAME}: YOUR_API_KEY" 80 | } 81 | } 82 | 83 | # Add security requirement to all endpoints 84 | if settings.security.auth_enabled: 85 | openapi_schema["security"] = [{"APIKeyHeader": []}] 86 | 87 | # Add note about authentication to description 88 | auth_note = f""" 89 | ## Authentication 90 | 91 | This API requires authentication using an API key. Include your API key in the `{API_KEY_NAME}` header with your requests. 92 | 93 | Example: `{API_KEY_NAME}: YOUR_API_KEY` 94 | """ 95 | 96 | openapi_schema["info"]["description"] = app.description + auth_note 97 | 98 | app.openapi_schema = openapi_schema 99 | return app.openapi_schema 100 | 101 | app.openapi = custom_openapi 102 | 103 | # Global exception handler 104 | @app.exception_handler(Exception) 105 | async def global_exception_handler(request: Request, exc: Exception): 106 | logger.error(f"Unhandled exception: {str(exc)}", exc_info=True) 107 | return JSONResponse( 108 | status_code=500, 109 | content={"detail": f"An unexpected error occurred: {str(exc)}"} 110 | ) 111 | 112 | # Root route 113 | @app.get("/", tags=["Root"]) 114 | async def root(): 115 | """ 116 | Root endpoint returns basic information about the API 117 | """ 118 | auth_status = "enabled" if settings.security.auth_enabled else "disabled" 119 | return { 120 | "name": settings.app_name, 121 | "version": "1.0.0", 122 | "description": "API for web content retrieval and search", 123 | "auth_status": auth_status, 124 | "endpoints": { 125 | "read": "/read/{url}", 126 | "search": "/search?q={query}" 127 | } 128 | } 129 | 130 | # Display API keys in debug mode when server starts 131 | @app.on_event("startup") 132 | async def startup_event(): 133 | if settings.debug and settings.security.auth_enabled: 134 | keys_info = "\n".join([f"- {key}" for key in settings.security.api_keys]) 135 | print(f"\n\nAuth is ENABLED. Available API key(s):\n{keys_info}\n") 136 | if settings.security.default_key in settings.security.api_keys: 137 | print(f"Default key was auto-generated. Use this for testing: {settings.security.default_key}\n\n") 138 | elif settings.debug: 139 | print("\n\nAuth is DISABLED. No API key required for requests.\n\n") -------------------------------------------------------------------------------- /app/utils/brave.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import json 3 | from typing import List, Dict, Any, Optional 4 | from app.core.config import settings 5 | 6 | 7 | class BraveSearchClient: 8 | """Client for interacting with Brave Search API.""" 9 | 10 | def __init__(self): 11 | """Initialize Brave Search client with configured settings.""" 12 | self.api_endpoint = "https://api.search.brave.com/res/v1/web/search" 13 | self.api_key = settings.search.brave.api_key 14 | self.timeout = settings.search.brave.timeout 15 | 16 | if not self.api_key: 17 | print("WARNING: Brave Search API key is not configured. API calls will fail.") 18 | 19 | async def search(self, query: str, num_results: int = None, language: str = None, time_range: str = None) -> list: 20 | """ 21 | Perform a search query against the Brave Search API. 22 | 23 | Args: 24 | query: The search query 25 | num_results: Number of results to return, defaults to configured max_results 26 | language: Language code for search results (e.g., en-US, fr-FR) 27 | time_range: Optional time filter (day, week, month, year) 28 | 29 | Returns: 30 | List of search results 31 | """ 32 | if not self.api_key: 33 | print("ERROR: Brave Search API key is not configured") 34 | return [] 35 | 36 | if num_results is None: 37 | num_results = settings.search.brave.max_results 38 | 39 | # Prepare search parameters 40 | params = { 41 | "q": query, 42 | "count": num_results 43 | } 44 | 45 | # Add language if specified 46 | if language: 47 | params["country"] = self._extract_country_code(language) 48 | 49 | # Add time range if specified 50 | if time_range: 51 | params["freshness"] = self._convert_time_range(time_range) 52 | 53 | # Set up timeout 54 | timeout = aiohttp.ClientTimeout(total=self.timeout) 55 | 56 | try: 57 | async with aiohttp.ClientSession(timeout=timeout) as session: 58 | try: 59 | # Make the request 60 | async with session.get( 61 | self.api_endpoint, 62 | params=params, 63 | headers={ 64 | "Accept": "application/json", 65 | "X-Subscription-Token": self.api_key 66 | } 67 | ) as response: 68 | if response.status != 200: 69 | error_text = await response.text() 70 | print(f"Brave Search API error: Status {response.status}, Response: {error_text[:200]}") 71 | raise ValueError(f"Brave Search failed with status code: {response.status}") 72 | 73 | try: 74 | response_data = await response.json() 75 | except Exception as e: 76 | error_text = await response.text() 77 | print(f"Failed to parse Brave Search JSON response: {str(e)}") 78 | print(f"Response text: {error_text[:200]}") 79 | return [] 80 | 81 | # Process and normalize results 82 | return self._process_brave_results(response_data) 83 | 84 | except aiohttp.ClientError as e: 85 | print(f"Brave Search connection error: {str(e)}") 86 | return [] 87 | 88 | except Exception as e: 89 | print(f"Brave Search error: {str(e)}") 90 | return [] 91 | 92 | def _process_brave_results(self, response_data: Dict[str, Any]) -> List[Dict[str, str]]: 93 | """ 94 | Process Brave Search API response to extract and normalize search results. 95 | 96 | Args: 97 | response_data: JSON response from Brave Search API 98 | 99 | Returns: 100 | List of search results with title, URL, and snippet 101 | """ 102 | results = [] 103 | 104 | web_results = response_data.get("web", {}).get("results", []) 105 | for result in web_results: 106 | results.append({ 107 | "title": result.get("title", ""), 108 | "url": result.get("url", ""), 109 | "snippet": result.get("description", "") 110 | }) 111 | 112 | return results 113 | 114 | def _extract_country_code(self, language: str) -> str: 115 | """ 116 | Extract country code from language code. 117 | 118 | Args: 119 | language: Language code (e.g., 'en-US') 120 | 121 | Returns: 122 | Country code (e.g., 'US') 123 | """ 124 | if "-" in language: 125 | return language.split("-")[1].upper() 126 | return "US" # Default 127 | 128 | def _convert_time_range(self, time_range: str) -> str: 129 | """ 130 | Convert time range to Brave Search API time range parameter. 131 | 132 | Args: 133 | time_range: Time range (day, week, month, year) 134 | 135 | Returns: 136 | Brave Search time range parameter 137 | """ 138 | time_map = { 139 | "day": "pd", # past day 140 | "week": "pw", # past week 141 | "month": "pm", # past month 142 | "year": "py" # past year 143 | } 144 | return time_map.get(time_range, "") -------------------------------------------------------------------------------- /app/utils/duckduckgo.py: -------------------------------------------------------------------------------- 1 | import aiohttp 2 | import re 3 | import json 4 | from typing import List, Dict, Any 5 | from urllib.parse import unquote, urlencode 6 | from app.core.config import settings 7 | 8 | 9 | class DuckDuckGoClient: 10 | """Client for interacting with DuckDuckGo search engine without relying on additional libraries.""" 11 | 12 | def __init__(self): 13 | """Initialize DuckDuckGo client with configured settings.""" 14 | self.search_url = "https://html.duckduckgo.com/html/" 15 | self.timeout = settings.search.duckduckgo.timeout 16 | 17 | async def search(self, query: str, num_results: int = None, language: str = None, time_range: str = None) -> list: 18 | """ 19 | Perform a search query against DuckDuckGo. 20 | 21 | Args: 22 | query: The search query 23 | num_results: Number of results to return, defaults to configured max_results 24 | language: Language code for search results (e.g., en-US, fr-FR) 25 | time_range: Optional time filter (day, week, month, year) 26 | 27 | Returns: 28 | List of search results 29 | """ 30 | if num_results is None: 31 | num_results = settings.search.duckduckgo.max_results 32 | 33 | # Prepare search parameters 34 | params = { 35 | "q": query, 36 | "kl": self._convert_language(language) if language else "wt-wt", # Default to international 37 | } 38 | 39 | # Add time range if specified 40 | if time_range: 41 | params["df"] = self._convert_time_range(time_range) 42 | 43 | # Set up timeout 44 | timeout = aiohttp.ClientTimeout(total=self.timeout) 45 | 46 | try: 47 | async with aiohttp.ClientSession(timeout=timeout) as session: 48 | try: 49 | # Make the request 50 | async with session.post( 51 | self.search_url, 52 | data=params, 53 | headers={ 54 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", 55 | "Accept": "text/html,application/xhtml+xml,application/xml", 56 | } 57 | ) as response: 58 | if response.status != 200: 59 | error_text = await response.text() 60 | print(f"DuckDuckGo error: Status {response.status}, Response: {error_text[:200]}") 61 | raise ValueError(f"DuckDuckGo search failed with status code: {response.status}") 62 | 63 | html_content = await response.text() 64 | 65 | # Parse the HTML response 66 | results = self._parse_html_results(html_content) 67 | return results[:num_results] 68 | 69 | except aiohttp.ClientError as e: 70 | print(f"DuckDuckGo connection error: {str(e)}") 71 | return [] 72 | 73 | except Exception as e: 74 | print(f"DuckDuckGo search error: {str(e)}") 75 | return [] 76 | 77 | def _parse_html_results(self, html_content: str) -> List[Dict[str, str]]: 78 | """ 79 | Parse the HTML response from DuckDuckGo to extract search results. 80 | 81 | Args: 82 | html_content: HTML content from DuckDuckGo 83 | 84 | Returns: 85 | List of search results with title, URL, and snippet 86 | """ 87 | results = [] 88 | 89 | try: 90 | # Extract titles 91 | title_matches = re.findall(r'([^<]+)', html_content) 92 | 93 | # Extract snippets 94 | snippet_matches = re.findall(r'([^<]+(?:<[^>]+>[^<]+)*)', html_content) 95 | 96 | # Process results 97 | for i, (url, title) in enumerate(title_matches): 98 | snippet = "" 99 | if i < len(snippet_matches): 100 | # Remove HTML tags from snippet 101 | snippet = re.sub(r'<[^>]+>', ' ', snippet_matches[i]) 102 | 103 | # Clean up title and snippet 104 | title = title.strip() 105 | snippet = snippet.strip() 106 | 107 | # Replace HTML entities 108 | for entity, char in [('"', '"'), ('&', '&'), ('<', '<'), ('>', '>'), (''', "'")]: 109 | title = title.replace(entity, char) 110 | snippet = snippet.replace(entity, char) 111 | 112 | results.append({ 113 | "title": title, 114 | "url": url, 115 | "snippet": snippet 116 | }) 117 | 118 | return results 119 | except Exception as e: 120 | print(f"Error parsing DuckDuckGo HTML: {str(e)}") 121 | return [] 122 | 123 | def _convert_language(self, language: str) -> str: 124 | """ 125 | Convert ISO language code to DuckDuckGo's language code. 126 | 127 | Args: 128 | language: Language code (e.g., 'en-US') 129 | 130 | Returns: 131 | DuckDuckGo language code 132 | """ 133 | # Map common language codes to DuckDuckGo region codes 134 | language_map = { 135 | "en-US": "us-en", 136 | "en-GB": "uk-en", 137 | "en-CA": "ca-en", 138 | "fr-FR": "fr-fr", 139 | "de-DE": "de-de", 140 | "es-ES": "es-es", 141 | "it-IT": "it-it", 142 | "ja-JP": "jp-jp", 143 | } 144 | 145 | # Default to world-wide if not found 146 | return language_map.get(language, "wt-wt") 147 | 148 | def _convert_time_range(self, time_range: str) -> str: 149 | """ 150 | Convert time range to DuckDuckGo's time range parameter. 151 | 152 | Args: 153 | time_range: Time range (day, week, month, year) 154 | 155 | Returns: 156 | DuckDuckGo time range parameter 157 | """ 158 | time_map = { 159 | "day": "d", 160 | "week": "w", 161 | "month": "m", 162 | "year": "y" 163 | } 164 | return time_map.get(time_range, "") -------------------------------------------------------------------------------- /app/utils/html_parser.py: -------------------------------------------------------------------------------- 1 | import re 2 | from bs4 import BeautifulSoup 3 | import markdown 4 | from urllib.parse import urljoin 5 | 6 | 7 | class HTMLCleaner: 8 | """HTML cleaner and converter for content retrieval.""" 9 | 10 | @staticmethod 11 | async def clean_html(html_content: str, base_url: str = '') -> BeautifulSoup: 12 | """ 13 | Clean HTML by removing unnecessary elements. 14 | 15 | Args: 16 | html_content: The raw HTML content 17 | base_url: Base URL for resolving relative links 18 | 19 | Returns: 20 | BeautifulSoup object with clean HTML 21 | """ 22 | # Create BeautifulSoup object 23 | soup = BeautifulSoup(html_content, 'html.parser') 24 | 25 | # Remove script, style, svg, img, iframe, form elements 26 | for element in soup.find_all(['script', 'style', 'svg', 'iframe', 'form', 27 | 'noscript', 'canvas', 'video', 'audio', 'source']): 28 | element.decompose() 29 | 30 | # Remove comments 31 | for comment in soup.find_all(string=lambda text: isinstance(text, str) and ' B[Web Content Retrieval] 39 | A --> C[HTML Processing] 40 | A --> D[Search Integration] 41 | B --> E[WebFetcher] 42 | C --> F[HTMLCleaner] 43 | D --> G[SearXNGClient] 44 | F --> H[Content Transformation] 45 | G --> I[Web Search] 46 | ``` 47 | 48 | If the diagram above doesn't display properly, you can view it here: 49 | 50 | ![SURF Architecture](https://mermaid.ink/img/pako:eNptUk1rwzAM_StGUAg0bbfDDiPrDmVQxt52CrZiC1Y7ne2MrOS_T3bbj7GC_KT33pM-3JFwZJQY4WWMixy0wd2aV0i5x5Jx3HzNdsjJxCdOaQDmc680kjn2SukmwzhUhltaK2cVOBDOCO64KUjhWIc2daLGJNQWkdOaDID-GpIsZXVPczqJJ4LbUvG0KDR30pLSOnZWMkeRI5pSJVibqVMOrfwx_3NeaMP3F35cPXHbLywQc3bFO-Wde2yyyJmjtVTqR3aWNezp_oi_L1fHuSCKEKW2eISTMlIUokPaYem7_xB7Sb_AjpHfXx0s4tyAMcf5OyyFW7IC6KZvsNJ5nBRXuJcmQtMdTI8nqGWFOoJZL2rQSuJriH6LA8MrmlFSzMmhcMlGBsdfJKCxZA) 51 | 52 | ### Components 53 | 54 | 1. **FastAPI Application Layer** 55 | - Handles HTTP requests and routing 56 | - Manages API endpoints and request validation 57 | - Provides OpenAPI documentation 58 | 59 | 2. **Web Content Retrieval** 60 | - `WebFetcher`: Fetches content from web URLs 61 | - Handles different content types, timeouts, and errors 62 | - Implements rate limiting and size constraints 63 | 64 | 3. **HTML Processing** 65 | - `HTMLCleaner`: Transforms HTML into clean, structured content 66 | - Handles tables, lists, code blocks, and other formatting 67 | - Extracts key metadata like page title 68 | 69 | 4. **Search Integration** 70 | - `SearchClientFactory`: Provides the appropriate search client based on configuration 71 | - Multiple search provider support: 72 | - `DuckDuckGoClient`: Default search provider with no API key required 73 | - `SearXNGClient`: Private metasearch engine option 74 | - `BraveSearchClient`: Commercial search API option 75 | - Normalized results across all providers for consistent API response 76 | 77 | 5. **Configuration Management** 78 | - Environment-based configuration 79 | - Sensible defaults with override capability 80 | 81 | ## Data Flow 82 | 83 | ### Read Endpoint Flow 84 | 85 | 1. Client sends request to `/read/{url}?format=[json|md]` 86 | 2. URL is validated and normalized 87 | 3. `WebFetcher` retrieves content from the URL 88 | 4. Content type is detected and appropriate processor is selected 89 | 5. For HTML content: 90 | - `HTMLCleaner` removes unnecessary elements 91 | - Content is transformed to markdown 92 | - Title, URL, and processed content are gathered 93 | 6. For non-HTML content: 94 | - Content is processed according to its type 95 | - JSON is pretty-printed, text is passed through, etc. 96 | 7. Response is formatted according to requested format and returned 97 | 98 | Sequence diagram for the read endpoint: 99 | 100 | ```mermaid 101 | sequenceDiagram 102 | Client->>+SURF API: GET /read/{url} 103 | SURF API->>+WebFetcher: fetch_url(url) 104 | WebFetcher->>+External Website: HTTP GET 105 | External Website-->>-WebFetcher: Raw Content 106 | WebFetcher-->>-SURF API: content, content_type 107 | 108 | alt is HTML content 109 | SURF API->>+HTMLCleaner: process_html(content, url) 110 | HTMLCleaner->>HTMLCleaner: clean_html() 111 | HTMLCleaner->>HTMLCleaner: extract_title() 112 | HTMLCleaner->>HTMLCleaner: html_to_markdown() 113 | HTMLCleaner-->>-SURF API: title, url, content 114 | else is other content 115 | SURF API->>+SURF API: extract_text_content(content, content_type, url) 116 | SURF API-->>-SURF API: title, url, content 117 | end 118 | 119 | SURF API-->>-Client: Response (JSON or Markdown) 120 | ``` 121 | 122 | If the diagram above doesn't display properly, you can view it here: 123 | 124 | ![Read Endpoint Flow](https://mermaid.ink/img/pako:eNqNksFuwjAMhl_F-AyI0sJt3Ci0ExJCQrvsdGhJtqJBkyrJQEXdd-9QukGhTNMusfL7__zbindqFalSG-aTT9alHiR_4M6nHUr-h63j-PF9sqNK5jFxyj1wP3fKCznhmCVXc_RRa_5h25sYD0fBCLPUC8ecW01Nzux0cNrHwPIYrEYpCqwxHEgbxVOZWMdhGDr8klrAPl4lNF0ZmAJZ8rRVg6yqN0nTHpAz1KSsLWfLi-8DUXRKlUCqJGufHpGBPT83YJeHj7-Rn-M6S6ioiFaQnYCe3XTLSPbGCqwOXVuPHbgOGxBP0mUxIY9jzlTAA4Uy1G3TxjJXzv8v1ZdQaUZrOXa1UwE5HhpfO5mYZGnJ60MHY6BjMjvkpyKVGi0Ky5gUFTcvWs1UqRzN9oDk0kZlcV_KkVVBa_W3W2LH82PKNxzf_QJx9-KV) 125 | 126 | ### Search Endpoint Flow 127 | 128 | When a user sends a request to the `/search` endpoint: 129 | 130 | 1. Request is received and validated 131 | 2. Query is validated 132 | 3. Based on configuration, the appropriate search client is selected via `SearchClientFactory` 133 | 4. The search client performs the search with the query 134 | 5. Search results are fetched, normalized, and returned 135 | 6. Results are formatted according to requested format and returned 136 | 137 | Sequence diagram for the search endpoint: 138 | 139 | ```mermaid 140 | sequenceDiagram 141 | Client->>+SURF API: GET /search?q={query} 142 | SURF API->>+SearchClientFactory: get_client() 143 | SearchClientFactory-->>-SURF API: Selected Search Client 144 | SURF API->>+Search Client: search(query) 145 | alt SearXNG Provider 146 | Search Client->>+SearXNG: HTTP GET with query 147 | SearXNG-->>-Search Client: Search Results 148 | else DuckDuckGo Provider 149 | Search Client->>+DuckDuckGo: HTTP Request 150 | DuckDuckGo-->>-Search Client: Search Results 151 | else Brave Search Provider 152 | Search Client->>+Brave API: HTTP GET with API key 153 | Brave API-->>-Search Client: Search Results 154 | end 155 | Search Client-->>-SURF API: Normalized Results 156 | SURF API-->>-Client: Response (JSON or Markdown) 157 | ``` 158 | 159 | If the diagram above doesn't display properly, you can view it here: 160 | 161 | ![Search Endpoint Flow](https://mermaid.ink/img/pako:eNplkM1qwzAQhF9F7CoQJ3ZyyCF03IMphULvvQhrbSuw_kBaQ4l59yqxnaTsTbszn2Z2RzoQMaTK-RgdPXF2wSXhgJTcY-c5fn7P9ihJaZOg1AN3cuZe0AgHUPW9D0nrgNO2TWXXt-a0VIGPSGscp4GPylk_WN85DOXg0GJKxI7k2eCY4Jakt4mSx8_HZsFhUzP07jUIx3CmkRHshavPMfvosNWS9BzjRLi6aGm5oLukeXxdxz7XT92zewtT_wO8MIsnfaTjnDPP0EGNZpIz5MzOEudnk_voG93UouKlk55k0K1b4znWEe3dqI7I5-Bs-Dy0WteK5VBs5Izpj_sD8ZNM8Q) 162 | 163 | ## Key Classes and Interactions 164 | 165 | ### `WebFetcher` 166 | 167 | Responsible for retrieving content from URLs: 168 | 169 | ```python 170 | class WebFetcher: 171 | MAX_CONTENT_SIZE = 10 * 1024 * 1024 172 | TEXT_CONTENT_TYPES = [...] 173 | 174 | @staticmethod 175 | async def fetch_url(url: str, timeout: int = 30) -> Optional[Tuple[str, str]]: 176 | # ...implementation... 177 | ``` 178 | 179 | ### `HTMLCleaner` 180 | 181 | Processes HTML content for optimal LLM consumption: 182 | 183 | ```python 184 | class HTMLCleaner: 185 | @staticmethod 186 | async def clean_html(html_content: str, base_url: str = '') -> BeautifulSoup: 187 | # ...implementation... 188 | 189 | @staticmethod 190 | async def html_to_markdown(soup: BeautifulSoup) -> str: 191 | # ...implementation... 192 | 193 | @classmethod 194 | async def process_html(cls, html_content: str, base_url: str = '') -> dict: 195 | # ...implementation... 196 | ``` 197 | 198 | ### `SearchClientFactory` 199 | 200 | Factory for creating search client instances based on configuration: 201 | 202 | ```python 203 | class SearchClientFactory: 204 | @staticmethod 205 | def get_client() -> Union[SearXNGClient, DuckDuckGoClient, BraveSearchClient]: 206 | # ...implementation... 207 | ``` 208 | 209 | ### `SearXNGClient`, `DuckDuckGoClient`, and `BraveSearchClient` 210 | 211 | Handle web search functionality through different providers: 212 | 213 | ```python 214 | class SearXNGClient: 215 | def __init__(self): 216 | # ...implementation... 217 | async def search(self, query: str, num_results: int = None) -> list: 218 | # ...implementation... 219 | 220 | class DuckDuckGoClient: 221 | def __init__(self): 222 | # ...implementation... 223 | async def search(self, query: str, num_results: int = None) -> list: 224 | # ...implementation... 225 | 226 | class BraveSearchClient: 227 | def __init__(self): 228 | # ...implementation... 229 | async def search(self, query: str, num_results: int = None) -> list: 230 | # ...implementation... 231 | ``` 232 | 233 | ## Optimization Strategies 234 | 235 | SURF implements several optimization strategies: 236 | 237 | 1. **Asynchronous Processing**: All I/O operations use `async`/`await` for optimal performance 238 | 2. **Content Cleaning**: Removes unnecessary elements from HTML to reduce tokens used by LLMs 239 | 3. **Timeout Management**: Configurable timeouts prevent hanging on slow websites 240 | 4. **Size Limits**: Content size limits prevent memory issues with large pages 241 | 5. **Text-Only Focus**: Binary content is recognized and handled appropriately 242 | 6. **Error Handling**: Comprehensive error handling with graceful fallbacks 243 | 244 | ## Security Considerations 245 | 246 | 1. **URL Validation**: All URLs are validated before processing 247 | 2. **Content Type Checking**: Only appropriate content types are processed 248 | 3. **Size Limitations**: Prevents denial-of-service through massive content 249 | 4. **Authentication**: Support for SearXNG authentication 250 | 5. **No Direct Content Execution**: Content is never executed, only processed as text 251 | 252 | ## Extensibility 253 | 254 | SURF is designed for extensibility: 255 | 256 | 1. **Modular Design**: Each component can be replaced or extended 257 | 2. **Clear Interfaces**: Components interact through well-defined interfaces 258 | 3. **Configuration System**: Extensive configuration options without code changes 259 | 4. **Content Type Handlers**: System for handling different content types 260 | 5. **Format Converters**: Flexible output format system 261 | 262 | ## Deployment Considerations 263 | 264 | ### Resource Requirements 265 | 266 | - **CPU**: Minimal for API serving, moderate for content processing 267 | - **Memory**: Depends on maximum content size (default 10MB) 268 | - **Disk**: Minimal (~50MB for code and dependencies) 269 | - **Network**: Moderate bandwidth for web content retrieval 270 | 271 | ### Scaling 272 | 273 | SURF can be scaled in several ways: 274 | 275 | 1. **Horizontal Scaling**: Deploy multiple instances behind a load balancer 276 | 2. **Vertical Scaling**: Increase resources for handling more concurrent requests 277 | 3. **Caching Layer**: Add Redis or similar for caching frequently accessed content 278 | 279 | ### Monitoring 280 | 281 | Important metrics to monitor: 282 | 283 | 1. **Request Latency**: Time to process requests 284 | 2. **Error Rates**: Failed requests by type 285 | 3. **Content Size**: Distribution of retrieved content sizes 286 | 4. **External Service Health**: Search provider availability (SearXNG, DuckDuckGo, Brave) 287 | 5. **Memory Usage**: Especially during large content processing -------------------------------------------------------------------------------- /docs/integration_guide.md: -------------------------------------------------------------------------------- 1 | # SURF Integration Guide 2 | 3 | This guide provides detailed instructions for integrating SURF API with various LLM systems and frameworks. 4 | 5 | ## Table of Contents 6 | 7 | - [Basic Integration Concepts](#basic-integration-concepts) 8 | - [Integration with LangChain](#integration-with-langchain) 9 | - [Integration with LlamaIndex](#integration-with-llamaindex) 10 | - [Direct Integration with OpenAI API](#direct-integration-with-openai-api) 11 | - [Integration with Hugging Face Transformers](#integration-with-hugging-face-transformers) 12 | - [Integration with Model Context Protocol (MCP)](#integration-with-model-context-protocol-mcp) 13 | - [Custom LLM Integration](#custom-llm-integration) 14 | - [Troubleshooting](#troubleshooting) 15 | 16 | ## Basic Integration Concepts 17 | 18 | SURF API provides two primary endpoints that can be used with any LLM system: 19 | 20 | 1. **Content Reading** (`/read/{url}`): Fetches and processes web content for LLM consumption 21 | 2. **Web Search** (`/search?q={query}`): Performs web searches and returns relevant results 22 | 23 | The most common integration patterns are: 24 | 25 | ### 1. Tool-Based Integration 26 | 27 | Configure SURF endpoints as tools that your LLM can use to access web information: 28 | 29 | ```python 30 | tools = [ 31 | { 32 | "type": "function", 33 | "function": { 34 | "name": "search_web", 35 | "description": "Search the web for information on a topic", 36 | "parameters": { 37 | "type": "object", 38 | "properties": { 39 | "query": { 40 | "type": "string", 41 | "description": "The search query" 42 | }, 43 | "max_results": { 44 | "type": "integer", 45 | "description": "Maximum number of results to return (1-10)", 46 | "default": 5 47 | } 48 | }, 49 | "required": ["query"] 50 | } 51 | } 52 | }, 53 | { 54 | "type": "function", 55 | "function": { 56 | "name": "read_webpage", 57 | "description": "Read and process a webpage for relevant information", 58 | "parameters": { 59 | "type": "object", 60 | "properties": { 61 | "url": { 62 | "type": "string", 63 | "description": "The URL of the webpage to read" 64 | } 65 | }, 66 | "required": ["url"] 67 | } 68 | } 69 | } 70 | ] 71 | ``` 72 | 73 | ### 2. Retrieval Augmentation (RAG) 74 | 75 | Use SURF as a retriever in a Retrieval-Augmented Generation system: 76 | 77 | ```python 78 | def retrieve_from_web(query): 79 | # Search the web using SURF 80 | search_results = requests.get( 81 | "http://localhost:8000/search", 82 | params={"q": query, "format": "json", "max_results": 3} 83 | ).json() 84 | 85 | # For each result, fetch the content 86 | documents = [] 87 | for result in search_results["results"]: 88 | try: 89 | content = requests.get( 90 | f"http://localhost:8000/read/{result['url']}", 91 | params={"format": "json"} 92 | ).json() 93 | documents.append({ 94 | "title": content["title"], 95 | "content": content["content"], 96 | "url": content["url"] 97 | }) 98 | except Exception as e: 99 | print(f"Error retrieving {result['url']}: {e}") 100 | 101 | return documents 102 | ``` 103 | 104 | ### 3. Direct Context Injection 105 | 106 | Add web content directly into your prompts: 107 | 108 | ```python 109 | def generate_response_with_web_info(user_query): 110 | # Search for relevant information 111 | search_results = requests.get( 112 | "http://localhost:8000/search", 113 | params={"q": user_query, "format": "json", "max_results": 2} 114 | ).json() 115 | 116 | # Create prompt with context 117 | prompt = f""" 118 | User query: {user_query} 119 | 120 | Relevant information from the web: 121 | 122 | """ 123 | 124 | for i, result in enumerate(search_results["results"], 1): 125 | prompt += f"{i}. {result['title']} ({result['url']}): {result['snippet']}\n\n" 126 | 127 | prompt += "Based on the above information, please provide a comprehensive answer to the user's query." 128 | 129 | # Send to LLM 130 | response = llm.generate(prompt) 131 | return response 132 | ``` 133 | 134 | ## Integration with LangChain 135 | 136 | [LangChain](https://github.com/langchain-ai/langchain) is a popular framework for developing applications with LLMs. Here's how to integrate SURF: 137 | 138 | ### Creating Custom Tools 139 | 140 | ```python 141 | from langchain.tools import Tool 142 | from langchain.agents import AgentType, initialize_agent 143 | from langchain.llms import OpenAI 144 | import requests 145 | 146 | # Define SURF tools 147 | def search_web(query): 148 | """Search the web for information.""" 149 | response = requests.get( 150 | "http://localhost:8000/search", 151 | params={"q": query, "format": "json", "max_results": 5} 152 | ) 153 | return response.json() 154 | 155 | def read_webpage(url): 156 | """Read and process a webpage.""" 157 | if not url.startswith(("http://", "https://")): 158 | url = f"https://{url}" 159 | response = requests.get( 160 | f"http://localhost:8000/read/{url}", 161 | params={"format": "json"} 162 | ) 163 | return response.json() 164 | 165 | # Create LangChain tools 166 | tools = [ 167 | Tool( 168 | name="SearchWeb", 169 | func=search_web, 170 | description="Useful for searching the web for information on a topic. Input should be a search query." 171 | ), 172 | Tool( 173 | name="ReadWebpage", 174 | func=read_webpage, 175 | description="Useful for reading and extracting information from a webpage. Input should be a URL." 176 | ) 177 | ] 178 | 179 | # Initialize agent 180 | llm = OpenAI(temperature=0) 181 | agent = initialize_agent( 182 | tools, 183 | llm, 184 | agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 185 | verbose=True 186 | ) 187 | 188 | # Use the agent 189 | result = agent.run("What are the latest developments in quantum computing?") 190 | ``` 191 | 192 | ### Creating a Custom Retriever 193 | 194 | ```python 195 | from langchain.retrievers import BaseRetriever 196 | from langchain.schema import Document 197 | import requests 198 | 199 | class SURFRetriever(BaseRetriever): 200 | """Retriever that uses SURF to search the web.""" 201 | 202 | def __init__(self, max_results=5): 203 | super().__init__() 204 | self.max_results = max_results 205 | 206 | def _get_relevant_documents(self, query): 207 | # Search for information 208 | search_results = requests.get( 209 | "http://localhost:8000/search", 210 | params={"q": query, "format": "json", "max_results": self.max_results} 211 | ).json() 212 | 213 | # Convert to documents 214 | documents = [] 215 | for result in search_results["results"]: 216 | documents.append( 217 | Document( 218 | page_content=result["snippet"], 219 | metadata={"title": result["title"], "url": result["url"]} 220 | ) 221 | ) 222 | 223 | return documents 224 | ``` 225 | 226 | ## Integration with LlamaIndex 227 | 228 | [LlamaIndex](https://github.com/jerryjliu/llama_index) is a data framework for LLM applications. Here's how to integrate SURF: 229 | 230 | ### Creating a Custom Reader 231 | 232 | ```python 233 | from llama_index.readers.base import BaseReader 234 | from llama_index.schema import Document 235 | import requests 236 | 237 | class SURFReader(BaseReader): 238 | """Reader that fetches web content using SURF.""" 239 | 240 | def load_data(self, urls): 241 | """Load data from the given URLs.""" 242 | documents = [] 243 | 244 | for url in urls: 245 | try: 246 | response = requests.get( 247 | f"http://localhost:8000/read/{url}", 248 | params={"format": "json"} 249 | ) 250 | data = response.json() 251 | 252 | documents.append( 253 | Document( 254 | text=data["content"], 255 | metadata={ 256 | "title": data["title"], 257 | "url": data["url"] 258 | } 259 | ) 260 | ) 261 | except Exception as e: 262 | print(f"Error loading {url}: {e}") 263 | 264 | return documents 265 | ``` 266 | 267 | ### Creating a Custom Retriever 268 | 269 | ```python 270 | from llama_index.retrievers import BaseRetriever 271 | from llama_index.schema import NodeWithScore, QueryBundle 272 | import requests 273 | 274 | class SURFWebRetriever(BaseRetriever): 275 | """Retriever that searches the web using SURF.""" 276 | 277 | def __init__(self, max_results=5): 278 | """Initialize the retriever.""" 279 | self.max_results = max_results 280 | 281 | def _retrieve(self, query_bundle: QueryBundle): 282 | """Retrieve nodes given query.""" 283 | search_results = requests.get( 284 | "http://localhost:8000/search", 285 | params={"q": query_bundle.query_str, "format": "json", "max_results": self.max_results} 286 | ).json() 287 | 288 | nodes = [] 289 | for result in search_results["results"]: 290 | node = NodeWithScore( 291 | node=Node( 292 | text=result["snippet"], 293 | metadata={ 294 | "title": result["title"], 295 | "url": result["url"] 296 | } 297 | ), 298 | score=1.0 / (i + 1) # Simple ranking based on position 299 | ) 300 | nodes.append(node) 301 | 302 | return nodes 303 | ``` 304 | 305 | ## Direct Integration with OpenAI API 306 | 307 | ### Function Calling with SURF 308 | 309 | ```python 310 | import openai 311 | import requests 312 | 313 | # Set your OpenAI API key 314 | openai.api_key = "your-api-key" 315 | 316 | # Define functions for web access 317 | functions = [ 318 | { 319 | "name": "search_web", 320 | "description": "Search the web for information on a specific topic", 321 | "parameters": { 322 | "type": "object", 323 | "properties": { 324 | "query": { 325 | "type": "string", 326 | "description": "The search query" 327 | }, 328 | "max_results": { 329 | "type": "integer", 330 | "description": "Maximum number of results to return", 331 | "default": 5 332 | } 333 | }, 334 | "required": ["query"] 335 | } 336 | }, 337 | { 338 | "name": "read_webpage", 339 | "description": "Read and process a webpage", 340 | "parameters": { 341 | "type": "object", 342 | "properties": { 343 | "url": { 344 | "type": "string", 345 | "description": "The URL of the webpage to read" 346 | } 347 | }, 348 | "required": ["url"] 349 | } 350 | } 351 | ] 352 | 353 | # Function implementations 354 | def search_web(query, max_results=5): 355 | response = requests.get( 356 | "http://localhost:8000/search", 357 | params={"q": query, "format": "json", "max_results": max_results} 358 | ) 359 | return response.json() 360 | 361 | def read_webpage(url): 362 | response = requests.get( 363 | f"http://localhost:8000/read/{url}", 364 | params={"format": "json"} 365 | ) 366 | return response.json() 367 | 368 | # Process user query 369 | def process_query(user_query): 370 | messages = [{"role": "user", "content": user_query}] 371 | 372 | while True: 373 | # Get response from OpenAI 374 | response = openai.ChatCompletion.create( 375 | model="gpt-3.5-turbo", 376 | messages=messages, 377 | functions=functions, 378 | function_call="auto" 379 | ) 380 | 381 | response_message = response["choices"][0]["message"] 382 | messages.append(response_message) 383 | 384 | # Check if function call is requested 385 | if response_message.get("function_call"): 386 | function_name = response_message["function_call"]["name"] 387 | function_args = json.loads(response_message["function_call"]["arguments"]) 388 | 389 | # Execute the function 390 | if function_name == "search_web": 391 | function_response = search_web( 392 | function_args.get("query"), 393 | function_args.get("max_results", 5) 394 | ) 395 | elif function_name == "read_webpage": 396 | function_response = read_webpage(function_args.get("url")) 397 | else: 398 | function_response = {"error": "Function not found"} 399 | 400 | # Add function response to messages 401 | messages.append({ 402 | "role": "function", 403 | "name": function_name, 404 | "content": json.dumps(function_response) 405 | }) 406 | else: 407 | # Return the final response 408 | return response_message["content"] 409 | ``` 410 | 411 | ## Integration with Hugging Face Transformers 412 | 413 | ### Using SURF with Hugging Face Pipeline 414 | 415 | ```python 416 | from transformers import pipeline 417 | import requests 418 | 419 | def get_web_content(url): 420 | """Fetch content from a webpage using SURF.""" 421 | response = requests.get( 422 | f"http://localhost:8000/read/{url}", 423 | params={"format": "json"} 424 | ) 425 | return response.json() 426 | 427 | # Initialize a summarization pipeline 428 | summarizer = pipeline("summarization", model="facebook/bart-large-cnn") 429 | 430 | # Get web content and summarize it 431 | def summarize_webpage(url): 432 | # Get content from SURF 433 | content = get_web_content(url) 434 | 435 | # Extract the text content 436 | text = content["content"] 437 | 438 | # Summarize in chunks if necessary (BART has a 1024 token limit) 439 | max_chunk_length = 1000 440 | chunks = [text[i:i+max_chunk_length] for i in range(0, len(text), max_chunk_length)] 441 | 442 | summaries = [] 443 | for chunk in chunks: 444 | summary = summarizer(chunk, max_length=150, min_length=30, do_sample=False) 445 | summaries.append(summary[0]["summary_text"]) 446 | 447 | return { 448 | "title": content["title"], 449 | "url": content["url"], 450 | "summary": " ".join(summaries) 451 | } 452 | ``` 453 | 454 | ## Integration with Model Context Protocol (MCP) 455 | 456 | [Model Context Protocol (MCP)](https://modelcontextprotocol.io) is an open standard developed by Anthropic that streamlines the integration of AI assistants with external data sources and tools. SURF can be implemented as an MCP server, allowing any MCP-compatible AI assistant to leverage its web search and content reading capabilities. 457 | 458 | ### Creating a SURF MCP Server 459 | 460 | ```python 461 | from mcp import MCPServer, ServerSchema, RequestContext 462 | import requests 463 | import json 464 | 465 | # Define the schema for your SURF MCP server 466 | schema = ServerSchema( 467 | name="surf", 468 | description="Search utility and reading framework for web access", 469 | capabilities=[ 470 | { 471 | "name": "search_web", 472 | "description": "Search the web for information", 473 | "parameters": { 474 | "type": "object", 475 | "properties": { 476 | "query": { 477 | "type": "string", 478 | "description": "The search query" 479 | }, 480 | "max_results": { 481 | "type": "integer", 482 | "description": "Maximum number of results (1-10)", 483 | "default": 5 484 | } 485 | }, 486 | "required": ["query"] 487 | } 488 | }, 489 | { 490 | "name": "read_content", 491 | "description": "Fetch and process web content", 492 | "parameters": { 493 | "type": "object", 494 | "properties": { 495 | "url": { 496 | "type": "string", 497 | "description": "URL to read" 498 | } 499 | }, 500 | "required": ["url"] 501 | } 502 | } 503 | ] 504 | ) 505 | 506 | # Define handler functions for each capability 507 | async def search_web(context: RequestContext, params: dict): 508 | query = params.get("query") 509 | max_results = params.get("max_results", 5) 510 | 511 | # Call SURF API for search 512 | response = requests.get( 513 | "http://localhost:8000/search", 514 | params={"q": query, "format": "json", "max_results": max_results} 515 | ) 516 | 517 | return response.json() 518 | 519 | async def read_content(context: RequestContext, params: dict): 520 | url = params.get("url") 521 | 522 | # Call SURF API to read content 523 | response = requests.get( 524 | f"http://localhost:8000/read/{url}", 525 | params={"format": "json"} 526 | ) 527 | 528 | return response.json() 529 | 530 | # Create and start MCP server 531 | server = MCPServer(schema=schema) 532 | server.register_capability("search_web", search_web) 533 | server.register_capability("read_content", read_content) 534 | 535 | # Start the server 536 | server.start() 537 | ``` 538 | 539 | ### Using SURF with Anthropic's Claude 540 | 541 | Claude Desktop application supports MCP integration. Once you have your SURF MCP server running: 542 | 543 | 1. Open Claude Desktop 544 | 2. Go to Settings > Model Context Protocol 545 | 3. Add your local SURF MCP server 546 | 4. Start a new conversation and use the SURF capabilities: 547 | 548 | Example prompt: 549 | ``` 550 | Can you search for the latest news about quantum computing and summarize the key points? 551 | ``` 552 | 553 | When Claude recognizes it needs web information, it will use your SURF MCP server to search and retrieve content. 554 | 555 | ### Benefits of MCP Integration 556 | 557 | - **Standardized Integration**: Connect SURF to any MCP-compatible AI assistant 558 | - **Enhanced Context**: Provide AI models with up-to-date web information 559 | - **Seamless User Experience**: Users interact naturally with the AI, which handles the web access behind the scenes 560 | - **Future-proof**: Join the growing MCP ecosystem with hundreds of tools and data sources 561 | 562 | ### Creating Custom MCP Clients 563 | 564 | You can create custom applications that leverage both SURF and MCP: 565 | 566 | ```python 567 | from mcp import MCPClient 568 | import asyncio 569 | 570 | async def main(): 571 | # Connect to SURF MCP server 572 | client = MCPClient("http://localhost:5000") 573 | 574 | # Call search capability 575 | search_results = await client.call( 576 | "search_web", 577 | {"query": "latest AI research", "max_results": 3} 578 | ) 579 | 580 | # Process and display results 581 | for i, result in enumerate(search_results["results"], 1): 582 | print(f"{i}. {result['title']} - {result['url']}") 583 | 584 | # Get full content of the first result 585 | if i == 1: 586 | content = await client.call( 587 | "read_content", 588 | {"url": result['url']} 589 | ) 590 | 591 | print(f"\nExcerpt from {content['title']}:\n") 592 | print(content['content'][:500] + "...\n") 593 | 594 | asyncio.run(main()) 595 | ``` 596 | 597 | ## Custom LLM Integration 598 | 599 | ### Creating a Proxy API for Local LLMs 600 | 601 | ```python 602 | from fastapi import FastAPI, Request 603 | import requests 604 | import os 605 | import json 606 | from transformers import pipeline 607 | 608 | # Initialize FastAPI 609 | app = FastAPI() 610 | 611 | # Initialize a text-generation pipeline with a local model 612 | generator = pipeline("text-generation", model="TheBloke/Llama-2-7B-Chat-GGUF", device_map="auto") 613 | 614 | # Initialize SURF client functions 615 | def search_web(query, max_results=5): 616 | response = requests.get( 617 | "http://localhost:8000/search", 618 | params={"q": query, "format": "json", "max_results": max_results} 619 | ) 620 | return response.json() 621 | 622 | def read_webpage(url): 623 | response = requests.get( 624 | f"http://localhost:8000/read/{url}", 625 | params={"format": "json"} 626 | ) 627 | return response.json() 628 | 629 | @app.post("/chat/completions") 630 | async def chat_completions(request: Request): 631 | data = await request.json() 632 | 633 | messages = data.get("messages", []) 634 | user_message = next((m for m in reversed(messages) if m["role"] == "user"), None) 635 | 636 | if not user_message: 637 | return {"error": "No user message found"} 638 | 639 | # Check if the message contains commands for web search or reading 640 | query = user_message["content"].lower() 641 | response_content = "" 642 | 643 | if "search for:" in query: 644 | search_query = query.split("search for:")[1].strip() 645 | search_results = search_web(search_query) 646 | 647 | # Format search results 648 | response_content = f"Here are the search results for '{search_query}':\n\n" 649 | for i, result in enumerate(search_results["results"], 1): 650 | response_content += f"{i}. {result['title']}\n {result['url']}\n {result['snippet']}\n\n" 651 | 652 | elif "read webpage:" in query: 653 | url = query.split("read webpage:")[1].strip() 654 | webpage_content = read_webpage(url) 655 | 656 | # Format webpage content 657 | response_content = f"Content from {webpage_content['title']} ({webpage_content['url']}):\n\n" 658 | response_content += webpage_content['content'][:1000] # Truncate for brevity 659 | response_content += "\n\n(Content truncated for brevity)" 660 | 661 | else: 662 | # Regular text generation for other queries 663 | prompt = "\n".join([f"{m['role']}: {m['content']}" for m in messages]) 664 | generated = generator(prompt, max_length=500, do_sample=True, temperature=0.7) 665 | response_content = generated[0]["generated_text"].split("assistant:")[-1].strip() 666 | 667 | return { 668 | "id": "chatcmpl-" + os.urandom(12).hex(), 669 | "object": "chat.completion", 670 | "created": int(time.time()), 671 | "model": "local-llm", 672 | "choices": [ 673 | { 674 | "index": 0, 675 | "message": { 676 | "role": "assistant", 677 | "content": response_content 678 | }, 679 | "finish_reason": "stop" 680 | } 681 | ], 682 | "usage": { 683 | "prompt_tokens": len(prompt), 684 | "completion_tokens": len(response_content), 685 | "total_tokens": len(prompt) + len(response_content) 686 | } 687 | } 688 | ``` 689 | 690 | ## Troubleshooting 691 | 692 | ### Common Issues and Solutions 693 | 694 | 1. **Connection Errors** 695 | 696 | If you're experiencing connection errors to the SURF API: 697 | 698 | ``` 699 | Error: Connection refused 700 | ``` 701 | 702 | **Solution**: Ensure the SURF API is running and accessible at the expected URL. Check if the port is correct and not blocked by a firewall. 703 | 704 | 2. **Rate Limiting or Timeout Issues** 705 | 706 | If SearXNG requests are timing out: 707 | 708 | ``` 709 | Error: Timeout when contacting SearXNG 710 | ``` -------------------------------------------------------------------------------- /docs/self_hosting.md: -------------------------------------------------------------------------------- 1 | # Self-Hosting Guide for SURF 2 | 3 | This guide will help you deploy SURF on your own infrastructure, whether you're running it on a personal server, a cloud VPS, or in a containerized environment. 4 | 5 | ## Table of Contents 6 | 7 | - [Requirements](#requirements) 8 | - [Deployment Options](#deployment-options) 9 | - [Docker Deployment](#docker-deployment) 10 | - [Bare Metal Deployment](#bare-metal-deployment) 11 | - [Cloud Platform Deployment](#cloud-platform-deployment) 12 | - [Security Considerations](#security-considerations) 13 | - [Reverse Proxy Configuration](#reverse-proxy-configuration) 14 | - [Environment Variables](#environment-variables) 15 | - [Monitoring](#monitoring) 16 | - [Troubleshooting](#troubleshooting) 17 | 18 | ## Requirements 19 | 20 | Before deploying SURF, ensure your system meets these minimum requirements: 21 | 22 | - **CPU**: 1+ cores (2+ recommended for production) 23 | - **RAM**: 512MB minimum (1GB+ recommended) 24 | - **Disk**: 1GB for application and dependencies 25 | - **Network**: Public internet access for search functionality 26 | - **Operating System**: Linux (recommended), macOS, or Windows with WSL2 27 | 28 | ## Deployment Options 29 | 30 | ### Docker Deployment 31 | 32 | Using Docker is the simplest way to deploy SURF. 33 | 34 | #### Step 1: Create a Docker Compose File 35 | 36 | Create a file named `docker-compose.yml`: 37 | 38 | ```yaml 39 | version: '3' 40 | 41 | services: 42 | surf: 43 | build: 44 | context: . 45 | dockerfile: Dockerfile 46 | restart: unless-stopped 47 | ports: 48 | - "8000:8000" 49 | environment: 50 | - DEBUG=False 51 | - AUTH_ENABLED=True 52 | - API_KEYS=your-secure-api-key-here 53 | - SEARCH_PROVIDER=duckduckgo 54 | - DUCKDUCKGO_MAX_RESULTS=10 55 | volumes: 56 | - ./data:/app/data 57 | ``` 58 | 59 | #### Step 2: Create a Dockerfile 60 | 61 | Create a file named `Dockerfile`: 62 | 63 | ```dockerfile 64 | FROM python:3.12-slim 65 | 66 | WORKDIR /app 67 | 68 | # Install dependencies 69 | RUN pip install --no-cache-dir fastapi uvicorn aiohttp python-dotenv beautifulsoup4 markdown 70 | 71 | # Copy application files 72 | COPY . /app/ 73 | 74 | # Expose the port 75 | EXPOSE 8000 76 | 77 | # Run the application 78 | CMD ["python", "run.py"] 79 | ``` 80 | 81 | #### Step 3: Build and Start the Container 82 | 83 | ```bash 84 | docker-compose up -d 85 | ``` 86 | 87 | Your SURF API will be available at `http://localhost:8000`. 88 | 89 | ### Bare Metal Deployment 90 | 91 | For direct installation on a server without containers: 92 | 93 | #### Step 1: Ensure Python 3.12+ is Installed 94 | 95 | ```bash 96 | python3 --version 97 | # Should be 3.12 or higher 98 | ``` 99 | 100 | If not, install Python 3.12+: 101 | 102 | ```bash 103 | # For Ubuntu/Debian 104 | sudo apt update 105 | sudo apt install software-properties-common 106 | sudo add-apt-repository ppa:deadsnakes/ppa 107 | sudo apt install python3.12 python3.12-venv python3.12-dev 108 | ``` 109 | 110 | #### Step 2: Clone the Repository 111 | 112 | ```bash 113 | git clone https://github.com/44za12/surf.git 114 | cd surf 115 | ``` 116 | 117 | #### Step 3: Set Up a Virtual Environment 118 | 119 | ```bash 120 | python3.12 -m venv venv 121 | source venv/bin/activate # On Windows: venv\Scripts\activate 122 | ``` 123 | 124 | #### Step 4: Install Dependencies 125 | 126 | ```bash 127 | pip install fastapi uvicorn aiohttp python-dotenv beautifulsoup4 markdown 128 | ``` 129 | 130 | #### Step 5: Configure Environment Variables 131 | 132 | ```bash 133 | cp .env.example .env 134 | ``` 135 | 136 | Edit the `.env` file with your preferred settings. 137 | 138 | #### Step 6: Run as a Service with Systemd (Linux) 139 | 140 | Create a systemd service file: 141 | 142 | ```bash 143 | sudo nano /etc/systemd/system/surf.service 144 | ``` 145 | 146 | Add the following content (adjust paths as needed): 147 | 148 | ``` 149 | [Unit] 150 | Description=SURF API 151 | After=network.target 152 | 153 | [Service] 154 | User=yourusername 155 | WorkingDirectory=/path/to/surf 156 | ExecStart=/path/to/surf/venv/bin/python run.py 157 | Restart=on-failure 158 | RestartSec=5 159 | Environment=PATH=/path/to/surf/venv/bin:/usr/bin:/bin 160 | 161 | [Install] 162 | WantedBy=multi-user.target 163 | ``` 164 | 165 | Enable and start the service: 166 | 167 | ```bash 168 | sudo systemctl enable surf 169 | sudo systemctl start surf 170 | sudo systemctl status surf 171 | ``` 172 | 173 | ### Cloud Platform Deployment 174 | 175 | #### Digital Ocean, AWS Lightsail, or similar VPS 176 | 177 | 1. Create a VPS with at least 1GB RAM running Ubuntu 22.04+ 178 | 2. Follow the Bare Metal Deployment steps above 179 | 3. Configure firewall to allow traffic on port 8000 (or your chosen port) 180 | 181 | #### Heroku Deployment 182 | 183 | 1. Create a `Procfile` in your project root: 184 | ``` 185 | web: python run.py 186 | ``` 187 | 188 | 2. Add a `runtime.txt` file: 189 | ``` 190 | python-3.12.0 191 | ``` 192 | 193 | 3. Deploy using the Heroku CLI: 194 | ```bash 195 | heroku create your-surf-instance 196 | git push heroku main 197 | ``` 198 | 199 | #### Railway, Render, or similar PaaS 200 | 201 | 1. Connect your GitHub repository 202 | 2. Set environment variables in the platform dashboard 203 | 3. Set the start command to `python run.py` 204 | 205 | ## Security Considerations 206 | 207 | ### API Keys 208 | 209 | Always use strong, unique API keys in production: 210 | 211 | ``` 212 | API_KEYS=your-long-random-string-here,another-key-for-different-user 213 | ``` 214 | 215 | You can generate secure keys with: 216 | 217 | ```bash 218 | openssl rand -base64 32 219 | ``` 220 | 221 | ### HTTPS 222 | 223 | In production, always use HTTPS. Set up a reverse proxy like Nginx or Caddy with Let's Encrypt. 224 | 225 | ### Rate Limiting 226 | 227 | Consider implementing rate limiting at the reverse proxy level to prevent abuse. 228 | 229 | ## Reverse Proxy Configuration 230 | 231 | ### Nginx 232 | 233 | Create a configuration file `/etc/nginx/sites-available/surf`: 234 | 235 | ```nginx 236 | server { 237 | listen 80; 238 | server_name api.yourdomain.com; 239 | 240 | location / { 241 | proxy_pass http://localhost:8000; 242 | proxy_http_version 1.1; 243 | proxy_set_header Upgrade $http_upgrade; 244 | proxy_set_header Connection 'upgrade'; 245 | proxy_set_header Host $host; 246 | proxy_cache_bypass $http_upgrade; 247 | } 248 | } 249 | ``` 250 | 251 | Enable the site and set up HTTPS with Certbot: 252 | 253 | ```bash 254 | sudo ln -s /etc/nginx/sites-available/surf /etc/nginx/sites-enabled/ 255 | sudo nginx -t 256 | sudo systemctl restart nginx 257 | sudo certbot --nginx -d api.yourdomain.com 258 | ``` 259 | 260 | ### Caddy (Simpler alternative with automatic HTTPS) 261 | 262 | Create a `Caddyfile`: 263 | 264 | ``` 265 | api.yourdomain.com { 266 | reverse_proxy localhost:8000 267 | } 268 | ``` 269 | 270 | ## Environment Variables 271 | 272 | Here's a summary of important environment variables for production: 273 | 274 | ``` 275 | # Security 276 | AUTH_ENABLED=True 277 | API_KEYS=your-secure-api-key-1,your-secure-api-key-2 278 | DEBUG=False 279 | 280 | # Search Provider 281 | SEARCH_PROVIDER=duckduckgo 282 | 283 | # Performance 284 | PORT=8000 285 | 286 | # If using SearXNG 287 | SEARXNG_INSTANCE_URL=https://your-private-searxng-instance.com 288 | SEARXNG_TIMEOUT=15 289 | 290 | # If using Brave Search 291 | BRAVE_API_KEY=your-brave-api-key 292 | ``` 293 | 294 | ## Monitoring 295 | 296 | ### Basic Monitoring with Uptime Checks 297 | 298 | Use a service like UptimeRobot, StatusCake, or Pingdom to monitor your API endpoint. 299 | 300 | ### Advanced Monitoring 301 | 302 | For production deployments, consider setting up: 303 | 304 | 1. Prometheus for metrics collection 305 | 2. Grafana for visualization 306 | 3. AlertManager for notifications 307 | 308 | ## Troubleshooting 309 | 310 | ### Common Issues 311 | 312 | 1. **API returns 500 errors**: 313 | - Check the application logs: `sudo journalctl -u surf` 314 | - Ensure all dependencies are installed 315 | 316 | 2. **Search fails but the API is running**: 317 | - Check internet connectivity from your server 318 | - Verify your search provider configuration 319 | 320 | 3. **High memory usage**: 321 | - Adjust the `MAX_CONTENT_SIZE` in `app/utils/web_fetcher.py` to a lower value 322 | 323 | 4. **Slow response times**: 324 | - Increase the number of workers in uvicorn by modifying the run.py file 325 | - Add `--workers 4` to the uvicorn command for multi-core systems 326 | 327 | ### Getting Help 328 | 329 | If you encounter issues not covered here: 330 | 331 | 1. Check the [GitHub Issues](https://github.com/44za12/surf/issues) for similar problems 332 | 2. Start a new discussion in the repository 333 | 3. Join our community chat for real-time assistance -------------------------------------------------------------------------------- /docs/use_cases.md: -------------------------------------------------------------------------------- 1 | # SURF API: Use Cases and Applications 2 | 3 | SURF API bridges the gap between LLMs and the web, enabling a new class of applications. This document outlines key use cases, benefits, and real-world applications. 4 | 5 | ## Key Benefits 6 | 7 | ### For Developers 8 | 9 | 1. **Simplified Web Integration** 10 | - Eliminate complex web scraping code 11 | - Standardized content processing across different sites 12 | - No need to handle different HTML structures manually 13 | 14 | 2. **Enhanced LLM Performance** 15 | - Provide clean, structured data to your LLMs 16 | - Reduce token usage by removing unnecessary content 17 | - Better formatting preserves semantic structure 18 | 19 | 3. **Privacy and Control** 20 | - Self-hosted solution keeps user queries private 21 | - Control over what content is accessed and how 22 | - No dependency on third-party web access APIs 23 | 24 | 4. **Flexibility** 25 | - Multiple output formats (JSON, markdown) 26 | - Configurable content processing options 27 | - Easily integrates with any LLM framework 28 | 29 | ### For End Users 30 | 31 | 1. **Up-to-date Information** 32 | - Access to current information beyond the LLM's training data 33 | - Real-time web search capabilities 34 | - Access to the latest articles, documentation, and content 35 | 36 | 2. **Better Answers** 37 | - Responses based on factual, current web content 38 | - Citations and sources for information 39 | - Reduced hallucinations from LLMs 40 | 41 | 3. **Tool-like Capabilities** 42 | - Web research assistant capabilities 43 | - Information gathering from multiple sources 44 | - Content summarization and analysis 45 | 46 | ## Use Cases 47 | 48 | ### 1. Research Assistant Applications 49 | 50 | **Scenario**: A user needs to research a complex topic with many facets. 51 | 52 | **Implementation**: 53 | 1. User submits a research query 54 | 2. Application uses SURF to search for relevant sources 55 | 3. For each promising source, the content is fetched and processed 56 | 4. The LLM analyzes, synthesizes, and summarizes the information 57 | 5. The user receives a comprehensive research report with citations 58 | 59 | **Benefits**: 60 | - Access to up-to-date information 61 | - Multiple source validation 62 | - Proper citations and attribution 63 | - Structured, comprehensive results 64 | 65 | ### 2. Knowledge Base Enhancement 66 | 67 | **Scenario**: An enterprise has internal documentation but needs to supplement it with external information. 68 | 69 | **Implementation**: 70 | 1. When user queries the knowledge base 71 | 2. System first searches internal sources 72 | 3. If information is insufficient, SURF searches the web 73 | 4. External information is blended with internal knowledge 74 | 5. Response clearly differentiates between internal and external knowledge 75 | 76 | **Benefits**: 77 | - Extends internal knowledge bases 78 | - Keeps information current 79 | - Clear source attribution 80 | - Consistent formatting of internal and external information 81 | 82 | ### 3. Technical Documentation Assistant 83 | 84 | **Scenario**: Developers need help understanding and implementing technical solutions. 85 | 86 | **Implementation**: 87 | 1. Developer asks a coding or technical question 88 | 2. System searches for relevant documentation and tutorials 89 | 3. SURF fetches and processes the content, preserving code blocks and tables 90 | 4. LLM synthesizes a solution based on multiple documentation sources 91 | 5. Developer receives contextual, accurate guidance 92 | 93 | **Benefits**: 94 | - Code examples are properly formatted 95 | - Technical tables are preserved 96 | - Solutions based on current documentation 97 | - Multiple sources for better answers 98 | 99 | ### 4. News Analysis and Summarization 100 | 101 | **Scenario**: Users want to stay informed about a topic with analysis of recent developments. 102 | 103 | **Implementation**: 104 | 1. User requests news on a specific topic 105 | 2. SURF searches for recent news articles 106 | 3. Content from multiple sources is fetched and processed 107 | 4. LLM analyzes, compares, and summarizes the perspectives 108 | 5. User receives a balanced overview with links to original sources 109 | 110 | **Benefits**: 111 | - Multiple source perspective 112 | - Up-to-date information 113 | - Reduced bias through multi-source analysis 114 | - Original sources available for deeper reading 115 | 116 | ### 5. Fact-Checking and Verification 117 | 118 | **Scenario**: Users want to verify claims or statements. 119 | 120 | **Implementation**: 121 | 1. User submits a claim to verify 122 | 2. SURF searches for relevant information 123 | 3. Multiple sources are fetched and processed 124 | 4. LLM analyzes the consistency and credibility of information 125 | 5. User receives a verification result with supporting evidence 126 | 127 | **Benefits**: 128 | - Multiple source verification 129 | - Access to current information 130 | - Clear presentation of evidence 131 | - Reduced LLM hallucination 132 | 133 | ## Real-World Applications 134 | 135 | ### Educational Tools 136 | 137 | **Example**: A study assistant that helps students research topics, understand concepts, and find supplementary resources. 138 | 139 | **How SURF Helps**: 140 | - Fetches and processes educational content from various sources 141 | - Preserves mathematical formulas and scientific notation 142 | - Structures information in a learning-friendly format 143 | - Provides current, accurate information for projects and assignments 144 | 145 | ### Business Intelligence 146 | 147 | **Example**: A market research tool that gathers and analyzes information about competitors, trends, and industry developments. 148 | 149 | **How SURF Helps**: 150 | - Searches for current market information 151 | - Processes business news, reports, and analyses 152 | - Extracts structured data from different sources 153 | - Enables continuous monitoring of market changes 154 | 155 | ### Healthcare Information Systems 156 | 157 | **Example**: A clinical information assistant that helps healthcare professionals stay updated on research and treatment guidelines. 158 | 159 | **How SURF Helps**: 160 | - Searches medical journals and trusted health sources 161 | - Preserves critical data tables and research findings 162 | - Extracts structured information from clinical guidelines 163 | - Provides current information beyond the LLM's training cutoff 164 | 165 | ### Legal Research 166 | 167 | **Example**: A legal research assistant that helps lawyers find relevant cases, statutes, and legal analyses. 168 | 169 | **How SURF Helps**: 170 | - Searches legal databases and resources 171 | - Preserves citation formats and legal terminology 172 | - Structures complex legal documents for easier analysis 173 | - Provides up-to-date legal information and precedents 174 | 175 | ### Content Creation Support 176 | 177 | **Example**: A content creation assistant that helps writers research topics, find statistics, and verify information. 178 | 179 | **How SURF Helps**: 180 | - Gathers information from multiple sources 181 | - Extracts statistics, quotes, and key facts 182 | - Provides proper attribution for content 183 | - Ensures factual accuracy in created content 184 | 185 | ## Integration Strategies 186 | 187 | ### 1. Tool-Based Approach 188 | 189 | Implement SURF as a tool that your LLM can call when it needs information: 190 | 191 | ```python 192 | def answer_with_web_info(question): 193 | # First try to answer with local knowledge 194 | initial_answer = llm.generate(f"Answer this question: {question}") 195 | 196 | # Check if confidence is low or needs verification 197 | if "I don't know" in initial_answer or "I'm not sure" in initial_answer: 198 | # Search for information 199 | search_results = requests.get( 200 | "http://localhost:8000/search", 201 | params={"q": question, "format": "json", "max_results": 3} 202 | ).json() 203 | 204 | # Get content from top results 205 | sources = [] 206 | for result in search_results["results"][:2]: 207 | content = requests.get( 208 | f"http://localhost:8000/read/{result['url']}", 209 | params={"format": "json"} 210 | ).json() 211 | sources.append(content) 212 | 213 | # Create prompt with sources 214 | source_text = "\n\n".join([ 215 | f"SOURCE: {source['title']}\n{source['content']}" 216 | for source in sources 217 | ]) 218 | 219 | prompt = f""" 220 | Question: {question} 221 | 222 | Please answer the question based on these sources: 223 | 224 | {source_text} 225 | 226 | Answer: 227 | """ 228 | 229 | # Generate answer with sources 230 | return llm.generate(prompt) 231 | 232 | return initial_answer 233 | ``` 234 | 235 | ### 2. RAG Architecture 236 | 237 | Implement a Retrieval-Augmented Generation system using SURF as the retriever: 238 | 239 | ```python 240 | class SURFRetriever: 241 | def __init__(self, search_results_count=3, content_results_count=2): 242 | self.search_results_count = search_results_count 243 | self.content_results_count = content_results_count 244 | 245 | async def retrieve(self, query): 246 | # Search the web 247 | search_results = requests.get( 248 | "http://localhost:8000/search", 249 | params={"q": query, "format": "json", "max_results": self.search_results_count} 250 | ).json() 251 | 252 | # Get content from top results 253 | documents = [] 254 | for result in search_results["results"][:self.content_results_count]: 255 | try: 256 | content = requests.get( 257 | f"http://localhost:8000/read/{result['url']}", 258 | params={"format": "json"} 259 | ).json() 260 | 261 | documents.append({ 262 | "title": content["title"], 263 | "content": content["content"], 264 | "url": content["url"] 265 | }) 266 | except Exception as e: 267 | print(f"Error retrieving {result['url']}: {e}") 268 | 269 | return documents 270 | 271 | # Usage in RAG system 272 | retriever = SURFRetriever() 273 | documents = await retriever.retrieve("latest developments in quantum computing") 274 | 275 | # Generate with retrieved context 276 | context = "\n\n".join([f"SOURCE: {doc['title']} ({doc['url']})\n{doc['content']}" for doc in documents]) 277 | response = llm.generate(f"Context:\n{context}\n\nBased on the above information, what are the latest developments in quantum computing?") 278 | ``` 279 | 280 | ### 3. Hybrid Approach 281 | 282 | Combine local knowledge with web information: 283 | 284 | ```python 285 | def hybrid_answer(question): 286 | # 1. Try to answer with local knowledge 287 | local_answer = llm.generate( 288 | f"Question: {question}\nAnswer using only your built-in knowledge. If you're unsure, say 'NEED_WEB_SEARCH'." 289 | ) 290 | 291 | # 2. If local knowledge is insufficient, use web search 292 | if "NEED_WEB_SEARCH" in local_answer: 293 | # Use SURF to get web information 294 | web_info = get_web_information(question) 295 | 296 | # Generate answer with web info 297 | web_answer = llm.generate( 298 | f"Question: {question}\nInformation from the web: {web_info}\nAnswer based on this information:" 299 | ) 300 | 301 | # Return with attribution 302 | return f"{web_answer}\n\nThis answer is based on current web information." 303 | 304 | return local_answer 305 | ``` 306 | 307 | ### 4. Model Context Protocol (MCP) Integration 308 | 309 | [Model Context Protocol (MCP)](https://modelcontextprotocol.io) is an open standard that enables AI assistants to interact with external data sources and tools in a standardized way. SURF makes implementing MCP servers remarkably easy, allowing your AI applications to access web information through a standardized interface. 310 | 311 | **Example**: Creating a SURF-based MCP server for web search and content retrieval: 312 | 313 | ```python 314 | from mcp import MCPServer, ServerSchema, RequestContext 315 | import requests 316 | 317 | # Define SURF MCP server with minimal boilerplate 318 | schema = ServerSchema( 319 | name="surf-web-access", 320 | description="Web search and content reading via SURF", 321 | capabilities=[ 322 | { 323 | "name": "search_web", 324 | "description": "Search the web for information", 325 | "parameters": { 326 | "type": "object", 327 | "properties": { 328 | "query": {"type": "string", "description": "Search query"} 329 | }, 330 | "required": ["query"] 331 | } 332 | }, 333 | { 334 | "name": "read_webpage", 335 | "description": "Read content from a webpage", 336 | "parameters": { 337 | "type": "object", 338 | "properties": { 339 | "url": {"type": "string", "description": "URL to read"} 340 | }, 341 | "required": ["url"] 342 | } 343 | } 344 | ] 345 | ) 346 | 347 | # Simple handler that directly leverages SURF API 348 | async def search_web(context: RequestContext, params: dict): 349 | response = requests.get( 350 | "http://localhost:8000/search", 351 | params={"q": params["query"], "format": "json"} 352 | ) 353 | return response.json() 354 | 355 | async def read_webpage(context: RequestContext, params: dict): 356 | response = requests.get( 357 | f"http://localhost:8000/read/{params['url']}", 358 | params={"format": "json"} 359 | ) 360 | return response.json() 361 | 362 | # Create and start MCP server 363 | server = MCPServer(schema=schema) 364 | server.register_capability("search_web", search_web) 365 | server.register_capability("read_webpage", read_webpage) 366 | server.start() 367 | ``` 368 | 369 | **Benefits of SURF-based MCP Servers**: 370 | 371 | 1. **Simplified Implementation**: SURF handles the complex web processing, allowing your MCP server to be clean and focused 372 | 2. **Standardized Integration**: Any MCP-compatible AI assistant (like Claude) can use your server 373 | 3. **Enhanced AI Capabilities**: Give AI models access to up-to-date web information and content 374 | 4. **Rapid Development**: Create powerful MCPs with minimal code by leveraging SURF's robust web processing 375 | 376 | **Real-world Applications**: 377 | 378 | - **Knowledge Management Systems**: Create an MCP that searches both internal documentation and the web 379 | - **Research Assistants**: Enable AI to gather information from multiple sources through a unified MCP interface 380 | - **Customer Support**: Let AI representatives look up product information, policies, and external references 381 | - **Content Creation**: Provide AI with the ability to research topics and gather accurate information 382 | 383 | By leveraging SURF to create MCP servers, developers can quickly enable AI systems to interact with the web in a standardized, secure way, without needing to implement complex web crawling or content processing logic. 384 | 385 | ## Best Practices 386 | 387 | 1. **Clear Source Attribution** 388 | - Always provide sources for information retrieved from the web 389 | - Include URLs and titles when presenting information to users 390 | 391 | 2. **Multiple Source Verification** 392 | - Use multiple sources to verify information 393 | - Compare information across different sources for accuracy 394 | 395 | 3. **Content Freshness Awareness** 396 | - Check publication dates when available 397 | - Prioritize recent sources for time-sensitive topics 398 | 399 | 4. **Error Handling** 400 | - Implement robust error handling for network issues 401 | - Have fallback strategies when web search fails 402 | 403 | 5. **User Transparency** 404 | - Clearly indicate when information comes from the web 405 | - Distinguish between the LLM's knowledge and web-retrieved information -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | fastapi==0.105.0 2 | uvicorn==0.24.0 3 | aiohttp==3.9.1 4 | python-dotenv==1.0.0 5 | beautifulsoup4==4.12.2 6 | markdown==3.5.2 7 | pydantic==2.5.2 8 | certifi==2023.11.17 9 | urllib3==2.1.0 10 | lxml==4.9.3 -------------------------------------------------------------------------------- /run.py: -------------------------------------------------------------------------------- 1 | import uvicorn 2 | import argparse 3 | import asyncio 4 | from app.core.config import settings 5 | from app.utils.web_fetcher import WebFetcher 6 | from app.utils.html_parser import HTMLCleaner 7 | 8 | 9 | async def test_html_parser(url="https://en.wikipedia.org/wiki/Markdown"): 10 | """Test the HTML parser with a sample URL.""" 11 | print(f"\n\nTesting HTML Parser with URL: {url}") 12 | print("=" * 80) 13 | 14 | # Fetch the URL 15 | result = await WebFetcher.fetch_url(url) 16 | if not result: 17 | print(f"Failed to fetch URL: {url}") 18 | return 19 | 20 | content, content_type = result 21 | 22 | # Process the HTML 23 | if 'text/html' in content_type: 24 | processed = await HTMLCleaner.process_html(content, url) 25 | 26 | # Print the title 27 | print(f"Title: {processed['title']}") 28 | print(f"URL: {processed['url']}") 29 | print("\nContent Preview (first 1000 characters):") 30 | print("-" * 80) 31 | print(processed['content'][:1000] + "...") 32 | print("-" * 80) 33 | 34 | # Check for tables 35 | if "| ---" in processed['content']: 36 | table_start = processed['content'].find("|") 37 | table_end = processed['content'].find("\n\n", table_start) 38 | if table_end == -1: 39 | table_end = table_start + 500 40 | 41 | print("\nTable Found:") 42 | print("-" * 80) 43 | print(processed['content'][table_start:table_end]) 44 | print("-" * 80) 45 | else: 46 | print("\nNo tables found in the content.") 47 | else: 48 | print(f"URL did not return HTML content: {content_type}") 49 | 50 | 51 | def main(): 52 | """Main entry point for the application.""" 53 | # Parse command line arguments 54 | parser = argparse.ArgumentParser(description="Run the SURF API server") 55 | parser.add_argument("--test", action="store_true", help="Test HTML parser with a URL") 56 | parser.add_argument("--url", type=str, help="URL to test with HTML parser") 57 | args = parser.parse_args() 58 | 59 | if args.test: 60 | # Run the test in an asyncio event loop 61 | asyncio.run(test_html_parser(args.url)) 62 | return 63 | 64 | # Run the FastAPI application directly without asyncio 65 | # Uvicorn manages its own event loop 66 | port = int(settings.port) if hasattr(settings, 'port') else 8000 67 | uvicorn.run( 68 | "app.main:app", 69 | host="0.0.0.0", 70 | port=port, 71 | reload=settings.debug 72 | ) 73 | 74 | 75 | if __name__ == "__main__": 76 | main() -------------------------------------------------------------------------------- /test_duckduckgo.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | """Test script for DuckDuckGo search client.""" 3 | 4 | import asyncio 5 | import sys 6 | from app.utils.duckduckgo import DuckDuckGoClient 7 | 8 | async def test_duckduckgo_search(query, num_results=5): 9 | """Test DuckDuckGo search with a given query.""" 10 | print(f"Searching DuckDuckGo for: {query}") 11 | client = DuckDuckGoClient() 12 | results = await client.search(query, num_results=num_results) 13 | 14 | if not results: 15 | print("No results found.") 16 | return 17 | 18 | print(f"Found {len(results)} results:") 19 | for i, result in enumerate(results, 1): 20 | print(f"\n{i}. {result.get('title', 'No Title')}") 21 | print(f" URL: {result.get('url', 'No URL')}") 22 | print(f" Snippet: {result.get('snippet', 'No snippet')[:100]}...") 23 | 24 | if __name__ == "__main__": 25 | if len(sys.argv) < 2: 26 | print("Usage: python test_duckduckgo.py [num_results]") 27 | sys.exit(1) 28 | 29 | query = sys.argv[1] 30 | num_results = int(sys.argv[2]) if len(sys.argv) > 2 else 5 31 | 32 | asyncio.run(test_duckduckgo_search(query, num_results)) --------------------------------------------------------------------------------