├── .DS_Store ├── examples ├── __init__.py ├── .DS_Store ├── firecrawl_automated_whitepaper_tracking │ ├── tests │ │ ├── __init__.py │ │ ├── test_webhook.py │ │ └── test_semantic_filter.py │ ├── .DS_Store │ ├── test.png │ ├── images │ │ └── hf-daily-papers-github-logo.png │ ├── __init__.py │ ├── .gitignore │ ├── .env_example │ ├── pyproject.toml │ ├── paper-tracker.yml │ ├── category_prompt.py │ ├── logging_config.py │ ├── discord_notifications.py │ ├── hf_white_paper_tracker.py │ ├── semantic_filter.py │ ├── x_post_v3.py │ ├── x_post.py │ ├── firecrawl_crawl_extract.py │ ├── supabase_db.py │ ├── README.md │ └── x_post_v2.py └── .gitignore ├── .github ├── .DS_Store └── workflows │ └── paper-tracker.yml ├── images └── firecrawl-quickstarts-github-cover.png ├── .gitignore ├── README.md ├── llm_extract_tutorial.ipynb ├── crawl_and_extract_with_openai_o1.ipynb ├── crawl_and_extract_with_xai_grok.ipynb └── claude_researcher_with_map.ipynb /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/.DS_Store -------------------------------------------------------------------------------- /examples/__init__.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Empty file to mark directory as Python package""" 2 | -------------------------------------------------------------------------------- /.github/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/.github/.DS_Store -------------------------------------------------------------------------------- /examples/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/examples/.DS_Store -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/tests/__init__.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Empty file to mark directory as Python package""" 2 | -------------------------------------------------------------------------------- /images/firecrawl-quickstarts-github-cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/images/firecrawl-quickstarts-github-cover.png -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/examples/firecrawl_automated_whitepaper_tracking/.DS_Store -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/test.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/examples/firecrawl_automated_whitepaper_tracking/test.png -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/images/hf-daily-papers-github-logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/alexfazio/firecrawl-quickstarts/main/examples/firecrawl_automated_whitepaper_tracking/images/hf-daily-papers-github-logo.png -------------------------------------------------------------------------------- /examples/.gitignore: -------------------------------------------------------------------------------- 1 | # Python cache files 2 | __pycache__/ 3 | *.pyc 4 | *.pyo 5 | *.pyd 6 | 7 | # Logs 8 | logs/ 9 | *.log 10 | **/logs/ 11 | **/logs/** 12 | 13 | # macOS system files 14 | .DS_Store 15 | .DS_Store? 16 | ._* 17 | 18 | # IDE 19 | .idea/ 20 | *.iml -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/__init__.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | Empty __init__.py file to ensure this directory is recognized as a package/module, 3 | allowing imports like "from category_prompt import DESIRED_CATEGORY" in discord_notifications.py. 4 | """ -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | .env 3 | 4 | # PyCharm 5 | .idea/ 6 | *.iml 7 | *.iws 8 | *.ipr 9 | *.xml 10 | .idea_modules/ 11 | *.pyc 12 | __pycache__/ 13 | *.pyo 14 | *.pyd 15 | .Python 16 | .python-version 17 | 18 | # GitHub Actions 19 | .github/workflows/.env 20 | 21 | # Temporary files 22 | *.log 23 | *.json 24 | 25 | # Add this near the top, before the *.pyc line 26 | !__init__.py 27 | 28 | # Database migrations 29 | migrate_to_v1.py 30 | migrate_to_v2.py 31 | migrate_to_v3.py 32 | revert_to_v2.py 33 | migrate_make_dates_nullable.py 34 | 35 | # Logs directory - comprehensive exclusion 36 | logs/ 37 | *.log 38 | examples/logs/ 39 | examples/logs/** 40 | **/logs/ 41 | **/logs/** -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/.env_example: -------------------------------------------------------------------------------- 1 | # Firecrawl API credentials 2 | FIRECRAWL_API_KEY=your_firecrawl_api_key 3 | 4 | # Database configuration 5 | POSTGRES_URL=postgresql://user:password@host:port/database 6 | 7 | # Discord webhook for notifications 8 | DISCORD_WEBHOOK_URL=your_discord_webhook_url 9 | 10 | # OpenAI API credentials 11 | OPENAI_API_KEY=your_openai_api_key 12 | 13 | # X (Twitter) API credentials 14 | X_API_KEY=your_x_api_key 15 | X_API_SECRET=your_x_api_secret 16 | X_OAUTH2_CLIENT_ID=your_oauth2_client_id 17 | X_OAUTH2_CLIENT_SECRET=your_oauth2_client_secret 18 | 19 | # The following X OAuth tokens will be auto-generated after local authorization 20 | # X_ACCESS_TOKEN= 21 | # X_REFRESH_TOKEN= 22 | # X_TOKEN_EXPIRES_IN= 23 | # X_TOKEN_SCOPE= -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "firecrawl-automated-whitepaper-tracking" 3 | version = "0.1.0" 4 | description = "" 5 | authors = ["alexfazio "] 6 | license = "MIT" 7 | readme = "README.md" 8 | package-mode = false 9 | 10 | [tool.poetry.dependencies] 11 | python = "3.10.15" 12 | firecrawl-py = "1.6.8" 13 | pydantic = "2.10.4" 14 | psycopg2-binary = "2.9.10" 15 | python-dotenv = "1.0.1" 16 | sqlalchemy = "*" 17 | pandas = "2.2.3" 18 | plotly = "5.24.1" 19 | aiohttp = "3.11.11" 20 | altair = "5.5.0" 21 | openai = ">=0.28.0" 22 | pytz = "2024.1" 23 | requests-oauthlib = "^2.0.0" 24 | requests = "^2.32.3" 25 | playwright = "^1.49.1" 26 | 27 | [build-system] 28 | requires = ["poetry-core"] 29 | build-backend = "poetry.core.masonry.api" 30 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/tests/test_webhook.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Module for testing the Discord webhook.""" 2 | 3 | import os 4 | from dotenv import load_dotenv 5 | import requests 6 | import json 7 | 8 | def test_discord_webhook(): 9 | """Test the Discord webhook configuration by sending a test message.""" 10 | # Load environment variables 11 | load_dotenv() 12 | 13 | webhook_url = os.getenv('DISCORD_WEBHOOK_URL') 14 | if not webhook_url: 15 | raise ValueError("DISCORD_WEBHOOK_URL not found in .env file") 16 | 17 | # Test message 18 | message = { 19 | "content": "🎉 Webhook test successful! Your Discord notifications are working.", 20 | "embeds": [{ 21 | "title": "Test Embed", 22 | "description": "This is a test message to verify the webhook configuration.", 23 | "color": 5814783, # A nice blue color 24 | "fields": [ 25 | { 26 | "name": "Status", 27 | "value": "✅ Connected", 28 | "inline": True 29 | } 30 | ] 31 | }] 32 | } 33 | 34 | try: 35 | response = requests.post( 36 | webhook_url, 37 | data=json.dumps(message), 38 | headers={'Content-Type': 'application/json'}, 39 | timeout=10 # 10 seconds timeout 40 | ) 41 | response.raise_for_status() 42 | print("Test message sent successfully!") 43 | return True 44 | except requests.exceptions.RequestException as e: 45 | print(f"Error sending test message: {e}") 46 | return False 47 | 48 | if __name__ == "__main__": 49 | test_discord_webhook() 50 | -------------------------------------------------------------------------------- /.github/workflows/paper-tracker.yml: -------------------------------------------------------------------------------- 1 | name: Paper Tracker 2 | 3 | on: 4 | schedule: 5 | - cron: '0 */12 * * *' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | track-papers: 10 | runs-on: ubuntu-latest 11 | defaults: 12 | run: 13 | working-directory: examples/firecrawl_automated_whitepaper_tracking 14 | 15 | steps: 16 | - name: Check out repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: '3.10.15' 23 | cache: 'pip' 24 | 25 | - name: Install Poetry 26 | run: | 27 | curl -sSL https://install.python-poetry.org | python3 - 28 | poetry --version 29 | 30 | - name: Install dependencies 31 | run: | 32 | poetry install 33 | 34 | - name: Test Database Connection (Optional) 35 | env: 36 | POSTGRES_URL: ${{ secrets.POSTGRES_URL }} # <-- Must be port 6543 now 37 | run: | 38 | echo "Testing database connection with psql..." 39 | sudo apt-get update && sudo apt-get install -y postgresql-client 40 | psql "$POSTGRES_URL" -c '\conninfo' 41 | 42 | - name: Run paper tracker 43 | env: 44 | DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }} 45 | FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} 46 | POSTGRES_URL: ${{ secrets.POSTGRES_URL }} # <-- Also port 6543 here 47 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 48 | run: | 49 | poetry run python hf_white_paper_tracker.py 50 | 51 | - name: Notify on Failure 52 | if: failure() 53 | env: 54 | DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }} 55 | run: | 56 | curl -H "Content-Type: application/json" -X POST \ 57 | -d '{"content":"⚠️ Paper Tracker workflow failed! Please check the GitHub Actions logs."}' \ 58 | $DISCORD_WEBHOOK_URL 59 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/paper-tracker.yml: -------------------------------------------------------------------------------- 1 | name: Paper Tracker 2 | 3 | on: 4 | schedule: 5 | - cron: '0 */12 * * *' 6 | workflow_dispatch: 7 | 8 | jobs: 9 | track-papers: 10 | runs-on: ubuntu-latest 11 | defaults: 12 | run: 13 | working-directory: examples/firecrawl_automated_whitepaper_tracking 14 | 15 | steps: 16 | - name: Check out repository 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: '3.10' 23 | cache: 'pip' 24 | 25 | - name: Install Poetry 26 | run: | 27 | curl -sSL https://install.python-poetry.org | python3 - 28 | poetry --version 29 | 30 | - name: Install dependencies 31 | run: | 32 | poetry install 33 | 34 | - name: Test Database Connection (Optional) 35 | env: 36 | POSTGRES_URL: ${{ secrets.POSTGRES_URL }} # <-- Must be port 6543 now 37 | run: | 38 | echo "Testing database connection with psql..." 39 | sudo apt-get update && sudo apt-get install -y postgresql-client 40 | psql "$POSTGRES_URL" -c '\conninfo' 41 | 42 | - name: Run paper tracker 43 | env: 44 | DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }} 45 | FIRECRAWL_API_KEY: ${{ secrets.FIRECRAWL_API_KEY }} 46 | POSTGRES_URL: ${{ secrets.POSTGRES_URL }} # <-- Also port 6543 here 47 | OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }} 48 | run: | 49 | poetry run python hf_white_paper_tracker.py 50 | 51 | - name: Notify on Failure 52 | if: failure() 53 | env: 54 | DISCORD_WEBHOOK_URL: ${{ secrets.DISCORD_WEBHOOK_URL }} 55 | run: | 56 | curl -H "Content-Type: application/json" -X POST \ 57 | -d '{"content":"⚠️ Paper Tracker workflow failed! Please check the GitHub Actions logs."}' \ 58 | $DISCORD_WEBHOOK_URL 59 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 2 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 3 | 4 | # User-specific stuff 5 | .idea/**/workspace.xml 6 | .idea/**/tasks.xml 7 | .idea/**/usage.statistics.xml 8 | .idea/**/dictionaries 9 | .idea/**/shelf 10 | 11 | # AWS User-specific 12 | .idea/**/aws.xml 13 | 14 | # Generated files 15 | .idea/**/contentModel.xml 16 | 17 | # Sensitive or high-churn files 18 | .idea/**/dataSources/ 19 | .idea/**/dataSources.ids 20 | .idea/**/dataSources.local.xml 21 | .idea/**/sqlDataSources.xml 22 | .idea/**/dynamic.xml 23 | .idea/**/uiDesigner.xml 24 | .idea/**/dbnavigator.xml 25 | 26 | # Gradle 27 | .idea/**/gradle.xml 28 | .idea/**/libraries 29 | 30 | # Gradle and Maven with auto-import 31 | # When using Gradle or Maven with auto-import, you should exclude module files, 32 | # since they will be recreated, and may cause churn. Uncomment if using 33 | # auto-import. 34 | # .idea/artifacts 35 | # .idea/compiler.xml 36 | # .idea/jarRepositories.xml 37 | # .idea/modules.xml 38 | # .idea/*.iml 39 | # .idea/modules 40 | # *.iml 41 | # *.ipr 42 | 43 | # CMake 44 | cmake-build-*/ 45 | 46 | # Mongo Explorer plugin 47 | .idea/**/mongoSettings.xml 48 | 49 | # File-based project format 50 | *.iws 51 | 52 | # IntelliJ 53 | out/ 54 | 55 | # mpeltonen/sbt-idea plugin 56 | .idea_modules/ 57 | 58 | # JIRA plugin 59 | atlassian-ide-plugin.xml 60 | 61 | # Cursive Clojure plugin 62 | .idea/replstate.xml 63 | 64 | # SonarLint plugin 65 | .idea/sonarlint/ 66 | 67 | # Crashlytics plugin (for Android Studio and IntelliJ) 68 | com_crashlytics_export_strings.xml 69 | crashlytics.properties 70 | crashlytics-build.properties 71 | fabric.properties 72 | 73 | # Editor-based Rest Client 74 | .idea/httpRequests 75 | 76 | # Android studio 3.1+ serialized cache file 77 | .idea/caches/build_file_checksums.ser 78 | 79 | # Files for environment variables, excluding examples 80 | *.env 81 | !.env.example 82 | 83 | # macOS system files 84 | .DS_Store 85 | .DS_Store? 86 | ._* 87 | 88 | # PyCharm 89 | .idea/ 90 | *.iml 91 | *.iws 92 | *.ipr 93 | *.xml 94 | .idea_modules/ 95 | *.pyc 96 | __pycache__/ 97 | *.pyo 98 | *.pyd 99 | .Python 100 | .python-version 101 | 102 | # VSCode 103 | .vscode/* 104 | !.vscode/extensions.json 105 | !.vscode/launch.json 106 | !.vscode/tasks.json 107 | *.code-workspace 108 | .history/ 109 | .settings/ 110 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/category_prompt.py: -------------------------------------------------------------------------------- 1 | __doc__ = """ 2 | This file defines the DESIRED_CATEGORY as a single, long string 3 | that describes the category of interest. 4 | """ 5 | 6 | DESIRED_CATEGORY = """ 7 | **Definition of “AI Agents”** 8 | An "AI Agent" is any system in which a large language model (LLM): 9 | 1. Maintains Dynamic Control over how tasks are accomplished, including 10 | which tools or APIs are used and in what sequence. 11 | 2. Plans, Reasons, and Adapts its approach based on user goals and 12 | feedback from its environment (e.g., tool outputs, code execution, 13 | external data). 14 | 3. Acts Autonomously or Semi-Autonomously in open-ended or complex tasks 15 | that cannot be fully decomposed in advance. 16 | 4. Demonstrates Decision-Making beyond hardcoded or strictly 17 | human-defined workflow paths, such as deciding what to do next at 18 | each step (versus executing a single, fixed script). 19 | 20 | **Core Criterion for Classification** 21 | A white paper belongs to the “AI Agents” category if its primary focus 22 | describes, evaluates, measures, or demonstrates LLM-based systems that 23 | exhibit or aim to exhibit one or more of the above qualities. This includes 24 | systems that: 25 | • Show Partial, Incremental, or Full Autonomy in real-world or 26 | simulated tasks. 27 | • Employ LLMs to Dynamically Decide how to use tools (e.g., web 28 | browsing, code writing, system commands). 29 | • Investigate, Benchmark, or Compare the performance of such agentic 30 | systems, even if only a subset of tasks is completed autonomously. 31 | • Provide Frameworks for Building or Testing agentic capabilities in 32 | LLMs (e.g., multi-step planning, chain-of-thought reasoning, 33 | environment/tool usage). 34 | 35 | **Clarifications to Prevent Underclassification** 36 | • Partial Autonomy Counts: Papers need not demonstrate 100% autonomous 37 | task completion. Even if an LLM handles only a fraction of tasks 38 | without human intervention, it can still qualify if the system’s 39 | goal or design involves adaptive or autonomous capabilities. 40 | • Research or Benchmarking is Included: Papers that focus on measuring, 41 | experimenting with, or benchmarking LLM agents should be classified 42 | as “AI Agents” if they revolve around agentic behavior, even if the 43 | research finds current systems are limited or only partially 44 | successful. 45 | • Use of Tools or Environment: If the paper describes LLMs selecting 46 | and executing code, commands, or API calls at their own discretion 47 | (i.e., not merely a single-step prompt for code generation), it 48 | likely falls under agentic systems. 49 | • Evaluation of Agent Performance: Studies that assess the 50 | effectiveness, reliability, or scalability of AI agents in performing 51 | tasks should be included if they address the agent’s ability to 52 | autonomously manage and execute tasks. 53 | • Integration with External Systems: Papers that explore how AI agents 54 | interact with external systems, databases, or APIs to accomplish 55 | tasks should be considered relevant. 56 | 57 | **Exclusion Criterion** 58 | A paper should not be classified under “AI Agents” if it only: 59 | • Discusses Static or Single-Step LLM Prompts that generate answers, 60 | translations, or content without autonomy or iterative 61 | decision-making. 62 | • Describes Purely Human-Orchestrated Pipelines where the LLM’s role is 63 | strictly predefined at each step (no dynamic path-finding, tool 64 | selection, or open-ended planning). 65 | • Focuses on General LLM Usage (e.g., chatbots, Q&A systems) without 66 | discussing autonomy, adaptive behavior, or iterative tool usage. 67 | 68 | **Likely Categories for Agentic Systems Papers** 69 | Based on Anthropic’s blog post, these arXiv categories are the most likely 70 | homes for papers on agentic LLM systems: 71 | • Multiagent Systems (cs.MA) – Most directly relevant 72 | • Artificial Intelligence (cs.AI) 73 | • Computation and Language (cs.CL) 74 | • Machine Learning (cs.LG) 75 | • Human-Computer Interaction (cs.HC) 76 | • Software Engineering (cs.SE) 77 | """ -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/logging_config.py: -------------------------------------------------------------------------------- 1 | """Module for configuring logging across the application.""" 2 | 3 | import logging 4 | from logging.handlers import RotatingFileHandler 5 | from datetime import datetime 6 | from functools import wraps # Needed for the decorator 7 | from pathlib import Path 8 | 9 | def setup_base_logging( 10 | logger_name: str, 11 | log_file: str = None, 12 | log_level: int = logging.INFO, 13 | format_string: str = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 14 | ) -> logging.Logger: 15 | """Configure base logging with both file and console handlers. 16 | 17 | Args: 18 | logger_name (str): Name of the logger to configure 19 | log_file (str, optional): Path to log file. If None, only console logging is used 20 | log_level (int, optional): Logging level. Defaults to INFO 21 | format_string (str, optional): Format string for log messages 22 | 23 | Returns: 24 | logging.Logger: Configured logger instance 25 | """ 26 | # Create logs directory relative to the examples directory 27 | logs_dir = Path(__file__).parent.parent / 'logs' 28 | logs_dir.mkdir(parents=True, exist_ok=True) 29 | 30 | logger = logging.getLogger(logger_name) 31 | logger.setLevel(log_level) 32 | 33 | # Clear any existing handlers 34 | if logger.hasHandlers(): 35 | logger.handlers.clear() 36 | 37 | # Create formatter 38 | formatter = logging.Formatter(format_string) 39 | 40 | # Create and configure console handler 41 | console_handler = logging.StreamHandler() 42 | console_handler.setLevel(log_level) 43 | console_handler.setFormatter(formatter) 44 | logger.addHandler(console_handler) 45 | 46 | # Add file handler if log_file is specified 47 | if log_file: 48 | log_path = logs_dir / log_file 49 | file_handler = RotatingFileHandler( 50 | log_path, 51 | maxBytes=5*1024*1024, # 5MB 52 | backupCount=5 53 | ) 54 | file_handler.setLevel(log_level) 55 | file_handler.setFormatter(formatter) 56 | logger.addHandler(file_handler) 57 | 58 | return logger 59 | 60 | def setup_crawler_logging() -> logging.Logger: 61 | """ 62 | Configure logging for the crawler with file output in the specified logs directory. 63 | 64 | Returns: 65 | logging.Logger: Configured logger instance 66 | """ 67 | timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") 68 | return setup_base_logging( 69 | logger_name="hf_paper_tracker", 70 | log_file=f"paper_tracker_{timestamp}.log" 71 | ) 72 | 73 | def setup_semantic_filter_logging() -> logging.Logger: 74 | """Configure logging specifically for the semantic filter module.""" 75 | timestamp = datetime.now().strftime("%Y%m%d") 76 | return setup_base_logging( 77 | logger_name='semantic_filter', 78 | log_file=f'semantic_filter_{timestamp}.log', 79 | format_string='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s' 80 | ) 81 | 82 | def setup_database_logging() -> logging.Logger: 83 | """Configure logging specifically for the database module.""" 84 | return setup_base_logging( 85 | logger_name='database', 86 | log_file='database.log' 87 | ) 88 | 89 | def log_function_call(func): 90 | """Decorator to log entry and exit of functions.""" 91 | @wraps(func) 92 | def wrapper(*args, **kwargs): 93 | logger = logging.getLogger('semantic_filter') # or whichever logger name you prefer 94 | logger.info("Entering %s", func.__name__) 95 | try: 96 | result = func(*args, **kwargs) 97 | logger.info("Exiting %s successfully", func.__name__) 98 | return result 99 | except Exception as e: 100 | logger.error("Error in %s: %s", func.__name__, str(e)) 101 | raise 102 | return wrapper 103 | 104 | # TODO: change logging style to display logging level before module name 105 | # e.g. 2024-12-24 12:05:18,885 - INFO - hf_paper_tracker - Extracting paper details from 106 | # instead of 2024-12-24 12:05:18,885 - hf_paper_tracker - INFO - Extracting paper details from 107 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/discord_notifications.py: -------------------------------------------------------------------------------- 1 | """Module for sending notifications about new research papers to Discord.""" 2 | 3 | import os 4 | import asyncio 5 | import aiohttp 6 | from dotenv import load_dotenv 7 | from logging_config import setup_base_logging, log_function_call 8 | 9 | # Configure logging using the centralized configuration 10 | logger = setup_base_logging( 11 | logger_name="discord_notifier", 12 | log_file="discord_notifications.log", 13 | format_string='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s' 14 | ) 15 | 16 | load_dotenv() 17 | 18 | @log_function_call 19 | async def send_paper_notification( 20 | paper_title: str, 21 | authors: list, 22 | abstract: str, 23 | upvotes: int, 24 | comments: int, 25 | url: str, 26 | pdf_url: str = None, 27 | arxiv_url: str = None, 28 | github_url: str = None 29 | ): 30 | """Send a new paper notification to Discord""" 31 | logger.info(f"Preparing notification for paper: {paper_title}") 32 | 33 | # Create links section 34 | links = [] 35 | if pdf_url: 36 | links.append(f"[📄 PDF]({pdf_url})") 37 | logger.debug("Added PDF link to notification") 38 | if arxiv_url: 39 | links.append(f"[📝 arXiv]({arxiv_url})") 40 | logger.debug("Added arXiv link to notification") 41 | if github_url: 42 | links.append(f"[💻 GitHub]({github_url})") 43 | logger.debug("Added GitHub link to notification") 44 | 45 | # Truncate abstract if needed 46 | truncated_abstract = abstract[:500] + ('...' if len(abstract) > 500 else '') 47 | logger.debug(f"Abstract truncated from {len(abstract)} to {len(truncated_abstract)} chars") 48 | 49 | message = { 50 | "embeds": [ 51 | { 52 | "title": "📚 New Paper Published!", 53 | "description": f"**{paper_title}**\n\n" 54 | f"**Authors:** {', '.join(authors)}\n\n" 55 | f"**Abstract:**\n{truncated_abstract}\n\n" 56 | f"**Stats:** ⬆️ {upvotes} | 💬 {comments}\n\n" 57 | f"**Links:**\n{' • '.join(links)}\n\n" 58 | f"[View on HuggingFace]({url})", 59 | "color": 5814783, # HF's purple color 60 | } 61 | ] 62 | } 63 | 64 | webhook_url = os.getenv("DISCORD_WEBHOOK_URL") 65 | if not webhook_url: 66 | logger.error("Discord webhook URL not found in environment variables") 67 | return 68 | 69 | try: 70 | logger.info("Sending notification to Discord webhook") 71 | async with aiohttp.ClientSession() as session: 72 | async with session.post(webhook_url, json=message) as response: 73 | if response.status == 204: # Discord returns 204 on success 74 | logger.info("Successfully sent Discord notification") 75 | else: 76 | response_text = await response.text() 77 | logger.error(f"Discord API returned status {response.status}: {response_text}") 78 | 79 | except aiohttp.ClientError as e: 80 | logger.error(f"Network error sending Discord notification: {str(e)}", exc_info=True) 81 | except Exception as e: 82 | logger.error(f"Unexpected error sending Discord notification: {str(e)}", exc_info=True) 83 | 84 | if __name__ == "__main__": 85 | logger.info("Starting Discord notification test") 86 | try: 87 | # Test notification 88 | asyncio.run( 89 | send_paper_notification( 90 | paper_title="Test Paper Title", 91 | authors=["Author 1", "Author 2"], 92 | abstract="This is a test abstract for the paper notification system.", 93 | upvotes=10, 94 | comments=5, 95 | url="https://huggingface.co/papers/test", 96 | pdf_url="https://example.com/test.pdf", 97 | arxiv_url="https://arxiv.org/abs/test", 98 | github_url="https://github.com/test/repo" 99 | ) 100 | ) 101 | logger.info("Test notification completed") 102 | except Exception as e: 103 | logger.error("Test notification failed:", exc_info=True) 104 | 105 | # TODO: implement discord button for feedback about relevancy of notifications, 106 | # this must be fed back into the database for subsequent refinement of prompts 107 | 108 | # TODO: implement admin-only error notifications in Discord - errors should only be 109 | # visible to channel administrators to avoid cluttering the main feed 110 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/hf_white_paper_tracker.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | """ 3 | Entry point module for the Hugging Face White Paper Tracker. 4 | Handles command line arguments and initiates the paper tracking process. 5 | """ 6 | 7 | import os 8 | import sys 9 | import argparse 10 | import asyncio 11 | import requests 12 | from typing import Optional 13 | from sqlalchemy.exc import SQLAlchemyError 14 | 15 | # Add project root to Python path 16 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")) 17 | sys.path.insert(0, project_root) 18 | 19 | # Now we can import our modules 20 | from examples.firecrawl_automated_whitepaper_tracking.firecrawl_crawl_extract import ( 21 | extract_paper_urls, 22 | process_paper_batch, 23 | get_todays_papers_url 24 | ) 25 | from examples.firecrawl_automated_whitepaper_tracking.supabase_db import Database 26 | from examples.firecrawl_automated_whitepaper_tracking.logging_config import setup_crawler_logging 27 | 28 | # Initialize logger 29 | logger = setup_crawler_logging() 30 | 31 | def verify_database_connection(db: Database) -> tuple[bool, str]: 32 | """Test database connection and return status.""" 33 | logger.debug("Verifying database connection...") 34 | try: 35 | db.get_all_papers() 36 | return True, "Database connection successful" 37 | except SQLAlchemyError as e: 38 | logger.error("Database connection failed: %s", str(e)) 39 | return False, f"Database connection failed: {str(e)}" 40 | 41 | def verify_database_version(db: Database) -> tuple[bool, str]: 42 | """Verify database schema version matches required version.""" 43 | logger.debug("Verifying database schema version...") 44 | try: 45 | session = db.session_factory() 46 | db._check_schema_version() 47 | session.close() 48 | return True, f"Database schema version verified" 49 | except RuntimeError as e: 50 | logger.error("Database version check failed: %s", str(e)) 51 | return False, str(e) 52 | 53 | def perform_startup_checks(db: Database) -> None: 54 | """Perform all startup checks before proceeding.""" 55 | # Database connection check 56 | connection_ok, connection_msg = verify_database_connection(db) 57 | logger.info(connection_msg) 58 | if not connection_ok: 59 | raise RuntimeError(connection_msg) 60 | 61 | # Database version check 62 | version_ok, version_msg = verify_database_version(db) 63 | logger.info(version_msg) 64 | if not version_ok: 65 | raise RuntimeError(version_msg) 66 | 67 | def run_paper_tracker(url: Optional[str] = None, date: Optional[str] = None) -> None: 68 | """ 69 | Main function to run the paper tracking process. 70 | 71 | Args: 72 | url (Optional[str]): Full URL to crawl (e.g., https://huggingface.co/papers?date=2024-12-19) 73 | date (Optional[str]): Date in YYYY-MM-DD format (e.g., 2024-12-19) 74 | """ 75 | # Initialize database first 76 | db = Database(os.getenv("POSTGRES_URL")) 77 | logger.info("Database connection initialized") 78 | 79 | # Perform startup checks before proceeding 80 | perform_startup_checks(db) 81 | 82 | # Determine which URL to use 83 | if url: 84 | papers_url = url 85 | logger.info("Using provided full URL: %s", papers_url) 86 | elif date: 87 | papers_url = f"https://huggingface.co/papers?date={date}" 88 | logger.info("Using URL for specified date: %s", papers_url) 89 | else: 90 | papers_url = get_todays_papers_url() 91 | logger.info("Using today's papers URL: %s", papers_url) 92 | 93 | urls = extract_paper_urls(papers_url) 94 | logger.info("Found %d papers to process", len(urls)) 95 | 96 | try: 97 | asyncio.run(process_paper_batch(urls, db)) 98 | except (SQLAlchemyError, requests.RequestException, ValueError) as e: 99 | logger.error("Critical error in main process: %s", str(e), exc_info=True) 100 | raise 101 | 102 | if __name__ == "__main__": 103 | # Set up argument parser 104 | parser = argparse.ArgumentParser(description='Crawl and extract papers from HuggingFace.') 105 | parser.add_argument('--url', type=str, 106 | help='Full URL to crawl (e.g., https://huggingface.co/papers?date=2024-12-19)') 107 | parser.add_argument('--date', type=str, 108 | help='Date in YYYY-MM-DD format (e.g., 2024-12-19)') 109 | 110 | args = parser.parse_args() 111 | run_paper_tracker(url=args.url, date=args.date) 112 | 113 | # TODO: Include a Bluesky API call to publish the paper's posts to Bluesky. This will require a new 114 | # llm flow to generate the post content and a new function to send the post to Bluesky. 115 | # TODO: test db connection and add check for db versoin matching supabase_db.py before running any modules -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/semantic_filter.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Module for semantic filtering of research papers using OpenAI's API.""" 2 | 3 | import os 4 | import json 5 | 6 | from json import JSONDecodeError 7 | from pydantic import BaseModel, ValidationError 8 | from dotenv import load_dotenv 9 | 10 | import openai 11 | from logging_config import setup_semantic_filter_logging, log_function_call 12 | from category_prompt import DESIRED_CATEGORY 13 | 14 | # Load environment variables 15 | load_dotenv() 16 | 17 | # Configure OpenAI API key 18 | openai.api_key = os.getenv('OPENAI_API_KEY') 19 | 20 | # Configure logging using centralized configuration 21 | logger = setup_semantic_filter_logging() 22 | logger.info("Using OpenAI version: %s", openai.__version__) 23 | 24 | client = openai.OpenAI() 25 | 26 | class CategoryMatch(BaseModel): 27 | """ 28 | Pydantic model for paper category classification results. 29 | We validate the structured output from the model here, ensuring 30 | it has a boolean 'belongs_to_category' and a float 'confidence'. 31 | """ 32 | belongs_to_category: bool 33 | confidence: float 34 | 35 | @log_function_call 36 | def should_process(paper_details: dict, is_new_paper: bool) -> tuple[bool, float]: 37 | """ 38 | Determine if a paper should be processed (notifications, posts, etc.) 39 | 40 | Args: 41 | paper_details (dict): Dictionary containing paper details 42 | is_new_paper (bool): Whether this is a new paper or an update 43 | 44 | Returns: 45 | tuple[bool, float]: (should_process, confidence) 46 | - should_process: True if paper should be processed 47 | - confidence: Model's confidence score 48 | """ 49 | if not is_new_paper: 50 | return False, 0.0 51 | 52 | # Get semantic classification 53 | belongs, confidence = belongs_to_category( 54 | paper_details["paper_title"], 55 | paper_details["abstract_body"], 56 | DESIRED_CATEGORY 57 | ) 58 | 59 | # Only process if both belongs is True AND confidence is high enough 60 | return belongs and confidence > 0.8, confidence 61 | 62 | @log_function_call 63 | def belongs_to_category(paper_title: str, paper_abstract: str, desired_category: str) -> tuple[bool, float]: 64 | """ 65 | Determine if a paper belongs to a specific category using 66 | an OpenAI model that supports structured JSON outputs. 67 | 68 | Returns: 69 | tuple: (belongs_to_category: bool, confidence: float) 70 | """ 71 | logger.info("Analyzing paper: '%s' for category '%s'", paper_title, desired_category) 72 | 73 | system_instructions = ( 74 | "You are a research paper classifier. " 75 | "Given: a desired_category, a paper_title, and a paper_abstract, " 76 | "determine if the paper belongs to the desired_category. " 77 | "Output only valid JSON with the exact format: " 78 | "{ \"belongs_to_category\": boolean, \"confidence\": float }. " 79 | "Where 'belongs_to_category' is True if the paper belongs to the specified desired_category, " 80 | "otherwise False, and 'confidence' is a float between 0 and 1. No additional keys or text." 81 | ) 82 | 83 | user_prompt = ( 84 | f"desired_category: {desired_category}\n" 85 | f"paper_title: {paper_title}\n" 86 | f"paper_abstract: {paper_abstract}" 87 | ) 88 | 89 | try: 90 | response = client.chat.completions.create( 91 | model="gpt-4o-mini", 92 | messages=[ 93 | {"role": "system", "content": system_instructions}, 94 | {"role": "user", "content": user_prompt}, 95 | ], 96 | temperature=0.7 97 | ) 98 | # Add detailed response logging 99 | logger.debug("Full API response: %s", response) 100 | 101 | message_content = response.choices[0].message.content.strip() 102 | logger.debug("Raw message content: %s", message_content) 103 | 104 | if not message_content: 105 | logger.error("Empty response from model") 106 | return False, 0.0 107 | 108 | parsed_args = json.loads(message_content) 109 | 110 | classification = CategoryMatch(**parsed_args) 111 | logger.info( 112 | "Classification result: belongs=%s, confidence=%s", 113 | classification.belongs_to_category, 114 | classification.confidence 115 | ) 116 | return classification.belongs_to_category, classification.confidence 117 | 118 | except (JSONDecodeError, ValidationError) as e: 119 | logger.error("Error parsing or validating classification result: %s", e) 120 | return False, 0.0 121 | 122 | if __name__ == "__main__": 123 | logger.info("Starting semantic filter test") 124 | 125 | # Test the classifier 126 | TEST_TITLE = "Building Reliable LLM Agents: A Study in Reinforcement Learning" 127 | TEST_ABSTRACT = ( 128 | "This paper explores methods for creating more reliable AI agents using LLMs and RL..." 129 | ) 130 | CATEGORY = "LLM Agents" 131 | result = belongs_to_category(TEST_TITLE, TEST_ABSTRACT, CATEGORY) 132 | logger.info("Test result for category '%s': %s", CATEGORY, result) 133 | 134 | # TODO: implement the Instructor library for structured outputs to enhance 135 | # the flexibility of model switching 136 | # TODO: add examples to system prompt of abstract that are known to be 137 | # in the category (agents) 138 | # TODO: store paper category relevance evaluations and confidence scores in the database 139 | # to develop more accurate relevance response evaluations in the future 140 | # using the OpenAI Evals platform. 141 | # TODO: implement error handling for OpenAI API credit exhaustion and send admin-only 142 | # notifications to Discord using discord_notifications.py's webhook. Research needed: 143 | # Discord webhook might not support role-based visibility (@admin mentions) directly - 144 | # may need to create a separate admin-only channel or explore Discord bot implementation 145 | # TODO: swap out gpt-4o for gpt-4o-mini in the semantic filter to save on cost. 146 | # ensure to get the correct model name from the API docs 147 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/x_post_v3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import base64 3 | import hashlib 4 | import secrets 5 | import webbrowser 6 | from http.server import HTTPServer, BaseHTTPRequestHandler 7 | from urllib.parse import parse_qs, urlparse 8 | import requests 9 | import logging 10 | import re 11 | from dotenv import load_dotenv 12 | from requests_oauthlib import OAuth2Session 13 | import time 14 | 15 | # Configure logging 16 | logging.basicConfig( 17 | level=logging.DEBUG, 18 | format='%(asctime)s - %(levelname)s - %(message)s' 19 | ) 20 | logger = logging.getLogger(__name__) 21 | 22 | # Load environment variables 23 | load_dotenv() 24 | 25 | # API endpoints - Updated to correct domains 26 | MEDIA_ENDPOINT_URL = 'https://api.x.com/2/media/upload' 27 | POST_TO_X_URL = 'https://api.x.com/2/tweets' 28 | AUTH_URL = "https://twitter.com/i/oauth2/authorize" 29 | TOKEN_URL = "https://api.twitter.com/2/oauth2/token" 30 | CALLBACK_URL = "http://127.0.0.1:8000/callback" 31 | 32 | class CallbackHandler(BaseHTTPRequestHandler): 33 | """Handle OAuth callback""" 34 | code = None 35 | 36 | def do_GET(self): 37 | """Process callback GET request""" 38 | query = parse_qs(urlparse(self.path).query) 39 | CallbackHandler.code = query.get('code', [None])[0] 40 | 41 | self.send_response(200) 42 | self.send_header('Content-type', 'text/html') 43 | self.end_headers() 44 | self.wfile.write(b"Authorization successful! You can close this window.") 45 | 46 | class XPost: 47 | def __init__(self): 48 | client_id = os.getenv('X_OAUTH2_CLIENT_ID') 49 | client_secret = os.getenv('X_OAUTH2_CLIENT_SECRET') 50 | self.access_token = os.getenv('X_OAUTH2_ACCESS_TOKEN') 51 | refresh_token = os.getenv('X_OAUTH2_REFRESH_TOKEN') 52 | 53 | # Initialize OAuth2 session 54 | self.oauth = OAuth2Session( 55 | client_id, 56 | token={ 57 | 'access_token': self.access_token, 58 | 'refresh_token': refresh_token, 59 | 'token_type': 'bearer' 60 | } 61 | ) 62 | 63 | self.headers = { 64 | "Authorization": f"Bearer {self.access_token}", 65 | "Content-Type": "application/json", 66 | "User-Agent": "XPostBot" 67 | } 68 | 69 | def upload_image(self, image_path): 70 | """Upload an image using the v2 endpoint with multi-step process""" 71 | logger.info(f"Attempting to upload image: {image_path}") 72 | 73 | if not os.path.exists(image_path): 74 | raise FileNotFoundError(f"Image file not found: {image_path}") 75 | 76 | total_bytes = os.path.getsize(image_path) 77 | 78 | # Step 1: INIT 79 | init_data = { 80 | 'command': 'INIT', 81 | 'media_type': 'image/png', 82 | 'total_bytes': total_bytes, 83 | 'media_category': 'tweet_image' 84 | } 85 | 86 | response = self.oauth.post(MEDIA_ENDPOINT_URL, params=init_data, headers=self.headers) 87 | if response.status_code < 200 or response.status_code > 299: # Accept any 2xx status 88 | logger.error(f"Media INIT failed: {response.text}") 89 | raise Exception(f"Failed to initialize media upload: {response.text}") 90 | 91 | media_id = response.json()['data']['id'] 92 | 93 | # Step 2: APPEND 94 | with open(image_path, 'rb') as file: 95 | chunk = file.read() 96 | files = {'media': ('chunk', chunk, 'application/octet-stream')} 97 | data = { 98 | 'command': 'APPEND', 99 | 'media_id': media_id, 100 | 'segment_index': 0 101 | } 102 | 103 | response = self.oauth.post(MEDIA_ENDPOINT_URL, data=data, files=files) 104 | if response.status_code < 200 or response.status_code > 299: # Accept any 2xx status 105 | logger.error(f"Media APPEND failed: {response.text}") 106 | raise Exception(f"Failed to append media: {response.text}") 107 | 108 | # Step 3: FINALIZE 109 | finalize_data = { 110 | 'command': 'FINALIZE', 111 | 'media_id': media_id 112 | } 113 | 114 | response = self.oauth.post(MEDIA_ENDPOINT_URL, params=finalize_data, headers=self.headers) 115 | if response.status_code < 200 or response.status_code > 299: # Accept any 2xx status 116 | logger.error(f"Media FINALIZE failed: {response.text}") 117 | raise Exception(f"Failed to finalize media: {response.text}") 118 | 119 | logger.info(f"Successfully uploaded media with ID: {media_id}") 120 | return media_id 121 | 122 | def create_tweet(self, text, media_id): 123 | """Create a tweet with text and attached media""" 124 | logger.info("Attempting to create tweet") 125 | 126 | payload = { 127 | 'text': text, 128 | 'media': { 129 | 'media_ids': [media_id] 130 | } 131 | } 132 | 133 | response = self.oauth.post(POST_TO_X_URL, json=payload, headers=self.headers) 134 | 135 | logger.debug(f"Response status code: {response.status_code}") 136 | logger.debug(f"Response headers: {response.headers}") 137 | logger.debug(f"Response body: {response.text}") 138 | 139 | if response.status_code == 429: 140 | reset_time = int(response.headers.get('x-app-limit-24hour-reset', 0)) 141 | wait_seconds = max(reset_time - int(time.time()), 0) 142 | logger.warning(f"Rate limit exceeded. Waiting {wait_seconds} seconds...") 143 | time.sleep(wait_seconds) 144 | # Retry the request 145 | return self.create_tweet(text, media_id) 146 | elif response.status_code != 201: 147 | logger.error(f"Tweet creation failed with status {response.status_code}") 148 | logger.error(f"Response: {response.text}") 149 | raise Exception(f"Failed to create tweet: {response.text}") 150 | 151 | return response.json() 152 | 153 | def main(): 154 | try: 155 | poster = XPost() 156 | 157 | # Use absolute path or correct relative path to your image 158 | image_path = "test.png" # Update this to your image path 159 | logger.info(f"Starting process with image: {image_path}") 160 | 161 | # Upload image 162 | media_id = poster.upload_image(image_path) 163 | 164 | # Create tweet with image 165 | tweet_text = "Testing X API v2 with an image attachment! 🚀" 166 | result = poster.create_tweet(tweet_text, media_id) 167 | 168 | logger.info("Tweet posted successfully!") 169 | logger.info(f"Result: {result}") 170 | 171 | except Exception as e: 172 | logger.error(f"Error: {str(e)}", exc_info=True) 173 | raise 174 | 175 | if __name__ == "__main__": 176 | main() -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Firecrawl Quickstarts Logo 5 | 6 | 7 | 8 | Firecrawl Quickstarts is an independent and unofficial collection of projects designed to help developers quickly get started with building applications using the Firecrawl API. Each quickstart provides a foundation that you can easily build upon and customize for your specific needs. This repository is not affiliated with, endorsed by, or officially supported by Firecrawl. 9 | 10 | ## Getting Started 11 | 12 | To use these quickstarts, you'll need a Firecrawl API key. If you don't have one yet, you can sign up for free at [firecrawl.dev](https://firecrawl.dev). 13 | 14 | ## Available Quickstarts 15 | 16 | ### Eventbrite AI Event Scout 17 | 18 | Automated discovery of AI/ML events across major cities 19 | 20 | - Scrapes events from 50+ global cities daily 21 | - Confidence scoring for relevance filtering 22 | - NLP filtering for relevant content 23 | - Automated Discord alerts with event details 24 | 25 | [Go to Eventbrite AI Event Scout](./events-scout-examples/eventbrite.ipynb) 26 | 27 | ### Luma AI Event Discovery 28 | 29 | Automated tracking of tech meetups across 60+ global cities 30 | 31 | - Scrapes and structures event data (titles/dates/locations) 32 | - Confidence scoring for relevance filtering 33 | - NLP filtering for relevant events 34 | - Automated Discord alerts with event details 35 | 36 | [Go to Luma AI Event Discovery](./events-scout-examples/luma.ipynb) 37 | 38 | ### Firecrawl Web Crawling with OpenAI and Anthropic 39 | 40 | This quickstart introduces how to integrate Firecrawl with OpenAI's Anthropic models to search and extract information based on specific user objectives. Learn to map a website, identify relevant pages, and retrieve content aligned with the objective. Ideal for targeted information gathering. 41 | 42 | [Go to Firecrawl Web Crawling with OpenAI and Anthropic](./claude_researcher_with_map.ipynb) 43 | 44 | ### Integrating OpenAI o1 Models with Firecrawl 45 | 46 | Explore how to enhance the Firecrawl web crawling process with OpenAI’s o1 reasoning models. This quickstart guides you in using these advanced models to generate search parameters, map sites, and validate extracted content, enhancing the precision and relevance of data extraction. 47 | 48 | [Go to Integrating OpenAI o1 Models with Firecrawl](./crawl_and_extract_with_openai_o1.ipynb) 49 | 50 | ### Building a Web Crawler with Grok-2 and Firecrawl 51 | 52 | Combine Grok-2’s AI-powered understanding with Firecrawl’s search to create an intelligent web crawler. This quickstart demonstrates building a targeted crawler that finds and processes structured data on web pages, with output in JSON format for seamless data handling. 53 | 54 | [Go to Building a Web Crawler with Grok-2 and Firecrawl](./crawl_and_extract_with_xai_grok.ipynb) 55 | 56 | ### Firecrawl Map Endpoint Quickstart 57 | 58 | Learn how to use Firecrawl's Map endpoint to create comprehensive sitemaps from single URLs. This quickstart is perfect for efficiently gathering website structures, enabling tasks such as content mapping, SEO analysis, and scalable web data extraction. 59 | 60 | [Go to Firecrawl Map Endpoint Quickstart](./firecrawl_map_endpoint_tutorial.ipynb) 61 | 62 | ### Job Board Scraping with Firecrawl and OpenAI 63 | 64 | Automate job listing extraction and analysis with Firecrawl and OpenAI’s Structured Outputs. This quickstart demonstrates scraping job boards, extracting structured job details, and matching listings to a user’s resume with schema-compliant outputs for reliable data processing. 65 | 66 | [Go to Job Board Scraping with Firecrawl and OpenAI](./job_scraping_tutorial.ipynb) 67 | 68 | ### Firecrawl LLM Extract Tutorial 69 | 70 | Learn how to use Firecrawl’s LLM-powered data extraction features. This quickstart covers extracting structured data from web pages, with options for schema-defined and prompt-only extraction, making it adaptable for diverse data formats and applications. 71 | 72 | [Go to Firecrawl LLM Extract Tutorial](./llm_extract_tutorial.ipynb) 73 | 74 | ## General Usage 75 | 76 | Each quickstart project is a Jupyter notebook designed to be easily opened and run on Google Colab. To get started, follow these steps: 77 | 78 | 1. **Open the Repository in Google Colab** 79 | 80 | Each notebook has a link to open directly in Google Colab. Click on the link for the quickstart you want to explore. 81 | 82 | 2. **Set Up Your Firecrawl API Key** 83 | 84 | Each notebook requires a Firecrawl API key. Once you've created your key (available [here](https://firecrawl.dev)), enter it in the notebook when prompted or set it as an environment variable as directed in the notebook. 85 | 86 | 3. **Run Each Notebook Cell Sequentially** 87 | 88 | Follow the instructions within each notebook, running cells in order. The notebooks will guide you through each step, from setting up the environment to executing web scraping or extraction tasks. 89 | 90 | 4. **View Results and Experiment** 91 | 92 | The notebooks are designed to be interactive. You can modify the code cells, adjust parameters, or try different objectives to explore Firecrawl’s capabilities further. 93 | 94 | Each notebook includes explanations and usage examples to help you understand and customize your setup. 95 | 96 | 97 | ## Explore Further 98 | 99 | To deepen your understanding of working with Firecrawl and its API, check out these resources: 100 | 101 | - [**Firecrawl Documentation**](https://docs.firecrawl.dev) - Comprehensive guides and API references 102 | - [**Firecrawl SDKs**](https://docs.firecrawl.dev/sdks/overview) - Explore our SDKs for [Python](https://docs.firecrawl.dev/sdks/python), [Node.js](https://docs.firecrawl.dev/sdks/node), [Go](https://docs.firecrawl.dev/sdks/go), and [Rust](https://docs.firecrawl.dev/sdks/rust) 103 | - [**LLM Framework Integrations**](https://docs.firecrawl.dev/integrations/overview) - Learn how to use Firecrawl with frameworks like LangChain and Llama Index 104 | - [**Firecrawl API Reference**](https://docs.firecrawl.dev/api-reference/introduction) - Detailed API endpoints and parameters 105 | 106 | ## Contributing 107 | 108 | We welcome contributions to the Firecrawl Quickstarts repository! If you have ideas for new quickstart projects or improvements to existing ones, please open an issue or submit a pull request. 109 | 110 | ## Community and Support 111 | 112 | - Join our [Firecrawl Discord community](https://discord.com/invite/gSmWdAkdwd) for discussions and support 113 | - Follow us on [Twitter](https://twitter.com/firecrawl_dev) and [LinkedIn](https://www.linkedin.com/company/104100957) for updates 114 | - Check out the [Firecrawl Support Documentation](https://docs.firecrawl.dev) for additional help 115 | 116 | ## License 117 | 118 | This project is licensed under the [MIT](https://opensource.org/licenses/MIT) License - see the [LICENSE](LICENSE) file for details. 119 | 120 | --- 121 | 122 | *It is the sole responsibility of the end users to respect websites' policies when scraping, searching, and crawling with Firecrawl. Users are advised to adhere to the applicable privacy policies and terms of use of the websites prior to initiating any scraping activities. By default, Firecrawl respects the directives specified in the websites' robots.txt files when crawling. By utilizing Firecrawl, you expressly agree to comply with these conditions.* 123 | 124 | [↑ Back to Top ↑](#firecrawl-quickstarts) 125 | 126 | Copyright (c) 2024-present, Alex Fazio 127 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/x_post.py: -------------------------------------------------------------------------------- 1 | """Module for posting research papers to X (Twitter).""" 2 | 3 | import os 4 | import base64 5 | import hashlib 6 | import secrets 7 | import webbrowser 8 | from http.server import HTTPServer, BaseHTTPRequestHandler 9 | from urllib.parse import parse_qs, urlparse 10 | from dotenv import load_dotenv, set_key, find_dotenv 11 | import requests 12 | from typing import Optional 13 | from logging_config import setup_base_logging, log_function_call 14 | 15 | # Configure logging using the centralized configuration 16 | logger = setup_base_logging( 17 | logger_name="x_poster", 18 | log_file="x_poster.log", 19 | format_string='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s' 20 | ) 21 | 22 | load_dotenv() 23 | 24 | AUTH_URL = "https://twitter.com/i/oauth2/authorize" 25 | TOKEN_URL = "https://api.twitter.com/2/oauth2/token" 26 | CALLBACK_URL = "http://127.0.0.1:8000/callback" 27 | X_API_URL = "https://api.twitter.com/2/tweets" 28 | 29 | @log_function_call 30 | def generate_pkce_pair(): 31 | """Generate PKCE code verifier and challenge""" 32 | code_verifier = secrets.token_urlsafe(64) 33 | code_challenge = base64.urlsafe_b64encode( 34 | hashlib.sha256(code_verifier.encode()).digest() 35 | ).rstrip(b'=').decode() 36 | 37 | logger.debug(f"Generated PKCE - Verifier: {len(code_verifier)} chars, Challenge: {len(code_challenge)} chars") 38 | return code_verifier, code_challenge 39 | 40 | class CallbackHandler(BaseHTTPRequestHandler): 41 | """Handle OAuth callback""" 42 | code = None 43 | 44 | def log_message(self, format, *args): 45 | """Override to use our logger""" 46 | logger.info(f"OAuth Callback: {format%args}") 47 | 48 | def do_GET(self): 49 | """Process callback GET request""" 50 | logger.info(f"Received callback request: {self.path}") 51 | 52 | query = parse_qs(urlparse(self.path).query) 53 | CallbackHandler.code = query.get('code', [None])[0] 54 | 55 | if CallbackHandler.code: 56 | logger.info("Successfully received authorization code") 57 | else: 58 | logger.error(f"No authorization code in callback. Query params: {query}") 59 | 60 | # Log any error parameters 61 | if 'error' in query: 62 | logger.error(f"Error in callback: {query['error']}") 63 | if 'error_description' in query: 64 | logger.error(f"Error description: {query['error_description']}") 65 | 66 | self.send_response(200) 67 | self.send_header('Content-type', 'text/html') 68 | self.end_headers() 69 | self.wfile.write(b"Authorization successful! You can close this window.") 70 | 71 | @log_function_call 72 | def load_stored_tokens(): 73 | """Load stored OAuth tokens from .env""" 74 | dotenv_path = find_dotenv() 75 | load_dotenv(dotenv_path) 76 | 77 | access_token = os.getenv('X_ACCESS_TOKEN') 78 | refresh_token = os.getenv('X_REFRESH_TOKEN') 79 | 80 | if access_token and refresh_token: 81 | logger.debug("Successfully loaded stored tokens") 82 | return { 83 | 'access_token': access_token, 84 | 'refresh_token': refresh_token, 85 | 'expires_in': os.getenv('X_TOKEN_EXPIRES_IN'), 86 | 'scope': os.getenv('X_TOKEN_SCOPE') 87 | } 88 | logger.warning("No stored tokens found") 89 | return None 90 | 91 | @log_function_call 92 | def save_tokens(tokens): 93 | """Save OAuth tokens to .env""" 94 | dotenv_path = find_dotenv() 95 | 96 | try: 97 | set_key(dotenv_path, 'X_ACCESS_TOKEN', tokens['access_token']) 98 | set_key(dotenv_path, 'X_REFRESH_TOKEN', tokens['refresh_token']) 99 | set_key(dotenv_path, 'X_TOKEN_EXPIRES_IN', str(tokens['expires_in'])) 100 | set_key(dotenv_path, 'X_TOKEN_SCOPE', tokens['scope']) 101 | load_dotenv(dotenv_path) 102 | logger.info("Successfully saved tokens to .env") 103 | except Exception as e: 104 | logger.error(f"Failed to save tokens: {str(e)}") 105 | raise 106 | 107 | def refresh_access_token(refresh_token): 108 | """Get new access token using refresh token""" 109 | auth = ( 110 | os.getenv('X_OAUTH2_CLIENT_ID'), 111 | os.getenv('X_OAUTH2_CLIENT_SECRET') 112 | ) 113 | 114 | data = { 115 | 'refresh_token': refresh_token, 116 | 'grant_type': 'refresh_token' 117 | } 118 | 119 | response = requests.post(TOKEN_URL, auth=auth, data=data) 120 | return response.json() 121 | 122 | def get_oauth2_token(): 123 | """Get OAuth 2.0 token, using stored tokens if available""" 124 | # Try to load stored tokens 125 | tokens = load_stored_tokens() 126 | 127 | if tokens: 128 | logger.debug(f"Found stored tokens with keys: {list(tokens.keys())}") 129 | if 'refresh_token' in tokens: 130 | try: 131 | logger.info("Attempting to refresh token...") 132 | new_tokens = refresh_access_token(tokens['refresh_token']) 133 | logger.debug(f"Refresh response keys: {list(new_tokens.keys())}") 134 | save_tokens(new_tokens) 135 | return new_tokens['access_token'] 136 | except Exception as e: 137 | logger.error(f"Token refresh failed: {e}") 138 | else: 139 | logger.error("Stored tokens missing refresh_token") 140 | 141 | # If no stored tokens or refresh failed, do full authorization 142 | logger.info("Starting OAuth 2.0 PKCE flow") 143 | code_verifier, code_challenge = generate_pkce_pair() 144 | 145 | auth_params = { 146 | 'response_type': 'code', 147 | 'client_id': os.getenv('X_OAUTH2_CLIENT_ID'), 148 | 'redirect_uri': CALLBACK_URL, 149 | 'scope': 'tweet.write tweet.read users.read offline.access', 150 | 'code_challenge': code_challenge, 151 | 'code_challenge_method': 'S256', 152 | 'state': secrets.token_urlsafe(32) 153 | } 154 | 155 | logger.info("Starting local callback server") 156 | server = HTTPServer(('127.0.0.1', 8000), CallbackHandler) 157 | 158 | auth_url = f"{AUTH_URL}?{'&'.join(f'{k}={v}' for k,v in auth_params.items())}" 159 | logger.info(f"Opening authorization URL: {auth_url}") 160 | webbrowser.open(auth_url) 161 | 162 | logger.info("Waiting for callback...") 163 | server.handle_request() 164 | 165 | if not CallbackHandler.code: 166 | raise Exception("Failed to get authorization code") 167 | 168 | logger.info("Exchanging code for token") 169 | token_data = { 170 | 'code': CallbackHandler.code, 171 | 'grant_type': 'authorization_code', 172 | 'client_id': os.getenv('X_OAUTH2_CLIENT_ID'), 173 | 'redirect_uri': CALLBACK_URL, 174 | 'code_verifier': code_verifier 175 | } 176 | 177 | auth = ( 178 | os.getenv('X_OAUTH2_CLIENT_ID'), 179 | os.getenv('X_OAUTH2_CLIENT_SECRET') 180 | ) 181 | 182 | response = requests.post(TOKEN_URL, auth=auth, data=token_data) 183 | if response.status_code != 200: 184 | raise Exception(f"Token exchange failed: {response.text}") 185 | 186 | token_json = response.json() 187 | logger.debug(f"Received token response with keys: {list(token_json.keys())}") 188 | save_tokens(token_json) 189 | return token_json['access_token'] 190 | 191 | def format_post( 192 | paper_title: str, 193 | authors: list, 194 | url: str, 195 | pdf_url: Optional[str] = None, 196 | arxiv_url: Optional[str] = None, 197 | github_url: Optional[str] = None 198 | ) -> str: 199 | """Format paper details into X post text""" 200 | # Start with title and truncate if needed 201 | post = f"📚 {paper_title[:100]}{'...' if len(paper_title) > 100 else ''}\n\n" 202 | 203 | # Add authors (limited to first 2 if many) 204 | if len(authors) > 2: 205 | authors_text = f"by {', '.join(authors[:2])} et al." 206 | else: 207 | authors_text = f"by {', '.join(authors)}" 208 | post += f"{authors_text}\n\n" 209 | 210 | # Add links 211 | post += f"🔗 {url}" 212 | if pdf_url: 213 | post += f"\n📄 {pdf_url}" 214 | if arxiv_url: 215 | post += f"\n📝 {arxiv_url}" 216 | if github_url: 217 | post += f"\n💻 {github_url}" 218 | 219 | return post 220 | 221 | def post_paper( 222 | paper_title: str, 223 | authors: list, 224 | url: str, 225 | pdf_url: Optional[str] = None, 226 | arxiv_url: Optional[str] = None, 227 | github_url: Optional[str] = None 228 | ): 229 | """Post paper to X""" 230 | logger.info(f"Attempting to post paper: {paper_title}") 231 | 232 | # Get OAuth 2.0 token 233 | try: 234 | token = get_oauth2_token() 235 | logger.info("Successfully obtained OAuth token") 236 | except Exception as e: 237 | logger.error(f"Failed to get OAuth token: {str(e)}") 238 | return None 239 | 240 | post_text = format_post( 241 | paper_title, authors, url, 242 | pdf_url, arxiv_url, github_url 243 | ) 244 | 245 | headers = { 246 | 'Authorization': f'Bearer {token}', 247 | 'Content-Type': 'application/json' 248 | } 249 | 250 | payload = {"text": post_text} 251 | 252 | try: 253 | logger.info("Sending post request to X API") 254 | response = requests.post(X_API_URL, json=payload, headers=headers) 255 | 256 | logger.debug(f"Post response status: {response.status_code}") 257 | logger.debug(f"Post response headers: {response.headers}") 258 | 259 | if response.status_code != 201: 260 | logger.error(f"Error posting to X: {response.text}") 261 | else: 262 | logger.info("Successfully posted to X") 263 | 264 | return response.json() 265 | except Exception as e: 266 | logger.error(f"Error posting to X: {str(e)}", exc_info=True) 267 | return None 268 | 269 | if __name__ == "__main__": 270 | logger.info("Starting X post test") 271 | try: 272 | response = post_paper( 273 | paper_title="Test Paper Title", 274 | authors=["Author 1", "Author 2"], 275 | url="https://huggingface.co/papers/test", 276 | pdf_url="https://example.com/test.pdf", 277 | arxiv_url="https://arxiv.org/abs/test", 278 | github_url="https://github.com/test/repo" 279 | ) 280 | 281 | if response and 'data' in response: 282 | logger.info("✅ Post successful!") 283 | logger.info(f"Tweet ID: {response['data']['id']}") 284 | logger.info(f"Tweet text: {response['data']['text']}") 285 | else: 286 | logger.error("❌ Post failed!") 287 | logger.error(f"Response: {response}") 288 | 289 | except Exception as e: 290 | logger.error("❌ Error during posting:") 291 | logger.error(f"Error details: {str(e)}", exc_info=True) -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/tests/test_semantic_filter.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Module for testing the semantic filter.""" 2 | 3 | import os 4 | import sys 5 | 6 | # Add the project root to the Python path 7 | project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../../")) 8 | sys.path.insert(0, project_root) 9 | 10 | from examples.firecrawl_automated_whitepaper_tracking.semantic_filter import belongs_to_category 11 | from examples.firecrawl_automated_whitepaper_tracking.category_prompt import DESIRED_CATEGORY 12 | 13 | def test_belongs_to_category(): 14 | """ 15 | This test ensures that belongs_to_category returns a boolean indicating if the paper 16 | likely belongs to the specified category, based on the model's classification. 17 | Also prints confidence scores for analysis. 18 | """ 19 | 20 | # Our test cases with inputs and expected outputs 21 | test_cases = [ 22 | ( 23 | "PaliGemma 2: A Family of Versatile VLMs for Transfer", 24 | """PaliGemma 2 is an upgrade of the PaliGemma open Vision-Language Model (VLM) based on the Gemma 2 family of language models. We combine the SigLIP-So400m vision encoder that was also used by PaliGemma with the whole range of Gemma 2 models, from the 2B one all the way up to the 27B model. We train these models at three resolutions (224px, 448px, and 896px) in multiple stages to equip them with broad knowledge for transfer via fine-tuning. The resulting family of base models covering different model sizes and resolutions allows us to investigate factors impacting transfer performance (such as learning rate) and to analyze the interplay between the type of task, model size, and resolution. We further increase the number and breadth of transfer tasks beyond the scope of PaliGemma including different OCR-related tasks such as table structure recognition, molecular structure recognition, music score recognition, as well as long fine-grained captioning and radiography report generation, on which PaliGemma 2 obtains state-of-the-art results.""", 25 | DESIRED_CATEGORY, 26 | False 27 | ), 28 | ( 29 | "From Generation to Judgment: Opportunities and Challenges of LLM-as-a-judge", 30 | """Assessment and evaluation have long been critical challenges in artificial intelligence (AI) and natural language processing (NLP). However, traditional methods, whether matching-based or embedding-based, often fall short of judging subtle attributes and delivering satisfactory results. Recent advancements in Large Language Models (LLMs) inspire the "LLM-as-a-judge" paradigm, where LLMs are leveraged to perform scoring, ranking, or selection across various tasks and applications. This paper provides a comprehensive survey of LLM-based judgment and assessment, offering an in-depth overview to advance this emerging field. We begin by giving detailed definitions from both input and output perspectives. Then we introduce a comprehensive taxonomy to explore LLM-as-a-judge from three dimensions: what to judge, how to judge and where to judge. Finally, we compile benchmarks for evaluating LLM-as-a-judge and highlight key challenges and promising directions, aiming to provide valuable insights and inspire future research in this promising research area. Paper list and more resources about LLM-as-a-judge can be found at https://github.com/llm-as-a-judge/Awesome-LLM-as-a-judge and https://llm-as-a-judge.github.io.""", 31 | DESIRED_CATEGORY, 32 | False 33 | ), 34 | ( 35 | "Evaluation Agent: Efficient and Promptable Evaluation Framework for Visual Generative Models", 36 | """Recent advancements in visual generative models have enabled high-quality image and video generation, opening diverse applications. However, evaluating these models often demands sampling hundreds or thousands of images or videos, making the process computationally expensive, especially for diffusion-based models with inherently slow sampling. Moreover, existing evaluation methods rely on rigid pipelines that overlook specific user needs and provide numerical results without clear explanations. In contrast, humans can quickly form impressions of a model's capabilities by observing only a few samples. To mimic this, we propose the Evaluation Agent framework, which employs human-like strategies for efficient, dynamic, multi-round evaluations using only a few samples per round, while offering detailed, user-tailored analyses. It offers four key advantages: 1) efficiency, 2) promptable evaluation tailored to diverse user needs, 3) explainability beyond single numerical scores, and 4) scalability across various models and tools. Experiments show that Evaluation Agent reduces evaluation time to 10% of traditional methods while delivering comparable results. The Evaluation Agent framework is fully open-sourced to advance research in visual generative models and their efficient evaluation.""", 37 | DESIRED_CATEGORY, 38 | True 39 | ), 40 | ( 41 | "TheAgentCompany: Benchmarking LLM Agents on Consequential Real World Tasks", 42 | """We interact with computers on an everyday basis, be it in everyday life or work, and many aspects of work can be done entirely with access to a computer and the Internet. At the same time, thanks to improvements in large language models (LLMs), there has also been a rapid development in AI agents that interact with and affect change in their surrounding environments. But how performant are AI agents at helping to accelerate or even autonomously perform work-related tasks? The answer to this question has important implications for both industry looking to adopt AI into their workflows, and for economic policy to understand the effects that adoption of AI may have on the labor market. To measure the progress of these LLM agents' performance on performing real-world professional tasks, in this paper, we introduce TheAgentCompany, an extensible benchmark for evaluating AI agents that interact with the world in similar ways to those of a digital worker: by browsing the Web, writing code, running programs, and communicating with other coworkers. We build a self-contained environment with internal web sites and data that mimics a small software company environment, and create a variety of tasks that may be performed by workers in such a company. We test baseline agents powered by both closed API-based and open-weights language models (LMs), and find that with the most competitive agent, 24% of the tasks can be completed autonomously. This paints a nuanced picture on task automation with LM agents -- in a setting simulating a real workplace, a good portion of simpler tasks could be solved autonomously, but more difficult long-horizon tasks are still beyond the reach of current systems.""", 43 | DESIRED_CATEGORY, 44 | True 45 | ), 46 | ( 47 | "GUI Agents: A Survey", 48 | """Graphical User Interface (GUI) agents, powered by Large Foundation Models, have emerged as a transformative approach to automating human-computer interaction. These agents autonomously interact with digital systems or software applications via GUIs, emulating human actions such as clicking, typing, and navigating visual elements across diverse platforms. Motivated by the growing interest and fundamental importance of GUI agents, we provide a comprehensive survey that categorizes their benchmarks, evaluation metrics, architectures, and training methods. We propose a unified framework that delineates their perception, reasoning, planning, and acting capabilities. Furthermore, we identify important open challenges and discuss key future directions. Finally, this work serves as a basis for practitioners and researchers to gain an intuitive understanding of current progress, techniques, benchmarks, and critical open problems that remain to be addressed.""", 49 | DESIRED_CATEGORY, 50 | True 51 | ), 52 | ( 53 | "Aguvis: Unified Pure Vision Agents for Autonomous GUI Interaction", 54 | """Graphical User Interfaces (GUIs) are critical to human-computer interaction, yet automating GUI tasks remains challenging due to the complexity and variability of visual environments. Existing approaches often rely on textual representations of GUIs, which introduce limitations in generalization, efficiency, and scalability. In this paper, we introduce Aguvis, a unified pure vision-based framework for autonomous GUI agents that operates across various platforms. Our approach leverages image-based observations, and grounding instructions in natural language to visual elements, and employs a consistent action space to ensure cross-platform generalization. To address the limitations of previous work, we integrate explicit planning and reasoning within the model, enhancing its ability to autonomously navigate and interact with complex digital environments. We construct a large-scale dataset of GUI agent trajectories, incorporating multimodal reasoning and grounding, and employ a two-stage training pipeline that first focuses on general GUI grounding, followed by planning and reasoning. Through comprehensive experiments, we demonstrate that Aguvis surpasses previous state-of-the-art methods in both offline and real-world online scenarios, achieving, to our knowledge, the first fully autonomous pure vision GUI agent capable of performing tasks independently without collaboration with external closed-source models. We open-sourced all datasets, models, and training recipes to facilitate future research at https://aguvis-project.github.io/.""", 55 | DESIRED_CATEGORY, 56 | True 57 | ), 58 | ] 59 | 60 | # Track failed tests 61 | failed_tests = [] 62 | 63 | # Run each test case 64 | for paper_title, paper_abstract, desired_category, expected_boolean in test_cases: 65 | try: 66 | # Get the raw response from the model 67 | result, confidence = belongs_to_category(paper_title, paper_abstract, desired_category) 68 | 69 | # Print the confidence score and classification result 70 | print(f"\nPaper: {paper_title[:50]}...") 71 | print(f"Expected category match: {expected_boolean}") 72 | print(f"Actual category match: {result}") 73 | print(f"Confidence score: {confidence:.2f}") 74 | 75 | assert isinstance(result, bool), "The result should be a boolean." 76 | if result != expected_boolean: 77 | failed_tests.append({ 78 | 'title': paper_title[:50], 79 | 'expected': expected_boolean, 80 | 'got': result 81 | }) 82 | 83 | except Exception as e: 84 | failed_tests.append({ 85 | 'title': paper_title[:50], 86 | 'error': str(e) 87 | }) 88 | 89 | # Print summary at the end 90 | print("\n=== Test Summary ===") 91 | if not failed_tests: 92 | print("All tests passed successfully!") 93 | else: 94 | print(f"Failed tests ({len(failed_tests)}):") 95 | for test in failed_tests: 96 | if 'error' in test: 97 | print(f"- {test['title']}: {test['error']}") 98 | else: 99 | print(f"- {test['title']}: expected {test['expected']}, got {test['got']}") 100 | raise AssertionError("Some tests failed. See summary above.") 101 | 102 | if __name__ == "__main__": 103 | test_belongs_to_category() 104 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/firecrawl_crawl_extract.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Module for crawling and extracting data from Hugging Face papers using Firecrawl. 2 | Module for crawling and extracting data from Hugging Face papers using Firecrawl. 3 | Handles paper metadata extraction and processing for the notification system. 4 | """ 5 | 6 | import asyncio 7 | import os 8 | import re 9 | from datetime import datetime 10 | from typing import Dict, Any 11 | 12 | # Third-party imports 13 | import pytz 14 | import requests 15 | from sqlalchemy.exc import SQLAlchemyError 16 | from pydantic import BaseModel 17 | from firecrawl import FirecrawlApp 18 | from dotenv import load_dotenv 19 | from supabase_db import Database 20 | from semantic_filter import should_process 21 | from discord_notifications import send_paper_notification 22 | from x_post import post_paper 23 | from logging_config import setup_crawler_logging 24 | 25 | # Initialize logger 26 | logger = setup_crawler_logging() 27 | 28 | # Load environment variables 29 | load_dotenv() 30 | 31 | # Validate required environment variables 32 | if not os.getenv("POSTGRES_URL"): 33 | raise ValueError("POSTGRES_URL environment variable not set") 34 | if not os.getenv("FIRECRAWL_API_KEY"): 35 | raise ValueError("FIRECRAWL_API_KEY environment variable not set") 36 | 37 | def extract_paper_urls(target_url: str) -> list: 38 | """ 39 | Extract all paper source URLs from a given target URL using Firecrawl. 40 | 41 | Args: 42 | target_url (str): The URL to crawl for paper sources 43 | 44 | Returns: 45 | list: A list of extracted source URLs, excluding daily papers URLs 46 | """ 47 | logger.info("Starting URL extraction from: %s", target_url) 48 | exclude_url_pattern = ( 49 | r"^https://huggingface\.co/papers\?date=" 50 | r"\d{4}-(?:0[1-9]|1[0-2])-(?:0[1-9]|[12]\d|3[01])$" 51 | ) 52 | def get_all_source_urls(json_data: Dict[str, Any]) -> list: 53 | extracted_urls = [] 54 | logger.debug("Processing JSON data with %d entries", len(json_data.get('data', []))) 55 | if "data" in json_data: 56 | for entry in json_data["data"]: 57 | if "metadata" in entry and "sourceURL" in entry["metadata"]: 58 | url = entry["metadata"]["sourceURL"] 59 | if not re.match(exclude_url_pattern, url): 60 | extracted_urls.append(url) 61 | if "next" in json_data and json_data["next"]: 62 | logger.debug("Found next page: %s", json_data['next']) 63 | next_page_url = json_data["next"] 64 | response = requests.get(next_page_url) # noqa 65 | if response.ok: 66 | next_json_data = response.json() 67 | extracted_urls.extend(get_all_source_urls(next_json_data)) 68 | return extracted_urls 69 | 70 | load_dotenv() 71 | api_key = os.getenv("FIRECRAWL_API_KEY") 72 | app = FirecrawlApp(api_key=api_key) 73 | params = { 74 | 'limit': 30, 75 | 'excludePaths': ['papers$'], 76 | 'includePaths': ['papers/*'], 77 | 'ignoreSitemap': True, 78 | 'scrapeOptions': { 79 | 'formats': ['markdown', 'links', 'html'], 80 | 'onlyMainContent': False, 81 | 'includeTags': ['a'] 82 | } 83 | } 84 | logger.info("Crawling URL with params: %s", params) 85 | crawl_result = app.crawl_url(target_url, params=params) 86 | urls = get_all_source_urls(crawl_result) 87 | logger.info("Extracted %d paper URLs", len(urls)) 88 | return urls 89 | 90 | async def extract_paper_details(url: str) -> dict: 91 | """Extract paper details from a given URL using FirecrawlApp. 92 | 93 | This async function handles the extraction of metadata from individual paper pages. 94 | It uses the FirecrawlApp to scrape structured data according to the ExtractSchema. 95 | The synchronous FirecrawlApp calls are run in a separate thread using asyncio.to_thread. 96 | 97 | Args: 98 | url (str): The URL of the paper to extract details from. 99 | 100 | Returns: 101 | dict: Extracted paper details including title, upvotes, comments, and URLs. 102 | """ 103 | logger.info("Extracting paper details from: %s", url) 104 | # Initialize the FirecrawlApp with your API key 105 | app = FirecrawlApp(api_key=os.getenv("FIRECRAWL_API_KEY")) 106 | 107 | class ExtractSchema(BaseModel): # noqa 108 | """Schema for extracting paper details from Hugging Face papers.""" 109 | paper_title: str 110 | number_of_upvotes: int 111 | number_of_comments: int 112 | view_pdf_url: str 113 | view_arxiv_page_url: str 114 | authors: str 115 | abstract_body: str 116 | # Publication date represents when the paper was originally published (e.g., on arXiv) 117 | utc_publication_date_day: int 118 | utc_publication_date_month: int 119 | utc_publication_date_year: int 120 | # Submission date represents when the paper was submitted/added to 121 | # HuggingFace's daily papers page 122 | utc_submission_date_day: int 123 | utc_submission_date_month: int 124 | utc_submission_date_year: int 125 | github_repo_url: str 126 | 127 | # Run the synchronous scrape_url method in a separate thread 128 | data = await asyncio.to_thread( 129 | app.scrape_url, 130 | url, 131 | { 132 | 'formats': ['extract'], 133 | 'extract': { 134 | 'schema': ExtractSchema.model_json_schema(), 135 | } 136 | } 137 | ) 138 | logger.debug("Raw extraction data: %s", data['extract']) 139 | return data['extract'] 140 | 141 | async def process_paper_batch(urls: list[str], db: Database, batch_size: int = 5): 142 | """Process papers in batches to avoid overwhelming resources""" 143 | for i in range(0, len(urls), batch_size): 144 | batch = urls[i:i + batch_size] 145 | tasks = [] 146 | for url in batch: 147 | tasks.append(extract_paper_details(url)) 148 | 149 | details_list = await asyncio.gather(*tasks, return_exceptions=True) 150 | 151 | for url, details in zip(batch, details_list): 152 | current_time = datetime.now() 153 | paper_data = { 154 | "url": url, 155 | "extraction_success": True, 156 | "extraction_error": None, 157 | "last_extraction_attempt": current_time, 158 | "notification_sent": False 159 | } 160 | 161 | if isinstance(details, Exception): 162 | logger.error(f"Error processing {url}: {details}") 163 | paper_data.update({ 164 | "extraction_success": False, 165 | "extraction_error": str(details) 166 | }) 167 | try: 168 | db.add_paper(paper_data) 169 | except SQLAlchemyError as e: 170 | logger.error(f"Database error storing failed paper {url}: {e}") 171 | continue 172 | 173 | try: 174 | paper_data.update(details) 175 | is_new_paper = db.add_paper(paper_data) 176 | 177 | # Use should_process from semantic_filter 178 | should_process_paper, confidence = should_process(details, is_new_paper) 179 | 180 | if should_process_paper: 181 | # Send Discord notification 182 | notification_success = await send_paper_notification( 183 | paper_title=details["paper_title"], 184 | authors=details["authors"].split(", "), 185 | abstract=details["abstract_body"], 186 | upvotes=details["number_of_upvotes"], 187 | comments=details["number_of_comments"], 188 | url=url, 189 | pdf_url=details["view_pdf_url"], 190 | arxiv_url=details["view_arxiv_page_url"], 191 | github_url=details["github_repo_url"] 192 | ) 193 | 194 | if notification_success: 195 | try: 196 | db.update_notification_status(url, True) 197 | except SQLAlchemyError as e: 198 | logger.error(f"Failed to update notification status for {url}: {e}") 199 | 200 | # Post to X 201 | try: 202 | x_response = post_paper( 203 | paper_title=details["paper_title"], 204 | authors=details["authors"].split(", "), 205 | url=url, 206 | pdf_url=details["view_pdf_url"], 207 | arxiv_url=details["view_arxiv_page_url"], 208 | github_url=details["github_repo_url"] 209 | ) 210 | if x_response and 'data' in x_response: 211 | logger.info(f"Successfully posted paper to X: {url}") 212 | # TODO: Update x_post_sent status in DB once column is added 213 | else: 214 | logger.error(f"Failed to post paper to X: {url}") 215 | except Exception as e: 216 | logger.error(f"Error posting to X for {url}: {e}") 217 | 218 | except Exception as e: 219 | logger.error(f"Error processing details for {url}: {e}") 220 | paper_data.update({ 221 | "extraction_success": False, 222 | "extraction_error": str(e) 223 | }) 224 | try: 225 | db.add_paper(paper_data) 226 | except SQLAlchemyError as db_error: 227 | logger.error(f"Database error storing error state for {url}: {db_error}") 228 | 229 | def get_todays_papers_url() -> str: 230 | """ 231 | Returns today's HuggingFace papers URL using San Francisco timezone. 232 | 233 | Returns: 234 | str: URL for today's papers webpage in format https://huggingface.co/papers?date=YYYY-MM-DD 235 | """ 236 | sf_tz = pytz.timezone('America/Los_Angeles') 237 | today = datetime.now(sf_tz).strftime('%Y-%m-%d') 238 | return f"https://huggingface.co/papers?date={today}" 239 | 240 | # TODO: create a streamlit ui to set environment variables and desired categories for the semantic filter 241 | # TODO: make the extract_paper_details function async so details are extracted in parallel 242 | # TODO: make all functions async to avoid redudant code 243 | # TODO: for each url extracted by the crawler it should be verified if it exists already in 244 | # the database before passing it to the extract_paper_details function 245 | # TODO: update the extract_paper_details function to process new papers only if the number of 246 | # found papers for the specific date is greater than the number of papers already found for that date 247 | # within the database. so this will need a new specific database table to store the number of papers 248 | # found for each date. 249 | # TODO: implement an improve error handling system for the extract_paper_details function which will 250 | # add papers that failed to be processed to the database but include a column to indicate that the 251 | # paper was not processed successfully, so that a retry can be performed when cron jobs are re-run. 252 | # this will require changing the database structure and schema, as well as some of the existing logic 253 | # that verifies if a paper should be processed or not. 254 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/supabase_db.py: -------------------------------------------------------------------------------- 1 | __doc__ = """Module for interacting with the supabase database using SQLAlchemy.""" 2 | 3 | from datetime import datetime, timedelta 4 | from sqlalchemy import ( 5 | create_engine, Column, String, Integer, DateTime, Text, ARRAY, text, Boolean 6 | ) 7 | from sqlalchemy.orm import sessionmaker, declarative_base 8 | from logging_config import setup_database_logging 9 | from sqlalchemy.exc import SQLAlchemyError 10 | 11 | # Configure logging using centralized configuration 12 | logger = setup_database_logging() 13 | 14 | Base = declarative_base() 15 | 16 | class Paper(Base): 17 | """SQLAlchemy model for storing research papers from the Hugging Face daily papers page. 18 | Each paper entry includes its URL, title, authors, abstract, associated URLs (PDF, arXiv, GitHub), 19 | publication and submission dates, as well as current engagement metrics (upvotes and comments).""" 20 | __tablename__ = "papers" 21 | url = Column(String, primary_key=True) 22 | title = Column(String, nullable=False) 23 | authors = Column(ARRAY(String), nullable=False) 24 | abstract = Column(Text, nullable=False) 25 | pdf_url = Column(String) 26 | arxiv_url = Column(String) 27 | github_url = Column(String) 28 | publication_date = Column(DateTime, nullable=False) 29 | submission_date = Column(DateTime, nullable=False) 30 | upvotes = Column(Integer, default=0) 31 | comments = Column(Integer, default=0) 32 | last_updated = Column(DateTime, default=datetime.now, onupdate=datetime.now) 33 | notification_sent = Column(Boolean, default=False) 34 | extraction_success = Column(Boolean, default=True) 35 | extraction_error = Column(Text, nullable=True) 36 | last_extraction_attempt = Column(DateTime, default=datetime.now) 37 | 38 | 39 | class Database: 40 | """Class for interacting with the database using SQLAlchemy.""" 41 | CURRENT_SCHEMA_VERSION = 2 42 | 43 | def __init__(self, connection_string, skip_version_check=False): 44 | logger.info("Initializing Database connection") 45 | if not connection_string: 46 | logger.error("Database connection string is not set") 47 | raise ValueError("Database connection string is not set") 48 | 49 | # Ensure sslmode=require is appended 50 | if '?' not in connection_string: 51 | connection_string += '?sslmode=require' 52 | elif 'sslmode' not in connection_string: 53 | connection_string += '&sslmode=require' 54 | 55 | logger.debug("Creating engine with connection string: %s", connection_string.split('?')[0]) 56 | self.engine = create_engine( 57 | connection_string, 58 | pool_pre_ping=True # Pre-ping helps keep connections alive 59 | ) 60 | 61 | logger.info("Creating database tables if they don't exist") 62 | Base.metadata.create_all(self.engine) 63 | self.session_factory = sessionmaker(bind=self.engine) 64 | logger.info("Database initialization complete") 65 | 66 | # Only check version if not skipped 67 | if not skip_version_check: 68 | self._check_schema_version() 69 | 70 | def _check_schema_version(self): 71 | """Verify database schema version is compatible.""" 72 | session = self.session_factory() 73 | try: 74 | # Create version table if it doesn't exist 75 | session.execute(text(""" 76 | CREATE TABLE IF NOT EXISTS schema_version ( 77 | version INTEGER PRIMARY KEY, 78 | applied_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP 79 | ) 80 | """)) 81 | 82 | # Get current version 83 | result = session.execute(text( 84 | "SELECT version FROM schema_version ORDER BY version DESC LIMIT 1" 85 | )) 86 | db_version = result.scalar() or 0 87 | 88 | if db_version == 0: 89 | # New database, set initial version 90 | session.execute(text( 91 | "INSERT INTO schema_version (version) VALUES (:version)" 92 | ), {"version": self.CURRENT_SCHEMA_VERSION}) 93 | session.commit() 94 | elif db_version < self.CURRENT_SCHEMA_VERSION: 95 | logger.error( 96 | "Database schema version %d is older than required version %d. " 97 | "Please run migrations.", 98 | db_version, self.CURRENT_SCHEMA_VERSION 99 | ) 100 | raise RuntimeError("Database schema needs migration") 101 | elif db_version > self.CURRENT_SCHEMA_VERSION: 102 | logger.error( 103 | "Database schema version %d is newer than supported version %d. " 104 | "Please update the application.", 105 | db_version, self.CURRENT_SCHEMA_VERSION 106 | ) 107 | raise RuntimeError("Database schema version not supported") 108 | 109 | logger.info("Database schema version: %d", db_version) 110 | 111 | except Exception as e: 112 | logger.error("Error checking schema version: %s", str(e)) 113 | raise 114 | finally: 115 | session.close() 116 | 117 | def get_all_papers(self): 118 | """Get all papers from the database""" 119 | logger.info("Fetching all papers from database") 120 | session = self.session_factory() 121 | try: 122 | papers = session.query(Paper).all() 123 | logger.info("Retrieved %d papers from database", len(papers)) 124 | return papers 125 | except Exception as e: 126 | logger.error("Error fetching papers: %s", str(e)) 127 | raise 128 | finally: 129 | session.close() 130 | 131 | def add_paper(self, paper_data): 132 | """Add or update a paper and its current metrics. 133 | 134 | Returns: 135 | bool: True if this is a new paper, False if it's an update 136 | """ 137 | logger.info("Adding/updating paper: %s", paper_data['url']) 138 | session = self.session_factory() 139 | try: 140 | # Check if paper already exists 141 | existing_paper = session.query(Paper).filter( 142 | Paper.url == paper_data["url"] 143 | ).first() 144 | 145 | is_new_paper = existing_paper is None 146 | action = "Adding new" if is_new_paper else "Updating existing" 147 | logger.info("%s paper: %s", action, paper_data['url']) 148 | 149 | if paper_data.get("extraction_success"): 150 | # Date handling with logging 151 | try: 152 | publication_date = datetime( 153 | paper_data["utc_publication_date_year"], 154 | paper_data["utc_publication_date_month"], 155 | paper_data["utc_publication_date_day"] 156 | ) 157 | except ValueError as e: 158 | logger.warning("Invalid publication date in %s: %s", paper_data['url'], e) 159 | publication_date = datetime.now() 160 | 161 | try: 162 | submission_date = datetime( 163 | paper_data["utc_submission_date_year"], 164 | paper_data["utc_submission_date_month"], 165 | paper_data["utc_submission_date_day"] 166 | ) 167 | except ValueError as e: 168 | logger.warning("Invalid submission date in %s: %s", paper_data['url'], e) 169 | submission_date = datetime.now() 170 | 171 | # Create/update the paper entry with metrics 172 | paper = Paper( 173 | url=paper_data["url"], 174 | title=paper_data["paper_title"], 175 | authors=paper_data["authors"].split(", "), 176 | abstract=paper_data["abstract_body"], 177 | pdf_url=paper_data.get("view_pdf_url"), 178 | arxiv_url=paper_data.get("view_arxiv_page_url"), 179 | github_url=paper_data.get("github_repo_url"), 180 | publication_date=publication_date, 181 | submission_date=submission_date, 182 | upvotes=paper_data.get("number_of_upvotes", 0), 183 | comments=paper_data.get("number_of_comments", 0) 184 | ) 185 | else: 186 | # Handle failed extraction 187 | paper = Paper( 188 | url=paper_data["url"], 189 | extraction_success=False, 190 | extraction_error=paper_data.get("extraction_error"), 191 | last_extraction_attempt=paper_data["last_extraction_attempt"], 192 | notification_sent=paper_data.get("notification_sent", False) 193 | ) 194 | 195 | logger.debug("Merging paper data for %s", paper_data['url']) 196 | session.merge(paper) 197 | session.commit() 198 | logger.info("Successfully %s paper for %s", 199 | 'added' if is_new_paper else 'updated', paper_data['url']) 200 | 201 | return is_new_paper 202 | except Exception as e: 203 | logger.error("Error adding/updating paper %s: %s", paper_data['url'], str(e)) 204 | raise 205 | finally: 206 | session.close() 207 | 208 | def update_notification_status(self, url: str, status: bool) -> bool: 209 | """Update the notification status for a paper. Returns True if successful.""" 210 | logger.info("Updating notification status for %s to %s", url, status) 211 | session = self.session_factory() 212 | try: 213 | paper = session.query(Paper).filter(Paper.url == url).first() 214 | if not paper: 215 | logger.error("Paper not found: %s", url) 216 | return False 217 | paper.notification_sent = status 218 | session.commit() 219 | logger.info("Successfully updated notification status") 220 | return True 221 | except SQLAlchemyError as e: 222 | session.rollback() 223 | logger.error("Error updating notification status: %s", str(e)) 224 | return False 225 | finally: 226 | session.close() 227 | 228 | def get_failed_extractions(self, min_age_hours: int = 1): 229 | """Get papers that failed extraction and haven't been retried recently.""" 230 | logger.info("Fetching failed extractions older than %d hours", min_age_hours) 231 | session = self.session_factory() 232 | try: 233 | retry_cutoff = datetime.now() - timedelta(hours=min_age_hours) 234 | papers = session.query(Paper).filter( 235 | Paper.extraction_success == False, 236 | Paper.last_extraction_attempt < retry_cutoff 237 | ).all() 238 | logger.info("Found %d failed extractions eligible for retry", len(papers)) 239 | return papers 240 | except SQLAlchemyError as e: 241 | logger.error("Error fetching failed extractions: %s", str(e)) 242 | return [] 243 | finally: 244 | session.close() 245 | 246 | 247 | if __name__ == "__main__": 248 | from dotenv import load_dotenv 249 | import os 250 | 251 | load_dotenv() 252 | logger.info("Starting database module directly") 253 | 254 | # Initialize database connection 255 | db = Database(os.getenv("POSTGRES_URL")) 256 | 257 | # Create a test paper entry 258 | test_paper = { 259 | "url": "https://test.paper/123", 260 | "paper_title": "Test Paper for Database Module", 261 | "authors": "John Doe, Jane Smith", 262 | "abstract_body": "This is a test paper to verify database functionality.", 263 | "view_pdf_url": "https://test.paper/123/pdf", 264 | "view_arxiv_page_url": "https://arxiv.org/abs/test.123", 265 | "github_repo_url": "https://github.com/test/repo", 266 | "utc_publication_date_year": 2024, 267 | "utc_publication_date_month": 3, 268 | "utc_publication_date_day": 15, 269 | "utc_submission_date_year": 2024, 270 | "utc_submission_date_month": 3, 271 | "utc_submission_date_day": 1, 272 | "number_of_upvotes": 42, 273 | "number_of_comments": 7 274 | } 275 | 276 | try: 277 | # Test adding a paper 278 | logger.info("Testing paper addition...") 279 | is_new = db.add_paper(test_paper) 280 | logger.info("Paper added successfully (new: %s)", is_new) 281 | 282 | # Test retrieving all papers 283 | logger.info("Testing paper retrieval...") 284 | papers = db.get_all_papers() 285 | logger.info("Retrieved %d papers", len(papers)) 286 | 287 | logger.info("Test completed successfully! ✅") 288 | 289 | except (SQLAlchemyError, ValueError) as e: 290 | logger.error("Test failed! ❌ Error: %s", str(e)) 291 | 292 | # TODO: stream the DB contents to a Notion database a la Chief AI Officer database 293 | # TODO: make db entries nullable. this will require db migrations. 294 | # TODO: add a column storing wether a post on x has been made for a paper 295 | -------------------------------------------------------------------------------- /llm_extract_tutorial.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "intro-section" 17 | }, 18 | "source": [ 19 | "# Firecrawl LLM Extract Tutorial\n", 20 | "\n", 21 | "By Alex Fazio (https://twitter.com/alxfazio)\n", 22 | "\n", 23 | "Github repo: https://github.com/alexfazio/firecrawl-cookbook\n", 24 | "\n", 25 | "This Jupyter notebook demonstrates how to use Firecrawl's LLM Extract feature to extract structured data from web pages. By the end of this tutorial, you'll be able to:\n", 26 | "\n", 27 | "1. Set up the Firecrawl environment\n", 28 | "2. Extract data using a schema\n", 29 | "3. Extract data using prompts without a schema\n", 30 | "\n", 31 | "This cookbook is designed for developers who want to efficiently extract structured data from web pages using LLMs." 32 | ] 33 | }, 34 | { 35 | "cell_type": "markdown", 36 | "metadata": { 37 | "id": "requirements-section" 38 | }, 39 | "source": [ 40 | "## Requirements\n", 41 | "\n", 42 | "Before proceeding, ensure you have:\n", 43 | "\n", 44 | "- **Firecrawl API key**: Required for accessing the Firecrawl service\n", 45 | "- Python environment with required packages\n", 46 | "\n", 47 | "We'll be using the following packages:\n", 48 | "- `firecrawl`: For interacting with the Firecrawl API\n", 49 | "- `pydantic`: For schema definition" 50 | ] 51 | }, 52 | { 53 | "cell_type": "markdown", 54 | "metadata": { 55 | "id": "setup-section" 56 | }, 57 | "source": [ 58 | "## Setup\n", 59 | "\n", 60 | "First, let's install the required packages:" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 1, 66 | "metadata": { 67 | "colab": { 68 | "base_uri": "https://localhost:8080/" 69 | }, 70 | "id": "ytS0D_edJQIH", 71 | "outputId": "2cf0e258-8ae4-4718-b883-bdd4c5b35230" 72 | }, 73 | "outputs": [ 74 | { 75 | "output_type": "stream", 76 | "name": "stdout", 77 | "text": [ 78 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/164.1 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.1/164.1 kB\u001b[0m \u001b[31m6.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 79 | "\u001b[?25h" 80 | ] 81 | } 82 | ], 83 | "source": [ 84 | "%pip install firecrawl-py pydantic --quiet" 85 | ] 86 | }, 87 | { 88 | "cell_type": "markdown", 89 | "metadata": { 90 | "id": "api-key-section" 91 | }, 92 | "source": [ 93 | "Next, let's set up our Firecrawl API key:" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 2, 99 | "metadata": { 100 | "colab": { 101 | "base_uri": "https://localhost:8080/" 102 | }, 103 | "id": "5S7s7PEmJQII", 104 | "outputId": "c42bd26a-3f6b-47c5-9391-3d3dc306d033" 105 | }, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "Enter your Firecrawl API key: ··········\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "from getpass import getpass\n", 117 | "api_key = getpass(\"Enter your Firecrawl API key: \")" 118 | ] 119 | }, 120 | { 121 | "cell_type": "markdown", 122 | "metadata": { 123 | "id": "schema-extraction-section" 124 | }, 125 | "source": [ 126 | "## Extracting Data with Schema\n", 127 | "\n", 128 | "Let's start by importing the required libraries and defining our schema for extraction:" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 3, 134 | "metadata": { 135 | "colab": { 136 | "base_uri": "https://localhost:8080/" 137 | }, 138 | "id": "KgQoSEgdJQII", 139 | "outputId": "452e9c86-3c14-4c28-ef9a-e4ccedc17a4b" 140 | }, 141 | "outputs": [ 142 | { 143 | "output_type": "stream", 144 | "name": "stdout", 145 | "text": [ 146 | "{'company_mission': \"Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to\", 'supports_sso': True, 'is_open_source': False, 'is_in_yc': True}\n" 147 | ] 148 | } 149 | ], 150 | "source": [ 151 | "from firecrawl import FirecrawlApp\n", 152 | "from pydantic import BaseModel, Field\n", 153 | "\n", 154 | "# Initialize the FirecrawlApp with your API key\n", 155 | "app = FirecrawlApp(api_key=api_key)\n", 156 | "\n", 157 | "class ExtractSchema(BaseModel):\n", 158 | " company_mission: str\n", 159 | " supports_sso: bool\n", 160 | " is_open_source: bool\n", 161 | " is_in_yc: bool\n", 162 | "\n", 163 | "data = app.scrape_url('https://docs.firecrawl.dev/', {\n", 164 | " 'formats': ['extract'],\n", 165 | " 'extract': {\n", 166 | " 'schema': ExtractSchema.model_json_schema(),\n", 167 | " }\n", 168 | "})\n", 169 | "\n", 170 | "print(data['extract'])" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": { 176 | "id": "prompt-extraction-section" 177 | }, 178 | "source": [ 179 | "## Extracting Data without Schema\n", 180 | "\n", 181 | "Firecrawl also supports extraction using just a prompt, allowing the LLM to determine the structure:" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 7, 187 | "metadata": { 188 | "colab": { 189 | "base_uri": "https://localhost:8080/" 190 | }, 191 | "id": "JU57r_bfJQIJ", 192 | "outputId": "49cc9378-6854-4565-dfd6-00cd9e2177c7" 193 | }, 194 | "outputs": [ 195 | { 196 | "output_type": "stream", 197 | "name": "stdout", 198 | "text": [ 199 | "{\"success\":true,\"data\":{\"extract\":{\"company_mission\":\"Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to\"},\"metadata\":{\"title\":\"Quickstart | Firecrawl\",\"description\":\"Firecrawl allows you to turn entire websites into LLM-ready markdown\",\"language\":\"en\",\"ogLocaleAlternate\":[],\"viewport\":\"width=device-width\",\"msapplication-config\":\"https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/_generated/favicon/browserconfig.xml?v=3\",\"apple-mobile-web-app-title\":\"Firecrawl Docs\",\"application-name\":\"Firecrawl Docs\",\"msapplication-TileColor\":\"#000\",\"theme-color\":\"#ffffff\",\"charset\":\"utf-8\",\"og:type\":\"website\",\"og:site_name\":\"Firecrawl Docs\",\"twitter:card\":\"summary_large_image\",\"og:title\":\"Quickstart | Firecrawl\",\"twitter:title\":\"Firecrawl Docs\",\"og:image\":\"/images/og.png\",\"twitter:image\":\"/images/og.png\",\"og:description\":\"Firecrawl allows you to turn entire websites into LLM-ready markdown\",\"og:url\":\"https://docs.firecrawl.dev/introduction\",\"next-head-count\":\"25\",\"sourceURL\":\"https://docs.firecrawl.dev/\",\"statusCode\":200}}}" 200 | ] 201 | } 202 | ], 203 | "source": [ 204 | "# Method 1: Using curl with a properly formatted command string\n", 205 | "curl_command = f'''\n", 206 | "curl -X POST https://api.firecrawl.dev/v1/scrape \\\n", 207 | " -H 'Content-Type: application/json' \\\n", 208 | " -H 'Authorization: Bearer {api_key}' \\\n", 209 | " -d '{{\n", 210 | " \"url\": \"https://docs.firecrawl.dev/\",\n", 211 | " \"formats\": [\"extract\"],\n", 212 | " \"extract\": {{\n", 213 | " \"prompt\": \"Extract the company mission from the page.\"\n", 214 | " }}\n", 215 | " }}'\n", 216 | "'''\n", 217 | "\n", 218 | "!{curl_command}" 219 | ] 220 | }, 221 | { 222 | "cell_type": "code", 223 | "source": [ 224 | "# Method 2: Alternative approach using requests library\n", 225 | "import requests\n", 226 | "import json\n", 227 | "\n", 228 | "url = \"https://api.firecrawl.dev/v1/scrape\"\n", 229 | "headers = {\n", 230 | " \"Content-Type\": \"application/json\",\n", 231 | " \"Authorization\": f\"Bearer {api_key}\"\n", 232 | "}\n", 233 | "payload = {\n", 234 | " \"url\": \"https://docs.firecrawl.dev/\",\n", 235 | " \"formats\": [\"extract\"],\n", 236 | " \"extract\": {\n", 237 | " \"prompt\": \"Extract the company mission from the page.\"\n", 238 | " }\n", 239 | "}\n", 240 | "\n", 241 | "response = requests.post(url, headers=headers, json=payload)\n", 242 | "print(json.dumps(response.json(), indent=2))" 243 | ], 244 | "metadata": { 245 | "colab": { 246 | "base_uri": "https://localhost:8080/" 247 | }, 248 | "id": "SJZydfRNT1Sc", 249 | "outputId": "3aed84ce-26f3-4714-b625-b73944031788" 250 | }, 251 | "execution_count": 8, 252 | "outputs": [ 253 | { 254 | "output_type": "stream", 255 | "name": "stdout", 256 | "text": [ 257 | "{\n", 258 | " \"success\": true,\n", 259 | " \"data\": {\n", 260 | " \"extract\": {\n", 261 | " \"company_mission\": \"Train a secure AI on your technical resources that answers customer and employee questions so your team doesn't have to\"\n", 262 | " },\n", 263 | " \"metadata\": {\n", 264 | " \"title\": \"Quickstart | Firecrawl\",\n", 265 | " \"description\": \"Firecrawl allows you to turn entire websites into LLM-ready markdown\",\n", 266 | " \"language\": \"en\",\n", 267 | " \"ogLocaleAlternate\": [],\n", 268 | " \"viewport\": \"width=device-width\",\n", 269 | " \"msapplication-config\": \"https://mintlify.s3-us-west-1.amazonaws.com/firecrawl/_generated/favicon/browserconfig.xml?v=3\",\n", 270 | " \"apple-mobile-web-app-title\": \"Firecrawl Docs\",\n", 271 | " \"application-name\": \"Firecrawl Docs\",\n", 272 | " \"msapplication-TileColor\": \"#000\",\n", 273 | " \"theme-color\": \"#ffffff\",\n", 274 | " \"charset\": \"utf-8\",\n", 275 | " \"og:type\": \"website\",\n", 276 | " \"og:site_name\": \"Firecrawl Docs\",\n", 277 | " \"twitter:card\": \"summary_large_image\",\n", 278 | " \"og:title\": \"Quickstart | Firecrawl\",\n", 279 | " \"twitter:title\": \"Firecrawl Docs\",\n", 280 | " \"og:image\": \"/images/og.png\",\n", 281 | " \"twitter:image\": \"/images/og.png\",\n", 282 | " \"og:description\": \"Firecrawl allows you to turn entire websites into LLM-ready markdown\",\n", 283 | " \"og:url\": \"https://docs.firecrawl.dev/introduction\",\n", 284 | " \"next-head-count\": \"25\",\n", 285 | " \"sourceURL\": \"https://docs.firecrawl.dev/\",\n", 286 | " \"statusCode\": 200\n", 287 | " }\n", 288 | " }\n", 289 | "}\n" 290 | ] 291 | } 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": { 297 | "id": "closing-section" 298 | }, 299 | "source": [ 300 | "## Next Steps\n", 301 | "\n", 302 | "You've now learned how to:\n", 303 | "1. Set up Firecrawl for data extraction\n", 304 | "2. Extract data using a defined schema\n", 305 | "3. Extract data using prompts without a schema\n", 306 | "\n", 307 | "For more information about the extract format and additional features, visit the [Firecrawl documentation](https://docs.firecrawl.dev/features/extract)." 308 | ] 309 | } 310 | ], 311 | "metadata": { 312 | "kernelspec": { 313 | "display_name": "Python 3", 314 | "language": "python", 315 | "name": "python3" 316 | }, 317 | "language_info": { 318 | "codemirror_mode": { 319 | "name": "ipython", 320 | "version": 3 321 | }, 322 | "file_extension": ".py", 323 | "mimetype": "text/x-python", 324 | "name": "python", 325 | "nbconvert_exporter": "python", 326 | "pygments_lexer": "ipython3", 327 | "version": "3.8.0" 328 | }, 329 | "colab": { 330 | "provenance": [], 331 | "include_colab_link": true 332 | } 333 | }, 334 | "nbformat": 4, 335 | "nbformat_minor": 0 336 | } -------------------------------------------------------------------------------- /crawl_and_extract_with_openai_o1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "id": "ULHdYNR8QlPF" 17 | }, 18 | "source": [ 19 | "# Integrating OpenAI's o1 Reasoning Models with Firecrawl: A Step-by-Step Guide\n", 20 | "\n", 21 | "By Alex Fazio (https://twitter.com/alxfazio)\n", 22 | "\n", 23 | "Github repo: https://github.com/alexfazio/firecrawl-cookbook\n", 24 | "\n", 25 | "OpenAI has recently unveiled its o1 series models, marking a significant leap in the realm of complex reasoning with AI. These models are designed to \"think before they answer,\" producing extensive internal chains of thought before responding. In this guide, we'll explore how to integrate these powerful models into your applications, with a practical example of crawling a website using the o1-preview model.\n", 26 | "\n", 27 | "**This Jupyter notebook** demonstrates how to integrate OpenAI's o1 reasoning models with Firecrawl technology to perform complex tasks like crawling a website and extracting specific information.\n", 28 | "\n", 29 | "By the end of this notebook, you'll be able to:\n", 30 | "\n", 31 | "- Set up the Firecrawl and OpenAI environments\n", 32 | "- Use the o1-preview model to enhance the crawling process\n", 33 | "- Crawl a website and generate a list of relevant URLs based on a given objective\n", 34 | "- Extract content from crawled pages in Markdown\n", 35 | "- Evaluate the extracted content using the o1 reasoning model to check if it meets the specified objective\n", 36 | "\n", 37 | "This guide is designed for developers and data scientists who want to leverage advanced AI reasoning capabilities and web crawling technology to efficiently gather and analyze information from the web.\n", 38 | "\n", 39 | "## Requirements\n", 40 | "\n", 41 | "Before proceeding, ensure you have the following:\n", 42 | "\n", 43 | "- Firecrawl API key: Essential for accessing the Firecrawl service\n", 44 | "- OpenAI API key: Required for using the o1 reasoning models" 45 | ] 46 | }, 47 | { 48 | "cell_type": "markdown", 49 | "metadata": { 50 | "id": "sypSu88zQlPG" 51 | }, 52 | "source": [ 53 | "## Introduction to o1 Models\n", 54 | "\n", 55 | "The o1 models are large language models trained with reinforcement learning to excel in complex reasoning tasks. There are two models available:\n", 56 | "\n", 57 | "- **o1-preview**: An early preview designed for reasoning about hard problems using broad general knowledge.\n", 58 | "- **o1-mini**: A faster, cost-effective version ideal for coding, math, and science tasks that don't require extensive general knowledge.\n", 59 | "\n", 60 | "While these models offer significant advancements, they are not intended to replace GPT-4o in all use cases. If your application requires image inputs, function calling, or consistent fast response times, GPT-4o and GPT-4o mini remain the optimal choices." 61 | ] 62 | }, 63 | { 64 | "cell_type": "markdown", 65 | "metadata": { 66 | "id": "DlPHD-WNQlPG" 67 | }, 68 | "source": [ 69 | "## Prerequisites\n", 70 | "\n", 71 | "First, let's install the required libraries:" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 1, 77 | "metadata": { 78 | "colab": { 79 | "base_uri": "https://localhost:8080/" 80 | }, 81 | "id": "METpmDKFQlPH", 82 | "outputId": "b32a1fc8-b6ee-4268-e4b3-719a770d5a03" 83 | }, 84 | "source": [ 85 | "%pip install -q firecrawl-py openai python-dotenv" 86 | ], 87 | "outputs": [ 88 | { 89 | "output_type": "stream", 90 | "name": "stdout", 91 | "text": [ 92 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/386.9 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m \u001b[32m378.9/386.9 kB\u001b[0m \u001b[31m20.7 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m386.9/386.9 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 93 | "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/76.4 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m76.4/76.4 kB\u001b[0m \u001b[31m4.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 94 | "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/78.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.0/78.0 kB\u001b[0m \u001b[31m3.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 95 | "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/325.2 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m325.2/325.2 kB\u001b[0m \u001b[31m10.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 96 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.1/164.1 kB\u001b[0m \u001b[31m7.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 97 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m58.3/58.3 kB\u001b[0m \u001b[31m3.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 98 | "\u001b[?25h" 99 | ] 100 | } 101 | ] 102 | }, 103 | { 104 | "cell_type": "markdown", 105 | "metadata": { 106 | "id": "PzI4OO5EQlPH" 107 | }, 108 | "source": [ 109 | "## Step 1: Import Necessary Libraries" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "id": "nEBEShvoQlPH" 117 | }, 118 | "source": [ 119 | "import os\n", 120 | "from firecrawl import FirecrawlApp\n", 121 | "import json\n", 122 | "from dotenv import load_dotenv\n", 123 | "from openai import OpenAI" 124 | ], 125 | "outputs": [] 126 | }, 127 | { 128 | "cell_type": "markdown", 129 | "metadata": { 130 | "id": "HKWKvisAQlPH" 131 | }, 132 | "source": [ 133 | "## Step 2: Load Environment Variables\n", 134 | "\n", 135 | "For Google Colab, we'll set the environment variables directly instead of using a .env file. In practice, you should never expose your API keys in your notebook." 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": null, 141 | "metadata": { 142 | "id": "G8vBvKDiQlPI" 143 | }, 144 | "source": [ 145 | "# For development, use environment variables\n", 146 | "os.environ['FIRECRAWL_API_KEY'] = 'your_firecrawl_api_key_here'\n", 147 | "os.environ['OPENAI_API_KEY'] = 'your_openai_api_key_here'\n", 148 | "\n", 149 | "# Retrieve API keys from environment variables\n", 150 | "firecrawl_api_key = os.getenv(\"FIRECRAWL_API_KEY\")\n", 151 | "openai_api_key = os.getenv(\"OPENAI_API_KEY\")" 152 | ], 153 | "outputs": [] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": { 158 | "id": "_jNb62ZnQlPI" 159 | }, 160 | "source": [ 161 | "## Step 3: Initialize the FirecrawlApp and OpenAI Client" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": null, 167 | "metadata": { 168 | "id": "xRUYdpUsQlPI" 169 | }, 170 | "source": [ 171 | "# Initialize the FirecrawlApp and OpenAI client\n", 172 | "app = FirecrawlApp(api_key=firecrawl_api_key)\n", 173 | "client = OpenAI(api_key=openai_api_key)" 174 | ], 175 | "outputs": [] 176 | }, 177 | { 178 | "cell_type": "markdown", 179 | "metadata": { 180 | "id": "ZYIFGPKwQlPI" 181 | }, 182 | "source": [ 183 | "## Step 4: Define the Objective and URL" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": null, 189 | "metadata": { 190 | "id": "NcEE8cljQlPI" 191 | }, 192 | "source": [ 193 | "url = \"https://example.com\"\n", 194 | "objective = \"Find the contact email for customer support\"" 195 | ], 196 | "outputs": [] 197 | }, 198 | { 199 | "cell_type": "markdown", 200 | "metadata": { 201 | "id": "0oYsHFCzQlPI" 202 | }, 203 | "source": [ 204 | "## Step 5: Determine the Search Parameter Using o1-preview" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": { 211 | "id": "jIvi4bTRQlPI" 212 | }, 213 | "source": [ 214 | "map_prompt = f\"\"\"\n", 215 | "The map function generates a list of URLs from a website and accepts a search parameter. Based on the objective: {objective}, suggest a 1-2 word search parameter to find the needed information. Only respond with 1-2 words.\n", 216 | "\"\"\"\n", 217 | "\n", 218 | "# OpenAI API call\n", 219 | "completion = client.chat.completions.create(\n", 220 | " model=\"o1-preview\",\n", 221 | " messages=[\n", 222 | " {\"role\": \"user\", \"content\": map_prompt}\n", 223 | " ]\n", 224 | ")\n", 225 | "\n", 226 | "map_search_parameter = completion.choices[0].message.content.strip()\n", 227 | "print(f\"Search parameter: {map_search_parameter}\")" 228 | ], 229 | "outputs": [] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": { 234 | "id": "DrYCmYtEQlPI" 235 | }, 236 | "source": [ 237 | "## Step 6: Map the Website Using the Search Parameter" 238 | ] 239 | }, 240 | { 241 | "cell_type": "code", 242 | "execution_count": null, 243 | "metadata": { 244 | "id": "Zir5O-HbQlPI" 245 | }, 246 | "source": [ 247 | "map_website = app.map_url(url, params={\"search\": map_search_parameter})\n", 248 | "print(\"Mapped URLs:\", map_website)" 249 | ], 250 | "outputs": [] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": { 255 | "id": "ejGIGykTQlPI" 256 | }, 257 | "source": [ 258 | "## Step 7: Scrape the Top Pages and Check for the Objective" 259 | ] 260 | }, 261 | { 262 | "cell_type": "code", 263 | "execution_count": null, 264 | "metadata": { 265 | "id": "MZHYRT36QlPI" 266 | }, 267 | "source": [ 268 | "# Get top 3 links\n", 269 | "top_links = map_website[:3] if isinstance(map_website, list) else []\n", 270 | "\n", 271 | "for link in top_links:\n", 272 | " # Scrape the page\n", 273 | " scrape_result = app.scrape_url(link, params={'formats': ['markdown']})\n", 274 | "\n", 275 | " # Check if objective is met\n", 276 | " check_prompt = f\"\"\"\n", 277 | " Given the following scraped content and objective, determine if the objective is met with high confidence.\n", 278 | " If it is, extract the relevant information in a simple and concise JSON format.\n", 279 | " If the objective is not met with high confidence, respond with 'Objective not met'.\n", 280 | "\n", 281 | " Objective: {objective}\n", 282 | " Scraped content: {scrape_result['markdown']}\n", 283 | " \"\"\"\n", 284 | "\n", 285 | " completion = client.chat.completions.create(\n", 286 | " model=\"o1-preview\",\n", 287 | " messages=[\n", 288 | " {\"role\": \"user\", \"content\": check_prompt}\n", 289 | " ]\n", 290 | " )\n", 291 | "\n", 292 | " result = completion.choices[0].message.content.strip()\n", 293 | "\n", 294 | " if result != \"Objective not met\":\n", 295 | " try:\n", 296 | " extracted_info = json.loads(result)\n", 297 | " break\n", 298 | " except json.JSONDecodeError:\n", 299 | " continue\n", 300 | "else:\n", 301 | " extracted_info = None" 302 | ], 303 | "outputs": [] 304 | }, 305 | { 306 | "cell_type": "markdown", 307 | "metadata": { 308 | "id": "JNBIwUbhQlPJ" 309 | }, 310 | "source": [ 311 | "## Step 8: Display the Extracted Information" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "id": "Ivv4ze9OQlPJ" 319 | }, 320 | "source": [ 321 | "if extracted_info:\n", 322 | " print(\"Extracted Information:\")\n", 323 | " print(json.dumps(extracted_info, indent=2))\n", 324 | "else:\n", 325 | " print(\"Objective not met with the available content.\")" 326 | ], 327 | "outputs": [] 328 | }, 329 | { 330 | "cell_type": "markdown", 331 | "metadata": { 332 | "id": "39Sb3XlFQlPJ" 333 | }, 334 | "source": [ 335 | "## Conclusion\n", 336 | "\n", 337 | "In this notebook, we've explored how to integrate OpenAI's new o1 reasoning models into your applications to perform complex tasks like crawling a website and extracting specific information. The o1 models showcase impressive capabilities in reasoning and problem-solving, making them valuable tools for developers tackling challenging AI tasks.\n", 338 | "\n", 339 | "Whether you're working on advanced coding problems, mathematical computations, or intricate scientific queries, the o1 models can significantly enhance your application's reasoning abilities.\n", 340 | "\n", 341 | "Happy coding!" 342 | ] 343 | } 344 | ], 345 | "metadata": { 346 | "kernelspec": { 347 | "display_name": "Python 3", 348 | "language": "python", 349 | "name": "python3" 350 | }, 351 | "language_info": { 352 | "codemirror_mode": { 353 | "name": "ipython", 354 | "version": 3 355 | }, 356 | "file_extension": ".py", 357 | "mimetype": "text/x-python", 358 | "name": "python", 359 | "nbconvert_exporter": "python", 360 | "pygments_lexer": "ipython3", 361 | "version": "3.8.0" 362 | }, 363 | "colab": { 364 | "provenance": [], 365 | "include_colab_link": true 366 | } 367 | }, 368 | "nbformat": 4, 369 | "nbformat_minor": 0 370 | } 371 | -------------------------------------------------------------------------------- /crawl_and_extract_with_xai_grok.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "private_outputs": true, 7 | "provenance": [], 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": { 32 | "id": "intro-section" 33 | }, 34 | "source": [ 35 | "# Building a Web Crawler with Grok-2 and Firecrawl\n", 36 | "\n", 37 | "By Alex Fazio (https://twitter.com/alxfazio)\n", 38 | "\n", 39 | "Github repo: https://github.com/alexfazio/firecrawl-cookbook\n", 40 | "\n", 41 | "This Jupyter notebook demonstrates how to combine Grok-2's language model capabilities with Firecrawl's web scraping features to build an intelligent web crawler that can extract structured data from websites.\n", 42 | "\n", 43 | "By the end of this notebook, you'll be able to:\n", 44 | "\n", 45 | "1. Set up the Grok-2 and Firecrawl environment\n", 46 | "2. Build a targeted web crawler that understands content\n", 47 | "3. Extract and process structured data from websites\n", 48 | "4. Export the processed content in JSON format\n", 49 | "\n", 50 | "This cookbook is designed for developers and data scientists who want to build advanced web crawlers with AI-powered content understanding." 51 | ] 52 | }, 53 | { 54 | "cell_type": "markdown", 55 | "metadata": { 56 | "id": "9rQmTgiMVk5X" 57 | }, 58 | "source": [ 59 | "## Setup\n", 60 | "\n", 61 | "First, let's install the required packages:" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "id": "zBE3KvuKVk5X" 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "%pip install firecrawl-py requests python-dotenv --quiet" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": null, 78 | "metadata": { 79 | "id": "q2GWsM_gVk5X" 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "import os\n", 84 | "import json\n", 85 | "import requests\n", 86 | "from dotenv import load_dotenv\n", 87 | "from firecrawl import FirecrawlApp" 88 | ] 89 | }, 90 | { 91 | "cell_type": "markdown", 92 | "metadata": { 93 | "id": "VqpYy-sxVk5Y" 94 | }, 95 | "source": [ 96 | "## Initialize Environment\n", 97 | "\n", 98 | "Enter your API keys securely:" 99 | ] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "execution_count": null, 104 | "metadata": { 105 | "id": "Wrec0L1sVk5Y" 106 | }, 107 | "outputs": [], 108 | "source": [ 109 | "from getpass import getpass\n", 110 | "\n", 111 | "# Securely get API keys\n", 112 | "grok_api_key = getpass(\"Enter your Grok-2 API key: \")\n", 113 | "firecrawl_api_key = getpass(\"Enter your Firecrawl API key: \")\n", 114 | "\n", 115 | "# Initialize FirecrawlApp\n", 116 | "app = FirecrawlApp(api_key=firecrawl_api_key)" 117 | ] 118 | }, 119 | { 120 | "cell_type": "markdown", 121 | "metadata": { 122 | "id": "uiFw3MfjVk5Y" 123 | }, 124 | "source": [ 125 | "## Define Grok-2 API Interaction\n", 126 | "\n", 127 | "Let's create a function to handle interactions with the Grok-2 API, including comprehensive error handling and debugging information:" 128 | ] 129 | }, 130 | { 131 | "cell_type": "code", 132 | "execution_count": null, 133 | "metadata": { 134 | "id": "PTc5bc85Vk5Y" 135 | }, 136 | "outputs": [], 137 | "source": [ 138 | "def grok_completion(prompt):\n", 139 | " url = \"https://api.x.ai/v1/chat/completions\"\n", 140 | " headers = {\n", 141 | " \"Content-Type\": \"application/json\",\n", 142 | " \"Authorization\": f\"Bearer {grok_api_key}\"\n", 143 | " }\n", 144 | " data = {\n", 145 | " \"messages\": [\n", 146 | " {\"role\": \"system\", \"content\": \"You are a helpful assistant.\"},\n", 147 | " {\"role\": \"user\", \"content\": prompt}\n", 148 | " ],\n", 149 | " \"model\": \"grok-beta\",\n", 150 | " \"stream\": False,\n", 151 | " \"temperature\": 0\n", 152 | " }\n", 153 | "\n", 154 | " try:\n", 155 | " response = requests.post(url, headers=headers, json=data)\n", 156 | " print(f\"\\nAPI Response Status Code: {response.status_code}\")\n", 157 | "\n", 158 | " if response.status_code != 200:\n", 159 | " print(f\"Error Response: {response.text}\")\n", 160 | " return None\n", 161 | "\n", 162 | " response_data = response.json()\n", 163 | " print(\"\\nFull API Response:\")\n", 164 | " print(json.dumps(response_data, indent=2))\n", 165 | "\n", 166 | " if 'choices' not in response_data:\n", 167 | " print(\"\\nWarning: 'choices' key not found in response\")\n", 168 | " print(\"Available keys:\", list(response_data.keys()))\n", 169 | " return None\n", 170 | "\n", 171 | " if not response_data['choices']:\n", 172 | " print(\"\\nWarning: 'choices' array is empty\")\n", 173 | " return None\n", 174 | "\n", 175 | " choice = response_data['choices'][0]\n", 176 | " if 'message' not in choice:\n", 177 | " print(\"\\nWarning: 'message' key not found in first choice\")\n", 178 | " print(\"Available keys in choice:\", list(choice.keys()))\n", 179 | " return None\n", 180 | "\n", 181 | " if 'content' not in choice['message']:\n", 182 | " print(\"\\nWarning: 'content' key not found in message\")\n", 183 | " print(\"Available keys in message:\", list(choice['message'].keys()))\n", 184 | " return None\n", 185 | "\n", 186 | " return choice['message']['content']\n", 187 | "\n", 188 | " except requests.exceptions.RequestException as e:\n", 189 | " print(f\"\\nRequest Error: {str(e)}\")\n", 190 | " return None\n", 191 | " except json.JSONDecodeError as e:\n", 192 | " print(f\"\\nJSON Decode Error: {str(e)}\")\n", 193 | " print(\"Raw Response:\", response.text)\n", 194 | " return None\n", 195 | " except Exception as e:\n", 196 | " print(f\"\\nUnexpected Error: {str(e)}\")\n", 197 | " return None" 198 | ] 199 | }, 200 | { 201 | "cell_type": "markdown", 202 | "metadata": { 203 | "id": "Wecuh9TbVk5Y" 204 | }, 205 | "source": [ 206 | "## Website Crawling Functions\n", 207 | "\n", 208 | "This function combines Grok-2's understanding with Firecrawl's search capabilities to find relevant pages. It:\n", 209 | "\n", 210 | "1. Uses Grok-2 to distill the user's objective into a focused search term\n", 211 | "2. Enforces strict formatting rules for consistent search terms\n", 212 | "3. Cleans and normalizes the search output\n", 213 | "4. Uses Firecrawl's map endpoint to discover relevant pages\n", 214 | "\n", 215 | "The function takes a broad objective (e.g., \"Find articles about startup investments\") and converts it into an optimized search term (e.g., \"startup funding\") to ensure targeted results.\n", 216 | "\n", 217 | "Note: The function limits search terms to 2 words maximum for optimal performance with Firecrawl's search algorithm." 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": null, 223 | "metadata": { 224 | "id": "cGA5X6PDVk5Y" 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "def find_relevant_pages(objective, url):\n", 229 | " prompt = f\"\"\"Based on the objective '{objective}', provide ONLY a 1-2 word search term to locate relevant information on the website.\n", 230 | "\n", 231 | "Rules:\n", 232 | "- Return ONLY the search term, nothing else\n", 233 | "- Maximum 2 words\n", 234 | "- No punctuation or formatting\n", 235 | "- No explanatory text\"\"\"\n", 236 | "\n", 237 | " search_term = grok_completion(prompt)\n", 238 | "\n", 239 | " if search_term is None:\n", 240 | " print(\"Failed to get search term from Grok-2 API\")\n", 241 | " return []\n", 242 | "\n", 243 | " # Clean up the search term\n", 244 | " search_term = search_term.strip().replace('\"', '').replace('*', '')\n", 245 | " words = search_term.split()\n", 246 | " if len(words) > 2:\n", 247 | " search_term = \" \".join(words[:2])\n", 248 | "\n", 249 | " print(f\"Using search term: '{search_term}'\")\n", 250 | "\n", 251 | " try:\n", 252 | " map_result = app.map_url(url, params={\"search\": search_term})\n", 253 | " return map_result.get(\"links\", [])\n", 254 | " except Exception as e:\n", 255 | " print(f\"Error mapping URL: {str(e)}\")\n", 256 | " return []" 257 | ] 258 | }, 259 | { 260 | "cell_type": "markdown", 261 | "source": [ 262 | "## Content Extraction and Processing\n", 263 | "\n", 264 | "This function handles the extraction and intelligent processing of content from each webpage. It:\n", 265 | "\n", 266 | "1. Scrapes content from each relevant page\n", 267 | "2. Uses Grok-2 to analyze the content against our objective\n", 268 | "3. Extracts structured data in JSON format\n", 269 | "4. Handles various edge cases and errors\n", 270 | "\n", 271 | "The function processes up to 3 pages and returns the first successful match, using Grok-2 to determine relevance and extract specific data points." 272 | ], 273 | "metadata": { 274 | "id": "XSuxErSsYH1L" 275 | } 276 | }, 277 | { 278 | "cell_type": "code", 279 | "execution_count": null, 280 | "metadata": { 281 | "id": "lffHeKe-Vk5Y" 282 | }, 283 | "outputs": [], 284 | "source": [ 285 | "def extract_data_from_pages(links, objective):\n", 286 | " for link in links[:3]:\n", 287 | " try:\n", 288 | " print(f\"\\nProcessing link: {link}\")\n", 289 | " scrape_result = app.scrape_url(link, params={'formats': ['markdown']})\n", 290 | " content = scrape_result.get('markdown', '')\n", 291 | "\n", 292 | " if not content:\n", 293 | " print(\"No content extracted from page\")\n", 294 | " continue\n", 295 | "\n", 296 | " prompt = f\"\"\"Given the following content, extract the information related to the objective '{objective}' in JSON format. If not found, reply 'Objective not met'.\n", 297 | "\n", 298 | "Content: {content}\n", 299 | "\n", 300 | "Remember:\n", 301 | "- Only return JSON if the objective is met.\n", 302 | "- Do not include any extra text or markdown formatting.\n", 303 | "- Do not wrap the JSON in code blocks.\n", 304 | "\"\"\"\n", 305 | " result = grok_completion(prompt)\n", 306 | "\n", 307 | " if result is None:\n", 308 | " print(\"Failed to get response from Grok-2 API\")\n", 309 | " continue\n", 310 | "\n", 311 | " result = result.strip()\n", 312 | "\n", 313 | " # Handle case where response is wrapped in code blocks\n", 314 | " if result.startswith(\"```\") and result.endswith(\"```\"):\n", 315 | " # Remove the code block markers and any language identifier\n", 316 | " result = result.split(\"\\n\", 1)[1].rsplit(\"\\n\", 1)[0]\n", 317 | "\n", 318 | " if result != \"Objective not met\":\n", 319 | " try:\n", 320 | " data = json.loads(result)\n", 321 | " return data\n", 322 | " except json.JSONDecodeError as e:\n", 323 | " print(f\"Error parsing JSON response: {str(e)}\")\n", 324 | " print(\"Raw response:\", result)\n", 325 | " continue\n", 326 | " else:\n", 327 | " print(\"Objective not met for this page\")\n", 328 | "\n", 329 | " except Exception as e:\n", 330 | " print(f\"Error processing page: {str(e)}\")\n", 331 | " continue\n", 332 | "\n", 333 | " return None" 334 | ] 335 | }, 336 | { 337 | "cell_type": "markdown", 338 | "metadata": { 339 | "id": "hT7hScXvVk5Y" 340 | }, 341 | "source": [ 342 | "## Main Execution\n", 343 | "\n", 344 | "Let's create and run the main function that ties everything together:" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "id": "C_wDgTBOVk5Y" 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "import pprint\n", 356 | "\n", 357 | "def main():\n", 358 | " url = input(\"Enter the website URL to crawl: \")\n", 359 | " objective = input(\"Enter your data extraction objective: \")\n", 360 | "\n", 361 | " print(\"\\nFinding relevant pages...\")\n", 362 | " links = find_relevant_pages(objective, url)\n", 363 | "\n", 364 | " if not links:\n", 365 | " print(\"No relevant pages found.\")\n", 366 | " return\n", 367 | "\n", 368 | " print(f\"\\nFound {len(links)} relevant pages:\")\n", 369 | " for i, link in enumerate(links[:3], 1):\n", 370 | " pprint.pprint(f\"{i}. {link}\")\n", 371 | "\n", 372 | " print(\"\\nExtracting data from pages...\")\n", 373 | " data = extract_data_from_pages(links, objective)\n", 374 | "\n", 375 | " if data:\n", 376 | " print(\"\\nData extracted successfully:\")\n", 377 | " pprint.pprint(json.dumps(data, indent=2))\n", 378 | " else:\n", 379 | " print(\"Could not find data matching the objective.\")" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": null, 385 | "metadata": { 386 | "id": "TtIGh9jSVk5Z" 387 | }, 388 | "outputs": [], 389 | "source": [ 390 | "# Run the crawler\n", 391 | "main()" 392 | ] 393 | }, 394 | { 395 | "cell_type": "markdown", 396 | "metadata": { 397 | "id": "PZPOttGoVk5Z" 398 | }, 399 | "source": [ 400 | "## What's Next?\n", 401 | "\n", 402 | "Now that you have a working web crawler, consider these enhancements:\n", 403 | "\n", 404 | "1. Add error handling and retries\n", 405 | "2. Implement concurrent processing\n", 406 | "3. Add content filtering and validation\n", 407 | "4. Create custom extraction rules\n", 408 | "\n", 409 | "The combination of Grok-2 and Firecrawl offers powerful possibilities for intelligent web scraping and content analysis.\n", 410 | "\n", 411 | "## Additional Resources\n", 412 | "\n", 413 | "- [x.ai Grok-2 API Documentation](https://api.x.ai/docs)\n", 414 | "- [Firecrawl Python Library Documentation](https://docs.firecrawl.dev)\n", 415 | "- [Example Code Repository](https://github.com/example/web-crawler)" 416 | ] 417 | } 418 | ] 419 | } 420 | -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | Firecrawl Quickstarts Logo 5 | 6 | 7 | 8 | # Hugging Face "Daily Papers" Tracker 9 | 10 | This system provides **automated notifications** about the latest white papers published on the [Hugging Face Daily Papers](https://huggingface.co/papers) page. Using **Firecrawl's semantic crawling and scraping capabilities** (Crawl and Extract APIs), it fetches and processes new publications daily. The system uses semantic filtering to determine which papers are most relevant to the user's interests, based on a user-defined category prompt, and delivers notifications directly to Discord. 11 | 12 | ### Key Features 13 | - **Daily Notifications**: Receive real-time updates about the latest papers that match your research interests. 14 | - **Firecrawl Integration**: 15 | - **Crawl API**: Retrieves the list of newly published papers from the Hugging Face Daily Papers page. 16 | - **Extract API**: Extracts structured and semantically enriched data from each paper for filtering and analysis. 17 | - **Semantic Filtering**: Matches papers to the user's category of interest using a customizable category prompt. 18 | - **Customizable Interests**: Easily define your research area by editing the `category_prompt` file. 19 | - **Default Configuration**: Preconfigured to track papers related to AI Agents, but can be adapted for any topic. 20 | 21 | ### How It Works 22 | 1. **Crawl**: The system uses the Firecrawl Crawl API to retrieve today's list of papers from the Hugging Face Daily Papers page. 23 | 2. **Extract**: It processes and extracts structured semantic data from each paper using the Firecrawl Extract API. 24 | 3. **Store**: All extracted paper data is stored in a **Supabase database** for future reference and analysis. 25 | 4. **Filter**: Semantic filtering identifies papers relevant to the category defined in `category_prompt`. 26 | 5. **Notify**: Sends summaries of the filtered papers directly to a Discord channel. 27 | 28 | ### System Architecture 29 | ```mermaid 30 | flowchart TB 31 | subgraph entry ["Entry Points"] 32 | CLI["CLI Arguments\n--url or --date"] 33 | GHA["GitHub Actions\nScheduled/Manual Trigger"] 34 | end 35 | 36 | subgraph hf_white_paper_tracker ["hf_white_paper_tracker.py"] 37 | A[Initialize Logger] --> B[Initialize Database] 38 | B --> C[Verify DB Connection/Version] 39 | C --> D{"URL Source?"} 40 | D -->|CLI Args| E1["Use Provided URL"] 41 | D -->|No Args| E2["Get Today's URL"] 42 | E1 --> F[Run Paper Tracker] 43 | E2 --> F 44 | end 45 | 46 | subgraph firecrawl_crawl_extract ["firecrawl_crawl_extract.py"] 47 | G[Extract Paper URLs] --> H[Process Paper Batch] 48 | H --> I[Extract Paper Details] 49 | end 50 | 51 | subgraph semantic_filter ["semantic_filter.py"] 52 | J[Load Category Definition] --> K[Classify Paper] 53 | K --> L{Meets Criteria?} 54 | end 55 | 56 | subgraph discord_notifications ["discord_notifications.py"] 57 | M[Format Message] --> N[Send to Discord] 58 | end 59 | 60 | subgraph supabase_db ["supabase_db.py"] 61 | O[Store Paper Data] --> P[Update Status] 62 | end 63 | 64 | %% Module connections 65 | CLI -->|Optional| hf_white_paper_tracker 66 | GHA -->|Scheduled| hf_white_paper_tracker 67 | F -->|Calls| G 68 | I -->|Passes paper data| J 69 | L -->|Yes| M 70 | L -->|No| O 71 | N -->|Update notification status| P 72 | 73 | %% Environment dependencies 74 | env[".env File"] -.->|Config| hf_white_paper_tracker 75 | category[category_prompt.py] -.->|Definition| semantic_filter 76 | 77 | classDef module fill:#f9f,stroke:#333,stroke-width:2px; 78 | class hf_white_paper_tracker,firecrawl_crawl_extract,semantic_filter,discord_notifications,supabase_db module; 79 | classDef entry fill:#aaf,stroke:#333,stroke-width:2px; 80 | class CLI,GHA entry; 81 | ``` 82 | 83 | ### Setup 84 | 1. Configure your Firecrawl API keys. 85 | 2. Modify the `category_prompt` to specify your topic of interest. 86 | 3. Set up a Discord webhook for receiving notifications. 87 | 4. Run the system to start tracking and getting updates. 88 | 89 | ## Setup 90 | 91 | 1. Clone the repository 92 | 93 | 2. Install Poetry (if not already installed): 94 | ```bash 95 | curl -sSL https://install.python-poetry.org | python3 - 96 | ``` 97 | 98 | 3. Install dependencies: 99 | ```bash 100 | poetry install 101 | ``` 102 | 103 | 4. Ensure PostgreSQL is installed: 104 | - PostgreSQL must be installed and pg_config must be in your PATH. 105 | - On macOS (using Homebrew): 106 | ```bash 107 | brew install postgresql 108 | ``` 109 | - Verify pg_config: 110 | ```bash 111 | pg_config --version 112 | ``` 113 | 114 | 5. Set up Supabase Database: 115 | 1. Create a Supabase account at [Supabase](https://supabase.com) 116 | 2. Create a new project 117 | 3. Go to Project Settings > Database 118 | 4. Note down: 119 | - Database password (set during project creation) 120 | - Connection string/URI 121 | - Project reference ID 122 | 5. Enable the PostgREST API: 123 | - Go to Project Settings > API 124 | - Ensure PostgREST is enabled 125 | 6. Configure database tables: 126 | - The application will automatically create the required tables: 127 | - `papers`: Stores paper information and tracking status 128 | - `schema_version`: Manages database migrations 129 | 7. Important connection string notes: 130 | - For local development, use port 5432: 131 | ``` 132 | POSTGRES_URL=postgresql://postgres:[YOUR-PASSWORD]@db.[YOUR-PROJECT-REF].supabase.co:5432/postgres?sslmode=require 133 | ``` 134 | - For GitHub Actions, use port 6543: 135 | ``` 136 | POSTGRES_URL=postgresql://postgres.[YOUR-PROJECT-REF]:[YOUR-PASSWORD]@aws-0-[REGION].pooler.supabase.com:6543/postgres?sslmode=require 137 | ``` 138 | - Remember to URL-encode special characters in your password: 139 | - `#` → `%23` 140 | - `$` → `%24` 141 | - `^` → `%5E` 142 | - `&` → `%26` 143 | - `@` stays as `@` 144 | 145 | 6. Set up Discord Notifications: 146 | 1. Create a Discord server (skip if you already have one) 147 | 2. Create a channel for paper notifications 148 | 3. Configure the webhook: 149 | - Go to Server Settings > Integrations 150 | - Click on "Create Webhook" (or edit an existing one) 151 | - Set a name for your webhook (e.g., "Paper Tracker") 152 | - Select the channel where notifications should be sent 153 | - Copy the Webhook URL 154 | 4. Important webhook URL format: 155 | ``` 156 | https://discord.com/api/webhooks/{webhook.id}/{webhook.token} 157 | ``` 158 | 5. The notifications will include: 159 | - Paper title 160 | - Authors 161 | - Abstract (first 500 characters) 162 | - Engagement stats (upvotes and comments) 163 | - Links to: 164 | - PDF version 165 | - arXiv page 166 | - GitHub repository (if available) 167 | - Original HuggingFace post 168 | 6. Security notes: 169 | - Keep your webhook URL private 170 | - The webhook URL contains a secret token 171 | - If compromised, you can regenerate the webhook token in Discord 172 | - Add the URL to your `.env` file: 173 | ``` 174 | DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/YOUR_WEBHOOK_ID/YOUR_WEBHOOK_TOKEN 175 | ``` 176 | 177 | 7. Configure environment variables: 178 | ```bash 179 | cp .env.example .env 180 | ``` 181 | Then edit `.env` with: 182 | 183 | a. Discord Webhook URL: 184 | 1. Go to your Discord server 185 | 2. Edit a channel > Integrations > Create Webhook 186 | 3. Copy the Webhook URL 187 | 4. Add to `.env`: 188 | ``` 189 | DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/YOUR_WEBHOOK_ID/YOUR_WEBHOOK_TOKEN 190 | ``` 191 | 192 | b. Firecrawl API Key: 193 | 1. Sign up at [Firecrawl](https://firecrawl.co) 194 | 2. Go to API Keys section 195 | 3. Create a new API key 196 | 4. Add to `.env`: 197 | ``` 198 | FIRECRAWL_API_KEY=fc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 199 | ``` 200 | 201 | c. Supabase Database URL: 202 | 1. Use the connection string from your Supabase setup (Step 5) 203 | 2. Add to `.env`: 204 | ``` 205 | POSTGRES_URL= 206 | ``` 207 | 208 | d. OpenAI API Key: 209 | 1. Sign up at [OpenAI](https://platform.openai.com) 210 | 2. Go to API Keys section 211 | 3. Create a new API key 212 | 4. Add to `.env`: 213 | ``` 214 | OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 215 | ``` 216 | 217 | 8. Configure paper category filtering: 218 | ```bash 219 | cp category_prompt.example.py category_prompt.py 220 | ``` 221 | Then edit `category_prompt.py` to define your `DESIRED_CATEGORY`. This string defines what papers are considered relevant and will trigger notifications. 222 | 223 | The default configuration is set up for "AI Agents" papers, but you can modify it for your needs. 224 | 225 | The semantic filter uses this definition to determine: 226 | - Which papers trigger Discord notifications 227 | - Classification confidence threshold (default: 0.8) 228 | - Categorization criteria for the LLM-based filter 229 | 230 | Your final `.env` file should look like: 231 | ``` 232 | DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/1234567890/abcdef... 233 | FIRECRAWL_API_KEY=fc-f6ff27d623e548f390bdc0b9debefe59 234 | POSTGRES_URL=postgresql://postgres:mypassword123@db.abcdefghijklm.supabase.co:5432/postgres?sslmode=require 235 | OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 236 | ``` 237 | 238 | TODO: Create a `.env.example` file with placeholder values after testing is complete, to serve as a template for new users. 239 | TODO: Add to readme how to use the main function firecrawl_crawl_extract.py with arguments to set 240 | the date for the papers to extract. 241 | 242 | ## Database Schema 243 | 244 | The application uses a PostgreSQL database with the following schema: 245 | 246 | ### Papers Table 247 | 248 | | Column | Type | Description | 249 | |------------------------|-----------|-------------------------------------------------------| 250 | | url | String | Primary key - unique paper URL | 251 | | title | String | Paper title | 252 | | authors | String[] | Array of author names | 253 | | abstract | Text | Paper abstract | 254 | | pdf_url | String | URL to PDF version | 255 | | arxiv_url | String | URL to arXiv page | 256 | | github_url | String | URL to GitHub repository | 257 | | publication_date | DateTime | Original publication date | 258 | | submission_date | DateTime | Date added to HuggingFace | 259 | | upvotes | Integer | Current number of upvotes (default: 0) | 260 | | comments | Integer | Current number of comments (default: 0) | 261 | | last_updated | DateTime | Last time the record was updated | 262 | | notification_sent | Boolean | Whether notification was sent (default: false) | 263 | | extraction_success | Boolean | Whether extraction succeeded (default: true) | 264 | | extraction_error | Text | Error message if extraction failed (nullable) | 265 | | last_extraction_attempt| DateTime | When the last extraction was attempted | 266 | 267 | ### Schema Version Table 268 | 269 | | Column | Type | Description | 270 | |-----------|-----------|--------------------------------------| 271 | | version | Integer | Schema version number | 272 | | applied_at| Timestamp | When this version was applied | 273 | 274 | ## Deployment Options 275 | 276 | ### Local Deployment 277 | Follow the setup instructions above for running the tracker locally. 278 | 279 | ### GitHub Actions Deployment (Recommended) 280 | This project includes GitHub Actions workflow configuration for automated paper tracking. The workflow: 281 | - Runs every 12 hours automatically 282 | - Can be triggered manually via workflow_dispatch 283 | - Sends notifications on both success and failure 284 | 285 | To set up with GitHub Actions: 286 | 287 | 1. Fork/clone this repository to your GitHub account 288 | 289 | 2. Set up GitHub Secrets: 290 | Go to your repository's Settings > Secrets and Variables > Actions and add the following secrets: 291 | 292 | ``` 293 | DISCORD_WEBHOOK_URL=https://discord.com/api/webhooks/YOUR_WEBHOOK_ID/YOUR_WEBHOOK_TOKEN 294 | FIRECRAWL_API_KEY=fc-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 295 | POSTGRES_URL=postgresql://postgres:[YOUR-PASSWORD]@db.[YOUR-PROJECT-REF].supabase.co:5432/postgres?sslmode=require 296 | OPENAI_API_KEY=sk-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx 297 | ``` 298 | 299 | Note: Make sure your Supabase database URL uses port 6543 for GitHub Actions compatibility. 300 | 301 | 3. Configure your tracking preferences: 302 | - Fork the repository 303 | - Edit `category_prompt.py` to define your paper interests 304 | - Commit the changes to your repository 305 | 306 | 4. Enable GitHub Actions: 307 | - Go to your repository's Actions tab 308 | - Enable workflows if they're not already enabled 309 | - The `Paper Tracker` workflow will now run automatically every 12 hours 310 | 311 | 5. Monitor the workflow: 312 | - Check the Actions tab for run history and logs 313 | - Failed runs will send notifications to your configured Discord channel 314 | - Successful runs will send notifications only for relevant papers 315 | 316 | The workflow is defined in `.github/workflows/paper-tracker.yml` and includes: 317 | - Python 3.10 setup 318 | - Poetry dependency management 319 | - Database connection testing 320 | - Automatic error notifications via Discord 321 | 322 | You can also trigger the workflow manually from the Actions tab using the "Run workflow" button. 323 | 324 | ## TODOs 325 | 326 | [ ] Update the GitHub Actions workflow to trigger based on actual changes to the 327 | Hugging Face Daily Papers page instead of using a scheduled cron job. This can be accomplished by: 328 | 1. Creating a GitHub Action that checks for changes to the page content/hash 329 | 2. Only triggering the main paper tracking workflow when changes are detected 330 | 3. This will ensure: 331 | - More timely updates as papers are published 332 | - More efficient use of API credits by avoiding unnecessary checks 333 | - Reduced latency between paper publication and notification 334 | [ ] Implement Hugging Face Daily Papers API to reduce API costs in calling scraping, and crawling models. 335 | - API: https://huggingface.co/api/daily_papers 336 | - Docs: https://huggingface.co/docs/hub/en/api#paper-pages-api 337 | 338 | ### X (Twitter) API Setup 339 | 340 | The system uses X's OAuth 2.0 for posting updates. This requires a one-time local authorization: 341 | 342 | 1. Configure X API credentials in `.env`: 343 | ``` 344 | X_OAUTH2_CLIENT_ID=your_client_id 345 | X_OAUTH2_CLIENT_SECRET=your_client_secret 346 | ``` 347 | 348 | 2. Run the script locally ONCE to authorize: 349 | ```bash 350 | python x_post.py 351 | ``` 352 | This will: 353 | - Open a browser window 354 | - Prompt for X account authorization 355 | - Store OAuth tokens in your `.env` file: 356 | ``` 357 | X_ACCESS_TOKEN=... 358 | X_REFRESH_TOKEN=... 359 | X_TOKEN_EXPIRES_IN=... 360 | X_TOKEN_SCOPE=... 361 | ``` 362 | 363 | 3. After authorization: 364 | - The tokens are stored in `.env` 365 | - Copy the updated `.env` to your server 366 | - Subsequent runs will use stored tokens 367 | - No browser interaction needed 368 | - Works in automated environments 369 | 370 | 4. To disable X posting: 371 | - Comment out the token-related variables in `.env`: 372 | ``` 373 | # X_ACCESS_TOKEN=... 374 | # X_REFRESH_TOKEN=... 375 | # X_TOKEN_EXPIRES_IN=... 376 | # X_TOKEN_SCOPE=... 377 | ``` 378 | - Uncomment to re-enable posting 379 | - Original credentials are preserved for future use 380 | 381 | Note: Keep your `.env` file secure and never commit it to version control. 382 | 383 | ## Local Testing with Historical Dates 384 | 385 | The system supports testing with historical paper data using the `--date` argument: 386 | 387 | ```bash 388 | # Format: YYYY-MM-DD 389 | python hf_white_paper_tracker.py --date 2024-03-15 390 | ``` 391 | 392 | This is particularly useful for: 393 | - Testing the system with known paper data 394 | - Backfilling missing papers from specific dates 395 | - Debugging extraction issues with historical content 396 | - Verifying database updates without waiting for new papers 397 | 398 | You can also provide a full URL using the `--url` argument: 399 | ```bash 400 | python hf_white_paper_tracker.py --url "https://huggingface.co/papers?date=2024-03-15" 401 | ``` 402 | 403 | The system prioritizes arguments in this order: 404 | 1. `--url` (if provided) 405 | 2. `--date` (if provided) 406 | 3. Today's date (default) 407 | 408 | This flexibility makes local development and testing much more efficient, as you don't need to wait for new papers to be published to verify your changes. -------------------------------------------------------------------------------- /examples/firecrawl_automated_whitepaper_tracking/x_post_v2.py: -------------------------------------------------------------------------------- 1 | """Module for posting research papers to X (Twitter).""" 2 | 3 | import os 4 | import base64 5 | import hashlib 6 | import secrets 7 | import webbrowser 8 | from http.server import HTTPServer, BaseHTTPRequestHandler 9 | from urllib.parse import parse_qs, urlparse 10 | from dotenv import load_dotenv, set_key, find_dotenv 11 | import requests 12 | from typing import Optional 13 | from logging_config import setup_base_logging, log_function_call 14 | import tempfile 15 | from playwright.sync_api import sync_playwright 16 | import mimetypes 17 | import random 18 | from datetime import datetime 19 | from requests_oauthlib import OAuth1 20 | import time 21 | 22 | # Configure logging using the centralized configuration 23 | logger = setup_base_logging( 24 | logger_name="x_poster", 25 | log_file="x_poster.log", 26 | format_string='%(asctime)s - %(levelname)s - %(funcName)s - %(message)s' 27 | ) 28 | 29 | load_dotenv() 30 | 31 | AUTH_URL = "https://twitter.com/i/oauth2/authorize" 32 | TOKEN_URL = "https://api.twitter.com/2/oauth2/token" 33 | CALLBACK_URL = "http://127.0.0.1:8000/callback" 34 | X_API_URL = "https://api.twitter.com/2/tweets" 35 | MEDIA_UPLOAD_URL = "https://upload.twitter.com/1.1/media/upload.json" 36 | POST_UPDATE_URL = "https://api.twitter.com/1.1/statuses/update.json" # v1.1 endpoint 37 | PDF_SCREENSHOT_SIZE = 1080 # 1:1 ratio 38 | PDF_LOAD_TIMEOUT = 30000 # 30 seconds 39 | 40 | @log_function_call 41 | def generate_pkce_pair(): 42 | """Generate PKCE code verifier and challenge""" 43 | code_verifier = secrets.token_urlsafe(64) 44 | code_challenge = base64.urlsafe_b64encode( 45 | hashlib.sha256(code_verifier.encode()).digest() 46 | ).rstrip(b'=').decode() 47 | 48 | logger.debug(f"Generated PKCE - Verifier: {len(code_verifier)} chars, Challenge: {len(code_challenge)} chars") 49 | return code_verifier, code_challenge 50 | 51 | class CallbackHandler(BaseHTTPRequestHandler): 52 | """Handle OAuth callback""" 53 | code = None 54 | 55 | def log_message(self, format, *args): 56 | """Override to use our logger""" 57 | logger.info(f"OAuth Callback: {format%args}") 58 | 59 | def do_GET(self): 60 | """Process callback GET request""" 61 | logger.info(f"Received callback request: {self.path}") 62 | 63 | query = parse_qs(urlparse(self.path).query) 64 | CallbackHandler.code = query.get('code', [None])[0] 65 | 66 | if CallbackHandler.code: 67 | logger.info("Successfully received authorization code") 68 | else: 69 | logger.error(f"No authorization code in callback. Query params: {query}") 70 | 71 | # Log any error parameters 72 | if 'error' in query: 73 | logger.error(f"Error in callback: {query['error']}") 74 | if 'error_description' in query: 75 | logger.error(f"Error description: {query['error_description']}") 76 | 77 | self.send_response(200) 78 | self.send_header('Content-type', 'text/html') 79 | self.end_headers() 80 | self.wfile.write(b"Authorization successful! You can close this window.") 81 | 82 | @log_function_call 83 | def load_stored_tokens(): 84 | """Load stored OAuth 2.0 tokens from .env""" 85 | dotenv_path = find_dotenv() 86 | load_dotenv(dotenv_path) 87 | 88 | # Use OAuth 2.0 specific variables 89 | access_token = os.getenv('X_OAUTH2_ACCESS_TOKEN') 90 | refresh_token = os.getenv('X_OAUTH2_REFRESH_TOKEN') 91 | 92 | if access_token and refresh_token: 93 | logger.debug("Successfully loaded stored OAuth 2.0 tokens") 94 | return { 95 | 'access_token': access_token, 96 | 'refresh_token': refresh_token, 97 | 'expires_in': os.getenv('X_OAUTH2_TOKEN_EXPIRES_IN'), 98 | 'scope': os.getenv('X_OAUTH2_TOKEN_SCOPE') 99 | } 100 | logger.warning("No stored OAuth 2.0 tokens found") 101 | return None 102 | 103 | @log_function_call 104 | def save_tokens(tokens): 105 | """Save OAuth 2.0 tokens to .env""" 106 | dotenv_path = find_dotenv() 107 | 108 | try: 109 | # Use different variable names for OAuth 2.0 110 | set_key(dotenv_path, 'X_OAUTH2_ACCESS_TOKEN', tokens['access_token']) 111 | set_key(dotenv_path, 'X_OAUTH2_REFRESH_TOKEN', tokens['refresh_token']) 112 | set_key(dotenv_path, 'X_OAUTH2_TOKEN_EXPIRES_IN', str(tokens['expires_in'])) 113 | set_key(dotenv_path, 'X_OAUTH2_TOKEN_SCOPE', tokens['scope']) 114 | load_dotenv(dotenv_path) 115 | logger.info("Successfully saved OAuth 2.0 tokens to .env") 116 | except Exception as e: 117 | logger.error(f"Failed to save tokens: {str(e)}") 118 | raise 119 | 120 | def refresh_access_token(refresh_token): 121 | """Get new access token using refresh token""" 122 | auth = ( 123 | os.getenv('X_OAUTH2_CLIENT_ID'), 124 | os.getenv('X_OAUTH2_CLIENT_SECRET') 125 | ) 126 | 127 | data = { 128 | 'refresh_token': refresh_token, 129 | 'grant_type': 'refresh_token' 130 | } 131 | 132 | response = requests.post(TOKEN_URL, auth=auth, data=data) 133 | return response.json() 134 | 135 | def get_oauth2_token(): 136 | """Get OAuth 2.0 token, using stored tokens if available""" 137 | # Try to load stored tokens 138 | tokens = load_stored_tokens() 139 | 140 | if tokens: 141 | logger.debug(f"Found stored tokens with keys: {list(tokens.keys())}") 142 | if 'refresh_token' in tokens: 143 | try: 144 | logger.info("Attempting to refresh token...") 145 | new_tokens = refresh_access_token(tokens['refresh_token']) 146 | logger.debug(f"Refresh response keys: {list(new_tokens.keys())}") 147 | save_tokens(new_tokens) 148 | return new_tokens['access_token'] 149 | except Exception as e: 150 | logger.error(f"Token refresh failed: {e}") 151 | else: 152 | logger.error("Stored tokens missing refresh_token") 153 | 154 | # If no stored tokens or refresh failed, do full authorization 155 | logger.info("Starting OAuth 2.0 PKCE flow") 156 | code_verifier, code_challenge = generate_pkce_pair() 157 | 158 | auth_params = { 159 | 'response_type': 'code', 160 | 'client_id': os.getenv('X_OAUTH2_CLIENT_ID'), 161 | 'redirect_uri': CALLBACK_URL, 162 | 'scope': 'tweet.write tweet.read users.read offline.access', 163 | 'code_challenge': code_challenge, 164 | 'code_challenge_method': 'S256', 165 | 'state': secrets.token_urlsafe(32) 166 | } 167 | 168 | logger.info("Starting local callback server") 169 | server = HTTPServer(('127.0.0.1', 8000), CallbackHandler) 170 | 171 | auth_url = f"{AUTH_URL}?{'&'.join(f'{k}={v}' for k,v in auth_params.items())}" 172 | logger.info(f"Opening authorization URL: {auth_url}") 173 | webbrowser.open(auth_url) 174 | 175 | logger.info("Waiting for callback...") 176 | server.handle_request() 177 | 178 | if not CallbackHandler.code: 179 | raise Exception("Failed to get authorization code") 180 | 181 | logger.info("Exchanging code for token") 182 | token_data = { 183 | 'code': CallbackHandler.code, 184 | 'grant_type': 'authorization_code', 185 | 'client_id': os.getenv('X_OAUTH2_CLIENT_ID'), 186 | 'redirect_uri': CALLBACK_URL, 187 | 'code_verifier': code_verifier 188 | } 189 | 190 | auth = ( 191 | os.getenv('X_OAUTH2_CLIENT_ID'), 192 | os.getenv('X_OAUTH2_CLIENT_SECRET') 193 | ) 194 | 195 | response = requests.post(TOKEN_URL, auth=auth, data=token_data) 196 | if response.status_code != 200: 197 | raise Exception(f"Token exchange failed: {response.text}") 198 | 199 | token_json = response.json() 200 | logger.debug(f"Received token response with keys: {list(token_json.keys())}") 201 | save_tokens(token_json) 202 | return token_json['access_token'] 203 | 204 | def format_post( 205 | paper_title: str, 206 | authors: list, 207 | url: str, 208 | pdf_url: Optional[str] = None, 209 | arxiv_url: Optional[str] = None, 210 | github_url: Optional[str] = None 211 | ) -> str: 212 | """Format paper details into X post text""" 213 | # Start with title and truncate if needed 214 | post = f"📚 {paper_title[:100]}{'...' if len(paper_title) > 100 else ''}\n\n" 215 | 216 | # Add authors (limited to first 2 if many) 217 | if len(authors) > 2: 218 | authors_text = f"by {', '.join(authors[:2])} et al." 219 | else: 220 | authors_text = f"by {', '.join(authors)}" 221 | post += f"{authors_text}\n\n" 222 | 223 | # Add links 224 | post += f"🔗 {url}" 225 | if pdf_url: 226 | post += f"\n📄 {pdf_url}" 227 | if arxiv_url: 228 | post += f"\n📝 {arxiv_url}" 229 | if github_url: 230 | post += f"\n💻 {github_url}" 231 | 232 | return post 233 | 234 | @log_function_call 235 | def capture_pdf_screenshot(pdf_url: str) -> Optional[bytes]: 236 | """Capture first page screenshot of PDF using Playwright.""" 237 | logger.info(f"Attempting to capture screenshot of PDF: {pdf_url}") 238 | 239 | try: 240 | with sync_playwright() as p: 241 | logger.debug("Launching browser...") 242 | browser = p.chromium.launch(headless=False) # Run in non-headless mode for debugging 243 | 244 | logger.debug("Creating new page with viewport size: %dx%d", 245 | PDF_SCREENSHOT_SIZE, PDF_SCREENSHOT_SIZE) 246 | page = browser.new_page( 247 | viewport={'width': PDF_SCREENSHOT_SIZE, 'height': PDF_SCREENSHOT_SIZE} 248 | ) 249 | page.set_default_timeout(PDF_LOAD_TIMEOUT) 250 | 251 | logger.debug("Loading PDF...") 252 | page.goto(pdf_url, wait_until='networkidle') # Wait for network to be idle 253 | logger.info("✅ PDF loaded successfully") 254 | 255 | # Add small delay to ensure PDF is rendered 256 | logger.debug("Waiting for PDF to render...") 257 | page.wait_for_timeout(2000) # 2 second delay 258 | 259 | logger.debug("Taking screenshot...") 260 | screenshot = page.screenshot() 261 | 262 | # Validate screenshot data 263 | if screenshot and len(screenshot) > 0: 264 | logger.info(f"✅ Screenshot captured successfully! Size: {len(screenshot)} bytes") 265 | else: 266 | logger.error("❌ Screenshot data is empty!") 267 | return None 268 | 269 | browser.close() 270 | logger.info("Browser closed successfully") 271 | return screenshot 272 | except Exception as e: 273 | logger.error(f"❌ Failed to capture PDF screenshot: {str(e)}") 274 | logger.error("Exception details:", exc_info=True) 275 | return None 276 | 277 | @log_function_call 278 | def check_media_status(media_id: str, auth: OAuth1) -> bool: 279 | """Check if media has finished processing.""" 280 | status_url = f"https://upload.twitter.com/1.1/media/upload.json?command=STATUS&media_id={media_id}" 281 | 282 | try: 283 | response = requests.get(status_url, auth=auth) 284 | if response.status_code == 200: 285 | processing_info = response.json().get('processing_info', {}) 286 | state = processing_info.get('state') 287 | 288 | if state == 'succeeded': 289 | return True 290 | elif state == 'pending': 291 | time.sleep(3) # Wait before checking again 292 | return check_media_status(media_id, auth) 293 | else: 294 | logger.error(f"Media processing failed: {processing_info}") 295 | return False 296 | except Exception as e: 297 | logger.error(f"Error checking media status: {str(e)}") 298 | return False 299 | 300 | @log_function_call 301 | def upload_media(image_data: bytes) -> Optional[str]: 302 | """Upload media using v1.1 API with OAuth 1.0a.""" 303 | if not image_data: 304 | logger.error("❌ No image data provided for upload") 305 | return None 306 | 307 | # Create OAuth 1.0a auth 308 | auth = OAuth1( 309 | client_key=os.getenv('X_API_KEY'), 310 | client_secret=os.getenv('X_API_SECRET'), 311 | resource_owner_key=os.getenv('X_ACCESS_TOKEN'), 312 | resource_owner_secret=os.getenv('X_ACCESS_TOKEN_SECRET') 313 | ) 314 | 315 | try: 316 | logger.info("📤 Uploading media using v1.1 API...") 317 | files = {'media': image_data} 318 | response = requests.post(MEDIA_UPLOAD_URL, auth=auth, files=files) 319 | 320 | if response.status_code != 200: 321 | logger.error(f"❌ Media upload failed: {response.status_code}") 322 | logger.error(f"Response: {response.text}") 323 | return None 324 | 325 | media_id = response.json()['media_id_string'] 326 | logger.info(f"✅ Media upload successful! (ID: {media_id})") 327 | return media_id 328 | 329 | except Exception as e: 330 | logger.error(f"❌ Failed to upload media: {str(e)}") 331 | logger.error("Exception details:", exc_info=True) 332 | return None 333 | 334 | @log_function_call 335 | def wait_for_rate_limit(response) -> bool: 336 | """Handle rate limit waiting. Returns True if waited, False if no wait needed.""" 337 | try: 338 | # Only proceed if we have rate limit headers 339 | if 'x-rate-limit-remaining' not in response.headers: 340 | return False 341 | 342 | limit = int(response.headers.get('x-rate-limit-limit', 0)) 343 | remaining = int(response.headers.get('x-rate-limit-remaining', 0)) 344 | reset_time = int(response.headers.get('x-rate-limit-reset', 0)) 345 | 346 | logger.info("Rate Limit Status:") 347 | logger.info(f"- Limit: {limit} requests") 348 | logger.info(f"- Remaining: {remaining} requests") 349 | 350 | # Only wait if we're actually rate limited 351 | if response.status_code == 429: 352 | current_time = time.time() 353 | wait_seconds = max(reset_time - current_time, 0) 354 | 355 | reset_datetime = datetime.fromtimestamp(reset_time) 356 | logger.info(f"Rate limit reset time: {reset_datetime}") 357 | 358 | if wait_seconds > 0: 359 | logger.info(f"Waiting {wait_seconds:.0f} seconds...") 360 | time.sleep(wait_seconds) 361 | return True 362 | 363 | return False 364 | 365 | except Exception as e: 366 | logger.error(f"Error in rate limit handling: {str(e)}") 367 | return False 368 | 369 | @log_function_call 370 | def post_paper(paper_title: str, authors: list, url: str, pdf_url: Optional[str] = None, 371 | arxiv_url: Optional[str] = None, github_url: Optional[str] = None, max_retries: int = 3): 372 | """Post paper using v2 API.""" 373 | logger.info(f"🔄 Starting post process for: {paper_title}") 374 | 375 | # Step 1: Upload media using v1.1 API if PDF URL provided 376 | media_id = None 377 | if pdf_url: 378 | logger.info("📸 Capturing PDF screenshot...") 379 | screenshot = capture_pdf_screenshot(pdf_url) 380 | if screenshot: 381 | media_id = upload_media(screenshot) # Uses v1.1 API with OAuth 1.0a 382 | 383 | # Step 2: Get OAuth 2.0 token for v2 API post 384 | token = get_oauth2_token() 385 | if not token: 386 | logger.error("❌ Failed to get OAuth 2.0 token") 387 | return None 388 | 389 | # Format post text 390 | post_text = format_post(paper_title, authors, url, pdf_url, arxiv_url, github_url) 391 | 392 | # Prepare v2 API payload 393 | payload = { 394 | "text": post_text 395 | } 396 | 397 | # Add media if available 398 | if media_id: 399 | payload["media"] = { 400 | "media_ids": [str(media_id)] 401 | } 402 | 403 | headers = { 404 | "Authorization": f"Bearer {token}", 405 | "Content-Type": "application/json" 406 | } 407 | 408 | # Post using v2 API 409 | retry_count = 0 410 | while retry_count < max_retries: 411 | try: 412 | logger.info("📝 Creating post using v2 API...") 413 | response = requests.post(X_API_URL, json=payload, headers=headers) 414 | 415 | logger.debug(f"Response status: {response.status_code}") 416 | logger.debug(f"Response headers: {dict(response.headers)}") 417 | logger.debug(f"Response body: {response.text}") # Log full response 418 | 419 | # Log rate limit info regardless of status 420 | for header, value in response.headers.items(): 421 | if 'rate' in header.lower(): 422 | logger.info(f"Rate limit info - {header}: {value}") 423 | 424 | if response.status_code == 429: # True rate limit 425 | if wait_for_rate_limit(response): 426 | retry_count += 1 427 | continue 428 | else: 429 | logger.error("❌ True rate limit hit") 430 | return {"error": "rate_limit", "response": response.text} 431 | 432 | elif response.status_code == 201: # Success 433 | logger.info("✅ Post successful!") 434 | return response.json() 435 | 436 | else: # Other errors 437 | error_data = response.json() 438 | logger.error(f"❌ Post failed with status {response.status_code}") 439 | logger.error(f"Error details: {error_data}") 440 | return {"error": "api_error", "response": error_data} 441 | 442 | except Exception as e: 443 | logger.error(f"❌ Error posting: {str(e)}") 444 | logger.error("Exception details:", exc_info=True) 445 | break 446 | 447 | retry_count += 1 448 | 449 | return None 450 | 451 | if __name__ == "__main__": 452 | # Random paper title components 453 | adjectives = ["Novel", "Advanced", "Innovative", "Comprehensive", "Efficient"] 454 | topics = ["Machine Learning", "Neural Networks", "Deep Learning", "AI", "Data Science"] 455 | methods = ["Framework", "Approach", "Methodology", "System", "Architecture"] 456 | 457 | # Random author names 458 | first_names = ["James", "Maria", "John", "Sarah", "Michael", "Emma", "David", "Lisa"] 459 | last_names = ["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis"] 460 | 461 | # Generate random title and authors 462 | random_title = f"{random.choice(adjectives)} {random.choice(topics)} {random.choice(methods)} [{datetime.now().strftime('%H:%M:%S')}]" 463 | random_authors = [ 464 | f"{random.choice(first_names)} {random.choice(last_names)}", 465 | f"{random.choice(first_names)} {random.choice(last_names)}" 466 | ] 467 | 468 | logger.info("Starting X post test") 469 | try: 470 | response = post_paper( 471 | paper_title=random_title, 472 | authors=random_authors, 473 | url="https://huggingface.co/papers/test", 474 | pdf_url="https://arxiv.org/pdf/2401.00935", # Test PDF URL 475 | arxiv_url="https://arxiv.org/abs/test", 476 | github_url="https://github.com/test/repo" 477 | ) 478 | 479 | if response and 'data' in response: 480 | logger.info("✅ Post successful!") 481 | logger.info(f"Tweet ID: {response['data']['id']}") 482 | logger.info(f"Tweet text: {response['data']['text']}") 483 | else: 484 | logger.error("❌ Post failed!") 485 | logger.error(f"Response: {response}") 486 | 487 | except Exception as e: 488 | logger.error("❌ Error during posting:") 489 | logger.error(f"Error details: {str(e)}", exc_info=True) 490 | -------------------------------------------------------------------------------- /claude_researcher_with_map.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "id": "view-in-github", 7 | "colab_type": "text" 8 | }, 9 | "source": [ 10 | "\"Open" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "id": "8f015733", 16 | "metadata": { 17 | "id": "8f015733" 18 | }, 19 | "source": [ 20 | "# Firecrawl Web Crawling with OpenAI and Anthropic\n", 21 | "This notebook demonstrates how to use the Firecrawl API along with OpenAI's Anthropic to search for specific information on a website. It takes a user-defined objective and website URL, then attempts to find relevant pages and extract information based on the objective.\n", 22 | "\n", 23 | "### Requirements\n", 24 | "1. **Firecrawl API key**: Obtain from your Firecrawl account.\n", 25 | "2. **Anthropic API key**: Obtain from Anthropic if you're leveraging their models.\n", 26 | "3. **AgentOps API key**: If using AgentOps, include its API key.\n", 27 | "\n", 28 | "Set up your API keys as environment variables or directly in the notebook for ease of access.\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "id": "TuuO7HAFyuq9", 35 | "metadata": { 36 | "id": "TuuO7HAFyuq9", 37 | "colab": { 38 | "base_uri": "https://localhost:8080/" 39 | }, 40 | "outputId": "f025d6d5-8b78-4e21-a464-da21217fa515" 41 | }, 42 | "outputs": [ 43 | { 44 | "output_type": "stream", 45 | "name": "stdout", 46 | "text": [ 47 | "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/946.0 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[90m╺\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m450.6/946.0 kB\u001b[0m \u001b[31m13.6 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m942.1/946.0 kB\u001b[0m \u001b[31m18.5 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m946.0/946.0 kB\u001b[0m \u001b[31m12.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 48 | "\u001b[?25h\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/50.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m50.8/50.8 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 49 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.0/53.0 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 50 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m288.2/288.2 kB\u001b[0m \u001b[31m9.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 51 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m164.1/164.1 kB\u001b[0m \u001b[31m10.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 52 | "\u001b[?25h" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "%pip install -q firecrawl-py anthropic agentops" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 3, 63 | "id": "8e2be400", 64 | "metadata": { 65 | "id": "8e2be400" 66 | }, 67 | "outputs": [], 68 | "source": [ 69 | "from getpass import getpass\n", 70 | "from firecrawl import FirecrawlApp\n", 71 | "import os, re, json, anthropic, agentops" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "RJZQ-gYpGOl9", 78 | "metadata": { 79 | "colab": { 80 | "base_uri": "https://localhost:8080/" 81 | }, 82 | "id": "RJZQ-gYpGOl9", 83 | "outputId": "fa8698d2-7126-4a89-f9f8-46c0b79e7d16" 84 | }, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "FIRECRAWL_API_KEY··········\n", 91 | "ANTHROPIC_API_KEY··········\n" 92 | ] 93 | }, 94 | { 95 | "output_type": "stream", 96 | "name": "stderr", 97 | "text": [ 98 | "DEBUG:httpx:load_ssl_context verify=True cert=None trust_env=True http2=False\n", 99 | "DEBUG:httpx:load_verify_locations cafile='/usr/local/lib/python3.10/dist-packages/certifi/cacert.pem'\n" 100 | ] 101 | }, 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "AGENTOPS_API_KEY··········\n" 107 | ] 108 | } 109 | ], 110 | "source": [ 111 | "# Initialize the FirecrawlApp, OpenAI client, and AgentOps\n", 112 | "app = FirecrawlApp(api_key=getpass('FIRECRAWL_API_KEY'))\n", 113 | "client = anthropic.Anthropic(api_key=getpass('ANTHROPIC_API_KEY'))\n", 114 | "AGENTOPS_API_KEY = getpass('AGENTOPS_API_KEY')" 115 | ] 116 | }, 117 | { 118 | "cell_type": "markdown", 119 | "id": "c625af64", 120 | "metadata": { 121 | "id": "c625af64" 122 | }, 123 | "source": [ 124 | "### Custom Color-Coded Logging Configuration\n", 125 | "This cell sets up a custom logging configuration with color-coded output for different log levels, enhancing readability for various messages.\n", 126 | "\n", 127 | "The `CustomFormatter` class applies specific colors to log levels (DEBUG, INFO, WARNING, ERROR, CRITICAL) and resets colors after each log message.\n", 128 | "\n", 129 | "A `StreamHandler` is added to the root logger with this custom formatter, displaying messages in the notebook's output stream.\n" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 17, 135 | "id": "98ec7066", 136 | "metadata": { 137 | "id": "98ec7066" 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "import logging\n", 142 | "\n", 143 | "# Set up colored logging\n", 144 | "class CustomFormatter(logging.Formatter):\n", 145 | " COLORS = {\n", 146 | " 'DEBUG': '\\033[94m', # Blue\n", 147 | " 'INFO': '\\033[92m', # Green\n", 148 | " 'WARNING': '\\033[93m', # Yellow\n", 149 | " 'ERROR': '\\033[91m', # Red\n", 150 | " 'CRITICAL': '\\033[95m' # Magenta\n", 151 | " }\n", 152 | " RESET = '\\033[0m'\n", 153 | " FORMAT = \"[%(levelname)s] %(message)s\"\n", 154 | "\n", 155 | " def format(self, record):\n", 156 | " log_color = self.COLORS.get(record.levelname, self.RESET)\n", 157 | " log_fmt = log_color + self.FORMAT + self.RESET\n", 158 | " formatter = logging.Formatter(log_fmt)\n", 159 | " return formatter.format(record)\n", 160 | "\n", 161 | "# Configure the root logger\n", 162 | "logger = logging.getLogger()\n", 163 | "logger.setLevel(logging.INFO)\n", 164 | "\n", 165 | "ch = logging.StreamHandler()\n", 166 | "ch.setLevel(logging.INFO)\n", 167 | "ch.setFormatter(CustomFormatter())\n", 168 | "\n", 169 | "# Add handler if not already added\n", 170 | "if not logger.hasHandlers():\n", 171 | " logger.addHandler(ch)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "id": "e732d54d", 177 | "metadata": { 178 | "id": "e732d54d" 179 | }, 180 | "source": [ 181 | "### Step 1: Finding the Relevant Page\n", 182 | "The function `find_relevant_page_via_map` takes an objective and a website URL. It then uses the Anthropic client to generate search parameters for the Firecrawl API to map the website and identify relevant pages based on the objective.\n" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 18, 188 | "id": "13f88d4a", 189 | "metadata": { 190 | "id": "13f88d4a" 191 | }, 192 | "outputs": [], 193 | "source": [ 194 | "# Find the page that most likely contains the objective using Firecrawl's Map\n", 195 | "def find_relevant_page_via_map(objective, url, app, client):\n", 196 | " \"\"\"\n", 197 | " Identifies the page most likely to contain the specified objective using Firecrawl's Map.\n", 198 | "\n", 199 | " Args:\n", 200 | " objective (str): The objective to search for within the website pages.\n", 201 | " url (str): The base URL of the website to be crawled.\n", 202 | " app (object): The application instance for conducting the crawl.\n", 203 | " client (object): The client used to make requests to the pages.\n", 204 | "\n", 205 | " Returns:\n", 206 | " str or None: Returns the URL of the page that most likely contains the objective if found; otherwise, returns None.\n", 207 | " \"\"\"\n", 208 | " try:\n", 209 | " logger.info(f\"{Colors.CYAN}Objective: {objective}{Colors.RESET}\")\n", 210 | " logger.info(f\"{Colors.CYAN}Initiating search on the website: {url}{Colors.RESET}\")\n", 211 | "\n", 212 | " map_prompt = f\"\"\"\n", 213 | " The map function generates a list of URLs from a website and accepts a search parameter.\n", 214 | " Based on the objective of: {objective}, suggest a 1-2 word search parameter.\n", 215 | " \"\"\"\n", 216 | "\n", 217 | " completion = client.messages.create(\n", 218 | " model='claude-3-5-sonnet-20241022',\n", 219 | " max_tokens=1000,\n", 220 | " temperature=0,\n", 221 | " system=\"Expert web crawler\",\n", 222 | " messages=[{'role': 'user', 'content': map_prompt}]\n", 223 | " )\n", 224 | "\n", 225 | " map_search_parameter = completion.content[0].text\n", 226 | " map_website = app.map_url(url, params={'search': map_search_parameter})\n", 227 | "\n", 228 | " logger.info(f\"{Colors.GREEN}Mapping completed. Links found: {len(map_website['links'])}{Colors.RESET}\")\n", 229 | " return map_website['links']\n", 230 | " except Exception as e:\n", 231 | " logger.info(f\"{Colors.RED}Error: {str(e)}{Colors.RESET}\")\n", 232 | " return None" 233 | ] 234 | }, 235 | { 236 | "cell_type": "markdown", 237 | "id": "c49f669d", 238 | "metadata": { 239 | "id": "c49f669d" 240 | }, 241 | "source": [ 242 | "### Step 2: Examining Top Pages using Firewcrawl's [Map](https://docs.firecrawl.dev/features/map)\n", 243 | "The function `find_objective_in_top_pages` examines the top pages from the website map, attempting to fulfill the user's objective using scraped content. If the objective is met, it returns the relevant data in JSON format.\n", 244 | "\n", 245 | "**Note:** Firecrawl's Map Response will be an ordered list from the most relevant to the least relevant. By selecting only the first three elements (`[:2]`), the function focuses on analyzing just the top three most relevant pages identified during the mapping stage..\n" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 19, 251 | "id": "812fd739", 252 | "metadata": { 253 | "id": "812fd739" 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "def find_objective_in_top_pages(map_website, objective, app, client):\n", 258 | " \"\"\"\n", 259 | " Scrapes the top 3 pages of a given website to check if the specified objective is met.\n", 260 | "\n", 261 | " Args:\n", 262 | " map_website (str): The website to be scraped.\n", 263 | " objective (str): The objective to look for within the pages.\n", 264 | " app (object): The application instance for web scraping.\n", 265 | " client (object): The client used to request page content.\n", 266 | "\n", 267 | " Returns:\n", 268 | " dict or None: Returns a JSON object if the objective is found within the top 3 pages; otherwise, returns None.\n", 269 | " \"\"\"\n", 270 | " try:\n", 271 | " # Get top 2 links from the map result\n", 272 | " top_links = map_website[:2]\n", 273 | " logger.info(f\"{Colors.CYAN}Analyzing the {len(top_links)} top links: {top_links}{Colors.RESET}\")\n", 274 | "\n", 275 | " batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})\n", 276 | " logger.info(f\"{Colors.GREEN}Batch scraping completed.{Colors.RESET}\")\n", 277 | "\n", 278 | " for scrape_result in batch_scrape_result['data']:\n", 279 | " check_prompt = f\"\"\"\n", 280 | " Given the following scraped content and objective, determine if the objective is met.\n", 281 | " If it is, extract the relevant information in a simple and concise JSON format. Use only the necessary fields and avoid nested structures if possible.\n", 282 | " If the objective is not met with confidence, respond with 'Objective not met'.\n", 283 | "\n", 284 | " Objective: {objective}\n", 285 | " Scraped content: {scrape_result['markdown']}\n", 286 | "\n", 287 | " Remember:\n", 288 | " 1. Only return JSON if you are confident the objective is fully met.\n", 289 | " 2. Keep the JSON structure as simple and flat as possible.\n", 290 | " 3. Do not include any explanations or markdown formatting in your response.\n", 291 | " \"\"\"\n", 292 | "\n", 293 | " completion = client.messages.create(\n", 294 | " model=\"claude-3-5-sonnet-20241022\",\n", 295 | " max_tokens=1000,\n", 296 | " temperature=0,\n", 297 | " system=\"You are an expert web crawler. Respond with the relevant information in JSON format.\",\n", 298 | " messages=[\n", 299 | " {\n", 300 | " \"role\": \"user\",\n", 301 | " \"content\": [\n", 302 | " {\n", 303 | " \"type\": \"text\",\n", 304 | " \"text\": check_prompt\n", 305 | " }\n", 306 | " ]\n", 307 | " }\n", 308 | " ]\n", 309 | " )\n", 310 | "\n", 311 | " result = completion.content[0].text\n", 312 | " if result and result != 'Objective not met':\n", 313 | " try:\n", 314 | " return json.loads(result)\n", 315 | " except json.JSONDecodeError as e:\n", 316 | " logger.info(f\"{Colors.RED}JSON parsing error: {e}. Raw result: {result}{Colors.RESET}\")\n", 317 | " continue # Skip to the next result if parsing fails\n", 318 | "\n", 319 | " logger.info(f\"{Colors.RED}Objective not met in examined content.{Colors.RESET}\")\n", 320 | " return None\n", 321 | " except Exception as e:\n", 322 | " logger.info(f\"{Colors.RED}Error during analysis: {str(e)}{Colors.RESET}\")\n", 323 | " return None" 324 | ] 325 | }, 326 | { 327 | "cell_type": "markdown", 328 | "id": "l3WzoekQEFCq", 329 | "metadata": { 330 | "id": "l3WzoekQEFCq" 331 | }, 332 | "source": [ 333 | "### Step 3: Find and Extract Information\n", 334 | "\n", 335 | "This function aims to find and extract information related to a given `objective` from the top-ranked pages of a website.\n", 336 | "\n", 337 | "**Functionality:**\n", 338 | "\n", 339 | "1. **Selects Top Links:** It selects the top two URLs from the `map_website` list, assuming they are the most relevant to the objective.\n", 340 | "2. **Scrapes Content:** It uses the `app.batch_scrape_urls` function to scrape content from these selected URLs in Markdown format.\n", 341 | "3. **Analyzes Content:** For each scraped page, it constructs a prompt for the Anthropic Claude model. This prompt asks the model to determine if the scraped content fulfills the `objective`. If it does, the model is asked to extract the relevant information and format it as JSON.\n", 342 | "4. **Extracts JSON:** The function uses a regular expression to identify JSON-like blocks within the Anth" 343 | ] 344 | }, 345 | { 346 | "cell_type": "markdown", 347 | "id": "Kc31wkA3EEtl", 348 | "metadata": { 349 | "id": "Kc31wkA3EEtl" 350 | }, 351 | "source": [] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 20, 356 | "id": "02DvK01wELut", 357 | "metadata": { 358 | "id": "02DvK01wELut" 359 | }, 360 | "outputs": [], 361 | "source": [ 362 | "def find_objective_in_top_pages(map_website, objective, app, client):\n", 363 | " \"\"\"\n", 364 | " Scrapes the top 3 pages of a website to determine if the specified objective is present.\n", 365 | "\n", 366 | " Args:\n", 367 | " map_website (str): The website map or URL structure to guide the scraping.\n", 368 | " objective (str): The objective or target content to search for on the pages.\n", 369 | " app (object): The application instance used for executing the scraping process.\n", 370 | " client (object): The client responsible for requesting the content of the pages.\n", 371 | "\n", 372 | " Returns:\n", 373 | " dict or None: Returns a JSON object containing the found objective details if located on one of the top 3 pages; otherwise, returns None.\n", 374 | " \"\"\"\n", 375 | " try:\n", 376 | " top_links = map_website[:2]\n", 377 | " logger.info(f\"{Colors.CYAN}Analyzing top links: {top_links}{Colors.RESET}\")\n", 378 | "\n", 379 | " batch_scrape_result = app.batch_scrape_urls(top_links, {'formats': ['markdown']})\n", 380 | " logger.info(f\"{Colors.GREEN}Batch scraping completed.{Colors.RESET}\")\n", 381 | "\n", 382 | " # Regex pattern to match JSON-like blocks in the response\n", 383 | " json_pattern = r\"\\{(?:[^{}]|(?:\\{[^{}]*\\}))*\\}\"\n", 384 | "\n", 385 | " for scrape_result in batch_scrape_result['data']:\n", 386 | " check_prompt = f\"\"\"\n", 387 | " Given scraped content and objective, determine if the objective is met.\n", 388 | " Extract relevant information in simple JSON if met.\n", 389 | " Objective: {objective}\n", 390 | " Scraped content: {scrape_result['markdown']}\n", 391 | " \"\"\"\n", 392 | "\n", 393 | " completion = client.messages.create(\n", 394 | " model='claude-3-5-sonnet-20241022',\n", 395 | " max_tokens=1000,\n", 396 | " temperature=0,\n", 397 | " system=\"Expert web crawler\",\n", 398 | " messages=[{'role': 'user', 'content': check_prompt}]\n", 399 | " )\n", 400 | "\n", 401 | " result = completion.content[0].text\n", 402 | " # Search for JSON-like block in the result text\n", 403 | " json_match = re.search(json_pattern, result, re.DOTALL)\n", 404 | " if json_match:\n", 405 | " try:\n", 406 | " return json.loads(json_match.group(0))\n", 407 | " except json.JSONDecodeError as e:\n", 408 | " logger.info(f\"{Colors.RED}JSON parsing error: {e}. Raw result: {json_match.group(0)}{Colors.RESET}\")\n", 409 | " continue # Skip to the next result if parsing fails\n", 410 | " else:\n", 411 | " logger.info(f\"{Colors.YELLOW}No JSON found in the response. Raw result: {result}{Colors.RESET}\")\n", 412 | "\n", 413 | " logger.info(f\"{Colors.RED}Objective not met in examined content.{Colors.RESET}\")\n", 414 | " return None\n", 415 | " except Exception as e:\n", 416 | " logger.info(f\"{Colors.RED}Error during analysis: {str(e)}{Colors.RESET}\")\n", 417 | " return None" 418 | ] 419 | }, 420 | { 421 | "cell_type": "markdown", 422 | "id": "9fc4cef6", 423 | "metadata": { 424 | "id": "9fc4cef6" 425 | }, 426 | "source": [ 427 | "### Step 4: Executing the Main Function\n", 428 | "The main function prompts for user input (website URL and objective), calls the `find_relevant_page_via_map` and `find_objective_in_top_pages` functions, and displays results accordingly.\n" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 21, 434 | "id": "e3721623", 435 | "metadata": { 436 | "id": "e3721623" 437 | }, 438 | "outputs": [], 439 | "source": [ 440 | "# Main function to execute the process\n", 441 | "def main():\n", 442 | " url = input(f\"{Colors.BLUE}Enter website URL:{Colors.RESET}\") or \"https://www.firecrawl.dev/\"\n", 443 | " objective = input(f\"{Colors.BLUE}Enter objective:{Colors.RESET}\") or \"find pricing plans\"\n", 444 | "\n", 445 | " map_website = find_relevant_page_via_map(objective, url, app, client)\n", 446 | "\n", 447 | " if map_website:\n", 448 | " result = find_objective_in_top_pages(map_website, objective, app, client)\n", 449 | " if result:\n", 450 | " logger.info(f\"{Colors.GREEN}Objective met. Extracted info:{Colors.RESET}\")\n", 451 | " logger.info(f\"{Colors.MAGENTA}{json.dumps(result, indent=2)}{Colors.RESET}\")\n", 452 | " else:\n", 453 | " logger.info(f\"{Colors.RED}Objective not fulfilled with available content.{Colors.RESET}\")\n", 454 | " else:\n", 455 | " logger.info(f\"{Colors.RED}No relevant pages identified.{Colors.RESET}\")" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": 22, 461 | "id": "0cr6H3nlBSMG", 462 | "metadata": { 463 | "colab": { 464 | "base_uri": "https://localhost:8080/" 465 | }, 466 | "id": "0cr6H3nlBSMG", 467 | "outputId": "452e3ae2-fd24-44a9-d0e1-8086f1204ef2" 468 | }, 469 | "outputs": [ 470 | { 471 | "name": "stdout", 472 | "output_type": "stream", 473 | "text": [ 474 | "\u001b[94mEnter website URL:\u001b[0mhttps://www.firecrawl.dev/\n", 475 | "\u001b[94mEnter objective:\u001b[0myes or no: is firecrawl backed by y combinator?\n" 476 | ] 477 | }, 478 | { 479 | "output_type": "stream", 480 | "name": "stderr", 481 | "text": [ 482 | "INFO:root:\u001b[96mObjective: yes or no: is firecrawl backed by y combinator?\u001b[0m\n", 483 | "INFO:root:\u001b[96mInitiating search on the website: https://www.firecrawl.dev/\u001b[0m\n", 484 | "INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages \"HTTP/1.1 200 OK\"\n", 485 | "INFO:root:\u001b[92mMapping completed. Links found: 42\u001b[0m\n", 486 | "INFO:root:\u001b[96mAnalyzing top links: ['https://www.firecrawl.dev/blog/your-ip-has-been-temporarily-blocked-or-banned', 'https://www.firecrawl.dev/blog/how-to-quickly-install-beautifulsoup-with-python']\u001b[0m\n", 487 | "INFO:root:\u001b[92mBatch scraping completed.\u001b[0m\n", 488 | "INFO:httpx:HTTP Request: POST https://api.anthropic.com/v1/messages \"HTTP/1.1 200 OK\"\n", 489 | "INFO:root:\u001b[92mObjective met. Extracted info:\u001b[0m\n", 490 | "INFO:root:\u001b[95m{\n", 491 | " \"can_determine\": false,\n", 492 | " \"reason\": \"No mention of Y Combinator backing in the scraped content\"\n", 493 | "}\u001b[0m\n" 494 | ] 495 | } 496 | ], 497 | "source": [ 498 | "main()" 499 | ] 500 | } 501 | ], 502 | "metadata": { 503 | "colab": { 504 | "provenance": [], 505 | "include_colab_link": true 506 | }, 507 | "kernelspec": { 508 | "display_name": "Python 3", 509 | "name": "python3" 510 | }, 511 | "language_info": { 512 | "name": "python" 513 | } 514 | }, 515 | "nbformat": 4, 516 | "nbformat_minor": 5 517 | } --------------------------------------------------------------------------------